7
7
from urllib.parse import parse_qsl, quote, unquote, urlencode, urlsplit, urlunsplit
8
8
9
9
from django.utils.encoding import punycode
10
-
from django.utils.functional import Promise, keep_lazy, keep_lazy_text
10
+
from django.utils.functional import Promise, cached_property, keep_lazy, keep_lazy_text
11
11
from django.utils.http import RFC3986_GENDELIMS, RFC3986_SUBDELIMS
12
12
from django.utils.regex_helper import _lazy_re_compile
13
13
from django.utils.safestring import SafeData, SafeString, mark_safe
@@ -225,6 +225,16 @@ def unquote_quote(segment):
225
225
return urlunsplit((scheme, netloc, path, query, fragment))
226
226
227
227
228
+
class CountsDict(dict):
229
+
def __init__(self, *args, word, **kwargs):
230
+
super().__init__(*args, *kwargs)
231
+
self.word = word
232
+
233
+
def __missing__(self, key):
234
+
self[key] = self.word.count(key)
235
+
return self[key]
236
+
237
+
228
238
class Urlizer:
229
239
"""
230
240
Convert any URLs in text into clickable links.
@@ -330,40 +340,72 @@ def trim_url(self, x, *, limit):
330
340
return x
331
341
return "%s…" % x[: max(0, limit - 1)]
332
342
343
+
@cached_property
344
+
def wrapping_punctuation_openings(self):
345
+
return "".join(dict(self.wrapping_punctuation).keys())
346
+
347
+
@cached_property
348
+
def trailing_punctuation_chars_no_semicolon(self):
349
+
return self.trailing_punctuation_chars.replace(";", "")
350
+
351
+
@cached_property
352
+
def trailing_punctuation_chars_has_semicolon(self):
353
+
return ";" in self.trailing_punctuation_chars
354
+
333
355
def trim_punctuation(self, word):
334
356
"""
335
357
Trim trailing and wrapping punctuation from `word`. Return the items of
336
358
the new state.
337
359
"""
338
-
lead, middle, trail = "", word, ""
360
+
# Strip all opening wrapping punctuation.
361
+
middle = word.lstrip(self.wrapping_punctuation_openings)
362
+
lead = word[: len(word) - len(middle)]
363
+
trail = ""
364
+
339
365
# Continue trimming until middle remains unchanged.
340
366
trimmed_something = True
341
-
while trimmed_something:
367
+
counts = CountsDict(word=middle)
368
+
while trimmed_something and middle:
342
369
trimmed_something = False
343
370
# Trim wrapping punctuation.
344
371
for opening, closing in self.wrapping_punctuation:
345
-
if middle.startswith(opening):
346
-
middle = middle[len(opening) :]
347
-
lead += opening
348
-
trimmed_something = True
349
-
# Keep parentheses at the end only if they're balanced.
350
-
if (
351
-
middle.endswith(closing)
352
-
and middle.count(closing) == middle.count(opening) + 1
353
-
):
354
-
middle = middle[: -len(closing)]
355
-
trail = closing + trail
356
-
trimmed_something = True
357
-
# Trim trailing punctuation (after trimming wrapping punctuation,
358
-
# as encoded entities contain ';'). Unescape entities to avoid
359
-
# breaking them by removing ';'.
360
-
middle_unescaped = html.unescape(middle)
361
-
stripped = middle_unescaped.rstrip(self.trailing_punctuation_chars)
362
-
if middle_unescaped != stripped:
363
-
punctuation_count = len(middle_unescaped) - len(stripped)
364
-
trail = middle[-punctuation_count:] + trail
365
-
middle = middle[:-punctuation_count]
372
+
if counts[opening] < counts[closing]:
373
+
rstripped = middle.rstrip(closing)
374
+
if rstripped != middle:
375
+
strip = counts[closing] - counts[opening]
376
+
trail = middle[-strip:]
377
+
middle = middle[:-strip]
378
+
trimmed_something = True
379
+
counts[closing] -= strip
380
+
381
+
rstripped = middle.rstrip(self.trailing_punctuation_chars_no_semicolon)
382
+
if rstripped != middle:
383
+
trail = middle[len(rstripped) :] + trail
384
+
middle = rstripped
366
385
trimmed_something = True
386
+
387
+
if self.trailing_punctuation_chars_has_semicolon and middle.endswith(";"):
388
+
# Only strip if not part of an HTML entity.
389
+
amp = middle.rfind("&")
390
+
if amp == -1:
391
+
can_strip = True
392
+
else:
393
+
potential_entity = middle[amp:]
394
+
escaped = html.unescape(potential_entity)
395
+
can_strip = (escaped == potential_entity) or escaped.endswith(";")
396
+
397
+
if can_strip:
398
+
rstripped = middle.rstrip(";")
399
+
amount_stripped = len(middle) - len(rstripped)
400
+
if amp > -1 and amount_stripped > 1:
401
+
# Leave a trailing semicolon as might be an entity.
402
+
trail = middle[len(rstripped) + 1 :] + trail
403
+
middle = rstripped + ";"
404
+
else:
405
+
trail = middle[len(rstripped) :] + trail
406
+
middle = rstripped
407
+
trimmed_something = True
408
+
367
409
return lead, middle, trail
368
410
369
411
@staticmethod
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4