@@ -23,8 +23,61 @@ def capfirst(x):
23
23
return x[0].upper() + x[1:]
24
24
25
25
26
-
# Set up regular expressions
27
-
re_words = _lazy_re_compile(r"<[^>]+?>|([^<>\s]+)", re.S)
26
+
# ----- Begin security-related performance workaround -----
27
+
28
+
# We used to have, below
29
+
#
30
+
# re_words = _lazy_re_compile(r"<[^>]+?>|([^<>\s]+)", re.S)
31
+
#
32
+
# But it was shown that this regex, in the way we use it here, has some
33
+
# catastrophic edge-case performance features. Namely, when it is applied to
34
+
# text with only open brackets "<<<...". The class below provides the services
35
+
# and correct answers for the use cases, but in these edge cases does it much
36
+
# faster.
37
+
re_notag = _lazy_re_compile(r"([^<>\s]+)", re.S)
38
+
re_prt = _lazy_re_compile(r"<|([^<>\s]+)", re.S)
39
+
40
+
41
+
class WordsRegex:
42
+
@staticmethod
43
+
def search(text, pos):
44
+
# Look for "<" or a non-tag word.
45
+
partial = re_prt.search(text, pos)
46
+
if partial is None or partial[1] is not None:
47
+
return partial
48
+
49
+
# "<" was found, look for a closing ">".
50
+
end = text.find(">", partial.end(0))
51
+
if end < 0:
52
+
# ">" cannot be found, look for a word.
53
+
return re_notag.search(text, pos + 1)
54
+
else:
55
+
# "<" followed by a ">" was found -- fake a match.
56
+
end += 1
57
+
return FakeMatch(text[partial.start(0) : end], end)
58
+
59
+
60
+
class FakeMatch:
61
+
__slots__ = ["_text", "_end"]
62
+
63
+
def end(self, group=0):
64
+
assert group == 0, "This specific object takes only group=0"
65
+
return self._end
66
+
67
+
def __getitem__(self, group):
68
+
if group == 1:
69
+
return None
70
+
assert group == 0, "This specific object takes only group in {0,1}"
71
+
return self._text
72
+
73
+
def __init__(self, text, end):
74
+
self._text, self._end = text, end
75
+
76
+
77
+
# ----- End security-related performance workaround -----
78
+
79
+
# Set up regular expressions.
80
+
re_words = WordsRegex
28
81
re_chars = _lazy_re_compile(r"<[^>]+?>|(.)", re.S)
29
82
re_tag = _lazy_re_compile(r"<(/)?(\S+?)(?:(\s*/)|\s.*?)?>", re.S)
30
83
re_newlines = _lazy_re_compile(r"\r\n|\r") # Used in normalize_newlines
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4