@@ -18,8 +18,61 @@ def capfirst(x):
18
18
return x and str(x)[0].upper() + str(x)[1:]
19
19
20
20
21
-
# Set up regular expressions
22
-
re_words = _lazy_re_compile(r'<[^>]+?>|([^<>\s]+)', re.S)
21
+
# ----- Begin security-related performance workaround -----
22
+
23
+
# We used to have, below
24
+
#
25
+
# re_words = _lazy_re_compile(r"<[^>]+?>|([^<>\s]+)", re.S)
26
+
#
27
+
# But it was shown that this regex, in the way we use it here, has some
28
+
# catastrophic edge-case performance features. Namely, when it is applied to
29
+
# text with only open brackets "<<<...". The class below provides the services
30
+
# and correct answers for the use cases, but in these edge cases does it much
31
+
# faster.
32
+
re_notag = _lazy_re_compile(r"([^<>\s]+)", re.S)
33
+
re_prt = _lazy_re_compile(r"<|([^<>\s]+)", re.S)
34
+
35
+
36
+
class WordsRegex:
37
+
@staticmethod
38
+
def search(text, pos):
39
+
# Look for "<" or a non-tag word.
40
+
partial = re_prt.search(text, pos)
41
+
if partial is None or partial[1] is not None:
42
+
return partial
43
+
44
+
# "<" was found, look for a closing ">".
45
+
end = text.find(">", partial.end(0))
46
+
if end < 0:
47
+
# ">" cannot be found, look for a word.
48
+
return re_notag.search(text, pos + 1)
49
+
else:
50
+
# "<" followed by a ">" was found -- fake a match.
51
+
end += 1
52
+
return FakeMatch(text[partial.start(0): end], end)
53
+
54
+
55
+
class FakeMatch:
56
+
__slots__ = ["_text", "_end"]
57
+
58
+
def end(self, group=0):
59
+
assert group == 0, "This specific object takes only group=0"
60
+
return self._end
61
+
62
+
def __getitem__(self, group):
63
+
if group == 1:
64
+
return None
65
+
assert group == 0, "This specific object takes only group in {0,1}"
66
+
return self._text
67
+
68
+
def __init__(self, text, end):
69
+
self._text, self._end = text, end
70
+
71
+
72
+
# ----- End security-related performance workaround -----
73
+
74
+
# Set up regular expressions.
75
+
re_words = WordsRegex
23
76
re_chars = _lazy_re_compile(r'<[^>]+?>|(.)', re.S)
24
77
re_tag = _lazy_re_compile(r'<(/)?(\S+?)(?:(\s*/)|\s.*?)?>', re.S)
25
78
re_newlines = _lazy_re_compile(r'\r\n|\r') # Used in normalize_newlines
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4