paul wrote: > As an aside: I would be pumpld about getting a generic lexer into the > Python distribution. how about this quick and dirty proposal: - add a new primitive to SRE: (?P#n), where n is a small integer. this primitive sets the match object's "index" variable to n when the engine stumbles upon it. - given a list of "phrases", combine them into a single regular expression like this: (?:phrase1(?P#1))|(?:phrase2(?P#2))|... - apply match repeatedly to the input string. for each match, use the index attribute to figure out what phrase we matched. see below for a slightly larger example. </F> import sre class Scanner: def __init__(self, lexicon): self.lexicon =3D lexicon p =3D [] for phrase, action in lexicon: p.append("(?:%s)(?P#%d)" % (phrase, len(p))) self.scanner =3D sre.compile("|".join(p)) def scan(self, string): result =3D [] append =3D result.append match =3D self.scanner.match i =3D 0 while 1: m =3D match(string, i) if not m: break j =3D m.end() if i =3D=3D j: break action =3D self.lexicon[m.index][1] if callable(action): self.match =3D match action =3D action(self, m.group()) if action is not None: append(action) i =3D j return result, string[i:] def s_ident(scanner, token): return token def s_operator(scanner, token): return "operator%s" % token def s_float(scanner, token): return float(token) def s_int(scanner, token): return int(token) scanner =3D Scanner([ (r"[a-zA-Z_]\w*", s_ident), (r"\d+\.\d*", s_float), (r"\d+", s_int), (r"=3D|\+|-|\*|/", s_operator), (r"\s+", None), ]) tokens, tail =3D scanner.scan("sum =3D 3*foo + 312.50 + bar") print tokens if tail: print "syntax error at", tail ## prints: ## ['sum', 'operator=3D', 3, 'operator*', 'foo', 'operator+', ## 312.5, 'operator+', 'bar']
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4