[MAL] > ... > Here is a sample implementation of what I had in mind: > > """ Demo for 'unicode-escape' encoding. > """ > import struct,string,re > > pack_format = '>H' > > def convert_string(s): > > l = map(None,s) > for i in range(len(l)): > l[i] = struct.pack(pack_format,ord(l[i])) > return l > > u_escape = re.compile(r'\\u([0-9a-fA-F]{0,4})') > > def unicode_unescape(s): > > l = [] > start = 0 > while start < len(s): > m = u_escape.search(s,start) > if not m: > l[len(l):] = convert_string(s[start:]) > break > m_start,m_end = m.span() > if m_start > start: > l[len(l):] = convert_string(s[start:m_start]) > hexcode = m.group(1) > #print hexcode,start,m_start > if len(hexcode) != 4: > raise SyntaxError,'illegal \\uXXXX sequence: \\u%s' % hexcode > ordinal = string.atoi(hexcode,16) > l.append(struct.pack(pack_format,ordinal)) > start = m_end > #print l > return string.join(l,'') > > def hexstr(s,sep=''): > > return string.join(map(lambda x,hex=hex,ord=ord: '%02x' % > ord(x),s),sep) It looks like r'\\u0000' will get translated into a 2-character Unicode string. That's probably not good, if for no other reason than that Java would not do this (it would create the obvious 7-character Unicode string), and having something that looks like a Java escape that doesn't *work* like the Java escape will be confusing as heck for JPython users. Keeping track of even-vs-odd number of backslashes can't be done with a regexp search, but is easy if the code is simple <wink>: def unicode_unescape(s): from string import atoi import array i, n = 0, len(s) result = array.array('H') # unsigned short, native order while i < n: ch = s[i] i = i+1 if ch != "\\": result.append(ord(ch)) continue if i == n: raise ValueError("string ends with lone backslash") ch = s[i] i = i+1 if ch != "u": result.append(ord("\\")) result.append(ord(ch)) continue hexchars = s[i:i+4] if len(hexchars) != 4: raise ValueError("\\u escape at end not followed by " "at least 4 characters") i = i+4 for ch in hexchars: if ch not in "01234567890abcdefABCDEF": raise ValueError("\\u" + hexchars + " contains " "non-hex characters") result.append(atoi(hexchars, 16)) # print result return result.tostring()
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4