StringEncoding.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. #
  2. # Cython -- encoding related tools
  3. #
  4. from __future__ import absolute_import
  5. import re
  6. import sys
  7. if sys.version_info[0] >= 3:
  8. _unicode, _str, _bytes, _unichr = str, str, bytes, chr
  9. IS_PYTHON3 = True
  10. else:
  11. _unicode, _str, _bytes, _unichr = unicode, str, str, unichr
  12. IS_PYTHON3 = False
  13. empty_bytes = _bytes()
  14. empty_unicode = _unicode()
  15. join_bytes = empty_bytes.join
  16. class UnicodeLiteralBuilder(object):
  17. """Assemble a unicode string.
  18. """
  19. def __init__(self):
  20. self.chars = []
  21. def append(self, characters):
  22. if isinstance(characters, _bytes):
  23. # this came from a Py2 string literal in the parser code
  24. characters = characters.decode("ASCII")
  25. assert isinstance(characters, _unicode), str(type(characters))
  26. self.chars.append(characters)
  27. if sys.maxunicode == 65535:
  28. def append_charval(self, char_number):
  29. if char_number > 65535:
  30. # wide Unicode character on narrow platform => replace
  31. # by surrogate pair
  32. char_number -= 0x10000
  33. self.chars.append( _unichr((char_number // 1024) + 0xD800) )
  34. self.chars.append( _unichr((char_number % 1024) + 0xDC00) )
  35. else:
  36. self.chars.append( _unichr(char_number) )
  37. else:
  38. def append_charval(self, char_number):
  39. self.chars.append( _unichr(char_number) )
  40. def append_uescape(self, char_number, escape_string):
  41. self.append_charval(char_number)
  42. def getstring(self):
  43. return EncodedString(u''.join(self.chars))
  44. def getstrings(self):
  45. return (None, self.getstring())
  46. class BytesLiteralBuilder(object):
  47. """Assemble a byte string or char value.
  48. """
  49. def __init__(self, target_encoding):
  50. self.chars = []
  51. self.target_encoding = target_encoding
  52. def append(self, characters):
  53. if isinstance(characters, _unicode):
  54. characters = characters.encode(self.target_encoding)
  55. assert isinstance(characters, _bytes), str(type(characters))
  56. self.chars.append(characters)
  57. def append_charval(self, char_number):
  58. self.chars.append( _unichr(char_number).encode('ISO-8859-1') )
  59. def append_uescape(self, char_number, escape_string):
  60. self.append(escape_string)
  61. def getstring(self):
  62. # this *must* return a byte string!
  63. return bytes_literal(join_bytes(self.chars), self.target_encoding)
  64. def getchar(self):
  65. # this *must* return a byte string!
  66. return self.getstring()
  67. def getstrings(self):
  68. return (self.getstring(), None)
  69. class StrLiteralBuilder(object):
  70. """Assemble both a bytes and a unicode representation of a string.
  71. """
  72. def __init__(self, target_encoding):
  73. self._bytes = BytesLiteralBuilder(target_encoding)
  74. self._unicode = UnicodeLiteralBuilder()
  75. def append(self, characters):
  76. self._bytes.append(characters)
  77. self._unicode.append(characters)
  78. def append_charval(self, char_number):
  79. self._bytes.append_charval(char_number)
  80. self._unicode.append_charval(char_number)
  81. def append_uescape(self, char_number, escape_string):
  82. self._bytes.append(escape_string)
  83. self._unicode.append_charval(char_number)
  84. def getstrings(self):
  85. return (self._bytes.getstring(), self._unicode.getstring())
  86. class EncodedString(_unicode):
  87. # unicode string subclass to keep track of the original encoding.
  88. # 'encoding' is None for unicode strings and the source encoding
  89. # otherwise
  90. encoding = None
  91. def __deepcopy__(self, memo):
  92. return self
  93. def byteencode(self):
  94. assert self.encoding is not None
  95. return self.encode(self.encoding)
  96. def utf8encode(self):
  97. assert self.encoding is None
  98. return self.encode("UTF-8")
  99. @property
  100. def is_unicode(self):
  101. return self.encoding is None
  102. def contains_surrogates(self):
  103. return string_contains_surrogates(self)
  104. def as_utf8_string(self):
  105. return bytes_literal(self.utf8encode(), 'utf8')
  106. def string_contains_surrogates(ustring):
  107. """
  108. Check if the unicode string contains surrogate code points
  109. on a CPython platform with wide (UCS-4) or narrow (UTF-16)
  110. Unicode, i.e. characters that would be spelled as two
  111. separate code units on a narrow platform.
  112. """
  113. for c in map(ord, ustring):
  114. if c > 65535: # can only happen on wide platforms
  115. return True
  116. if 0xD800 <= c <= 0xDFFF:
  117. return True
  118. return False
  119. def string_contains_lone_surrogates(ustring):
  120. """
  121. Check if the unicode string contains lone surrogate code points
  122. on a CPython platform with wide (UCS-4) or narrow (UTF-16)
  123. Unicode, i.e. characters that would be spelled as two
  124. separate code units on a narrow platform, but that do not form a pair.
  125. """
  126. last_was_start = False
  127. unicode_uses_surrogate_encoding = sys.maxunicode == 65535
  128. for c in map(ord, ustring):
  129. # surrogates tend to be rare
  130. if c < 0xD800 or c > 0xDFFF:
  131. if last_was_start:
  132. return True
  133. elif not unicode_uses_surrogate_encoding:
  134. # on 32bit Unicode platforms, there is never a pair
  135. return True
  136. elif c <= 0xDBFF:
  137. if last_was_start:
  138. return True # lone start
  139. last_was_start = True
  140. else:
  141. if not last_was_start:
  142. return True # lone end
  143. last_was_start = False
  144. return last_was_start
  145. class BytesLiteral(_bytes):
  146. # bytes subclass that is compatible with EncodedString
  147. encoding = None
  148. def __deepcopy__(self, memo):
  149. return self
  150. def byteencode(self):
  151. if IS_PYTHON3:
  152. return _bytes(self)
  153. else:
  154. # fake-recode the string to make it a plain bytes object
  155. return self.decode('ISO-8859-1').encode('ISO-8859-1')
  156. def utf8encode(self):
  157. assert False, "this is not a unicode string: %r" % self
  158. def __str__(self):
  159. """Fake-decode the byte string to unicode to support %
  160. formatting of unicode strings.
  161. """
  162. return self.decode('ISO-8859-1')
  163. is_unicode = False
  164. def as_c_string_literal(self):
  165. value = split_string_literal(escape_byte_string(self))
  166. return '"%s"' % value
  167. def bytes_literal(s, encoding):
  168. assert isinstance(s, bytes)
  169. s = BytesLiteral(s)
  170. s.encoding = encoding
  171. return s
  172. def encoded_string(s, encoding):
  173. assert isinstance(s, (_unicode, bytes))
  174. s = EncodedString(s)
  175. if encoding is not None:
  176. s.encoding = encoding
  177. return s
  178. char_from_escape_sequence = {
  179. r'\a' : u'\a',
  180. r'\b' : u'\b',
  181. r'\f' : u'\f',
  182. r'\n' : u'\n',
  183. r'\r' : u'\r',
  184. r'\t' : u'\t',
  185. r'\v' : u'\v',
  186. }.get
  187. _c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
  188. def _to_escape_sequence(s):
  189. if s in '\n\r\t':
  190. return repr(s)[1:-1]
  191. elif s == '"':
  192. return r'\"'
  193. elif s == '\\':
  194. return r'\\'
  195. else:
  196. # within a character sequence, oct passes much better than hex
  197. return ''.join(['\\%03o' % ord(c) for c in s])
  198. def _build_specials_replacer():
  199. subexps = []
  200. replacements = {}
  201. for special in _c_special:
  202. regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
  203. subexps.append(regexp)
  204. replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
  205. sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
  206. def replace_specials(m):
  207. return replacements[m.group(1)]
  208. def replace(s):
  209. return sub(replace_specials, s)
  210. return replace
  211. _replace_specials = _build_specials_replacer()
  212. def escape_char(c):
  213. if IS_PYTHON3:
  214. c = c.decode('ISO-8859-1')
  215. if c in '\n\r\t\\':
  216. return repr(c)[1:-1]
  217. elif c == "'":
  218. return "\\'"
  219. n = ord(c)
  220. if n < 32 or n > 127:
  221. # hex works well for characters
  222. return "\\x%02X" % n
  223. else:
  224. return c
  225. def escape_byte_string(s):
  226. """Escape a byte string so that it can be written into C code.
  227. Note that this returns a Unicode string instead which, when
  228. encoded as ISO-8859-1, will result in the correct byte sequence
  229. being written.
  230. """
  231. s = _replace_specials(s)
  232. try:
  233. return s.decode("ASCII") # trial decoding: plain ASCII => done
  234. except UnicodeDecodeError:
  235. pass
  236. if IS_PYTHON3:
  237. s_new = bytearray()
  238. append, extend = s_new.append, s_new.extend
  239. for b in s:
  240. if b >= 128:
  241. extend(('\\%3o' % b).encode('ASCII'))
  242. else:
  243. append(b)
  244. return s_new.decode('ISO-8859-1')
  245. else:
  246. l = []
  247. append = l.append
  248. for c in s:
  249. o = ord(c)
  250. if o >= 128:
  251. append('\\%3o' % o)
  252. else:
  253. append(c)
  254. return join_bytes(l).decode('ISO-8859-1')
  255. def split_string_literal(s, limit=2000):
  256. # MSVC can't handle long string literals.
  257. if len(s) < limit:
  258. return s
  259. else:
  260. start = 0
  261. chunks = []
  262. while start < len(s):
  263. end = start + limit
  264. if len(s) > end-4 and '\\' in s[end-4:end]:
  265. end -= 4 - s[end-4:end].find('\\') # just before the backslash
  266. while s[end-1] == '\\':
  267. end -= 1
  268. if end == start:
  269. # must have been a long line of backslashes
  270. end = start + limit - (limit % 2) - 4
  271. break
  272. chunks.append(s[start:end])
  273. start = end
  274. return '""'.join(chunks)
  275. def encode_pyunicode_string(s):
  276. """Create Py_UNICODE[] representation of a given unicode string.
  277. """
  278. s = list(map(ord, s)) + [0]
  279. if sys.maxunicode >= 0x10000: # Wide build or Py3.3
  280. utf16, utf32 = [], s
  281. for code_point in s:
  282. if code_point >= 0x10000: # outside of BMP
  283. high, low = divmod(code_point - 0x10000, 1024)
  284. utf16.append(high + 0xD800)
  285. utf16.append(low + 0xDC00)
  286. else:
  287. utf16.append(code_point)
  288. else:
  289. utf16, utf32 = s, []
  290. for code_unit in s:
  291. if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
  292. high, low = utf32[-1], code_unit
  293. utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
  294. else:
  295. utf32.append(code_unit)
  296. if utf16 == utf32:
  297. utf16 = []
  298. return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32))