punycode.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. """ Codec for the Punicode encoding, as specified in RFC 3492
  2. Written by Martin v. Löwis.
  3. """
  4. import codecs
  5. ##################### Encoding #####################################
  6. def segregate(str):
  7. """3.1 Basic code point segregation"""
  8. base = bytearray()
  9. extended = set()
  10. for c in str:
  11. if ord(c) < 128:
  12. base.append(ord(c))
  13. else:
  14. extended.add(c)
  15. extended = sorted(extended)
  16. return bytes(base), extended
  17. def selective_len(str, max):
  18. """Return the length of str, considering only characters below max."""
  19. res = 0
  20. for c in str:
  21. if ord(c) < max:
  22. res += 1
  23. return res
  24. def selective_find(str, char, index, pos):
  25. """Return a pair (index, pos), indicating the next occurrence of
  26. char in str. index is the position of the character considering
  27. only ordinals up to and including char, and pos is the position in
  28. the full string. index/pos is the starting position in the full
  29. string."""
  30. l = len(str)
  31. while 1:
  32. pos += 1
  33. if pos == l:
  34. return (-1, -1)
  35. c = str[pos]
  36. if c == char:
  37. return index+1, pos
  38. elif c < char:
  39. index += 1
  40. def insertion_unsort(str, extended):
  41. """3.2 Insertion unsort coding"""
  42. oldchar = 0x80
  43. result = []
  44. oldindex = -1
  45. for c in extended:
  46. index = pos = -1
  47. char = ord(c)
  48. curlen = selective_len(str, char)
  49. delta = (curlen+1) * (char - oldchar)
  50. while 1:
  51. index,pos = selective_find(str,c,index,pos)
  52. if index == -1:
  53. break
  54. delta += index - oldindex
  55. result.append(delta-1)
  56. oldindex = index
  57. delta = 0
  58. oldchar = char
  59. return result
  60. def T(j, bias):
  61. # Punycode parameters: tmin = 1, tmax = 26, base = 36
  62. res = 36 * (j + 1) - bias
  63. if res < 1: return 1
  64. if res > 26: return 26
  65. return res
  66. digits = b"abcdefghijklmnopqrstuvwxyz0123456789"
  67. def generate_generalized_integer(N, bias):
  68. """3.3 Generalized variable-length integers"""
  69. result = bytearray()
  70. j = 0
  71. while 1:
  72. t = T(j, bias)
  73. if N < t:
  74. result.append(digits[N])
  75. return bytes(result)
  76. result.append(digits[t + ((N - t) % (36 - t))])
  77. N = (N - t) // (36 - t)
  78. j += 1
  79. def adapt(delta, first, numchars):
  80. if first:
  81. delta //= 700
  82. else:
  83. delta //= 2
  84. delta += delta // numchars
  85. # ((base - tmin) * tmax) // 2 == 455
  86. divisions = 0
  87. while delta > 455:
  88. delta = delta // 35 # base - tmin
  89. divisions += 36
  90. bias = divisions + (36 * delta // (delta + 38))
  91. return bias
  92. def generate_integers(baselen, deltas):
  93. """3.4 Bias adaptation"""
  94. # Punycode parameters: initial bias = 72, damp = 700, skew = 38
  95. result = bytearray()
  96. bias = 72
  97. for points, delta in enumerate(deltas):
  98. s = generate_generalized_integer(delta, bias)
  99. result.extend(s)
  100. bias = adapt(delta, points==0, baselen+points+1)
  101. return bytes(result)
  102. def punycode_encode(text):
  103. base, extended = segregate(text)
  104. deltas = insertion_unsort(text, extended)
  105. extended = generate_integers(len(base), deltas)
  106. if base:
  107. return base + b"-" + extended
  108. return extended
  109. ##################### Decoding #####################################
  110. def decode_generalized_number(extended, extpos, bias, errors):
  111. """3.3 Generalized variable-length integers"""
  112. result = 0
  113. w = 1
  114. j = 0
  115. while 1:
  116. try:
  117. char = ord(extended[extpos])
  118. except IndexError:
  119. if errors == "strict":
  120. raise UnicodeError("incomplete punicode string")
  121. return extpos + 1, None
  122. extpos += 1
  123. if 0x41 <= char <= 0x5A: # A-Z
  124. digit = char - 0x41
  125. elif 0x30 <= char <= 0x39:
  126. digit = char - 22 # 0x30-26
  127. elif errors == "strict":
  128. raise UnicodeError("Invalid extended code point '%s'"
  129. % extended[extpos-1])
  130. else:
  131. return extpos, None
  132. t = T(j, bias)
  133. result += digit * w
  134. if digit < t:
  135. return extpos, result
  136. w = w * (36 - t)
  137. j += 1
  138. def insertion_sort(base, extended, errors):
  139. """3.2 Insertion unsort coding"""
  140. char = 0x80
  141. pos = -1
  142. bias = 72
  143. extpos = 0
  144. while extpos < len(extended):
  145. newpos, delta = decode_generalized_number(extended, extpos,
  146. bias, errors)
  147. if delta is None:
  148. # There was an error in decoding. We can't continue because
  149. # synchronization is lost.
  150. return base
  151. pos += delta+1
  152. char += pos // (len(base) + 1)
  153. if char > 0x10FFFF:
  154. if errors == "strict":
  155. raise UnicodeError("Invalid character U+%x" % char)
  156. char = ord('?')
  157. pos = pos % (len(base) + 1)
  158. base = base[:pos] + chr(char) + base[pos:]
  159. bias = adapt(delta, (extpos == 0), len(base))
  160. extpos = newpos
  161. return base
  162. def punycode_decode(text, errors):
  163. if isinstance(text, str):
  164. text = text.encode("ascii")
  165. if isinstance(text, memoryview):
  166. text = bytes(text)
  167. pos = text.rfind(b"-")
  168. if pos == -1:
  169. base = ""
  170. extended = str(text, "ascii").upper()
  171. else:
  172. base = str(text[:pos], "ascii", errors)
  173. extended = str(text[pos+1:], "ascii").upper()
  174. return insertion_sort(base, extended, errors)
  175. ### Codec APIs
  176. class Codec(codecs.Codec):
  177. def encode(self, input, errors='strict'):
  178. res = punycode_encode(input)
  179. return res, len(input)
  180. def decode(self, input, errors='strict'):
  181. if errors not in ('strict', 'replace', 'ignore'):
  182. raise UnicodeError("Unsupported error handling "+errors)
  183. res = punycode_decode(input, errors)
  184. return res, len(input)
  185. class IncrementalEncoder(codecs.IncrementalEncoder):
  186. def encode(self, input, final=False):
  187. return punycode_encode(input)
  188. class IncrementalDecoder(codecs.IncrementalDecoder):
  189. def decode(self, input, final=False):
  190. if self.errors not in ('strict', 'replace', 'ignore'):
  191. raise UnicodeError("Unsupported error handling "+self.errors)
  192. return punycode_decode(input, self.errors)
  193. class StreamWriter(Codec,codecs.StreamWriter):
  194. pass
  195. class StreamReader(Codec,codecs.StreamReader):
  196. pass
  197. ### encodings module API
  198. def getregentry():
  199. return codecs.CodecInfo(
  200. name='punycode',
  201. encode=Codec().encode,
  202. decode=Codec().decode,
  203. incrementalencoder=IncrementalEncoder,
  204. incrementaldecoder=IncrementalDecoder,
  205. streamwriter=StreamWriter,
  206. streamreader=StreamReader,
  207. )