idna.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
  2. import stringprep, re, codecs
  3. from unicodedata import ucd_3_2_0 as unicodedata
  4. # IDNA section 3.1
  5. dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
  6. # IDNA section 5
  7. ace_prefix = b"xn--"
  8. sace_prefix = "xn--"
  9. # This assumes query strings, so AllowUnassigned is true
  10. def nameprep(label):
  11. # Map
  12. newlabel = []
  13. for c in label:
  14. if stringprep.in_table_b1(c):
  15. # Map to nothing
  16. continue
  17. newlabel.append(stringprep.map_table_b2(c))
  18. label = "".join(newlabel)
  19. # Normalize
  20. label = unicodedata.normalize("NFKC", label)
  21. # Prohibit
  22. for c in label:
  23. if stringprep.in_table_c12(c) or \
  24. stringprep.in_table_c22(c) or \
  25. stringprep.in_table_c3(c) or \
  26. stringprep.in_table_c4(c) or \
  27. stringprep.in_table_c5(c) or \
  28. stringprep.in_table_c6(c) or \
  29. stringprep.in_table_c7(c) or \
  30. stringprep.in_table_c8(c) or \
  31. stringprep.in_table_c9(c):
  32. raise UnicodeError("Invalid character %r" % c)
  33. # Check bidi
  34. RandAL = [stringprep.in_table_d1(x) for x in label]
  35. for c in RandAL:
  36. if c:
  37. # There is a RandAL char in the string. Must perform further
  38. # tests:
  39. # 1) The characters in section 5.8 MUST be prohibited.
  40. # This is table C.8, which was already checked
  41. # 2) If a string contains any RandALCat character, the string
  42. # MUST NOT contain any LCat character.
  43. if any(stringprep.in_table_d2(x) for x in label):
  44. raise UnicodeError("Violation of BIDI requirement 2")
  45. # 3) If a string contains any RandALCat character, a
  46. # RandALCat character MUST be the first character of the
  47. # string, and a RandALCat character MUST be the last
  48. # character of the string.
  49. if not RandAL[0] or not RandAL[-1]:
  50. raise UnicodeError("Violation of BIDI requirement 3")
  51. return label
  52. def ToASCII(label):
  53. try:
  54. # Step 1: try ASCII
  55. label = label.encode("ascii")
  56. except UnicodeError:
  57. pass
  58. else:
  59. # Skip to step 3: UseSTD3ASCIIRules is false, so
  60. # Skip to step 8.
  61. if 0 < len(label) < 64:
  62. return label
  63. raise UnicodeError("label empty or too long")
  64. # Step 2: nameprep
  65. label = nameprep(label)
  66. # Step 3: UseSTD3ASCIIRules is false
  67. # Step 4: try ASCII
  68. try:
  69. label = label.encode("ascii")
  70. except UnicodeError:
  71. pass
  72. else:
  73. # Skip to step 8.
  74. if 0 < len(label) < 64:
  75. return label
  76. raise UnicodeError("label empty or too long")
  77. # Step 5: Check ACE prefix
  78. if label.startswith(sace_prefix):
  79. raise UnicodeError("Label starts with ACE prefix")
  80. # Step 6: Encode with PUNYCODE
  81. label = label.encode("punycode")
  82. # Step 7: Prepend ACE prefix
  83. label = ace_prefix + label
  84. # Step 8: Check size
  85. if 0 < len(label) < 64:
  86. return label
  87. raise UnicodeError("label empty or too long")
  88. def ToUnicode(label):
  89. # Step 1: Check for ASCII
  90. if isinstance(label, bytes):
  91. pure_ascii = True
  92. else:
  93. try:
  94. label = label.encode("ascii")
  95. pure_ascii = True
  96. except UnicodeError:
  97. pure_ascii = False
  98. if not pure_ascii:
  99. # Step 2: Perform nameprep
  100. label = nameprep(label)
  101. # It doesn't say this, but apparently, it should be ASCII now
  102. try:
  103. label = label.encode("ascii")
  104. except UnicodeError:
  105. raise UnicodeError("Invalid character in IDN label")
  106. # Step 3: Check for ACE prefix
  107. if not label.startswith(ace_prefix):
  108. return str(label, "ascii")
  109. # Step 4: Remove ACE prefix
  110. label1 = label[len(ace_prefix):]
  111. # Step 5: Decode using PUNYCODE
  112. result = label1.decode("punycode")
  113. # Step 6: Apply ToASCII
  114. label2 = ToASCII(result)
  115. # Step 7: Compare the result of step 6 with the one of step 3
  116. # label2 will already be in lower case.
  117. if str(label, "ascii").lower() != str(label2, "ascii"):
  118. raise UnicodeError("IDNA does not round-trip", label, label2)
  119. # Step 8: return the result of step 5
  120. return result
  121. ### Codec APIs
  122. class Codec(codecs.Codec):
  123. def encode(self, input, errors='strict'):
  124. if errors != 'strict':
  125. # IDNA is quite clear that implementations must be strict
  126. raise UnicodeError("unsupported error handling "+errors)
  127. if not input:
  128. return b'', 0
  129. try:
  130. result = input.encode('ascii')
  131. except UnicodeEncodeError:
  132. pass
  133. else:
  134. # ASCII name: fast path
  135. labels = result.split(b'.')
  136. for label in labels[:-1]:
  137. if not (0 < len(label) < 64):
  138. raise UnicodeError("label empty or too long")
  139. if len(labels[-1]) >= 64:
  140. raise UnicodeError("label too long")
  141. return result, len(input)
  142. result = bytearray()
  143. labels = dots.split(input)
  144. if labels and not labels[-1]:
  145. trailing_dot = b'.'
  146. del labels[-1]
  147. else:
  148. trailing_dot = b''
  149. for label in labels:
  150. if result:
  151. # Join with U+002E
  152. result.extend(b'.')
  153. result.extend(ToASCII(label))
  154. return bytes(result+trailing_dot), len(input)
  155. def decode(self, input, errors='strict'):
  156. if errors != 'strict':
  157. raise UnicodeError("Unsupported error handling "+errors)
  158. if not input:
  159. return "", 0
  160. # IDNA allows decoding to operate on Unicode strings, too.
  161. if not isinstance(input, bytes):
  162. # XXX obviously wrong, see #3232
  163. input = bytes(input)
  164. if ace_prefix not in input:
  165. # Fast path
  166. try:
  167. return input.decode('ascii'), len(input)
  168. except UnicodeDecodeError:
  169. pass
  170. labels = input.split(b".")
  171. if labels and len(labels[-1]) == 0:
  172. trailing_dot = '.'
  173. del labels[-1]
  174. else:
  175. trailing_dot = ''
  176. result = []
  177. for label in labels:
  178. result.append(ToUnicode(label))
  179. return ".".join(result)+trailing_dot, len(input)
  180. class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
  181. def _buffer_encode(self, input, errors, final):
  182. if errors != 'strict':
  183. # IDNA is quite clear that implementations must be strict
  184. raise UnicodeError("unsupported error handling "+errors)
  185. if not input:
  186. return (b'', 0)
  187. labels = dots.split(input)
  188. trailing_dot = b''
  189. if labels:
  190. if not labels[-1]:
  191. trailing_dot = b'.'
  192. del labels[-1]
  193. elif not final:
  194. # Keep potentially unfinished label until the next call
  195. del labels[-1]
  196. if labels:
  197. trailing_dot = b'.'
  198. result = bytearray()
  199. size = 0
  200. for label in labels:
  201. if size:
  202. # Join with U+002E
  203. result.extend(b'.')
  204. size += 1
  205. result.extend(ToASCII(label))
  206. size += len(label)
  207. result += trailing_dot
  208. size += len(trailing_dot)
  209. return (bytes(result), size)
  210. class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
  211. def _buffer_decode(self, input, errors, final):
  212. if errors != 'strict':
  213. raise UnicodeError("Unsupported error handling "+errors)
  214. if not input:
  215. return ("", 0)
  216. # IDNA allows decoding to operate on Unicode strings, too.
  217. if isinstance(input, str):
  218. labels = dots.split(input)
  219. else:
  220. # Must be ASCII string
  221. input = str(input, "ascii")
  222. labels = input.split(".")
  223. trailing_dot = ''
  224. if labels:
  225. if not labels[-1]:
  226. trailing_dot = '.'
  227. del labels[-1]
  228. elif not final:
  229. # Keep potentially unfinished label until the next call
  230. del labels[-1]
  231. if labels:
  232. trailing_dot = '.'
  233. result = []
  234. size = 0
  235. for label in labels:
  236. result.append(ToUnicode(label))
  237. if size:
  238. size += 1
  239. size += len(label)
  240. result = ".".join(result) + trailing_dot
  241. size += len(trailing_dot)
  242. return (result, size)
  243. class StreamWriter(Codec,codecs.StreamWriter):
  244. pass
  245. class StreamReader(Codec,codecs.StreamReader):
  246. pass
  247. ### encodings module API
  248. def getregentry():
  249. return codecs.CodecInfo(
  250. name='idna',
  251. encode=Codec().encode,
  252. decode=Codec().decode,
  253. incrementalencoder=IncrementalEncoder,
  254. incrementaldecoder=IncrementalDecoder,
  255. streamwriter=StreamWriter,
  256. streamreader=StreamReader,
  257. )