_quoting_c.pyx 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. # cython: language_level=3
  2. from libc.stdint cimport uint8_t, uint64_t
  3. from libc.string cimport memcpy, memset
  4. from cpython.exc cimport PyErr_NoMemory
  5. from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
  6. from cpython.unicode cimport PyUnicode_DecodeASCII, PyUnicode_DecodeUTF8Stateful
  7. from string import ascii_letters, digits
  8. cdef str GEN_DELIMS = ":/?#[]@"
  9. cdef str SUB_DELIMS_WITHOUT_QS = "!$'()*,"
  10. cdef str SUB_DELIMS = SUB_DELIMS_WITHOUT_QS + '+?=;'
  11. cdef str RESERVED = GEN_DELIMS + SUB_DELIMS
  12. cdef str UNRESERVED = ascii_letters + digits + '-._~'
  13. cdef str ALLOWED = UNRESERVED + SUB_DELIMS_WITHOUT_QS
  14. cdef str QS = '+&=;'
  15. DEF BUF_SIZE = 8 * 1024 # 8KiB
  16. cdef char BUFFER[BUF_SIZE]
  17. cdef inline Py_UCS4 _to_hex(uint8_t v):
  18. if v < 10:
  19. return <Py_UCS4>(v+0x30) # ord('0') == 0x30
  20. else:
  21. return <Py_UCS4>(v+0x41-10) # ord('A') == 0x41
  22. cdef inline int _from_hex(Py_UCS4 v):
  23. if '0' <= v <= '9':
  24. return <int>(v) - 0x30 # ord('0') == 0x30
  25. elif 'A' <= v <= 'F':
  26. return <int>(v) - 0x41 + 10 # ord('A') == 0x41
  27. elif 'a' <= v <= 'f':
  28. return <int>(v) - 0x61 + 10 # ord('a') == 0x61
  29. else:
  30. return -1
  31. cdef inline int _is_lower_hex(Py_UCS4 v):
  32. return 'a' <= v <= 'f'
  33. cdef inline Py_UCS4 _restore_ch(Py_UCS4 d1, Py_UCS4 d2):
  34. cdef int digit1 = _from_hex(d1)
  35. if digit1 < 0:
  36. return <Py_UCS4>-1
  37. cdef int digit2 = _from_hex(d2)
  38. if digit2 < 0:
  39. return <Py_UCS4>-1
  40. return <Py_UCS4>(digit1 << 4 | digit2)
  41. cdef uint8_t ALLOWED_TABLE[16]
  42. cdef uint8_t ALLOWED_NOTQS_TABLE[16]
  43. cdef inline bint bit_at(uint8_t array[], uint64_t ch):
  44. return array[ch >> 3] & (1 << (ch & 7))
  45. cdef inline void set_bit(uint8_t array[], uint64_t ch):
  46. array[ch >> 3] |= (1 << (ch & 7))
  47. memset(ALLOWED_TABLE, 0, sizeof(ALLOWED_TABLE))
  48. memset(ALLOWED_NOTQS_TABLE, 0, sizeof(ALLOWED_NOTQS_TABLE))
  49. for i in range(128):
  50. if chr(i) in ALLOWED:
  51. set_bit(ALLOWED_TABLE, i)
  52. set_bit(ALLOWED_NOTQS_TABLE, i)
  53. if chr(i) in QS:
  54. set_bit(ALLOWED_NOTQS_TABLE, i)
  55. # ----------------- writer ---------------------------
  56. cdef struct Writer:
  57. char *buf
  58. Py_ssize_t size
  59. Py_ssize_t pos
  60. bint changed
  61. cdef inline void _init_writer(Writer* writer):
  62. writer.buf = &BUFFER[0]
  63. writer.size = BUF_SIZE
  64. writer.pos = 0
  65. writer.changed = 0
  66. cdef inline void _release_writer(Writer* writer):
  67. if writer.buf != BUFFER:
  68. PyMem_Free(writer.buf)
  69. cdef inline int _write_char(Writer* writer, Py_UCS4 ch, bint changed):
  70. cdef char * buf
  71. cdef Py_ssize_t size
  72. if writer.pos == writer.size:
  73. # reallocate
  74. size = writer.size + BUF_SIZE
  75. if writer.buf == BUFFER:
  76. buf = <char*>PyMem_Malloc(size)
  77. if buf == NULL:
  78. PyErr_NoMemory()
  79. return -1
  80. memcpy(buf, writer.buf, writer.size)
  81. else:
  82. buf = <char*>PyMem_Realloc(writer.buf, size)
  83. if buf == NULL:
  84. PyErr_NoMemory()
  85. return -1
  86. writer.buf = buf
  87. writer.size = size
  88. writer.buf[writer.pos] = <char>ch
  89. writer.pos += 1
  90. writer.changed |= changed
  91. return 0
  92. cdef inline int _write_pct(Writer* writer, uint8_t ch, bint changed):
  93. if _write_char(writer, '%', changed) < 0:
  94. return -1
  95. if _write_char(writer, _to_hex(<uint8_t>ch >> 4), changed) < 0:
  96. return -1
  97. return _write_char(writer, _to_hex(<uint8_t>ch & 0x0f), changed)
  98. cdef inline int _write_utf8(Writer* writer, Py_UCS4 symbol):
  99. cdef uint64_t utf = <uint64_t> symbol
  100. if utf < 0x80:
  101. return _write_pct(writer, <uint8_t>utf, True)
  102. elif utf < 0x800:
  103. if _write_pct(writer, <uint8_t>(0xc0 | (utf >> 6)), True) < 0:
  104. return -1
  105. return _write_pct(writer, <uint8_t>(0x80 | (utf & 0x3f)), True)
  106. elif 0xD800 <= utf <= 0xDFFF:
  107. # surogate pair, ignored
  108. return 0
  109. elif utf < 0x10000:
  110. if _write_pct(writer, <uint8_t>(0xe0 | (utf >> 12)), True) < 0:
  111. return -1
  112. if _write_pct(writer, <uint8_t>(0x80 | ((utf >> 6) & 0x3f)),
  113. True) < 0:
  114. return -1
  115. return _write_pct(writer, <uint8_t>(0x80 | (utf & 0x3f)), True)
  116. elif utf > 0x10FFFF:
  117. # symbol is too large
  118. return 0
  119. else:
  120. if _write_pct(writer, <uint8_t>(0xf0 | (utf >> 18)), True) < 0:
  121. return -1
  122. if _write_pct(writer, <uint8_t>(0x80 | ((utf >> 12) & 0x3f)),
  123. True) < 0:
  124. return -1
  125. if _write_pct(writer, <uint8_t>(0x80 | ((utf >> 6) & 0x3f)),
  126. True) < 0:
  127. return -1
  128. return _write_pct(writer, <uint8_t>(0x80 | (utf & 0x3f)), True)
  129. # --------------------- end writer --------------------------
  130. cdef class _Quoter:
  131. cdef bint _qs
  132. cdef bint _requote
  133. cdef uint8_t _safe_table[16]
  134. cdef uint8_t _protected_table[16]
  135. def __init__(
  136. self, *, str safe='', str protected='', bint qs=False, bint requote=True,
  137. ):
  138. cdef Py_UCS4 ch
  139. self._qs = qs
  140. self._requote = requote
  141. if not self._qs:
  142. memcpy(self._safe_table,
  143. ALLOWED_NOTQS_TABLE,
  144. sizeof(self._safe_table))
  145. else:
  146. memcpy(self._safe_table,
  147. ALLOWED_TABLE,
  148. sizeof(self._safe_table))
  149. for ch in safe:
  150. if ord(ch) > 127:
  151. raise ValueError("Only safe symbols with ORD < 128 are allowed")
  152. set_bit(self._safe_table, ch)
  153. memset(self._protected_table, 0, sizeof(self._protected_table))
  154. for ch in protected:
  155. if ord(ch) > 127:
  156. raise ValueError("Only safe symbols with ORD < 128 are allowed")
  157. set_bit(self._safe_table, ch)
  158. set_bit(self._protected_table, ch)
  159. def __call__(self, val):
  160. cdef Writer writer
  161. if val is None:
  162. return None
  163. if type(val) is not str:
  164. if isinstance(val, str):
  165. # derived from str
  166. val = str(val)
  167. else:
  168. raise TypeError("Argument should be str")
  169. _init_writer(&writer)
  170. try:
  171. return self._do_quote(<str>val, &writer)
  172. finally:
  173. _release_writer(&writer)
  174. cdef str _do_quote(self, str val, Writer *writer):
  175. cdef Py_UCS4 ch
  176. cdef int changed
  177. cdef int idx = 0
  178. cdef int length = len(val)
  179. while idx < length:
  180. ch = val[idx]
  181. idx += 1
  182. if ch == '%' and self._requote and idx <= length - 2:
  183. ch = _restore_ch(val[idx], val[idx + 1])
  184. if ch != <Py_UCS4>-1:
  185. idx += 2
  186. if ch < 128:
  187. if bit_at(self._protected_table, ch):
  188. if _write_pct(writer, ch, True) < 0:
  189. raise
  190. continue
  191. if bit_at(self._safe_table, ch):
  192. if _write_char(writer, ch, True) < 0:
  193. raise
  194. continue
  195. changed = (_is_lower_hex(val[idx - 2]) or
  196. _is_lower_hex(val[idx - 1]))
  197. if _write_pct(writer, ch, changed) < 0:
  198. raise
  199. continue
  200. else:
  201. ch = '%'
  202. if self._write(writer, ch) < 0:
  203. raise
  204. if not writer.changed:
  205. return val
  206. else:
  207. return PyUnicode_DecodeASCII(writer.buf, writer.pos, "strict")
  208. cdef inline int _write(self, Writer *writer, Py_UCS4 ch):
  209. if self._qs:
  210. if ch == ' ':
  211. return _write_char(writer, '+', True)
  212. if ch < 128 and bit_at(self._safe_table, ch):
  213. return _write_char(writer, ch, False)
  214. return _write_utf8(writer, ch)
  215. cdef class _Unquoter:
  216. cdef str _unsafe
  217. cdef bint _qs
  218. cdef _Quoter _quoter
  219. cdef _Quoter _qs_quoter
  220. def __init__(self, *, unsafe='', qs=False):
  221. self._unsafe = unsafe
  222. self._qs = qs
  223. self._quoter = _Quoter()
  224. self._qs_quoter = _Quoter(qs=True)
  225. def __call__(self, val):
  226. if val is None:
  227. return None
  228. if type(val) is not str:
  229. if isinstance(val, str):
  230. # derived from str
  231. val = str(val)
  232. else:
  233. raise TypeError("Argument should be str")
  234. return self._do_unquote(<str>val)
  235. cdef str _do_unquote(self, str val):
  236. if len(val) == 0:
  237. return val
  238. cdef list ret = []
  239. cdef char buffer[4]
  240. cdef Py_ssize_t buflen = 0
  241. cdef Py_ssize_t consumed
  242. cdef str unquoted
  243. cdef Py_UCS4 ch = 0
  244. cdef Py_ssize_t idx = 0
  245. cdef Py_ssize_t length = len(val)
  246. cdef Py_ssize_t start_pct
  247. while idx < length:
  248. ch = val[idx]
  249. idx += 1
  250. if ch == '%' and idx <= length - 2:
  251. ch = _restore_ch(val[idx], val[idx + 1])
  252. if ch != <Py_UCS4>-1:
  253. idx += 2
  254. assert buflen < 4
  255. buffer[buflen] = ch
  256. buflen += 1
  257. try:
  258. unquoted = PyUnicode_DecodeUTF8Stateful(buffer, buflen,
  259. NULL, &consumed)
  260. except UnicodeDecodeError:
  261. start_pct = idx - buflen * 3
  262. buffer[0] = ch
  263. buflen = 1
  264. ret.append(val[start_pct : idx - 3])
  265. try:
  266. unquoted = PyUnicode_DecodeUTF8Stateful(buffer, buflen,
  267. NULL, &consumed)
  268. except UnicodeDecodeError:
  269. buflen = 0
  270. ret.append(val[idx - 3 : idx])
  271. continue
  272. if not unquoted:
  273. assert consumed == 0
  274. continue
  275. assert consumed == buflen
  276. buflen = 0
  277. if self._qs and unquoted in '+=&;':
  278. ret.append(self._qs_quoter(unquoted))
  279. elif unquoted in self._unsafe:
  280. ret.append(self._quoter(unquoted))
  281. else:
  282. ret.append(unquoted)
  283. continue
  284. else:
  285. ch = '%'
  286. if buflen:
  287. start_pct = idx - 1 - buflen * 3
  288. ret.append(val[start_pct : idx - 1])
  289. buflen = 0
  290. if ch == '+':
  291. if not self._qs or ch in self._unsafe:
  292. ret.append('+')
  293. else:
  294. ret.append(' ')
  295. continue
  296. if ch in self._unsafe:
  297. ret.append('%')
  298. h = hex(ord(ch)).upper()[2:]
  299. for ch in h:
  300. ret.append(ch)
  301. continue
  302. ret.append(ch)
  303. if buflen:
  304. ret.append(val[length - buflen * 3 : length])
  305. return ''.join(ret)