sre_compile.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808
  1. #
  2. # Secret Labs' Regular Expression Engine
  3. #
  4. # convert template to internal format
  5. #
  6. # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
  7. #
  8. # See the sre.py file for information on usage and redistribution.
  9. #
  10. """Internal support module for sre"""
  11. import _sre
  12. import sre_parse
  13. from sre_constants import *
  14. assert _sre.MAGIC == MAGIC, "SRE module mismatch"
  15. _LITERAL_CODES = {LITERAL, NOT_LITERAL}
  16. _REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
  17. _SUCCESS_CODES = {SUCCESS, FAILURE}
  18. _ASSERT_CODES = {ASSERT, ASSERT_NOT}
  19. _UNIT_CODES = _LITERAL_CODES | {ANY, IN}
  20. # Sets of lowercase characters which have the same uppercase.
  21. _equivalences = (
  22. # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
  23. (0x69, 0x131), # iı
  24. # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
  25. (0x73, 0x17f), # sſ
  26. # MICRO SIGN, GREEK SMALL LETTER MU
  27. (0xb5, 0x3bc), # µμ
  28. # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
  29. (0x345, 0x3b9, 0x1fbe), # \u0345ιι
  30. # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
  31. (0x390, 0x1fd3), # ΐΐ
  32. # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
  33. (0x3b0, 0x1fe3), # ΰΰ
  34. # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
  35. (0x3b2, 0x3d0), # βϐ
  36. # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
  37. (0x3b5, 0x3f5), # εϵ
  38. # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
  39. (0x3b8, 0x3d1), # θϑ
  40. # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
  41. (0x3ba, 0x3f0), # κϰ
  42. # GREEK SMALL LETTER PI, GREEK PI SYMBOL
  43. (0x3c0, 0x3d6), # πϖ
  44. # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
  45. (0x3c1, 0x3f1), # ρϱ
  46. # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
  47. (0x3c2, 0x3c3), # ςσ
  48. # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
  49. (0x3c6, 0x3d5), # φϕ
  50. # CYRILLIC SMALL LETTER VE, CYRILLIC SMALL LETTER ROUNDED VE
  51. (0x432, 0x1c80), # вᲀ
  52. # CYRILLIC SMALL LETTER DE, CYRILLIC SMALL LETTER LONG-LEGGED DE
  53. (0x434, 0x1c81), # дᲁ
  54. # CYRILLIC SMALL LETTER O, CYRILLIC SMALL LETTER NARROW O
  55. (0x43e, 0x1c82), # оᲂ
  56. # CYRILLIC SMALL LETTER ES, CYRILLIC SMALL LETTER WIDE ES
  57. (0x441, 0x1c83), # сᲃ
  58. # CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER TALL TE, CYRILLIC SMALL LETTER THREE-LEGGED TE
  59. (0x442, 0x1c84, 0x1c85), # тᲄᲅ
  60. # CYRILLIC SMALL LETTER HARD SIGN, CYRILLIC SMALL LETTER TALL HARD SIGN
  61. (0x44a, 0x1c86), # ъᲆ
  62. # CYRILLIC SMALL LETTER YAT, CYRILLIC SMALL LETTER TALL YAT
  63. (0x463, 0x1c87), # ѣᲇ
  64. # CYRILLIC SMALL LETTER UNBLENDED UK, CYRILLIC SMALL LETTER MONOGRAPH UK
  65. (0x1c88, 0xa64b), # ᲈꙋ
  66. # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
  67. (0x1e61, 0x1e9b), # ṡẛ
  68. # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
  69. (0xfb05, 0xfb06), # ſtst
  70. )
  71. # Maps the lowercase code to lowercase codes which have the same uppercase.
  72. _ignorecase_fixes = {i: tuple(j for j in t if i != j)
  73. for t in _equivalences for i in t}
  74. def _combine_flags(flags, add_flags, del_flags,
  75. TYPE_FLAGS=sre_parse.TYPE_FLAGS):
  76. if add_flags & TYPE_FLAGS:
  77. flags &= ~TYPE_FLAGS
  78. return (flags | add_flags) & ~del_flags
  79. def _compile(code, pattern, flags):
  80. # internal: compile a (sub)pattern
  81. emit = code.append
  82. _len = len
  83. LITERAL_CODES = _LITERAL_CODES
  84. REPEATING_CODES = _REPEATING_CODES
  85. SUCCESS_CODES = _SUCCESS_CODES
  86. ASSERT_CODES = _ASSERT_CODES
  87. iscased = None
  88. tolower = None
  89. fixes = None
  90. if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
  91. if flags & SRE_FLAG_UNICODE:
  92. iscased = _sre.unicode_iscased
  93. tolower = _sre.unicode_tolower
  94. fixes = _ignorecase_fixes
  95. else:
  96. iscased = _sre.ascii_iscased
  97. tolower = _sre.ascii_tolower
  98. for op, av in pattern:
  99. if op in LITERAL_CODES:
  100. if not flags & SRE_FLAG_IGNORECASE:
  101. emit(op)
  102. emit(av)
  103. elif flags & SRE_FLAG_LOCALE:
  104. emit(OP_LOCALE_IGNORE[op])
  105. emit(av)
  106. elif not iscased(av):
  107. emit(op)
  108. emit(av)
  109. else:
  110. lo = tolower(av)
  111. if not fixes: # ascii
  112. emit(OP_IGNORE[op])
  113. emit(lo)
  114. elif lo not in fixes:
  115. emit(OP_UNICODE_IGNORE[op])
  116. emit(lo)
  117. else:
  118. emit(IN_UNI_IGNORE)
  119. skip = _len(code); emit(0)
  120. if op is NOT_LITERAL:
  121. emit(NEGATE)
  122. for k in (lo,) + fixes[lo]:
  123. emit(LITERAL)
  124. emit(k)
  125. emit(FAILURE)
  126. code[skip] = _len(code) - skip
  127. elif op is IN:
  128. charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
  129. if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
  130. emit(IN_LOC_IGNORE)
  131. elif not hascased:
  132. emit(IN)
  133. elif not fixes: # ascii
  134. emit(IN_IGNORE)
  135. else:
  136. emit(IN_UNI_IGNORE)
  137. skip = _len(code); emit(0)
  138. _compile_charset(charset, flags, code)
  139. code[skip] = _len(code) - skip
  140. elif op is ANY:
  141. if flags & SRE_FLAG_DOTALL:
  142. emit(ANY_ALL)
  143. else:
  144. emit(ANY)
  145. elif op in REPEATING_CODES:
  146. if flags & SRE_FLAG_TEMPLATE:
  147. raise error("internal: unsupported template operator %r" % (op,))
  148. if _simple(av[2]):
  149. if op is MAX_REPEAT:
  150. emit(REPEAT_ONE)
  151. else:
  152. emit(MIN_REPEAT_ONE)
  153. skip = _len(code); emit(0)
  154. emit(av[0])
  155. emit(av[1])
  156. _compile(code, av[2], flags)
  157. emit(SUCCESS)
  158. code[skip] = _len(code) - skip
  159. else:
  160. emit(REPEAT)
  161. skip = _len(code); emit(0)
  162. emit(av[0])
  163. emit(av[1])
  164. _compile(code, av[2], flags)
  165. code[skip] = _len(code) - skip
  166. if op is MAX_REPEAT:
  167. emit(MAX_UNTIL)
  168. else:
  169. emit(MIN_UNTIL)
  170. elif op is SUBPATTERN:
  171. group, add_flags, del_flags, p = av
  172. if group:
  173. emit(MARK)
  174. emit((group-1)*2)
  175. # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
  176. _compile(code, p, _combine_flags(flags, add_flags, del_flags))
  177. if group:
  178. emit(MARK)
  179. emit((group-1)*2+1)
  180. elif op in SUCCESS_CODES:
  181. emit(op)
  182. elif op in ASSERT_CODES:
  183. emit(op)
  184. skip = _len(code); emit(0)
  185. if av[0] >= 0:
  186. emit(0) # look ahead
  187. else:
  188. lo, hi = av[1].getwidth()
  189. if lo != hi:
  190. raise error("look-behind requires fixed-width pattern")
  191. emit(lo) # look behind
  192. _compile(code, av[1], flags)
  193. emit(SUCCESS)
  194. code[skip] = _len(code) - skip
  195. elif op is CALL:
  196. emit(op)
  197. skip = _len(code); emit(0)
  198. _compile(code, av, flags)
  199. emit(SUCCESS)
  200. code[skip] = _len(code) - skip
  201. elif op is AT:
  202. emit(op)
  203. if flags & SRE_FLAG_MULTILINE:
  204. av = AT_MULTILINE.get(av, av)
  205. if flags & SRE_FLAG_LOCALE:
  206. av = AT_LOCALE.get(av, av)
  207. elif flags & SRE_FLAG_UNICODE:
  208. av = AT_UNICODE.get(av, av)
  209. emit(av)
  210. elif op is BRANCH:
  211. emit(op)
  212. tail = []
  213. tailappend = tail.append
  214. for av in av[1]:
  215. skip = _len(code); emit(0)
  216. # _compile_info(code, av, flags)
  217. _compile(code, av, flags)
  218. emit(JUMP)
  219. tailappend(_len(code)); emit(0)
  220. code[skip] = _len(code) - skip
  221. emit(FAILURE) # end of branch
  222. for tail in tail:
  223. code[tail] = _len(code) - tail
  224. elif op is CATEGORY:
  225. emit(op)
  226. if flags & SRE_FLAG_LOCALE:
  227. av = CH_LOCALE[av]
  228. elif flags & SRE_FLAG_UNICODE:
  229. av = CH_UNICODE[av]
  230. emit(av)
  231. elif op is GROUPREF:
  232. if not flags & SRE_FLAG_IGNORECASE:
  233. emit(op)
  234. elif flags & SRE_FLAG_LOCALE:
  235. emit(GROUPREF_LOC_IGNORE)
  236. elif not fixes: # ascii
  237. emit(GROUPREF_IGNORE)
  238. else:
  239. emit(GROUPREF_UNI_IGNORE)
  240. emit(av-1)
  241. elif op is GROUPREF_EXISTS:
  242. emit(op)
  243. emit(av[0]-1)
  244. skipyes = _len(code); emit(0)
  245. _compile(code, av[1], flags)
  246. if av[2]:
  247. emit(JUMP)
  248. skipno = _len(code); emit(0)
  249. code[skipyes] = _len(code) - skipyes + 1
  250. _compile(code, av[2], flags)
  251. code[skipno] = _len(code) - skipno
  252. else:
  253. code[skipyes] = _len(code) - skipyes + 1
  254. else:
  255. raise error("internal: unsupported operand type %r" % (op,))
  256. def _compile_charset(charset, flags, code):
  257. # compile charset subprogram
  258. emit = code.append
  259. for op, av in charset:
  260. emit(op)
  261. if op is NEGATE:
  262. pass
  263. elif op is LITERAL:
  264. emit(av)
  265. elif op is RANGE or op is RANGE_UNI_IGNORE:
  266. emit(av[0])
  267. emit(av[1])
  268. elif op is CHARSET:
  269. code.extend(av)
  270. elif op is BIGCHARSET:
  271. code.extend(av)
  272. elif op is CATEGORY:
  273. if flags & SRE_FLAG_LOCALE:
  274. emit(CH_LOCALE[av])
  275. elif flags & SRE_FLAG_UNICODE:
  276. emit(CH_UNICODE[av])
  277. else:
  278. emit(av)
  279. else:
  280. raise error("internal: unsupported set operator %r" % (op,))
  281. emit(FAILURE)
  282. def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
  283. # internal: optimize character set
  284. out = []
  285. tail = []
  286. charmap = bytearray(256)
  287. hascased = False
  288. for op, av in charset:
  289. while True:
  290. try:
  291. if op is LITERAL:
  292. if fixup:
  293. lo = fixup(av)
  294. charmap[lo] = 1
  295. if fixes and lo in fixes:
  296. for k in fixes[lo]:
  297. charmap[k] = 1
  298. if not hascased and iscased(av):
  299. hascased = True
  300. else:
  301. charmap[av] = 1
  302. elif op is RANGE:
  303. r = range(av[0], av[1]+1)
  304. if fixup:
  305. if fixes:
  306. for i in map(fixup, r):
  307. charmap[i] = 1
  308. if i in fixes:
  309. for k in fixes[i]:
  310. charmap[k] = 1
  311. else:
  312. for i in map(fixup, r):
  313. charmap[i] = 1
  314. if not hascased:
  315. hascased = any(map(iscased, r))
  316. else:
  317. for i in r:
  318. charmap[i] = 1
  319. elif op is NEGATE:
  320. out.append((op, av))
  321. else:
  322. tail.append((op, av))
  323. except IndexError:
  324. if len(charmap) == 256:
  325. # character set contains non-UCS1 character codes
  326. charmap += b'\0' * 0xff00
  327. continue
  328. # Character set contains non-BMP character codes.
  329. # For range, all BMP characters in the range are already
  330. # proceeded.
  331. if fixup:
  332. hascased = True
  333. # For now, IN_UNI_IGNORE+LITERAL and
  334. # IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
  335. # characters, because two characters (at least one of
  336. # which is not in the BMP) match case-insensitively
  337. # if and only if:
  338. # 1) c1.lower() == c2.lower()
  339. # 2) c1.lower() == c2 or c1.lower().upper() == c2
  340. # Also, both c.lower() and c.lower().upper() are single
  341. # characters for every non-BMP character.
  342. if op is RANGE:
  343. op = RANGE_UNI_IGNORE
  344. tail.append((op, av))
  345. break
  346. # compress character map
  347. runs = []
  348. q = 0
  349. while True:
  350. p = charmap.find(1, q)
  351. if p < 0:
  352. break
  353. if len(runs) >= 2:
  354. runs = None
  355. break
  356. q = charmap.find(0, p)
  357. if q < 0:
  358. runs.append((p, len(charmap)))
  359. break
  360. runs.append((p, q))
  361. if runs is not None:
  362. # use literal/range
  363. for p, q in runs:
  364. if q - p == 1:
  365. out.append((LITERAL, p))
  366. else:
  367. out.append((RANGE, (p, q - 1)))
  368. out += tail
  369. # if the case was changed or new representation is more compact
  370. if hascased or len(out) < len(charset):
  371. return out, hascased
  372. # else original character set is good enough
  373. return charset, hascased
  374. # use bitmap
  375. if len(charmap) == 256:
  376. data = _mk_bitmap(charmap)
  377. out.append((CHARSET, data))
  378. out += tail
  379. return out, hascased
  380. # To represent a big charset, first a bitmap of all characters in the
  381. # set is constructed. Then, this bitmap is sliced into chunks of 256
  382. # characters, duplicate chunks are eliminated, and each chunk is
  383. # given a number. In the compiled expression, the charset is
  384. # represented by a 32-bit word sequence, consisting of one word for
  385. # the number of different chunks, a sequence of 256 bytes (64 words)
  386. # of chunk numbers indexed by their original chunk position, and a
  387. # sequence of 256-bit chunks (8 words each).
  388. # Compression is normally good: in a typical charset, large ranges of
  389. # Unicode will be either completely excluded (e.g. if only cyrillic
  390. # letters are to be matched), or completely included (e.g. if large
  391. # subranges of Kanji match). These ranges will be represented by
  392. # chunks of all one-bits or all zero-bits.
  393. # Matching can be also done efficiently: the more significant byte of
  394. # the Unicode character is an index into the chunk number, and the
  395. # less significant byte is a bit index in the chunk (just like the
  396. # CHARSET matching).
  397. charmap = bytes(charmap) # should be hashable
  398. comps = {}
  399. mapping = bytearray(256)
  400. block = 0
  401. data = bytearray()
  402. for i in range(0, 65536, 256):
  403. chunk = charmap[i: i + 256]
  404. if chunk in comps:
  405. mapping[i // 256] = comps[chunk]
  406. else:
  407. mapping[i // 256] = comps[chunk] = block
  408. block += 1
  409. data += chunk
  410. data = _mk_bitmap(data)
  411. data[0:0] = [block] + _bytes_to_codes(mapping)
  412. out.append((BIGCHARSET, data))
  413. out += tail
  414. return out, hascased
  415. _CODEBITS = _sre.CODESIZE * 8
  416. MAXCODE = (1 << _CODEBITS) - 1
  417. _BITS_TRANS = b'0' + b'1' * 255
  418. def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
  419. s = bits.translate(_BITS_TRANS)[::-1]
  420. return [_int(s[i - _CODEBITS: i], 2)
  421. for i in range(len(s), 0, -_CODEBITS)]
  422. def _bytes_to_codes(b):
  423. # Convert block indices to word array
  424. a = memoryview(b).cast('I')
  425. assert a.itemsize == _sre.CODESIZE
  426. assert len(a) * a.itemsize == len(b)
  427. return a.tolist()
  428. def _simple(p):
  429. # check if this subpattern is a "simple" operator
  430. if len(p) != 1:
  431. return False
  432. op, av = p[0]
  433. if op is SUBPATTERN:
  434. return av[0] is None and _simple(av[-1])
  435. return op in _UNIT_CODES
  436. def _generate_overlap_table(prefix):
  437. """
  438. Generate an overlap table for the following prefix.
  439. An overlap table is a table of the same size as the prefix which
  440. informs about the potential self-overlap for each index in the prefix:
  441. - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
  442. - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
  443. prefix[0:k]
  444. """
  445. table = [0] * len(prefix)
  446. for i in range(1, len(prefix)):
  447. idx = table[i - 1]
  448. while prefix[i] != prefix[idx]:
  449. if idx == 0:
  450. table[i] = 0
  451. break
  452. idx = table[idx - 1]
  453. else:
  454. table[i] = idx + 1
  455. return table
  456. def _get_iscased(flags):
  457. if not flags & SRE_FLAG_IGNORECASE:
  458. return None
  459. elif flags & SRE_FLAG_UNICODE:
  460. return _sre.unicode_iscased
  461. else:
  462. return _sre.ascii_iscased
  463. def _get_literal_prefix(pattern, flags):
  464. # look for literal prefix
  465. prefix = []
  466. prefixappend = prefix.append
  467. prefix_skip = None
  468. iscased = _get_iscased(flags)
  469. for op, av in pattern.data:
  470. if op is LITERAL:
  471. if iscased and iscased(av):
  472. break
  473. prefixappend(av)
  474. elif op is SUBPATTERN:
  475. group, add_flags, del_flags, p = av
  476. flags1 = _combine_flags(flags, add_flags, del_flags)
  477. if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
  478. break
  479. prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
  480. if prefix_skip is None:
  481. if group is not None:
  482. prefix_skip = len(prefix)
  483. elif prefix_skip1 is not None:
  484. prefix_skip = len(prefix) + prefix_skip1
  485. prefix.extend(prefix1)
  486. if not got_all:
  487. break
  488. else:
  489. break
  490. else:
  491. return prefix, prefix_skip, True
  492. return prefix, prefix_skip, False
  493. def _get_charset_prefix(pattern, flags):
  494. while True:
  495. if not pattern.data:
  496. return None
  497. op, av = pattern.data[0]
  498. if op is not SUBPATTERN:
  499. break
  500. group, add_flags, del_flags, pattern = av
  501. flags = _combine_flags(flags, add_flags, del_flags)
  502. if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
  503. return None
  504. iscased = _get_iscased(flags)
  505. if op is LITERAL:
  506. if iscased and iscased(av):
  507. return None
  508. return [(op, av)]
  509. elif op is BRANCH:
  510. charset = []
  511. charsetappend = charset.append
  512. for p in av[1]:
  513. if not p:
  514. return None
  515. op, av = p[0]
  516. if op is LITERAL and not (iscased and iscased(av)):
  517. charsetappend((op, av))
  518. else:
  519. return None
  520. return charset
  521. elif op is IN:
  522. charset = av
  523. if iscased:
  524. for op, av in charset:
  525. if op is LITERAL:
  526. if iscased(av):
  527. return None
  528. elif op is RANGE:
  529. if av[1] > 0xffff:
  530. return None
  531. if any(map(iscased, range(av[0], av[1]+1))):
  532. return None
  533. return charset
  534. return None
  535. def _compile_info(code, pattern, flags):
  536. # internal: compile an info block. in the current version,
  537. # this contains min/max pattern width, and an optional literal
  538. # prefix or a character map
  539. lo, hi = pattern.getwidth()
  540. if hi > MAXCODE:
  541. hi = MAXCODE
  542. if lo == 0:
  543. code.extend([INFO, 4, 0, lo, hi])
  544. return
  545. # look for a literal prefix
  546. prefix = []
  547. prefix_skip = 0
  548. charset = [] # not used
  549. if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
  550. # look for literal prefix
  551. prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
  552. # if no prefix, look for charset prefix
  553. if not prefix:
  554. charset = _get_charset_prefix(pattern, flags)
  555. ## if prefix:
  556. ## print("*** PREFIX", prefix, prefix_skip)
  557. ## if charset:
  558. ## print("*** CHARSET", charset)
  559. # add an info block
  560. emit = code.append
  561. emit(INFO)
  562. skip = len(code); emit(0)
  563. # literal flag
  564. mask = 0
  565. if prefix:
  566. mask = SRE_INFO_PREFIX
  567. if prefix_skip is None and got_all:
  568. mask = mask | SRE_INFO_LITERAL
  569. elif charset:
  570. mask = mask | SRE_INFO_CHARSET
  571. emit(mask)
  572. # pattern length
  573. if lo < MAXCODE:
  574. emit(lo)
  575. else:
  576. emit(MAXCODE)
  577. prefix = prefix[:MAXCODE]
  578. emit(min(hi, MAXCODE))
  579. # add literal prefix
  580. if prefix:
  581. emit(len(prefix)) # length
  582. if prefix_skip is None:
  583. prefix_skip = len(prefix)
  584. emit(prefix_skip) # skip
  585. code.extend(prefix)
  586. # generate overlap table
  587. code.extend(_generate_overlap_table(prefix))
  588. elif charset:
  589. charset, hascased = _optimize_charset(charset)
  590. assert not hascased
  591. _compile_charset(charset, flags, code)
  592. code[skip] = len(code) - skip
  593. def isstring(obj):
  594. return isinstance(obj, (str, bytes))
  595. def _code(p, flags):
  596. flags = p.state.flags | flags
  597. code = []
  598. # compile info block
  599. _compile_info(code, p, flags)
  600. # compile the pattern
  601. _compile(code, p.data, flags)
  602. code.append(SUCCESS)
  603. return code
  604. def _hex_code(code):
  605. return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
  606. def dis(code):
  607. import sys
  608. labels = set()
  609. level = 0
  610. offset_width = len(str(len(code) - 1))
  611. def dis_(start, end):
  612. def print_(*args, to=None):
  613. if to is not None:
  614. labels.add(to)
  615. args += ('(to %d)' % (to,),)
  616. print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
  617. end=' '*(level-1))
  618. print(*args)
  619. def print_2(*args):
  620. print(end=' '*(offset_width + 2*level))
  621. print(*args)
  622. nonlocal level
  623. level += 1
  624. i = start
  625. while i < end:
  626. start = i
  627. op = code[i]
  628. i += 1
  629. op = OPCODES[op]
  630. if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
  631. MAX_UNTIL, MIN_UNTIL, NEGATE):
  632. print_(op)
  633. elif op in (LITERAL, NOT_LITERAL,
  634. LITERAL_IGNORE, NOT_LITERAL_IGNORE,
  635. LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
  636. LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
  637. arg = code[i]
  638. i += 1
  639. print_(op, '%#02x (%r)' % (arg, chr(arg)))
  640. elif op is AT:
  641. arg = code[i]
  642. i += 1
  643. arg = str(ATCODES[arg])
  644. assert arg[:3] == 'AT_'
  645. print_(op, arg[3:])
  646. elif op is CATEGORY:
  647. arg = code[i]
  648. i += 1
  649. arg = str(CHCODES[arg])
  650. assert arg[:9] == 'CATEGORY_'
  651. print_(op, arg[9:])
  652. elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
  653. skip = code[i]
  654. print_(op, skip, to=i+skip)
  655. dis_(i+1, i+skip)
  656. i += skip
  657. elif op in (RANGE, RANGE_UNI_IGNORE):
  658. lo, hi = code[i: i+2]
  659. i += 2
  660. print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
  661. elif op is CHARSET:
  662. print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
  663. i += 256//_CODEBITS
  664. elif op is BIGCHARSET:
  665. arg = code[i]
  666. i += 1
  667. mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
  668. for x in code[i: i + 256//_sre.CODESIZE]))
  669. print_(op, arg, mapping)
  670. i += 256//_sre.CODESIZE
  671. level += 1
  672. for j in range(arg):
  673. print_2(_hex_code(code[i: i + 256//_CODEBITS]))
  674. i += 256//_CODEBITS
  675. level -= 1
  676. elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
  677. GROUPREF_LOC_IGNORE):
  678. arg = code[i]
  679. i += 1
  680. print_(op, arg)
  681. elif op is JUMP:
  682. skip = code[i]
  683. print_(op, skip, to=i+skip)
  684. i += 1
  685. elif op is BRANCH:
  686. skip = code[i]
  687. print_(op, skip, to=i+skip)
  688. while skip:
  689. dis_(i+1, i+skip)
  690. i += skip
  691. start = i
  692. skip = code[i]
  693. if skip:
  694. print_('branch', skip, to=i+skip)
  695. else:
  696. print_(FAILURE)
  697. i += 1
  698. elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
  699. skip, min, max = code[i: i+3]
  700. if max == MAXREPEAT:
  701. max = 'MAXREPEAT'
  702. print_(op, skip, min, max, to=i+skip)
  703. dis_(i+3, i+skip)
  704. i += skip
  705. elif op is GROUPREF_EXISTS:
  706. arg, skip = code[i: i+2]
  707. print_(op, arg, skip, to=i+skip)
  708. i += 2
  709. elif op in (ASSERT, ASSERT_NOT):
  710. skip, arg = code[i: i+2]
  711. print_(op, skip, arg, to=i+skip)
  712. dis_(i+2, i+skip)
  713. i += skip
  714. elif op is INFO:
  715. skip, flags, min, max = code[i: i+4]
  716. if max == MAXREPEAT:
  717. max = 'MAXREPEAT'
  718. print_(op, skip, bin(flags), min, max, to=i+skip)
  719. start = i+4
  720. if flags & SRE_INFO_PREFIX:
  721. prefix_len, prefix_skip = code[i+4: i+6]
  722. print_2(' prefix_skip', prefix_skip)
  723. start = i + 6
  724. prefix = code[start: start+prefix_len]
  725. print_2(' prefix',
  726. '[%s]' % ', '.join('%#02x' % x for x in prefix),
  727. '(%r)' % ''.join(map(chr, prefix)))
  728. start += prefix_len
  729. print_2(' overlap', code[start: start+prefix_len])
  730. start += prefix_len
  731. if flags & SRE_INFO_CHARSET:
  732. level += 1
  733. print_2('in')
  734. dis_(start, i+skip)
  735. level -= 1
  736. i += skip
  737. else:
  738. raise ValueError(op)
  739. level -= 1
  740. dis_(0, len(code))
  741. def compile(p, flags=0):
  742. # internal: convert pattern list to internal format
  743. if isstring(p):
  744. pattern = p
  745. p = sre_parse.parse(p, flags)
  746. else:
  747. pattern = None
  748. code = _code(p, flags)
  749. if flags & SRE_FLAG_DEBUG:
  750. print()
  751. dis(code)
  752. # map in either direction
  753. groupindex = p.state.groupdict
  754. indexgroup = [None] * p.state.groups
  755. for k, i in groupindex.items():
  756. indexgroup[i] = k
  757. return _sre.compile(
  758. pattern, flags | p.state.flags, code,
  759. p.state.groups-1,
  760. groupindex, tuple(indexgroup)
  761. )