Regexps.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. #=======================================================================
  2. #
  3. # Python Lexical Analyser
  4. #
  5. # Regular Expressions
  6. #
  7. #=======================================================================
  8. from __future__ import absolute_import
  9. import types
  10. try:
  11. from sys import maxsize as maxint
  12. except ImportError:
  13. from sys import maxint
  14. from . import Errors
  15. #
  16. # Constants
  17. #
  18. BOL = 'bol'
  19. EOL = 'eol'
  20. EOF = 'eof'
  21. nl_code = ord('\n')
  22. #
  23. # Helper functions
  24. #
  25. def chars_to_ranges(s):
  26. """
  27. Return a list of character codes consisting of pairs
  28. [code1a, code1b, code2a, code2b,...] which cover all
  29. the characters in |s|.
  30. """
  31. char_list = list(s)
  32. char_list.sort()
  33. i = 0
  34. n = len(char_list)
  35. result = []
  36. while i < n:
  37. code1 = ord(char_list[i])
  38. code2 = code1 + 1
  39. i += 1
  40. while i < n and code2 >= ord(char_list[i]):
  41. code2 += 1
  42. i += 1
  43. result.append(code1)
  44. result.append(code2)
  45. return result
  46. def uppercase_range(code1, code2):
  47. """
  48. If the range of characters from code1 to code2-1 includes any
  49. lower case letters, return the corresponding upper case range.
  50. """
  51. code3 = max(code1, ord('a'))
  52. code4 = min(code2, ord('z') + 1)
  53. if code3 < code4:
  54. d = ord('A') - ord('a')
  55. return (code3 + d, code4 + d)
  56. else:
  57. return None
  58. def lowercase_range(code1, code2):
  59. """
  60. If the range of characters from code1 to code2-1 includes any
  61. upper case letters, return the corresponding lower case range.
  62. """
  63. code3 = max(code1, ord('A'))
  64. code4 = min(code2, ord('Z') + 1)
  65. if code3 < code4:
  66. d = ord('a') - ord('A')
  67. return (code3 + d, code4 + d)
  68. else:
  69. return None
  70. def CodeRanges(code_list):
  71. """
  72. Given a list of codes as returned by chars_to_ranges, return
  73. an RE which will match a character in any of the ranges.
  74. """
  75. re_list = [CodeRange(code_list[i], code_list[i + 1]) for i in range(0, len(code_list), 2)]
  76. return Alt(*re_list)
  77. def CodeRange(code1, code2):
  78. """
  79. CodeRange(code1, code2) is an RE which matches any character
  80. with a code |c| in the range |code1| <= |c| < |code2|.
  81. """
  82. if code1 <= nl_code < code2:
  83. return Alt(RawCodeRange(code1, nl_code),
  84. RawNewline,
  85. RawCodeRange(nl_code + 1, code2))
  86. else:
  87. return RawCodeRange(code1, code2)
  88. #
  89. # Abstract classes
  90. #
  91. class RE(object):
  92. """RE is the base class for regular expression constructors.
  93. The following operators are defined on REs:
  94. re1 + re2 is an RE which matches |re1| followed by |re2|
  95. re1 | re2 is an RE which matches either |re1| or |re2|
  96. """
  97. nullable = 1 # True if this RE can match 0 input symbols
  98. match_nl = 1 # True if this RE can match a string ending with '\n'
  99. str = None # Set to a string to override the class's __str__ result
  100. def build_machine(self, machine, initial_state, final_state,
  101. match_bol, nocase):
  102. """
  103. This method should add states to |machine| to implement this
  104. RE, starting at |initial_state| and ending at |final_state|.
  105. If |match_bol| is true, the RE must be able to match at the
  106. beginning of a line. If nocase is true, upper and lower case
  107. letters should be treated as equivalent.
  108. """
  109. raise NotImplementedError("%s.build_machine not implemented" %
  110. self.__class__.__name__)
  111. def build_opt(self, m, initial_state, c):
  112. """
  113. Given a state |s| of machine |m|, return a new state
  114. reachable from |s| on character |c| or epsilon.
  115. """
  116. s = m.new_state()
  117. initial_state.link_to(s)
  118. initial_state.add_transition(c, s)
  119. return s
  120. def __add__(self, other):
  121. return Seq(self, other)
  122. def __or__(self, other):
  123. return Alt(self, other)
  124. def __str__(self):
  125. if self.str:
  126. return self.str
  127. else:
  128. return self.calc_str()
  129. def check_re(self, num, value):
  130. if not isinstance(value, RE):
  131. self.wrong_type(num, value, "Plex.RE instance")
  132. def check_string(self, num, value):
  133. if type(value) != type(''):
  134. self.wrong_type(num, value, "string")
  135. def check_char(self, num, value):
  136. self.check_string(num, value)
  137. if len(value) != 1:
  138. raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s."
  139. "Expected a string of length 1, got: %s" % (
  140. num, self.__class__.__name__, repr(value)))
  141. def wrong_type(self, num, value, expected):
  142. if type(value) == types.InstanceType:
  143. got = "%s.%s instance" % (
  144. value.__class__.__module__, value.__class__.__name__)
  145. else:
  146. got = type(value).__name__
  147. raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s "
  148. "(expected %s, got %s" % (
  149. num, self.__class__.__name__, expected, got))
  150. #
  151. # Primitive RE constructors
  152. # -------------------------
  153. #
  154. # These are the basic REs from which all others are built.
  155. #
  156. ## class Char(RE):
  157. ## """
  158. ## Char(c) is an RE which matches the character |c|.
  159. ## """
  160. ## nullable = 0
  161. ## def __init__(self, char):
  162. ## self.char = char
  163. ## self.match_nl = char == '\n'
  164. ## def build_machine(self, m, initial_state, final_state, match_bol, nocase):
  165. ## c = self.char
  166. ## if match_bol and c != BOL:
  167. ## s1 = self.build_opt(m, initial_state, BOL)
  168. ## else:
  169. ## s1 = initial_state
  170. ## if c == '\n' or c == EOF:
  171. ## s1 = self.build_opt(m, s1, EOL)
  172. ## if len(c) == 1:
  173. ## code = ord(self.char)
  174. ## s1.add_transition((code, code+1), final_state)
  175. ## if nocase and is_letter_code(code):
  176. ## code2 = other_case_code(code)
  177. ## s1.add_transition((code2, code2+1), final_state)
  178. ## else:
  179. ## s1.add_transition(c, final_state)
  180. ## def calc_str(self):
  181. ## return "Char(%s)" % repr(self.char)
  182. def Char(c):
  183. """
  184. Char(c) is an RE which matches the character |c|.
  185. """
  186. if len(c) == 1:
  187. result = CodeRange(ord(c), ord(c) + 1)
  188. else:
  189. result = SpecialSymbol(c)
  190. result.str = "Char(%s)" % repr(c)
  191. return result
  192. class RawCodeRange(RE):
  193. """
  194. RawCodeRange(code1, code2) is a low-level RE which matches any character
  195. with a code |c| in the range |code1| <= |c| < |code2|, where the range
  196. does not include newline. For internal use only.
  197. """
  198. nullable = 0
  199. match_nl = 0
  200. range = None # (code, code)
  201. uppercase_range = None # (code, code) or None
  202. lowercase_range = None # (code, code) or None
  203. def __init__(self, code1, code2):
  204. self.range = (code1, code2)
  205. self.uppercase_range = uppercase_range(code1, code2)
  206. self.lowercase_range = lowercase_range(code1, code2)
  207. def build_machine(self, m, initial_state, final_state, match_bol, nocase):
  208. if match_bol:
  209. initial_state = self.build_opt(m, initial_state, BOL)
  210. initial_state.add_transition(self.range, final_state)
  211. if nocase:
  212. if self.uppercase_range:
  213. initial_state.add_transition(self.uppercase_range, final_state)
  214. if self.lowercase_range:
  215. initial_state.add_transition(self.lowercase_range, final_state)
  216. def calc_str(self):
  217. return "CodeRange(%d,%d)" % (self.code1, self.code2)
  218. class _RawNewline(RE):
  219. """
  220. RawNewline is a low-level RE which matches a newline character.
  221. For internal use only.
  222. """
  223. nullable = 0
  224. match_nl = 1
  225. def build_machine(self, m, initial_state, final_state, match_bol, nocase):
  226. if match_bol:
  227. initial_state = self.build_opt(m, initial_state, BOL)
  228. s = self.build_opt(m, initial_state, EOL)
  229. s.add_transition((nl_code, nl_code + 1), final_state)
  230. RawNewline = _RawNewline()
  231. class SpecialSymbol(RE):
  232. """
  233. SpecialSymbol(sym) is an RE which matches the special input
  234. symbol |sym|, which is one of BOL, EOL or EOF.
  235. """
  236. nullable = 0
  237. match_nl = 0
  238. sym = None
  239. def __init__(self, sym):
  240. self.sym = sym
  241. def build_machine(self, m, initial_state, final_state, match_bol, nocase):
  242. # Sequences 'bol bol' and 'bol eof' are impossible, so only need
  243. # to allow for bol if sym is eol
  244. if match_bol and self.sym == EOL:
  245. initial_state = self.build_opt(m, initial_state, BOL)
  246. initial_state.add_transition(self.sym, final_state)
  247. class Seq(RE):
  248. """Seq(re1, re2, re3...) is an RE which matches |re1| followed by
  249. |re2| followed by |re3|..."""
  250. def __init__(self, *re_list):
  251. nullable = 1
  252. for i, re in enumerate(re_list):
  253. self.check_re(i, re)
  254. nullable = nullable and re.nullable
  255. self.re_list = re_list
  256. self.nullable = nullable
  257. i = len(re_list)
  258. match_nl = 0
  259. while i:
  260. i -= 1
  261. re = re_list[i]
  262. if re.match_nl:
  263. match_nl = 1
  264. break
  265. if not re.nullable:
  266. break
  267. self.match_nl = match_nl
  268. def build_machine(self, m, initial_state, final_state, match_bol, nocase):
  269. re_list = self.re_list
  270. if len(re_list) == 0:
  271. initial_state.link_to(final_state)
  272. else:
  273. s1 = initial_state
  274. n = len(re_list)
  275. for i, re in enumerate(re_list):
  276. if i < n - 1:
  277. s2 = m.new_state()
  278. else:
  279. s2 = final_state
  280. re.build_machine(m, s1, s2, match_bol, nocase)
  281. s1 = s2
  282. match_bol = re.match_nl or (match_bol and re.nullable)
  283. def calc_str(self):
  284. return "Seq(%s)" % ','.join(map(str, self.re_list))
  285. class Alt(RE):
  286. """Alt(re1, re2, re3...) is an RE which matches either |re1| or
  287. |re2| or |re3|..."""
  288. def __init__(self, *re_list):
  289. self.re_list = re_list
  290. nullable = 0
  291. match_nl = 0
  292. nullable_res = []
  293. non_nullable_res = []
  294. i = 1
  295. for re in re_list:
  296. self.check_re(i, re)
  297. if re.nullable:
  298. nullable_res.append(re)
  299. nullable = 1
  300. else:
  301. non_nullable_res.append(re)
  302. if re.match_nl:
  303. match_nl = 1
  304. i += 1
  305. self.nullable_res = nullable_res
  306. self.non_nullable_res = non_nullable_res
  307. self.nullable = nullable
  308. self.match_nl = match_nl
  309. def build_machine(self, m, initial_state, final_state, match_bol, nocase):
  310. for re in self.nullable_res:
  311. re.build_machine(m, initial_state, final_state, match_bol, nocase)
  312. if self.non_nullable_res:
  313. if match_bol:
  314. initial_state = self.build_opt(m, initial_state, BOL)
  315. for re in self.non_nullable_res:
  316. re.build_machine(m, initial_state, final_state, 0, nocase)
  317. def calc_str(self):
  318. return "Alt(%s)" % ','.join(map(str, self.re_list))
  319. class Rep1(RE):
  320. """Rep1(re) is an RE which matches one or more repetitions of |re|."""
  321. def __init__(self, re):
  322. self.check_re(1, re)
  323. self.re = re
  324. self.nullable = re.nullable
  325. self.match_nl = re.match_nl
  326. def build_machine(self, m, initial_state, final_state, match_bol, nocase):
  327. s1 = m.new_state()
  328. s2 = m.new_state()
  329. initial_state.link_to(s1)
  330. self.re.build_machine(m, s1, s2, match_bol or self.re.match_nl, nocase)
  331. s2.link_to(s1)
  332. s2.link_to(final_state)
  333. def calc_str(self):
  334. return "Rep1(%s)" % self.re
  335. class SwitchCase(RE):
  336. """
  337. SwitchCase(re, nocase) is an RE which matches the same strings as RE,
  338. but treating upper and lower case letters according to |nocase|. If
  339. |nocase| is true, case is ignored, otherwise it is not.
  340. """
  341. re = None
  342. nocase = None
  343. def __init__(self, re, nocase):
  344. self.re = re
  345. self.nocase = nocase
  346. self.nullable = re.nullable
  347. self.match_nl = re.match_nl
  348. def build_machine(self, m, initial_state, final_state, match_bol, nocase):
  349. self.re.build_machine(m, initial_state, final_state, match_bol,
  350. self.nocase)
  351. def calc_str(self):
  352. if self.nocase:
  353. name = "NoCase"
  354. else:
  355. name = "Case"
  356. return "%s(%s)" % (name, self.re)
  357. #
  358. # Composite RE constructors
  359. # -------------------------
  360. #
  361. # These REs are defined in terms of the primitive REs.
  362. #
  363. Empty = Seq()
  364. Empty.__doc__ = \
  365. """
  366. Empty is an RE which matches the empty string.
  367. """
  368. Empty.str = "Empty"
  369. def Str1(s):
  370. """
  371. Str1(s) is an RE which matches the literal string |s|.
  372. """
  373. result = Seq(*tuple(map(Char, s)))
  374. result.str = "Str(%s)" % repr(s)
  375. return result
  376. def Str(*strs):
  377. """
  378. Str(s) is an RE which matches the literal string |s|.
  379. Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|...
  380. """
  381. if len(strs) == 1:
  382. return Str1(strs[0])
  383. else:
  384. result = Alt(*tuple(map(Str1, strs)))
  385. result.str = "Str(%s)" % ','.join(map(repr, strs))
  386. return result
  387. def Any(s):
  388. """
  389. Any(s) is an RE which matches any character in the string |s|.
  390. """
  391. #result = apply(Alt, tuple(map(Char, s)))
  392. result = CodeRanges(chars_to_ranges(s))
  393. result.str = "Any(%s)" % repr(s)
  394. return result
  395. def AnyBut(s):
  396. """
  397. AnyBut(s) is an RE which matches any character (including
  398. newline) which is not in the string |s|.
  399. """
  400. ranges = chars_to_ranges(s)
  401. ranges.insert(0, -maxint)
  402. ranges.append(maxint)
  403. result = CodeRanges(ranges)
  404. result.str = "AnyBut(%s)" % repr(s)
  405. return result
  406. AnyChar = AnyBut("")
  407. AnyChar.__doc__ = \
  408. """
  409. AnyChar is an RE which matches any single character (including a newline).
  410. """
  411. AnyChar.str = "AnyChar"
  412. def Range(s1, s2=None):
  413. """
  414. Range(c1, c2) is an RE which matches any single character in the range
  415. |c1| to |c2| inclusive.
  416. Range(s) where |s| is a string of even length is an RE which matches
  417. any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,...
  418. """
  419. if s2:
  420. result = CodeRange(ord(s1), ord(s2) + 1)
  421. result.str = "Range(%s,%s)" % (s1, s2)
  422. else:
  423. ranges = []
  424. for i in range(0, len(s1), 2):
  425. ranges.append(CodeRange(ord(s1[i]), ord(s1[i + 1]) + 1))
  426. result = Alt(*ranges)
  427. result.str = "Range(%s)" % repr(s1)
  428. return result
  429. def Opt(re):
  430. """
  431. Opt(re) is an RE which matches either |re| or the empty string.
  432. """
  433. result = Alt(re, Empty)
  434. result.str = "Opt(%s)" % re
  435. return result
  436. def Rep(re):
  437. """
  438. Rep(re) is an RE which matches zero or more repetitions of |re|.
  439. """
  440. result = Opt(Rep1(re))
  441. result.str = "Rep(%s)" % re
  442. return result
  443. def NoCase(re):
  444. """
  445. NoCase(re) is an RE which matches the same strings as RE, but treating
  446. upper and lower case letters as equivalent.
  447. """
  448. return SwitchCase(re, nocase=1)
  449. def Case(re):
  450. """
  451. Case(re) is an RE which matches the same strings as RE, but treating
  452. upper and lower case letters as distinct, i.e. it cancels the effect
  453. of any enclosing NoCase().
  454. """
  455. return SwitchCase(re, nocase=0)
  456. #
  457. # RE Constants
  458. #
  459. Bol = Char(BOL)
  460. Bol.__doc__ = \
  461. """
  462. Bol is an RE which matches the beginning of a line.
  463. """
  464. Bol.str = "Bol"
  465. Eol = Char(EOL)
  466. Eol.__doc__ = \
  467. """
  468. Eol is an RE which matches the end of a line.
  469. """
  470. Eol.str = "Eol"
  471. Eof = Char(EOF)
  472. Eof.__doc__ = \
  473. """
  474. Eof is an RE which matches the end of the file.
  475. """
  476. Eof.str = "Eof"