Traditional.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. #=======================================================================
  2. #
  3. # Python Lexical Analyser
  4. #
  5. # Traditional Regular Expression Syntax
  6. #
  7. #=======================================================================
  8. from __future__ import absolute_import
  9. from .Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char
  10. from .Errors import PlexError
  11. class RegexpSyntaxError(PlexError):
  12. pass
  13. def re(s):
  14. """
  15. Convert traditional string representation of regular expression |s|
  16. into Plex representation.
  17. """
  18. return REParser(s).parse_re()
  19. class REParser(object):
  20. def __init__(self, s):
  21. self.s = s
  22. self.i = -1
  23. self.end = 0
  24. self.next()
  25. def parse_re(self):
  26. re = self.parse_alt()
  27. if not self.end:
  28. self.error("Unexpected %s" % repr(self.c))
  29. return re
  30. def parse_alt(self):
  31. """Parse a set of alternative regexps."""
  32. re = self.parse_seq()
  33. if self.c == '|':
  34. re_list = [re]
  35. while self.c == '|':
  36. self.next()
  37. re_list.append(self.parse_seq())
  38. re = Alt(*re_list)
  39. return re
  40. def parse_seq(self):
  41. """Parse a sequence of regexps."""
  42. re_list = []
  43. while not self.end and not self.c in "|)":
  44. re_list.append(self.parse_mod())
  45. return Seq(*re_list)
  46. def parse_mod(self):
  47. """Parse a primitive regexp followed by *, +, ? modifiers."""
  48. re = self.parse_prim()
  49. while not self.end and self.c in "*+?":
  50. if self.c == '*':
  51. re = Rep(re)
  52. elif self.c == '+':
  53. re = Rep1(re)
  54. else: # self.c == '?'
  55. re = Opt(re)
  56. self.next()
  57. return re
  58. def parse_prim(self):
  59. """Parse a primitive regexp."""
  60. c = self.get()
  61. if c == '.':
  62. re = AnyBut("\n")
  63. elif c == '^':
  64. re = Bol
  65. elif c == '$':
  66. re = Eol
  67. elif c == '(':
  68. re = self.parse_alt()
  69. self.expect(')')
  70. elif c == '[':
  71. re = self.parse_charset()
  72. self.expect(']')
  73. else:
  74. if c == '\\':
  75. c = self.get()
  76. re = Char(c)
  77. return re
  78. def parse_charset(self):
  79. """Parse a charset. Does not include the surrounding []."""
  80. char_list = []
  81. invert = 0
  82. if self.c == '^':
  83. invert = 1
  84. self.next()
  85. if self.c == ']':
  86. char_list.append(']')
  87. self.next()
  88. while not self.end and self.c != ']':
  89. c1 = self.get()
  90. if self.c == '-' and self.lookahead(1) != ']':
  91. self.next()
  92. c2 = self.get()
  93. for a in range(ord(c1), ord(c2) + 1):
  94. char_list.append(chr(a))
  95. else:
  96. char_list.append(c1)
  97. chars = ''.join(char_list)
  98. if invert:
  99. return AnyBut(chars)
  100. else:
  101. return Any(chars)
  102. def next(self):
  103. """Advance to the next char."""
  104. s = self.s
  105. i = self.i = self.i + 1
  106. if i < len(s):
  107. self.c = s[i]
  108. else:
  109. self.c = ''
  110. self.end = 1
  111. def get(self):
  112. if self.end:
  113. self.error("Premature end of string")
  114. c = self.c
  115. self.next()
  116. return c
  117. def lookahead(self, n):
  118. """Look ahead n chars."""
  119. j = self.i + n
  120. if j < len(self.s):
  121. return self.s[j]
  122. else:
  123. return ''
  124. def expect(self, c):
  125. """
  126. Expect to find character |c| at current position.
  127. Raises an exception otherwise.
  128. """
  129. if self.c == c:
  130. self.next()
  131. else:
  132. self.error("Missing %s" % repr(c))
  133. def error(self, mess):
  134. """Raise exception to signal syntax error in regexp."""
  135. raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % (
  136. repr(self.s), self.i, mess))