123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- #=======================================================================
- #
- # Python Lexical Analyser
- #
- # Traditional Regular Expression Syntax
- #
- #=======================================================================
- from __future__ import absolute_import
- from .Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char
- from .Errors import PlexError
- class RegexpSyntaxError(PlexError):
- pass
- def re(s):
- """
- Convert traditional string representation of regular expression |s|
- into Plex representation.
- """
- return REParser(s).parse_re()
- class REParser(object):
- def __init__(self, s):
- self.s = s
- self.i = -1
- self.end = 0
- self.next()
- def parse_re(self):
- re = self.parse_alt()
- if not self.end:
- self.error("Unexpected %s" % repr(self.c))
- return re
- def parse_alt(self):
- """Parse a set of alternative regexps."""
- re = self.parse_seq()
- if self.c == '|':
- re_list = [re]
- while self.c == '|':
- self.next()
- re_list.append(self.parse_seq())
- re = Alt(*re_list)
- return re
- def parse_seq(self):
- """Parse a sequence of regexps."""
- re_list = []
- while not self.end and not self.c in "|)":
- re_list.append(self.parse_mod())
- return Seq(*re_list)
- def parse_mod(self):
- """Parse a primitive regexp followed by *, +, ? modifiers."""
- re = self.parse_prim()
- while not self.end and self.c in "*+?":
- if self.c == '*':
- re = Rep(re)
- elif self.c == '+':
- re = Rep1(re)
- else: # self.c == '?'
- re = Opt(re)
- self.next()
- return re
- def parse_prim(self):
- """Parse a primitive regexp."""
- c = self.get()
- if c == '.':
- re = AnyBut("\n")
- elif c == '^':
- re = Bol
- elif c == '$':
- re = Eol
- elif c == '(':
- re = self.parse_alt()
- self.expect(')')
- elif c == '[':
- re = self.parse_charset()
- self.expect(']')
- else:
- if c == '\\':
- c = self.get()
- re = Char(c)
- return re
- def parse_charset(self):
- """Parse a charset. Does not include the surrounding []."""
- char_list = []
- invert = 0
- if self.c == '^':
- invert = 1
- self.next()
- if self.c == ']':
- char_list.append(']')
- self.next()
- while not self.end and self.c != ']':
- c1 = self.get()
- if self.c == '-' and self.lookahead(1) != ']':
- self.next()
- c2 = self.get()
- for a in range(ord(c1), ord(c2) + 1):
- char_list.append(chr(a))
- else:
- char_list.append(c1)
- chars = ''.join(char_list)
- if invert:
- return AnyBut(chars)
- else:
- return Any(chars)
- def next(self):
- """Advance to the next char."""
- s = self.s
- i = self.i = self.i + 1
- if i < len(s):
- self.c = s[i]
- else:
- self.c = ''
- self.end = 1
- def get(self):
- if self.end:
- self.error("Premature end of string")
- c = self.c
- self.next()
- return c
- def lookahead(self, n):
- """Look ahead n chars."""
- j = self.i + n
- if j < len(self.s):
- return self.s[j]
- else:
- return ''
- def expect(self, c):
- """
- Expect to find character |c| at current position.
- Raises an exception otherwise.
- """
- if self.c == c:
- self.next()
- else:
- self.error("Missing %s" % repr(c))
- def error(self, mess):
- """Raise exception to signal syntax error in regexp."""
- raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % (
- repr(self.s), self.i, mess))
|