123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204 |
- # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
- # Licensed to PSF under a Contributor Agreement.
- """Parser engine for the grammar tables generated by pgen.
- The grammar table must be loaded first.
- See Parser/parser.c in the Python distribution for additional info on
- how this parsing engine works.
- """
- # Local imports
- from . import token
- class ParseError(Exception):
- """Exception to signal the parser is stuck."""
- def __init__(self, msg, type, value, context):
- Exception.__init__(self, "%s: type=%r, value=%r, context=%r" %
- (msg, type, value, context))
- self.msg = msg
- self.type = type
- self.value = value
- self.context = context
- def __reduce__(self):
- return type(self), (self.msg, self.type, self.value, self.context)
- class Parser(object):
- """Parser engine.
- The proper usage sequence is:
- p = Parser(grammar, [converter]) # create instance
- p.setup([start]) # prepare for parsing
- <for each input token>:
- if p.addtoken(...): # parse a token; may raise ParseError
- break
- root = p.rootnode # root of abstract syntax tree
- A Parser instance may be reused by calling setup() repeatedly.
- A Parser instance contains state pertaining to the current token
- sequence, and should not be used concurrently by different threads
- to parse separate token sequences.
- See driver.py for how to get input tokens by tokenizing a file or
- string.
- Parsing is complete when addtoken() returns True; the root of the
- abstract syntax tree can then be retrieved from the rootnode
- instance variable. When a syntax error occurs, addtoken() raises
- the ParseError exception. There is no error recovery; the parser
- cannot be used after a syntax error was reported (but it can be
- reinitialized by calling setup()).
- """
- def __init__(self, grammar, convert=None):
- """Constructor.
- The grammar argument is a grammar.Grammar instance; see the
- grammar module for more information.
- The parser is not ready yet for parsing; you must call the
- setup() method to get it started.
- The optional convert argument is a function mapping concrete
- syntax tree nodes to abstract syntax tree nodes. If not
- given, no conversion is done and the syntax tree produced is
- the concrete syntax tree. If given, it must be a function of
- two arguments, the first being the grammar (a grammar.Grammar
- instance), and the second being the concrete syntax tree node
- to be converted. The syntax tree is converted from the bottom
- up.
- A concrete syntax tree node is a (type, value, context, nodes)
- tuple, where type is the node type (a token or symbol number),
- value is None for symbols and a string for tokens, context is
- None or an opaque value used for error reporting (typically a
- (lineno, offset) pair), and nodes is a list of children for
- symbols, and None for tokens.
- An abstract syntax tree node may be anything; this is entirely
- up to the converter function.
- """
- self.grammar = grammar
- self.convert = convert or (lambda grammar, node: node)
- def setup(self, start=None):
- """Prepare for parsing.
- This *must* be called before starting to parse.
- The optional argument is an alternative start symbol; it
- defaults to the grammar's start symbol.
- You can use a Parser instance to parse any number of programs;
- each time you call setup() the parser is reset to an initial
- state determined by the (implicit or explicit) start symbol.
- """
- if start is None:
- start = self.grammar.start
- # Each stack entry is a tuple: (dfa, state, node).
- # A node is a tuple: (type, value, context, children),
- # where children is a list of nodes or None, and context may be None.
- newnode = (start, None, None, [])
- stackentry = (self.grammar.dfas[start], 0, newnode)
- self.stack = [stackentry]
- self.rootnode = None
- self.used_names = set() # Aliased to self.rootnode.used_names in pop()
- def addtoken(self, type, value, context):
- """Add a token; return True iff this is the end of the program."""
- # Map from token to label
- ilabel = self.classify(type, value, context)
- # Loop until the token is shifted; may raise exceptions
- while True:
- dfa, state, node = self.stack[-1]
- states, first = dfa
- arcs = states[state]
- # Look for a state with this label
- for i, newstate in arcs:
- t, v = self.grammar.labels[i]
- if ilabel == i:
- # Look it up in the list of labels
- assert t < 256
- # Shift a token; we're done with it
- self.shift(type, value, newstate, context)
- # Pop while we are in an accept-only state
- state = newstate
- while states[state] == [(0, state)]:
- self.pop()
- if not self.stack:
- # Done parsing!
- return True
- dfa, state, node = self.stack[-1]
- states, first = dfa
- # Done with this token
- return False
- elif t >= 256:
- # See if it's a symbol and if we're in its first set
- itsdfa = self.grammar.dfas[t]
- itsstates, itsfirst = itsdfa
- if ilabel in itsfirst:
- # Push a symbol
- self.push(t, self.grammar.dfas[t], newstate, context)
- break # To continue the outer while loop
- else:
- if (0, state) in arcs:
- # An accepting state, pop it and try something else
- self.pop()
- if not self.stack:
- # Done parsing, but another token is input
- raise ParseError("too much input",
- type, value, context)
- else:
- # No success finding a transition
- raise ParseError("bad input", type, value, context)
- def classify(self, type, value, context):
- """Turn a token into a label. (Internal)"""
- if type == token.NAME:
- # Keep a listing of all used names
- self.used_names.add(value)
- # Check for reserved words
- ilabel = self.grammar.keywords.get(value)
- if ilabel is not None:
- return ilabel
- ilabel = self.grammar.tokens.get(type)
- if ilabel is None:
- raise ParseError("bad token", type, value, context)
- return ilabel
- def shift(self, type, value, newstate, context):
- """Shift a token. (Internal)"""
- dfa, state, node = self.stack[-1]
- newnode = (type, value, context, None)
- newnode = self.convert(self.grammar, newnode)
- if newnode is not None:
- node[-1].append(newnode)
- self.stack[-1] = (dfa, newstate, node)
- def push(self, type, newdfa, newstate, context):
- """Push a nonterminal. (Internal)"""
- dfa, state, node = self.stack[-1]
- newnode = (type, None, context, [])
- self.stack[-1] = (dfa, newstate, node)
- self.stack.append((newdfa, 0, newnode))
- def pop(self):
- """Pop a nonterminal. (Internal)"""
- popdfa, popstate, popnode = self.stack.pop()
- newnode = self.convert(self.grammar, popnode)
- if newnode is not None:
- if self.stack:
- dfa, state, node = self.stack[-1]
- node[-1].append(newnode)
- else:
- self.rootnode = newnode
- self.rootnode.used_names = self.used_names
|