123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200 |
- #=======================================================================
- #
- # Python Lexical Analyser
- #
- # Lexical Analyser Specification
- #
- #=======================================================================
- from __future__ import absolute_import
- import types
- from . import Actions
- from . import DFA
- from . import Errors
- from . import Machines
- from . import Regexps
- # debug_flags for Lexicon constructor
- DUMP_NFA = 1
- DUMP_DFA = 2
- class State(object):
- """
- This class is used as part of a Plex.Lexicon specification to
- introduce a user-defined state.
- Constructor:
- State(name, token_specifications)
- """
- name = None
- tokens = None
- def __init__(self, name, tokens):
- self.name = name
- self.tokens = tokens
- class Lexicon(object):
- """
- Lexicon(specification) builds a lexical analyser from the given
- |specification|. The specification consists of a list of
- specification items. Each specification item may be either:
- 1) A token definition, which is a tuple:
- (pattern, action)
- The |pattern| is a regular axpression built using the
- constructors defined in the Plex module.
- The |action| is the action to be performed when this pattern
- is recognised (see below).
- 2) A state definition:
- State(name, tokens)
- where |name| is a character string naming the state,
- and |tokens| is a list of token definitions as
- above. The meaning and usage of states is described
- below.
- Actions
- -------
- The |action| in a token specication may be one of three things:
- 1) A function, which is called as follows:
- function(scanner, text)
- where |scanner| is the relevant Scanner instance, and |text|
- is the matched text. If the function returns anything
- other than None, that value is returned as the value of the
- token. If it returns None, scanning continues as if the IGNORE
- action were specified (see below).
- 2) One of the following special actions:
- IGNORE means that the recognised characters will be treated as
- white space and ignored. Scanning will continue until
- the next non-ignored token is recognised before returning.
- TEXT causes the scanned text itself to be returned as the
- value of the token.
- 3) Any other value, which is returned as the value of the token.
- States
- ------
- At any given time, the scanner is in one of a number of states.
- Associated with each state is a set of possible tokens. When scanning,
- only tokens associated with the current state are recognised.
- There is a default state, whose name is the empty string. Token
- definitions which are not inside any State definition belong to
- the default state.
- The initial state of the scanner is the default state. The state can
- be changed in one of two ways:
- 1) Using Begin(state_name) as the action of a token.
- 2) Calling the begin(state_name) method of the Scanner.
- To change back to the default state, use '' as the state name.
- """
- machine = None # Machine
- tables = None # StateTableMachine
- def __init__(self, specifications, debug=None, debug_flags=7, timings=None):
- if not isinstance(specifications, list):
- raise Errors.InvalidScanner("Scanner definition is not a list")
- if timings:
- from .Timing import time
- total_time = 0.0
- time1 = time()
- nfa = Machines.Machine()
- default_initial_state = nfa.new_initial_state('')
- token_number = 1
- for spec in specifications:
- if isinstance(spec, State):
- user_initial_state = nfa.new_initial_state(spec.name)
- for token in spec.tokens:
- self.add_token_to_machine(
- nfa, user_initial_state, token, token_number)
- token_number += 1
- elif isinstance(spec, tuple):
- self.add_token_to_machine(
- nfa, default_initial_state, spec, token_number)
- token_number += 1
- else:
- raise Errors.InvalidToken(
- token_number,
- "Expected a token definition (tuple) or State instance")
- if timings:
- time2 = time()
- total_time = total_time + (time2 - time1)
- time3 = time()
- if debug and (debug_flags & 1):
- debug.write("\n============= NFA ===========\n")
- nfa.dump(debug)
- dfa = DFA.nfa_to_dfa(nfa, debug=(debug_flags & 3) == 3 and debug)
- if timings:
- time4 = time()
- total_time = total_time + (time4 - time3)
- if debug and (debug_flags & 2):
- debug.write("\n============= DFA ===========\n")
- dfa.dump(debug)
- if timings:
- timings.write("Constructing NFA : %5.2f\n" % (time2 - time1))
- timings.write("Converting to DFA: %5.2f\n" % (time4 - time3))
- timings.write("TOTAL : %5.2f\n" % total_time)
- self.machine = dfa
- def add_token_to_machine(self, machine, initial_state, token_spec, token_number):
- try:
- (re, action_spec) = self.parse_token_definition(token_spec)
- # Disabled this -- matching empty strings can be useful
- #if re.nullable:
- # raise Errors.InvalidToken(
- # token_number, "Pattern can match 0 input symbols")
- if isinstance(action_spec, Actions.Action):
- action = action_spec
- else:
- try:
- action_spec.__call__
- except AttributeError:
- action = Actions.Return(action_spec)
- else:
- action = Actions.Call(action_spec)
- final_state = machine.new_state()
- re.build_machine(machine, initial_state, final_state,
- match_bol=1, nocase=0)
- final_state.set_action(action, priority=-token_number)
- except Errors.PlexError as e:
- raise e.__class__("Token number %d: %s" % (token_number, e))
- def parse_token_definition(self, token_spec):
- if not isinstance(token_spec, tuple):
- raise Errors.InvalidToken("Token definition is not a tuple")
- if len(token_spec) != 2:
- raise Errors.InvalidToken("Wrong number of items in token definition")
- pattern, action = token_spec
- if not isinstance(pattern, Regexps.RE):
- raise Errors.InvalidToken("Pattern is not an RE instance")
- return (pattern, action)
- def get_initial_state(self, name):
- return self.machine.get_initial_state(name)
|