Lexicons.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. #=======================================================================
  2. #
  3. # Python Lexical Analyser
  4. #
  5. # Lexical Analyser Specification
  6. #
  7. #=======================================================================
  8. from __future__ import absolute_import
  9. import types
  10. from . import Actions
  11. from . import DFA
  12. from . import Errors
  13. from . import Machines
  14. from . import Regexps
  15. # debug_flags for Lexicon constructor
  16. DUMP_NFA = 1
  17. DUMP_DFA = 2
  18. class State(object):
  19. """
  20. This class is used as part of a Plex.Lexicon specification to
  21. introduce a user-defined state.
  22. Constructor:
  23. State(name, token_specifications)
  24. """
  25. name = None
  26. tokens = None
  27. def __init__(self, name, tokens):
  28. self.name = name
  29. self.tokens = tokens
  30. class Lexicon(object):
  31. """
  32. Lexicon(specification) builds a lexical analyser from the given
  33. |specification|. The specification consists of a list of
  34. specification items. Each specification item may be either:
  35. 1) A token definition, which is a tuple:
  36. (pattern, action)
  37. The |pattern| is a regular axpression built using the
  38. constructors defined in the Plex module.
  39. The |action| is the action to be performed when this pattern
  40. is recognised (see below).
  41. 2) A state definition:
  42. State(name, tokens)
  43. where |name| is a character string naming the state,
  44. and |tokens| is a list of token definitions as
  45. above. The meaning and usage of states is described
  46. below.
  47. Actions
  48. -------
  49. The |action| in a token specication may be one of three things:
  50. 1) A function, which is called as follows:
  51. function(scanner, text)
  52. where |scanner| is the relevant Scanner instance, and |text|
  53. is the matched text. If the function returns anything
  54. other than None, that value is returned as the value of the
  55. token. If it returns None, scanning continues as if the IGNORE
  56. action were specified (see below).
  57. 2) One of the following special actions:
  58. IGNORE means that the recognised characters will be treated as
  59. white space and ignored. Scanning will continue until
  60. the next non-ignored token is recognised before returning.
  61. TEXT causes the scanned text itself to be returned as the
  62. value of the token.
  63. 3) Any other value, which is returned as the value of the token.
  64. States
  65. ------
  66. At any given time, the scanner is in one of a number of states.
  67. Associated with each state is a set of possible tokens. When scanning,
  68. only tokens associated with the current state are recognised.
  69. There is a default state, whose name is the empty string. Token
  70. definitions which are not inside any State definition belong to
  71. the default state.
  72. The initial state of the scanner is the default state. The state can
  73. be changed in one of two ways:
  74. 1) Using Begin(state_name) as the action of a token.
  75. 2) Calling the begin(state_name) method of the Scanner.
  76. To change back to the default state, use '' as the state name.
  77. """
  78. machine = None # Machine
  79. tables = None # StateTableMachine
  80. def __init__(self, specifications, debug=None, debug_flags=7, timings=None):
  81. if not isinstance(specifications, list):
  82. raise Errors.InvalidScanner("Scanner definition is not a list")
  83. if timings:
  84. from .Timing import time
  85. total_time = 0.0
  86. time1 = time()
  87. nfa = Machines.Machine()
  88. default_initial_state = nfa.new_initial_state('')
  89. token_number = 1
  90. for spec in specifications:
  91. if isinstance(spec, State):
  92. user_initial_state = nfa.new_initial_state(spec.name)
  93. for token in spec.tokens:
  94. self.add_token_to_machine(
  95. nfa, user_initial_state, token, token_number)
  96. token_number += 1
  97. elif isinstance(spec, tuple):
  98. self.add_token_to_machine(
  99. nfa, default_initial_state, spec, token_number)
  100. token_number += 1
  101. else:
  102. raise Errors.InvalidToken(
  103. token_number,
  104. "Expected a token definition (tuple) or State instance")
  105. if timings:
  106. time2 = time()
  107. total_time = total_time + (time2 - time1)
  108. time3 = time()
  109. if debug and (debug_flags & 1):
  110. debug.write("\n============= NFA ===========\n")
  111. nfa.dump(debug)
  112. dfa = DFA.nfa_to_dfa(nfa, debug=(debug_flags & 3) == 3 and debug)
  113. if timings:
  114. time4 = time()
  115. total_time = total_time + (time4 - time3)
  116. if debug and (debug_flags & 2):
  117. debug.write("\n============= DFA ===========\n")
  118. dfa.dump(debug)
  119. if timings:
  120. timings.write("Constructing NFA : %5.2f\n" % (time2 - time1))
  121. timings.write("Converting to DFA: %5.2f\n" % (time4 - time3))
  122. timings.write("TOTAL : %5.2f\n" % total_time)
  123. self.machine = dfa
  124. def add_token_to_machine(self, machine, initial_state, token_spec, token_number):
  125. try:
  126. (re, action_spec) = self.parse_token_definition(token_spec)
  127. # Disabled this -- matching empty strings can be useful
  128. #if re.nullable:
  129. # raise Errors.InvalidToken(
  130. # token_number, "Pattern can match 0 input symbols")
  131. if isinstance(action_spec, Actions.Action):
  132. action = action_spec
  133. else:
  134. try:
  135. action_spec.__call__
  136. except AttributeError:
  137. action = Actions.Return(action_spec)
  138. else:
  139. action = Actions.Call(action_spec)
  140. final_state = machine.new_state()
  141. re.build_machine(machine, initial_state, final_state,
  142. match_bol=1, nocase=0)
  143. final_state.set_action(action, priority=-token_number)
  144. except Errors.PlexError as e:
  145. raise e.__class__("Token number %d: %s" % (token_number, e))
  146. def parse_token_definition(self, token_spec):
  147. if not isinstance(token_spec, tuple):
  148. raise Errors.InvalidToken("Token definition is not a tuple")
  149. if len(token_spec) != 2:
  150. raise Errors.InvalidToken("Wrong number of items in token definition")
  151. pattern, action = token_spec
  152. if not isinstance(pattern, Regexps.RE):
  153. raise Errors.InvalidToken("Pattern is not an RE instance")
  154. return (pattern, action)
  155. def get_initial_state(self, name):
  156. return self.machine.get_initial_state(name)