grammar.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
  2. # Licensed to PSF under a Contributor Agreement.
  3. """This module defines the data structures used to represent a grammar.
  4. These are a bit arcane because they are derived from the data
  5. structures used by Python's 'pgen' parser generator.
  6. There's also a table here mapping operators to their names in the
  7. token module; the Python tokenize module reports all operators as the
  8. fallback token code OP, but the parser needs the actual token code.
  9. """
  10. # Python imports
  11. import pickle
  12. # Local imports
  13. from . import token
  14. class Grammar(object):
  15. """Pgen parsing tables conversion class.
  16. Once initialized, this class supplies the grammar tables for the
  17. parsing engine implemented by parse.py. The parsing engine
  18. accesses the instance variables directly. The class here does not
  19. provide initialization of the tables; several subclasses exist to
  20. do this (see the conv and pgen modules).
  21. The load() method reads the tables from a pickle file, which is
  22. much faster than the other ways offered by subclasses. The pickle
  23. file is written by calling dump() (after loading the grammar
  24. tables using a subclass). The report() method prints a readable
  25. representation of the tables to stdout, for debugging.
  26. The instance variables are as follows:
  27. symbol2number -- a dict mapping symbol names to numbers. Symbol
  28. numbers are always 256 or higher, to distinguish
  29. them from token numbers, which are between 0 and
  30. 255 (inclusive).
  31. number2symbol -- a dict mapping numbers to symbol names;
  32. these two are each other's inverse.
  33. states -- a list of DFAs, where each DFA is a list of
  34. states, each state is a list of arcs, and each
  35. arc is a (i, j) pair where i is a label and j is
  36. a state number. The DFA number is the index into
  37. this list. (This name is slightly confusing.)
  38. Final states are represented by a special arc of
  39. the form (0, j) where j is its own state number.
  40. dfas -- a dict mapping symbol numbers to (DFA, first)
  41. pairs, where DFA is an item from the states list
  42. above, and first is a set of tokens that can
  43. begin this grammar rule (represented by a dict
  44. whose values are always 1).
  45. labels -- a list of (x, y) pairs where x is either a token
  46. number or a symbol number, and y is either None
  47. or a string; the strings are keywords. The label
  48. number is the index in this list; label numbers
  49. are used to mark state transitions (arcs) in the
  50. DFAs.
  51. start -- the number of the grammar's start symbol.
  52. keywords -- a dict mapping keyword strings to arc labels.
  53. tokens -- a dict mapping token numbers to arc labels.
  54. """
  55. def __init__(self):
  56. self.symbol2number = {}
  57. self.number2symbol = {}
  58. self.states = []
  59. self.dfas = {}
  60. self.labels = [(0, "EMPTY")]
  61. self.keywords = {}
  62. self.tokens = {}
  63. self.symbol2label = {}
  64. self.start = 256
  65. def dump(self, filename):
  66. """Dump the grammar tables to a pickle file."""
  67. with open(filename, "wb") as f:
  68. pickle.dump(self.__dict__, f, pickle.HIGHEST_PROTOCOL)
  69. def load(self, filename):
  70. """Load the grammar tables from a pickle file."""
  71. with open(filename, "rb") as f:
  72. d = pickle.load(f)
  73. self.__dict__.update(d)
  74. def loads(self, pkl):
  75. """Load the grammar tables from a pickle bytes object."""
  76. self.__dict__.update(pickle.loads(pkl))
  77. def copy(self):
  78. """
  79. Copy the grammar.
  80. """
  81. new = self.__class__()
  82. for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
  83. "tokens", "symbol2label"):
  84. setattr(new, dict_attr, getattr(self, dict_attr).copy())
  85. new.labels = self.labels[:]
  86. new.states = self.states[:]
  87. new.start = self.start
  88. return new
  89. def report(self):
  90. """Dump the grammar tables to standard output, for debugging."""
  91. from pprint import pprint
  92. print("s2n")
  93. pprint(self.symbol2number)
  94. print("n2s")
  95. pprint(self.number2symbol)
  96. print("states")
  97. pprint(self.states)
  98. print("dfas")
  99. pprint(self.dfas)
  100. print("labels")
  101. pprint(self.labels)
  102. print("start", self.start)
  103. # Map from operator to number (since tokenize doesn't do this)
  104. opmap_raw = """
  105. ( LPAR
  106. ) RPAR
  107. [ LSQB
  108. ] RSQB
  109. : COLON
  110. , COMMA
  111. ; SEMI
  112. + PLUS
  113. - MINUS
  114. * STAR
  115. / SLASH
  116. | VBAR
  117. & AMPER
  118. < LESS
  119. > GREATER
  120. = EQUAL
  121. . DOT
  122. % PERCENT
  123. ` BACKQUOTE
  124. { LBRACE
  125. } RBRACE
  126. @ AT
  127. @= ATEQUAL
  128. == EQEQUAL
  129. != NOTEQUAL
  130. <> NOTEQUAL
  131. <= LESSEQUAL
  132. >= GREATEREQUAL
  133. ~ TILDE
  134. ^ CIRCUMFLEX
  135. << LEFTSHIFT
  136. >> RIGHTSHIFT
  137. ** DOUBLESTAR
  138. += PLUSEQUAL
  139. -= MINEQUAL
  140. *= STAREQUAL
  141. /= SLASHEQUAL
  142. %= PERCENTEQUAL
  143. &= AMPEREQUAL
  144. |= VBAREQUAL
  145. ^= CIRCUMFLEXEQUAL
  146. <<= LEFTSHIFTEQUAL
  147. >>= RIGHTSHIFTEQUAL
  148. **= DOUBLESTAREQUAL
  149. // DOUBLESLASH
  150. //= DOUBLESLASHEQUAL
  151. -> RARROW
  152. := COLONEQUAL
  153. """
  154. opmap = {}
  155. for line in opmap_raw.splitlines():
  156. if line:
  157. op, name = line.split()
  158. opmap[op] = getattr(token, name)