btm_utils.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. "Utility functions used by the btm_matcher module"
  2. from . import pytree
  3. from .pgen2 import grammar, token
  4. from .pygram import pattern_symbols, python_symbols
  5. syms = pattern_symbols
  6. pysyms = python_symbols
  7. tokens = grammar.opmap
  8. token_labels = token
  9. TYPE_ANY = -1
  10. TYPE_ALTERNATIVES = -2
  11. TYPE_GROUP = -3
  12. class MinNode(object):
  13. """This class serves as an intermediate representation of the
  14. pattern tree during the conversion to sets of leaf-to-root
  15. subpatterns"""
  16. def __init__(self, type=None, name=None):
  17. self.type = type
  18. self.name = name
  19. self.children = []
  20. self.leaf = False
  21. self.parent = None
  22. self.alternatives = []
  23. self.group = []
  24. def __repr__(self):
  25. return str(self.type) + ' ' + str(self.name)
  26. def leaf_to_root(self):
  27. """Internal method. Returns a characteristic path of the
  28. pattern tree. This method must be run for all leaves until the
  29. linear subpatterns are merged into a single"""
  30. node = self
  31. subp = []
  32. while node:
  33. if node.type == TYPE_ALTERNATIVES:
  34. node.alternatives.append(subp)
  35. if len(node.alternatives) == len(node.children):
  36. #last alternative
  37. subp = [tuple(node.alternatives)]
  38. node.alternatives = []
  39. node = node.parent
  40. continue
  41. else:
  42. node = node.parent
  43. subp = None
  44. break
  45. if node.type == TYPE_GROUP:
  46. node.group.append(subp)
  47. #probably should check the number of leaves
  48. if len(node.group) == len(node.children):
  49. subp = get_characteristic_subpattern(node.group)
  50. node.group = []
  51. node = node.parent
  52. continue
  53. else:
  54. node = node.parent
  55. subp = None
  56. break
  57. if node.type == token_labels.NAME and node.name:
  58. #in case of type=name, use the name instead
  59. subp.append(node.name)
  60. else:
  61. subp.append(node.type)
  62. node = node.parent
  63. return subp
  64. def get_linear_subpattern(self):
  65. """Drives the leaf_to_root method. The reason that
  66. leaf_to_root must be run multiple times is because we need to
  67. reject 'group' matches; for example the alternative form
  68. (a | b c) creates a group [b c] that needs to be matched. Since
  69. matching multiple linear patterns overcomes the automaton's
  70. capabilities, leaf_to_root merges each group into a single
  71. choice based on 'characteristic'ity,
  72. i.e. (a|b c) -> (a|b) if b more characteristic than c
  73. Returns: The most 'characteristic'(as defined by
  74. get_characteristic_subpattern) path for the compiled pattern
  75. tree.
  76. """
  77. for l in self.leaves():
  78. subp = l.leaf_to_root()
  79. if subp:
  80. return subp
  81. def leaves(self):
  82. "Generator that returns the leaves of the tree"
  83. for child in self.children:
  84. yield from child.leaves()
  85. if not self.children:
  86. yield self
  87. def reduce_tree(node, parent=None):
  88. """
  89. Internal function. Reduces a compiled pattern tree to an
  90. intermediate representation suitable for feeding the
  91. automaton. This also trims off any optional pattern elements(like
  92. [a], a*).
  93. """
  94. new_node = None
  95. #switch on the node type
  96. if node.type == syms.Matcher:
  97. #skip
  98. node = node.children[0]
  99. if node.type == syms.Alternatives :
  100. #2 cases
  101. if len(node.children) <= 2:
  102. #just a single 'Alternative', skip this node
  103. new_node = reduce_tree(node.children[0], parent)
  104. else:
  105. #real alternatives
  106. new_node = MinNode(type=TYPE_ALTERNATIVES)
  107. #skip odd children('|' tokens)
  108. for child in node.children:
  109. if node.children.index(child)%2:
  110. continue
  111. reduced = reduce_tree(child, new_node)
  112. if reduced is not None:
  113. new_node.children.append(reduced)
  114. elif node.type == syms.Alternative:
  115. if len(node.children) > 1:
  116. new_node = MinNode(type=TYPE_GROUP)
  117. for child in node.children:
  118. reduced = reduce_tree(child, new_node)
  119. if reduced:
  120. new_node.children.append(reduced)
  121. if not new_node.children:
  122. # delete the group if all of the children were reduced to None
  123. new_node = None
  124. else:
  125. new_node = reduce_tree(node.children[0], parent)
  126. elif node.type == syms.Unit:
  127. if (isinstance(node.children[0], pytree.Leaf) and
  128. node.children[0].value == '('):
  129. #skip parentheses
  130. return reduce_tree(node.children[1], parent)
  131. if ((isinstance(node.children[0], pytree.Leaf) and
  132. node.children[0].value == '[')
  133. or
  134. (len(node.children)>1 and
  135. hasattr(node.children[1], "value") and
  136. node.children[1].value == '[')):
  137. #skip whole unit if its optional
  138. return None
  139. leaf = True
  140. details_node = None
  141. alternatives_node = None
  142. has_repeater = False
  143. repeater_node = None
  144. has_variable_name = False
  145. for child in node.children:
  146. if child.type == syms.Details:
  147. leaf = False
  148. details_node = child
  149. elif child.type == syms.Repeater:
  150. has_repeater = True
  151. repeater_node = child
  152. elif child.type == syms.Alternatives:
  153. alternatives_node = child
  154. if hasattr(child, 'value') and child.value == '=': # variable name
  155. has_variable_name = True
  156. #skip variable name
  157. if has_variable_name:
  158. #skip variable name, '='
  159. name_leaf = node.children[2]
  160. if hasattr(name_leaf, 'value') and name_leaf.value == '(':
  161. # skip parenthesis
  162. name_leaf = node.children[3]
  163. else:
  164. name_leaf = node.children[0]
  165. #set node type
  166. if name_leaf.type == token_labels.NAME:
  167. #(python) non-name or wildcard
  168. if name_leaf.value == 'any':
  169. new_node = MinNode(type=TYPE_ANY)
  170. else:
  171. if hasattr(token_labels, name_leaf.value):
  172. new_node = MinNode(type=getattr(token_labels, name_leaf.value))
  173. else:
  174. new_node = MinNode(type=getattr(pysyms, name_leaf.value))
  175. elif name_leaf.type == token_labels.STRING:
  176. #(python) name or character; remove the apostrophes from
  177. #the string value
  178. name = name_leaf.value.strip("'")
  179. if name in tokens:
  180. new_node = MinNode(type=tokens[name])
  181. else:
  182. new_node = MinNode(type=token_labels.NAME, name=name)
  183. elif name_leaf.type == syms.Alternatives:
  184. new_node = reduce_tree(alternatives_node, parent)
  185. #handle repeaters
  186. if has_repeater:
  187. if repeater_node.children[0].value == '*':
  188. #reduce to None
  189. new_node = None
  190. elif repeater_node.children[0].value == '+':
  191. #reduce to a single occurrence i.e. do nothing
  192. pass
  193. else:
  194. #TODO: handle {min, max} repeaters
  195. raise NotImplementedError
  196. pass
  197. #add children
  198. if details_node and new_node is not None:
  199. for child in details_node.children[1:-1]:
  200. #skip '<', '>' markers
  201. reduced = reduce_tree(child, new_node)
  202. if reduced is not None:
  203. new_node.children.append(reduced)
  204. if new_node:
  205. new_node.parent = parent
  206. return new_node
  207. def get_characteristic_subpattern(subpatterns):
  208. """Picks the most characteristic from a list of linear patterns
  209. Current order used is:
  210. names > common_names > common_chars
  211. """
  212. if not isinstance(subpatterns, list):
  213. return subpatterns
  214. if len(subpatterns)==1:
  215. return subpatterns[0]
  216. # first pick out the ones containing variable names
  217. subpatterns_with_names = []
  218. subpatterns_with_common_names = []
  219. common_names = ['in', 'for', 'if' , 'not', 'None']
  220. subpatterns_with_common_chars = []
  221. common_chars = "[]().,:"
  222. for subpattern in subpatterns:
  223. if any(rec_test(subpattern, lambda x: type(x) is str)):
  224. if any(rec_test(subpattern,
  225. lambda x: isinstance(x, str) and x in common_chars)):
  226. subpatterns_with_common_chars.append(subpattern)
  227. elif any(rec_test(subpattern,
  228. lambda x: isinstance(x, str) and x in common_names)):
  229. subpatterns_with_common_names.append(subpattern)
  230. else:
  231. subpatterns_with_names.append(subpattern)
  232. if subpatterns_with_names:
  233. subpatterns = subpatterns_with_names
  234. elif subpatterns_with_common_names:
  235. subpatterns = subpatterns_with_common_names
  236. elif subpatterns_with_common_chars:
  237. subpatterns = subpatterns_with_common_chars
  238. # of the remaining subpatterns pick out the longest one
  239. return max(subpatterns, key=len)
  240. def rec_test(sequence, test_func):
  241. """Tests test_func on all items of sequence and items of included
  242. sub-iterables"""
  243. for x in sequence:
  244. if isinstance(x, (list, tuple)):
  245. yield from rec_test(x, test_func)
  246. else:
  247. yield test_func(x)