123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280 |
- "Utility functions used by the btm_matcher module"
- from . import pytree
- from .pgen2 import grammar, token
- from .pygram import pattern_symbols, python_symbols
- syms = pattern_symbols
- pysyms = python_symbols
- tokens = grammar.opmap
- token_labels = token
- TYPE_ANY = -1
- TYPE_ALTERNATIVES = -2
- TYPE_GROUP = -3
- class MinNode(object):
- """This class serves as an intermediate representation of the
- pattern tree during the conversion to sets of leaf-to-root
- subpatterns"""
- def __init__(self, type=None, name=None):
- self.type = type
- self.name = name
- self.children = []
- self.leaf = False
- self.parent = None
- self.alternatives = []
- self.group = []
- def __repr__(self):
- return str(self.type) + ' ' + str(self.name)
- def leaf_to_root(self):
- """Internal method. Returns a characteristic path of the
- pattern tree. This method must be run for all leaves until the
- linear subpatterns are merged into a single"""
- node = self
- subp = []
- while node:
- if node.type == TYPE_ALTERNATIVES:
- node.alternatives.append(subp)
- if len(node.alternatives) == len(node.children):
- #last alternative
- subp = [tuple(node.alternatives)]
- node.alternatives = []
- node = node.parent
- continue
- else:
- node = node.parent
- subp = None
- break
- if node.type == TYPE_GROUP:
- node.group.append(subp)
- #probably should check the number of leaves
- if len(node.group) == len(node.children):
- subp = get_characteristic_subpattern(node.group)
- node.group = []
- node = node.parent
- continue
- else:
- node = node.parent
- subp = None
- break
- if node.type == token_labels.NAME and node.name:
- #in case of type=name, use the name instead
- subp.append(node.name)
- else:
- subp.append(node.type)
- node = node.parent
- return subp
- def get_linear_subpattern(self):
- """Drives the leaf_to_root method. The reason that
- leaf_to_root must be run multiple times is because we need to
- reject 'group' matches; for example the alternative form
- (a | b c) creates a group [b c] that needs to be matched. Since
- matching multiple linear patterns overcomes the automaton's
- capabilities, leaf_to_root merges each group into a single
- choice based on 'characteristic'ity,
- i.e. (a|b c) -> (a|b) if b more characteristic than c
- Returns: The most 'characteristic'(as defined by
- get_characteristic_subpattern) path for the compiled pattern
- tree.
- """
- for l in self.leaves():
- subp = l.leaf_to_root()
- if subp:
- return subp
- def leaves(self):
- "Generator that returns the leaves of the tree"
- for child in self.children:
- yield from child.leaves()
- if not self.children:
- yield self
- def reduce_tree(node, parent=None):
- """
- Internal function. Reduces a compiled pattern tree to an
- intermediate representation suitable for feeding the
- automaton. This also trims off any optional pattern elements(like
- [a], a*).
- """
- new_node = None
- #switch on the node type
- if node.type == syms.Matcher:
- #skip
- node = node.children[0]
- if node.type == syms.Alternatives :
- #2 cases
- if len(node.children) <= 2:
- #just a single 'Alternative', skip this node
- new_node = reduce_tree(node.children[0], parent)
- else:
- #real alternatives
- new_node = MinNode(type=TYPE_ALTERNATIVES)
- #skip odd children('|' tokens)
- for child in node.children:
- if node.children.index(child)%2:
- continue
- reduced = reduce_tree(child, new_node)
- if reduced is not None:
- new_node.children.append(reduced)
- elif node.type == syms.Alternative:
- if len(node.children) > 1:
- new_node = MinNode(type=TYPE_GROUP)
- for child in node.children:
- reduced = reduce_tree(child, new_node)
- if reduced:
- new_node.children.append(reduced)
- if not new_node.children:
- # delete the group if all of the children were reduced to None
- new_node = None
- else:
- new_node = reduce_tree(node.children[0], parent)
- elif node.type == syms.Unit:
- if (isinstance(node.children[0], pytree.Leaf) and
- node.children[0].value == '('):
- #skip parentheses
- return reduce_tree(node.children[1], parent)
- if ((isinstance(node.children[0], pytree.Leaf) and
- node.children[0].value == '[')
- or
- (len(node.children)>1 and
- hasattr(node.children[1], "value") and
- node.children[1].value == '[')):
- #skip whole unit if its optional
- return None
- leaf = True
- details_node = None
- alternatives_node = None
- has_repeater = False
- repeater_node = None
- has_variable_name = False
- for child in node.children:
- if child.type == syms.Details:
- leaf = False
- details_node = child
- elif child.type == syms.Repeater:
- has_repeater = True
- repeater_node = child
- elif child.type == syms.Alternatives:
- alternatives_node = child
- if hasattr(child, 'value') and child.value == '=': # variable name
- has_variable_name = True
- #skip variable name
- if has_variable_name:
- #skip variable name, '='
- name_leaf = node.children[2]
- if hasattr(name_leaf, 'value') and name_leaf.value == '(':
- # skip parenthesis
- name_leaf = node.children[3]
- else:
- name_leaf = node.children[0]
- #set node type
- if name_leaf.type == token_labels.NAME:
- #(python) non-name or wildcard
- if name_leaf.value == 'any':
- new_node = MinNode(type=TYPE_ANY)
- else:
- if hasattr(token_labels, name_leaf.value):
- new_node = MinNode(type=getattr(token_labels, name_leaf.value))
- else:
- new_node = MinNode(type=getattr(pysyms, name_leaf.value))
- elif name_leaf.type == token_labels.STRING:
- #(python) name or character; remove the apostrophes from
- #the string value
- name = name_leaf.value.strip("'")
- if name in tokens:
- new_node = MinNode(type=tokens[name])
- else:
- new_node = MinNode(type=token_labels.NAME, name=name)
- elif name_leaf.type == syms.Alternatives:
- new_node = reduce_tree(alternatives_node, parent)
- #handle repeaters
- if has_repeater:
- if repeater_node.children[0].value == '*':
- #reduce to None
- new_node = None
- elif repeater_node.children[0].value == '+':
- #reduce to a single occurrence i.e. do nothing
- pass
- else:
- #TODO: handle {min, max} repeaters
- raise NotImplementedError
- #add children
- if details_node and new_node is not None:
- for child in details_node.children[1:-1]:
- #skip '<', '>' markers
- reduced = reduce_tree(child, new_node)
- if reduced is not None:
- new_node.children.append(reduced)
- if new_node:
- new_node.parent = parent
- return new_node
- def get_characteristic_subpattern(subpatterns):
- """Picks the most characteristic from a list of linear patterns
- Current order used is:
- names > common_names > common_chars
- """
- if not isinstance(subpatterns, list):
- return subpatterns
- if len(subpatterns)==1:
- return subpatterns[0]
- # first pick out the ones containing variable names
- subpatterns_with_names = []
- subpatterns_with_common_names = []
- common_names = ['in', 'for', 'if' , 'not', 'None']
- subpatterns_with_common_chars = []
- common_chars = "[]().,:"
- for subpattern in subpatterns:
- if any(rec_test(subpattern, lambda x: type(x) is str)):
- if any(rec_test(subpattern,
- lambda x: isinstance(x, str) and x in common_chars)):
- subpatterns_with_common_chars.append(subpattern)
- elif any(rec_test(subpattern,
- lambda x: isinstance(x, str) and x in common_names)):
- subpatterns_with_common_names.append(subpattern)
- else:
- subpatterns_with_names.append(subpattern)
- if subpatterns_with_names:
- subpatterns = subpatterns_with_names
- elif subpatterns_with_common_names:
- subpatterns = subpatterns_with_common_names
- elif subpatterns_with_common_chars:
- subpatterns = subpatterns_with_common_chars
- # of the remaining subpatterns pick out the longest one
- return max(subpatterns, key=len)
- def rec_test(sequence, test_func):
- """Tests test_func on all items of sequence and items of included
- sub-iterables"""
- for x in sequence:
- if isinstance(x, (list, tuple)):
- yield from rec_test(x, test_func)
- else:
- yield test_func(x)
|