TreePath.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. """
  2. A simple XPath-like language for tree traversal.
  3. This works by creating a filter chain of generator functions. Each
  4. function selects a part of the expression, e.g. a child node, a
  5. specific descendant or a node that holds an attribute.
  6. """
  7. from __future__ import absolute_import
  8. import re
  9. import operator
  10. import sys
  11. if sys.version_info[0] >= 3:
  12. _unicode = str
  13. else:
  14. _unicode = unicode
  15. path_tokenizer = re.compile(
  16. r"("
  17. r"'[^']*'|\"[^\"]*\"|"
  18. r"//?|"
  19. r"\(\)|"
  20. r"==?|"
  21. r"[/.*\[\]()@])|"
  22. r"([^/\[\]()@=\s]+)|"
  23. r"\s+"
  24. ).findall
  25. def iterchildren(node, attr_name):
  26. # returns an iterable of all child nodes of that name
  27. child = getattr(node, attr_name)
  28. if child is not None:
  29. if type(child) is list:
  30. return child
  31. else:
  32. return [child]
  33. else:
  34. return ()
  35. def _get_first_or_none(it):
  36. try:
  37. try:
  38. _next = it.next
  39. except AttributeError:
  40. return next(it)
  41. else:
  42. return _next()
  43. except StopIteration:
  44. return None
  45. def type_name(node):
  46. return node.__class__.__name__.split('.')[-1]
  47. def parse_func(next, token):
  48. name = token[1]
  49. token = next()
  50. if token[0] != '(':
  51. raise ValueError("Expected '(' after function name '%s'" % name)
  52. predicate = handle_predicate(next, token)
  53. return name, predicate
  54. def handle_func_not(next, token):
  55. """
  56. not(...)
  57. """
  58. name, predicate = parse_func(next, token)
  59. def select(result):
  60. for node in result:
  61. if _get_first_or_none(predicate([node])) is None:
  62. yield node
  63. return select
  64. def handle_name(next, token):
  65. """
  66. /NodeName/
  67. or
  68. func(...)
  69. """
  70. name = token[1]
  71. if name in functions:
  72. return functions[name](next, token)
  73. def select(result):
  74. for node in result:
  75. for attr_name in node.child_attrs:
  76. for child in iterchildren(node, attr_name):
  77. if type_name(child) == name:
  78. yield child
  79. return select
  80. def handle_star(next, token):
  81. """
  82. /*/
  83. """
  84. def select(result):
  85. for node in result:
  86. for name in node.child_attrs:
  87. for child in iterchildren(node, name):
  88. yield child
  89. return select
  90. def handle_dot(next, token):
  91. """
  92. /./
  93. """
  94. def select(result):
  95. return result
  96. return select
  97. def handle_descendants(next, token):
  98. """
  99. //...
  100. """
  101. token = next()
  102. if token[0] == "*":
  103. def iter_recursive(node):
  104. for name in node.child_attrs:
  105. for child in iterchildren(node, name):
  106. yield child
  107. for c in iter_recursive(child):
  108. yield c
  109. elif not token[0]:
  110. node_name = token[1]
  111. def iter_recursive(node):
  112. for name in node.child_attrs:
  113. for child in iterchildren(node, name):
  114. if type_name(child) == node_name:
  115. yield child
  116. for c in iter_recursive(child):
  117. yield c
  118. else:
  119. raise ValueError("Expected node name after '//'")
  120. def select(result):
  121. for node in result:
  122. for child in iter_recursive(node):
  123. yield child
  124. return select
  125. def handle_attribute(next, token):
  126. token = next()
  127. if token[0]:
  128. raise ValueError("Expected attribute name")
  129. name = token[1]
  130. value = None
  131. try:
  132. token = next()
  133. except StopIteration:
  134. pass
  135. else:
  136. if token[0] == '=':
  137. value = parse_path_value(next)
  138. readattr = operator.attrgetter(name)
  139. if value is None:
  140. def select(result):
  141. for node in result:
  142. try:
  143. attr_value = readattr(node)
  144. except AttributeError:
  145. continue
  146. if attr_value is not None:
  147. yield attr_value
  148. else:
  149. def select(result):
  150. for node in result:
  151. try:
  152. attr_value = readattr(node)
  153. except AttributeError:
  154. continue
  155. if attr_value == value:
  156. yield attr_value
  157. elif (isinstance(attr_value, bytes) and isinstance(value, _unicode) and
  158. attr_value == value.encode()):
  159. # allow a bytes-to-string comparison too
  160. yield attr_value
  161. return select
  162. def parse_path_value(next):
  163. token = next()
  164. value = token[0]
  165. if value:
  166. if value[:1] == "'" or value[:1] == '"':
  167. return value[1:-1]
  168. try:
  169. return int(value)
  170. except ValueError:
  171. pass
  172. elif token[1].isdigit():
  173. return int(token[1])
  174. else:
  175. name = token[1].lower()
  176. if name == 'true':
  177. return True
  178. elif name == 'false':
  179. return False
  180. raise ValueError("Invalid attribute predicate: '%s'" % value)
  181. def handle_predicate(next, token):
  182. token = next()
  183. selector = []
  184. while token[0] != ']':
  185. selector.append( operations[token[0]](next, token) )
  186. try:
  187. token = next()
  188. except StopIteration:
  189. break
  190. else:
  191. if token[0] == "/":
  192. token = next()
  193. if not token[0] and token[1] == 'and':
  194. return logical_and(selector, handle_predicate(next, token))
  195. def select(result):
  196. for node in result:
  197. subresult = iter((node,))
  198. for select in selector:
  199. subresult = select(subresult)
  200. predicate_result = _get_first_or_none(subresult)
  201. if predicate_result is not None:
  202. yield node
  203. return select
  204. def logical_and(lhs_selects, rhs_select):
  205. def select(result):
  206. for node in result:
  207. subresult = iter((node,))
  208. for select in lhs_selects:
  209. subresult = select(subresult)
  210. predicate_result = _get_first_or_none(subresult)
  211. subresult = iter((node,))
  212. if predicate_result is not None:
  213. for result_node in rhs_select(subresult):
  214. yield node
  215. return select
  216. operations = {
  217. "@": handle_attribute,
  218. "": handle_name,
  219. "*": handle_star,
  220. ".": handle_dot,
  221. "//": handle_descendants,
  222. "[": handle_predicate,
  223. }
  224. functions = {
  225. 'not' : handle_func_not
  226. }
  227. def _build_path_iterator(path):
  228. # parse pattern
  229. stream = iter([ (special,text)
  230. for (special,text) in path_tokenizer(path)
  231. if special or text ])
  232. try:
  233. _next = stream.next
  234. except AttributeError:
  235. # Python 3
  236. def _next():
  237. return next(stream)
  238. token = _next()
  239. selector = []
  240. while 1:
  241. try:
  242. selector.append(operations[token[0]](_next, token))
  243. except StopIteration:
  244. raise ValueError("invalid path")
  245. try:
  246. token = _next()
  247. if token[0] == "/":
  248. token = _next()
  249. except StopIteration:
  250. break
  251. return selector
  252. # main module API
  253. def iterfind(node, path):
  254. selector_chain = _build_path_iterator(path)
  255. result = iter((node,))
  256. for select in selector_chain:
  257. result = select(result)
  258. return result
  259. def find_first(node, path):
  260. return _get_first_or_none(iterfind(node, path))
  261. def find_all(node, path):
  262. return list(iterfind(node, path))