helpers.py 38 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100
  1. # helpers.py
  2. import html.entities
  3. import re
  4. import sys
  5. import typing
  6. from . import __diag__
  7. from .core import *
  8. from .util import (
  9. _bslash,
  10. _flatten,
  11. _escape_regex_range_chars,
  12. replaced_by_pep8,
  13. )
  14. #
  15. # global helpers
  16. #
  17. def counted_array(
  18. expr: ParserElement,
  19. int_expr: typing.Optional[ParserElement] = None,
  20. *,
  21. intExpr: typing.Optional[ParserElement] = None,
  22. ) -> ParserElement:
  23. """Helper to define a counted list of expressions.
  24. This helper defines a pattern of the form::
  25. integer expr expr expr...
  26. where the leading integer tells how many expr expressions follow.
  27. The matched tokens returns the array of expr tokens as a list - the
  28. leading count token is suppressed.
  29. If ``int_expr`` is specified, it should be a pyparsing expression
  30. that produces an integer value.
  31. Example::
  32. counted_array(Word(alphas)).parse_string('2 ab cd ef') # -> ['ab', 'cd']
  33. # in this parser, the leading integer value is given in binary,
  34. # '10' indicating that 2 values are in the array
  35. binary_constant = Word('01').set_parse_action(lambda t: int(t[0], 2))
  36. counted_array(Word(alphas), int_expr=binary_constant).parse_string('10 ab cd ef') # -> ['ab', 'cd']
  37. # if other fields must be parsed after the count but before the
  38. # list items, give the fields results names and they will
  39. # be preserved in the returned ParseResults:
  40. count_with_metadata = integer + Word(alphas)("type")
  41. typed_array = counted_array(Word(alphanums), int_expr=count_with_metadata)("items")
  42. result = typed_array.parse_string("3 bool True True False")
  43. print(result.dump())
  44. # prints
  45. # ['True', 'True', 'False']
  46. # - items: ['True', 'True', 'False']
  47. # - type: 'bool'
  48. """
  49. intExpr = intExpr or int_expr
  50. array_expr = Forward()
  51. def count_field_parse_action(s, l, t):
  52. nonlocal array_expr
  53. n = t[0]
  54. array_expr <<= (expr * n) if n else Empty()
  55. # clear list contents, but keep any named results
  56. del t[:]
  57. if intExpr is None:
  58. intExpr = Word(nums).set_parse_action(lambda t: int(t[0]))
  59. else:
  60. intExpr = intExpr.copy()
  61. intExpr.set_name("arrayLen")
  62. intExpr.add_parse_action(count_field_parse_action, call_during_try=True)
  63. return (intExpr + array_expr).set_name("(len) " + str(expr) + "...")
  64. def match_previous_literal(expr: ParserElement) -> ParserElement:
  65. """Helper to define an expression that is indirectly defined from
  66. the tokens matched in a previous expression, that is, it looks for
  67. a 'repeat' of a previous expression. For example::
  68. first = Word(nums)
  69. second = match_previous_literal(first)
  70. match_expr = first + ":" + second
  71. will match ``"1:1"``, but not ``"1:2"``. Because this
  72. matches a previous literal, will also match the leading
  73. ``"1:1"`` in ``"1:10"``. If this is not desired, use
  74. :class:`match_previous_expr`. Do *not* use with packrat parsing
  75. enabled.
  76. """
  77. rep = Forward()
  78. def copy_token_to_repeater(s, l, t):
  79. if t:
  80. if len(t) == 1:
  81. rep << t[0]
  82. else:
  83. # flatten t tokens
  84. tflat = _flatten(t.as_list())
  85. rep << And(Literal(tt) for tt in tflat)
  86. else:
  87. rep << Empty()
  88. expr.add_parse_action(copy_token_to_repeater, callDuringTry=True)
  89. rep.set_name("(prev) " + str(expr))
  90. return rep
  91. def match_previous_expr(expr: ParserElement) -> ParserElement:
  92. """Helper to define an expression that is indirectly defined from
  93. the tokens matched in a previous expression, that is, it looks for
  94. a 'repeat' of a previous expression. For example::
  95. first = Word(nums)
  96. second = match_previous_expr(first)
  97. match_expr = first + ":" + second
  98. will match ``"1:1"``, but not ``"1:2"``. Because this
  99. matches by expressions, will *not* match the leading ``"1:1"``
  100. in ``"1:10"``; the expressions are evaluated first, and then
  101. compared, so ``"1"`` is compared with ``"10"``. Do *not* use
  102. with packrat parsing enabled.
  103. """
  104. rep = Forward()
  105. e2 = expr.copy()
  106. rep <<= e2
  107. def copy_token_to_repeater(s, l, t):
  108. matchTokens = _flatten(t.as_list())
  109. def must_match_these_tokens(s, l, t):
  110. theseTokens = _flatten(t.as_list())
  111. if theseTokens != matchTokens:
  112. raise ParseException(
  113. s, l, f"Expected {matchTokens}, found{theseTokens}"
  114. )
  115. rep.set_parse_action(must_match_these_tokens, callDuringTry=True)
  116. expr.add_parse_action(copy_token_to_repeater, callDuringTry=True)
  117. rep.set_name("(prev) " + str(expr))
  118. return rep
  119. def one_of(
  120. strs: Union[typing.Iterable[str], str],
  121. caseless: bool = False,
  122. use_regex: bool = True,
  123. as_keyword: bool = False,
  124. *,
  125. useRegex: bool = True,
  126. asKeyword: bool = False,
  127. ) -> ParserElement:
  128. """Helper to quickly define a set of alternative :class:`Literal` s,
  129. and makes sure to do longest-first testing when there is a conflict,
  130. regardless of the input order, but returns
  131. a :class:`MatchFirst` for best performance.
  132. Parameters:
  133. - ``strs`` - a string of space-delimited literals, or a collection of
  134. string literals
  135. - ``caseless`` - treat all literals as caseless - (default= ``False``)
  136. - ``use_regex`` - as an optimization, will
  137. generate a :class:`Regex` object; otherwise, will generate
  138. a :class:`MatchFirst` object (if ``caseless=True`` or ``as_keyword=True``, or if
  139. creating a :class:`Regex` raises an exception) - (default= ``True``)
  140. - ``as_keyword`` - enforce :class:`Keyword`-style matching on the
  141. generated expressions - (default= ``False``)
  142. - ``asKeyword`` and ``useRegex`` are retained for pre-PEP8 compatibility,
  143. but will be removed in a future release
  144. Example::
  145. comp_oper = one_of("< = > <= >= !=")
  146. var = Word(alphas)
  147. number = Word(nums)
  148. term = var | number
  149. comparison_expr = term + comp_oper + term
  150. print(comparison_expr.search_string("B = 12 AA=23 B<=AA AA>12"))
  151. prints::
  152. [['B', '=', '12'], ['AA', '=', '23'], ['B', '<=', 'AA'], ['AA', '>', '12']]
  153. """
  154. asKeyword = asKeyword or as_keyword
  155. useRegex = useRegex and use_regex
  156. if (
  157. isinstance(caseless, str_type)
  158. and __diag__.warn_on_multiple_string_args_to_oneof
  159. ):
  160. warnings.warn(
  161. "More than one string argument passed to one_of, pass"
  162. " choices as a list or space-delimited string",
  163. stacklevel=2,
  164. )
  165. if caseless:
  166. isequal = lambda a, b: a.upper() == b.upper()
  167. masks = lambda a, b: b.upper().startswith(a.upper())
  168. parseElementClass = CaselessKeyword if asKeyword else CaselessLiteral
  169. else:
  170. isequal = lambda a, b: a == b
  171. masks = lambda a, b: b.startswith(a)
  172. parseElementClass = Keyword if asKeyword else Literal
  173. symbols: List[str] = []
  174. if isinstance(strs, str_type):
  175. strs = typing.cast(str, strs)
  176. symbols = strs.split()
  177. elif isinstance(strs, Iterable):
  178. symbols = list(strs)
  179. else:
  180. raise TypeError("Invalid argument to one_of, expected string or iterable")
  181. if not symbols:
  182. return NoMatch()
  183. # reorder given symbols to take care to avoid masking longer choices with shorter ones
  184. # (but only if the given symbols are not just single characters)
  185. if any(len(sym) > 1 for sym in symbols):
  186. i = 0
  187. while i < len(symbols) - 1:
  188. cur = symbols[i]
  189. for j, other in enumerate(symbols[i + 1 :]):
  190. if isequal(other, cur):
  191. del symbols[i + j + 1]
  192. break
  193. elif masks(cur, other):
  194. del symbols[i + j + 1]
  195. symbols.insert(i, other)
  196. break
  197. else:
  198. i += 1
  199. if useRegex:
  200. re_flags: int = re.IGNORECASE if caseless else 0
  201. try:
  202. if all(len(sym) == 1 for sym in symbols):
  203. # symbols are just single characters, create range regex pattern
  204. patt = f"[{''.join(_escape_regex_range_chars(sym) for sym in symbols)}]"
  205. else:
  206. patt = "|".join(re.escape(sym) for sym in symbols)
  207. # wrap with \b word break markers if defining as keywords
  208. if asKeyword:
  209. patt = rf"\b(?:{patt})\b"
  210. ret = Regex(patt, flags=re_flags).set_name(" | ".join(symbols))
  211. if caseless:
  212. # add parse action to return symbols as specified, not in random
  213. # casing as found in input string
  214. symbol_map = {sym.lower(): sym for sym in symbols}
  215. ret.add_parse_action(lambda s, l, t: symbol_map[t[0].lower()])
  216. return ret
  217. except re.error:
  218. warnings.warn(
  219. "Exception creating Regex for one_of, building MatchFirst", stacklevel=2
  220. )
  221. # last resort, just use MatchFirst
  222. return MatchFirst(parseElementClass(sym) for sym in symbols).set_name(
  223. " | ".join(symbols)
  224. )
  225. def dict_of(key: ParserElement, value: ParserElement) -> ParserElement:
  226. """Helper to easily and clearly define a dictionary by specifying
  227. the respective patterns for the key and value. Takes care of
  228. defining the :class:`Dict`, :class:`ZeroOrMore`, and
  229. :class:`Group` tokens in the proper order. The key pattern
  230. can include delimiting markers or punctuation, as long as they are
  231. suppressed, thereby leaving the significant key text. The value
  232. pattern can include named results, so that the :class:`Dict` results
  233. can include named token fields.
  234. Example::
  235. text = "shape: SQUARE posn: upper left color: light blue texture: burlap"
  236. attr_expr = (label + Suppress(':') + OneOrMore(data_word, stop_on=label).set_parse_action(' '.join))
  237. print(attr_expr[1, ...].parse_string(text).dump())
  238. attr_label = label
  239. attr_value = Suppress(':') + OneOrMore(data_word, stop_on=label).set_parse_action(' '.join)
  240. # similar to Dict, but simpler call format
  241. result = dict_of(attr_label, attr_value).parse_string(text)
  242. print(result.dump())
  243. print(result['shape'])
  244. print(result.shape) # object attribute access works too
  245. print(result.as_dict())
  246. prints::
  247. [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']]
  248. - color: 'light blue'
  249. - posn: 'upper left'
  250. - shape: 'SQUARE'
  251. - texture: 'burlap'
  252. SQUARE
  253. SQUARE
  254. {'color': 'light blue', 'shape': 'SQUARE', 'posn': 'upper left', 'texture': 'burlap'}
  255. """
  256. return Dict(OneOrMore(Group(key + value)))
  257. def original_text_for(
  258. expr: ParserElement, as_string: bool = True, *, asString: bool = True
  259. ) -> ParserElement:
  260. """Helper to return the original, untokenized text for a given
  261. expression. Useful to restore the parsed fields of an HTML start
  262. tag into the raw tag text itself, or to revert separate tokens with
  263. intervening whitespace back to the original matching input text. By
  264. default, returns a string containing the original parsed text.
  265. If the optional ``as_string`` argument is passed as
  266. ``False``, then the return value is
  267. a :class:`ParseResults` containing any results names that
  268. were originally matched, and a single token containing the original
  269. matched text from the input string. So if the expression passed to
  270. :class:`original_text_for` contains expressions with defined
  271. results names, you must set ``as_string`` to ``False`` if you
  272. want to preserve those results name values.
  273. The ``asString`` pre-PEP8 argument is retained for compatibility,
  274. but will be removed in a future release.
  275. Example::
  276. src = "this is test <b> bold <i>text</i> </b> normal text "
  277. for tag in ("b", "i"):
  278. opener, closer = make_html_tags(tag)
  279. patt = original_text_for(opener + ... + closer)
  280. print(patt.search_string(src)[0])
  281. prints::
  282. ['<b> bold <i>text</i> </b>']
  283. ['<i>text</i>']
  284. """
  285. asString = asString and as_string
  286. locMarker = Empty().set_parse_action(lambda s, loc, t: loc)
  287. endlocMarker = locMarker.copy()
  288. endlocMarker.callPreparse = False
  289. matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")
  290. if asString:
  291. extractText = lambda s, l, t: s[t._original_start : t._original_end]
  292. else:
  293. def extractText(s, l, t):
  294. t[:] = [s[t.pop("_original_start") : t.pop("_original_end")]]
  295. matchExpr.set_parse_action(extractText)
  296. matchExpr.ignoreExprs = expr.ignoreExprs
  297. matchExpr.suppress_warning(Diagnostics.warn_ungrouped_named_tokens_in_collection)
  298. return matchExpr
  299. def ungroup(expr: ParserElement) -> ParserElement:
  300. """Helper to undo pyparsing's default grouping of And expressions,
  301. even if all but one are non-empty.
  302. """
  303. return TokenConverter(expr).add_parse_action(lambda t: t[0])
  304. def locatedExpr(expr: ParserElement) -> ParserElement:
  305. """
  306. (DEPRECATED - future code should use the :class:`Located` class)
  307. Helper to decorate a returned token with its starting and ending
  308. locations in the input string.
  309. This helper adds the following results names:
  310. - ``locn_start`` - location where matched expression begins
  311. - ``locn_end`` - location where matched expression ends
  312. - ``value`` - the actual parsed results
  313. Be careful if the input text contains ``<TAB>`` characters, you
  314. may want to call :class:`ParserElement.parse_with_tabs`
  315. Example::
  316. wd = Word(alphas)
  317. for match in locatedExpr(wd).search_string("ljsdf123lksdjjf123lkkjj1222"):
  318. print(match)
  319. prints::
  320. [[0, 'ljsdf', 5]]
  321. [[8, 'lksdjjf', 15]]
  322. [[18, 'lkkjj', 23]]
  323. """
  324. locator = Empty().set_parse_action(lambda ss, ll, tt: ll)
  325. return Group(
  326. locator("locn_start")
  327. + expr("value")
  328. + locator.copy().leaveWhitespace()("locn_end")
  329. )
  330. def nested_expr(
  331. opener: Union[str, ParserElement] = "(",
  332. closer: Union[str, ParserElement] = ")",
  333. content: typing.Optional[ParserElement] = None,
  334. ignore_expr: ParserElement = quoted_string(),
  335. *,
  336. ignoreExpr: ParserElement = quoted_string(),
  337. ) -> ParserElement:
  338. """Helper method for defining nested lists enclosed in opening and
  339. closing delimiters (``"("`` and ``")"`` are the default).
  340. Parameters:
  341. - ``opener`` - opening character for a nested list
  342. (default= ``"("``); can also be a pyparsing expression
  343. - ``closer`` - closing character for a nested list
  344. (default= ``")"``); can also be a pyparsing expression
  345. - ``content`` - expression for items within the nested lists
  346. (default= ``None``)
  347. - ``ignore_expr`` - expression for ignoring opening and closing delimiters
  348. (default= :class:`quoted_string`)
  349. - ``ignoreExpr`` - this pre-PEP8 argument is retained for compatibility
  350. but will be removed in a future release
  351. If an expression is not provided for the content argument, the
  352. nested expression will capture all whitespace-delimited content
  353. between delimiters as a list of separate values.
  354. Use the ``ignore_expr`` argument to define expressions that may
  355. contain opening or closing characters that should not be treated as
  356. opening or closing characters for nesting, such as quoted_string or
  357. a comment expression. Specify multiple expressions using an
  358. :class:`Or` or :class:`MatchFirst`. The default is
  359. :class:`quoted_string`, but if no expressions are to be ignored, then
  360. pass ``None`` for this argument.
  361. Example::
  362. data_type = one_of("void int short long char float double")
  363. decl_data_type = Combine(data_type + Opt(Word('*')))
  364. ident = Word(alphas+'_', alphanums+'_')
  365. number = pyparsing_common.number
  366. arg = Group(decl_data_type + ident)
  367. LPAR, RPAR = map(Suppress, "()")
  368. code_body = nested_expr('{', '}', ignore_expr=(quoted_string | c_style_comment))
  369. c_function = (decl_data_type("type")
  370. + ident("name")
  371. + LPAR + Opt(DelimitedList(arg), [])("args") + RPAR
  372. + code_body("body"))
  373. c_function.ignore(c_style_comment)
  374. source_code = '''
  375. int is_odd(int x) {
  376. return (x%2);
  377. }
  378. int dec_to_hex(char hchar) {
  379. if (hchar >= '0' && hchar <= '9') {
  380. return (ord(hchar)-ord('0'));
  381. } else {
  382. return (10+ord(hchar)-ord('A'));
  383. }
  384. }
  385. '''
  386. for func in c_function.search_string(source_code):
  387. print("%(name)s (%(type)s) args: %(args)s" % func)
  388. prints::
  389. is_odd (int) args: [['int', 'x']]
  390. dec_to_hex (int) args: [['char', 'hchar']]
  391. """
  392. if ignoreExpr != ignore_expr:
  393. ignoreExpr = ignore_expr if ignoreExpr == quoted_string() else ignoreExpr
  394. if opener == closer:
  395. raise ValueError("opening and closing strings cannot be the same")
  396. if content is None:
  397. if isinstance(opener, str_type) and isinstance(closer, str_type):
  398. opener = typing.cast(str, opener)
  399. closer = typing.cast(str, closer)
  400. if len(opener) == 1 and len(closer) == 1:
  401. if ignoreExpr is not None:
  402. content = Combine(
  403. OneOrMore(
  404. ~ignoreExpr
  405. + CharsNotIn(
  406. opener + closer + ParserElement.DEFAULT_WHITE_CHARS,
  407. exact=1,
  408. )
  409. )
  410. ).set_parse_action(lambda t: t[0].strip())
  411. else:
  412. content = empty.copy() + CharsNotIn(
  413. opener + closer + ParserElement.DEFAULT_WHITE_CHARS
  414. ).set_parse_action(lambda t: t[0].strip())
  415. else:
  416. if ignoreExpr is not None:
  417. content = Combine(
  418. OneOrMore(
  419. ~ignoreExpr
  420. + ~Literal(opener)
  421. + ~Literal(closer)
  422. + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
  423. )
  424. ).set_parse_action(lambda t: t[0].strip())
  425. else:
  426. content = Combine(
  427. OneOrMore(
  428. ~Literal(opener)
  429. + ~Literal(closer)
  430. + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
  431. )
  432. ).set_parse_action(lambda t: t[0].strip())
  433. else:
  434. raise ValueError(
  435. "opening and closing arguments must be strings if no content expression is given"
  436. )
  437. ret = Forward()
  438. if ignoreExpr is not None:
  439. ret <<= Group(
  440. Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer)
  441. )
  442. else:
  443. ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer))
  444. ret.set_name("nested %s%s expression" % (opener, closer))
  445. return ret
  446. def _makeTags(tagStr, xml, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")):
  447. """Internal helper to construct opening and closing tag expressions, given a tag name"""
  448. if isinstance(tagStr, str_type):
  449. resname = tagStr
  450. tagStr = Keyword(tagStr, caseless=not xml)
  451. else:
  452. resname = tagStr.name
  453. tagAttrName = Word(alphas, alphanums + "_-:")
  454. if xml:
  455. tagAttrValue = dbl_quoted_string.copy().set_parse_action(remove_quotes)
  456. openTag = (
  457. suppress_LT
  458. + tagStr("tag")
  459. + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue)))
  460. + Opt("/", default=[False])("empty").set_parse_action(
  461. lambda s, l, t: t[0] == "/"
  462. )
  463. + suppress_GT
  464. )
  465. else:
  466. tagAttrValue = quoted_string.copy().set_parse_action(remove_quotes) | Word(
  467. printables, exclude_chars=">"
  468. )
  469. openTag = (
  470. suppress_LT
  471. + tagStr("tag")
  472. + Dict(
  473. ZeroOrMore(
  474. Group(
  475. tagAttrName.set_parse_action(lambda t: t[0].lower())
  476. + Opt(Suppress("=") + tagAttrValue)
  477. )
  478. )
  479. )
  480. + Opt("/", default=[False])("empty").set_parse_action(
  481. lambda s, l, t: t[0] == "/"
  482. )
  483. + suppress_GT
  484. )
  485. closeTag = Combine(Literal("</") + tagStr + ">", adjacent=False)
  486. openTag.set_name("<%s>" % resname)
  487. # add start<tagname> results name in parse action now that ungrouped names are not reported at two levels
  488. openTag.add_parse_action(
  489. lambda t: t.__setitem__(
  490. "start" + "".join(resname.replace(":", " ").title().split()), t.copy()
  491. )
  492. )
  493. closeTag = closeTag(
  494. "end" + "".join(resname.replace(":", " ").title().split())
  495. ).set_name("</%s>" % resname)
  496. openTag.tag = resname
  497. closeTag.tag = resname
  498. openTag.tag_body = SkipTo(closeTag())
  499. return openTag, closeTag
  500. def make_html_tags(
  501. tag_str: Union[str, ParserElement]
  502. ) -> Tuple[ParserElement, ParserElement]:
  503. """Helper to construct opening and closing tag expressions for HTML,
  504. given a tag name. Matches tags in either upper or lower case,
  505. attributes with namespaces and with quoted or unquoted values.
  506. Example::
  507. text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
  508. # make_html_tags returns pyparsing expressions for the opening and
  509. # closing tags as a 2-tuple
  510. a, a_end = make_html_tags("A")
  511. link_expr = a + SkipTo(a_end)("link_text") + a_end
  512. for link in link_expr.search_string(text):
  513. # attributes in the <A> tag (like "href" shown here) are
  514. # also accessible as named results
  515. print(link.link_text, '->', link.href)
  516. prints::
  517. pyparsing -> https://github.com/pyparsing/pyparsing/wiki
  518. """
  519. return _makeTags(tag_str, False)
  520. def make_xml_tags(
  521. tag_str: Union[str, ParserElement]
  522. ) -> Tuple[ParserElement, ParserElement]:
  523. """Helper to construct opening and closing tag expressions for XML,
  524. given a tag name. Matches tags only in the given upper/lower case.
  525. Example: similar to :class:`make_html_tags`
  526. """
  527. return _makeTags(tag_str, True)
  528. any_open_tag: ParserElement
  529. any_close_tag: ParserElement
  530. any_open_tag, any_close_tag = make_html_tags(
  531. Word(alphas, alphanums + "_:").set_name("any tag")
  532. )
  533. _htmlEntityMap = {k.rstrip(";"): v for k, v in html.entities.html5.items()}
  534. common_html_entity = Regex("&(?P<entity>" + "|".join(_htmlEntityMap) + ");").set_name(
  535. "common HTML entity"
  536. )
  537. def replace_html_entity(s, l, t):
  538. """Helper parser action to replace common HTML entities with their special characters"""
  539. return _htmlEntityMap.get(t.entity)
  540. class OpAssoc(Enum):
  541. """Enumeration of operator associativity
  542. - used in constructing InfixNotationOperatorSpec for :class:`infix_notation`"""
  543. LEFT = 1
  544. RIGHT = 2
  545. InfixNotationOperatorArgType = Union[
  546. ParserElement, str, Tuple[Union[ParserElement, str], Union[ParserElement, str]]
  547. ]
  548. InfixNotationOperatorSpec = Union[
  549. Tuple[
  550. InfixNotationOperatorArgType,
  551. int,
  552. OpAssoc,
  553. typing.Optional[ParseAction],
  554. ],
  555. Tuple[
  556. InfixNotationOperatorArgType,
  557. int,
  558. OpAssoc,
  559. ],
  560. ]
  561. def infix_notation(
  562. base_expr: ParserElement,
  563. op_list: List[InfixNotationOperatorSpec],
  564. lpar: Union[str, ParserElement] = Suppress("("),
  565. rpar: Union[str, ParserElement] = Suppress(")"),
  566. ) -> ParserElement:
  567. """Helper method for constructing grammars of expressions made up of
  568. operators working in a precedence hierarchy. Operators may be unary
  569. or binary, left- or right-associative. Parse actions can also be
  570. attached to operator expressions. The generated parser will also
  571. recognize the use of parentheses to override operator precedences
  572. (see example below).
  573. Note: if you define a deep operator list, you may see performance
  574. issues when using infix_notation. See
  575. :class:`ParserElement.enable_packrat` for a mechanism to potentially
  576. improve your parser performance.
  577. Parameters:
  578. - ``base_expr`` - expression representing the most basic operand to
  579. be used in the expression
  580. - ``op_list`` - list of tuples, one for each operator precedence level
  581. in the expression grammar; each tuple is of the form ``(op_expr,
  582. num_operands, right_left_assoc, (optional)parse_action)``, where:
  583. - ``op_expr`` is the pyparsing expression for the operator; may also
  584. be a string, which will be converted to a Literal; if ``num_operands``
  585. is 3, ``op_expr`` is a tuple of two expressions, for the two
  586. operators separating the 3 terms
  587. - ``num_operands`` is the number of terms for this operator (must be 1,
  588. 2, or 3)
  589. - ``right_left_assoc`` is the indicator whether the operator is right
  590. or left associative, using the pyparsing-defined constants
  591. ``OpAssoc.RIGHT`` and ``OpAssoc.LEFT``.
  592. - ``parse_action`` is the parse action to be associated with
  593. expressions matching this operator expression (the parse action
  594. tuple member may be omitted); if the parse action is passed
  595. a tuple or list of functions, this is equivalent to calling
  596. ``set_parse_action(*fn)``
  597. (:class:`ParserElement.set_parse_action`)
  598. - ``lpar`` - expression for matching left-parentheses; if passed as a
  599. str, then will be parsed as ``Suppress(lpar)``. If lpar is passed as
  600. an expression (such as ``Literal('(')``), then it will be kept in
  601. the parsed results, and grouped with them. (default= ``Suppress('(')``)
  602. - ``rpar`` - expression for matching right-parentheses; if passed as a
  603. str, then will be parsed as ``Suppress(rpar)``. If rpar is passed as
  604. an expression (such as ``Literal(')')``), then it will be kept in
  605. the parsed results, and grouped with them. (default= ``Suppress(')')``)
  606. Example::
  607. # simple example of four-function arithmetic with ints and
  608. # variable names
  609. integer = pyparsing_common.signed_integer
  610. varname = pyparsing_common.identifier
  611. arith_expr = infix_notation(integer | varname,
  612. [
  613. ('-', 1, OpAssoc.RIGHT),
  614. (one_of('* /'), 2, OpAssoc.LEFT),
  615. (one_of('+ -'), 2, OpAssoc.LEFT),
  616. ])
  617. arith_expr.run_tests('''
  618. 5+3*6
  619. (5+3)*6
  620. -2--11
  621. ''', full_dump=False)
  622. prints::
  623. 5+3*6
  624. [[5, '+', [3, '*', 6]]]
  625. (5+3)*6
  626. [[[5, '+', 3], '*', 6]]
  627. (5+x)*y
  628. [[[5, '+', 'x'], '*', 'y']]
  629. -2--11
  630. [[['-', 2], '-', ['-', 11]]]
  631. """
  632. # captive version of FollowedBy that does not do parse actions or capture results names
  633. class _FB(FollowedBy):
  634. def parseImpl(self, instring, loc, doActions=True):
  635. self.expr.try_parse(instring, loc)
  636. return loc, []
  637. _FB.__name__ = "FollowedBy>"
  638. ret = Forward()
  639. if isinstance(lpar, str):
  640. lpar = Suppress(lpar)
  641. if isinstance(rpar, str):
  642. rpar = Suppress(rpar)
  643. # if lpar and rpar are not suppressed, wrap in group
  644. if not (isinstance(rpar, Suppress) and isinstance(rpar, Suppress)):
  645. lastExpr = base_expr | Group(lpar + ret + rpar)
  646. else:
  647. lastExpr = base_expr | (lpar + ret + rpar)
  648. arity: int
  649. rightLeftAssoc: opAssoc
  650. pa: typing.Optional[ParseAction]
  651. opExpr1: ParserElement
  652. opExpr2: ParserElement
  653. for i, operDef in enumerate(op_list):
  654. opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4] # type: ignore[assignment]
  655. if isinstance(opExpr, str_type):
  656. opExpr = ParserElement._literalStringClass(opExpr)
  657. opExpr = typing.cast(ParserElement, opExpr)
  658. if arity == 3:
  659. if not isinstance(opExpr, (tuple, list)) or len(opExpr) != 2:
  660. raise ValueError(
  661. "if numterms=3, opExpr must be a tuple or list of two expressions"
  662. )
  663. opExpr1, opExpr2 = opExpr
  664. term_name = f"{opExpr1}{opExpr2} term"
  665. else:
  666. term_name = f"{opExpr} term"
  667. if not 1 <= arity <= 3:
  668. raise ValueError("operator must be unary (1), binary (2), or ternary (3)")
  669. if rightLeftAssoc not in (OpAssoc.LEFT, OpAssoc.RIGHT):
  670. raise ValueError("operator must indicate right or left associativity")
  671. thisExpr: ParserElement = Forward().set_name(term_name)
  672. thisExpr = typing.cast(Forward, thisExpr)
  673. if rightLeftAssoc is OpAssoc.LEFT:
  674. if arity == 1:
  675. matchExpr = _FB(lastExpr + opExpr) + Group(lastExpr + opExpr[1, ...])
  676. elif arity == 2:
  677. if opExpr is not None:
  678. matchExpr = _FB(lastExpr + opExpr + lastExpr) + Group(
  679. lastExpr + (opExpr + lastExpr)[1, ...]
  680. )
  681. else:
  682. matchExpr = _FB(lastExpr + lastExpr) + Group(lastExpr[2, ...])
  683. elif arity == 3:
  684. matchExpr = _FB(
  685. lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr
  686. ) + Group(lastExpr + OneOrMore(opExpr1 + lastExpr + opExpr2 + lastExpr))
  687. elif rightLeftAssoc is OpAssoc.RIGHT:
  688. if arity == 1:
  689. # try to avoid LR with this extra test
  690. if not isinstance(opExpr, Opt):
  691. opExpr = Opt(opExpr)
  692. matchExpr = _FB(opExpr.expr + thisExpr) + Group(opExpr + thisExpr)
  693. elif arity == 2:
  694. if opExpr is not None:
  695. matchExpr = _FB(lastExpr + opExpr + thisExpr) + Group(
  696. lastExpr + (opExpr + thisExpr)[1, ...]
  697. )
  698. else:
  699. matchExpr = _FB(lastExpr + thisExpr) + Group(
  700. lastExpr + thisExpr[1, ...]
  701. )
  702. elif arity == 3:
  703. matchExpr = _FB(
  704. lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr
  705. ) + Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr)
  706. if pa:
  707. if isinstance(pa, (tuple, list)):
  708. matchExpr.set_parse_action(*pa)
  709. else:
  710. matchExpr.set_parse_action(pa)
  711. thisExpr <<= (matchExpr | lastExpr).setName(term_name)
  712. lastExpr = thisExpr
  713. ret <<= lastExpr
  714. return ret
  715. def indentedBlock(blockStatementExpr, indentStack, indent=True, backup_stacks=[]):
  716. """
  717. (DEPRECATED - use :class:`IndentedBlock` class instead)
  718. Helper method for defining space-delimited indentation blocks,
  719. such as those used to define block statements in Python source code.
  720. Parameters:
  721. - ``blockStatementExpr`` - expression defining syntax of statement that
  722. is repeated within the indented block
  723. - ``indentStack`` - list created by caller to manage indentation stack
  724. (multiple ``statementWithIndentedBlock`` expressions within a single
  725. grammar should share a common ``indentStack``)
  726. - ``indent`` - boolean indicating whether block must be indented beyond
  727. the current level; set to ``False`` for block of left-most statements
  728. (default= ``True``)
  729. A valid block must contain at least one ``blockStatement``.
  730. (Note that indentedBlock uses internal parse actions which make it
  731. incompatible with packrat parsing.)
  732. Example::
  733. data = '''
  734. def A(z):
  735. A1
  736. B = 100
  737. G = A2
  738. A2
  739. A3
  740. B
  741. def BB(a,b,c):
  742. BB1
  743. def BBA():
  744. bba1
  745. bba2
  746. bba3
  747. C
  748. D
  749. def spam(x,y):
  750. def eggs(z):
  751. pass
  752. '''
  753. indentStack = [1]
  754. stmt = Forward()
  755. identifier = Word(alphas, alphanums)
  756. funcDecl = ("def" + identifier + Group("(" + Opt(delimitedList(identifier)) + ")") + ":")
  757. func_body = indentedBlock(stmt, indentStack)
  758. funcDef = Group(funcDecl + func_body)
  759. rvalue = Forward()
  760. funcCall = Group(identifier + "(" + Opt(delimitedList(rvalue)) + ")")
  761. rvalue << (funcCall | identifier | Word(nums))
  762. assignment = Group(identifier + "=" + rvalue)
  763. stmt << (funcDef | assignment | identifier)
  764. module_body = stmt[1, ...]
  765. parseTree = module_body.parseString(data)
  766. parseTree.pprint()
  767. prints::
  768. [['def',
  769. 'A',
  770. ['(', 'z', ')'],
  771. ':',
  772. [['A1'], [['B', '=', '100']], [['G', '=', 'A2']], ['A2'], ['A3']]],
  773. 'B',
  774. ['def',
  775. 'BB',
  776. ['(', 'a', 'b', 'c', ')'],
  777. ':',
  778. [['BB1'], [['def', 'BBA', ['(', ')'], ':', [['bba1'], ['bba2'], ['bba3']]]]]],
  779. 'C',
  780. 'D',
  781. ['def',
  782. 'spam',
  783. ['(', 'x', 'y', ')'],
  784. ':',
  785. [[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]]
  786. """
  787. backup_stacks.append(indentStack[:])
  788. def reset_stack():
  789. indentStack[:] = backup_stacks[-1]
  790. def checkPeerIndent(s, l, t):
  791. if l >= len(s):
  792. return
  793. curCol = col(l, s)
  794. if curCol != indentStack[-1]:
  795. if curCol > indentStack[-1]:
  796. raise ParseException(s, l, "illegal nesting")
  797. raise ParseException(s, l, "not a peer entry")
  798. def checkSubIndent(s, l, t):
  799. curCol = col(l, s)
  800. if curCol > indentStack[-1]:
  801. indentStack.append(curCol)
  802. else:
  803. raise ParseException(s, l, "not a subentry")
  804. def checkUnindent(s, l, t):
  805. if l >= len(s):
  806. return
  807. curCol = col(l, s)
  808. if not (indentStack and curCol in indentStack):
  809. raise ParseException(s, l, "not an unindent")
  810. if curCol < indentStack[-1]:
  811. indentStack.pop()
  812. NL = OneOrMore(LineEnd().set_whitespace_chars("\t ").suppress())
  813. INDENT = (Empty() + Empty().set_parse_action(checkSubIndent)).set_name("INDENT")
  814. PEER = Empty().set_parse_action(checkPeerIndent).set_name("")
  815. UNDENT = Empty().set_parse_action(checkUnindent).set_name("UNINDENT")
  816. if indent:
  817. smExpr = Group(
  818. Opt(NL)
  819. + INDENT
  820. + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
  821. + UNDENT
  822. )
  823. else:
  824. smExpr = Group(
  825. Opt(NL)
  826. + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
  827. + Opt(UNDENT)
  828. )
  829. # add a parse action to remove backup_stack from list of backups
  830. smExpr.add_parse_action(
  831. lambda: backup_stacks.pop(-1) and None if backup_stacks else None
  832. )
  833. smExpr.set_fail_action(lambda a, b, c, d: reset_stack())
  834. blockStatementExpr.ignore(_bslash + LineEnd())
  835. return smExpr.set_name("indented block")
  836. # it's easy to get these comment structures wrong - they're very common, so may as well make them available
  837. c_style_comment = Combine(Regex(r"/\*(?:[^*]|\*(?!/))*") + "*/").set_name(
  838. "C style comment"
  839. )
  840. "Comment of the form ``/* ... */``"
  841. html_comment = Regex(r"<!--[\s\S]*?-->").set_name("HTML comment")
  842. "Comment of the form ``<!-- ... -->``"
  843. rest_of_line = Regex(r".*").leave_whitespace().set_name("rest of line")
  844. dbl_slash_comment = Regex(r"//(?:\\\n|[^\n])*").set_name("// comment")
  845. "Comment of the form ``// ... (to end of line)``"
  846. cpp_style_comment = Combine(
  847. Regex(r"/\*(?:[^*]|\*(?!/))*") + "*/" | dbl_slash_comment
  848. ).set_name("C++ style comment")
  849. "Comment of either form :class:`c_style_comment` or :class:`dbl_slash_comment`"
  850. java_style_comment = cpp_style_comment
  851. "Same as :class:`cpp_style_comment`"
  852. python_style_comment = Regex(r"#.*").set_name("Python style comment")
  853. "Comment of the form ``# ... (to end of line)``"
  854. # build list of built-in expressions, for future reference if a global default value
  855. # gets updated
  856. _builtin_exprs: List[ParserElement] = [
  857. v for v in vars().values() if isinstance(v, ParserElement)
  858. ]
  859. # compatibility function, superseded by DelimitedList class
  860. def delimited_list(
  861. expr: Union[str, ParserElement],
  862. delim: Union[str, ParserElement] = ",",
  863. combine: bool = False,
  864. min: typing.Optional[int] = None,
  865. max: typing.Optional[int] = None,
  866. *,
  867. allow_trailing_delim: bool = False,
  868. ) -> ParserElement:
  869. """(DEPRECATED - use :class:`DelimitedList` class)"""
  870. return DelimitedList(
  871. expr, delim, combine, min, max, allow_trailing_delim=allow_trailing_delim
  872. )
  873. # pre-PEP8 compatible names
  874. # fmt: off
  875. opAssoc = OpAssoc
  876. anyOpenTag = any_open_tag
  877. anyCloseTag = any_close_tag
  878. commonHTMLEntity = common_html_entity
  879. cStyleComment = c_style_comment
  880. htmlComment = html_comment
  881. restOfLine = rest_of_line
  882. dblSlashComment = dbl_slash_comment
  883. cppStyleComment = cpp_style_comment
  884. javaStyleComment = java_style_comment
  885. pythonStyleComment = python_style_comment
  886. @replaced_by_pep8(DelimitedList)
  887. def delimitedList(): ...
  888. @replaced_by_pep8(DelimitedList)
  889. def delimited_list(): ...
  890. @replaced_by_pep8(counted_array)
  891. def countedArray(): ...
  892. @replaced_by_pep8(match_previous_literal)
  893. def matchPreviousLiteral(): ...
  894. @replaced_by_pep8(match_previous_expr)
  895. def matchPreviousExpr(): ...
  896. @replaced_by_pep8(one_of)
  897. def oneOf(): ...
  898. @replaced_by_pep8(dict_of)
  899. def dictOf(): ...
  900. @replaced_by_pep8(original_text_for)
  901. def originalTextFor(): ...
  902. @replaced_by_pep8(nested_expr)
  903. def nestedExpr(): ...
  904. @replaced_by_pep8(make_html_tags)
  905. def makeHTMLTags(): ...
  906. @replaced_by_pep8(make_xml_tags)
  907. def makeXMLTags(): ...
  908. @replaced_by_pep8(replace_html_entity)
  909. def replaceHTMLEntity(): ...
  910. @replaced_by_pep8(infix_notation)
  911. def infixNotation(): ...
  912. # fmt: on