Lexicon.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. # cython: language_level=3, py2_import=True
  2. #
  3. # Cython Scanner - Lexical Definitions
  4. #
  5. from __future__ import absolute_import, unicode_literals
  6. raw_prefixes = "rR"
  7. bytes_prefixes = "bB"
  8. string_prefixes = "fFuU" + bytes_prefixes
  9. char_prefixes = "cC"
  10. any_string_prefix = raw_prefixes + string_prefixes + char_prefixes
  11. IDENT = 'IDENT'
  12. def make_lexicon():
  13. from ..Plex import \
  14. Str, Any, AnyBut, AnyChar, Rep, Rep1, Opt, Bol, Eol, Eof, \
  15. TEXT, IGNORE, State, Lexicon
  16. from .Scanning import Method
  17. letter = Any("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_")
  18. digit = Any("0123456789")
  19. bindigit = Any("01")
  20. octdigit = Any("01234567")
  21. hexdigit = Any("0123456789ABCDEFabcdef")
  22. indentation = Bol + Rep(Any(" \t"))
  23. def underscore_digits(d):
  24. return Rep1(d) + Rep(Str("_") + Rep1(d))
  25. decimal = underscore_digits(digit)
  26. dot = Str(".")
  27. exponent = Any("Ee") + Opt(Any("+-")) + decimal
  28. decimal_fract = (decimal + dot + Opt(decimal)) | (dot + decimal)
  29. name = letter + Rep(letter | digit)
  30. intconst = decimal | (Str("0") + ((Any("Xx") + underscore_digits(hexdigit)) |
  31. (Any("Oo") + underscore_digits(octdigit)) |
  32. (Any("Bb") + underscore_digits(bindigit)) ))
  33. intsuffix = (Opt(Any("Uu")) + Opt(Any("Ll")) + Opt(Any("Ll"))) | (Opt(Any("Ll")) + Opt(Any("Ll")) + Opt(Any("Uu")))
  34. intliteral = intconst + intsuffix
  35. fltconst = (decimal_fract + Opt(exponent)) | (decimal + exponent)
  36. imagconst = (intconst | fltconst) + Any("jJ")
  37. # invalid combinations of prefixes are caught in p_string_literal
  38. beginstring = Opt(Rep(Any(string_prefixes + raw_prefixes)) |
  39. Any(char_prefixes)
  40. ) + (Str("'") | Str('"') | Str("'''") | Str('"""'))
  41. two_oct = octdigit + octdigit
  42. three_oct = octdigit + octdigit + octdigit
  43. two_hex = hexdigit + hexdigit
  44. four_hex = two_hex + two_hex
  45. escapeseq = Str("\\") + (two_oct | three_oct |
  46. Str('N{') + Rep(AnyBut('}')) + Str('}') |
  47. Str('u') + four_hex | Str('x') + two_hex |
  48. Str('U') + four_hex + four_hex | AnyChar)
  49. bra = Any("([{")
  50. ket = Any(")]}")
  51. punct = Any(":,;+-*/|&<>=.%`~^?!@")
  52. diphthong = Str("==", "<>", "!=", "<=", ">=", "<<", ">>", "**", "//",
  53. "+=", "-=", "*=", "/=", "%=", "|=", "^=", "&=",
  54. "<<=", ">>=", "**=", "//=", "->", "@=")
  55. spaces = Rep1(Any(" \t\f"))
  56. escaped_newline = Str("\\\n")
  57. lineterm = Eol + Opt(Str("\n"))
  58. comment = Str("#") + Rep(AnyBut("\n"))
  59. return Lexicon([
  60. (name, IDENT),
  61. (intliteral, Method('strip_underscores', symbol='INT')),
  62. (fltconst, Method('strip_underscores', symbol='FLOAT')),
  63. (imagconst, Method('strip_underscores', symbol='IMAG')),
  64. (punct | diphthong, TEXT),
  65. (bra, Method('open_bracket_action')),
  66. (ket, Method('close_bracket_action')),
  67. (lineterm, Method('newline_action')),
  68. (beginstring, Method('begin_string_action')),
  69. (comment, IGNORE),
  70. (spaces, IGNORE),
  71. (escaped_newline, IGNORE),
  72. State('INDENT', [
  73. (comment + lineterm, Method('commentline')),
  74. (Opt(spaces) + Opt(comment) + lineterm, IGNORE),
  75. (indentation, Method('indentation_action')),
  76. (Eof, Method('eof_action'))
  77. ]),
  78. State('SQ_STRING', [
  79. (escapeseq, 'ESCAPE'),
  80. (Rep1(AnyBut("'\"\n\\")), 'CHARS'),
  81. (Str('"'), 'CHARS'),
  82. (Str("\n"), Method('unclosed_string_action')),
  83. (Str("'"), Method('end_string_action')),
  84. (Eof, 'EOF')
  85. ]),
  86. State('DQ_STRING', [
  87. (escapeseq, 'ESCAPE'),
  88. (Rep1(AnyBut('"\n\\')), 'CHARS'),
  89. (Str("'"), 'CHARS'),
  90. (Str("\n"), Method('unclosed_string_action')),
  91. (Str('"'), Method('end_string_action')),
  92. (Eof, 'EOF')
  93. ]),
  94. State('TSQ_STRING', [
  95. (escapeseq, 'ESCAPE'),
  96. (Rep1(AnyBut("'\"\n\\")), 'CHARS'),
  97. (Any("'\""), 'CHARS'),
  98. (Str("\n"), 'NEWLINE'),
  99. (Str("'''"), Method('end_string_action')),
  100. (Eof, 'EOF')
  101. ]),
  102. State('TDQ_STRING', [
  103. (escapeseq, 'ESCAPE'),
  104. (Rep1(AnyBut('"\'\n\\')), 'CHARS'),
  105. (Any("'\""), 'CHARS'),
  106. (Str("\n"), 'NEWLINE'),
  107. (Str('"""'), Method('end_string_action')),
  108. (Eof, 'EOF')
  109. ]),
  110. (Eof, Method('eof_action'))
  111. ],
  112. # FIXME: Plex 1.9 needs different args here from Plex 1.1.4
  113. #debug_flags = scanner_debug_flags,
  114. #debug_file = scanner_dump_file
  115. )