hyperparser.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. """Provide advanced parsing abilities for ParenMatch and other extensions.
  2. HyperParser uses PyParser. PyParser mostly gives information on the
  3. proper indentation of code. HyperParser gives additional information on
  4. the structure of code.
  5. """
  6. from keyword import iskeyword
  7. import string
  8. from idlelib import pyparse
  9. # all ASCII chars that may be in an identifier
  10. _ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_")
  11. # all ASCII chars that may be the first char of an identifier
  12. _ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_")
  13. # lookup table for whether 7-bit ASCII chars are valid in a Python identifier
  14. _IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)]
  15. # lookup table for whether 7-bit ASCII chars are valid as the first
  16. # char in a Python identifier
  17. _IS_ASCII_ID_FIRST_CHAR = \
  18. [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)]
  19. class HyperParser:
  20. def __init__(self, editwin, index):
  21. "To initialize, analyze the surroundings of the given index."
  22. self.editwin = editwin
  23. self.text = text = editwin.text
  24. parser = pyparse.Parser(editwin.indentwidth, editwin.tabwidth)
  25. def index2line(index):
  26. return int(float(index))
  27. lno = index2line(text.index(index))
  28. if not editwin.prompt_last_line:
  29. for context in editwin.num_context_lines:
  30. startat = max(lno - context, 1)
  31. startatindex = repr(startat) + ".0"
  32. stopatindex = "%d.end" % lno
  33. # We add the newline because PyParse requires a newline
  34. # at end. We add a space so that index won't be at end
  35. # of line, so that its status will be the same as the
  36. # char before it, if should.
  37. parser.set_code(text.get(startatindex, stopatindex)+' \n')
  38. bod = parser.find_good_parse_start(
  39. editwin._build_char_in_string_func(startatindex))
  40. if bod is not None or startat == 1:
  41. break
  42. parser.set_lo(bod or 0)
  43. else:
  44. r = text.tag_prevrange("console", index)
  45. if r:
  46. startatindex = r[1]
  47. else:
  48. startatindex = "1.0"
  49. stopatindex = "%d.end" % lno
  50. # We add the newline because PyParse requires it. We add a
  51. # space so that index won't be at end of line, so that its
  52. # status will be the same as the char before it, if should.
  53. parser.set_code(text.get(startatindex, stopatindex)+' \n')
  54. parser.set_lo(0)
  55. # We want what the parser has, minus the last newline and space.
  56. self.rawtext = parser.code[:-2]
  57. # Parser.code apparently preserves the statement we are in, so
  58. # that stopatindex can be used to synchronize the string with
  59. # the text box indices.
  60. self.stopatindex = stopatindex
  61. self.bracketing = parser.get_last_stmt_bracketing()
  62. # find which pairs of bracketing are openers. These always
  63. # correspond to a character of rawtext.
  64. self.isopener = [i>0 and self.bracketing[i][1] >
  65. self.bracketing[i-1][1]
  66. for i in range(len(self.bracketing))]
  67. self.set_index(index)
  68. def set_index(self, index):
  69. """Set the index to which the functions relate.
  70. The index must be in the same statement.
  71. """
  72. indexinrawtext = (len(self.rawtext) -
  73. len(self.text.get(index, self.stopatindex)))
  74. if indexinrawtext < 0:
  75. raise ValueError("Index %s precedes the analyzed statement"
  76. % index)
  77. self.indexinrawtext = indexinrawtext
  78. # find the rightmost bracket to which index belongs
  79. self.indexbracket = 0
  80. while (self.indexbracket < len(self.bracketing)-1 and
  81. self.bracketing[self.indexbracket+1][0] < self.indexinrawtext):
  82. self.indexbracket += 1
  83. if (self.indexbracket < len(self.bracketing)-1 and
  84. self.bracketing[self.indexbracket+1][0] == self.indexinrawtext and
  85. not self.isopener[self.indexbracket+1]):
  86. self.indexbracket += 1
  87. def is_in_string(self):
  88. """Is the index given to the HyperParser in a string?"""
  89. # The bracket to which we belong should be an opener.
  90. # If it's an opener, it has to have a character.
  91. return (self.isopener[self.indexbracket] and
  92. self.rawtext[self.bracketing[self.indexbracket][0]]
  93. in ('"', "'"))
  94. def is_in_code(self):
  95. """Is the index given to the HyperParser in normal code?"""
  96. return (not self.isopener[self.indexbracket] or
  97. self.rawtext[self.bracketing[self.indexbracket][0]]
  98. not in ('#', '"', "'"))
  99. def get_surrounding_brackets(self, openers='([{', mustclose=False):
  100. """Return bracket indexes or None.
  101. If the index given to the HyperParser is surrounded by a
  102. bracket defined in openers (or at least has one before it),
  103. return the indices of the opening bracket and the closing
  104. bracket (or the end of line, whichever comes first).
  105. If it is not surrounded by brackets, or the end of line comes
  106. before the closing bracket and mustclose is True, returns None.
  107. """
  108. bracketinglevel = self.bracketing[self.indexbracket][1]
  109. before = self.indexbracket
  110. while (not self.isopener[before] or
  111. self.rawtext[self.bracketing[before][0]] not in openers or
  112. self.bracketing[before][1] > bracketinglevel):
  113. before -= 1
  114. if before < 0:
  115. return None
  116. bracketinglevel = min(bracketinglevel, self.bracketing[before][1])
  117. after = self.indexbracket + 1
  118. while (after < len(self.bracketing) and
  119. self.bracketing[after][1] >= bracketinglevel):
  120. after += 1
  121. beforeindex = self.text.index("%s-%dc" %
  122. (self.stopatindex, len(self.rawtext)-self.bracketing[before][0]))
  123. if (after >= len(self.bracketing) or
  124. self.bracketing[after][0] > len(self.rawtext)):
  125. if mustclose:
  126. return None
  127. afterindex = self.stopatindex
  128. else:
  129. # We are after a real char, so it is a ')' and we give the
  130. # index before it.
  131. afterindex = self.text.index(
  132. "%s-%dc" % (self.stopatindex,
  133. len(self.rawtext)-(self.bracketing[after][0]-1)))
  134. return beforeindex, afterindex
  135. # the set of built-in identifiers which are also keywords,
  136. # i.e. keyword.iskeyword() returns True for them
  137. _ID_KEYWORDS = frozenset({"True", "False", "None"})
  138. @classmethod
  139. def _eat_identifier(cls, str, limit, pos):
  140. """Given a string and pos, return the number of chars in the
  141. identifier which ends at pos, or 0 if there is no such one.
  142. This ignores non-identifier eywords are not identifiers.
  143. """
  144. is_ascii_id_char = _IS_ASCII_ID_CHAR
  145. # Start at the end (pos) and work backwards.
  146. i = pos
  147. # Go backwards as long as the characters are valid ASCII
  148. # identifier characters. This is an optimization, since it
  149. # is faster in the common case where most of the characters
  150. # are ASCII.
  151. while i > limit and (
  152. ord(str[i - 1]) < 128 and
  153. is_ascii_id_char[ord(str[i - 1])]
  154. ):
  155. i -= 1
  156. # If the above loop ended due to reaching a non-ASCII
  157. # character, continue going backwards using the most generic
  158. # test for whether a string contains only valid identifier
  159. # characters.
  160. if i > limit and ord(str[i - 1]) >= 128:
  161. while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier():
  162. i -= 4
  163. if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier():
  164. i -= 2
  165. if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier():
  166. i -= 1
  167. # The identifier candidate starts here. If it isn't a valid
  168. # identifier, don't eat anything. At this point that is only
  169. # possible if the first character isn't a valid first
  170. # character for an identifier.
  171. if not str[i:pos].isidentifier():
  172. return 0
  173. elif i < pos:
  174. # All characters in str[i:pos] are valid ASCII identifier
  175. # characters, so it is enough to check that the first is
  176. # valid as the first character of an identifier.
  177. if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]:
  178. return 0
  179. # All keywords are valid identifiers, but should not be
  180. # considered identifiers here, except for True, False and None.
  181. if i < pos and (
  182. iskeyword(str[i:pos]) and
  183. str[i:pos] not in cls._ID_KEYWORDS
  184. ):
  185. return 0
  186. return pos - i
  187. # This string includes all chars that may be in a white space
  188. _whitespace_chars = " \t\n\\"
  189. def get_expression(self):
  190. """Return a string with the Python expression which ends at the
  191. given index, which is empty if there is no real one.
  192. """
  193. if not self.is_in_code():
  194. raise ValueError("get_expression should only be called "
  195. "if index is inside a code.")
  196. rawtext = self.rawtext
  197. bracketing = self.bracketing
  198. brck_index = self.indexbracket
  199. brck_limit = bracketing[brck_index][0]
  200. pos = self.indexinrawtext
  201. last_identifier_pos = pos
  202. postdot_phase = True
  203. while True:
  204. # Eat whitespaces, comments, and if postdot_phase is False - a dot
  205. while True:
  206. if pos>brck_limit and rawtext[pos-1] in self._whitespace_chars:
  207. # Eat a whitespace
  208. pos -= 1
  209. elif (not postdot_phase and
  210. pos > brck_limit and rawtext[pos-1] == '.'):
  211. # Eat a dot
  212. pos -= 1
  213. postdot_phase = True
  214. # The next line will fail if we are *inside* a comment,
  215. # but we shouldn't be.
  216. elif (pos == brck_limit and brck_index > 0 and
  217. rawtext[bracketing[brck_index-1][0]] == '#'):
  218. # Eat a comment
  219. brck_index -= 2
  220. brck_limit = bracketing[brck_index][0]
  221. pos = bracketing[brck_index+1][0]
  222. else:
  223. # If we didn't eat anything, quit.
  224. break
  225. if not postdot_phase:
  226. # We didn't find a dot, so the expression end at the
  227. # last identifier pos.
  228. break
  229. ret = self._eat_identifier(rawtext, brck_limit, pos)
  230. if ret:
  231. # There is an identifier to eat
  232. pos = pos - ret
  233. last_identifier_pos = pos
  234. # Now, to continue the search, we must find a dot.
  235. postdot_phase = False
  236. # (the loop continues now)
  237. elif pos == brck_limit:
  238. # We are at a bracketing limit. If it is a closing
  239. # bracket, eat the bracket, otherwise, stop the search.
  240. level = bracketing[brck_index][1]
  241. while brck_index > 0 and bracketing[brck_index-1][1] > level:
  242. brck_index -= 1
  243. if bracketing[brck_index][0] == brck_limit:
  244. # We were not at the end of a closing bracket
  245. break
  246. pos = bracketing[brck_index][0]
  247. brck_index -= 1
  248. brck_limit = bracketing[brck_index][0]
  249. last_identifier_pos = pos
  250. if rawtext[pos] in "([":
  251. # [] and () may be used after an identifier, so we
  252. # continue. postdot_phase is True, so we don't allow a dot.
  253. pass
  254. else:
  255. # We can't continue after other types of brackets
  256. if rawtext[pos] in "'\"":
  257. # Scan a string prefix
  258. while pos > 0 and rawtext[pos - 1] in "rRbBuU":
  259. pos -= 1
  260. last_identifier_pos = pos
  261. break
  262. else:
  263. # We've found an operator or something.
  264. break
  265. return rawtext[last_identifier_pos:self.indexinrawtext]
  266. if __name__ == '__main__':
  267. from unittest import main
  268. main('idlelib.idle_test.test_hyperparser', verbosity=2)