tokenize.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
  2. # All rights reserved.
  3. """Tokenization help for Python programs.
  4. generate_tokens(readline) is a generator that breaks a stream of
  5. text into Python tokens. It accepts a readline-like method which is called
  6. repeatedly to get the next line of input (or "" for EOF). It generates
  7. 5-tuples with these members:
  8. the token type (see token.py)
  9. the token (a string)
  10. the starting (row, column) indices of the token (a 2-tuple of ints)
  11. the ending (row, column) indices of the token (a 2-tuple of ints)
  12. the original line (string)
  13. It is designed to match the working of the Python tokenizer exactly, except
  14. that it produces COMMENT tokens for comments and gives type OP for all
  15. operators
  16. Older entry points
  17. tokenize_loop(readline, tokeneater)
  18. tokenize(readline, tokeneater=printtoken)
  19. are the same, except instead of generating tokens, tokeneater is a callback
  20. function to which the 5 fields described above are passed as 5 arguments,
  21. each time a new token is found."""
  22. __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  23. __credits__ = \
  24. 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  25. import string, re
  26. from codecs import BOM_UTF8, lookup
  27. from lib2to3.pgen2.token import *
  28. from . import token
  29. __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
  30. "generate_tokens", "untokenize"]
  31. del token
  32. try:
  33. bytes
  34. except NameError:
  35. # Support bytes type in Python <= 2.5, so 2to3 turns itself into
  36. # valid Python 3 code.
  37. bytes = str
  38. def group(*choices): return '(' + '|'.join(choices) + ')'
  39. def any(*choices): return group(*choices) + '*'
  40. def maybe(*choices): return group(*choices) + '?'
  41. def _combinations(*l):
  42. return set(
  43. x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
  44. )
  45. Whitespace = r'[ \f\t]*'
  46. Comment = r'#[^\r\n]*'
  47. Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  48. Name = r'\w+'
  49. Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
  50. Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
  51. Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
  52. Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
  53. Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
  54. Exponent = r'[eE][-+]?\d+(?:_\d+)*'
  55. Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
  56. Expfloat = r'\d+(?:_\d+)*' + Exponent
  57. Floatnumber = group(Pointfloat, Expfloat)
  58. Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
  59. Number = group(Imagnumber, Floatnumber, Intnumber)
  60. # Tail end of ' string.
  61. Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  62. # Tail end of " string.
  63. Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  64. # Tail end of ''' string.
  65. Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  66. # Tail end of """ string.
  67. Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  68. _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
  69. Triple = group(_litprefix + "'''", _litprefix + '"""')
  70. # Single-line ' or " string.
  71. String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  72. _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  73. # Because of leftmost-then-longest match semantics, be sure to put the
  74. # longest operators first (e.g., if = came before ==, == would get
  75. # recognized as two instances of =).
  76. Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  77. r"//=?", r"->",
  78. r"[+\-*/%&@|^=<>]=?",
  79. r"~")
  80. Bracket = '[][(){}]'
  81. Special = group(r'\r?\n', r':=', r'[:;.,`@]')
  82. Funny = group(Operator, Bracket, Special)
  83. PlainToken = group(Number, Funny, String, Name)
  84. Token = Ignore + PlainToken
  85. # First (or only) line of ' or " string.
  86. ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
  87. group("'", r'\\\r?\n'),
  88. _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
  89. group('"', r'\\\r?\n'))
  90. PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  91. PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
  92. tokenprog, pseudoprog, single3prog, double3prog = map(
  93. re.compile, (Token, PseudoToken, Single3, Double3))
  94. _strprefixes = (
  95. _combinations('r', 'R', 'f', 'F') |
  96. _combinations('r', 'R', 'b', 'B') |
  97. {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
  98. )
  99. endprogs = {"'": re.compile(Single), '"': re.compile(Double),
  100. "'''": single3prog, '"""': double3prog,
  101. **{f"{prefix}'''": single3prog for prefix in _strprefixes},
  102. **{f'{prefix}"""': double3prog for prefix in _strprefixes},
  103. **{prefix: None for prefix in _strprefixes}}
  104. triple_quoted = (
  105. {"'''", '"""'} |
  106. {f"{prefix}'''" for prefix in _strprefixes} |
  107. {f'{prefix}"""' for prefix in _strprefixes}
  108. )
  109. single_quoted = (
  110. {"'", '"'} |
  111. {f"{prefix}'" for prefix in _strprefixes} |
  112. {f'{prefix}"' for prefix in _strprefixes}
  113. )
  114. tabsize = 8
  115. class TokenError(Exception): pass
  116. class StopTokenizing(Exception): pass
  117. def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
  118. (srow, scol) = xxx_todo_changeme
  119. (erow, ecol) = xxx_todo_changeme1
  120. print("%d,%d-%d,%d:\t%s\t%s" % \
  121. (srow, scol, erow, ecol, tok_name[type], repr(token)))
  122. def tokenize(readline, tokeneater=printtoken):
  123. """
  124. The tokenize() function accepts two parameters: one representing the
  125. input stream, and one providing an output mechanism for tokenize().
  126. The first parameter, readline, must be a callable object which provides
  127. the same interface as the readline() method of built-in file objects.
  128. Each call to the function should return one line of input as a string.
  129. The second parameter, tokeneater, must also be a callable object. It is
  130. called once for each token, with five arguments, corresponding to the
  131. tuples generated by generate_tokens().
  132. """
  133. try:
  134. tokenize_loop(readline, tokeneater)
  135. except StopTokenizing:
  136. pass
  137. # backwards compatible interface
  138. def tokenize_loop(readline, tokeneater):
  139. for token_info in generate_tokens(readline):
  140. tokeneater(*token_info)
  141. class Untokenizer:
  142. def __init__(self):
  143. self.tokens = []
  144. self.prev_row = 1
  145. self.prev_col = 0
  146. def add_whitespace(self, start):
  147. row, col = start
  148. assert row <= self.prev_row
  149. col_offset = col - self.prev_col
  150. if col_offset:
  151. self.tokens.append(" " * col_offset)
  152. def untokenize(self, iterable):
  153. for t in iterable:
  154. if len(t) == 2:
  155. self.compat(t, iterable)
  156. break
  157. tok_type, token, start, end, line = t
  158. self.add_whitespace(start)
  159. self.tokens.append(token)
  160. self.prev_row, self.prev_col = end
  161. if tok_type in (NEWLINE, NL):
  162. self.prev_row += 1
  163. self.prev_col = 0
  164. return "".join(self.tokens)
  165. def compat(self, token, iterable):
  166. startline = False
  167. indents = []
  168. toks_append = self.tokens.append
  169. toknum, tokval = token
  170. if toknum in (NAME, NUMBER):
  171. tokval += ' '
  172. if toknum in (NEWLINE, NL):
  173. startline = True
  174. for tok in iterable:
  175. toknum, tokval = tok[:2]
  176. if toknum in (NAME, NUMBER, ASYNC, AWAIT):
  177. tokval += ' '
  178. if toknum == INDENT:
  179. indents.append(tokval)
  180. continue
  181. elif toknum == DEDENT:
  182. indents.pop()
  183. continue
  184. elif toknum in (NEWLINE, NL):
  185. startline = True
  186. elif startline and indents:
  187. toks_append(indents[-1])
  188. startline = False
  189. toks_append(tokval)
  190. cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
  191. blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
  192. def _get_normal_name(orig_enc):
  193. """Imitates get_normal_name in tokenizer.c."""
  194. # Only care about the first 12 characters.
  195. enc = orig_enc[:12].lower().replace("_", "-")
  196. if enc == "utf-8" or enc.startswith("utf-8-"):
  197. return "utf-8"
  198. if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
  199. enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
  200. return "iso-8859-1"
  201. return orig_enc
  202. def detect_encoding(readline):
  203. """
  204. The detect_encoding() function is used to detect the encoding that should
  205. be used to decode a Python source file. It requires one argument, readline,
  206. in the same way as the tokenize() generator.
  207. It will call readline a maximum of twice, and return the encoding used
  208. (as a string) and a list of any lines (left as bytes) it has read
  209. in.
  210. It detects the encoding from the presence of a utf-8 bom or an encoding
  211. cookie as specified in pep-0263. If both a bom and a cookie are present, but
  212. disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
  213. charset, raise a SyntaxError. Note that if a utf-8 bom is found,
  214. 'utf-8-sig' is returned.
  215. If no encoding is specified, then the default of 'utf-8' will be returned.
  216. """
  217. bom_found = False
  218. encoding = None
  219. default = 'utf-8'
  220. def read_or_stop():
  221. try:
  222. return readline()
  223. except StopIteration:
  224. return bytes()
  225. def find_cookie(line):
  226. try:
  227. line_string = line.decode('ascii')
  228. except UnicodeDecodeError:
  229. return None
  230. match = cookie_re.match(line_string)
  231. if not match:
  232. return None
  233. encoding = _get_normal_name(match.group(1))
  234. try:
  235. codec = lookup(encoding)
  236. except LookupError:
  237. # This behaviour mimics the Python interpreter
  238. raise SyntaxError("unknown encoding: " + encoding)
  239. if bom_found:
  240. if codec.name != 'utf-8':
  241. # This behaviour mimics the Python interpreter
  242. raise SyntaxError('encoding problem: utf-8')
  243. encoding += '-sig'
  244. return encoding
  245. first = read_or_stop()
  246. if first.startswith(BOM_UTF8):
  247. bom_found = True
  248. first = first[3:]
  249. default = 'utf-8-sig'
  250. if not first:
  251. return default, []
  252. encoding = find_cookie(first)
  253. if encoding:
  254. return encoding, [first]
  255. if not blank_re.match(first):
  256. return default, [first]
  257. second = read_or_stop()
  258. if not second:
  259. return default, [first]
  260. encoding = find_cookie(second)
  261. if encoding:
  262. return encoding, [first, second]
  263. return default, [first, second]
  264. def untokenize(iterable):
  265. """Transform tokens back into Python source code.
  266. Each element returned by the iterable must be a token sequence
  267. with at least two elements, a token number and token value. If
  268. only two tokens are passed, the resulting output is poor.
  269. Round-trip invariant for full input:
  270. Untokenized source will match input source exactly
  271. Round-trip invariant for limited input:
  272. # Output text will tokenize the back to the input
  273. t1 = [tok[:2] for tok in generate_tokens(f.readline)]
  274. newcode = untokenize(t1)
  275. readline = iter(newcode.splitlines(1)).next
  276. t2 = [tok[:2] for tokin generate_tokens(readline)]
  277. assert t1 == t2
  278. """
  279. ut = Untokenizer()
  280. return ut.untokenize(iterable)
  281. def generate_tokens(readline):
  282. """
  283. The generate_tokens() generator requires one argument, readline, which
  284. must be a callable object which provides the same interface as the
  285. readline() method of built-in file objects. Each call to the function
  286. should return one line of input as a string. Alternately, readline
  287. can be a callable function terminating with StopIteration:
  288. readline = open(myfile).next # Example of alternate readline
  289. The generator produces 5-tuples with these members: the token type; the
  290. token string; a 2-tuple (srow, scol) of ints specifying the row and
  291. column where the token begins in the source; a 2-tuple (erow, ecol) of
  292. ints specifying the row and column where the token ends in the source;
  293. and the line on which the token was found. The line passed is the
  294. physical line.
  295. """
  296. lnum = parenlev = continued = 0
  297. contstr, needcont = '', 0
  298. contline = None
  299. indents = [0]
  300. # 'stashed' and 'async_*' are used for async/await parsing
  301. stashed = None
  302. async_def = False
  303. async_def_indent = 0
  304. async_def_nl = False
  305. while 1: # loop over lines in stream
  306. try:
  307. line = readline()
  308. except StopIteration:
  309. line = ''
  310. lnum = lnum + 1
  311. pos, max = 0, len(line)
  312. if contstr: # continued string
  313. if not line:
  314. raise TokenError("EOF in multi-line string", strstart)
  315. endmatch = endprog.match(line)
  316. if endmatch:
  317. pos = end = endmatch.end(0)
  318. yield (STRING, contstr + line[:end],
  319. strstart, (lnum, end), contline + line)
  320. contstr, needcont = '', 0
  321. contline = None
  322. elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
  323. yield (ERRORTOKEN, contstr + line,
  324. strstart, (lnum, len(line)), contline)
  325. contstr = ''
  326. contline = None
  327. continue
  328. else:
  329. contstr = contstr + line
  330. contline = contline + line
  331. continue
  332. elif parenlev == 0 and not continued: # new statement
  333. if not line: break
  334. column = 0
  335. while pos < max: # measure leading whitespace
  336. if line[pos] == ' ': column = column + 1
  337. elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
  338. elif line[pos] == '\f': column = 0
  339. else: break
  340. pos = pos + 1
  341. if pos == max: break
  342. if stashed:
  343. yield stashed
  344. stashed = None
  345. if line[pos] in '#\r\n': # skip comments or blank lines
  346. if line[pos] == '#':
  347. comment_token = line[pos:].rstrip('\r\n')
  348. nl_pos = pos + len(comment_token)
  349. yield (COMMENT, comment_token,
  350. (lnum, pos), (lnum, pos + len(comment_token)), line)
  351. yield (NL, line[nl_pos:],
  352. (lnum, nl_pos), (lnum, len(line)), line)
  353. else:
  354. yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
  355. (lnum, pos), (lnum, len(line)), line)
  356. continue
  357. if column > indents[-1]: # count indents or dedents
  358. indents.append(column)
  359. yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
  360. while column < indents[-1]:
  361. if column not in indents:
  362. raise IndentationError(
  363. "unindent does not match any outer indentation level",
  364. ("<tokenize>", lnum, pos, line))
  365. indents = indents[:-1]
  366. if async_def and async_def_indent >= indents[-1]:
  367. async_def = False
  368. async_def_nl = False
  369. async_def_indent = 0
  370. yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
  371. if async_def and async_def_nl and async_def_indent >= indents[-1]:
  372. async_def = False
  373. async_def_nl = False
  374. async_def_indent = 0
  375. else: # continued statement
  376. if not line:
  377. raise TokenError("EOF in multi-line statement", (lnum, 0))
  378. continued = 0
  379. while pos < max:
  380. pseudomatch = pseudoprog.match(line, pos)
  381. if pseudomatch: # scan for tokens
  382. start, end = pseudomatch.span(1)
  383. spos, epos, pos = (lnum, start), (lnum, end), end
  384. token, initial = line[start:end], line[start]
  385. if initial in string.digits or \
  386. (initial == '.' and token != '.'): # ordinary number
  387. yield (NUMBER, token, spos, epos, line)
  388. elif initial in '\r\n':
  389. newline = NEWLINE
  390. if parenlev > 0:
  391. newline = NL
  392. elif async_def:
  393. async_def_nl = True
  394. if stashed:
  395. yield stashed
  396. stashed = None
  397. yield (newline, token, spos, epos, line)
  398. elif initial == '#':
  399. assert not token.endswith("\n")
  400. if stashed:
  401. yield stashed
  402. stashed = None
  403. yield (COMMENT, token, spos, epos, line)
  404. elif token in triple_quoted:
  405. endprog = endprogs[token]
  406. endmatch = endprog.match(line, pos)
  407. if endmatch: # all on one line
  408. pos = endmatch.end(0)
  409. token = line[start:pos]
  410. if stashed:
  411. yield stashed
  412. stashed = None
  413. yield (STRING, token, spos, (lnum, pos), line)
  414. else:
  415. strstart = (lnum, start) # multiple lines
  416. contstr = line[start:]
  417. contline = line
  418. break
  419. elif initial in single_quoted or \
  420. token[:2] in single_quoted or \
  421. token[:3] in single_quoted:
  422. if token[-1] == '\n': # continued string
  423. strstart = (lnum, start)
  424. endprog = (endprogs[initial] or endprogs[token[1]] or
  425. endprogs[token[2]])
  426. contstr, needcont = line[start:], 1
  427. contline = line
  428. break
  429. else: # ordinary string
  430. if stashed:
  431. yield stashed
  432. stashed = None
  433. yield (STRING, token, spos, epos, line)
  434. elif initial.isidentifier(): # ordinary name
  435. if token in ('async', 'await'):
  436. if async_def:
  437. yield (ASYNC if token == 'async' else AWAIT,
  438. token, spos, epos, line)
  439. continue
  440. tok = (NAME, token, spos, epos, line)
  441. if token == 'async' and not stashed:
  442. stashed = tok
  443. continue
  444. if token in ('def', 'for'):
  445. if (stashed
  446. and stashed[0] == NAME
  447. and stashed[1] == 'async'):
  448. if token == 'def':
  449. async_def = True
  450. async_def_indent = indents[-1]
  451. yield (ASYNC, stashed[1],
  452. stashed[2], stashed[3],
  453. stashed[4])
  454. stashed = None
  455. if stashed:
  456. yield stashed
  457. stashed = None
  458. yield tok
  459. elif initial == '\\': # continued stmt
  460. # This yield is new; needed for better idempotency:
  461. if stashed:
  462. yield stashed
  463. stashed = None
  464. yield (NL, token, spos, (lnum, pos), line)
  465. continued = 1
  466. else:
  467. if initial in '([{': parenlev = parenlev + 1
  468. elif initial in ')]}': parenlev = parenlev - 1
  469. if stashed:
  470. yield stashed
  471. stashed = None
  472. yield (OP, token, spos, epos, line)
  473. else:
  474. yield (ERRORTOKEN, line[pos],
  475. (lnum, pos), (lnum, pos+1), line)
  476. pos = pos + 1
  477. if stashed:
  478. yield stashed
  479. stashed = None
  480. for indent in indents[1:]: # pop remaining indent levels
  481. yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
  482. yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
  483. if __name__ == '__main__': # testing
  484. import sys
  485. if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
  486. else: tokenize(sys.stdin.readline)