shlex.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. """A lexical analyzer class for simple shell-like syntaxes."""
  2. # Module and documentation by Eric S. Raymond, 21 Dec 1998
  3. # Input stacking and error message cleanup added by ESR, March 2000
  4. # push_source() and pop_source() made explicit by ESR, January 2001.
  5. # Posix compliance, split(), string arguments, and
  6. # iterator interface by Gustavo Niemeyer, April 2003.
  7. # changes to tokenize more like Posix shells by Vinay Sajip, July 2016.
  8. import os
  9. import re
  10. import sys
  11. from collections import deque
  12. from io import StringIO
  13. __all__ = ["shlex", "split", "quote", "join"]
  14. class shlex:
  15. "A lexical analyzer class for simple shell-like syntaxes."
  16. def __init__(self, instream=None, infile=None, posix=False,
  17. punctuation_chars=False):
  18. if isinstance(instream, str):
  19. instream = StringIO(instream)
  20. if instream is not None:
  21. self.instream = instream
  22. self.infile = infile
  23. else:
  24. self.instream = sys.stdin
  25. self.infile = None
  26. self.posix = posix
  27. if posix:
  28. self.eof = None
  29. else:
  30. self.eof = ''
  31. self.commenters = '#'
  32. self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
  33. 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
  34. if self.posix:
  35. self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
  36. 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
  37. self.whitespace = ' \t\r\n'
  38. self.whitespace_split = False
  39. self.quotes = '\'"'
  40. self.escape = '\\'
  41. self.escapedquotes = '"'
  42. self.state = ' '
  43. self.pushback = deque()
  44. self.lineno = 1
  45. self.debug = 0
  46. self.token = ''
  47. self.filestack = deque()
  48. self.source = None
  49. if not punctuation_chars:
  50. punctuation_chars = ''
  51. elif punctuation_chars is True:
  52. punctuation_chars = '();<>|&'
  53. self._punctuation_chars = punctuation_chars
  54. if punctuation_chars:
  55. # _pushback_chars is a push back queue used by lookahead logic
  56. self._pushback_chars = deque()
  57. # these chars added because allowed in file names, args, wildcards
  58. self.wordchars += '~-./*?='
  59. #remove any punctuation chars from wordchars
  60. t = self.wordchars.maketrans(dict.fromkeys(punctuation_chars))
  61. self.wordchars = self.wordchars.translate(t)
  62. @property
  63. def punctuation_chars(self):
  64. return self._punctuation_chars
  65. def push_token(self, tok):
  66. "Push a token onto the stack popped by the get_token method"
  67. if self.debug >= 1:
  68. print("shlex: pushing token " + repr(tok))
  69. self.pushback.appendleft(tok)
  70. def push_source(self, newstream, newfile=None):
  71. "Push an input source onto the lexer's input source stack."
  72. if isinstance(newstream, str):
  73. newstream = StringIO(newstream)
  74. self.filestack.appendleft((self.infile, self.instream, self.lineno))
  75. self.infile = newfile
  76. self.instream = newstream
  77. self.lineno = 1
  78. if self.debug:
  79. if newfile is not None:
  80. print('shlex: pushing to file %s' % (self.infile,))
  81. else:
  82. print('shlex: pushing to stream %s' % (self.instream,))
  83. def pop_source(self):
  84. "Pop the input source stack."
  85. self.instream.close()
  86. (self.infile, self.instream, self.lineno) = self.filestack.popleft()
  87. if self.debug:
  88. print('shlex: popping to %s, line %d' \
  89. % (self.instream, self.lineno))
  90. self.state = ' '
  91. def get_token(self):
  92. "Get a token from the input stream (or from stack if it's nonempty)"
  93. if self.pushback:
  94. tok = self.pushback.popleft()
  95. if self.debug >= 1:
  96. print("shlex: popping token " + repr(tok))
  97. return tok
  98. # No pushback. Get a token.
  99. raw = self.read_token()
  100. # Handle inclusions
  101. if self.source is not None:
  102. while raw == self.source:
  103. spec = self.sourcehook(self.read_token())
  104. if spec:
  105. (newfile, newstream) = spec
  106. self.push_source(newstream, newfile)
  107. raw = self.get_token()
  108. # Maybe we got EOF instead?
  109. while raw == self.eof:
  110. if not self.filestack:
  111. return self.eof
  112. else:
  113. self.pop_source()
  114. raw = self.get_token()
  115. # Neither inclusion nor EOF
  116. if self.debug >= 1:
  117. if raw != self.eof:
  118. print("shlex: token=" + repr(raw))
  119. else:
  120. print("shlex: token=EOF")
  121. return raw
  122. def read_token(self):
  123. quoted = False
  124. escapedstate = ' '
  125. while True:
  126. if self.punctuation_chars and self._pushback_chars:
  127. nextchar = self._pushback_chars.pop()
  128. else:
  129. nextchar = self.instream.read(1)
  130. if nextchar == '\n':
  131. self.lineno += 1
  132. if self.debug >= 3:
  133. print("shlex: in state %r I see character: %r" % (self.state,
  134. nextchar))
  135. if self.state is None:
  136. self.token = '' # past end of file
  137. break
  138. elif self.state == ' ':
  139. if not nextchar:
  140. self.state = None # end of file
  141. break
  142. elif nextchar in self.whitespace:
  143. if self.debug >= 2:
  144. print("shlex: I see whitespace in whitespace state")
  145. if self.token or (self.posix and quoted):
  146. break # emit current token
  147. else:
  148. continue
  149. elif nextchar in self.commenters:
  150. self.instream.readline()
  151. self.lineno += 1
  152. elif self.posix and nextchar in self.escape:
  153. escapedstate = 'a'
  154. self.state = nextchar
  155. elif nextchar in self.wordchars:
  156. self.token = nextchar
  157. self.state = 'a'
  158. elif nextchar in self.punctuation_chars:
  159. self.token = nextchar
  160. self.state = 'c'
  161. elif nextchar in self.quotes:
  162. if not self.posix:
  163. self.token = nextchar
  164. self.state = nextchar
  165. elif self.whitespace_split:
  166. self.token = nextchar
  167. self.state = 'a'
  168. else:
  169. self.token = nextchar
  170. if self.token or (self.posix and quoted):
  171. break # emit current token
  172. else:
  173. continue
  174. elif self.state in self.quotes:
  175. quoted = True
  176. if not nextchar: # end of file
  177. if self.debug >= 2:
  178. print("shlex: I see EOF in quotes state")
  179. # XXX what error should be raised here?
  180. raise ValueError("No closing quotation")
  181. if nextchar == self.state:
  182. if not self.posix:
  183. self.token += nextchar
  184. self.state = ' '
  185. break
  186. else:
  187. self.state = 'a'
  188. elif (self.posix and nextchar in self.escape and self.state
  189. in self.escapedquotes):
  190. escapedstate = self.state
  191. self.state = nextchar
  192. else:
  193. self.token += nextchar
  194. elif self.state in self.escape:
  195. if not nextchar: # end of file
  196. if self.debug >= 2:
  197. print("shlex: I see EOF in escape state")
  198. # XXX what error should be raised here?
  199. raise ValueError("No escaped character")
  200. # In posix shells, only the quote itself or the escape
  201. # character may be escaped within quotes.
  202. if (escapedstate in self.quotes and
  203. nextchar != self.state and nextchar != escapedstate):
  204. self.token += self.state
  205. self.token += nextchar
  206. self.state = escapedstate
  207. elif self.state in ('a', 'c'):
  208. if not nextchar:
  209. self.state = None # end of file
  210. break
  211. elif nextchar in self.whitespace:
  212. if self.debug >= 2:
  213. print("shlex: I see whitespace in word state")
  214. self.state = ' '
  215. if self.token or (self.posix and quoted):
  216. break # emit current token
  217. else:
  218. continue
  219. elif nextchar in self.commenters:
  220. self.instream.readline()
  221. self.lineno += 1
  222. if self.posix:
  223. self.state = ' '
  224. if self.token or (self.posix and quoted):
  225. break # emit current token
  226. else:
  227. continue
  228. elif self.state == 'c':
  229. if nextchar in self.punctuation_chars:
  230. self.token += nextchar
  231. else:
  232. if nextchar not in self.whitespace:
  233. self._pushback_chars.append(nextchar)
  234. self.state = ' '
  235. break
  236. elif self.posix and nextchar in self.quotes:
  237. self.state = nextchar
  238. elif self.posix and nextchar in self.escape:
  239. escapedstate = 'a'
  240. self.state = nextchar
  241. elif (nextchar in self.wordchars or nextchar in self.quotes
  242. or (self.whitespace_split and
  243. nextchar not in self.punctuation_chars)):
  244. self.token += nextchar
  245. else:
  246. if self.punctuation_chars:
  247. self._pushback_chars.append(nextchar)
  248. else:
  249. self.pushback.appendleft(nextchar)
  250. if self.debug >= 2:
  251. print("shlex: I see punctuation in word state")
  252. self.state = ' '
  253. if self.token or (self.posix and quoted):
  254. break # emit current token
  255. else:
  256. continue
  257. result = self.token
  258. self.token = ''
  259. if self.posix and not quoted and result == '':
  260. result = None
  261. if self.debug > 1:
  262. if result:
  263. print("shlex: raw token=" + repr(result))
  264. else:
  265. print("shlex: raw token=EOF")
  266. return result
  267. def sourcehook(self, newfile):
  268. "Hook called on a filename to be sourced."
  269. if newfile[0] == '"':
  270. newfile = newfile[1:-1]
  271. # This implements cpp-like semantics for relative-path inclusion.
  272. if isinstance(self.infile, str) and not os.path.isabs(newfile):
  273. newfile = os.path.join(os.path.dirname(self.infile), newfile)
  274. return (newfile, open(newfile, "r"))
  275. def error_leader(self, infile=None, lineno=None):
  276. "Emit a C-compiler-like, Emacs-friendly error-message leader."
  277. if infile is None:
  278. infile = self.infile
  279. if lineno is None:
  280. lineno = self.lineno
  281. return "\"%s\", line %d: " % (infile, lineno)
  282. def __iter__(self):
  283. return self
  284. def __next__(self):
  285. token = self.get_token()
  286. if token == self.eof:
  287. raise StopIteration
  288. return token
  289. def split(s, comments=False, posix=True):
  290. """Split the string *s* using shell-like syntax."""
  291. if s is None:
  292. import warnings
  293. warnings.warn("Passing None for 's' to shlex.split() is deprecated.",
  294. DeprecationWarning, stacklevel=2)
  295. lex = shlex(s, posix=posix)
  296. lex.whitespace_split = True
  297. if not comments:
  298. lex.commenters = ''
  299. return list(lex)
  300. def join(split_command):
  301. """Return a shell-escaped string from *split_command*."""
  302. return ' '.join(quote(arg) for arg in split_command)
  303. _find_unsafe = re.compile(r'[^\w@%+=:,./-]', re.ASCII).search
  304. def quote(s):
  305. """Return a shell-escaped version of the string *s*."""
  306. if not s:
  307. return "''"
  308. if _find_unsafe(s) is None:
  309. return s
  310. # use single quotes, and put single quotes into double quotes
  311. # the string $'b is then quoted as '$'"'"'b'
  312. return "'" + s.replace("'", "'\"'\"'") + "'"
  313. def _print_tokens(lexer):
  314. while 1:
  315. tt = lexer.get_token()
  316. if not tt:
  317. break
  318. print("Token: " + repr(tt))
  319. if __name__ == '__main__':
  320. if len(sys.argv) == 1:
  321. _print_tokens(shlex())
  322. else:
  323. fn = sys.argv[1]
  324. with open(fn) as f:
  325. _print_tokens(shlex(f, fn))