feedparser.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. # Copyright (C) 2004-2006 Python Software Foundation
  2. # Authors: Baxter, Wouters and Warsaw
  3. # Contact: email-sig@python.org
  4. """FeedParser - An email feed parser.
  5. The feed parser implements an interface for incrementally parsing an email
  6. message, line by line. This has advantages for certain applications, such as
  7. those reading email messages off a socket.
  8. FeedParser.feed() is the primary interface for pushing new data into the
  9. parser. It returns when there's nothing more it can do with the available
  10. data. When you have no more data to push into the parser, call .close().
  11. This completes the parsing and returns the root message object.
  12. The other advantage of this parser is that it will never raise a parsing
  13. exception. Instead, when it finds something unexpected, it adds a 'defect' to
  14. the current message. Defects are just instances that live on the message
  15. object's .defects attribute.
  16. """
  17. __all__ = ['FeedParser', 'BytesFeedParser']
  18. import re
  19. from email import errors
  20. from email._policybase import compat32
  21. from collections import deque
  22. from io import StringIO
  23. NLCRE = re.compile(r'\r\n|\r|\n')
  24. NLCRE_bol = re.compile(r'(\r\n|\r|\n)')
  25. NLCRE_eol = re.compile(r'(\r\n|\r|\n)\Z')
  26. NLCRE_crack = re.compile(r'(\r\n|\r|\n)')
  27. # RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
  28. # except controls, SP, and ":".
  29. headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
  30. EMPTYSTRING = ''
  31. NL = '\n'
  32. NeedMoreData = object()
  33. class BufferedSubFile(object):
  34. """A file-ish object that can have new data loaded into it.
  35. You can also push and pop line-matching predicates onto a stack. When the
  36. current predicate matches the current line, a false EOF response
  37. (i.e. empty string) is returned instead. This lets the parser adhere to a
  38. simple abstraction -- it parses until EOF closes the current message.
  39. """
  40. def __init__(self):
  41. # Text stream of the last partial line pushed into this object.
  42. # See issue 22233 for why this is a text stream and not a list.
  43. self._partial = StringIO(newline='')
  44. # A deque of full, pushed lines
  45. self._lines = deque()
  46. # The stack of false-EOF checking predicates.
  47. self._eofstack = []
  48. # A flag indicating whether the file has been closed or not.
  49. self._closed = False
  50. def push_eof_matcher(self, pred):
  51. self._eofstack.append(pred)
  52. def pop_eof_matcher(self):
  53. return self._eofstack.pop()
  54. def close(self):
  55. # Don't forget any trailing partial line.
  56. self._partial.seek(0)
  57. self.pushlines(self._partial.readlines())
  58. self._partial.seek(0)
  59. self._partial.truncate()
  60. self._closed = True
  61. def readline(self):
  62. if not self._lines:
  63. if self._closed:
  64. return ''
  65. return NeedMoreData
  66. # Pop the line off the stack and see if it matches the current
  67. # false-EOF predicate.
  68. line = self._lines.popleft()
  69. # RFC 2046, section 5.1.2 requires us to recognize outer level
  70. # boundaries at any level of inner nesting. Do this, but be sure it's
  71. # in the order of most to least nested.
  72. for ateof in reversed(self._eofstack):
  73. if ateof(line):
  74. # We're at the false EOF. But push the last line back first.
  75. self._lines.appendleft(line)
  76. return ''
  77. return line
  78. def unreadline(self, line):
  79. # Let the consumer push a line back into the buffer.
  80. assert line is not NeedMoreData
  81. self._lines.appendleft(line)
  82. def push(self, data):
  83. """Push some new data into this object."""
  84. self._partial.write(data)
  85. if '\n' not in data and '\r' not in data:
  86. # No new complete lines, wait for more.
  87. return
  88. # Crack into lines, preserving the linesep characters.
  89. self._partial.seek(0)
  90. parts = self._partial.readlines()
  91. self._partial.seek(0)
  92. self._partial.truncate()
  93. # If the last element of the list does not end in a newline, then treat
  94. # it as a partial line. We only check for '\n' here because a line
  95. # ending with '\r' might be a line that was split in the middle of a
  96. # '\r\n' sequence (see bugs 1555570 and 1721862).
  97. if not parts[-1].endswith('\n'):
  98. self._partial.write(parts.pop())
  99. self.pushlines(parts)
  100. def pushlines(self, lines):
  101. self._lines.extend(lines)
  102. def __iter__(self):
  103. return self
  104. def __next__(self):
  105. line = self.readline()
  106. if line == '':
  107. raise StopIteration
  108. return line
  109. class FeedParser:
  110. """A feed-style parser of email."""
  111. def __init__(self, _factory=None, *, policy=compat32):
  112. """_factory is called with no arguments to create a new message obj
  113. The policy keyword specifies a policy object that controls a number of
  114. aspects of the parser's operation. The default policy maintains
  115. backward compatibility.
  116. """
  117. self.policy = policy
  118. self._old_style_factory = False
  119. if _factory is None:
  120. if policy.message_factory is None:
  121. from email.message import Message
  122. self._factory = Message
  123. else:
  124. self._factory = policy.message_factory
  125. else:
  126. self._factory = _factory
  127. try:
  128. _factory(policy=self.policy)
  129. except TypeError:
  130. # Assume this is an old-style factory
  131. self._old_style_factory = True
  132. self._input = BufferedSubFile()
  133. self._msgstack = []
  134. self._parse = self._parsegen().__next__
  135. self._cur = None
  136. self._last = None
  137. self._headersonly = False
  138. # Non-public interface for supporting Parser's headersonly flag
  139. def _set_headersonly(self):
  140. self._headersonly = True
  141. def feed(self, data):
  142. """Push more data into the parser."""
  143. self._input.push(data)
  144. self._call_parse()
  145. def _call_parse(self):
  146. try:
  147. self._parse()
  148. except StopIteration:
  149. pass
  150. def close(self):
  151. """Parse all remaining data and return the root message object."""
  152. self._input.close()
  153. self._call_parse()
  154. root = self._pop_message()
  155. assert not self._msgstack
  156. # Look for final set of defects
  157. if root.get_content_maintype() == 'multipart' \
  158. and not root.is_multipart() and not self._headersonly:
  159. defect = errors.MultipartInvariantViolationDefect()
  160. self.policy.handle_defect(root, defect)
  161. return root
  162. def _new_message(self):
  163. if self._old_style_factory:
  164. msg = self._factory()
  165. else:
  166. msg = self._factory(policy=self.policy)
  167. if self._cur and self._cur.get_content_type() == 'multipart/digest':
  168. msg.set_default_type('message/rfc822')
  169. if self._msgstack:
  170. self._msgstack[-1].attach(msg)
  171. self._msgstack.append(msg)
  172. self._cur = msg
  173. self._last = msg
  174. def _pop_message(self):
  175. retval = self._msgstack.pop()
  176. if self._msgstack:
  177. self._cur = self._msgstack[-1]
  178. else:
  179. self._cur = None
  180. return retval
  181. def _parsegen(self):
  182. # Create a new message and start by parsing headers.
  183. self._new_message()
  184. headers = []
  185. # Collect the headers, searching for a line that doesn't match the RFC
  186. # 2822 header or continuation pattern (including an empty line).
  187. for line in self._input:
  188. if line is NeedMoreData:
  189. yield NeedMoreData
  190. continue
  191. if not headerRE.match(line):
  192. # If we saw the RFC defined header/body separator
  193. # (i.e. newline), just throw it away. Otherwise the line is
  194. # part of the body so push it back.
  195. if not NLCRE.match(line):
  196. defect = errors.MissingHeaderBodySeparatorDefect()
  197. self.policy.handle_defect(self._cur, defect)
  198. self._input.unreadline(line)
  199. break
  200. headers.append(line)
  201. # Done with the headers, so parse them and figure out what we're
  202. # supposed to see in the body of the message.
  203. self._parse_headers(headers)
  204. # Headers-only parsing is a backwards compatibility hack, which was
  205. # necessary in the older parser, which could raise errors. All
  206. # remaining lines in the input are thrown into the message body.
  207. if self._headersonly:
  208. lines = []
  209. while True:
  210. line = self._input.readline()
  211. if line is NeedMoreData:
  212. yield NeedMoreData
  213. continue
  214. if line == '':
  215. break
  216. lines.append(line)
  217. self._cur.set_payload(EMPTYSTRING.join(lines))
  218. return
  219. if self._cur.get_content_type() == 'message/delivery-status':
  220. # message/delivery-status contains blocks of headers separated by
  221. # a blank line. We'll represent each header block as a separate
  222. # nested message object, but the processing is a bit different
  223. # than standard message/* types because there is no body for the
  224. # nested messages. A blank line separates the subparts.
  225. while True:
  226. self._input.push_eof_matcher(NLCRE.match)
  227. for retval in self._parsegen():
  228. if retval is NeedMoreData:
  229. yield NeedMoreData
  230. continue
  231. break
  232. self._pop_message()
  233. # We need to pop the EOF matcher in order to tell if we're at
  234. # the end of the current file, not the end of the last block
  235. # of message headers.
  236. self._input.pop_eof_matcher()
  237. # The input stream must be sitting at the newline or at the
  238. # EOF. We want to see if we're at the end of this subpart, so
  239. # first consume the blank line, then test the next line to see
  240. # if we're at this subpart's EOF.
  241. while True:
  242. line = self._input.readline()
  243. if line is NeedMoreData:
  244. yield NeedMoreData
  245. continue
  246. break
  247. while True:
  248. line = self._input.readline()
  249. if line is NeedMoreData:
  250. yield NeedMoreData
  251. continue
  252. break
  253. if line == '':
  254. break
  255. # Not at EOF so this is a line we're going to need.
  256. self._input.unreadline(line)
  257. return
  258. if self._cur.get_content_maintype() == 'message':
  259. # The message claims to be a message/* type, then what follows is
  260. # another RFC 2822 message.
  261. for retval in self._parsegen():
  262. if retval is NeedMoreData:
  263. yield NeedMoreData
  264. continue
  265. break
  266. self._pop_message()
  267. return
  268. if self._cur.get_content_maintype() == 'multipart':
  269. boundary = self._cur.get_boundary()
  270. if boundary is None:
  271. # The message /claims/ to be a multipart but it has not
  272. # defined a boundary. That's a problem which we'll handle by
  273. # reading everything until the EOF and marking the message as
  274. # defective.
  275. defect = errors.NoBoundaryInMultipartDefect()
  276. self.policy.handle_defect(self._cur, defect)
  277. lines = []
  278. for line in self._input:
  279. if line is NeedMoreData:
  280. yield NeedMoreData
  281. continue
  282. lines.append(line)
  283. self._cur.set_payload(EMPTYSTRING.join(lines))
  284. return
  285. # Make sure a valid content type was specified per RFC 2045:6.4.
  286. if (str(self._cur.get('content-transfer-encoding', '8bit')).lower()
  287. not in ('7bit', '8bit', 'binary')):
  288. defect = errors.InvalidMultipartContentTransferEncodingDefect()
  289. self.policy.handle_defect(self._cur, defect)
  290. # Create a line match predicate which matches the inter-part
  291. # boundary as well as the end-of-multipart boundary. Don't push
  292. # this onto the input stream until we've scanned past the
  293. # preamble.
  294. separator = '--' + boundary
  295. boundaryre = re.compile(
  296. '(?P<sep>' + re.escape(separator) +
  297. r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
  298. capturing_preamble = True
  299. preamble = []
  300. linesep = False
  301. close_boundary_seen = False
  302. while True:
  303. line = self._input.readline()
  304. if line is NeedMoreData:
  305. yield NeedMoreData
  306. continue
  307. if line == '':
  308. break
  309. mo = boundaryre.match(line)
  310. if mo:
  311. # If we're looking at the end boundary, we're done with
  312. # this multipart. If there was a newline at the end of
  313. # the closing boundary, then we need to initialize the
  314. # epilogue with the empty string (see below).
  315. if mo.group('end'):
  316. close_boundary_seen = True
  317. linesep = mo.group('linesep')
  318. break
  319. # We saw an inter-part boundary. Were we in the preamble?
  320. if capturing_preamble:
  321. if preamble:
  322. # According to RFC 2046, the last newline belongs
  323. # to the boundary.
  324. lastline = preamble[-1]
  325. eolmo = NLCRE_eol.search(lastline)
  326. if eolmo:
  327. preamble[-1] = lastline[:-len(eolmo.group(0))]
  328. self._cur.preamble = EMPTYSTRING.join(preamble)
  329. capturing_preamble = False
  330. self._input.unreadline(line)
  331. continue
  332. # We saw a boundary separating two parts. Consume any
  333. # multiple boundary lines that may be following. Our
  334. # interpretation of RFC 2046 BNF grammar does not produce
  335. # body parts within such double boundaries.
  336. while True:
  337. line = self._input.readline()
  338. if line is NeedMoreData:
  339. yield NeedMoreData
  340. continue
  341. mo = boundaryre.match(line)
  342. if not mo:
  343. self._input.unreadline(line)
  344. break
  345. # Recurse to parse this subpart; the input stream points
  346. # at the subpart's first line.
  347. self._input.push_eof_matcher(boundaryre.match)
  348. for retval in self._parsegen():
  349. if retval is NeedMoreData:
  350. yield NeedMoreData
  351. continue
  352. break
  353. # Because of RFC 2046, the newline preceding the boundary
  354. # separator actually belongs to the boundary, not the
  355. # previous subpart's payload (or epilogue if the previous
  356. # part is a multipart).
  357. if self._last.get_content_maintype() == 'multipart':
  358. epilogue = self._last.epilogue
  359. if epilogue == '':
  360. self._last.epilogue = None
  361. elif epilogue is not None:
  362. mo = NLCRE_eol.search(epilogue)
  363. if mo:
  364. end = len(mo.group(0))
  365. self._last.epilogue = epilogue[:-end]
  366. else:
  367. payload = self._last._payload
  368. if isinstance(payload, str):
  369. mo = NLCRE_eol.search(payload)
  370. if mo:
  371. payload = payload[:-len(mo.group(0))]
  372. self._last._payload = payload
  373. self._input.pop_eof_matcher()
  374. self._pop_message()
  375. # Set the multipart up for newline cleansing, which will
  376. # happen if we're in a nested multipart.
  377. self._last = self._cur
  378. else:
  379. # I think we must be in the preamble
  380. assert capturing_preamble
  381. preamble.append(line)
  382. # We've seen either the EOF or the end boundary. If we're still
  383. # capturing the preamble, we never saw the start boundary. Note
  384. # that as a defect and store the captured text as the payload.
  385. if capturing_preamble:
  386. defect = errors.StartBoundaryNotFoundDefect()
  387. self.policy.handle_defect(self._cur, defect)
  388. self._cur.set_payload(EMPTYSTRING.join(preamble))
  389. epilogue = []
  390. for line in self._input:
  391. if line is NeedMoreData:
  392. yield NeedMoreData
  393. continue
  394. self._cur.epilogue = EMPTYSTRING.join(epilogue)
  395. return
  396. # If we're not processing the preamble, then we might have seen
  397. # EOF without seeing that end boundary...that is also a defect.
  398. if not close_boundary_seen:
  399. defect = errors.CloseBoundaryNotFoundDefect()
  400. self.policy.handle_defect(self._cur, defect)
  401. return
  402. # Everything from here to the EOF is epilogue. If the end boundary
  403. # ended in a newline, we'll need to make sure the epilogue isn't
  404. # None
  405. if linesep:
  406. epilogue = ['']
  407. else:
  408. epilogue = []
  409. for line in self._input:
  410. if line is NeedMoreData:
  411. yield NeedMoreData
  412. continue
  413. epilogue.append(line)
  414. # Any CRLF at the front of the epilogue is not technically part of
  415. # the epilogue. Also, watch out for an empty string epilogue,
  416. # which means a single newline.
  417. if epilogue:
  418. firstline = epilogue[0]
  419. bolmo = NLCRE_bol.match(firstline)
  420. if bolmo:
  421. epilogue[0] = firstline[len(bolmo.group(0)):]
  422. self._cur.epilogue = EMPTYSTRING.join(epilogue)
  423. return
  424. # Otherwise, it's some non-multipart type, so the entire rest of the
  425. # file contents becomes the payload.
  426. lines = []
  427. for line in self._input:
  428. if line is NeedMoreData:
  429. yield NeedMoreData
  430. continue
  431. lines.append(line)
  432. self._cur.set_payload(EMPTYSTRING.join(lines))
  433. def _parse_headers(self, lines):
  434. # Passed a list of lines that make up the headers for the current msg
  435. lastheader = ''
  436. lastvalue = []
  437. for lineno, line in enumerate(lines):
  438. # Check for continuation
  439. if line[0] in ' \t':
  440. if not lastheader:
  441. # The first line of the headers was a continuation. This
  442. # is illegal, so let's note the defect, store the illegal
  443. # line, and ignore it for purposes of headers.
  444. defect = errors.FirstHeaderLineIsContinuationDefect(line)
  445. self.policy.handle_defect(self._cur, defect)
  446. continue
  447. lastvalue.append(line)
  448. continue
  449. if lastheader:
  450. self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
  451. lastheader, lastvalue = '', []
  452. # Check for envelope header, i.e. unix-from
  453. if line.startswith('From '):
  454. if lineno == 0:
  455. # Strip off the trailing newline
  456. mo = NLCRE_eol.search(line)
  457. if mo:
  458. line = line[:-len(mo.group(0))]
  459. self._cur.set_unixfrom(line)
  460. continue
  461. elif lineno == len(lines) - 1:
  462. # Something looking like a unix-from at the end - it's
  463. # probably the first line of the body, so push back the
  464. # line and stop.
  465. self._input.unreadline(line)
  466. return
  467. else:
  468. # Weirdly placed unix-from line. Note this as a defect
  469. # and ignore it.
  470. defect = errors.MisplacedEnvelopeHeaderDefect(line)
  471. self._cur.defects.append(defect)
  472. continue
  473. # Split the line on the colon separating field name from value.
  474. # There will always be a colon, because if there wasn't the part of
  475. # the parser that calls us would have started parsing the body.
  476. i = line.find(':')
  477. # If the colon is on the start of the line the header is clearly
  478. # malformed, but we might be able to salvage the rest of the
  479. # message. Track the error but keep going.
  480. if i == 0:
  481. defect = errors.InvalidHeaderDefect("Missing header name.")
  482. self._cur.defects.append(defect)
  483. continue
  484. assert i>0, "_parse_headers fed line with no : and no leading WS"
  485. lastheader = line[:i]
  486. lastvalue = [line]
  487. # Done with all the lines, so handle the last header.
  488. if lastheader:
  489. self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
  490. class BytesFeedParser(FeedParser):
  491. """Like FeedParser, but feed accepts bytes."""
  492. def feed(self, data):
  493. super().feed(data.decode('ascii', 'surrogateescape'))