header.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. # Copyright (C) 2002-2007 Python Software Foundation
  2. # Author: Ben Gertzfield, Barry Warsaw
  3. # Contact: email-sig@python.org
  4. """Header encoding and decoding functionality."""
  5. __all__ = [
  6. 'Header',
  7. 'decode_header',
  8. 'make_header',
  9. ]
  10. import re
  11. import binascii
  12. import email.quoprimime
  13. import email.base64mime
  14. from email.errors import HeaderParseError
  15. from email import charset as _charset
  16. Charset = _charset.Charset
  17. NL = '\n'
  18. SPACE = ' '
  19. BSPACE = b' '
  20. SPACE8 = ' ' * 8
  21. EMPTYSTRING = ''
  22. MAXLINELEN = 78
  23. FWS = ' \t'
  24. USASCII = Charset('us-ascii')
  25. UTF8 = Charset('utf-8')
  26. # Match encoded-word strings in the form =?charset?q?Hello_World?=
  27. ecre = re.compile(r'''
  28. =\? # literal =?
  29. (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
  30. \? # literal ?
  31. (?P<encoding>[qQbB]) # either a "q" or a "b", case insensitive
  32. \? # literal ?
  33. (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
  34. \?= # literal ?=
  35. ''', re.VERBOSE | re.MULTILINE)
  36. # Field name regexp, including trailing colon, but not separating whitespace,
  37. # according to RFC 2822. Character range is from tilde to exclamation mark.
  38. # For use with .match()
  39. fcre = re.compile(r'[\041-\176]+:$')
  40. # Find a header embedded in a putative header value. Used to check for
  41. # header injection attack.
  42. _embedded_header = re.compile(r'\n[^ \t]+:')
  43. # Helpers
  44. _max_append = email.quoprimime._max_append
  45. def decode_header(header):
  46. """Decode a message header value without converting charset.
  47. Returns a list of (string, charset) pairs containing each of the decoded
  48. parts of the header. Charset is None for non-encoded parts of the header,
  49. otherwise a lower-case string containing the name of the character set
  50. specified in the encoded string.
  51. header may be a string that may or may not contain RFC2047 encoded words,
  52. or it may be a Header object.
  53. An email.errors.HeaderParseError may be raised when certain decoding error
  54. occurs (e.g. a base64 decoding exception).
  55. """
  56. # If it is a Header object, we can just return the encoded chunks.
  57. if hasattr(header, '_chunks'):
  58. return [(_charset._encode(string, str(charset)), str(charset))
  59. for string, charset in header._chunks]
  60. # If no encoding, just return the header with no charset.
  61. if not ecre.search(header):
  62. return [(header, None)]
  63. # First step is to parse all the encoded parts into triplets of the form
  64. # (encoded_string, encoding, charset). For unencoded strings, the last
  65. # two parts will be None.
  66. words = []
  67. for line in header.splitlines():
  68. parts = ecre.split(line)
  69. first = True
  70. while parts:
  71. unencoded = parts.pop(0)
  72. if first:
  73. unencoded = unencoded.lstrip()
  74. first = False
  75. if unencoded:
  76. words.append((unencoded, None, None))
  77. if parts:
  78. charset = parts.pop(0).lower()
  79. encoding = parts.pop(0).lower()
  80. encoded = parts.pop(0)
  81. words.append((encoded, encoding, charset))
  82. # Now loop over words and remove words that consist of whitespace
  83. # between two encoded strings.
  84. droplist = []
  85. for n, w in enumerate(words):
  86. if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():
  87. droplist.append(n-1)
  88. for d in reversed(droplist):
  89. del words[d]
  90. # The next step is to decode each encoded word by applying the reverse
  91. # base64 or quopri transformation. decoded_words is now a list of the
  92. # form (decoded_word, charset).
  93. decoded_words = []
  94. for encoded_string, encoding, charset in words:
  95. if encoding is None:
  96. # This is an unencoded word.
  97. decoded_words.append((encoded_string, charset))
  98. elif encoding == 'q':
  99. word = email.quoprimime.header_decode(encoded_string)
  100. decoded_words.append((word, charset))
  101. elif encoding == 'b':
  102. paderr = len(encoded_string) % 4 # Postel's law: add missing padding
  103. if paderr:
  104. encoded_string += '==='[:4 - paderr]
  105. try:
  106. word = email.base64mime.decode(encoded_string)
  107. except binascii.Error:
  108. raise HeaderParseError('Base64 decoding error')
  109. else:
  110. decoded_words.append((word, charset))
  111. else:
  112. raise AssertionError('Unexpected encoding: ' + encoding)
  113. # Now convert all words to bytes and collapse consecutive runs of
  114. # similarly encoded words.
  115. collapsed = []
  116. last_word = last_charset = None
  117. for word, charset in decoded_words:
  118. if isinstance(word, str):
  119. word = bytes(word, 'raw-unicode-escape')
  120. if last_word is None:
  121. last_word = word
  122. last_charset = charset
  123. elif charset != last_charset:
  124. collapsed.append((last_word, last_charset))
  125. last_word = word
  126. last_charset = charset
  127. elif last_charset is None:
  128. last_word += BSPACE + word
  129. else:
  130. last_word += word
  131. collapsed.append((last_word, last_charset))
  132. return collapsed
  133. def make_header(decoded_seq, maxlinelen=None, header_name=None,
  134. continuation_ws=' '):
  135. """Create a Header from a sequence of pairs as returned by decode_header()
  136. decode_header() takes a header value string and returns a sequence of
  137. pairs of the format (decoded_string, charset) where charset is the string
  138. name of the character set.
  139. This function takes one of those sequence of pairs and returns a Header
  140. instance. Optional maxlinelen, header_name, and continuation_ws are as in
  141. the Header constructor.
  142. """
  143. h = Header(maxlinelen=maxlinelen, header_name=header_name,
  144. continuation_ws=continuation_ws)
  145. for s, charset in decoded_seq:
  146. # None means us-ascii but we can simply pass it on to h.append()
  147. if charset is not None and not isinstance(charset, Charset):
  148. charset = Charset(charset)
  149. h.append(s, charset)
  150. return h
  151. class Header:
  152. def __init__(self, s=None, charset=None,
  153. maxlinelen=None, header_name=None,
  154. continuation_ws=' ', errors='strict'):
  155. """Create a MIME-compliant header that can contain many character sets.
  156. Optional s is the initial header value. If None, the initial header
  157. value is not set. You can later append to the header with .append()
  158. method calls. s may be a byte string or a Unicode string, but see the
  159. .append() documentation for semantics.
  160. Optional charset serves two purposes: it has the same meaning as the
  161. charset argument to the .append() method. It also sets the default
  162. character set for all subsequent .append() calls that omit the charset
  163. argument. If charset is not provided in the constructor, the us-ascii
  164. charset is used both as s's initial charset and as the default for
  165. subsequent .append() calls.
  166. The maximum line length can be specified explicitly via maxlinelen. For
  167. splitting the first line to a shorter value (to account for the field
  168. header which isn't included in s, e.g. `Subject') pass in the name of
  169. the field in header_name. The default maxlinelen is 78 as recommended
  170. by RFC 2822.
  171. continuation_ws must be RFC 2822 compliant folding whitespace (usually
  172. either a space or a hard tab) which will be prepended to continuation
  173. lines.
  174. errors is passed through to the .append() call.
  175. """
  176. if charset is None:
  177. charset = USASCII
  178. elif not isinstance(charset, Charset):
  179. charset = Charset(charset)
  180. self._charset = charset
  181. self._continuation_ws = continuation_ws
  182. self._chunks = []
  183. if s is not None:
  184. self.append(s, charset, errors)
  185. if maxlinelen is None:
  186. maxlinelen = MAXLINELEN
  187. self._maxlinelen = maxlinelen
  188. if header_name is None:
  189. self._headerlen = 0
  190. else:
  191. # Take the separating colon and space into account.
  192. self._headerlen = len(header_name) + 2
  193. def __str__(self):
  194. """Return the string value of the header."""
  195. self._normalize()
  196. uchunks = []
  197. lastcs = None
  198. lastspace = None
  199. for string, charset in self._chunks:
  200. # We must preserve spaces between encoded and non-encoded word
  201. # boundaries, which means for us we need to add a space when we go
  202. # from a charset to None/us-ascii, or from None/us-ascii to a
  203. # charset. Only do this for the second and subsequent chunks.
  204. # Don't add a space if the None/us-ascii string already has
  205. # a space (trailing or leading depending on transition)
  206. nextcs = charset
  207. if nextcs == _charset.UNKNOWN8BIT:
  208. original_bytes = string.encode('ascii', 'surrogateescape')
  209. string = original_bytes.decode('ascii', 'replace')
  210. if uchunks:
  211. hasspace = string and self._nonctext(string[0])
  212. if lastcs not in (None, 'us-ascii'):
  213. if nextcs in (None, 'us-ascii') and not hasspace:
  214. uchunks.append(SPACE)
  215. nextcs = None
  216. elif nextcs not in (None, 'us-ascii') and not lastspace:
  217. uchunks.append(SPACE)
  218. lastspace = string and self._nonctext(string[-1])
  219. lastcs = nextcs
  220. uchunks.append(string)
  221. return EMPTYSTRING.join(uchunks)
  222. # Rich comparison operators for equality only. BAW: does it make sense to
  223. # have or explicitly disable <, <=, >, >= operators?
  224. def __eq__(self, other):
  225. # other may be a Header or a string. Both are fine so coerce
  226. # ourselves to a unicode (of the unencoded header value), swap the
  227. # args and do another comparison.
  228. return other == str(self)
  229. def append(self, s, charset=None, errors='strict'):
  230. """Append a string to the MIME header.
  231. Optional charset, if given, should be a Charset instance or the name
  232. of a character set (which will be converted to a Charset instance). A
  233. value of None (the default) means that the charset given in the
  234. constructor is used.
  235. s may be a byte string or a Unicode string. If it is a byte string
  236. (i.e. isinstance(s, str) is false), then charset is the encoding of
  237. that byte string, and a UnicodeError will be raised if the string
  238. cannot be decoded with that charset. If s is a Unicode string, then
  239. charset is a hint specifying the character set of the characters in
  240. the string. In either case, when producing an RFC 2822 compliant
  241. header using RFC 2047 rules, the string will be encoded using the
  242. output codec of the charset. If the string cannot be encoded to the
  243. output codec, a UnicodeError will be raised.
  244. Optional `errors' is passed as the errors argument to the decode
  245. call if s is a byte string.
  246. """
  247. if charset is None:
  248. charset = self._charset
  249. elif not isinstance(charset, Charset):
  250. charset = Charset(charset)
  251. if not isinstance(s, str):
  252. input_charset = charset.input_codec or 'us-ascii'
  253. if input_charset == _charset.UNKNOWN8BIT:
  254. s = s.decode('us-ascii', 'surrogateescape')
  255. else:
  256. s = s.decode(input_charset, errors)
  257. # Ensure that the bytes we're storing can be decoded to the output
  258. # character set, otherwise an early error is raised.
  259. output_charset = charset.output_codec or 'us-ascii'
  260. if output_charset != _charset.UNKNOWN8BIT:
  261. try:
  262. s.encode(output_charset, errors)
  263. except UnicodeEncodeError:
  264. if output_charset!='us-ascii':
  265. raise
  266. charset = UTF8
  267. self._chunks.append((s, charset))
  268. def _nonctext(self, s):
  269. """True if string s is not a ctext character of RFC822.
  270. """
  271. return s.isspace() or s in ('(', ')', '\\')
  272. def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
  273. r"""Encode a message header into an RFC-compliant format.
  274. There are many issues involved in converting a given string for use in
  275. an email header. Only certain character sets are readable in most
  276. email clients, and as header strings can only contain a subset of
  277. 7-bit ASCII, care must be taken to properly convert and encode (with
  278. Base64 or quoted-printable) header strings. In addition, there is a
  279. 75-character length limit on any given encoded header field, so
  280. line-wrapping must be performed, even with double-byte character sets.
  281. Optional maxlinelen specifies the maximum length of each generated
  282. line, exclusive of the linesep string. Individual lines may be longer
  283. than maxlinelen if a folding point cannot be found. The first line
  284. will be shorter by the length of the header name plus ": " if a header
  285. name was specified at Header construction time. The default value for
  286. maxlinelen is determined at header construction time.
  287. Optional splitchars is a string containing characters which should be
  288. given extra weight by the splitting algorithm during normal header
  289. wrapping. This is in very rough support of RFC 2822's `higher level
  290. syntactic breaks': split points preceded by a splitchar are preferred
  291. during line splitting, with the characters preferred in the order in
  292. which they appear in the string. Space and tab may be included in the
  293. string to indicate whether preference should be given to one over the
  294. other as a split point when other split chars do not appear in the line
  295. being split. Splitchars does not affect RFC 2047 encoded lines.
  296. Optional linesep is a string to be used to separate the lines of
  297. the value. The default value is the most useful for typical
  298. Python applications, but it can be set to \r\n to produce RFC-compliant
  299. line separators when needed.
  300. """
  301. self._normalize()
  302. if maxlinelen is None:
  303. maxlinelen = self._maxlinelen
  304. # A maxlinelen of 0 means don't wrap. For all practical purposes,
  305. # choosing a huge number here accomplishes that and makes the
  306. # _ValueFormatter algorithm much simpler.
  307. if maxlinelen == 0:
  308. maxlinelen = 1000000
  309. formatter = _ValueFormatter(self._headerlen, maxlinelen,
  310. self._continuation_ws, splitchars)
  311. lastcs = None
  312. hasspace = lastspace = None
  313. for string, charset in self._chunks:
  314. if hasspace is not None:
  315. hasspace = string and self._nonctext(string[0])
  316. if lastcs not in (None, 'us-ascii'):
  317. if not hasspace or charset not in (None, 'us-ascii'):
  318. formatter.add_transition()
  319. elif charset not in (None, 'us-ascii') and not lastspace:
  320. formatter.add_transition()
  321. lastspace = string and self._nonctext(string[-1])
  322. lastcs = charset
  323. hasspace = False
  324. lines = string.splitlines()
  325. if lines:
  326. formatter.feed('', lines[0], charset)
  327. else:
  328. formatter.feed('', '', charset)
  329. for line in lines[1:]:
  330. formatter.newline()
  331. if charset.header_encoding is not None:
  332. formatter.feed(self._continuation_ws, ' ' + line.lstrip(),
  333. charset)
  334. else:
  335. sline = line.lstrip()
  336. fws = line[:len(line)-len(sline)]
  337. formatter.feed(fws, sline, charset)
  338. if len(lines) > 1:
  339. formatter.newline()
  340. if self._chunks:
  341. formatter.add_transition()
  342. value = formatter._str(linesep)
  343. if _embedded_header.search(value):
  344. raise HeaderParseError("header value appears to contain "
  345. "an embedded header: {!r}".format(value))
  346. return value
  347. def _normalize(self):
  348. # Step 1: Normalize the chunks so that all runs of identical charsets
  349. # get collapsed into a single unicode string.
  350. chunks = []
  351. last_charset = None
  352. last_chunk = []
  353. for string, charset in self._chunks:
  354. if charset == last_charset:
  355. last_chunk.append(string)
  356. else:
  357. if last_charset is not None:
  358. chunks.append((SPACE.join(last_chunk), last_charset))
  359. last_chunk = [string]
  360. last_charset = charset
  361. if last_chunk:
  362. chunks.append((SPACE.join(last_chunk), last_charset))
  363. self._chunks = chunks
  364. class _ValueFormatter:
  365. def __init__(self, headerlen, maxlen, continuation_ws, splitchars):
  366. self._maxlen = maxlen
  367. self._continuation_ws = continuation_ws
  368. self._continuation_ws_len = len(continuation_ws)
  369. self._splitchars = splitchars
  370. self._lines = []
  371. self._current_line = _Accumulator(headerlen)
  372. def _str(self, linesep):
  373. self.newline()
  374. return linesep.join(self._lines)
  375. def __str__(self):
  376. return self._str(NL)
  377. def newline(self):
  378. end_of_line = self._current_line.pop()
  379. if end_of_line != (' ', ''):
  380. self._current_line.push(*end_of_line)
  381. if len(self._current_line) > 0:
  382. if self._current_line.is_onlyws() and self._lines:
  383. self._lines[-1] += str(self._current_line)
  384. else:
  385. self._lines.append(str(self._current_line))
  386. self._current_line.reset()
  387. def add_transition(self):
  388. self._current_line.push(' ', '')
  389. def feed(self, fws, string, charset):
  390. # If the charset has no header encoding (i.e. it is an ASCII encoding)
  391. # then we must split the header at the "highest level syntactic break"
  392. # possible. Note that we don't have a lot of smarts about field
  393. # syntax; we just try to break on semi-colons, then commas, then
  394. # whitespace. Eventually, this should be pluggable.
  395. if charset.header_encoding is None:
  396. self._ascii_split(fws, string, self._splitchars)
  397. return
  398. # Otherwise, we're doing either a Base64 or a quoted-printable
  399. # encoding which means we don't need to split the line on syntactic
  400. # breaks. We can basically just find enough characters to fit on the
  401. # current line, minus the RFC 2047 chrome. What makes this trickier
  402. # though is that we have to split at octet boundaries, not character
  403. # boundaries but it's only safe to split at character boundaries so at
  404. # best we can only get close.
  405. encoded_lines = charset.header_encode_lines(string, self._maxlengths())
  406. # The first element extends the current line, but if it's None then
  407. # nothing more fit on the current line so start a new line.
  408. try:
  409. first_line = encoded_lines.pop(0)
  410. except IndexError:
  411. # There are no encoded lines, so we're done.
  412. return
  413. if first_line is not None:
  414. self._append_chunk(fws, first_line)
  415. try:
  416. last_line = encoded_lines.pop()
  417. except IndexError:
  418. # There was only one line.
  419. return
  420. self.newline()
  421. self._current_line.push(self._continuation_ws, last_line)
  422. # Everything else are full lines in themselves.
  423. for line in encoded_lines:
  424. self._lines.append(self._continuation_ws + line)
  425. def _maxlengths(self):
  426. # The first line's length.
  427. yield self._maxlen - len(self._current_line)
  428. while True:
  429. yield self._maxlen - self._continuation_ws_len
  430. def _ascii_split(self, fws, string, splitchars):
  431. # The RFC 2822 header folding algorithm is simple in principle but
  432. # complex in practice. Lines may be folded any place where "folding
  433. # white space" appears by inserting a linesep character in front of the
  434. # FWS. The complication is that not all spaces or tabs qualify as FWS,
  435. # and we are also supposed to prefer to break at "higher level
  436. # syntactic breaks". We can't do either of these without intimate
  437. # knowledge of the structure of structured headers, which we don't have
  438. # here. So the best we can do here is prefer to break at the specified
  439. # splitchars, and hope that we don't choose any spaces or tabs that
  440. # aren't legal FWS. (This is at least better than the old algorithm,
  441. # where we would sometimes *introduce* FWS after a splitchar, or the
  442. # algorithm before that, where we would turn all white space runs into
  443. # single spaces or tabs.)
  444. parts = re.split("(["+FWS+"]+)", fws+string)
  445. if parts[0]:
  446. parts[:0] = ['']
  447. else:
  448. parts.pop(0)
  449. for fws, part in zip(*[iter(parts)]*2):
  450. self._append_chunk(fws, part)
  451. def _append_chunk(self, fws, string):
  452. self._current_line.push(fws, string)
  453. if len(self._current_line) > self._maxlen:
  454. # Find the best split point, working backward from the end.
  455. # There might be none, on a long first line.
  456. for ch in self._splitchars:
  457. for i in range(self._current_line.part_count()-1, 0, -1):
  458. if ch.isspace():
  459. fws = self._current_line[i][0]
  460. if fws and fws[0]==ch:
  461. break
  462. prevpart = self._current_line[i-1][1]
  463. if prevpart and prevpart[-1]==ch:
  464. break
  465. else:
  466. continue
  467. break
  468. else:
  469. fws, part = self._current_line.pop()
  470. if self._current_line._initial_size > 0:
  471. # There will be a header, so leave it on a line by itself.
  472. self.newline()
  473. if not fws:
  474. # We don't use continuation_ws here because the whitespace
  475. # after a header should always be a space.
  476. fws = ' '
  477. self._current_line.push(fws, part)
  478. return
  479. remainder = self._current_line.pop_from(i)
  480. self._lines.append(str(self._current_line))
  481. self._current_line.reset(remainder)
  482. class _Accumulator(list):
  483. def __init__(self, initial_size=0):
  484. self._initial_size = initial_size
  485. super().__init__()
  486. def push(self, fws, string):
  487. self.append((fws, string))
  488. def pop_from(self, i=0):
  489. popped = self[i:]
  490. self[i:] = []
  491. return popped
  492. def pop(self):
  493. if self.part_count()==0:
  494. return ('', '')
  495. return super().pop()
  496. def __len__(self):
  497. return sum((len(fws)+len(part) for fws, part in self),
  498. self._initial_size)
  499. def __str__(self):
  500. return EMPTYSTRING.join((EMPTYSTRING.join((fws, part))
  501. for fws, part in self))
  502. def reset(self, startval=None):
  503. if startval is None:
  504. startval = []
  505. self[:] = startval
  506. self._initial_size = 0
  507. def is_onlyws(self):
  508. return self._initial_size==0 and (not self or str(self).isspace())
  509. def part_count(self):
  510. return super().__len__()