parser.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. """A parser for HTML and XHTML."""
  2. # This file is based on sgmllib.py, but the API is slightly different.
  3. # XXX There should be a way to distinguish between PCDATA (parsed
  4. # character data -- the normal case), RCDATA (replaceable character
  5. # data -- only char and entity references and end tags are special)
  6. # and CDATA (character data -- only end tags are special).
  7. import re
  8. import _markupbase
  9. from html import unescape
  10. __all__ = ['HTMLParser']
  11. # Regular expressions used for parsing
  12. interesting_normal = re.compile('[&<]')
  13. incomplete = re.compile('&[a-zA-Z#]')
  14. entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  15. charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  16. starttagopen = re.compile('<[a-zA-Z]')
  17. piclose = re.compile('>')
  18. commentclose = re.compile(r'--\s*>')
  19. # Note:
  20. # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
  21. # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
  22. # explode, so don't do it.
  23. # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
  24. # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
  25. tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
  26. attrfind_tolerant = re.compile(
  27. r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
  28. r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
  29. locatestarttagend_tolerant = re.compile(r"""
  30. <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
  31. (?:[\s/]* # optional whitespace before attribute name
  32. (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
  33. (?:\s*=+\s* # value indicator
  34. (?:'[^']*' # LITA-enclosed value
  35. |"[^"]*" # LIT-enclosed value
  36. |(?!['"])[^>\s]* # bare value
  37. )
  38. \s* # possibly followed by a space
  39. )?(?:\s|/(?!>))*
  40. )*
  41. )?
  42. \s* # trailing whitespace
  43. """, re.VERBOSE)
  44. endendtag = re.compile('>')
  45. # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
  46. # </ and the tag name, so maybe this should be fixed
  47. endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  48. class HTMLParser(_markupbase.ParserBase):
  49. """Find tags and other markup and call handler functions.
  50. Usage:
  51. p = HTMLParser()
  52. p.feed(data)
  53. ...
  54. p.close()
  55. Start tags are handled by calling self.handle_starttag() or
  56. self.handle_startendtag(); end tags by self.handle_endtag(). The
  57. data between tags is passed from the parser to the derived class
  58. by calling self.handle_data() with the data as argument (the data
  59. may be split up in arbitrary chunks). If convert_charrefs is
  60. True the character references are converted automatically to the
  61. corresponding Unicode character (and self.handle_data() is no
  62. longer split in chunks), otherwise they are passed by calling
  63. self.handle_entityref() or self.handle_charref() with the string
  64. containing respectively the named or numeric reference as the
  65. argument.
  66. """
  67. CDATA_CONTENT_ELEMENTS = ("script", "style")
  68. def __init__(self, *, convert_charrefs=True):
  69. """Initialize and reset this instance.
  70. If convert_charrefs is True (the default), all character references
  71. are automatically converted to the corresponding Unicode characters.
  72. """
  73. self.convert_charrefs = convert_charrefs
  74. self.reset()
  75. def reset(self):
  76. """Reset this instance. Loses all unprocessed data."""
  77. self.rawdata = ''
  78. self.lasttag = '???'
  79. self.interesting = interesting_normal
  80. self.cdata_elem = None
  81. _markupbase.ParserBase.reset(self)
  82. def feed(self, data):
  83. r"""Feed data to the parser.
  84. Call this as often as you want, with as little or as much text
  85. as you want (may include '\n').
  86. """
  87. self.rawdata = self.rawdata + data
  88. self.goahead(0)
  89. def close(self):
  90. """Handle any buffered data."""
  91. self.goahead(1)
  92. __starttag_text = None
  93. def get_starttag_text(self):
  94. """Return full source of start tag: '<...>'."""
  95. return self.__starttag_text
  96. def set_cdata_mode(self, elem):
  97. self.cdata_elem = elem.lower()
  98. self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
  99. def clear_cdata_mode(self):
  100. self.interesting = interesting_normal
  101. self.cdata_elem = None
  102. # Internal -- handle data as far as reasonable. May leave state
  103. # and data to be processed by a subsequent call. If 'end' is
  104. # true, force handling all data as if followed by EOF marker.
  105. def goahead(self, end):
  106. rawdata = self.rawdata
  107. i = 0
  108. n = len(rawdata)
  109. while i < n:
  110. if self.convert_charrefs and not self.cdata_elem:
  111. j = rawdata.find('<', i)
  112. if j < 0:
  113. # if we can't find the next <, either we are at the end
  114. # or there's more text incoming. If the latter is True,
  115. # we can't pass the text to handle_data in case we have
  116. # a charref cut in half at end. Try to determine if
  117. # this is the case before proceeding by looking for an
  118. # & near the end and see if it's followed by a space or ;.
  119. amppos = rawdata.rfind('&', max(i, n-34))
  120. if (amppos >= 0 and
  121. not re.compile(r'[\s;]').search(rawdata, amppos)):
  122. break # wait till we get all the text
  123. j = n
  124. else:
  125. match = self.interesting.search(rawdata, i) # < or &
  126. if match:
  127. j = match.start()
  128. else:
  129. if self.cdata_elem:
  130. break
  131. j = n
  132. if i < j:
  133. if self.convert_charrefs and not self.cdata_elem:
  134. self.handle_data(unescape(rawdata[i:j]))
  135. else:
  136. self.handle_data(rawdata[i:j])
  137. i = self.updatepos(i, j)
  138. if i == n: break
  139. startswith = rawdata.startswith
  140. if startswith('<', i):
  141. if starttagopen.match(rawdata, i): # < + letter
  142. k = self.parse_starttag(i)
  143. elif startswith("</", i):
  144. k = self.parse_endtag(i)
  145. elif startswith("<!--", i):
  146. k = self.parse_comment(i)
  147. elif startswith("<?", i):
  148. k = self.parse_pi(i)
  149. elif startswith("<!", i):
  150. k = self.parse_html_declaration(i)
  151. elif (i + 1) < n:
  152. self.handle_data("<")
  153. k = i + 1
  154. else:
  155. break
  156. if k < 0:
  157. if not end:
  158. break
  159. k = rawdata.find('>', i + 1)
  160. if k < 0:
  161. k = rawdata.find('<', i + 1)
  162. if k < 0:
  163. k = i + 1
  164. else:
  165. k += 1
  166. if self.convert_charrefs and not self.cdata_elem:
  167. self.handle_data(unescape(rawdata[i:k]))
  168. else:
  169. self.handle_data(rawdata[i:k])
  170. i = self.updatepos(i, k)
  171. elif startswith("&#", i):
  172. match = charref.match(rawdata, i)
  173. if match:
  174. name = match.group()[2:-1]
  175. self.handle_charref(name)
  176. k = match.end()
  177. if not startswith(';', k-1):
  178. k = k - 1
  179. i = self.updatepos(i, k)
  180. continue
  181. else:
  182. if ";" in rawdata[i:]: # bail by consuming &#
  183. self.handle_data(rawdata[i:i+2])
  184. i = self.updatepos(i, i+2)
  185. break
  186. elif startswith('&', i):
  187. match = entityref.match(rawdata, i)
  188. if match:
  189. name = match.group(1)
  190. self.handle_entityref(name)
  191. k = match.end()
  192. if not startswith(';', k-1):
  193. k = k - 1
  194. i = self.updatepos(i, k)
  195. continue
  196. match = incomplete.match(rawdata, i)
  197. if match:
  198. # match.group() will contain at least 2 chars
  199. if end and match.group() == rawdata[i:]:
  200. k = match.end()
  201. if k <= i:
  202. k = n
  203. i = self.updatepos(i, i + 1)
  204. # incomplete
  205. break
  206. elif (i + 1) < n:
  207. # not the end of the buffer, and can't be confused
  208. # with some other construct
  209. self.handle_data("&")
  210. i = self.updatepos(i, i + 1)
  211. else:
  212. break
  213. else:
  214. assert 0, "interesting.search() lied"
  215. # end while
  216. if end and i < n and not self.cdata_elem:
  217. if self.convert_charrefs and not self.cdata_elem:
  218. self.handle_data(unescape(rawdata[i:n]))
  219. else:
  220. self.handle_data(rawdata[i:n])
  221. i = self.updatepos(i, n)
  222. self.rawdata = rawdata[i:]
  223. # Internal -- parse html declarations, return length or -1 if not terminated
  224. # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
  225. # See also parse_declaration in _markupbase
  226. def parse_html_declaration(self, i):
  227. rawdata = self.rawdata
  228. assert rawdata[i:i+2] == '<!', ('unexpected call to '
  229. 'parse_html_declaration()')
  230. if rawdata[i:i+4] == '<!--':
  231. # this case is actually already handled in goahead()
  232. return self.parse_comment(i)
  233. elif rawdata[i:i+3] == '<![':
  234. return self.parse_marked_section(i)
  235. elif rawdata[i:i+9].lower() == '<!doctype':
  236. # find the closing >
  237. gtpos = rawdata.find('>', i+9)
  238. if gtpos == -1:
  239. return -1
  240. self.handle_decl(rawdata[i+2:gtpos])
  241. return gtpos+1
  242. else:
  243. return self.parse_bogus_comment(i)
  244. # Internal -- parse bogus comment, return length or -1 if not terminated
  245. # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
  246. def parse_bogus_comment(self, i, report=1):
  247. rawdata = self.rawdata
  248. assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
  249. 'parse_comment()')
  250. pos = rawdata.find('>', i+2)
  251. if pos == -1:
  252. return -1
  253. if report:
  254. self.handle_comment(rawdata[i+2:pos])
  255. return pos + 1
  256. # Internal -- parse processing instr, return end or -1 if not terminated
  257. def parse_pi(self, i):
  258. rawdata = self.rawdata
  259. assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
  260. match = piclose.search(rawdata, i+2) # >
  261. if not match:
  262. return -1
  263. j = match.start()
  264. self.handle_pi(rawdata[i+2: j])
  265. j = match.end()
  266. return j
  267. # Internal -- handle starttag, return end or -1 if not terminated
  268. def parse_starttag(self, i):
  269. self.__starttag_text = None
  270. endpos = self.check_for_whole_start_tag(i)
  271. if endpos < 0:
  272. return endpos
  273. rawdata = self.rawdata
  274. self.__starttag_text = rawdata[i:endpos]
  275. # Now parse the data between i+1 and j into a tag and attrs
  276. attrs = []
  277. match = tagfind_tolerant.match(rawdata, i+1)
  278. assert match, 'unexpected call to parse_starttag()'
  279. k = match.end()
  280. self.lasttag = tag = match.group(1).lower()
  281. while k < endpos:
  282. m = attrfind_tolerant.match(rawdata, k)
  283. if not m:
  284. break
  285. attrname, rest, attrvalue = m.group(1, 2, 3)
  286. if not rest:
  287. attrvalue = None
  288. elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
  289. attrvalue[:1] == '"' == attrvalue[-1:]:
  290. attrvalue = attrvalue[1:-1]
  291. if attrvalue:
  292. attrvalue = unescape(attrvalue)
  293. attrs.append((attrname.lower(), attrvalue))
  294. k = m.end()
  295. end = rawdata[k:endpos].strip()
  296. if end not in (">", "/>"):
  297. lineno, offset = self.getpos()
  298. if "\n" in self.__starttag_text:
  299. lineno = lineno + self.__starttag_text.count("\n")
  300. offset = len(self.__starttag_text) \
  301. - self.__starttag_text.rfind("\n")
  302. else:
  303. offset = offset + len(self.__starttag_text)
  304. self.handle_data(rawdata[i:endpos])
  305. return endpos
  306. if end.endswith('/>'):
  307. # XHTML-style empty tag: <span attr="value" />
  308. self.handle_startendtag(tag, attrs)
  309. else:
  310. self.handle_starttag(tag, attrs)
  311. if tag in self.CDATA_CONTENT_ELEMENTS:
  312. self.set_cdata_mode(tag)
  313. return endpos
  314. # Internal -- check to see if we have a complete starttag; return end
  315. # or -1 if incomplete.
  316. def check_for_whole_start_tag(self, i):
  317. rawdata = self.rawdata
  318. m = locatestarttagend_tolerant.match(rawdata, i)
  319. if m:
  320. j = m.end()
  321. next = rawdata[j:j+1]
  322. if next == ">":
  323. return j + 1
  324. if next == "/":
  325. if rawdata.startswith("/>", j):
  326. return j + 2
  327. if rawdata.startswith("/", j):
  328. # buffer boundary
  329. return -1
  330. # else bogus input
  331. if j > i:
  332. return j
  333. else:
  334. return i + 1
  335. if next == "":
  336. # end of input
  337. return -1
  338. if next in ("abcdefghijklmnopqrstuvwxyz=/"
  339. "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
  340. # end of input in or before attribute value, or we have the
  341. # '/' from a '/>' ending
  342. return -1
  343. if j > i:
  344. return j
  345. else:
  346. return i + 1
  347. raise AssertionError("we should not get here!")
  348. # Internal -- parse endtag, return end or -1 if incomplete
  349. def parse_endtag(self, i):
  350. rawdata = self.rawdata
  351. assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
  352. match = endendtag.search(rawdata, i+1) # >
  353. if not match:
  354. return -1
  355. gtpos = match.end()
  356. match = endtagfind.match(rawdata, i) # </ + tag + >
  357. if not match:
  358. if self.cdata_elem is not None:
  359. self.handle_data(rawdata[i:gtpos])
  360. return gtpos
  361. # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
  362. namematch = tagfind_tolerant.match(rawdata, i+2)
  363. if not namematch:
  364. # w3.org/TR/html5/tokenization.html#end-tag-open-state
  365. if rawdata[i:i+3] == '</>':
  366. return i+3
  367. else:
  368. return self.parse_bogus_comment(i)
  369. tagname = namematch.group(1).lower()
  370. # consume and ignore other stuff between the name and the >
  371. # Note: this is not 100% correct, since we might have things like
  372. # </tag attr=">">, but looking for > after the name should cover
  373. # most of the cases and is much simpler
  374. gtpos = rawdata.find('>', namematch.end())
  375. self.handle_endtag(tagname)
  376. return gtpos+1
  377. elem = match.group(1).lower() # script or style
  378. if self.cdata_elem is not None:
  379. if elem != self.cdata_elem:
  380. self.handle_data(rawdata[i:gtpos])
  381. return gtpos
  382. self.handle_endtag(elem)
  383. self.clear_cdata_mode()
  384. return gtpos
  385. # Overridable -- finish processing of start+end tag: <tag.../>
  386. def handle_startendtag(self, tag, attrs):
  387. self.handle_starttag(tag, attrs)
  388. self.handle_endtag(tag)
  389. # Overridable -- handle start tag
  390. def handle_starttag(self, tag, attrs):
  391. pass
  392. # Overridable -- handle end tag
  393. def handle_endtag(self, tag):
  394. pass
  395. # Overridable -- handle character reference
  396. def handle_charref(self, name):
  397. pass
  398. # Overridable -- handle entity reference
  399. def handle_entityref(self, name):
  400. pass
  401. # Overridable -- handle data
  402. def handle_data(self, data):
  403. pass
  404. # Overridable -- handle comment
  405. def handle_comment(self, data):
  406. pass
  407. # Overridable -- handle declaration
  408. def handle_decl(self, decl):
  409. pass
  410. # Overridable -- handle processing instruction
  411. def handle_pi(self, data):
  412. pass
  413. def unknown_decl(self, data):
  414. pass