__init__.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. """
  2. General functions for HTML manipulation.
  3. """
  4. import re as _re
  5. from html.entities import html5 as _html5
  6. __all__ = ['escape', 'unescape']
  7. def escape(s, quote=True):
  8. """
  9. Replace special characters "&", "<" and ">" to HTML-safe sequences.
  10. If the optional flag quote is true (the default), the quotation mark
  11. characters, both double quote (") and single quote (') characters are also
  12. translated.
  13. """
  14. s = s.replace("&", "&amp;") # Must be done first!
  15. s = s.replace("<", "&lt;")
  16. s = s.replace(">", "&gt;")
  17. if quote:
  18. s = s.replace('"', "&quot;")
  19. s = s.replace('\'', "&#x27;")
  20. return s
  21. # see https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
  22. _invalid_charrefs = {
  23. 0x00: '\ufffd', # REPLACEMENT CHARACTER
  24. 0x0d: '\r', # CARRIAGE RETURN
  25. 0x80: '\u20ac', # EURO SIGN
  26. 0x81: '\x81', # <control>
  27. 0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK
  28. 0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK
  29. 0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK
  30. 0x85: '\u2026', # HORIZONTAL ELLIPSIS
  31. 0x86: '\u2020', # DAGGER
  32. 0x87: '\u2021', # DOUBLE DAGGER
  33. 0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT
  34. 0x89: '\u2030', # PER MILLE SIGN
  35. 0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON
  36. 0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
  37. 0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE
  38. 0x8d: '\x8d', # <control>
  39. 0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON
  40. 0x8f: '\x8f', # <control>
  41. 0x90: '\x90', # <control>
  42. 0x91: '\u2018', # LEFT SINGLE QUOTATION MARK
  43. 0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK
  44. 0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK
  45. 0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK
  46. 0x95: '\u2022', # BULLET
  47. 0x96: '\u2013', # EN DASH
  48. 0x97: '\u2014', # EM DASH
  49. 0x98: '\u02dc', # SMALL TILDE
  50. 0x99: '\u2122', # TRADE MARK SIGN
  51. 0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON
  52. 0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
  53. 0x9c: '\u0153', # LATIN SMALL LIGATURE OE
  54. 0x9d: '\x9d', # <control>
  55. 0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON
  56. 0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS
  57. }
  58. _invalid_codepoints = {
  59. # 0x0001 to 0x0008
  60. 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
  61. # 0x000E to 0x001F
  62. 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
  63. 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
  64. # 0x007F to 0x009F
  65. 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
  66. 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
  67. 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
  68. # 0xFDD0 to 0xFDEF
  69. 0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
  70. 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
  71. 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
  72. 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
  73. # others
  74. 0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
  75. 0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
  76. 0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
  77. 0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
  78. 0x10fffe, 0x10ffff
  79. }
  80. def _replace_charref(s):
  81. s = s.group(1)
  82. if s[0] == '#':
  83. # numeric charref
  84. if s[1] in 'xX':
  85. num = int(s[2:].rstrip(';'), 16)
  86. else:
  87. num = int(s[1:].rstrip(';'))
  88. if num in _invalid_charrefs:
  89. return _invalid_charrefs[num]
  90. if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:
  91. return '\uFFFD'
  92. if num in _invalid_codepoints:
  93. return ''
  94. return chr(num)
  95. else:
  96. # named charref
  97. if s in _html5:
  98. return _html5[s]
  99. # find the longest matching name (as defined by the standard)
  100. for x in range(len(s)-1, 1, -1):
  101. if s[:x] in _html5:
  102. return _html5[s[:x]] + s[x:]
  103. else:
  104. return '&' + s
  105. _charref = _re.compile(r'&(#[0-9]+;?'
  106. r'|#[xX][0-9a-fA-F]+;?'
  107. r'|[^\t\n\f <&#;]{1,32};?)')
  108. def unescape(s):
  109. """
  110. Convert all named and numeric character references (e.g. &gt;, &#62;,
  111. &x3e;) in the string s to the corresponding unicode characters.
  112. This function uses the rules defined by the HTML 5 standard
  113. for both valid and invalid character references, and the list of
  114. HTML 5 named character references defined in html.entities.html5.
  115. """
  116. if '&' not in s:
  117. return s
  118. return _charref.sub(_replace_charref, s)