codecs.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. """Extend the Python codecs module with a few encodings that are used in OpenType (name table)
  2. but missing from Python. See https://github.com/fonttools/fonttools/issues/236 for details."""
  3. import codecs
  4. import encodings
  5. class ExtendCodec(codecs.Codec):
  6. def __init__(self, name, base_encoding, mapping):
  7. self.name = name
  8. self.base_encoding = base_encoding
  9. self.mapping = mapping
  10. self.reverse = {v: k for k, v in mapping.items()}
  11. self.max_len = max(len(v) for v in mapping.values())
  12. self.info = codecs.CodecInfo(
  13. name=self.name, encode=self.encode, decode=self.decode
  14. )
  15. codecs.register_error(name, self.error)
  16. def _map(self, mapper, output_type, exc_type, input, errors):
  17. base_error_handler = codecs.lookup_error(errors)
  18. length = len(input)
  19. out = output_type()
  20. while input:
  21. # first try to use self.error as the error handler
  22. try:
  23. part = mapper(input, self.base_encoding, errors=self.name)
  24. out += part
  25. break # All converted
  26. except exc_type as e:
  27. # else convert the correct part, handle error as requested and continue
  28. out += mapper(input[: e.start], self.base_encoding, self.name)
  29. replacement, pos = base_error_handler(e)
  30. out += replacement
  31. input = input[pos:]
  32. return out, length
  33. def encode(self, input, errors="strict"):
  34. return self._map(codecs.encode, bytes, UnicodeEncodeError, input, errors)
  35. def decode(self, input, errors="strict"):
  36. return self._map(codecs.decode, str, UnicodeDecodeError, input, errors)
  37. def error(self, e):
  38. if isinstance(e, UnicodeDecodeError):
  39. for end in range(e.start + 1, e.end + 1):
  40. s = e.object[e.start : end]
  41. if s in self.mapping:
  42. return self.mapping[s], end
  43. elif isinstance(e, UnicodeEncodeError):
  44. for end in range(e.start + 1, e.start + self.max_len + 1):
  45. s = e.object[e.start : end]
  46. if s in self.reverse:
  47. return self.reverse[s], end
  48. e.encoding = self.name
  49. raise e
  50. _extended_encodings = {
  51. "x_mac_japanese_ttx": (
  52. "shift_jis",
  53. {
  54. b"\xFC": chr(0x007C),
  55. b"\x7E": chr(0x007E),
  56. b"\x80": chr(0x005C),
  57. b"\xA0": chr(0x00A0),
  58. b"\xFD": chr(0x00A9),
  59. b"\xFE": chr(0x2122),
  60. b"\xFF": chr(0x2026),
  61. },
  62. ),
  63. "x_mac_trad_chinese_ttx": (
  64. "big5",
  65. {
  66. b"\x80": chr(0x005C),
  67. b"\xA0": chr(0x00A0),
  68. b"\xFD": chr(0x00A9),
  69. b"\xFE": chr(0x2122),
  70. b"\xFF": chr(0x2026),
  71. },
  72. ),
  73. "x_mac_korean_ttx": (
  74. "euc_kr",
  75. {
  76. b"\x80": chr(0x00A0),
  77. b"\x81": chr(0x20A9),
  78. b"\x82": chr(0x2014),
  79. b"\x83": chr(0x00A9),
  80. b"\xFE": chr(0x2122),
  81. b"\xFF": chr(0x2026),
  82. },
  83. ),
  84. "x_mac_simp_chinese_ttx": (
  85. "gb2312",
  86. {
  87. b"\x80": chr(0x00FC),
  88. b"\xA0": chr(0x00A0),
  89. b"\xFD": chr(0x00A9),
  90. b"\xFE": chr(0x2122),
  91. b"\xFF": chr(0x2026),
  92. },
  93. ),
  94. }
  95. _cache = {}
  96. def search_function(name):
  97. name = encodings.normalize_encoding(name) # Rather undocumented...
  98. if name in _extended_encodings:
  99. if name not in _cache:
  100. base_encoding, mapping = _extended_encodings[name]
  101. assert name[-4:] == "_ttx"
  102. # Python 2 didn't have any of the encodings that we are implementing
  103. # in this file. Python 3 added aliases for the East Asian ones, mapping
  104. # them "temporarily" to the same base encoding as us, with a comment
  105. # suggesting that full implementation will appear some time later.
  106. # As such, try the Python version of the x_mac_... first, if that is found,
  107. # use *that* as our base encoding. This would make our encoding upgrade
  108. # to the full encoding when and if Python finally implements that.
  109. # http://bugs.python.org/issue24041
  110. base_encodings = [name[:-4], base_encoding]
  111. for base_encoding in base_encodings:
  112. try:
  113. codecs.lookup(base_encoding)
  114. except LookupError:
  115. continue
  116. _cache[name] = ExtendCodec(name, base_encoding, mapping)
  117. break
  118. return _cache[name].info
  119. return None
  120. codecs.register(search_function)