__init__.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. """ Standard "encodings" Package
  2. Standard Python encoding modules are stored in this package
  3. directory.
  4. Codec modules must have names corresponding to normalized encoding
  5. names as defined in the normalize_encoding() function below, e.g.
  6. 'utf-8' must be implemented by the module 'utf_8.py'.
  7. Each codec module must export the following interface:
  8. * getregentry() -> codecs.CodecInfo object
  9. The getregentry() API must return a CodecInfo object with encoder, decoder,
  10. incrementalencoder, incrementaldecoder, streamwriter and streamreader
  11. attributes which adhere to the Python Codec Interface Standard.
  12. In addition, a module may optionally also define the following
  13. APIs which are then used by the package's codec search function:
  14. * getaliases() -> sequence of encoding name strings to use as aliases
  15. Alias names returned by getaliases() must be normalized encoding
  16. names as defined by normalize_encoding().
  17. Written by Marc-Andre Lemburg (mal@lemburg.com).
  18. (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
  19. """#"
  20. import codecs
  21. import sys
  22. from . import aliases
  23. _cache = {}
  24. _unknown = '--unknown--'
  25. _import_tail = ['*']
  26. _aliases = aliases.aliases
  27. class CodecRegistryError(LookupError, SystemError):
  28. pass
  29. def normalize_encoding(encoding):
  30. """ Normalize an encoding name.
  31. Normalization works as follows: all non-alphanumeric
  32. characters except the dot used for Python package names are
  33. collapsed and replaced with a single underscore, e.g. ' -;#'
  34. becomes '_'. Leading and trailing underscores are removed.
  35. Note that encoding names should be ASCII only.
  36. """
  37. if isinstance(encoding, bytes):
  38. encoding = str(encoding, "ascii")
  39. chars = []
  40. punct = False
  41. for c in encoding:
  42. if c.isalnum() or c == '.':
  43. if punct and chars:
  44. chars.append('_')
  45. chars.append(c)
  46. punct = False
  47. else:
  48. punct = True
  49. return ''.join(chars)
  50. def search_function(encoding):
  51. # Cache lookup
  52. entry = _cache.get(encoding, _unknown)
  53. if entry is not _unknown:
  54. return entry
  55. # Import the module:
  56. #
  57. # First try to find an alias for the normalized encoding
  58. # name and lookup the module using the aliased name, then try to
  59. # lookup the module using the standard import scheme, i.e. first
  60. # try in the encodings package, then at top-level.
  61. #
  62. norm_encoding = normalize_encoding(encoding)
  63. aliased_encoding = _aliases.get(norm_encoding) or \
  64. _aliases.get(norm_encoding.replace('.', '_'))
  65. if aliased_encoding is not None:
  66. modnames = [aliased_encoding,
  67. norm_encoding]
  68. else:
  69. modnames = [norm_encoding]
  70. for modname in modnames:
  71. if not modname or '.' in modname:
  72. continue
  73. try:
  74. # Import is absolute to prevent the possibly malicious import of a
  75. # module with side-effects that is not in the 'encodings' package.
  76. mod = __import__('encodings.' + modname, fromlist=_import_tail,
  77. level=0)
  78. except ImportError:
  79. # ImportError may occur because 'encodings.(modname)' does not exist,
  80. # or because it imports a name that does not exist (see mbcs and oem)
  81. pass
  82. else:
  83. break
  84. else:
  85. mod = None
  86. try:
  87. getregentry = mod.getregentry
  88. except AttributeError:
  89. # Not a codec module
  90. mod = None
  91. if mod is None:
  92. # Cache misses
  93. _cache[encoding] = None
  94. return None
  95. # Now ask the module for the registry entry
  96. entry = getregentry()
  97. if not isinstance(entry, codecs.CodecInfo):
  98. if not 4 <= len(entry) <= 7:
  99. raise CodecRegistryError('module "%s" (%s) failed to register'
  100. % (mod.__name__, mod.__file__))
  101. if not callable(entry[0]) or not callable(entry[1]) or \
  102. (entry[2] is not None and not callable(entry[2])) or \
  103. (entry[3] is not None and not callable(entry[3])) or \
  104. (len(entry) > 4 and entry[4] is not None and not callable(entry[4])) or \
  105. (len(entry) > 5 and entry[5] is not None and not callable(entry[5])):
  106. raise CodecRegistryError('incompatible codecs in module "%s" (%s)'
  107. % (mod.__name__, mod.__file__))
  108. if len(entry)<7 or entry[6] is None:
  109. entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
  110. entry = codecs.CodecInfo(*entry)
  111. # Cache the codec registry entry
  112. _cache[encoding] = entry
  113. # Register its aliases (without overwriting previously registered
  114. # aliases)
  115. try:
  116. codecaliases = mod.getaliases()
  117. except AttributeError:
  118. pass
  119. else:
  120. for alias in codecaliases:
  121. if alias not in _aliases:
  122. _aliases[alias] = modname
  123. # Return the registry entry
  124. return entry
  125. # Register the search_function in the Python codec registry
  126. codecs.register(search_function)
  127. if sys.platform == 'win32':
  128. def _alias_mbcs(encoding):
  129. try:
  130. import _winapi
  131. ansi_code_page = "cp%s" % _winapi.GetACP()
  132. if encoding == ansi_code_page:
  133. import encodings.mbcs
  134. return encodings.mbcs.getregentry()
  135. except ImportError:
  136. # Imports may fail while we are shutting down
  137. pass
  138. codecs.register(_alias_mbcs)