sbcharsetprober.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. ######################## BEGIN LICENSE BLOCK ########################
  2. # The Original Code is Mozilla Universal charset detector code.
  3. #
  4. # The Initial Developer of the Original Code is
  5. # Netscape Communications Corporation.
  6. # Portions created by the Initial Developer are Copyright (C) 2001
  7. # the Initial Developer. All Rights Reserved.
  8. #
  9. # Contributor(s):
  10. # Mark Pilgrim - port to Python
  11. # Shy Shalom - original C code
  12. #
  13. # This library is free software; you can redistribute it and/or
  14. # modify it under the terms of the GNU Lesser General Public
  15. # License as published by the Free Software Foundation; either
  16. # version 2.1 of the License, or (at your option) any later version.
  17. #
  18. # This library is distributed in the hope that it will be useful,
  19. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  21. # Lesser General Public License for more details.
  22. #
  23. # You should have received a copy of the GNU Lesser General Public
  24. # License along with this library; if not, write to the Free Software
  25. # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  26. # 02110-1301 USA
  27. ######################### END LICENSE BLOCK #########################
  28. from collections import namedtuple
  29. from .charsetprober import CharSetProber
  30. from .enums import CharacterCategory, ProbingState, SequenceLikelihood
  31. SingleByteCharSetModel = namedtuple('SingleByteCharSetModel',
  32. ['charset_name',
  33. 'language',
  34. 'char_to_order_map',
  35. 'language_model',
  36. 'typical_positive_ratio',
  37. 'keep_ascii_letters',
  38. 'alphabet'])
  39. class SingleByteCharSetProber(CharSetProber):
  40. SAMPLE_SIZE = 64
  41. SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2
  42. POSITIVE_SHORTCUT_THRESHOLD = 0.95
  43. NEGATIVE_SHORTCUT_THRESHOLD = 0.05
  44. def __init__(self, model, reversed=False, name_prober=None):
  45. super(SingleByteCharSetProber, self).__init__()
  46. self._model = model
  47. # TRUE if we need to reverse every pair in the model lookup
  48. self._reversed = reversed
  49. # Optional auxiliary prober for name decision
  50. self._name_prober = name_prober
  51. self._last_order = None
  52. self._seq_counters = None
  53. self._total_seqs = None
  54. self._total_char = None
  55. self._freq_char = None
  56. self.reset()
  57. def reset(self):
  58. super(SingleByteCharSetProber, self).reset()
  59. # char order of last character
  60. self._last_order = 255
  61. self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
  62. self._total_seqs = 0
  63. self._total_char = 0
  64. # characters that fall in our sampling range
  65. self._freq_char = 0
  66. @property
  67. def charset_name(self):
  68. if self._name_prober:
  69. return self._name_prober.charset_name
  70. else:
  71. return self._model.charset_name
  72. @property
  73. def language(self):
  74. if self._name_prober:
  75. return self._name_prober.language
  76. else:
  77. return self._model.language
  78. def feed(self, byte_str):
  79. # TODO: Make filter_international_words keep things in self.alphabet
  80. if not self._model.keep_ascii_letters:
  81. byte_str = self.filter_international_words(byte_str)
  82. if not byte_str:
  83. return self.state
  84. char_to_order_map = self._model.char_to_order_map
  85. language_model = self._model.language_model
  86. for char in byte_str:
  87. order = char_to_order_map.get(char, CharacterCategory.UNDEFINED)
  88. # XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but
  89. # CharacterCategory.SYMBOL is actually 253, so we use CONTROL
  90. # to make it closer to the original intent. The only difference
  91. # is whether or not we count digits and control characters for
  92. # _total_char purposes.
  93. if order < CharacterCategory.CONTROL:
  94. self._total_char += 1
  95. # TODO: Follow uchardet's lead and discount confidence for frequent
  96. # control characters.
  97. # See https://github.com/BYVoid/uchardet/commit/55b4f23971db61
  98. if order < self.SAMPLE_SIZE:
  99. self._freq_char += 1
  100. if self._last_order < self.SAMPLE_SIZE:
  101. self._total_seqs += 1
  102. if not self._reversed:
  103. lm_cat = language_model[self._last_order][order]
  104. else:
  105. lm_cat = language_model[order][self._last_order]
  106. self._seq_counters[lm_cat] += 1
  107. self._last_order = order
  108. charset_name = self._model.charset_name
  109. if self.state == ProbingState.DETECTING:
  110. if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
  111. confidence = self.get_confidence()
  112. if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
  113. self.logger.debug('%s confidence = %s, we have a winner',
  114. charset_name, confidence)
  115. self._state = ProbingState.FOUND_IT
  116. elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:
  117. self.logger.debug('%s confidence = %s, below negative '
  118. 'shortcut threshhold %s', charset_name,
  119. confidence,
  120. self.NEGATIVE_SHORTCUT_THRESHOLD)
  121. self._state = ProbingState.NOT_ME
  122. return self.state
  123. def get_confidence(self):
  124. r = 0.01
  125. if self._total_seqs > 0:
  126. r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
  127. self._total_seqs / self._model.typical_positive_ratio)
  128. r = r * self._freq_char / self._total_char
  129. if r >= 1.0:
  130. r = 0.99
  131. return r