sbcsgroupprober.py 4.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. ######################## BEGIN LICENSE BLOCK ########################
  2. # The Original Code is Mozilla Universal charset detector code.
  3. #
  4. # The Initial Developer of the Original Code is
  5. # Netscape Communications Corporation.
  6. # Portions created by the Initial Developer are Copyright (C) 2001
  7. # the Initial Developer. All Rights Reserved.
  8. #
  9. # Contributor(s):
  10. # Mark Pilgrim - port to Python
  11. # Shy Shalom - original C code
  12. #
  13. # This library is free software; you can redistribute it and/or
  14. # modify it under the terms of the GNU Lesser General Public
  15. # License as published by the Free Software Foundation; either
  16. # version 2.1 of the License, or (at your option) any later version.
  17. #
  18. # This library is distributed in the hope that it will be useful,
  19. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  21. # Lesser General Public License for more details.
  22. #
  23. # You should have received a copy of the GNU Lesser General Public
  24. # License along with this library; if not, write to the Free Software
  25. # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  26. # 02110-1301 USA
  27. ######################### END LICENSE BLOCK #########################
  28. from .charsetgroupprober import CharSetGroupProber
  29. from .hebrewprober import HebrewProber
  30. from .langbulgarianmodel import (ISO_8859_5_BULGARIAN_MODEL,
  31. WINDOWS_1251_BULGARIAN_MODEL)
  32. from .langgreekmodel import ISO_8859_7_GREEK_MODEL, WINDOWS_1253_GREEK_MODEL
  33. from .langhebrewmodel import WINDOWS_1255_HEBREW_MODEL
  34. # from .langhungarianmodel import (ISO_8859_2_HUNGARIAN_MODEL,
  35. # WINDOWS_1250_HUNGARIAN_MODEL)
  36. from .langrussianmodel import (IBM855_RUSSIAN_MODEL, IBM866_RUSSIAN_MODEL,
  37. ISO_8859_5_RUSSIAN_MODEL, KOI8_R_RUSSIAN_MODEL,
  38. MACCYRILLIC_RUSSIAN_MODEL,
  39. WINDOWS_1251_RUSSIAN_MODEL)
  40. from .langthaimodel import TIS_620_THAI_MODEL
  41. from .langturkishmodel import ISO_8859_9_TURKISH_MODEL
  42. from .sbcharsetprober import SingleByteCharSetProber
  43. class SBCSGroupProber(CharSetGroupProber):
  44. def __init__(self):
  45. super(SBCSGroupProber, self).__init__()
  46. hebrew_prober = HebrewProber()
  47. logical_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
  48. False, hebrew_prober)
  49. # TODO: See if using ISO-8859-8 Hebrew model works better here, since
  50. # it's actually the visual one
  51. visual_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
  52. True, hebrew_prober)
  53. hebrew_prober.set_model_probers(logical_hebrew_prober,
  54. visual_hebrew_prober)
  55. # TODO: ORDER MATTERS HERE. I changed the order vs what was in master
  56. # and several tests failed that did not before. Some thought
  57. # should be put into the ordering, and we should consider making
  58. # order not matter here, because that is very counter-intuitive.
  59. self.probers = [
  60. SingleByteCharSetProber(WINDOWS_1251_RUSSIAN_MODEL),
  61. SingleByteCharSetProber(KOI8_R_RUSSIAN_MODEL),
  62. SingleByteCharSetProber(ISO_8859_5_RUSSIAN_MODEL),
  63. SingleByteCharSetProber(MACCYRILLIC_RUSSIAN_MODEL),
  64. SingleByteCharSetProber(IBM866_RUSSIAN_MODEL),
  65. SingleByteCharSetProber(IBM855_RUSSIAN_MODEL),
  66. SingleByteCharSetProber(ISO_8859_7_GREEK_MODEL),
  67. SingleByteCharSetProber(WINDOWS_1253_GREEK_MODEL),
  68. SingleByteCharSetProber(ISO_8859_5_BULGARIAN_MODEL),
  69. SingleByteCharSetProber(WINDOWS_1251_BULGARIAN_MODEL),
  70. # TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250)
  71. # after we retrain model.
  72. # SingleByteCharSetProber(ISO_8859_2_HUNGARIAN_MODEL),
  73. # SingleByteCharSetProber(WINDOWS_1250_HUNGARIAN_MODEL),
  74. SingleByteCharSetProber(TIS_620_THAI_MODEL),
  75. SingleByteCharSetProber(ISO_8859_9_TURKISH_MODEL),
  76. hebrew_prober,
  77. logical_hebrew_prober,
  78. visual_hebrew_prober,
  79. ]
  80. self.reset()