chardetect.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. """
  2. Script which takes one or more file paths and reports on their detected
  3. encodings
  4. Example::
  5. % chardetect somefile someotherfile
  6. somefile: windows-1252 with confidence 0.5
  7. someotherfile: ascii with confidence 1.0
  8. If no paths are provided, it takes its input from stdin.
  9. """
  10. from __future__ import absolute_import, print_function, unicode_literals
  11. import argparse
  12. import sys
  13. from chardet import __version__
  14. from chardet.compat import PY2
  15. from chardet.universaldetector import UniversalDetector
  16. def description_of(lines, name='stdin'):
  17. """
  18. Return a string describing the probable encoding of a file or
  19. list of strings.
  20. :param lines: The lines to get the encoding of.
  21. :type lines: Iterable of bytes
  22. :param name: Name of file or collection of lines
  23. :type name: str
  24. """
  25. u = UniversalDetector()
  26. for line in lines:
  27. line = bytearray(line)
  28. u.feed(line)
  29. # shortcut out of the loop to save reading further - particularly useful if we read a BOM.
  30. if u.done:
  31. break
  32. u.close()
  33. result = u.result
  34. if PY2:
  35. name = name.decode(sys.getfilesystemencoding(), 'ignore')
  36. if result['encoding']:
  37. return '{}: {} with confidence {}'.format(name, result['encoding'],
  38. result['confidence'])
  39. else:
  40. return '{}: no result'.format(name)
  41. def main(argv=None):
  42. """
  43. Handles command line arguments and gets things started.
  44. :param argv: List of arguments, as if specified on the command-line.
  45. If None, ``sys.argv[1:]`` is used instead.
  46. :type argv: list of str
  47. """
  48. # Get command line arguments
  49. parser = argparse.ArgumentParser(
  50. description="Takes one or more file paths and reports their detected \
  51. encodings")
  52. parser.add_argument('input',
  53. help='File whose encoding we would like to determine. \
  54. (default: stdin)',
  55. type=argparse.FileType('rb'), nargs='*',
  56. default=[sys.stdin if PY2 else sys.stdin.buffer])
  57. parser.add_argument('--version', action='version',
  58. version='%(prog)s {}'.format(__version__))
  59. args = parser.parse_args(argv)
  60. for f in args.input:
  61. if f.isatty():
  62. print("You are running chardetect interactively. Press " +
  63. "CTRL-D twice at the start of a blank line to signal the " +
  64. "end of your input. If you want help, run chardetect " +
  65. "--help\n", file=sys.stderr)
  66. print(description_of(f, f.name))
  67. if __name__ == '__main__':
  68. main()