use chardet for encoding detection

adrienverge · spyoungtech · Mar 25, 2020 · Mar 25, 2020 · Mar 25, 2020 · Mar 25, 2020
commit a68a80143a449ceb2045163e3c3284b02c2f6eb7
diff --git a/setup.py b/setup.py
@@ -51,6 +51,6 @@
     packages=find_packages(exclude=['tests', 'tests.*']),
     entry_points={'console_scripts': ['yamllint=yamllint.cli:run']},
     package_data={'yamllint': ['conf/*.yaml']},
-    install_requires=['pathspec >=0.5.3', 'pyyaml'],
+    install_requires=['pathspec >=0.5.3', 'pyyaml', 'chardet'],
     test_suite='tests',
 )
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -92,9 +92,9 @@ def setUpClass(cls):
             'dos.yml': '---\r\n'
                        'dos: true',
             # UTF-16 Little Endian BOM
-            'non-ascii/utf16le': b'\xff\xfe---\nutf16le: true\n',
+            'non-ascii/utf16le': b'\xff\xfe' + u'---\nutf16le: true\n'.encode('utf-16-le'),
             # UTF-16 Big Endian
-            'non-ascii/utf16be': b'\xfe\xff---\nutf16be: true\n',
+            'non-ascii/utf16be': b'\xfe\xff' + u'---\nutf16be: true\n'.encode('utf-16-be'),
             # UTF-8 BOM
             'non-ascii/utf8': b'\xef\xbb\xbf---\nutf8: true\n',
         })
@@ -532,15 +532,18 @@ def test_run_non_universal_newline(self):
 
     def test_encoding_detection_utf16le(self):
         path = os.path.join(self.wd, 'non-ascii/utf16le')
-        encoding = cli.determine_encoding(path)
-        self.assertEqual(encoding, 'utf-16-le')
+        with RunContext(self) as ctx:
+            cli.run(('-f', 'parsable', path))
+        self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))
 
     def test_encoding_detection_utf16be(self):
         path = os.path.join(self.wd, 'non-ascii/utf16be')
-        encoding = cli.determine_encoding(path)
-        self.assertEqual(encoding, 'utf-16-be')
+        with RunContext(self) as ctx:
+            cli.run(('-f', 'parsable', path))
+        self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))
 
     def test_encoding_detection_utf8(self):
         path = os.path.join(self.wd, 'non-ascii/utf8')
-        encoding = cli.determine_encoding(path)
-        self.assertEqual(encoding, 'utf-8')
+        with RunContext(self) as ctx:
+            cli.run(('-f', 'parsable', path))
+        self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))
diff --git a/yamllint/cli.py b/yamllint/cli.py
@@ -17,7 +17,8 @@
 from __future__ import print_function
 
 import argparse
-import codecs
+import chardet
+import contextlib
 import io
 import os
 import platform
@@ -29,16 +30,16 @@
 from yamllint.linter import PROBLEM_LEVELS
 
 
-def determine_encoding(file):
-    with io.open(file, 'rb') as raw_file:
-        data = raw_file.read(4)
-    if data.startswith(codecs.BOM_UTF16_LE):
-        encoding = 'utf-16-le'
-    elif data.startswith(codecs.BOM_UTF16_BE):
-        encoding = 'utf-16-be'
-    else:
-        encoding = 'utf-8'
-    return encoding
+@contextlib.contextmanager
+def yamlopen(fp, **iowrapper_kwargs):
+    encoding = iowrapper_kwargs.pop('encoding', None)
+    with io.open(fp, mode='rb') as raw_file:
+        if encoding is None:
+            raw_data = raw_file.read()
+            encoding = chardet.detect(raw_data).get('encoding') or 'utf-8'
+            raw_file.seek(0)
+        with io.TextIOWrapper(raw_file, encoding=encoding, **iowrapper_kwargs) as decoded:
+            yield decoded
 
 
 def find_files_recursively(items, conf):
@@ -190,8 +191,7 @@ def run(argv=None):
     for file in find_files_recursively(args.files, conf):
         filepath = file[2:] if file.startswith('./') else file
         try:
-            encoding = determine_encoding(file)
-            with io.open(file, newline='', encoding=encoding) as f:
+            with yamlopen(file, newline='') as f:
                 problems = linter.run(f, conf, filepath)
         except EnvironmentError as e:
             print(e, file=sys.stderr)