adrienverge · spyoungtech · Mar 25, 2020 · Mar 25, 2020 · Mar 25, 2020 · Mar 25, 2020
diff --git a/setup.py b/setup.py
@@ -51,6 +51,6 @@
     packages=find_packages(exclude=['tests', 'tests.*']),
     entry_points={'console_scripts': ['yamllint=yamllint.cli:run']},
     package_data={'yamllint': ['conf/*.yaml']},
-    install_requires=['pathspec >=0.5.3', 'pyyaml'],
+    install_requires=['pathspec >=0.5.3', 'pyyaml', 'chardet'],
     test_suite='tests',
 )
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -91,6 +91,16 @@ def setUpClass(cls):
             # dos line endings yaml
             'dos.yml': '---\r\n'
                        'dos: true',
+            # UTF-16 Little Endian BOM
+            'non-ascii/utf-16-le':
+                b'\xff\xfe' + u'---\nutf16le: true\n'.encode('utf-16-le'),
+            # UTF-16 Big Endian
+            'non-ascii/utf-16-be':
+                b'\xfe\xff' + u'---\nutf16be: true\n'.encode('utf-16-be'),
+            # UTF-8 BOM
+            'non-ascii/utf-8': b'\xef\xbb\xbf---\nutf8: true\n',
+            # Random bytes that have no possible encoding
+            'non-ascii/undetectable': b'\x05\xfc\x17A\xb6\x15\x15\x90>9'
         })
 
     @classmethod
@@ -171,6 +181,10 @@ def test_find_files_recursively(self):
              os.path.join(self.wd, 'dos.yml'),
              os.path.join(self.wd, 'empty.yml'),
              os.path.join(self.wd, 'no-yaml.json'),
+             os.path.join(self.wd, 'non-ascii/undetectable'),
+             os.path.join(self.wd, 'non-ascii/utf-16-be'),
+             os.path.join(self.wd, 'non-ascii/utf-16-le'),
+             os.path.join(self.wd, 'non-ascii/utf-8'),
              os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8'),
              os.path.join(self.wd, 's/s/s/s/s/s/s/s/s/s/s/s/s/s/s/file.yaml'),
              os.path.join(self.wd, 'sub/ok.yaml'),
@@ -188,6 +202,10 @@ def test_find_files_recursively(self):
              os.path.join(self.wd, 'dos.yml'),
              os.path.join(self.wd, 'empty.yml'),
              os.path.join(self.wd, 'no-yaml.json'),
+             os.path.join(self.wd, 'non-ascii/undetectable'),
+             os.path.join(self.wd, 'non-ascii/utf-16-be'),
+             os.path.join(self.wd, 'non-ascii/utf-16-le'),
+             os.path.join(self.wd, 'non-ascii/utf-8'),
              os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8'),
              os.path.join(self.wd, 's/s/s/s/s/s/s/s/s/s/s/s/s/s/s/file.yaml'),
              os.path.join(self.wd, 'sub/ok.yaml'),
@@ -200,7 +218,8 @@ def test_find_files_recursively(self):
                                      '  - \'**/utf-8\'\n')
         self.assertEqual(
             sorted(cli.find_files_recursively([self.wd], conf)),
-            [os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8')]
+            [os.path.join(self.wd, 'non-ascii/utf-8'),
+             os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8')]
         )
 
     def test_run_with_bad_arguments(self):
@@ -517,3 +536,55 @@ def test_run_non_universal_newline(self):
             '\n' % path)
         self.assertEqual(
             (ctx.returncode, ctx.stdout, ctx.stderr), (1, expected_out, ''))
+
+    def test_encoding_detection_utf16le(self):
+        path = os.path.join(self.wd, 'non-ascii/utf-16-le')
+        with RunContext(self) as ctx:
+            cli.run(('-f', 'parsable', path))
+        self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))
+
+    def test_encoding_detection_utf16be(self):
+        path = os.path.join(self.wd, 'non-ascii/utf-16-be')
+        with RunContext(self) as ctx:
+            cli.run(('-f', 'parsable', path))
+        self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))
+
+    def test_encoding_detection_utf8(self):
+        path = os.path.join(self.wd, 'non-ascii/utf-8')
+        with RunContext(self) as ctx:
+            cli.run(('-f', 'parsable', path))
+        self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))
+
+    def test_detected_encoding_utf8(self):
+        path = os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8')
+        with cli.yamlopen(path) as yaml_file:
+            yaml_file.read()
+        self.assertEqual(yaml_file.encoding, 'utf-8')
+
+    def test_detected_encoding_utf8_sig(self):
+        path = os.path.join(self.wd, 'non-ascii/utf-8')
+        with cli.yamlopen(path) as yaml_file:
+            yaml_file.read()
+        self.assertEqual(yaml_file.encoding, 'UTF-8-SIG')
+
+    def test_detected_encoding_utf16(self):
+        path = os.path.join(self.wd, 'non-ascii/utf-16-le')
+        with cli.yamlopen(path) as yaml_file:
+            yaml_file.read()
+        self.assertEqual(yaml_file.encoding, 'UTF-16')
+        path = os.path.join(self.wd, 'non-ascii/utf-16-be')
+        with cli.yamlopen(path) as yaml_file:
+            yaml_file.read()
+        self.assertEqual(yaml_file.encoding, 'UTF-16')
+
+    def test_explicit_encoding(self):
+        path = os.path.join(self.wd, 'a.yaml')
+        with cli.yamlopen(path, encoding='windows-1252') as yaml_file:
+            yaml_file.read()
+        self.assertEqual(yaml_file.encoding, 'windows-1252')
+
+    def test_default_encoding(self):
+        path = os.path.join(self.wd, 'non-ascii/undetectable')
+        with cli.yamlopen(path) as yaml_file:
+            encoding = yaml_file.encoding
+        self.assertEqual(encoding, 'utf-8')
diff --git a/yamllint/cli.py b/yamllint/cli.py
@@ -17,17 +17,35 @@
 from __future__ import print_function
 
 import argparse
+import contextlib
 import io
 import os
 import platform
 import sys
 
+import chardet
+
 from yamllint import APP_DESCRIPTION, APP_NAME, APP_VERSION
 from yamllint import linter
 from yamllint.config import YamlLintConfig, YamlLintConfigError
 from yamllint.linter import PROBLEM_LEVELS
 
 
+@contextlib.contextmanager
+def yamlopen(fp, **iowrapper_kwargs):
+    with io.open(fp, mode='rb') as raw_file:
+        if iowrapper_kwargs.get('encoding'):
+            with io.TextIOWrapper(raw_file, **iowrapper_kwargs) as decoded:
+                yield decoded
+        else:
+            raw_data = raw_file.read()
+            encoding = chardet.detect(raw_data).get('encoding') or 'utf-8'
+            iowrapper_kwargs['encoding'] = encoding
+            buff = io.BytesIO(raw_data)
+            with io.TextIOWrapper(buff, **iowrapper_kwargs) as decoded:
+                yield decoded
+
+
 def find_files_recursively(items, conf):
     for item in items:
         if os.path.isdir(item):
@@ -177,7 +195,7 @@ def run(argv=None):
     for file in find_files_recursively(args.files, conf):
         filepath = file[2:] if file.startswith('./') else file
         try:
-            with io.open(file, newline='') as f:
+            with yamlopen(file, newline='') as f:
                 problems = linter.run(f, conf, filepath)
         except EnvironmentError as e:
             print(e, file=sys.stderr)