Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect encoding per yaml spec (fix #238) #240

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Prev Previous commit
Next Next commit
use chardet for encoding detection
spyoungtech committed Apr 4, 2020
commit a68a80143a449ceb2045163e3c3284b02c2f6eb7
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -51,6 +51,6 @@
packages=find_packages(exclude=['tests', 'tests.*']),
entry_points={'console_scripts': ['yamllint=yamllint.cli:run']},
package_data={'yamllint': ['conf/*.yaml']},
install_requires=['pathspec >=0.5.3', 'pyyaml'],
install_requires=['pathspec >=0.5.3', 'pyyaml', 'chardet'],
test_suite='tests',
)
19 changes: 11 additions & 8 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -92,9 +92,9 @@ def setUpClass(cls):
'dos.yml': '---\r\n'
'dos: true',
# UTF-16 Little Endian BOM
'non-ascii/utf16le': b'\xff\xfe---\nutf16le: true\n',
'non-ascii/utf16le': b'\xff\xfe' + u'---\nutf16le: true\n'.encode('utf-16-le'),
# UTF-16 Big Endian
'non-ascii/utf16be': b'\xfe\xff---\nutf16be: true\n',
'non-ascii/utf16be': b'\xfe\xff' + u'---\nutf16be: true\n'.encode('utf-16-be'),
# UTF-8 BOM
'non-ascii/utf8': b'\xef\xbb\xbf---\nutf8: true\n',
})
@@ -532,15 +532,18 @@ def test_run_non_universal_newline(self):

def test_encoding_detection_utf16le(self):
path = os.path.join(self.wd, 'non-ascii/utf16le')
encoding = cli.determine_encoding(path)
self.assertEqual(encoding, 'utf-16-le')
with RunContext(self) as ctx:
cli.run(('-f', 'parsable', path))
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))

def test_encoding_detection_utf16be(self):
path = os.path.join(self.wd, 'non-ascii/utf16be')
encoding = cli.determine_encoding(path)
self.assertEqual(encoding, 'utf-16-be')
with RunContext(self) as ctx:
cli.run(('-f', 'parsable', path))
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))

def test_encoding_detection_utf8(self):
path = os.path.join(self.wd, 'non-ascii/utf8')
encoding = cli.determine_encoding(path)
self.assertEqual(encoding, 'utf-8')
with RunContext(self) as ctx:
cli.run(('-f', 'parsable', path))
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))
26 changes: 13 additions & 13 deletions yamllint/cli.py
Original file line number Diff line number Diff line change
@@ -17,7 +17,8 @@
from __future__ import print_function

import argparse
import codecs
import chardet
import contextlib
import io
import os
import platform
@@ -29,16 +30,16 @@
from yamllint.linter import PROBLEM_LEVELS


def determine_encoding(file):
with io.open(file, 'rb') as raw_file:
data = raw_file.read(4)
if data.startswith(codecs.BOM_UTF16_LE):
encoding = 'utf-16-le'
elif data.startswith(codecs.BOM_UTF16_BE):
encoding = 'utf-16-be'
else:
encoding = 'utf-8'
return encoding
@contextlib.contextmanager
def yamlopen(fp, **iowrapper_kwargs):
encoding = iowrapper_kwargs.pop('encoding', None)
with io.open(fp, mode='rb') as raw_file:
if encoding is None:
raw_data = raw_file.read()
encoding = chardet.detect(raw_data).get('encoding') or 'utf-8'
raw_file.seek(0)
with io.TextIOWrapper(raw_file, encoding=encoding, **iowrapper_kwargs) as decoded:
yield decoded


def find_files_recursively(items, conf):
@@ -190,8 +191,7 @@ def run(argv=None):
for file in find_files_recursively(args.files, conf):
filepath = file[2:] if file.startswith('./') else file
try:
encoding = determine_encoding(file)
with io.open(file, newline='', encoding=encoding) as f:
with yamlopen(file, newline='') as f:
problems = linter.run(f, conf, filepath)
except EnvironmentError as e:
print(e, file=sys.stderr)