Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect encoding per yaml spec (fix #238) #240

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,6 @@
packages=find_packages(exclude=['tests', 'tests.*']),
entry_points={'console_scripts': ['yamllint=yamllint.cli:run']},
package_data={'yamllint': ['conf/*.yaml']},
install_requires=['pathspec >=0.5.3', 'pyyaml'],
install_requires=['pathspec >=0.5.3', 'pyyaml', 'chardet'],
test_suite='tests',
)
73 changes: 72 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,16 @@ def setUpClass(cls):
# dos line endings yaml
'dos.yml': '---\r\n'
'dos: true',
# UTF-16 Little Endian BOM
'non-ascii/utf-16-le':
b'\xff\xfe' + u'---\nutf16le: true\n'.encode('utf-16-le'),
# UTF-16 Big Endian
'non-ascii/utf-16-be':
b'\xfe\xff' + u'---\nutf16be: true\n'.encode('utf-16-be'),
# UTF-8 BOM
'non-ascii/utf-8': b'\xef\xbb\xbf---\nutf8: true\n',
# Random bytes that have no possible encoding
'non-ascii/undetectable': b'\x05\xfc\x17A\xb6\x15\x15\x90>9'
})

@classmethod
Expand Down Expand Up @@ -171,6 +181,10 @@ def test_find_files_recursively(self):
os.path.join(self.wd, 'dos.yml'),
os.path.join(self.wd, 'empty.yml'),
os.path.join(self.wd, 'no-yaml.json'),
os.path.join(self.wd, 'non-ascii/undetectable'),
os.path.join(self.wd, 'non-ascii/utf-16-be'),
os.path.join(self.wd, 'non-ascii/utf-16-le'),
os.path.join(self.wd, 'non-ascii/utf-8'),
os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8'),
os.path.join(self.wd, 's/s/s/s/s/s/s/s/s/s/s/s/s/s/s/file.yaml'),
os.path.join(self.wd, 'sub/ok.yaml'),
Expand All @@ -188,6 +202,10 @@ def test_find_files_recursively(self):
os.path.join(self.wd, 'dos.yml'),
os.path.join(self.wd, 'empty.yml'),
os.path.join(self.wd, 'no-yaml.json'),
os.path.join(self.wd, 'non-ascii/undetectable'),
os.path.join(self.wd, 'non-ascii/utf-16-be'),
os.path.join(self.wd, 'non-ascii/utf-16-le'),
os.path.join(self.wd, 'non-ascii/utf-8'),
os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8'),
os.path.join(self.wd, 's/s/s/s/s/s/s/s/s/s/s/s/s/s/s/file.yaml'),
os.path.join(self.wd, 'sub/ok.yaml'),
Expand All @@ -200,7 +218,8 @@ def test_find_files_recursively(self):
' - \'**/utf-8\'\n')
self.assertEqual(
sorted(cli.find_files_recursively([self.wd], conf)),
[os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8')]
[os.path.join(self.wd, 'non-ascii/utf-8'),
os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8')]
)

def test_run_with_bad_arguments(self):
Expand Down Expand Up @@ -517,3 +536,55 @@ def test_run_non_universal_newline(self):
'\n' % path)
self.assertEqual(
(ctx.returncode, ctx.stdout, ctx.stderr), (1, expected_out, ''))

def test_encoding_detection_utf16le(self):
path = os.path.join(self.wd, 'non-ascii/utf-16-le')
with RunContext(self) as ctx:
cli.run(('-f', 'parsable', path))
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))

def test_encoding_detection_utf16be(self):
path = os.path.join(self.wd, 'non-ascii/utf-16-be')
with RunContext(self) as ctx:
cli.run(('-f', 'parsable', path))
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))

def test_encoding_detection_utf8(self):
path = os.path.join(self.wd, 'non-ascii/utf-8')
with RunContext(self) as ctx:
cli.run(('-f', 'parsable', path))
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))

def test_detected_encoding_utf8(self):
path = os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8')
with cli.yamlopen(path) as yaml_file:
yaml_file.read()
self.assertEqual(yaml_file.encoding, 'utf-8')

def test_detected_encoding_utf8_sig(self):
path = os.path.join(self.wd, 'non-ascii/utf-8')
with cli.yamlopen(path) as yaml_file:
yaml_file.read()
self.assertEqual(yaml_file.encoding, 'UTF-8-SIG')

def test_detected_encoding_utf16(self):
path = os.path.join(self.wd, 'non-ascii/utf-16-le')
with cli.yamlopen(path) as yaml_file:
yaml_file.read()
self.assertEqual(yaml_file.encoding, 'UTF-16')
path = os.path.join(self.wd, 'non-ascii/utf-16-be')
with cli.yamlopen(path) as yaml_file:
yaml_file.read()
self.assertEqual(yaml_file.encoding, 'UTF-16')

def test_explicit_encoding(self):
path = os.path.join(self.wd, 'a.yaml')
with cli.yamlopen(path, encoding='windows-1252') as yaml_file:
yaml_file.read()
self.assertEqual(yaml_file.encoding, 'windows-1252')
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not test anything because it was already opened with encoding='windows-1252', does it?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It tests that yamlopen correctly accepts an explicit encoding, instead of detecting the encoding with chardet.

There are tests that run the linter on all the new non-ascii files, except for the file that is made up of random bytes that cannot be decoded and exists just to be sure UTF-8 is used as a default if chardet can't detect any encoding.

I understand not wanting to add dependencies. If it's any consolation chardet itself is pure python with no external dependencies. I can also go back and revert to just checking the BOM, per the yaml spec and expecting failure or malformed data in any other case.

The motivation for using chardet was the comment from bz2 about making a best-effort to parse files even if they're not an acceptable encoding.

The yamlopen context manager was also implemented per request on review. I can remove that, too, if you prefer not to use it.

Copy link
Owner

@adrienverge adrienverge Apr 9, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It tests that yamlopen correctly accepts an explicit encoding, instead of detecting the encoding with chardet.

OK, thanks for the explanation!

There are tests that run the linter on all the new non-ascii files, except for the file that is made up of random bytes that cannot be decoded and exists just to be sure UTF-8 is used as a default if chardet can't detect any encoding.

Sure. But still no tests for ISO-8859-1-encoded file, unless I missed something?

I understand not wanting to add dependencies. If it's any consolation chardet itself is pure python with no external dependencies. I can also go back and revert to just checking the BOM, per the yaml spec and expecting failure or malformed data in any other case.

The motivation for using chardet was the comment from bz2 about making a best-effort to parse files even if they're not an acceptable encoding.

Sure, I understood that. But I'm very against adding new dependencies (pure Python or not).
Have you seen my proposal about a temporary Python-2-only solution? What do you think of it?
PS: I opened this related PR: #249

The yamlopen context manager was also implemented per request on review. I can remove that, too, if you prefer not to use it.

Yes I've seen it, but I don't think it helps readability.


def test_default_encoding(self):
path = os.path.join(self.wd, 'non-ascii/undetectable')
with cli.yamlopen(path) as yaml_file:
encoding = yaml_file.encoding
self.assertEqual(encoding, 'utf-8')
20 changes: 19 additions & 1 deletion yamllint/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,35 @@
from __future__ import print_function

import argparse
import contextlib
import io
import os
import platform
import sys

import chardet

from yamllint import APP_DESCRIPTION, APP_NAME, APP_VERSION
from yamllint import linter
from yamllint.config import YamlLintConfig, YamlLintConfigError
from yamllint.linter import PROBLEM_LEVELS


@contextlib.contextmanager
def yamlopen(fp, **iowrapper_kwargs):
with io.open(fp, mode='rb') as raw_file:
if iowrapper_kwargs.get('encoding'):
with io.TextIOWrapper(raw_file, **iowrapper_kwargs) as decoded:
yield decoded
else:
raw_data = raw_file.read()
encoding = chardet.detect(raw_data).get('encoding') or 'utf-8'
iowrapper_kwargs['encoding'] = encoding
buff = io.BytesIO(raw_data)
with io.TextIOWrapper(buff, **iowrapper_kwargs) as decoded:
yield decoded


def find_files_recursively(items, conf):
for item in items:
if os.path.isdir(item):
Expand Down Expand Up @@ -177,7 +195,7 @@ def run(argv=None):
for file in find_files_recursively(args.files, conf):
filepath = file[2:] if file.startswith('./') else file
try:
with io.open(file, newline='') as f:
with yamlopen(file, newline='') as f:
problems = linter.run(f, conf, filepath)
except EnvironmentError as e:
print(e, file=sys.stderr)
Expand Down