Merge pull request #20 from todofixthis/develop

Filters v3.1.0
todofixthis · Feb 3, 2023 · 5843483 · 5843483
2 parents 191466e + de9d540
commit 5843483
Show file tree

Hide file tree

Showing 15 changed files with 386 additions and 109 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,3 @@
 include LICENCE.txt
-recursive-include test *.py
+graft test
+prune **/__pycache__
diff --git a/README.rst b/README.rst
@@ -130,7 +130,11 @@ Install the package with the ``test-runner`` extra to set up the necessary
 dependencies, and then you can run the tests with the ``tox`` command::
 
    pip install -e .[test-runner]
-   tox -p all
+   tox -p
+
+To run tests in the current virtualenv::
+
+   python -m unittest
 
 Documentation
 -------------

diff --git a/docs/complex_filters.rst b/docs/complex_filters.rst
@@ -301,7 +301,7 @@ the incoming value's ``name`` item:
 .. _filterception:
 
 Filterception
-^^^^^^^^^^^^^
+=============
 Just like any other filter, complex filters can be chained with other filters.
 
 For example, to decode a JSON string that describes an address book card, the

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
@@ -101,7 +101,7 @@ If you want to reject ``None``, add the ``Required`` filter to your chain:
 Next Steps
 ==========
 See :doc:`/simple_filters` for a list of all the filters that come bundled with
-the Filters library (and its official extensions).
+the Filters library.
 
 Be sure to pay special attention to :doc:`/complex_filters`, which lists filters
 designed exclusively to work with other filters, allowing you to construct

diff --git a/docs/simple_filters.rst b/docs/simple_filters.rst
@@ -600,41 +600,118 @@ max value, set ``exclusive=True`` in the filter's initialiser:
 
 MaxBytes
 --------
-Truncates a unicode string to a max number of bytes.  When converting to a
-multibyte encoding (e.g., UTF-8), the filter will truncate additional bytes as
-needed to avoid orphaned sequences (see example below).
+Checks that a string will fit into a max number of bytes when encoded (using
+UTF-8 by default).
 
 .. important::
 
    The resulting value will be a byte string (``bytes`` type), not a unicode
    string!
 
+.. code-block:: python
+
+      import filters as f
+
+      runner = f.FilterRunner(f.MaxBytes(25), 'Γειάσου Κόσμε')
+      assert runner.is_valid() is True
+      assert runner.cleaned_data ==\
+        b'\xce\x93\xce\xb5\xce\xb9\xce\xac\xcf\x83\xce\xbf' \
+        b'\xcf\x85 \xce\x9a\xcf\x8c\xcf\x83\xce\xbc\xce\xb5'
+
+      runner = f.FilterRunner(f.MaxBytes(24), 'Γειάσου Κόσμε')
+      assert runner.is_valid() is False
+      assert runner.cleaned_data is None
+
+Instead of treating too-long values as invalid, you can configure the filter to
+truncate them instead:
+
 .. code-block:: python
 
    import filters as f
 
-   runner = f.FilterRunner(f.MaxBytes(24), 'Γειάσου Κόσμε')
-   # Value is too long, so ``is_valid()`` returns ``False``.
-   assert runner.is_valid() is False
-   # Note that the resulting value is truncated to 23 bytes instead of 24, so
-   # as not to orphan a multibyte sequence.
+   runner = f.FilterRunner(f.MaxBytes(22, truncate=True), 'हैलो वर्ल्ड')
+   # Truncated values are considered valid.
+   assert runner.is_valid() is True
    assert runner.cleaned_data ==\
-       b'\xce\x93\xce\xb5\xce\xb9\xce\xac\xcf\x83\xce\xbf' \
-       b'\xcf\x85 \xce\x9a\xcf\x8c\xcf\x83\xce\xbc'
+       b'\xe0\xa4\xb9\xe0\xa5\x88\xe0\xa4\xb2\xe0' \
+       b'\xa5\x8b \xe0\xa4\xb5\xe0\xa4\xb0\xe0\xa5\x8d'
 
-.. tip::
-
-   If you just want to validate the length of the input and don't need to
-   waste CPU cycles truncating too-long values, you can provide `truncate=False`
-   to the filter's initialiser:
+.. note::
+   When truncating with a multibyte encoding (e.g., UTF-8), the filter may
+   remove additional bytes as needed to avoid orphaned sequences:
 
    .. code-block:: python
 
       import filters as f
 
-      runner = f.FilterRunner(f.MaxBytes(24, truncate=False), 'Γειάσου Κόσμε')
-      assert runner.is_valid() is False
-      assert runner.cleaned_data is None
+      runner = f.FilterRunner(f.MaxBytes(21, truncate=True), 'हैलो वर्ल्ड')
+      assert runner.is_valid() is True
+      # Result is truncated to 19 bytes instead of 21, so as not to orphan a
+      # multibyte sequence.
+      assert len(runner.cleaned_data) == 19
+
+You can configure the filter to apply a prefix and/or suffix to the value when
+truncating:
+
+.. code-block:: python
+
+   import filters as f
+
+   # Apply a prefix to truncated values:
+   runner = f.FilterRunner(
+       f.MaxBytes(12, truncate=True, prefix='(more) '),
+       'Hello, world!'
+   )
+   assert runner.is_valid() is True
+   # The length of the prefix is taken into account, so that the result is still
+   # 12 bytes long.
+   assert runner.cleaned_data == b'(more) Hello'
+
+   # Apply a suffix to truncated values:
+   runner = f.FilterRunner(
+       f.MaxBytes(12, truncate=True, suffix='...'),
+       'Hello, world!',
+   )
+   assert runner.is_valid() is True
+   assert runner.cleaned_data == b'Hello, wo...'
+
+   # Apply both, why not..
+   runner = f.FilterRunner(
+       f.MaxBytes(12, truncate=True, prefix='->', suffix='<-'),
+       'Hello, world!',
+   )
+   assert runner.is_valid() is True
+   assert runner.cleaned_data == b'->Hello, w<-'
+
+By default, the filter uses UTF-8; if you need to use a different encoding, you
+can specify it when initialising the filter:
+
+.. code-block:: python
+
+   import filters as f
+
+   runner = f.FilterRunner(
+       f.MaxBytes(32, truncate=True, encoding='utf-16'),
+       'kia ora e te ao whānui',
+   )
+   assert runner.is_valid() is True
+   assert runner.cleaned_data ==\
+       b'\xff\xfek\x00i\x00a\x00 \x00o\x00r\x00a\x00' \
+       b' \x00e\x00 \x00t\x00e\x00 \x00a\x00o\x00'
+
+   # Prefix and suffix also work with alternate encodings.
+   runner = f.FilterRunner(
+       f.MaxBytes(40, truncate=True, prefix='[अधिक] ', suffix=' (अधिक)', encoding='utf-16'),
+       'मैं अपने आप से ऐसा क्यों करता हूं?',
+   )
+   assert runner.is_valid() is True
+   assert runner.cleaned_data == (
+       b"\xff\xfe"                           # BOM
+       b"[\x00\x05\t'\t?\t\x15\t]\x00 \x00"  # Prefix
+       b'.\tH\t\x02\t \x00\x05\t'            # Truncated string
+       b" \x00(\x00\x05\t'\t?\t\x15\t)\x00"  # Suffix
+   )
+   assert len(runner.cleaned_data) == 40
 
 MaxLength
 ---------

diff --git a/filters/string.py b/filters/string.py
@@ -231,8 +231,9 @@ class MaxBytes(BaseFilter):
     Ensures that an incoming string value is small enough to fit into a
     specified number of bytes when encoded.
 
-    Note:  The resulting value is a byte string, even if you provide a
-    unicode.
+    .. note::
+
+        The resulting value is always byte string.
     """
     CODE_TOO_LONG = 'too_long'
 
@@ -245,25 +246,40 @@ class MaxBytes(BaseFilter):
     def __init__(
             self,
             max_bytes: int,
-            truncate: bool = True,
+            truncate: bool = False,
             prefix: str = '',
+            suffix: str = '',
             encoding: str = 'utf-8',
     ) -> None:
         """
         :param max_bytes:
             Max number of bytes to allow.
 
         :param truncate:
-            Whether to truncate values that are too long.
+            How to handle values that are too long:
 
-            Set this to ``False`` to save system resources when you
-            know that you will reject values that are too long.
+            - ``truncate is True``:  Return truncated string.
+            - ``truncate is False``:  Treat as invalid value.
 
         :param prefix:
             Prefix to apply to truncated values.
 
+            The prefix will count towards the number of bytes, so even with a
+            prefix the resulting string will not exceed ``max_bytes`` in
+            length.
+
+            Ignored when the incoming value is short enough, or when
+            ``truncate is False``.
+
+        :param suffix:
+            Suffix to apply to truncated values.
+
+            The suffix will count towards the number of bytes, so even with a
+            suffix the resulting string will not exceed ``max_bytes`` in
+            length.
+
             Ignored when the incoming value is short enough, or when
-            ``truncate`` is ``False``.
+            ``truncate is False``.
 
         :param encoding:
             The character encoding to check against.
@@ -275,6 +291,7 @@ def __init__(
         self.encoding = encoding
         self.max_bytes = max_bytes
         self.prefix = prefix
+        self.suffix = suffix
         self.truncate = truncate
 
     def __str__(self):
@@ -304,57 +321,72 @@ def _apply(self, value):
         if self._has_errors:
             return None
 
-        str_value = value.encode(self.encoding)
+        bytes_value = value.encode(self.encoding)
 
-        if len(str_value) > self.max_bytes:
-            replacement = (
-                self.truncate_string(
-                    # Ensure that we convert back to unicode before
-                    # adding the prefix, just in case `self.encoding`
-                    # indicates a codec that uses a BOM.
-                    value=self.prefix + value,
-
-                    max_bytes=self.max_bytes,
-                    encoding=self.encoding,
-                )
-                if self.truncate
-                else None
-            )
+        if self.truncate:
+            # Truncated values are considered valid.
+            return self.truncate_bytes(bytes_value)
 
+        if len(bytes_value) > self.max_bytes:
+            # Else, too-long values are invalid.
             return self._invalid_value(
                 value=value,
                 reason=self.CODE_TOO_LONG,
-                replacement=replacement,
-
                 context={
                     'encoding': self.encoding,
                     'max_bytes': self.max_bytes,
-                    'prefix': self.prefix,
                     'truncate': self.truncate,
                 },
             )
 
-        return str_value
+        return bytes_value
 
-    @staticmethod
-    def truncate_string(value: str, max_bytes: int, encoding: str) -> bytes:
+    def truncate_bytes(self, bytes_value: bytes) -> bytes:
         """
-        Truncates a string value to the specified number of bytes.
+        Truncates a too-long bytes value to the specified number of bytes,
+        using the filter's current configuration.
 
         :return:
             Returns bytes, truncated to the correct length.
 
-            Note: Might be a bit shorter than `max_bytes`, to avoid
+            Note: Might be a bit shorter than ``self.max_bytes``, to avoid
             orphaning a multibyte sequence.
         """
-        # Convert to bytearray so that we get the same handling in
-        # Python 2 and Python 3.
-        bytes_ = bytearray(value.encode(encoding))
+        if len(bytes_value) <= self.max_bytes:
+            return bytes_value
+
+        # Note that ``self.encoding`` may indicate a codec that
+        # uses a BOM, so we have to do a little extra work to make
+        # sure we don't insert extra BOMs in the resulting value.
+        bom = len(''.encode(self.encoding))
+
+        # Prefix can be prepended right away.
+        if self.prefix:
+            bytes_value = (
+                    bytes_value[0:bom] +
+                    self.prefix.encode(self.encoding)[bom:] +
+                    bytes_value[bom:]
+            )
+
+        # Suffix has to be tucked away for later (otherwise, the first thing
+        # we'd truncate would be the suffix!).
+        encoded_suffix = (
+            self.suffix.encode(self.encoding)[bom:]
+            if self.suffix
+            else b''
+        )
+
+        # Ensure we leave enough space for the suffix.
+        target_bytes = self.max_bytes - len(encoded_suffix)
+
+        # Edge case where ``self.max_bytes`` is so tiny that we can't even fit
+        # the entire prefix+suffix into the end result.
+        if target_bytes < 1:
+            return b''
 
         # Truncating the value is a bit tricky, as we have to be
         # careful not to leave an unterminated multibyte sequence.
-
-        if encoding.lower() in ['utf-8', 'utf8']:
+        if self.encoding.lower() in {'utf-8', 'utf8'}:
             #
             # This code works a bit faster than the generic routine
             # (see below) because we only have to inspect up to 4
@@ -363,7 +395,7 @@ def truncate_string(value: str, max_bytes: int, encoding: str) -> bytes:
             #
             # But, it only works for UTF-8.
             #
-            truncated = bytes_[0:max_bytes]
+            truncated = bytes_value[0:target_bytes]
 
             # Walk backwards through the string until we hit certain
             # sequences.
@@ -399,34 +431,34 @@ def truncate_string(value: str, max_bytes: int, encoding: str) -> bytes:
                 # Else, we have a continuation byte.  Continue walking
                 # backwards through the string.
 
-            return truncated
+            return truncated + encoded_suffix
 
         else:
             trim = 0
             while True:
                 # Progressively chop bytes off the end of the string
                 # until we have something that can be successfully
                 # decoded using the specified encoding.
-                truncated = bytes_[0:max_bytes - trim]
+                truncated = bytes_value[0:target_bytes - trim]
 
                 try:
-                    truncated.decode(encoding)
+                    truncated.decode(self.encoding)
                 except UnicodeDecodeError:
                     trim += 1
                 else:
-                    return bytes(truncated)
+                    return bytes(truncated) + encoded_suffix
 
                 # We should never get here, but just in case, we need
                 # to ensure the loop eventually terminates (Python
                 # won't error if ``max_bytes - trim`` goes negative,
                 # since the slice operator accepts negative values).
-                if trim >= max_bytes:
+                if trim >= target_bytes:
                     raise ValueError(
-                        'Unable to truncate {bytes_!r} to {max_bytes} '
+                        'Unable to truncate {bytes_value!r} to {target_bytes} '
                         'bytes when encoded using {encoding}.'.format(
-                            bytes_=bytes_,
-                            max_bytes=max_bytes,
-                            encoding=encoding,
+                            bytes_value=bytes_value,
+                            target_bytes=target_bytes,
+                            encoding=self.encoding,
                         ),
                     )