🔧 improve the detector general reliability (#532)

Issues (#520) (#509) (#498) (#407)
jawah · Sep 25, 2024 · 39b6f5c · 39b6f5c
1 parent 0d694f0
commit 39b6f5c
Show file tree

Hide file tree

Showing 5 changed files with 31 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,13 +2,11 @@
 All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
-## [3.3.3](https://github.com/Ousret/charset_normalizer/compare/3.3.2...master) (2024-03-??)
+## [3.3.3](https://github.com/Ousret/charset_normalizer/compare/3.3.2...master) (2024-09-??)
 
 ### Fixed
 - Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
-
-### Changed
-- Optional mypyc compilation upgraded to version 1.9.0 for Python >= 3.8
+- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407)
 
 ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
 

diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
@@ -221,16 +221,20 @@ def from_bytes(
         try:
             if is_too_large_sequence and is_multi_byte_decoder is False:
                 str(
-                    sequences[: int(50e4)]
-                    if strip_sig_or_bom is False
-                    else sequences[len(sig_payload) : int(50e4)],
+                    (
+                        sequences[: int(50e4)]
+                        if strip_sig_or_bom is False
+                        else sequences[len(sig_payload) : int(50e4)]
+                    ),
                     encoding=encoding_iana,
                 )
             else:
                 decoded_payload = str(
-                    sequences
-                    if strip_sig_or_bom is False
-                    else sequences[len(sig_payload) :],
+                    (
+                        sequences
+                        if strip_sig_or_bom is False
+                        else sequences[len(sig_payload) :]
+                    ),
                     encoding=encoding_iana,
                 )
         except (UnicodeDecodeError, LookupError) as e:

diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py
@@ -544,6 +544,8 @@
     "|",
     '"',
     "-",
+    "(",
+    ")",
 }
 
 

diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py
@@ -236,7 +236,7 @@ def reset(self) -> None:  # pragma: no cover
 
     @property
     def ratio(self) -> float:
-        if self._character_count <= 24:
+        if self._character_count <= 13:
             return 0.0
 
         ratio_of_suspicious_range_usage: float = (
@@ -260,6 +260,7 @@ def __init__(self) -> None:
 
         self._buffer: str = ""
         self._buffer_accent_count: int = 0
+        self._buffer_glyph_count: int = 0
 
     def eligible(self, character: str) -> bool:
         return True
@@ -279,6 +280,14 @@ def feed(self, character: str) -> None:
                 and is_thai(character) is False
             ):
                 self._foreign_long_watch = True
+            if (
+                is_cjk(character)
+                or is_hangul(character)
+                or is_katakana(character)
+                or is_hiragana(character)
+                or is_thai(character)
+            ):
+                self._buffer_glyph_count += 1
             return
         if not self._buffer:
             return
@@ -291,17 +300,20 @@ def feed(self, character: str) -> None:
             self._character_count += buffer_length
 
             if buffer_length >= 4:
-                if self._buffer_accent_count / buffer_length > 0.34:
+                if self._buffer_accent_count / buffer_length >= 0.5:
                     self._is_current_word_bad = True
                 # Word/Buffer ending with an upper case accentuated letter are so rare,
                 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
-                if (
+                elif (
                     is_accentuated(self._buffer[-1])
                     and self._buffer[-1].isupper()
                     and all(_.isupper() for _ in self._buffer) is False
                 ):
                     self._foreign_long_count += 1
                     self._is_current_word_bad = True
+                elif self._buffer_glyph_count == 1:
+                    self._is_current_word_bad = True
+                    self._foreign_long_count += 1
             if buffer_length >= 24 and self._foreign_long_watch:
                 camel_case_dst = [
                     i
@@ -325,6 +337,7 @@ def feed(self, character: str) -> None:
             self._foreign_long_watch = False
             self._buffer = ""
             self._buffer_accent_count = 0
+            self._buffer_glyph_count = 0
         elif (
             character not in {"<", ">", "-", "=", "~", "|", "_"}
             and character.isdigit() is False

diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py
@@ -2,5 +2,5 @@
 Expose version
 """
 
-__version__ = "3.3.2"
+__version__ = "3.3.3"
 VERSION = __version__.split(".")