Skip to content

Commit

Permalink
fix: correctly handle multibyte strings in Precompiled Normalizer
Browse files Browse the repository at this point in the history
  • Loading branch information
CodeWithKyrian committed Sep 14, 2024
1 parent 6ec3e3e commit bee47e0
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 10 deletions.
14 changes: 7 additions & 7 deletions src/Normalizers/Precompiled.php
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,16 @@ private function parsePrecompiledCharsmap(string $charsMap): void
$trieSize = $data[1];

$this->trie = new CharTrie();
$this->normalized = substr($charsMap, 4 + $trieSize);
$this->normalized = mb_substr($charsMap, 4 + $trieSize);

$offset = 0;
while ($offset < strlen($this->normalized)) {
$end = strpos($this->normalized, "\0", $offset);
while ($offset < mb_strlen($this->normalized)) {
$end = mb_strpos($this->normalized, "\0", $offset);
if ($end === false) {
break;
}
$replacement = substr($this->normalized, $offset, $end - $offset);
$this->trie->push(chr($offset) . $replacement);
$replacement = mb_substr($this->normalized, $offset, $end - $offset);
$this->trie->push(mb_chr($offset) . $replacement);
$offset = $end + 1;
}
}
Expand Down Expand Up @@ -101,7 +101,7 @@ private function transform(string $chunk): ?string
return null;
}

return substr($longestMatch, 1);
return mb_substr($longestMatch, 1);
}

/**
Expand All @@ -115,7 +115,7 @@ private function findLongestMatch(Generator $results): ?string
{
$longestMatch = null;
foreach ($results as $result) {
if ($longestMatch === null || strlen($result) > strlen($longestMatch)) {
if ($longestMatch === null || mb_strlen($result) > mb_strlen($longestMatch)) {
$longestMatch = $result;
}
}
Expand Down
5 changes: 2 additions & 3 deletions src/PreTokenizers/WhitespaceSplit.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ public function __construct(protected array $config)

public function preTokenizeText(string|array $text, array $options): array
{
preg_match_all('/\S+/', $text, $matches);

return $matches[0] ?? [];
// $words = preg_split('/\s+/', $text, flags: PREG_SPLIT_NO_EMPTY);
return preg_split('/[\s\x{FFFD}]+/u', $text, flags: PREG_SPLIT_NO_EMPTY);
}
}

0 comments on commit bee47e0

Please sign in to comment.