Skip to content

Commit 7e880dd

Browse files
feat: Correctly parse and use the Precompiled Normalizer
1 parent ca6fc3b commit 7e880dd

File tree

1 file changed

+93
-34
lines changed

1 file changed

+93
-34
lines changed

src/Normalizers/Precompiled.php

+93-34
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,53 @@
22

33
declare(strict_types=1);
44

5-
65
namespace Codewithkyrian\Transformers\Normalizers;
76

7+
use Codewithkyrian\Transformers\DataStructures\CharTrie;
8+
use Generator;
9+
810
class Precompiled extends Normalizer
911
{
12+
/**
13+
* Normalized chars mapping.
14+
*/
15+
private string $normalized;
1016

1117
/**
12-
* Precompiled chars mapping.
18+
* Trie for fast prefix search.
1319
*/
14-
protected mixed $charsMap;
20+
private CharTrie $trie;
1521

1622
public function __construct(array $config)
1723
{
1824
parent::__construct($config);
25+
26+
$this->parsePrecompiledCharsmap(base64_decode($config['precompiled_charsmap']));
27+
}
28+
29+
/**
30+
* Parses the precompiled charsmap.
31+
*
32+
* @param string $charsMap The precompiled charsmap.
33+
*/
34+
private function parsePrecompiledCharsmap(string $charsMap): void
35+
{
36+
$data = unpack('V', $charsMap , 0);
37+
$trieSize = $data[1];
1938

20-
$this->charsMap = $config['precompiled_charsmap'];
39+
$this->trie = new CharTrie();
40+
$this->normalized = substr($charsMap, 4 + $trieSize);
41+
42+
$offset = 0;
43+
while ($offset < strlen($this->normalized)) {
44+
$end = strpos($this->normalized, "\0", $offset);
45+
if ($end === false) {
46+
break;
47+
}
48+
$replacement = substr($this->normalized, $offset, $end - $offset);
49+
$this->trie->push(chr($offset) . $replacement);
50+
$offset = $end + 1;
51+
}
2152
}
2253

2354
/**
@@ -29,37 +60,65 @@ public function __construct(array $config)
2960
*/
3061
public function normalize(string $text): string
3162
{
32-
// As stated in the sentencepiece normalization docs (https://github.com/google/sentencepiece/blob/master/doc/normalization.md#use-pre-defined-normalization-rule),
33-
// there are 5 pre-defined normalization rules:
34-
// 1. nmt_nfkc: NFKC normalization with some additional normalization around spaces. (default)
35-
// 2. nfkc: original NFKC normalization.
36-
// 3. nmt_nfkc_cf: nmt_nfkc + Unicode case folding (mostly lower casing)
37-
// 4. nfkc_cf: nfkc + Unicode case folding.
38-
// 5. identity: no normalization
39-
//
40-
// For now, we only implement the default (nmt_nfkc).
41-
// See https://raw.githubusercontent.com/google/sentencepiece/master/data/nmt_nfkc.tsv for the full list of rules.
42-
// TODO: detect when a different `$this->charsMap` is used.
43-
44-
// Remove control characters
45-
$text = preg_replace('/[\x01-\x08\x0B\x0E-\x1F\x7F\x8F\x9F]/u', '', $text);
46-
47-
// Replace certain characters with a space
48-
$text = preg_replace('/[\x09\x0A\x0C\x0D\x{1680}\x{200B}\x{200C}\x{200E}\x{200F}\x{2028}\x{2029}\x{2581}\x{FEFF}\x{FFFD}]/u', ' ', $text);
49-
50-
if (mb_strpos($text, '') !== false) {
51-
// To match the sentencepiece implementation 100%, we must handle a very strange edge-case.
52-
// For some reason, the "Fullwidth Tilde" character (~) should not be converted to the standard Tilde character (~).
53-
// However, NFKC normalization does do this conversion. As a result, we split the string on the Fullwidth Tilde character,
54-
// perform NFKC normalization on each substring, and then join them back together with the Fullwidth Tilde character.
55-
$parts = explode('', $text);
56-
$text = implode('', array_map(function ($part) {
57-
return mb_convert_encoding(normalizer_normalize($part, \Normalizer::FORM_KC), 'UTF-8', 'UTF-8');
58-
}, $parts));
59-
} else {
60-
$text = normalizer_normalize($text, \Normalizer::FORM_KC);
63+
$normalized = '';
64+
$graphemes = preg_split('//u', $text, -1, PREG_SPLIT_NO_EMPTY);
65+
66+
foreach ($graphemes as $grapheme) {
67+
if (mb_strlen($grapheme) < 6) {
68+
$norm = $this->transform($grapheme);
69+
if ($norm !== null) {
70+
$normalized .= $norm;
71+
continue;
72+
}
73+
}
74+
75+
foreach (preg_split('//u', $grapheme, -1, PREG_SPLIT_NO_EMPTY) as $char) {
76+
$norm = $this->transform($char);
77+
if ($norm !== null) {
78+
$normalized .= $norm;
79+
} else {
80+
$normalized .= $char;
81+
}
82+
}
6183
}
6284

63-
return $text;
85+
return $normalized;
86+
}
87+
88+
/**
89+
* Transforms the given chunk by finding the longest match in the trie.
90+
*
91+
* @param string $chunk The chunk to transform.
92+
*
93+
* @return string|null The transformed chunk or null if no match is found.
94+
*/
95+
private function transform(string $chunk): ?string
96+
{
97+
$results = $this->trie->commonPrefixSearch($chunk);
98+
$longestMatch = $this->findLongestMatch($results);
99+
100+
if ($longestMatch === null) {
101+
return null;
102+
}
103+
104+
return substr($longestMatch, 1);
105+
}
106+
107+
/**
108+
* Finds the longest match in the given results.
109+
*
110+
* @param Generator $results The results to find the longest match in.
111+
*
112+
* @return string|null The longest match or null if no match is found.
113+
*/
114+
private function findLongestMatch(Generator $results): ?string
115+
{
116+
$longestMatch = null;
117+
foreach ($results as $result) {
118+
if ($longestMatch === null || strlen($result) > strlen($longestMatch)) {
119+
$longestMatch = $result;
120+
}
121+
}
122+
return $longestMatch;
64123
}
65124
}

0 commit comments

Comments
 (0)