2
2
3
3
declare (strict_types=1 );
4
4
5
-
6
5
namespace Codewithkyrian \Transformers \Normalizers ;
7
6
7
+ use Codewithkyrian \Transformers \DataStructures \CharTrie ;
8
+ use Generator ;
9
+
8
10
class Precompiled extends Normalizer
9
11
{
12
+ /**
13
+ * Normalized chars mapping.
14
+ */
15
+ private string $ normalized ;
10
16
11
17
/**
12
- * Precompiled chars mapping .
18
+ * Trie for fast prefix search .
13
19
*/
14
- protected mixed $ charsMap ;
20
+ private CharTrie $ trie ;
15
21
16
22
public function __construct (array $ config )
17
23
{
18
24
parent ::__construct ($ config );
25
+
26
+ $ this ->parsePrecompiledCharsmap (base64_decode ($ config ['precompiled_charsmap ' ]));
27
+ }
28
+
29
+ /**
30
+ * Parses the precompiled charsmap.
31
+ *
32
+ * @param string $charsMap The precompiled charsmap.
33
+ */
34
+ private function parsePrecompiledCharsmap (string $ charsMap ): void
35
+ {
36
+ $ data = unpack ('V ' , $ charsMap , 0 );
37
+ $ trieSize = $ data [1 ];
19
38
20
- $ this ->charsMap = $ config ['precompiled_charsmap ' ];
39
+ $ this ->trie = new CharTrie ();
40
+ $ this ->normalized = substr ($ charsMap , 4 + $ trieSize );
41
+
42
+ $ offset = 0 ;
43
+ while ($ offset < strlen ($ this ->normalized )) {
44
+ $ end = strpos ($ this ->normalized , "\0" , $ offset );
45
+ if ($ end === false ) {
46
+ break ;
47
+ }
48
+ $ replacement = substr ($ this ->normalized , $ offset , $ end - $ offset );
49
+ $ this ->trie ->push (chr ($ offset ) . $ replacement );
50
+ $ offset = $ end + 1 ;
51
+ }
21
52
}
22
53
23
54
/**
@@ -29,37 +60,65 @@ public function __construct(array $config)
29
60
*/
30
61
public function normalize (string $ text ): string
31
62
{
32
- // As stated in the sentencepiece normalization docs (https://github.com/google/sentencepiece/blob/master/doc/normalization.md#use-pre-defined-normalization-rule),
33
- // there are 5 pre-defined normalization rules:
34
- // 1. nmt_nfkc: NFKC normalization with some additional normalization around spaces. (default)
35
- // 2. nfkc: original NFKC normalization.
36
- // 3. nmt_nfkc_cf: nmt_nfkc + Unicode case folding (mostly lower casing)
37
- // 4. nfkc_cf: nfkc + Unicode case folding.
38
- // 5. identity: no normalization
39
- //
40
- // For now, we only implement the default (nmt_nfkc).
41
- // See https://raw.githubusercontent.com/google/sentencepiece/master/data/nmt_nfkc.tsv for the full list of rules.
42
- // TODO: detect when a different `$this->charsMap` is used.
43
-
44
- // Remove control characters
45
- $ text = preg_replace ('/[\x01-\x08\x0B\x0E-\x1F\x7F\x8F\x9F]/u ' , '' , $ text );
46
-
47
- // Replace certain characters with a space
48
- $ text = preg_replace ('/[\x09\x0A\x0C\x0D\x{1680}\x{200B}\x{200C}\x{200E}\x{200F}\x{2028}\x{2029}\x{2581}\x{FEFF}\x{FFFD}]/u ' , ' ' , $ text );
49
-
50
- if (mb_strpos ($ text , '~ ' ) !== false ) {
51
- // To match the sentencepiece implementation 100%, we must handle a very strange edge-case.
52
- // For some reason, the "Fullwidth Tilde" character (~) should not be converted to the standard Tilde character (~).
53
- // However, NFKC normalization does do this conversion. As a result, we split the string on the Fullwidth Tilde character,
54
- // perform NFKC normalization on each substring, and then join them back together with the Fullwidth Tilde character.
55
- $ parts = explode ('~ ' , $ text );
56
- $ text = implode ('~ ' , array_map (function ($ part ) {
57
- return mb_convert_encoding (normalizer_normalize ($ part , \Normalizer::FORM_KC ), 'UTF-8 ' , 'UTF-8 ' );
58
- }, $ parts ));
59
- } else {
60
- $ text = normalizer_normalize ($ text , \Normalizer::FORM_KC );
63
+ $ normalized = '' ;
64
+ $ graphemes = preg_split ('//u ' , $ text , -1 , PREG_SPLIT_NO_EMPTY );
65
+
66
+ foreach ($ graphemes as $ grapheme ) {
67
+ if (mb_strlen ($ grapheme ) < 6 ) {
68
+ $ norm = $ this ->transform ($ grapheme );
69
+ if ($ norm !== null ) {
70
+ $ normalized .= $ norm ;
71
+ continue ;
72
+ }
73
+ }
74
+
75
+ foreach (preg_split ('//u ' , $ grapheme , -1 , PREG_SPLIT_NO_EMPTY ) as $ char ) {
76
+ $ norm = $ this ->transform ($ char );
77
+ if ($ norm !== null ) {
78
+ $ normalized .= $ norm ;
79
+ } else {
80
+ $ normalized .= $ char ;
81
+ }
82
+ }
61
83
}
62
84
63
- return $ text ;
85
+ return $ normalized ;
86
+ }
87
+
88
+ /**
89
+ * Transforms the given chunk by finding the longest match in the trie.
90
+ *
91
+ * @param string $chunk The chunk to transform.
92
+ *
93
+ * @return string|null The transformed chunk or null if no match is found.
94
+ */
95
+ private function transform (string $ chunk ): ?string
96
+ {
97
+ $ results = $ this ->trie ->commonPrefixSearch ($ chunk );
98
+ $ longestMatch = $ this ->findLongestMatch ($ results );
99
+
100
+ if ($ longestMatch === null ) {
101
+ return null ;
102
+ }
103
+
104
+ return substr ($ longestMatch , 1 );
105
+ }
106
+
107
+ /**
108
+ * Finds the longest match in the given results.
109
+ *
110
+ * @param Generator $results The results to find the longest match in.
111
+ *
112
+ * @return string|null The longest match or null if no match is found.
113
+ */
114
+ private function findLongestMatch (Generator $ results ): ?string
115
+ {
116
+ $ longestMatch = null ;
117
+ foreach ($ results as $ result ) {
118
+ if ($ longestMatch === null || strlen ($ result ) > strlen ($ longestMatch )) {
119
+ $ longestMatch = $ result ;
120
+ }
121
+ }
122
+ return $ longestMatch ;
64
123
}
65
124
}
0 commit comments