Skip to content

Commit 75f5d9c

Browse files
feat: Use a static list for byte-unicode and unicode-byte conversion
Using this static list is much faster than decoding the utf-8 everytime a character is encountered.
1 parent 258a90b commit 75f5d9c

File tree

5 files changed

+520
-21
lines changed

5 files changed

+520
-21
lines changed

examples/composer.json

+1-2
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414
"require": {
1515
"php": "^8.1",
1616
"symfony/console": "^7.0",
17-
"codewithkyrian/transformers": "*",
18-
"rokka/imagine-vips": "dev-master"
17+
"codewithkyrian/transformers": "*"
1918
},
2019
"require-dev": {
2120
"symfony/var-dumper": "^7.0"

src/Decoders/ByteLevelDecoder.php

+259-9
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,264 @@
1111

1212
class ByteLevelDecoder extends Decoder
1313
{
14-
protected array $byteDecoder = [];
15-
16-
public function __construct(array $config)
17-
{
18-
parent::__construct($config);
19-
20-
$this->byteDecoder = Tokenizer::unicodeToBytes();
21-
}
14+
protected const UNICODE_TO_BYTES = [
15+
'Ā' => 0,
16+
'ā' => 1,
17+
'Ă' => 2,
18+
'ă' => 3,
19+
'Ą' => 4,
20+
'ą' => 5,
21+
'Ć' => 6,
22+
'ć' => 7,
23+
'Ĉ' => 8,
24+
'ĉ' => 9,
25+
'Ċ' => 10,
26+
'ċ' => 11,
27+
'Č' => 12,
28+
'č' => 13,
29+
'Ď' => 14,
30+
'ď' => 15,
31+
'Đ' => 16,
32+
'đ' => 17,
33+
'Ē' => 18,
34+
'ē' => 19,
35+
'Ĕ' => 20,
36+
'ĕ' => 21,
37+
'Ė' => 22,
38+
'ė' => 23,
39+
'Ę' => 24,
40+
'ę' => 25,
41+
'Ě' => 26,
42+
'ě' => 27,
43+
'Ĝ' => 28,
44+
'ĝ' => 29,
45+
'Ğ' => 30,
46+
'ğ' => 31,
47+
'Ġ' => 32,
48+
'!' => 33,
49+
'"' => 34,
50+
'#' => 35,
51+
'$' => 36,
52+
'%' => 37,
53+
'&' => 38,
54+
'\'' => 39,
55+
'(' => 40,
56+
')' => 41,
57+
'*' => 42,
58+
'+' => 43,
59+
',' => 44,
60+
'-' => 45,
61+
'.' => 46,
62+
'/' => 47,
63+
'0' => 48,
64+
'1' => 49,
65+
'2' => 50,
66+
'3' => 51,
67+
'4' => 52,
68+
'5' => 53,
69+
'6' => 54,
70+
'7' => 55,
71+
'8' => 56,
72+
'9' => 57,
73+
':' => 58,
74+
';' => 59,
75+
'<' => 60,
76+
'=' => 61,
77+
'>' => 62,
78+
'?' => 63,
79+
'@' => 64,
80+
'A' => 65,
81+
'B' => 66,
82+
'C' => 67,
83+
'D' => 68,
84+
'E' => 69,
85+
'F' => 70,
86+
'G' => 71,
87+
'H' => 72,
88+
'I' => 73,
89+
'J' => 74,
90+
'K' => 75,
91+
'L' => 76,
92+
'M' => 77,
93+
'N' => 78,
94+
'O' => 79,
95+
'P' => 80,
96+
'Q' => 81,
97+
'R' => 82,
98+
'S' => 83,
99+
'T' => 84,
100+
'U' => 85,
101+
'V' => 86,
102+
'W' => 87,
103+
'X' => 88,
104+
'Y' => 89,
105+
'Z' => 90,
106+
'[' => 91,
107+
'\\' => 92,
108+
']' => 93,
109+
'^' => 94,
110+
'_' => 95,
111+
'`' => 96,
112+
'a' => 97,
113+
'b' => 98,
114+
'c' => 99,
115+
'd' => 100,
116+
'e' => 101,
117+
'f' => 102,
118+
'g' => 103,
119+
'h' => 104,
120+
'i' => 105,
121+
'j' => 106,
122+
'k' => 107,
123+
'l' => 108,
124+
'm' => 109,
125+
'n' => 110,
126+
'o' => 111,
127+
'p' => 112,
128+
'q' => 113,
129+
'r' => 114,
130+
's' => 115,
131+
't' => 116,
132+
'u' => 117,
133+
'v' => 118,
134+
'w' => 119,
135+
'x' => 120,
136+
'y' => 121,
137+
'z' => 122,
138+
'{' => 123,
139+
'|' => 124,
140+
'}' => 125,
141+
'~' => 126,
142+
'ġ' => 127,
143+
'Ģ' => 128,
144+
'ģ' => 129,
145+
'Ĥ' => 130,
146+
'ĥ' => 131,
147+
'Ħ' => 132,
148+
'ħ' => 133,
149+
'Ĩ' => 134,
150+
'ĩ' => 135,
151+
'Ī' => 136,
152+
'ī' => 137,
153+
'Ĭ' => 138,
154+
'ĭ' => 139,
155+
'Į' => 140,
156+
'į' => 141,
157+
'İ' => 142,
158+
'ı' => 143,
159+
'IJ' => 144,
160+
'ij' => 145,
161+
'Ĵ' => 146,
162+
'ĵ' => 147,
163+
'Ķ' => 148,
164+
'ķ' => 149,
165+
'ĸ' => 150,
166+
'Ĺ' => 151,
167+
'ĺ' => 152,
168+
'Ļ' => 153,
169+
'ļ' => 154,
170+
'Ľ' => 155,
171+
'ľ' => 156,
172+
'Ŀ' => 157,
173+
'ŀ' => 158,
174+
'Ł' => 159,
175+
'ł' => 160,
176+
'¡' => 161,
177+
'¢' => 162,
178+
'£' => 163,
179+
'¤' => 164,
180+
'¥' => 165,
181+
'¦' => 166,
182+
'§' => 167,
183+
'¨' => 168,
184+
'©' => 169,
185+
'ª' => 170,
186+
'«' => 171,
187+
'¬' => 172,
188+
'Ń' => 173,
189+
'®' => 174,
190+
'¯' => 175,
191+
'°' => 176,
192+
'±' => 177,
193+
'²' => 178,
194+
'³' => 179,
195+
'´' => 180,
196+
'µ' => 181,
197+
'' => 182,
198+
'·' => 183,
199+
'¸' => 184,
200+
'¹' => 185,
201+
'º' => 186,
202+
'»' => 187,
203+
'¼' => 188,
204+
'½' => 189,
205+
'¾' => 190,
206+
'¿' => 191,
207+
'À' => 192,
208+
'Á' => 193,
209+
'Â' => 194,
210+
'Ã' => 195,
211+
'Ä' => 196,
212+
'Å' => 197,
213+
'Æ' => 198,
214+
'Ç' => 199,
215+
'È' => 200,
216+
'É' => 201,
217+
'Ê' => 202,
218+
'Ë' => 203,
219+
'Ì' => 204,
220+
'Í' => 205,
221+
'Î' => 206,
222+
'Ï' => 207,
223+
'Ð' => 208,
224+
'Ñ' => 209,
225+
'Ò' => 210,
226+
'Ó' => 211,
227+
'Ô' => 212,
228+
'Õ' => 213,
229+
'Ö' => 214,
230+
'×' => 215,
231+
'Ø' => 216,
232+
'Ù' => 217,
233+
'Ú' => 218,
234+
'Û' => 219,
235+
'Ü' => 220,
236+
'Ý' => 221,
237+
'Þ' => 222,
238+
'ß' => 223,
239+
'à' => 224,
240+
'á' => 225,
241+
'â' => 226,
242+
'ã' => 227,
243+
'ä' => 228,
244+
'å' => 229,
245+
'æ' => 230,
246+
'ç' => 231,
247+
'è' => 232,
248+
'é' => 233,
249+
'ê' => 234,
250+
'ë' => 235,
251+
'ì' => 236,
252+
'í' => 237,
253+
'î' => 238,
254+
'ï' => 239,
255+
'ð' => 240,
256+
'ñ' => 241,
257+
'ò' => 242,
258+
'ó' => 243,
259+
'ô' => 244,
260+
'õ' => 245,
261+
'ö' => 246,
262+
'÷' => 247,
263+
'ø' => 248,
264+
'ù' => 249,
265+
'ú' => 250,
266+
'û' => 251,
267+
'ü' => 252,
268+
'ý' => 253,
269+
'þ' => 254,
270+
'ÿ' => 255,
271+
];
22272

23273

24274
/**
@@ -33,7 +283,7 @@ public function convertTokensToString(array $tokens): string
33283

34284
$textArray = preg_split('//u', $text, -1, PREG_SPLIT_NO_EMPTY);
35285

36-
$byteArray = array_map(fn($x) => $this->byteDecoder[$x], $textArray);
286+
$byteArray = array_map(fn($x) => self::UNICODE_TO_BYTES[$x], $textArray);
37287

38288
$binaryString = pack('C*', ...$byteArray);
39289

0 commit comments

Comments
 (0)