CodeWithKyrian
diff --git a/‎examples/bootstrap.php
+3-2 b/‎examples/bootstrap.php
+3-2
diff --git a/‎examples/pipelines/asr.php
+7-4 b/‎examples/pipelines/asr.php
+7-4
diff --git a/‎examples/pipelines/text-classification.php
+3-7 b/‎examples/pipelines/text-classification.php
+3-7
diff --git a/‎examples/pipelines/text-generation.php
+2-1 b/‎examples/pipelines/text-generation.php
+2-1
diff --git a/‎examples/pipelines/text2text-generation.php
+3-3 b/‎examples/pipelines/text2text-generation.php
+3-3
diff --git a/‎examples/pipelines/token-classification.php
+2-2 b/‎examples/pipelines/token-classification.php
+2-2
diff --git a/‎src/Decoders/ByteFallback.php
+19-8 b/‎src/Decoders/ByteFallback.php
+19-8
diff --git a/‎src/Decoders/ByteLevelDecoder.php
+4-7 b/‎src/Decoders/ByteLevelDecoder.php
+4-7
diff --git a/‎src/Decoders/CTCDecoder.php
+3-3 b/‎src/Decoders/CTCDecoder.php
+3-3
diff --git a/‎src/Decoders/ReplaceDecoder.php
+1-2 b/‎src/Decoders/ReplaceDecoder.php
+1-2
diff --git a/‎src/Decoders/StripDecoder.php
+6-5 b/‎src/Decoders/StripDecoder.php
+6-5
diff --git a/‎src/Decoders/VitsDecoder.php
+20 b/‎src/Decoders/VitsDecoder.php
+20
diff --git a/‎src/Decoders/WordPieceDecoder.php
+3-3 b/‎src/Decoders/WordPieceDecoder.php
+3-3
diff --git a/‎src/Generation/Samplers/Sampler.php
+1-11 b/‎src/Generation/Samplers/Sampler.php
+1-11
diff --git a/‎src/Generation/Streamers/TextStreamer.php
+7-4 b/‎src/Generation/Streamers/TextStreamer.php
+7-4
@@ -7,5 +7,6 @@
 
 require_once './vendor/autoload.php';
 
-Transformers::setup()->setImageDriver(ImageDriver::VIPS);
-
+Transformers::setup()
+    ->setCacheDir('/Users/Kyrian/.transformers')
+    ->setImageDriver(ImageDriver::VIPS);
@@ -22,14 +22,17 @@
 //$audioUrl = __DIR__ . '/../sounds/taunt.wav';
 //$audioUrl = __DIR__ . '/../sounds/gettysburg.wav';
 //$audioUrl = __DIR__ . '/../sounds/kyrian-speaking.wav';
-//$audioUrl = __DIR__ . '/../sounds/ted_60.wav';
-$audioUrl = __DIR__ . '/../sounds/sample-1.mp3';
+$audioUrl = __DIR__ . '/../sounds/ted_60.wav';
+//$audioUrl = __DIR__ . '/../sounds/sample-1.mp3';
 
+$streamer = WhisperTextStreamer::make()
+//->onTimestampStart(fn($timestamp) => dump($timestamp));
+->onStream(fn($text) => print($text));
 
 $output = $transcriber($audioUrl,
     maxNewTokens: 256,
     chunkLengthSecs: 24,
-//    returnTimestamps: 'word',
+    streamer: $streamer,
 );
 
-dd($output, timeUsage(), memoryUsage());
+dd($output, timeUsage(), memoryUsage());
@@ -7,17 +7,13 @@
 require_once './bootstrap.php';
 
 
-//$classifier = pipeline('text-classification', 'Xenova/toxic-bert');
+$classifier = pipeline('text-classification', 'Xenova/toxic-bert');
 //
 //$result = $classifier("I hate you! You gave me life but in misery", topK: -1);
 
 
-$classifier = pipeline('text-classification', 'Xenova/distilbert-base-uncased-mnli');
-
-$result = $classifier('I love you!, You frustrated my life');
+// $classifier = pipeline('text-classification', 'Xenova/distilbert-base-uncased-mnli');
 
+$result = $classifier('I want to beat him to pulp', topK: -1);
 
 dd($result);
-
-
-
@@ -14,7 +14,8 @@
 
 //$generator = pipeline('text-generation', 'Xenova/gpt2');
 //$generator = pipeline('text-generation', 'Xenova/Qwen1.5-0.5B-Chat');
-$generator = pipeline('text-generation', 'Xenova/TinyLlama-1.1B-Chat-v1.0');
+//$generator = pipeline('text-generation', 'Xenova/TinyLlama-1.1B-Chat-v1.0');
+$generator = pipeline('text-generation', 'onnx-community/Llama-3.2-1B-Instruct', modelFilename: 'model_q4');
 
 $streamer = TextStreamer::make()->shouldSkipPrompt();
 
 
@@ -9,8 +9,8 @@
 
 ini_set('memory_limit', -1);
 
-$generator = pipeline('text2text-generation', 'Xenova/LaMini-Flan-T5-783M');
-//$generator = pipeline('text2text-generation', 'Xenova/flan-t5-small', quantized: true);
+//$generator = pipeline('text2text-generation', 'Xenova/LaMini-Flan-T5-783M');
+$generator = pipeline('text2text-generation', 'Xenova/flan-t5-small', quantized: true);
 
 $streamer = TextStreamer::make();
 
@@ -22,4 +22,4 @@
 $output = $generator($query, streamer: $streamer, maxNewTokens: 256, doSample: true, repetitionPenalty: 1.1, temperature: 0.7);
 
 //dd($output);
-dd('Done', timeUsage(), memoryUsage());
+dd('Done', timeUsage(), memoryUsage());
@@ -11,8 +11,8 @@
 
 ini_set('memory_limit', -1);
 
-//$classifier = pipeline('token-classification', 'Xenova/bert-base-NER');
-$classifier = pipeline('token-classification', 'codewithkyrian/bert-english-uncased-finetuned-pos');
+ $classifier = pipeline('token-classification', 'Xenova/bert-base-NER');
+//$classifier = pipeline('token-classification', 'codewithkyrian/bert-english-uncased-finetuned-pos');
 
 $output = $classifier(
     'My name is Kyrian and I live in Nigeria',
 
@@ -18,31 +18,42 @@ public function __construct(array $config)
 
     protected function decodeChain(array $tokens): array
     {
-        $newTokens = [];
         $previousByteTokens = [];
+        $newTokens = [];
 
         foreach ($tokens as $token) {
             $bytes = null;
+
+            // Check if the token is of the form <0xXX>
             if (strlen($token) === 6 && str_starts_with($token, '<0x') && str_ends_with($token, '>')) {
+                // Extract the hexadecimal value from the token
                 $byte = hexdec(substr($token, 3, 2));
                 if (!is_nan($byte)) {
                     $bytes = $byte;
                 }
             }
+
             if ($bytes !== null) {
+                // Add byte to previousByteTokens
                 $previousByteTokens[] = $bytes;
             } else {
-                if (count($previousByteTokens) > 0) {
-                    $string = $this->bytesToString($previousByteTokens);
-                    $newTokens[] = $string;
-                    $previousByteTokens = [];
+                // If we have accumulated byte tokens, decode them to a string
+                if (!empty($previousByteTokens)) {
+                    $string = pack('C*', ...$previousByteTokens);  // Convert bytes back to string
+                    $newTokens[] = $string;  // Add decoded string to newTokens
+                    $previousByteTokens = [];  // Reset byte accumulator
                 }
+                // Add the non-byte token to newTokens
                 $newTokens[] = $token;
             }
         }
-        if (count($previousByteTokens) > 0) {
-            $string = $this->bytesToString($previousByteTokens);
+
+
+        // After the loop, if there are still byte tokens, decode them
+        if (!empty($previousByteTokens)) {
+            $string = pack('C*', ...$previousByteTokens);  // Convert remaining bytes to string
             $newTokens[] = $string;
+            $previousByteTokens = [];  // Reset byte accumulator
         }
 
         return $newTokens;
@@ -59,4 +70,4 @@ protected function bytesToString(array $bytes): string
         $binaryString = pack('C*', ...$bytes);
         return mb_convert_encoding($binaryString, 'ISO-8859-1');
     }
-}
+}
@@ -6,7 +6,7 @@
 namespace Codewithkyrian\Transformers\Decoders;
 
 use Codewithkyrian\Transformers\Tokenizers\AddedToken;
-use Codewithkyrian\Transformers\Tokenizers\Tokenizer;
+use Codewithkyrian\Transformers\Tokenizers\TokenizerModel;
 use SplFixedArray;
 
 class ByteLevelDecoder extends Decoder
@@ -287,7 +287,7 @@ public function convertTokensToString(array $tokens): string
 
         $binaryString = pack('C*', ...$byteArray);
 
-        return mb_convert_encoding($binaryString, 'ISO-8859-1');
+        return mb_convert_encoding($binaryString, 'UTF-8');
     }
 
     protected function decodeChain(array $tokens): array
@@ -298,9 +298,7 @@ protected function decodeChain(array $tokens): array
         foreach ($tokens as $token) {
             // No need to check skip_special_tokens since the tokens are already filtered
 
-            $addedToken = array_filter($this->addedTokens, function (AddedToken $x) use ($token) {
-                return $x->content === $token;
-            });
+            $addedToken = array_filter($this->addedTokens, fn (AddedToken $x) => $x->content === $token);
 
             if (!empty($addedToken)) {
                 if (!empty($currentSubText)) {
@@ -319,7 +317,6 @@ protected function decodeChain(array $tokens): array
         }
 
         // TODO: add spaces_between_special_tokens and clean_up_tokenization_spaces options
-
         return $subTexts;
     }
-}
+}
@@ -5,7 +5,7 @@
 
 namespace Codewithkyrian\Transformers\Decoders;
 
-use Codewithkyrian\Transformers\Tokenizers\Tokenizer;
+use Codewithkyrian\Transformers\Tokenizers\TokenizerModel;
 
 /**
  * The CTC (Connectionist Temporal Classification) decoder.
@@ -65,7 +65,7 @@ function convertTokensToString(array $tokens): string
         $text = implode('', $filteredTokens);
         if ($this->cleanup) {
             // Cleanup and replace delimiter token
-            $text = trim(str_replace($this->wordDelimiterToken, ' ', Tokenizer::cleanUpTokenization($text)));
+            $text = trim(str_replace($this->wordDelimiterToken, ' ', TokenizerModel::cleanUpTokenization($text)));
         }
 
         return $text;
@@ -75,4 +75,4 @@ protected function decodeChain(array $tokens): array
     {
         return [$this->convertTokensToString($tokens)];
     }
-}
+}
@@ -2,7 +2,6 @@
 
 declare(strict_types=1);
 
-
 namespace Codewithkyrian\Transformers\Decoders;
 
 class ReplaceDecoder extends Decoder
@@ -38,4 +37,4 @@ protected function decodeChain(array $tokens): array
             return $token;
         }, $tokens);
     }
-}
+}
@@ -28,17 +28,18 @@ protected function decodeChain(array $tokens): array
         return array_map(function ($token) {
             $startCut = 0;
             for ($i = 0; $i < $this->start; ++$i) {
-                if ($token[$i] ?? null === $this->content) {
+                $char = mb_substr($token, $i, 1);
+                if ($char === $this->content) {
                     $startCut = $i + 1;
                     continue;
                 } else {
                     break;
                 }
             }
 
-            $stopCut = strlen($token);
+            $stopCut = mb_strlen($token);
             for ($i = 0; $i < $this->stop; ++$i) {
-                $index = strlen($token) - $i - 1;
+                $index = mb_strlen($token) - $i - 1;
                 if ($token[$index] ?? null === $this->content) {
                     $stopCut = $index;
                     continue;
@@ -47,7 +48,7 @@ protected function decodeChain(array $tokens): array
                 }
             }
 
-            return substr($token, $startCut, $stopCut - $startCut);
+            return mb_substr($token, $startCut, $stopCut - $startCut);
         }, $tokens);
     }
-}
+}
@@ -0,0 +1,20 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Codewithkyrian\Transformers\Decoders;
+
+class VitsDecoder extends Decoder
+{
+
+    protected function decodeChain(array $tokens): array
+    {
+        $decoded = '';
+
+        for ($i = 1; $i < count($tokens); $i += 2) {
+            $decoded .= $tokens[$i];
+        }
+
+        return [$decoded];
+    }
+}
@@ -5,7 +5,7 @@
 
 namespace Codewithkyrian\Transformers\Decoders;
 
-use Codewithkyrian\Transformers\Tokenizers\Tokenizer;
+use Codewithkyrian\Transformers\Tokenizers\TokenizerModel;
 
 class WordPieceDecoder extends Decoder
 {
@@ -31,12 +31,12 @@ protected function decodeChain(array $tokens): array
                 }
             }
             if ($this->cleanup) {
-                $token = Tokenizer::cleanUpTokenization($token);
+                $token = TokenizerModel::cleanUpTokenization($token);
             }
 
             $decodedTokens[] = $token;
         }
 
         return $decodedTokens;
     }
-}
+}
@@ -46,22 +46,12 @@ abstract public function sample(Tensor $logits, int $index);
      */
     public function getLogits(Tensor $logits, int $index): Tensor
     {
-//        $vocabSize = $logits->shape()[$logits->ndim() - 1];
-
-//        $start = array_fill(0, $logits->ndim(), 0);
-//        $size = array_fill(0, $logits->ndim(), 1);
-//
-//        array_splice($start, -2, replacement: [$index, 0]);
-//        array_splice($size, -2, replacement: [1, $vocabSize]);
-//
-//        $logs = $logits->sliceWithBounds($start, $size);
         $logits = $logits->slice($index);
 
         if ($this->generationConfig->temperature > 0) {
             $logits = $logits->multiply(1 / $this->generationConfig->temperature);
         }
 
-        // Remove all dimensions of 1, leaving a flat 1D array of vocab_size
         return $logits->squeeze();
     }
 
@@ -116,4 +106,4 @@ public static function getSampler(GenerationConfig $generationConfig): Sampler
             return new GreedySampler($generationConfig);
         }
     }
-}
+}
@@ -41,10 +41,14 @@ public function put(mixed $value): void
 
         if ($this->skipPrompt && $this->nextTokensArePrompt) {
             $this->nextTokensArePrompt = false;
-            $this->printedText = $this->tokenizer->decode($this->promptTokens, skipSpecialTokens: true);
-            $this->printedLength = mb_strlen($this->printedText);
+//            $this->printedText = $this->tokenizer->decode($this->promptTokens, skipSpecialTokens: true);
+//            $this->printedLength = mb_strlen($this->printedText);
+//            $this->lastDecodedCheckpointForToken = count($this->promptTokens) - 1;
+//            $this->lastDecodedCheckpointForText = mb_strlen($this->printedText);
+//            return;
+            $prompt = $this->tokenizer->decode($this->promptTokens, skipSpecialTokens: true);
+            $this->printedLength = mb_strlen($prompt);
             $this->lastDecodedCheckpointForToken = count($this->promptTokens) - 1;
-            $this->lastDecodedCheckpointForText = mb_strlen($this->printedText);
             return;
         }
 
@@ -90,4 +94,3 @@ public function end(): void
         $this->lastDecodedCheckpointForText = 0;
     }
 }
-
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`
`6`	`6`	`namespace Codewithkyrian\Transformers\Decoders;`
`7`	`7`
`8`		`-use Codewithkyrian\Transformers\Tokenizers\Tokenizer;`
	`8`	`+use Codewithkyrian\Transformers\Tokenizers\TokenizerModel;`
`9`	`9`
`10`	`10`	`/**`
`11`	`11`	`* The CTC (Connectionist Temporal Classification) decoder.`
`@@ -65,7 +65,7 @@ function convertTokensToString(array $tokens): string`
`65`	`65`	`$text = implode('', $filteredTokens);`
`66`	`66`	`if ($this->cleanup) {`
`67`	`67`	`// Cleanup and replace delimiter token`
`68`		`- $text = trim(str_replace($this->wordDelimiterToken, ' ', Tokenizer::cleanUpTokenization($text)));`
	`68`	`+ $text = trim(str_replace($this->wordDelimiterToken, ' ', TokenizerModel::cleanUpTokenization($text)));`
`69`	`69`	`}`
`70`	`70`
`71`	`71`	`return $text;`
`@@ -75,4 +75,4 @@ protected function decodeChain(array $tokens): array`
`75`	`75`	`{`
`76`	`76`	`return [$this->convertTokensToString($tokens)];`
`77`	`77`	`}`
`78`		`-}`
	`78`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -28,17 +28,18 @@ protected function decodeChain(array $tokens): array`
`28`	`28`	`return array_map(function ($token) {`
`29`	`29`	`$startCut = 0;`
`30`	`30`	`for ($i = 0; $i < $this->start; ++$i) {`
`31`		`- if ($token[$i] ?? null === $this->content) {`
	`31`	`+ $char = mb_substr($token, $i, 1);`
	`32`	`+ if ($char === $this->content) {`
`32`	`33`	`$startCut = $i + 1;`
`33`	`34`	`continue;`
`34`	`35`	`} else {`
`35`	`36`	`break;`
`36`	`37`	`}`
`37`	`38`	`}`
`38`	`39`
`39`		`- $stopCut = strlen($token);`
	`40`	`+ $stopCut = mb_strlen($token);`
`40`	`41`	`for ($i = 0; $i < $this->stop; ++$i) {`
`41`		`- $index = strlen($token) - $i - 1;`
	`42`	`+ $index = mb_strlen($token) - $i - 1;`
`42`	`43`	`if ($token[$index] ?? null === $this->content) {`
`43`	`44`	`$stopCut = $index;`
`44`	`45`	`continue;`
`@@ -47,7 +48,7 @@ protected function decodeChain(array $tokens): array`
`47`	`48`	`}`
`48`	`49`	`}`
`49`	`50`
`50`		`- return substr($token, $startCut, $stopCut - $startCut);`
	`51`	`+ return mb_substr($token, $startCut, $stopCut - $startCut);`
`51`	`52`	`}, $tokens);`
`52`	`53`	`}`
`53`		`-}`
	`54`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -46,22 +46,12 @@ abstract public function sample(Tensor $logits, int $index);`
`46`	`46`	`*/`
`47`	`47`	`public function getLogits(Tensor $logits, int $index): Tensor`
`48`	`48`	`{`
`49`		`-// $vocabSize = $logits->shape()[$logits->ndim() - 1];`
`50`		`-`
`51`		`-// $start = array_fill(0, $logits->ndim(), 0);`
`52`		`-// $size = array_fill(0, $logits->ndim(), 1);`
`53`		`-//`
`54`		`-// array_splice($start, -2, replacement: [$index, 0]);`
`55`		`-// array_splice($size, -2, replacement: [1, $vocabSize]);`
`56`		`-//`
`57`		`-// $logs = $logits->sliceWithBounds($start, $size);`
`58`	`49`	`$logits = $logits->slice($index);`
`59`	`50`
`60`	`51`	`if ($this->generationConfig->temperature > 0) {`
`61`	`52`	`$logits = $logits->multiply(1 / $this->generationConfig->temperature);`
`62`	`53`	`}`
`63`	`54`
`64`		`- // Remove all dimensions of 1, leaving a flat 1D array of vocab_size`
`65`	`55`	`return $logits->squeeze();`
`66`	`56`	`}`
`67`	`57`
`@@ -116,4 +106,4 @@ public static function getSampler(GenerationConfig $generationConfig): Sampler`
`116`	`106`	`return new GreedySampler($generationConfig);`
`117`	`107`	`}`
`118`	`108`	`}`
`119`		`-}`
	`109`	`+}`