Refactor arguments and fix wrong attention mask type in generation

CodeWithKyrian · CodeWithKyrian · commit 715e2f7fe150 · 2024-03-20T17:57:52.000+01:00
diff --git a/examples/pipelines/text-generation.php b/examples/pipelines/text-generation.php
@@ -10,28 +10,34 @@
 use function Codewithkyrian\Transformers\Utils\timeUsage;
 
 ini_set('memory_limit', -1);
-
-//$generator = pipeline('text-generation', 'Xenova/gpt2');
-//
-//$streamer = StdOutStreamer::make($generator->tokenizer);
 //
-//$output = $generator('The Black man worked as a',
-//    streamer: $streamer,
-//    maxNewTokens: 128,
-//    doSample: true,
-//    temperature: 0.7,
-//    repetitionPenalty: 1.3,
-//    earlyStopping: true
-//);
+$generator = pipeline('text-generation', 'Xenova/gpt2');
 
-$generator = pipeline('text-generation', 'Xenova/codegen-350M-mono');
 $streamer = StdOutStreamer::make($generator->tokenizer);
 
-$output = $generator(
-    'def fib(n):',
+$messages = [
+    ['role' => 'user', 'content' => 'Hello!'],
+    ['role' => 'assistant', 'content' => 'Hi! How are you?'],
+    ['role' => 'user', 'content' => 'I am doing great. What about you?'],
+];
+
+$output = $generator("I love going to school but I don't",
     streamer: $streamer,
-    maxNewTokens: 50,
-    doSample: true
+    maxNewTokens: 128,
+    doSample: true,
+    temperature: 0.7,
+    repetitionPenalty: 1.3,
+    earlyStopping: true
 );
 
+//$generator = pipeline('text-generation', 'Xenova/codegen-350M-mono');
+//$streamer = StdOutStreamer::make($generator->tokenizer);
+//
+//$output = $generator(
+//    'def fib(n):',
+//    streamer: $streamer,
+//    maxNewTokens: 100,
+//    doSample: true
+//);
+
 dd("done", timeUsage(), memoryUsage());
diff --git a/src/Models/Pretrained/PreTrainedModel.php b/src/Models/Pretrained/PreTrainedModel.php
@@ -684,7 +684,7 @@ public function generate(
         Tensor               $inputs,
         ?GenerationConfig    $generationConfig = null,
         ?LogitsProcessorList $logitsProcessor = null,
-        array                $inputsAttentionMask = null,
+        Tensor                $inputsAttentionMask = null,
         ?Streamer            $streamer = null,
     ): array
     {
diff --git a/src/Pipelines/TextGenerationPipeline.php b/src/Pipelines/TextGenerationPipeline.php
@@ -64,7 +64,7 @@ public function __invoke(array|string $texts, ...$args): array
             $snakeCasedArgs[$this->camelCaseToSnakeCase($key)] = $value;
         }
 
-        $generateKwargs = new GenerationConfig($snakeCasedArgs);
+        $generationConfig = new GenerationConfig($snakeCasedArgs);
 
         $isBatched = is_array($texts);
         if (!$isBatched) {
@@ -75,14 +75,14 @@ public function __invoke(array|string $texts, ...$args): array
         $addSpecialTokens = $this->model->config['add_special_tokens'] ?? false;
 
         $this->tokenizer->paddingSide = 'left';
-        ['input_ids' => $inputIds, 'attention_mask' => $attentionMask] = $this->tokenizer->__invoke(
+        ['input_ids' => $inputIds, 'attention_mask' => $attentionMask] = $this->tokenizer->tokenize(
             $texts,
             padding: true,
             addSpecialTokens: $addSpecialTokens,
             truncation: true
         );
 
-        $outputTokenIds = $this->model->generate($inputIds, generationConfig: $generateKwargs, streamer: $streamer);
+        $outputTokenIds = $this->model->generate($inputIds, generationConfig: $generationConfig, streamer: $streamer);
 
         $decoded = $this->tokenizer->batchDecode($outputTokenIds, skipSpecialTokens: true);
 
diff --git a/src/PretrainedTokenizers/PretrainedTokenizer.php b/src/PretrainedTokenizers/PretrainedTokenizer.php
@@ -368,6 +368,31 @@ function ($key) {
         return $result;
     }
 
+    /**
+     * Tokenize the given text(s).
+     *
+     * @param string|array $text The text to tokenize.
+     * @param string|array|null $textPair Optional second sequence to be encoded. If set, must be the same type as text.
+     * @param bool|string $padding Whether to pad the input sequences.
+     * @param bool $addSpecialTokens Whether to add the special tokens associated with the corresponding model.
+     * @param bool $truncation Whether to truncate the input sequences.
+     * @param int|null $maxLength Maximum length of the returned list and optionally padding length.
+     *
+     * @return array{input_ids: Tensor, attention_mask: Tensor, token_type_ids: Tensor|null}
+     */
+    public function tokenize(
+        string|array      $text,
+        string|array|null $textPair = null,
+        bool|string       $padding = false,
+        bool              $addSpecialTokens = true,
+        bool              $truncation = false,
+        ?int              $maxLength = null,
+    ): array
+    {
+        return $this->__invoke($text, $textPair, $padding, $addSpecialTokens, $truncation, $maxLength);
+    }
+
+
     /**
      * Encodes a single text using the preprocessor pipeline of the tokenizer.
      *

Original file line number	Diff line number	Diff line change
`@@ -684,7 +684,7 @@ public function generate(`
`684`	`684`	`Tensor $inputs,`
`685`	`685`	`?GenerationConfig $generationConfig = null,`
`686`	`686`	`?LogitsProcessorList $logitsProcessor = null,`
`687`		`- array $inputsAttentionMask = null,`
	`687`	`+ Tensor $inputsAttentionMask = null,`
`688`	`688`	`?Streamer $streamer = null,`
`689`	`689`	`): array`
`690`	`690`	`{`