Skip to content

Commit

Permalink
Cache tokenizer output to improve speed of tasks like Zero Shot Class…
Browse files Browse the repository at this point in the history
…ification
  • Loading branch information
CodeWithKyrian committed Mar 25, 2024
1 parent 05e5588 commit b115c28
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 5 deletions.
18 changes: 14 additions & 4 deletions examples/pipelines/zero-shot-classification.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,27 @@

use function Codewithkyrian\Transformers\Pipelines\pipeline;
use function Codewithkyrian\Transformers\Utils\memoryUsage;
use function Codewithkyrian\Transformers\Utils\timeUsage;

require_once './bootstrap.php';


//$classifier = pipeline('zero-shot-classification', 'Xenova/mobilebert-uncased-mnli');
//$result = $classifier('Who are you voting for in 2020?', ['politics', 'public health', 'economics', 'elections']);

ini_set('memory_limit', '160M');
ini_set('memory_limit', -1);
$classifier = pipeline('zero-shot-classification', 'Xenova/nli-deberta-v3-xsmall');

$input = "The tension was thick as fog in the arena tonight as the underdogs, the Nets, clawed their way back from a significant deficit to steal a victory from the heavily favored The BUlls in a final score of 120 - Nets to 80 - Bulls
The game was a nail-biter from the start. The Bulls jumped out to an early lead, showcasing their signature fast-paced offense. Net's defense struggled to contain their star player, Frank, who racked up points in the first half.
However, just before halftime, the tide began to turn. The NEts's forward - James hit a series of clutch three-pointers, igniting a spark in the home crowd. The team rallied behind his energy, tightening up their defense and chipping away at the lead.
The second half was a back-and-forth affair, with neither team able to establish a clear advantage. Both sides traded baskets, steals, and blocks, keeping the fans on the edge of their seats. With seconds remaining on the clock, the score was tied.";
$result = $classifier(
'I have a problem with my iphone that needs to be resolved asap!',
['urgent', 'not urgent', 'phone', 'tablet', 'computer'],
$input,
['politics', 'public health', 'economics', 'elections', 'sports', 'entertainment', 'technology', 'business', 'finance', 'education', 'science', 'religion', 'history', 'culture', 'environment', 'weather'],
multiLabel: true
);

Expand All @@ -29,5 +38,6 @@
//
//$result = $classifier('Apple just announced the newest iPhone 13', ["technology", "sports", "politics"]);

dd(memoryUsage(), $result);
dd( $result, timeUsage(), memoryUsage());

// Improved from 11.7687s to 2.9687s, 3.5x faster (75% improvement)
16 changes: 15 additions & 1 deletion src/PretrainedTokenizers/PretrainedTokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
use Codewithkyrian\Transformers\Tokenizers\AddedToken;
use Codewithkyrian\Transformers\Tokenizers\Tokenizer;
use Codewithkyrian\Transformers\Utils\Tensor;
use function Codewithkyrian\Transformers\Utils\timeUsage;

class PretrainedTokenizer
{
Expand Down Expand Up @@ -70,6 +71,7 @@ class PretrainedTokenizer

protected mixed $chatTemplate;
protected array $compiledTemplateCache = [];
protected array $tokenizationCache = [];

/**
* @param array $tokenizerJSON The JSON of the tokenizer.
Expand Down Expand Up @@ -404,6 +406,13 @@ protected function encodeText(?string $text): ?array
return null;
}

// Hash the text and check if it is in the cache
$hash = hash('sha256', $text);

if (isset($this->tokenizationCache[$hash])) {
return $this->tokenizationCache[$hash];
}

// Actual function which does encoding, for a single text
// First, we take care of special tokens. Needed to avoid issues arising from
// normalization and/or pretokenization (which may not preserve special tokens)
Expand Down Expand Up @@ -442,7 +451,12 @@ protected function encodeText(?string $text): ?array
}
}, $sections, array_keys($sections));

return array_merge(...$tokens);
$result = array_merge(...$tokens);

// Cache the result
$this->tokenizationCache[$hash] = $result;

return $result;
}


Expand Down

0 comments on commit b115c28

Please sign in to comment.