Skip to content

Commit

Permalink
fix: improve Unigram Tokenizer handling of multibyte strings
Browse files Browse the repository at this point in the history
  • Loading branch information
CodeWithKyrian committed Sep 13, 2024
1 parent 7e880dd commit e8a8a9a
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 89 deletions.
12 changes: 8 additions & 4 deletions src/DataStructures/CharTrie.php
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ public function extend(array $texts): void
public function push(string $text): void
{
$node = $this->root;
$length = mb_strlen($text);

for ($i = 0, $length = strlen($text); $i < $length; $i++) {
$ch = $text[$i];
for ($i = 0; $i < $length; $i++) {
$ch = mb_substr($text, $i, 1);
$node = $node->getChild($ch);
}

Expand All @@ -59,10 +60,13 @@ public function commonPrefixSearch(string $text): Generator
{
$node = $this->root;
$prefix = "";
for ($i = 0; $i < strlen($text) && $node != null; $i++) {
$ch = $text[$i];
$length = mb_strlen($text);

for ($i = 0; $i < $length && $node != null; $i++) {
$ch = mb_substr($text, $i, 1);
$prefix .= $ch;
$node = $node->getChild($ch);

if ($node?->isLeaf) {
yield $prefix;
}
Expand Down
4 changes: 2 additions & 2 deletions src/DataStructures/TokenLattice.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public function __construct(
public ?int $bosTokenId,
public ?int $eosTokenId)
{
$this->len = strlen($sentence);
$this->len = mb_strlen($sentence);
$this->beginNodes = array_fill(0, $this->len + 1, []);
$this->endNodes = array_fill(0, $this->len + 1, []);

Expand Down Expand Up @@ -124,7 +124,7 @@ public function viterbi(): array
*/
public function piece(TokenLatticeNode $node): string
{
return substr($this->sentence, $node->pos, $node->length);
return mb_substr($this->sentence, $node->pos, $node->length);
}

/**
Expand Down
Loading

0 comments on commit e8a8a9a

Please sign in to comment.