Skip to content

Commit 9aa12f9

Browse files
Merge pull request #49 from CodeWithKyrian/43-regex-for-detecting-language-codes-incorrect
fix: improve regex for detecting language codes in NllbTokenizer
2 parents dd2481f + 3a261da commit 9aa12f9

File tree

3 files changed

+5
-4
lines changed

3 files changed

+5
-4
lines changed

examples/pipelines/translation.php

+4-2
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,13 @@
1111

1212
ini_set('memory_limit', -1);
1313

14-
$translator = pipeline('translation', 'Xenova/m2m100_418M');
14+
//$translator = pipeline('translation', 'Xenova/m2m100_418M');
15+
$translator = pipeline('translation', 'Xenova/nllb-200-distilled-600M');
1516

1617
$streamer = StdOutStreamer::make();
1718

18-
$output = $translator('生活就像一盒巧克力。', streamer: $streamer, tgtLang: 'en');
19+
//$output = $translator('生活就像一盒巧克力。', streamer: $streamer, tgtLang: 'en');
20+
$output = $translator('जीवन एक चॉकलेट बॉक्स की तरह है।', streamer: $streamer, tgtLang: 'fra_Latn');
1921
//$output = $translator('जीवन एक चॉकलेट बॉक्स की तरह है।', streamer: $streamer, tgtLang: 'fr');
2022
//$output = $translator('संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है', streamer: $streamer, tgtLang: 'fr', maxNewTokens: 256);
2123

src/Normalizers/Replace.php

-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
declare(strict_types=1);
44

5-
65
namespace Codewithkyrian\Transformers\Normalizers;
76

87
/**

src/PretrainedTokenizers/NllbTokenizer.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
class NllbTokenizer extends PretrainedTokenizer
1212
{
13-
protected string $languageRegex = '/^[a-z]{3}_[A-Z]{3}$/';
13+
protected string $languageRegex = '/^[a-z]{3}_[a-zA-Z]{3,4}$/';
1414

1515
protected array $languageCodes = [];
1616
protected \Closure $langToToken;

0 commit comments

Comments
 (0)