Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"any_locale_word_tokenize",
"english_word_tokenize",
"LATIN_CHARS_ALL",
"INDIC_CHARS_ALL",
"normalize_unicode_text",
"japanese_text_preprocessing",
]
Expand All @@ -52,11 +53,23 @@
LATIN_ALPHABET_BASIC = "A-Za-z"
ACCENTED_CHARS = "À-ÖØ-öø-ÿ"
LATIN_CHARS_ALL = f"{LATIN_ALPHABET_BASIC}{ACCENTED_CHARS}"

# Indic characters based on https://www.unicode.org/charts/
DEVANAGARI_CHARS = (
r'\u0900-\u097F' # Hindi, Marathi, Nepali, Sanskrit https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)
)
BENGALI_CHARS = r'\u0980-\u09FF' # Bengali, Assamese
TAMIL_CHARS = r'\u0B80-\u0BFF' # Tamil
TELUGU_CHARS = r'\u0C00-\u0C7F' # Telugu
KANNADA_CHARS = r'\u0C80-\u0CFF' # Kannada
GUJARATI_CHARS = r'\u0A80-\u0AFF' # Gujarati
INDIC_CHARS_ALL = f"{DEVANAGARI_CHARS}{BENGALI_CHARS}{TAMIL_CHARS}{TELUGU_CHARS}{KANNADA_CHARS}{GUJARATI_CHARS}"
Comment on lines +57 to +66
Copy link

Copilot AI Feb 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

INDIC_CHARS_ALL uses entire Unicode blocks (e.g., \u0900-\u097F), which includes punctuation such as Devanagari danda (U+0964). Because _WORDS_RE_ANY_LOCALE treats everything in INDIC_CHARS_ALL as part of a “word”, strings like दुनिया। will be tokenized as a single word and won't match phoneme-dict entries for दुनिया. Consider narrowing these ranges to letters/marks (and optionally digits) and explicitly excluding script punctuation like / so they tokenize as punctuation separators.

Copilot uses AI. Check for mistakes.

_WORDS_RE_EN = re.compile(
fr"([{LATIN_ALPHABET_BASIC}]+(?:[{LATIN_ALPHABET_BASIC}\-']*[{LATIN_ALPHABET_BASIC}]+)*)|(\|[^|]*\|)|([^{LATIN_ALPHABET_BASIC}|]+)"
)
_WORDS_RE_ANY_LOCALE = re.compile(
fr"([{LATIN_CHARS_ALL}]+(?:[{LATIN_CHARS_ALL}\-']*[{LATIN_CHARS_ALL}]+)*)|(\|[^|]*\|)|([^{LATIN_CHARS_ALL}|]+)"
fr"([{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}]+(?:[{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}\-']*[{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}]+)*)|(\|[^|]*\|)|([^{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}|]+)"
)


Expand Down
9 changes: 7 additions & 2 deletions nemo/collections/tts/g2p/models/i18n_ipa.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import validate_locale
from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import (
INDIC_CHARS_ALL,
LATIN_CHARS_ALL,
any_locale_word_tokenize,
english_word_tokenize,
Expand All @@ -29,13 +30,16 @@
from nemo.collections.tts.g2p.utils import GRAPHEME_CASE_MIXED, GRAPHEME_CASE_UPPER, set_grapheme_case
from nemo.utils import logging

# Compiled regex pattern for Indic scripts (used in dictionary parsing)
_INDIC_PATTERN = re.compile(f'^[{INDIC_CHARS_ALL}]')


class IpaG2p(BaseG2p):
# fmt: off
STRESS_SYMBOLS = ["ˈ", "ˌ"]
# Regex for roman characters, accented characters, and locale-agnostic numbers/digits
CHAR_REGEX = re.compile(fr"[{LATIN_CHARS_ALL}\d]")
PUNCT_REGEX = re.compile(fr"[^{LATIN_CHARS_ALL}\d]")
CHAR_REGEX = re.compile(fr"[{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}\d]")
PUNCT_REGEX = re.compile(fr"[^{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}\d]")
Comment on lines 39 to +42
Copy link

Copilot AI Feb 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CHAR_REGEX/PUNCT_REGEX are expanded to include INDIC_CHARS_ALL, but since INDIC_CHARS_ALL currently includes Devanagari punctuation (e.g., ), parse_one_word() will no longer recognize those symbols as punctuation-only tokens (CHAR_REGEX.search() will match). This can cause Hindi punctuation to be treated as part of a word and lead to OOV fallbacks instead of dictionary lookups. Recommend switching to an Indic letter/mark set (excluding danda/double danda) for CHAR_REGEX and treating those punctuation marks via the punctuation path.

Copilot uses AI. Check for mistakes.
# fmt: on

def __init__(
Expand Down Expand Up @@ -190,6 +194,7 @@ def _parse_phoneme_dict(
or 'À' <= line[0] <= 'Ö'
or 'Ø' <= line[0] <= 'ö'
or 'ø' <= line[0] <= 'ÿ'
or _INDIC_PATTERN.match(line[0])
or line[0] == "'"
):
parts = line.strip().split(maxsplit=1)
Expand Down
Loading
Loading