Skip to content

Commit 7cab90f

Browse files
authored
Fix nltk bug in multi-threaded environments (run-llama#8668)
wip
1 parent d494329 commit 7cab90f

File tree

1 file changed

+11
-1
lines changed

1 file changed

+11
-1
lines changed

llama_index/text_splitter/utils.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
import logging
12
from typing import Callable, List
23

34
from llama_index.text_splitter.types import TextSplitter
45

6+
logger = logging.getLogger(__name__)
7+
58

69
def truncate_text(text: str, text_splitter: TextSplitter) -> str:
710
"""Truncate text to fit within the chunk size."""
@@ -46,7 +49,14 @@ def split_by_sentence_tokenizer() -> Callable[[str], List[str]]:
4649
try:
4750
nltk.data.find("tokenizers/punkt")
4851
except LookupError:
49-
nltk.download("punkt", download_dir=nltk_data_dir)
52+
try:
53+
nltk.download("punkt", download_dir=nltk_data_dir)
54+
except FileExistsError:
55+
logger.info(
56+
"Tried to re-download NLTK files but already exists. "
57+
"This could happen in multi-theaded deployments, "
58+
"should be benign"
59+
)
5060

5161
tokenizer = nltk.tokenize.PunktSentenceTokenizer()
5262

0 commit comments

Comments
 (0)