Skip to content

Commit 5b07ec6

Browse files
committed
Adds loading check to CSV and GSheets so we don't load twice the same content
1 parent 1a1b775 commit 5b07ec6

File tree

2 files changed

+28
-13
lines changed

2 files changed

+28
-13
lines changed

dialog_lib/loaders/csv.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
from dialog_lib.db import get_session
23
from dialog_lib.db.models import CompanyContent
34
from dialog_lib.embeddings.generate import generate_embedding
@@ -6,6 +7,9 @@
67
from langchain_community.document_loaders.csv_loader import CSVLoader
78

89

10+
logger = logging.getLogger(__name__)
11+
12+
913
def load_csv(
1014
file_path, dbsession=get_session, embeddings_model_instance=None,
1115
embedding_llm_model=None, embedding_llm_api_key=None, company_id=None
@@ -28,6 +32,10 @@ def load_csv(
2832
values = line.split(": ")
2933
content[values[0]] = values[1]
3034

35+
36+
if not dbsession.query(CompanyContent).filter(
37+
CompanyContent.question == content["question"], CompanyContent.content == content["content"]
38+
).first():
3139
company_content = CompanyContent(
3240
category="csv",
3341
subcategory="csv-content",
@@ -37,5 +45,5 @@ def load_csv(
3745
embedding=generate_embedding(csv_content.page_content, embeddings_model_instance)
3846
)
3947
session.add(company_content)
40-
41-
return company_content
48+
else:
49+
logger.warning(f"Question: {content['question']} already exists in the database. Skipping.")

dialog_lib/loaders/gsheets.py

+18-11
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import gspread
2+
import logging
23

34
from dialog_lib.db.models import CompanyContent
45
from dialog_lib.embeddings.generate import generate_embedding
@@ -11,6 +12,8 @@
1112
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
1213

1314

15+
logger = logging.getLogger(__name__)
16+
1417
class GoogleSheetsLoader(BaseLoader):
1518
def __init__(self, credentials_path: Union[str, Path], spreadsheet_url: str, sheet_name: str):
1619
self.sheet_name = sheet_name
@@ -59,15 +62,19 @@ def load_google_sheets(
5962
values = line.split(": ")
6063
content[values[0]] = values[1]
6164

62-
company_content = CompanyContent(
63-
category="csv",
64-
subcategory="csv-content",
65-
question=content["question"],
66-
content=content["content"],
67-
dataset=company_id,
68-
embedding=generate_embedding(csv_content.page_content, embeddings_model_instance)
69-
)
70-
dbsession.add(company_content)
65+
if not dbsession.query(CompanyContent).filter(
66+
CompanyContent.question == content["question"], CompanyContent.content == content["content"]
67+
).first():
68+
company_content = CompanyContent(
69+
category="csv",
70+
subcategory="csv-content",
71+
question=content["question"],
72+
content=content["content"],
73+
dataset=company_id,
74+
embedding=generate_embedding(csv_content.page_content, embeddings_model_instance)
75+
)
76+
dbsession.add(company_content)
77+
else:
78+
logger.warning(f"Question: {content['question']} already exists in the database. Skipping.")
7179

72-
dbsession.commit()
73-
return company_content
80+
dbsession.commit()

0 commit comments

Comments
 (0)