1
1
import gspread
2
+ import logging
2
3
3
4
from dialog_lib .db .models import CompanyContent
4
5
from dialog_lib .embeddings .generate import generate_embedding
11
12
from typing import Any , Dict , Iterator , List , Optional , Sequence , Union
12
13
13
14
15
+ logger = logging .getLogger (__name__ )
16
+
14
17
class GoogleSheetsLoader (BaseLoader ):
15
18
def __init__ (self , credentials_path : Union [str , Path ], spreadsheet_url : str , sheet_name : str ):
16
19
self .sheet_name = sheet_name
@@ -59,15 +62,19 @@ def load_google_sheets(
59
62
values = line .split (": " )
60
63
content [values [0 ]] = values [1 ]
61
64
62
- company_content = CompanyContent (
63
- category = "csv" ,
64
- subcategory = "csv-content" ,
65
- question = content ["question" ],
66
- content = content ["content" ],
67
- dataset = company_id ,
68
- embedding = generate_embedding (csv_content .page_content , embeddings_model_instance )
69
- )
70
- dbsession .add (company_content )
65
+ if not dbsession .query (CompanyContent ).filter (
66
+ CompanyContent .question == content ["question" ], CompanyContent .content == content ["content" ]
67
+ ).first ():
68
+ company_content = CompanyContent (
69
+ category = "csv" ,
70
+ subcategory = "csv-content" ,
71
+ question = content ["question" ],
72
+ content = content ["content" ],
73
+ dataset = company_id ,
74
+ embedding = generate_embedding (csv_content .page_content , embeddings_model_instance )
75
+ )
76
+ dbsession .add (company_content )
77
+ else :
78
+ logger .warning (f"Question: { content ['question' ]} already exists in the database. Skipping." )
71
79
72
- dbsession .commit ()
73
- return company_content
80
+ dbsession .commit ()
0 commit comments