forked from hwchase17/chat-your-data
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathingest_data.py
44 lines (37 loc) · 1.25 KB
/
ingest_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.hyde.base import HypotheticalDocumentEmbedder
from langchain.llms import OpenAI
import pickle
from glob import glob
import os
from langchain.docstore.document import Document
from config import settings
def read_text(path):
try:
with open(path) as f:
text = f.read()
metadata = {"source": path}
return Document(page_content=text, metadata=metadata)
except:
return None
# Load and split Document
text_path_candidates = glob("data/*/*")
documents = [
read_text(text_path_candidate)
for text_path_candidate in text_path_candidates
if read_text(text_path_candidate)
]
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(documents)
# Load Data to vectorstore
embeddings = HypotheticalDocumentEmbedder.from_llm(
llm=OpenAI(n=settings.HyDE_n, best_of=settings.HyDE_best_of),
base_embeddings=OpenAIEmbeddings(),
prompt_key="web_search",
)
vectorstore = FAISS.from_documents(documents, embeddings)
# Save vectorstore
with open("vectorstore.pkl", "wb") as f:
pickle.dump(vectorstore, f)