forked from sahil-sagwekar2652/semantic-search-demo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
55 lines (40 loc) · 1.32 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import (
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
)
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from dotenv import load_dotenv
import sys
import os
# Load environment variables
load_dotenv()
loader = DirectoryLoader(
"./consume",
glob="*.pdf",
use_multithreading=True,
loader_cls=PyPDFLoader,
show_progress=True,
)
docs = loader.load_and_split(
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
)
with open("docs.txt", "w") as file:
for doc in docs:
file.write(doc.page_content)
print("-" * 20 + "DOCUMENT LOAD AND SPLIT COMPLETE" + "-" * 20)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")
# save to disk
db = Chroma.from_documents(docs, embeddings, persist_directory="./chroma_db")
print("-" * 20 + "EMBEDDINGS CREATED" + "-" * 20)
# load from disk
# db = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
query = str(sys.argv[1])
query_embedding = embeddings.embed_query(query)
fdocs = db.similarity_search_by_vector(query_embedding)
for i in fdocs:
print(i.metadata)
print("\n\n")
print(i.page_content)
print("\n\n")