-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathupload.py
43 lines (35 loc) · 1.14 KB
/
upload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import chromadb
import os
import dotenv
from langchain_community.document_loaders import PyPDFLoader
dotenv.load_dotenv()
client = chromadb.PersistentClient(path='db')
collection = client.get_or_create_collection(name="minilm")
import chromadb.utils.embedding_functions as embedding_functions
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
api_key=os.getenv('HUGGINGFACE_API_KEY'),
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
def db(text, embed, ids):
collection.add(
documents=[text],
embeddings=[embed],
ids=[str(ids)]
)
print(text + " added to database")
def embed(pdf):
loader = PyPDFLoader(pdf)
pages = loader.load_and_split()
for page in pages:
embeddings = huggingface_ef(page.page_content)
db(page.page_content, embeddings[0], pages.index(page))
def query_search(input, n=5):
embedding=huggingface_ef(input)
res=collection.query(
query_embeddings=[embedding[0]],
n_results=n,
)
# print(res)
return res['documents'][0]
# embed("eBook-How-to-Build-a-Career-in-AI.pdf")
# print(query_search("who is the author of the book?"))