1
- from langchain_core .globals import set_verbose , set_debug
2
- from langchain_ollama import ChatOllama , OllamaEmbeddings
1
+ import logging
2
+ from typing import Optional
3
+
4
+ import yaml
3
5
from langchain .schema .output_parser import StrOutputParser
4
- from langchain_mongodb .vectorstores import MongoDBAtlasVectorSearch
5
- from pymongo import MongoClient
6
- from langchain_community .document_loaders import PyPDFLoader
7
- from langchain .text_splitter import RecursiveCharacterTextSplitter
8
6
from langchain .schema .runnable import RunnablePassthrough
7
+ from langchain .text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_community .document_loaders import PyPDFLoader
9
9
from langchain_community .vectorstores .utils import filter_complex_metadata
10
+ from langchain_core .globals import set_debug , set_verbose
10
11
from langchain_core .prompts import ChatPromptTemplate
11
- import logging
12
- import yaml
13
-
12
+ from langchain_mongodb . vectorstores import MongoDBAtlasVectorSearch
13
+ from langchain_ollama import ChatOllama , OllamaEmbeddings
14
+ from pymongo import MongoClient
14
15
15
16
# Enable verbose debugging
16
17
set_debug (True )
20
21
logging .basicConfig (level = logging .INFO )
21
22
logger = logging .getLogger (__name__ )
22
23
24
+
23
25
def load_config (config_file : str = "config.yaml" ):
24
26
"""Load configuration from a YAML file."""
25
- with open (config_file , "r" ) as file :
27
+ with open (config_file ) as file :
26
28
return yaml .safe_load (file )
27
29
30
+
28
31
class ChatPDF :
29
32
"""A class designed for PDF ingestion and question answering using RAG with detailed debugging logs."""
30
33
@@ -40,29 +43,33 @@ def __init__(self, config_file: str = "config.yaml"):
40
43
mongo_connection_str = config ["mongo_connection_str" ]
41
44
database_name = config ["database_name" ]
42
45
collection_name = config ["collection_name" ]
43
-
46
+
44
47
self .model = ChatOllama (model = llm_model )
45
48
self .embeddings = OllamaEmbeddings (model = embedding_model )
46
- self .text_splitter = RecursiveCharacterTextSplitter (chunk_size = 1024 , chunk_overlap = 100 )
49
+ self .text_splitter = RecursiveCharacterTextSplitter (
50
+ chunk_size = 1024 , chunk_overlap = 100
51
+ )
47
52
self .prompt = ChatPromptTemplate .from_template (
48
53
"""
49
54
You are a helpful assistant answering questions based on the uploaded document and the conversation.
50
-
55
+
51
56
Conversation History:
52
57
{conversation_history}
53
-
58
+
54
59
Context from Documents:
55
60
{context}
56
-
61
+
57
62
Question:
58
63
{question}
59
-
64
+
60
65
Provide a concise, accurate answer (preferably within three sentences), ensuring it directly addresses the question.
61
66
"""
62
67
)
63
-
68
+
64
69
# Setup MongoDB connection
65
- self .client = MongoClient (mongo_connection_str )
70
+ self .client = MongoClient (
71
+ mongo_connection_str , appname = "devrel.showcase.local_rag_pdf_app"
72
+ )
66
73
self .collection = self .client [database_name ][collection_name ]
67
74
68
75
# Verbose connection check
@@ -74,7 +81,7 @@ def __init__(self, config_file: str = "config.yaml"):
74
81
collection = self .collection ,
75
82
embedding = self .embeddings ,
76
83
index_name = "vector_index" ,
77
- relevance_score_fn = "cosine"
84
+ relevance_score_fn = "cosine" ,
78
85
)
79
86
80
87
# Create vector search index on the collection
@@ -107,7 +114,13 @@ def upload_and_index_pdf(self, pdf_file_path: str):
107
114
self .vector_store .add_documents (documents = chunks )
108
115
logger .info ("Document embeddings stored successfully in MongoDB Atlas." )
109
116
110
- def query_with_context (self , query : str , conversation_history : list = None , k : int = 5 , score_threshold : float = 0.2 ):
117
+ def query_with_context (
118
+ self ,
119
+ query : str ,
120
+ conversation_history : Optional [list ] = None ,
121
+ k : int = 5 ,
122
+ score_threshold : float = 0.2 ,
123
+ ):
111
124
"""
112
125
Answer a query using the RAG pipeline with verbose debugging and conversation history.
113
126
@@ -132,7 +145,9 @@ def query_with_context(self, query: str, conversation_history: list = None, k: i
132
145
# Generate and log query embeddings
133
146
query_embedding = self .embeddings .embed_query (query )
134
147
logger .info (f"User Query: { query } " )
135
- logger .debug (f"Query Embedding (sample values): { query_embedding [:10 ]} ... [Total Length: { len (query_embedding )} ]" )
148
+ logger .debug (
149
+ f"Query Embedding (sample values): { query_embedding [:10 ]} ... [Total Length: { len (query_embedding )} ]"
150
+ )
136
151
137
152
logger .info (f"Retrieving context for query: { query } " )
138
153
retrieved_docs = self .retriever .invoke (query )
@@ -147,17 +162,19 @@ def query_with_context(self, query: str, conversation_history: list = None, k: i
147
162
148
163
# Format the input for the LLM, including conversation history
149
164
formatted_input = {
150
- "conversation_history" : "\n " .join (conversation_history ) if conversation_history else "" ,
165
+ "conversation_history" : (
166
+ "\n " .join (conversation_history ) if conversation_history else ""
167
+ ),
151
168
"context" : "\n \n " .join (doc .page_content for doc in retrieved_docs ),
152
169
"question" : query ,
153
170
}
154
171
155
172
# Build the RAG chain
156
173
chain = (
157
174
RunnablePassthrough () # Passes the input as-is
158
- | self .prompt # Formats the input for the LLM
159
- | self .model # Queries the LLM
160
- | StrOutputParser () # Parses the LLM's output
175
+ | self .prompt # Formats the input for the LLM
176
+ | self .model # Queries the LLM
177
+ | StrOutputParser () # Parses the LLM's output
161
178
)
162
179
163
180
logger .info ("Generating response using the LLM." )
0 commit comments