diff --git a/.env.template b/.env.template index 48fe88a..77bfb1f 100644 --- a/.env.template +++ b/.env.template @@ -1,26 +1,19 @@ -GOOGLE_API_KEY= - -# To use vertexai keep it true and false to use gemini -GEMINI_USE_VERTEX=true +GOOGLE_API_KEY=AIzaSyBSW2NPqACOJnvm396j6nt3PydiEwJ2-HU +GEMINI_USE_VERTEX=false +# You can leave other variables blank for local dev if not using Vertex AI GCP_PROJECT_ID= -GCP_REGION= - -# BigQuery +GCP_REGION= BQ_DATASET_ID= BQ_TABLE_ID= -BQ_LOCATION= - +BQ_LOCATION= INDEX_ENDPOINT_ID= DEPLOYED_INDEX_ID= - EMBED_MODEL_NAME=nomic-ai/nomic-embed-text-v1.5 INDEX_ENDPOINT_ID_FULL= VECTOR_API_EP= - -# For Scraping data PUBLIC_API_URL=https://api.knowledge-space.org ELASTIC_BASE_URL= ELASTIC_USERNAME= diff --git a/README.md b/README.md index 00f71e0..73c2915 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,7 @@ The backend requires specific environment variables to connect to **Google Cloud In one terminal, from the project root with the virtual environment active: ```bash +cd backend uv run main.py ``` diff --git a/backend/retrieval.py b/backend/retrieval.py index 866d6c9..f89a0ce 100644 --- a/backend/retrieval.py +++ b/backend/retrieval.py @@ -233,3 +233,85 @@ def search( except Exception as e: logger.error(f"Matching Engine search failed: {e}") return [] + + + def hybrid_search( + self, + query: str, + top_k: int = 20, + context: Optional[Dict[str, Any]] = None + ) -> List[RetrievedItem]: + """ + Hybrid retrieval: Vector Search + Keyword Search with RRF + """ + + vector_results = self.search(query, top_k, context) + keyword_results = self.keyword_search(query, top_k) + + if not keyword_results: + logger.info( + "Hybrid search fallback: returning vector results only" + ) + return vector_results + + fused_results = reciprocal_rank_fusion( + vector_results, + keyword_results + ) + + logger.info( + + "Hybrid search results | vector=%d keyword=%d fused=%d", + len(vector_results), + len(keyword_results), + len(fused_results), + ) + + return fused_results[:top_k] + + + + + + def keyword_search( + self, query: str, top_k: int = 20 + ) -> List[RetrievedItem]: + """ + Keyword-based search (BM25 / Elastic). + Placeholder implementation for hybrid search. + """ + logger.info("Keyword search called (placeholder)") + return [] + + + +def reciprocal_rank_fusion( + vector_results: List[RetrievedItem], + keyword_results: List[RetrievedItem], + k: int = 60 +) -> List[RetrievedItem]: + """ + Combine vector and keyword results using Reciprocal Rank Fusion (RRF) + """ + + scores: Dict[str, Dict[str, Any]] = {} + + def add_scores(results: List[RetrievedItem]): + for rank, item in enumerate(results): + if item.id not in scores: + scores[item.id] = { + "item": item, + "score": 0.0 + } + scores[item.id]["score"] += 1 / (k + rank + 1) + + add_scores(vector_results) + add_scores(keyword_results) + + fused = sorted( + scores.values(), + key=lambda x: x["score"], + reverse=True + ) + + return [x["item"] for x in fused]