From 7cc254bc322ee5373ca526bbcef99dcc5627a645 Mon Sep 17 00:00:00 2001 From: Zohaib Shahid Date: Wed, 21 Jan 2026 02:19:37 +0500 Subject: [PATCH 1/3] Fix backend run instructions in README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 00f71e0..73c2915 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,7 @@ The backend requires specific environment variables to connect to **Google Cloud In one terminal, from the project root with the virtual environment active: ```bash +cd backend uv run main.py ``` From 38d0c51377a4a1e9fdf4926b34a4c452d1b192f7 Mon Sep 17 00:00:00 2001 From: Zohaib Shahid Date: Wed, 21 Jan 2026 22:56:12 +0500 Subject: [PATCH 2/3] Issue #81: Implement hybrid search with RRF --- backend/retrieval.py | 81 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/backend/retrieval.py b/backend/retrieval.py index 866d6c9..6c95afc 100644 --- a/backend/retrieval.py +++ b/backend/retrieval.py @@ -233,3 +233,84 @@ def search( except Exception as e: logger.error(f"Matching Engine search failed: {e}") return [] + + + def hybrid_search( + self, + query: str, + top_k: int = 20, + context: Optional[Dict[str, Any]] = None + ) -> List[RetrievedItem]: + """ + Hybrid retrieval: Vector Search + Keyword Search with RRF + """ + + vector_results = self.search(query, top_k, context) + keyword_results = self.keyword_search(query, top_k) + + if not keyword_results: + logger.info( + "Hybrid search fallback: returning vector results only" + ) + return vector_results + + fused_results = reciprocal_rank_fusion( + vector_results, + keyword_results + ) + + logger.info( + "Hybrid search results | vector=%d keyword=%d fused=%d", + len(vector_results), + len(keyword_results), + len(fused_results), + ) + + return fused_results[:top_k] + + + + + + def keyword_search( + self, query: str, top_k: int = 20 + ) -> List[RetrievedItem]: + """ + Keyword-based search (BM25 / Elastic). + Placeholder implementation for hybrid search. + """ + logger.info("Keyword search called (placeholder)") + return [] + + + +def reciprocal_rank_fusion( + vector_results: List[RetrievedItem], + keyword_results: List[RetrievedItem], + k: int = 60 +) -> List[RetrievedItem]: + """ + Combine vector and keyword results using Reciprocal Rank Fusion (RRF) + """ + + scores: Dict[str, Dict[str, Any]] = {} + + def add_scores(results: List[RetrievedItem]): + for rank, item in enumerate(results): + if item.id not in scores: + scores[item.id] = { + "item": item, + "score": 0.0 + } + scores[item.id]["score"] += 1 / (k + rank + 1) + + add_scores(vector_results) + add_scores(keyword_results) + + fused = sorted( + scores.values(), + key=lambda x: x["score"], + reverse=True + ) + + return [x["item"] for x in fused] From c78096808a36bc16a043811656c38c28864ac763 Mon Sep 17 00:00:00 2001 From: Zohaib Shahid Date: Wed, 21 Jan 2026 23:05:00 +0500 Subject: [PATCH 3/3] Issue #81: Add hybrid search with RRF in retrieval --- .env.template | 17 +++++------------ backend/retrieval.py | 1 + 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/.env.template b/.env.template index 48fe88a..77bfb1f 100644 --- a/.env.template +++ b/.env.template @@ -1,26 +1,19 @@ -GOOGLE_API_KEY= - -# To use vertexai keep it true and false to use gemini -GEMINI_USE_VERTEX=true +GOOGLE_API_KEY=AIzaSyBSW2NPqACOJnvm396j6nt3PydiEwJ2-HU +GEMINI_USE_VERTEX=false +# You can leave other variables blank for local dev if not using Vertex AI GCP_PROJECT_ID= -GCP_REGION= - -# BigQuery +GCP_REGION= BQ_DATASET_ID= BQ_TABLE_ID= -BQ_LOCATION= - +BQ_LOCATION= INDEX_ENDPOINT_ID= DEPLOYED_INDEX_ID= - EMBED_MODEL_NAME=nomic-ai/nomic-embed-text-v1.5 INDEX_ENDPOINT_ID_FULL= VECTOR_API_EP= - -# For Scraping data PUBLIC_API_URL=https://api.knowledge-space.org ELASTIC_BASE_URL= ELASTIC_USERNAME= diff --git a/backend/retrieval.py b/backend/retrieval.py index 6c95afc..f89a0ce 100644 --- a/backend/retrieval.py +++ b/backend/retrieval.py @@ -260,6 +260,7 @@ def hybrid_search( ) logger.info( + "Hybrid search results | vector=%d keyword=%d fused=%d", len(vector_results), len(keyword_results),