Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 5 additions & 12 deletions .env.template
Original file line number Diff line number Diff line change
@@ -1,26 +1,19 @@
GOOGLE_API_KEY=

# To use vertexai keep it true and false to use gemini
GEMINI_USE_VERTEX=true
GOOGLE_API_KEY=AIzaSyBSW2NPqACOJnvm396j6nt3PydiEwJ2-HU
GEMINI_USE_VERTEX=false

# You can leave other variables blank for local dev if not using Vertex AI
GCP_PROJECT_ID=
GCP_REGION=

# BigQuery
GCP_REGION=
BQ_DATASET_ID=
BQ_TABLE_ID=
BQ_LOCATION=

BQ_LOCATION=
INDEX_ENDPOINT_ID=
DEPLOYED_INDEX_ID=


EMBED_MODEL_NAME=nomic-ai/nomic-embed-text-v1.5
INDEX_ENDPOINT_ID_FULL=
VECTOR_API_EP=


# For Scraping data
PUBLIC_API_URL=https://api.knowledge-space.org
ELASTIC_BASE_URL=
ELASTIC_USERNAME=
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ The backend requires specific environment variables to connect to **Google Cloud
In one terminal, from the project root with the virtual environment active:

```bash
cd backend
uv run main.py
```

Expand Down
82 changes: 82 additions & 0 deletions backend/retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,3 +233,85 @@ def search(
except Exception as e:
logger.error(f"Matching Engine search failed: {e}")
return []


def hybrid_search(
self,
query: str,
top_k: int = 20,
context: Optional[Dict[str, Any]] = None
) -> List[RetrievedItem]:
"""
Hybrid retrieval: Vector Search + Keyword Search with RRF
"""

vector_results = self.search(query, top_k, context)
keyword_results = self.keyword_search(query, top_k)

if not keyword_results:
logger.info(
"Hybrid search fallback: returning vector results only"
)
return vector_results

fused_results = reciprocal_rank_fusion(
vector_results,
keyword_results
)

logger.info(

"Hybrid search results | vector=%d keyword=%d fused=%d",
len(vector_results),
len(keyword_results),
len(fused_results),
)

return fused_results[:top_k]





def keyword_search(
self, query: str, top_k: int = 20
) -> List[RetrievedItem]:
"""
Keyword-based search (BM25 / Elastic).
Placeholder implementation for hybrid search.
"""
logger.info("Keyword search called (placeholder)")
return []



def reciprocal_rank_fusion(
vector_results: List[RetrievedItem],
keyword_results: List[RetrievedItem],
k: int = 60
) -> List[RetrievedItem]:
"""
Combine vector and keyword results using Reciprocal Rank Fusion (RRF)
"""

scores: Dict[str, Dict[str, Any]] = {}

def add_scores(results: List[RetrievedItem]):
for rank, item in enumerate(results):
if item.id not in scores:
scores[item.id] = {
"item": item,
"score": 0.0
}
scores[item.id]["score"] += 1 / (k + rank + 1)

add_scores(vector_results)
add_scores(keyword_results)

fused = sorted(
scores.values(),
key=lambda x: x["score"],
reverse=True
)

return [x["item"] for x in fused]