Skip to content

Commit

Permalink
Update Opik notebook example and integration (#1295)
Browse files Browse the repository at this point in the history
# Opik integration update

Updated the Opik integration:
1. Updated notebooks following the changes to expected inputs for metric
functions
2. Updated the OpikTracer following some changes in the Opik library

---------

Co-authored-by: Abby Morgan <[email protected]>
  • Loading branch information
jverre and anmorgan24 authored Sep 17, 2024
1 parent 78b54c6 commit c40891b
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 59 deletions.
2 changes: 1 addition & 1 deletion docs/howtos/applications/tracing.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ export LANGCHAIN_API_KEY=<your-api-key>
export LANGCHAIN_PROJECT=<your-project> # if not specified, defaults to "default"
```

Now we have to import the required tracer from langchain, here we are using `LangChainTracer` but you can similarly use any tracer supported by langchain like [WandbTracer](https://python.langchain.com/docs/integrations/providers/wandb_tracing) or [OpikTracer](https://comet.com/docs/opik/tracing/integrations/ragas?utm_source=ragas&utm_medium=github&utm_campaign=opik&utm_content=tracing_how_to)
Now we have to import the required tracer from langchain, here we are using `LangChainTracer` but you can similarly use any tracer supported by langchain like [WandbTracer](https://python.langchain.com/docs/integrations/providers/wandb_tracing) or [OpikTracer](https://comet.com/docs/opik/tracing/integrations/ragas?utm_source=ragas&utm_medium=docs&utm_campaign=opik&utm_content=tracing_how_to)

```{code-block} python
# langsmith
Expand Down
84 changes: 48 additions & 36 deletions docs/howtos/integrations/opik.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Opik by Comet\n",
"# Comet Opik\n",
"\n",
"In this notebook, we will showcase how to use Opik with Ragas for monitoring and evaluation of RAG (Retrieval-Augmented Generation) pipelines.\n",
"\n",
Expand All @@ -13,16 +13,18 @@
"1. Using Ragas metrics to score traces\n",
"2. Using the Ragas `evaluate` function to score a dataset\n",
"\n",
"<center><img src=\"https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/opik-project-dashboard.png\" alt=\"Comet Opik project dashboard screenshot with list of traces and spans\" width=\"600\" style=\"border: 0.5px solid #ddd;\"/></center>\n",
"\n",
"## Setup\n",
"\n",
"[Comet](https://www.comet.com/site?utm_medium=github&utm_source=ragas&utm_campaign=opik) provides a hosted version of the Opik platform, [simply create an account](https://www.comet.com/signup?from=llm&utm_medium=github&utm_source=ragas&utm_campaign=opik) and grab you API Key.\n",
"[Comet](https://www.comet.com/site?utm_medium=docs&utm_source=ragas&utm_campaign=opik) provides a hosted version of the Opik platform, [simply create an account](https://www.comet.com/signup?from=llm&utm_medium=docs&utm_source=ragas&utm_campaign=opik) and grab you API Key.\n",
"\n",
"> You can also run the Opik platform locally, see the [installation guide](https://www.comet.com/docs/opik/self-host/self_hosting_opik?utm_medium=github&utm_source=ragas&utm_campaign=opik/) for more information."
"> You can also run the Opik platform locally, see the [installation guide](https://www.comet.com/docs/opik/self-host/self_hosting_opik?utm_medium=docs&utm_source=ragas&utm_campaign=opik/) for more information."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -44,7 +46,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -63,7 +65,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -97,7 +99,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -126,7 +128,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -138,43 +140,48 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Answer Relevancy score: 0.9616931041269692\n"
"Answer Relevancy score: 1.0\n"
]
}
],
"source": [
"import asyncio\n",
"from ragas.integrations.opik import OpikTracer\n",
"from ragas.dataset_schema import SingleTurnSample\n",
"\n",
"\n",
"# Define the scoring function\n",
"def compute_metric(opik_tracer, metric, row):\n",
"def compute_metric(metric, row):\n",
" row = SingleTurnSample(**row)\n",
"\n",
" opik_tracer = OpikTracer()\n",
"\n",
" async def get_score(opik_tracer, metric, row):\n",
" score = await metric.ascore(row, callbacks=[opik_tracer])\n",
" score = await metric.single_turn_ascore(row, callbacks=[OpikTracer()])\n",
" return score\n",
"\n",
" # Run the async function using the current event loop\n",
" loop = asyncio.get_event_loop()\n",
"\n",
" result = loop.run_until_complete(get_score(opik_tracer, metric, row))\n",
" return result\n",
"\n",
"\n",
"# Score a simple example\n",
"row = {\n",
" \"question\": \"What is the capital of France?\",\n",
" \"answer\": \"Paris\",\n",
" \"contexts\": [\"Paris is the capital of France.\", \"Paris is in France.\"],\n",
" \"user_input\": \"What is the capital of France?\",\n",
" \"response\": \"Paris\",\n",
" \"retrieved_contexts\": [\"Paris is the capital of France.\", \"Paris is in France.\"],\n",
"}\n",
"\n",
"opik_tracer = OpikTracer()\n",
"score = compute_metric(opik_tracer, answer_relevancy_metric, row)\n",
"score = compute_metric(answer_relevancy_metric, row)\n",
"print(\"Answer Relevancy score:\", score)"
]
},
Expand All @@ -186,14 +193,14 @@
"\n",
"#### Score traces\n",
"\n",
"You can score traces by using the `get_current_trace` function to get the current trace and then calling the `log_feedback_score` function.\n",
"You can score traces by using the `update_current_trace` function to get the current trace and passing the feedback scores to that function.\n",
"\n",
"The advantage of this approach is that the scoring span is added to the trace allowing for a more fine-grained analysis of the RAG pipeline. It will however run the Ragas metric calculation synchronously and so might not be suitable for production use-cases."
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand All @@ -202,14 +209,14 @@
"'Paris'"
]
},
"execution_count": 5,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from opik import track\n",
"from opik.opik_context import get_current_trace\n",
"from opik.opik_context import update_current_trace\n",
"\n",
"\n",
"@track\n",
Expand All @@ -227,7 +234,7 @@
"@track(name=\"Compute Ragas metric score\", capture_input=False)\n",
"def compute_rag_score(answer_relevancy_metric, question, answer, contexts):\n",
" # Define the score function\n",
" row = {\"question\": question, \"answer\": answer, \"contexts\": contexts}\n",
" row = {\"user_input\": question, \"response\": answer, \"retrieved_contexts\": contexts}\n",
" score = compute_metric(answer_relevancy_metric, row)\n",
" return score\n",
"\n",
Expand All @@ -238,9 +245,10 @@
" contexts = retrieve_contexts(question)\n",
" answer = answer_question(question, contexts)\n",
"\n",
" trace = get_current_trace()\n",
" score = compute_rag_score(answer_relevancy_metric, question, answer, contexts)\n",
" trace.log_feedback_score(\"answer_relevancy\", round(score, 4), category_name=\"ragas\")\n",
" update_current_trace(\n",
" feedback_scores=[{\"name\": \"answer_relevancy\", \"value\": round(score, 4)}]\n",
" )\n",
"\n",
" return answer\n",
"\n",
Expand All @@ -261,25 +269,18 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "985d2e27ce8a48daad673666e6e6e953",
"model_id": "07abcf96a39b4fd183756d5dc3b617c9",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Evaluating: 0%| | 0/9 [00:00<?, ?it/s]"
"Evaluating: 0%| | 0/6 [00:00<?, ?it/s]"
]
},
"metadata": {},
Expand All @@ -289,7 +290,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'context_precision': 1.0000, 'faithfulness': 0.8250, 'answer_relevancy': 0.9755}\n"
"{'context_precision': 1.0000, 'faithfulness': 0.7375, 'answer_relevancy': 0.9889}\n"
]
}
],
Expand All @@ -301,10 +302,21 @@
"\n",
"fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n",
"\n",
"# Reformat the dataset to match the schema expected by the Ragas evaluate function\n",
"dataset = fiqa_eval[\"baseline\"].select(range(3))\n",
"\n",
"dataset = dataset.map(\n",
" lambda x: {\n",
" \"user_input\": x[\"question\"],\n",
" \"reference\": x[\"ground_truths\"][0],\n",
" \"retrieved_contexts\": x[\"contexts\"],\n",
" }\n",
")\n",
"\n",
"opik_tracer_eval = OpikTracer(tags=[\"ragas_eval\"], metadata={\"evaluation_run\": True})\n",
"\n",
"result = evaluate(\n",
" fiqa_eval[\"baseline\"].select(range(3)),\n",
" dataset,\n",
" metrics=[context_precision, faithfulness, answer_relevancy],\n",
" callbacks=[opik_tracer_eval],\n",
")\n",
Expand Down
41 changes: 19 additions & 22 deletions src/ragas/integrations/opik.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import typing as t

try:
from opik.integrations.langchain import ( # type: ignore
from opik.integrations.langchain import (
OpikTracer as LangchainOpikTracer,
)
) # type: ignore

from ragas.evaluation import RAGAS_EVALUATION_CHAIN_NAME
except ImportError:
Expand All @@ -29,37 +29,34 @@ class OpikTracer(LangchainOpikTracer):

_evaluation_run_id: t.Optional[str] = None

def _persist_run(self, run: "Run"):
# The _persist_run function is called by LangChain if it is a root run,
# we update it so that we don't log the root run if we are running an evaluation.
if run.id != self._evaluation_run_id:
super()._persist_run(run)

def _on_chain_start(self, run: "Run"):
def _process_start_trace(self, run: "Run"):
if (run.parent_run_id is None) and (run.name == RAGAS_EVALUATION_CHAIN_NAME):
# Store the evaluation run id so we can flag the child traces and log them independently
self._evaluation_run_id = str(run.id)
self._evaluation_run_id = run.id
else:
# Each child trace of the "ragas evaluation" chain should be a new trace
if run.parent_run_id == self._evaluation_run_id:
run.parent_run_id = None

super()._on_chain_start(run)
super()._process_start_trace(run)

def _on_chain_end(self, run: "Run"):
if run.id == self._evaluation_run_id:
pass
else:
# We want to log the output row chain as feedback scores as these align with the Opik terminology of "feedback scores"
if run.name.startswith("row ") and (self._evaluation_run_id is not None):
span = self._span_map[run.id]
trace_id = span.trace_id
def _process_end_trace(self, run: "Run"):
if run.id != self._evaluation_run_id:
if run.name.startswith("row "):
trace_data = self._created_traces_data_map[run.id]
if run.outputs:
self._opik_client.log_traces_feedback_scores(
[
{"id": trace_id, "name": name, "value": round(value, 4)}
{
"id": trace_data.id,
"name": name,
"value": round(value, 4),
}
for name, value in run.outputs.items()
]
)

self._persist_run(run)
super()._process_end_trace(run)

def _persist_run(self, run: "Run"):
if run.id != self._evaluation_run_id:
super()._persist_run(run)

0 comments on commit c40891b

Please sign in to comment.