diff --git a/README.md b/README.md
index 31e9fc282..f0bf40940 100644
--- a/README.md
+++ b/README.md
@@ -91,9 +91,11 @@ Ragas measures your pipeline's performance against different dimensions
2. **Context Relevancy**: measures how relevant retrieved contexts are to the question. Ideally, the context should only contain information necessary to answer the question. The presence of redundant information in the context is penalized.
-3. **Answer Relevancy**: refers to the degree to which a response directly addresses and is appropriate for a given question or context. This does not take the factuality of the answer into consideration but rather penalizes the present of redundant information or incomplete answers given a question.
+3. **Context Recall**: measures the recall of the retrieved context using annotated answer as ground truth. Annotated answer is taken as proxy for ground truth context.
-4. **Aspect Critiques**: Designed to judge the submission against defined aspects like harmlessness, correctness, etc. You can also define your own aspect and validate the submission against your desired aspect. The output of aspect critiques is always binary.
+4. **Answer Relevancy**: refers to the degree to which a response directly addresses and is appropriate for a given question or context. This does not take the factuality of the answer into consideration but rather penalizes the present of redundant information or incomplete answers given a question.
+
+5. **Aspect Critiques**: Designed to judge the submission against defined aspects like harmlessness, correctness, etc. You can also define your own aspect and validate the submission against your desired aspect. The output of aspect critiques is always binary.
The final `ragas_score` is the harmonic mean of individual metric scores.
diff --git a/docs/integrations/langchain.ipynb b/docs/integrations/langchain.ipynb
index 40232d010..2e1b2f0d5 100644
--- a/docs/integrations/langchain.ipynb
+++ b/docs/integrations/langchain.ipynb
@@ -25,6 +25,17 @@
"nest_asyncio.apply()"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "8333f65e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%load_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
{
"cell_type": "markdown",
"id": "842e32dc",
@@ -35,7 +46,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"id": "4aa9a986",
"metadata": {},
"outputs": [],
@@ -51,23 +62,23 @@
"\n",
"llm = ChatOpenAI()\n",
"qa_chain = RetrievalQA.from_chain_type(\n",
- " llm, retriever=index.vectorstore.as_retriever(), return_source_documents=True\n",
+ " llm, retriever=index.vectorstore.as_retriever(), return_source_documents=True,\n",
")"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"id": "b0ebdf8d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'New York City was named in honor of the Duke of York, who would become King James II of England. King Charles II appointed the Duke as proprietor of the former territory of New Netherland, including the city of New Amsterdam, when England seized it from Dutch control.'"
+ "'New York City got its name in 1664 when it was renamed after the Duke of York, who later became King James II of England. The city was originally called New Amsterdam by Dutch colonists and was renamed New York when it came under British control.'"
]
},
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -90,7 +101,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"id": "e67ce0e0",
"metadata": {},
"outputs": [],
@@ -103,7 +114,16 @@
" \"What is the significance of the Statue of Liberty in New York City?\",\n",
"]\n",
"\n",
- "queries = [{\"query\": q} for q in eval_questions]"
+ "eval_answers = [\n",
+ " \"8,804,000\", # incorrect answer\n",
+ " \"Queens\", # incorrect answer\n",
+ " \"New York City's economic significance is vast, as it serves as the global financial capital, housing Wall Street and major financial institutions. Its diverse economy spans technology, media, healthcare, education, and more, making it resilient to economic fluctuations. NYC is a hub for international business, attracting global companies, and boasts a large, skilled labor force. Its real estate market, tourism, cultural industries, and educational institutions further fuel its economic prowess. The city's transportation network and global influence amplify its impact on the world stage, solidifying its status as a vital economic player and cultural epicenter.\",\n",
+ " \"New York City got its name when it came under British control in 1664. King Charles II of England granted the lands to his brother, the Duke of York, who named the city New York in his own honor.\",\n",
+ " 'The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.',\n",
+ "]\n",
+ "\n",
+ "examples = [{\"query\": q, \"ground_truths\": [eval_answers[i]]} \n",
+ " for i, q in enumerate(eval_questions)]"
]
},
{
@@ -126,18 +146,63 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 10,
+ "id": "8f89d719",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.'"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "result = qa_chain({\"query\": eval_questions[4]})\n",
+ "result[\"result\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "81fa9c47",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'The borough of Brooklyn (Kings County) has the highest population in New York City.'"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "result = qa_chain(examples[1])\n",
+ "result[\"result\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
"id": "1d9266d4",
"metadata": {},
"outputs": [],
"source": [
"from ragas.langchain.evalchain import RagasEvaluatorChain\n",
- "from ragas.metrics import faithfulness, answer_relevancy, context_relevancy\n",
+ "from ragas.metrics import faithfulness, answer_relevancy, context_relevancy, context_recall\n",
"\n",
"# create evaluation chains\n",
"faithfulness_chain = RagasEvaluatorChain(metric=faithfulness)\n",
"answer_rel_chain = RagasEvaluatorChain(metric=answer_relevancy)\n",
- "context_rel_chain = RagasEvaluatorChain(metric=context_relevancy)"
+ "context_rel_chain = RagasEvaluatorChain(metric=context_relevancy)\n",
+ "context_recall_chain = RagasEvaluatorChain(metric=context_recall)"
]
},
{
@@ -152,17 +217,17 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 17,
"id": "5ede32cd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "1.0"
+ "0.5"
]
},
- "execution_count": 6,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -172,6 +237,28 @@
"eval_result[\"faithfulness_score\"]"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "94b5544e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.0"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "eval_result = context_recall_chain(result)\n",
+ "eval_result[\"context_recall_score\"]"
+ ]
+ },
{
"cell_type": "markdown",
"id": "f11295b5",
@@ -184,7 +271,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 24,
"id": "1ce7bff1",
"metadata": {},
"outputs": [
@@ -199,7 +286,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|█████████████████████████████████████████████████████████████| 1/1 [00:38<00:00, 38.77s/it]\n"
+ "100%|█████████████████████████████████████████████████████████████| 1/1 [00:57<00:00, 57.41s/it]\n"
]
},
{
@@ -207,23 +294,65 @@
"text/plain": [
"[{'faithfulness_score': 1.0},\n",
" {'faithfulness_score': 0.5},\n",
- " {'faithfulness_score': 0.75},\n",
+ " {'faithfulness_score': 1.0},\n",
" {'faithfulness_score': 1.0},\n",
" {'faithfulness_score': 1.0}]"
]
},
- "execution_count": 7,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# run the queries as a batch for efficiency\n",
- "predictions = qa_chain.batch(queries)\n",
+ "predictions = qa_chain.batch(examples)\n",
"\n",
"# evaluate\n",
"print(\"evaluating...\")\n",
- "r = faithfulness_chain.evaluate(queries, predictions)\n",
+ "r = faithfulness_chain.evaluate(examples, predictions)\n",
+ "r"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "55299f14",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "evaluating...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|█████████████████████████████████████████████████████████████| 1/1 [00:54<00:00, 54.21s/it]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "[{'context_recall_score': 0.9333333333333333},\n",
+ " {'context_recall_score': 0.0},\n",
+ " {'context_recall_score': 1.0},\n",
+ " {'context_recall_score': 1.0},\n",
+ " {'context_recall_score': 1.0}]"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# evaluate context recall\n",
+ "print(\"evaluating...\")\n",
+ "r = context_recall_chain.evaluate(examples, predictions)\n",
"r"
]
},
@@ -244,7 +373,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 48,
"id": "e75144c5",
"metadata": {},
"outputs": [
@@ -252,7 +381,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "using existing dataset: NYC test\n"
+ "Created a new dataset: NYC test\n"
]
}
],
@@ -274,9 +403,10 @@
" dataset = client.create_dataset(\n",
" dataset_name=dataset_name, description=\"NYC test dataset\"\n",
" )\n",
- " for q in eval_questions:\n",
+ " for e in examples:\n",
" client.create_example(\n",
- " inputs={\"query\": q},\n",
+ " inputs={\"query\": e[\"query\"]},\n",
+ " outputs={\"ground_truths\": e[\"ground_truths\"]},\n",
" dataset_id=dataset.id,\n",
" )\n",
"\n",
@@ -297,7 +427,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 27,
"id": "3a6decc6",
"metadata": {},
"outputs": [],
@@ -322,7 +452,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 49,
"id": "25f7992f",
"metadata": {},
"outputs": [
@@ -330,8 +460,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "View the evaluation results for project '2023-08-22-19-28-17-RetrievalQA' at:\n",
- "https://smith.langchain.com/projects/p/2133d672-b69a-4091-bc96-a4e39d150db5?eval=true\n"
+ "View the evaluation results for project '2023-08-24-03-36-45-RetrievalQA' at:\n",
+ "https://smith.langchain.com/projects/p/9fb78371-150e-49cc-a927-b1247fdb9e8d?eval=true\n"
]
}
],
@@ -339,10 +469,10 @@
"from langchain.smith import RunEvalConfig, run_on_dataset\n",
"\n",
"evaluation_config = RunEvalConfig(\n",
- " custom_evaluators=[faithfulness_chain, answer_rel_chain, context_rel_chain],\n",
+ " custom_evaluators=[faithfulness_chain, answer_rel_chain, context_rel_chain, context_recall_chain],\n",
" prediction_key=\"result\",\n",
")\n",
- "\n",
+ " \n",
"result = run_on_dataset(\n",
" client,\n",
" dataset_name,\n",
diff --git a/docs/metrics.md b/docs/metrics.md
index 8f3616a04..1c16c5156 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -30,6 +30,22 @@ dataset: Dataset
results = context_rel.score(dataset)
```
+### `Context Recall`
+measures the recall of the retrieved context using annotated answer as ground truth. Annotated answer is taken as proxy for ground truth context.
+
+```python
+from ragas.metrics.context_recall import ContextRecall
+context_recall = ContextRecall()
+# Dataset({
+# features: ['contexts','ground_truths'],
+# num_rows: 25
+# })
+dataset: Dataset
+
+results = context_recall.score(dataset)
+```
+
+
### `AnswerRelevancy`
This measures how relevant is the generated answer to the prompt. If the generated answer is incomplete or contains redundant information the score will be low. This is quantified by working out the chance of an LLM generating the given question using the generated answer. Values range (0,1), higher the better.
diff --git a/docs/quickstart.ipynb b/docs/quickstart.ipynb
index 9e9411953..89804059f 100644
--- a/docs/quickstart.ipynb
+++ b/docs/quickstart.ipynb
@@ -1,419 +1,473 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "2e63f667",
- "metadata": {},
- "source": [
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
Quickstart
\n",
- "\n",
- "\n",
- "welcome to the ragas quickstart. We're going to get you up and running with ragas as qickly as you can so that you can go back to improving your Retrieval Augmented Generation pipelines while this library makes sure your changes are improving your entire pipeline.\n",
- "\n",
- "to kick things of lets start with the data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "57585b55",
- "metadata": {},
- "outputs": [],
- "source": [
- "# if using colab uncomment this\n",
- "#!pip install ragas"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c77789bb",
- "metadata": {},
- "source": [
- "Ragas also uses OpenAI for running some metrics so make sure you have your openai key ready and available in your environment"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "0b7179f7",
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "\n",
- "os.environ[\"OPENAI_API_KEY\"] = \"your-openai-key\""
- ]
- },
- {
- "cell_type": "markdown",
- "id": "06c9fc7d",
- "metadata": {},
- "source": [
- "## The Data\n",
- "\n",
- "Ragas performs a `ground_truth` free evaluation of your RAG pipelines. This is because for most people building a gold labeled dataset which represents in the distribution they get in production is a very expensive process.\n",
- "\n",
- "Hence to work with ragas all you need are the following data\n",
- "- question: `list[str]` - These are the questions you RAG pipeline will be evaluated on. \n",
- "- answer: `list[str]` - The answer generated from the RAG pipeline and give to the user.\n",
- "- contexts: `list[list[str]]` - The contexts which where passed into the LLM to answer the question.\n",
- "\n",
- "Ideally your list of questions should reflect the questions your users give, including those that you have been problamatic in the past.\n",
- "\n",
- "Here we're using an example dataset from on of the baselines we created for the [Financial Opinion Mining and Question Answering (fiqa) Dataset](https://sites.google.com/view/fiqa/) we created. If you want to want to know more about the baseline, feel free to check the `experiements/baseline` section"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "b658e02f",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Found cached dataset fiqa (/home/jjmachan/.cache/huggingface/datasets/explodinggradients___fiqa/ragas_eval/1.0.0/3dc7b639f5b4b16509a3299a2ceb78bf5fe98ee6b5fee25e7d5e4d290c88efb8)\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "986d2c6f72354b10b32d0458fe00a749",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- " 0%| | 0/1 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "DatasetDict({\n",
- " baseline: Dataset({\n",
- " features: ['question', 'ground_truths', 'answer', 'contexts'],\n",
- " num_rows: 30\n",
- " })\n",
- "})"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# data\n",
- "from datasets import load_dataset\n",
- "\n",
- "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n",
- "fiqa_eval"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "84aa640f",
- "metadata": {},
- "source": [
- "## Metrics\n",
- "\n",
- "Ragas measures your pipeline's performance against two dimensions\n",
- "\n",
- "1. Faithfulness: measures the factual consistency of the generated answer against the given context.\n",
- "2. Relevancy: measures how relevant retrieved contexts and the generated answer are to the question.\n",
- "\n",
- "Through repeated experiments, we have found that the quality of a RAG pipeline is highly dependent on these two dimensions. The final `ragas_score` is the harmonic mean of these two factors.\n",
- "\n",
- "now lets import these metrics and understand more about what they denote"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "f17bcf9d",
- "metadata": {},
- "outputs": [],
- "source": [
- "from ragas.metrics import context_relevancy, answer_relevancy, faithfulness\n",
- "from ragas.metrics.critique import harmfulness"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ef8c5e60",
- "metadata": {},
- "source": [
- "here you can see that we are using 4 metrics, but what do the represent?\n",
- "\n",
- "1. context_relevancy - a measure of how relevent the retrieved context is to the question. Conveys quality of the retrieval pipeline.\n",
- "2. answer_relevancy - a measure of how relevent the answer is to the question\n",
- "3. faithfulness - the factual consistancy of the answer to the context base on the question.\n",
- "4. harmfulness (AspectCritique) - in general, `AspectCritique` is a metric that can be used to quantify various aspects of the answer. Aspects like harmfulness, maliciousness, coherence, correctness, concisenes are available by default but you can easily define your own. Check the [docs](./metrics.md) for more info.\n",
- "\n",
- "**Note:** *by default these metrics are using OpenAI's API to compute the score. If you using this metric make sure you set the environment key `OPENAI_API_KEY` with your API key. You can also try other LLMs for evaluation, check the [llm guide](./guides/llms.ipynb) to learn more*\n",
- "\n",
- "If you're interested in learning more, feel free to check the [docs](https://github.com/explodinggradients/ragas/blob/main/docs/metrics.md)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8d6ecd5a",
- "metadata": {},
- "source": [
- "## Evaluation\n",
- "\n",
- "Running the evalutation is as simple as calling evaluate on the `Dataset` with the metrics of your choice."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "22eb6f97",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "evaluating with [context_ relevancy]\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|████████████████████████████████████████████████████████████| 1/1 [00:06<00:00, 6.05s/it]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "evaluating with [faithfulness]\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|████████████████████████████████████████████████████████████| 1/1 [00:22<00:00, 22.11s/it]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "evaluating with [answer_relevancy]\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|████████████████████████████████████████████████████████████| 1/1 [00:07<00:00, 7.20s/it]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "evaluating with [harmfulness]\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|████████████████████████████████████████████████████████████| 1/1 [00:07<00:00, 7.75s/it]\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "{'ragas_score': 0.1787, 'context_ relevancy': 0.0689, 'faithfulness': 0.8333, 'answer_relevancy': 0.9347, 'harmfulness': 0.0000}"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ragas import evaluate\n",
- "\n",
- "result = evaluate(\n",
- " fiqa_eval[\"baseline\"],\n",
- " metrics=[context_relevancy, faithfulness, answer_relevancy, harmfulness],\n",
- ")\n",
- "\n",
- "result"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a2dc0ec2",
- "metadata": {},
- "source": [
- "and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.\n",
- "\n",
- "now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "8686bf53",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " question | \n",
- " ground_truths | \n",
- " answer | \n",
- " contexts | \n",
- " context_ relevancy | \n",
- " faithfulness | \n",
- " answer_relevancy | \n",
- " harmfulness | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " How to deposit a cheque issued to an associate... | \n",
- " [Have the check reissued to the proper payee.J... | \n",
- " \\nThe best way to deposit a cheque issued to a... | \n",
- " [Just have the associate sign the back and the... | \n",
- " 0.132468 | \n",
- " 1.0 | \n",
- " 0.978180 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Can I send a money order from USPS as a business? | \n",
- " [Sure you can. You can fill in whatever you w... | \n",
- " \\nYes, you can send a money order from USPS as... | \n",
- " [Sure you can. You can fill in whatever you w... | \n",
- " 0.074175 | \n",
- " 1.0 | \n",
- " 0.909481 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 1 EIN doing business under multiple business n... | \n",
- " [You're confusing a lot of things here. Compan... | \n",
- " \\nYes, it is possible to have one EIN doing bu... | \n",
- " [You're confusing a lot of things here. Compan... | \n",
- " 0.000000 | \n",
- " 0.5 | \n",
- " 0.916480 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " question \\\n",
- "0 How to deposit a cheque issued to an associate... \n",
- "1 Can I send a money order from USPS as a business? \n",
- "2 1 EIN doing business under multiple business n... \n",
- "\n",
- " ground_truths \\\n",
- "0 [Have the check reissued to the proper payee.J... \n",
- "1 [Sure you can. You can fill in whatever you w... \n",
- "2 [You're confusing a lot of things here. Compan... \n",
- "\n",
- " answer \\\n",
- "0 \\nThe best way to deposit a cheque issued to a... \n",
- "1 \\nYes, you can send a money order from USPS as... \n",
- "2 \\nYes, it is possible to have one EIN doing bu... \n",
- "\n",
- " contexts context_ relevancy \\\n",
- "0 [Just have the associate sign the back and the... 0.132468 \n",
- "1 [Sure you can. You can fill in whatever you w... 0.074175 \n",
- "2 [You're confusing a lot of things here. Compan... 0.000000 \n",
- "\n",
- " faithfulness answer_relevancy harmfulness \n",
- "0 1.0 0.978180 0 \n",
- "1 1.0 0.909481 0 \n",
- "2 0.5 0.916480 0 "
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df = result.to_pandas()\n",
- "df.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f668fce1",
- "metadata": {},
- "source": [
- "And thats it!\n",
- "\n",
- "You can check out the [ragas in action] notebook to get a feel of what is like to use it while trying to improve your pipelines.\n",
- "\n",
- "if you have any suggestion/feedbacks/things your not happy about, please do share it in the [issue section](https://github.com/explodinggradients/ragas/issues). We love hearing from you 😁"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.12"
- }
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "2e63f667",
+ "metadata": {},
+ "source": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
Quickstart
\n",
+ "\n",
+ "\n",
+ "welcome to the ragas quickstart. We're going to get you up and running with ragas as qickly as you can so that you can go back to improving your Retrieval Augmented Generation pipelines while this library makes sure your changes are improving your entire pipeline.\n",
+ "\n",
+ "to kick things of lets start with the data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "57585b55",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# if using colab uncomment this\n",
+ "#!pip install ragas"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c77789bb",
+ "metadata": {},
+ "source": [
+ "Ragas also uses OpenAI for running some metrics so make sure you have your openai key ready and available in your environment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "0b7179f7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "os.environ[\"OPENAI_API_KEY\"] = \"your-openai-key\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "06c9fc7d",
+ "metadata": {},
+ "source": [
+ "## The Data\n",
+ "\n",
+ "Ragas performs a `ground_truth` free evaluation of your RAG pipelines. This is because for most people building a gold labeled dataset which represents in the distribution they get in production is a very expensive process.\n",
+ "\n",
+ "Hence to work with ragas all you need are the following data\n",
+ "- question: `list[str]` - These are the questions you RAG pipeline will be evaluated on. \n",
+ "- answer: `list[str]` - The answer generated from the RAG pipeline and give to the user.\n",
+ "- contexts: `list[list[str]]` - The contexts which where passed into the LLM to answer the question.\n",
+ "- ground_truths: `list[list[str]]` - The ground truth answer to the questions.\n",
+ "\n",
+ "Ideally your list of questions should reflect the questions your users give, including those that you have been problamatic in the past.\n",
+ "\n",
+ "Here we're using an example dataset from on of the baselines we created for the [Financial Opinion Mining and Question Answering (fiqa) Dataset](https://sites.google.com/view/fiqa/) we created. If you want to want to know more about the baseline, feel free to check the `experiements/baseline` section"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "b658e02f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Found cached dataset fiqa (/home/jjmachan/.cache/huggingface/datasets/explodinggradients___fiqa/ragas_eval/1.0.0/3dc7b639f5b4b16509a3299a2ceb78bf5fe98ee6b5fee25e7d5e4d290c88efb8)\n"
+ ]
},
- "nbformat": 4,
- "nbformat_minor": 5
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "6c415f76ed4f4c969f87986ee05f2fb1",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/1 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "DatasetDict({\n",
+ " baseline: Dataset({\n",
+ " features: ['question', 'ground_truths', 'answer', 'contexts'],\n",
+ " num_rows: 30\n",
+ " })\n",
+ "})"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# data\n",
+ "from datasets import load_dataset\n",
+ "\n",
+ "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n",
+ "fiqa_eval"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "84aa640f",
+ "metadata": {},
+ "source": [
+ "## Metrics\n",
+ "\n",
+ "Ragas provides you with a few metrics to evaluate the different aspects of your RAG systems namely\n",
+ "\n",
+ "1. metrics to evaluate retrieval: offers `context_relevancy` and `context_recall` which give you the measure of the performance of your retrieval system. \n",
+ "2. metrics to evaluate generation: offers `faithfulness` which measures hallucinations and `answer_relevancy` which measures how to the point the answers are to the question.\n",
+ "\n",
+ "The harmonic mean of these 4 aspects gives you the **ragas score** which is a single measure of the performance of your QA system across all the important aspects.\n",
+ "\n",
+ "now lets import these metrics and understand more about what they denote"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "f17bcf9d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from ragas.metrics import context_relevancy, answer_relevancy, faithfulness, context_recall\n",
+ "from ragas.metrics.critique import harmfulness"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ef8c5e60",
+ "metadata": {},
+ "source": [
+ "here you can see that we are using 4 metrics, but what do the represent?\n",
+ "\n",
+ "1. context_relevancy - a measure of how relevent the retrieved context is to the question. Conveys quality of the retrieval pipeline.\n",
+ "2. answer_relevancy - a measure of how relevent the answer is to the question\n",
+ "3. faithfulness - the factual consistancy of the answer to the context base on the question.\n",
+ "4. context_recall: measures the ability of the retriever to retrieve all the necessary information needed to answer the question. \n",
+ "5. harmfulness (AspectCritique) - in general, `AspectCritique` is a metric that can be used to quantify various aspects of the answer. Aspects like harmfulness, maliciousness, coherence, correctness, concisenes are available by default but you can easily define your own. Check the [docs](./metrics.md) for more info.\n",
+ "\n",
+ "**Note:** *by default these metrics are using OpenAI's API to compute the score. If you using this metric make sure you set the environment key `OPENAI_API_KEY` with your API key. You can also try other LLMs for evaluation, check the [llm guide](./guides/llms.ipynb) to learn more*\n",
+ "\n",
+ "If you're interested in learning more, feel free to check the [docs](https://github.com/explodinggradients/ragas/blob/main/docs/metrics.md)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8d6ecd5a",
+ "metadata": {},
+ "source": [
+ "## Evaluation\n",
+ "\n",
+ "Running the evalutation is as simple as calling evaluate on the `Dataset` with the metrics of your choice."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "22eb6f97",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "evaluating with [context_ relevancy]\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|████████████████████████████████████████████████████████████| 2/2 [05:28<00:00, 164.33s/it]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "evaluating with [faithfulness]\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|████████████████████████████████████████████████████████████| 2/2 [09:24<00:00, 282.03s/it]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "evaluating with [answer_relevancy]\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|█████████████████████████████████████████████████████████████| 2/2 [01:22<00:00, 41.37s/it]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "evaluating with [context_recall]\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|████████████████████████████████████████████████████████████| 2/2 [13:02<00:00, 391.15s/it]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "evaluating with [harmfulness]\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|█████████████████████████████████████████████████████████████| 2/2 [02:10<00:00, 65.37s/it]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'ragas_score': 0.4400, 'context_ relevancy': 0.2339, 'faithfulness': 0.7689, 'answer_relevancy': 0.9260, 'context_recall': 0.4107, 'harmfulness': 0.0000}"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from ragas import evaluate\n",
+ "\n",
+ "result = evaluate(\n",
+ " fiqa_eval[\"baseline\"],\n",
+ " metrics=[context_relevancy, faithfulness, answer_relevancy, context_recall, harmfulness],\n",
+ ")\n",
+ "\n",
+ "result"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a2dc0ec2",
+ "metadata": {},
+ "source": [
+ "and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.\n",
+ "\n",
+ "now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "8686bf53",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " question | \n",
+ " ground_truths | \n",
+ " answer | \n",
+ " contexts | \n",
+ " context_ relevancy | \n",
+ " faithfulness | \n",
+ " answer_relevancy | \n",
+ " context_recall | \n",
+ " harmfulness | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " How to deposit a cheque issued to an associate... | \n",
+ " [Have the check reissued to the proper payee.J... | \n",
+ " \\nThe best way to deposit a cheque issued to a... | \n",
+ " [Just have the associate sign the back and the... | \n",
+ " 0.088301 | \n",
+ " 0.666667 | \n",
+ " 0.976247 | \n",
+ " 0.111111 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Can I send a money order from USPS as a business? | \n",
+ " [Sure you can. You can fill in whatever you w... | \n",
+ " \\nYes, you can send a money order from USPS as... | \n",
+ " [Sure you can. You can fill in whatever you w... | \n",
+ " 0.191611 | \n",
+ " 1.000000 | \n",
+ " 0.883586 | \n",
+ " 0.800000 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 EIN doing business under multiple business n... | \n",
+ " [You're confusing a lot of things here. Compan... | \n",
+ " \\nYes, it is possible to have one EIN doing bu... | \n",
+ " [You're confusing a lot of things here. Compan... | \n",
+ " 0.069420 | \n",
+ " 1.000000 | \n",
+ " 0.928548 | \n",
+ " 1.000000 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Applying for and receiving business credit | \n",
+ " [\"I'm afraid the great myth of limited liabili... | \n",
+ " \\nApplying for and receiving business credit c... | \n",
+ " [Set up a meeting with the bank that handles y... | \n",
+ " 0.408924 | \n",
+ " 1.000000 | \n",
+ " 0.906223 | \n",
+ " 0.187500 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 401k Transfer After Business Closure | \n",
+ " [You should probably consult an attorney. Howe... | \n",
+ " \\nIf your employer has closed and you need to ... | \n",
+ " [The time horizon for your 401K/IRA is essenti... | \n",
+ " 0.064802 | \n",
+ " 0.666667 | \n",
+ " 0.889312 | \n",
+ " 0.000000 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " question \\\n",
+ "0 How to deposit a cheque issued to an associate... \n",
+ "1 Can I send a money order from USPS as a business? \n",
+ "2 1 EIN doing business under multiple business n... \n",
+ "3 Applying for and receiving business credit \n",
+ "4 401k Transfer After Business Closure \n",
+ "\n",
+ " ground_truths \\\n",
+ "0 [Have the check reissued to the proper payee.J... \n",
+ "1 [Sure you can. You can fill in whatever you w... \n",
+ "2 [You're confusing a lot of things here. Compan... \n",
+ "3 [\"I'm afraid the great myth of limited liabili... \n",
+ "4 [You should probably consult an attorney. Howe... \n",
+ "\n",
+ " answer \\\n",
+ "0 \\nThe best way to deposit a cheque issued to a... \n",
+ "1 \\nYes, you can send a money order from USPS as... \n",
+ "2 \\nYes, it is possible to have one EIN doing bu... \n",
+ "3 \\nApplying for and receiving business credit c... \n",
+ "4 \\nIf your employer has closed and you need to ... \n",
+ "\n",
+ " contexts context_ relevancy \\\n",
+ "0 [Just have the associate sign the back and the... 0.088301 \n",
+ "1 [Sure you can. You can fill in whatever you w... 0.191611 \n",
+ "2 [You're confusing a lot of things here. Compan... 0.069420 \n",
+ "3 [Set up a meeting with the bank that handles y... 0.408924 \n",
+ "4 [The time horizon for your 401K/IRA is essenti... 0.064802 \n",
+ "\n",
+ " faithfulness answer_relevancy context_recall harmfulness \n",
+ "0 0.666667 0.976247 0.111111 0 \n",
+ "1 1.000000 0.883586 0.800000 0 \n",
+ "2 1.000000 0.928548 1.000000 0 \n",
+ "3 1.000000 0.906223 0.187500 0 \n",
+ "4 0.666667 0.889312 0.000000 0 "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = result.to_pandas()\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f668fce1",
+ "metadata": {},
+ "source": [
+ "And thats it!\n",
+ "\n",
+ "You can check out the [ragas in action] notebook to get a feel of what is like to use it while trying to improve your pipelines.\n",
+ "\n",
+ "if you have any suggestion/feedbacks/things your not happy about, please do share it in the [issue section](https://github.com/explodinggradients/ragas/issues). We love hearing from you 😁"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
}
diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
index 40f677af9..6d0c80896 100644
--- a/src/ragas/evaluation.py
+++ b/src/ragas/evaluation.py
@@ -76,13 +76,15 @@ def evaluate(
metrics = [answer_relevancy, context_relevancy, faithfulness]
- # select columns from the dataset
- dataset = dataset.from_dict({k: dataset[v] for k, v in column_map.items()})
-
# validation
validate_evaluation_modes(dataset, metrics)
validate_column_dtypes(dataset)
+ # select columns from the dataset
+ dataset = dataset.from_dict(
+ {column_map[name]: dataset[column_map[name]] for name in dataset.column_names}
+ )
+
# run the evaluation on dataset with different metrics
# initialize all the models in the metrics
[m.init_model() for m in metrics]
diff --git a/src/ragas/langchain/__init__.py b/src/ragas/langchain/__init__.py
index e69de29bb..07570a8fd 100644
--- a/src/ragas/langchain/__init__.py
+++ b/src/ragas/langchain/__init__.py
@@ -0,0 +1 @@
+from ragas.langchain.evalchain import RagasEvaluatorChain
diff --git a/src/ragas/langchain/evalchain.py b/src/ragas/langchain/evalchain.py
index fd5af1210..43d4ad3c9 100644
--- a/src/ragas/langchain/evalchain.py
+++ b/src/ragas/langchain/evalchain.py
@@ -1,6 +1,7 @@
from __future__ import annotations
import typing as t
+from collections import defaultdict
from datasets import Dataset
from langchain.callbacks.manager import CallbackManagerForChainRun
@@ -32,6 +33,8 @@ def input_keys(self) -> list[str]:
keys = ["query", "result"]
if self.metric.evaluation_mode in [EvaluationMode.qac, EvaluationMode.qc]:
keys += ["source_documents"]
+ if self.metric.evaluation_mode in [EvaluationMode.gc]:
+ keys += ["ground_truths"]
return keys
@property
@@ -58,6 +61,9 @@ def _call(
contexts.append(document["page_content"])
else:
contexts.append(document.page_content)
+ ground_truths = []
+ if "ground_truths" in inputs:
+ ground_truths = inputs["ground_truths"]
question = inputs["query"]
answer = inputs["result"]
@@ -66,6 +72,7 @@ def _call(
"question": question,
"answer": answer,
"contexts": contexts,
+ "ground_truths": ground_truths,
},
callbacks=callbacks,
)
@@ -96,6 +103,11 @@ def _validate(
f'"{context_key}" is required in each prediction for the '
f"metric[{self.metric.name}] you have chosen."
)
+ if "ground_truths" in required_columns and "ground_truths" not in input:
+ raise ValueError(
+ f'"ground_truths" is required in each prediction for the '
+ f"metric[{self.metric.name}] you have chosen."
+ )
def evaluate(
self,
@@ -104,11 +116,12 @@ def evaluate(
question_key: str = "query",
prediction_key: str = "result",
context_key: str = "source_documents",
+ ground_truths_key: str = "ground_truths",
*,
callbacks: Callbacks = None,
) -> list[dict]:
"""Evaluate question answering examples and predictions."""
- question, answer, contexts = [], [], []
+ dataset_dict = defaultdict(list)
# validation
if len(examples) != len(predictions):
@@ -122,13 +135,32 @@ def evaluate(
{**example, **predictions[i]}, question_key, prediction_key, context_key
)
# transform into Dataset that is supported by ragas
- question.append(example[question_key])
- answer.append(predictions[i][prediction_key])
- if self.metric.evaluation_mode in [EvaluationMode.qac, EvaluationMode.qc]:
- contexts.append([d.page_content for d in predictions[i][context_key]])
- dataset = Dataset.from_dict(
- {"question": question, "answer": answer, "contexts": contexts}
- )
+ if self.metric.evaluation_mode in [
+ EvaluationMode.qac,
+ EvaluationMode.qc,
+ EvaluationMode.qa,
+ ]:
+ dataset_dict["question"].append(example[question_key])
+
+ if self.metric.evaluation_mode in [EvaluationMode.qac, EvaluationMode.qa]:
+ dataset_dict["answer"].append(predictions[i][prediction_key])
+
+ if self.metric.evaluation_mode in [
+ EvaluationMode.qac,
+ EvaluationMode.qc,
+ EvaluationMode.gc,
+ ]:
+ dataset_dict["contexts"].append(
+ [d.page_content for d in predictions[i][context_key]]
+ )
+
+ if self.metric.evaluation_mode == EvaluationMode.gc:
+ if isinstance(example["ground_truths"], list):
+ dataset_dict["ground_truths"].append(example["ground_truths"])
+ else:
+ dataset_dict["ground_truths"].append([example["ground_truths"]])
+
+ dataset = Dataset.from_dict(dataset_dict)
# evaluate
dataset_with_scores = self.metric.score(dataset, callbacks=callbacks)
@@ -145,9 +177,16 @@ def evaluate_run(
Evaluate a langsmith run
"""
if run.outputs is None:
- raise ValueError("Run outputs cannot be None")
- run.outputs["query"] = run.inputs["query"]
- eval_output = self(run.outputs, include_run_info=True)
+ raise ValueError("The chain should return results and service_document.")
+ if example is None:
+ raise ValueError("Examples have to be provided.")
+ chain_eval = run.outputs
+ chain_eval["query"] = run.inputs["query"]
+ if self.metric.evaluation_mode == EvaluationMode.gc:
+ if example.outputs is None or "ground_truths" not in example.outputs:
+ raise ValueError("expected `ground_truths` in example outputs.")
+ chain_eval["ground_truths"] = example.outputs["ground_truths"]
+ eval_output = self(chain_eval, include_run_info=True)
score_name = f"{self.metric.name}_score"
evaluation_result = EvaluationResult(
diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py
index 826e8fb25..b385e9e26 100644
--- a/src/ragas/metrics/__init__.py
+++ b/src/ragas/metrics/__init__.py
@@ -1,5 +1,6 @@
from ragas.metrics.answer_relevance import AnswerRelevancy, answer_relevancy
from ragas.metrics.context_relevance import ContextRelevancy, context_relevancy
+from ragas.metrics.context_recall import ContextRecall, context_recall
from ragas.metrics.critique import AspectCritique
from ragas.metrics.faithfulnes import Faithfulness, faithfulness
@@ -11,4 +12,6 @@
"ContextRelevancy",
"context_relevancy",
"AspectCritique",
+ "ContextRecall",
+ "context_recall"
]
diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
index 605b38547..19868c3b4 100644
--- a/src/ragas/metrics/base.py
+++ b/src/ragas/metrics/base.py
@@ -38,7 +38,7 @@ def make_batches(total_size: int, batch_size: int) -> list[range]:
return batches
-EvaluationMode = Enum("EvaluationMode", "qac qa qc ga")
+EvaluationMode = Enum("EvaluationMode", "qac qa qc gc")
@dataclass
diff --git a/src/ragas/metrics/context_recall.py b/src/ragas/metrics/context_recall.py
new file mode 100644
index 000000000..6e5977ca9
--- /dev/null
+++ b/src/ragas/metrics/context_recall.py
@@ -0,0 +1,93 @@
+from __future__ import annotations
+
+import typing as t
+from dataclasses import dataclass
+
+from datasets import Dataset
+from langchain.callbacks.manager import CallbackManager, trace_as_chain_group
+from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
+
+from ragas.metrics.base import EvaluationMode, MetricWithLLM
+from ragas.metrics.llms import generate
+
+CONTEXT_RECALL_RA = HumanMessagePromptTemplate.from_template(
+ """
+Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not.
+Think in steps and reason bofore coming to conclusion.
+
+context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.
+answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895
+classification
+1. Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. The date of birth of Einstein is mentioned clearly in the context. So [Attributed]
+2. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. The exact sentence is present in the given context. So [Attributed]
+3. He published 4 papers in 1905. There is no mention about papers he wrote in given the context. So [Not Attributed]
+4. Einstein moved to Switzerland in 1895. There is not supporting evidence for this in the given the context. So [Not Attributed]
+
+context:{context}
+answer:{ground_truth}
+classification:
+""" # noqa: E501
+)
+
+
+@dataclass
+class ContextRecall(MetricWithLLM):
+
+ """
+ Estimates context recall by estimating TP and FN using annotated answer and
+ retrieved context.
+
+ Attributes
+ ----------
+ name : str
+ batch_size : int
+ Batch size for openai completion.
+ """
+
+ name: str = "context_recall"
+ evaluation_mode: EvaluationMode = EvaluationMode.gc
+ batch_size: int = 15
+
+ def init_model(self: t.Self):
+ ...
+
+ def _score_batch(
+ self: t.Self,
+ dataset: Dataset,
+ callbacks: t.Optional[CallbackManager] = None,
+ callback_group_name: str = "batch",
+ ) -> list:
+ verdict_token = "[Attributed]"
+ prompts = []
+ ground_truths, contexts = dataset["ground_truths"], dataset["contexts"]
+
+ with trace_as_chain_group(
+ callback_group_name, callback_manager=callbacks
+ ) as batch_group:
+ for gt, ctx in zip(ground_truths, contexts):
+ gt = "\n".join(gt) if isinstance(gt, list) else gt
+ ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx
+ human_prompt = CONTEXT_RECALL_RA.format(context=ctx, ground_truth=gt)
+ prompts.append(ChatPromptTemplate.from_messages([human_prompt]))
+
+ responses: list[list[str]] = []
+ results = generate(
+ prompts,
+ self.llm,
+ n=1,
+ callbacks=batch_group,
+ )
+ responses = [[i.text for i in r] for r in results.generations]
+ scores = []
+ for response in responses:
+ sentences = response[0].split("\n")
+ denom = len(sentences)
+ numerator = sum(
+ bool(sentence.find(verdict_token) != -1) for sentence in sentences
+ )
+ scores.append(numerator / denom)
+
+ return scores
+
+
+context_recall = ContextRecall()
diff --git a/src/ragas/validation.py b/src/ragas/validation.py
index 98c39250c..50c1487d8 100644
--- a/src/ragas/validation.py
+++ b/src/ragas/validation.py
@@ -29,7 +29,7 @@ def validate_column_dtypes(ds: Dataset):
EvaluationMode.qac: ["question", "answer", "contexts"],
EvaluationMode.qa: ["question", "answer"],
EvaluationMode.qc: ["question", "contexts"],
- EvaluationMode.ga: ["ground_truths", "answer"],
+ EvaluationMode.gc: ["ground_truths", "contexts"],
}