diff --git a/Makefile b/Makefile index ec769aeb2..279e942c1 100644 --- a/Makefile +++ b/Makefile @@ -31,3 +31,6 @@ run-benchmarks: ## Run benchmarks test: ## Run tests @echo "Running tests..." @pytest tests/unit $(shell if [ -n "$(k)" ]; then echo "-k $(k)"; fi) +test-e2e: ## Run end2end tests + echo "running end2end tests..." + @pytest tests/e2e -s diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 26d4cb307..96e48e8cb 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -34,6 +34,12 @@ def evaluate( later. If the top 3 metrics are provided then it also returns the `ragas_score` for the entire pipeline. + Raises + ------ + ValueError + if validation fails because the columns required for the metrics are missing or + if the columns are of the wrong format. + Examples -------- the basic usage is as follows: diff --git a/tests/e2e/test_fullflow.py b/tests/e2e/test_fullflow.py new file mode 100644 index 000000000..8061281af --- /dev/null +++ b/tests/e2e/test_fullflow.py @@ -0,0 +1,14 @@ +from datasets import load_dataset + +from ragas import evaluate +from ragas.metrics import answer_relevancy, context_relevancy, faithfulness +from ragas.metrics.critique import harmfulness + + +def test_evaluate_e2e(): + ds = load_dataset("explodinggradients/fiqa", "ragas_eval")["baseline"] + result = evaluate( + ds.select(range(5)), + metrics=[answer_relevancy, context_relevancy, faithfulness, harmfulness], + ) + assert result is not None