diff --git a/Makefile b/Makefile
index ec769aeb2..279e942c1 100644
--- a/Makefile
+++ b/Makefile
@@ -31,3 +31,6 @@ run-benchmarks: ## Run benchmarks
 test: ## Run tests
 	@echo "Running tests..."
 	@pytest tests/unit $(shell if [ -n "$(k)" ]; then echo "-k $(k)"; fi)
+test-e2e: ## Run end2end tests
+	echo "running end2end tests..."
+	@pytest tests/e2e -s
diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
index 26d4cb307..96e48e8cb 100644
--- a/src/ragas/evaluation.py
+++ b/src/ragas/evaluation.py
@@ -34,6 +34,12 @@ def evaluate(
         later. If the top 3 metrics are provided then it also returns the `ragas_score`
         for the entire pipeline.
 
+    Raises
+    ------
+    ValueError
+        if validation fails because the columns required for the metrics are missing or
+        if the columns are of the wrong format.
+
     Examples
     --------
     the basic usage is as follows:
diff --git a/tests/e2e/test_fullflow.py b/tests/e2e/test_fullflow.py
new file mode 100644
index 000000000..8061281af
--- /dev/null
+++ b/tests/e2e/test_fullflow.py
@@ -0,0 +1,14 @@
+from datasets import load_dataset
+
+from ragas import evaluate
+from ragas.metrics import answer_relevancy, context_relevancy, faithfulness
+from ragas.metrics.critique import harmfulness
+
+
+def test_evaluate_e2e():
+    ds = load_dataset("explodinggradients/fiqa", "ragas_eval")["baseline"]
+    result = evaluate(
+        ds.select(range(5)),
+        metrics=[answer_relevancy, context_relevancy, faithfulness, harmfulness],
+    )
+    assert result is not None