diff --git a/.gitignore b/.gitignore index 4779bb12e..ad3a19fb2 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,4 @@ cython_debug/ #.idea/ belar/_version.py **/tmp.ipynb +ragas/_version.py diff --git a/Makefile b/Makefile index 27a1eb38a..fc5ab0fa8 100644 --- a/Makefile +++ b/Makefile @@ -5,20 +5,20 @@ help: ## Show all Makefile targets .PHONY: format lint type style clean run-benchmarks format: ## Running code formatter: black and isort + @echo "(isort) Ordering imports..." + @isort . @echo "(black) Formatting codebase..." - @black --config pyproject.toml belar tests examples + @black --config pyproject.toml ragas tests examples @echo "(black) Formatting stubs..." - @find belar -name "*.pyi" ! -name "*_pb2*" -exec black --pyi --config pyproject.toml {} \; - @echo "(isort) Reordering imports..." - @isort . + @find ragas -name "*.pyi" ! -name "*_pb2*" -exec black --pyi --config pyproject.toml {} \; @echo "(ruff) Running fix only..." - @ruff check belar examples tests --fix-only + @ruff check ragas examples tests --fix-only lint: ## Running lint checker: ruff @echo "(ruff) Linting development project..." - @ruff check belar examples tests + @ruff check ragas examples tests type: ## Running type checker: pyright @echo "(pyright) Typechecking codebase..." - @pyright -p belar + @pyright -p ragas clean: ## Clean all generated files @echo "Cleaning all generated files..." @cd $(GIT_ROOT)/docs && make clean diff --git a/README.md b/README.md index 648f37e4b..3444dc88d 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,36 @@ -# BeLAR -Benchmarking LLM Augmented Retrieval +
+ SOTA metrics for evaluating Retrieval Augmented Generation (RAG) +
+ ++ + + + + + + + + + + + + + + +
+ ++ Installation | + Quick Example | + Hugging Face +
+ + +## What + diff --git a/belar/metrics/__init__.py b/belar/metrics/__init__.py deleted file mode 100644 index 70fe4a3e5..000000000 --- a/belar/metrics/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -from belar.metrics.base import Evaluation, Metric -from belar.metrics.factual import EntailmentScore -from belar.metrics.similarity import SBERTScore -from belar.metrics.simple import (BLUE, EditDistance, EditRatio, Rouge1, - Rouge2, RougeL) - -__all__ = [ - "Evaluation", - "Metric", - "EntailmentScore", - "SBERTScore", - "BLUE", - "EditDistance", - "EditRatio", - "RougeL", - "Rouge1", - "Rouge2", -] diff --git a/docs/assets/logo.png b/docs/assets/logo.png new file mode 100644 index 000000000..22e1ffb2f Binary files /dev/null and b/docs/assets/logo.png differ diff --git a/examples/quickstart.ipynb b/examples/quickstart.ipynb index f726fcf23..3799e9190 100644 --- a/examples/quickstart.ipynb +++ b/examples/quickstart.ipynb @@ -10,24 +10,32 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 1, "id": "22c7dd25", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ + "# only run this if your have an editable install\n", "%load_ext autoreload\n", "%autoreload 2" ] }, + { + "cell_type": "markdown", + "id": "40258397", + "metadata": {}, + "source": [ + "## Load the Data\n", + "\n", + "For this quickstart we are going to be using a dataset that we prepared from [eli5](https://huggingface.co/datasets/eli5) dataset with the models response. \n", + "\n", + "prompt: str\n", + "context: str\n", + "references: list[str]\n", + "ground_truth: list[str]\n", + "generated_text: str" + ] + }, { "cell_type": "code", "execution_count": 2, @@ -85,7 +93,9 @@ "cell_type": "code", "execution_count": 28, "id": "a77c805d", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stderr", diff --git a/pyproject.toml b/pyproject.toml index 2c84c3a99..f7f6a0dc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "belar" +name = "ragas" dependencies = [ "Levenshtein", "rouge-score", @@ -19,4 +19,4 @@ readme = {file = ["README.md"], content-type = "text/plain"} requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] -write_to = "belar/_version.py" +write_to = "ragas/_version.py" diff --git a/belar/__init__.py b/ragas/__init__.py similarity index 100% rename from belar/__init__.py rename to ragas/__init__.py diff --git a/ragas/metrics/__init__.py b/ragas/metrics/__init__.py new file mode 100644 index 000000000..e2518e77f --- /dev/null +++ b/ragas/metrics/__init__.py @@ -0,0 +1,17 @@ +from ragas.metrics.base import Evaluation, Metric +from ragas.metrics.factual import EntailmentScore +from ragas.metrics.similarity import SBERTScore +from ragas.metrics.simple import BLUE, EditDistance, EditRatio, Rouge1, Rouge2, RougeL + +__all__ = [ + "Evaluation", + "Metric", + "EntailmentScore", + "SBERTScore", + "BLUE", + "EditDistance", + "EditRatio", + "RougeL", + "Rouge1", + "Rouge2", +] diff --git a/belar/metrics/base.py b/ragas/metrics/base.py similarity index 100% rename from belar/metrics/base.py rename to ragas/metrics/base.py diff --git a/belar/metrics/factual.py b/ragas/metrics/factual.py similarity index 98% rename from belar/metrics/factual.py rename to ragas/metrics/factual.py index 3a9769280..3928f5b25 100644 --- a/belar/metrics/factual.py +++ b/ragas/metrics/factual.py @@ -9,18 +9,23 @@ import numpy as np import spacy import transformers -from transformers import (AutoConfig, AutoModelForSequenceClassification, - AutoTokenizer, PreTrainedModel) +from transformers import ( + AutoConfig, + AutoModelForSequenceClassification, + AutoTokenizer, + PreTrainedModel, +) -from belar.metrics import Metric -from belar.utils import device_check +from ragas.metrics import Metric +from ragas.utils import device_check if t.TYPE_CHECKING: from torch import device as Device from transformers.models.auto.modeling_auto import ( MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, - MODEL_WITH_LM_HEAD_MAPPING_NAMES) + MODEL_WITH_LM_HEAD_MAPPING_NAMES, +) MODEL_MAPPINGS_NAMES = [ MODEL_WITH_LM_HEAD_MAPPING_NAMES, diff --git a/belar/metrics/similarity.py b/ragas/metrics/similarity.py similarity index 97% rename from belar/metrics/similarity.py rename to ragas/metrics/similarity.py index 8c38137c9..f4435ce04 100644 --- a/belar/metrics/similarity.py +++ b/ragas/metrics/similarity.py @@ -7,7 +7,7 @@ from numpy.linalg import norm from sentence_transformers import SentenceTransformer -from belar.metrics.base import Metric +from ragas.metrics.base import Metric if t.TYPE_CHECKING: from torch import Tensor diff --git a/belar/metrics/simple.py b/ragas/metrics/simple.py similarity index 98% rename from belar/metrics/simple.py rename to ragas/metrics/simple.py index 116225776..81fdad42f 100644 --- a/belar/metrics/simple.py +++ b/ragas/metrics/simple.py @@ -8,7 +8,7 @@ from nltk.translate.bleu_score import sentence_bleu from rouge_score import rouge_scorer -from belar.metrics.base import Metric +from ragas.metrics.base import Metric ROUGE_TYPES = t.Literal["rouge1", "rouge2", "rougeL"] diff --git a/belar/utils.py b/ragas/utils.py similarity index 100% rename from belar/utils.py rename to ragas/utils.py diff --git a/tests/benchmarks/benchmark.py b/tests/benchmarks/benchmark.py index 50d63beb5..dc8812f5c 100644 --- a/tests/benchmarks/benchmark.py +++ b/tests/benchmarks/benchmark.py @@ -5,8 +5,16 @@ from tqdm import tqdm from utils import print_table, timeit -from belar.metrics import (EditDistance, EditRatio, EntailmentScore, - Evaluation, Rouge1, Rouge2, RougeL, SBERTScore) +from ragas.metrics import ( + EditDistance, + EditRatio, + EntailmentScore, + Evaluation, + Rouge1, + Rouge2, + RougeL, + SBERTScore, +) DEVICE = "cuda" if is_available() else "cpu" BATCHES = [0, 1] diff --git a/tests/unit/test_simple.py b/tests/unit/test_simple.py index a247ee597..0ea853d21 100644 --- a/tests/unit/test_simple.py +++ b/tests/unit/test_simple.py @@ -1,4 +1,4 @@ def test_import(): - import belar + import ragas - assert belar is not None + assert ragas is not None