Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds Baseline workflow + fixes #363

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions src/lighteval/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,20 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import argparse
import os
from dataclasses import asdict
from pprint import pformat

from lighteval.parsers import parser_accelerate, parser_nanotron, parser_utils_tasks
from lighteval.parsers import parser_accelerate, parser_baseline, parser_nanotron, parser_utils_tasks
from lighteval.tasks.registry import Registry, taskinfo_selector


CACHE_DIR = os.getenv("HF_HOME")


def cli_evaluate():
def cli_evaluate(): # noqa: C901
parser = argparse.ArgumentParser(description="CLI tool for lighteval, a lightweight framework for LLM evaluation")
subparsers = parser.add_subparsers(help="help for subcommand", dest="subcommand")

Expand All @@ -46,9 +47,12 @@ def cli_evaluate():
parser_b = subparsers.add_parser("nanotron", help="use nanotron as backend for evaluation.")
parser_nanotron(parser_b)

parser_c = subparsers.add_parser("baseline", help="compute baseline for a task")
parser_baseline(parser_c)

# Subparser for task utils functions
parser_c = subparsers.add_parser("tasks", help="display information about available tasks and samples.")
parser_utils_tasks(parser_c)
parser_d = subparsers.add_parser("tasks", help="display information about available tasks and samples.")
parser_utils_tasks(parser_d)

args = parser.parse_args()

Expand All @@ -62,18 +66,24 @@ def cli_evaluate():

main_nanotron(args.checkpoint_config_path, args.lighteval_config_path, args.cache_dir)

elif args.subcommand == "baseline":
from lighteval.main_baseline import main as main_baseline

main_baseline(args)

elif args.subcommand == "tasks":
registry = Registry(cache_dir=args.cache_dir, custom_tasks=args.custom_tasks)
if args.list:
Registry(cache_dir="").print_all_tasks()
registry.print_all_tasks()

if args.inspect:
print(f"Loading the tasks dataset to cache folder: {args.cache_dir}")
print(
"All examples will be displayed without few shot, as few shot sample construction requires loading a model and using its tokenizer. "
)
# Loading task
task_names_list, _ = taskinfo_selector(args.inspect)
task_dict = Registry(cache_dir=args.cache_dir).get_task_dict(task_names_list)
task_names_list, _ = taskinfo_selector(args.inspect, task_registry=registry)
task_dict = registry.get_task_dict(task_names_list)
for name, task in task_dict.items():
print("-" * 10, name, "-" * 10)
if args.show_config:
Expand All @@ -84,7 +94,6 @@ def cli_evaluate():
print("-" * 10, "SAMPLES")
print(f"-- sample {ix} --")
print(pformat(asdict(sample), indent=1))

else:
print("You did not provide any argument. Exiting")

Expand Down
88 changes: 88 additions & 0 deletions src/lighteval/main_baseline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.metrics.utils.metric_utils import MetricCategory
from lighteval.models.abstract_model import ModelInfo
from lighteval.tasks.lighteval_task import LightevalTask
from lighteval.tasks.registry import Registry, taskinfo_selector
from lighteval.utils.utils import as_list


def main(args):
"""
Compute baselines for given tasks.
It has been tested with generative and accuracy tasks, but may not work correctly for other task types.
The baseline is computed as follows:
- For multiple-choice tasks: It assumes random guessing, so the score is n_correct/number_of_choices.
- For other metrics: It assigns a score of 0, which may not be appropriate for all task types.
Note:
This baseline computation may not be suitable for all task types and should be used with caution.
clefourrier marked this conversation as resolved.
Show resolved Hide resolved
"""
task_registry = Registry(cache_dir=args.cache_dir, custom_tasks=args.custom_tasks)
task_names_list, fewshots_dict = taskinfo_selector(args.tasks, task_registry)
task_dict = task_registry.get_task_dict(task_names_list)

evaluation_tracker = EvaluationTracker(
output_dir=args.output_dir,
save_details=False,
push_to_hub=False,
push_to_tensorboard=False,
public=False,
hub_results_org=None,
)
evaluation_tracker.general_config_logger.log_model_info(
ModelInfo(
model_name="lighteval/baseline",
model_sha=None,
model_dtype=None,
model_size=None,
)
)
evaluation_tracker.task_config_logger.log(task_dict)

LightevalTask.load_datasets(list(task_dict.values()), args.dataset_loading_processes)

for task_name, task in task_dict.items():
task_docs = list(task.eval_docs())
n_samples = min(args.max_samples, len(task_docs)) if args.max_samples else len(task_docs)

p_correct_score = [
len(as_list(task_doc.gold_index)) / len(task_doc.choices) for task_doc in task_docs[:n_samples]
]

metric_results = {
metric.metric_name: p_correct_score
if metric.category
in [MetricCategory.MULTICHOICE, MetricCategory.MULTICHOICE_PMI, MetricCategory.MULTICHOICE_ONE_TOKEN]
else 0
for metric in task.metrics
}

for fewshots, _ in fewshots_dict[task_name]:
evaluation_tracker.metrics_logger.log(f"{task_name}|{fewshots}", metric_results)

evaluation_tracker.metrics_logger.aggregate(task_dict=task_dict, bootstrap_iters=1000)
evaluation_tracker.save()
39 changes: 39 additions & 0 deletions src/lighteval/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,44 @@ def parser_accelerate(parser=None):
return parser


def parser_baseline(parser=None):
if parser is None:
parser = argparse.ArgumentParser(
description="CLI tool for lighteval, a lightweight framework for LLM evaluation"
)

parser.add_argument(
"--custom_tasks",
type=str,
default=None,
help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)",
)

parser.add_argument(
"--tasks",
type=str,
required=True,
help="Task to compute the baseline for",
)
parser.add_argument("--max_samples", type=int, default=None, help="Maximum number of samples to evaluate on")
parser.add_argument(
"--dataset_loading_processes", type=int, default=1, help="Number of processes to use for loading the datasets"
)

parser.add_argument(
"--cache_dir", type=str, default=CACHE_DIR, help="Cache directory used to store datasets and models"
)
# Ooutput related
parser.add_argument(
"--output_dir",
required=True,
type=str,
help="Directory to save the results, fsspec compliant (e.g. s3://bucket/path)",
)

return parser


def parser_nanotron(parser=None):
if parser is None:
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -142,6 +180,7 @@ def parser_utils_tasks(parser=None):
default=None,
help="Id of tasks or path to a text file with a list of tasks (e.g. 'original|mmlu:abstract_algebra|5') for which you want to manually inspect samples.",
)
parser.add_argument("--custom_tasks", type=str, default=None, help="Path to a file with custom tasks")
parser.add_argument("--num_samples", type=int, default=10, help="Number of samples to display")
parser.add_argument("--show_config", default=False, action="store_true", help="Will display the full task config")
parser.add_argument(
Expand Down
Loading