huggingface · hynky1999 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 16, 2024
diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py
@@ -22,19 +22,20 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+
 import argparse
 import os
 from dataclasses import asdict
 from pprint import pformat
 
-from lighteval.parsers import parser_accelerate, parser_nanotron, parser_utils_tasks
+from lighteval.parsers import parser_accelerate, parser_baseline, parser_nanotron, parser_utils_tasks
 from lighteval.tasks.registry import Registry, taskinfo_selector
 
 
 CACHE_DIR = os.getenv("HF_HOME")
 
 
-def cli_evaluate():
+def cli_evaluate(): # noqa: C901
  parser = argparse.ArgumentParser(description="CLI tool for lighteval, a lightweight framework for LLM evaluation")
  subparsers = parser.add_subparsers(help="help for subcommand", dest="subcommand")
 
@@ -46,9 +47,12 @@ def cli_evaluate():
  parser_b = subparsers.add_parser("nanotron", help="use nanotron as backend for evaluation.")
  parser_nanotron(parser_b)
 
+ parser_c = subparsers.add_parser("baseline", help="compute baseline for a task")
+ parser_baseline(parser_c)
+
  # Subparser for task utils functions
- parser_c = subparsers.add_parser("tasks", help="display information about available tasks and samples.")
- parser_utils_tasks(parser_c)
+ parser_d = subparsers.add_parser("tasks", help="display information about available tasks and samples.")
+ parser_utils_tasks(parser_d)
 
  args = parser.parse_args()
 
@@ -62,18 +66,24 @@ def cli_evaluate():
 
  main_nanotron(args.checkpoint_config_path, args.lighteval_config_path, args.cache_dir)
 
+ elif args.subcommand == "baseline":
+ from lighteval.main_baseline import main as main_baseline
+
+ main_baseline(args)
+
  elif args.subcommand == "tasks":
+ registry = Registry(cache_dir=args.cache_dir, custom_tasks=args.custom_tasks)
  if args.list:
- Registry(cache_dir="").print_all_tasks()
+ registry.print_all_tasks()
 
  if args.inspect:
  print(f"Loading the tasks dataset to cache folder: {args.cache_dir}")
  print(
  "All examples will be displayed without few shot, as few shot sample construction requires loading a model and using its tokenizer. "
  )
  # Loading task
- task_names_list, _ = taskinfo_selector(args.inspect)
- task_dict = Registry(cache_dir=args.cache_dir).get_task_dict(task_names_list)
+ task_names_list, _ = taskinfo_selector(args.inspect, task_registry=registry)
+ task_dict = registry.get_task_dict(task_names_list)
  for name, task in task_dict.items():
  print("-" * 10, name, "-" * 10)
  if args.show_config:
@@ -84,7 +94,6 @@ def cli_evaluate():
  print("-" * 10, "SAMPLES")
  print(f"-- sample {ix} --")
  print(pformat(asdict(sample), indent=1))
-
  else:
  print("You did not provide any argument. Exiting")
 

diff --git a/src/lighteval/main_baseline.py b/src/lighteval/main_baseline.py
@@ -0,0 +1,88 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from lighteval.logging.evaluation_tracker import EvaluationTracker
+from lighteval.metrics.utils.metric_utils import MetricCategory
+from lighteval.models.abstract_model import ModelInfo
+from lighteval.tasks.lighteval_task import LightevalTask
+from lighteval.tasks.registry import Registry, taskinfo_selector
+from lighteval.utils.utils import as_list
+
+
+def main(args):
+ """
+ Compute baselines for given tasks.
+
+ It has been tested with generative and accuracy tasks, but may not work correctly for other task types.
+
+ The baseline is computed as follows:
+ - For multiple-choice tasks: It assumes random guessing, so the score is n_correct/number_of_choices.
+ - For other metrics: It assigns a score of 0, which may not be appropriate for all task types.
+
+ Note:
+ This baseline computation may not be suitable for all task types and should be used with caution.
+ """
+ task_registry = Registry(cache_dir=args.cache_dir, custom_tasks=args.custom_tasks)
+ task_names_list, fewshots_dict = taskinfo_selector(args.tasks, task_registry)
+ task_dict = task_registry.get_task_dict(task_names_list)
+
+ evaluation_tracker = EvaluationTracker(
+ output_dir=args.output_dir,
+ save_details=False,
+ push_to_hub=False,
+ push_to_tensorboard=False,
+ public=False,
+ hub_results_org=None,
+ )
+ evaluation_tracker.general_config_logger.log_model_info(
+ ModelInfo(
+ model_name="lighteval/baseline",
+ model_sha=None,
+ model_dtype=None,
+ model_size=None,
+ )
+ )
+ evaluation_tracker.task_config_logger.log(task_dict)
+
+ LightevalTask.load_datasets(list(task_dict.values()), args.dataset_loading_processes)
+
+ for task_name, task in task_dict.items():
+ task_docs = list(task.eval_docs())
+ n_samples = min(args.max_samples, len(task_docs)) if args.max_samples else len(task_docs)
+
+ p_correct_score = [
+ len(as_list(task_doc.gold_index)) / len(task_doc.choices) for task_doc in task_docs[:n_samples]
+ ]
+
+ metric_results = {
+ metric.metric_name: p_correct_score
+ if metric.category
+ in [MetricCategory.MULTICHOICE, MetricCategory.MULTICHOICE_PMI, MetricCategory.MULTICHOICE_ONE_TOKEN]
+ else 0
+ for metric in task.metrics
+ }
+
+ for fewshots, _ in fewshots_dict[task_name]:
+ evaluation_tracker.metrics_logger.log(f"{task_name}|{fewshots}", metric_results)
+
+ evaluation_tracker.metrics_logger.aggregate(task_dict=task_dict, bootstrap_iters=1000)
+ evaluation_tracker.save()
diff --git a/src/lighteval/parsers.py b/src/lighteval/parsers.py
@@ -104,6 +104,44 @@ def parser_accelerate(parser=None):
  return parser
 
 
+def parser_baseline(parser=None):
+ if parser is None:
+ parser = argparse.ArgumentParser(
+ description="CLI tool for lighteval, a lightweight framework for LLM evaluation"
+ )
+
+ parser.add_argument(
+ "--custom_tasks",
+ type=str,
+ default=None,
+ help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)",
+ )
+
+ parser.add_argument(
+ "--tasks",
+ type=str,
+ required=True,
+ help="Task to compute the baseline for",
+ )
+ parser.add_argument("--max_samples", type=int, default=None, help="Maximum number of samples to evaluate on")
+ parser.add_argument(
+ "--dataset_loading_processes", type=int, default=1, help="Number of processes to use for loading the datasets"
+ )
+
+ parser.add_argument(
+ "--cache_dir", type=str, default=CACHE_DIR, help="Cache directory used to store datasets and models"
+ )
+ # Ooutput related
+ parser.add_argument(
+ "--output_dir",
+ required=True,
+ type=str,
+ help="Directory to save the results, fsspec compliant (e.g. s3://bucket/path)",
+ )
+
+ return parser
+
+
 def parser_nanotron(parser=None):
  if parser is None:
  parser = argparse.ArgumentParser(
@@ -142,6 +180,7 @@ def parser_utils_tasks(parser=None):
  default=None,
  help="Id of tasks or path to a text file with a list of tasks (e.g. 'original|mmlu:abstract_algebra|5') for which you want to manually inspect samples.",
  )
+ parser.add_argument("--custom_tasks", type=str, default=None, help="Path to a file with custom tasks")
  parser.add_argument("--num_samples", type=int, default=10, help="Number of samples to display")
  parser.add_argument("--show_config", default=False, action="store_true", help="Will display the full task config")
  parser.add_argument(