diff --git a/examples/landmark_detection/report.ipynb b/examples/landmark_detection/report.ipynb
new file mode 100644
index 00000000..0b38d5ff
--- /dev/null
+++ b/examples/landmark_detection/report.ipynb
@@ -0,0 +1,596 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from face_alignment import FaceAlignment, LandmarksType\n",
+ "\n",
+ "from giskard_vision.landmark_detection.dataloaders.loaders import DataLoaderFFHQ, DataLoader300W\n",
+ "from giskard_vision.landmark_detection.dataloaders.wrappers import (\n",
+ " CroppedDataLoader,\n",
+ " ResizedDataLoader,\n",
+ " ColoredDataLoader,\n",
+ " BlurredDataLoader,\n",
+ " FilteredDataLoader,\n",
+ " HeadPoseDataLoader,\n",
+ " EthnicityDataLoader,\n",
+ " CachedDataLoader,\n",
+ ")\n",
+ "\n",
+ "from giskard_vision.landmark_detection.models.wrappers import OpenCVWrapper, FaceAlignmentWrapper\n",
+ "from giskard_vision.landmark_detection.tests.performance import NMEMean\n",
+ "from giskard_vision.landmark_detection.marks.facial_parts import FacialParts\n",
+ "from giskard_vision.landmark_detection.tests.report import Report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dl_ref = DataLoader300W(dir_path=\"300W/sample\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2024-01-17 12:01:44.698306: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+ "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# cropping\n",
+ "dl_cropped_left = CroppedDataLoader(dl_ref, part=FacialParts.LEFT_HALF.value)\n",
+ "dl_cropped_upper = CroppedDataLoader(dl_ref, part=FacialParts.UPPER_HALF.value)\n",
+ "\n",
+ "# resizing\n",
+ "dl_resized = ResizedDataLoader(dl_ref, scales=0.5)\n",
+ "\n",
+ "# coloring\n",
+ "dl_colored = ColoredDataLoader(dl_ref)\n",
+ "\n",
+ "# blurring\n",
+ "dl_blurred = BlurredDataLoader(dl_ref)\n",
+ "\n",
+ "\n",
+ "# head pose filtering\n",
+ "def positive_roll(elt):\n",
+ " return elt[2][\"headPose\"][\"roll\"] > 0\n",
+ "\n",
+ "\n",
+ "def negative_roll(elt):\n",
+ " return elt[2][\"headPose\"][\"roll\"] < 0\n",
+ "\n",
+ "\n",
+ "cached_dl = CachedDataLoader(HeadPoseDataLoader(dl_ref), cache_size=None, cache_img=False, cache_marks=False)\n",
+ "dl_positive_roll = FilteredDataLoader(cached_dl, positive_roll)\n",
+ "dl_negative_roll = FilteredDataLoader(cached_dl, negative_roll)\n",
+ "\n",
+ "\n",
+ "# ethnicity filtering\n",
+ "def white_ethnicity(elt):\n",
+ " return elt[2][\"ethnicity\"] == \"white\"\n",
+ "\n",
+ "\n",
+ "def latino_ethnicity(elt):\n",
+ " return elt[2][\"ethnicity\"] == \"latino hispanic\"\n",
+ "\n",
+ "\n",
+ "cached_dl = CachedDataLoader(\n",
+ " EthnicityDataLoader(dl_ref, ethnicity_map={\"indian\": \"asian\"}), cache_size=None, cache_img=False, cache_marks=False\n",
+ ")\n",
+ "dl_white = FilteredDataLoader(cached_dl, white_ethnicity)\n",
+ "dl_latino = FilteredDataLoader(cached_dl, latino_ethnicity)\n",
+ "\n",
+ "dataloaders_list = [\n",
+ " dl_cropped_left,\n",
+ " dl_cropped_upper,\n",
+ " dl_resized,\n",
+ " dl_colored,\n",
+ " dl_blurred,\n",
+ " dl_positive_roll,\n",
+ " dl_negative_roll,\n",
+ " dl_white,\n",
+ " dl_latino,\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "loading data from : lbfmodel.yaml\n"
+ ]
+ }
+ ],
+ "source": [
+ "models_list = [\n",
+ " FaceAlignmentWrapper(model=FaceAlignment(LandmarksType.TWO_D, device=\"cpu\", flip_input=False)),\n",
+ " OpenCVWrapper(),\n",
+ "]\n",
+ "\n",
+ "models_list = [models_list[1]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "OpenCVWrapper: Face not detected in processed image of batch 1 and index 0.\n",
+ "OpenCVWrapper: Face not detected in processed image of batch 5 and index 0.\n",
+ "OpenCVWrapper: Face not detected in processed image of batch 1 and index 0.\n",
+ "OpenCVWrapper: Face not detected in processed image of batch 2 and index 0.\n",
+ "OpenCVWrapper: Face not detected in processed image of batch 3 and index 0.\n",
+ "OpenCVWrapper: Face not detected in processed image of batch 5 and index 0.\n"
+ ]
+ }
+ ],
+ "source": [
+ "report = Report(models_list, dataloaders_list, dataloader_ref=dl_ref)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " model | \n",
+ " facial_part | \n",
+ " dataloader | \n",
+ " prediction_time | \n",
+ " prediction_fail_rate | \n",
+ " test | \n",
+ " metric | \n",
+ " metric_value | \n",
+ " threshold | \n",
+ " passed | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " OpenCV | \n",
+ " left half | \n",
+ " 300W cropped on left half | \n",
+ " 0.929891 | \n",
+ " 0.564706 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " -0.644057 | \n",
+ " -0.1 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " OpenCV | \n",
+ " upper half | \n",
+ " 300W cropped on upper half | \n",
+ " 0.943296 | \n",
+ " 0.682353 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " 0.040216 | \n",
+ " -0.1 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " OpenCV | \n",
+ " entire face | \n",
+ " 300W resizing with ratios: 0.5 | \n",
+ " 0.995143 | \n",
+ " 0.000000 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " -0.079876 | \n",
+ " -0.1 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " OpenCV | \n",
+ " entire face | \n",
+ " 300W altered with color mode 7 | \n",
+ " 1.316452 | \n",
+ " 0.000000 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " 0.001347 | \n",
+ " -0.1 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " OpenCV | \n",
+ " entire face | \n",
+ " 300W blurred | \n",
+ " 1.483105 | \n",
+ " 0.000000 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " -0.103017 | \n",
+ " -0.1 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " OpenCV | \n",
+ " entire face | \n",
+ " (Cached (300W) with head-pose) filtered using ... | \n",
+ " 0.974038 | \n",
+ " 0.000000 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " 0.077927 | \n",
+ " -0.1 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " OpenCV | \n",
+ " entire face | \n",
+ " (Cached (300W) with head-pose) filtered using ... | \n",
+ " 2.076661 | \n",
+ " 0.000000 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " -0.019482 | \n",
+ " -0.1 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " OpenCV | \n",
+ " entire face | \n",
+ " (Cached (300W) with ethnicity) filtered using ... | \n",
+ " 2.389328 | \n",
+ " 0.000000 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " 0.168421 | \n",
+ " -0.1 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " OpenCV | \n",
+ " entire face | \n",
+ " (Cached (300W) with ethnicity) filtered using ... | \n",
+ " 1.945159 | \n",
+ " 0.000000 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " -0.784538 | \n",
+ " -0.1 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " model facial_part dataloader \\\n",
+ "0 OpenCV left half 300W cropped on left half \n",
+ "1 OpenCV upper half 300W cropped on upper half \n",
+ "2 OpenCV entire face 300W resizing with ratios: 0.5 \n",
+ "3 OpenCV entire face 300W altered with color mode 7 \n",
+ "4 OpenCV entire face 300W blurred \n",
+ "5 OpenCV entire face (Cached (300W) with head-pose) filtered using ... \n",
+ "6 OpenCV entire face (Cached (300W) with head-pose) filtered using ... \n",
+ "7 OpenCV entire face (Cached (300W) with ethnicity) filtered using ... \n",
+ "8 OpenCV entire face (Cached (300W) with ethnicity) filtered using ... \n",
+ "\n",
+ " prediction_time prediction_fail_rate test metric metric_value \\\n",
+ "0 0.929891 0.564706 TestDiff NME_mean -0.644057 \n",
+ "1 0.943296 0.682353 TestDiff NME_mean 0.040216 \n",
+ "2 0.995143 0.000000 TestDiff NME_mean -0.079876 \n",
+ "3 1.316452 0.000000 TestDiff NME_mean 0.001347 \n",
+ "4 1.483105 0.000000 TestDiff NME_mean -0.103017 \n",
+ "5 0.974038 0.000000 TestDiff NME_mean 0.077927 \n",
+ "6 2.076661 0.000000 TestDiff NME_mean -0.019482 \n",
+ "7 2.389328 0.000000 TestDiff NME_mean 0.168421 \n",
+ "8 1.945159 0.000000 TestDiff NME_mean -0.784538 \n",
+ "\n",
+ " threshold passed \n",
+ "0 -0.1 True \n",
+ "1 -0.1 False \n",
+ "2 -0.1 False \n",
+ "3 -0.1 False \n",
+ "4 -0.1 True \n",
+ "5 -0.1 False \n",
+ "6 -0.1 False \n",
+ "7 -0.1 False \n",
+ "8 -0.1 True "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "report.to_dataframe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " model | \n",
+ " facial_part | \n",
+ " dataloader | \n",
+ " prediction_time | \n",
+ " prediction_fail_rate | \n",
+ " test | \n",
+ " metric | \n",
+ " metric_value | \n",
+ " threshold | \n",
+ " passed | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " OpenCV | \n",
+ " left half | \n",
+ " 300W cropped on left half | \n",
+ " 0.929891 | \n",
+ " 0.564706 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " -0.644057 | \n",
+ " -0.9 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " OpenCV | \n",
+ " upper half | \n",
+ " 300W cropped on upper half | \n",
+ " 0.943296 | \n",
+ " 0.682353 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " 0.040216 | \n",
+ " -0.1 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " OpenCV | \n",
+ " entire face | \n",
+ " 300W resizing with ratios: 0.5 | \n",
+ " 0.995143 | \n",
+ " 0.000000 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " -0.079876 | \n",
+ " -0.1 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " OpenCV | \n",
+ " entire face | \n",
+ " 300W altered with color mode 7 | \n",
+ " 1.316452 | \n",
+ " 0.000000 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " 0.001347 | \n",
+ " -0.1 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " OpenCV | \n",
+ " entire face | \n",
+ " 300W blurred | \n",
+ " 1.483105 | \n",
+ " 0.000000 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " -0.103017 | \n",
+ " -0.1 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " OpenCV | \n",
+ " entire face | \n",
+ " (Cached (300W) with head-pose) filtered using ... | \n",
+ " 0.974038 | \n",
+ " 0.000000 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " 0.077927 | \n",
+ " -0.1 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " OpenCV | \n",
+ " entire face | \n",
+ " (Cached (300W) with head-pose) filtered using ... | \n",
+ " 2.076661 | \n",
+ " 0.000000 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " -0.019482 | \n",
+ " -0.1 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " OpenCV | \n",
+ " entire face | \n",
+ " (Cached (300W) with ethnicity) filtered using ... | \n",
+ " 2.389328 | \n",
+ " 0.000000 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " 0.168421 | \n",
+ " -0.1 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " OpenCV | \n",
+ " entire face | \n",
+ " (Cached (300W) with ethnicity) filtered using ... | \n",
+ " 1.945159 | \n",
+ " 0.000000 | \n",
+ " TestDiff | \n",
+ " NME_mean | \n",
+ " -0.784538 | \n",
+ " -0.1 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " model facial_part dataloader \\\n",
+ "0 OpenCV left half 300W cropped on left half \n",
+ "1 OpenCV upper half 300W cropped on upper half \n",
+ "2 OpenCV entire face 300W resizing with ratios: 0.5 \n",
+ "3 OpenCV entire face 300W altered with color mode 7 \n",
+ "4 OpenCV entire face 300W blurred \n",
+ "5 OpenCV entire face (Cached (300W) with head-pose) filtered using ... \n",
+ "6 OpenCV entire face (Cached (300W) with head-pose) filtered using ... \n",
+ "7 OpenCV entire face (Cached (300W) with ethnicity) filtered using ... \n",
+ "8 OpenCV entire face (Cached (300W) with ethnicity) filtered using ... \n",
+ "\n",
+ " prediction_time prediction_fail_rate test metric metric_value \\\n",
+ "0 0.929891 0.564706 TestDiff NME_mean -0.644057 \n",
+ "1 0.943296 0.682353 TestDiff NME_mean 0.040216 \n",
+ "2 0.995143 0.000000 TestDiff NME_mean -0.079876 \n",
+ "3 1.316452 0.000000 TestDiff NME_mean 0.001347 \n",
+ "4 1.483105 0.000000 TestDiff NME_mean -0.103017 \n",
+ "5 0.974038 0.000000 TestDiff NME_mean 0.077927 \n",
+ "6 2.076661 0.000000 TestDiff NME_mean -0.019482 \n",
+ "7 2.389328 0.000000 TestDiff NME_mean 0.168421 \n",
+ "8 1.945159 0.000000 TestDiff NME_mean -0.784538 \n",
+ "\n",
+ " threshold passed \n",
+ "0 -0.9 False \n",
+ "1 -0.1 False \n",
+ "2 -0.1 False \n",
+ "3 -0.1 False \n",
+ "4 -0.1 True \n",
+ "5 -0.1 False \n",
+ "6 -0.1 False \n",
+ "7 -0.1 False \n",
+ "8 -0.1 True "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "report.adjust_thresholds({0: -0.9})\n",
+ "report.to_dataframe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import tempfile\n",
+ "\n",
+ "with tempfile.NamedTemporaryFile() as f:\n",
+ " report.to_json(filename=f.name)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/giskard_vision/landmark_detection/tests/base.py b/giskard_vision/landmark_detection/tests/base.py
index 27b13c29..78ae8309 100644
--- a/giskard_vision/landmark_detection/tests/base.py
+++ b/giskard_vision/landmark_detection/tests/base.py
@@ -233,7 +233,7 @@ def run(
metric_value=metric_value,
threshold=self.threshold,
prediction_results=[prediction_result],
- passed=metric_value <= self.threshold,
+ passed=bool(metric_value <= self.threshold), # casting is important for json dumping
prediction_time=prediction_result.prediction_time,
prediction_fail_rate=prediction_result.prediction_fail_rate,
facial_part=facial_part,
@@ -305,7 +305,7 @@ def run(
metric_value=metric_value,
threshold=self.threshold,
prediction_results=prediction_results,
- passed=abs(metric_value) <= self.threshold,
+ passed=bool(metric_value <= self.threshold), # casting is important for json dumping
prediction_time=prediction_time,
prediction_fail_rate=prediction_fail_rate,
facial_part=facial_part,
diff --git a/giskard_vision/landmark_detection/tests/report.py b/giskard_vision/landmark_detection/tests/report.py
new file mode 100644
index 00000000..3eb69b3a
--- /dev/null
+++ b/giskard_vision/landmark_detection/tests/report.py
@@ -0,0 +1,121 @@
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import pandas as pd
+
+from giskard_vision.landmark_detection.dataloaders.base import DataIteratorBase
+from giskard_vision.landmark_detection.models.base import FaceLandmarksModelBase
+from giskard_vision.landmark_detection.tests.base import Metric, Test, TestDiff
+from giskard_vision.landmark_detection.tests.performance import NMEMean
+
+
+class Report:
+ """
+ A class for generating and managing test reports for landmark detection models.
+
+ Attributes:
+ default_rel_threshold (float): Default relative threshold.
+ default_abs_threshold (float): Default absolute threshold.
+
+ """
+
+ default_rel_threshold = -0.1
+ default_abs_threshold = 1
+
+ def __init__(
+ self,
+ models: List[FaceLandmarksModelBase],
+ dataloaders: List[DataIteratorBase],
+ metrics: Optional[List[Metric]] = None,
+ dataloader_ref: Optional[DataIteratorBase] = None,
+ ):
+ """
+ Initializes a Report instance.
+
+ Args:
+ models (List[FaceLandmarksModelBase]): List of face landmarks models.
+ dataloaders (List[DataIteratorBase]): List of data loaders for testing.
+ metrics (Optional[List[Metric]]): List of metrics to evaluate (default is NMEMean).
+ dataloader_ref (Optional[DataIteratorBase]): Reference data loader for comparative tests.
+
+ """
+ test = Test if dataloader_ref is None else TestDiff
+ threshold = self.default_abs_threshold if dataloader_ref is None else self.default_rel_threshold
+ metrics = [NMEMean] if metrics is None else metrics
+
+ self.results = []
+ for model in models:
+ for dataloader in dataloaders:
+ run_kwargs = {"model": model, "dataloader": dataloader}
+ if dataloader_ref is not None:
+ run_kwargs["dataloader_ref"] = dataloader_ref
+ for metric in metrics:
+ self.results.append(test(metric=metric, threshold=threshold).run(**run_kwargs).to_dict())
+
+ def to_dataframe(self):
+ """
+ Converts the test results to a pandas DataFrame.
+
+ Returns:
+ pd.DataFrame: A DataFrame containing the test results.
+
+ """
+ # columns reordering
+ return pd.DataFrame(self.results)[
+ [
+ "model",
+ "facial_part",
+ "dataloader",
+ "prediction_time",
+ "prediction_fail_rate",
+ "test",
+ "metric",
+ "metric_value",
+ "threshold",
+ "passed",
+ ]
+ ]
+
+ def to_json(self, filename: Optional[str] = None):
+ """
+ Writes the test results to a JSON file.
+
+ Args:
+ filename (Optional[str]): Name of the JSON file (default is generated with a unique identifier).
+
+ """
+ import json
+
+ if filename is None:
+ import uuid
+
+ _uuid = str(uuid.uuid4())
+ filename = "report-{}.jsonl".format(_uuid)
+
+ with open(filename, "w") as jsonl_file:
+ for result in self.results:
+ jsonl_file.write(json.dumps(result) + "\n")
+
+ def adjust_thresholds(self, thresholds: Union[List[float], Dict[int, float]]):
+ """
+ Adjusts the thresholds for the tests.
+
+ Args:
+ thresholds (Union[List[float], Dict[int, float]]): Threshold values for the tests.
+
+ Raises:
+ ValueError: If the length of thresholds list does not match the number of test results.
+
+ """
+ if len(thresholds) != len(self.results) and isinstance(thresholds, list):
+ raise ValueError(
+ f"{self.__class__.__name__}: adjust_thresholds accepts either a List[float] of thresholds of len(self.results) = {len(self.results)} or a Dict[int, float] to map the index of each test to a threshold."
+ )
+
+ if not isinstance(thresholds, dict):
+ thresholds = list(thresholds)
+ thresholds = dict(zip(np.arange(len(thresholds)), thresholds))
+
+ for idx, threshold in thresholds.items():
+ self.results[idx]["threshold"] = float(threshold)
+ self.results[idx]["passed"] = bool(self.results[idx]["metric_value"] <= threshold)
diff --git a/tests/landmark_detection/tests_and_metrics/test_report.py b/tests/landmark_detection/tests_and_metrics/test_report.py
new file mode 100644
index 00000000..476e132f
--- /dev/null
+++ b/tests/landmark_detection/tests_and_metrics/test_report.py
@@ -0,0 +1,30 @@
+import tempfile
+
+import numpy as np
+
+from giskard_vision.landmark_detection.tests.report import Report
+
+
+def test_report(opencv_model, dataset_300w):
+ models = [opencv_model]
+ dls = [dataset_300w]
+ dl_ref = dataset_300w
+
+ report = Report(models=models, dataloaders=dls)
+ assert report.results[0]["test"] == "Test"
+ assert report.results[0]["passed"]
+ assert np.allclose(report.results[0]["metric_value"], 0.04136279942)
+
+ report.adjust_thresholds({0: 0.03})
+ assert not report.results[0]["passed"]
+
+ report2 = Report(models=models, dataloaders=dls, dataloader_ref=dl_ref)
+ assert report2.results[0]["test"] == "TestDiff"
+
+ with tempfile.NamedTemporaryFile() as f:
+ report.to_json(filename=f.name)
+ report2.to_json(filename=f.name)
+
+ df = report.to_dataframe()
+ df2 = report2.to_dataframe()
+ assert len(df) == len(df2) == 1