diff --git a/examples/landmark_detection/report.ipynb b/examples/landmark_detection/report.ipynb new file mode 100644 index 00000000..0b38d5ff --- /dev/null +++ b/examples/landmark_detection/report.ipynb @@ -0,0 +1,596 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from face_alignment import FaceAlignment, LandmarksType\n", + "\n", + "from giskard_vision.landmark_detection.dataloaders.loaders import DataLoaderFFHQ, DataLoader300W\n", + "from giskard_vision.landmark_detection.dataloaders.wrappers import (\n", + " CroppedDataLoader,\n", + " ResizedDataLoader,\n", + " ColoredDataLoader,\n", + " BlurredDataLoader,\n", + " FilteredDataLoader,\n", + " HeadPoseDataLoader,\n", + " EthnicityDataLoader,\n", + " CachedDataLoader,\n", + ")\n", + "\n", + "from giskard_vision.landmark_detection.models.wrappers import OpenCVWrapper, FaceAlignmentWrapper\n", + "from giskard_vision.landmark_detection.tests.performance import NMEMean\n", + "from giskard_vision.landmark_detection.marks.facial_parts import FacialParts\n", + "from giskard_vision.landmark_detection.tests.report import Report" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "dl_ref = DataLoader300W(dir_path=\"300W/sample\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-01-17 12:01:44.698306: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + } + ], + "source": [ + "# cropping\n", + "dl_cropped_left = CroppedDataLoader(dl_ref, part=FacialParts.LEFT_HALF.value)\n", + "dl_cropped_upper = CroppedDataLoader(dl_ref, part=FacialParts.UPPER_HALF.value)\n", + "\n", + "# resizing\n", + "dl_resized = ResizedDataLoader(dl_ref, scales=0.5)\n", + "\n", + "# coloring\n", + "dl_colored = ColoredDataLoader(dl_ref)\n", + "\n", + "# blurring\n", + "dl_blurred = BlurredDataLoader(dl_ref)\n", + "\n", + "\n", + "# head pose filtering\n", + "def positive_roll(elt):\n", + " return elt[2][\"headPose\"][\"roll\"] > 0\n", + "\n", + "\n", + "def negative_roll(elt):\n", + " return elt[2][\"headPose\"][\"roll\"] < 0\n", + "\n", + "\n", + "cached_dl = CachedDataLoader(HeadPoseDataLoader(dl_ref), cache_size=None, cache_img=False, cache_marks=False)\n", + "dl_positive_roll = FilteredDataLoader(cached_dl, positive_roll)\n", + "dl_negative_roll = FilteredDataLoader(cached_dl, negative_roll)\n", + "\n", + "\n", + "# ethnicity filtering\n", + "def white_ethnicity(elt):\n", + " return elt[2][\"ethnicity\"] == \"white\"\n", + "\n", + "\n", + "def latino_ethnicity(elt):\n", + " return elt[2][\"ethnicity\"] == \"latino hispanic\"\n", + "\n", + "\n", + "cached_dl = CachedDataLoader(\n", + " EthnicityDataLoader(dl_ref, ethnicity_map={\"indian\": \"asian\"}), cache_size=None, cache_img=False, cache_marks=False\n", + ")\n", + "dl_white = FilteredDataLoader(cached_dl, white_ethnicity)\n", + "dl_latino = FilteredDataLoader(cached_dl, latino_ethnicity)\n", + "\n", + "dataloaders_list = [\n", + " dl_cropped_left,\n", + " dl_cropped_upper,\n", + " dl_resized,\n", + " dl_colored,\n", + " dl_blurred,\n", + " dl_positive_roll,\n", + " dl_negative_roll,\n", + " dl_white,\n", + " dl_latino,\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading data from : lbfmodel.yaml\n" + ] + } + ], + "source": [ + "models_list = [\n", + " FaceAlignmentWrapper(model=FaceAlignment(LandmarksType.TWO_D, device=\"cpu\", flip_input=False)),\n", + " OpenCVWrapper(),\n", + "]\n", + "\n", + "models_list = [models_list[1]]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "OpenCVWrapper: Face not detected in processed image of batch 1 and index 0.\n", + "OpenCVWrapper: Face not detected in processed image of batch 5 and index 0.\n", + "OpenCVWrapper: Face not detected in processed image of batch 1 and index 0.\n", + "OpenCVWrapper: Face not detected in processed image of batch 2 and index 0.\n", + "OpenCVWrapper: Face not detected in processed image of batch 3 and index 0.\n", + "OpenCVWrapper: Face not detected in processed image of batch 5 and index 0.\n" + ] + } + ], + "source": [ + "report = Report(models_list, dataloaders_list, dataloader_ref=dl_ref)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
modelfacial_partdataloaderprediction_timeprediction_fail_ratetestmetricmetric_valuethresholdpassed
0OpenCVleft half300W cropped on left half0.9298910.564706TestDiffNME_mean-0.644057-0.1True
1OpenCVupper half300W cropped on upper half0.9432960.682353TestDiffNME_mean0.040216-0.1False
2OpenCVentire face300W resizing with ratios: 0.50.9951430.000000TestDiffNME_mean-0.079876-0.1False
3OpenCVentire face300W altered with color mode 71.3164520.000000TestDiffNME_mean0.001347-0.1False
4OpenCVentire face300W blurred1.4831050.000000TestDiffNME_mean-0.103017-0.1True
5OpenCVentire face(Cached (300W) with head-pose) filtered using ...0.9740380.000000TestDiffNME_mean0.077927-0.1False
6OpenCVentire face(Cached (300W) with head-pose) filtered using ...2.0766610.000000TestDiffNME_mean-0.019482-0.1False
7OpenCVentire face(Cached (300W) with ethnicity) filtered using ...2.3893280.000000TestDiffNME_mean0.168421-0.1False
8OpenCVentire face(Cached (300W) with ethnicity) filtered using ...1.9451590.000000TestDiffNME_mean-0.784538-0.1True
\n", + "
" + ], + "text/plain": [ + " model facial_part dataloader \\\n", + "0 OpenCV left half 300W cropped on left half \n", + "1 OpenCV upper half 300W cropped on upper half \n", + "2 OpenCV entire face 300W resizing with ratios: 0.5 \n", + "3 OpenCV entire face 300W altered with color mode 7 \n", + "4 OpenCV entire face 300W blurred \n", + "5 OpenCV entire face (Cached (300W) with head-pose) filtered using ... \n", + "6 OpenCV entire face (Cached (300W) with head-pose) filtered using ... \n", + "7 OpenCV entire face (Cached (300W) with ethnicity) filtered using ... \n", + "8 OpenCV entire face (Cached (300W) with ethnicity) filtered using ... \n", + "\n", + " prediction_time prediction_fail_rate test metric metric_value \\\n", + "0 0.929891 0.564706 TestDiff NME_mean -0.644057 \n", + "1 0.943296 0.682353 TestDiff NME_mean 0.040216 \n", + "2 0.995143 0.000000 TestDiff NME_mean -0.079876 \n", + "3 1.316452 0.000000 TestDiff NME_mean 0.001347 \n", + "4 1.483105 0.000000 TestDiff NME_mean -0.103017 \n", + "5 0.974038 0.000000 TestDiff NME_mean 0.077927 \n", + "6 2.076661 0.000000 TestDiff NME_mean -0.019482 \n", + "7 2.389328 0.000000 TestDiff NME_mean 0.168421 \n", + "8 1.945159 0.000000 TestDiff NME_mean -0.784538 \n", + "\n", + " threshold passed \n", + "0 -0.1 True \n", + "1 -0.1 False \n", + "2 -0.1 False \n", + "3 -0.1 False \n", + "4 -0.1 True \n", + "5 -0.1 False \n", + "6 -0.1 False \n", + "7 -0.1 False \n", + "8 -0.1 True " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report.to_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
modelfacial_partdataloaderprediction_timeprediction_fail_ratetestmetricmetric_valuethresholdpassed
0OpenCVleft half300W cropped on left half0.9298910.564706TestDiffNME_mean-0.644057-0.9False
1OpenCVupper half300W cropped on upper half0.9432960.682353TestDiffNME_mean0.040216-0.1False
2OpenCVentire face300W resizing with ratios: 0.50.9951430.000000TestDiffNME_mean-0.079876-0.1False
3OpenCVentire face300W altered with color mode 71.3164520.000000TestDiffNME_mean0.001347-0.1False
4OpenCVentire face300W blurred1.4831050.000000TestDiffNME_mean-0.103017-0.1True
5OpenCVentire face(Cached (300W) with head-pose) filtered using ...0.9740380.000000TestDiffNME_mean0.077927-0.1False
6OpenCVentire face(Cached (300W) with head-pose) filtered using ...2.0766610.000000TestDiffNME_mean-0.019482-0.1False
7OpenCVentire face(Cached (300W) with ethnicity) filtered using ...2.3893280.000000TestDiffNME_mean0.168421-0.1False
8OpenCVentire face(Cached (300W) with ethnicity) filtered using ...1.9451590.000000TestDiffNME_mean-0.784538-0.1True
\n", + "
" + ], + "text/plain": [ + " model facial_part dataloader \\\n", + "0 OpenCV left half 300W cropped on left half \n", + "1 OpenCV upper half 300W cropped on upper half \n", + "2 OpenCV entire face 300W resizing with ratios: 0.5 \n", + "3 OpenCV entire face 300W altered with color mode 7 \n", + "4 OpenCV entire face 300W blurred \n", + "5 OpenCV entire face (Cached (300W) with head-pose) filtered using ... \n", + "6 OpenCV entire face (Cached (300W) with head-pose) filtered using ... \n", + "7 OpenCV entire face (Cached (300W) with ethnicity) filtered using ... \n", + "8 OpenCV entire face (Cached (300W) with ethnicity) filtered using ... \n", + "\n", + " prediction_time prediction_fail_rate test metric metric_value \\\n", + "0 0.929891 0.564706 TestDiff NME_mean -0.644057 \n", + "1 0.943296 0.682353 TestDiff NME_mean 0.040216 \n", + "2 0.995143 0.000000 TestDiff NME_mean -0.079876 \n", + "3 1.316452 0.000000 TestDiff NME_mean 0.001347 \n", + "4 1.483105 0.000000 TestDiff NME_mean -0.103017 \n", + "5 0.974038 0.000000 TestDiff NME_mean 0.077927 \n", + "6 2.076661 0.000000 TestDiff NME_mean -0.019482 \n", + "7 2.389328 0.000000 TestDiff NME_mean 0.168421 \n", + "8 1.945159 0.000000 TestDiff NME_mean -0.784538 \n", + "\n", + " threshold passed \n", + "0 -0.9 False \n", + "1 -0.1 False \n", + "2 -0.1 False \n", + "3 -0.1 False \n", + "4 -0.1 True \n", + "5 -0.1 False \n", + "6 -0.1 False \n", + "7 -0.1 False \n", + "8 -0.1 True " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report.adjust_thresholds({0: -0.9})\n", + "report.to_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import tempfile\n", + "\n", + "with tempfile.NamedTemporaryFile() as f:\n", + " report.to_json(filename=f.name)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/giskard_vision/landmark_detection/tests/base.py b/giskard_vision/landmark_detection/tests/base.py index 27b13c29..78ae8309 100644 --- a/giskard_vision/landmark_detection/tests/base.py +++ b/giskard_vision/landmark_detection/tests/base.py @@ -233,7 +233,7 @@ def run( metric_value=metric_value, threshold=self.threshold, prediction_results=[prediction_result], - passed=metric_value <= self.threshold, + passed=bool(metric_value <= self.threshold), # casting is important for json dumping prediction_time=prediction_result.prediction_time, prediction_fail_rate=prediction_result.prediction_fail_rate, facial_part=facial_part, @@ -305,7 +305,7 @@ def run( metric_value=metric_value, threshold=self.threshold, prediction_results=prediction_results, - passed=abs(metric_value) <= self.threshold, + passed=bool(metric_value <= self.threshold), # casting is important for json dumping prediction_time=prediction_time, prediction_fail_rate=prediction_fail_rate, facial_part=facial_part, diff --git a/giskard_vision/landmark_detection/tests/report.py b/giskard_vision/landmark_detection/tests/report.py new file mode 100644 index 00000000..3eb69b3a --- /dev/null +++ b/giskard_vision/landmark_detection/tests/report.py @@ -0,0 +1,121 @@ +from typing import Dict, List, Optional, Union + +import numpy as np +import pandas as pd + +from giskard_vision.landmark_detection.dataloaders.base import DataIteratorBase +from giskard_vision.landmark_detection.models.base import FaceLandmarksModelBase +from giskard_vision.landmark_detection.tests.base import Metric, Test, TestDiff +from giskard_vision.landmark_detection.tests.performance import NMEMean + + +class Report: + """ + A class for generating and managing test reports for landmark detection models. + + Attributes: + default_rel_threshold (float): Default relative threshold. + default_abs_threshold (float): Default absolute threshold. + + """ + + default_rel_threshold = -0.1 + default_abs_threshold = 1 + + def __init__( + self, + models: List[FaceLandmarksModelBase], + dataloaders: List[DataIteratorBase], + metrics: Optional[List[Metric]] = None, + dataloader_ref: Optional[DataIteratorBase] = None, + ): + """ + Initializes a Report instance. + + Args: + models (List[FaceLandmarksModelBase]): List of face landmarks models. + dataloaders (List[DataIteratorBase]): List of data loaders for testing. + metrics (Optional[List[Metric]]): List of metrics to evaluate (default is NMEMean). + dataloader_ref (Optional[DataIteratorBase]): Reference data loader for comparative tests. + + """ + test = Test if dataloader_ref is None else TestDiff + threshold = self.default_abs_threshold if dataloader_ref is None else self.default_rel_threshold + metrics = [NMEMean] if metrics is None else metrics + + self.results = [] + for model in models: + for dataloader in dataloaders: + run_kwargs = {"model": model, "dataloader": dataloader} + if dataloader_ref is not None: + run_kwargs["dataloader_ref"] = dataloader_ref + for metric in metrics: + self.results.append(test(metric=metric, threshold=threshold).run(**run_kwargs).to_dict()) + + def to_dataframe(self): + """ + Converts the test results to a pandas DataFrame. + + Returns: + pd.DataFrame: A DataFrame containing the test results. + + """ + # columns reordering + return pd.DataFrame(self.results)[ + [ + "model", + "facial_part", + "dataloader", + "prediction_time", + "prediction_fail_rate", + "test", + "metric", + "metric_value", + "threshold", + "passed", + ] + ] + + def to_json(self, filename: Optional[str] = None): + """ + Writes the test results to a JSON file. + + Args: + filename (Optional[str]): Name of the JSON file (default is generated with a unique identifier). + + """ + import json + + if filename is None: + import uuid + + _uuid = str(uuid.uuid4()) + filename = "report-{}.jsonl".format(_uuid) + + with open(filename, "w") as jsonl_file: + for result in self.results: + jsonl_file.write(json.dumps(result) + "\n") + + def adjust_thresholds(self, thresholds: Union[List[float], Dict[int, float]]): + """ + Adjusts the thresholds for the tests. + + Args: + thresholds (Union[List[float], Dict[int, float]]): Threshold values for the tests. + + Raises: + ValueError: If the length of thresholds list does not match the number of test results. + + """ + if len(thresholds) != len(self.results) and isinstance(thresholds, list): + raise ValueError( + f"{self.__class__.__name__}: adjust_thresholds accepts either a List[float] of thresholds of len(self.results) = {len(self.results)} or a Dict[int, float] to map the index of each test to a threshold." + ) + + if not isinstance(thresholds, dict): + thresholds = list(thresholds) + thresholds = dict(zip(np.arange(len(thresholds)), thresholds)) + + for idx, threshold in thresholds.items(): + self.results[idx]["threshold"] = float(threshold) + self.results[idx]["passed"] = bool(self.results[idx]["metric_value"] <= threshold) diff --git a/tests/landmark_detection/tests_and_metrics/test_report.py b/tests/landmark_detection/tests_and_metrics/test_report.py new file mode 100644 index 00000000..476e132f --- /dev/null +++ b/tests/landmark_detection/tests_and_metrics/test_report.py @@ -0,0 +1,30 @@ +import tempfile + +import numpy as np + +from giskard_vision.landmark_detection.tests.report import Report + + +def test_report(opencv_model, dataset_300w): + models = [opencv_model] + dls = [dataset_300w] + dl_ref = dataset_300w + + report = Report(models=models, dataloaders=dls) + assert report.results[0]["test"] == "Test" + assert report.results[0]["passed"] + assert np.allclose(report.results[0]["metric_value"], 0.04136279942) + + report.adjust_thresholds({0: 0.03}) + assert not report.results[0]["passed"] + + report2 = Report(models=models, dataloaders=dls, dataloader_ref=dl_ref) + assert report2.results[0]["test"] == "TestDiff" + + with tempfile.NamedTemporaryFile() as f: + report.to_json(filename=f.name) + report2.to_json(filename=f.name) + + df = report.to_dataframe() + df2 = report2.to_dataframe() + assert len(df) == len(df2) == 1