diff --git a/examples/landmark_detection/report.ipynb b/examples/landmark_detection/report.ipynb
new file mode 100644
index 00000000..0b38d5ff
--- /dev/null
+++ b/examples/landmark_detection/report.ipynb
@@ -0,0 +1,596 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from face_alignment import FaceAlignment, LandmarksType\n",
+    "\n",
+    "from giskard_vision.landmark_detection.dataloaders.loaders import DataLoaderFFHQ, DataLoader300W\n",
+    "from giskard_vision.landmark_detection.dataloaders.wrappers import (\n",
+    "    CroppedDataLoader,\n",
+    "    ResizedDataLoader,\n",
+    "    ColoredDataLoader,\n",
+    "    BlurredDataLoader,\n",
+    "    FilteredDataLoader,\n",
+    "    HeadPoseDataLoader,\n",
+    "    EthnicityDataLoader,\n",
+    "    CachedDataLoader,\n",
+    ")\n",
+    "\n",
+    "from giskard_vision.landmark_detection.models.wrappers import OpenCVWrapper, FaceAlignmentWrapper\n",
+    "from giskard_vision.landmark_detection.tests.performance import NMEMean\n",
+    "from giskard_vision.landmark_detection.marks.facial_parts import FacialParts\n",
+    "from giskard_vision.landmark_detection.tests.report import Report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dl_ref = DataLoader300W(dir_path=\"300W/sample\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-01-17 12:01:44.698306: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# cropping\n",
+    "dl_cropped_left = CroppedDataLoader(dl_ref, part=FacialParts.LEFT_HALF.value)\n",
+    "dl_cropped_upper = CroppedDataLoader(dl_ref, part=FacialParts.UPPER_HALF.value)\n",
+    "\n",
+    "# resizing\n",
+    "dl_resized = ResizedDataLoader(dl_ref, scales=0.5)\n",
+    "\n",
+    "# coloring\n",
+    "dl_colored = ColoredDataLoader(dl_ref)\n",
+    "\n",
+    "# blurring\n",
+    "dl_blurred = BlurredDataLoader(dl_ref)\n",
+    "\n",
+    "\n",
+    "# head pose filtering\n",
+    "def positive_roll(elt):\n",
+    "    return elt[2][\"headPose\"][\"roll\"] > 0\n",
+    "\n",
+    "\n",
+    "def negative_roll(elt):\n",
+    "    return elt[2][\"headPose\"][\"roll\"] < 0\n",
+    "\n",
+    "\n",
+    "cached_dl = CachedDataLoader(HeadPoseDataLoader(dl_ref), cache_size=None, cache_img=False, cache_marks=False)\n",
+    "dl_positive_roll = FilteredDataLoader(cached_dl, positive_roll)\n",
+    "dl_negative_roll = FilteredDataLoader(cached_dl, negative_roll)\n",
+    "\n",
+    "\n",
+    "# ethnicity filtering\n",
+    "def white_ethnicity(elt):\n",
+    "    return elt[2][\"ethnicity\"] == \"white\"\n",
+    "\n",
+    "\n",
+    "def latino_ethnicity(elt):\n",
+    "    return elt[2][\"ethnicity\"] == \"latino hispanic\"\n",
+    "\n",
+    "\n",
+    "cached_dl = CachedDataLoader(\n",
+    "    EthnicityDataLoader(dl_ref, ethnicity_map={\"indian\": \"asian\"}), cache_size=None, cache_img=False, cache_marks=False\n",
+    ")\n",
+    "dl_white = FilteredDataLoader(cached_dl, white_ethnicity)\n",
+    "dl_latino = FilteredDataLoader(cached_dl, latino_ethnicity)\n",
+    "\n",
+    "dataloaders_list = [\n",
+    "    dl_cropped_left,\n",
+    "    dl_cropped_upper,\n",
+    "    dl_resized,\n",
+    "    dl_colored,\n",
+    "    dl_blurred,\n",
+    "    dl_positive_roll,\n",
+    "    dl_negative_roll,\n",
+    "    dl_white,\n",
+    "    dl_latino,\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "loading data from : lbfmodel.yaml\n"
+     ]
+    }
+   ],
+   "source": [
+    "models_list = [\n",
+    "    FaceAlignmentWrapper(model=FaceAlignment(LandmarksType.TWO_D, device=\"cpu\", flip_input=False)),\n",
+    "    OpenCVWrapper(),\n",
+    "]\n",
+    "\n",
+    "models_list = [models_list[1]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "OpenCVWrapper: Face not detected in processed image of batch 1 and index 0.\n",
+      "OpenCVWrapper: Face not detected in processed image of batch 5 and index 0.\n",
+      "OpenCVWrapper: Face not detected in processed image of batch 1 and index 0.\n",
+      "OpenCVWrapper: Face not detected in processed image of batch 2 and index 0.\n",
+      "OpenCVWrapper: Face not detected in processed image of batch 3 and index 0.\n",
+      "OpenCVWrapper: Face not detected in processed image of batch 5 and index 0.\n"
+     ]
+    }
+   ],
+   "source": [
+    "report = Report(models_list, dataloaders_list, dataloader_ref=dl_ref)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>model</th>\n",
+       "      <th>facial_part</th>\n",
+       "      <th>dataloader</th>\n",
+       "      <th>prediction_time</th>\n",
+       "      <th>prediction_fail_rate</th>\n",
+       "      <th>test</th>\n",
+       "      <th>metric</th>\n",
+       "      <th>metric_value</th>\n",
+       "      <th>threshold</th>\n",
+       "      <th>passed</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>left half</td>\n",
+       "      <td>300W cropped on left half</td>\n",
+       "      <td>0.929891</td>\n",
+       "      <td>0.564706</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>-0.644057</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>upper half</td>\n",
+       "      <td>300W cropped on upper half</td>\n",
+       "      <td>0.943296</td>\n",
+       "      <td>0.682353</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>0.040216</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>entire face</td>\n",
+       "      <td>300W resizing with ratios: 0.5</td>\n",
+       "      <td>0.995143</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>-0.079876</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>entire face</td>\n",
+       "      <td>300W altered with color mode 7</td>\n",
+       "      <td>1.316452</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>0.001347</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>entire face</td>\n",
+       "      <td>300W blurred</td>\n",
+       "      <td>1.483105</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>-0.103017</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>entire face</td>\n",
+       "      <td>(Cached (300W) with head-pose) filtered using ...</td>\n",
+       "      <td>0.974038</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>0.077927</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>entire face</td>\n",
+       "      <td>(Cached (300W) with head-pose) filtered using ...</td>\n",
+       "      <td>2.076661</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>-0.019482</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>entire face</td>\n",
+       "      <td>(Cached (300W) with ethnicity) filtered using ...</td>\n",
+       "      <td>2.389328</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>0.168421</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>entire face</td>\n",
+       "      <td>(Cached (300W) with ethnicity) filtered using ...</td>\n",
+       "      <td>1.945159</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>-0.784538</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    model  facial_part                                         dataloader  \\\n",
+       "0  OpenCV    left half                          300W cropped on left half   \n",
+       "1  OpenCV   upper half                         300W cropped on upper half   \n",
+       "2  OpenCV  entire face                     300W resizing with ratios: 0.5   \n",
+       "3  OpenCV  entire face                     300W altered with color mode 7   \n",
+       "4  OpenCV  entire face                                       300W blurred   \n",
+       "5  OpenCV  entire face  (Cached (300W) with head-pose) filtered using ...   \n",
+       "6  OpenCV  entire face  (Cached (300W) with head-pose) filtered using ...   \n",
+       "7  OpenCV  entire face  (Cached (300W) with ethnicity) filtered using ...   \n",
+       "8  OpenCV  entire face  (Cached (300W) with ethnicity) filtered using ...   \n",
+       "\n",
+       "   prediction_time  prediction_fail_rate      test    metric  metric_value  \\\n",
+       "0         0.929891              0.564706  TestDiff  NME_mean     -0.644057   \n",
+       "1         0.943296              0.682353  TestDiff  NME_mean      0.040216   \n",
+       "2         0.995143              0.000000  TestDiff  NME_mean     -0.079876   \n",
+       "3         1.316452              0.000000  TestDiff  NME_mean      0.001347   \n",
+       "4         1.483105              0.000000  TestDiff  NME_mean     -0.103017   \n",
+       "5         0.974038              0.000000  TestDiff  NME_mean      0.077927   \n",
+       "6         2.076661              0.000000  TestDiff  NME_mean     -0.019482   \n",
+       "7         2.389328              0.000000  TestDiff  NME_mean      0.168421   \n",
+       "8         1.945159              0.000000  TestDiff  NME_mean     -0.784538   \n",
+       "\n",
+       "   threshold  passed  \n",
+       "0       -0.1    True  \n",
+       "1       -0.1   False  \n",
+       "2       -0.1   False  \n",
+       "3       -0.1   False  \n",
+       "4       -0.1    True  \n",
+       "5       -0.1   False  \n",
+       "6       -0.1   False  \n",
+       "7       -0.1   False  \n",
+       "8       -0.1    True  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "report.to_dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>model</th>\n",
+       "      <th>facial_part</th>\n",
+       "      <th>dataloader</th>\n",
+       "      <th>prediction_time</th>\n",
+       "      <th>prediction_fail_rate</th>\n",
+       "      <th>test</th>\n",
+       "      <th>metric</th>\n",
+       "      <th>metric_value</th>\n",
+       "      <th>threshold</th>\n",
+       "      <th>passed</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>left half</td>\n",
+       "      <td>300W cropped on left half</td>\n",
+       "      <td>0.929891</td>\n",
+       "      <td>0.564706</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>-0.644057</td>\n",
+       "      <td>-0.9</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>upper half</td>\n",
+       "      <td>300W cropped on upper half</td>\n",
+       "      <td>0.943296</td>\n",
+       "      <td>0.682353</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>0.040216</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>entire face</td>\n",
+       "      <td>300W resizing with ratios: 0.5</td>\n",
+       "      <td>0.995143</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>-0.079876</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>entire face</td>\n",
+       "      <td>300W altered with color mode 7</td>\n",
+       "      <td>1.316452</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>0.001347</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>entire face</td>\n",
+       "      <td>300W blurred</td>\n",
+       "      <td>1.483105</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>-0.103017</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>entire face</td>\n",
+       "      <td>(Cached (300W) with head-pose) filtered using ...</td>\n",
+       "      <td>0.974038</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>0.077927</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>entire face</td>\n",
+       "      <td>(Cached (300W) with head-pose) filtered using ...</td>\n",
+       "      <td>2.076661</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>-0.019482</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>entire face</td>\n",
+       "      <td>(Cached (300W) with ethnicity) filtered using ...</td>\n",
+       "      <td>2.389328</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>0.168421</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>OpenCV</td>\n",
+       "      <td>entire face</td>\n",
+       "      <td>(Cached (300W) with ethnicity) filtered using ...</td>\n",
+       "      <td>1.945159</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>TestDiff</td>\n",
+       "      <td>NME_mean</td>\n",
+       "      <td>-0.784538</td>\n",
+       "      <td>-0.1</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    model  facial_part                                         dataloader  \\\n",
+       "0  OpenCV    left half                          300W cropped on left half   \n",
+       "1  OpenCV   upper half                         300W cropped on upper half   \n",
+       "2  OpenCV  entire face                     300W resizing with ratios: 0.5   \n",
+       "3  OpenCV  entire face                     300W altered with color mode 7   \n",
+       "4  OpenCV  entire face                                       300W blurred   \n",
+       "5  OpenCV  entire face  (Cached (300W) with head-pose) filtered using ...   \n",
+       "6  OpenCV  entire face  (Cached (300W) with head-pose) filtered using ...   \n",
+       "7  OpenCV  entire face  (Cached (300W) with ethnicity) filtered using ...   \n",
+       "8  OpenCV  entire face  (Cached (300W) with ethnicity) filtered using ...   \n",
+       "\n",
+       "   prediction_time  prediction_fail_rate      test    metric  metric_value  \\\n",
+       "0         0.929891              0.564706  TestDiff  NME_mean     -0.644057   \n",
+       "1         0.943296              0.682353  TestDiff  NME_mean      0.040216   \n",
+       "2         0.995143              0.000000  TestDiff  NME_mean     -0.079876   \n",
+       "3         1.316452              0.000000  TestDiff  NME_mean      0.001347   \n",
+       "4         1.483105              0.000000  TestDiff  NME_mean     -0.103017   \n",
+       "5         0.974038              0.000000  TestDiff  NME_mean      0.077927   \n",
+       "6         2.076661              0.000000  TestDiff  NME_mean     -0.019482   \n",
+       "7         2.389328              0.000000  TestDiff  NME_mean      0.168421   \n",
+       "8         1.945159              0.000000  TestDiff  NME_mean     -0.784538   \n",
+       "\n",
+       "   threshold  passed  \n",
+       "0       -0.9   False  \n",
+       "1       -0.1   False  \n",
+       "2       -0.1   False  \n",
+       "3       -0.1   False  \n",
+       "4       -0.1    True  \n",
+       "5       -0.1   False  \n",
+       "6       -0.1   False  \n",
+       "7       -0.1   False  \n",
+       "8       -0.1    True  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "report.adjust_thresholds({0: -0.9})\n",
+    "report.to_dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tempfile\n",
+    "\n",
+    "with tempfile.NamedTemporaryFile() as f:\n",
+    "    report.to_json(filename=f.name)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/giskard_vision/landmark_detection/tests/base.py b/giskard_vision/landmark_detection/tests/base.py
index 27b13c29..78ae8309 100644
--- a/giskard_vision/landmark_detection/tests/base.py
+++ b/giskard_vision/landmark_detection/tests/base.py
@@ -233,7 +233,7 @@ def run(
             metric_value=metric_value,
             threshold=self.threshold,
             prediction_results=[prediction_result],
-            passed=metric_value <= self.threshold,
+            passed=bool(metric_value <= self.threshold),  # casting is important for json dumping
             prediction_time=prediction_result.prediction_time,
             prediction_fail_rate=prediction_result.prediction_fail_rate,
             facial_part=facial_part,
@@ -305,7 +305,7 @@ def run(
             metric_value=metric_value,
             threshold=self.threshold,
             prediction_results=prediction_results,
-            passed=abs(metric_value) <= self.threshold,
+            passed=bool(metric_value <= self.threshold),  # casting is important for json dumping
             prediction_time=prediction_time,
             prediction_fail_rate=prediction_fail_rate,
             facial_part=facial_part,
diff --git a/giskard_vision/landmark_detection/tests/report.py b/giskard_vision/landmark_detection/tests/report.py
new file mode 100644
index 00000000..3eb69b3a
--- /dev/null
+++ b/giskard_vision/landmark_detection/tests/report.py
@@ -0,0 +1,121 @@
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import pandas as pd
+
+from giskard_vision.landmark_detection.dataloaders.base import DataIteratorBase
+from giskard_vision.landmark_detection.models.base import FaceLandmarksModelBase
+from giskard_vision.landmark_detection.tests.base import Metric, Test, TestDiff
+from giskard_vision.landmark_detection.tests.performance import NMEMean
+
+
+class Report:
+    """
+    A class for generating and managing test reports for landmark detection models.
+
+    Attributes:
+        default_rel_threshold (float): Default relative threshold.
+        default_abs_threshold (float): Default absolute threshold.
+
+    """
+
+    default_rel_threshold = -0.1
+    default_abs_threshold = 1
+
+    def __init__(
+        self,
+        models: List[FaceLandmarksModelBase],
+        dataloaders: List[DataIteratorBase],
+        metrics: Optional[List[Metric]] = None,
+        dataloader_ref: Optional[DataIteratorBase] = None,
+    ):
+        """
+        Initializes a Report instance.
+
+        Args:
+            models (List[FaceLandmarksModelBase]): List of face landmarks models.
+            dataloaders (List[DataIteratorBase]): List of data loaders for testing.
+            metrics (Optional[List[Metric]]): List of metrics to evaluate (default is NMEMean).
+            dataloader_ref (Optional[DataIteratorBase]): Reference data loader for comparative tests.
+
+        """
+        test = Test if dataloader_ref is None else TestDiff
+        threshold = self.default_abs_threshold if dataloader_ref is None else self.default_rel_threshold
+        metrics = [NMEMean] if metrics is None else metrics
+
+        self.results = []
+        for model in models:
+            for dataloader in dataloaders:
+                run_kwargs = {"model": model, "dataloader": dataloader}
+                if dataloader_ref is not None:
+                    run_kwargs["dataloader_ref"] = dataloader_ref
+                for metric in metrics:
+                    self.results.append(test(metric=metric, threshold=threshold).run(**run_kwargs).to_dict())
+
+    def to_dataframe(self):
+        """
+        Converts the test results to a pandas DataFrame.
+
+        Returns:
+            pd.DataFrame: A DataFrame containing the test results.
+
+        """
+        # columns reordering
+        return pd.DataFrame(self.results)[
+            [
+                "model",
+                "facial_part",
+                "dataloader",
+                "prediction_time",
+                "prediction_fail_rate",
+                "test",
+                "metric",
+                "metric_value",
+                "threshold",
+                "passed",
+            ]
+        ]
+
+    def to_json(self, filename: Optional[str] = None):
+        """
+        Writes the test results to a JSON file.
+
+        Args:
+            filename (Optional[str]): Name of the JSON file (default is generated with a unique identifier).
+
+        """
+        import json
+
+        if filename is None:
+            import uuid
+
+            _uuid = str(uuid.uuid4())
+            filename = "report-{}.jsonl".format(_uuid)
+
+        with open(filename, "w") as jsonl_file:
+            for result in self.results:
+                jsonl_file.write(json.dumps(result) + "\n")
+
+    def adjust_thresholds(self, thresholds: Union[List[float], Dict[int, float]]):
+        """
+        Adjusts the thresholds for the tests.
+
+        Args:
+            thresholds (Union[List[float], Dict[int, float]]): Threshold values for the tests.
+
+        Raises:
+            ValueError: If the length of thresholds list does not match the number of test results.
+
+        """
+        if len(thresholds) != len(self.results) and isinstance(thresholds, list):
+            raise ValueError(
+                f"{self.__class__.__name__}: adjust_thresholds accepts either a List[float] of thresholds of len(self.results) = {len(self.results)} or a Dict[int, float] to map the index of each test to a threshold."
+            )
+
+        if not isinstance(thresholds, dict):
+            thresholds = list(thresholds)
+            thresholds = dict(zip(np.arange(len(thresholds)), thresholds))
+
+        for idx, threshold in thresholds.items():
+            self.results[idx]["threshold"] = float(threshold)
+            self.results[idx]["passed"] = bool(self.results[idx]["metric_value"] <= threshold)
diff --git a/tests/landmark_detection/tests_and_metrics/test_report.py b/tests/landmark_detection/tests_and_metrics/test_report.py
new file mode 100644
index 00000000..476e132f
--- /dev/null
+++ b/tests/landmark_detection/tests_and_metrics/test_report.py
@@ -0,0 +1,30 @@
+import tempfile
+
+import numpy as np
+
+from giskard_vision.landmark_detection.tests.report import Report
+
+
+def test_report(opencv_model, dataset_300w):
+    models = [opencv_model]
+    dls = [dataset_300w]
+    dl_ref = dataset_300w
+
+    report = Report(models=models, dataloaders=dls)
+    assert report.results[0]["test"] == "Test"
+    assert report.results[0]["passed"]
+    assert np.allclose(report.results[0]["metric_value"], 0.04136279942)
+
+    report.adjust_thresholds({0: 0.03})
+    assert not report.results[0]["passed"]
+
+    report2 = Report(models=models, dataloaders=dls, dataloader_ref=dl_ref)
+    assert report2.results[0]["test"] == "TestDiff"
+
+    with tempfile.NamedTemporaryFile() as f:
+        report.to_json(filename=f.name)
+        report2.to_json(filename=f.name)
+
+    df = report.to_dataframe()
+    df2 = report2.to_dataframe()
+    assert len(df) == len(df2) == 1