diff --git a/analytics/app/data/transform.py b/analytics/app/data/transform.py
index d71179606..72c317b8c 100644
--- a/analytics/app/data/transform.py
+++ b/analytics/app/data/transform.py
@@ -23,6 +23,33 @@
 # -------------------------------------------------------------------------------------------------------------------- #
 
 
+def pipeline_leaf_times_df(
+    logs: PipelineLogs,
+    *,
+    use_traintime_patch_at_trainer: bool,
+    pipeline_id: str = "pipeline",
+) -> pd.DataFrame:
+    pipeline_leaf_stages = leaf_stages(logs)
+    df_all = logs_dataframe(logs, f"pipeline_{pipeline_id}")
+    df_leaf_single = df_all[df_all["id"].isin(pipeline_leaf_stages)]
+    if not use_traintime_patch_at_trainer:
+        return df_leaf_single
+
+    df_leaf_only_train = df_leaf_single[df_leaf_single["id"] == PipelineStage.TRAIN.name]
+    df_leaf_wo_train = df_leaf_single[df_leaf_single["id"] != PipelineStage.TRAIN.name]
+
+    df_trainings = StageLog.df(
+        (x for x in logs.supervisor_logs.stage_runs if x.id == PipelineStage.TRAIN.name),
+        extended=True,
+    )
+    df_merged = df_leaf_only_train.merge(df_trainings, on="trigger_idx", how="inner", suffixes=("", "_training"))
+    assert df_merged.shape[0] == df_leaf_only_train.shape[0] == df_trainings.shape[0]
+    df_merged["duration"] = df_merged["train_time_at_trainer"] / 1000.0  # ms to s
+    df_merged = df_merged[df_leaf_only_train.columns]
+
+    return pd.concat([df_merged, df_leaf_wo_train])
+
+
 def logs_dataframe(logs: PipelineLogs, pipeline_ref: str = "pipeline") -> pd.DataFrame:
     df = logs.supervisor_logs.df
     df["pipeline_ref"] = pipeline_ref
diff --git a/analytics/plotting/common/color.py b/analytics/plotting/common/color.py
new file mode 100644
index 000000000..14b864026
--- /dev/null
+++ b/analytics/plotting/common/color.py
@@ -0,0 +1,72 @@
+from typing import Any
+
+import numpy as np
+import seaborn as sns
+from matplotlib import pyplot as plt
+from matplotlib.colors import LinearSegmentedColormap
+
+
+def get_rdbu_wo_white(
+    palette: str = "RdBu",
+    strip: tuple[float, float] | None = (0.35, 0.65),
+    nvalues: int = 100,
+) -> LinearSegmentedColormap | str:
+    if strip is None:
+        return palette
+
+    # Truncate the "RdBu" colormap to exclude the light colors
+    rd_bu_cmap = plt.get_cmap(palette)
+    custom_cmap_blu = rd_bu_cmap(np.linspace(0.0, strip[0], nvalues // 2))
+    custom_cmap_red = rd_bu_cmap(np.linspace(strip[1], 1.0, nvalues // 2))
+    cmap = LinearSegmentedColormap.from_list("truncated", np.concatenate([custom_cmap_blu, custom_cmap_red]))
+    return cmap
+
+
+def gen_categorical_map(categories: list) -> dict[Any, tuple[float, float, float]]:
+    palette = (
+        sns.color_palette("bright")
+        + sns.color_palette("dark")
+        + sns.color_palette("colorblind")
+        + sns.color_palette("pastel")
+        + sns.color_palette("Paired") * 100
+    )[: len(categories)]
+    color_map = dict(zip(categories, palette))
+    return color_map
+
+
+def discrete_colors(n: int = 10):
+    return sns.color_palette("RdBu", n)
+
+
+def discrete_color(i: int, n: int = 10) -> tuple[float, float, float]:
+    palette = discrete_colors(n)
+    return palette[i % n]
+
+
+def main_colors(light: bool = False) -> list[tuple[float, float, float]]:
+    rdbu_palette = discrete_colors(10)
+    colorblind_palette = sns.color_palette("colorblind", 10)
+
+    if light:
+        return [
+            rdbu_palette[-2],
+            rdbu_palette[2],
+            colorblind_palette[-2],
+            colorblind_palette[1],
+            colorblind_palette[2],
+            colorblind_palette[3],
+            colorblind_palette[4],
+        ]
+    return [
+        rdbu_palette[-1],
+        rdbu_palette[1],
+        colorblind_palette[-2],
+        colorblind_palette[1],
+        colorblind_palette[2],
+        colorblind_palette[4],
+        colorblind_palette[5],
+    ]
+
+
+def main_color(i: int, light: bool = False) -> tuple[float, float, float]:
+    return main_colors(light=light)[i]
diff --git a/analytics/plotting/common/const.py b/analytics/plotting/common/const.py
new file mode 100644
index 000000000..d6aa2cbf1
--- /dev/null
+++ b/analytics/plotting/common/const.py
@@ -0,0 +1,2 @@
+DOUBLE_FIG_WIDTH = 10
+DOUBLE_FIG_HEIGHT = 3.5
diff --git a/analytics/plotting/common/cost_matrix.py b/analytics/plotting/common/cost_matrix.py
new file mode 100644
index 000000000..c7feb0d59
--- /dev/null
+++ b/analytics/plotting/common/cost_matrix.py
@@ -0,0 +1,185 @@
+import matplotlib.dates as mdates
+import pandas as pd
+import seaborn as sns
+from matplotlib import pyplot as plt
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+from matplotlib.ticker import MaxNLocator
+
+# Create the heatmap
+from analytics.plotting.common.common import init_plot
+from analytics.plotting.common.const import DOUBLE_FIG_HEIGHT, DOUBLE_FIG_WIDTH
+from analytics.plotting.common.font import setup_font
+
+
+def plot_cost_matrix(
+    df_costs: pd.DataFrame,
+    pipeline_ids: list[int],
+    # y_ticks: list[int] | list[str] | None = None,
+    # y_ticks_bins: int | None = None,
+    # x_ticks: list[int] | None = None,
+    # x_custom_ticks: list[tuple[int, str]] | None = None,  # (position, label)
+    # y_custom_ticks: list[tuple[int, str]] | None = None,  # (position, label)
+    # reverse_col: bool = False,
+    # y_label: str = "Reference Year",
+    # x_label: str = "Current Year",
+    # color_label: str = "Accuracy %",
+    # title_label: str = "",
+    # target_ax: Axes | None = None,
+    # height_factor: float = 1.0,
+    # width_factor: float = 1.0,
+    # cmap: Any | None = None,
+    # linewidth: int = 2,
+    grid_alpha: float = 0.0,
+    title_map: dict[int, str] = {},
+    height_factor: float = 1.0,
+    width_factor: float = 1.0,
+    duration_ylabel: str = "Duration (sec.)",
+    cumulative_ylabel: str = "Cumulative Duration (sec.)",
+    x_label: str = "Sample Timestamp",
+    x_lim: tuple[int, int] = (1930, 2013),
+    x_ticks: list[int] | None = None,
+    x_date_locator: mdates.DateLocator | None = None,
+    x_date_formatter: mdates.DateFormatter | None = None,
+    y_lim: tuple[int, int] = (0, 4000),
+    y_lim_cumulative: tuple[int, int] = (0, 4000),
+    y_ticks: list[int] | None = None,
+    y_ticks_cumulative: list[int] | None = None,
+    y_minutes: bool = False,
+    y_minutes_cumulative: bool = False,
+    # TODO: num bins, xlim, xticks, yticks, ylim
+) -> Figure | Axes:
+    """
+    DataFrame columns:
+        pipeline_ref
+        id: supervisor leaf stage id
+        sample_time_year: sample year when this cost was recorded
+        duration: cost of the pipeline at that time
+    """
+    sns.set_theme(style="whitegrid")
+    init_plot()
+    setup_font(small_label=True, small_title=True, small_ticks=True)
+
+    fig, axs = plt.subplots(
+        nrows=len(pipeline_ids),
+        ncols=2,
+        edgecolor="black",
+        frameon=True,
+        figsize=(
+            DOUBLE_FIG_WIDTH * width_factor,
+            2 * DOUBLE_FIG_HEIGHT * height_factor,
+        ),
+        dpi=600,
+    )
+
+    x_col = "sample_time_year"
+    y_col = "duration"
+    hue_col = "id"
+
+    palette = sns.color_palette("RdBu", 10)
+    new_palette = {
+        "train": palette[0],
+        "inform remaining data": palette[-2],
+        "evaluate trigger policy": palette[2],
+        "inform trigger": palette[-1],
+        "store trained model": palette[1],
+    }
+    # [palette[0], palette[-2], palette[1], palette[-1], palette[2]]
+
+    # use sum of all pipelines to determine the order of the bars that is consistent across subplots
+    df_agg = df_costs.groupby([hue_col]).agg({y_col: "sum"}).reset_index()
+    df_agg = df_agg.sort_values(y_col, ascending=False)
+    categories = df_agg[hue_col].unique()
+
+    legend_tuple = (pipeline_ids[0], True)
+
+    for row, pipeline_id in enumerate(pipeline_ids):
+        # sort by cumulative duration
+        df_costs_pipeline = df_costs[df_costs["pipeline_ref"] == f"pipeline_{pipeline_id}"]
+
+        for cumulative in [False, True]:
+            df_final = df_costs_pipeline.copy()
+            if cumulative and y_minutes_cumulative:
+                df_final[y_col] = df_final[y_col] / 60
+            elif not cumulative and y_minutes:
+                df_final[y_col] = df_final[y_col] / 60
+
+            ax = axs[row, int(cumulative)] if len(pipeline_ids) > 1 else axs[int(cumulative)]
+            h = sns.histplot(
+                df_final,
+                x=x_col,
+                weights=y_col,
+                bins=2014 - 1930 + 1,
+                cumulative=cumulative,
+                # discrete=True,
+                multiple="stack",
+                linewidth=0,  # Remove white edges between bars
+                shrink=1.0,  # Ensure bars touch each other
+                alpha=1.0,  # remove transparaency
+                # hue
+                hue="id",
+                hue_order=categories,
+                palette=new_palette,
+                # ax=axs[int(cumulative)],  # for 1 pipeline, only 1 row
+                ax=ax,
+                # legend
+                legend=legend_tuple == (pipeline_id, cumulative),
+                zorder=-2,
+            )
+
+            # Rasterize the heatmap background to avoid anti-aliasing artifacts
+            for bar in h.patches:
+                bar.set_rasterized(True)
+
+            h.grid(axis="y", linestyle="--", alpha=grid_alpha, zorder=3, color="lightgray")
+            h.grid(axis="x", linestyle="--", alpha=grid_alpha, zorder=3, color="lightgray")
+
+            if len(title_map) > 0:
+                # size huge
+                h.set_title(title_map[pipeline_id])
+
+            # # Set x-axis
+            h.set(xlim=x_lim)
+            h.set_xlabel(x_label, labelpad=10)
+
+            if x_date_locator:
+                h.xaxis.set_major_locator(x_date_locator)
+                # ax.set_xticklabels(x_ticks, rotation=0)
+                h.xaxis.set_major_formatter(x_date_formatter)
+                # ticks = ax.get_xticks()
+                plt.xticks(rotation=0)
+            elif x_ticks is not None:
+                h.set_xticks(
+                    ticks=x_ticks,
+                    labels=x_ticks,
+                    rotation=0,
+                    # ha='right'
+                )
+
+            if cumulative:
+                h.set_ylabel(cumulative_ylabel, labelpad=20)
+                if y_lim_cumulative:
+                    h.set(ylim=y_lim_cumulative)
+                if y_ticks_cumulative:
+                    h.set_yticks(ticks=y_ticks_cumulative, labels=y_ticks_cumulative, rotation=0)
+                else:
+                    h.yaxis.set_major_locator(MaxNLocator(nbins=4))
+            else:
+                h.set_ylabel(duration_ylabel, labelpad=20)
+                if y_ticks:
+                    h.set_yticks(ticks=y_ticks, labels=y_ticks, rotation=0)
+                else:
+                    h.yaxis.set_major_locator(MaxNLocator(nbins=4))
+            if legend_tuple == (pipeline_id, cumulative):
+                # set hue label
+                legend = h.get_legend()
+
+                legend.set_title("")  # remove title
+
+                # expand legend horizontally
+                # legend.set_bbox_to_anchor((0, 1, 1, 0), transform=h.transAxes)
+
+    # Display the plot
+    plt.tight_layout()
+
+    return fig
diff --git a/analytics/plotting/common/dataset_histogram.py b/analytics/plotting/common/dataset_histogram.py
new file mode 100644
index 000000000..90960d58d
--- /dev/null
+++ b/analytics/plotting/common/dataset_histogram.py
@@ -0,0 +1,443 @@
+import matplotlib.dates as mdates
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from matplotlib import pyplot as plt
+from matplotlib.figure import Figure
+from matplotlib.ticker import MaxNLocator
+
+from analytics.plotting.common.color import (
+    gen_categorical_map,
+    main_color,
+    main_colors,
+)
+from analytics.plotting.common.common import DOUBLE_FIG_HEIGHT, init_plot
+from analytics.plotting.common.const import DOUBLE_FIG_WIDTH
+from analytics.plotting.common.font import setup_font
+
+
+def build_countplot(
+    histogram_data: pd.DataFrame,
+    x: str,
+    y_ticks: list[int] | None = None,
+    y_ticks_bins: int | None = None,
+    x_ticks: list[int] | None = None,
+    y_label: str = "Number of Samples",
+    x_label: str = "Year",
+    height_factor: float = 1.0,
+    width_factor: float = 1.0,
+    palette: str = "RdBu",
+    palette_strip: tuple[float, float] | None = (0.35, 0.65),
+) -> Figure:
+    init_plot()
+    setup_font()
+
+    fig = plt.figure(
+        edgecolor="black",
+        frameon=True,
+        figsize=(
+            DOUBLE_FIG_WIDTH * width_factor,
+            2 * DOUBLE_FIG_HEIGHT * height_factor,
+        ),
+        dpi=600,
+    )
+    ax = fig.add_subplot(111)
+
+    agg_by_year = histogram_data.groupby(x).size().reset_index(name="count")
+
+    ax = sns.barplot(
+        data=agg_by_year,
+        x=x,
+        y="count",
+        color=main_color(0),
+        # hue="count",
+        # palette=get_rdbu_wo_white(palette=palette, strip=palette_strip),
+        width=1,
+        legend=False,
+        # bins=12,
+        # element="step",  # hide lines
+        # native_scale=True,        ax=ax,
+    )
+
+    # avoid fine white lines between cells
+    for artist in ax.patches:  # ax.patches contains the bars in the plot
+        artist.set_rasterized(True)
+
+    # draw grid behind bars (horizontal and vertical)
+    ax.grid(axis="x", linestyle="--", alpha=1.0)
+    ax.grid(axis="y", linestyle="--", alpha=1.0)
+
+    # Adjust x-axis tick labels
+    plt.xlabel(x_label)
+    if x_ticks is not None:
+        plt.xticks(
+            ticks=[xtick - min(histogram_data[x]) for xtick in x_ticks],
+            labels=x_ticks,
+            rotation=0,
+            # ha='right'
+        )
+
+    plt.ylabel(y_label)
+    if y_ticks is not None:
+        plt.yticks(ticks=y_ticks, labels=y_ticks, rotation=0)
+    elif y_ticks_bins is not None:
+        ax.yaxis.set_major_locator(MaxNLocator(nbins=y_ticks_bins))
+        ax.set_yticklabels([int(i) for i in ax.get_yticks()], rotation=0)
+
+    # Display the plot
+    plt.tight_layout()
+    # plt.show()
+
+    return fig
+
+
+def build_histogram_multicategory_facets(
+    histogram_data: pd.DataFrame,
+    x: str,
+    label: str,
+    sorted_categories: pd.Series,
+    y_ticks: list[int | float] | None = None,
+    y_ticks_bins: int | None = None,
+    x_ticks: list[pd.Timestamp] | None = None,
+    y_label: str = "Number of Samples",
+    x_label: str = "Year",
+    sharey: bool = False,
+    height_factor: float = 1.0,
+    width_factor: float = 1.0,
+    legend_labels: list[str] | None = None,
+) -> Figure:
+    color_map = gen_categorical_map(sorted_categories)
+    histogram_data = histogram_data.copy()
+
+    init_plot()
+    setup_font()
+
+    # Create a FacetGrid object with 'sex' as the categorical label for facets
+    g = sns.FacetGrid(
+        histogram_data,
+        col=label,
+        margin_titles=False,
+        col_wrap=6,
+        sharey=sharey,  # sharey=False allows independent y-axis
+        sharex=True,
+        col_order=sorted_categories,
+        subplot_kws={},
+        despine=True,
+        # gridspec_kws={"hspace": 0, "wspace": 0},
+    )
+
+    g.figure.set_dpi(300)
+    g.figure.set_figwidth(DOUBLE_FIG_WIDTH * width_factor)
+    g.figure.set_figheight(2 * DOUBLE_FIG_HEIGHT * height_factor)
+
+    g.map_dataframe(
+        sns.histplot,
+        # data=histogram_data, # supplied by map_dataframe
+        x=x,
+        hue=label,
+        palette=color_map,
+        edgecolor=None,  # Disable black borders
+        element="bars",  # bars, poly, bars
+        multiple="dodge",  # layer, **dodge**, **fill**, **stack**
+        bins=40,
+    )
+
+    g.set_titles("{col_name}")  # only the value in the facet name
+
+    # Adjust x-axis tick labels
+    # g.set(xlabel=x_label)
+    if x_ticks is not None:
+        g.set(xticks=x_ticks)
+
+        for ax in g.axes.flat:
+            ax.xaxis.set_major_formatter(mdates.DateFormatter("%b\n%Y"))
+            ax.figure.autofmt_xdate(ha="center", rotation=0)  # Auto-rotate the date labels
+
+    for ax in g.axes.flat:
+        # draw grid behind bars (horizontal and vertical)
+        ax.grid(axis="x", alpha=1.0, linestyle="--")
+        ax.grid(axis="y", alpha=1.0, linestyle="--")
+
+    # g.set(ylabel=y_label)
+    # Hide y-axis labels for all but the leftmost column
+    for i, ax in enumerate(g.axes.flat):
+        # ax.set_xlabel(x_label, labelpad=10)
+        # if i % 4 != 0:  # Check if it's not in the leftmost column
+        ax.set_ylabel(None)
+        ax.set_xlabel(None)
+
+        # center the x-axis labels
+        ax.tick_params(axis="x", rotation=0, pad=6)
+        ax.tick_params(axis="y", pad=10)
+
+        # avoid fine white lines between cells
+        for artist in ax.patches:  # ax.patches contains the bars in the plot
+            artist.set_rasterized(True)
+
+    # g.set_axis_labels(
+    #     x_var=x_label,
+    #     y_var=y_label,
+    #     clear_inner=True,
+    # )
+
+    # Add common x and y labels with custom placement
+    g.figure.text(0.5, 0.0, x_label, ha="center", va="center", fontsize="large")
+    g.figure.text(
+        0.0,
+        0.5,
+        y_label,
+        ha="center",
+        va="center",
+        rotation="vertical",
+        fontsize="large",
+    )
+
+    plt.tight_layout()
+    g.figure.subplots_adjust(wspace=0.4)  # Reduce horizontal space between subplots
+    return g
+
+
+def build_histogram_multicategory_barnorm(
+    histogram_data: pd.DataFrame,
+    x: str,
+    label: str,
+    sorted_coloring_categories: pd.Series,
+    sorted_ordering_categories: pd.Series | None = None,
+    y_ticks: list[int | float] | None = None,
+    y_ticks_bins: int | None = None,
+    x_ticks: list[pd.Timestamp] | None = None,
+    y_label: str = "Number of Samples",
+    x_label: str = "Year",
+    height_factor: float = 1.0,
+    width_factor: float = 1.0,
+    legend: bool = True,
+    legend_labels: list[str] | None = None,
+    legend_title: str | None = None,
+    nbins: int | None = None,
+    manual_color_map: dict[str, tuple[float, float, float]] | None = None,
+    grid_opacity: float = 1.0,
+    col_alpha: float | None = None,
+) -> Figure:
+    if sorted_ordering_categories is None:
+        sorted_ordering_categories = sorted_coloring_categories
+    if legend_labels is None:
+        legend_labels = []
+
+    histogram_data = histogram_data.copy()
+    # rename: if label not in legend_labels, add underscore to label to hide it from the legend
+    histogram_data[label] = histogram_data[label].apply(lambda x: x if x in (legend_labels) else f"_{x}")
+    underscore_col_categories = [x if x in (legend_labels) else f"_{x}" for x in sorted_coloring_categories]
+    underscore_ordering_categories = [x if x in (legend_labels) else f"_{x}" for x in sorted_ordering_categories]
+    underscore_ordering_categories += [
+        # add any missing categories to the end of the list
+        x
+        for x in underscore_col_categories
+        if x not in underscore_ordering_categories
+    ]
+    color_map = gen_categorical_map(underscore_col_categories)
+
+    init_plot()
+    setup_font()
+
+    fig = plt.figure(
+        edgecolor="black",
+        frameon=True,
+        figsize=(
+            DOUBLE_FIG_WIDTH * width_factor,
+            2 * DOUBLE_FIG_HEIGHT * height_factor,
+        ),
+        dpi=600,
+    )
+    ax = fig.add_subplot(111)
+
+    ax = sns.histplot(
+        data=histogram_data,
+        x=x,
+        hue=label,
+        palette=manual_color_map if manual_color_map else color_map,
+        hue_order=underscore_ordering_categories,
+        linewidth=0,  # avoid fine white lines between cells
+        edgecolor=None,  # Disable black borders
+        # legend=len(legend_labels or []) > 0,
+        legend=legend,
+        element="bars",  # bars, poly, bars
+        multiple="fill",  # layer, **dodge**, **fill**, **stack**
+        **{"bins": nbins} if nbins is not None else {},
+        # opacity
+        **{"alpha": col_alpha} if col_alpha is not None else {},
+        ax=ax,
+    )
+    ax.invert_yaxis()
+
+    # avoid fine white lines between cells
+    for artist in ax.patches:  # ax.patches contains the bars in the plot
+        artist.set_rasterized(True)
+
+    # position legend outside of plot
+    if legend and len(legend_labels) > 0:
+        ax.get_legend().set_bbox_to_anchor((1.05, 1.05))
+
+        if legend_title is not None:
+            ax.get_legend().set_title(legend_title)
+
+    # draw grid behind bars (horizontal and vertical)
+    ax.grid(axis="x", linestyle="--", alpha=grid_opacity, color="white")
+    ax.grid(axis="y", linestyle="--", alpha=grid_opacity, color="white")
+
+    # Adjust x-axis tick labels
+    plt.xlabel(x_label)
+    if x_ticks is not None:
+        # ax.xaxis.set_major_locator(DateLocator())
+        # ax.set_xticklabels(x_ticks, rotation=0)
+        # plt.xticks(
+        #     ticks=x_ticks,
+        #     labels=x_ticks,
+        #     rotation=0,
+        #     # ha='right'
+        # )
+        plt.xticks(x_ticks)
+        date_form = mdates.DateFormatter("%b\n%Y")  # Customize format: "2020 Jan"
+        ax.xaxis.set_major_formatter(date_form)
+
+        # Optionally, adjust the number of ticks on x-axis
+        # ax.xaxis.set_major_locator(mdates.YearLocator(base=4))  # Show every 3 months
+
+    # ax.yaxis.set_major_locator(MaxNLocator(nbins=y_ticks_bins))
+    # # ax.set_yticklabels([int(i) + histogram_data["x"].min() for i in ax.get_yticks()], rotation=0)
+
+    plt.ylabel(y_label)
+    if y_ticks is not None:
+        plt.yticks(ticks=y_ticks, labels=list(reversed(y_ticks)), rotation=0)
+    elif y_ticks_bins is not None:
+        ax.yaxis.set_major_locator(MaxNLocator(nbins=y_ticks_bins))
+        # ax.set_yticklabels([int(i) for i in ax.get_yticks()], rotation=0)
+
+    # Display the plot
+    plt.tight_layout()
+    # plt.show()
+
+    return fig
+
+
+def build_cum_barplot(
+    histogram_data: pd.DataFrame,
+    x: str,
+    y: str,
+    y_ticks: list[int] | None = None,
+    y_ticks_bins: int | None = None,
+    x_ticks: list[int] | None = None,
+    x_ticks_bins: int | None = None,
+    y_label: str = "Number of Samples",
+    x_label: str = "Year",
+    height_factor: float = 1.0,
+    width_factor: float = 1.0,
+    palette: str = "RdBu",
+    palette_strip: tuple[float, float] | None = (0.35, 0.65),
+) -> Figure:
+    init_plot()
+    setup_font()
+
+    fig = plt.figure(
+        edgecolor="black",
+        frameon=True,
+        figsize=(
+            DOUBLE_FIG_WIDTH * width_factor,
+            2 * DOUBLE_FIG_HEIGHT * height_factor,
+        ),
+        dpi=600,
+    )
+    ax = fig.add_subplot(111)
+
+    ax = sns.lineplot(
+        data=histogram_data,
+        x=x,
+        y=y,
+        color=main_color(0),
+        # market size
+        # markers=False
+        # markers=True,
+        # hue=y,
+        # # palette=get_rdbu_wo_white(palette=palette, strip=palette_strip),
+        # width=1,
+        # legend=False,
+        # # fill=True,
+        # edgecolor=".5",
+        # facecolor=(0, 0, 0, 0),
+        ax=ax,
+    )
+    # TODO: check gap, dodged elements --> if pdf shows white lines
+
+    # draw grid behind bars (horizontal and vertical)
+    ax.grid(axis="x", linestyle="--", alpha=1.0)
+    ax.grid(axis="y", linestyle="--", alpha=1.0)
+
+    # Adjust x-axis tick labels
+    plt.xlabel(x_label)
+    if x_ticks is not None:
+        plt.xticks(
+            ticks=[xtick - min(histogram_data[x]) for xtick in x_ticks],
+            labels=x_ticks,
+            rotation=0,
+            # ha='right'
+        )
+    elif x_ticks_bins is not None:
+        ax.xaxis.set_major_locator(MaxNLocator(nbins=x_ticks_bins))
+
+    ax.yaxis.set_major_locator(MaxNLocator(nbins=y_ticks_bins))
+    # ax.set_yticklabels([int(i) + histogram_data["x"].min() for i in ax.get_yticks()], rotation=0)
+
+    plt.ylabel(y_label)
+    if y_ticks is not None:
+        plt.yticks(ticks=y_ticks, labels=y_ticks, rotation=0)
+    elif y_ticks_bins is not None:
+        ax.yaxis.set_major_locator(MaxNLocator(nbins=y_ticks_bins))
+        # ax.set_yticklabels([int(i) for i in ax.get_yticks()], rotation=0)
+
+    # Display the plot
+    plt.tight_layout()
+    # plt.show()
+
+    return fig
+
+
+def build_pieplot(
+    x: list[int],
+    labels: list[str],
+    height_factor: float = 1.0,
+    width_factor: float = 1.0,
+) -> Figure:
+    init_plot()
+    setup_font()
+
+    fig = plt.figure(
+        edgecolor="black",
+        frameon=True,
+        figsize=(
+            DOUBLE_FIG_WIDTH * width_factor,
+            2 * DOUBLE_FIG_HEIGHT * height_factor,
+        ),
+        dpi=600,
+    )
+
+    def func(pct, allvals):
+        absolute = int(np.round(pct / 100.0 * np.sum(allvals)))
+        return f"{pct:.1f}%\n({absolute:d})"
+
+    wedges, texts, autotexts = plt.pie(
+        x=x,
+        labels=labels,
+        autopct=lambda pct: func(pct, x),
+        textprops=dict(color="w"),
+        colors=main_colors(),
+        # show labels next to the pie chart
+        startangle=90,
+        explode=(0.1, 0),
+    )
+
+    plt.setp(autotexts, size=8, weight="bold")
+
+    # Display the plot
+    plt.tight_layout()
+    # plt.show()
+
+    return fig
diff --git a/analytics/plotting/common/font.py b/analytics/plotting/common/font.py
new file mode 100644
index 000000000..8c31d4ced
--- /dev/null
+++ b/analytics/plotting/common/font.py
@@ -0,0 +1,35 @@
+import matplotlib.font_manager as fm
+from matplotlib import pyplot as plt
+
+__loaded = False
+
+
+def load_font() -> None:
+    global __loaded
+    if __loaded:
+        return
+
+    cmu_fonts = [
+        x
+        for x in fm.findSystemFonts(fontpaths=["/Users/robinholzinger/Library/Fonts/"])
+        if "cmu" in x or "p052" in x.lower()
+    ]
+
+    for font in cmu_fonts:
+        # Register the font with Matplotlib's font manager
+        # font_prop = fm.FontProperties(fname=font)
+        fm.fontManager.addfont(font)
+
+    assert len([f.name for f in fm.fontManager.ttflist if "cmu" in f.name.lower()]) >= 2
+
+
+def setup_font(small_label: bool = False, small_title: bool | None = None, small_ticks: bool = True) -> None:
+    load_font()
+    plt.rcParams["svg.fonttype"] = "none"
+    plt.rcParams["font.family"] = "P052"  # latex default: "CMU Serif", robin thesis: P052
+    plt.rcParams["legend.fontsize"] = "small" if small_ticks else "medium"
+    plt.rcParams["xtick.labelsize"] = "small" if small_ticks else "medium"
+    plt.rcParams["ytick.labelsize"] = "small" if small_ticks else "medium"
+    plt.rcParams["axes.labelsize"] = "small" if small_label else "medium"
+    if small_title is not None:
+        plt.rcParams["axes.titlesize"] = "small" if small_title else "medium"
diff --git a/analytics/plotting/common/heatmap.py b/analytics/plotting/common/heatmap.py
index 1a5333369..5014b3a39 100644
--- a/analytics/plotting/common/heatmap.py
+++ b/analytics/plotting/common/heatmap.py
@@ -1,100 +1,276 @@
-from pathlib import Path
+from typing import Any, Literal
 
+import matplotlib.patches as patches
 import pandas as pd
 import seaborn as sns
 from matplotlib import pyplot as plt
+from matplotlib.axes import Axes
 from matplotlib.figure import Figure
 from matplotlib.ticker import MaxNLocator
 
 # Create the heatmap
 from analytics.plotting.common.common import init_plot
+from analytics.plotting.common.const import DOUBLE_FIG_HEIGHT, DOUBLE_FIG_WIDTH
+from analytics.plotting.common.font import setup_font
+
+
+def get_fractional_index(dates: pd.Series, query_date: pd.Timestamp, fractional: bool = True) -> float:
+    """Given a list of Period objects (dates) and a query_date as a Period,
+    return the interpolated fractional index between two period indices if the
+    query_date lies between them."""
+    # Ensure query_date is within the bounds of the period range
+    if query_date < dates[0].start_time:
+        return -1  # -1 before first index
+
+    if query_date > dates[-1].start_time:
+        return len(dates)  # +1 after last index
+
+    # Find the two periods where the query_date falls in between
+    for i in range(len(dates) - 1):
+        if dates[i].start_time <= query_date <= dates[i + 1].start_time:
+            # Perform linear interpolation, assuming equal length periods
+            return i + (
+                ((query_date - dates[i].start_time) / (dates[i + 1].start_time - dates[i].start_time))
+                if fractional
+                else 0
+            )
+
+    # If query_date is exactly one of the dates
+    return dates.get_loc(query_date)
 
 
 def build_heatmap(
     heatmap_data: pd.DataFrame,
-    y_ticks: list[int] | None = None,
+    y_ticks: list[int] | list[str] | None = None,
     y_ticks_bins: int | None = None,
+    x_ticks: list[int] | None = None,
+    x_custom_ticks: list[tuple[int, str]] | None = None,  # (position, label)
+    y_custom_ticks: list[tuple[int, str]] | None = None,  # (position, label)
     reverse_col: bool = False,
     y_label: str = "Reference Year",
     x_label: str = "Current Year",
     color_label: str = "Accuracy %",
-) -> Figure:
+    title_label: str = "",
+    target_ax: Axes | None = None,
+    height_factor: float = 1.0,
+    width_factor: float = 1.0,
+    square: bool = False,
+    cbar: bool = True,
+    vmin: float | None = None,
+    vmax: float | None = None,
+    policy: list[tuple[int, int, int]] = [],
+    cmap: Any | None = None,
+    linewidth: int = 2,
+    grid_alpha: float = 0.0,
+    disable_horizontal_grid: bool = False,
+    df_logs_models: pd.DataFrame | None = None,
+    triggers: dict[int, pd.DataFrame] = {},
+    x_axis: Literal["int", "period"] = "year",
+) -> Figure | Axes:
     init_plot()
-    # sns.set_theme(style="ticks")
-    plt.rcParams["svg.fonttype"] = "none"
-
-    double_fig_width = 10
-    double_fig_height = 3.5
+    setup_font(small_label=True, small_title=True)
 
-    fig = plt.figure(
-        edgecolor="black",
-        frameon=True,
-        figsize=(double_fig_width, 2.2 * double_fig_height),
-        dpi=300,
-    )
+    if not target_ax:
+        fig = plt.figure(
+            edgecolor="black",
+            frameon=True,
+            figsize=(
+                DOUBLE_FIG_WIDTH * width_factor,
+                2 * DOUBLE_FIG_HEIGHT * height_factor,
+            ),
+            dpi=600,
+        )
 
     ax = sns.heatmap(
         heatmap_data,
-        cmap="RdBu" + ("_r" if reverse_col else ""),
+        cmap=("RdBu" + ("_r" if reverse_col else "")) if not cmap else cmap,
         linewidths=0.0,
-        linecolor="black",
-        cbar=True,
+        linecolor="white",
         # color bar from 0 to 1
         cbar_kws={
             "label": color_label,
             # "ticks": [0, 25, 50, 75, 100],
             "orientation": "vertical",
         },
+        ax=target_ax,
+        square=square,
+        **{
+            "vmin": vmin if vmin is not None else heatmap_data.min().min(),
+            "vmax": vmax if vmax is not None else heatmap_data.max().max(),
+            "cbar": cbar,
+        },
     )
+
+    # Rasterize the heatmap background to avoid anti-aliasing artifacts
     ax.collections[0].set_rasterized(True)
 
-    # Adjust x-axis tick labels
-    plt.xlabel(x_label)
-    plt.xticks(
-        ticks=[x + 0.5 for x in range(0, 2010 - 1930 + 1, 20)],
-        labels=[x for x in range(1930, 2010 + 1, 20)],
-        rotation=0,
-        # ha='right'
+    rect = patches.Rectangle(
+        (0, 0),
+        heatmap_data.shape[1],
+        heatmap_data.shape[0],
+        linewidth=2,
+        edgecolor="black",
+        facecolor="none",
     )
+    ax.add_patch(rect)
+
+    # Adjust x-axis tick labels
+    ax.set_xlabel(x_label)
+    if not x_ticks and not x_custom_ticks:
+        ax.set_xticks(
+            ticks=[x + 0.5 for x in range(0, 2010 - 1930 + 1, 20)],  # TODO: check 0.5
+            labels=[x for x in range(1930, 2010 + 1, 20)],
+            rotation=0,
+            # ha='right'
+        )
+    else:
+        if x_custom_ticks:
+            ax.set_xticks(
+                ticks=[x[0] for x in x_custom_ticks],
+                labels=[x[1] for x in x_custom_ticks],
+                rotation=0,
+                # ha='right'
+            )
+        else:
+            assert x_ticks is not None
+            ax.set_xticks(
+                ticks=[x - 1930 + 0.5 for x in x_ticks],
+                labels=[x for x in x_ticks],
+                rotation=0,
+                # ha='right'
+            )
     ax.invert_yaxis()
 
+    ax.grid(
+        axis="y",
+        linestyle="--",
+        alpha=0 if disable_horizontal_grid else grid_alpha,
+        color="white",
+    )
+    ax.grid(axis="x", linestyle="--", alpha=grid_alpha, color="white")
+
     if y_ticks is not None:
-        plt.yticks(ticks=[y + 0.5 - 1930 for y in y_ticks], labels=[y for y in y_ticks], rotation=0)
+        ax.set_yticks(
+            ticks=[y + 0.5 - 1930 for y in y_ticks],
+            labels=[y for y in y_ticks],
+            rotation=0,
+        )
     elif y_ticks_bins is not None:
         ax.yaxis.set_major_locator(MaxNLocator(nbins=y_ticks_bins))
         ax.set_yticklabels([int(i) + min(heatmap_data.index) for i in ax.get_yticks()], rotation=0)
+    else:
+        if y_custom_ticks:
+            ax.set_yticks(
+                ticks=[y[0] for y in y_custom_ticks],
+                labels=[y[1] for y in y_custom_ticks],
+                rotation=0,
+                # ha='right'
+            )
+
+    ax.set_ylabel(y_label)
+
+    if title_label:
+        ax.set_title(title_label)
 
-    plt.ylabel(y_label)
-
-    # # Draft training boxes
-    # if drift_pipeline:
-    #     for type_, dashed in [("train", False), ("usage", False), ("train", True)]:
-    #         for active_ in df_logs_models.iterrows():
-    #             x_start = active_[1][f"{type_}_start"].year - 1930
-    #             x_end = active_[1][f"{type_}_end"].year - 1930
-    #             y = active_[1]["model_idx"]
-    #             rect = plt.Rectangle(
-    #                 (x_start, y - 1),  # y: 0 based index, model_idx: 1 based index
-    #                 x_end - x_start,
-    #                 1,
-    #                 edgecolor="White" if type_ == "train" else "Black",
-    #                 facecolor="none",
-    #                 linewidth=3,
-    #                 linestyle="dotted" if dashed else "solid",
-    #                 hatch="/",
-    #                 joinstyle="bevel",
-    #                 # capstyle="round",
-    #             )
-    #             ax.add_patch(rect)
+    # mainly for offline expore
+    previous_y = 0
+    for x_start, x_end, y in policy:
+        # main box
+        rect = plt.Rectangle(
+            (x_start, y),  # y: 0 based index, model_idx: 1 based index
+            x_end - x_start,
+            1,
+            edgecolor="White",
+            facecolor="none",
+            linewidth=linewidth,
+            linestyle="solid",
+            hatch="/",
+            joinstyle="bevel",
+            # capstyle="round",
+        )
+        ax.add_patch(rect)
+
+        # connector
+        connector = plt.Rectangle(
+            (x_start, previous_y),  # y: 0 based index, model_idx: 1 based index
+            0,
+            y - previous_y + 1,
+            edgecolor="White",
+            facecolor="none",
+            linewidth=linewidth,
+            linestyle="solid",
+            hatch="/",
+            joinstyle="bevel",
+            # capstyle="round",
+        )
+        ax.add_patch(connector)
+        previous_y = y
+
+    # for post factum evaluation
+    if df_logs_models is not None:
+        for type_, dashed in [("train", False), ("usage", False), ("train", True)]:
+            for active_ in df_logs_models.iterrows():
+                if x_axis == "year":
+                    x_start = active_[1][f"{type_}_start"].year - 1930
+                    x_end = active_[1][f"{type_}_end"].year - 1930
+                else:
+                    # start_idx = get_fractional_index(heatmap_data.columns, start_date)
+                    # end_idx = get_fractional_index(heatmap_data.columns, end_date)
+                    # x_start = heatmap_data.columns.get_loc(active_[1][f"{type_}_start"])
+                    # x_end = heatmap_data.columns.get_loc(active_[1][f"{type_}_end"])
+                    x_start = get_fractional_index(
+                        heatmap_data.columns,
+                        active_[1][f"{type_}_start"],
+                        fractional=False,
+                    )
+                    x_end = get_fractional_index(
+                        heatmap_data.columns,
+                        active_[1][f"{type_}_end"],
+                        fractional=False,
+                    )
+
+                y = active_[1]["model_idx"]
+                rect = plt.Rectangle(
+                    (
+                        x_start,
+                        y - 1,
+                    ),  # y: 0 based index, model_idx: 1 based index
+                    x_end - x_start,
+                    1,
+                    edgecolor="White" if type_ == "train" else "Black",
+                    facecolor="none",
+                    linewidth=1.5,
+                    linestyle="dotted" if dashed else "solid",
+                    hatch="/",
+                    joinstyle="bevel",
+                    # capstyle="round",
+                )
+                ax.add_patch(rect)
+
+    if triggers:
+        for y, triggers_df in triggers.items():
+            for row in triggers_df.iterrows():
+                type_ = "usage"
+                # for y, x_list in triggers.items():
+                x_start = row[1][f"{type_}_start"].year - 1930
+                x_end = row[1][f"{type_}_end"].year - 1930
+                # for x in x_list:
+                rect = plt.Rectangle(
+                    (x_start, y),  # y: 0 based index, model_idx: 1 based index
+                    x_end - x_start,
+                    1,
+                    edgecolor="black",
+                    facecolor="none",
+                    linewidth=1,
+                    # linestyle="dotted",
+                    # hatch="/",
+                    # joinstyle="bevel",
+                    # capstyle="round",
+                )
+                ax.add_patch(rect)
 
     # Display the plot
     plt.tight_layout()
     # plt.show()
 
-    return fig
-
-
-def save_plot(fig: Figure, name: str) -> None:
-    for img_type in ["png", "svg"]:
-        img_path = Path("/scratch/robinholzi/gh/modyn/.data/plots") / f"{name}.{img_type}"
-        fig.savefig(img_path, bbox_inches="tight", transparent=True)
+    return fig if not target_ax else ax
diff --git a/analytics/plotting/common/linear_regression_scatterplot.py b/analytics/plotting/common/linear_regression_scatterplot.py
new file mode 100644
index 000000000..a75f2a790
--- /dev/null
+++ b/analytics/plotting/common/linear_regression_scatterplot.py
@@ -0,0 +1,107 @@
+from typing import Any
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+
+from analytics.plotting.common.color import main_color
+from analytics.plotting.common.common import init_plot
+from analytics.plotting.common.font import setup_font
+
+# Create the heatmap
+
+
+def scatter_linear_regression(
+    data: pd.DataFrame,
+    x: str,
+    y: str,
+    hue: str,
+    y_ticks: list[int] | list[str] | None = None,
+    x_ticks: list[int] | None = None,
+    y_label: str = "Reference Year",
+    x_label: str = "Current Year",
+    height_factor: float = 1.0,
+    width_factor: float = 1.0,
+    legend_label: str = "Number Samples",
+    title_label: str = "",
+    target_ax: Axes | None = None,
+    palette: Any = None,
+    small_legend_fonts: bool = False,
+) -> Figure | tuple[Axes, Axes]:
+    sns.set_style("whitegrid")
+
+    init_plot()
+    setup_font(small_label=True, small_title=True)
+
+    DOUBLE_FIG_WIDTH = 10
+    DOUBLE_FIG_HEIGHT = 3.5
+
+    if not target_ax:
+        fig = plt.figure(
+            edgecolor="black",
+            frameon=True,
+            figsize=(
+                DOUBLE_FIG_WIDTH * width_factor,
+                2 * DOUBLE_FIG_HEIGHT * height_factor,
+            ),
+            dpi=600,
+        )
+
+    ax1 = sns.regplot(
+        data,
+        x=x,
+        y=y,  # duration
+        color=main_color(0),
+    )
+
+    ax2 = sns.scatterplot(
+        data,
+        x=x,
+        y=y,  # duration
+        hue=hue,
+        palette=palette,
+        s=200,
+        legend=True,
+        marker="X",
+    )
+
+    ax2.legend(
+        title=legend_label,
+        ncol=2,
+        handletextpad=0,
+        columnspacing=0.5,
+        **({"fontsize": "x-small"} if small_legend_fonts else {}),
+    )
+    # ax2.legend().set_title(legend_label)
+
+    # Adjust x-axis tick labels
+    ax2.set_xlabel(x_label)
+    if x_ticks is not None:
+        ax2.set_xticks(
+            ticks=x_ticks,
+            labels=x_ticks,
+            rotation=0,
+            # ha='right'
+        )
+
+    if y_ticks is not None:
+        ax2.set_yticks(
+            ticks=y_ticks,
+            labels=y_ticks,
+            rotation=0,
+        )
+
+    ax2.set_ylabel(y_label)
+
+    if title_label:
+        ax2.set_title(title_label)
+
+    print("Number of plotted items", data.shape[0])
+
+    # Display the plot
+    plt.tight_layout()
+    # plt.show()
+
+    return fig if not target_ax else (ax1, ax2)
diff --git a/analytics/plotting/common/metric_over_time.py b/analytics/plotting/common/metric_over_time.py
new file mode 100644
index 000000000..6aeba86c7
--- /dev/null
+++ b/analytics/plotting/common/metric_over_time.py
@@ -0,0 +1,125 @@
+import matplotlib.dates as mdates
+import pandas as pd
+import seaborn as sns
+from matplotlib import pyplot as plt
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+
+from analytics.plotting.common.color import main_color
+from analytics.plotting.common.common import (
+    DOUBLE_FIG_HEIGHT,
+    DOUBLE_FIG_WIDTH,
+    init_plot,
+)
+from analytics.plotting.common.font import setup_font
+
+# Create the heatmap
+
+
+def plot_metric_over_time(
+    data: pd.DataFrame,
+    x: str = "time",
+    y: str = "value",
+    hue: str = "pipeline_ref",
+    style: str = "pipeline_ref",
+    # y_ticks: list[int] | list[str] | None = None,
+    # y_ticks_bins: int | None = None,
+    # x_ticks: list[int] | None = None,
+    # x_custom_ticks: list[tuple[int, str]] | None = None,  # (position, label)
+    # y_custom_ticks: list[tuple[int, str]] | None = None,  # (position, label)
+    # reverse_col: bool = False,
+    y_label: str = "Reference Year",
+    x_label: str = "Current Year",
+    # color_label: str = "Accuracy %",
+    title_label: str = "",
+    target_ax: Axes | None = None,
+    height_factor: float = 1.0,
+    width_factor: float = 1.0,
+    # square: bool = False,
+    # cbar: bool = True,
+    # vmin: float | None = None,
+    # vmax: float | None = None,
+    # policy: list[tuple[int, int, int]] = [],
+    # cmap: Any | None = None,
+    # linewidth: int = 2,
+    grid_alpha: float = 0.0,
+    legend_label: str = "TODO",
+    small_legend_fonts: bool = False,
+    x_date_locator: mdates.DateLocator | None = None,
+    x_date_formatter: mdates.DateFormatter | None = None,
+    y_ticks: list[int] | None = None,
+    xlim: tuple[int, int] | None = None,
+    ylim: tuple[int, int] | None = None,
+    markers: bool = True,
+) -> Figure | Axes:
+    sns.set_style("whitegrid")
+    init_plot()
+    setup_font(small_label=False, small_title=False, small_ticks=False)
+
+    if not target_ax:
+        fig = plt.figure(
+            edgecolor="black",
+            frameon=True,
+            figsize=(
+                DOUBLE_FIG_WIDTH * width_factor,
+                2 * DOUBLE_FIG_HEIGHT * height_factor,
+            ),
+            dpi=600,
+        )
+
+    ax = sns.lineplot(
+        data,
+        x=x,
+        y=y,
+        hue=hue,
+        markersize=7,
+        # line width
+        linewidth=2.5,
+        palette=[
+            main_color(0),
+            main_color(1),
+            main_color(3),
+            main_color(4),
+            main_color(5),
+            main_color(6),
+        ],
+        style=style,
+        markers=markers,
+    )
+
+    if xlim:
+        ax.set(xlim=xlim)
+
+    if ylim:
+        ax.set(ylim=ylim)
+
+    ax.set_xlabel(x_label)
+    ax.set_ylabel(y_label)
+
+    ax.legend(
+        title=legend_label,
+        ncol=2,
+        handletextpad=1,
+        columnspacing=1.4,
+        **({"fontsize": "x-small"} if small_legend_fonts else {}),
+    )
+
+    if x_date_locator:
+        ax.xaxis.set_major_locator(x_date_locator)
+        # ax.set_xticklabels(x_ticks, rotation=0)
+        ax.xaxis.set_major_formatter(x_date_formatter)
+        # ticks = ax.get_xticks()
+        plt.xticks(rotation=0)
+
+    if y_ticks:
+        ax.set_yticks(y_ticks)
+
+    # set aspect ratio
+    # ax.set_aspect(0.4)
+    # plt.setp(legend.get_title())
+
+    # Display the plot
+    plt.tight_layout()
+    # plt.show()
+
+    return fig if not target_ax else ax
diff --git a/analytics/plotting/common/save.py b/analytics/plotting/common/save.py
new file mode 100644
index 000000000..b6be7fcc4
--- /dev/null
+++ b/analytics/plotting/common/save.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+
+import pandas as pd
+from matplotlib.figure import Figure
+
+
+def save_plot(fig: Figure, name: str) -> None:
+    for img_type in ["png", "svg", "pdf"]:
+        img_path = Path(".data/_plots") / f"{name}.{img_type}"
+        img_path.parent.mkdir(exist_ok=True, parents=True)
+        fig.savefig(img_path, bbox_inches="tight", transparent=True)
+
+
+def save_csv_df(df: pd.DataFrame, name: str) -> None:
+    csv_path = Path(".data/csv") / f"{name}.csv"
+    csv_path.parent.mkdir(exist_ok=True, parents=True)
+    df.to_csv(csv_path, index=False)
diff --git a/analytics/plotting/common/tradeoff_scatterplot.py b/analytics/plotting/common/tradeoff_scatterplot.py
new file mode 100644
index 000000000..ddb5f30d5
--- /dev/null
+++ b/analytics/plotting/common/tradeoff_scatterplot.py
@@ -0,0 +1,104 @@
+import pandas as pd
+import seaborn as sns
+from matplotlib import pyplot as plt
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+
+from analytics.plotting.common.color import main_color
+from analytics.plotting.common.common import init_plot
+from analytics.plotting.common.font import setup_font
+
+
+def plot_tradeoff_scatter(
+    data: pd.DataFrame,
+    x: str,
+    y: str,
+    hue: str,
+    style: str,
+    x_label: str = "Number of Triggers",
+    y_label: str = "Mean Accuracy %",
+    height_factor: float = 1.0,
+    width_factor: float = 1.0,
+    target_ax: Axes | None = None,
+    manual_legend_title: bool = True,
+    legend_ncol: int = 1,
+) -> Figure:
+    sns.set_theme(style="whitegrid")
+    init_plot()
+    setup_font(small_label=True, small_title=True, small_ticks=True)
+
+    DOUBLE_FIG_WIDTH = 10
+    DOUBLE_FIG_HEIGHT = 3.5
+
+    if not target_ax:
+        fig = plt.figure(
+            edgecolor="black",
+            frameon=True,
+            figsize=(
+                DOUBLE_FIG_WIDTH * width_factor,
+                2 * DOUBLE_FIG_HEIGHT * height_factor,
+            ),
+            dpi=600,
+        )
+
+    ax = sns.scatterplot(
+        data,
+        x=x,
+        y=y,
+        hue=hue,
+        style=style,
+        # style="pipeline_ref",
+        palette=[
+            main_color(0),
+            main_color(1),
+            main_color(3),
+            main_color(4),
+            main_color(5),
+        ],
+        # palette={"drift": main_color(3), "yearly": main_color(0), "amount": main_color(1)},
+        s=300,
+        # legend=False,
+        # marker="X",
+    )
+    # ax.set(ylim=(90, 93))
+    # ax.set(xlim=(-4, 85))
+
+    ax.legend(
+        fontsize="small",
+        title_fontsize="medium",
+        # title="Pipeline",
+        **(
+            {
+                "title": hue,
+            }
+            if manual_legend_title
+            else {}
+        ),
+        # 2 columns
+        ncol=legend_ncol,
+    )
+
+    # Adjust x-axis tick labels
+    plt.xlabel(x_label, labelpad=10)
+    # plt.xticks(
+    #     ticks=[x for x in range(0, 80 + 1, 20)],
+    #     labels=[x for x in range(0, 80 + 1, 20)],
+    #     rotation=0,
+    #     # ha='right'
+    # )
+
+    # Set y-axis ticks to be equally spaced
+    plt.ylabel(y_label, labelpad=15)
+    # plt.yticks(
+    #     ticks=[x for x in range(90, 93 + 1, 3)],
+    #     labels=[x for x in range(90, 93 + 1, 3)],
+    #     rotation=0,
+    # )
+
+    # Display the plot
+    plt.tight_layout()
+    plt.show()
+
+    return fig
+
+    # TODO: same figure for time, arxiv and huffpost, use fixed train cost
diff --git a/analytics/plotting/rh_thesis/arxiv_kaggle_analytics.ipynb b/analytics/plotting/rh_thesis/arxiv_kaggle_analytics.ipynb
new file mode 100644
index 000000000..2ce4af8c2
--- /dev/null
+++ b/analytics/plotting/rh_thesis/arxiv_kaggle_analytics.ipynb
@@ -0,0 +1,421 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "import plotly.express as px\n",
+    "\n",
+    "from analytics.plotting.common.dataset_histogram import (\n",
+    "    build_countplot,\n",
+    "    build_cum_barplot,\n",
+    "    build_histogram_multicategory_barnorm,\n",
+    ")\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from benchmark.arxiv_kaggle.data_generation import ArxivKaggleDataGenerator\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use interactive plotly\n",
+    "interactive = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arxiv_dataset = ArxivKaggleDataGenerator(\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/datasets/arxiv_kaggle/\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/datasets/arxiv_kaggle/raw/arxiv.zip\"),\n",
+    ")\n",
+    "arxiv_dataset.extract_data(\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/datasets/arxiv_kaggle/raw/arxiv.zip\")\n",
+    ")\n",
+    "arxiv_df = arxiv_dataset.load_into_dataframe(keep_true_category=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pickle_path = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/datasets/arxiv_kaggle/raw/arxiv_kaggle.pkl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pickle.dump(arxiv_df, open(pickle_path, \"wb\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arxiv_df = pickle.load(open(pickle_path, \"rb\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arxiv_df = arxiv_df[arxiv_df[\"first_version_timestamp\"] >= \"1990-01-01\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arxiv_df[\"category\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # number of samples over time\n",
+    "# px.histogram(\n",
+    "#     arxiv_df, x=\"first_version_timestamp\"\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arxiv_df[\"year\"] = arxiv_df[\"first_version_timestamp\"].dt.year"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arxiv_df[\"year\"].max()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# number of samples over time\n",
+    "if interactive:\n",
+    "    fig = px.histogram(\n",
+    "        arxiv_df,\n",
+    "        x=\"first_version_timestamp\",\n",
+    "        color=\"category\",\n",
+    "        facet_col=\"category\",\n",
+    "        facet_col_wrap=6,\n",
+    "        height=5000,\n",
+    "        facet_row_spacing=0.001,\n",
+    "    )\n",
+    "    fig.update_yaxes(matches=None, showticklabels=True)\n",
+    "    fig.update_xaxes(showticklabels=True)\n",
+    "    fig.show()\n",
+    "\n",
+    "else:\n",
+    "    # polished\n",
+    "    fig1 = build_countplot(\n",
+    "        arxiv_df,\n",
+    "        x=\"year\",\n",
+    "        x_ticks=[y for y in range(1990, 2020 + 1, 5)],\n",
+    "        y_ticks_bins=4,\n",
+    "        height_factor=0.4,\n",
+    "        width_factor=1.0,\n",
+    "        x_label=\"Sample Time\",\n",
+    "        y_label=\"Num Samples\",\n",
+    "    )\n",
+    "\n",
+    "    save_plot(fig1, \"arxiv_kaggle_samples_over_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "category_and_years = arxiv_df[[\"category\", \"first_version_timestamp\"]]\n",
+    "category_and_years[\"year\"] = category_and_years[\"first_version_timestamp\"].dt.year\n",
+    "category_and_years = category_and_years[[\"category\", \"year\"]].drop_duplicates()\n",
+    "category_and_years = category_and_years.groupby(\"category\").size().reset_index()\n",
+    "category_and_years.columns = [\"category\", \"num_years\"]\n",
+    "category_and_years[category_and_years[\"num_years\"] > 9]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arxiv_df_reduced = arxiv_df.merge(category_and_years, on=\"category\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# arxiv_df_reduced[\"category\"] = arxiv_df_reduced[\"category\"].str.split(\".\", expand=True)[0]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_category_ratios(df: pd.DataFrame) -> pd.DataFrame:\n",
+    "    total_samples = df.shape[0]\n",
+    "    category_counts = df[\"category\"].value_counts().reset_index().sort_values(\"count\", ascending=False)\n",
+    "    category_counts[\"ratio\"] = category_counts[\"count\"] / total_samples\n",
+    "    return category_counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Analyse ratio of categories\n",
+    "category_counts = find_category_ratios(arxiv_df_reduced)\n",
+    "category_counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wordcloud: list[str] = []\n",
+    "for i, category in enumerate(list(category_counts[\"category\"].unique())):\n",
+    "    for _ in range(100 - i):\n",
+    "        wordcloud.append(category)\n",
+    "print(wordcloud)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Export for thesis table\n",
+    "from analytics.plotting.common.save import save_csv_df\n",
+    "\n",
+    "# select top 8 and bottom 2\n",
+    "export_csv = pd.concat([category_counts.head(8)])  # , category_counts.tail(2)\n",
+    "export_csv[\"ratio\"] = export_csv[\"ratio\"].apply(lambda x: round(x * 100, 1))\n",
+    "print(export_csv)\n",
+    "\n",
+    "save_csv_df(export_csv, \"arxiv_kaggle_category_ratios\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sorted_categories = (category_counts.sort_values(\"count\", ascending=False))[\"category\"]\n",
+    "sorted_categories\n",
+    "\n",
+    "\n",
+    "arxiv_df_reduced[\"sort_idx\"] = pd.Categorical(arxiv_df_reduced[\"category\"], categories=sorted_categories, ordered=True)\n",
+    "arxiv_df_reduced = arxiv_df_reduced.sort_values(\"sort_idx\", ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plotting_threshold = category_counts.reset_index()[[\"index\", \"ratio\"]]\n",
+    "plotting_threshold[\"index\"] = plotting_threshold[\"index\"] + 1\n",
+    "# add first row: 0\n",
+    "plotting_threshold = pd.concat([pd.DataFrame({\"index\": [0], \"ratio\": [0]}), plotting_threshold])\n",
+    "\n",
+    "# cumulative sum\n",
+    "plotting_threshold[\"ratio\"] = plotting_threshold[\"ratio\"].cumsum() * 100\n",
+    "plotting_threshold"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot coverage of categories\n",
+    "label_hist = build_cum_barplot(\n",
+    "    plotting_threshold,\n",
+    "    x=\"index\",\n",
+    "    y=\"ratio\",\n",
+    "    x_label=\"Categories\",\n",
+    "    y_label=\"% of Dataset\",\n",
+    "    height_factor=0.4,\n",
+    "    width_factor=0.4,\n",
+    "    y_ticks_bins=3,\n",
+    "    x_ticks_bins=4,\n",
+    ")\n",
+    "save_plot(label_hist, \"arxiv_kaggle_category_coverage\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we want to find out the ratio of the dataset that we cover when only\n",
+    "# show the top 24 categories\n",
+    "category_counts.sort_values(\"ratio\", ascending=False).head(n=24)[\"ratio\"].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# legend: find the top 10 labels\n",
+    "labels = category_counts[\"category\"].head(n=10)\n",
+    "fig_labels_distribution = build_histogram_multicategory_barnorm(\n",
+    "    arxiv_df_reduced,\n",
+    "    x=\"first_version_timestamp\",\n",
+    "    label=\"category\",\n",
+    "    sorted_coloring_categories=sorted_categories,\n",
+    "    height_factor=0.65,\n",
+    "    width_factor=1.0,\n",
+    "    legend_labels=list(labels),\n",
+    "    x_label=\"Sample Time\",\n",
+    "    y_label=\"Label Distribution\",\n",
+    "    y_ticks=[1.0, 0.75, 0.5, 0.25, 0.0],\n",
+    "    y_ticks_bins=4,\n",
+    "    x_ticks=[\n",
+    "        pd.to_datetime(d) for d in [\"1991-09-01\", \"1996-07-01\", \"2000-01-01\", \"2009-01-01\", \"2014-01-01\", \"2018-01-01\"]\n",
+    "    ],\n",
+    "    legend_title=\"Paper Category\",\n",
+    "    nbins=60,\n",
+    ")\n",
+    "save_plot(fig_labels_distribution, \"arxiv_kaggle_category_relative\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# legend: find the top 10 labels\n",
+    "labels = category_counts[\"category\"][category_counts[\"category\"].str.contains(\"astro-ph\")].head(n=10)\n",
+    "fig_labels_distribution = build_histogram_multicategory_barnorm(\n",
+    "    arxiv_df_reduced[arxiv_df_reduced[\"category\"].str.contains(\"astro-ph\")],\n",
+    "    x=\"first_version_timestamp\",\n",
+    "    label=\"category\",\n",
+    "    sorted_coloring_categories=sorted_categories,\n",
+    "    height_factor=0.65,\n",
+    "    width_factor=1.0,\n",
+    "    legend_labels=list(labels),\n",
+    "    x_label=\"Sample Time\",\n",
+    "    y_label=\"Label Distribution\",\n",
+    "    y_ticks=[1.0, 0.75, 0.5, 0.25, 0.0],\n",
+    "    y_ticks_bins=4,\n",
+    "    x_ticks=[\n",
+    "        pd.to_datetime(d) for d in [\"1991-09-01\", \"1996-07-01\", \"2000-01-01\", \"2009-01-01\", \"2014-01-01\", \"2018-01-01\"]\n",
+    "    ],\n",
+    "    legend_title=\"Paper Category\",\n",
+    "    nbins=60,\n",
+    ")\n",
+    "save_plot(fig_labels_distribution, \"arxiv_kaggle_category_relative_astro_ph\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Presentation\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/compare_all/arxiv_perf_tradeoff.ipynb b/analytics/plotting/rh_thesis/compare_all/arxiv_perf_tradeoff.ipynb
new file mode 100644
index 000000000..033b7328a
--- /dev/null
+++ b/analytics/plotting/rh_thesis/compare_all/arxiv_perf_tradeoff.ipynb
@@ -0,0 +1,542 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import (\n",
+    "    df_aggregate_eval_metric,\n",
+    "    dfs_models_and_evals,\n",
+    "    pipeline_leaf_times_df,\n",
+    ")\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from analytics.plotting.common.tradeoff_scatterplot import plot_tradeoff_scatter\n",
+    "from modyn.supervisor.internal.grpc.enums import PipelineStage\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/10_baselines_time\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/11_baselines_amount\"),\n",
+    "    Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/21_datadrift_dynamic\"\n",
+    "    ),  # TODO\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/30_performance\"),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    print(\"Reading\", dir)\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    print(dir_pipelines)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"arxiv_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = {p_id: (pname, p_path) for p_id, (pname, p_path) in pipelines.items()}\n",
+    "\n",
+    "pipeline_ids = pipelines.keys()\n",
+    "pipeline_ids = [\n",
+    "    y\n",
+    "    for y, _ in [\n",
+    "        (263, \"timetrigger_5y\"),\n",
+    "        (265, \"timetrigger_10y\"),\n",
+    "        # (267, 'timetrigger_26w'),\n",
+    "        (269, \"timetrigger_2y\"),\n",
+    "        (272, \"timetrigger_1y\"),\n",
+    "        # (264, 'dataamount_1000000'),\n",
+    "        (266, \"dataamount_50000\"),\n",
+    "        # (268, 'dataamount_500000'),\n",
+    "        (270, \"dataamount_25000\"),\n",
+    "        (271, \"dataamount_100000\"),\n",
+    "        (782, \"drifttrigger_mmd-quant-0.05-20_int20000_win1y\"),\n",
+    "        (783, \"drifttrigger_mmd-rollavg-0.5-20_int20000_win1y\"),\n",
+    "        (784, \"drifttrigger_mmd-rollavg-5.0-20_int20000_win1y\"),\n",
+    "        (785, \"drifttrigger_mmd-quant-0.15-20_int20000_win1y\"),\n",
+    "        (786, \"drifttrigger_mmd-rollavg-0.2-20_int20000_win1y\"),\n",
+    "        (787, \"drifttrigger_mmd-quant-0.1-20_int20000_win1y\"),\n",
+    "        (788, \"drifttrigger_mmd-rollavg-1.0-20_int20000_win1y\"),\n",
+    "        (789, \"drifttrigger_mmd-quant-0.3-20_int20000_win1y\"),\n",
+    "        (790, \"drifttrigger_mmd-rollavg-2.0-20_int20000_win1y\"),\n",
+    "        (674, \"performancetrigger_static-0.45-int20000\"),\n",
+    "        (675, \"performancetrigger_dynamic-quant-0.05-20-int20000\"),\n",
+    "        (676, \"performancetrigger_dynamic-rollavg-0.3-20-int20000\"),\n",
+    "        (677, \"performancetrigger_num_misclass-100000-exp-0.6-red-False--int20000\"),\n",
+    "        (678, \"performancetrigger_dynamic-rollavg-0.2-20-int20000\"),\n",
+    "        (679, \"performancetrigger_dynamic-rollavg-0.1-20-int20000\"),\n",
+    "        (680, \"performancetrigger_static-0.5-int20000\"),\n",
+    "        (681, \"performancetrigger_dynamic-quant-0.15-20-int20000\"),\n",
+    "        (682, \"performancetrigger_num_misclass-50000-exp-0.6-red-False--int20000\"),\n",
+    "        (723, \"performancetrigger_num_misclass-30000-exp-0.6-red-False--int20000\"),\n",
+    "        (756, \"performancetrigger_num_misclass-15000-exp-0.6-red-False--int20000\"),\n",
+    "        (762, \"performancetrigger_num_misclass-10000-exp-0.6-red-False--int20000\"),\n",
+    "    ]\n",
+    "]\n",
+    "\n",
+    "[(p_id, pname) for p_id, (pname, _) in pipelines.items() if p_id in pipeline_ids]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_df_eval_single: list[pd.DataFrame] = []\n",
+    "df_leaf_list: list[pd.DataFrame] = []\n",
+    "\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_single[\"pipeline_id\"] = pipeline_id\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "    _, _, df_eval_single = dfs_models_and_evals(\n",
+    "        pipeline_logs[pipeline_id], df_leaf_single[\"sample_time\"].max(), pipelines[pipeline_id][0]\n",
+    "    )\n",
+    "    df_eval_single[\"pipeline_id\"] = pipeline_id\n",
+    "    list_df_eval_single.append(df_eval_single)\n",
+    "\n",
+    "df_adjusted = pd.concat(list_df_eval_single)\n",
+    "df_adjusted\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)\n",
+    "df_leaf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(df_leaf[\"id\"].unique())\n",
+    "assert set(df_leaf[\"id\"].unique()) == {\n",
+    "    \"TRAIN\",\n",
+    "    \"INIT_CLUSTER_CONNECTION\",\n",
+    "    \"EVALUATE_TRIGGER_POLICY\",\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "    \"TRAINING_COMPLETED\",\n",
+    "    \"STORE_TRAINED_MODEL\",\n",
+    "    \"EVALUATE\",\n",
+    "    \"DONE\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reduce to composite models\n",
+    "df_adjusted = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "df_adjusted[composite_model_variant].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reduce evaluation interval to interval where all policies have evaluations\n",
+    "min_active_eval_center_per_pipeline = (\n",
+    "    df_adjusted[df_adjusted[composite_model_variant]].groupby(\"pipeline_ref\")[\"interval_center\"].min()\n",
+    ")\n",
+    "\n",
+    "maximum_min = pd.to_datetime(min_active_eval_center_per_pipeline).max()\n",
+    "print(maximum_min, min_active_eval_center_per_pipeline)\n",
+    "\n",
+    "assert maximum_min < pd.to_datetime(\"2006-01-01\")\n",
+    "\n",
+    "df_adjusted = df_adjusted[pd.to_datetime(df_adjusted[\"interval_center\"]) >= maximum_min]\n",
+    "df_adjusted[\"interval_center\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].astype(str).str.split(\"-\").str[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Aggregate metrics to a scalar value per pipeline\n",
+    "mean_accuracies = df_aggregate_eval_metric(\n",
+    "    df_adjusted,\n",
+    "    group_by=[\"pipeline_id\", \"pipeline_ref\", \"metric\"],\n",
+    "    in_col=\"value\",\n",
+    "    out_col=\"metric_value\",\n",
+    "    aggregate_func=\"mean\",\n",
+    ")\n",
+    "mean_accuracies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_triggers = df_leaf[df_leaf[\"id\"] == PipelineStage.TRAIN.name]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_triggers = df_leaf[df_leaf[\"id\"] == PipelineStage.TRAIN.name]\n",
+    "df_triggers = df_triggers[df_triggers[\"sample_time\"] > maximum_min]\n",
+    "df_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find number of trigger per pipeline that are after maximum_min\n",
+    "\n",
+    "# before the cutoff there was one trigger (equivalent to start of our reduced dataset): +1\n",
+    "num_triggers = df_triggers.groupby(\"pipeline_id\").aggregate(count=(\"id\", \"count\"), sum_duration=(\"duration\", \"sum\"))\n",
+    "num_triggers[\"count\"] += 1\n",
+    "num_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged = num_triggers.merge(mean_accuracies, on=\"pipeline_id\", how=\"inner\")\n",
+    "assert num_triggers.shape[0] == merged.shape[0]\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_type(x: str):\n",
+    "    if \"year\" in x:\n",
+    "        return \"time\"\n",
+    "    elif \"samples\" in x:\n",
+    "        return \"amount\"\n",
+    "    elif \"d\" in x:\n",
+    "        return \"drift\"\n",
+    "    else:\n",
+    "        return \"unknown\"\n",
+    "\n",
+    "\n",
+    "merged[\"type\"] = merged[\"pipeline_ref\"].apply(lambda x: create_type(x))\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "renamed = merged.copy()\n",
+    "\n",
+    "# renamed = merged[\n",
+    "#     merged[\"pipeline_id\"].isin(\n",
+    "#         [\n",
+    "#             # # static thresholds\n",
+    "#             # 113,  # 0.03\n",
+    "#             # 112,  # 0.05\n",
+    "#             # 107,  # 0.07\n",
+    "#             # 109,  # 0.09\n",
+    "#             # 85,  # 0.12\n",
+    "#             # # dyn quantile\n",
+    "#             # 353,  # % 0.05\n",
+    "#             # 345,  # % 0.10\n",
+    "#             # 357,  # % 0.15\n",
+    "#             # # dyn roll. avg\n",
+    "#             # 372,  # Δ 2.0\n",
+    "#             # 370,  # Δ 1.0\n",
+    "#             # 369,  # Δ 0.5\n",
+    "#             # 363,  # Δ 0.05\n",
+    "#         ]\n",
+    "#     )\n",
+    "# ].copy()\n",
+    "renamed[\"Trigger SubType\"] = renamed[\"pipeline_ref\"].apply(\n",
+    "    lambda x: (\n",
+    "        \"DataAmount\"\n",
+    "        if \"dataamount\" in x\n",
+    "        else (\n",
+    "            \"Time\"\n",
+    "            if \"time\" in x\n",
+    "            else (\n",
+    "                (\n",
+    "                    \"Static\"\n",
+    "                    if \"_mmd-0\" in x\n",
+    "                    else (\"Quantile\" if \"quant\" in x else (\"Rolling Avg.\" if \"roll\" in x else (\"unknown\")))\n",
+    "                )\n",
+    "                if \"drift\" in x\n",
+    "                else (\n",
+    "                    (\n",
+    "                        \"Static\"\n",
+    "                        if \"static\" in x\n",
+    "                        else (\n",
+    "                            \"Quantile\"\n",
+    "                            if \"quant\" in x\n",
+    "                            else (\n",
+    "                                \"Rolling Avg.\"\n",
+    "                                if \"roll\" in x\n",
+    "                                else (\"AvoidableMisclass\" if \"num_misclass\" in x else (\"unknown\"))\n",
+    "                            )\n",
+    "                        )\n",
+    "                    )\n",
+    "                    if \"performancetrigger\" in x\n",
+    "                    else (\n",
+    "                        \"DataIncorporationLatency\"\n",
+    "                        if \"data_inc\" in x\n",
+    "                        else (\"AvoidableMisclass\" if \"avoidable\" in x else (\"unknown\"))\n",
+    "                    )\n",
+    "                )\n",
+    "            )\n",
+    "        )\n",
+    "    )\n",
+    ")\n",
+    "renamed[\"Trigger Type\"] = renamed[\"pipeline_ref\"].apply(\n",
+    "    lambda x: (\n",
+    "        \"Simple\"\n",
+    "        if \"dataamount\" in x\n",
+    "        else (\n",
+    "            \"Simple\"\n",
+    "            if \"time\" in x\n",
+    "            else (\n",
+    "                \"DataDrift\"\n",
+    "                if \"drift\" in x\n",
+    "                else (\"Performance\" if \"performancetrigger\" in x else (\"Cost\" if \"costtrigger\" in x else (\"unknown\")))\n",
+    "            )\n",
+    "        )\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "# assert no unknowns and DataIncorporationLatency\n",
+    "assert not renamed[\"Trigger Type\"].str.contains(\"unknown\").any()\n",
+    "assert not renamed[\"Trigger SubType\"].str.contains(\"unknown\").any()\n",
+    "assert not renamed[\"Trigger SubType\"].str.contains(\"DataIncorporationLatency\").any()\n",
+    "\n",
+    "# assert no cost triggers\n",
+    "assert not renamed[\"Trigger Type\"].str.contains(\"Cost\").any()\n",
+    "\n",
+    "renamed[\"Trigger Type\"] = pd.Categorical(\n",
+    "    renamed[\"Trigger Type\"], categories=[\"Simple\", \"DataDrift\", \"Performance\"], ordered=True\n",
+    ")\n",
+    "\n",
+    "renamed[\"Trigger SubType\"] = pd.Categorical(\n",
+    "    renamed[\"Trigger SubType\"],\n",
+    "    categories=[\"DataAmount\", \"Time\", \"Static\", \"Quantile\", \"Rolling Avg.\", \"AvoidableMisclass\"],\n",
+    "    ordered=True,\n",
+    ")\n",
+    "\n",
+    "renamed = renamed.sort_values(by=[\"Trigger Type\", \"Trigger SubType\", \"pipeline_id\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_tradeoff_scatter(\n",
+    "    renamed,\n",
+    "    x=\"count\",\n",
+    "    y=\"metric_value\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger SubType\",\n",
+    "    x_label=\"Number of Triggers\",\n",
+    "    y_label=\"Mean Accuracy %\",\n",
+    "    height_factor=0.8,\n",
+    "    width_factor=0.8,\n",
+    "    manual_legend_title=False,\n",
+    "    legend_ncol=2,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"_all_tradeoff_arxiv_triggers_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "in_minutes = renamed.copy()\n",
+    "in_minutes[\"sum_duration\"] = in_minutes[\"sum_duration\"] / 60\n",
+    "\n",
+    "fig = plot_tradeoff_scatter(\n",
+    "    in_minutes,\n",
+    "    x=\"sum_duration\",\n",
+    "    y=\"metric_value\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger SubType\",\n",
+    "    x_label=\"Total Cost (Minutes)\",\n",
+    "    y_label=\"Mean Accuracy %\",\n",
+    "    height_factor=0.7,\n",
+    "    width_factor=0.8,\n",
+    "    manual_legend_title=False,\n",
+    "    legend_ncol=2,\n",
+    ")\n",
+    "\n",
+    "# save_plot(fig, \"tradeoff_drift_yearbook_cost_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_tradeoff_scatter(\n",
+    "    renamed,\n",
+    "    x=\"count\",\n",
+    "    y=\"sum_duration\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger SubType\",\n",
+    "    x_label=\"Number of Triggers\",\n",
+    "    y_label=\"Total Cost (seconds)\",\n",
+    "    height_factor=1.5,\n",
+    "    width_factor=1.8,\n",
+    ")\n",
+    "\n",
+    "# save_plot(fig, \"tradeoff_drift_yearbook_triggers_cost\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/compare_all/hp_perf_tradeoff.ipynb b/analytics/plotting/rh_thesis/compare_all/hp_perf_tradeoff.ipynb
new file mode 100644
index 000000000..ba2991250
--- /dev/null
+++ b/analytics/plotting/rh_thesis/compare_all/hp_perf_tradeoff.ipynb
@@ -0,0 +1,559 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import (\n",
+    "    df_aggregate_eval_metric,\n",
+    "    dfs_models_and_evals,\n",
+    "    pipeline_leaf_times_df,\n",
+    ")\n",
+    "from analytics.plotting.common.tradeoff_scatterplot import plot_tradeoff_scatter\n",
+    "from modyn.supervisor.internal.grpc.enums import PipelineStage\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/10_baselines_time\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/11_baselines_amount\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/21_datadrift_dynamic\"),\n",
+    "    Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/30_performance/num_misclass\"\n",
+    "    ),\n",
+    "    Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/30_performance/static_dyn\"\n",
+    "    ),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    print(\"Reading\", dir)\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    print(dir_pipelines)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"huffpost_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = {p_id: (pname, p_path) for p_id, (pname, p_path) in pipelines.items()}\n",
+    "\n",
+    "# pipeline_ids = pipelines.keys()\n",
+    "pipeline_ids = [\n",
+    "    y\n",
+    "    for y, _ in [\n",
+    "        (273, \"timetrigger_26w\"),\n",
+    "        (275, \"timetrigger_13w\"),\n",
+    "        (278, \"timetrigger_1y\"),\n",
+    "        # (280, 'timetrigger_4y'),\n",
+    "        # (282, 'timetrigger_2y'),\n",
+    "        (274, \"dataamount_10000\"),\n",
+    "        (276, \"dataamount_5000\"),\n",
+    "        (277, \"dataamount_20000\"),\n",
+    "        # (279, 'dataamount_80000'),\n",
+    "        # (281, 'dataamount_40000'),\n",
+    "        (745, \"dataamount_15000\"),\n",
+    "        # (750, 'dataamount_30000'),\n",
+    "        (763, \"drifttrigger_mmd-quant-0.05-20_int1500_win1y\"),\n",
+    "        (769, \"drifttrigger_mmd-quant-0.15-20_int1500_win1y\"),\n",
+    "        (770, \"drifttrigger_mmd-rollavg-5.0-20_int1500_win1y\"),\n",
+    "        (771, \"drifttrigger_mmd-rollavg-2.0-20_int1500_win1y\"),\n",
+    "        (772, \"drifttrigger_mmd-rollavg-1.0-20_int1500_win1y\"),\n",
+    "        (774, \"drifttrigger_mmd-rollavg-0.5-20_int1500_win1y\"),\n",
+    "        (689, \"performancetrigger_num_misclass-8000-exp-0.5-red-False--int1500y\"),\n",
+    "        (705, \"performancetrigger_num_misclass-8000-exp-0.6-red-False--int1500y\"),\n",
+    "        (722, \"performancetrigger_num_misclass-4000-exp-0.5-red-False--int1500y\"),\n",
+    "        (724, \"performancetrigger_num_misclass-4000-exp-0.6-red-False--int1500y\"),\n",
+    "        (725, \"performancetrigger_num_misclass-1000-exp-0.5-red-False--int1500y\"),\n",
+    "        (726, \"performancetrigger_num_misclass-1000-exp-0.6-red-False--int1500y\"),\n",
+    "        (773, \"performancetrigger_num_misclass-500-exp-0.5-red-False--int1500y\"),\n",
+    "        (775, \"performancetrigger_num_misclass-250-exp-0.6-red-False--int1500y\"),\n",
+    "        (776, \"performancetrigger_num_misclass-500-exp-0.6-red-False--int1500y\"),\n",
+    "        (778, \"performancetrigger_num_misclass-250-exp-0.5-red-False--int1500y\"),\n",
+    "        (635, \"performancetrigger_static-0.45-int1500y\"),\n",
+    "        (636, \"performancetrigger_dynamic-quant-0.05-15-int1500y\"),\n",
+    "        (637, \"performancetrigger_dynamic-rollavg-0.3-15-int1500y\"),\n",
+    "        (639, \"performancetrigger_static-0.5-int1500y\"),\n",
+    "        (640, \"performancetrigger_dynamic-rollavg-0.3-30-int1500y\"),\n",
+    "        (642, \"performancetrigger_dynamic-quant-0.05-30-int1500y\"),\n",
+    "        (643, \"performancetrigger_static-0.55-int1500y\"),\n",
+    "        (645, \"performancetrigger_dynamic-rollavg-0.2-15-int1500y\"),\n",
+    "        (646, \"performancetrigger_dynamic-quant-0.15-15-int1500y\"),\n",
+    "        (647, \"performancetrigger_static-0.6-int1500y\"),\n",
+    "        (649, \"performancetrigger_dynamic-rollavg-0.2-30-int1500y\"),\n",
+    "        (650, \"performancetrigger_dynamic-quant-0.15-30-int1500y\"),\n",
+    "        (651, \"performancetrigger_dynamic-rollavg-0.1-15-int1500y\"),\n",
+    "        (653, \"performancetrigger_dynamic-quant-0.3-15-int1500y\"),\n",
+    "        (654, \"performancetrigger_dynamic-rollavg-0.1-30-int1500y\"),\n",
+    "        (656, \"performancetrigger_dynamic-quant-0.3-30-int1500y\"),\n",
+    "    ]\n",
+    "]\n",
+    "\n",
+    "[(p_id, pname) for p_id, (pname, _) in pipelines.items() if p_id in pipeline_ids]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_df_eval_single: list[pd.DataFrame] = []\n",
+    "df_leaf_list: list[pd.DataFrame] = []\n",
+    "\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_single[\"pipeline_id\"] = pipeline_id\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "    _, _, df_eval_single = dfs_models_and_evals(\n",
+    "        pipeline_logs[pipeline_id], df_leaf_single[\"sample_time\"].max(), pipelines[pipeline_id][0]\n",
+    "    )\n",
+    "    df_eval_single[\"pipeline_id\"] = pipeline_id\n",
+    "    list_df_eval_single.append(df_eval_single)\n",
+    "\n",
+    "df_adjusted = pd.concat(list_df_eval_single)\n",
+    "df_adjusted\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)\n",
+    "df_leaf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(df_leaf[\"id\"].unique())\n",
+    "assert set(df_leaf[\"id\"].unique()) == {\n",
+    "    \"TRAIN\",\n",
+    "    \"INIT_CLUSTER_CONNECTION\",\n",
+    "    \"EVALUATE_TRIGGER_POLICY\",\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "    \"TRAINING_COMPLETED\",\n",
+    "    \"STORE_TRAINED_MODEL\",\n",
+    "    \"EVALUATE\",\n",
+    "    \"DONE\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reduce to composite models\n",
+    "df_adjusted = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "df_adjusted[composite_model_variant].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reduce evaluation interval to interval where all policies have evaluations\n",
+    "min_active_eval_center_per_pipeline = (\n",
+    "    df_adjusted[df_adjusted[composite_model_variant]].groupby(\"pipeline_ref\")[\"interval_center\"].min()\n",
+    ")\n",
+    "maximum_min = min_active_eval_center_per_pipeline.max()\n",
+    "print(maximum_min, min_active_eval_center_per_pipeline)\n",
+    "\n",
+    "assert maximum_min < pd.Timestamp(\"2013-05-01\")\n",
+    "\n",
+    "df_adjusted = df_adjusted[df_adjusted[\"interval_center\"] >= maximum_min]\n",
+    "df_adjusted[\"interval_center\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].astype(str).str.split(\"-\").str[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Aggregate metrics to a scalar value per pipeline\n",
+    "mean_accuracies = df_aggregate_eval_metric(\n",
+    "    df_adjusted,\n",
+    "    group_by=[\"pipeline_id\", \"pipeline_ref\", \"metric\"],\n",
+    "    in_col=\"value\",\n",
+    "    out_col=\"metric_value\",\n",
+    "    aggregate_func=\"mean\",\n",
+    ")\n",
+    "mean_accuracies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_triggers = df_leaf[df_leaf[\"id\"] == PipelineStage.TRAIN.name]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_triggers = df_leaf[df_leaf[\"id\"] == PipelineStage.TRAIN.name]\n",
+    "df_triggers = df_triggers[df_triggers[\"sample_time\"] > maximum_min]\n",
+    "df_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find number of trigger per pipeline that are after maximum_min\n",
+    "\n",
+    "# before the cutoff there was one trigger (equivalent to start of our reduced dataset): +1\n",
+    "num_triggers = df_triggers.groupby(\"pipeline_id\").aggregate(count=(\"id\", \"count\"), sum_duration=(\"duration\", \"sum\"))\n",
+    "num_triggers[\"count\"] += 1\n",
+    "num_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged = num_triggers.merge(mean_accuracies, on=\"pipeline_id\", how=\"inner\")\n",
+    "assert num_triggers.shape[0] == merged.shape[0]\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_type(x: str):\n",
+    "    if \"year\" in x:\n",
+    "        return \"time\"\n",
+    "    elif \"samples\" in x:\n",
+    "        return \"amount\"\n",
+    "    elif \"d\" in x:\n",
+    "        return \"drift\"\n",
+    "    else:\n",
+    "        return \"unknown\"\n",
+    "\n",
+    "\n",
+    "merged[\"type\"] = merged[\"pipeline_ref\"].apply(lambda x: create_type(x))\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "renamed = merged.copy()\n",
+    "\n",
+    "# renamed = merged[\n",
+    "#     merged[\"pipeline_id\"].isin(\n",
+    "#         [\n",
+    "#             # # static thresholds\n",
+    "#             # 113,  # 0.03\n",
+    "#             # 112,  # 0.05\n",
+    "#             # 107,  # 0.07\n",
+    "#             # 109,  # 0.09\n",
+    "#             # 85,  # 0.12\n",
+    "#             # # dyn quantile\n",
+    "#             # 353,  # % 0.05\n",
+    "#             # 345,  # % 0.10\n",
+    "#             # 357,  # % 0.15\n",
+    "#             # # dyn roll. avg\n",
+    "#             # 372,  # Δ 2.0\n",
+    "#             # 370,  # Δ 1.0\n",
+    "#             # 369,  # Δ 0.5\n",
+    "#             # 363,  # Δ 0.05\n",
+    "#         ]\n",
+    "#     )\n",
+    "# ].copy()\n",
+    "renamed[\"Trigger SubType\"] = renamed[\"pipeline_ref\"].apply(\n",
+    "    lambda x: (\n",
+    "        \"DataAmount\"\n",
+    "        if \"dataamount\" in x\n",
+    "        else (\n",
+    "            \"Time\"\n",
+    "            if \"time\" in x\n",
+    "            else (\n",
+    "                (\n",
+    "                    \"Static\"\n",
+    "                    if \"_mmd-0\" in x\n",
+    "                    else (\"Quantile\" if \"quant\" in x else (\"Rolling Avg.\" if \"roll\" in x else (\"unknown\")))\n",
+    "                )\n",
+    "                if \"drift\" in x\n",
+    "                else (\n",
+    "                    (\n",
+    "                        \"Static\"\n",
+    "                        if \"static\" in x\n",
+    "                        else (\n",
+    "                            \"Quantile\"\n",
+    "                            if \"quant\" in x\n",
+    "                            else (\n",
+    "                                \"Rolling Avg.\"\n",
+    "                                if \"roll\" in x\n",
+    "                                else (\"AvoidableMisclass\" if \"num_misclass\" in x else (\"unknown\"))\n",
+    "                            )\n",
+    "                        )\n",
+    "                    )\n",
+    "                    if \"performancetrigger\" in x\n",
+    "                    else (\n",
+    "                        \"DataIncorporationLatency\"\n",
+    "                        if \"data_inc\" in x\n",
+    "                        else (\"AvoidableMisclass\" if \"avoidable\" in x else (\"unknown\"))\n",
+    "                    )\n",
+    "                )\n",
+    "            )\n",
+    "        )\n",
+    "    )\n",
+    ")\n",
+    "renamed[\"Trigger Type\"] = renamed[\"pipeline_ref\"].apply(\n",
+    "    lambda x: (\n",
+    "        \"Simple\"\n",
+    "        if \"dataamount\" in x\n",
+    "        else (\n",
+    "            \"Simple\"\n",
+    "            if \"time\" in x\n",
+    "            else (\n",
+    "                \"DataDrift\"\n",
+    "                if \"drift\" in x\n",
+    "                else (\"Performance\" if \"performancetrigger\" in x else (\"Cost\" if \"costtrigger\" in x else (\"unknown\")))\n",
+    "            )\n",
+    "        )\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "# assert no unknowns and DataIncorporationLatency\n",
+    "assert not renamed[\"Trigger Type\"].str.contains(\"unknown\").any()\n",
+    "assert not renamed[\"Trigger SubType\"].str.contains(\"unknown\").any()\n",
+    "assert not renamed[\"Trigger SubType\"].str.contains(\"DataIncorporationLatency\").any()\n",
+    "\n",
+    "# assert no cost triggers\n",
+    "assert not renamed[\"Trigger Type\"].str.contains(\"Cost\").any()\n",
+    "\n",
+    "renamed[\"Trigger Type\"] = pd.Categorical(\n",
+    "    renamed[\"Trigger Type\"], categories=[\"Simple\", \"DataDrift\", \"Performance\"], ordered=True\n",
+    ")\n",
+    "\n",
+    "renamed[\"Trigger SubType\"] = pd.Categorical(\n",
+    "    renamed[\"Trigger SubType\"],\n",
+    "    categories=[\"DataAmount\", \"Time\", \"Static\", \"Quantile\", \"Rolling Avg.\", \"AvoidableMisclass\"],\n",
+    "    ordered=True,\n",
+    ")\n",
+    "\n",
+    "renamed = renamed.sort_values(by=[\"Trigger Type\", \"Trigger SubType\", \"pipeline_id\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_tradeoff_scatter(\n",
+    "    renamed,\n",
+    "    x=\"count\",\n",
+    "    y=\"metric_value\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger SubType\",\n",
+    "    x_label=\"Number of Triggers\",\n",
+    "    y_label=\"Mean Accuracy %\",\n",
+    "    height_factor=1,\n",
+    "    width_factor=1,\n",
+    ")\n",
+    "\n",
+    "# save_plot(fig, \"_all_tradeoff_yearbook_triggers_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "in_minutes = renamed.copy()\n",
+    "in_minutes[\"sum_duration\"] = in_minutes[\"sum_duration\"] / 60\n",
+    "\n",
+    "fig = plot_tradeoff_scatter(\n",
+    "    in_minutes,\n",
+    "    x=\"sum_duration\",\n",
+    "    y=\"metric_value\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger SubType\",\n",
+    "    x_label=\"Total Cost (Minutes)\",\n",
+    "    y_label=\"Mean Accuracy %\",\n",
+    "    height_factor=1,\n",
+    "    width_factor=1,\n",
+    ")\n",
+    "\n",
+    "# save_plot(fig, \"tradeoff_drift_yearbook_cost_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_tradeoff_scatter(\n",
+    "    renamed,\n",
+    "    x=\"count\",\n",
+    "    y=\"sum_duration\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger SubType\",\n",
+    "    x_label=\"Number of Triggers\",\n",
+    "    y_label=\"Total Cost (seconds)\",\n",
+    "    height_factor=1.5,\n",
+    "    width_factor=1.8,\n",
+    ")\n",
+    "\n",
+    "# save_plot(fig, \"tradeoff_drift_yearbook_triggers_cost\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/compare_all/yb_perf_tradeoff.ipynb b/analytics/plotting/rh_thesis/compare_all/yb_perf_tradeoff.ipynb
new file mode 100644
index 000000000..811a5f083
--- /dev/null
+++ b/analytics/plotting/rh_thesis/compare_all/yb_perf_tradeoff.ipynb
@@ -0,0 +1,891 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import (\n",
+    "    df_aggregate_eval_metric,\n",
+    "    dfs_models_and_evals,\n",
+    "    patch_yearbook_time,\n",
+    "    pipeline_leaf_times_df,\n",
+    ")\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from analytics.plotting.common.tradeoff_scatterplot import plot_tradeoff_scatter\n",
+    "from modyn.supervisor.internal.grpc.enums import PipelineStage\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/10_baselines_time\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/11_baselines_amount\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/20_datadrift_static\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/21_datadrift_dynamic\"),\n",
+    "    Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/30_performance/num_misclass\"\n",
+    "    ),\n",
+    "    Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/30_performance/static_dyn\"\n",
+    "    ),\n",
+    "    Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/40_cost_dataincorporation_latency\"\n",
+    "    ),\n",
+    "    Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/41_avoidable_miclass_cost\"\n",
+    "    ),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    print(\"Reading\", dir)\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    print(dir_pipelines)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = {p_id: (pname, p_path) for p_id, (pname, p_path) in pipelines.items()}\n",
+    "pipeline_ids = pipelines.keys()\n",
+    "pipeline_ids = [\n",
+    "    y\n",
+    "    for y, _ in [\n",
+    "        # (20, 'yearbook_timetrigger_40y'),\n",
+    "        (23, \"yearbook_timetrigger_25y\"),\n",
+    "        (24, \"yearbook_timetrigger_15y\"),\n",
+    "        (25, \"yearbook_timetrigger_10y\"),\n",
+    "        (26, \"yearbook_timetrigger_5y\"),\n",
+    "        (27, \"yearbook_timetrigger_4y\"),\n",
+    "        (29, \"yearbook_timetrigger_3y\"),\n",
+    "        (31, \"yearbook_timetrigger_2y\"),\n",
+    "        (33, \"yearbook_timetrigger_1y\"),\n",
+    "        (21, \"yearbook_dataamount_250\"),\n",
+    "        (30, \"yearbook_dataamount_500\"),\n",
+    "        (32, \"yearbook_dataamount_1000\"),\n",
+    "        (35, \"yearbook_dataamount_2500\"),\n",
+    "        (36, \"yearbook_dataamount_5000\"),\n",
+    "        (37, \"yearbook_dataamount_10000\"),\n",
+    "        # (38, 'yearbook_dataamount_15000'),\n",
+    "        # (39, 'yearbook_dataamount_30000'),\n",
+    "        # duplicates\n",
+    "        # (666, 'yearbook_dataamount_1000'),\n",
+    "        # (667, 'yearbook_dataamount_250'),\n",
+    "        # (668, 'yearbook_dataamount_2500'),\n",
+    "        # (669, 'yearbook_dataamount_5000'),\n",
+    "        # (670, 'yearbook_dataamount_10000'),\n",
+    "        # (671, 'yearbook_dataamount_500'),\n",
+    "        # (672, 'yearbook_dataamount_15000'),\n",
+    "        # (673, 'yearbook_dataamount_30000'),\n",
+    "        # (63, 'yearbook_drifttrigger_mmd-0.09_int100_win1d'),\n",
+    "        # (64, 'yearbook_drifttrigger_mmd-0.07_int100_win1d'),\n",
+    "        # (65, 'yearbook_drifttrigger_mmd-0.12_int100_win1d'),\n",
+    "        # (66, 'yearbook_drifttrigger_mmd-0.15_int100_win1d'),\n",
+    "        # (67, 'yearbook_drifttrigger_mmd-0.03_int100_win1d'),\n",
+    "        # (68, 'yearbook_drifttrigger_mmd-0.05_int100_win1d'),\n",
+    "        # (69, 'yearbook_drifttrigger_mmd-0.12_int100_win4d'),\n",
+    "        # (70, 'yearbook_drifttrigger_mmd-0.2_int100_win1d'),\n",
+    "        # (71, 'yearbook_drifttrigger_mmd-0.4_int100_win1d'),\n",
+    "        # (72, 'yearbook_drifttrigger_mmd-0.15_int100_win4d'),\n",
+    "        # (73, 'yearbook_drifttrigger_mmd-0.09_int100_win4d'),\n",
+    "        # (74, 'yearbook_drifttrigger_mmd-0.07_int100_win4d'),\n",
+    "        # (75, 'yearbook_drifttrigger_mmd-0.12_int100_win10d'),\n",
+    "        # (76, 'yearbook_drifttrigger_mmd-0.03_int100_win4d'),\n",
+    "        # (77, 'yearbook_drifttrigger_mmd-0.05_int100_win4d'),\n",
+    "        # (78, 'yearbook_drifttrigger_mmd-0.15_int100_win10d'),\n",
+    "        # (79, 'yearbook_drifttrigger_mmd-0.4_int100_win4d'),\n",
+    "        # (80, 'yearbook_drifttrigger_mmd-0.2_int100_win4d'),\n",
+    "        (81, \"yearbook_drifttrigger_mmd-0.12_int250_win1d\"),\n",
+    "        # (82, 'yearbook_drifttrigger_mmd-0.09_int100_win10d'),\n",
+    "        # (83, 'yearbook_drifttrigger_mmd-0.07_int100_win10d'),\n",
+    "        (84, \"yearbook_drifttrigger_mmd-0.15_int250_win1d\"),\n",
+    "        (85, \"yearbook_drifttrigger_mmd-0.12_int250_win4d\"),\n",
+    "        # (86, 'yearbook_drifttrigger_mmd-0.03_int100_win10d'),\n",
+    "        # (87, 'yearbook_drifttrigger_mmd-0.05_int100_win10d'),\n",
+    "        (88, \"yearbook_drifttrigger_mmd-0.15_int250_win4d\"),\n",
+    "        (89, \"yearbook_drifttrigger_mmd-0.12_int250_win10d\"),\n",
+    "        # (90, 'yearbook_drifttrigger_mmd-0.2_int100_win10d'),\n",
+    "        (91, \"yearbook_drifttrigger_mmd-0.15_int250_win10d\"),\n",
+    "        # (92, 'yearbook_drifttrigger_mmd-0.4_int100_win10d'),\n",
+    "        # (93, 'yearbook_drifttrigger_mmd-0.12_int500_win1d'),\n",
+    "        # (94, 'yearbook_drifttrigger_mmd-0.15_int500_win1d'),\n",
+    "        (95, \"yearbook_drifttrigger_mmd-0.07_int250_win1d\"),\n",
+    "        # (96, 'yearbook_drifttrigger_mmd-0.12_int500_win4d'),\n",
+    "        (97, \"yearbook_drifttrigger_mmd-0.09_int250_win1d\"),\n",
+    "        # (98, 'yearbook_drifttrigger_mmd-0.15_int500_win4d'),\n",
+    "        (99, \"yearbook_drifttrigger_mmd-0.05_int250_win1d\"),\n",
+    "        (100, \"yearbook_drifttrigger_mmd-0.03_int250_win1d\"),\n",
+    "        # (101, 'yearbook_drifttrigger_mmd-0.12_int500_win10d'),\n",
+    "        # (102, 'yearbook_drifttrigger_mmd-0.15_int500_win10d'),\n",
+    "        (103, \"yearbook_drifttrigger_mmd-0.2_int250_win1d\"),\n",
+    "        (104, \"yearbook_drifttrigger_mmd-0.4_int250_win1d\"),\n",
+    "        # (105, 'yearbook_drifttrigger_mmd-0.12_int1000_win1d'),\n",
+    "        # (106, 'yearbook_drifttrigger_mmd-0.15_int1000_win1d'),\n",
+    "        (107, \"yearbook_drifttrigger_mmd-0.07_int250_win4d\"),\n",
+    "        # (108, 'yearbook_drifttrigger_mmd-0.12_int1000_win4d'),\n",
+    "        (109, \"yearbook_drifttrigger_mmd-0.09_int250_win4d\"),\n",
+    "        # (110, 'yearbook_drifttrigger_mmd-0.15_int1000_win4d'),\n",
+    "        # (111, 'yearbook_drifttrigger_mmd-0.12_int1000_win10d'),\n",
+    "        (112, \"yearbook_drifttrigger_mmd-0.05_int250_win4d\"),\n",
+    "        (113, \"yearbook_drifttrigger_mmd-0.03_int250_win4d\"),\n",
+    "        # (114, 'yearbook_drifttrigger_mmd-0.15_int1000_win10d'),\n",
+    "        (115, \"yearbook_drifttrigger_mmd-0.2_int250_win4d\"),\n",
+    "        (116, \"yearbook_drifttrigger_mmd-0.4_int250_win4d\"),\n",
+    "        (117, \"yearbook_drifttrigger_mmd-0.07_int250_win10d\"),\n",
+    "        (118, \"yearbook_drifttrigger_mmd-0.09_int250_win10d\"),\n",
+    "        (119, \"yearbook_drifttrigger_mmd-0.05_int250_win10d\"),\n",
+    "        # (122, 'yearbook_drifttrigger_mmd-0.09_int500_win1d'),\n",
+    "        (123, \"yearbook_drifttrigger_mmd-0.2_int250_win10d\"),\n",
+    "        # (126, 'yearbook_drifttrigger_mmd-0.09_int500_win4d'),\n",
+    "        # (127, 'yearbook_drifttrigger_mmd-0.07_int500_win1d'),\n",
+    "        # (132, 'yearbook_drifttrigger_mmd-0.05_int500_win1d'),\n",
+    "        # (133, 'yearbook_drifttrigger_mmd-0.4_int500_win10d'),\n",
+    "        # (136, 'yearbook_drifttrigger_mmd-0.4_int1000_win1d'),\n",
+    "        # (137, 'yearbook_drifttrigger_mmd-0.2_int500_win1d'),\n",
+    "        # (138, 'yearbook_drifttrigger_mmd-0.09_int1000_win4d'),\n",
+    "        # (139, 'yearbook_drifttrigger_mmd-0.07_int500_win4d'),\n",
+    "        # (144, 'yearbook_drifttrigger_mmd-0.4_int1000_win10d'),\n",
+    "        # (145, 'yearbook_drifttrigger_mmd-0.05_int500_win4d'),\n",
+    "        # (146, 'yearbook_drifttrigger_mmd-0.2_int500_win4d'),\n",
+    "        # (147, 'yearbook_drifttrigger_mmd-0.07_int500_win10d'),\n",
+    "        # (148, 'yearbook_drifttrigger_mmd-0.05_int500_win10d'),\n",
+    "        # (149, 'yearbook_drifttrigger_mmd-0.2_int500_win10d'),\n",
+    "        # (150, 'yearbook_drifttrigger_mmd-0.07_int1000_win1d'),\n",
+    "        # (151, 'yearbook_drifttrigger_mmd-0.05_int1000_win1d'),\n",
+    "        # (152, 'yearbook_drifttrigger_mmd-0.2_int1000_win1d'),\n",
+    "        # (153, 'yearbook_drifttrigger_mmd-0.07_int1000_win4d'),\n",
+    "        # (154, 'yearbook_drifttrigger_mmd-0.05_int1000_win4d'),\n",
+    "        # (155, 'yearbook_drifttrigger_mmd-0.2_int1000_win4d'),\n",
+    "        # (156, 'yearbook_drifttrigger_mmd-0.07_int1000_win10d'),\n",
+    "        # (157, 'yearbook_drifttrigger_mmd-0.05_int1000_win10d'),\n",
+    "        # (158, 'yearbook_drifttrigger_mmd-0.2_int1000_win10d'),\n",
+    "        (159, \"yearbook_drifttrigger_mmd-0.03_int250_win10d\"),\n",
+    "        # (160, 'yearbook_drifttrigger_mmd-0.03_int1000_win1d'),\n",
+    "        # (161, 'yearbook_drifttrigger_mmd-0.4_int500_win4d'),\n",
+    "        # (162, 'yearbook_drifttrigger_mmd-0.03_int500_win10d'),\n",
+    "        # (163, 'yearbook_drifttrigger_mmd-0.4_int1000_win4d'),\n",
+    "        # (164, 'yearbook_drifttrigger_mmd-0.09_int1000_win10d'),\n",
+    "        # (165, 'yearbook_drifttrigger_mmd-0.03_int500_win1d'),\n",
+    "        (166, \"yearbook_drifttrigger_mmd-0.4_int250_win10d\"),\n",
+    "        # (167, 'yearbook_drifttrigger_mmd-0.09_int1000_win1d'),\n",
+    "        # (168, 'yearbook_drifttrigger_mmd-0.09_int500_win10d'),\n",
+    "        # (169, 'yearbook_drifttrigger_mmd-0.03_int500_win4d'),\n",
+    "        # (170, 'yearbook_drifttrigger_mmd-0.4_int500_win1d'),\n",
+    "        # (171, 'yearbook_drifttrigger_mmd-0.03_int1000_win10d'),\n",
+    "        # (172, 'yearbook_drifttrigger_mmd-0.03_int1000_win4d'),\n",
+    "        # (329, 'yearbook_drifttrigger_mmd-quant-0.05-10_int500_win4d'),\n",
+    "        # (330, 'yearbook_drifttrigger_mmd-quant-0.05-20_int500_win4d\\n'),\n",
+    "        # (331, 'yearbook_drifttrigger_mmd-quant-0.05-30_int100_win4d'),\n",
+    "        # (332, 'yearbook_drifttrigger_mmd-quant-0.1-10_int500_win4d'),\n",
+    "        # (333, 'yearbook_drifttrigger_mmd-quant-0.15-20_int500_win4d'),\n",
+    "        # (334, 'yearbook_drifttrigger_mmd-quant-0.15-10_int500_win4d'),\n",
+    "        # (335, 'yearbook_drifttrigger_mmd-quant-0.1-20_int500_win4d'),\n",
+    "        # (336, 'yearbook_drifttrigger_mmd-quant-0.3-20_int500_win4d'),\n",
+    "        # (337, 'yearbook_drifttrigger_mmd-quant-0.1-30_int500_win4d'),\n",
+    "        # (338, 'yearbook_drifttrigger_mmd-quant-0.3-10_int500_win4d'),\n",
+    "        # (339, 'yearbook_drifttrigger_mmd-rollavg-0.05-20_int500_win4d'),\n",
+    "        (340, \"yearbook_drifttrigger_mmd-quant-0.1-10_int250_win4d\"),\n",
+    "        # (341, 'yearbook_drifttrigger_mmd-quant-0.15-30_int100_win4d'),\n",
+    "        # (342, 'yearbook_drifttrigger_mmd-rollavg-0.05-10_int500_win4d'),\n",
+    "        # (343, 'yearbook_drifttrigger_mmd-rollavg-0.2-20_int500_win4d'),\n",
+    "        # (344, 'yearbook_drifttrigger_mmd-rollavg-0.2-10_int500_win4d'),\n",
+    "        (345, \"yearbook_drifttrigger_mmd-quant-0.1-20_int250_win4d\"),\n",
+    "        # (346, 'yearbook_drifttrigger_mmd-rollavg-0.5-20_int500_win4d'),\n",
+    "        # (347, 'yearbook_drifttrigger_mmd-rollavg-0.5-10_int500_win4d'),\n",
+    "        # (348, 'yearbook_drifttrigger_mmd-rollavg-1.0-20_int500_win4d'),\n",
+    "        (349, \"yearbook_drifttrigger_mmd-quant-0.1-30_int250_win4d\"),\n",
+    "        # (350, 'yearbook_drifttrigger_mmd-rollavg-1.0-10_int500_win4d'),\n",
+    "        # (351, 'yearbook_drifttrigger_mmd-rollavg-2.0-20_int500_win4d'),\n",
+    "        # (352, 'yearbook_drifttrigger_mmd-quant-0.3-30_int100_win4d'),\n",
+    "        (353, \"yearbook_drifttrigger_mmd-quant-0.05-20_int250_win4d\"),\n",
+    "        # (354, 'yearbook_drifttrigger_mmd-rollavg-2.0-10_int500_win4d'),\n",
+    "        # (355, 'yearbook_drifttrigger_mmd-quant-0.1-10_int100_win4d'),\n",
+    "        (356, \"yearbook_drifttrigger_mmd-quant-0.05-10_int250_win4d\"),\n",
+    "        (357, \"yearbook_drifttrigger_mmd-quant-0.15-20_int250_win4d\"),\n",
+    "        (358, \"yearbook_drifttrigger_mmd-quant-0.15-10_int250_win4d\"),\n",
+    "        (359, \"yearbook_drifttrigger_mmd-quant-0.3-20_int250_win4d\"),\n",
+    "        # (360, 'yearbook_drifttrigger_mmd-rollavg-0.05-30_int100_win4d'),\n",
+    "        # (361, 'yearbook_drifttrigger_mmd-quant-0.1-20_int100_win4d'),\n",
+    "        (362, \"yearbook_drifttrigger_mmd-quant-0.3-10_int250_win4d\"),\n",
+    "        (363, \"yearbook_drifttrigger_mmd-rollavg-0.05-20_int250_win4d\"),\n",
+    "        (364, \"yearbook_drifttrigger_mmd-rollavg-0.05-10_int250_win4d\"),\n",
+    "        (365, \"yearbook_drifttrigger_mmd-rollavg-0.2-20_int250_win4d\"),\n",
+    "        # (366, 'yearbook_drifttrigger_mmd-quant-0.1-30_int100_win4d'),\n",
+    "        # (367, 'yearbook_drifttrigger_mmd-rollavg-0.2-30_int100_win4d'),\n",
+    "        (368, \"yearbook_drifttrigger_mmd-rollavg-0.2-10_int250_win4d\"),\n",
+    "        (369, \"yearbook_drifttrigger_mmd-rollavg-0.5-20_int250_win4d\"),\n",
+    "        (370, \"yearbook_drifttrigger_mmd-rollavg-1.0-20_int250_win4d\"),\n",
+    "        (371, \"yearbook_drifttrigger_mmd-rollavg-0.5-10_int250_win4d\"),\n",
+    "        (372, \"yearbook_drifttrigger_mmd-rollavg-2.0-20_int250_win4d\"),\n",
+    "        (373, \"yearbook_drifttrigger_mmd-rollavg-1.0-10_int250_win4d\"),\n",
+    "        # (374, 'yearbook_drifttrigger_mmd-rollavg-0.5-30_int100_win4d'),\n",
+    "        # (375, 'yearbook_drifttrigger_mmd-quant-0.05-20_int100_win4d'),\n",
+    "        (376, \"yearbook_drifttrigger_mmd-rollavg-2.0-10_int250_win4d\"),\n",
+    "        # (377, 'yearbook_drifttrigger_mmd-quant-0.05-10_int100_win4d'),\n",
+    "        # (378, 'yearbook_drifttrigger_mmd-rollavg-1.0-30_int100_win4d'),\n",
+    "        # (379, 'yearbook_drifttrigger_mmd-quant-0.15-20_int100_win4d'),\n",
+    "        # (380, 'yearbook_drifttrigger_mmd-quant-0.15-10_int100_win4d'),\n",
+    "        # (381, 'yearbook_drifttrigger_mmd-rollavg-2.0-30_int100_win4d'),\n",
+    "        # (382, 'yearbook_drifttrigger_mmd-quant-0.3-20_int100_win4d'),\n",
+    "        (383, \"yearbook_drifttrigger_mmd-quant-0.05-30_int250_win4d\"),\n",
+    "        # (384, 'yearbook_drifttrigger_mmd-quant-0.3-10_int100_win4d'),\n",
+    "        (385, \"yearbook_drifttrigger_mmd-quant-0.15-30_int250_win4d\"),\n",
+    "        (386, \"yearbook_drifttrigger_mmd-quant-0.3-30_int250_win4d\"),\n",
+    "        # (387, 'yearbook_drifttrigger_mmd-rollavg-0.05-20_int100_win4d'),\n",
+    "        (388, \"yearbook_drifttrigger_mmd-rollavg-0.05-30_int250_win4d\"),\n",
+    "        # (389, 'yearbook_drifttrigger_mmd-rollavg-0.05-10_int100_win4d'),\n",
+    "        (390, \"yearbook_drifttrigger_mmd-rollavg-0.2-30_int250_win4d\"),\n",
+    "        # (391, 'yearbook_drifttrigger_mmd-rollavg-0.2-20_int100_win4d'),\n",
+    "        (392, \"yearbook_drifttrigger_mmd-rollavg-0.5-30_int250_win4d\"),\n",
+    "        (393, \"yearbook_drifttrigger_mmd-rollavg-1.0-30_int250_win4d\"),\n",
+    "        # (394, 'yearbook_drifttrigger_mmd-rollavg-0.2-10_int100_win4d'),\n",
+    "        (395, \"yearbook_drifttrigger_mmd-rollavg-2.0-30_int250_win4d\"),\n",
+    "        # (396, 'yearbook_drifttrigger_mmd-rollavg-0.5-20_int100_win4d'),\n",
+    "        # (397, 'yearbook_drifttrigger_mmd-quant-0.05-30_int500_win4d'),\n",
+    "        # (398, 'yearbook_drifttrigger_mmd-quant-0.15-30_int500_win4d'),\n",
+    "        # (399, 'yearbook_drifttrigger_mmd-rollavg-0.5-10_int100_win4d'),\n",
+    "        # (400, 'yearbook_drifttrigger_mmd-quant-0.3-30_int500_win4d'),\n",
+    "        # (401, 'yearbook_drifttrigger_mmd-rollavg-1.0-20_int100_win4d'),\n",
+    "        # (402, 'yearbook_drifttrigger_mmd-rollavg-0.05-30_int500_win4d'),\n",
+    "        # (403, 'yearbook_drifttrigger_mmd-rollavg-0.2-30_int500_win4d'),\n",
+    "        # (404, 'yearbook_drifttrigger_mmd-rollavg-1.0-10_int100_win4d'),\n",
+    "        # (405, 'yearbook_drifttrigger_mmd-rollavg-0.5-30_int500_win4d'),\n",
+    "        # (406, 'yearbook_drifttrigger_mmd-rollavg-2.0-20_int100_win4d'),\n",
+    "        # (407, 'yearbook_drifttrigger_mmd-rollavg-1.0-30_int500_win4d'),\n",
+    "        # (408, 'yearbook_drifttrigger_mmd-rollavg-2.0-30_int500_win4d'),\n",
+    "        # (409, 'yearbook_drifttrigger_mmd-rollavg-2.0-10_int100_win4d'),\n",
+    "        # (683,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-1500-exp-0.85-red-True--int250y'),\n",
+    "        # (685,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-1500-exp-0.85-red-False--int250y'),\n",
+    "        (686, \"yearbook_performancetrigger_num_misclass-1500-exp-0.9-red-True--int250y\"),\n",
+    "        (687, \"yearbook_performancetrigger_num_misclass-1500-exp-0.9-red-False--int250y\"),\n",
+    "        # (688,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-1500-exp-0.95-red-True--int250y'),\n",
+    "        # (704,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-200-exp-0.85-red-False--int250y'),\n",
+    "        # (727,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-1500-exp-0.95-red-False--int250y'),\n",
+    "        # (728,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-1000-exp-0.85-red-True--int250y'),\n",
+    "        # (729,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-1000-exp-0.85-red-False--int250y'),\n",
+    "        # (730,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-50-exp-0.85-red-True--int250y'),\n",
+    "        (731, \"yearbook_performancetrigger_num_misclass-1000-exp-0.9-red-True--int250y\"),\n",
+    "        # (732,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-50-exp-0.85-red-False--int250y'),\n",
+    "        (733, \"yearbook_performancetrigger_num_misclass-1000-exp-0.9-red-False--int250y\"),\n",
+    "        (734, \"yearbook_performancetrigger_num_misclass-50-exp-0.9-red-True--int250y\"),\n",
+    "        # (735,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-1000-exp-0.95-red-True--int250y'),\n",
+    "        (736, \"yearbook_performancetrigger_num_misclass-50-exp-0.9-red-False--int250y\"),\n",
+    "        # (737,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-1000-exp-0.95-red-False--int250y'),\n",
+    "        # (738,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-50-exp-0.95-red-True--int250y'),\n",
+    "        # (739,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-500-exp-0.85-red-True--int250y'),\n",
+    "        # (740,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-50-exp-0.95-red-False--int250y'),\n",
+    "        # (741,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-500-exp-0.85-red-False--int250y'),\n",
+    "        (742, \"yearbook_performancetrigger_num_misclass-500-exp-0.9-red-True--int250y\"),\n",
+    "        (743, \"yearbook_performancetrigger_num_misclass-500-exp-0.9-red-False--int250y\"),\n",
+    "        # (744,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-500-exp-0.95-red-True--int250y'),\n",
+    "        # (746,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-500-exp-0.95-red-False--int250y'),\n",
+    "        # (747,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-200-exp-0.85-red-True--int250y'),\n",
+    "        (749, \"yearbook_performancetrigger_num_misclass-200-exp-0.9-red-True--int250y\"),\n",
+    "        (751, \"yearbook_performancetrigger_num_misclass-200-exp-0.9-red-False--int250y\"),\n",
+    "        # (753,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-200-exp-0.95-red-True--int250y'),\n",
+    "        # (754,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-200-exp-0.95-red-False--int250y'),\n",
+    "        # (755,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-100-exp-0.85-red-True--int250y'),\n",
+    "        # (757,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-100-exp-0.85-red-False--int250y'),\n",
+    "        (758, \"yearbook_performancetrigger_num_misclass-100-exp-0.9-red-True--int250y\"),\n",
+    "        (759, \"yearbook_performancetrigger_num_misclass-100-exp-0.9-red-False--int250y\"),\n",
+    "        # (760,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-100-exp-0.95-red-True--int250y'),\n",
+    "        # (761,\n",
+    "        # 'yearbook_performancetrigger_num_misclass-100-exp-0.95-red-False--int250y'),\n",
+    "        # (410, 'yearbook_performancetrigger_static-0.7-int100y'),\n",
+    "        (411, \"yearbook_performancetrigger_static-0.7-int250y\"),\n",
+    "        # (412, 'yearbook_performancetrigger_static-0.7-int500y'),\n",
+    "        # (413, 'yearbook_performancetrigger_static-0.75-int500y'),\n",
+    "        (414, \"yearbook_performancetrigger_static-0.75-int250y\"),\n",
+    "        # (416, 'yearbook_performancetrigger_static-0.8-int500y'),\n",
+    "        # (417, 'yearbook_performancetrigger_static-0.75-int100y'),\n",
+    "        (418, \"yearbook_performancetrigger_static-0.8-int250y\"),\n",
+    "        # (419, 'yearbook_performancetrigger_static-0.85-int500y'),\n",
+    "        (421, \"yearbook_performancetrigger_static-0.85-int250y\"),\n",
+    "        # (422, 'yearbook_performancetrigger_static-0.875-int500y'),\n",
+    "        # (423, 'yearbook_performancetrigger_static-0.8-int100y'),\n",
+    "        # (424, 'yearbook_performancetrigger_static-0.9-int500y'),\n",
+    "        (425, \"yearbook_performancetrigger_static-0.875-int250y\"),\n",
+    "        # (427, 'yearbook_performancetrigger_static-0.925-int500y'),\n",
+    "        # (428, 'yearbook_performancetrigger_static-0.85-int100y'),\n",
+    "        (429, \"yearbook_performancetrigger_static-0.9-int250y\"),\n",
+    "        # (430, 'yearbook_performancetrigger_static-0.95-int500y'),\n",
+    "        (432, \"yearbook_performancetrigger_static-0.925-int250y\"),\n",
+    "        # (433, 'yearbook_performancetrigger_dynamic-quant-0.05-10-int500y'),\n",
+    "        # (434, 'yearbook_performancetrigger_static-0.875-int100y'),\n",
+    "        # (436, 'yearbook_performancetrigger_dynamic-quant-0.05-20-int500y'),\n",
+    "        (437, \"yearbook_performancetrigger_static-0.95-int250y\"),\n",
+    "        # (438, 'yearbook_performancetrigger_dynamic-quant-0.05-30-int500y'),\n",
+    "        # (440, 'yearbook_performancetrigger_dynamic-quant-0.15-10-int500y'),\n",
+    "        # (441, 'yearbook_performancetrigger_static-0.9-int100y'),\n",
+    "        # (442, 'yearbook_performancetrigger_dynamic-quant-0.05-10-int250y'),\n",
+    "        # (443, 'yearbook_performancetrigger_dynamic-quant-0.15-20-int500y'),\n",
+    "        (445, \"yearbook_performancetrigger_dynamic-quant-0.05-20-int250y\"),\n",
+    "        # (446, 'yearbook_performancetrigger_dynamic-quant-0.15-30-int500y'),\n",
+    "        # (447, 'yearbook_performancetrigger_dynamic-quant-0.3-10-int500y'),\n",
+    "        # (448, 'yearbook_performancetrigger_dynamic-quant-0.05-30-int250y'),\n",
+    "        # (450, 'yearbook_performancetrigger_static-0.925-int100y'),\n",
+    "        # (451, 'yearbook_performancetrigger_dynamic-quant-0.3-20-int500y'),\n",
+    "        # (452, 'yearbook_performancetrigger_dynamic-quant-0.15-10-int250y'),\n",
+    "        # (454, 'yearbook_performancetrigger_dynamic-quant-0.3-30-int500y'),\n",
+    "        (455, \"yearbook_performancetrigger_dynamic-quant-0.15-20-int250y\"),\n",
+    "        # (458, 'yearbook_performancetrigger_dynamic-quant-0.15-30-int250y'),\n",
+    "        # (459, 'yearbook_performancetrigger_static-0.95-int100y'),\n",
+    "        # (463, 'yearbook_performancetrigger_dynamic-quant-0.3-10-int250y'),\n",
+    "        # (464, 'yearbook_performancetrigger_dynamic-rollavg-0.05-10-int500y'),\n",
+    "        # (465, 'yearbook_performancetrigger_dynamic-rollavg-0.05-20-int500y'),\n",
+    "        (467, \"yearbook_performancetrigger_dynamic-quant-0.3-20-int250y\"),\n",
+    "        # (468, 'yearbook_performancetrigger_dynamic-rollavg-0.05-30-int500y'),\n",
+    "        # (469, 'yearbook_performancetrigger_dynamic-rollavg-0.1-10-int500y'),\n",
+    "        # (471, 'yearbook_performancetrigger_dynamic-quant-0.3-30-int250y'),\n",
+    "        # (472, 'yearbook_performancetrigger_dynamic-rollavg-0.1-20-int500y'),\n",
+    "        # (473, 'yearbook_performancetrigger_dynamic-quant-0.05-10-int100y'),\n",
+    "        # (474, 'yearbook_performancetrigger_dynamic-rollavg-0.1-30-int500y'),\n",
+    "        # (475, 'yearbook_performancetrigger_dynamic-rollavg-0.2-10-int500y'),\n",
+    "        # (478, 'yearbook_performancetrigger_dynamic-rollavg-0.2-20-int500y'),\n",
+    "        # (479, 'yearbook_performancetrigger_dynamic-rollavg-0.2-30-int500y'),\n",
+    "        (481, \"yearbook_performancetrigger_dynamic-quant-0.05-20-int100y\"),\n",
+    "        # (483, 'yearbook_performancetrigger_dynamic-rollavg-0.3-10-int500y'),\n",
+    "        # (484, 'yearbook_performancetrigger_dynamic-rollavg-0.3-20-int500y'),\n",
+    "        # (486, 'yearbook_performancetrigger_dynamic-rollavg-0.3-30-int500y'),\n",
+    "        # (489, 'yearbook_performancetrigger_dynamic-quant-0.05-30-int100y'),\n",
+    "        # (491, 'yearbook_performancetrigger_dynamic-rollavg-0.05-10-int250y'),\n",
+    "        (494, \"yearbook_performancetrigger_dynamic-rollavg-0.05-20-int250y\"),\n",
+    "        # (497, 'yearbook_performancetrigger_dynamic-quant-0.15-10-int100y'),\n",
+    "        # (499, 'yearbook_performancetrigger_dynamic-rollavg-0.05-30-int250y'),\n",
+    "        # (503, 'yearbook_performancetrigger_dynamic-rollavg-0.1-10-int250y'),\n",
+    "        (506, \"yearbook_performancetrigger_dynamic-rollavg-0.1-20-int250y\"),\n",
+    "        (507, \"yearbook_performancetrigger_dynamic-quant-0.15-20-int100y\"),\n",
+    "        # (509, 'yearbook_performancetrigger_dynamic-rollavg-0.1-30-int250y'),\n",
+    "        # (513, 'yearbook_performancetrigger_dynamic-rollavg-0.2-10-int250y'),\n",
+    "        (516, \"yearbook_performancetrigger_dynamic-rollavg-0.2-20-int250y\"),\n",
+    "        # (519, 'yearbook_performancetrigger_dynamic-quant-0.15-30-int100y'),\n",
+    "        # (521, 'yearbook_performancetrigger_dynamic-rollavg-0.2-30-int250y'),\n",
+    "        # (524, 'yearbook_performancetrigger_dynamic-rollavg-0.3-10-int250y'),\n",
+    "        # (527, 'yearbook_performancetrigger_dynamic-rollavg-0.3-20-int250y'),\n",
+    "        # (529, 'yearbook_performancetrigger_dynamic-rollavg-0.3-30-int250y'),\n",
+    "        # (530, 'yearbook_performancetrigger_dynamic-quant-0.3-10-int100y'),\n",
+    "        (539, \"yearbook_performancetrigger_dynamic-quant-0.3-20-int100y\"),\n",
+    "        # (548, 'yearbook_performancetrigger_dynamic-quant-0.3-30-int100y'),\n",
+    "        # (818, 'yearbook_costtrigger_data_inc_int250_exch15552000'),\n",
+    "        # (819, 'yearbook_costtrigger_data_inc_int250_exch13824000'),\n",
+    "        (821, \"yearbook_costtrigger_avoidable_misclass_int250_exch86400000_redFalse\"),\n",
+    "        # (822, 'yearbook_costtrigger_data_inc_int250_exch12096000'),\n",
+    "        # (825, 'yearbook_costtrigger_data_inc_int250_exch10368000'),\n",
+    "        # (835, 'yearbook_costtrigger_data_inc_int250_exch129600000'),\n",
+    "        # (836, 'yearbook_costtrigger_data_inc_int250_exch34560000'),\n",
+    "        # (837, 'yearbook_costtrigger_data_inc_int250_exch25920000'),\n",
+    "        # (838, 'yearbook_costtrigger_data_inc_int250_exch4320000'),\n",
+    "        #  (821, 'yearbook_costtrigger_avoidable_misclass_int250_exch86400000_redFalse'),\n",
+    "        (839, \"yearbook_costtrigger_avoidable_misclass_int250_exch86400000_redFalse\"),\n",
+    "        (840, \"yearbook_costtrigger_avoidable_misclass_int250_exch864000_redFalse\"),\n",
+    "        (841, \"yearbook_costtrigger_avoidable_misclass_int250_exch864000_redFalse\"),\n",
+    "        (842, \"yearbook_costtrigger_avoidable_misclass_int250_exch864.0_redFalse\"),\n",
+    "        #  (843, 'yearbook_costtrigger_avoidable_misclass_int250_exch8640000_redFalse'),\n",
+    "        (844, \"yearbook_costtrigger_avoidable_misclass_int250_exch8640.0_redFalse\"),\n",
+    "        #  (846, 'yearbook_costtrigger_avoidable_misclass_int250_exch8640000000_redFalse'),\n",
+    "        #  (847, 'yearbook_costtrigger_avoidable_misclass_int250_exch864000000_redFalse'),\n",
+    "        (848, \"yearbook_costtrigger_avoidable_misclass_int250_exch64800.0_redFalse\"),\n",
+    "        #  (849, 'yearbook_costtrigger_avoidable_misclass_int250_exch43200.0_redFalse'),\n",
+    "        (850, \"yearbook_costtrigger_avoidable_misclass_int250_exch21600.0_redFalse\"),\n",
+    "        #  (851, 'yearbook_costtrigger_avoidable_misclass_int250_exch4320000000_redFalse')]\n",
+    "    ]\n",
+    "]\n",
+    "\n",
+    "[(p_id, pname) for p_id, (pname, _) in pipelines.items() if p_id in pipeline_ids]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_df_eval_single: list[pd.DataFrame] = []\n",
+    "df_leaf_list: list[pd.DataFrame] = []\n",
+    "\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_single[\"pipeline_id\"] = pipeline_id\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "    _, _, df_eval_single = dfs_models_and_evals(\n",
+    "        pipeline_logs[pipeline_id], df_leaf_single[\"sample_time\"].max(), pipelines[pipeline_id][0]\n",
+    "    )\n",
+    "    df_eval_single[\"pipeline_id\"] = pipeline_id\n",
+    "    list_df_eval_single.append(df_eval_single)\n",
+    "\n",
+    "df_adjusted = pd.concat(list_df_eval_single)\n",
+    "df_adjusted\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)\n",
+    "df_leaf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(df_leaf[\"id\"].unique())\n",
+    "assert set(df_leaf[\"id\"].unique()) == {\n",
+    "    \"TRAIN\",\n",
+    "    \"INIT_CLUSTER_CONNECTION\",\n",
+    "    \"EVALUATE_TRIGGER_POLICY\",\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "    \"TRAINING_COMPLETED\",\n",
+    "    \"STORE_TRAINED_MODEL\",\n",
+    "    \"EVALUATE\",\n",
+    "    \"DONE\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if patch_yearbook:\n",
+    "    for column in [\"interval_start\", \"interval_center\", \"interval_end\"]:\n",
+    "        patch_yearbook_time(df_adjusted, column)\n",
+    "    patch_yearbook_time(df_leaf, \"sample_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reduce to composite models\n",
+    "df_adjusted = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "df_adjusted[composite_model_variant].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reduce evaluation interval to interval where all policies have evaluations\n",
+    "min_active_eval_center_per_pipeline = (\n",
+    "    df_adjusted[df_adjusted[composite_model_variant]].groupby(\"pipeline_ref\")[\"interval_center\"].min()\n",
+    ")\n",
+    "maximum_min = min_active_eval_center_per_pipeline.max()\n",
+    "print(maximum_min, min_active_eval_center_per_pipeline)\n",
+    "\n",
+    "assert maximum_min < pd.Timestamp(\"1962-01-01\")\n",
+    "\n",
+    "df_adjusted = df_adjusted[df_adjusted[\"interval_center\"] >= maximum_min]\n",
+    "df_adjusted[\"interval_center\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].astype(str).str.split(\"-\").str[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Aggregate metrics to a scalar value per pipeline\n",
+    "mean_accuracies = df_aggregate_eval_metric(\n",
+    "    df_adjusted,\n",
+    "    group_by=[\"pipeline_id\", \"pipeline_ref\", \"metric\"],\n",
+    "    in_col=\"value\",\n",
+    "    out_col=\"metric_value\",\n",
+    "    aggregate_func=\"mean\",\n",
+    ")\n",
+    "mean_accuracies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_triggers = df_leaf[df_leaf[\"id\"] == PipelineStage.TRAIN.name]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_triggers = df_leaf[df_leaf[\"id\"] == PipelineStage.TRAIN.name]\n",
+    "df_triggers = df_triggers[df_triggers[\"sample_time\"] > maximum_min]\n",
+    "df_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find number of trigger per pipeline that are after maximum_min\n",
+    "\n",
+    "# before the cutoff there was one trigger (equivalent to start of our reduced dataset): +1\n",
+    "num_triggers = df_triggers.groupby(\"pipeline_id\").aggregate(count=(\"id\", \"count\"), sum_duration=(\"duration\", \"sum\"))\n",
+    "num_triggers[\"count\"] += 1\n",
+    "num_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged = num_triggers.merge(mean_accuracies, on=\"pipeline_id\", how=\"inner\")\n",
+    "assert num_triggers.shape[0] == merged.shape[0]\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_type(x: str):\n",
+    "    if \"year\" in x:\n",
+    "        return \"time\"\n",
+    "    elif \"samples\" in x:\n",
+    "        return \"amount\"\n",
+    "    elif \"d\" in x:\n",
+    "        return \"drift\"\n",
+    "    else:\n",
+    "        return \"unknown\"\n",
+    "\n",
+    "\n",
+    "merged[\"type\"] = merged[\"pipeline_ref\"].apply(lambda x: create_type(x))\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "renamed = merged.copy()\n",
+    "\n",
+    "# renamed = merged[\n",
+    "#     merged[\"pipeline_id\"].isin(\n",
+    "#         [\n",
+    "#             # # static thresholds\n",
+    "#             # 113,  # 0.03\n",
+    "#             # 112,  # 0.05\n",
+    "#             # 107,  # 0.07\n",
+    "#             # 109,  # 0.09\n",
+    "#             # 85,  # 0.12\n",
+    "#             # # dyn quantile\n",
+    "#             # 353,  # % 0.05\n",
+    "#             # 345,  # % 0.10\n",
+    "#             # 357,  # % 0.15\n",
+    "#             # # dyn roll. avg\n",
+    "#             # 372,  # Δ 2.0\n",
+    "#             # 370,  # Δ 1.0\n",
+    "#             # 369,  # Δ 0.5\n",
+    "#             # 363,  # Δ 0.05\n",
+    "#         ]\n",
+    "#     )\n",
+    "# ].copy()\n",
+    "renamed[\"Trigger SubType\"] = renamed[\"pipeline_ref\"].apply(\n",
+    "    lambda x: (\n",
+    "        \"DataAmount\"\n",
+    "        if \"dataamount\" in x\n",
+    "        else (\n",
+    "            \"Time\"\n",
+    "            if \"time\" in x\n",
+    "            else (\n",
+    "                (\n",
+    "                    \"Static\"\n",
+    "                    if \"_mmd-0\" in x\n",
+    "                    else (\"Quantile\" if \"quant\" in x else (\"Rolling Avg.\" if \"roll\" in x else (\"unknown\")))\n",
+    "                )\n",
+    "                if \"drift\" in x\n",
+    "                else (\n",
+    "                    (\n",
+    "                        \"Static\"\n",
+    "                        if \"static\" in x\n",
+    "                        else (\n",
+    "                            \"Quantile\"\n",
+    "                            if \"quant\" in x\n",
+    "                            else (\n",
+    "                                \"Rolling Avg.\"\n",
+    "                                if \"roll\" in x\n",
+    "                                else (\"AvoidableMisclass\" if \"num_misclass\" in x else (\"unknown\"))\n",
+    "                            )\n",
+    "                        )\n",
+    "                    )\n",
+    "                    if \"performancetrigger\" in x\n",
+    "                    else (\n",
+    "                        \"DataIncorporationLatency\"\n",
+    "                        if \"data_inc\" in x\n",
+    "                        else (\"AvoidableMisclass\" if \"avoidable\" in x else (\"unknown\"))\n",
+    "                    )\n",
+    "                )\n",
+    "            )\n",
+    "        )\n",
+    "    )\n",
+    ")\n",
+    "renamed[\"Trigger Type\"] = renamed[\"pipeline_ref\"].apply(\n",
+    "    lambda x: (\n",
+    "        \"Simple\"\n",
+    "        if \"dataamount\" in x\n",
+    "        else (\n",
+    "            \"Simple\"\n",
+    "            if \"time\" in x\n",
+    "            else (\n",
+    "                \"DataDrift\"\n",
+    "                if \"drift\" in x\n",
+    "                else (\"Performance\" if \"performancetrigger\" in x else (\"Cost\" if \"costtrigger\" in x else (\"unknown\")))\n",
+    "            )\n",
+    "        )\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "# assert no unknowns and DataIncorporationLatency\n",
+    "assert not renamed[\"Trigger Type\"].str.contains(\"unknown\").any()\n",
+    "assert not renamed[\"Trigger SubType\"].str.contains(\"unknown\").any()\n",
+    "assert not renamed[\"Trigger SubType\"].str.contains(\"DataIncorporationLatency\").any()\n",
+    "\n",
+    "renamed[\"Trigger Type\"] = pd.Categorical(\n",
+    "    renamed[\"Trigger Type\"], categories=[\"Simple\", \"DataDrift\", \"Performance\", \"Cost\"], ordered=True\n",
+    ")\n",
+    "\n",
+    "renamed[\"Trigger SubType\"] = pd.Categorical(\n",
+    "    renamed[\"Trigger SubType\"],\n",
+    "    categories=[\"DataAmount\", \"Time\", \"Static\", \"Quantile\", \"Rolling Avg.\", \"AvoidableMisclass\"],\n",
+    "    ordered=True,\n",
+    ")\n",
+    "\n",
+    "renamed = renamed.sort_values(by=[\"Trigger Type\", \"Trigger SubType\", \"pipeline_id\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_tradeoff_scatter(\n",
+    "    renamed,\n",
+    "    x=\"count\",\n",
+    "    y=\"metric_value\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger SubType\",\n",
+    "    x_label=\"Number of Triggers\",\n",
+    "    y_label=\"Mean Accuracy %\",\n",
+    "    height_factor=0.8,\n",
+    "    width_factor=0.9,\n",
+    "    manual_legend_title=False,\n",
+    "    legend_ncol=2,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"_all_tradeoff_yearbook_triggers_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "in_minutes = renamed.copy()\n",
+    "in_minutes[\"sum_duration\"] = in_minutes[\"sum_duration\"] / 60\n",
+    "\n",
+    "fig = plot_tradeoff_scatter(\n",
+    "    in_minutes,\n",
+    "    x=\"sum_duration\",\n",
+    "    y=\"metric_value\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger SubType\",\n",
+    "    x_label=\"Total Cost (Minutes)\",\n",
+    "    y_label=\"Mean Accuracy %\",\n",
+    "    height_factor=0.8,\n",
+    "    width_factor=0.9,\n",
+    "    manual_legend_title=False,\n",
+    "    legend_ncol=2,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"_all_tradeoff_yearbook_cost_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_tradeoff_scatter(\n",
+    "    renamed,\n",
+    "    x=\"count\",\n",
+    "    y=\"sum_duration\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger SubType\",\n",
+    "    x_label=\"Number of Triggers\",\n",
+    "    y_label=\"Total Cost (seconds)\",\n",
+    "    height_factor=1.5,\n",
+    "    width_factor=1.8,\n",
+    "    manual_legend_title=False,\n",
+    "    legend_ncol=2,\n",
+    ")\n",
+    "\n",
+    "# save_plot(fig, \"tradeoff_drift_yearbook_triggers_cost\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/drift/arxiv_cost.ipynb b/analytics/plotting/rh_thesis/drift/arxiv_cost.ipynb
new file mode 100644
index 000000000..efa21290b
--- /dev/null
+++ b/analytics/plotting/rh_thesis/drift/arxiv_cost.ipynb
@@ -0,0 +1,282 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import matplotlib.dates as mdates\n",
+    "import pandas as pd\n",
+    "from matplotlib.ticker import FixedFormatter, FixedLocator\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import pipeline_leaf_times_df\n",
+    "from analytics.plotting.common.cost_matrix import plot_cost_matrix\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    # TODO\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode: time + amount\n",
+    "pipeline_ids = [267, 269, 265] + [268, 271, 270]\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"arxiv_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_leaf_list = []\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)\n",
+    "df_leaf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_leaf.copy()\n",
+    "\n",
+    "# coloring in order of decreasing avg. duration\n",
+    "avg_duration_per_stage = df_adjusted.groupby([\"pipeline_ref\", \"id\"])[\"duration\"].mean().sort_values(ascending=False)\n",
+    "df_adjusted = df_adjusted.merge(avg_duration_per_stage, on=[\"pipeline_ref\", \"id\"], suffixes=(\"\", \"_avg\")).sort_values(\n",
+    "    \"duration_avg\", ascending=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted\n",
+    "df_adjusted[\"sample_time_year\"] = df_adjusted[\"sample_time\"]\n",
+    "df_adjusted[\"sample_time_year_bin\"] = pd.cut(df_adjusted[\"sample_time_year\"], bins=10, labels=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new = df_adjusted[\n",
+    "    (\n",
+    "        df_adjusted[\"id\"].isin(\n",
+    "            [\n",
+    "                \"TRAIN\",\n",
+    "                \"STORE_TRAINED_MODEL\",\n",
+    "                \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "                \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "                \"EVALUATE_TRIGGER_POLICY\",\n",
+    "            ]\n",
+    "        )\n",
+    "    )\n",
+    "][[\"pipeline_ref\", \"id\", \"sample_time_year\", \"duration\"]].copy()\n",
+    "df_new = df_new.sort_values(\"sample_time_year\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state_rename = {\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\": \"inform remaining data\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\": \"inform trigger\",\n",
+    "}\n",
+    "\n",
+    "df_new[\"id\"] = df_new[\"id\"].replace(state_rename).str.lower().str.replace(\"_\", \" \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [265, 269, 267],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        265: \"TimeTrigger 10 years\",\n",
+    "        269: \"TimeTrigger 2 years\",\n",
+    "        267: \"TimeTrigger 26 weeks\",\n",
+    "    },\n",
+    "    height_factor=1.8,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (min)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2000-01-01\", \"2009-01-01\", \"2018-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"Jan 2000\", \"Jan 2009\", \"Jan 2018\"]]),\n",
+    "    x_lim=(pd.Timestamp(\"1995-01-01\"), pd.Timestamp(\"2024-09-01\")),\n",
+    "    y_ticks_cumulative=[x for x in range(0, 1000, 200)],\n",
+    "    y_lim_cumulative=(0, 1000),\n",
+    "    y_minutes=True,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"arxiv_time-trigger-cost-matrix\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [268, 271, 270],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        268: \"AmountTrigger 500k samples\",\n",
+    "        271: \"AmountTrigger 100k samples\",\n",
+    "        270: \"AmountTrigger 25k samples\",\n",
+    "    },\n",
+    "    height_factor=1.8,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (min)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2000-01-01\", \"2009-01-01\", \"2018-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"Jan 2000\", \"Jan 2009\", \"Jan 2018\"]]),\n",
+    "    x_lim=(pd.Timestamp(\"1995-01-01\"), pd.Timestamp(\"2024-09-01\")),\n",
+    "    y_ticks_cumulative=[x for x in range(0, 1000, 200)],\n",
+    "    y_lim_cumulative=(0, 1000),\n",
+    "    y_minutes=True,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"arxiv_amount-trigger-cost-matrix\")\n",
+    "# not interesting: note that for 250 samples we see multiple trigger at the same timestamp"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Plot 100k amount and 2y time trigger together"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [269, 271],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        269: \"TimeTrigger 2 years\",\n",
+    "        271: \"AmountTrigger 100k samples\",\n",
+    "    },\n",
+    "    height_factor=1.2,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (min)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2000-01-01\", \"2009-01-01\", \"2018-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"Jan 2000\", \"Jan 2009\", \"Jan 2018\"]]),\n",
+    "    x_lim=(pd.Timestamp(\"1995-01-01\"), pd.Timestamp(\"2024-09-01\")),\n",
+    "    y_ticks_cumulative=[x for x in range(0, 1000, 200)],\n",
+    "    y_lim_cumulative=(0, 1000),\n",
+    "    y_minutes=True,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"arxiv_timeamount-trigger-cost-matrix\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/drift/arxiv_heatmap_single.ipynb b/analytics/plotting/rh_thesis/drift/arxiv_heatmap_single.ipynb
new file mode 100644
index 000000000..b8c9e6fa6
--- /dev/null
+++ b/analytics/plotting/rh_thesis/drift/arxiv_heatmap_single.ipynb
@@ -0,0 +1,287 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/21_datadrift_dynamic\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 782  # drifttrigger_mmd-quant-0.05-20_int20000_win1y\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"arxiv_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "assert df_adjusted[\"pipeline_ref\"].nunique() <= 1\n",
+    "# add the pipeline time series which is the performance of different models stitched together dep.\n",
+    "# w.r.t which model was active\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = 0\n",
+    "pipeline_composite_model[\"id_model\"] = 0\n",
+    "\n",
+    "label_map = {k: f\"{k}\" for k, v in df_adjusted[[\"model_idx\", \"id_model\"]].values}\n",
+    "label_map[0] = \"Pipeline composite model\"\n",
+    "\n",
+    "if include_composite_model:\n",
+    "    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])\n",
+    "else:\n",
+    "    df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].dt.to_period(\"M\")\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_end_years_per_model = df_logs_models[[\"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"].dt.to_period(\"M\")\n",
+    "df_train_end_years_per_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=\"model_idx\", how=\"left\")\n",
+    "df_merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.groupby([\"real_train_end\", \"interval_center\"]).size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build heatmap matrix dataframe:\n",
+    "df_merged[\"real_train_end\"] = df_merged[\"real_train_end\"].apply(lambda x: pd.Period(x, freq=\"M\"))\n",
+    "heatmap_data = df_merged.pivot(index=[\"real_train_end\"], columns=\"interval_center\", values=\"value\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index.min(), heatmap_data.index.max()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_custom_ticks=[\n",
+    "        (i, f\"{period.to_timestamp().strftime('%b %Y')}\")\n",
+    "        for i, period in list(enumerate(heatmap_data.columns))[::1]\n",
+    "        if period in [pd.Period(\"Mar 2000\"), pd.Period(\"Mar 2009\"), pd.Period(\"Mar 2020\")]\n",
+    "    ],\n",
+    "    y_custom_ticks=[\n",
+    "        (i, f\"{period.to_timestamp().strftime('%b %Y')}\".replace(\" \", \"\\n\"))\n",
+    "        for i, period in list(enumerate(heatmap_data.index))[::1]\n",
+    "        if period in [pd.Period(\"Jun 2000\"), pd.Period(\"Jun 2009\"), pd.Period(\"May 2020\")]\n",
+    "    ],\n",
+    "    y_label=\"Trained up to\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    title_label=\"Arxiv Dynamic Drift Threshold: MMD Quantile: 0.05\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.55,\n",
+    "    # grid_alpha=0.4,\n",
+    "    grid_alpha=0.0,\n",
+    "    # disable_horizontal_grid=True,\n",
+    "    # cbar=False,\n",
+    "    df_logs_models=df_logs_models,\n",
+    "    x_axis=\"period\",\n",
+    ")\n",
+    "save_plot(fig, \"arxiv_trigger_heatmap_drift_single_dynamic\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/drift/hp_cost.ipynb b/analytics/plotting/rh_thesis/drift/hp_cost.ipynb
new file mode 100644
index 000000000..6fb266f85
--- /dev/null
+++ b/analytics/plotting/rh_thesis/drift/hp_cost.ipynb
@@ -0,0 +1,219 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import matplotlib.dates as mdates\n",
+    "import pandas as pd\n",
+    "from matplotlib.ticker import FixedFormatter, FixedLocator\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import pipeline_leaf_times_df\n",
+    "from analytics.plotting.common.cost_matrix import plot_cost_matrix\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/21_datadrift_dynamic\"),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode: time + amount\n",
+    "pipeline_ids = [771]  # hp drifttrigger_mmd-rollavg-2.0-20_int1500_win1y\n",
+    "\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"huffpost_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_leaf_list = []\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)\n",
+    "df_leaf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_leaf.copy()\n",
+    "\n",
+    "# coloring in order of decreasing avg. duration\n",
+    "avg_duration_per_stage = df_adjusted.groupby([\"pipeline_ref\", \"id\"])[\"duration\"].mean().sort_values(ascending=False)\n",
+    "df_adjusted = df_adjusted.merge(avg_duration_per_stage, on=[\"pipeline_ref\", \"id\"], suffixes=(\"\", \"_avg\")).sort_values(\n",
+    "    \"duration_avg\", ascending=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted\n",
+    "df_adjusted[\"sample_time_year\"] = df_adjusted[\"sample_time\"]\n",
+    "df_adjusted[\"sample_time_year_bin\"] = pd.cut(df_adjusted[\"sample_time_year\"], bins=10, labels=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new = df_adjusted[\n",
+    "    (\n",
+    "        df_adjusted[\"id\"].isin(\n",
+    "            [\n",
+    "                \"TRAIN\",\n",
+    "                \"STORE_TRAINED_MODEL\",\n",
+    "                \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "                \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "                \"EVALUATE_TRIGGER_POLICY\",\n",
+    "            ]\n",
+    "        )\n",
+    "    )\n",
+    "][[\"pipeline_ref\", \"id\", \"sample_time_year\", \"duration\"]].copy()\n",
+    "df_new = df_new.sort_values(\"sample_time_year\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state_rename = {\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\": \"inform remaining data\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\": \"inform trigger\",\n",
+    "}\n",
+    "\n",
+    "df_new[\"id\"] = df_new[\"id\"].replace(state_rename).str.lower().str.replace(\"_\", \" \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [771],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        771: \"HuffPost Dynamic Drift:  Δ +200%\",\n",
+    "    },\n",
+    "    height_factor=0.7,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (min)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2014-05-01\", \"2018-06-01\", \"2021-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"May\\n2014\", \"Jun\\n2018\", \"Jan\\n2021\"]]),\n",
+    "    x_lim=(pd.Timestamp(\"2012-01-01\"), pd.Timestamp(\"2022-09-01\")),\n",
+    "    y_ticks_cumulative=[x for x in range(0, 110, 25)],\n",
+    "    y_lim_cumulative=(0, 100),\n",
+    "    y_minutes=True,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"huffpost_drift-trigger-cost-matrix\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/drift/hp_heatmap_single.ipynb b/analytics/plotting/rh_thesis/drift/hp_heatmap_single.ipynb
new file mode 100644
index 000000000..b0c43f313
--- /dev/null
+++ b/analytics/plotting/rh_thesis/drift/hp_heatmap_single.ipynb
@@ -0,0 +1,286 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/21_datadrift_dynamic\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 771  # hp drifttrigger_mmd-rollavg-2.0-20_int1500_win1y\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"huffpost_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "assert df_adjusted[\"pipeline_ref\"].nunique() <= 1\n",
+    "# add the pipeline time series which is the performance of different models stitched together dep.\n",
+    "# w.r.t which model was active\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = 0\n",
+    "pipeline_composite_model[\"id_model\"] = 0\n",
+    "\n",
+    "label_map = {k: f\"{k}\" for k, v in df_adjusted[[\"model_idx\", \"id_model\"]].values}\n",
+    "label_map[0] = \"Pipeline composite model\"\n",
+    "\n",
+    "if include_composite_model:\n",
+    "    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])\n",
+    "else:\n",
+    "    df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].dt.to_period(\"M\")\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_end_years_per_model = df_logs_models[[\"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"].dt.to_period(\"M\")\n",
+    "df_train_end_years_per_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=\"model_idx\", how=\"left\")\n",
+    "df_merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.groupby([\"real_train_end\", \"interval_center\"]).size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build heatmap matrix dataframe:\n",
+    "df_merged[\"real_train_end\"] = df_merged[\"real_train_end\"].apply(lambda x: pd.Period(x, freq=\"M\"))\n",
+    "heatmap_data = df_merged.pivot(index=[\"real_train_end\"], columns=\"interval_center\", values=\"value\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index.min(), heatmap_data.index.max()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_custom_ticks=[\n",
+    "        (i, f\"{period.to_timestamp().strftime('%b %Y')}\")\n",
+    "        for i, period in list(enumerate(heatmap_data.columns))[::1]\n",
+    "        if period in [pd.Period(\"Apr 2014\"), pd.Period(\"Jul 2018\"), pd.Period(\"Jan 2022\")]\n",
+    "    ],\n",
+    "    y_custom_ticks=[\n",
+    "        (i + 0.5, f\"{period.to_timestamp().strftime('%b %Y')}\")\n",
+    "        for i, period in list(enumerate(heatmap_data.index))[::1]\n",
+    "    ],\n",
+    "    y_label=\"Trained up to\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    title_label=\"HuffPost Dynamic Drift Threshold: Rolling Average Δ +200%\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.5,\n",
+    "    # grid_alpha=0.4,\n",
+    "    grid_alpha=0.0,\n",
+    "    # disable_horizontal_grid=True,\n",
+    "    # cbar=False,\n",
+    "    df_logs_models=df_logs_models,\n",
+    "    x_axis=\"period\",\n",
+    ")\n",
+    "save_plot(fig, \"hp_trigger_heatmap_drift_single_dynamic\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/drift/yb_cost.ipynb b/analytics/plotting/rh_thesis/drift/yb_cost.ipynb
new file mode 100644
index 000000000..a136ed341
--- /dev/null
+++ b/analytics/plotting/rh_thesis/drift/yb_cost.ipynb
@@ -0,0 +1,217 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import patch_yearbook_time, pipeline_leaf_times_df\n",
+    "from analytics.plotting.common.cost_matrix import plot_cost_matrix\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/20_datadrift_static\"),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode: time + amount\n",
+    "pipeline_ids = [107]  # yb drift mmd 0.06 250 4d\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_leaf_list = []\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_leaf.copy()\n",
+    "\n",
+    "# coloring in order of decreasing avg. duration\n",
+    "avg_duration_per_stage = df_adjusted.groupby([\"pipeline_ref\", \"id\"])[\"duration\"].mean().sort_values(ascending=False)\n",
+    "df_adjusted = df_adjusted.merge(avg_duration_per_stage, on=[\"pipeline_ref\", \"id\"], suffixes=(\"\", \"_avg\")).sort_values(\n",
+    "    \"duration_avg\", ascending=False\n",
+    ")\n",
+    "\n",
+    "# Yearbook as a mapped time dimension (to display the correct timestamps we need to convert back from days to years)\n",
+    "if patch_yearbook:\n",
+    "    patch_yearbook_time(df_adjusted, \"sample_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted\n",
+    "df_adjusted[\"sample_time_year\"] = df_adjusted[\"sample_time\"].dt.year\n",
+    "df_adjusted[\"sample_time_year_bin\"] = pd.cut(df_adjusted[\"sample_time_year\"], bins=10, labels=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new = df_adjusted[\n",
+    "    (\n",
+    "        df_adjusted[\"id\"].isin(\n",
+    "            [\n",
+    "                \"TRAIN\",\n",
+    "                \"STORE_TRAINED_MODEL\",\n",
+    "                \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "                \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "                \"EVALUATE_TRIGGER_POLICY\",\n",
+    "            ]\n",
+    "        )\n",
+    "    )\n",
+    "][[\"pipeline_ref\", \"id\", \"sample_time_year\", \"duration\"]].copy()\n",
+    "df_new = df_new.sort_values(\"sample_time_year\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state_rename = {\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\": \"inform remaining data\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\": \"inform trigger\",\n",
+    "}\n",
+    "\n",
+    "df_new[\"id\"] = df_new[\"id\"].replace(state_rename).str.lower().str.replace(\"_\", \" \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [107],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        107: \"Static MMD Threshold=0.07\",\n",
+    "    },\n",
+    "    height_factor=0.7,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (sec)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_ticks=[x for x in range(1940, 2010 + 1, 30)],\n",
+    "    y_ticks_cumulative=[x for x in range(0, 9 + 1, 3)],\n",
+    "    y_lim_cumulative=(0, 10),\n",
+    "    y_minutes=False,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"yearbook_drift-trigger-cost-matrix\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# warmup noticeable where not detection is launched"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/drift/yb_cost_perf_tradeoff.ipynb b/analytics/plotting/rh_thesis/drift/yb_cost_perf_tradeoff.ipynb
new file mode 100644
index 000000000..c33a59917
--- /dev/null
+++ b/analytics/plotting/rh_thesis/drift/yb_cost_perf_tradeoff.ipynb
@@ -0,0 +1,444 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import (\n",
+    "    df_aggregate_eval_metric,\n",
+    "    dfs_models_and_evals,\n",
+    "    patch_yearbook_time,\n",
+    "    pipeline_leaf_times_df,\n",
+    ")\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from analytics.plotting.common.tradeoff_scatterplot import plot_tradeoff_scatter\n",
+    "from modyn.supervisor.internal.grpc.enums import PipelineStage\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/20_datadrift_static\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/21_datadrift_dynamic\"),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from modyn.supervisor.internal.pipeline_executor.models import StageLog\n",
+    "\n",
+    "df = StageLog.df(\n",
+    "    [x for x in pipeline_logs.get(63).supervisor_logs.stage_runs if x.id == PipelineStage.TRAIN.name], extended=True\n",
+    ")\n",
+    "\n",
+    "max_trigger_idx = df[\"trigger_idx\"].idxmax()\n",
+    "time_at_trainer = df.loc[max_trigger_idx, \"train_time_at_trainer\"]\n",
+    "time_at_trainer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = {p_id: (pname, p_path) for p_id, (pname, p_path) in pipelines.items()}\n",
+    "pipeline_ids = list(pipelines.keys())\n",
+    "\n",
+    "[(p_id, pname) for p_id, (pname, _) in pipelines.items() if p_id in pipeline_ids]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_df_eval_single: list[pd.DataFrame] = []\n",
+    "df_leaf_list: list[pd.DataFrame] = []\n",
+    "\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_single[\"pipeline_id\"] = pipeline_id\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "    _, _, df_eval_single = dfs_models_and_evals(\n",
+    "        pipeline_logs[pipeline_id], df_leaf_single[\"sample_time\"].max(), pipelines[pipeline_id][0]\n",
+    "    )\n",
+    "    df_eval_single[\"pipeline_id\"] = pipeline_id\n",
+    "    list_df_eval_single.append(df_eval_single)\n",
+    "\n",
+    "df_adjusted = pd.concat(list_df_eval_single)\n",
+    "df_adjusted\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)\n",
+    "df_leaf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(df_leaf[\"id\"].unique())\n",
+    "assert set(df_leaf[\"id\"].unique()) == {\n",
+    "    \"TRAIN\",\n",
+    "    \"INIT_CLUSTER_CONNECTION\",\n",
+    "    \"EVALUATE_TRIGGER_POLICY\",\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "    \"TRAINING_COMPLETED\",\n",
+    "    \"STORE_TRAINED_MODEL\",\n",
+    "    \"EVALUATE\",\n",
+    "    \"DONE\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if patch_yearbook:\n",
+    "    for column in [\"interval_start\", \"interval_center\", \"interval_end\"]:\n",
+    "        patch_yearbook_time(df_adjusted, column)\n",
+    "    patch_yearbook_time(df_leaf, \"sample_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reduce to composite models\n",
+    "df_adjusted = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "df_adjusted[composite_model_variant].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reduce evaluation interval to interval where all policies have evaluations\n",
+    "min_active_eval_center_per_pipeline = (\n",
+    "    df_adjusted[df_adjusted[composite_model_variant]].groupby(\"pipeline_ref\")[\"interval_center\"].min()\n",
+    ")\n",
+    "maximum_min = min_active_eval_center_per_pipeline.max()\n",
+    "print(maximum_min, min_active_eval_center_per_pipeline)\n",
+    "\n",
+    "assert maximum_min < pd.Timestamp(\"1940-01-01\")\n",
+    "\n",
+    "df_adjusted = df_adjusted[df_adjusted[\"interval_center\"] >= maximum_min]\n",
+    "df_adjusted[\"interval_center\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].astype(str).str.split(\"-\").str[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Aggregate metrics to a scalar value per pipeline\n",
+    "mean_accuracies = df_aggregate_eval_metric(\n",
+    "    df_adjusted,\n",
+    "    group_by=[\"pipeline_id\", \"pipeline_ref\", \"metric\"],\n",
+    "    in_col=\"value\",\n",
+    "    out_col=\"metric_value\",\n",
+    "    aggregate_func=\"mean\",\n",
+    ")\n",
+    "mean_accuracies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_triggers = df_leaf[df_leaf[\"id\"] == PipelineStage.TRAIN.name]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_triggers = df_leaf[df_leaf[\"id\"] == PipelineStage.TRAIN.name]\n",
+    "df_triggers = df_triggers[df_triggers[\"sample_time\"] > maximum_min]\n",
+    "df_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find number of trigger per pipeline that are after maximum_min\n",
+    "\n",
+    "# before the cutoff there was one trigger (equivalent to start of our reduced dataset): +1\n",
+    "num_triggers = df_triggers.groupby(\"pipeline_id\").aggregate(count=(\"id\", \"count\"), sum_duration=(\"duration\", \"sum\"))\n",
+    "num_triggers[\"count\"] += 1\n",
+    "num_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged = num_triggers.merge(mean_accuracies, on=\"pipeline_id\")\n",
+    "assert mean_accuracies.shape[0] == merged.shape[0]\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_type(x: str):\n",
+    "    if \"year\" in x:\n",
+    "        return \"time\"\n",
+    "    elif \"samples\" in x:\n",
+    "        return \"amount\"\n",
+    "    elif \"d\" in x:\n",
+    "        return \"drift\"\n",
+    "    else:\n",
+    "        return \"unknown\"\n",
+    "\n",
+    "\n",
+    "merged[\"type\"] = merged[\"pipeline_ref\"].apply(lambda x: create_type(x))\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "renamed = merged[\n",
+    "    merged[\"pipeline_id\"].isin(\n",
+    "        [\n",
+    "            # static thresholds\n",
+    "            113,  # 0.03\n",
+    "            112,  # 0.05\n",
+    "            107,  # 0.07\n",
+    "            109,  # 0.09\n",
+    "            85,  # 0.12\n",
+    "            # dyn quantile\n",
+    "            353,  # % 0.05\n",
+    "            345,  # % 0.10\n",
+    "            357,  # % 0.15\n",
+    "            # dyn roll. avg\n",
+    "            372,  # Δ 2.0\n",
+    "            370,  # Δ 1.0\n",
+    "            369,  # Δ 0.5\n",
+    "            363,  # Δ 0.05\n",
+    "        ]\n",
+    "    )\n",
+    "].copy()\n",
+    "renamed[\"Trigger Type\"] = renamed[\"pipeline_ref\"].apply(\n",
+    "    lambda x: \"Dyn. Quantile % [0.05/0.1/0.15]\"\n",
+    "    if \"quant\" in x\n",
+    "    else (\"Roll. Avg Δ [2.0/1.0/0.05/0.5]\" if \"roll\" in x else (\"static MMD threshold\\n[0.03/0.05/0.07/0.09/0.12]\"))\n",
+    ")\n",
+    "renamed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_tradeoff_scatter(\n",
+    "    renamed,\n",
+    "    x=\"count\",\n",
+    "    y=\"metric_value\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger Type\",\n",
+    "    x_label=\"Number of Triggers\",\n",
+    "    y_label=\"Mean Accuracy %\",\n",
+    "    height_factor=0.6,\n",
+    "    width_factor=0.7,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"tradeoff_drift_yearbook_triggers_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "in_minutes = renamed.copy()\n",
+    "in_minutes[\"sum_duration\"] = in_minutes[\"sum_duration\"] / 60\n",
+    "\n",
+    "fig = plot_tradeoff_scatter(\n",
+    "    in_minutes,\n",
+    "    x=\"sum_duration\",\n",
+    "    y=\"metric_value\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger Type\",\n",
+    "    x_label=\"Total Cost (Minutes)\",\n",
+    "    y_label=\"Mean Accuracy %\",\n",
+    "    height_factor=0.6,\n",
+    "    width_factor=0.7,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"tradeoff_drift_yearbook_cost_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_tradeoff_scatter(\n",
+    "    renamed,\n",
+    "    x=\"count\",\n",
+    "    y=\"sum_duration\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger Type\",\n",
+    "    x_label=\"Number of Triggers\",\n",
+    "    y_label=\"Total Cost (seconds)\",\n",
+    "    height_factor=0.6,\n",
+    "    width_factor=0.8,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"tradeoff_drift_yearbook_triggers_cost\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/drift/yearbook_heatmap_multi.ipynb b/analytics/plotting/rh_thesis/drift/yearbook_heatmap_multi.ipynb
new file mode 100644
index 000000000..aa3538f10
--- /dev/null
+++ b/analytics/plotting/rh_thesis/drift/yearbook_heatmap_multi.ipynb
@@ -0,0 +1,423 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import (\n",
+    "    dfs_models_and_evals,\n",
+    "    patch_yearbook_time,\n",
+    "    pipeline_leaf_times_df,\n",
+    ")\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/20_datadrift_static\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/21_datadrift_dynamic\"),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_ids = list(pipelines.keys())\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_df_eval_single: list[pd.DataFrame] = []\n",
+    "df_logs_models_list: list[pd.DataFrame] = []\n",
+    "\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=False, pipeline_id=pipeline_id)\n",
+    "    df_logs_models_single, _, df_eval_single = dfs_models_and_evals(\n",
+    "        pipeline_logs[pipeline_id], df_leaf_single[\"sample_time\"].max(), pipelines[pipeline_id][0]\n",
+    "    )\n",
+    "    df_eval_single[\"pipeline_id\"] = pipeline_id\n",
+    "    df_logs_models_single[\"pipeline_id\"] = pipeline_id\n",
+    "    list_df_eval_single.append(df_eval_single)\n",
+    "    df_logs_models_list.append(df_logs_models_single)\n",
+    "\n",
+    "df_adjusted = pd.concat(list_df_eval_single)\n",
+    "df_adjusted\n",
+    "\n",
+    "df_logs_models = pd.concat(df_logs_models_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if patch_yearbook:\n",
+    "    for column in [\"interval_start\", \"interval_center\", \"interval_end\"]:\n",
+    "        patch_yearbook_time(df_adjusted, column)\n",
+    "    for column in [\"train_start\", \"train_end\", \"real_train_end\", \"usage_start\", \"usage_end\"]:\n",
+    "        patch_yearbook_time(df_logs_models, column)\n",
+    "\n",
+    "    # correction for -1 second in timestamp format before patching\n",
+    "    df_logs_models[\"usage_end\"] = (\n",
+    "        df_logs_models[\"usage_end\"].dt.to_period(\"M\") + 1\n",
+    "    ).dt.to_timestamp()  # december (because of -1 second in timestamp format) -> start of year\n",
+    "\n",
+    "df_logs_models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "len(df_adjusted)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reduce to composite models\n",
+    "df_adjusted = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "df_adjusted[composite_model_variant].unique()\n",
+    "len(df_adjusted)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].astype(str).str.split(\"-\").str[0]\n",
+    "\n",
+    "df_train_end_years_per_model = df_logs_models[[\"pipeline_id\", \"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"].dt.year"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted.groupby([\"pipeline_id\"]).size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Window Sizes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_pids = [117, 107, 95]\n",
+    "\n",
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=[\"pipeline_id\", \"model_idx\"], how=\"left\")\n",
+    "# build heatmap matrix dataframe:\n",
+    "df_merged[\"pipeline_id\"] = df_merged[\"pipeline_id\"].astype(int)\n",
+    "df_merged = df_merged[df_merged[\"pipeline_id\"].isin(_pids)]\n",
+    "heatmap_data = df_merged.pivot(index=[\"pipeline_id\"], columns=\"interval_center\", values=\"value\")\n",
+    "\n",
+    "heatmap_data.index.min(), heatmap_data.index.max()\n",
+    "heatmap_data\n",
+    "\n",
+    "# sort index by pipeline_refs\n",
+    "heatmap_data = heatmap_data.reindex(_pids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "\n",
+    "pipelines_refs = {\n",
+    "    117: \"10y\",\n",
+    "    107: \"4y\",\n",
+    "    95: \"1y\",\n",
+    "}\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_ticks=[1950, 1975, 2000],\n",
+    "    y_custom_ticks=[(i + 0.5, pipelines_refs[y]) for i, y in enumerate(heatmap_data.index)],\n",
+    "    y_label=\"Pipeline with\\nWindow Size\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    title_label=\"Yearbook Composite Models: Drift Window Sizes (MMD=0.07)\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.38,\n",
+    "    # grid_alpha=0.4,\n",
+    "    grid_alpha=0.0,\n",
+    "    # disable_horizontal_grid=True,\n",
+    "    # cbar=False,\n",
+    "    triggers={\n",
+    "        i: df_logs_models[df_logs_models[\"pipeline_id\"] == p_id][\n",
+    "            [\"train_start\", \"train_end\", \"usage_start\", \"usage_end\"]\n",
+    "        ]\n",
+    "        for i, p_id in enumerate(heatmap_data.index)\n",
+    "    },\n",
+    ")\n",
+    "save_plot(fig, \"yb_trigger_heatmap_drift_multi_static_window_size\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Static Thresholds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_pids = list(reversed([113, 112, 107, 109, 85]))\n",
+    "\n",
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=[\"pipeline_id\", \"model_idx\"], how=\"left\")\n",
+    "# build heatmap matrix dataframe:\n",
+    "df_merged[\"pipeline_id\"] = df_merged[\"pipeline_id\"].astype(int)\n",
+    "df_merged = df_merged[df_merged[\"pipeline_id\"].isin(_pids)]\n",
+    "heatmap_data = df_merged.pivot(index=[\"pipeline_id\"], columns=\"interval_center\", values=\"value\")\n",
+    "\n",
+    "heatmap_data.index.min(), heatmap_data.index.max()\n",
+    "\n",
+    "# sort index by pipeline_refs\n",
+    "heatmap_data = heatmap_data.reindex(_pids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "pipelines_refs = {\n",
+    "    113: \"0.03\",\n",
+    "    112: \"0.05\",\n",
+    "    107: \"0.07\",\n",
+    "    109: \"0.09\",\n",
+    "    85: \"0.12\",\n",
+    "}\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_ticks=[1950, 1975, 2000],\n",
+    "    y_custom_ticks=[(i + 0.5, pipelines_refs[y]) for i, y in enumerate(heatmap_data.index)],\n",
+    "    y_label=\"MMD Threshold\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    title_label=\"Yearbook Composite Models: Static Drift Thresholds\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.45,\n",
+    "    # grid_alpha=0.4,\n",
+    "    grid_alpha=0.0,\n",
+    "    # disable_horizontal_grid=True,\n",
+    "    # cbar=False,\n",
+    "    triggers={\n",
+    "        i: df_logs_models[df_logs_models[\"pipeline_id\"] == p_id][\n",
+    "            [\"train_start\", \"train_end\", \"usage_start\", \"usage_end\"]\n",
+    "        ]\n",
+    "        for i, p_id in enumerate(heatmap_data.index)\n",
+    "    },\n",
+    ")\n",
+    "save_plot(fig, \"yb_trigger_heatmap_drift_multi_static_threshold\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dynamic Quantile / Roll Avg"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_pids = list([372, 370, 369, 363]) + list([353, 357])  # 345\n",
+    "\n",
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=[\"pipeline_id\", \"model_idx\"], how=\"left\")\n",
+    "# build heatmap matrix dataframe:\n",
+    "df_merged[\"pipeline_id\"] = df_merged[\"pipeline_id\"].astype(int)\n",
+    "df_merged = df_merged[df_merged[\"pipeline_id\"].isin(_pids)]\n",
+    "heatmap_data = df_merged.pivot(index=[\"pipeline_id\"], columns=\"interval_center\", values=\"value\")\n",
+    "\n",
+    "heatmap_data.index.min(), heatmap_data.index.max()\n",
+    "\n",
+    "# sort index by pipeline_refs\n",
+    "heatmap_data = heatmap_data.reindex(_pids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "pipelines_refs = {\n",
+    "    # dyn quantile\n",
+    "    353: \"% 0.05\",\n",
+    "    # 345: \"% 0.10\",\n",
+    "    357: \"% 0.15\",\n",
+    "    # dyn roll. avg\n",
+    "    372: \"Δ 2.0\",\n",
+    "    370: \"Δ 1.0\",\n",
+    "    369: \"Δ 0.5\",\n",
+    "    363: \"Δ 0.05\",\n",
+    "}\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_ticks=[1950, 1975, 2000],\n",
+    "    y_custom_ticks=[(i + 0.5, pipelines_refs[y]) for i, y in enumerate(heatmap_data.index)],\n",
+    "    y_label=\"Criterion\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    title_label=\"Yearbook Composite Models: Dynamic Drift Thresholds\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.47,\n",
+    "    # grid_alpha=0.4,\n",
+    "    grid_alpha=0.0,\n",
+    "    # disable_horizontal_grid=True,\n",
+    "    # cbar=False,\n",
+    "    triggers={\n",
+    "        i: df_logs_models[df_logs_models[\"pipeline_id\"] == p_id][\n",
+    "            [\"train_start\", \"train_end\", \"usage_start\", \"usage_end\"]\n",
+    "        ]\n",
+    "        for i, p_id in enumerate(heatmap_data.index)\n",
+    "    },\n",
+    ")\n",
+    "save_plot(fig, \"yb_trigger_heatmap_drift_multi_dynamic_thresholds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/drift/yearbook_heatmap_single.ipynb b/analytics/plotting/rh_thesis/drift/yearbook_heatmap_single.ipynb
new file mode 100644
index 000000000..54bdc59eb
--- /dev/null
+++ b/analytics/plotting/rh_thesis/drift/yearbook_heatmap_single.ipynb
@@ -0,0 +1,316 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe, patch_yearbook_time\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# INPUTS\n",
+    "drift_pipeline = False\n",
+    "if drift_pipeline:\n",
+    "    pipelines_dir = Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/20_datadrift_static\"\n",
+    "    )\n",
+    "else:\n",
+    "    pipelines_dir = Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/20_datadrift_static\"\n",
+    "    )\n",
+    "output_dir = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/plots/triggering\")\n",
+    "assert pipelines_dir.exists()\n",
+    "assert output_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 107  # yb drift mmd 0.06 250 4d\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_logs_models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if patch_yearbook:\n",
+    "    for column in [\"interval_start\", \"interval_center\", \"interval_end\"]:\n",
+    "        patch_yearbook_time(df_adjusted, column)\n",
+    "    for column in [\"train_start\", \"train_end\", \"real_train_end\", \"usage_start\", \"usage_end\"]:\n",
+    "        patch_yearbook_time(df_logs_models, column)\n",
+    "\n",
+    "    # correction for -1 second in timestamp format before patching\n",
+    "    df_logs_models[\"usage_end\"] = (\n",
+    "        df_logs_models[\"usage_end\"].dt.to_period(\"M\") + 1\n",
+    "    ).dt.to_timestamp()  # december (because of -1 second in timestamp format) -> start of year\n",
+    "\n",
+    "df_logs_models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "assert df_adjusted[\"pipeline_ref\"].nunique() <= 1\n",
+    "# add the pipeline time series which is the performance of different models stitched together dep.\n",
+    "# w.r.t which model was active\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = 0\n",
+    "pipeline_composite_model[\"id_model\"] = 0\n",
+    "\n",
+    "label_map = {k: f\"{k}\" for k, v in df_adjusted[[\"model_idx\", \"id_model\"]].values}\n",
+    "label_map[0] = \"Pipeline composite model\"\n",
+    "\n",
+    "if include_composite_model:\n",
+    "    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])\n",
+    "else:\n",
+    "    df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].astype(str).str.split(\"-\").str[0]\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_end_years_per_model = df_logs_models[[\"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"].dt.year\n",
+    "df_train_end_years_per_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=\"model_idx\", how=\"left\")\n",
+    "df_merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build heatmap matrix dataframe:\n",
+    "heatmap_data = df_merged.pivot(index=[\"real_train_end\"], columns=\"interval_center\", values=\"value\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index.min(), heatmap_data.index.max()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_ticks=[1950, 1975, 2000],\n",
+    "    y_custom_ticks=[(i + 0.5, str(y)) for i, y in enumerate(heatmap_data.index)],\n",
+    "    y_label=\"Trained up to\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    title_label=\"Yearbook 4y Drift Windows: Static MMD Threshold=0.07\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.55,\n",
+    "    # grid_alpha=0.4,\n",
+    "    grid_alpha=0.0,\n",
+    "    # disable_horizontal_grid=True,\n",
+    "    # cbar=False,\n",
+    "    df_logs_models=df_logs_models,\n",
+    ")\n",
+    "save_plot(fig, \"yb_trigger_heatmap_drift_single_static\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/evaluation_setup/arxiv_heatmap_metrics.ipynb b/analytics/plotting/rh_thesis/evaluation_setup/arxiv_heatmap_metrics.ipynb
new file mode 100644
index 000000000..5d0537799
--- /dev/null
+++ b/analytics/plotting/rh_thesis/evaluation_setup/arxiv_heatmap_metrics.ipynb
@@ -0,0 +1,365 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "from matplotlib import pyplot as plt\n",
+    "from matplotlib.figure import Figure\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe\n",
+    "from analytics.plotting.common.common import init_plot\n",
+    "from analytics.plotting.common.font import setup_font\n",
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/10_baselines_time\"\n",
+    ")\n",
+    "assert pipelines_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 267\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "dataset_id = \"arxiv_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metrics = [\n",
+    "    \"Accuracy\",\n",
+    "    \"F1-micro\",\n",
+    "    \"F1-macro\",\n",
+    "    \"F1-weighted\",\n",
+    "    \"Top-10-Accuracy\",\n",
+    "    \"Top-5-Accuracy\",\n",
+    "    \"Top-2-Accuracy\",\n",
+    "]\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"].isin(metrics))\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].dt.to_period(\"M\")\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "assert df_adjusted[\"pipeline_ref\"].nunique() <= 1\n",
+    "# add the pipeline time series which is the performance of different models stitched together dep.\n",
+    "# w.r.t which model was active\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = 0\n",
+    "pipeline_composite_model[\"id_model\"] = 0\n",
+    "\n",
+    "label_map = {k: f\"{k}\" for k, v in df_adjusted[[\"model_idx\", \"id_model\"]].values}\n",
+    "label_map[0] = \"Pipeline composite model\"\n",
+    "\n",
+    "if include_composite_model:\n",
+    "    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])\n",
+    "else:\n",
+    "    df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_heatmap_grid(\n",
+    "    data: dict[tuple[int, str], dict[tuple[int, str], pd.DataFrame]],\n",
+    "    cbar_label: str,\n",
+    "    # Define vmin and vmax for the color scale to be consistent across heatmaps\n",
+    "    vmin: float = 0,\n",
+    "    vmax: float = 0.3,\n",
+    "    nrows: int = 4,\n",
+    "    ncols: int = 4,\n",
+    "    cbar: bool = False,\n",
+    "    single_cbar: bool = False,\n",
+    "    height_factor: float = 1.2,\n",
+    "    width_factor: float = 1.0,\n",
+    "    x_space_factor: float = 1,\n",
+    "    y_space_factor: float = 1,\n",
+    "    grid_alpha: float = 0.0,\n",
+    ") -> Figure:\n",
+    "    init_plot()\n",
+    "    setup_font(small_label=True, small_title=True)\n",
+    "\n",
+    "    double_fig_width = 10\n",
+    "    double_fig_height = 3.5\n",
+    "    fig, axs = plt.subplots(\n",
+    "        nrows=nrows,\n",
+    "        ncols=ncols,\n",
+    "        edgecolor=\"black\",\n",
+    "        frameon=True,\n",
+    "        figsize=(double_fig_width * width_factor, 2.2 * double_fig_height * height_factor),\n",
+    "        dpi=450,\n",
+    "        squeeze=True,\n",
+    "    )\n",
+    "\n",
+    "    for x, row_key in enumerate(data):\n",
+    "        (row, row_title) = row_key\n",
+    "        for y, col_key in enumerate(data[row_key]):\n",
+    "            (col, col_title) = col_key\n",
+    "            title, cell_data = data[row_key][col_key]\n",
+    "            ax = axs[x, y] if nrows > 1 else axs[y]\n",
+    "            # print([(i, period.to_timestamp().strftime('%b %Y')) for i, period in list(enumerate(cell_data.columns))[::1]])\n",
+    "            # available:\n",
+    "            # [(0, 'Apr 1995'), (1, 'Oct 1995'), (2, 'Mar 1996'), (3, 'Sep 1996'), (4, 'Mar 1997'), (5, 'Sep 1997'), (6, 'Mar 1998'), (7, 'Sep 1998'), (8, 'Mar 1999'), (9, 'Sep 1999'), (10, 'Mar 2000'), (11, 'Sep 2000'), (12, 'Mar 2001'), (13, 'Sep 2001'), (14, 'Mar 2002'), (15, 'Sep 2002'), (16, 'Mar 2003'), (17, 'Sep 2003'), (18, 'Mar 2004'), (19, 'Sep 2004'), (20, 'Mar 2005'), (21, 'Sep 2005'), (22, 'Mar 2006'), (23, 'Sep 2006'), (24, 'Mar 2007'), (25, 'Sep 2007'), (26, 'Mar 2008'), (27, 'Sep 2008'), (28, 'Mar 2009'), (29, 'Sep 2009'), (30, 'Mar 2010'), (31, 'Sep 2010'), (32, 'Mar 2011'), (33, 'Sep 2011'), (34, 'Mar 2012'), (35, 'Sep 2012'), (36, 'Mar 2013'), (37, 'Sep 2013'), (38, 'Mar 2014'), (39, 'Sep 2014'), (40, 'Mar 2015'), (41, 'Sep 2015'), (42, 'Mar 2016'), (43, 'Sep 2016'), (44, 'Mar 2017'), (45, 'Sep 2017'), (46, 'Mar 2018'), (47, 'Sep 2018'), (48, 'Mar 2019'), (49, 'Sep 2019'), (50, 'Mar 2020'), (51, 'Aug 2020'), (52, 'Feb 2021'), (53, 'Aug 2021'), (54, 'Feb 2022'), (55, 'Aug 2022'), (56, 'Feb 2023'), (57, 'Aug 2023'), (58, 'Feb 2024')]\n",
+    "\n",
+    "            # print([(i, period.to_timestamp().strftime('%b %Y')) for i, period in list(enumerate(cell_data.index))[::1]])\n",
+    "            # [(0, 'Jul 1995'), (1, 'Dec 1995'), (2, 'Jun 1996'), (3, 'Dec 1996'), (4, 'Jun 1997'), (5, 'Dec 1997'), (6, 'Jun 1998'), (7, 'Dec 1998'), (8, 'Jun 1999'), (9, 'Dec 1999'), (10, 'Jun 2000'), (11, 'Dec 2000'), (12, 'Jun 2001'), (13, 'Dec 2001'), (14, 'Jun 2002'), (15, 'Dec 2002'), (16, 'Jun 2003'), (17, 'Dec 2003'), (18, 'Jun 2004'), (19, 'Dec 2004'), (20, 'Jun 2005'), (21, 'Dec 2005'), (22, 'Jun 2006'), (23, 'Dec 2006'), (24, 'Jun 2007'), (25, 'Dec 2007'), (26, 'Jun 2008'), (27, 'Dec 2008'), (28, 'Jun 2009'), (29, 'Dec 2009'), (30, 'Jun 2010'), (31, 'Dec 2010'), (32, 'Jun 2011'), (33, 'Dec 2011'), (34, 'Jun 2012'), (35, 'Dec 2012'), (36, 'Jun 2013'), (37, 'Dec 2013'), (38, 'Jun 2014'), (39, 'Dec 2014'), (40, 'Jun 2015'), (41, 'Dec 2015'), (42, 'Jun 2016'), (43, 'Dec 2016'), (44, 'Jun 2017'), (45, 'Dec 2017'), (46, 'Jun 2018'), (47, 'Dec 2018'), (48, 'Jun 2019'), (49, 'Dec 2019'), (50, 'May 2020'), (51, 'Nov 2020'), (52, 'May 2021'), (53, 'Nov 2021'), (54, 'May 2022'), (55, 'Nov 2022'), (56, 'May 2023'), (57, 'Nov 2023'), (58, 'May 2024')]\n",
+    "\n",
+    "            _ = build_heatmap(\n",
+    "                cell_data,\n",
+    "                x_custom_ticks=[\n",
+    "                    (i, f\"{period.to_timestamp().strftime('%b %Y')}\".replace(\" \", \"\\n\"))\n",
+    "                    for i, period in list(enumerate(cell_data.columns))[::1]\n",
+    "                    if period in [pd.Period(\"Mar 2000\"), pd.Period(\"Mar 2009\"), pd.Period(\"Mar 2020\")]\n",
+    "                ],\n",
+    "                y_custom_ticks=[\n",
+    "                    (i, f\"{period.to_timestamp().strftime('%b %Y')}\".replace(\" \", \"\\n\"))\n",
+    "                    for i, period in list(enumerate(cell_data.index))[::1]\n",
+    "                    if period in [pd.Period(\"Jun 2000\"), pd.Period(\"Jun 2009\"), pd.Period(\"May 2020\")]\n",
+    "                ],\n",
+    "                x_label=col_title,\n",
+    "                y_label=row_title,\n",
+    "                reverse_col=True,\n",
+    "                # x_label,\n",
+    "                # color_label = \"MMD\",  # TODO\n",
+    "                title_label=title,\n",
+    "                target_ax=ax,\n",
+    "                square=False,\n",
+    "                width_factor=width_factor,\n",
+    "                height_factor=height_factor,\n",
+    "                cbar=single_cbar,\n",
+    "                vmin=vmin,\n",
+    "                vmax=vmax,\n",
+    "                grid_alpha=grid_alpha,\n",
+    "            )\n",
+    "            ax.label_outer()  # Remove labels for inner plots, keep only on outer\n",
+    "\n",
+    "    if cbar:\n",
+    "        cbar_ax = fig.add_axes([1, 0.25, 0.02, 0.6])  # Adjust the colorbar position\n",
+    "        custom_cbar = fig.colorbar(ax.collections[0], cax=cbar_ax)  # use last printed axis\n",
+    "        custom_cbar.set_label(cbar_label)  # Set your custom label here\n",
+    "\n",
+    "    plt.subplots_adjust(wspace=0.1 * x_space_factor, hspace=0.1 * y_space_factor)\n",
+    "    return fig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_end_years_per_model = df_logs_models[[\"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"].dt.to_period(\"M\")\n",
+    "df_train_end_years_per_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=\"model_idx\", how=\"left\")\n",
+    "df_merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_heatmap_data_for_handler(data: pd.DataFrame, metric: str) -> pd.DataFrame:\n",
+    "    # build heatmap matrix dataframe:\n",
+    "    data_filtered = data[data[\"metric\"] == metric]\n",
+    "    pt_data = data_filtered.pivot(index=[\"real_train_end\"], columns=\"interval_center\", values=\"value\")\n",
+    "    return pt_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_content = {\n",
+    "    (0, \"Trained up to\"): {\n",
+    "        (0, \"Evaluation Year\"): (\n",
+    "            \"Accuracy\",\n",
+    "            generate_heatmap_data_for_handler(\n",
+    "                df_merged, \"Accuracy\"\n",
+    "            ),  # almost identical to F1-micro and F1-weighted; macro is broken\n",
+    "        ),\n",
+    "        (1, \"Evaluation Year\"): (\"Top-2-Accuracy\", generate_heatmap_data_for_handler(df_merged, \"Top-2-Accuracy\")),\n",
+    "        (2, \"Evaluation Year\"): (\"Top-5-Accuracy\", generate_heatmap_data_for_handler(df_merged, \"Top-5-Accuracy\")),\n",
+    "        (3, \"Evaluation Year\"): (\"Top-10-Accuracy\", generate_heatmap_data_for_handler(df_merged, \"Top-10-Accuracy\")),\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "# find vmin and vmax\n",
+    "vmin = 1\n",
+    "vmax = 0\n",
+    "for row_key in plot_content:\n",
+    "    for col_key in plot_content[row_key]:\n",
+    "        (_, cell_data) = plot_content[row_key][col_key]\n",
+    "        vmin = min(vmin, cell_data.min().min())\n",
+    "        vmax = max(vmax, cell_data.max().max())\n",
+    "print(vmin, vmax)\n",
+    "\n",
+    "fig = plot_heatmap_grid(\n",
+    "    data=plot_content,\n",
+    "    cbar_label=\"[Metric] %\",\n",
+    "    nrows=1,\n",
+    "    ncols=4,\n",
+    "    vmin=vmin,\n",
+    "    vmax=vmax,\n",
+    "    cbar=True,\n",
+    "    single_cbar=False,\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.48,\n",
+    "    grid_alpha=0.5,\n",
+    ")\n",
+    "\n",
+    "# TODO: we have already seen the Accuracy plot in Offline eval chapter (with another color scale though)\n",
+    "save_plot(fig, \"evaluation_metrics_arxiv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/evaluation_setup/hp_heatmap_metrics.ipynb b/analytics/plotting/rh_thesis/evaluation_setup/hp_heatmap_metrics.ipynb
new file mode 100644
index 000000000..4c4c4e479
--- /dev/null
+++ b/analytics/plotting/rh_thesis/evaluation_setup/hp_heatmap_metrics.ipynb
@@ -0,0 +1,366 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "from matplotlib import pyplot as plt\n",
+    "from matplotlib.figure import Figure\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe\n",
+    "from analytics.plotting.common.common import init_plot\n",
+    "from analytics.plotting.common.font import setup_font\n",
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/10_baselines_time\"\n",
+    ")\n",
+    "assert pipelines_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 273\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "dataset_id = \"huffpost_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metrics = [\n",
+    "    \"Accuracy\",\n",
+    "    \"F1-micro\",\n",
+    "    \"F1-macro\",\n",
+    "    \"F1-weighted\",\n",
+    "    \"Top-10-Accuracy\",\n",
+    "    \"Top-5-Accuracy\",\n",
+    "    \"Top-2-Accuracy\",\n",
+    "]\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"].isin(metrics))\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].dt.to_period(\"M\")\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "assert df_adjusted[\"pipeline_ref\"].nunique() <= 1\n",
+    "# add the pipeline time series which is the performance of different models stitched together dep.\n",
+    "# w.r.t which model was active\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = 0\n",
+    "pipeline_composite_model[\"id_model\"] = 0\n",
+    "\n",
+    "label_map = {k: f\"{k}\" for k, v in df_adjusted[[\"model_idx\", \"id_model\"]].values}\n",
+    "label_map[0] = \"Pipeline composite model\"\n",
+    "\n",
+    "if include_composite_model:\n",
+    "    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])\n",
+    "else:\n",
+    "    df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_heatmap_grid(\n",
+    "    data: dict[tuple[int, str], dict[tuple[int, str], pd.DataFrame]],\n",
+    "    cbar_label: str,\n",
+    "    # Define vmin and vmax for the color scale to be consistent across heatmaps\n",
+    "    vmin: float = 0,\n",
+    "    vmax: float = 0.3,\n",
+    "    nrows: int = 4,\n",
+    "    ncols: int = 4,\n",
+    "    cbar: bool = False,\n",
+    "    single_cbar: bool = False,\n",
+    "    height_factor: float = 1.2,\n",
+    "    width_factor: float = 1.0,\n",
+    "    x_space_factor: float = 1,\n",
+    "    y_space_factor: float = 1,\n",
+    "    grid_alpha: float = 0.0,\n",
+    ") -> Figure:\n",
+    "    init_plot()\n",
+    "    setup_font(small_label=True, small_title=True)\n",
+    "\n",
+    "    double_fig_width = 10\n",
+    "    double_fig_height = 3.5\n",
+    "    fig, axs = plt.subplots(\n",
+    "        nrows=nrows,\n",
+    "        ncols=ncols,\n",
+    "        edgecolor=\"black\",\n",
+    "        frameon=True,\n",
+    "        figsize=(double_fig_width * width_factor, 2.2 * double_fig_height * height_factor),\n",
+    "        dpi=450,\n",
+    "        squeeze=True,\n",
+    "    )\n",
+    "\n",
+    "    for x, row_key in enumerate(data):\n",
+    "        (row, row_title) = row_key\n",
+    "        for y, col_key in enumerate(data[row_key]):\n",
+    "            (col, col_title) = col_key\n",
+    "            title, cell_data = data[row_key][col_key]\n",
+    "            ax = axs[x, y] if nrows > 1 else axs[y]\n",
+    "            # print([(i, period.to_timestamp().strftime('%b %Y')) for i, period in list(enumerate(cell_data.columns))[::1]])\n",
+    "            # available:\n",
+    "            # [(0, 'Jan 2012'), (1, 'Apr 2012'), (2, 'Jul 2012'), (3, 'Oct 2012'), (4, 'Jan 2013'), (5, 'Apr 2013'), (6, 'Jul 2013'), (7, 'Oct 2013'), (8, 'Jan 2014'), (9, 'Apr 2014'), (10, 'Jul 2014'), (11, 'Oct 2014'), (12, 'Jan 2015'), (13, 'Apr 2015'), (14, 'Jul 2015'), (15, 'Oct 2015'), (16, 'Jan 2016'), (17, 'Apr 2016'), (18, 'Jul 2016'), (19, 'Oct 2016'), (20, 'Jan 2017'), (21, 'Apr 2017'), (22, 'Jul 2017'), (23, 'Oct 2017'), (24, 'Jan 2018'), (25, 'Apr 2018'), (26, 'Jul 2018'), (27, 'Oct 2018'), (28, 'Jan 2019'), (29, 'Apr 2019'), (30, 'Jul 2019'), (31, 'Oct 2019'), (32, 'Jan 2020'), (33, 'Apr 2020'), (34, 'Jul 2020'), (35, 'Oct 2020'), (36, 'Jan 2021'), (37, 'Apr 2021'), (38, 'Jul 2021'), (39, 'Oct 2021'), (40, 'Jan 2022'), (41, 'Apr 2022'), (42, 'Jul 2022')]\n",
+    "\n",
+    "            # print([(i, period.to_timestamp().strftime('%b %Y')) for i, period in list(enumerate(cell_data.index))[::1]])\n",
+    "            # [(0, 'Jul 2012'), (1, 'Jan 2013'), (2, 'Jul 2013'), (3, 'Jan 2014'), (4, 'Jul 2014'), (5, 'Jan 2015'), (6, 'Jul 2015'), (7, 'Jan 2016'), (8, 'Jul 2016'), (9, 'Jan 2017'), (10, 'Jul 2017'), (11, 'Jan 2018'), (12, 'Jul 2018'), (13, 'Jan 2019'), (14, 'Jul 2019'), (15, 'Jan 2020'), (16, 'Jul 2020'), (17, 'Jan 2021'), (18, 'Jul 2021'), (19, 'Jan 2022'), (20, 'Jul 2022')]\n",
+    "\n",
+    "            _ = build_heatmap(\n",
+    "                cell_data,\n",
+    "                x_custom_ticks=[\n",
+    "                    (i, f\"{period.to_timestamp().strftime('%b %Y')}\".replace(\" \", \"\\n\"))\n",
+    "                    for i, period in list(enumerate(cell_data.columns))[::1]\n",
+    "                    if period in [pd.Period(\"Apr 2014\"), pd.Period(\"Jul 2018\"), pd.Period(\"Jan 2022\")]\n",
+    "                ],\n",
+    "                y_custom_ticks=[\n",
+    "                    (i, f\"{period.to_timestamp().strftime('%b %Y')}\".replace(\" \", \"\\n\"))\n",
+    "                    for i, period in list(enumerate(cell_data.index))[::1]\n",
+    "                    if period in [pd.Period(\"Jul 2014\"), pd.Period(\"Jul 2018\"), pd.Period(\"Jan 2022\")]\n",
+    "                ],\n",
+    "                x_label=col_title,\n",
+    "                y_label=row_title,\n",
+    "                reverse_col=True,\n",
+    "                # x_label,\n",
+    "                # color_label = \"MMD\",  # TODO\n",
+    "                title_label=title,\n",
+    "                target_ax=ax,\n",
+    "                square=False,\n",
+    "                width_factor=width_factor,\n",
+    "                height_factor=height_factor,\n",
+    "                cbar=single_cbar,\n",
+    "                vmin=vmin,\n",
+    "                vmax=vmax,\n",
+    "                grid_alpha=grid_alpha,\n",
+    "            )\n",
+    "            ax.label_outer()  # Remove labels for inner plots, keep only on outer\n",
+    "\n",
+    "    if cbar:\n",
+    "        cbar_ax = fig.add_axes([1, 0.25, 0.02, 0.6])  # Adjust the colorbar position\n",
+    "        custom_cbar = fig.colorbar(ax.collections[0], cax=cbar_ax)  # use last printed axis\n",
+    "        custom_cbar.set_label(cbar_label)  # Set your custom label here\n",
+    "\n",
+    "    plt.subplots_adjust(wspace=0.1 * x_space_factor, hspace=0.1 * y_space_factor)\n",
+    "    return fig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_end_years_per_model = df_logs_models[[\"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"].dt.to_period(\"M\")\n",
+    "df_train_end_years_per_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=\"model_idx\", how=\"left\")\n",
+    "df_merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_heatmap_data_for_handler(data: pd.DataFrame, metric: str) -> pd.DataFrame:\n",
+    "    # build heatmap matrix dataframe:\n",
+    "    data_filtered = data[data[\"metric\"] == metric]\n",
+    "    pt_data = data_filtered.pivot(index=[\"real_train_end\"], columns=\"interval_center\", values=\"value\")\n",
+    "    return pt_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_content = {\n",
+    "    (0, \"Trained up to\"): {\n",
+    "        (0, \"Evaluation Year\"): (\n",
+    "            \"Accuracy\",\n",
+    "            generate_heatmap_data_for_handler(\n",
+    "                df_merged, \"Accuracy\"\n",
+    "            ),  # almost identical to F1-micro and F1-weighted; macro is broken\n",
+    "        ),\n",
+    "        (1, \"Evaluation Year\"): (\"Top-2-Accuracy\", generate_heatmap_data_for_handler(df_merged, \"Top-2-Accuracy\")),\n",
+    "        (2, \"Evaluation Year\"): (\"Top-5-Accuracy\", generate_heatmap_data_for_handler(df_merged, \"Top-5-Accuracy\")),\n",
+    "        (3, \"Evaluation Year\"): (\"Top-10-Accuracy\", generate_heatmap_data_for_handler(df_merged, \"Top-10-Accuracy\")),\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# find vmin and vmax\n",
+    "vmin = 1\n",
+    "vmax = 0\n",
+    "for row_key in plot_content:\n",
+    "    for col_key in plot_content[row_key]:\n",
+    "        (_, cell_data) = plot_content[row_key][col_key]\n",
+    "        vmin = min(vmin, cell_data.min().min())\n",
+    "        vmax = max(vmax, cell_data.max().max())\n",
+    "print(vmin, vmax)\n",
+    "\n",
+    "fig = plot_heatmap_grid(\n",
+    "    data=plot_content,\n",
+    "    cbar_label=\"[Metric] %\",\n",
+    "    nrows=1,\n",
+    "    ncols=4,\n",
+    "    vmin=vmin,\n",
+    "    vmax=vmax,\n",
+    "    cbar=True,\n",
+    "    single_cbar=False,\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.48,\n",
+    "    grid_alpha=0.5,\n",
+    ")\n",
+    "\n",
+    "# TODO: we have already seen the Accuracy plot in Offline eval chapter (with another color scale though)\n",
+    "save_plot(fig, \"evaluation_metrics_hp\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/evaluation_setup/yb_heatmap_metrics.ipynb b/analytics/plotting/rh_thesis/evaluation_setup/yb_heatmap_metrics.ipynb
new file mode 100644
index 000000000..6cdc52db8
--- /dev/null
+++ b/analytics/plotting/rh_thesis/evaluation_setup/yb_heatmap_metrics.ipynb
@@ -0,0 +1,396 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "from matplotlib import pyplot as plt\n",
+    "from matplotlib.figure import Figure\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe, patch_yearbook_time\n",
+    "from analytics.plotting.common.common import init_plot\n",
+    "from analytics.plotting.common.font import setup_font\n",
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/00_varying_periodic_intervals/baselines_time\"\n",
+    ")\n",
+    "assert pipelines_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 16\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metrics = [\n",
+    "    \"Accuracy\",\n",
+    "    \"F1-weighted\",\n",
+    "    \"F1-macro\",\n",
+    "    \"F1-micro\",\n",
+    "    \"ROC-AUC\",\n",
+    "]\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"].isin(metrics))\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if patch_yearbook:\n",
+    "    for column in [\"interval_start\", \"interval_center\", \"interval_end\"]:\n",
+    "        patch_yearbook_time(df_adjusted, column)\n",
+    "    for column in [\"train_start\", \"train_end\", \"real_train_end\", \"usage_start\", \"usage_end\"]:\n",
+    "        patch_yearbook_time(df_logs_models, column)\n",
+    "\n",
+    "    # correction for -1 second in timestamp format before patching\n",
+    "    df_logs_models[\"usage_end\"] = (\n",
+    "        df_logs_models[\"usage_end\"].dt.to_period(\"M\") + 1\n",
+    "    ).dt.to_timestamp()  # december (because of -1 second in timestamp format) -> start of year\n",
+    "\n",
+    "df_logs_models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].dt.to_period(\"M\")\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "assert df_adjusted[\"pipeline_ref\"].nunique() <= 1\n",
+    "# add the pipeline time series which is the performance of different models stitched together dep.\n",
+    "# w.r.t which model was active\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = 0\n",
+    "pipeline_composite_model[\"id_model\"] = 0\n",
+    "\n",
+    "label_map = {k: f\"{k}\" for k, v in df_adjusted[[\"model_idx\", \"id_model\"]].values}\n",
+    "label_map[0] = \"Pipeline composite model\"\n",
+    "\n",
+    "if include_composite_model:\n",
+    "    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])\n",
+    "else:\n",
+    "    df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_heatmap_grid(\n",
+    "    data: dict[tuple[int, str], dict[tuple[int, str], pd.DataFrame]],\n",
+    "    cbar_label: str,\n",
+    "    # Define vmin and vmax for the color scale to be consistent across heatmaps\n",
+    "    vmin: float = 0,\n",
+    "    vmax: float = 0.3,\n",
+    "    nrows: int = 4,\n",
+    "    ncols: int = 4,\n",
+    "    cbar: bool = False,\n",
+    "    single_cbar: bool = False,\n",
+    "    height_factor: float = 1.2,\n",
+    "    width_factor: float = 1.0,\n",
+    "    x_space_factor: float = 1,\n",
+    "    y_space_factor: float = 1,\n",
+    "    grid_alpha: float = 0.0,\n",
+    ") -> Figure:\n",
+    "    init_plot()\n",
+    "    setup_font(small_label=True, small_title=True)\n",
+    "\n",
+    "    double_fig_width = 10\n",
+    "    double_fig_height = 3.5\n",
+    "    fig, axs = plt.subplots(\n",
+    "        nrows=nrows,\n",
+    "        ncols=ncols,\n",
+    "        edgecolor=\"black\",\n",
+    "        frameon=True,\n",
+    "        figsize=(double_fig_width * width_factor, 2.2 * double_fig_height * height_factor),\n",
+    "        dpi=450,\n",
+    "        squeeze=True,\n",
+    "    )\n",
+    "\n",
+    "    for x, row_key in enumerate(data):\n",
+    "        (row, row_title) = row_key\n",
+    "        for y, col_key in enumerate(data[row_key]):\n",
+    "            (col, col_title) = col_key\n",
+    "            title, cell_data = data[row_key][col_key]\n",
+    "            ax = axs[x, y] if nrows > 1 else axs[y]\n",
+    "\n",
+    "            _ = build_heatmap(\n",
+    "                cell_data,\n",
+    "                # note that for some years we have two interval centers\n",
+    "                # This is because the evaluation epochs are yearly and the interval offsets are bound by the dataset\n",
+    "                # start, therefore the right interval end is asymmetrically far to the right compared to the left bound.\n",
+    "                # We can still act as if we have a value for every year\n",
+    "                x_ticks=[1950, 1975, 2000],\n",
+    "                y_ticks=[1950, 1975, 2000],\n",
+    "                x_label=col_title,\n",
+    "                y_label=row_title,\n",
+    "                reverse_col=True,\n",
+    "                # x_label,\n",
+    "                # color_label = \"MMD\",  # TODO\n",
+    "                title_label=title,\n",
+    "                target_ax=ax,\n",
+    "                square=False,\n",
+    "                width_factor=width_factor,\n",
+    "                height_factor=height_factor,\n",
+    "                cbar=single_cbar,\n",
+    "                vmin=vmin,\n",
+    "                vmax=vmax,\n",
+    "                grid_alpha=grid_alpha,\n",
+    "            )\n",
+    "            ax.label_outer()  # Remove labels for inner plots, keep only on outer\n",
+    "\n",
+    "    if cbar:\n",
+    "        cbar_ax = fig.add_axes([1, 0.25, 0.02, 0.6])  # Adjust the colorbar position\n",
+    "        custom_cbar = fig.colorbar(ax.collections[0], cax=cbar_ax)  # use last printed axis\n",
+    "        custom_cbar.set_label(cbar_label)  # Set your custom label here\n",
+    "\n",
+    "    plt.subplots_adjust(wspace=0.1 * x_space_factor, hspace=0.1 * y_space_factor)\n",
+    "    return fig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_end_years_per_model = df_logs_models[[\"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"].dt.year\n",
+    "df_train_end_years_per_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=\"model_idx\", how=\"left\")\n",
+    "df_merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_heatmap_data_for_handler(data: pd.DataFrame, metric: str) -> pd.DataFrame:\n",
+    "    # build heatmap matrix dataframe:\n",
+    "    data_filtered = data[data[\"metric\"] == metric]\n",
+    "    pt_data = data_filtered.pivot(index=[\"real_train_end\"], columns=\"interval_center\", values=\"value\")\n",
+    "    return pt_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_content = {\n",
+    "    (0, \"Trained up to\"): {\n",
+    "        (0, \"Evaluation Year\"): (\"Accuracy\", generate_heatmap_data_for_handler(df_merged, \"Accuracy\")),\n",
+    "        (1, \"Evaluation Year\"): (\"ROC-AUC\", generate_heatmap_data_for_handler(df_merged, \"ROC-AUC\")),\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "vmin = 40\n",
+    "vmax = 100\n",
+    "print(vmin, vmax)\n",
+    "\n",
+    "fig = plot_heatmap_grid(\n",
+    "    data=plot_content,\n",
+    "    cbar_label=\"[Metric] %\",\n",
+    "    nrows=1,\n",
+    "    ncols=2,\n",
+    "    vmin=vmin,\n",
+    "    vmax=vmax,\n",
+    "    cbar=True,\n",
+    "    single_cbar=False,\n",
+    "    width_factor=0.7,\n",
+    "    height_factor=0.53,\n",
+    "    grid_alpha=0.5,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"evaluation_metrics_yb_one\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_content = {\n",
+    "    (0, \"Trained up to\"): {\n",
+    "        (0, \"Evaluation Year\"): (\"F1-micro\", generate_heatmap_data_for_handler(df_merged, \"F1-micro\")),\n",
+    "        (1, \"Evaluation Year\"): (\"F1-macro\", generate_heatmap_data_for_handler(df_merged, \"F1-macro\")),\n",
+    "        (2, \"Evaluation Year\"): (\"F1-weighted\", generate_heatmap_data_for_handler(df_merged, \"F1-weighted\")),\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "vmin = 40\n",
+    "vmax = 100\n",
+    "print(vmin, vmax)\n",
+    "\n",
+    "fig = plot_heatmap_grid(\n",
+    "    data=plot_content,\n",
+    "    cbar_label=\"[Metric] %\",\n",
+    "    nrows=1,\n",
+    "    ncols=3,\n",
+    "    vmin=vmin,\n",
+    "    vmax=vmax,\n",
+    "    cbar=True,\n",
+    "    single_cbar=False,\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.53,\n",
+    "    grid_alpha=0.5,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"evaluation_metrics_yb_two\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/evaluation_setup/yb_heatmap_window_size.ipynb b/analytics/plotting/rh_thesis/evaluation_setup/yb_heatmap_window_size.ipynb
new file mode 100644
index 000000000..108303011
--- /dev/null
+++ b/analytics/plotting/rh_thesis/evaluation_setup/yb_heatmap_window_size.ipynb
@@ -0,0 +1,359 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "from matplotlib import pyplot as plt\n",
+    "from matplotlib.figure import Figure\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe, patch_yearbook_time\n",
+    "from analytics.plotting.common.common import init_plot\n",
+    "from analytics.plotting.common.font import setup_font\n",
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/00_varying_periodic_intervals/baselines_time\"\n",
+    ")\n",
+    "assert pipelines_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 16\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handlers = [\n",
+    "    f\"periodic-{x}\" for x in [\"current\", \"delta+-1y\", \"delta+-2y\", \"delta+-3y\", \"delta+-5y\", \"delta+-10y\", \"delta+-15y\"]\n",
+    "]\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"].isin(eval_handlers))\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if patch_yearbook:\n",
+    "    for column in [\"interval_start\", \"interval_center\", \"interval_end\"]:\n",
+    "        patch_yearbook_time(df_adjusted, column)\n",
+    "    for column in [\"train_start\", \"train_end\", \"real_train_end\", \"usage_start\", \"usage_end\"]:\n",
+    "        patch_yearbook_time(df_logs_models, column)\n",
+    "\n",
+    "    # correction for -1 second in timestamp format before patching\n",
+    "    df_logs_models[\"usage_end\"] = (\n",
+    "        df_logs_models[\"usage_end\"].dt.to_period(\"M\") + 1\n",
+    "    ).dt.to_timestamp()  # december (because of -1 second in timestamp format) -> start of year\n",
+    "\n",
+    "df_logs_models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].dt.to_period(\"M\")\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "assert df_adjusted[\"pipeline_ref\"].nunique() <= 1\n",
+    "# add the pipeline time series which is the performance of different models stitched together dep.\n",
+    "# w.r.t which model was active\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = 0\n",
+    "pipeline_composite_model[\"id_model\"] = 0\n",
+    "\n",
+    "label_map = {k: f\"{k}\" for k, v in df_adjusted[[\"model_idx\", \"id_model\"]].values}\n",
+    "label_map[0] = \"Pipeline composite model\"\n",
+    "\n",
+    "if include_composite_model:\n",
+    "    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])\n",
+    "else:\n",
+    "    df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_heatmap_grid(\n",
+    "    data: dict[tuple[int, str], dict[tuple[int, str], pd.DataFrame]],\n",
+    "    cbar_label: str,\n",
+    "    # Define vmin and vmax for the color scale to be consistent across heatmaps\n",
+    "    vmin: float = 0,\n",
+    "    vmax: float = 0.3,\n",
+    "    nrows: int = 4,\n",
+    "    ncols: int = 4,\n",
+    "    cbar: bool = False,\n",
+    "    single_cbar: bool = False,\n",
+    "    height_factor: float = 1.2,\n",
+    "    width_factor: float = 1.0,\n",
+    "    x_space_factor: float = 1,\n",
+    "    y_space_factor: float = 1,\n",
+    "    grid_alpha: float = 0.0,\n",
+    ") -> Figure:\n",
+    "    init_plot()\n",
+    "    setup_font(small_label=True, small_title=True)\n",
+    "\n",
+    "    double_fig_width = 10\n",
+    "    double_fig_height = 3.5\n",
+    "    fig, axs = plt.subplots(\n",
+    "        nrows=nrows,\n",
+    "        ncols=ncols,\n",
+    "        edgecolor=\"black\",\n",
+    "        frameon=True,\n",
+    "        figsize=(double_fig_width * width_factor, 2.2 * double_fig_height * height_factor),\n",
+    "        dpi=450,\n",
+    "        squeeze=True,\n",
+    "    )\n",
+    "\n",
+    "    for x, row_key in enumerate(data):\n",
+    "        (row, row_title) = row_key\n",
+    "        for y, col_key in enumerate(data[row_key]):\n",
+    "            (col, col_title) = col_key\n",
+    "            title, cell_data = data[row_key][col_key]\n",
+    "            ax = axs[x, y] if nrows > 1 else axs[y]\n",
+    "\n",
+    "            _ = build_heatmap(\n",
+    "                cell_data,\n",
+    "                # note that for some years we have two interval centers\n",
+    "                # This is because the evaluation epochs are yearly and the interval offsets are bound by the dataset\n",
+    "                # start, therefore the right interval end is asymmetrically far to the right compared to the left bound.\n",
+    "                # We can still act as if we have a value for every year\n",
+    "                x_ticks=[1950, 1975, 2000],\n",
+    "                y_ticks=[1950, 1975, 2000],\n",
+    "                x_label=col_title,\n",
+    "                y_label=row_title,\n",
+    "                reverse_col=True,\n",
+    "                # x_label,\n",
+    "                # color_label = \"MMD\",  # TODO\n",
+    "                title_label=title,\n",
+    "                target_ax=ax,\n",
+    "                square=False,\n",
+    "                width_factor=width_factor,\n",
+    "                height_factor=height_factor,\n",
+    "                cbar=single_cbar,\n",
+    "                vmin=vmin,\n",
+    "                vmax=vmax,\n",
+    "                grid_alpha=grid_alpha,\n",
+    "            )\n",
+    "            ax.label_outer()  # Remove labels for inner plots, keep only on outer\n",
+    "\n",
+    "    if cbar:\n",
+    "        cbar_ax = fig.add_axes([1, 0.25, 0.02, 0.6])  # Adjust the colorbar position\n",
+    "        custom_cbar = fig.colorbar(ax.collections[0], cax=cbar_ax)  # use last printed axis\n",
+    "        custom_cbar.set_label(cbar_label)  # Set your custom label here\n",
+    "\n",
+    "    plt.subplots_adjust(wspace=0.1 * x_space_factor, hspace=0.1 * y_space_factor)\n",
+    "    return fig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_end_years_per_model = df_logs_models[[\"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"].dt.year\n",
+    "df_train_end_years_per_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=\"model_idx\", how=\"left\")\n",
+    "df_merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_heatmap_data_for_handler(data: pd.DataFrame, handler: str) -> pd.DataFrame:\n",
+    "    # build heatmap matrix dataframe:\n",
+    "    data_filtered = data[data[\"eval_handler\"] == handler]\n",
+    "    pt_data = data_filtered.pivot(index=[\"real_train_end\"], columns=\"interval_center\", values=\"value\")\n",
+    "    return pt_data\n",
+    "\n",
+    "\n",
+    "plot_content = {\n",
+    "    (0, \"Trained up to\"): {\n",
+    "        (0, \"Evaluation Year\"): (\"Same Year\", generate_heatmap_data_for_handler(df_merged, \"periodic-current\")),\n",
+    "        (1, \"Evaluation Year\"): (\n",
+    "            \"3 Year Window (±1 yr.)\",\n",
+    "            generate_heatmap_data_for_handler(df_merged, \"periodic-delta+-1y\"),\n",
+    "        ),\n",
+    "        (2, \"Evaluation Year\"): (\n",
+    "            \"11 Year Window (±5 yr.)\",\n",
+    "            generate_heatmap_data_for_handler(df_merged, \"periodic-delta+-5y\"),\n",
+    "        ),\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "# find vmin and vmax\n",
+    "vmin = 1\n",
+    "vmax = 0\n",
+    "# for row_key in plot_content:\n",
+    "#     for col_key in plot_content[row_key]:\n",
+    "#         (_, cell_data) = plot_content[row_key][col_key]\n",
+    "#         vmin = min(vmin, cell_data.min().min())\n",
+    "#         vmax = max(vmax, cell_data.max().max())\n",
+    "vmin = 40\n",
+    "vmax = 100\n",
+    "print(vmin, vmax)\n",
+    "\n",
+    "fig = plot_heatmap_grid(\n",
+    "    data=plot_content,\n",
+    "    cbar_label=\"Accuracy %\",\n",
+    "    nrows=1,\n",
+    "    ncols=3,\n",
+    "    vmin=vmin,\n",
+    "    vmax=vmax,\n",
+    "    cbar=True,\n",
+    "    single_cbar=False,\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.53,\n",
+    "    grid_alpha=0.5,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"evaluation_smoothing_yb\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/hp_kaggle_analytics.ipynb b/analytics/plotting/rh_thesis/hp_kaggle_analytics.ipynb
new file mode 100644
index 000000000..d5178b354
--- /dev/null
+++ b/analytics/plotting/rh_thesis/hp_kaggle_analytics.ipynb
@@ -0,0 +1,482 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import deque\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "import plotly.express as px\n",
+    "\n",
+    "from analytics.plotting.common.dataset_histogram import (\n",
+    "    build_countplot,\n",
+    "    build_cum_barplot,\n",
+    "    build_histogram_multicategory_barnorm,\n",
+    "    build_histogram_multicategory_facets,\n",
+    ")\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from benchmark.huffpost_kaggle.data_generation import HuffpostKaggleDataGenerator\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use interactive plotly\n",
+    "interactive = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "huffpost_dataset = HuffpostKaggleDataGenerator(\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/datasets/huffpost_kaggle/\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/datasets/huffpost_kaggle/raw/news-category-dataset.zip\"),\n",
+    ")\n",
+    "# huffpost_dataset.extract_data(Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/datasets/huffpost_kaggle/raw/news-category-dataset.zip\"))\n",
+    "hp_df = huffpost_dataset.load_into_dataframe(keep_true_category=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hp_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hp_df[\"category\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hp_df[\"month\"] = hp_df[\"date\"].dt.to_period(\"M\")\n",
+    "\n",
+    "# x = build_histogram(\n",
+    "#     hp_df,\n",
+    "#     x=\"month\",\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# number of samples over time\n",
+    "hp_df[\"year\"] = hp_df[\"date\"].dt.year\n",
+    "\n",
+    "if interactive:\n",
+    "    px.histogram(hp_df, x=\"date\")\n",
+    "else:\n",
+    "    # polished\n",
+    "    fig1 = build_countplot(\n",
+    "        hp_df,\n",
+    "        x=\"year\",\n",
+    "        x_ticks=[y for y in range(2012, 2022, 2)],\n",
+    "        y_ticks_bins=3,\n",
+    "        height_factor=0.4,\n",
+    "        width_factor=1.0,\n",
+    "        x_label=\"Sample Time\",\n",
+    "        y_label=\"Num. Samples\",\n",
+    "        palette_strip=None,\n",
+    "    )\n",
+    "\n",
+    "    save_plot(fig1, \"huffpost_kaggle_samples_over_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "category_and_years = hp_df[[\"category\", \"date\"]]\n",
+    "category_and_years[\"year\"] = category_and_years[\"date\"].dt.year\n",
+    "category_and_years = category_and_years[[\"category\", \"year\"]].drop_duplicates()\n",
+    "category_and_years = category_and_years.groupby(\"category\").size().reset_index()\n",
+    "category_and_years\n",
+    "category_and_years.columns = [\"category\", \"num_years\"]\n",
+    "category_and_years[category_and_years[\"num_years\"] > 9]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hp_df_reduced = hp_df.merge(category_and_years, on=\"category\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cut at 2018\n",
+    "hp_df_reduced_before_2018 = hp_df_reduced[hp_df_reduced[\"date\"] < \"2018-01-01\"]\n",
+    "hp_df_reduced_after_2018 = hp_df_reduced[hp_df_reduced[\"date\"] >= \"2018-01-01\"]\n",
+    "\n",
+    "category_counts_before_2018 = (\n",
+    "    hp_df_reduced_before_2018[\"category\"].value_counts().reset_index().sort_values(\"count\", ascending=False)\n",
+    ")\n",
+    "category_counts_after_2018 = (\n",
+    "    hp_df_reduced_after_2018[\"category\"].value_counts().reset_index().sort_values(\"count\", ascending=False)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_category_ratios(df: pd.DataFrame) -> pd.DataFrame:\n",
+    "    total_samples = df.shape[0]\n",
+    "    category_counts = df[\"category\"].value_counts().reset_index().sort_values(\"count\", ascending=False)\n",
+    "    category_counts[\"ratio\"] = category_counts[\"count\"] / total_samples\n",
+    "    return category_counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Analyse ratio of categories\n",
+    "category_counts = find_category_ratios(hp_df_reduced)\n",
+    "category_counts_before_2018 = find_category_ratios(hp_df_reduced_before_2018)\n",
+    "category_counts_after_2018 = find_category_ratios(hp_df_reduced_after_2018)\n",
+    "\n",
+    "category_counts = category_counts.merge(\n",
+    "    category_counts_before_2018, on=\"category\", suffixes=(\"\", \"_before_2018\"), how=\"left\"\n",
+    ").merge(category_counts_after_2018, on=\"category\", suffixes=(\"\", \"_after_2018\"), how=\"left\")\n",
+    "category_counts.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sorted_categories = (category_counts.sort_values(\"count\", ascending=False))[\"category\"]\n",
+    "sorted_categories\n",
+    "\n",
+    "\n",
+    "hp_df_reduced[\"sort_idx\"] = pd.Categorical(hp_df_reduced[\"category\"], categories=sorted_categories, ordered=True)\n",
+    "hp_df_reduced = hp_df_reduced.sort_values(\"sort_idx\", ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # we want to find out the ratio of the dataset (all, <2018, >=2018) that we cover when only\n",
+    "# # show the top 12 categories from <2018 and the top 4 from >=2018 (some might overlap)\n",
+    "# top_12_before_2018 = category_counts_before_2018.head(12)\n",
+    "# df_top_12_before_2018 = hp_df_reduced_before_2018[hp_df_reduced_before_2018[\"category\"].isin(top_12_before_2018[\"category\"])]\n",
+    "\n",
+    "# top_4_after_2018 = category_counts_after_2018.head(4)\n",
+    "# df_top_4_after_2018 = hp_df_reduced_after_2018[hp_df_reduced_after_2018[\"category\"].isin(top_4_after_2018[\"category\"])]\n",
+    "\n",
+    "# percentage_before_2018 = df_top_12_before_2018.shape[0] / hp_df_reduced_before_2018.shape[0]\n",
+    "# percentage_after_2018 = df_top_4_after_2018.shape[0] / hp_df_reduced_after_2018.shape[0]\n",
+    "# percentage_total = (df_top_12_before_2018.shape[0] + df_top_4_after_2018.shape[0]) / hp_df_reduced.shape[0]\n",
+    "\n",
+    "# print(percentage_before_2018, percentage_after_2018, percentage_total)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Export for thesis table\n",
+    "from analytics.plotting.common.save import save_csv_df\n",
+    "\n",
+    "# select top 8 and bottom 2\n",
+    "export_csv = pd.concat([category_counts.head(8)])[[\"category\", \"count\", \"ratio\"]]  # , category_counts.tail(2)\n",
+    "export_csv[\"ratio\"] = export_csv[\"ratio\"].apply(lambda x: round(x * 100, 1))\n",
+    "print(export_csv)\n",
+    "\n",
+    "save_csv_df(export_csv, \"hp_kaggle_category_ratios\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plotting_threshold = category_counts.reset_index()[[\"index\", \"ratio\"]]\n",
+    "plotting_threshold[\"index\"] = plotting_threshold[\"index\"] + 1\n",
+    "# add first row: 0\n",
+    "plotting_threshold = pd.concat([pd.DataFrame({\"index\": [0], \"ratio\": [0]}), plotting_threshold])\n",
+    "\n",
+    "# cumulative sum\n",
+    "plotting_threshold[\"ratio\"] = plotting_threshold[\"ratio\"].cumsum() * 100\n",
+    "plotting_threshold.head(n=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot coverage of categories\n",
+    "label_hist = build_cum_barplot(\n",
+    "    plotting_threshold,\n",
+    "    x=\"index\",\n",
+    "    y=\"ratio\",\n",
+    "    x_label=\"Categories\",\n",
+    "    y_label=\"% of Dataset\",\n",
+    "    height_factor=0.4,\n",
+    "    width_factor=0.4,\n",
+    "    y_ticks_bins=3,\n",
+    "    x_ticks_bins=4,\n",
+    ")\n",
+    "save_plot(label_hist, \"huffpost_kaggle_category_coverage\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if interactive:\n",
+    "    px.histogram(hp_df_reduced, x=\"date\", color=\"category\")\n",
+    "    fig = px.histogram(\n",
+    "        hp_df_reduced,\n",
+    "        x=\"date\",\n",
+    "        color=\"category\",\n",
+    "        facet_col=\"category\",\n",
+    "        facet_col_wrap=4,\n",
+    "        height=2000,\n",
+    "        facet_row_spacing=0.05,\n",
+    "        category_orders={\"category\": (category_counts[\"category\"].tolist())},\n",
+    "        color_discrete_sequence=px.colors.qualitative.Safe,\n",
+    "    )\n",
+    "    fig.update_yaxes(matches=None, showticklabels=True)\n",
+    "    fig.update_xaxes(showticklabels=True)\n",
+    "    fig.show()\n",
+    "else:\n",
+    "    fig_all = build_histogram_multicategory_facets(\n",
+    "        hp_df_reduced,\n",
+    "        x=\"date\",\n",
+    "        label=\"category\",\n",
+    "        sorted_categories=sorted_categories,\n",
+    "        height_factor=2.25,\n",
+    "        width_factor=1.5,\n",
+    "        # legend_labels=list(merged),\n",
+    "        x_label=\"Sample Time\",\n",
+    "        y_label=\"Number of Samples\",\n",
+    "        x_ticks=[pd.to_datetime(d) for d in [\"2014-05-01\", \"2018-06-01\"]],\n",
+    "        sharey=False,\n",
+    "    )\n",
+    "\n",
+    "    save_plot(fig_all, \"huffpost_kaggle_label_distribution_over_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if interactive:\n",
+    "    fig = px.histogram(\n",
+    "        hp_df_reduced,\n",
+    "        x=\"date\",\n",
+    "        color=\"category\",\n",
+    "        height=500,\n",
+    "        barnorm=\"percent\",\n",
+    "        category_orders={\"category\": (category_counts[\"category\"].tolist())},\n",
+    "        color_discrete_sequence=px.colors.qualitative.Safe,\n",
+    "    )\n",
+    "    fig.update_yaxes(matches=None, showticklabels=True)\n",
+    "    fig.update_xaxes(showticklabels=True)\n",
+    "    fig.show()\n",
+    "else:\n",
+    "    # legend:\n",
+    "    # find the top 5 labels before 2018 and the top 5 labels after 2018, merge them and use them in the legend\n",
+    "    before_labels = deque(category_counts_before_2018[\"category\"].tolist())\n",
+    "    after_labels = deque(category_counts_after_2018[\"category\"].tolist())\n",
+    "\n",
+    "    # iteratively take first element from each list and append to merged SET until 10 distinct elements are in the set\n",
+    "    merged = set()\n",
+    "    while len(merged) < 8:\n",
+    "        if before_labels:\n",
+    "            merged.add(before_labels.popleft())\n",
+    "        if len(merged) == 8:\n",
+    "            break\n",
+    "        if after_labels:\n",
+    "            merged.add(after_labels.popleft())\n",
+    "    fig_labels_distribution = build_histogram_multicategory_barnorm(\n",
+    "        hp_df_reduced,\n",
+    "        x=\"date\",\n",
+    "        label=\"category\",\n",
+    "        sorted_coloring_categories=sorted_categories,\n",
+    "        height_factor=0.55,\n",
+    "        width_factor=1.0,\n",
+    "        legend_labels=list(merged),\n",
+    "        x_label=\"Sample Time\",\n",
+    "        y_label=\"Label Distribution\",\n",
+    "        y_ticks=[1.0, 0.75, 0.5, 0.25, 0.0],\n",
+    "        y_ticks_bins=4,\n",
+    "        x_ticks=[pd.to_datetime(d) for d in [\"2014-05-01\", \"2015-07-01\", \"2018-06-01\", \"2021-01-01\"]],\n",
+    "        legend_title=\"Article Category\",\n",
+    "    )\n",
+    "\n",
+    "    save_plot(fig_labels_distribution, \"huffpost_kaggle_label_distribution_over_time_relative\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if interactive:\n",
+    "    fig = px.histogram(\n",
+    "        hp_df_reduced_before_2018,\n",
+    "        x=\"date\",\n",
+    "        color=\"category\",\n",
+    "        height=500,\n",
+    "        barnorm=\"percent\",\n",
+    "        category_orders={\"category\": (category_counts_before_2018[\"category\"].tolist())},\n",
+    "        color_discrete_sequence=px.colors.qualitative.Safe,\n",
+    "    )\n",
+    "    fig.update_yaxes(matches=None, showticklabels=True)\n",
+    "    fig.update_xaxes(showticklabels=True)\n",
+    "    fig.show()\n",
+    "\n",
+    "    fig = px.histogram(\n",
+    "        hp_df_reduced_after_2018,\n",
+    "        x=\"date\",\n",
+    "        color=\"category\",\n",
+    "        height=500,\n",
+    "        barnorm=\"percent\",\n",
+    "        category_orders={\"category\": (category_counts_after_2018[\"category\"].tolist())},\n",
+    "        # color palette\n",
+    "        color_discrete_sequence=px.colors.qualitative.Safe,\n",
+    "    )\n",
+    "    fig.update_yaxes(matches=None, showticklabels=True)\n",
+    "    fig.update_xaxes(showticklabels=True)\n",
+    "    fig.show()\n",
+    "\n",
+    "else:\n",
+    "    # -------------------------------------------------- Before 2018 ------------------------------------------------- #\n",
+    "    category_counts_before_2018[\"sort_idx\"] = pd.Categorical(\n",
+    "        category_counts_before_2018[\"category\"],\n",
+    "        categories=category_counts_before_2018[\"category\"].tolist(),\n",
+    "        ordered=True,\n",
+    "    )\n",
+    "    category_counts_before_2018.sort_values(\"sort_idx\", ascending=True, inplace=True)\n",
+    "\n",
+    "    fig_before_2018 = build_histogram_multicategory_barnorm(\n",
+    "        hp_df_reduced_before_2018,\n",
+    "        x=\"date\",\n",
+    "        label=\"category\",\n",
+    "        sorted_coloring_categories=sorted_categories,\n",
+    "        sorted_ordering_categories=category_counts_before_2018[\"category\"].tolist(),\n",
+    "        height_factor=0.55,\n",
+    "        width_factor=1.0,\n",
+    "        legend_labels=category_counts_before_2018[\"category\"].tolist()[:8],\n",
+    "        x_label=\"Sample Time\",\n",
+    "        y_label=\"Label Distribution\",\n",
+    "        y_ticks=[1.0, 0.75, 0.5, 0.25, 0.0],\n",
+    "        legend_title=\"Article Category\",\n",
+    "        nbins=60,\n",
+    "    )\n",
+    "    save_plot(fig_before_2018, \"huffpost_kaggle_label_distribution_over_time_relative_before_2018\")\n",
+    "\n",
+    "    # -------------------------------------------------- After 2018 -------------------------------------------------- #\n",
+    "\n",
+    "    hp_df_reduced_after_2018[\"sort_idx\"] = pd.Categorical(\n",
+    "        hp_df_reduced_after_2018[\"category\"], categories=category_counts_after_2018[\"category\"].tolist(), ordered=True\n",
+    "    )\n",
+    "    hp_df_reduced_after_2018.sort_values(\"sort_idx\", ascending=True, inplace=True)\n",
+    "\n",
+    "    # we want the legend to have different sorting\n",
+    "    fig_after_2018 = build_histogram_multicategory_barnorm(\n",
+    "        hp_df_reduced_after_2018,\n",
+    "        x=\"date\",\n",
+    "        label=\"category\",\n",
+    "        sorted_coloring_categories=sorted_categories,\n",
+    "        sorted_ordering_categories=category_counts_after_2018[\"category\"].tolist(),\n",
+    "        height_factor=0.55,\n",
+    "        width_factor=1.0,\n",
+    "        legend_labels=category_counts_after_2018[\"category\"].tolist()[:8],\n",
+    "        x_ticks=[pd.to_datetime(d) for d in [\"2019-01-01\", \"2020-01-01\", \"2021-01-01\", \"2022-01-01\"]],\n",
+    "        x_label=\"Sample Time\",\n",
+    "        y_label=\"Label Distribution\",\n",
+    "        y_ticks=[1.0, 0.75, 0.5, 0.25, 0.0],\n",
+    "        legend_title=\"Article Category\",\n",
+    "        nbins=60,\n",
+    "    )\n",
+    "    save_plot(fig_after_2018, \"huffpost_kaggle_label_distribution_over_time_relative_after_2018\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/performance/arxiv_cost.ipynb b/analytics/plotting/rh_thesis/performance/arxiv_cost.ipynb
new file mode 100644
index 000000000..03a3f16f0
--- /dev/null
+++ b/analytics/plotting/rh_thesis/performance/arxiv_cost.ipynb
@@ -0,0 +1,227 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import matplotlib.dates as mdates\n",
+    "import pandas as pd\n",
+    "from matplotlib.ticker import FixedFormatter, FixedLocator\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import pipeline_leaf_times_df\n",
+    "from analytics.plotting.common.cost_matrix import plot_cost_matrix\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/30_performance\")\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode: time + amount\n",
+    "pipeline_ids = [762]  # performancetrigger_num_misclass-10000-exp-0.6-red-False--int20000\n",
+    "\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"arxiv_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_leaf_list = []\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)\n",
+    "df_leaf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_leaf.copy()\n",
+    "\n",
+    "# coloring in order of decreasing avg. duration\n",
+    "avg_duration_per_stage = df_adjusted.groupby([\"pipeline_ref\", \"id\"])[\"duration\"].mean().sort_values(ascending=False)\n",
+    "df_adjusted = df_adjusted.merge(avg_duration_per_stage, on=[\"pipeline_ref\", \"id\"], suffixes=(\"\", \"_avg\")).sort_values(\n",
+    "    \"duration_avg\", ascending=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted\n",
+    "df_adjusted[\"sample_time_year\"] = df_adjusted[\"sample_time\"]\n",
+    "df_adjusted[\"sample_time_year_bin\"] = pd.cut(df_adjusted[\"sample_time_year\"], bins=10, labels=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new = df_adjusted[\n",
+    "    (\n",
+    "        df_adjusted[\"id\"].isin(\n",
+    "            [\n",
+    "                \"TRAIN\",\n",
+    "                \"STORE_TRAINED_MODEL\",\n",
+    "                \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "                \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "                \"EVALUATE_TRIGGER_POLICY\",\n",
+    "            ]\n",
+    "        )\n",
+    "    )\n",
+    "][[\"pipeline_ref\", \"id\", \"sample_time_year\", \"duration\"]].copy()\n",
+    "df_new = df_new.sort_values(\"sample_time_year\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state_rename = {\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\": \"inform remaining data\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\": \"inform trigger\",\n",
+    "}\n",
+    "\n",
+    "df_new[\"id\"] = df_new[\"id\"].replace(state_rename).str.lower().str.replace(\"_\", \" \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [762],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        762: \"arXiv PerformanceTrigger (NumMisclass.)\",\n",
+    "    },\n",
+    "    height_factor=0.7,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (min)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2000-01-01\", \"2009-01-01\", \"2018-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"Jan 2000\", \"Jan 2009\", \"Jan 2018\"]]),\n",
+    "    x_lim=(pd.Timestamp(\"1995-01-01\"), pd.Timestamp(\"2024-09-01\")),\n",
+    "    y_ticks_cumulative=[x for x in range(0, 1000, 200)],\n",
+    "    y_lim_cumulative=(0, 1000),\n",
+    "    y_minutes=True,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "save_plot(fig, \"arxiv_performance-trigger-cost-matrix\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lower/insignificant policy eval costs compared to drift"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/performance/arxiv_heatmap_single.ipynb b/analytics/plotting/rh_thesis/performance/arxiv_heatmap_single.ipynb
new file mode 100644
index 000000000..44c4d2e7e
--- /dev/null
+++ b/analytics/plotting/rh_thesis/performance/arxiv_heatmap_single.ipynb
@@ -0,0 +1,285 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/30_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 762  # performancetrigger_num_misclass-10000-exp-0.6-red-False--int20000\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"arxiv_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "assert df_adjusted[\"pipeline_ref\"].nunique() <= 1\n",
+    "# add the pipeline time series which is the performance of different models stitched together dep.\n",
+    "# w.r.t which model was active\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = 0\n",
+    "pipeline_composite_model[\"id_model\"] = 0\n",
+    "\n",
+    "label_map = {k: f\"{k}\" for k, v in df_adjusted[[\"model_idx\", \"id_model\"]].values}\n",
+    "label_map[0] = \"Pipeline composite model\"\n",
+    "\n",
+    "if include_composite_model:\n",
+    "    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])\n",
+    "else:\n",
+    "    df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].dt.to_period(\"M\")\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_end_years_per_model = df_logs_models[[\"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"].dt.to_period(\"M\")\n",
+    "df_train_end_years_per_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=\"model_idx\", how=\"left\")\n",
+    "df_merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.groupby([\"real_train_end\", \"interval_center\"]).size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build heatmap matrix dataframe:\n",
+    "df_merged[\"real_train_end\"] = df_merged[\"real_train_end\"].apply(lambda x: pd.Period(x, freq=\"M\"))\n",
+    "heatmap_data = df_merged.pivot(index=[\"real_train_end\"], columns=\"interval_center\", values=\"value\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index.min(), heatmap_data.index.max()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_custom_ticks=[\n",
+    "        (i, f\"{period.to_timestamp().strftime('%b %Y')}\")\n",
+    "        for i, period in list(enumerate(heatmap_data.columns))[::1]\n",
+    "        if period in [pd.Period(\"Mar 2000\"), pd.Period(\"Mar 2009\"), pd.Period(\"Mar 2020\")]\n",
+    "    ],\n",
+    "    y_custom_ticks=[\n",
+    "        (i, f\"{period.to_timestamp().strftime('%b %Y')}\".replace(\" \", \"\\n\"))\n",
+    "        for i, period in list(enumerate(heatmap_data.index))[::1]\n",
+    "        if period in [pd.Period(\"Jun 2000\"), pd.Period(\"Jun 2009\"), pd.Period(\"May 2020\")]\n",
+    "    ],\n",
+    "    y_label=\"Trained up to\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    title_label=\"Arxiv PerformanceTrigger\\nExp. Acc=60% | NumMiscl=10k | No Reduction\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.7,\n",
+    "    # grid_alpha=0.4,\n",
+    "    grid_alpha=0.0,\n",
+    "    # disable_horizontal_grid=True,\n",
+    "    # cbar=False,\n",
+    "    df_logs_models=df_logs_models,\n",
+    "    x_axis=\"period\",\n",
+    ")\n",
+    "save_plot(fig, \"arxiv_trigger_heatmap_performance_single_dynamic\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/performance/hp_cost.ipynb b/analytics/plotting/rh_thesis/performance/hp_cost.ipynb
new file mode 100644
index 000000000..a4401c8df
--- /dev/null
+++ b/analytics/plotting/rh_thesis/performance/hp_cost.ipynb
@@ -0,0 +1,221 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import matplotlib.dates as mdates\n",
+    "import pandas as pd\n",
+    "from matplotlib.ticker import FixedFormatter, FixedLocator\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import pipeline_leaf_times_df\n",
+    "from analytics.plotting.common.cost_matrix import plot_cost_matrix\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/30_performance/static_dyn\")\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode: time + amount\n",
+    "pipeline_ids = [639]  # performancetrigger_static-0.5-int1500y\n",
+    "\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"huffpost_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_leaf_list = []\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)\n",
+    "df_leaf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_leaf.copy()\n",
+    "\n",
+    "# coloring in order of decreasing avg. duration\n",
+    "avg_duration_per_stage = df_adjusted.groupby([\"pipeline_ref\", \"id\"])[\"duration\"].mean().sort_values(ascending=False)\n",
+    "df_adjusted = df_adjusted.merge(avg_duration_per_stage, on=[\"pipeline_ref\", \"id\"], suffixes=(\"\", \"_avg\")).sort_values(\n",
+    "    \"duration_avg\", ascending=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted\n",
+    "df_adjusted[\"sample_time_year\"] = df_adjusted[\"sample_time\"]\n",
+    "df_adjusted[\"sample_time_year_bin\"] = pd.cut(df_adjusted[\"sample_time_year\"], bins=10, labels=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new = df_adjusted[\n",
+    "    (\n",
+    "        df_adjusted[\"id\"].isin(\n",
+    "            [\n",
+    "                \"TRAIN\",\n",
+    "                \"STORE_TRAINED_MODEL\",\n",
+    "                \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "                \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "                \"EVALUATE_TRIGGER_POLICY\",\n",
+    "            ]\n",
+    "        )\n",
+    "    )\n",
+    "][[\"pipeline_ref\", \"id\", \"sample_time_year\", \"duration\"]].copy()\n",
+    "df_new = df_new.sort_values(\"sample_time_year\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state_rename = {\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\": \"inform remaining data\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\": \"inform trigger\",\n",
+    "}\n",
+    "\n",
+    "df_new[\"id\"] = df_new[\"id\"].replace(state_rename).str.lower().str.replace(\"_\", \" \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [639],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        639: \"HuffPost Static PerformanceTrigger\",\n",
+    "    },\n",
+    "    height_factor=0.7,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (min)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2014-05-01\", \"2018-06-01\", \"2021-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"May\\n2014\", \"Jun\\n2018\", \"Jan\\n2021\"]]),\n",
+    "    x_lim=(pd.Timestamp(\"2012-01-01\"), pd.Timestamp(\"2022-09-01\")),\n",
+    "    y_ticks_cumulative=[x for x in range(0, 110, 25)],\n",
+    "    y_lim_cumulative=(0, 100),\n",
+    "    y_minutes=True,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"huffpost_performance-trigger-cost-matrix\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lower policy eval costs compared to drift"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/performance/hp_heatmap_single.ipynb b/analytics/plotting/rh_thesis/performance/hp_heatmap_single.ipynb
new file mode 100644
index 000000000..ad25baf58
--- /dev/null
+++ b/analytics/plotting/rh_thesis/performance/hp_heatmap_single.ipynb
@@ -0,0 +1,286 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/30_performance/static_dyn\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 639  # performancetrigger_static-0.5-int1500y\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"huffpost_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "assert df_adjusted[\"pipeline_ref\"].nunique() <= 1\n",
+    "# add the pipeline time series which is the performance of different models stitched together dep.\n",
+    "# w.r.t which model was active\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = 0\n",
+    "pipeline_composite_model[\"id_model\"] = 0\n",
+    "\n",
+    "label_map = {k: f\"{k}\" for k, v in df_adjusted[[\"model_idx\", \"id_model\"]].values}\n",
+    "label_map[0] = \"Pipeline composite model\"\n",
+    "\n",
+    "if include_composite_model:\n",
+    "    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])\n",
+    "else:\n",
+    "    df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].dt.to_period(\"M\")\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_end_years_per_model = df_logs_models[[\"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"].dt.to_period(\"M\")\n",
+    "df_train_end_years_per_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=\"model_idx\", how=\"left\")\n",
+    "df_merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.groupby([\"real_train_end\", \"interval_center\"]).size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build heatmap matrix dataframe:\n",
+    "df_merged[\"real_train_end\"] = df_merged[\"real_train_end\"].apply(lambda x: pd.Period(x, freq=\"M\"))\n",
+    "heatmap_data = df_merged.pivot(index=[\"real_train_end\"], columns=\"interval_center\", values=\"value\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index.min(), heatmap_data.index.max()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_custom_ticks=[\n",
+    "        (i, f\"{period.to_timestamp().strftime('%b %Y')}\")\n",
+    "        for i, period in list(enumerate(heatmap_data.columns))[::1]\n",
+    "        if period in [pd.Period(\"Apr 2014\"), pd.Period(\"Jul 2018\"), pd.Period(\"Jan 2022\")]\n",
+    "    ],\n",
+    "    y_custom_ticks=[\n",
+    "        (i + 0.5, f\"{period.to_timestamp().strftime('%b %Y')}\")\n",
+    "        for i, period in list(enumerate(heatmap_data.index))[::1]\n",
+    "    ],\n",
+    "    y_label=\"Trained up to\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    title_label=\"HuffPost PerformanceTrigger: Static Accuracy Threshold=50%\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.5,\n",
+    "    # grid_alpha=0.4,\n",
+    "    grid_alpha=0.0,\n",
+    "    # disable_horizontal_grid=True,\n",
+    "    # cbar=False,\n",
+    "    df_logs_models=df_logs_models,\n",
+    "    x_axis=\"period\",\n",
+    ")\n",
+    "save_plot(fig, \"hp_trigger_heatmap_performance_single_static\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/performance/yb_cost.ipynb b/analytics/plotting/rh_thesis/performance/yb_cost.ipynb
new file mode 100644
index 000000000..acf1a8608
--- /dev/null
+++ b/analytics/plotting/rh_thesis/performance/yb_cost.ipynb
@@ -0,0 +1,212 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import patch_yearbook_time, pipeline_leaf_times_df\n",
+    "from analytics.plotting.common.cost_matrix import plot_cost_matrix\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/30_performance/num_misclass/\"\n",
+    "    ),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode: time + amount\n",
+    "pipeline_ids = [759]  # yearbook_performancetrigger_num_misclass-100-exp-0.9-red-False--int250y\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_leaf_list = []\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_leaf.copy()\n",
+    "\n",
+    "# coloring in order of decreasing avg. duration\n",
+    "avg_duration_per_stage = df_adjusted.groupby([\"pipeline_ref\", \"id\"])[\"duration\"].mean().sort_values(ascending=False)\n",
+    "df_adjusted = df_adjusted.merge(avg_duration_per_stage, on=[\"pipeline_ref\", \"id\"], suffixes=(\"\", \"_avg\")).sort_values(\n",
+    "    \"duration_avg\", ascending=False\n",
+    ")\n",
+    "\n",
+    "# Yearbook as a mapped time dimension (to display the correct timestamps we need to convert back from days to years)\n",
+    "if patch_yearbook:\n",
+    "    patch_yearbook_time(df_adjusted, \"sample_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted\n",
+    "df_adjusted[\"sample_time_year\"] = df_adjusted[\"sample_time\"].dt.year\n",
+    "df_adjusted[\"sample_time_year_bin\"] = pd.cut(df_adjusted[\"sample_time_year\"], bins=10, labels=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new = df_adjusted[\n",
+    "    (\n",
+    "        df_adjusted[\"id\"].isin(\n",
+    "            [\n",
+    "                \"TRAIN\",\n",
+    "                \"STORE_TRAINED_MODEL\",\n",
+    "                \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "                \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "                \"EVALUATE_TRIGGER_POLICY\",\n",
+    "            ]\n",
+    "        )\n",
+    "    )\n",
+    "][[\"pipeline_ref\", \"id\", \"sample_time_year\", \"duration\"]].copy()\n",
+    "df_new = df_new.sort_values(\"sample_time_year\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state_rename = {\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\": \"inform remaining data\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\": \"inform trigger\",\n",
+    "}\n",
+    "\n",
+    "df_new[\"id\"] = df_new[\"id\"].replace(state_rename).str.lower().str.replace(\"_\", \" \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [759],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        # title_label=\"Yearbook PerformanceTrigger:\\n Exp. Acc=90% | NumMiscl=100 | No Reduction\",\n",
+    "        759: \"Yearbook PerformanceTrigger (NumMiscl)\",\n",
+    "    },\n",
+    "    height_factor=0.7,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (sec)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_ticks=[x for x in range(1940, 2010 + 1, 30)],\n",
+    "    y_ticks_cumulative=[x for x in range(0, 9 + 1, 3)],\n",
+    "    y_lim_cumulative=(0, 10),\n",
+    "    y_minutes=False,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"yearbook_performance-trigger-cost-matrix\")\n",
+    "# Lower policy costs than in drift case"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/performance/yearbook_heatmap_multi.ipynb b/analytics/plotting/rh_thesis/performance/yearbook_heatmap_multi.ipynb
new file mode 100644
index 000000000..17e38c2b7
--- /dev/null
+++ b/analytics/plotting/rh_thesis/performance/yearbook_heatmap_multi.ipynb
@@ -0,0 +1,372 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import (\n",
+    "    dfs_models_and_evals,\n",
+    "    patch_yearbook_time,\n",
+    "    pipeline_leaf_times_df,\n",
+    ")\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/30_performance/static_dyn/\"\n",
+    "    ),\n",
+    "    Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/30_performance/num_misclass/\"\n",
+    "    ),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_ids = list(pipelines.keys())\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_df_eval_single: list[pd.DataFrame] = []\n",
+    "df_logs_models_list: list[pd.DataFrame] = []\n",
+    "\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=False, pipeline_id=pipeline_id)\n",
+    "    df_logs_models_single, _, df_eval_single = dfs_models_and_evals(\n",
+    "        pipeline_logs[pipeline_id], df_leaf_single[\"sample_time\"].max(), pipelines[pipeline_id][0]\n",
+    "    )\n",
+    "    df_eval_single[\"pipeline_id\"] = pipeline_id\n",
+    "    df_logs_models_single[\"pipeline_id\"] = pipeline_id\n",
+    "    list_df_eval_single.append(df_eval_single)\n",
+    "    df_logs_models_list.append(df_logs_models_single)\n",
+    "\n",
+    "df_adjusted = pd.concat(list_df_eval_single)\n",
+    "df_adjusted\n",
+    "\n",
+    "df_logs_models = pd.concat(df_logs_models_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if patch_yearbook:\n",
+    "    for column in [\"interval_start\", \"interval_center\", \"interval_end\"]:\n",
+    "        patch_yearbook_time(df_adjusted, column)\n",
+    "    for column in [\"train_start\", \"train_end\", \"real_train_end\", \"usage_start\", \"usage_end\"]:\n",
+    "        patch_yearbook_time(df_logs_models, column)\n",
+    "\n",
+    "    # correction for -1 second in timestamp format before patching\n",
+    "    df_logs_models[\"usage_end\"] = (\n",
+    "        df_logs_models[\"usage_end\"].dt.to_period(\"M\") + 1\n",
+    "    ).dt.to_timestamp()  # december (because of -1 second in timestamp format) -> start of year\n",
+    "\n",
+    "df_logs_models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "len(df_adjusted)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reduce to composite models\n",
+    "df_adjusted = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "df_adjusted[composite_model_variant].unique()\n",
+    "len(df_adjusted)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].astype(str).str.split(\"-\").str[0]\n",
+    "\n",
+    "df_train_end_years_per_model = df_logs_models[[\"pipeline_id\", \"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"].dt.year"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted.groupby([\"pipeline_id\"]).size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Static Performance Thresholds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_pids = list(reversed([437, 432, 429, 425, 421, 418, 414, 411]))\n",
+    "\n",
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=[\"pipeline_id\", \"model_idx\"], how=\"left\")\n",
+    "# build heatmap matrix dataframe:\n",
+    "df_merged[\"pipeline_id\"] = df_merged[\"pipeline_id\"].astype(int)\n",
+    "df_merged = df_merged[df_merged[\"pipeline_id\"].isin(_pids)]\n",
+    "heatmap_data = df_merged.pivot(index=[\"pipeline_id\"], columns=\"interval_center\", values=\"value\")\n",
+    "\n",
+    "heatmap_data.index.min(), heatmap_data.index.max()\n",
+    "heatmap_data\n",
+    "\n",
+    "# sort index by pipeline_refs\n",
+    "heatmap_data = heatmap_data.reindex(_pids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "\n",
+    "pipelines_refs = {437: \"95%\", 432: \"92.5%\", 429: \"90%\", 425: \"87.5%\", 421: \"85%\", 418: \"80%\", 414: \"75%\", 411: \"70%\"}\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_ticks=[1950, 1975, 2000],\n",
+    "    y_custom_ticks=[(i + 0.5, pipelines_refs[y]) for i, y in enumerate(heatmap_data.index)],\n",
+    "    y_label=\"Pipeline with\\nAccuracy Threshold\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    title_label=\"Yearbook Composite Models: Static Accuracy Thresholds\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.58,\n",
+    "    # grid_alpha=0.4,\n",
+    "    grid_alpha=0.0,\n",
+    "    # disable_horizontal_grid=True,\n",
+    "    # cbar=False,\n",
+    "    triggers={\n",
+    "        i: df_logs_models[df_logs_models[\"pipeline_id\"] == p_id][\n",
+    "            [\"train_start\", \"train_end\", \"usage_start\", \"usage_end\"]\n",
+    "        ]\n",
+    "        for i, p_id in enumerate(heatmap_data.index)\n",
+    "    },\n",
+    ")\n",
+    "save_plot(fig, \"yb_trigger_heatmap_performance_multi_static_thresholds\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dynamic Thresholds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_pids = (\n",
+    "    # Num misclass: with reduction\n",
+    "    list(reversed([734, 758, 749]))\n",
+    "    +\n",
+    "    # Num misclass: without reduction\n",
+    "    list(reversed([736, 759, 751, 743]))\n",
+    "    +\n",
+    "    # roll avg\n",
+    "    list([527, 516, 506, 494])\n",
+    "    +\n",
+    "    # quantile\n",
+    "    [445]\n",
+    ")\n",
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=[\"pipeline_id\", \"model_idx\"], how=\"left\")\n",
+    "# build heatmap matrix dataframe:\n",
+    "df_merged[\"pipeline_id\"] = df_merged[\"pipeline_id\"].astype(int)\n",
+    "df_merged = df_merged[df_merged[\"pipeline_id\"].isin(_pids)]\n",
+    "heatmap_data = df_merged.pivot(index=[\"pipeline_id\"], columns=\"interval_center\", values=\"value\")\n",
+    "\n",
+    "heatmap_data.index.min(), heatmap_data.index.max()\n",
+    "\n",
+    "# sort index by pipeline_refs\n",
+    "heatmap_data = heatmap_data.reindex(_pids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "pipelines_refs = {\n",
+    "    # Roll Avg\n",
+    "    527: \"Δ 0.3\",\n",
+    "    516: \"Δ 0.2\",\n",
+    "    506: \"Δ 0.1\",\n",
+    "    494: \"Δ 0.05\",\n",
+    "    # Quantile\n",
+    "    445: \"% 0.05\",\n",
+    "    # Num misclass: without reduction\n",
+    "    736: \"X 50, noRed\",\n",
+    "    759: \"X 100, noRed\",\n",
+    "    751: \"X 200, noRed\",\n",
+    "    743: \"X 500, noRed\",\n",
+    "    # Num misclass: with reduction\n",
+    "    734: \"X 50, Red\",\n",
+    "    758: \"X 100, Red\",\n",
+    "    749: \"X 200, Red\",\n",
+    "}\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_ticks=[1950, 1975, 2000],\n",
+    "    y_custom_ticks=[(i + 0.5, pipelines_refs[y]) for i, y in enumerate(heatmap_data.index)],\n",
+    "    y_label=\"Criterion\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    title_label=\"Yearbook Composite Models:\\nDynamic Performance Thresholds & Num. Misclassifications\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.75,\n",
+    "    # grid_alpha=0.4,\n",
+    "    grid_alpha=0.0,\n",
+    "    # disable_horizontal_grid=True,\n",
+    "    # cbar=False,\n",
+    "    triggers={\n",
+    "        i: df_logs_models[df_logs_models[\"pipeline_id\"] == p_id][\n",
+    "            [\"train_start\", \"train_end\", \"usage_start\", \"usage_end\"]\n",
+    "        ]\n",
+    "        for i, p_id in enumerate(heatmap_data.index)\n",
+    "    },\n",
+    ")\n",
+    "save_plot(fig, \"yb_trigger_heatmap_performance_multi_dyn_thresholds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/performance/yearbook_heatmap_single.ipynb b/analytics/plotting/rh_thesis/performance/yearbook_heatmap_single.ipynb
new file mode 100644
index 000000000..934586937
--- /dev/null
+++ b/analytics/plotting/rh_thesis/performance/yearbook_heatmap_single.ipynb
@@ -0,0 +1,307 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe, patch_yearbook_time\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/30_performance/static_dyn\"\n",
+    ")\n",
+    "assert pipelines_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 418  # 250 0.8 static\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_logs_models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if patch_yearbook:\n",
+    "    for column in [\"interval_start\", \"interval_center\", \"interval_end\"]:\n",
+    "        patch_yearbook_time(df_adjusted, column)\n",
+    "    for column in [\"train_start\", \"train_end\", \"real_train_end\", \"usage_start\", \"usage_end\"]:\n",
+    "        patch_yearbook_time(df_logs_models, column)\n",
+    "\n",
+    "    # correction for -1 second in timestamp format before patching\n",
+    "    df_logs_models[\"usage_end\"] = (\n",
+    "        df_logs_models[\"usage_end\"].dt.to_period(\"M\") + 1\n",
+    "    ).dt.to_timestamp()  # december (because of -1 second in timestamp format) -> start of year\n",
+    "\n",
+    "df_logs_models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "assert df_adjusted[\"pipeline_ref\"].nunique() <= 1\n",
+    "# add the pipeline time series which is the performance of different models stitched together dep.\n",
+    "# w.r.t which model was active\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = 0\n",
+    "pipeline_composite_model[\"id_model\"] = 0\n",
+    "\n",
+    "label_map = {k: f\"{k}\" for k, v in df_adjusted[[\"model_idx\", \"id_model\"]].values}\n",
+    "label_map[0] = \"Pipeline composite model\"\n",
+    "\n",
+    "if include_composite_model:\n",
+    "    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])\n",
+    "else:\n",
+    "    df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].astype(str).str.split(\"-\").str[0]\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_end_years_per_model = df_logs_models[[\"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"].dt.year\n",
+    "df_train_end_years_per_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=\"model_idx\", how=\"left\")\n",
+    "df_merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build heatmap matrix dataframe:\n",
+    "heatmap_data = df_merged.pivot(index=[\"real_train_end\"], columns=\"interval_center\", values=\"value\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index.min(), heatmap_data.index.max()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_ticks=[1950, 1975, 2000],\n",
+    "    y_custom_ticks=[(i + 0.5, str(y)) for i, y in enumerate(heatmap_data.index)],\n",
+    "    y_label=\"Trained up to\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    title_label=\"Yearbook PerformanceTrigger: Static Accuracy Threshold=70%\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.55,\n",
+    "    # grid_alpha=0.4,\n",
+    "    grid_alpha=0.0,\n",
+    "    # disable_horizontal_grid=True,\n",
+    "    # cbar=False,\n",
+    "    df_logs_models=df_logs_models,\n",
+    ")\n",
+    "save_plot(fig, \"yb_trigger_heatmap_performance_single_static\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/performance/yearbook_heatmap_single_num_miclass.ipynb b/analytics/plotting/rh_thesis/performance/yearbook_heatmap_single_num_miclass.ipynb
new file mode 100644
index 000000000..e39e4764a
--- /dev/null
+++ b/analytics/plotting/rh_thesis/performance/yearbook_heatmap_single_num_miclass.ipynb
@@ -0,0 +1,310 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe, patch_yearbook_time\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/30_performance/num_misclass/\"\n",
+    ")\n",
+    "assert pipelines_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 759  # yearbook_performancetrigger_num_misclass-100-exp-0.9-red-False--int250y\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = False\n",
+    "\n",
+    "print(f\"Pipeline ID: {pipeline_id}, name: {pipelines[pipeline_id][0]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_logs_models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if patch_yearbook:\n",
+    "    for column in [\"interval_start\", \"interval_center\", \"interval_end\"]:\n",
+    "        patch_yearbook_time(df_adjusted, column)\n",
+    "    for column in [\"train_start\", \"train_end\", \"real_train_end\", \"usage_start\", \"usage_end\"]:\n",
+    "        patch_yearbook_time(df_logs_models, column)\n",
+    "\n",
+    "    # correction for -1 second in timestamp format before patching\n",
+    "    df_logs_models[\"usage_end\"] = (\n",
+    "        df_logs_models[\"usage_end\"].dt.to_period(\"M\") + 1\n",
+    "    ).dt.to_timestamp()  # december (because of -1 second in timestamp format) -> start of year\n",
+    "\n",
+    "df_logs_models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "assert df_adjusted[\"pipeline_ref\"].nunique() <= 1\n",
+    "# add the pipeline time series which is the performance of different models stitched together dep.\n",
+    "# w.r.t which model was active\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = 0\n",
+    "pipeline_composite_model[\"id_model\"] = 0\n",
+    "\n",
+    "label_map = {k: f\"{k}\" for k, v in df_adjusted[[\"model_idx\", \"id_model\"]].values}\n",
+    "label_map[0] = \"Pipeline composite model\"\n",
+    "\n",
+    "if include_composite_model:\n",
+    "    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])\n",
+    "else:\n",
+    "    df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].astype(str).str.split(\"-\").str[0]\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_end_years_per_model = df_logs_models[[\"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"].dt.year\n",
+    "df_train_end_years_per_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=\"model_idx\", how=\"left\")\n",
+    "df_merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build heatmap matrix dataframe:\n",
+    "heatmap_data = df_merged.pivot(index=[\"real_train_end\"], columns=\"interval_center\", values=\"value\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index.min(), heatmap_data.index.max()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_ticks=[1950, 1975, 2000],\n",
+    "    y_custom_ticks=[(i + 0.5, str(y)) for i, y in enumerate(heatmap_data.index)],\n",
+    "    y_label=\"Trained up to\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    # yearbook_performancetrigger_num_misclass-100-exp-0.9-red-False--int250y\n",
+    "    title_label=\"Yearbook PerformanceTrigger:\\n Exp. Acc=90% | NumMiscl=100 | No Reduction\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=1,\n",
+    "    height_factor=0.55,\n",
+    "    # grid_alpha=0.4,\n",
+    "    grid_alpha=0.0,\n",
+    "    # disable_horizontal_grid=True,\n",
+    "    # cbar=False,\n",
+    "    df_logs_models=df_logs_models,\n",
+    ")\n",
+    "save_plot(fig, \"yb_trigger_heatmap_performance_single_num_misclass\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/traintime_vs_dataamount/yb_traintime_dataamount.ipynb b/analytics/plotting/rh_thesis/traintime_vs_dataamount/yb_traintime_dataamount.ipynb
new file mode 100644
index 000000000..1d8c2914e
--- /dev/null
+++ b/analytics/plotting/rh_thesis/traintime_vs_dataamount/yb_traintime_dataamount.ipynb
@@ -0,0 +1,306 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.plotting.common.color import discrete_colors\n",
+    "from analytics.plotting.common.linear_regression_scatterplot import scatter_linear_regression\n",
+    "from modyn.supervisor.internal.grpc.enums import PipelineStage\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import StageLog\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# INPUTS\n",
+    "\n",
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/11_baselines_amount\"\n",
+    ")\n",
+    "# pipelines_dir = Path(\n",
+    "#     \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/11_baselines_amount\"\n",
+    "# )\n",
+    "# pipelines_dir = Path(\n",
+    "#     \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/11_baselines_amount\"\n",
+    "# )\n",
+    "output_dir = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.analytics.log/.data/_plots\")\n",
+    "assert pipelines_dir.exists()\n",
+    "assert output_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extract number of epochs\n",
+    "num_epochs: int | None = None\n",
+    "\n",
+    "for p_id, logs in pipeline_logs.items():\n",
+    "    for log in logs:\n",
+    "        if num_epochs is None:\n",
+    "            num_epochs = logs.config.pipeline.training.epochs_per_trigger\n",
+    "        else:\n",
+    "            assert num_epochs == logs.config.pipeline.training.epochs_per_trigger\n",
+    "\n",
+    "assert num_epochs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_df_train: list[pd.DataFrame] = []\n",
+    "\n",
+    "for pipeline_id in pipelines:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    train_logs = [record for record in logs.supervisor_logs.stage_runs if record.id == PipelineStage.TRAIN.name]\n",
+    "    df_train = StageLog.df(stage_logs=train_logs, extended=True)\n",
+    "    df_train[\"pipeline_id\"] = pipelines[pipeline_id][0]\n",
+    "    list_df_train.append(df_train)\n",
+    "\n",
+    "df_train = pd.concat(list_df_train)\n",
+    "df_train.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Conversion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Clean pipeline name\n",
+    "\n",
+    "import re\n",
+    "\n",
+    "\n",
+    "def pipeline_name_cleaner(name: str):\n",
+    "    return re.sub(r\".*dataamount_(\\d+)\", r\"\\1\", name)\n",
+    "\n",
+    "\n",
+    "df_train[\"pipeline_id\"] = df_train[\"pipeline_id\"].apply(pipeline_name_cleaner)\n",
+    "df_train.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# to seconds\n",
+    "df_train[\"duration\"] = df_train[\"duration\"].dt.total_seconds() / 60\n",
+    "# df_train[\"duration\"] = df_train[\"duration\"].dt.total_seconds()\n",
+    "# df_train[\"train_time_at_trainer\"] = df_train[\"train_time_at_trainer\"] / 1_000  # millis to seconds\n",
+    "df_train[\"train_time_at_trainer\"] = df_train[\"train_time_at_trainer\"] / 1_000 / 60  # millis to minutes\n",
+    "\n",
+    "# vs. number of passed sample: num_samples\n",
+    "df_train[\"num_input_samples\"] = df_train[\"num_samples\"] / num_epochs\n",
+    "\n",
+    "\n",
+    "dataset = pipelines_dir.parent.name\n",
+    "\n",
+    "if dataset != \"yearbook\":\n",
+    "    df_train[\"num_input_samples\"] = df_train[\"num_input_samples\"] / 1_000\n",
+    "    df_train[\"pipeline_id\"] = (df_train[\"pipeline_id\"].astype(int) // 1_000).astype(str) + \"k\"\n",
+    "\n",
+    "\n",
+    "df_train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sort by number of samples\n",
+    "df_train = df_train.sort_values(by=\"num_samples\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "fig = scatter_linear_regression(\n",
+    "    df_train,\n",
+    "    x=\"num_input_samples\",\n",
+    "    y=\"train_time_at_trainer\",  # duration is broken due to bug in grpc interface\n",
+    "    hue=\"pipeline_id\",\n",
+    "    palette=(\n",
+    "        discrete_colors(14)[0:4] + discrete_colors(14)[10:14]\n",
+    "        if \"yearbook\" in str(pipelines_dir)\n",
+    "        else (\n",
+    "            discrete_colors(12)[0:4] + discrete_colors(12)[9:12]\n",
+    "            if \"huffpost\" in str(pipelines_dir)\n",
+    "            else discrete_colors(8)[0:3] + discrete_colors(8)[6:8]\n",
+    "        )\n",
+    "    ),\n",
+    "    title_label=\"Training Size (Samples) vs. Cost (Time)\",\n",
+    "    x_label=\"#Trained Samples (k) / #Epochs\",\n",
+    "    y_label=\"Training Time (min)\",\n",
+    "    legend_label=\"Trigger every\",\n",
+    "    height_factor=0.5 if dataset != \"yearbook\" else 0.55,\n",
+    "    width_factor=0.575 if dataset != \"yearbook\" else 0.7,\n",
+    "    small_legend_fonts=dataset != \"yearbook\",\n",
+    "    # x_ticks=[],\n",
+    "    # y_ticks=[],\n",
+    ")\n",
+    "\n",
+    "save_plot(\n",
+    "    fig=fig,\n",
+    "    name=dataset + \"_training_size_vs_cost\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: run more variants of in less dense areas\n",
+    "# TODO: plot / add number of datapoints to thesis so that the signicance of regression line is clear\n",
+    "# State in thesis that there are no outliers to be expected!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Plotting faulty time at supervisor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "fig = scatter_linear_regression(\n",
+    "    df_train,\n",
+    "    x=\"num_input_samples\",\n",
+    "    y=\"duration\",  # broken due to bug in grpc interface\n",
+    "    hue=\"pipeline_id\",\n",
+    "    palette=(\n",
+    "        discrete_colors(14)[0:4] + discrete_colors(14)[10:14]\n",
+    "        if \"yearbook\" in str(pipelines_dir)\n",
+    "        else (\n",
+    "            discrete_colors(12)[0:4] + discrete_colors(12)[9:12]\n",
+    "            if \"huffpost\" in str(pipelines_dir)\n",
+    "            else discrete_colors(8)[0:3] + discrete_colors(8)[6:8]\n",
+    "        )\n",
+    "    ),\n",
+    "    title_label=\"Training Size (Samples) vs. Cost (Time)\",\n",
+    "    x_label=\"#Trained Samples (k) / #Epochs\",\n",
+    "    y_label=\"Supervisor TRAIN\" if dataset != \"yearbook\" else \"Supervisor TRAIN Stage (min)\",\n",
+    "    legend_label=\"Trigger every\",\n",
+    "    height_factor=0.5 if dataset != \"yearbook\" else 0.7,\n",
+    "    width_factor=0.575 if dataset != \"yearbook\" else 0.7,\n",
+    "    small_legend_fonts=dataset != \"yearbook\",\n",
+    "    # x_ticks=[],\n",
+    "    # y_ticks=[],\n",
+    ")\n",
+    "\n",
+    "save_plot(\n",
+    "    fig=fig,\n",
+    "    name=dataset + \"_training_size_vs_cost_bug_supervisor_time\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/triggers_simple/arxiv_accuracy_over_time_multi_pipeline.ipynb b/analytics/plotting/rh_thesis/triggers_simple/arxiv_accuracy_over_time_multi_pipeline.ipynb
new file mode 100644
index 000000000..74ae9eb6f
--- /dev/null
+++ b/analytics/plotting/rh_thesis/triggers_simple/arxiv_accuracy_over_time_multi_pipeline.ipynb
@@ -0,0 +1,267 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import matplotlib.dates as mdates\n",
+    "import pandas as pd\n",
+    "from matplotlib.ticker import FixedFormatter, FixedLocator\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe\n",
+    "from analytics.plotting.common.metric_over_time import plot_metric_over_time\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/10_baselines_time\"\n",
+    ")\n",
+    "assert pipelines_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_ids = [263, 265, 267, 272]\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"arxiv_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_eval_dfs = []\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    pipeline_log = pipeline_logs[pipeline_id]\n",
+    "    pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "    df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "    df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "        # subtracting would interfere with yearbook patching\n",
+    "        pipeline_log,\n",
+    "        df_all[\"sample_time\"].max(),\n",
+    "        pipeline_ref,\n",
+    "    )\n",
+    "    all_eval_dfs.append(df_eval_single)\n",
+    "\n",
+    "df_adjusted = pd.concat(all_eval_dfs)\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Name transformer\n",
+    "import re\n",
+    "\n",
+    "\n",
+    "def name_transformer(name):\n",
+    "    # reduce to number yearbook_timetrigger_10y\n",
+    "    return (\n",
+    "        re.sub(r\".*timetrigger_(\\d+)([wy])\", r\"every \\1 [\\2]\", name)\n",
+    "        .replace(\"[w]\", \"weeks\")\n",
+    "        .replace(\"[y]\", \"years\")\n",
+    "        .replace(\"1 years\", \"1 year\")\n",
+    "        .replace(\"1 weeks\", \"1 week\")\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "df_adjusted[\"pipeline_ref\"] = df_adjusted[\"pipeline_ref\"].apply(name_transformer)\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"pipeline_ref\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df_adjusted[\"dataset_id\"].unique()\n",
+    "df_adjusted[df_adjusted[\"dataset_id\"] == \"yearbook-test\"][\"pipeline_ref\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = -1  # \"00-pipeline-composite-model\"\n",
+    "# number_digits = len(str(df_adjusted[\"model_idx\"].max()))\n",
+    "# df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"].astype(str).str.zfill(number_digits)\n",
+    "df_adjusted = pd.concat([df_adjusted, pipeline_composite_model])\n",
+    "\n",
+    "# df_composite = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "# df_composite"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# only composite models\n",
+    "reduced = df_adjusted[df_adjusted[\"model_idx\"] == -1].copy()\n",
+    "reduced"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_metric_over_time(\n",
+    "    reduced,\n",
+    "    x=\"interval_center\",\n",
+    "    y=\"value\",\n",
+    "    hue=\"pipeline_ref\",\n",
+    "    style=\"pipeline_ref\",\n",
+    "    width_factor=0.85,\n",
+    "    height_factor=0.75,\n",
+    "    legend_label=\"TimeTrigger Pipeline\",\n",
+    "    small_legend_fonts=True,\n",
+    "    # x_date_locator=mdates.YearLocator(20),\n",
+    "    # x_date_formatter=mdates.DateFormatter(\"%Y\"),  # %b\\n\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2000-01-01\", \"2009-01-01\", \"2018-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"Jan 2000\", \"Jan 2009\", \"Jan 2018\"]]),\n",
+    "    xlim=(pd.Timestamp(\"1995-01-01\"), pd.Timestamp(\"2024-09-01\")),\n",
+    "    ylim=(20, 70),\n",
+    "    y_ticks=[30, 40, 50, 60],\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    y_label=\"Accuracy (%)\",\n",
+    "    markers=False,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"simple_arxiv_composite_models_over_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/triggers_simple/arxiv_accuracy_over_time_single_pipeline.ipynb b/analytics/plotting/rh_thesis/triggers_simple/arxiv_accuracy_over_time_single_pipeline.ipynb
new file mode 100644
index 000000000..8c806aa14
--- /dev/null
+++ b/analytics/plotting/rh_thesis/triggers_simple/arxiv_accuracy_over_time_single_pipeline.ipynb
@@ -0,0 +1,282 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import matplotlib.dates as mdates\n",
+    "import pandas as pd\n",
+    "from matplotlib.ticker import FixedFormatter, FixedLocator\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe\n",
+    "from analytics.plotting.common.metric_over_time import plot_metric_over_time\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/10_baselines_time\"\n",
+    ")\n",
+    "assert pipelines_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 267\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"arxiv_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"pipeline_ref\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df_adjusted[\"dataset_id\"].unique()\n",
+    "df_adjusted[df_adjusted[\"dataset_id\"] == \"yearbook-test\"][\"pipeline_ref\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = -1  # \"00-pipeline-composite-model\"\n",
+    "# number_digits = len(str(df_adjusted[\"model_idx\"].max()))\n",
+    "# df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"].astype(str).str.zfill(number_digits)\n",
+    "df_adjusted = pd.concat([df_adjusted, pipeline_composite_model])\n",
+    "\n",
+    "# df_composite = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "# df_composite"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min_model = 1\n",
+    "max_model = df_adjusted[\"model_idx\"].max()\n",
+    "thresholds_early_models = (6, 11)\n",
+    "threshold_late_models = 49\n",
+    "\n",
+    "reduced = df_adjusted.copy()\n",
+    "reduced[\"early_model\"] = (reduced[\"model_idx\"] <= thresholds_early_models[1]) & (\n",
+    "    reduced[\"model_idx\"] >= thresholds_early_models[0]\n",
+    ")\n",
+    "reduced[\"late_model\"] = reduced[\"model_idx\"] >= threshold_late_models\n",
+    "reduced[\"composite_model\"] = reduced[\"model_idx\"] == -1\n",
+    "reduced = reduced[reduced[\"early_model\"] | reduced[\"late_model\"] | reduced[\"composite_model\"]]\n",
+    "# assert len(reduced[reduced[\"early_model\"]][\"model_idx\"].unique()) == 10\n",
+    "# assert len(reduced[reduced[\"late_model\"]][\"model_idx\"].unique()) == 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# if early_model --> \"early model\", if late_model --> \"late model\", else \"composite model\"\n",
+    "def mapper(x):\n",
+    "    if x < 0:\n",
+    "        return \"Cur. Active\\nComposite Model\"\n",
+    "    if (x >= thresholds_early_models[0]) & (x <= thresholds_early_models[1]):\n",
+    "        return f\"Early Models ({thresholds_early_models[0]} - {thresholds_early_models[1]})\"\n",
+    "    if x >= threshold_late_models:\n",
+    "        return f\"Late Models ({threshold_late_models} - {max_model})\"\n",
+    "    return \"hide\"\n",
+    "\n",
+    "\n",
+    "reduced[\"model_type\"] = reduced[\"model_idx\"].apply(mapper)\n",
+    "reduced[\"model_type\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reduced"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_metric_over_time(\n",
+    "    reduced,\n",
+    "    x=\"interval_center\",\n",
+    "    y=\"value\",\n",
+    "    hue=\"model_type\",\n",
+    "    style=\"model_type\",\n",
+    "    width_factor=0.85,\n",
+    "    height_factor=0.65,\n",
+    "    legend_label=\"Model Type\",\n",
+    "    small_legend_fonts=True,\n",
+    "    # x_date_locator=mdates.YearLocator(20),\n",
+    "    # x_date_formatter=mdates.DateFormatter(\"%Y\"),  # %b\\n\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2000-01-01\", \"2009-01-01\", \"2018-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"Jan 2000\", \"Jan 2009\", \"Jan 2018\"]]),\n",
+    "    xlim=(pd.Timestamp(\"1995-01-01\"), pd.Timestamp(\"2024-09-01\")),\n",
+    "    ylim=(-20, 75),\n",
+    "    y_ticks=[0, 20, 40, 60],\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    y_label=\"Accuracy (%)\",\n",
+    "    markers=False,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"simple_arxiv_26w_begin_vs_start_vs_composite\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/triggers_simple/arxiv_cost.ipynb b/analytics/plotting/rh_thesis/triggers_simple/arxiv_cost.ipynb
new file mode 100644
index 000000000..1a25c6d8c
--- /dev/null
+++ b/analytics/plotting/rh_thesis/triggers_simple/arxiv_cost.ipynb
@@ -0,0 +1,283 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import matplotlib.dates as mdates\n",
+    "import pandas as pd\n",
+    "from matplotlib.ticker import FixedFormatter, FixedLocator\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import pipeline_leaf_times_df\n",
+    "from analytics.plotting.common.cost_matrix import plot_cost_matrix\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/10_baselines_time\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/11_baselines_amount\"),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode: time + amount\n",
+    "pipeline_ids = [267, 269, 265] + [268, 271, 270]\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"arxiv_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_leaf_list = []\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)\n",
+    "df_leaf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_leaf.copy()\n",
+    "\n",
+    "# coloring in order of decreasing avg. duration\n",
+    "avg_duration_per_stage = df_adjusted.groupby([\"pipeline_ref\", \"id\"])[\"duration\"].mean().sort_values(ascending=False)\n",
+    "df_adjusted = df_adjusted.merge(avg_duration_per_stage, on=[\"pipeline_ref\", \"id\"], suffixes=(\"\", \"_avg\")).sort_values(\n",
+    "    \"duration_avg\", ascending=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted\n",
+    "df_adjusted[\"sample_time_year\"] = df_adjusted[\"sample_time\"]\n",
+    "df_adjusted[\"sample_time_year_bin\"] = pd.cut(df_adjusted[\"sample_time_year\"], bins=10, labels=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new = df_adjusted[\n",
+    "    (\n",
+    "        df_adjusted[\"id\"].isin(\n",
+    "            [\n",
+    "                \"TRAIN\",\n",
+    "                \"STORE_TRAINED_MODEL\",\n",
+    "                \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "                \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "                \"EVALUATE_TRIGGER_POLICY\",\n",
+    "            ]\n",
+    "        )\n",
+    "    )\n",
+    "][[\"pipeline_ref\", \"id\", \"sample_time_year\", \"duration\"]].copy()\n",
+    "df_new = df_new.sort_values(\"sample_time_year\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state_rename = {\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\": \"inform remaining data\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\": \"inform trigger\",\n",
+    "}\n",
+    "\n",
+    "df_new[\"id\"] = df_new[\"id\"].replace(state_rename).str.lower().str.replace(\"_\", \" \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [265, 269, 267],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        265: \"TimeTrigger 10 years\",\n",
+    "        269: \"TimeTrigger 2 years\",\n",
+    "        267: \"TimeTrigger 26 weeks\",\n",
+    "    },\n",
+    "    height_factor=1.8,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (min)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2000-01-01\", \"2009-01-01\", \"2018-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"Jan 2000\", \"Jan 2009\", \"Jan 2018\"]]),\n",
+    "    x_lim=(pd.Timestamp(\"1995-01-01\"), pd.Timestamp(\"2024-09-01\")),\n",
+    "    y_ticks_cumulative=[x for x in range(0, 1000, 200)],\n",
+    "    y_lim_cumulative=(0, 1000),\n",
+    "    y_minutes=True,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"arxiv_time-trigger-cost-matrix\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [268, 271, 270],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        268: \"AmountTrigger 500k samples\",\n",
+    "        271: \"AmountTrigger 100k samples\",\n",
+    "        270: \"AmountTrigger 25k samples\",\n",
+    "    },\n",
+    "    height_factor=1.8,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (min)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2000-01-01\", \"2009-01-01\", \"2018-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"Jan 2000\", \"Jan 2009\", \"Jan 2018\"]]),\n",
+    "    x_lim=(pd.Timestamp(\"1995-01-01\"), pd.Timestamp(\"2024-09-01\")),\n",
+    "    y_ticks_cumulative=[x for x in range(0, 1000, 200)],\n",
+    "    y_lim_cumulative=(0, 1000),\n",
+    "    y_minutes=True,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"arxiv_amount-trigger-cost-matrix\")\n",
+    "# not interesting: note that for 250 samples we see multiple trigger at the same timestamp"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Plot 100k amount and 2y time trigger together"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [269, 271],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        269: \"TimeTrigger 2 years\",\n",
+    "        271: \"AmountTrigger 100k samples\",\n",
+    "    },\n",
+    "    height_factor=1.2,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (min)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2000-01-01\", \"2009-01-01\", \"2018-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"Jan 2000\", \"Jan 2009\", \"Jan 2018\"]]),\n",
+    "    x_lim=(pd.Timestamp(\"1995-01-01\"), pd.Timestamp(\"2024-09-01\")),\n",
+    "    y_ticks_cumulative=[x for x in range(0, 1000, 200)],\n",
+    "    y_lim_cumulative=(0, 1000),\n",
+    "    y_minutes=True,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"arxiv_timeamount-trigger-cost-matrix\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/triggers_simple/arxiv_cost_perf_tradeoff.ipynb b/analytics/plotting/rh_thesis/triggers_simple/arxiv_cost_perf_tradeoff.ipynb
new file mode 100644
index 000000000..107c73b77
--- /dev/null
+++ b/analytics/plotting/rh_thesis/triggers_simple/arxiv_cost_perf_tradeoff.ipynb
@@ -0,0 +1,454 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import (\n",
+    "    df_aggregate_eval_metric,\n",
+    "    dfs_models_and_evals,\n",
+    "    pipeline_leaf_times_df,\n",
+    ")\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from analytics.plotting.common.tradeoff_scatterplot import plot_tradeoff_scatter\n",
+    "from modyn.supervisor.internal.grpc.enums import PipelineStage\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/10_baselines_time\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/11_baselines_amount\"),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"arxiv_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pipeline_name_mapper(name: str) -> str:\n",
+    "    name = name.replace(\"yearbook_\", \"\")\n",
+    "    name = name.replace(\"timetrigger_\", \"\")  # \"every \"\n",
+    "    name = name.replace(\"amounttrigger_\", \"\")  # \"every \"\n",
+    "    name = name.replace(\"mmdalibi_\", \"\")\n",
+    "    if name.endswith(\"y\"):\n",
+    "        name = name[:-1] + (\" years\" if not name.endswith(\"1y\") else \" year\")\n",
+    "    elif name.endswith(\"w\"):\n",
+    "        name = name[:-1] + (\" weeks\" if not name.endswith(\"1w\") else \" week\")\n",
+    "    elif not name.endswith(\"d\"):  # dataamount\n",
+    "        name = name.replace(\"dataamount_\", \"\") + \" samples\"\n",
+    "    else:  # drift\n",
+    "        name = name.replace(\"_\", \"/\")\n",
+    "    return name\n",
+    "\n",
+    "\n",
+    "pipelines = {\n",
+    "    p_id: (pipeline_name_mapper(pname), p_path)\n",
+    "    for p_id, (pname, p_path) in pipelines.items()\n",
+    "    if pipeline_name_mapper(pname)\n",
+    "    in [\n",
+    "        \"1 year\",\n",
+    "        \"5 years\",\n",
+    "        \"26 weeks\",\n",
+    "        \"25000 samples\",\n",
+    "        \"50000 samples\",\n",
+    "        \"100000 samples\",\n",
+    "    ]\n",
+    "}\n",
+    "pipeline_ids = list(pipelines.keys())\n",
+    "\n",
+    "[(p_id, pname) for p_id, (pname, _) in pipelines.items() if p_id in pipeline_ids]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_df_eval_single: list[pd.DataFrame] = []\n",
+    "df_leaf_list: list[pd.DataFrame] = []\n",
+    "\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_single[\"pipeline_id\"] = pipeline_id\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "    _, _, df_eval_single = dfs_models_and_evals(\n",
+    "        pipeline_logs[pipeline_id], df_leaf_single[\"sample_time\"].max(), pipelines[pipeline_id][0]\n",
+    "    )\n",
+    "    df_eval_single[\"pipeline_id\"] = pipeline_id\n",
+    "    list_df_eval_single.append(df_eval_single)\n",
+    "\n",
+    "df_adjusted = pd.concat(list_df_eval_single)\n",
+    "df_adjusted\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(df_leaf[\"id\"].unique())\n",
+    "assert set(df_leaf[\"id\"].unique()) == {\n",
+    "    \"TRAIN\",\n",
+    "    \"INIT_CLUSTER_CONNECTION\",\n",
+    "    \"EVALUATE_TRIGGER_POLICY\",\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "    \"TRAINING_COMPLETED\",\n",
+    "    \"STORE_TRAINED_MODEL\",\n",
+    "    \"EVALUATE\",\n",
+    "    \"DONE\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reduce to composite models\n",
+    "df_adjusted = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "df_adjusted[composite_model_variant].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reduce evaluation interval to interval where all policies have evaluations\n",
+    "min_active_eval_center_per_pipeline = (\n",
+    "    df_adjusted[df_adjusted[composite_model_variant]].groupby(\"pipeline_ref\")[\"interval_center\"].min()\n",
+    ")\n",
+    "maximum_min = min_active_eval_center_per_pipeline.max()\n",
+    "print(maximum_min, min_active_eval_center_per_pipeline)\n",
+    "\n",
+    "assert maximum_min < pd.Timestamp(\"2005-01-01\")\n",
+    "\n",
+    "df_adjusted = df_adjusted[df_adjusted[\"interval_center\"] >= maximum_min]\n",
+    "df_adjusted[\"interval_center\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].astype(str).str.split(\"-\").str[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Aggregate metrics to a scalar value per pipeline\n",
+    "mean_accuracies = df_aggregate_eval_metric(\n",
+    "    df_adjusted,\n",
+    "    group_by=[\"pipeline_id\", \"pipeline_ref\", \"metric\"],\n",
+    "    in_col=\"value\",\n",
+    "    out_col=\"metric_value\",\n",
+    "    aggregate_func=\"mean\",\n",
+    ")\n",
+    "mean_accuracies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_triggers = df_leaf[df_leaf[\"id\"] == PipelineStage.TRAIN.name]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_triggers = df_leaf[df_leaf[\"id\"] == PipelineStage.TRAIN.name]\n",
+    "df_triggers = df_triggers[df_triggers[\"sample_time\"] > maximum_min]\n",
+    "df_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find number of trigger per pipeline that are after maximum_min\n",
+    "\n",
+    "# before the cutoff there was one trigger (equivalent to start of our reduced dataset): +1\n",
+    "num_triggers = df_triggers.groupby(\"pipeline_id\").aggregate(count=(\"id\", \"count\"), sum_duration=(\"duration\", \"sum\"))\n",
+    "num_triggers[\"count\"] += 1\n",
+    "num_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged = num_triggers.merge(mean_accuracies, on=\"pipeline_id\")\n",
+    "assert mean_accuracies.shape[0] == merged.shape[0]\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_type(x: str):\n",
+    "    if \"year\" in x or \"week\" in x:\n",
+    "        return \"time\"\n",
+    "    elif \"samples\" in x:\n",
+    "        return \"amount\"\n",
+    "    elif \"d\" in x:\n",
+    "        return \"drift\"\n",
+    "    else:\n",
+    "        return \"unknown\"\n",
+    "\n",
+    "\n",
+    "merged[\"type\"] = merged[\"pipeline_ref\"].apply(lambda x: create_type(x))\n",
+    "merged = merged.sort_values(by=[\"count\"])\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extract number if samples from pipeline name\n",
+    "amount_policies = \"/\".join(\n",
+    "    [\n",
+    "        f\"{int(x) // 1000}k\"\n",
+    "        for x in [str(y).replace(\" samples\", \"\") for y in merged[merged[\"type\"] == \"amount\"][\"pipeline_ref\"]]\n",
+    "    ]\n",
+    ")\n",
+    "time_policies = \"/\".join(\n",
+    "    [\n",
+    "        str(x)\n",
+    "        for x in [\n",
+    "            str(y).replace(\" weeks\", \"w\").replace(\" years\", \"y\").replace(\" year\", \"y\")\n",
+    "            for y in merged[merged[\"type\"] == \"time\"][\"pipeline_ref\"]\n",
+    "        ]\n",
+    "    ]\n",
+    ")\n",
+    "print(amount_policies)\n",
+    "print(time_policies)\n",
+    "\n",
+    "renamed = merged.copy()\n",
+    "renamed[\"Trigger Type\"] = renamed[\"type\"].apply(\n",
+    "    lambda x: f\"Amount    \\n[{amount_policies}]\" if x == \"amount\" else f\"Time [{time_policies} yrs.]\"\n",
+    ")\n",
+    "renamed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_tradeoff_scatter(\n",
+    "    renamed,\n",
+    "    x=\"count\",\n",
+    "    y=\"metric_value\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger Type\",\n",
+    "    x_label=\"Number of Triggers\",\n",
+    "    y_label=\"Mean Accuracy %\",\n",
+    "    height_factor=0.6,\n",
+    "    width_factor=0.65,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"tradeoff_simple_arxiv_triggers_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "in_minutes = renamed.copy()\n",
+    "in_minutes[\"sum_duration\"] = in_minutes[\"sum_duration\"] / 60\n",
+    "\n",
+    "fig = plot_tradeoff_scatter(\n",
+    "    in_minutes,\n",
+    "    x=\"sum_duration\",\n",
+    "    y=\"metric_value\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger Type\",\n",
+    "    x_label=\"Total Cost (Minutes)\",\n",
+    "    y_label=\"Mean Accuracy %\",\n",
+    "    height_factor=0.6,\n",
+    "    width_factor=0.65,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"tradeoff_simple_arxiv_cost_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_tradeoff_scatter(\n",
+    "    renamed,\n",
+    "    x=\"count\",\n",
+    "    y=\"sum_duration\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger Type\",\n",
+    "    x_label=\"Number of Triggers\",\n",
+    "    y_label=\"Total Cost (seconds)\",\n",
+    "    height_factor=0.6,\n",
+    "    width_factor=0.8,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"tradeoff_simple_arxiv_triggers_cost\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/triggers_simple/hp_accuracy_over_time_multi_pipeline.ipynb b/analytics/plotting/rh_thesis/triggers_simple/hp_accuracy_over_time_multi_pipeline.ipynb
new file mode 100644
index 000000000..f74306afc
--- /dev/null
+++ b/analytics/plotting/rh_thesis/triggers_simple/hp_accuracy_over_time_multi_pipeline.ipynb
@@ -0,0 +1,267 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import matplotlib.dates as mdates\n",
+    "import pandas as pd\n",
+    "from matplotlib.ticker import FixedFormatter, FixedLocator\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe\n",
+    "from analytics.plotting.common.metric_over_time import plot_metric_over_time\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/10_baselines_time\"\n",
+    ")\n",
+    "assert pipelines_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_ids = [273, 275, 278, 280, 282]\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"huffpost_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_eval_dfs = []\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    pipeline_log = pipeline_logs[pipeline_id]\n",
+    "    pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "    df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "    df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "        # subtracting would interfere with yearbook patching\n",
+    "        pipeline_log,\n",
+    "        df_all[\"sample_time\"].max(),\n",
+    "        pipeline_ref,\n",
+    "    )\n",
+    "    all_eval_dfs.append(df_eval_single)\n",
+    "\n",
+    "df_adjusted = pd.concat(all_eval_dfs)\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Name transformer\n",
+    "import re\n",
+    "\n",
+    "\n",
+    "def name_transformer(name):\n",
+    "    # reduce to number yearbook_timetrigger_10y\n",
+    "    return (\n",
+    "        re.sub(r\".*timetrigger_(\\d+)([wy])\", r\"every \\1 [\\2]\", name)\n",
+    "        .replace(\"[w]\", \"weeks\")\n",
+    "        .replace(\"[y]\", \"years\")\n",
+    "        .replace(\"1 years\", \"1 year\")\n",
+    "        .replace(\"1 weeks\", \"1 week\")\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "df_adjusted[\"pipeline_ref\"] = df_adjusted[\"pipeline_ref\"].apply(name_transformer)\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"pipeline_ref\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df_adjusted[\"dataset_id\"].unique()\n",
+    "df_adjusted[df_adjusted[\"dataset_id\"] == \"yearbook-test\"][\"pipeline_ref\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = -1  # \"00-pipeline-composite-model\"\n",
+    "# number_digits = len(str(df_adjusted[\"model_idx\"].max()))\n",
+    "# df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"].astype(str).str.zfill(number_digits)\n",
+    "df_adjusted = pd.concat([df_adjusted, pipeline_composite_model])\n",
+    "\n",
+    "# df_composite = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "# df_composite"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# only composite models\n",
+    "reduced = df_adjusted[df_adjusted[\"model_idx\"] == -1].copy()\n",
+    "reduced"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_metric_over_time(\n",
+    "    reduced,\n",
+    "    x=\"interval_center\",\n",
+    "    y=\"value\",\n",
+    "    hue=\"pipeline_ref\",\n",
+    "    style=\"pipeline_ref\",\n",
+    "    width_factor=0.85,\n",
+    "    height_factor=0.75,\n",
+    "    legend_label=\"TimeTrigger Pipeline\",\n",
+    "    small_legend_fonts=True,\n",
+    "    # x_date_locator=mdates.YearLocator(20),\n",
+    "    # x_date_formatter=mdates.DateFormatter(\"%Y\"),  # %b\\n\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2014-05-01\", \"2018-06-01\", \"2021-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"May 2014\", \"Jun 2018\", \"Jan 2021\"]]),\n",
+    "    xlim=(pd.Timestamp(\"2012-01-01\"), pd.Timestamp(\"2022-09-01\")),\n",
+    "    ylim=(-30, 90),\n",
+    "    y_ticks=[0, 20, 40, 60, 80],\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    y_label=\"Accuracy (%)\",\n",
+    "    markers=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"simple_hp_composite_models_over_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/triggers_simple/hp_accuracy_over_time_single_pipeline.ipynb b/analytics/plotting/rh_thesis/triggers_simple/hp_accuracy_over_time_single_pipeline.ipynb
new file mode 100644
index 000000000..b74bd2092
--- /dev/null
+++ b/analytics/plotting/rh_thesis/triggers_simple/hp_accuracy_over_time_single_pipeline.ipynb
@@ -0,0 +1,277 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import matplotlib.dates as mdates\n",
+    "import pandas as pd\n",
+    "from matplotlib.ticker import FixedFormatter, FixedLocator\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe\n",
+    "from analytics.plotting.common.metric_over_time import plot_metric_over_time\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/10_baselines_time\"\n",
+    ")\n",
+    "assert pipelines_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 275\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"huffpost_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"pipeline_ref\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df_adjusted[\"dataset_id\"].unique()\n",
+    "df_adjusted[df_adjusted[\"dataset_id\"] == \"yearbook-test\"][\"pipeline_ref\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = -1  # \"00-pipeline-composite-model\"\n",
+    "# number_digits = len(str(df_adjusted[\"model_idx\"].max()))\n",
+    "# df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"].astype(str).str.zfill(number_digits)\n",
+    "df_adjusted = pd.concat([df_adjusted, pipeline_composite_model])\n",
+    "\n",
+    "# df_composite = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "# df_composite"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min_model = 1\n",
+    "max_model = df_adjusted[\"model_idx\"].max()\n",
+    "threshold_early_models = 4\n",
+    "thresholds_middle_models = (6, 15)\n",
+    "threshold_late_models = 16\n",
+    "\n",
+    "reduced = df_adjusted.copy()\n",
+    "reduced[\"early_model\"] = (reduced[\"model_idx\"] <= threshold_early_models) & (reduced[\"model_idx\"] > 0)\n",
+    "reduced[\"middle_model\"] = (reduced[\"model_idx\"] <= thresholds_middle_models[1]) & (\n",
+    "    reduced[\"model_idx\"] >= thresholds_middle_models[0]\n",
+    ")\n",
+    "reduced[\"late_model\"] = reduced[\"model_idx\"] >= threshold_late_models\n",
+    "reduced[\"composite_model\"] = reduced[\"model_idx\"] == -1\n",
+    "reduced = reduced[reduced[\"early_model\"] | reduced[\"middle_model\"] | reduced[\"late_model\"] | reduced[\"composite_model\"]]\n",
+    "# assert len(reduced[reduced[\"early_model\"]][\"model_idx\"].unique()) == 10\n",
+    "# assert len(reduced[reduced[\"late_model\"]][\"model_idx\"].unique()) == 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# if early_model --> \"early model\", if late_model --> \"late model\", else \"composite model\"\n",
+    "def mapper(x):\n",
+    "    if x < 0:\n",
+    "        return \"Cur. Active Composite Model\"\n",
+    "    if x <= threshold_early_models:\n",
+    "        return f\"Early Models (1 - {threshold_early_models})\"\n",
+    "    if (x >= thresholds_middle_models[0]) and (x <= thresholds_middle_models[1]):\n",
+    "        return f\"Middle Models ({thresholds_middle_models[0]} - {thresholds_middle_models[1]})\"\n",
+    "    if x >= threshold_late_models:\n",
+    "        return f\"Late Models ({threshold_late_models} - {max_model})\"\n",
+    "    return \"hide\"\n",
+    "\n",
+    "\n",
+    "reduced[\"model_type\"] = reduced[\"model_idx\"].apply(mapper)\n",
+    "reduced[\"model_type\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_metric_over_time(\n",
+    "    reduced,\n",
+    "    x=\"interval_center\",\n",
+    "    y=\"value\",\n",
+    "    hue=\"model_type\",\n",
+    "    style=\"model_type\",\n",
+    "    width_factor=0.85,\n",
+    "    height_factor=0.65,\n",
+    "    legend_label=\"Model Type\",\n",
+    "    small_legend_fonts=True,\n",
+    "    # x_date_locator=mdates.YearLocator(20),\n",
+    "    # x_date_formatter=mdates.DateFormatter(\"%Y\"),  # %b\\n\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2014-05-01\", \"2018-06-01\", \"2021-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"May 2014\", \"Jun 2018\", \"Jan 2021\"]]),\n",
+    "    xlim=(pd.Timestamp(\"2012-01-01\"), pd.Timestamp(\"2022-09-01\")),\n",
+    "    ylim=(-50, 90),\n",
+    "    y_ticks=[10, 40, 70],\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    y_label=\"Accuracy (%)\",\n",
+    "    markers=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"simple_hp_13w_begin_vs_start_vs_composite\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/triggers_simple/hp_cost.ipynb b/analytics/plotting/rh_thesis/triggers_simple/hp_cost.ipynb
new file mode 100644
index 000000000..89f40d441
--- /dev/null
+++ b/analytics/plotting/rh_thesis/triggers_simple/hp_cost.ipynb
@@ -0,0 +1,246 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import matplotlib.dates as mdates\n",
+    "import pandas as pd\n",
+    "from matplotlib.ticker import FixedFormatter, FixedLocator\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import pipeline_leaf_times_df\n",
+    "from analytics.plotting.common.cost_matrix import plot_cost_matrix\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/10_baselines_time\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/11_baselines_amount\"),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode: time + amount\n",
+    "pipeline_ids = [275, 278, 280] + [276, 745, 279]\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"huffpost_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_leaf_list = []\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)\n",
+    "df_leaf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_leaf.copy()\n",
+    "\n",
+    "# coloring in order of decreasing avg. duration\n",
+    "avg_duration_per_stage = df_adjusted.groupby([\"pipeline_ref\", \"id\"])[\"duration\"].mean().sort_values(ascending=False)\n",
+    "df_adjusted = df_adjusted.merge(avg_duration_per_stage, on=[\"pipeline_ref\", \"id\"], suffixes=(\"\", \"_avg\")).sort_values(\n",
+    "    \"duration_avg\", ascending=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted\n",
+    "df_adjusted[\"sample_time_year\"] = df_adjusted[\"sample_time\"]\n",
+    "df_adjusted[\"sample_time_year_bin\"] = pd.cut(df_adjusted[\"sample_time_year\"], bins=10, labels=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new = df_adjusted[\n",
+    "    (\n",
+    "        df_adjusted[\"id\"].isin(\n",
+    "            [\n",
+    "                \"TRAIN\",\n",
+    "                \"STORE_TRAINED_MODEL\",\n",
+    "                \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "                \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "                \"EVALUATE_TRIGGER_POLICY\",\n",
+    "            ]\n",
+    "        )\n",
+    "    )\n",
+    "][[\"pipeline_ref\", \"id\", \"sample_time_year\", \"duration\"]].copy()\n",
+    "df_new = df_new.sort_values(\"sample_time_year\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state_rename = {\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\": \"inform remaining data\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\": \"inform trigger\",\n",
+    "}\n",
+    "\n",
+    "df_new[\"id\"] = df_new[\"id\"].replace(state_rename).str.lower().str.replace(\"_\", \" \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [280, 278, 275],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        280: \"TimeTrigger 4 years\",\n",
+    "        278: \"TimeTrigger 1 year\",\n",
+    "        275: \"TimeTrigger 13 weeks\",\n",
+    "    },\n",
+    "    height_factor=1.8,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (min)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2014-05-01\", \"2018-06-01\", \"2021-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"May\\n2014\", \"Jun\\n2018\", \"Jan\\n2021\"]]),\n",
+    "    x_lim=(pd.Timestamp(\"2012-01-01\"), pd.Timestamp(\"2022-09-01\")),\n",
+    "    y_ticks_cumulative=[x for x in range(0, 110, 25)],\n",
+    "    y_lim_cumulative=(0, 100),\n",
+    "    y_minutes=True,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"huffpost_time-trigger-cost-matrix\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [279, 745, 276],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        279: \"AmountTrigger 80k samples\",\n",
+    "        745: \"AmountTrigger 15k samples\",\n",
+    "        276: \"AmountTrigger 5k samples\",\n",
+    "    },\n",
+    "    height_factor=1.8,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (min)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in [\"2014-05-01\", \"2018-06-01\", \"2021-01-01\"]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [\"May\\n2014\", \"Jun\\n2018\", \"Jan\\n2021\"]]),\n",
+    "    x_lim=(pd.Timestamp(\"2012-01-01\"), pd.Timestamp(\"2022-09-01\")),\n",
+    "    y_ticks_cumulative=[x for x in range(0, 110, 25)],\n",
+    "    y_lim_cumulative=(0, 100),\n",
+    "    y_minutes=True,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"huffpost_amount-trigger-cost-matrix\")\n",
+    "# not interesting: note that for 250 samples we see multiple trigger at the same timestamp"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/triggers_simple/hp_cost_perf_tradeoff.ipynb b/analytics/plotting/rh_thesis/triggers_simple/hp_cost_perf_tradeoff.ipynb
new file mode 100644
index 000000000..3a2b9325f
--- /dev/null
+++ b/analytics/plotting/rh_thesis/triggers_simple/hp_cost_perf_tradeoff.ipynb
@@ -0,0 +1,453 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import (\n",
+    "    df_aggregate_eval_metric,\n",
+    "    dfs_models_and_evals,\n",
+    "    pipeline_leaf_times_df,\n",
+    ")\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from analytics.plotting.common.tradeoff_scatterplot import plot_tradeoff_scatter\n",
+    "from modyn.supervisor.internal.grpc.enums import PipelineStage\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/10_baselines_time\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/11_baselines_amount\"),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"huffpost_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pipeline_name_mapper(name: str) -> str:\n",
+    "    name = name.replace(\"yearbook_\", \"\")\n",
+    "    name = name.replace(\"timetrigger_\", \"\")  # \"every \"\n",
+    "    name = name.replace(\"amounttrigger_\", \"\")  # \"every \"\n",
+    "    name = name.replace(\"mmdalibi_\", \"\")\n",
+    "    if name.endswith(\"y\"):\n",
+    "        name = name[:-1] + (\" years\" if not name.endswith(\"1y\") else \" year\")\n",
+    "    elif name.endswith(\"w\"):\n",
+    "        name = name[:-1] + (\" weeks\" if not name.endswith(\"1w\") else \" week\")\n",
+    "    elif not name.endswith(\"d\"):  # dataamount\n",
+    "        name = name.replace(\"dataamount_\", \"\") + \" samples\"\n",
+    "    else:  # drift\n",
+    "        name = name.replace(\"_\", \"/\")\n",
+    "    return name\n",
+    "\n",
+    "\n",
+    "pipelines = {\n",
+    "    p_id: (pipeline_name_mapper(pname), p_path)\n",
+    "    for p_id, (pname, p_path) in pipelines.items()\n",
+    "    if pipeline_name_mapper(pname)\n",
+    "    in [\n",
+    "        \"1 year\",\n",
+    "        \"13 weeks\",\n",
+    "        \"2 years\",\n",
+    "        \"26 weeks\",\n",
+    "        # \"4 years\",\n",
+    "        \"10000 samples\",\n",
+    "        \"15000 samples\",\n",
+    "        \"20000 samples\",\n",
+    "        # \"30000 samples\",\n",
+    "        \"40000 samples\",\n",
+    "        \"5000 samples\",\n",
+    "        # \"dataamount_80000 samples\",\n",
+    "    ]\n",
+    "}\n",
+    "pipeline_ids = list(pipelines.keys())\n",
+    "\n",
+    "[(p_id, pname) for p_id, (pname, _) in pipelines.items() if p_id in pipeline_ids]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_df_eval_single: list[pd.DataFrame] = []\n",
+    "df_leaf_list: list[pd.DataFrame] = []\n",
+    "\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_single[\"pipeline_id\"] = pipeline_id\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "    _, _, df_eval_single = dfs_models_and_evals(\n",
+    "        pipeline_logs[pipeline_id], df_leaf_single[\"sample_time\"].max(), pipelines[pipeline_id][0]\n",
+    "    )\n",
+    "    df_eval_single[\"pipeline_id\"] = pipeline_id\n",
+    "    list_df_eval_single.append(df_eval_single)\n",
+    "\n",
+    "df_adjusted = pd.concat(list_df_eval_single)\n",
+    "df_adjusted\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(df_leaf[\"id\"].unique())\n",
+    "assert set(df_leaf[\"id\"].unique()) == {\n",
+    "    \"TRAIN\",\n",
+    "    \"INIT_CLUSTER_CONNECTION\",\n",
+    "    \"EVALUATE_TRIGGER_POLICY\",\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "    \"TRAINING_COMPLETED\",\n",
+    "    \"STORE_TRAINED_MODEL\",\n",
+    "    \"EVALUATE\",\n",
+    "    \"DONE\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reduce to composite models\n",
+    "df_adjusted = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "df_adjusted[composite_model_variant].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reduce evaluation interval to interval where all policies have evaluations\n",
+    "min_active_eval_center_per_pipeline = (\n",
+    "    df_adjusted[df_adjusted[composite_model_variant]].groupby(\"pipeline_ref\")[\"interval_center\"].min()\n",
+    ")\n",
+    "maximum_min = min_active_eval_center_per_pipeline.max()\n",
+    "print(maximum_min, min_active_eval_center_per_pipeline)\n",
+    "\n",
+    "assert maximum_min < pd.Timestamp(\"2014-06-01\")\n",
+    "\n",
+    "df_adjusted = df_adjusted[df_adjusted[\"interval_center\"] >= maximum_min]\n",
+    "df_adjusted[\"interval_center\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].astype(str).str.split(\"-\").str[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Aggregate metrics to a scalar value per pipeline\n",
+    "mean_accuracies = df_aggregate_eval_metric(\n",
+    "    df_adjusted,\n",
+    "    group_by=[\"pipeline_id\", \"pipeline_ref\", \"metric\"],\n",
+    "    in_col=\"value\",\n",
+    "    out_col=\"metric_value\",\n",
+    "    aggregate_func=\"mean\",\n",
+    ")\n",
+    "mean_accuracies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_triggers = df_leaf[df_leaf[\"id\"] == PipelineStage.TRAIN.name]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_triggers = df_leaf[df_leaf[\"id\"] == PipelineStage.TRAIN.name]\n",
+    "df_triggers = df_triggers[df_triggers[\"sample_time\"] > maximum_min]\n",
+    "df_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find number of trigger per pipeline that are after maximum_min\n",
+    "\n",
+    "# before the cutoff there was one trigger (equivalent to start of our reduced dataset): +1\n",
+    "num_triggers = df_triggers.groupby(\"pipeline_id\").aggregate(count=(\"id\", \"count\"), sum_duration=(\"duration\", \"sum\"))\n",
+    "num_triggers[\"count\"] += 1\n",
+    "num_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged = num_triggers.merge(mean_accuracies, on=\"pipeline_id\")\n",
+    "assert mean_accuracies.shape[0] == merged.shape[0]\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_type(x: str):\n",
+    "    if \"year\" in x or \"week\" in x:\n",
+    "        return \"time\"\n",
+    "    elif \"samples\" in x:\n",
+    "        return \"amount\"\n",
+    "    elif \"d\" in x:\n",
+    "        return \"drift\"\n",
+    "    else:\n",
+    "        return \"unknown\"\n",
+    "\n",
+    "\n",
+    "merged[\"type\"] = merged[\"pipeline_ref\"].apply(lambda x: create_type(x))\n",
+    "merged = merged.sort_values(by=[\"count\"])\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extract number if samples from pipeline name\n",
+    "amount_policies = \"/\".join(\n",
+    "    [\n",
+    "        f\"{int(x) // 1000}k\"\n",
+    "        for x in [str(y).replace(\" samples\", \"\") for y in merged[merged[\"type\"] == \"amount\"][\"pipeline_ref\"]]\n",
+    "    ]\n",
+    ")\n",
+    "time_policies = \"/\".join(\n",
+    "    [\n",
+    "        str(x)\n",
+    "        for x in [\n",
+    "            str(y).replace(\" weeks\", \"w\").replace(\" years\", \"y\").replace(\" year\", \"y\")\n",
+    "            for y in merged[merged[\"type\"] == \"time\"][\"pipeline_ref\"]\n",
+    "        ]\n",
+    "    ]\n",
+    ")\n",
+    "print(amount_policies)\n",
+    "print(time_policies)\n",
+    "\n",
+    "renamed = merged.copy()\n",
+    "renamed[\"Trigger Type\"] = renamed[\"type\"].apply(\n",
+    "    lambda x: f\"Amount    \\n[{amount_policies}]\" if x == \"amount\" else f\"Time [{time_policies} yrs.]\"\n",
+    ")\n",
+    "renamed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_tradeoff_scatter(\n",
+    "    renamed,\n",
+    "    x=\"count\",\n",
+    "    y=\"metric_value\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger Type\",\n",
+    "    x_label=\"Number of Triggers\",\n",
+    "    y_label=\"Mean Accuracy %\",\n",
+    "    height_factor=0.6,\n",
+    "    width_factor=0.7,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"tradeoff_simple_huffpost_triggers_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "in_minutes = renamed.copy()\n",
+    "in_minutes[\"sum_duration\"] = in_minutes[\"sum_duration\"] / 60\n",
+    "\n",
+    "fig = plot_tradeoff_scatter(\n",
+    "    in_minutes,\n",
+    "    x=\"sum_duration\",\n",
+    "    y=\"metric_value\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger Type\",\n",
+    "    x_label=\"Total Cost (Minutes)\",\n",
+    "    y_label=\"Mean Accuracy %\",\n",
+    "    height_factor=0.6,\n",
+    "    width_factor=0.7,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"tradeoff_simple_huffpost_cost_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_tradeoff_scatter(\n",
+    "    renamed,\n",
+    "    x=\"count\",\n",
+    "    y=\"sum_duration\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger Type\",\n",
+    "    x_label=\"Number of Triggers\",\n",
+    "    y_label=\"Total Cost (seconds)\",\n",
+    "    height_factor=0.6,\n",
+    "    width_factor=0.8,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"tradeoff_simple_huffpost_triggers_cost\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/triggers_simple/yb_accuracy_over_time_multi_pipeline.ipynb b/analytics/plotting/rh_thesis/triggers_simple/yb_accuracy_over_time_multi_pipeline.ipynb
new file mode 100644
index 000000000..dd338dab1
--- /dev/null
+++ b/analytics/plotting/rh_thesis/triggers_simple/yb_accuracy_over_time_multi_pipeline.ipynb
@@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import matplotlib.dates as mdates\n",
+    "import pandas as pd\n",
+    "from matplotlib.ticker import FixedFormatter, FixedLocator\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe, patch_yearbook_time\n",
+    "from analytics.plotting.common.metric_over_time import plot_metric_over_time\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/10_baselines_time\"\n",
+    ")\n",
+    "assert pipelines_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_ids = [20, 24, 26, 33]\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_eval_dfs = []\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    pipeline_log = pipeline_logs[pipeline_id]\n",
+    "    pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "    df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "    df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "        # subtracting would interfere with yearbook patching\n",
+    "        pipeline_log,\n",
+    "        df_all[\"sample_time\"].max(),\n",
+    "        pipeline_ref,\n",
+    "    )\n",
+    "    all_eval_dfs.append(df_eval_single)\n",
+    "\n",
+    "df_adjusted = pd.concat(all_eval_dfs)\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Name transformer\n",
+    "import re\n",
+    "\n",
+    "\n",
+    "def name_transformer(name):\n",
+    "    # reduce to number yearbook_timetrigger_10y\n",
+    "    return re.sub(r\".*yearbook_timetriggers?_(\\d+)y\", r\"trigger every \\1 years\", name).replace(\"1 years\", \"1 year\")\n",
+    "\n",
+    "\n",
+    "df_adjusted[\"pipeline_ref\"] = df_adjusted[\"pipeline_ref\"].apply(name_transformer)\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"pipeline_ref\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df_adjusted[\"dataset_id\"].unique()\n",
+    "df_adjusted[df_adjusted[\"dataset_id\"] == \"yearbook-test\"][\"pipeline_ref\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if patch_yearbook:\n",
+    "    for column in [\"interval_start\", \"interval_center\", \"interval_end\"]:\n",
+    "        patch_yearbook_time(df_adjusted, column)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = -1  # \"00-pipeline-composite-model\"\n",
+    "# number_digits = len(str(df_adjusted[\"model_idx\"].max()))\n",
+    "# df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"].astype(str).str.zfill(number_digits)\n",
+    "df_adjusted = pd.concat([df_adjusted, pipeline_composite_model])\n",
+    "\n",
+    "# df_composite = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "# df_composite"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# only composite models\n",
+    "reduced = df_adjusted[df_adjusted[\"model_idx\"] == -1].copy()\n",
+    "reduced"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_metric_over_time(\n",
+    "    reduced,\n",
+    "    x=\"interval_center\",\n",
+    "    y=\"value\",\n",
+    "    hue=\"pipeline_ref\",\n",
+    "    style=\"pipeline_ref\",\n",
+    "    width_factor=0.85,\n",
+    "    height_factor=0.75,\n",
+    "    legend_label=\"TimeTrigger Pipeline\",\n",
+    "    small_legend_fonts=True,\n",
+    "    # x_date_locator=mdates.YearLocator(20),\n",
+    "    # x_date_formatter=mdates.DateFormatter(\"%Y\"),  # %b\\n\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(f\"{y}-01-01\")) for y in [1940, 1970, 2000]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [1940, 1970, 2000]]),\n",
+    "    y_ticks=[50, 70, 90],\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    y_label=\"Accuracy (%)\",\n",
+    "    ylim=(45, 103),\n",
+    "    markers=False,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"simple_yb_composite_models_over_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/triggers_simple/yb_accuracy_over_time_single_pipeline.ipynb b/analytics/plotting/rh_thesis/triggers_simple/yb_accuracy_over_time_single_pipeline.ipynb
new file mode 100644
index 000000000..3742df904
--- /dev/null
+++ b/analytics/plotting/rh_thesis/triggers_simple/yb_accuracy_over_time_single_pipeline.ipynb
@@ -0,0 +1,282 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import matplotlib.dates as mdates\n",
+    "import pandas as pd\n",
+    "from matplotlib.ticker import FixedFormatter, FixedLocator\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe, patch_yearbook_time\n",
+    "from analytics.plotting.common.metric_over_time import plot_metric_over_time\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/10_baselines_time\"\n",
+    ")\n",
+    "assert pipelines_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 33\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"pipeline_ref\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df_adjusted[\"dataset_id\"].unique()\n",
+    "df_adjusted[df_adjusted[\"dataset_id\"] == \"yearbook-test\"][\"pipeline_ref\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if patch_yearbook:\n",
+    "    for column in [\"interval_start\", \"interval_center\", \"interval_end\"]:\n",
+    "        patch_yearbook_time(df_adjusted, column)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = -1  # \"00-pipeline-composite-model\"\n",
+    "# number_digits = len(str(df_adjusted[\"model_idx\"].max()))\n",
+    "# df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"].astype(str).str.zfill(number_digits)\n",
+    "df_adjusted = pd.concat([df_adjusted, pipeline_composite_model])\n",
+    "\n",
+    "# df_composite = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "# df_composite"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min_model = 1\n",
+    "max_model = df_adjusted[\"model_idx\"].max()\n",
+    "threshold_early_models = 39\n",
+    "threshold_late_models = 60\n",
+    "\n",
+    "reduced = df_adjusted.copy()\n",
+    "reduced[\"early_model\"] = (reduced[\"model_idx\"] <= threshold_early_models) & (reduced[\"model_idx\"] > 5)\n",
+    "reduced[\"late_model\"] = reduced[\"model_idx\"] >= threshold_late_models\n",
+    "reduced[\"composite_model\"] = reduced[\"model_idx\"] == -1\n",
+    "reduced = reduced[reduced[\"early_model\"] | reduced[\"late_model\"] | reduced[\"composite_model\"]]\n",
+    "# assert len(reduced[reduced[\"early_model\"]][\"model_idx\"].unique()) == 10\n",
+    "# assert len(reduced[reduced[\"late_model\"]][\"model_idx\"].unique()) == 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# if early_model --> \"early model\", if late_model --> \"late model\", else \"composite model\"\n",
+    "def mapper(x):\n",
+    "    if x < 0:\n",
+    "        return \"Cur. Active\\nComposite Model\"\n",
+    "    if x <= 5:\n",
+    "        return \"not converged model\"\n",
+    "    if x <= threshold_early_models:\n",
+    "        return f\"Early Models (6 - {threshold_early_models})\"\n",
+    "    if x >= threshold_late_models:\n",
+    "        return f\"Late Models ({threshold_late_models} - {max_model})\"\n",
+    "    return \"ERROR\"\n",
+    "\n",
+    "\n",
+    "reduced[\"model_type\"] = reduced[\"model_idx\"].apply(mapper)\n",
+    "reduced[\"model_type\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_metric_over_time(\n",
+    "    reduced,\n",
+    "    x=\"interval_center\",\n",
+    "    y=\"value\",\n",
+    "    hue=\"model_type\",\n",
+    "    style=\"model_type\",\n",
+    "    width_factor=0.85,\n",
+    "    height_factor=0.65,\n",
+    "    legend_label=\"Model Type\",\n",
+    "    small_legend_fonts=True,\n",
+    "    # x_date_locator=mdates.YearLocator(20),\n",
+    "    # x_date_formatter=mdates.DateFormatter(\"%Y\"),  # %b\\n\n",
+    "    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(f\"{y}-01-01\")) for y in [1940, 1970, 2000]]),\n",
+    "    x_date_formatter=FixedFormatter([str(year) for year in [1940, 1970, 2000]]),\n",
+    "    y_ticks=[40, 60, 80],\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    y_label=\"Accuracy (%)\",\n",
+    "    markers=False,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"simple_yb_1y_begin_vs_start_vs_composite\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/triggers_simple/yb_cost.ipynb b/analytics/plotting/rh_thesis/triggers_simple/yb_cost.ipynb
new file mode 100644
index 000000000..34f9c4417
--- /dev/null
+++ b/analytics/plotting/rh_thesis/triggers_simple/yb_cost.ipynb
@@ -0,0 +1,243 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import patch_yearbook_time, pipeline_leaf_times_df\n",
+    "from analytics.plotting.common.cost_matrix import plot_cost_matrix\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/10_baselines_time\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/11_baselines_amount\"),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode: time + amount\n",
+    "pipeline_ids = [23, 26, 33] + [21, 32, 36]\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_leaf_list = []\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_leaf.copy()\n",
+    "\n",
+    "# coloring in order of decreasing avg. duration\n",
+    "avg_duration_per_stage = df_adjusted.groupby([\"pipeline_ref\", \"id\"])[\"duration\"].mean().sort_values(ascending=False)\n",
+    "df_adjusted = df_adjusted.merge(avg_duration_per_stage, on=[\"pipeline_ref\", \"id\"], suffixes=(\"\", \"_avg\")).sort_values(\n",
+    "    \"duration_avg\", ascending=False\n",
+    ")\n",
+    "\n",
+    "# Yearbook as a mapped time dimension (to display the correct timestamps we need to convert back from days to years)\n",
+    "if patch_yearbook:\n",
+    "    patch_yearbook_time(df_adjusted, \"sample_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted\n",
+    "df_adjusted[\"sample_time_year\"] = df_adjusted[\"sample_time\"].dt.year\n",
+    "df_adjusted[\"sample_time_year_bin\"] = pd.cut(df_adjusted[\"sample_time_year\"], bins=10, labels=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new = df_adjusted[\n",
+    "    (\n",
+    "        df_adjusted[\"id\"].isin(\n",
+    "            [\n",
+    "                \"TRAIN\",\n",
+    "                \"STORE_TRAINED_MODEL\",\n",
+    "                \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "                \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "                \"EVALUATE_TRIGGER_POLICY\",\n",
+    "            ]\n",
+    "        )\n",
+    "    )\n",
+    "][[\"pipeline_ref\", \"id\", \"sample_time_year\", \"duration\"]].copy()\n",
+    "df_new = df_new.sort_values(\"sample_time_year\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state_rename = {\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\": \"inform remaining data\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\": \"inform trigger\",\n",
+    "}\n",
+    "\n",
+    "df_new[\"id\"] = df_new[\"id\"].replace(state_rename).str.lower().str.replace(\"_\", \" \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [23, 26, 33],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        23: \"TimeTrigger 25 years\",\n",
+    "        26: \"TimeTrigger 5 years\",\n",
+    "        33: \"TimeTrigger 1 years\",\n",
+    "    },\n",
+    "    height_factor=1.6,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (sec)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_ticks=[x for x in range(1940, 2010 + 1, 30)],\n",
+    "    y_ticks_cumulative=[x for x in range(0, 60 + 1, 20)],\n",
+    "    y_lim_cumulative=(0, 70),\n",
+    "    y_minutes=False,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"yearbook_time-trigger-cost-matrix\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [36, 32, 21],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        36: \"AmountTrigger 5000 samples\",\n",
+    "        32: \"AmountTrigger 1000 samples\",\n",
+    "        21: \"AmountTrigger 250 samples\",\n",
+    "    },\n",
+    "    height_factor=1.6,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (sec)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_ticks=[x for x in range(1940, 2010 + 1, 30)],\n",
+    "    y_ticks_cumulative=[x for x in range(0, 60 + 1, 20)],\n",
+    "    y_lim_cumulative=(0, 70),\n",
+    "    y_minutes=False,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"yearbook_amount-trigger-cost-matrix\")\n",
+    "# not interesting: note that for 250 samples we see multiple trigger at the same timestamp"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/triggers_simple/yb_cost_perf_tradeoff.ipynb b/analytics/plotting/rh_thesis/triggers_simple/yb_cost_perf_tradeoff.ipynb
new file mode 100644
index 000000000..6613ffcb2
--- /dev/null
+++ b/analytics/plotting/rh_thesis/triggers_simple/yb_cost_perf_tradeoff.ipynb
@@ -0,0 +1,474 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import (\n",
+    "    df_aggregate_eval_metric,\n",
+    "    dfs_models_and_evals,\n",
+    "    patch_yearbook_time,\n",
+    "    pipeline_leaf_times_df,\n",
+    ")\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from analytics.plotting.common.tradeoff_scatterplot import plot_tradeoff_scatter\n",
+    "from modyn.supervisor.internal.grpc.enums import PipelineStage\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/10_baselines_time\"),\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/11_baselines_amount\"),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pipeline_name_mapper(name: str) -> str:\n",
+    "    name = name.replace(\"yearbook_\", \"\")\n",
+    "    name = name.replace(\"timetrigger_\", \"\")  # \"every \"\n",
+    "    name = name.replace(\"amounttrigger_\", \"\")  # \"every \"\n",
+    "    name = name.replace(\"mmdalibi_\", \"\")\n",
+    "    if name.endswith(\"y\"):\n",
+    "        name = name[:-1] + (\" years\" if not name.endswith(\"1y\") else \" year\")\n",
+    "    elif not name.endswith(\"d\"):  # dataamount\n",
+    "        name = name + \" samples\"\n",
+    "    else:  # drift\n",
+    "        name = name.replace(\"_\", \"/\")\n",
+    "    return name\n",
+    "\n",
+    "\n",
+    "pipelines = {\n",
+    "    p_id: (pipeline_name_mapper(pname), p_path)\n",
+    "    for p_id, (pname, p_path) in pipelines.items()\n",
+    "    if pipeline_name_mapper(pname)\n",
+    "    in [\n",
+    "        # '40 years',\n",
+    "        # '25 years',\n",
+    "        \"15 years\",\n",
+    "        # '10 years',\n",
+    "        \"5 years\",\n",
+    "        \"4 years\",\n",
+    "        \"3 years\",\n",
+    "        \"2 years\",\n",
+    "        \"1 year\",\n",
+    "        \"dataamount_250 samples\",\n",
+    "        \"dataamount_500 samples\",\n",
+    "        \"dataamount_1000 samples\",\n",
+    "        \"dataamount_2500 samples\",\n",
+    "        \"dataamount_5000 samples\",\n",
+    "        # 'dataamount_10000 samples',\n",
+    "        # 'dataamount_15000 samples',\n",
+    "        # 'dataamount_30000 samples'\n",
+    "    ]\n",
+    "    and p_id < 100  # don't include duplicates\n",
+    "}\n",
+    "pipeline_ids = list(pipelines.keys())\n",
+    "\n",
+    "[(p_id, pname) for p_id, (pname, _) in pipelines.items() if p_id in pipeline_ids]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_df_eval_single: list[pd.DataFrame] = []\n",
+    "df_leaf_list: list[pd.DataFrame] = []\n",
+    "\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_single[\"pipeline_id\"] = pipeline_id\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "    _, _, df_eval_single = dfs_models_and_evals(\n",
+    "        pipeline_logs[pipeline_id], df_leaf_single[\"sample_time\"].max(), pipelines[pipeline_id][0]\n",
+    "    )\n",
+    "    df_eval_single[\"pipeline_id\"] = pipeline_id\n",
+    "    list_df_eval_single.append(df_eval_single)\n",
+    "\n",
+    "df_adjusted = pd.concat(list_df_eval_single)\n",
+    "df_adjusted\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)\n",
+    "df_leaf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(df_leaf[\"id\"].unique())\n",
+    "assert set(df_leaf[\"id\"].unique()) == {\n",
+    "    \"TRAIN\",\n",
+    "    \"INIT_CLUSTER_CONNECTION\",\n",
+    "    \"EVALUATE_TRIGGER_POLICY\",\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "    \"TRAINING_COMPLETED\",\n",
+    "    \"STORE_TRAINED_MODEL\",\n",
+    "    \"EVALUATE\",\n",
+    "    \"DONE\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if patch_yearbook:\n",
+    "    for column in [\"interval_start\", \"interval_center\", \"interval_end\"]:\n",
+    "        patch_yearbook_time(df_adjusted, column)\n",
+    "    patch_yearbook_time(df_leaf, \"sample_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reduce to composite models\n",
+    "df_adjusted = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "df_adjusted[composite_model_variant].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reduce evaluation interval to interval where all policies have evaluations\n",
+    "min_active_eval_center_per_pipeline = (\n",
+    "    df_adjusted[df_adjusted[composite_model_variant]].groupby(\"pipeline_ref\")[\"interval_center\"].min()\n",
+    ")\n",
+    "maximum_min = min_active_eval_center_per_pipeline.max()\n",
+    "print(maximum_min, min_active_eval_center_per_pipeline)\n",
+    "\n",
+    "assert maximum_min < pd.Timestamp(\"1950-01-01\")\n",
+    "\n",
+    "df_adjusted = df_adjusted[df_adjusted[\"interval_center\"] >= maximum_min]\n",
+    "df_adjusted[\"interval_center\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].astype(str).str.split(\"-\").str[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Aggregate metrics to a scalar value per pipeline\n",
+    "mean_accuracies = df_aggregate_eval_metric(\n",
+    "    df_adjusted,\n",
+    "    group_by=[\"pipeline_id\", \"pipeline_ref\", \"metric\"],\n",
+    "    in_col=\"value\",\n",
+    "    out_col=\"metric_value\",\n",
+    "    aggregate_func=\"mean\",\n",
+    ")\n",
+    "mean_accuracies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_leaf[\"id\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_triggers = df_leaf[df_leaf[\"id\"] == PipelineStage.TRAIN.name]\n",
+    "df_triggers = df_triggers[df_triggers[\"sample_time\"] > maximum_min]\n",
+    "df_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find number of trigger per pipeline that are after maximum_min\n",
+    "\n",
+    "# before the cutoff there was one trigger (equivalent to start of our reduced dataset): +1\n",
+    "num_triggers = df_triggers.groupby(\"pipeline_id\").aggregate(count=(\"id\", \"count\"), sum_duration=(\"duration\", \"sum\"))\n",
+    "num_triggers[\"count\"] += 1\n",
+    "num_triggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged = num_triggers.merge(mean_accuracies, on=\"pipeline_id\")\n",
+    "assert mean_accuracies.shape[0] == merged.shape[0]\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_type(x: str):\n",
+    "    if \"year\" in x:\n",
+    "        return \"time\"\n",
+    "    elif \"samples\" in x:\n",
+    "        return \"amount\"\n",
+    "    elif \"d\" in x:\n",
+    "        return \"drift\"\n",
+    "    else:\n",
+    "        return \"unknown\"\n",
+    "\n",
+    "\n",
+    "merged[\"type\"] = merged[\"pipeline_ref\"].apply(lambda x: create_type(x))\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extract number if samples from pipeline name\n",
+    "amount_policies = \"/\".join(\n",
+    "    [\n",
+    "        str(x)\n",
+    "        for x in list(\n",
+    "            sorted(\n",
+    "                merged[merged[\"type\"] == \"amount\"][\"pipeline_ref\"].str.extract(r\"(\\d+) .*\").astype(int)[0], reverse=True\n",
+    "            )\n",
+    "        )\n",
+    "    ]\n",
+    ")\n",
+    "time_policies = \"/\".join(\n",
+    "    [\n",
+    "        str(x)\n",
+    "        for x in list(\n",
+    "            sorted(\n",
+    "                merged[merged[\"type\"] == \"time\"][\"pipeline_ref\"].str.extract(r\"(\\d+) .*\").astype(int)[0], reverse=True\n",
+    "            )\n",
+    "        )\n",
+    "    ]\n",
+    ")\n",
+    "print(amount_policies)\n",
+    "print(time_policies)\n",
+    "\n",
+    "renamed = merged.copy()\n",
+    "renamed[\"Trigger Type\"] = renamed[\"type\"].apply(\n",
+    "    lambda x: f\"Amount    \\n[{amount_policies} s.]\" if x == \"amount\" else f\"Time    [{time_policies} yrs.]\"\n",
+    ")\n",
+    "renamed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_tradeoff_scatter(\n",
+    "    renamed,\n",
+    "    x=\"count\",\n",
+    "    y=\"metric_value\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger Type\",\n",
+    "    x_label=\"Number of Triggers\",\n",
+    "    y_label=\"Mean Accuracy %\",\n",
+    "    height_factor=0.6,\n",
+    "    width_factor=0.65,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"tradeoff_simple_yearbook_triggers_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "in_minutes = renamed.copy()\n",
+    "in_minutes[\"sum_duration\"] = in_minutes[\"sum_duration\"] / 60\n",
+    "\n",
+    "fig = plot_tradeoff_scatter(\n",
+    "    in_minutes,\n",
+    "    x=\"sum_duration\",\n",
+    "    y=\"metric_value\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger Type\",\n",
+    "    x_label=\"Total Cost (Minutes)\",\n",
+    "    y_label=\"Mean Accuracy %\",\n",
+    "    height_factor=0.6,\n",
+    "    width_factor=0.65,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"tradeoff_simple_yearbook_cost_performance\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_tradeoff_scatter(\n",
+    "    renamed,\n",
+    "    x=\"count\",\n",
+    "    y=\"sum_duration\",\n",
+    "    hue=\"Trigger Type\",\n",
+    "    style=\"Trigger Type\",\n",
+    "    x_label=\"Number of Triggers\",\n",
+    "    y_label=\"Total Cost (seconds)\",\n",
+    "    height_factor=0.6,\n",
+    "    width_factor=0.8,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"tradeoff_simple_yearbook_triggers_cost\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/yb_analytics.ipynb b/analytics/plotting/rh_thesis/yb_analytics.ipynb
new file mode 100644
index 000000000..c53e33e62
--- /dev/null
+++ b/analytics/plotting/rh_thesis/yb_analytics.ipynb
@@ -0,0 +1,197 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.plotting.common.dataset_histogram import (\n",
+    "    build_countplot,\n",
+    "    build_histogram_multicategory_barnorm,\n",
+    "    build_pieplot,\n",
+    ")\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use interactive plotly\n",
+    "interactive = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "yb_samples: list[(int, int)] = []\n",
+    "\"\"\"year, label\"\"\"\n",
+    "\n",
+    "for year in range(1930, 2014 + 1):\n",
+    "    file1 = Path(f\"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/datasets/yearbook/all/{year}.bin\")\n",
+    "    file1_bytes = file1.read_bytes()\n",
+    "\n",
+    "    record_size = 12288 + 4  # 32 * 32 * 3 * 4 (width*height*channels*float32) + 4 (label)\n",
+    "    label_size = 4\n",
+    "\n",
+    "    num_items = len(file1_bytes) // record_size\n",
+    "\n",
+    "    images_bin_year = []\n",
+    "    for i in range(num_items):\n",
+    "        label = file1_bytes[i * record_size : i * record_size + label_size]\n",
+    "        yb_samples.append((year, int.from_bytes(label, byteorder=\"big\")))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "yb_df = pd.DataFrame(yb_samples, columns=[\"year\", \"label\"])\n",
+    "yb_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# polished\n",
+    "fig1 = build_countplot(\n",
+    "    yb_df,\n",
+    "    x=\"year\",\n",
+    "    x_ticks=[1950, 1975, 2000],\n",
+    "    y_ticks_bins=3,\n",
+    "    height_factor=0.45,\n",
+    "    width_factor=0.48,\n",
+    "    x_label=\"Sample Time\",\n",
+    "    y_label=\"Num. Samples\",\n",
+    ")\n",
+    "\n",
+    "save_plot(fig1, \"yearbook_samples_over_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label_map = {0: \"Male\", 1: \"Female\"}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sorted_categories = yb_df[\"label\"].value_counts().reset_index().sort_values(\"count\", ascending=False)\n",
+    "sorted_categories[\"ratio\"] = sorted_categories[\"count\"] / sorted_categories[\"count\"].sum()\n",
+    "sorted_categories[\"label\"] = sorted_categories[\"label\"].map(label_map)\n",
+    "sorted_categories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "yb_df[\"label\"] = yb_df[\"label\"].map(label_map)\n",
+    "yb_df[\"date\"] = pd.to_datetime(yb_df[\"year\"], format=\"%Y\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ratio = build_pieplot(\n",
+    "    x=list(sorted_categories[\"count\"]),\n",
+    "    labels=tuple(sorted_categories[\"label\"]),\n",
+    "    width_factor=0.4,\n",
+    "    height_factor=0.35,\n",
+    ")\n",
+    "save_plot(ratio, \"yearbook_samples_ratio\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we want the legend to have different sorting\n",
+    "\n",
+    "from analytics.plotting.common.color import main_color\n",
+    "\n",
+    "fig_ratio = build_histogram_multicategory_barnorm(\n",
+    "    yb_df,\n",
+    "    x=\"year\",\n",
+    "    label=\"label\",\n",
+    "    sorted_coloring_categories=sorted_categories[\"label\"],\n",
+    "    height_factor=0.45,\n",
+    "    width_factor=0.48,\n",
+    "    legend=False,\n",
+    "    legend_labels=[\"Male\", \"Female\"],\n",
+    "    # x_ticks=[pd.to_datetime(f\"{y}-01-01\") for y in list(range(1930, 2014, 10))],\n",
+    "    x_label=\"Sample Time\",\n",
+    "    y_label=\"Label Distribution   \",\n",
+    "    y_ticks=[1.0, 0.75, 0.5, 0.25, 0.0],\n",
+    "    legend_title=\"Article Category\",\n",
+    "    nbins=84,\n",
+    "    manual_color_map={\n",
+    "        \"Male\": main_color(0, light=True),\n",
+    "        \"Female\": main_color(1),\n",
+    "    },\n",
+    "    grid_opacity=0.5,\n",
+    "    col_alpha=1.0,\n",
+    ")\n",
+    "\n",
+    "# Add manual text labels into the plot\n",
+    "fig_ratio.text(0.28, 0.77, \"Male: 53.3% (19808)\", color=\"white\")\n",
+    "fig_ratio.text(0.28, 0.33, \"Female: 46.7% (17382)\", color=\"white\")\n",
+    "\n",
+    "save_plot(fig_ratio, \"yearbook_label_distribution\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/yb_triggering/arxiv_heatmap.ipynb b/analytics/plotting/rh_thesis/yb_triggering/arxiv_heatmap.ipynb
new file mode 100644
index 000000000..db936afc1
--- /dev/null
+++ b/analytics/plotting/rh_thesis/yb_triggering/arxiv_heatmap.ipynb
@@ -0,0 +1,293 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# INPUTS\n",
+    "\n",
+    "drift_pipeline = False\n",
+    "if drift_pipeline:\n",
+    "    pipelines_dir = Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/10_baselines_time\"\n",
+    "    )\n",
+    "else:\n",
+    "    pipelines_dir = Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/10_baselines_time\"\n",
+    "    )\n",
+    "output_dir = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/plots/triggering\")\n",
+    "assert pipelines_dir.exists()\n",
+    "assert output_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "type(pipeline_logs[267 if not drift_pipeline else 267])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 267 if not drift_pipeline else 267\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"arxiv_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "assert df_adjusted[\"pipeline_ref\"].nunique() <= 1\n",
+    "# add the pipeline time series which is the performance of different models stitched together dep.\n",
+    "# w.r.t which model was active\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = 0\n",
+    "pipeline_composite_model[\"id_model\"] = 0\n",
+    "\n",
+    "label_map = {k: f\"{k}\" for k, v in df_adjusted[[\"model_idx\", \"id_model\"]].values}\n",
+    "label_map[0] = \"Pipeline composite model\"\n",
+    "\n",
+    "if include_composite_model:\n",
+    "    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])\n",
+    "else:\n",
+    "    df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"]\n",
+    "# .astype(str).str.split(\"-\").str[0]\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_end_years_per_model = df_logs_models[[\"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"]  # .dt.year  # TODO\n",
+    "df_train_end_years_per_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=\"model_idx\", how=\"left\")\n",
+    "df_merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.groupby([\"interval_center\", \"real_train_end\"]).size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build heatmap matrix dataframe:\n",
+    "heatmap_data = df_merged.pivot(index=[\"real_train_end\"], columns=\"interval_center\", values=\"value\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index.min(), heatmap_data.index.max()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_custom_ticks=[\n",
+    "        (2 * (x - 1995), str(x))\n",
+    "        for x in [2000, 2009, 2020]  # twice a year\n",
+    "    ],\n",
+    "    y_custom_ticks=[\n",
+    "        (2 * (x - 1995), str(x))\n",
+    "        for x in [2000, 2009, 2020]  # twice a year\n",
+    "    ],\n",
+    "    y_label=\"Trained up to\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    title_label=\"ArticleNet Performance\\nEvaluation Heatmap\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=0.5,\n",
+    "    height_factor=0.6,\n",
+    "    square=True,\n",
+    "    grid_alpha=0.55,\n",
+    ")\n",
+    "save_plot(fig, \"arxiv_trigger_heatmap_every_6_months\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list(df_logs_models.iterrows())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/yb_triggering/hp_heatmap.ipynb b/analytics/plotting/rh_thesis/yb_triggering/hp_heatmap.ipynb
new file mode 100644
index 000000000..6dda87255
--- /dev/null
+++ b/analytics/plotting/rh_thesis/yb_triggering/hp_heatmap.ipynb
@@ -0,0 +1,287 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# INPUTS\n",
+    "\n",
+    "drift_pipeline = False\n",
+    "if drift_pipeline:\n",
+    "    pipelines_dir = Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/10_baselines_time\"\n",
+    "    )\n",
+    "else:\n",
+    "    pipelines_dir = Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/10_baselines_time\"\n",
+    "    )\n",
+    "output_dir = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/plots/triggering\")\n",
+    "assert pipelines_dir.exists()\n",
+    "assert output_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "type(pipeline_logs[275 if not drift_pipeline else 275])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 275 if not drift_pipeline else 275\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"huffpost_kaggle_test\"\n",
+    "eval_handler = \"periodic-current\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "assert df_adjusted[\"pipeline_ref\"].nunique() <= 1\n",
+    "# add the pipeline time series which is the performance of different models stitched together dep.\n",
+    "# w.r.t which model was active\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = 0\n",
+    "pipeline_composite_model[\"id_model\"] = 0\n",
+    "\n",
+    "label_map = {k: f\"{k}\" for k, v in df_adjusted[[\"model_idx\", \"id_model\"]].values}\n",
+    "label_map[0] = \"Pipeline composite model\"\n",
+    "\n",
+    "if include_composite_model:\n",
+    "    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])\n",
+    "else:\n",
+    "    df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"]\n",
+    "# .astype(str).str.split(\"-\").str[0]\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_end_years_per_model = df_logs_models[[\"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"]  # .dt.year  # TODO\n",
+    "df_train_end_years_per_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=\"model_idx\", how=\"left\")\n",
+    "df_merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged.groupby([\"interval_center\", \"real_train_end\"]).size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build heatmap matrix dataframe:\n",
+    "heatmap_data = df_merged.pivot(index=[\"real_train_end\"], columns=\"interval_center\", values=\"value\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index.min(), heatmap_data.index.max()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_custom_ticks=[(4 * (x - 2012) + 0.5, str(x)) for x in [2014, 2018, 2021]],\n",
+    "    y_custom_ticks=[(4 * (x - 2012), str(x)) for x in [2014, 2018, 2021]],\n",
+    "    y_label=\"Trained up to\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    title_label=\"ArticleNet Performance\\nEvaluation Heatmap\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=0.5,\n",
+    "    height_factor=0.6,\n",
+    "    square=True,\n",
+    "    grid_alpha=0.55,\n",
+    ")\n",
+    "save_plot(fig, \"hp_trigger_heatmap_quarterly\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list(df_logs_models.iterrows())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/plotting/rh_thesis/yb_triggering/yb_heatmap.ipynb b/analytics/plotting/rh_thesis/yb_triggering/yb_heatmap.ipynb
new file mode 100644
index 000000000..d810fa86e
--- /dev/null
+++ b/analytics/plotting/rh_thesis/yb_triggering/yb_heatmap.ipynb
@@ -0,0 +1,316 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe, patch_yearbook_time\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# INPUTS\n",
+    "\n",
+    "drift_pipeline = False\n",
+    "if drift_pipeline:\n",
+    "    pipelines_dir = Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/10_baselines_time\"\n",
+    "    )\n",
+    "else:\n",
+    "    pipelines_dir = Path(\n",
+    "        \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/10_baselines_time\"\n",
+    "    )\n",
+    "output_dir = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/plots/triggering\")\n",
+    "assert pipelines_dir.exists()\n",
+    "assert output_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "type(pipeline_logs[33 if not drift_pipeline else 33])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode:\n",
+    "pipeline_id = 33 if not drift_pipeline else 33\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_log = pipeline_logs[pipeline_id]\n",
+    "pipeline_ref = f\"{pipeline_id}\".zfill(len(str(max_pipeline_id))) + f\" - {pipelines[pipeline_id][0]}\"\n",
+    "\n",
+    "df_all = logs_dataframe(pipeline_log, pipeline_ref)\n",
+    "\n",
+    "df_logs_models, _, df_eval_single = dfs_models_and_evals(\n",
+    "    # subtracting would interfere with yearbook patching\n",
+    "    pipeline_log,\n",
+    "    df_all[\"sample_time\"].max(),\n",
+    "    pipeline_ref,\n",
+    ")\n",
+    "\n",
+    "df_adjusted = df_eval_single\n",
+    "\n",
+    "\n",
+    "df_adjusted = df_adjusted[\n",
+    "    (df_adjusted[\"dataset_id\"] == dataset_id)\n",
+    "    & (df_adjusted[\"eval_handler\"] == eval_handler)\n",
+    "    & (df_adjusted[\"metric\"] == metric)\n",
+    "]\n",
+    "\n",
+    "# in percent (0-100)\n",
+    "df_adjusted[\"value\"] = df_adjusted[\"value\"] * 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_logs_models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if patch_yearbook:\n",
+    "    for column in [\"interval_start\", \"interval_center\", \"interval_end\"]:\n",
+    "        patch_yearbook_time(df_adjusted, column)\n",
+    "    for column in [\"train_start\", \"train_end\", \"real_train_end\", \"usage_start\", \"usage_end\"]:\n",
+    "        patch_yearbook_time(df_logs_models, column)\n",
+    "\n",
+    "    # correction for -1 second in timestamp format before patching\n",
+    "    df_logs_models[\"usage_end\"] = (\n",
+    "        df_logs_models[\"usage_end\"].dt.to_period(\"M\") + 1\n",
+    "    ).dt.to_timestamp()  # december (because of -1 second in timestamp format) -> start of year\n",
+    "\n",
+    "df_logs_models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_adjusted.sort_values(by=[\"interval_center\"])\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add composite model\n",
+    "\n",
+    "assert df_adjusted[\"pipeline_ref\"].nunique() <= 1\n",
+    "# add the pipeline time series which is the performance of different models stitched together dep.\n",
+    "# w.r.t which model was active\n",
+    "pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]\n",
+    "pipeline_composite_model[\"model_idx\"] = 0\n",
+    "pipeline_composite_model[\"id_model\"] = 0\n",
+    "\n",
+    "label_map = {k: f\"{k}\" for k, v in df_adjusted[[\"model_idx\", \"id_model\"]].values}\n",
+    "label_map[0] = \"Pipeline composite model\"\n",
+    "\n",
+    "if include_composite_model:\n",
+    "    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])\n",
+    "else:\n",
+    "    df_adjusted[\"model_idx\"] = df_adjusted[\"model_idx\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted[\"interval_center\"] = df_adjusted[\"interval_center\"].astype(str).str.split(\"-\").str[0]\n",
+    "df_adjusted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_end_years_per_model = df_logs_models[[\"model_idx\", \"real_train_end\"]]\n",
+    "df_train_end_years_per_model[\"real_train_end\"] = df_train_end_years_per_model[\"real_train_end\"].dt.year\n",
+    "df_train_end_years_per_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_merged = df_adjusted.merge(df_train_end_years_per_model, on=\"model_idx\", how=\"left\")\n",
+    "df_merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build heatmap matrix dataframe:\n",
+    "heatmap_data = df_merged.pivot(index=[\"real_train_end\"], columns=\"interval_center\", values=\"value\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "heatmap_data.index.min(), heatmap_data.index.max()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.heatmap import build_heatmap\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "\n",
+    "fig = build_heatmap(\n",
+    "    heatmap_data,\n",
+    "    reverse_col=True,\n",
+    "    x_ticks=[1950, 1975, 2000],\n",
+    "    y_ticks=[1950, 1975, 2000],\n",
+    "    y_label=\"Trained up to\",\n",
+    "    x_label=\"Evaluation Year\",\n",
+    "    title_label=\"YearbookNet Performance\\nEvaluation Heatmap\",\n",
+    "    color_label=\"Accuracy %\",\n",
+    "    width_factor=0.5,\n",
+    "    height_factor=0.6,\n",
+    "    square=True,\n",
+    "    grid_alpha=0.4,\n",
+    ")\n",
+    "save_plot(fig, \"yb_trigger_heatmap_yearly\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list(df_logs_models.iterrows())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analytics/tools/patch_logfile.ipynb b/analytics/tools/patch_logfile.ipynb
index 1e5d25e60..72c6746c9 100644
--- a/analytics/tools/patch_logfile.ipynb
+++ b/analytics/tools/patch_logfile.ipynb
@@ -49,9 +49,7 @@
    "source": [
     "# VARIABLES\n",
     "\n",
-    "pipeline_logfile = Path(\n",
-    "    \"/Users/robinholzinger/robin/dev/eth/modyn-sigmod-data/cglm-landmark/data_selection/logs_agg_patch_currently_trained/pipeline_24/pipeline.log\"\n",
-    ")"
+    "pipeline_logfile = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/test/pipeline_8/pipeline.log\")"
    ]
   },
   {
@@ -122,6 +120,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from modyn.utils.utils import SECONDS_PER_UNIT\n",
+    "\n",
+    "offset = SECONDS_PER_UNIT[\"w\"] * 25 * 2\n",
     "for eval_log in logs.supervisor_logs.stage_runs:\n",
     "    if eval_log.id == PipelineStage.EVALUATE_MULTI.name:\n",
     "        # For a fixed interval the evaluation request of a certain model is the most recent, if the model training\n",
@@ -136,14 +137,44 @@
     "            model_row = df_models[df_models[\"id_model\"] == eval_request.id_model]\n",
     "            assert len(model_row) == 1\n",
     "\n",
-    "            training_center = (\n",
-    "                model_row.iloc[0][\"train_start\"].to_pydatetime().timestamp()\n",
-    "                + model_row.iloc[0][\"train_end\"].to_pydatetime().timestamp()\n",
-    "            ) / 2\n",
-    "            eval_request.currently_active_model = eval_request.currently_trained_model\n",
-    "            eval_request.currently_trained_model = (\n",
-    "                eval_request.interval_start <= training_center <= eval_request.interval_end\n",
-    "            )"
+    "            evaluation_center = (eval_request.interval_start + eval_request.interval_end) / 2\n",
+    "\n",
+    "            eval_request.currently_active_model = (\n",
+    "                evaluation_center - offset\n",
+    "                <= model_row.iloc[0][\"train_end\"].to_pydatetime().timestamp()\n",
+    "                <= evaluation_center\n",
+    "            )\n",
+    "\n",
+    "            # training_center = (\n",
+    "            #     model_row.iloc[0][\"train_start\"].to_pydatetime().timestamp()\n",
+    "            #     + model_row.iloc[0][\"train_end\"].to_pydatetime().timestamp()\n",
+    "            # ) / 2\n",
+    "            # eval_request.currently_active_model = eval_request.currently_trained_model\n",
+    "            # eval_request.currently_trained_model = (\n",
+    "            #     eval_request.interval_start <= training_center <= eval_request.interval_end\n",
+    "            # )\n",
+    "\n",
+    "new_df_models, new_eval_requests, new_evals_metrics = dfs_models_and_evals(logs, max_timestamp)\n",
+    "\n",
+    "\n",
+    "# # # Another pass for the currently trained model\n",
+    "# for eval_log in logs.supervisor_logs.stage_runs:\n",
+    "#     if eval_log.id == PipelineStage.EVALUATE_MULTI.name:\n",
+    "#         assert len(model_row) == 1\n",
+    "\n",
+    "#         found = new_eval_requests[\n",
+    "#             (\n",
+    "#                 new_eval_requests[\"id_model\"] == eval_request.id_model + 1\n",
+    "#             ) & (\n",
+    "#                 new_eval_requests[\"interval_start\"] == pd.to_datetime(eval_request.interval_start, unit=\"s\")\n",
+    "#             ) &\n",
+    "#             (\n",
+    "#                 new_eval_requests[\"interval_end\"] == pd.to_datetime(eval_request.interval_end, unit=\"s\")\n",
+    "#             ) & (\n",
+    "#                 new_eval_requests[\"currently_active_model\"] == True\n",
+    "#             )\n",
+    "#         ]\n",
+    "#         eval_request.currently_trained_model = len(found) > 0"
    ]
   },
   {
@@ -153,7 +184,9 @@
    "outputs": [],
    "source": [
     "# Write results back\n",
-    "pipeline_logfile.write_text(logs.model_dump_json(by_alias=True))"
+    "Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/test/pipeline_9/pipeline.log\").write_text(\n",
+    "    logs.model_dump_json(by_alias=True)\n",
+    ")"
    ]
   },
   {
diff --git a/benchmark/arxiv_kaggle/data_generation.py b/benchmark/arxiv_kaggle/data_generation.py
index 4402fd438..47528dee2 100644
--- a/benchmark/arxiv_kaggle/data_generation.py
+++ b/benchmark/arxiv_kaggle/data_generation.py
@@ -38,17 +38,21 @@ def data_json(self) -> Path:
     def clean_folder(self) -> None:
         self.data_json.unlink()
 
-    def load_into_dataframe(self) -> pd.DataFrame:
+    def load_into_dataframe(self, keep_true_category: bool = False) -> pd.DataFrame:
         records = []
         for line in self.data_json.read_text().splitlines():
             record = json.loads(line)
             records.append({field: record[field] for field in ["title", "categories", "versions", "update_date"]})
 
         df = pd.DataFrame(records)
-        return ArxivKaggleDataGenerator.sanitize_dataframe(df)
+        return ArxivKaggleDataGenerator.sanitize_dataframe(df, keep_true_category=keep_true_category)
 
     def store_data(
-        self, cleaned_df: pd.DataFrame, resolution: TimeResolution, test_split: bool, dummy_period: bool = False
+        self,
+        cleaned_df: pd.DataFrame,
+        resolution: TimeResolution,
+        test_split: bool,
+        dummy_period: bool = False,
     ) -> None:
         partitions = bin_dataframe_wrt_time_resolution(cleaned_df, resolution, "first_version_timestamp")
 
@@ -64,7 +68,9 @@ def store_data(
                     df_to_csv_with_timestamp(partition[self.fields_to_keep], name, self.data_dir / "train")
                 else:
                     df_train, df_test = train_test_split(
-                        partition[self.fields_to_keep], test_size=self.test_holdout, random_state=42
+                        partition[self.fields_to_keep],
+                        test_size=self.test_holdout,
+                        random_state=42,
                     )
                     df_to_csv_with_timestamp(df_train, name, self.data_dir / "train")
                     df_to_csv_with_timestamp(df_test, name, self.data_dir / "test")
@@ -79,7 +85,7 @@ def store_data(
             )
 
     @staticmethod
-    def sanitize_dataframe(raw_df: pd.DataFrame) -> pd.DataFrame:
+    def sanitize_dataframe(raw_df: pd.DataFrame, keep_true_category: bool = False) -> pd.DataFrame:
         def extract_first_version_timestamp(row: Any) -> Any:
             versions = row["versions"]
             if len(versions) == 0:
@@ -94,8 +100,9 @@ def extract_first_version_timestamp(row: Any) -> Any:
         # we only take the first category (like in the wilds)
         transformed["category"] = transformed["categories"].str.split(" ").str[0]
 
-        # to int-categorical
-        transformed["category"] = pd.Categorical(transformed["category"]).codes
+        if not keep_true_category:
+            # to int-categorical
+            transformed["category"] = pd.Categorical(transformed["category"]).codes
 
         # we only take the first version timestamp
         transformed["first_version_timestamp"] = transformed.apply(extract_first_version_timestamp, axis=1)
diff --git a/benchmark/huffpost_kaggle/data_generation.py b/benchmark/huffpost_kaggle/data_generation.py
index 0442f12ab..79757fffd 100644
--- a/benchmark/huffpost_kaggle/data_generation.py
+++ b/benchmark/huffpost_kaggle/data_generation.py
@@ -38,17 +38,21 @@ def data_json(self) -> Path:
     def clean_folder(self) -> None:
         self.data_json.unlink()
 
-    def load_into_dataframe(self) -> pd.DataFrame:
+    def load_into_dataframe(self, keep_true_category: bool = False) -> pd.DataFrame:
         records = []
         for line in self.data_json.read_text().splitlines():
             record = json.loads(line)
             records.append({field: record[field] for field in ["headline", "category", "date"]})
 
         df = pd.DataFrame(records)
-        return HuffpostKaggleDataGenerator.sanitize_dataframe(df)
+        return HuffpostKaggleDataGenerator.sanitize_dataframe(df, keep_true_category=keep_true_category)
 
     def store_data(
-        self, cleaned_df: pd.DataFrame, resolution: TimeResolution, test_split: bool, dummy_period: bool = False
+        self,
+        cleaned_df: pd.DataFrame,
+        resolution: TimeResolution,
+        test_split: bool,
+        dummy_period: bool = False,
     ) -> None:
         partitions = bin_dataframe_wrt_time_resolution(cleaned_df, resolution, "date")
 
@@ -64,7 +68,9 @@ def store_data(
                     df_to_csv_with_timestamp(partition[self.fields_to_keep], name, self.data_dir / "train")
                 else:
                     df_train, df_test = train_test_split(
-                        partition[self.fields_to_keep], test_size=self.test_holdout, random_state=42
+                        partition[self.fields_to_keep],
+                        test_size=self.test_holdout,
+                        random_state=42,
                     )
                     df_to_csv_with_timestamp(df_train, name, self.data_dir / "train")
                     df_to_csv_with_timestamp(df_test, name, self.data_dir / "test")
@@ -79,14 +85,15 @@ def store_data(
             )
 
     @staticmethod
-    def sanitize_dataframe(raw_df: pd.DataFrame) -> pd.DataFrame:
+    def sanitize_dataframe(raw_df: pd.DataFrame, keep_true_category: bool = False) -> pd.DataFrame:
         transformed = raw_df
 
         # escape new lines
         transformed["headline"] = transformed["headline"].str.replace("\n", " ").replace(r"\s+", " ", regex=True)
 
-        # to int-categorical
-        transformed["category"] = pd.Categorical(transformed["category"]).codes
+        if not keep_true_category:
+            # to int-categorical
+            transformed["category"] = pd.Categorical(transformed["category"]).codes
 
         # parse the date
         transformed["date"] = pd.to_datetime(transformed["date"])
diff --git a/benchmark/utils/time_resolution_binning.py b/benchmark/utils/time_resolution_binning.py
index 21e7aa586..8d7851c1f 100644
--- a/benchmark/utils/time_resolution_binning.py
+++ b/benchmark/utils/time_resolution_binning.py
@@ -25,6 +25,7 @@ def bin_dataframe_wrt_time_resolution(
     pandas_time_unit = {
         "year": "Y",
         "month": "M",
+        "week": "W",
         "day": "D",
         "hour": "h",
         "minute": "min",
@@ -44,7 +45,7 @@ def bin_dataframe_wrt_time_resolution(
 
 def df_to_csv_with_timestamp(df: pd.DataFrame, period: pd.Period, data_dir: Path) -> None:
     """Stores the dataframe in a file with the timestamp."""
-    label_file = data_dir / f"{period}.csv"
+    label_file = data_dir / f"{str(period).replace("/", "_")}.csv"
     df.to_csv(label_file, index=False, sep="\t", lineterminator="\n", header=False)
     timestamp = int(period.to_timestamp().to_pydatetime().timestamp())
     os.utime(label_file, (timestamp, timestamp))
diff --git a/modyn/config/schema/pipeline/trigger/drift/alibi_detect.py b/modyn/config/schema/pipeline/trigger/drift/alibi_detect.py
index b0cecb15f..e18f2824d 100644
--- a/modyn/config/schema/pipeline/trigger/drift/alibi_detect.py
+++ b/modyn/config/schema/pipeline/trigger/drift/alibi_detect.py
@@ -88,7 +88,30 @@ class AlibiDetectCVMDriftMetric(_AlibiDetectBaseDriftMetric, _AlibiDetectCorrect
     id: Literal["AlibiDetectCVMDriftMetric"] = Field("AlibiDetectCVMDriftMetric")
 
 
+class AlibiDetectLSDDDriftMetric(_AlibiDetectBaseDriftMetric, _AlibiDetectCorrectionMixin, AlibiDetectDeviceMixin):
+    id: Literal["AlibiDetectLSDDDriftMetric"] = Field("AlibiDetectLSDDDriftMetric")
+
+
+class AlibiDetectFETDriftMetric(
+    _AlibiDetectBaseDriftMetric,
+    _AlibiDetectCorrectionMixin,
+    _AlibiDetectAlternativeMixin,
+):
+    id: Literal["AlibiDetectFETDriftMetric"] = Field("AlibiDetectFETDriftMetric")
+    n_features: int | None = Field(None)
+
+
+class AlibiDetectChiSquareDriftMetric(_AlibiDetectBaseDriftMetric, _AlibiDetectCorrectionMixin):
+    id: Literal["AlibiDetectChiSquareDriftMetric"] = Field("AlibiDetectChiSquareDriftMetric")
+    n_features: int | None = Field(None)
+
+
 AlibiDetectDriftMetric = Annotated[
-    AlibiDetectMmdDriftMetric | AlibiDetectKSDriftMetric | AlibiDetectCVMDriftMetric,
+    AlibiDetectMmdDriftMetric
+    | AlibiDetectKSDriftMetric
+    | AlibiDetectCVMDriftMetric
+    | AlibiDetectLSDDDriftMetric
+    | AlibiDetectFETDriftMetric
+    | AlibiDetectChiSquareDriftMetric,
     Field(discriminator="id"),
 ]
diff --git a/modyn/const/types.py b/modyn/const/types.py
index cc5ee8ecc..89c32901b 100644
--- a/modyn/const/types.py
+++ b/modyn/const/types.py
@@ -8,6 +8,7 @@
 class TimeResolution(str, Enum):
     YEAR = "year"
     MONTH = "month"
+    WEEK = "week"
     DAY = "day"
     HOUR = "hour"
     MINUTE = "minute"
diff --git a/modyn/evaluator/internal/metrics/accuracy.py b/modyn/evaluator/internal/metrics/accuracy.py
index 540b7e27a..5c881d8df 100644
--- a/modyn/evaluator/internal/metrics/accuracy.py
+++ b/modyn/evaluator/internal/metrics/accuracy.py
@@ -24,9 +24,6 @@ def _batch_evaluated_callback(self, y_true: torch.Tensor, y_pred: torch.Tensor,
         self.total_correct += labeled_correctly
         self.samples_seen += batch_size
 
-        self.total_correct += labeled_correctly
-        self.samples_seen += batch_size
-
     def get_evaluation_result(self) -> float:
         if self.samples_seen == 0:
             self.warning("Did not see any samples.")
diff --git a/modyn/supervisor/internal/pipeline_executor/models.py b/modyn/supervisor/internal/pipeline_executor/models.py
index 10330f2ec..3cc19db0f 100644
--- a/modyn/supervisor/internal/pipeline_executor/models.py
+++ b/modyn/supervisor/internal/pipeline_executor/models.py
@@ -21,7 +21,9 @@
 from modyn.supervisor.internal.eval.handler import EvalRequest
 from modyn.supervisor.internal.grpc.enums import PipelineStage
 from modyn.supervisor.internal.triggers.utils.models import TriggerPolicyEvaluationLog
-from modyn.supervisor.internal.utils.evaluation_status_reporter import EvaluationStatusReporter
+from modyn.supervisor.internal.utils.evaluation_status_reporter import (
+    EvaluationStatusReporter,
+)
 from modyn.supervisor.internal.utils.git_utils import get_head_sha
 
 logger = logging.getLogger(__name__)
@@ -250,7 +252,12 @@ def df_columns(self) -> list[str]:
     @override
     @property
     def df_row(self) -> tuple:
-        return (self.trigger_i, self.trigger_index, self.trigger_i, self.num_samples_in_trigger)
+        return (
+            self.trigger_i,
+            self.trigger_index,
+            self.trigger_i,
+            self.num_samples_in_trigger,
+        )
 
 
 class TriggerExecutionInfo(_TriggerLogMixin):
@@ -260,12 +267,24 @@ class TriggerExecutionInfo(_TriggerLogMixin):
     def df_columns(self) -> list[str]:
         """Provide the column names of the DataFrame representation of the
         data."""
-        return ["trigger_i", "trigger_index", "trigger_id", "first_timestamp", "last_timestamp"]
+        return [
+            "trigger_i",
+            "trigger_index",
+            "trigger_id",
+            "first_timestamp",
+            "last_timestamp",
+        ]
 
     @override
     @property
     def df_row(self) -> tuple:
-        return (self.trigger_i, self.trigger_index, self.trigger_id, self.first_timestamp, self.last_timestamp)
+        return (
+            self.trigger_i,
+            self.trigger_index,
+            self.trigger_id,
+            self.first_timestamp,
+            self.last_timestamp,
+        )
 
 
 class _TrainInfoMixin(StageInfo):
@@ -279,12 +298,24 @@ class TrainingInfo(_TrainInfoMixin):
     def df_columns(self) -> list[str]:
         """Provide the column names of the DataFrame representation of the
         data."""
-        return ["trigger_id", "training_id", "num_batches", "num_samples"]
+        return [
+            "trigger_id",
+            "training_id",
+            "num_batches",
+            "num_samples",
+            "train_time_at_trainer",
+        ]
 
     @override
     @property
     def df_row(self) -> tuple:
-        return (self.trigger_id, self.training_id, self.trainer_log["num_batches"], self.trainer_log["num_samples"])
+        return (
+            self.trigger_id,
+            self.training_id,
+            self.trainer_log["num_batches"],
+            self.trainer_log["num_samples"],
+            self.trainer_log["total_train"],
+        )
 
 
 class StoreModelInfo(_TrainInfoMixin):
@@ -510,9 +541,16 @@ class StageLog(BaseModel):
     def df_columns(self, extended: bool = False) -> list[str]:
         """Provide the column names of the DataFrame representation of the
         data."""
-        return ["id", "start", "end", "duration", "batch_idx", "sample_idx", "sample_time", "trigger_idx"] + (
-            self.info.df_columns() if extended and self.info else []
-        )
+        return [
+            "id",
+            "start",
+            "end",
+            "duration",
+            "batch_idx",
+            "sample_idx",
+            "sample_time",
+            "trigger_idx",
+        ] + (self.info.df_columns() if extended and self.info else [])
 
     def df_row(self, extended: bool = False) -> tuple:
         return (
@@ -605,7 +643,16 @@ def df(self) -> pd.DataFrame:
                 )
                 for stage in self.stage_runs
             ],
-            columns=["id", "start", "end", "duration", "batch_idx", "sample_idx", "sample_time", "trigger_idx"],
+            columns=[
+                "id",
+                "start",
+                "end",
+                "duration",
+                "batch_idx",
+                "sample_idx",
+                "sample_time",
+                "trigger_idx",
+            ],
         )
 
 
@@ -644,7 +691,11 @@ class PipelineLogs(BaseModel):
     # metadata
     partial_idx: int = Field(0)
 
-    def materialize(self, log_dir_path: Path, mode: Literal["initial", "increment", "final"] = "increment") -> None:
+    def materialize(
+        self,
+        log_dir_path: Path,
+        mode: Literal["initial", "increment", "final"] = "increment",
+    ) -> None:
         """Materialize the logs to log files.
 
         If run with pytest, log_file_path and mode will be ignored.
@@ -682,7 +733,11 @@ def materialize(self, log_dir_path: Path, mode: Literal["initial", "increment",
             return
 
         if mode == "increment":
-            with open(pipeline_logdir / f"supervisor_part_{self.partial_idx}.log", "w", encoding="utf-8") as logfile:
+            with open(
+                pipeline_logdir / f"supervisor_part_{self.partial_idx}.log",
+                "w",
+                encoding="utf-8",
+            ) as logfile:
                 logfile.write(self.supervisor_logs.model_dump_json(by_alias=True, indent=2))
 
             self.supervisor_logs.clear()
diff --git a/modyn/supervisor/internal/triggers/drift/detector/alibi.py b/modyn/supervisor/internal/triggers/drift/detector/alibi.py
index 239b554f0..8bcfaedcb 100644
--- a/modyn/supervisor/internal/triggers/drift/detector/alibi.py
+++ b/modyn/supervisor/internal/triggers/drift/detector/alibi.py
@@ -20,9 +20,12 @@
     MetricResult,
 )
 from modyn.config.schema.pipeline.trigger.drift.alibi_detect import (
+    AlibiDetectChiSquareDriftMetric,
     AlibiDetectClassifierDriftMetric,
     AlibiDetectCVMDriftMetric,
+    AlibiDetectFETDriftMetric,
     AlibiDetectKSDriftMetric,
+    AlibiDetectLSDDDriftMetric,
 )
 from modyn.supervisor.internal.triggers.drift.classifier_models import (
     alibi_classifier_models,
@@ -154,4 +157,36 @@ def _alibi_detect_metric_factory(config: AlibiDetectDriftMetric, embeddings_ref:
             **kwargs,
         )
 
+    if isinstance(config, AlibiDetectLSDDDriftMetric):
+        return LSDDDrift(
+            x_ref=embeddings_ref,
+            backend="pytorch",
+            n_permutations=config.num_permutations or 1,
+            p_val=config.p_val,
+            correction=config.correction,
+            x_ref_preprocessed=config.x_ref_preprocessed,
+            device=config.device,
+            **kwargs,
+        )
+
+    if isinstance(config, AlibiDetectFETDriftMetric):
+        return FETDrift(
+            x_ref=embeddings_ref,
+            p_val=config.p_val,
+            correction=config.correction,
+            x_ref_preprocessed=config.x_ref_preprocessed,
+            n_features=config.n_features,
+            **kwargs,
+        )
+
+    if isinstance(config, AlibiDetectChiSquareDriftMetric):
+        return ChiSquareDrift(
+            x_ref=embeddings_ref,
+            p_val=config.p_val,
+            correction=config.correction,
+            x_ref_preprocessed=config.x_ref_preprocessed,
+            n_features=config.n_features,
+            **kwargs,
+        )
+
     raise NotImplementedError(f"Metric {config.id} is not supported in AlibiDetectDriftMetric.")
diff --git a/modyn/tests/evaluator/internal/metrics/test_accuracy.py b/modyn/tests/evaluator/internal/metrics/test_accuracy.py
index 379611c77..a197cb953 100644
--- a/modyn/tests/evaluator/internal/metrics/test_accuracy.py
+++ b/modyn/tests/evaluator/internal/metrics/test_accuracy.py
@@ -67,6 +67,7 @@ def test_accuracy() -> None:
     accuracy.evaluate_batch(y_true, y_pred, 6)
 
     assert accuracy.get_evaluation_result() == pytest.approx(1.0 / 3)
+    assert accuracy.samples_seen - accuracy.total_correct == 0 + 6 + 4
 
 
 def test_accuracy_invalid() -> None: