From 42a095692d6db879b862e6a78d0cce50c8c47c0c Mon Sep 17 00:00:00 2001
From: Alexander Held <45009355+alexander-held@users.noreply.github.com>
Date: Tue, 5 Sep 2023 23:48:00 +0200
Subject: [PATCH] fix: handle long file names in ServiceX dataset grouping (AGC
 v1) (#196)

* handle long file names in ServiceX dataset grouping
* do not override config object by cabinetry config
---
 .../ttbar_analysis_pipeline.ipynb             | 35 ++++++++++++++-----
 .../ttbar_analysis_pipeline.py                | 14 ++++----
 .../cms-open-data-ttbar/utils/__init__.py     | 20 ++++++++---
 3 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb
index 8d9c9350..fa84d8ba 100644
--- a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb
+++ b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb
@@ -1,6 +1,7 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "f4bdc262",
    "metadata": {},
@@ -24,6 +25,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "49c44094",
    "metadata": {},
@@ -35,6 +37,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "991a4343",
    "metadata": {},
@@ -77,6 +80,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "808b4789",
    "metadata": {},
@@ -130,6 +134,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "a22d0859",
    "metadata": {},
@@ -349,6 +354,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "3243414e",
    "metadata": {},
@@ -395,6 +401,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "b0b27a46",
    "metadata": {},
@@ -434,6 +441,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "2114307f",
    "metadata": {},
@@ -476,6 +484,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "62bbc8c8",
    "metadata": {},
@@ -596,6 +605,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "b66c8142",
    "metadata": {},
@@ -661,6 +671,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "5feb786b",
    "metadata": {},
@@ -735,6 +746,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "9f861625",
    "metadata": {},
@@ -758,6 +770,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "6ea49c8e-2d20-47d5-8fd6-2f51e4ef1e0e",
    "metadata": {},
@@ -781,17 +794,18 @@
    },
    "outputs": [],
    "source": [
-    "config = cabinetry.configuration.load(\"cabinetry_config.yml\")\n",
+    "cabinetry_config = cabinetry.configuration.load(\"cabinetry_config.yml\")\n",
     "\n",
     "# rebinning: lower edge 110 GeV, merge bins 2->1\n",
-    "rebinning_router = utils.get_cabinetry_rebinning_router(config, rebinning=slice(110j, None, hist.rebin(2)))\n",
-    "cabinetry.templates.build(config, router=rebinning_router)\n",
-    "cabinetry.templates.postprocess(config)  # optional post-processing (e.g. smoothing)\n",
-    "ws = cabinetry.workspace.build(config)\n",
+    "rebinning_router = utils.get_cabinetry_rebinning_router(cabinetry_config, rebinning=slice(110j, None, hist.rebin(2)))\n",
+    "cabinetry.templates.build(cabinetry_config, router=rebinning_router)\n",
+    "cabinetry.templates.postprocess(cabinetry_config)  # optional post-processing (e.g. smoothing)\n",
+    "ws = cabinetry.workspace.build(cabinetry_config)\n",
     "cabinetry.workspace.save(ws, \"workspace.json\")"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "6feae4d5",
    "metadata": {},
@@ -839,6 +853,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "aab2493c",
    "metadata": {},
@@ -876,6 +891,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "fe677e60",
    "metadata": {},
@@ -906,6 +922,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "35e5a9aa",
    "metadata": {},
@@ -936,7 +953,7 @@
    ],
    "source": [
     "model_prediction = cabinetry.model_utils.prediction(model)\n",
-    "figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=config)\n",
+    "figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=cabinetry_config)\n",
     "figs[0][\"figure\"]"
    ]
   },
@@ -965,6 +982,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "9908c2a2",
    "metadata": {},
@@ -994,7 +1012,7 @@
    ],
    "source": [
     "model_prediction_postfit = cabinetry.model_utils.prediction(model, fit_results=fit_results)\n",
-    "figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=config)\n",
+    "figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=cabinetry_config)\n",
     "figs[0][\"figure\"]"
    ]
   },
@@ -1023,6 +1041,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "269f8c3a",
    "metadata": {},
@@ -1059,7 +1078,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.16"
+   "version": "3.9.16"
   }
  },
  "nbformat": 4,
diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py
index 0e68ea2e..ccd04f60 100644
--- a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py
+++ b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py
@@ -517,13 +517,13 @@ def get_query(source: ObjectStream) -> ObjectStream:
 # We will use `cabinetry` to combine all histograms into a `pyhf` workspace and fit the resulting statistical model to the pseudodata we built.
 
 # %%
-config = cabinetry.configuration.load("cabinetry_config.yml")
+cabinetry_config = cabinetry.configuration.load("cabinetry_config.yml")
 
 # rebinning: lower edge 110 GeV, merge bins 2->1
-rebinning_router = utils.get_cabinetry_rebinning_router(config, rebinning=slice(110j, None, hist.rebin(2)))
-cabinetry.templates.build(config, router=rebinning_router)
-cabinetry.templates.postprocess(config)  # optional post-processing (e.g. smoothing)
-ws = cabinetry.workspace.build(config)
+rebinning_router = utils.get_cabinetry_rebinning_router(cabinetry_config, rebinning=slice(110j, None, hist.rebin(2)))
+cabinetry.templates.build(cabinetry_config, router=rebinning_router)
+cabinetry.templates.postprocess(cabinetry_config)  # optional post-processing (e.g. smoothing)
+ws = cabinetry.workspace.build(cabinetry_config)
 cabinetry.workspace.save(ws, "workspace.json")
 
 # %% [markdown]
@@ -556,7 +556,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
 
 # %%
 model_prediction = cabinetry.model_utils.prediction(model)
-figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=config)
+figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=cabinetry_config)
 figs[0]["figure"]
 
 # %%
@@ -567,7 +567,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
 
 # %%
 model_prediction_postfit = cabinetry.model_utils.prediction(model, fit_results=fit_results)
-figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=config)
+figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=cabinetry_config)
 figs[0]["figure"]
 
 # %%
diff --git a/analyses/cms-open-data-ttbar/utils/__init__.py b/analyses/cms-open-data-ttbar/utils/__init__.py
index 653ff595..b88eaa32 100644
--- a/analyses/cms-open-data-ttbar/utils/__init__.py
+++ b/analyses/cms-open-data-ttbar/utils/__init__.py
@@ -146,12 +146,22 @@ def __init__(self, fileset, backend_name="uproot", ignore_cache=False):
     def get_data_rootfiles_uri(self, query, as_signed_url=True, title="Untitled"):
 
         all_files = np.array(self.ds.get_data_rootfiles_uri(query, as_signed_url=as_signed_url, title=title))
-        parent_file_urls = np.array([f.file for f in all_files])
 
-        # order is not retained after transform, so we can match files to their parent files using the filename
-        # (replacing / with : to mitigate servicex filename convention )
-        parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].replace("/",":"))[0][0]
-                               for i in range(len(self.filelist))])
+        try:
+            # default matching for when ServiceX doesn't abbreviate names
+            parent_file_urls = np.array([f.file for f in all_files])
+
+            # order is not retained after transform, so we can match files to their parent files using the filename
+            # (replacing / with : to mitigate servicex filename convention )
+            parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].replace("/",":"))[0][0]
+                                   for i in range(len(self.filelist))])
+        except:
+            # fallback solution that relies splitting via the port (name only changes before that)
+            # probably not very stable and general! this may fail - please report back if you observe that happening
+            # TODO: find something more stable
+            parent_file_urls = np.asarray([f.replace(":", "/").split("1094//")[-1] for f in np.array([f.file for f in all_files])])
+            parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].split("1094//")[-1])[0][0]
+                                   for i in range(len(self.filelist))])
 
         files_per_process = {}
         for i, process in enumerate(self.fileset):