fix: handle long file names in ServiceX dataset grouping (AGC v1) (#196)

* handle long file names in ServiceX dataset grouping * do not override config object by cabinetry config
iris-hep · Sep 5, 2023 · 42a0956 · 42a0956
1 parent 9377d38
commit 42a0956
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 20 deletions.
diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb
@@ -1,6 +1,7 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "f4bdc262",
    "metadata": {},
@@ -24,6 +25,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "49c44094",
    "metadata": {},
@@ -35,6 +37,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "991a4343",
    "metadata": {},
@@ -77,6 +80,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "808b4789",
    "metadata": {},
@@ -130,6 +134,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "a22d0859",
    "metadata": {},
@@ -349,6 +354,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "3243414e",
    "metadata": {},
@@ -395,6 +401,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "b0b27a46",
    "metadata": {},
@@ -434,6 +441,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "2114307f",
    "metadata": {},
@@ -476,6 +484,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "62bbc8c8",
    "metadata": {},
@@ -596,6 +605,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "b66c8142",
    "metadata": {},
@@ -661,6 +671,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "5feb786b",
    "metadata": {},
@@ -735,6 +746,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "9f861625",
    "metadata": {},
@@ -758,6 +770,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "6ea49c8e-2d20-47d5-8fd6-2f51e4ef1e0e",
    "metadata": {},
@@ -781,17 +794,18 @@
    },
    "outputs": [],
    "source": [
-    "config = cabinetry.configuration.load(\"cabinetry_config.yml\")\n",
+    "cabinetry_config = cabinetry.configuration.load(\"cabinetry_config.yml\")\n",
     "\n",
     "# rebinning: lower edge 110 GeV, merge bins 2->1\n",
-    "rebinning_router = utils.get_cabinetry_rebinning_router(config, rebinning=slice(110j, None, hist.rebin(2)))\n",
-    "cabinetry.templates.build(config, router=rebinning_router)\n",
-    "cabinetry.templates.postprocess(config)  # optional post-processing (e.g. smoothing)\n",
-    "ws = cabinetry.workspace.build(config)\n",
+    "rebinning_router = utils.get_cabinetry_rebinning_router(cabinetry_config, rebinning=slice(110j, None, hist.rebin(2)))\n",
+    "cabinetry.templates.build(cabinetry_config, router=rebinning_router)\n",
+    "cabinetry.templates.postprocess(cabinetry_config)  # optional post-processing (e.g. smoothing)\n",
+    "ws = cabinetry.workspace.build(cabinetry_config)\n",
     "cabinetry.workspace.save(ws, \"workspace.json\")"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "6feae4d5",
    "metadata": {},
@@ -839,6 +853,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "aab2493c",
    "metadata": {},
@@ -876,6 +891,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "fe677e60",
    "metadata": {},
@@ -906,6 +922,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "35e5a9aa",
    "metadata": {},
@@ -936,7 +953,7 @@
    ],
    "source": [
     "model_prediction = cabinetry.model_utils.prediction(model)\n",
-    "figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=config)\n",
+    "figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=cabinetry_config)\n",
     "figs[0][\"figure\"]"
    ]
   },
@@ -965,6 +982,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "9908c2a2",
    "metadata": {},
@@ -994,7 +1012,7 @@
    ],
    "source": [
     "model_prediction_postfit = cabinetry.model_utils.prediction(model, fit_results=fit_results)\n",
-    "figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=config)\n",
+    "figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=cabinetry_config)\n",
     "figs[0][\"figure\"]"
    ]
   },
@@ -1023,6 +1041,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "269f8c3a",
    "metadata": {},
@@ -1059,7 +1078,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.16"
+   "version": "3.9.16"
   }
  },
  "nbformat": 4,

diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py
@@ -517,13 +517,13 @@ def get_query(source: ObjectStream) -> ObjectStream:
 # We will use `cabinetry` to combine all histograms into a `pyhf` workspace and fit the resulting statistical model to the pseudodata we built.
 
 # %%
-config = cabinetry.configuration.load("cabinetry_config.yml")
+cabinetry_config = cabinetry.configuration.load("cabinetry_config.yml")
 
 # rebinning: lower edge 110 GeV, merge bins 2->1
-rebinning_router = utils.get_cabinetry_rebinning_router(config, rebinning=slice(110j, None, hist.rebin(2)))
-cabinetry.templates.build(config, router=rebinning_router)
-cabinetry.templates.postprocess(config)  # optional post-processing (e.g. smoothing)
-ws = cabinetry.workspace.build(config)
+rebinning_router = utils.get_cabinetry_rebinning_router(cabinetry_config, rebinning=slice(110j, None, hist.rebin(2)))
+cabinetry.templates.build(cabinetry_config, router=rebinning_router)
+cabinetry.templates.postprocess(cabinetry_config)  # optional post-processing (e.g. smoothing)
+ws = cabinetry.workspace.build(cabinetry_config)
 cabinetry.workspace.save(ws, "workspace.json")
 
 # %% [markdown]
@@ -556,7 +556,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
 
 # %%
 model_prediction = cabinetry.model_utils.prediction(model)
-figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=config)
+figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=cabinetry_config)
 figs[0]["figure"]
 
 # %%
@@ -567,7 +567,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
 
 # %%
 model_prediction_postfit = cabinetry.model_utils.prediction(model, fit_results=fit_results)
-figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=config)
+figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=cabinetry_config)
 figs[0]["figure"]
 
 # %%

diff --git a/analyses/cms-open-data-ttbar/utils/__init__.py b/analyses/cms-open-data-ttbar/utils/__init__.py
@@ -146,12 +146,22 @@ def __init__(self, fileset, backend_name="uproot", ignore_cache=False):
     def get_data_rootfiles_uri(self, query, as_signed_url=True, title="Untitled"):
 
         all_files = np.array(self.ds.get_data_rootfiles_uri(query, as_signed_url=as_signed_url, title=title))
-        parent_file_urls = np.array([f.file for f in all_files])
 
-        # order is not retained after transform, so we can match files to their parent files using the filename
-        # (replacing / with : to mitigate servicex filename convention )
-        parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].replace("/",":"))[0][0]
-                               for i in range(len(self.filelist))])
+        try:
+            # default matching for when ServiceX doesn't abbreviate names
+            parent_file_urls = np.array([f.file for f in all_files])
+
+            # order is not retained after transform, so we can match files to their parent files using the filename
+            # (replacing / with : to mitigate servicex filename convention )
+            parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].replace("/",":"))[0][0]
+                                   for i in range(len(self.filelist))])
+        except:
+            # fallback solution that relies splitting via the port (name only changes before that)
+            # probably not very stable and general! this may fail - please report back if you observe that happening
+            # TODO: find something more stable
+            parent_file_urls = np.asarray([f.replace(":", "/").split("1094//")[-1] for f in np.array([f.file for f in all_files])])
+            parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].split("1094//")[-1])[0][0]
+                                   for i in range(len(self.filelist))])
 
         files_per_process = {}
         for i, process in enumerate(self.fileset):