From 42a095692d6db879b862e6a78d0cce50c8c47c0c Mon Sep 17 00:00:00 2001 From: Alexander Held <45009355+alexander-held@users.noreply.github.com> Date: Tue, 5 Sep 2023 23:48:00 +0200 Subject: [PATCH] fix: handle long file names in ServiceX dataset grouping (AGC v1) (#196) * handle long file names in ServiceX dataset grouping * do not override config object by cabinetry config --- .../ttbar_analysis_pipeline.ipynb | 35 ++++++++++++++----- .../ttbar_analysis_pipeline.py | 14 ++++---- .../cms-open-data-ttbar/utils/__init__.py | 20 ++++++++--- 3 files changed, 49 insertions(+), 20 deletions(-) diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb index 8d9c9350..fa84d8ba 100644 --- a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb +++ b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "f4bdc262", "metadata": {}, @@ -24,6 +25,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "49c44094", "metadata": {}, @@ -35,6 +37,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "991a4343", "metadata": {}, @@ -77,6 +80,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "808b4789", "metadata": {}, @@ -130,6 +134,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a22d0859", "metadata": {}, @@ -349,6 +354,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3243414e", "metadata": {}, @@ -395,6 +401,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b0b27a46", "metadata": {}, @@ -434,6 +441,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2114307f", "metadata": {}, @@ -476,6 +484,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "62bbc8c8", "metadata": {}, @@ -596,6 +605,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b66c8142", "metadata": {}, @@ -661,6 +671,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "5feb786b", "metadata": {}, @@ -735,6 +746,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "9f861625", "metadata": {}, @@ -758,6 +770,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "6ea49c8e-2d20-47d5-8fd6-2f51e4ef1e0e", "metadata": {}, @@ -781,17 +794,18 @@ }, "outputs": [], "source": [ - "config = cabinetry.configuration.load(\"cabinetry_config.yml\")\n", + "cabinetry_config = cabinetry.configuration.load(\"cabinetry_config.yml\")\n", "\n", "# rebinning: lower edge 110 GeV, merge bins 2->1\n", - "rebinning_router = utils.get_cabinetry_rebinning_router(config, rebinning=slice(110j, None, hist.rebin(2)))\n", - "cabinetry.templates.build(config, router=rebinning_router)\n", - "cabinetry.templates.postprocess(config) # optional post-processing (e.g. smoothing)\n", - "ws = cabinetry.workspace.build(config)\n", + "rebinning_router = utils.get_cabinetry_rebinning_router(cabinetry_config, rebinning=slice(110j, None, hist.rebin(2)))\n", + "cabinetry.templates.build(cabinetry_config, router=rebinning_router)\n", + "cabinetry.templates.postprocess(cabinetry_config) # optional post-processing (e.g. smoothing)\n", + "ws = cabinetry.workspace.build(cabinetry_config)\n", "cabinetry.workspace.save(ws, \"workspace.json\")" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "6feae4d5", "metadata": {}, @@ -839,6 +853,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "aab2493c", "metadata": {}, @@ -876,6 +891,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "fe677e60", "metadata": {}, @@ -906,6 +922,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "35e5a9aa", "metadata": {}, @@ -936,7 +953,7 @@ ], "source": [ "model_prediction = cabinetry.model_utils.prediction(model)\n", - "figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=config)\n", + "figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=cabinetry_config)\n", "figs[0][\"figure\"]" ] }, @@ -965,6 +982,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "9908c2a2", "metadata": {}, @@ -994,7 +1012,7 @@ ], "source": [ "model_prediction_postfit = cabinetry.model_utils.prediction(model, fit_results=fit_results)\n", - "figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=config)\n", + "figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=cabinetry_config)\n", "figs[0][\"figure\"]" ] }, @@ -1023,6 +1041,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "269f8c3a", "metadata": {}, @@ -1059,7 +1078,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py index 0e68ea2e..ccd04f60 100644 --- a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py +++ b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py @@ -517,13 +517,13 @@ def get_query(source: ObjectStream) -> ObjectStream: # We will use `cabinetry` to combine all histograms into a `pyhf` workspace and fit the resulting statistical model to the pseudodata we built. # %% -config = cabinetry.configuration.load("cabinetry_config.yml") +cabinetry_config = cabinetry.configuration.load("cabinetry_config.yml") # rebinning: lower edge 110 GeV, merge bins 2->1 -rebinning_router = utils.get_cabinetry_rebinning_router(config, rebinning=slice(110j, None, hist.rebin(2))) -cabinetry.templates.build(config, router=rebinning_router) -cabinetry.templates.postprocess(config) # optional post-processing (e.g. smoothing) -ws = cabinetry.workspace.build(config) +rebinning_router = utils.get_cabinetry_rebinning_router(cabinetry_config, rebinning=slice(110j, None, hist.rebin(2))) +cabinetry.templates.build(cabinetry_config, router=rebinning_router) +cabinetry.templates.postprocess(cabinetry_config) # optional post-processing (e.g. smoothing) +ws = cabinetry.workspace.build(cabinetry_config) cabinetry.workspace.save(ws, "workspace.json") # %% [markdown] @@ -556,7 +556,7 @@ def get_query(source: ObjectStream) -> ObjectStream: # %% model_prediction = cabinetry.model_utils.prediction(model) -figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=config) +figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=cabinetry_config) figs[0]["figure"] # %% @@ -567,7 +567,7 @@ def get_query(source: ObjectStream) -> ObjectStream: # %% model_prediction_postfit = cabinetry.model_utils.prediction(model, fit_results=fit_results) -figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=config) +figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=cabinetry_config) figs[0]["figure"] # %% diff --git a/analyses/cms-open-data-ttbar/utils/__init__.py b/analyses/cms-open-data-ttbar/utils/__init__.py index 653ff595..b88eaa32 100644 --- a/analyses/cms-open-data-ttbar/utils/__init__.py +++ b/analyses/cms-open-data-ttbar/utils/__init__.py @@ -146,12 +146,22 @@ def __init__(self, fileset, backend_name="uproot", ignore_cache=False): def get_data_rootfiles_uri(self, query, as_signed_url=True, title="Untitled"): all_files = np.array(self.ds.get_data_rootfiles_uri(query, as_signed_url=as_signed_url, title=title)) - parent_file_urls = np.array([f.file for f in all_files]) - # order is not retained after transform, so we can match files to their parent files using the filename - # (replacing / with : to mitigate servicex filename convention ) - parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].replace("/",":"))[0][0] - for i in range(len(self.filelist))]) + try: + # default matching for when ServiceX doesn't abbreviate names + parent_file_urls = np.array([f.file for f in all_files]) + + # order is not retained after transform, so we can match files to their parent files using the filename + # (replacing / with : to mitigate servicex filename convention ) + parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].replace("/",":"))[0][0] + for i in range(len(self.filelist))]) + except: + # fallback solution that relies splitting via the port (name only changes before that) + # probably not very stable and general! this may fail - please report back if you observe that happening + # TODO: find something more stable + parent_file_urls = np.asarray([f.replace(":", "/").split("1094//")[-1] for f in np.array([f.file for f in all_files])]) + parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].split("1094//")[-1])[0][0] + for i in range(len(self.filelist))]) files_per_process = {} for i, process in enumerate(self.fileset):