Skip to content

Commit

Permalink
fix: handle long file names in ServiceX dataset grouping (AGC v1) (#196)
Browse files Browse the repository at this point in the history
* handle long file names in ServiceX dataset grouping
* do not override config object by cabinetry config
  • Loading branch information
alexander-held authored Sep 5, 2023
1 parent 9377d38 commit 42a0956
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 20 deletions.
35 changes: 27 additions & 8 deletions analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "f4bdc262",
"metadata": {},
Expand All @@ -24,6 +25,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "49c44094",
"metadata": {},
Expand All @@ -35,6 +37,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "991a4343",
"metadata": {},
Expand Down Expand Up @@ -77,6 +80,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "808b4789",
"metadata": {},
Expand Down Expand Up @@ -130,6 +134,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "a22d0859",
"metadata": {},
Expand Down Expand Up @@ -349,6 +354,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "3243414e",
"metadata": {},
Expand Down Expand Up @@ -395,6 +401,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "b0b27a46",
"metadata": {},
Expand Down Expand Up @@ -434,6 +441,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "2114307f",
"metadata": {},
Expand Down Expand Up @@ -476,6 +484,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "62bbc8c8",
"metadata": {},
Expand Down Expand Up @@ -596,6 +605,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "b66c8142",
"metadata": {},
Expand Down Expand Up @@ -661,6 +671,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "5feb786b",
"metadata": {},
Expand Down Expand Up @@ -735,6 +746,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "9f861625",
"metadata": {},
Expand All @@ -758,6 +770,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "6ea49c8e-2d20-47d5-8fd6-2f51e4ef1e0e",
"metadata": {},
Expand All @@ -781,17 +794,18 @@
},
"outputs": [],
"source": [
"config = cabinetry.configuration.load(\"cabinetry_config.yml\")\n",
"cabinetry_config = cabinetry.configuration.load(\"cabinetry_config.yml\")\n",
"\n",
"# rebinning: lower edge 110 GeV, merge bins 2->1\n",
"rebinning_router = utils.get_cabinetry_rebinning_router(config, rebinning=slice(110j, None, hist.rebin(2)))\n",
"cabinetry.templates.build(config, router=rebinning_router)\n",
"cabinetry.templates.postprocess(config) # optional post-processing (e.g. smoothing)\n",
"ws = cabinetry.workspace.build(config)\n",
"rebinning_router = utils.get_cabinetry_rebinning_router(cabinetry_config, rebinning=slice(110j, None, hist.rebin(2)))\n",
"cabinetry.templates.build(cabinetry_config, router=rebinning_router)\n",
"cabinetry.templates.postprocess(cabinetry_config) # optional post-processing (e.g. smoothing)\n",
"ws = cabinetry.workspace.build(cabinetry_config)\n",
"cabinetry.workspace.save(ws, \"workspace.json\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "6feae4d5",
"metadata": {},
Expand Down Expand Up @@ -839,6 +853,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "aab2493c",
"metadata": {},
Expand Down Expand Up @@ -876,6 +891,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "fe677e60",
"metadata": {},
Expand Down Expand Up @@ -906,6 +922,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "35e5a9aa",
"metadata": {},
Expand Down Expand Up @@ -936,7 +953,7 @@
],
"source": [
"model_prediction = cabinetry.model_utils.prediction(model)\n",
"figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=config)\n",
"figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=cabinetry_config)\n",
"figs[0][\"figure\"]"
]
},
Expand Down Expand Up @@ -965,6 +982,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "9908c2a2",
"metadata": {},
Expand Down Expand Up @@ -994,7 +1012,7 @@
],
"source": [
"model_prediction_postfit = cabinetry.model_utils.prediction(model, fit_results=fit_results)\n",
"figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=config)\n",
"figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=cabinetry_config)\n",
"figs[0][\"figure\"]"
]
},
Expand Down Expand Up @@ -1023,6 +1041,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "269f8c3a",
"metadata": {},
Expand Down Expand Up @@ -1059,7 +1078,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
"version": "3.9.16"
}
},
"nbformat": 4,
Expand Down
14 changes: 7 additions & 7 deletions analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,13 +517,13 @@ def get_query(source: ObjectStream) -> ObjectStream:
# We will use `cabinetry` to combine all histograms into a `pyhf` workspace and fit the resulting statistical model to the pseudodata we built.

# %%
config = cabinetry.configuration.load("cabinetry_config.yml")
cabinetry_config = cabinetry.configuration.load("cabinetry_config.yml")

# rebinning: lower edge 110 GeV, merge bins 2->1
rebinning_router = utils.get_cabinetry_rebinning_router(config, rebinning=slice(110j, None, hist.rebin(2)))
cabinetry.templates.build(config, router=rebinning_router)
cabinetry.templates.postprocess(config) # optional post-processing (e.g. smoothing)
ws = cabinetry.workspace.build(config)
rebinning_router = utils.get_cabinetry_rebinning_router(cabinetry_config, rebinning=slice(110j, None, hist.rebin(2)))
cabinetry.templates.build(cabinetry_config, router=rebinning_router)
cabinetry.templates.postprocess(cabinetry_config) # optional post-processing (e.g. smoothing)
ws = cabinetry.workspace.build(cabinetry_config)
cabinetry.workspace.save(ws, "workspace.json")

# %% [markdown]
Expand Down Expand Up @@ -556,7 +556,7 @@ def get_query(source: ObjectStream) -> ObjectStream:

# %%
model_prediction = cabinetry.model_utils.prediction(model)
figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=config)
figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=cabinetry_config)
figs[0]["figure"]

# %%
Expand All @@ -567,7 +567,7 @@ def get_query(source: ObjectStream) -> ObjectStream:

# %%
model_prediction_postfit = cabinetry.model_utils.prediction(model, fit_results=fit_results)
figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=config)
figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=cabinetry_config)
figs[0]["figure"]

# %%
Expand Down
20 changes: 15 additions & 5 deletions analyses/cms-open-data-ttbar/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,22 @@ def __init__(self, fileset, backend_name="uproot", ignore_cache=False):
def get_data_rootfiles_uri(self, query, as_signed_url=True, title="Untitled"):

all_files = np.array(self.ds.get_data_rootfiles_uri(query, as_signed_url=as_signed_url, title=title))
parent_file_urls = np.array([f.file for f in all_files])

# order is not retained after transform, so we can match files to their parent files using the filename
# (replacing / with : to mitigate servicex filename convention )
parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].replace("/",":"))[0][0]
for i in range(len(self.filelist))])
try:
# default matching for when ServiceX doesn't abbreviate names
parent_file_urls = np.array([f.file for f in all_files])

# order is not retained after transform, so we can match files to their parent files using the filename
# (replacing / with : to mitigate servicex filename convention )
parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].replace("/",":"))[0][0]
for i in range(len(self.filelist))])
except:
# fallback solution that relies splitting via the port (name only changes before that)
# probably not very stable and general! this may fail - please report back if you observe that happening
# TODO: find something more stable
parent_file_urls = np.asarray([f.replace(":", "/").split("1094//")[-1] for f in np.array([f.file for f in all_files])])
parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].split("1094//")[-1])[0][0]
for i in range(len(self.filelist))])

files_per_process = {}
for i, process in enumerate(self.fileset):
Expand Down

0 comments on commit 42a0956

Please sign in to comment.