Skip to content

Commit 42a0956

Browse files
fix: handle long file names in ServiceX dataset grouping (AGC v1) (#196)
* handle long file names in ServiceX dataset grouping * do not override config object by cabinetry config
1 parent 9377d38 commit 42a0956

File tree

3 files changed

+49
-20
lines changed

3 files changed

+49
-20
lines changed

analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{
22
"cells": [
33
{
4+
"attachments": {},
45
"cell_type": "markdown",
56
"id": "f4bdc262",
67
"metadata": {},
@@ -24,6 +25,7 @@
2425
]
2526
},
2627
{
28+
"attachments": {},
2729
"cell_type": "markdown",
2830
"id": "49c44094",
2931
"metadata": {},
@@ -35,6 +37,7 @@
3537
]
3638
},
3739
{
40+
"attachments": {},
3841
"cell_type": "markdown",
3942
"id": "991a4343",
4043
"metadata": {},
@@ -77,6 +80,7 @@
7780
]
7881
},
7982
{
83+
"attachments": {},
8084
"cell_type": "markdown",
8185
"id": "808b4789",
8286
"metadata": {},
@@ -130,6 +134,7 @@
130134
]
131135
},
132136
{
137+
"attachments": {},
133138
"cell_type": "markdown",
134139
"id": "a22d0859",
135140
"metadata": {},
@@ -349,6 +354,7 @@
349354
]
350355
},
351356
{
357+
"attachments": {},
352358
"cell_type": "markdown",
353359
"id": "3243414e",
354360
"metadata": {},
@@ -395,6 +401,7 @@
395401
]
396402
},
397403
{
404+
"attachments": {},
398405
"cell_type": "markdown",
399406
"id": "b0b27a46",
400407
"metadata": {},
@@ -434,6 +441,7 @@
434441
]
435442
},
436443
{
444+
"attachments": {},
437445
"cell_type": "markdown",
438446
"id": "2114307f",
439447
"metadata": {},
@@ -476,6 +484,7 @@
476484
]
477485
},
478486
{
487+
"attachments": {},
479488
"cell_type": "markdown",
480489
"id": "62bbc8c8",
481490
"metadata": {},
@@ -596,6 +605,7 @@
596605
]
597606
},
598607
{
608+
"attachments": {},
599609
"cell_type": "markdown",
600610
"id": "b66c8142",
601611
"metadata": {},
@@ -661,6 +671,7 @@
661671
]
662672
},
663673
{
674+
"attachments": {},
664675
"cell_type": "markdown",
665676
"id": "5feb786b",
666677
"metadata": {},
@@ -735,6 +746,7 @@
735746
]
736747
},
737748
{
749+
"attachments": {},
738750
"cell_type": "markdown",
739751
"id": "9f861625",
740752
"metadata": {},
@@ -758,6 +770,7 @@
758770
]
759771
},
760772
{
773+
"attachments": {},
761774
"cell_type": "markdown",
762775
"id": "6ea49c8e-2d20-47d5-8fd6-2f51e4ef1e0e",
763776
"metadata": {},
@@ -781,17 +794,18 @@
781794
},
782795
"outputs": [],
783796
"source": [
784-
"config = cabinetry.configuration.load(\"cabinetry_config.yml\")\n",
797+
"cabinetry_config = cabinetry.configuration.load(\"cabinetry_config.yml\")\n",
785798
"\n",
786799
"# rebinning: lower edge 110 GeV, merge bins 2->1\n",
787-
"rebinning_router = utils.get_cabinetry_rebinning_router(config, rebinning=slice(110j, None, hist.rebin(2)))\n",
788-
"cabinetry.templates.build(config, router=rebinning_router)\n",
789-
"cabinetry.templates.postprocess(config) # optional post-processing (e.g. smoothing)\n",
790-
"ws = cabinetry.workspace.build(config)\n",
800+
"rebinning_router = utils.get_cabinetry_rebinning_router(cabinetry_config, rebinning=slice(110j, None, hist.rebin(2)))\n",
801+
"cabinetry.templates.build(cabinetry_config, router=rebinning_router)\n",
802+
"cabinetry.templates.postprocess(cabinetry_config) # optional post-processing (e.g. smoothing)\n",
803+
"ws = cabinetry.workspace.build(cabinetry_config)\n",
791804
"cabinetry.workspace.save(ws, \"workspace.json\")"
792805
]
793806
},
794807
{
808+
"attachments": {},
795809
"cell_type": "markdown",
796810
"id": "6feae4d5",
797811
"metadata": {},
@@ -839,6 +853,7 @@
839853
]
840854
},
841855
{
856+
"attachments": {},
842857
"cell_type": "markdown",
843858
"id": "aab2493c",
844859
"metadata": {},
@@ -876,6 +891,7 @@
876891
]
877892
},
878893
{
894+
"attachments": {},
879895
"cell_type": "markdown",
880896
"id": "fe677e60",
881897
"metadata": {},
@@ -906,6 +922,7 @@
906922
]
907923
},
908924
{
925+
"attachments": {},
909926
"cell_type": "markdown",
910927
"id": "35e5a9aa",
911928
"metadata": {},
@@ -936,7 +953,7 @@
936953
],
937954
"source": [
938955
"model_prediction = cabinetry.model_utils.prediction(model)\n",
939-
"figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=config)\n",
956+
"figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=cabinetry_config)\n",
940957
"figs[0][\"figure\"]"
941958
]
942959
},
@@ -965,6 +982,7 @@
965982
]
966983
},
967984
{
985+
"attachments": {},
968986
"cell_type": "markdown",
969987
"id": "9908c2a2",
970988
"metadata": {},
@@ -994,7 +1012,7 @@
9941012
],
9951013
"source": [
9961014
"model_prediction_postfit = cabinetry.model_utils.prediction(model, fit_results=fit_results)\n",
997-
"figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=config)\n",
1015+
"figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=cabinetry_config)\n",
9981016
"figs[0][\"figure\"]"
9991017
]
10001018
},
@@ -1023,6 +1041,7 @@
10231041
]
10241042
},
10251043
{
1044+
"attachments": {},
10261045
"cell_type": "markdown",
10271046
"id": "269f8c3a",
10281047
"metadata": {},
@@ -1059,7 +1078,7 @@
10591078
"name": "python",
10601079
"nbconvert_exporter": "python",
10611080
"pygments_lexer": "ipython3",
1062-
"version": "3.8.16"
1081+
"version": "3.9.16"
10631082
}
10641083
},
10651084
"nbformat": 4,

analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -517,13 +517,13 @@ def get_query(source: ObjectStream) -> ObjectStream:
517517
# We will use `cabinetry` to combine all histograms into a `pyhf` workspace and fit the resulting statistical model to the pseudodata we built.
518518

519519
# %%
520-
config = cabinetry.configuration.load("cabinetry_config.yml")
520+
cabinetry_config = cabinetry.configuration.load("cabinetry_config.yml")
521521

522522
# rebinning: lower edge 110 GeV, merge bins 2->1
523-
rebinning_router = utils.get_cabinetry_rebinning_router(config, rebinning=slice(110j, None, hist.rebin(2)))
524-
cabinetry.templates.build(config, router=rebinning_router)
525-
cabinetry.templates.postprocess(config) # optional post-processing (e.g. smoothing)
526-
ws = cabinetry.workspace.build(config)
523+
rebinning_router = utils.get_cabinetry_rebinning_router(cabinetry_config, rebinning=slice(110j, None, hist.rebin(2)))
524+
cabinetry.templates.build(cabinetry_config, router=rebinning_router)
525+
cabinetry.templates.postprocess(cabinetry_config) # optional post-processing (e.g. smoothing)
526+
ws = cabinetry.workspace.build(cabinetry_config)
527527
cabinetry.workspace.save(ws, "workspace.json")
528528

529529
# %% [markdown]
@@ -556,7 +556,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
556556

557557
# %%
558558
model_prediction = cabinetry.model_utils.prediction(model)
559-
figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=config)
559+
figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=cabinetry_config)
560560
figs[0]["figure"]
561561

562562
# %%
@@ -567,7 +567,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
567567

568568
# %%
569569
model_prediction_postfit = cabinetry.model_utils.prediction(model, fit_results=fit_results)
570-
figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=config)
570+
figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=cabinetry_config)
571571
figs[0]["figure"]
572572

573573
# %%

analyses/cms-open-data-ttbar/utils/__init__.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -146,12 +146,22 @@ def __init__(self, fileset, backend_name="uproot", ignore_cache=False):
146146
def get_data_rootfiles_uri(self, query, as_signed_url=True, title="Untitled"):
147147

148148
all_files = np.array(self.ds.get_data_rootfiles_uri(query, as_signed_url=as_signed_url, title=title))
149-
parent_file_urls = np.array([f.file for f in all_files])
150149

151-
# order is not retained after transform, so we can match files to their parent files using the filename
152-
# (replacing / with : to mitigate servicex filename convention )
153-
parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].replace("/",":"))[0][0]
154-
for i in range(len(self.filelist))])
150+
try:
151+
# default matching for when ServiceX doesn't abbreviate names
152+
parent_file_urls = np.array([f.file for f in all_files])
153+
154+
# order is not retained after transform, so we can match files to their parent files using the filename
155+
# (replacing / with : to mitigate servicex filename convention )
156+
parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].replace("/",":"))[0][0]
157+
for i in range(len(self.filelist))])
158+
except:
159+
# fallback solution that relies splitting via the port (name only changes before that)
160+
# probably not very stable and general! this may fail - please report back if you observe that happening
161+
# TODO: find something more stable
162+
parent_file_urls = np.asarray([f.replace(":", "/").split("1094//")[-1] for f in np.array([f.file for f in all_files])])
163+
parent_key = np.array([np.where(parent_file_urls==self.filelist[i][0].split("1094//")[-1])[0][0]
164+
for i in range(len(self.filelist))])
155165

156166
files_per_process = {}
157167
for i, process in enumerate(self.fileset):

0 commit comments

Comments
 (0)