Skip to content

Commit

Permalink
fix: sync changes from #185 to script (#186)
Browse files Browse the repository at this point in the history
* run jupytext to sync notebook changes to script version
  • Loading branch information
alexander-held authored Aug 4, 2023
1 parent 24a069b commit 4f5a661
Showing 1 changed file with 28 additions and 23 deletions.
51 changes: 28 additions & 23 deletions analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.14.1
# jupytext_version: 1.14.7
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
Expand Down Expand Up @@ -40,7 +40,7 @@
# %% [markdown]
# ### Imports: setting up our environment

# %% tags=[]
# %%
import asyncio
import logging
import os
Expand Down Expand Up @@ -89,7 +89,7 @@
#
# The input files are all in the 1–3 GB range.

# %% tags=[]
# %%
### GLOBAL CONFIGURATION
# input files per process, set to e.g. 10 (smaller number = faster)
N_FILES_MAX_PER_SAMPLE = 5
Expand All @@ -114,7 +114,7 @@
# - calculating systematic uncertainties at the event and object level,
# - filling all the information into histograms that get aggregated and ultimately returned to us by `coffea`.

# %% tags=[]
# %%
# functions creating systematic variations
def flat_variation(ones):
# 2.5% weight variations
Expand Down Expand Up @@ -316,8 +316,13 @@ def postprocess(self, accumulator):
#
# Here, we gather all the required information about the files we want to process: paths to the files and asociated metadata.

# %% tags=[]
fileset = utils.construct_fileset(N_FILES_MAX_PER_SAMPLE, use_xcache=False, af_name=config["benchmarking"]["AF_NAME"]) # local files on /data for ssl-dev
# %%
fileset = utils.construct_fileset(
N_FILES_MAX_PER_SAMPLE,
use_xcache=False,
af_name=config["benchmarking"]["AF_NAME"],
input_from_eos=config["benchmarking"]["INPUT_FROM_EOS"]
) # local files on /data for ssl-dev as af_name

print(f"processes in fileset: {list(fileset.keys())}")
print(f"\nexample of information in fileset:\n{{\n 'files': [{fileset['ttbar__nominal']['files'][0]}, ...],")
Expand All @@ -329,7 +334,7 @@ def postprocess(self, accumulator):
#
# Define the func_adl query to be used for the purpose of extracting columns and filtering.

# %% tags=[]
# %%
def get_query(source: ObjectStream) -> ObjectStream:
"""Query for event / column selection: >=4j >=1b, ==1 lep with pT>25 GeV, return relevant columns
"""
Expand All @@ -355,7 +360,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
#
# Using the queries created with `func_adl`, we are using `ServiceX` to read the CMS Open Data files to build cached files with only the specific event information as dictated by the query.

# %% tags=[]
# %%
if USE_SERVICEX:
# dummy dataset on which to generate the query
dummy_ds = ServiceXSourceUpROOT("cernopendata://dummy", "Events", backend_name="uproot")
Expand Down Expand Up @@ -385,7 +390,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
#
# When `USE_SERVICEX` is false, the input files need to be processed during this step as well.

# %% tags=[]
# %%
NanoAODSchema.warn_missing_crossrefs = False # silences warnings about branches we will not use here
if USE_DASK:
executor = processor.DaskExecutor(client=utils.get_client(af=config["global"]["AF"]))
Expand All @@ -410,7 +415,7 @@ def get_query(source: ObjectStream) -> ObjectStream:

print(f"\nexecution took {exec_time:.2f} seconds")

# %% tags=[]
# %%
# track metrics
dataset_source = "/data" if fileset["ttbar__nominal"]["files"][0].startswith("/data") else "https://xrootd-local.unl.edu:1094" # TODO: xcache support
metrics.update({
Expand Down Expand Up @@ -448,15 +453,15 @@ def get_query(source: ObjectStream) -> ObjectStream:
# Let's have a look at the data we obtained.
# We built histograms in two phase space regions, for multiple physics processes and systematic variations.

# %% tags=[]
# %%
utils.set_style()

all_histograms[110j::hist.rebin(2), "4j1b", :, "nominal"].stack("process")[::-1].plot(stack=True, histtype="fill", linewidth=1, edgecolor="grey")
plt.legend(frameon=False)
plt.title(">= 4 jets, 1 b-tag")
plt.xlabel("HT [GeV]");

# %% tags=[]
# %%
all_histograms[:, "4j2b", :, "nominal"].stack("process")[::-1].plot(stack=True, histtype="fill", linewidth=1,edgecolor="grey")
plt.legend(frameon=False)
plt.title(">= 4 jets, >= 2 b-tags")
Expand All @@ -471,7 +476,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
#
# We are making of [UHI](https://uhi.readthedocs.io/) here to re-bin.

# %% tags=[]
# %%
# b-tagging variations
all_histograms[110j::hist.rebin(2), "4j1b", "ttbar", "nominal"].plot(label="nominal", linewidth=2)
all_histograms[110j::hist.rebin(2), "4j1b", "ttbar", "btag_var_0_up"].plot(label="NP 1", linewidth=2)
Expand All @@ -482,7 +487,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
plt.xlabel("HT [GeV]")
plt.title("b-tagging variations");

# %% tags=[]
# %%
# jet energy scale variations
all_histograms[:, "4j2b", "ttbar", "nominal"].plot(label="nominal", linewidth=2)
all_histograms[:, "4j2b", "ttbar", "pt_scale_up"].plot(label="scale up", linewidth=2)
Expand All @@ -497,7 +502,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
# We'll save everything to disk for subsequent usage.
# This also builds pseudo-data by combining events from the various simulation setups we have processed.

# %% tags=[]
# %%
utils.save_histograms(all_histograms, fileset, "histograms.root")

# %% [markdown]
Expand All @@ -510,7 +515,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
# A statistical model has been defined in `config.yml`, ready to be used with our output.
# We will use `cabinetry` to combine all histograms into a `pyhf` workspace and fit the resulting statistical model to the pseudodata we built.

# %% tags=[]
# %%
config = cabinetry.configuration.load("cabinetry_config.yml")

# rebinning: lower edge 110 GeV, merge bins 2->1
Expand All @@ -523,13 +528,13 @@ def get_query(source: ObjectStream) -> ObjectStream:
# %% [markdown]
# We can inspect the workspace with `pyhf`, or use `pyhf` to perform inference.

# %% tags=[]
# %%
# !pyhf inspect workspace.json | head -n 20

# %% [markdown]
# Let's try out what we built: the next cell will perform a maximum likelihood fit of our statistical model to the pseudodata we built.

# %% tags=[]
# %%
model, data = cabinetry.model_utils.model_and_data(ws)
fit_results = cabinetry.fit.fit(model, data)

Expand All @@ -540,31 +545,31 @@ def get_query(source: ObjectStream) -> ObjectStream:
# %% [markdown]
# For this pseudodata, what is the resulting ttbar cross-section divided by the Standard Model prediction?

# %% tags=[]
# %%
poi_index = model.config.poi_index
print(f"\nfit result for ttbar_norm: {fit_results.bestfit[poi_index]:.3f} +/- {fit_results.uncertainty[poi_index]:.3f}")

# %% [markdown]
# Let's also visualize the model before and after the fit, in both the regions we are using.
# The binning here corresponds to the binning used for the fit.

# %% tags=[]
# %%
model_prediction = cabinetry.model_utils.prediction(model)
figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=config)
figs[0]["figure"]

# %% tags=[]
# %%
figs[1]["figure"]

# %% [markdown]
# We can see very good post-fit agreement.

# %% tags=[]
# %%
model_prediction_postfit = cabinetry.model_utils.prediction(model, fit_results=fit_results)
figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=config)
figs[0]["figure"]

# %% tags=[]
# %%
figs[1]["figure"]

# %% [markdown]
Expand Down

0 comments on commit 4f5a661

Please sign in to comment.