Skip to content

Commit

Permalink
feat: add validation script and histogram references (#170)
Browse files Browse the repository at this point in the history
* add histogram validation script
* add reference counts for various file settings
  • Loading branch information
alexander-held authored Jun 29, 2023
1 parent 1d45a2e commit 9a0a3db
Show file tree
Hide file tree
Showing 10 changed files with 65,932 additions and 0 deletions.
7,318 changes: 7,318 additions & 0 deletions analyses/cms-open-data-ttbar/reference/histos_100_file_per_process.json

Large diffs are not rendered by default.

7,318 changes: 7,318 additions & 0 deletions analyses/cms-open-data-ttbar/reference/histos_10_file_per_process.json

Large diffs are not rendered by default.

7,318 changes: 7,318 additions & 0 deletions analyses/cms-open-data-ttbar/reference/histos_1_file_per_process.json

Large diffs are not rendered by default.

7,318 changes: 7,318 additions & 0 deletions analyses/cms-open-data-ttbar/reference/histos_200_file_per_process.json

Large diffs are not rendered by default.

7,318 changes: 7,318 additions & 0 deletions analyses/cms-open-data-ttbar/reference/histos_20_file_per_process.json

Large diffs are not rendered by default.

7,318 changes: 7,318 additions & 0 deletions analyses/cms-open-data-ttbar/reference/histos_2_file_per_process.json

Large diffs are not rendered by default.

7,318 changes: 7,318 additions & 0 deletions analyses/cms-open-data-ttbar/reference/histos_50_file_per_process.json

Large diffs are not rendered by default.

7,318 changes: 7,318 additions & 0 deletions analyses/cms-open-data-ttbar/reference/histos_5_file_per_process.json

Large diffs are not rendered by default.

7,318 changes: 7,318 additions & 0 deletions analyses/cms-open-data-ttbar/reference/histos_all_file_per_process.json

Large diffs are not rendered by default.

70 changes: 70 additions & 0 deletions analyses/cms-open-data-ttbar/validate_histograms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Compare the content of histograms produced by ttbar_analysis_pipeline with a reference file.
# A reference file for N_FILES_MAX_PER_SAMPLE=1 is available in directory `reference/`.

from __future__ import annotations
import argparse
from collections import defaultdict
import json
import numpy as np
import sys
import uproot

def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--histos", help="ROOT file containing the output histograms. Defaults to './histograms.root'.", default="histograms.root")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--reference", help="JSON reference against which histogram contents should be compared")
group.add_argument("--dump-json", help="Print JSON representation of histogram contents to screen", action='store_true')
return parser.parse_args()

# convert uproot file containing only TH1Ds to a corresponding JSON-compatible dict with structure:
# { "histo1": { "edges": [...], "contents": [...] }, "histo2": { ... }, ... }
# Only the highest namecycle for every histogram is considered, and cycles are stripped from the histogram names.
def as_dict(f: uproot.ReadOnlyDirectory) -> dict[str, dict]:
histos = defaultdict(dict)
# this assumes that the rightmost ";" (if any) comes before a namecycle
names = set(k.rsplit(";", 1)[0] for k in f)
for name in names:
h = f[name]
assert isinstance(h, uproot.behaviors.TH1.Histogram)
histos[name]["edges"] = h.axis().edges().tolist()
histos[name]["contents"] = h.counts(flow=True).tolist()
return histos

def validate(histos: dict, reference: dict) -> dict[str, list[str]]:
errors = defaultdict(list)
for name, ref_h in reference.items():
if name not in histos:
errors[name].append("Histogram not found.")
continue

h = histos[name]
if not np.allclose(h['edges'], ref_h['edges']):
errors[name].append(f"Edges do not match:\n\tgot {h['edges']}\n\texpected {ref_h['edges']}")
contents_depend_on_rng = "pt_res_up" in name # skip checking the contents of these histograms as they are not stable
if not contents_depend_on_rng and not np.allclose(h['contents'], ref_h['contents']):
errors[name].append(f"Contents do not match:\n\tgot {h['contents']}\n\texpected {ref_h['contents']}")

return errors

if __name__ == "__main__":
args = parse_args()
with uproot.open(args.histos) as f:
histos = as_dict(f)

if args.dump_json:
print(json.dumps(histos, indent=2, sort_keys=True))
sys.exit(0)

with open(args.reference) as reference:
ref_histos = json.load(reference)

print(f"Validating '{args.histos}' against reference '{args.reference}'...")
errs = validate(histos=histos, reference=ref_histos)
if len(errs) == 0:
print("All good!")
else:
for hist_name, errors in errs.items():
errors = '\n\t'.join(errors)
print(f"{hist_name}\n\t{errors}")
sys.exit(1)

0 comments on commit 9a0a3db

Please sign in to comment.