Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Plot #53

Merged
merged 9 commits into from
Dec 4, 2024
Merged

Plot #53

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion notebooks/bdt/hhh_qcd_bdt.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -679,7 +679,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.10"
"version": "3.11.9"
}
},
"nbformat": 4,
Expand Down
696 changes: 696 additions & 0 deletions notebooks/benchmark/west/target_weight.ipynb

Large diffs are not rendered by default.

228 changes: 228 additions & 0 deletions notebooks/benchmark/west/trained_on_hh_B+R.ipynb

Large diffs are not rendered by default.

354 changes: 354 additions & 0 deletions notebooks/benchmark/west/trained_on_hh_resolved.ipynb

Large diffs are not rendered by default.

219 changes: 219 additions & 0 deletions notebooks/benchmark/west/trained_on_hhh_B+R.ipynb

Large diffs are not rendered by default.

533 changes: 533 additions & 0 deletions notebooks/benchmark/west/trained_on_hhh_B+R_infer_on_QCD.ipynb

Large diffs are not rendered by default.

364 changes: 364 additions & 0 deletions notebooks/benchmark/west/trained_on_hhh_resolved.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion notebooks/dev/mix_basline_bdt.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.10"
"version": "3.11.9"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion notebooks/dev/mix_basline_sota.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.10"
"version": "3.11.9"
}
},
"nbformat": 4,
Expand Down
179 changes: 179 additions & 0 deletions notebooks/example_plot_higgs_metrics.ipynb

Large diffs are not rendered by default.

656 changes: 656 additions & 0 deletions notebooks/example_plot_mass_sculpting.ipynb

Large diffs are not rendered by default.

117 changes: 51 additions & 66 deletions src/analysis/boosted.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,83 +88,66 @@ def gen_target_bH_LUT(bb_ps_passed, bb_ts_selected, targetH_pts):
# each entry corresponds to [recoH correct or not, reco H pt]
# or
# [targetH retrieved or not, target H pt]
def parse_boosted_w_target(testfile, predfile):
# Collect H pt, mask, target and predicted jet and fjets for 3 Hs in each event
# h pt
bh1_pt = np.array(testfile["TARGETS"]["bh1"]["pt"])
bh2_pt = np.array(testfile["TARGETS"]["bh2"]["pt"])
bh3_pt = np.array(testfile["TARGETS"]["bh3"]["pt"])

# mask
bh1_mask = np.array(testfile["TARGETS"]["bh1"]["mask"])
bh2_mask = np.array(testfile["TARGETS"]["bh2"]["mask"])
bh3_mask = np.array(testfile["TARGETS"]["bh3"]["mask"])

# target assignment
bb_bh1_t = np.array(testfile["TARGETS"]["bh1"]["bb"])
bb_bh2_t = np.array(testfile["TARGETS"]["bh2"]["bb"])
bb_bh3_t = np.array(testfile["TARGETS"]["bh3"]["bb"])

try:
# pred assignment
bb_bh1_p = np.array(predfile["TARGETS"]["bh1"]["bb"])
bb_bh2_p = np.array(predfile["TARGETS"]["bh2"]["bb"])
bb_bh3_p = np.array(predfile["TARGETS"]["bh3"]["bb"])

# boosted Higgs detection probability
dp_bh1 = np.array(predfile["TARGETS"]["bh1"]["detection_probability"])
dp_bh2 = np.array(predfile["TARGETS"]["bh2"]["detection_probability"])
dp_bh3 = np.array(predfile["TARGETS"]["bh3"]["detection_probability"])

# fatjet assignment probability
ap_bh1 = np.array(predfile["TARGETS"]["bh1"]["assignment_probability"])
ap_bh2 = np.array(predfile["TARGETS"]["bh2"]["assignment_probability"])
ap_bh3 = np.array(predfile["TARGETS"]["bh3"]["assignment_probability"])
except:
# pred assignment
bb_bh1_p = np.array(predfile["TARGETS"]["bh1"]["bb"]) + 10
bb_bh2_p = np.array(predfile["TARGETS"]["bh2"]["bb"]) + 10
bb_bh3_p = np.array(predfile["TARGETS"]["bh3"]["bb"]) + 10

# boosted Higgs detection probability
dp_bh1 = np.array(predfile["TARGETS"]["bh1"]["mask"]).astype("float")
dp_bh2 = np.array(predfile["TARGETS"]["bh2"]["mask"]).astype("float")
dp_bh3 = np.array(predfile["TARGETS"]["bh3"]["mask"]).astype("float")

# fatjet assignment probability
ap_bh1 = np.array(predfile["TARGETS"]["bh1"]["mask"]).astype("float")
ap_bh2 = np.array(predfile["TARGETS"]["bh2"]["mask"]).astype("float")
ap_bh3 = np.array(predfile["TARGETS"]["bh3"]["mask"]).astype("float")

# collect fatjet pt
fj_pt = np.array(testfile["INPUTS"]["BoostedJets"]["fj_pt"])

dps = np.concatenate((dp_bh1.reshape(-1, 1), dp_bh2.reshape(-1, 1), dp_bh3.reshape(-1, 1)), axis=1)
aps = np.concatenate((ap_bh1.reshape(-1, 1), ap_bh2.reshape(-1, 1), ap_bh3.reshape(-1, 1)), axis=1)

# convert some arrays to ak array
bb_ps = np.concatenate((bb_bh1_p.reshape(-1, 1), bb_bh2_p.reshape(-1, 1), bb_bh3_p.reshape(-1, 1)), axis=1)
def parse_boosted_w_target(testfile, predfile, num_higgs=3):
# Initialize lists to store variables dynamically
dp_bh = []
ap_bh = []
bb_bh_p = []
bb_bh_t = []
bh_masks_list = []
bh_pts_list = []

for i in range(1, num_higgs + 1):
# Collect target pt, mask, and assignment for each Higgs
bh_pt = np.array(testfile['TARGETS'][f'bh{i}']['pt'])
bh_mask = np.array(testfile['TARGETS'][f'bh{i}']['mask'])
bb_bh_t.append(np.array(testfile['TARGETS'][f'bh{i}']['bb']))
bh_masks_list.append(bh_mask.reshape(-1, 1))
bh_pts_list.append(bh_pt.reshape(-1, 1))

try:
# Collect predicted assignment, detection probability, and fatjet assignment probability
bb_bh_p.append(np.array(predfile['TARGETS'][f'bh{i}']['bb']))
dp_bh.append(np.array(predfile['TARGETS'][f'bh{i}']['detection_probability']))
ap_bh.append(np.array(predfile['TARGETS'][f'bh{i}']['assignment_probability']))
except:
# In case of missing prediction, apply fallback logic
bb_bh_p.append(np.array(predfile['TARGETS'][f'bh{i}']['bb']) + 10)
dp_bh.append(np.array(predfile['TARGETS'][f'bh{i}']['mask']).astype('float'))
ap_bh.append(np.array(predfile['TARGETS'][f'bh{i}']['mask']).astype('float'))

# Collect fatjet pt
fj_pt = np.array(testfile['INPUTS']['BoostedJets']['fj_pt'])

# Concatenate detection and assignment probabilities into arrays
dps = np.concatenate([dp.reshape(-1, 1) for dp in dp_bh], axis=1)
aps = np.concatenate([ap.reshape(-1, 1) for ap in ap_bh], axis=1)

# Convert bb predictions and targets to awkward arrays
bb_ps = np.concatenate([bb.reshape(-1, 1) for bb in bb_bh_p], axis=1)
bb_ps = ak.Array(bb_ps)
bb_ts = np.concatenate((bb_bh1_t.reshape(-1, 1), bb_bh2_t.reshape(-1, 1), bb_bh3_t.reshape(-1, 1)), axis=1)
bb_ts = np.concatenate([bb.reshape(-1, 1) for bb in bb_bh_t], axis=1)
bb_ts = ak.Array(bb_ts)
fj_pt = ak.Array(fj_pt)
bh_masks = np.concatenate((bh1_mask.reshape(-1, 1), bh2_mask.reshape(-1, 1), bh3_mask.reshape(-1, 1)), axis=1)

# Combine masks and pt values dynamically for all Higgs
bh_masks = np.concatenate(bh_masks_list, axis=1)
bh_masks = ak.Array(bh_masks)
bh_pts = np.concatenate((bh1_pt.reshape(-1, 1), bh2_pt.reshape(-1, 1), bh3_pt.reshape(-1, 1)), axis=1)
bh_pts = np.concatenate(bh_pts_list, axis=1)
bh_pts = ak.Array(bh_pts)

# select predictions and targets
# Select predictions and targets
bb_ps_selected = sel_pred_bH_by_dp_ap(dps, aps, bb_ps)
bb_ts_selected, targetH_selected_pts = sel_target_bH_by_mask(bb_ts, bh_pts, bh_masks)

# generate correct/retrieved LUT for pred/target respectively
# Generate correct/retrieved LUT for pred/target respectively
LUT_pred = gen_pred_bH_LUT(bb_ps_selected, bb_ts_selected, fj_pt)
LUT_target = gen_target_bH_LUT(bb_ps_selected, bb_ts_selected, targetH_selected_pts)

# reconstruct bH to remove overlapped ak4 jets
fj_eta = np.array(testfile["INPUTS"]["BoostedJets"]["fj_eta"])
fj_phi = np.array(testfile["INPUTS"]["BoostedJets"]["fj_phi"])
fj_mass = np.array(testfile["INPUTS"]["BoostedJets"]["fj_mass"])
# Reconstruct bH to remove overlapped ak4 jets
fj_eta = np.array(testfile['INPUTS']['BoostedJets']['fj_eta'])
fj_phi = np.array(testfile['INPUTS']['BoostedJets']['fj_phi'])
fj_mass = np.array(testfile['INPUTS']['BoostedJets']['fj_mass'])

fjs = ak.zip(
{
Expand All @@ -173,8 +156,10 @@ def parse_boosted_w_target(testfile, predfile):
"phi": fj_phi,
"mass": fj_mass,
},
with_name="Momentum4D",
with_name="Momentum4D"
)
fj_reco = fjs[bb_ps_selected - 10]

# Return the predicted and target LUTs and the reconstructed jets
return LUT_pred, LUT_target, fj_reco

114 changes: 64 additions & 50 deletions src/analysis/plot.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import h5py as h5
import matplotlib.pyplot as plt
import mplhep as hep
import numpy as np

from src.analysis.boosted import parse_boosted_w_target
from src.analysis.resolved import parse_resolved_w_target
from src.analysis.utils import calc_eff, calc_pur

hep.style.use("CMS")

def calc_pur_eff(target_path, pred_path, bins):

def calc_pur_eff(target_path, pred_path, bins, num_higgs=3):
# open files
pred_h5 = h5.File(pred_path, "a")
target_h5 = h5.File(target_path)
Expand All @@ -18,9 +21,9 @@ def calc_pur_eff(target_path, pred_path, bins):
pred_h5["TARGETS"] = pred_h5["SpecialKey.Targets"]

# generate look up tables
LUT_boosted_pred, LUT_boosted_target, fjs_reco = parse_boosted_w_target(target_h5, pred_h5)
LUT_resolved_pred, LUT_resolved_target, _ = parse_resolved_w_target(target_h5, pred_h5, fjs_reco=None)
LUT_resolved_wOR_pred, LUT_resolved_wOR_target, _ = parse_resolved_w_target(target_h5, pred_h5, fjs_reco=fjs_reco)
LUT_boosted_pred, LUT_boosted_target, fjs_reco = parse_boosted_w_target(target_h5, pred_h5, num_higgs)
LUT_resolved_pred, LUT_resolved_target, _ = parse_resolved_w_target(target_h5, pred_h5, fjs_reco=None, num_higgs=num_higgs)
LUT_resolved_wOR_pred, LUT_resolved_wOR_target, _ = parse_resolved_w_target(target_h5, pred_h5, fjs_reco=fjs_reco, num_higgs=num_higgs)

LUT_resolved_pred_no_OR = []
for event in LUT_resolved_wOR_pred:
Expand All @@ -40,18 +43,26 @@ def calc_pur_eff(target_path, pred_path, bins):

# calculate efficiencies and purities for b+r, b, and r
results = {}
results["pur_m"], results["purerr_m"] = calc_eff(LUT_boosted_pred, LUT_resolved_wOR_pred, bins)
results["eff_m"], results["efferr_m"] = calc_pur(LUT_boosted_target, LUT_resolved_wOR_target, bins)

results["pur_b"], results["purerr_b"] = calc_eff(LUT_boosted_pred, None, bins)
results["eff_b"], results["efferr_b"] = calc_pur(LUT_boosted_target, None, bins)

results["pur_r"], results["purerr_r"] = calc_eff(None, LUT_resolved_pred, bins)
results["eff_r"], results["efferr_r"] = calc_pur(None, LUT_resolved_target, bins)

results["pur_r_or"], results["purerr_r_or"] = calc_eff(None, LUT_resolved_pred_no_OR, bins)
results["eff_r_or"], results["efferr_r_or"] = calc_pur(None, LUT_resolved_target_no_OR, bins)

results["pur_m"], results["purerr_m"], avg_pur_m, n_correct_pred_m = calc_pur(LUT_boosted_pred, LUT_resolved_wOR_pred, bins)
results["eff_m"], results["efferr_m"], avg_eff_m, n_reco_target_m = calc_eff(LUT_boosted_target, LUT_resolved_wOR_target, bins)

results["pur_b"], results["purerr_b"], avg_pur_b, n_correct_pred_b = calc_pur(LUT_boosted_pred, None, bins)
results["eff_b"], results["efferr_b"], avg_eff_b, n_reco_target_b = calc_eff(LUT_boosted_target, None, bins)

results["pur_r"], results["purerr_r"], avg_pur_r, n_correct_pred_r = calc_pur(None, LUT_resolved_pred, bins)
results["eff_r"], results["efferr_r"], avg_eff_r, n_reco_target_r = calc_eff(None, LUT_resolved_target, bins)

results["pur_r_or"], results["purerr_r_or"], _, _ = calc_pur(None, LUT_resolved_pred_no_OR, bins)
results["eff_r_or"], results["efferr_r_or"], _, _ = calc_eff(None, LUT_resolved_target_no_OR, bins)

print("Average purity:")
print("merged", avg_pur_m, "boosted", avg_pur_b, "resolved", avg_pur_r)
print("Average efficiency:")
print("merged", avg_eff_m, "boosted", avg_eff_b, "resolved", avg_eff_r)
print("Number of correct Higgs canddiate predictions")
print("merged", n_correct_pred_m, "boosted", n_correct_pred_b, "resolved", n_correct_pred_r)
print("Number of reconstructed Higgs targets")
print("merged", n_reco_target_m, "boosted", n_reco_target_b, "resolved", n_reco_target_r)
print("Number of Boosted Prediction:", np.array([pred for event in LUT_boosted_pred for pred in event]).shape[0])
print(
"Number of Resolved Prediction before OR:",
Expand All @@ -67,7 +78,7 @@ def calc_pur_eff(target_path, pred_path, bins):

# I started to use "efficiency" for describing how many gen Higgs were reconstructed
# and "purity" for desrcribing how many reco Higgs are actually gen Higgs
def plot_pur_eff_w_dict(plot_dict, target_path, save_path=None, proj_name=None, bins=None):
def plot_pur_eff_w_dict(plot_dict, target_path, save_path=None, proj_name=None, bins=None, num_higgs=3):
if bins == None:
bins = np.arange(0, 1050, 50)

Expand All @@ -78,57 +89,59 @@ def plot_pur_eff_w_dict(plot_dict, target_path, save_path=None, proj_name=None,
# m: merged (b+r w OR)
# b: boosted
# r: resolved
fig_m, ax_m = plt.subplots(1, 2, figsize=(12, 5))
fig_b, ax_b = plt.subplots(1, 2, figsize=(12, 5))
fig_r, ax_r = plt.subplots(1, 2, figsize=(12, 5))
fig_r_or, ax_r_or = plt.subplots(1, 2, figsize=(12, 5))
fig_m, ax_m = plt.subplots(1, 2, figsize=(24, 10))
fig_b, ax_b = plt.subplots(1, 2, figsize=(24, 10))
fig_r, ax_r = plt.subplots(1, 2, figsize=(24, 10))
fig_r_or, ax_r_or = plt.subplots(1, 2, figsize=(24, 10))

# preset figure labels, titles, limits, etc.
ax_m[0].set(
xlabel=r"Merged Reco H pT (GeV)",
xlabel=r"Reco. H $p_\mathrm{T}$ GeV",
ylabel=r"Reconstruction Purity",
title=f"Reconstruction Purity vs. Merged Reco H pT",
# title=f"Reconstruction Purity vs. Merged Reco H pT",
)
ax_m[1].set(
xlabel=r"Merged Gen H pT (GeV)",
xlabel=r"Gen. H $p_\mathrm{T}$ GeV",
ylabel=r"Reconstruction Efficiency",
title=f"Reconstruction Efficiency vs. Merged Gen H pT",
# title=f"Reconstruction Efficiency vs. Merged Gen H pT",
)
ax_b[0].set(
xlabel=r"Reco Boosted H pT (GeV)",
xlabel=r"Reco. H $p_\mathrm{T}$ GeV",
ylabel=r"Reconstruction Purity",
title=f"Reconstruction Purity vs. Reco Boosted H pT",
# title=f"Reconstruction Purity vs. Reco Boosted H pT",
)
ax_b[1].set(
xlabel=r"Gen Boosted H pT (GeV)",
xlabel=r"Gen. H $p_\mathrm{T}$ GeV",
ylabel=r"Reconstruction Efficiency",
title=f"Reconstruction Efficiency vs. Gen Boosted H pT",
# title=f"Reconstruction Efficiency vs. Gen Boosted H pT",
)
ax_r[0].set(
xlabel=r"Reco Resolved H pT (GeV)",
xlabel=r"Reco. H $p_\mathrm{T}$ GeV",
ylabel=r"Reconstruction Purity",
title=f"Reconstruction Purity vs. Reco Resolved H pT",
# title=f"Reconstruction Purity vs. Reco Resolved H pT",
)
ax_r[1].set(
xlabel=r"Gen Resolved H pT (GeV)",
xlabel=r"Gen. H $p_\mathrm{T}$ GeV",
ylabel=r"Reconstruction Efficiency",
title=f"Reconstruction Efficiency vs. Gen Resolved H pT",
# title=f"Reconstruction Efficiency vs. Gen Resolved H pT",
)
ax_r_or[0].set(
xlabel=r"Reco Resolved H pT (GeV)",
xlabel=r"Reco. H $p_\mathrm{T}$ GeV",
ylabel=r"Reconstruction Purity",
title=f"Resolved Purity After OR vs. Reco Resolved H pT",
# title=f"Resolved Purity After OR vs. Reco Resolved H pT",
)
ax_r_or[1].set(
xlabel=r"Gen Resolved H pT (GeV)",
xlabel=r"Gen. H $p_\mathrm{T}$ GeV",
ylabel=r"Reconstruction Efficiency",
title=f"Resolved Efficiency After OR vs. Gen Resolved H pT",
# title=f"Resolved Efficiency After OR vs. Gen Resolved H pT",
)

# plot purities and efficiencies
for tag, pred_path in plot_dict.items():
print("Processing", tag)
results = calc_pur_eff(target_path, pred_path, bins)

results = calc_pur_eff(target_path, pred_path, bins, num_higgs)

ax_m[0].errorbar(
x=bin_centers, y=results["pur_m"], xerr=xerr, yerr=results["purerr_m"], fmt="o", capsize=5, label=tag
)
Expand All @@ -155,29 +168,30 @@ def plot_pur_eff_w_dict(plot_dict, target_path, save_path=None, proj_name=None,
)

# adjust limits and legends
ax_m[0].legend()
ax_m[1].legend()
event_type = "H" * num_higgs
ax_m[0].legend(title=f'{event_type} Boosted+Resolved')
ax_m[1].legend(title=f'{event_type} Boosted+Resolved')
ax_m[0].set_ylim([-0.1, 1.1])
ax_m[1].set_ylim([-0.1, 1.1])
ax_b[0].legend()
ax_b[1].legend()
ax_b[0].legend(title=f'{event_type} Boosted')
ax_b[1].legend(title=f'{event_type} Boosted')
ax_b[0].set_ylim([-0.1, 1.1])
ax_b[1].set_ylim([-0.1, 1.1])
ax_r[0].legend()
ax_r[1].legend()
ax_r[0].legend(title=f'{event_type} Resolved')
ax_r[1].legend(title=f'{event_type} Resolved')
ax_r[0].set_ylim([-0.1, 1.1])
ax_r[1].set_ylim([-0.1, 1.1])
ax_r_or[0].legend()
ax_r_or[1].legend()
ax_r_or[0].legend(title=f'{event_type} Resolved+OR')
ax_r_or[1].legend(title=f'{event_type} Resolved+OR')
ax_r_or[0].set_ylim([-0.1, 1.1])
ax_r_or[1].set_ylim([-0.1, 1.1])

plt.show()

if save_path is not None:
fig_m.savefig(f"{save_path}/{proj_name}_merged.png")
fig_b.savefig(f"{save_path}/{proj_name}_boosted.png")
fig_r.savefig(f"{save_path}/{proj_name}_resolved.png")
fig_r.savefig(f"{save_path}/{proj_name}_resolved_wOR.png")
fig_m.savefig(f"{save_path}/{proj_name}_merged.pdf", format='pdf')
fig_b.savefig(f"{save_path}/{proj_name}_boosted.pdf", format='pdf')
fig_r.savefig(f"{save_path}/{proj_name}_resolved.pdf", format='pdf')
fig_r_or.savefig(f"{save_path}/{proj_name}_resolved_wOR.pdf", format='pdf')

return
Loading
Loading