Source code for grafei.model.dataset_utils
##########################################################################
# basf2 (Belle II Analysis Software Framework) #
# Author: The Belle II Collaboration #
# #
# See git log for contributors and copyright holders. #
# This file is licensed under LGPL-3.0, see LICENSE.md. #
##########################################################################
import numpy as np
import uproot
[docs]
def populate_avail_samples(X, Y, B_reco=0):
"""
Shifts through the file metadata to populate a list of available dataset samples.
Args:
X (list): List of ROOT lazyarray dicts for X (input) data.
Y (list): List of ROOT lazyarray dicts for Y (ground truth) data.
B_reco (int): Reconstruction mode flag (set automatically):
.. math::
\\text{Upsilon} (4S) = 0,\\ B^0 = 1,\\ B^+ = 2.
Returns:
list: List of available samples for training.
"""
# Must iterate over Y because X contains a b_index of -1 for unmatched particles
avail_samples = []
# Iterate over files in self.y
for i, f in enumerate(Y):
# Have to get event list differently
events = X[i]["event"]
# Iterate over events in current file
for evt_idx, _ in enumerate(events):
b_indices = [1] if not B_reco else [1, 2]
for b_index in b_indices:
# Check that LCA is not trivial
lca_rows = f[b_index]["n_LCA"][evt_idx]
if lca_rows < 2:
continue
# Fetch relevant event properties
x_attrs = X[i]
evt_b_index = x_attrs["b_index"][evt_idx]
evt_primary = x_attrs["primary"][evt_idx]
# particles coming from one or both Bs
matched = (evt_b_index != -1) if not B_reco else (evt_b_index == int(b_index))
# Keeping only those where there are reconstructed particles
if matched.sum() == 0:
continue
# Skip events/B's with one or less primaries reconstructed
if np.sum(np.logical_and(matched, evt_primary)) < 2:
continue
# If we made it to here a sample is valid and we add it to the one available
avail_samples.append((i, evt_idx, b_index))
return avail_samples
[docs]
def preload_root_data(root_files, features, discarded):
"""
Load all data from root files as lazyarrays (not actually read from disk until accessed).
Args:
root_files (str): Path to ROOT files.
features (list): List of feature names.
discarded (list): List of features present in the ROOT files and not used as input,
but used to calculate other quantities (e.g. edge features).
Returns:
list, list: Lists of dictionaries containing training information for input and ground-truth.
"""
x = []
y = []
for f in root_files:
with uproot.open(f)["Tree"] as tree:
# Get event numbers
event = tree["event"].array(library="np")
# Create dicts for x and y lazy arrays
x_dict = {}
x_dict["event"] = event
x_dict["features"] = {
feat: tree[feat].array(library="np") for feat in features
}
x_dict["discarded"] = {
feat: tree[feat].array(library="np") for feat in discarded
}
# Need this to initialise numpy features array in __getitem__
x_dict["leaves"] = tree["leaves"].array(library="np")
x_dict["primary"] = tree["primary"].array(library="np")
x_dict["b_index"] = tree["b_index"].array(library="np")
x_dict["mc_pdg"] = tree["mcPDG"].array(library="np")
y_dict = {1: {}, 2: {}}
for i in [1, 2]:
# Get this LCA
# Need this to reshape the falttened LCA when loading
y_dict[i]["n_LCA"] = tree[f"n_LCA_leaves_{i}"].array(library="np")
y_dict[i]["LCA"] = tree[f"LCAS_{i}"].array(library="np")
y_dict[i]["LCA_leaves"] = tree[f"LCA_leaves_{i}"].array(library="np")
x.append(x_dict)
y.append(y_dict)
return x, y