Source code for grafei.model.dataset_utils

##########################################################################
# basf2 (Belle II Analysis Software Framework)                           #
# Author: The Belle II Collaboration                                     #
#                                                                        #
# See git log for contributors and copyright holders.                    #
# This file is licensed under LGPL-3.0, see LICENSE.md.                  #
##########################################################################


import numpy as np
import uproot



[docs]
def populate_avail_samples(X, Y, B_reco=0):
    """
    Shifts through the file metadata to populate a list of available dataset samples.

    Args:
        X (list): List of ROOT lazyarray dicts for X (input) data.
        Y (list): List of ROOT lazyarray dicts for Y (ground truth) data.
        B_reco (int): Reconstruction mode flag (set automatically):

            .. math::
               \\text{Upsilon} (4S) = 0,\\ B^0 = 1,\\ B^+ = 2.

    Returns:
        list: List of available samples for training.
    """
    # Must iterate over Y because X contains a b_index of -1 for unmatched particles
    avail_samples = []

    # Iterate over files in self.y
    for i, f in enumerate(Y):
        # Have to get event list differently
        events = X[i]["event"]

        # Iterate over events in current file
        for evt_idx, _ in enumerate(events):
            b_indices = [1] if not B_reco else [1, 2]

            for b_index in b_indices:
                # Check that LCA is not trivial
                lca_rows = f[b_index]["n_LCA"][evt_idx]

                if lca_rows < 2:
                    continue

                # Fetch relevant event properties
                x_attrs = X[i]
                evt_b_index = x_attrs["b_index"][evt_idx]
                evt_primary = x_attrs["primary"][evt_idx]

                # particles coming from one or both Bs
                matched = (evt_b_index != -1) if not B_reco else (evt_b_index == int(b_index))

                # Keeping only those where there are reconstructed particles
                if matched.sum() == 0:
                    continue

                # Skip events/B's with one or less primaries reconstructed
                if np.sum(np.logical_and(matched, evt_primary)) < 2:
                    continue

                # If we made it to here a sample is valid and we add it to the one available
                avail_samples.append((i, evt_idx, b_index))

    return avail_samples




[docs]
def preload_root_data(root_files, features, discarded):
    """
    Load all data from root files as lazyarrays (not actually read from disk until accessed).

    Args:
        root_files (str): Path to ROOT files.
        features (list): List of feature names.
        discarded (list): List of features present in the ROOT files and not used as input,
            but used to calculate other quantities (e.g. edge features).

    Returns:
        list, list: Lists of dictionaries containing training information for input and ground-truth.
    """
    x = []
    y = []

    for f in root_files:
        with uproot.open(f)["Tree"] as tree:
            # Get event numbers
            event = tree["event"].array(library="np")
            # Create dicts for x and y lazy arrays
            x_dict = {}
            x_dict["event"] = event
            x_dict["features"] = {
                feat: tree[feat].array(library="np") for feat in features
            }
            x_dict["discarded"] = {
                feat: tree[feat].array(library="np") for feat in discarded
            }

            # Need this to initialise numpy features array in __getitem__
            x_dict["leaves"] = tree["leaves"].array(library="np")
            x_dict["primary"] = tree["primary"].array(library="np")
            x_dict["b_index"] = tree["b_index"].array(library="np")
            x_dict["mc_pdg"] = tree["mcPDG"].array(library="np")

            y_dict = {1: {}, 2: {}}
            for i in [1, 2]:
                # Get this LCA
                # Need this to reshape the falttened LCA when loading
                y_dict[i]["n_LCA"] = tree[f"n_LCA_leaves_{i}"].array(library="np")
                y_dict[i]["LCA"] = tree[f"LCAS_{i}"].array(library="np")
                y_dict[i]["LCA_leaves"] = tree[f"LCA_leaves_{i}"].array(library="np")

            x.append(x_dict)
            y.append(y_dict)

    return x, y