Belle II Software development
dataset_utils.py
1
8
9
10import numpy as np
11import uproot
12
13
14def populate_avail_samples(X, Y, B_reco=0):
15 """
16 Shifts through the file metadata to populate a list of available dataset samples.
17
18 Args:
19 X (list): List of ROOT lazyarray dicts for X (input) data.
20 Y (list): List of ROOT lazyarray dicts for Y (ground truth) data.
21 B_reco (int): Reconstruction mode flag (set automatically):
22
23 .. math::
24 \\text{Upsilon} (4S) = 0,\\ B^0 = 1,\\ B^+ = 2.
25
26 Returns:
27 list: List of available samples for training.
28 """
29 # Must iterate over Y because X contains a b_index of -1 for unmatched particles
30 avail_samples = []
31
32 # Iterate over files in self.y
33 for i, f in enumerate(Y):
34 # Have to get event list differently
35 events = X[i]["event"]
36
37 # Iterate over events in current file
38 for evt_idx, _ in enumerate(events):
39 b_indices = [1] if not B_reco else [1, 2]
40
41 for b_index in b_indices:
42 # Check that LCA is not trivial
43 lca_rows = f[b_index]["n_LCA"][evt_idx]
44
45 if lca_rows < 2:
46 continue
47
48 # Fetch relevant event properties
49 x_attrs = X[i]
50 evt_b_index = x_attrs["b_index"][evt_idx]
51 evt_primary = x_attrs["primary"][evt_idx]
52
53 # particles coming from one or both Bs
54 matched = (evt_b_index != -1) if not B_reco else (evt_b_index == int(b_index))
55
56 # Keeping only those where there are reconstructed particles
57 if matched.sum() == 0:
58 continue
59
60 # Skip events/B's with one or less primaries reconstructed
61 if np.sum(np.logical_and(matched, evt_primary)) < 2:
62 continue
63
64 # If we made it to here a sample is valid and we add it to the one available
65 avail_samples.append((i, evt_idx, b_index))
66
67 return avail_samples
68
69
70def preload_root_data(root_files, features, discarded):
71 """
72 Load all data from root files as lazyarrays (not actually read from disk until accessed).
73
74 Args:
75 root_files (str): Path to ROOT files.
76 features (list): List of feature names.
77 discarded (list): List of features present in the ROOT files and not used as input,
78 but used to calculate other quantities (e.g. edge features).
79
80 Returns:
81 list, list: Lists of dictionaries containing training information for input and ground-truth.
82 """
83 x = []
84 y = []
85
86 for f in root_files:
87 with uproot.open(f)["Tree"] as tree:
88 # Get event numbers
89 event = tree["event"].array(library="np")
90 # Create dicts for x and y lazy arrays
91 x_dict = {}
92 x_dict["event"] = event
93 x_dict["features"] = {
94 feat: tree[feat].array(library="np") for feat in features
95 }
96 x_dict["discarded"] = {
97 feat: tree[feat].array(library="np") for feat in discarded
98 }
99
100 # Need this to initialise numpy features array in __getitem__
101 x_dict["leaves"] = tree["leaves"].array(library="np")
102 x_dict["primary"] = tree["primary"].array(library="np")
103 x_dict["b_index"] = tree["b_index"].array(library="np")
104 x_dict["mc_pdg"] = tree["mcPDG"].array(library="np")
105
106 y_dict = {1: {}, 2: {}}
107 for i in [1, 2]:
108 # Get this LCA
109 # Need this to reshape the falttened LCA when loading
110 y_dict[i]["n_LCA"] = tree[f"n_LCA_leaves_{i}"].array(library="np")
111 y_dict[i]["LCA"] = tree[f"LCAS_{i}"].array(library="np")
112 y_dict[i]["LCA_leaves"] = tree[f"LCA_leaves_{i}"].array(library="np")
113
114 x.append(x_dict)
115 y.append(y_dict)
116
117 return x, y