Belle II Software  light-2403-persian
dataset_utils.py
1 
8 
9 
10 import numpy as np
11 import uproot
12 
13 
14 def populate_avail_samples(X, Y, B_reco=0):
15  """
16  Shifts through the file metadata to populate a list of available dataset samples.
17 
18  Args:
19  X (list): List of ROOT lazyarray dicts for X (input) data.
20  Y (list): List of ROOT lazyarray dicts for Y (ground truth) data.
21  B_reco (int): Reconstruction mode flag (set automatically):
22 
23  .. math::
24  \\text{Upsilon} (4S) = 0,\\ B^0 = 1,\\ B^+ = 2.
25 
26  Returns:
27  list: List of available samples for training.
28  """
29  # Must iterate over Y because X contains a b_index of -1 for unmatched particles
30  avail_samples = []
31 
32  # Iterate over files in self.y
33  for i, f in enumerate(Y):
34  # Have to get event list differently
35  events = X[i]["event"]
36 
37  # Iterate over events in current file
38  for evt_idx, _ in enumerate(events):
39  b_indices = [1] if not B_reco else [1, 2]
40 
41  for b_index in b_indices:
42  # Check that LCA is not trivial
43  lca_rows = f[b_index]["n_LCA"][evt_idx]
44 
45  if lca_rows < 2:
46  continue
47 
48  # Fetch relevant event properties
49  x_attrs = X[i]
50  evt_b_index = x_attrs["b_index"][evt_idx]
51  evt_primary = x_attrs["primary"][evt_idx]
52 
53  # particles coming from one or both Bs
54  matched = (evt_b_index != -1) if not B_reco else (evt_b_index == int(b_index))
55 
56  # Keeping only those where there are reconstructed particles
57  if matched.sum() == 0:
58  continue
59 
60  # Skip events/B's with one or less primaries reconstructed
61  if np.sum(np.logical_and(matched, evt_primary)) < 2:
62  continue
63 
64  # If we made it to here a sample is valid and we add it to the one available
65  avail_samples.append((i, evt_idx, b_index))
66 
67  return avail_samples
68 
69 
70 def preload_root_data(root_files, features, discarded):
71  """
72  Load all data from root files as lazyarrays (not actually read from disk until accessed).
73 
74  Args:
75  root_files (str): Path to ROOT files.
76  features (list): List of feature names.
77  discarded (list): List of features present in the ROOT files and not used as input,
78  but used to calculate other quantities (e.g. edge features).
79 
80  Returns:
81  list, list: Lists of dictionaries containing training information for input and ground-truth.
82  """
83  x = []
84  y = []
85 
86  for f in root_files:
87  with uproot.open(f)["Tree"] as tree:
88  # Get event numbers
89  event = tree["event"].array(library="np")
90  # Create dicts for x and y lazy arrays
91  x_dict = {}
92  x_dict["event"] = event
93  x_dict["features"] = {
94  feat: tree[feat].array(library="np") for feat in features
95  }
96  x_dict["discarded"] = {
97  feat: tree[feat].array(library="np") for feat in discarded
98  }
99 
100  # Need this to initialise numpy features array in __getitem__
101  x_dict["leaves"] = tree["leaves"].array(library="np")
102  x_dict["primary"] = tree["primary"].array(library="np")
103  x_dict["b_index"] = tree["b_index"].array(library="np")
104  x_dict["mc_pdg"] = tree["mcPDG"].array(library="np")
105 
106  y_dict = {1: {}, 2: {}}
107  for i in [1, 2]:
108  # Get this LCA
109  # Need this to reshape the falttened LCA when loading
110  y_dict[i]["n_LCA"] = tree[f"n_LCA_leaves_{i}"].array(library="np")
111  y_dict[i]["LCA"] = tree[f"LCAS_{i}"].array(library="np")
112  y_dict[i]["LCA_leaves"] = tree[f"LCA_leaves_{i}"].array(library="np")
113 
114  x.append(x_dict)
115  y.append(y_dict)
116 
117  return x, y