Belle II Software development
utils.py
1
8"""
9Pure utility functions shared between cosmics and dimuon validation.
10"""
11
12import numpy as np
13
14from alignment_validation.variables import TrackVariable
15
16
17log_level = 0
18
19
20def get_variable_names(variables: list) -> list:
21 """Flatten a list of Variable objects into a list of ROOT branch-name strings.
22
23 Parameters
24 ----------
25 variables : list of GlobalVariable or TrackVariable
26 Variable metadata objects whose branch names should be collected.
27
28 Returns
29 -------
30 list of str
31 All ROOT branch names in the order they appear in ``variables``.
32 TrackVariable contributes two names (name1, name2).
33 """
34 names = []
35 for variable in variables:
36 names.extend(variable.getName())
37 return names
38
39
40def auto_range(data_list: list, percent: float, modify: float = 0, symmetric: bool = False) -> tuple:
41 """Return an axis range covering the central ``percent`` of all datasets.
42
43 The range is determined by the union of per-dataset percentile intervals,
44 optionally symmetrised around the grand median and padded by ``modify``.
45
46 Parameters
47 ----------
48 data_list : list of array-like
49 One or more data arrays to consider simultaneously. The returned range
50 is wide enough to cover all of them.
51 percent : float
52 Central percentage of data to include (e.g. 96 keeps the central 96%,
53 clipping 2% from each tail).
54 modify : float, optional
55 Fractional padding applied to the final range width on each side.
56 E.g. 0.1 expands the range by 10% on each end. Default is 0.
57 symmetric : bool, optional
58 If True, the range is symmetrised around the mean of per-dataset
59 medians before padding. Default is False.
60
61 Returns
62 -------
63 tuple of (float, float)
64 ``(minimum, maximum)`` of the computed range.
65 """
66 low_percent = (100 - percent) / 2
67 high_percent = 100 - low_percent
68 low, high, median = (), (), ()
69 for data in data_list:
70 low += (np.percentile(data, low_percent),)
71 high += (np.percentile(data, high_percent),)
72 median += (np.median(data),)
73 minimum = min(low)
74 maximum = max(high)
75 center = np.mean(median)
76 if symmetric:
77 if abs(center - minimum) > abs(center - maximum):
78 maximum = center + center - minimum
79 if abs(center - minimum) < abs(center - maximum):
80 minimum = center + center - maximum
81 minimum = minimum - (maximum - minimum) * modify
82 maximum = maximum + (maximum - minimum) * modify
83 return (minimum, maximum)
84
85
86def normal_distribution(x: float, a: float, mu: float, sigma: float) -> float:
87 """Evaluate a normalised Gaussian at ``x``.
88
89 Parameters
90 ----------
91 x : float or array-like
92 Point(s) at which to evaluate the Gaussian.
93 a : float
94 Amplitude (total integral of the Gaussian).
95 mu : float
96 Mean of the Gaussian.
97 sigma : float
98 Standard deviation of the Gaussian.
99
100 Returns
101 -------
102 float or ndarray
103 Value of ``a / (sigma * sqrt(2*pi)) * exp(-0.5 * ((x-mu)/sigma)^2)``.
104 """
105 return (a / (sigma * (2 * np.pi) ** 0.5)) * np.exp(-((x - mu) / sigma) ** 2 / 2)
106
107
108def to_bins(x, y, bins, x_limits):
109 """Bin (x, y) data and compute per-bin median, sigma68, and their uncertainties.
110
111 Bins with fewer than 10 entries are filled with ``nan``.
112
113 Parameters
114 ----------
115 x : array-like
116 Independent variable used to define bins.
117 y : array-like
118 Dependent variable whose distribution is summarised in each bin.
119 bins : int or sequence of float
120 Number of equal-width bins or an explicit sequence of bin edges,
121 passed directly to ``numpy.histogram_bin_edges``.
122 x_limits : tuple of (float, float)
123 ``(min, max)`` range over which to bin.
124
125 Returns
126 -------
127 x_vals : list of float
128 Bin-centre x values (``nan`` for empty bins).
129 y_medians : list of float
130 Per-bin median of ``y`` (``nan`` for empty bins).
131 x_halfwidth : list of float
132 Half-width of each bin in x (``nan`` for empty bins).
133 y_approx_uncert : list of float
134 Approximate uncertainty on the median: ``std(y) / sqrt(N)``
135 (``nan`` for empty bins).
136 halfwidth_of_sigma68 : list of float
137 Half the 16th–84th percentile range of ``y``, i.e. sigma68
138 (``nan`` for empty bins).
139 sigma68_approx_uncert : list of float
140 Approximate uncertainty on sigma68: ``sigma68 / sqrt(N)``
141 (``nan`` for empty bins).
142 """
143 x_edges = np.histogram_bin_edges(x, bins, x_limits)
144 x_bin_halfwidth = [(x_edges[i] - x_edges[i - 1]) / 2 for i in range(1, len(x_edges))]
145 bin_numbers = np.digitize(x, x_edges) - 1
146 y_medians, y_approx_uncert, halfwidth_of_sigma68, sigma68_approx_uncert, x_halfwidth, x_vals = [], [], [], [], [], []
147 for i in range(len(x_edges) - 1):
148 if y[bin_numbers == i].size >= 10:
149 if log_level > 1:
150 print(f"Events in bin {x_edges[i]} to {x_edges[i+1]}: {y[bin_numbers == i].size}")
151 y_medians.append(float(np.median(y[bin_numbers == i])))
152 y_approx_uncert.append(float(np.std(y[bin_numbers == i]) / len(y[bin_numbers == i]) ** 0.5))
153 halfwidth_of_sigma68.append((np.percentile(y[bin_numbers == i], 84) -
154 np.percentile(y[bin_numbers == i], 16)) / 2)
155 sigma68_approx_uncert.append(halfwidth_of_sigma68[-1] / len(y[bin_numbers == i]) ** 0.5)
156 x_halfwidth.append(x_bin_halfwidth[i])
157 x_vals.append(x_edges[i] + x_bin_halfwidth[i])
158 else:
159 y_medians.append(np.nan)
160 y_approx_uncert.append(np.nan)
161 halfwidth_of_sigma68.append(np.nan)
162 sigma68_approx_uncert.append(np.nan)
163 x_halfwidth.append(np.nan)
164 x_vals.append(np.nan)
165 if log_level:
166 print(f"Warning: bin {x_edges[i]}-{x_edges[i+1]} is empty.")
167 return x_vals, y_medians, x_halfwidth, y_approx_uncert, halfwidth_of_sigma68, sigma68_approx_uncert
168
169
170def swap_tracks(data: dict, vars: list) -> dict:
171 """Swap the name1/name2 arrays for each TrackVariable in ``vars`` within the data dict.
172
173 Modifies ``data`` in-place and also returns it.
174
175 Parameters
176 ----------
177 data : dict
178 Data dictionary mapping ROOT branch names to arrays, as returned by
179 ``load_data``.
180 vars : list of GlobalVariable or TrackVariable
181 Variables to process. Only TrackVariable instances are swapped;
182 GlobalVariable instances are ignored.
183
184 Returns
185 -------
186 dict
187 The same ``data`` dict with track-1 and track-2 arrays swapped for
188 each TrackVariable in ``vars``.
189 """
190 for var in vars:
191 if isinstance(var, TrackVariable):
192 data[var.name1], data[var.name2] = data[var.name2], data[var.name1]
193 return data
194
195
196def get_filter(data: dict, cut: dict) -> np.ndarray:
197 """Return a boolean mask selecting events that pass all (min, max) cuts.
198
199 Parameters
200 ----------
201 data : dict
202 Data dictionary mapping branch names to arrays.
203 cut : dict
204 Selection criteria as ``{branch_name: (min, max)}``. An event passes
205 if ``min < data[branch_name] < max`` for every entry in ``cut``.
206 An empty dict returns a mask of all True.
207
208 Returns
209 -------
210 numpy.ndarray of bool
211 Boolean mask with the same length as the data arrays.
212 """
213 mask = np.ones_like(next(iter(data.values())), dtype=bool)
214 for var in cut:
215 mask = np.logical_and(mask, np.logical_and(cut[var][0] < data[var], data[var] < cut[var][1]))
216 return mask
217
218
219def pseudomomentum(p, tan, power):
220 """Compute the muon pseudomomentum p*beta*sin(theta)^(power/2).
221
222 This quantity is used as the x-axis for resolution-vs-momentum plots.
223 The muon mass (105.7 MeV/c²) is used for the beta factor.
224
225 Parameters
226 ----------
227 p : array-like
228 Transverse momentum in GeV/c.
229 tan : array-like
230 tan(lambda), where lambda is the dip angle (= pz/pt).
231 power : int or float
232 Exponent of sin(theta) in the pseudomomentum definition.
233 Typical values are 3 (for z0/tanLambda) and 5 (for d0/phi).
234
235 Returns
236 -------
237 ndarray
238 Pseudomomentum values in GeV/c.
239 """
240 return p * (1 + 1 / tan ** 2) ** 0.5 * 1 / (1 + 0.105 ** 2 / (p ** 2 *
241 (1 + tan ** 2))) ** 0.5 * 1 / ((1 + tan ** 2) ** 0.5) ** power