Belle II Software development
train.py
1
8
9import pandas as pd
10import numpy as np
11
12from . import fit_functions
13from scipy.optimize import curve_fit
14import matplotlib.pyplot as plt
15
16from sklearn import tree
17
18
20 """Train a neural network for dE/dx-based particle identification"""
21
22 def __init__(self):
23 """Constructor"""
24
26
27 self.dedx_column = "dedx"
28
29 def train(self, data):
30 """Train on the input data"""
31 # We have everything
32 raise NotImplementedError("Use this class as a base class only")
33
34 def test(self, data):
35 """Get the trained neural-network output value for test data"""
36 if self.dedx_estimator_function is None:
37 raise ValueError("Train the estimator first!")
38
39 return self.dedx_estimator_function(data[self.dedx_column])
40
41
43 """Train a neural network for dE/dx-based particle identification"""
44
45
46 number_of_bins_in_dedx = 20
47
48 number_of_bins_in_p = 29
49
50 number_of_head_values_used_to_fit = 20
51
52 def create_dedx_bins(self, data):
53 """Construct the dE/dx bins and then populate them with the data"""
54 dedx_bins = np.linspace(
55 data[
56 self.dedx_column].min(), data[
57 self.dedx_column].max(), GroupedDEDXEstimationTrainer.number_of_bins_in_dedx)
58 dedx_cuts = pd.cut(data[self.dedx_column], dedx_bins)
59 return data.groupby(dedx_cuts), dedx_bins
60
61 def create_p_bins(self, data):
62 """Construct the momentum bins and then populate them with the data"""
63 p_bins = np.linspace(data.p.min(), data.p.max(), GroupedDEDXEstimationTrainer.number_of_bins_in_p)
64 p_cuts = pd.cut(data.p, p_bins)
65 return data.groupby(p_cuts), p_bins
66
67 def use_only_the_highest_values(self, data, number_of_values=None):
68 """Sort the data then select only the highest N values"""
69 if number_of_values is None:
70 return data
71 else:
72 return data.sort("number_of_p_values", ascending=False).head(number_of_values).sort()
73
74 def create_fit_data(self, dedx_bin):
75 """Fit track-momentum values"""
76 p_binned_data, p_bins = self.create_p_bins(dedx_bin)
77
78 number_of_p_values = pd.Series(p_binned_data.count().p.values, name="number_of_p_values")
79 p_bin_centers = pd.Series(0.5 * (p_bins[:-1] + p_bins[1:]), name="p_bin_centers")
80
81 all_fit_data = pd.DataFrame([number_of_p_values, p_bin_centers]).T
82 fit_data = self.use_only_the_highest_values(all_fit_data, GroupedDEDXEstimationTrainer.number_of_head_values_used_to_fit)
83
84 return fit_data
85
86 def fit_p_to_dedx_bin(self, dedx_bin):
87 """Fit the track-momentum values in the selected dE/dx bin, then train on the fitted values"""
88 fit_data = self.create_fit_data(dedx_bin)
89 return self.train_function(fit_data)
90
91
93 """Train a neural network for dE/dx-based particle identification"""
94
95 def __init__(self, result_function, use_sigma_for_result_fitting):
96 """Constructor"""
97
98
99 self.result_function = result_function
100
102
103 self.use_sigma_for_result_fitting = use_sigma_for_result_fitting
104
105 GroupedDEDXEstimationTrainer.__init__(self)
106
108 """Fit for the mean dE/dx and standard deviation, return the fit Dataframe"""
109 result_df = pd.DataFrame([{"dedx_bin_center": dedx_bin_center,
110 "mu": fit_parameters[1][1],
111 "sigma": fit_parameters[0]} for dedx_bin_center,
112 fit_parameters in self.result_parameters_for_each_dedx_bin.items()
113 if fit_parameters is not None])
114
115 if len(result_df) == 0:
116 raise ValueError("Could not find any fitted parameters!")
117
119 result_df["mu_plus_sigma"] = result_df.mu + result_df.sigma
120 result_df["mu_minus_sigma"] = result_df.mu - result_df.sigma
121
122 result_df.sort("dedx_bin_center", inplace=True)
123
124 return result_df
125
127 """Define the parameters for the fit, assign initial guesses"""
128 result_df = self.create_result_dataframe()
129
130 p0 = (7e+08, -4e+04, 0.1, 0)
131
133 popt, pcov = curve_fit(self.result_function, result_df.dedx_bin_center, result_df.mu, p0=p0,
134 sigma=result_df.sigma, absolute_sigma=True)
135 else:
136 popt, pcov = curve_fit(self.result_function, result_df.dedx_bin_center, result_df.mu, p0=p0)
137
138 return popt, lambda dedx: self.result_function(dedx, *popt)
139
140 def train(self, data):
141 """Train the neural network using curated data"""
142 dedx_binned_data, dedx_bins = self.create_dedx_bins(data)
143
144 def fit_and_save_results(dedx_bin):
145 fit_result = self.fit_p_to_dedx_bin(dedx_bin)
146 return {dedx_bin.mean()[self.dedx_column]: fit_result}
147
148 for result in dedx_binned_data.apply(fit_and_save_results):
149 self.result_parameters_for_each_dedx_bin.update(result)
150
151
152 self.dedx_estimator_parameters, self.dedx_estimator_functiondedx_estimator_function = self.fit_result_parameters()
153
154 def plot_fit_result(self, data):
155 """Plot the fitted results"""
156 plot_dedx_data = np.linspace(data[self.dedx_column].min(), data[self.dedx_column].max(), 100)
157 result_df = self.create_result_dataframe()
158
159 plt.plot(plot_dedx_data, self.dedx_estimator_functiondedx_estimator_function(plot_dedx_data), color="black", label="Fitted estimator")
161 # color = "black"
162 plt.errorbar(result_df.dedx_bin_center, result_df.mu, marker="o", ls="", label="Data Points", yerr=result_df.sigma)
163
164 plt.ylim(0, 0.14)
165 plt.xlabel("dEdX in ADC count/cm")
166 plt.ylabel("p in GeV/c")
167 plt.legend(frameon=True)
168
169 def plot_grouped_result(self, data):
170 """Plot the fitted grouped results"""
171 dedx_binned_data, dedx_bins = self.create_dedx_bins(data)
172
173 # List to prevent bug in pd.DataFrame.apply
174 already_plotted_list = []
175
176 def plot_fitted_results(dedx_bin):
177 dedx_bin_center = dedx_bin.mean().values[0]
178
179 if dedx_bin_center not in already_plotted_list:
180 already_plotted_list.append(dedx_bin_center)
181
182 fit_data = self.create_fit_data(dedx_bin)
183 plt.plot(fit_data.p_bin_centers, fit_data.number_of_p_values, ls="", marker=".", color="black")
184
185 return True
186
187 plt.xlabel("p in GeV/c")
188 plt.ylabel("Entries")
189
190 dedx_binned_data.apply(plot_fitted_results)
191
192
194 """Train a neural network for dE/dx-based particle identification"""
195
196 def __init__(self, fit_function, dimension_of_fit_function, result_function, use_sigma_for_result_fitting):
197 """Constructor"""
198
199
200 self.dimension_of_fit_function = dimension_of_fit_function
201
202 self.fit_function = fit_function
203
204 FittedGroupedDEDXEstimatorTrainer.__init__(self, result_function, use_sigma_for_result_fitting)
205
206 def train_function(fit_data):
207 """Train on the fit to curated-data highest values whose truth value is known"""
208 max_value = self.use_only_the_highest_values(fit_data, 1).p_bin_centers.values[0]
209
210 if self.dimension_of_fit_function == 3:
211 p0 = (1e3, max_value, 4e-2)
212 elif self.dimension_of_fit_function == 6:
213 p0 = (1e3, max_value, 4e-2, 1, 1, 1)
214
215 popt, pcov = curve_fit(self.fit_function, fit_data.p_bin_centers, fit_data.number_of_p_values, p0=p0)
216
217 return [np.sqrt(np.diag(pcov)[1]), popt]
218
219
220 self.train_function = train_function
221
222 def plot_grouped_result(self, data):
223 """Plot the fitted grouped results"""
224 FittedGroupedDEDXEstimatorTrainer.plot_grouped_result(self, data)
225
226 dedx_binned_data, dedx_bins = self.create_dedx_bins(data)
227
228 p_plot_data = np.linspace(data.p.min(), data.p.max(), 1000)
229
230 # List to prevent bug in pd.DataFrame.apply
231 already_plotted_list = []
232
233 def plot_fitted_results(dedx_bin):
234 dedx_bin_center = dedx_bin.mean().values[0]
235
236 if dedx_bin_center not in already_plotted_list:
237 fitted_results = self.result_parameters_for_each_dedx_bin[dedx_bin.mean()[self.dedx_column]]
238 already_plotted_list.append(dedx_bin_center)
239 unneeded, fit_options = fitted_results
240
241 dedx_plot_data = self.fit_function(p_plot_data, *fitted_results[1])
242 plt.plot(p_plot_data, dedx_plot_data)
243
244 return True
245
246 dedx_binned_data.apply(plot_fitted_results)
247
248
250 """Train a neural network for dE/dx-based particle identification using a Gaussian estimator"""
251
252 def __init__(self):
253 """Constructor"""
254 FunctionFittedGroupedDEDXEstimatorTrainer.__init__(
255 self,
256 fit_functions.norm,
257 3,
258 fit_functions.inverse_squared,
259 use_sigma_for_result_fitting=True)
260
261
263 """Train a neural network for dE/dx-based particle identification using a Landau estimator"""
264
265 def __init__(self):
266 """Constructor"""
267 FunctionFittedGroupedDEDXEstimatorTrainer.__init__(
268 self,
269 fit_functions.landau,
270 3,
271 fit_functions.inverse_squared,
272 use_sigma_for_result_fitting=True)
273
274
276 """Train a neural network for dE/dx-based particle identification using only the highest values"""
277
278 def __init__(self):
279 """Constructor"""
280 FittedGroupedDEDXEstimatorTrainer.__init__(self, fit_functions.inverse_squared, use_sigma_for_result_fitting=False)
281
282 def train_function(fit_data):
283 """Train on the curated-data highest values whose truth value is known"""
284 max_value = self.use_only_the_highest_values(fit_data, 1).p_bin_centers.values[0]
285
286 return [None, [None, max_value, None]]
287
288
289 self.train_function = train_function
290
291
293 """Train a neural network for dE/dx-based particle identification using only the median values"""
294
295 def __init__(self):
296 """Constructor"""
297 FittedGroupedDEDXEstimatorTrainer.__init__(self, fit_functions.inverse_squared, use_sigma_for_result_fitting=True)
298
299 def train_function(fit_data):
300 """Train on the curated-data median values whose truth value is known"""
301 weighted_p_values = fit_data.apply(lambda data: [data.p_bin_centers] * int(data.number_of_p_values), axis=1).sum()
302 median_value = np.median(weighted_p_values)
303 iqr = np.percentile(weighted_p_values, 75) - np.percentile(weighted_p_values, 50)
304
305 return [iqr, [None, median_value, None]]
306
307
308 self.train_function = train_function
309
310
312 """Train a neural network for dE/dx-based particle identification using a Gaussian estimator"""
313
314 def __init__(self):
315 """Constructor"""
316 FunctionFittedGroupedDEDXEstimatorTrainer.__init__(
317 self,
318 fit_functions.norm,
319 3,
320 fit_functions.inverse_sqrt,
321 use_sigma_for_result_fitting=True)
322
323
325 """Train a neural network for dE/dx-based particle identification using a Landau estimator"""
326
327 def __init__(self):
328 """Constructor"""
329 FunctionFittedGroupedDEDXEstimatorTrainer.__init__(
330 self,
331 fit_functions.landau,
332 3,
333 fit_functions.inverse_sqrt,
334 use_sigma_for_result_fitting=True)
335
336
338 """Train a neural network for dE/dx-based particle identification using only the highest values"""
339
340 def __init__(self):
341 """Constructor"""
342 FittedGroupedDEDXEstimatorTrainer.__init__(self, fit_functions.inverse_sqrt, use_sigma_for_result_fitting=False)
343
344 def train_function(fit_data):
345 """Train on the curated-data highest values whose truth value is known"""
346 max_value = self.use_only_the_highest_values(fit_data, 1).p_bin_centers.values[0]
347
348 return [None, [None, max_value, None]]
349
350
351 self.train_function = train_function
352
353
355 """Train a neural network for dE/dx-based particle identification using only the median values"""
356
357 def __init__(self):
358 """Constructor"""
359 FittedGroupedDEDXEstimatorTrainer.__init__(self, fit_functions.inverse_sqrt, use_sigma_for_result_fitting=True)
360
361 def train_function(fit_data):
362 """Train on the curated-data median values whose truth value is known"""
363 weighted_p_values = fit_data.apply(lambda data: [data.p_bin_centers] * int(data.number_of_p_values), axis=1).sum()
364 median_value = np.median(weighted_p_values)
365 iqr = np.percentile(weighted_p_values, 75) - np.percentile(weighted_p_values, 50)
366
367 return [iqr, [None, median_value, None]]
368
369
370 self.train_function = train_function
371
372
374 """Train a neural network for dE/dx-based particle identification using multivariate data analysis"""
375
376 def __init__(self):
377 """Constructor"""
378
379
380 self.tree = tree.DecisionTreeRegressor()
381 DEDXEstimationTrainer.__init__(self)
382
383 def train(self, data):
384 """Train the neural network using curated data"""
385
386 train_data = data.copy()
387 del train_data["p"]
388
389 p_values = data["p"]
390
391 self.tree.fit(train_data.values, p_values.values)
392
393 def test(self, data):
394 """Get the trained neural-network output value for test data"""
395
396 test_data = data.copy()
397 del test_data["p"]
398
399 return self.tree.predict(test_data.values)
def test(self, data)
Definition: train.py:34
dedx_estimator_function
by default, the dE/dx-particle-identification trainer has not run yet
Definition: train.py:25
dedx_column
the default data column is 'dedx'
Definition: train.py:27
result_parameters_for_each_dedx_bin
cached copy of the dictionary of fitting parameters for each dE/dx bin
Definition: train.py:101
result_function
cached copy of the result function
Definition: train.py:99
use_sigma_for_result_fitting
cached copy of the flag to add mean+/-sigma values to the output Dataframe
Definition: train.py:103
dedx_estimator_function
cached copies of the fit parameters and estimator function
Definition: train.py:152
def __init__(self, result_function, use_sigma_for_result_fitting)
Definition: train.py:95
fit_function
cached copy of the fitting function
Definition: train.py:202
def __init__(self, fit_function, dimension_of_fit_function, result_function, use_sigma_for_result_fitting)
Definition: train.py:196
dimension_of_fit_function
cached value of the degrees of freedom in the fit
Definition: train.py:200
train_function
this class's training function
Definition: train.py:220
def create_p_bins(self, data)
Definition: train.py:61
def use_only_the_highest_values(self, data, number_of_values=None)
Definition: train.py:67
def fit_p_to_dedx_bin(self, dedx_bin)
Definition: train.py:86
def create_fit_data(self, dedx_bin)
Definition: train.py:74
def create_dedx_bins(self, data)
Definition: train.py:52
def test(self, data)
Definition: train.py:393
tree
cached copy of the MVA tool
Definition: train.py:380
train_function
this class's training function
Definition: train.py:351
train_function
this class's training function
Definition: train.py:289
train_function
this class's training function
Definition: train.py:370
train_function
this class's training function
Definition: train.py:308
Definition: train.py:1