Belle II Software  release-08-01-10
train.py
1 
8 
9 import pandas as pd
10 import numpy as np
11 
12 from . import fit_functions
13 from scipy.optimize import curve_fit
14 import matplotlib.pyplot as plt
15 
16 from sklearn import tree
17 
18 
20  """Train a neural network for dE/dx-based particle identification"""
21 
22  def __init__(self):
23  """Constructor"""
24 
25  self.dedx_estimator_functiondedx_estimator_function = None
26 
27  self.dedx_columndedx_column = "dedx"
28 
29  def train(self, data):
30  """Train on the input data"""
31  # We have everything
32  raise NotImplementedError("Use this class as a base class only")
33 
34  def test(self, data):
35  """Get the trained neural-network output value for test data"""
36  if self.dedx_estimator_functiondedx_estimator_function is None:
37  raise ValueError("Train the estimator first!")
38 
39  return self.dedx_estimator_functiondedx_estimator_function(data[self.dedx_columndedx_column])
40 
41 
43  """Train a neural network for dE/dx-based particle identification"""
44 
45 
46  number_of_bins_in_dedx = 20
47 
48  number_of_bins_in_p = 29
49 
50  number_of_head_values_used_to_fit = 20
51 
52  def create_dedx_bins(self, data):
53  """Construct the dE/dx bins and then populate them with the data"""
54  dedx_bins = np.linspace(
55  data[
56  self.dedx_columndedx_column].min(), data[
57  self.dedx_columndedx_column].max(), GroupedDEDXEstimationTrainer.number_of_bins_in_dedx)
58  dedx_cuts = pd.cut(data[self.dedx_columndedx_column], dedx_bins)
59  return data.groupby(dedx_cuts), dedx_bins
60 
61  def create_p_bins(self, data):
62  """Construct the momentum bins and then populate them with the data"""
63  p_bins = np.linspace(data.p.min(), data.p.max(), GroupedDEDXEstimationTrainer.number_of_bins_in_p)
64  p_cuts = pd.cut(data.p, p_bins)
65  return data.groupby(p_cuts), p_bins
66 
67  def use_only_the_highest_values(self, data, number_of_values=None):
68  """Sort the data then select only the highest N values"""
69  if number_of_values is None:
70  return data
71  else:
72  return data.sort("number_of_p_values", ascending=False).head(number_of_values).sort()
73 
74  def create_fit_data(self, dedx_bin):
75  """Fit track-momentum values"""
76  p_binned_data, p_bins = self.create_p_binscreate_p_bins(dedx_bin)
77 
78  number_of_p_values = pd.Series(p_binned_data.count().p.values, name="number_of_p_values")
79  p_bin_centers = pd.Series(0.5 * (p_bins[:-1] + p_bins[1:]), name="p_bin_centers")
80 
81  all_fit_data = pd.DataFrame([number_of_p_values, p_bin_centers]).T
82  fit_data = self.use_only_the_highest_valuesuse_only_the_highest_values(all_fit_data, GroupedDEDXEstimationTrainer.number_of_head_values_used_to_fit)
83 
84  return fit_data
85 
86  def fit_p_to_dedx_bin(self, dedx_bin):
87  """Fit the track-momentum values in the selected dE/dx bin, then train on the fitted values"""
88  fit_data = self.create_fit_datacreate_fit_data(dedx_bin)
89  return self.train_function(fit_data)
90 
91 
93  """Train a neural network for dE/dx-based particle identification"""
94 
95  def __init__(self, result_function, use_sigma_for_result_fitting):
96  """Constructor"""
97 
98 
99  self.result_functionresult_function = result_function
100 
101  self.result_parameters_for_each_dedx_binresult_parameters_for_each_dedx_bin = {}
102 
103  self.use_sigma_for_result_fittinguse_sigma_for_result_fitting = use_sigma_for_result_fitting
104 
105  GroupedDEDXEstimationTrainer.__init__(self)
106 
108  """Fit for the mean dE/dx and standard deviation, return the fit Dataframe"""
109  result_df = pd.DataFrame([{"dedx_bin_center": dedx_bin_center,
110  "mu": fit_parameters[1][1],
111  "sigma": fit_parameters[0]} for dedx_bin_center,
112  fit_parameters in self.result_parameters_for_each_dedx_binresult_parameters_for_each_dedx_bin.items()
113  if fit_parameters is not None])
114 
115  if len(result_df) == 0:
116  raise ValueError("Could not find any fitted parameters!")
117 
118  if self.use_sigma_for_result_fittinguse_sigma_for_result_fitting:
119  result_df["mu_plus_sigma"] = result_df.mu + result_df.sigma
120  result_df["mu_minus_sigma"] = result_df.mu - result_df.sigma
121 
122  result_df.sort("dedx_bin_center", inplace=True)
123 
124  return result_df
125 
127  """Define the parameters for the fit, assign initial guesses"""
128  result_df = self.create_result_dataframecreate_result_dataframe()
129 
130  p0 = (7e+08, -4e+04, 0.1, 0)
131 
132  if self.use_sigma_for_result_fittinguse_sigma_for_result_fitting:
133  popt, pcov = curve_fit(self.result_functionresult_function, result_df.dedx_bin_center, result_df.mu, p0=p0,
134  sigma=result_df.sigma, absolute_sigma=True)
135  else:
136  popt, pcov = curve_fit(self.result_functionresult_function, result_df.dedx_bin_center, result_df.mu, p0=p0)
137 
138  return popt, lambda dedx: self.result_functionresult_function(dedx, *popt)
139 
140  def train(self, data):
141  """Train the neural network using curated data"""
142  dedx_binned_data, dedx_bins = self.create_dedx_binscreate_dedx_bins(data)
143 
144  def fit_and_save_results(dedx_bin):
145  fit_result = self.fit_p_to_dedx_binfit_p_to_dedx_bin(dedx_bin)
146  return {dedx_bin.mean()[self.dedx_columndedx_column]: fit_result}
147 
148  for result in dedx_binned_data.apply(fit_and_save_results):
149  self.result_parameters_for_each_dedx_binresult_parameters_for_each_dedx_bin.update(result)
150 
151 
152  self.dedx_estimator_parameters, self.dedx_estimator_functiondedx_estimator_functiondedx_estimator_function = self.fit_result_parametersfit_result_parameters()
153 
154  def plot_fit_result(self, data):
155  """Plot the fitted results"""
156  plot_dedx_data = np.linspace(data[self.dedx_columndedx_column].min(), data[self.dedx_columndedx_column].max(), 100)
157  result_df = self.create_result_dataframecreate_result_dataframe()
158 
159  plt.plot(plot_dedx_data, self.dedx_estimator_functiondedx_estimator_functiondedx_estimator_function(plot_dedx_data), color="black", label="Fitted estimator")
160  if self.use_sigma_for_result_fittinguse_sigma_for_result_fitting:
161  # color = "black"
162  plt.errorbar(result_df.dedx_bin_center, result_df.mu, marker="o", ls="", label="Data Points", yerr=result_df.sigma)
163 
164  plt.ylim(0, 0.14)
165  plt.xlabel("dEdX in ADC count/cm")
166  plt.ylabel("p in GeV/c")
167  plt.legend(frameon=True)
168 
169  def plot_grouped_result(self, data):
170  """Plot the fitted grouped results"""
171  dedx_binned_data, dedx_bins = self.create_dedx_binscreate_dedx_bins(data)
172 
173  # List to prevent bug in pd.DataFrame.apply
174  already_plotted_list = []
175 
176  def plot_fitted_results(dedx_bin):
177  dedx_bin_center = dedx_bin.mean().values[0]
178 
179  if dedx_bin_center not in already_plotted_list:
180  already_plotted_list.append(dedx_bin_center)
181 
182  fit_data = self.create_fit_datacreate_fit_data(dedx_bin)
183  plt.plot(fit_data.p_bin_centers, fit_data.number_of_p_values, ls="", marker=".", color="black")
184 
185  return True
186 
187  plt.xlabel("p in GeV/c")
188  plt.ylabel("Entries")
189 
190  dedx_binned_data.apply(plot_fitted_results)
191 
192 
194  """Train a neural network for dE/dx-based particle identification"""
195 
196  def __init__(self, fit_function, dimension_of_fit_function, result_function, use_sigma_for_result_fitting):
197  """Constructor"""
198 
199 
200  self.dimension_of_fit_functiondimension_of_fit_function = dimension_of_fit_function
201 
202  self.fit_functionfit_function = fit_function
203 
204  FittedGroupedDEDXEstimatorTrainer.__init__(self, result_function, use_sigma_for_result_fitting)
205 
206  def train_function(fit_data):
207  """Train on the fit to curated-data highest values whose truth value is known"""
208  max_value = self.use_only_the_highest_valuesuse_only_the_highest_values(fit_data, 1).p_bin_centers.values[0]
209 
210  if self.dimension_of_fit_functiondimension_of_fit_function == 3:
211  p0 = (1e3, max_value, 4e-2)
212  elif self.dimension_of_fit_functiondimension_of_fit_function == 6:
213  p0 = (1e3, max_value, 4e-2, 1, 1, 1)
214 
215  popt, pcov = curve_fit(self.fit_functionfit_function, fit_data.p_bin_centers, fit_data.number_of_p_values, p0=p0)
216 
217  return [np.sqrt(np.diag(pcov)[1]), popt]
218 
219 
220  self.train_functiontrain_function = train_function
221 
222  def plot_grouped_result(self, data):
223  """Plot the fitted grouped results"""
224  FittedGroupedDEDXEstimatorTrainer.plot_grouped_result(self, data)
225 
226  dedx_binned_data, dedx_bins = self.create_dedx_binscreate_dedx_bins(data)
227 
228  p_plot_data = np.linspace(data.p.min(), data.p.max(), 1000)
229 
230  # List to prevent bug in pd.DataFrame.apply
231  already_plotted_list = []
232 
233  def plot_fitted_results(dedx_bin):
234  dedx_bin_center = dedx_bin.mean().values[0]
235 
236  if dedx_bin_center not in already_plotted_list:
237  fitted_results = self.result_parameters_for_each_dedx_binresult_parameters_for_each_dedx_bin[dedx_bin.mean()[self.dedx_columndedx_column]]
238  already_plotted_list.append(dedx_bin_center)
239  unneeded, fit_options = fitted_results
240 
241  dedx_plot_data = self.fit_functionfit_function(p_plot_data, *fitted_results[1])
242  plt.plot(p_plot_data, dedx_plot_data)
243 
244  return True
245 
246  dedx_binned_data.apply(plot_fitted_results)
247 
248 
250  """Train a neural network for dE/dx-based particle identification using a Gaussian estimator"""
251 
252  def __init__(self):
253  """Constructor"""
254  FunctionFittedGroupedDEDXEstimatorTrainer.__init__(
255  self,
256  fit_functions.norm,
257  3,
258  fit_functions.inverse_squared,
259  use_sigma_for_result_fitting=True)
260 
261 
263  """Train a neural network for dE/dx-based particle identification using a Landau estimator"""
264 
265  def __init__(self):
266  """Constructor"""
267  FunctionFittedGroupedDEDXEstimatorTrainer.__init__(
268  self,
269  fit_functions.landau,
270  3,
271  fit_functions.inverse_squared,
272  use_sigma_for_result_fitting=True)
273 
274 
276  """Train a neural network for dE/dx-based particle identification using only the highest values"""
277 
278  def __init__(self):
279  """Constructor"""
280  FittedGroupedDEDXEstimatorTrainer.__init__(self, fit_functions.inverse_squared, use_sigma_for_result_fitting=False)
281 
282  def train_function(fit_data):
283  """Train on the curated-data highest values whose truth value is known"""
284  max_value = self.use_only_the_highest_valuesuse_only_the_highest_values(fit_data, 1).p_bin_centers.values[0]
285 
286  return [None, [None, max_value, None]]
287 
288 
289  self.train_functiontrain_function = train_function
290 
291 
293  """Train a neural network for dE/dx-based particle identification using only the median values"""
294 
295  def __init__(self):
296  """Constructor"""
297  FittedGroupedDEDXEstimatorTrainer.__init__(self, fit_functions.inverse_squared, use_sigma_for_result_fitting=True)
298 
299  def train_function(fit_data):
300  """Train on the curated-data median values whose truth value is known"""
301  weighted_p_values = fit_data.apply(lambda data: [data.p_bin_centers] * int(data.number_of_p_values), axis=1).sum()
302  median_value = np.median(weighted_p_values)
303  iqr = np.percentile(weighted_p_values, 75) - np.percentile(weighted_p_values, 50)
304 
305  return [iqr, [None, median_value, None]]
306 
307 
308  self.train_functiontrain_function = train_function
309 
310 
312  """Train a neural network for dE/dx-based particle identification using a Gaussian estimator"""
313 
314  def __init__(self):
315  """Constructor"""
316  FunctionFittedGroupedDEDXEstimatorTrainer.__init__(
317  self,
318  fit_functions.norm,
319  3,
320  fit_functions.inverse_sqrt,
321  use_sigma_for_result_fitting=True)
322 
323 
325  """Train a neural network for dE/dx-based particle identification using a Landau estimator"""
326 
327  def __init__(self):
328  """Constructor"""
329  FunctionFittedGroupedDEDXEstimatorTrainer.__init__(
330  self,
331  fit_functions.landau,
332  3,
333  fit_functions.inverse_sqrt,
334  use_sigma_for_result_fitting=True)
335 
336 
338  """Train a neural network for dE/dx-based particle identification using only the highest values"""
339 
340  def __init__(self):
341  """Constructor"""
342  FittedGroupedDEDXEstimatorTrainer.__init__(self, fit_functions.inverse_sqrt, use_sigma_for_result_fitting=False)
343 
344  def train_function(fit_data):
345  """Train on the curated-data highest values whose truth value is known"""
346  max_value = self.use_only_the_highest_valuesuse_only_the_highest_values(fit_data, 1).p_bin_centers.values[0]
347 
348  return [None, [None, max_value, None]]
349 
350 
351  self.train_functiontrain_function = train_function
352 
353 
355  """Train a neural network for dE/dx-based particle identification using only the median values"""
356 
357  def __init__(self):
358  """Constructor"""
359  FittedGroupedDEDXEstimatorTrainer.__init__(self, fit_functions.inverse_sqrt, use_sigma_for_result_fitting=True)
360 
361  def train_function(fit_data):
362  """Train on the curated-data median values whose truth value is known"""
363  weighted_p_values = fit_data.apply(lambda data: [data.p_bin_centers] * int(data.number_of_p_values), axis=1).sum()
364  median_value = np.median(weighted_p_values)
365  iqr = np.percentile(weighted_p_values, 75) - np.percentile(weighted_p_values, 50)
366 
367  return [iqr, [None, median_value, None]]
368 
369 
370  self.train_functiontrain_function = train_function
371 
372 
374  """Train a neural network for dE/dx-based particle identification using multivariate data analysis"""
375 
376  def __init__(self):
377  """Constructor"""
378 
379 
380  self.treetree = tree.DecisionTreeRegressor()
381  DEDXEstimationTrainer.__init__(self)
382 
383  def train(self, data):
384  """Train the neural network using curated data"""
385 
386  train_data = data.copy()
387  del train_data["p"]
388 
389  p_values = data["p"]
390 
391  self.treetree.fit(train_data.values, p_values.values)
392 
393  def test(self, data):
394  """Get the trained neural-network output value for test data"""
395 
396  test_data = data.copy()
397  del test_data["p"]
398 
399  return self.treetree.predict(test_data.values)
def test(self, data)
Definition: train.py:34
def train(self, data)
Definition: train.py:29
dedx_estimator_function
by default, the dE/dx-particle-identification trainer has not run yet
Definition: train.py:25
dedx_column
the default data column is 'dedx'
Definition: train.py:27
result_parameters_for_each_dedx_bin
cached copy of the dictionary of fitting parameters for each dE/dx bin
Definition: train.py:101
result_function
cached copy of the result function
Definition: train.py:99
use_sigma_for_result_fitting
cached copy of the flag to add mean+/-sigma values to the output Dataframe
Definition: train.py:103
dedx_estimator_function
cached copies of the fit parameters and estimator function
Definition: train.py:152
def __init__(self, result_function, use_sigma_for_result_fitting)
Definition: train.py:95
fit_function
cached copy of the fitting function
Definition: train.py:202
def __init__(self, fit_function, dimension_of_fit_function, result_function, use_sigma_for_result_fitting)
Definition: train.py:196
dimension_of_fit_function
cached value of the degrees of freedom in the fit
Definition: train.py:200
train_function
this class's training function
Definition: train.py:220
def create_p_bins(self, data)
Definition: train.py:61
def use_only_the_highest_values(self, data, number_of_values=None)
Definition: train.py:67
def fit_p_to_dedx_bin(self, dedx_bin)
Definition: train.py:86
def create_fit_data(self, dedx_bin)
Definition: train.py:74
def create_dedx_bins(self, data)
Definition: train.py:52
def test(self, data)
Definition: train.py:393
tree
cached copy of the MVA tool
Definition: train.py:380
def train(self, data)
Definition: train.py:383
train_function
this class's training function
Definition: train.py:351
train_function
this class's training function
Definition: train.py:289
train_function
this class's training function
Definition: train.py:370
train_function
this class's training function
Definition: train.py:308