Belle II Software  release-05-02-19
train.py
1 import pandas as pd
2 import numpy as np
3 
4 from . import fit_functions
5 from scipy.optimize import curve_fit
6 import matplotlib.pyplot as plt
7 
8 from sklearn import tree
9 
10 
12  """Train a neural network for dE/dx-based particle identification"""
13 
14  def __init__(self):
15  """Constructor"""
16 
18 
19  self.dedx_column = "dedx"
20 
21  def train(self, data):
22  """Train on the input data"""
23  # We have everything
24  raise NotImplementedError("Use this class as a base class only")
25 
26  def test(self, data):
27  """Get the trained neural-network output value for test data"""
28  if self.dedx_estimator_function is None:
29  raise ValueError("Train the estimator first!")
30 
31  return self.dedx_estimator_function(data[self.dedx_column])
32 
33 
35  """Train a neural network for dE/dx-based particle identification"""
36 
37 
38  number_of_bins_in_dedx = 20
39 
40  number_of_bins_in_p = 29
41 
42  number_of_head_values_used_to_fit = 20
43 
44  def create_dedx_bins(self, data):
45  """Construct the dE/dx bins and then populate them with the data"""
46  dedx_bins = np.linspace(
47  data[
48  self.dedx_column].min(), data[
49  self.dedx_column].max(), GroupedDEDXEstimationTrainer.number_of_bins_in_dedx)
50  dedx_cuts = pd.cut(data[self.dedx_column], dedx_bins)
51  return data.groupby(dedx_cuts), dedx_bins
52 
53  def create_p_bins(self, data):
54  """Construct the momentum bins and then populate them with the data"""
55  p_bins = np.linspace(data.p.min(), data.p.max(), GroupedDEDXEstimationTrainer.number_of_bins_in_p)
56  p_cuts = pd.cut(data.p, p_bins)
57  return data.groupby(p_cuts), p_bins
58 
59  def use_only_the_highest_values(self, data, number_of_values=None):
60  """Sort the data then select only the highest N values"""
61  if number_of_values is None:
62  return data
63  else:
64  return data.sort("number_of_p_values", ascending=False).head(number_of_values).sort()
65 
66  def create_fit_data(self, dedx_bin):
67  """Fit track-momentum values"""
68  p_binned_data, p_bins = self.create_p_bins(dedx_bin)
69 
70  number_of_p_values = pd.Series(p_binned_data.count().p.values, name="number_of_p_values")
71  p_bin_centers = pd.Series(0.5 * (p_bins[:-1] + p_bins[1:]), name="p_bin_centers")
72 
73  all_fit_data = pd.DataFrame([number_of_p_values, p_bin_centers]).T
74  fit_data = self.use_only_the_highest_values(all_fit_data, GroupedDEDXEstimationTrainer.number_of_head_values_used_to_fit)
75 
76  return fit_data
77 
78  def fit_p_to_dedx_bin(self, dedx_bin):
79  """Fit the track-momentum values in the selected dE/dx bin, then train on the fitted values"""
80  fit_data = self.create_fit_data(dedx_bin)
81  return self.train_function(fit_data)
82 
83 
85  """Train a neural network for dE/dx-based particle identification"""
86 
87  def __init__(self, result_function, use_sigma_for_result_fitting):
88  """Constructor"""
89 
90 
91  self.result_function = result_function
92 
94 
95  self.use_sigma_for_result_fitting = use_sigma_for_result_fitting
96 
97  GroupedDEDXEstimationTrainer.__init__(self)
98 
100  """Fit for the mean dE/dx and standard deviation, return the fit Dataframe"""
101  result_df = pd.DataFrame([{"dedx_bin_center": dedx_bin_center,
102  "mu": fit_parameters[1][1],
103  "sigma": fit_parameters[0]} for dedx_bin_center,
104  fit_parameters in self.result_parameters_for_each_dedx_bin.items()
105  if fit_parameters is not None])
106 
107  if len(result_df) == 0:
108  raise ValueError("Could not find any fitted parameters!")
109 
111  result_df["mu_plus_sigma"] = result_df.mu + result_df.sigma
112  result_df["mu_minus_sigma"] = result_df.mu - result_df.sigma
113 
114  result_df.sort("dedx_bin_center", inplace=True)
115 
116  return result_df
117 
119  """Define the parameters for the fit, assign initial guesses"""
120  result_df = self.create_result_dataframe()
121 
122  p0 = (7e+08, -4e+04, 0.1, 0)
123 
125  popt, pcov = curve_fit(self.result_function, result_df.dedx_bin_center, result_df.mu, p0=p0,
126  sigma=result_df.sigma, absolute_sigma=True)
127  else:
128  popt, pcov = curve_fit(self.result_function, result_df.dedx_bin_center, result_df.mu, p0=p0)
129 
130  return popt, lambda dedx: self.result_function(dedx, *popt)
131 
132  def train(self, data):
133  """Train the neural network using curated data"""
134  dedx_binned_data, dedx_bins = self.create_dedx_bins(data)
135 
136  def fit_and_save_results(dedx_bin):
137  fit_result = self.fit_p_to_dedx_bin(dedx_bin)
138  return {dedx_bin.mean()[self.dedx_column]: fit_result}
139 
140  for result in dedx_binned_data.apply(fit_and_save_results):
141  self.result_parameters_for_each_dedx_bin.update(result)
142 
143 
144  self.dedx_estimator_parameters, self.dedx_estimator_function = self.fit_result_parameters()
145 
146  def plot_fit_result(self, data):
147  """Plot the fitted results"""
148  plot_dedx_data = np.linspace(data[self.dedx_column].min(), data[self.dedx_column].max(), 100)
149  result_df = self.create_result_dataframe()
150 
151  plt.plot(plot_dedx_data, self.dedx_estimator_function(plot_dedx_data), color="black", label="Fitted estimator")
153  color = "black"
154  plt.errorbar(result_df.dedx_bin_center, result_df.mu, marker="o", ls="", label="Data Points", yerr=result_df.sigma)
155 
156  plt.ylim(0, 0.14)
157  plt.xlabel("dEdX in ADC count/cm")
158  plt.ylabel("p in GeV/c")
159  plt.legend(frameon=True)
160 
161  def plot_grouped_result(self, data):
162  """Plot the fitted grouped results"""
163  dedx_binned_data, dedx_bins = self.create_dedx_bins(data)
164 
165  # List to prevent bug in pd.DataFrame.apply
166  already_plotted_list = []
167 
168  def plot_fitted_results(dedx_bin):
169  dedx_bin_center = dedx_bin.mean().values[0]
170 
171  if dedx_bin_center not in already_plotted_list:
172  already_plotted_list.append(dedx_bin_center)
173 
174  fit_data = self.create_fit_data(dedx_bin)
175  plt.plot(fit_data.p_bin_centers, fit_data.number_of_p_values, ls="", marker=".", color="black")
176 
177  return True
178 
179  plt.xlabel("p in GeV/c")
180  plt.ylabel("Entries")
181 
182  dedx_binned_data.apply(plot_fitted_results)
183 
184 
186  """Train a neural network for dE/dx-based particle identification"""
187 
188  def __init__(self, fit_function, dimension_of_fit_function, result_function, use_sigma_for_result_fitting):
189  """Constructor"""
190 
191 
192  self.dimension_of_fit_function = dimension_of_fit_function
193 
194  self.fit_function = fit_function
195 
196  FittedGroupedDEDXEstimatorTrainer.__init__(self, result_function, use_sigma_for_result_fitting)
197 
198  def train_function(fit_data):
199  """Train on the fit to curated-data highest values whose truth value is known"""
200  max_value = self.use_only_the_highest_values(fit_data, 1).p_bin_centers.values[0]
201 
202  if self.dimension_of_fit_function == 3:
203  p0 = (1e3, max_value, 4e-2)
204  elif self.dimension_of_fit_function == 6:
205  p0 = (1e3, max_value, 4e-2, 1, 1, 1)
206 
207  popt, pcov = curve_fit(self.fit_function, fit_data.p_bin_centers, fit_data.number_of_p_values, p0=p0)
208 
209  return [np.sqrt(np.diag(pcov)[1]), popt]
210 
211 
212  self.train_function = train_function
213 
214  def plot_grouped_result(self, data):
215  """Plot the fitted grouped results"""
216  FittedGroupedDEDXEstimatorTrainer.plot_grouped_result(self, data)
217 
218  dedx_binned_data, dedx_bins = self.create_dedx_bins(data)
219 
220  p_plot_data = np.linspace(data.p.min(), data.p.max(), 1000)
221 
222  # List to prevent bug in pd.DataFrame.apply
223  already_plotted_list = []
224 
225  def plot_fitted_results(dedx_bin):
226  dedx_bin_center = dedx_bin.mean().values[0]
227 
228  if dedx_bin_center not in already_plotted_list:
229  fitted_results = self.result_parameters_for_each_dedx_bin[dedx_bin.mean()[self.dedx_column]]
230  already_plotted_list.append(dedx_bin_center)
231  unneeded, fit_options = fitted_results
232 
233  dedx_plot_data = self.fit_function(p_plot_data, *fitted_results[1])
234  plt.plot(p_plot_data, dedx_plot_data)
235 
236  return True
237 
238  dedx_binned_data.apply(plot_fitted_results)
239 
240 
242  """Train a neural network for dE/dx-based particle identification using a Gaussian estimator"""
243 
244  def __init__(self):
245  """Constructor"""
246  FunctionFittedGroupedDEDXEstimatorTrainer.__init__(
247  self,
248  fit_functions.norm,
249  3,
250  fit_functions.inverse_squared,
251  use_sigma_for_result_fitting=True)
252 
253 
255  """Train a neural network for dE/dx-based particle identification using a Landau estimator"""
256 
257  def __init__(self):
258  """Constructor"""
259  FunctionFittedGroupedDEDXEstimatorTrainer.__init__(
260  self,
261  fit_functions.landau,
262  3,
263  fit_functions.inverse_squared,
264  use_sigma_for_result_fitting=True)
265 
266 
268  """Train a neural network for dE/dx-based particle identification using only the highest values"""
269 
270  def __init__(self):
271  """Constructor"""
272  FittedGroupedDEDXEstimatorTrainer.__init__(self, fit_functions.inverse_squared, use_sigma_for_result_fitting=False)
273 
274  def train_function(fit_data):
275  """Train on the curated-data highest values whose truth value is known"""
276  max_value = self.use_only_the_highest_values(fit_data, 1).p_bin_centers.values[0]
277 
278  return [None, [None, max_value, None]]
279 
280 
281  self.train_function = train_function
282 
283 
285  """Train a neural network for dE/dx-based particle identification using only the median values"""
286 
287  def __init__(self):
288  """Constructor"""
289  FittedGroupedDEDXEstimatorTrainer.__init__(self, fit_functions.inverse_squared, use_sigma_for_result_fitting=True)
290 
291  def train_function(fit_data):
292  """Train on the curated-data median values whose truth value is known"""
293  weighted_p_values = fit_data.apply(lambda data: [data.p_bin_centers] * int(data.number_of_p_values), axis=1).sum()
294  median_value = np.median(weighted_p_values)
295  iqr = np.percentile(weighted_p_values, 75) - np.percentile(weighted_p_values, 50)
296 
297  return [iqr, [None, median_value, None]]
298 
299 
300  self.train_function = train_function
301 
302 
304  """Train a neural network for dE/dx-based particle identification using a Gaussian estimator"""
305 
306  def __init__(self):
307  """Constructor"""
308  FunctionFittedGroupedDEDXEstimatorTrainer.__init__(
309  self,
310  fit_functions.norm,
311  3,
312  fit_functions.inverse_sqrt,
313  use_sigma_for_result_fitting=True)
314 
315 
317  """Train a neural network for dE/dx-based particle identification using a Landau estimator"""
318 
319  def __init__(self):
320  """Constructor"""
321  FunctionFittedGroupedDEDXEstimatorTrainer.__init__(
322  self,
323  fit_functions.landau,
324  3,
325  fit_functions.inverse_sqrt,
326  use_sigma_for_result_fitting=True)
327 
328 
330  """Train a neural network for dE/dx-based particle identification using only the highest values"""
331 
332  def __init__(self):
333  """Constructor"""
334  FittedGroupedDEDXEstimatorTrainer.__init__(self, fit_functions.inverse_sqrt, use_sigma_for_result_fitting=False)
335 
336  def train_function(fit_data):
337  """Train on the curated-data highest values whose truth value is known"""
338  max_value = self.use_only_the_highest_values(fit_data, 1).p_bin_centers.values[0]
339 
340  return [None, [None, max_value, None]]
341 
342 
343  self.train_function = train_function
344 
345 
347  """Train a neural network for dE/dx-based particle identification using only the median values"""
348 
349  def __init__(self):
350  """Constructor"""
351  FittedGroupedDEDXEstimatorTrainer.__init__(self, fit_functions.inverse_sqrt, use_sigma_for_result_fitting=True)
352 
353  def train_function(fit_data):
354  """Train on the curated-data median values whose truth value is known"""
355  weighted_p_values = fit_data.apply(lambda data: [data.p_bin_centers] * int(data.number_of_p_values), axis=1).sum()
356  median_value = np.median(weighted_p_values)
357  iqr = np.percentile(weighted_p_values, 75) - np.percentile(weighted_p_values, 50)
358 
359  return [iqr, [None, median_value, None]]
360 
361 
362  self.train_function = train_function
363 
364 
366  """Train a neural network for dE/dx-based particle identification using multivariate data analysis"""
367 
368  def __init__(self):
369  """Constructor"""
370 
371 
372  self.tree = tree.DecisionTreeRegressor()
373  DEDXEstimationTrainer.__init__(self)
374 
375  def train(self, data):
376  """Train the neural network using curated data"""
377 
378  train_data = data.copy()
379  del train_data["p"]
380 
381  p_values = data["p"]
382 
383  self.tree.fit(train_data.values, p_values.values)
384 
385  def test(self, data):
386  """Get the trained neural-network output value for test data"""
387 
388  test_data = data.copy()
389  del test_data["p"]
390 
391  return self.tree.predict(test_data.values)
train.FittedGroupedDEDXEstimatorTrainer
Definition: train.py:84
train.FunctionFittedGroupedDEDXEstimatorTrainer.train_function
train_function
this class's training function
Definition: train.py:212
train.DEDXEstimationTrainer.dedx_estimator_function
dedx_estimator_function
by default, the dE/dx-particle-identification trainer has not run yet
Definition: train.py:17
train.DEDXEstimationTrainer
Definition: train.py:11
train.LandauEstimatorTrainer.__init__
def __init__(self)
Definition: train.py:257
train.FittedGroupedDEDXEstimatorTrainer.plot_grouped_result
def plot_grouped_result(self, data)
Definition: train.py:161
train.FittedGroupedDEDXEstimatorTrainer.result_function
result_function
cached copy of the result function
Definition: train.py:91
train.MaximumEstimatorTrainer.train_function
train_function
this class's training function
Definition: train.py:281
train.MaximumEstimatorTrainerSQRT.__init__
def __init__(self)
Definition: train.py:332
train.GroupedDEDXEstimationTrainer.use_only_the_highest_values
def use_only_the_highest_values(self, data, number_of_values=None)
Definition: train.py:59
train.FittedGroupedDEDXEstimatorTrainer.use_sigma_for_result_fitting
use_sigma_for_result_fitting
cached copy of the flag to add mean+/-sigma values to the output Dataframe
Definition: train.py:95
train.FunctionFittedGroupedDEDXEstimatorTrainer
Definition: train.py:185
train.GroupedDEDXEstimationTrainer
Definition: train.py:34
train.FittedGroupedDEDXEstimatorTrainer.result_parameters_for_each_dedx_bin
result_parameters_for_each_dedx_bin
cached copy of the dictionary of fitting parameters for each dE/dx bin
Definition: train.py:93
train.MedianEstimatorTrainer
Definition: train.py:284
train.GroupedDEDXEstimationTrainer.create_p_bins
def create_p_bins(self, data)
Definition: train.py:53
train.MaximumEstimatorTrainerSQRT.train_function
train_function
this class's training function
Definition: train.py:343
train.MVADEDXEstimationTrainer
Definition: train.py:365
train.DEDXEstimationTrainer.__init__
def __init__(self)
Definition: train.py:14
train.MVADEDXEstimationTrainer.__init__
def __init__(self)
Definition: train.py:368
train.MedianEstimatorTrainer.__init__
def __init__(self)
Definition: train.py:287
train.MVADEDXEstimationTrainer.train
def train(self, data)
Definition: train.py:375
train.FunctionFittedGroupedDEDXEstimatorTrainer.__init__
def __init__(self, fit_function, dimension_of_fit_function, result_function, use_sigma_for_result_fitting)
Definition: train.py:188
train.LandauEstimatorTrainerSQRT
Definition: train.py:316
train.FittedGroupedDEDXEstimatorTrainer.train
def train(self, data)
Definition: train.py:132
train.FittedGroupedDEDXEstimatorTrainer.create_result_dataframe
def create_result_dataframe(self)
Definition: train.py:99
train.MaximumEstimatorTrainer.__init__
def __init__(self)
Definition: train.py:270
train.MedianEstimatorTrainerSQRT.train_function
train_function
this class's training function
Definition: train.py:362
train.FittedGroupedDEDXEstimatorTrainer.plot_fit_result
def plot_fit_result(self, data)
Definition: train.py:146
train.MVADEDXEstimationTrainer.tree
tree
cached copy of the MVA tool
Definition: train.py:372
train.LandauEstimatorTrainerSQRT.__init__
def __init__(self)
Definition: train.py:319
train.FunctionFittedGroupedDEDXEstimatorTrainer.plot_grouped_result
def plot_grouped_result(self, data)
Definition: train.py:214
train.DEDXEstimationTrainer.test
def test(self, data)
Definition: train.py:26
train.GaussianEstimatorTrainer
Definition: train.py:241
train.GaussianEstimatorTrainerSQRT.__init__
def __init__(self)
Definition: train.py:306
train.GroupedDEDXEstimationTrainer.create_dedx_bins
def create_dedx_bins(self, data)
Definition: train.py:44
train.GaussianEstimatorTrainerSQRT
Definition: train.py:303
train.DEDXEstimationTrainer.train
def train(self, data)
Definition: train.py:21
train.FunctionFittedGroupedDEDXEstimatorTrainer.fit_function
fit_function
cached copy of the fitting function
Definition: train.py:194
train.MedianEstimatorTrainer.train_function
train_function
this class's training function
Definition: train.py:300
train.FittedGroupedDEDXEstimatorTrainer.fit_result_parameters
def fit_result_parameters(self)
Definition: train.py:118
train.DEDXEstimationTrainer.dedx_column
dedx_column
the default data column is 'dedx'
Definition: train.py:19
train.GaussianEstimatorTrainer.__init__
def __init__(self)
Definition: train.py:244
train.MedianEstimatorTrainerSQRT
Definition: train.py:346
train.MVADEDXEstimationTrainer.test
def test(self, data)
Definition: train.py:385
train.MedianEstimatorTrainerSQRT.__init__
def __init__(self)
Definition: train.py:349
train.GroupedDEDXEstimationTrainer.create_fit_data
def create_fit_data(self, dedx_bin)
Definition: train.py:66
train.FittedGroupedDEDXEstimatorTrainer.__init__
def __init__(self, result_function, use_sigma_for_result_fitting)
Definition: train.py:87
train.MaximumEstimatorTrainer
Definition: train.py:267
train.MaximumEstimatorTrainerSQRT
Definition: train.py:329
train.GroupedDEDXEstimationTrainer.fit_p_to_dedx_bin
def fit_p_to_dedx_bin(self, dedx_bin)
Definition: train.py:78
train.FunctionFittedGroupedDEDXEstimatorTrainer.dimension_of_fit_function
dimension_of_fit_function
cached value of the degrees of freedom in the fit
Definition: train.py:192
train.LandauEstimatorTrainer
Definition: train.py:254