Belle II Software  release-05-02-19
SVDTimeNet_Learn.py
1 
2 # coding: utf-8
3 
4 # ## Train and save the SVDTime Neural Network
5 #
6 # The SVDTimeNN is a MultilayerPerceptron estimator of
7 # The truth data in this case are bin numbers in a series of time shift
8 # bins. The result of such a training is a distribution function for a
9 # time shift value. From these, it is easy to calculate mean value and
10 # standard deviation, but also do a range of approximate probabilistic
11 # calculations.
12 
13 # ##### Required Python packages
14 #
15 # The following python packages are used:
16 # - math (basic python math functions)
17 # - numpy (Vectors and matrices for numerics)
18 # - pandas (Python analogue of Excel tables)
19 # - scipy (Scientific computing package)
20 # - scikit-learn (machine learning)
21 #
22 # Only sklear2pmml is missing in the current basf2 distribution.
23 # Install it with
24 #
25 # pip3 install --user git+https://github.com/jpmml/sklearn2pmml.git
26 #
27 # ##### Other pre-requisites:
28 #
29 # A sample of training data, plus binning and bounds information in pickle (*.pkl) files.
30 
31 import math
32 import datetime
33 import pickle
34 import numpy as np
35 import pandas as pd
36 from scipy import stats as stats
37 from scipy.optimize import minimize_scalar
38 from sklearn.neural_network import MLPClassifier
39 from sklearn2pmml import sklearn2pmml, PMMLPipeline
40 from svd.SVDSimBase import *
41 from lxml import etree as ET
42 import argparse
43 
44 # ### Retrieve training sample
45 
46 parser = argparse.ArgumentParser(description="Train the SVD hit time estimator")
47 
48 parser.add_argument(
49  '--nsamples',
50  dest='n_samples',
51  action='store',
52  default=1000000,
53  type=int,
54  help='Global tag to use at central DB in PNNL')
55 
56 args = parser.parse_args()
57 
58 pkl_name = 'SVDTime_Training{0}_{1}.pkl'
59 
60 print('Reading data...')
61 
62 stockdata = pd.read_pickle(pkl_name.format('Sample', args.n_samples))
63 bounds = pd.read_pickle(pkl_name.format('Bounds', args.n_samples))
64 bins = pd.read_pickle(pkl_name.format('Bins', args.n_samples))
65 
66 timearray = bins['midpoint']
67 timebins = np.unique(bins[['lower', 'upper']])
68 
69 print('Done.')
70 
71 # ### Split the data into training and test samples
72 
73 test_fraction = 0.2
74 X = stockdata[['s' + str(i) for i in range(1, 7)] + ['normed_tau']]
75 Y = stockdata['t0_bin']
76 X_train = X[stockdata.test > test_fraction]
77 X_test = X[stockdata.test < test_fraction]
78 Y_train = Y[stockdata.test > test_fraction]
79 Y_test = Y[stockdata.test < test_fraction]
80 
81 classifier = MLPClassifier(
82  hidden_layer_sizes=(len(timearray) - 1, len(timearray) + 1),
83  activation='relu',
84  solver='adam',
85  tol=1.0e-6,
86  alpha=0.005,
87  verbose=True
88 )
89 
90 print('Fitting the neural network...')
91 
92 nntime_fitter = PMMLPipeline([('claasifier', classifier)])
93 nntime_fitter.fit(X_train, Y_train)
94 
95 test_score = nntime_fitter.score(X_test, Y_test)
96 train_score = nntime_fitter.score(X_train, Y_train)
97 print('Test: {}'.format(test_score))
98 print('Train: {}'.format(train_score))
99 print('Fitting done.')
100 
101 print('Writing output...')
102 
103 pmml_name = 'SVDTimeNet.pmml'
104 xml_name = pmml_name.replace('pmml', 'xml')
105 sklearn2pmml(nntime_fitter, pmml_name, with_repr=True)
106 
107 parser = ET.XMLParser(remove_blank_text=True)
108 net = ET.parse(pmml_name, parser)
109 root = net.getroot()
110 # namespace hassle
111 namespace = root.nsmap[None]
112 nsprefix = '{' + namespace + '}'
113 procinfo = root.find(nsprefix + 'MiningBuildTask')
114 
115 # Save some metadata
116 name = ET.SubElement(procinfo, nsprefix + 'Title')
117 name.text = 'Neural network for time shift estimation'
118 
119 # Information on use of the classifier
120 target = ET.SubElement(procinfo, nsprefix + 'IntendedUse')
121 basf2proc = ET.SubElement(target, nsprefix + 'basf2process')
122 basf2simulation = ET.SubElement(basf2proc, nsprefix + 'Simulation')
123 basf2simulation.text = 'yes'
124 basf2reconstruction = ET.SubElement(basf2proc, nsprefix + 'Reconstruction')
125 basf2reconstruction.text = 'yes'
126 sensorType = ET.SubElement(target, nsprefix + 'SensorType')
127 sensorType.text = 'all'
128 sensorSide = ET.SubElement(target, nsprefix + 'SensorSide')
129 sensorSide.text = 'all'
130 
131 # information on training
132 training = ET.SubElement(procinfo, nsprefix + 'Training')
133 source = ET.SubElement(training, nsprefix + 'SampleSource')
134 source.text = 'Toy simulation'
135 genfunc = ET.SubElement(training, nsprefix + 'Waveform')
136 genfunc.text = 'beta-prime'
137 num_samples = ET.SubElement(training, nsprefix + 'SampleSize')
138 train_samples = ET.SubElement(num_samples, nsprefix + 'Training', {'n': str(int((1 - test_fraction) * args.n_samples))})
139 test_samples = ET.SubElement(num_samples, nsprefix + 'Test', {'n': str(int(test_fraction * args.n_samples))})
140 bounds.apply(
141  lambda row: ET.SubElement(training, nsprefix + 'Parameter', **{u: str(v) for u, v in row.items()}), axis=1
142 )
143 
144 netparams = ET.SubElement(procinfo, nsprefix + 'NetworkParameters')
145 inputLayer = ET.SubElement(netparams, nsprefix + 'NetworkLayer')
146 inputLayer.attrib['number'] = str(0)
147 inputLayer.attrib['kind'] = 'input'
148 inputLayer.attrib['size'] = str(7) # 7 as in 6 APV samples + tau
149 n_hidden = len(classifier.hidden_layer_sizes)
150 for (iLayer, sz) in zip(range(1, 1 + n_hidden), classifier.hidden_layer_sizes):
151  layer = ET.SubElement(netparams, nsprefix + 'NetworkLayer')
152  layer.attrib['number'] = str(iLayer)
153  layer.attrib['kind'] = 'hidden'
154  layer.attrib['size'] = str(sz)
155 outputLayer = ET.SubElement(netparams, nsprefix + 'NetworkLayer')
156 outputLayer.attrib['number'] = str(n_hidden + 1)
157 outputLayer.attrib['kind'] = 'output'
158 outputLayer.attrib['size'] = str(len(timearray))
159 
160 for field in root.find(nsprefix + 'DataDictionary'):
161  if field.attrib['name'] == 't0_bin':
162  for child in field:
163  i = int(child.attrib['value'])
164  child.attrib['lower'] = '{0:.3f}'.format(bins.loc[i, 'lower'])
165  child.attrib['upper'] = '{0:.3f}'.format(bins.loc[i, 'upper'])
166  child.attrib['midpoint'] = '{0:.3f}'.format(bins.loc[i, 'midpoint'])
167 
168 net.write(xml_name, xml_declaration=True, pretty_print=True, encoding='utf-8')
169 
170 print('Output saved.')
171 
172 print('Saving fits...')
173 
174 # #### Set up tau en/decoder
175 amp_index = bounds[bounds.value == 'amplitude'].index[0]
176 amp_range = (bounds.ix[amp_index, 'low'], bounds.ix[amp_index, 'high'])
177 tau_index = bounds[bounds.value == 'tau'].index[0]
178 tau_range = (bounds.ix[tau_index, 'low'], bounds.ix[tau_index, 'high'])
179 coder = tau_encoder(amp_range, tau_range)
180 
181 # #### True values
182 
183 Trues_test = stockdata[stockdata.test < test_fraction][['t0', 'amplitude', 'tau', 't0_bin', 'abin']]
184 
185 # #### Predicted probabilities.
186 
187 probs = nntime_fitter.predict_proba(X_test)
188 
189 # ### Calculate time shifts and amplitudes from probabilities
190 
191 
192 def fitFromProb(fw, signals, p, tau, timearray):
193  t_fit = np.average(timearray, weights=p)
194  t_sigma = np.sqrt(np.average((timearray - t_fit)**2, weights=p))
195  weights = fw(-t_fit + np.linspace(-dt, 4 * dt, 6, endpoint=True), tau=tau)
196  weights[signals.values == 0.0] = 0.0
197  norm = 1.0 / np.inner(weights, weights)
198  a_fit = np.inner(signals, weights) * norm
199  a_sigma = np.sqrt(norm)
200  residuals = signals - a_fit * weights
201  ndf = np.sum(np.ones_like(signals[signals > 0])) - 2 # Can't be less than 1
202  chi2_ndf = np.inner(residuals, residuals) / ndf
203  return pd.Series({
204  't_fit': t_fit,
205  't_sigma': t_sigma,
206  'a_fit': a_fit,
207  'a_sigma': a_sigma,
208  'chi2_ndf': chi2_ndf
209  })
210 
211 
212 probdf = pd.DataFrame(probs)
213 probdf.index = X_test.index
214 probdf.to_pickle('SVDTime_TrainingProbs_{0}.pkl'.format(args.n_samples))
215 
216 fits = X_test.apply(
217  lambda row: fitFromProb(
218  betaprime_wave,
219  row[['s' + str(i) for i in range(1, 7)]],
220  probdf.ix[row.name],
221  coder.decode(row['normed_tau']),
222  timearray),
223  axis=1
224 )
225 fits['t_true'] = Trues_test['t0']
226 fits['tau'] = Trues_test['tau']
227 fits['a_true'] = Trues_test['amplitude']
228 fits['t_bin'] = Trues_test['t0_bin']
229 fits['a_bin'] = Trues_test['abin']
230 
231 fits.to_pickle('SVDTime_TrainingFits_{0}.pkl'.format(args.n_samples))
232 
233 print('Writing classifier...')
234 
235 with open('classifier.pkl', 'wb') as f:
236  pickle.dump(classifier, f)
237 
238 with open('classifier.txt', 'w') as cdump:
239  cdump.write("Classifier coefficients:\n")
240  for iLayer in range(len(classifier.coefs_)):
241  cdump.write('Layer: {0}\n'.format(iLayer))
242  nrows = classifier.coefs_[iLayer].shape[0]
243  ncols = classifier.coefs_[iLayer].shape[1]
244  cdump.write('Weights:\n')
245  for col in range(ncols):
246  s = " ".join([str(classifier.coefs_[iLayer][row, col]) for row in range(nrows)])
247  s += "\n"
248  cdump.write(s)
249  # intercepts should have nrows dimension
250  cdump.write('Intercepts:\n')
251  s = " ".join([str(classifier.intercepts_[iLayer][col]) for col in range(ncols)])
252  s += "\n"
253  cdump.write(s)
254 
255 print("Learning phase completed.")
svd.SVDSimBase
Definition: SVDSimBase.py:1