Belle II Software development
SVDTimeNet_Learn.py
1
8
9
10# ## Train and save the SVDTime Neural Network
11#
12# The SVDTimeNN is a MultilayerPerceptron estimator of
13# The truth data in this case are bin numbers in a series of time shift
14# bins. The result of such a training is a distribution function for a
15# time shift value. From these, it is easy to calculate mean value and
16# standard deviation, but also do a range of approximate probabilistic
17# calculations.
18
19# ##### Required Python packages
20#
21# The following python packages are used:
22# - math (basic python math functions)
23# - numpy (Vectors and matrices for numerics)
24# - pandas (Python analogue of Excel tables)
25# - scipy (Scientific computing package)
26# - scikit-learn (machine learning)
27#
28# Only sklear2pmml is missing in the current basf2 distribution.
29# Install it with
30#
31# pip3 install --user git+https://github.com/jpmml/sklearn2pmml.git
32#
33# ##### Other pre-requisites:
34#
35# A sample of training data, plus binning and bounds information in pickle (*.pkl) files.
36
37import pickle
38import numpy as np
39import pandas as pd
40from sklearn.neural_network import MLPClassifier
41from sklearn2pmml import sklearn2pmml, PMMLPipeline
42from svd.SVDSimBase import dt, betaprime_wave, tau_encoder
43from lxml import etree as ET
44import argparse
45
46# ### Retrieve training sample
47
48parser = argparse.ArgumentParser(description="Train the SVD hit time estimator")
49
50parser.add_argument(
51 '--nsamples',
52 dest='n_samples',
53 action='store',
54 default=1000000,
55 type=int,
56 help='Global tag to use at central DB in PNNL')
57
58args = parser.parse_args()
59
60pkl_name = 'SVDTime_Training{0}_{1}.pkl'
61
62print('Reading data...')
63
64stockdata = pd.read_pickle(pkl_name.format('Sample', args.n_samples))
65bounds = pd.read_pickle(pkl_name.format('Bounds', args.n_samples))
66bins = pd.read_pickle(pkl_name.format('Bins', args.n_samples))
67
68timearray = bins['midpoint']
69timebins = np.unique(bins[['lower', 'upper']])
70
71print('Done.')
72
73# ### Split the data into training and test samples
74
75test_fraction = 0.2
76X = stockdata[['s' + str(i) for i in range(1, 7)] + ['normed_tau']]
77Y = stockdata['t0_bin']
78X_train = X[stockdata.test > test_fraction]
79X_test = X[stockdata.test < test_fraction]
80Y_train = Y[stockdata.test > test_fraction]
81Y_test = Y[stockdata.test < test_fraction]
82
83classifier = MLPClassifier(
84 hidden_layer_sizes=(len(timearray) - 1, len(timearray) + 1),
85 activation='relu',
86 solver='adam',
87 tol=1.0e-6,
88 alpha=0.005,
89 verbose=True
90)
91
92print('Fitting the neural network...')
93
94nntime_fitter = PMMLPipeline([('claasifier', classifier)])
95nntime_fitter.fit(X_train, Y_train)
96
97test_score = nntime_fitter.score(X_test, Y_test)
98train_score = nntime_fitter.score(X_train, Y_train)
99print(f'Test: {test_score}')
100print(f'Train: {train_score}')
101print('Fitting done.')
102
103print('Writing output...')
104
105pmml_name = 'SVDTimeNet.pmml'
106xml_name = pmml_name.replace('pmml', 'xml')
107sklearn2pmml(nntime_fitter, pmml_name, with_repr=True)
108
109parser = ET.XMLParser(remove_blank_text=True)
110net = ET.parse(pmml_name, parser)
111root = net.getroot()
112# namespace hassle
113namespace = root.nsmap[None]
114nsprefix = '{' + namespace + '}'
115procinfo = root.find(nsprefix + 'MiningBuildTask')
116
117# Save some metadata
118name = ET.SubElement(procinfo, nsprefix + 'Title')
119name.text = 'Neural network for time shift estimation'
120
121# Information on use of the classifier
122target = ET.SubElement(procinfo, nsprefix + 'IntendedUse')
123basf2proc = ET.SubElement(target, nsprefix + 'basf2process')
124basf2simulation = ET.SubElement(basf2proc, nsprefix + 'Simulation')
125basf2simulation.text = 'yes'
126basf2reconstruction = ET.SubElement(basf2proc, nsprefix + 'Reconstruction')
127basf2reconstruction.text = 'yes'
128sensorType = ET.SubElement(target, nsprefix + 'SensorType')
129sensorType.text = 'all'
130sensorSide = ET.SubElement(target, nsprefix + 'SensorSide')
131sensorSide.text = 'all'
132
133# information on training
134training = ET.SubElement(procinfo, nsprefix + 'Training')
135source = ET.SubElement(training, nsprefix + 'SampleSource')
136source.text = 'Toy simulation'
137genfunc = ET.SubElement(training, nsprefix + 'Waveform')
138genfunc.text = 'beta-prime'
139num_samples = ET.SubElement(training, nsprefix + 'SampleSize')
140train_samples = ET.SubElement(num_samples, nsprefix + 'Training', {'n': str(int((1 - test_fraction) * args.n_samples))})
141test_samples = ET.SubElement(num_samples, nsprefix + 'Test', {'n': str(int(test_fraction * args.n_samples))})
142bounds.apply(
143 lambda row: ET.SubElement(training, nsprefix + 'Parameter', **{u: str(v) for u, v in row.items()}), axis=1
144)
145
146netparams = ET.SubElement(procinfo, nsprefix + 'NetworkParameters')
147inputLayer = ET.SubElement(netparams, nsprefix + 'NetworkLayer')
148inputLayer.attrib['number'] = str(0)
149inputLayer.attrib['kind'] = 'input'
150inputLayer.attrib['size'] = str(7) # 7 as in 6 APV samples + tau
151n_hidden = len(classifier.hidden_layer_sizes)
152for (iLayer, sz) in zip(range(1, 1 + n_hidden), classifier.hidden_layer_sizes):
153 layer = ET.SubElement(netparams, nsprefix + 'NetworkLayer')
154 layer.attrib['number'] = str(iLayer)
155 layer.attrib['kind'] = 'hidden'
156 layer.attrib['size'] = str(sz)
157outputLayer = ET.SubElement(netparams, nsprefix + 'NetworkLayer')
158outputLayer.attrib['number'] = str(n_hidden + 1)
159outputLayer.attrib['kind'] = 'output'
160outputLayer.attrib['size'] = str(len(timearray))
161
162for field in root.find(nsprefix + 'DataDictionary'):
163 if field.attrib['name'] == 't0_bin':
164 for child in field:
165 i = int(child.attrib['value'])
166 child.attrib['lower'] = f'{bins.loc[i, "lower"]:.3f}'
167 child.attrib['upper'] = f'{bins.loc[i, "upper"]:.3f}'
168 child.attrib['midpoint'] = f'{bins.loc[i, "midpoint"]:.3f}'
169
170net.write(xml_name, xml_declaration=True, pretty_print=True, encoding='utf-8')
171
172print('Output saved.')
173
174print('Saving fits...')
175
176# #### Set up tau en/decoder
177amp_index = bounds[bounds.value == 'amplitude'].index[0]
178amp_range = (bounds.ix[amp_index, 'low'], bounds.ix[amp_index, 'high'])
179tau_index = bounds[bounds.value == 'tau'].index[0]
180tau_range = (bounds.ix[tau_index, 'low'], bounds.ix[tau_index, 'high'])
181coder = tau_encoder(amp_range, tau_range)
182
183# #### True values
184
185Trues_test = stockdata[stockdata.test < test_fraction][['t0', 'amplitude', 'tau', 't0_bin', 'abin']]
186
187# #### Predicted probabilities.
188
189probs = nntime_fitter.predict_proba(X_test)
190
191# ### Calculate time shifts and amplitudes from probabilities
192
193
194def fitFromProb(fw, signals, p, tau, timearray):
195 t_fit = np.average(timearray, weights=p)
196 t_sigma = np.sqrt(np.average((timearray - t_fit)**2, weights=p))
197 weights = fw(-t_fit + np.linspace(-dt, 4 * dt, 6, endpoint=True), tau=tau)
198 weights[signals.values == 0.0] = 0.0
199 norm = 1.0 / np.inner(weights, weights)
200 a_fit = np.inner(signals, weights) * norm
201 a_sigma = np.sqrt(norm)
202 residuals = signals - a_fit * weights
203 ndf = np.sum(np.ones_like(signals[signals > 0])) - 2 # Can't be less than 1
204 chi2_ndf = np.inner(residuals, residuals) / ndf
205 return pd.Series({
206 't_fit': t_fit,
207 't_sigma': t_sigma,
208 'a_fit': a_fit,
209 'a_sigma': a_sigma,
210 'chi2_ndf': chi2_ndf
211 })
212
213
214probdf = pd.DataFrame(probs)
215probdf.index = X_test.index
216probdf.to_pickle(f'SVDTime_TrainingProbs_{args.n_samples}.pkl')
217
218fits = X_test.apply(
219 lambda row: fitFromProb(
220 betaprime_wave,
221 row[['s' + str(i) for i in range(1, 7)]],
222 probdf.ix[row.name],
223 coder.decode(row['normed_tau']),
224 timearray),
225 axis=1
226)
227fits['t_true'] = Trues_test['t0']
228fits['tau'] = Trues_test['tau']
229fits['a_true'] = Trues_test['amplitude']
230fits['t_bin'] = Trues_test['t0_bin']
231fits['a_bin'] = Trues_test['abin']
232
233fits.to_pickle(f'SVDTime_TrainingFits_{args.n_samples}.pkl')
234
235print('Writing classifier...')
236
237with open('classifier.pkl', 'wb') as f:
238 pickle.dump(classifier, f)
239
240with open('classifier.txt', 'w') as cdump:
241 cdump.write("Classifier coefficients:\n")
242 for iLayer in range(len(classifier.coefs_)):
243 cdump.write(f'Layer: {iLayer}\n')
244 nrows = classifier.coefs_[iLayer].shape[0]
245 ncols = classifier.coefs_[iLayer].shape[1]
246 cdump.write('Weights:\n')
247 for col in range(ncols):
248 s = " ".join([str(classifier.coefs_[iLayer][row, col]) for row in range(nrows)])
249 s += "\n"
250 cdump.write(s)
251 # intercepts should have nrows dimension
252 cdump.write('Intercepts:\n')
253 s = " ".join([str(classifier.intercepts_[iLayer][col]) for col in range(ncols)])
254 s += "\n"
255 cdump.write(s)
256
257print("Learning phase completed.")