36 from scipy
import stats
as stats
37 from scipy.optimize
import minimize_scalar
38 from sklearn.neural_network
import MLPClassifier
39 from sklearn2pmml
import sklearn2pmml, PMMLPipeline
41 from lxml
import etree
as ET
46 parser = argparse.ArgumentParser(description=
"Train the SVD hit time estimator")
54 help=
'Global tag to use at central DB in PNNL')
56 args = parser.parse_args()
58 pkl_name =
'SVDTime_Training{0}_{1}.pkl'
60 print(
'Reading data...')
62 stockdata = pd.read_pickle(pkl_name.format(
'Sample', args.n_samples))
63 bounds = pd.read_pickle(pkl_name.format(
'Bounds', args.n_samples))
64 bins = pd.read_pickle(pkl_name.format(
'Bins', args.n_samples))
66 timearray = bins[
'midpoint']
67 timebins = np.unique(bins[[
'lower',
'upper']])
74 X = stockdata[[
's' + str(i)
for i
in range(1, 7)] + [
'normed_tau']]
75 Y = stockdata[
't0_bin']
76 X_train = X[stockdata.test > test_fraction]
77 X_test = X[stockdata.test < test_fraction]
78 Y_train = Y[stockdata.test > test_fraction]
79 Y_test = Y[stockdata.test < test_fraction]
81 classifier = MLPClassifier(
82 hidden_layer_sizes=(len(timearray) - 1, len(timearray) + 1),
90 print(
'Fitting the neural network...')
92 nntime_fitter = PMMLPipeline([(
'claasifier', classifier)])
93 nntime_fitter.fit(X_train, Y_train)
95 test_score = nntime_fitter.score(X_test, Y_test)
96 train_score = nntime_fitter.score(X_train, Y_train)
97 print(
'Test: {}'.format(test_score))
98 print(
'Train: {}'.format(train_score))
99 print(
'Fitting done.')
101 print(
'Writing output...')
103 pmml_name =
'SVDTimeNet.pmml'
104 xml_name = pmml_name.replace(
'pmml',
'xml')
105 sklearn2pmml(nntime_fitter, pmml_name, with_repr=
True)
107 parser = ET.XMLParser(remove_blank_text=
True)
108 net = ET.parse(pmml_name, parser)
111 namespace = root.nsmap[
None]
112 nsprefix =
'{' + namespace +
'}'
113 procinfo = root.find(nsprefix +
'MiningBuildTask')
116 name = ET.SubElement(procinfo, nsprefix +
'Title')
117 name.text =
'Neural network for time shift estimation'
120 target = ET.SubElement(procinfo, nsprefix +
'IntendedUse')
121 basf2proc = ET.SubElement(target, nsprefix +
'basf2process')
122 basf2simulation = ET.SubElement(basf2proc, nsprefix +
'Simulation')
123 basf2simulation.text =
'yes'
124 basf2reconstruction = ET.SubElement(basf2proc, nsprefix +
'Reconstruction')
125 basf2reconstruction.text =
'yes'
126 sensorType = ET.SubElement(target, nsprefix +
'SensorType')
127 sensorType.text =
'all'
128 sensorSide = ET.SubElement(target, nsprefix +
'SensorSide')
129 sensorSide.text =
'all'
132 training = ET.SubElement(procinfo, nsprefix +
'Training')
133 source = ET.SubElement(training, nsprefix +
'SampleSource')
134 source.text =
'Toy simulation'
135 genfunc = ET.SubElement(training, nsprefix +
'Waveform')
136 genfunc.text =
'beta-prime'
137 num_samples = ET.SubElement(training, nsprefix +
'SampleSize')
138 train_samples = ET.SubElement(num_samples, nsprefix +
'Training', {
'n': str(int((1 - test_fraction) * args.n_samples))})
139 test_samples = ET.SubElement(num_samples, nsprefix +
'Test', {
'n': str(int(test_fraction * args.n_samples))})
141 lambda row: ET.SubElement(training, nsprefix +
'Parameter', **{u: str(v)
for u, v
in row.items()}), axis=1
144 netparams = ET.SubElement(procinfo, nsprefix +
'NetworkParameters')
145 inputLayer = ET.SubElement(netparams, nsprefix +
'NetworkLayer')
146 inputLayer.attrib[
'number'] = str(0)
147 inputLayer.attrib[
'kind'] =
'input'
148 inputLayer.attrib[
'size'] = str(7)
149 n_hidden = len(classifier.hidden_layer_sizes)
150 for (iLayer, sz)
in zip(range(1, 1 + n_hidden), classifier.hidden_layer_sizes):
151 layer = ET.SubElement(netparams, nsprefix +
'NetworkLayer')
152 layer.attrib[
'number'] = str(iLayer)
153 layer.attrib[
'kind'] =
'hidden'
154 layer.attrib[
'size'] = str(sz)
155 outputLayer = ET.SubElement(netparams, nsprefix +
'NetworkLayer')
156 outputLayer.attrib[
'number'] = str(n_hidden + 1)
157 outputLayer.attrib[
'kind'] =
'output'
158 outputLayer.attrib[
'size'] = str(len(timearray))
160 for field
in root.find(nsprefix +
'DataDictionary'):
161 if field.attrib[
'name'] ==
't0_bin':
163 i = int(child.attrib[
'value'])
164 child.attrib[
'lower'] =
'{0:.3f}'.format(bins.loc[i,
'lower'])
165 child.attrib[
'upper'] =
'{0:.3f}'.format(bins.loc[i,
'upper'])
166 child.attrib[
'midpoint'] =
'{0:.3f}'.format(bins.loc[i,
'midpoint'])
168 net.write(xml_name, xml_declaration=
True, pretty_print=
True, encoding=
'utf-8')
170 print(
'Output saved.')
172 print(
'Saving fits...')
175 amp_index = bounds[bounds.value ==
'amplitude'].index[0]
176 amp_range = (bounds.ix[amp_index,
'low'], bounds.ix[amp_index,
'high'])
177 tau_index = bounds[bounds.value ==
'tau'].index[0]
178 tau_range = (bounds.ix[tau_index,
'low'], bounds.ix[tau_index,
'high'])
179 coder = tau_encoder(amp_range, tau_range)
183 Trues_test = stockdata[stockdata.test < test_fraction][[
't0',
'amplitude',
'tau',
't0_bin',
'abin']]
187 probs = nntime_fitter.predict_proba(X_test)
192 def fitFromProb(fw, signals, p, tau, timearray):
193 t_fit = np.average(timearray, weights=p)
194 t_sigma = np.sqrt(np.average((timearray - t_fit)**2, weights=p))
195 weights = fw(-t_fit + np.linspace(-dt, 4 * dt, 6, endpoint=
True), tau=tau)
196 weights[signals.values == 0.0] = 0.0
197 norm = 1.0 / np.inner(weights, weights)
198 a_fit = np.inner(signals, weights) * norm
199 a_sigma = np.sqrt(norm)
200 residuals = signals - a_fit * weights
201 ndf = np.sum(np.ones_like(signals[signals > 0])) - 2
202 chi2_ndf = np.inner(residuals, residuals) / ndf
212 probdf = pd.DataFrame(probs)
213 probdf.index = X_test.index
214 probdf.to_pickle(
'SVDTime_TrainingProbs_{0}.pkl'.format(args.n_samples))
217 lambda row: fitFromProb(
219 row[[
's' + str(i)
for i
in range(1, 7)]],
221 coder.decode(row[
'normed_tau']),
225 fits[
't_true'] = Trues_test[
't0']
226 fits[
'tau'] = Trues_test[
'tau']
227 fits[
'a_true'] = Trues_test[
'amplitude']
228 fits[
't_bin'] = Trues_test[
't0_bin']
229 fits[
'a_bin'] = Trues_test[
'abin']
231 fits.to_pickle(
'SVDTime_TrainingFits_{0}.pkl'.format(args.n_samples))
233 print(
'Writing classifier...')
235 with open(
'classifier.pkl',
'wb')
as f:
236 pickle.dump(classifier, f)
238 with open(
'classifier.txt',
'w')
as cdump:
239 cdump.write(
"Classifier coefficients:\n")
240 for iLayer
in range(len(classifier.coefs_)):
241 cdump.write(
'Layer: {0}\n'.format(iLayer))
242 nrows = classifier.coefs_[iLayer].shape[0]
243 ncols = classifier.coefs_[iLayer].shape[1]
244 cdump.write(
'Weights:\n')
245 for col
in range(ncols):
246 s =
" ".join([str(classifier.coefs_[iLayer][row, col])
for row
in range(nrows)])
250 cdump.write(
'Intercepts:\n')
251 s =
" ".join([str(classifier.intercepts_[iLayer][col])
for col
in range(ncols)])
255 print(
"Learning phase completed.")