157 {
158
159 Weightfile weightfile;
160 std::string custom_weightfile = weightfile.generateFileName();
161 std::string custom_steeringfile = weightfile.generateFileName();
162
163 uint64_t numberOfFeatures = training_data.getNumberOfFeatures();
164 uint64_t numberOfSpectators = training_data.getNumberOfSpectators();
165 uint64_t numberOfEvents = training_data.getNumberOfEvents();
166
167 if (m_specific_options.m_training_fraction <= 0.0 or m_specific_options.m_training_fraction > 1.0) {
168 B2ERROR("Please provide a positive training fraction");
169 throw std::runtime_error("Please provide a training fraction between (0.0,1.0]");
170 }
171
172 auto numberOfTrainingEvents = static_cast<uint64_t>(numberOfEvents * 100 * m_specific_options.m_training_fraction);
173 numberOfTrainingEvents = numberOfTrainingEvents / 100 + (numberOfTrainingEvents % 100 != 0);
174 auto numberOfValidationEvents = numberOfEvents - numberOfTrainingEvents;
175
176 uint64_t batch_size = m_specific_options.m_mini_batch_size;
177 if (batch_size == 0) {
178 batch_size = numberOfTrainingEvents;
179 }
180
181 if (batch_size > numberOfTrainingEvents) {
182 B2WARNING("Mini batch size (" << batch_size << ") is larger than the number of training events (" << numberOfTrainingEvents << ")"\
183 " The batch size has been set equal to the number of training events.");
184 batch_size = numberOfTrainingEvents;
185 };
186
187 auto X = std::unique_ptr<float[]>(new float[batch_size * numberOfFeatures]);
188 auto S = std::unique_ptr<float[]>(new float[batch_size * numberOfSpectators]);
189 auto y = std::unique_ptr<float[]>(new float[batch_size]);
190 auto w = std::unique_ptr<float[]>(new float[batch_size]);
191 npy_intp dimensions_X[2] = {static_cast<npy_intp>(batch_size), static_cast<npy_intp>(numberOfFeatures)};
192 npy_intp dimensions_S[2] = {static_cast<npy_intp>(batch_size), static_cast<npy_intp>(numberOfSpectators)};
193 npy_intp dimensions_y[2] = {static_cast<npy_intp>(batch_size), 1};
194 npy_intp dimensions_w[2] = {static_cast<npy_intp>(batch_size), 1};
195
196 auto X_v = std::unique_ptr<float[]>(new float[numberOfValidationEvents * numberOfFeatures]);
197 auto S_v = std::unique_ptr<float[]>(new float[numberOfValidationEvents * numberOfSpectators]);
198 auto y_v = std::unique_ptr<float[]>(new float[numberOfValidationEvents]);
199 auto w_v = std::unique_ptr<float[]>(new float[numberOfValidationEvents]);
200 npy_intp dimensions_X_v[2] = {static_cast<npy_intp>(numberOfValidationEvents), static_cast<npy_intp>(numberOfFeatures)};
201 npy_intp dimensions_S_v[2] = {static_cast<npy_intp>(numberOfValidationEvents), static_cast<npy_intp>(numberOfSpectators)};
202 npy_intp dimensions_y_v[2] = {static_cast<npy_intp>(numberOfValidationEvents), 1};
203 npy_intp dimensions_w_v[2] = {static_cast<npy_intp>(numberOfValidationEvents), 1};
204
205 std::string steering_file_source_code;
206 if (m_specific_options.m_steering_file != "") {
207 std::string filename = FileSystem::findFile(m_specific_options.m_steering_file);
208 std::ifstream steering_file(filename);
209 if (not steering_file) {
210 throw std::runtime_error(std::string("Couldn't open file ") + filename);
211 }
212 steering_file.seekg(0, std::ios::end);
213 steering_file_source_code.resize(steering_file.tellg());
214 steering_file.seekg(0, std::ios::beg);
215 steering_file.read(&steering_file_source_code[0], steering_file_source_code.size());
216 }
217
218 std::vector<float> means(numberOfFeatures, 0.0);
219 std::vector<float> stds(numberOfFeatures, 0.0);
220
221 if (m_specific_options.m_normalize) {
222
223
224 auto weights = training_data.getWeights();
225 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
226 double wSum = 0.0;
227 double mean = 0.0;
228 double running_std = 0.0;
229 auto feature = training_data.getFeature(iFeature);
230 for (uint64_t i = 0; i < weights.size(); ++i) {
231 wSum += weights[i];
232 double meanOld = mean;
233 mean += (weights[i] / wSum) * (feature[i] - meanOld);
234 running_std += weights[i] * (feature[i] - meanOld) * (feature[i] - mean);
235 }
236 means[iFeature] = mean;
237 stds[iFeature] = std::sqrt(running_std / (wSum - 1));
238 }
239 }
240
241 try {
242
243 auto json = boost::python::import("json");
244 auto builtins = boost::python::import("builtins");
245 auto inspect = boost::python::import("inspect");
246
247
248 auto framework = boost::python::import((std::string("basf2_mva_python_interface.") + m_specific_options.m_framework).c_str());
249
250 builtins.attr("exec")(steering_file_source_code.c_str(), boost::python::object(framework.attr("__dict__")));
251
252
253 auto parameters = json.attr("loads")(m_specific_options.m_config.c_str());
254 auto model = framework.attr("get_model")(numberOfFeatures, numberOfSpectators,
255 numberOfEvents, m_specific_options.m_training_fraction, parameters);
256
257
258 for (uint64_t iEvent = 0; iEvent < numberOfValidationEvents; ++iEvent) {
259 training_data.loadEvent(iEvent);
260 if (m_specific_options.m_normalize) {
261 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
262 X_v[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
263 } else {
264 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
265 X_v[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
266 }
267 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
268 S_v[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
269 y_v[iEvent] = training_data.m_target;
270 w_v[iEvent] = training_data.m_weight;
271 }
272
273 auto ndarray_X_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X_v, NPY_FLOAT32, X_v.get()));
274 auto ndarray_S_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S_v, NPY_FLOAT32, S_v.get()));
275 auto ndarray_y_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y_v, NPY_FLOAT32, y_v.get()));
276 auto ndarray_w_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w_v, NPY_FLOAT32, w_v.get()));
277
278 uint64_t nBatches = std::floor(numberOfTrainingEvents / batch_size);
279
280 auto state = framework.attr("begin_fit")(model, ndarray_X_v, ndarray_S_v, ndarray_y_v, ndarray_w_v, nBatches);
281
282 bool continue_loop = true;
283
284 std::vector<uint64_t> iteration_index_vector(numberOfTrainingEvents);
285 std::iota(std::begin(iteration_index_vector), std::end(iteration_index_vector), 0);
286
287 for (uint64_t iIteration = 0; (iIteration < m_specific_options.m_nIterations or m_specific_options.m_nIterations == 0)
288 and continue_loop; ++iIteration) {
289
290
291 if (iIteration > 0) std::shuffle(std::begin(iteration_index_vector), std::end(iteration_index_vector), TRandomWrapper());
292
293 for (uint64_t iBatch = 0; iBatch < nBatches and continue_loop; ++iBatch) {
294
295
296
297 PyThreadState* m_thread_state = PyEval_SaveThread();
298 for (uint64_t iEvent = 0; iEvent < batch_size; ++iEvent) {
299 training_data.loadEvent(iteration_index_vector.at(iEvent + iBatch * batch_size) + numberOfValidationEvents);
300 if (m_specific_options.m_normalize) {
301 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
302 X[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
303 } else {
304 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
305 X[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
306 }
307 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
308 S[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
309 y[iEvent] = training_data.m_target;
310 w[iEvent] = training_data.m_weight;
311 }
312
313
314 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
315 auto ndarray_S = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S, NPY_FLOAT32, S.get()));
316 auto ndarray_y = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y, NPY_FLOAT32, y.get()));
317 auto ndarray_w = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w, NPY_FLOAT32, w.get()));
318
319
320 PyEval_RestoreThread(m_thread_state);
321 auto r = framework.attr("partial_fit")(state, ndarray_X, ndarray_S, ndarray_y,
322 ndarray_w, iIteration, iBatch);
323 boost::python::extract<bool> proxy(r);
324 if (proxy.check())
325 continue_loop = static_cast<bool>(proxy);
326 }
327 }
328
329 auto result = framework.attr("end_fit")(state);
330
331 auto pickle = boost::python::import("pickle");
332 auto file = builtins.attr("open")(custom_weightfile.c_str(), "wb");
333 pickle.attr("dump")(result, file);
334
335 auto steeringfile = builtins.attr("open")(custom_steeringfile.c_str(), "wb");
336 pickle.attr("dump")(steering_file_source_code.c_str(), steeringfile);
337
338 auto importances = framework.attr("feature_importance")(state);
339 if (len(importances) == 0) {
340 B2INFO("Python method returned empty feature importance. There won't be any information about the feature importance in the weightfile.");
341 } else if (numberOfFeatures != static_cast<uint64_t>(len(importances))) {
342 B2WARNING("Python method didn't return the correct number of importance value. I ignore the importances");
343 } else {
344 std::map<std::string, float> feature_importances;
345 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
346 boost::python::extract<float> proxy(importances[iFeature]);
347 if (proxy.check()) {
348 feature_importances[m_general_options.m_variables[iFeature]] = static_cast<float>(proxy);
349 } else {
350 B2WARNING("Failed to convert importance output of the method to a float, using 0 instead");
351 feature_importances[m_general_options.m_variables[iFeature]] = 0.0;
352 }
353 }
354 weightfile.addFeatureImportance(feature_importances);
355 }
356
357 } catch (...) {
358 PyErr_Print();
359 PyErr_Clear();
360 B2ERROR("Failed calling train in PythonTeacher");
361 throw std::runtime_error(std::string("Failed calling train in PythonTeacher"));
362 }
363
364 weightfile.addOptions(m_general_options);
365 weightfile.addOptions(m_specific_options);
366 weightfile.addFile("Python_Weightfile", custom_weightfile);
367 weightfile.addFile("Python_Steeringfile", custom_steeringfile);
368 weightfile.addSignalFraction(training_data.getSignalFraction());
369 if (m_specific_options.m_normalize) {
370 weightfile.addVector("Python_Means", means);
371 weightfile.addVector("Python_Stds", stds);
372 }
373
374 return weightfile;
375
376 }