Belle II Software development
PythonTeacher Class Reference

Teacher for the Python MVA method. More...

#include <Python.h>

Inheritance diagram for PythonTeacher:
Teacher

Public Member Functions

 PythonTeacher (const GeneralOptions &general_options, const PythonOptions &specific_options)
 Constructs a new teacher using the GeneralOptions and specific options of this training.
 
virtual Weightfile train (Dataset &training_data) const override
 Train a mva method using the given dataset returning a Weightfile.
 

Protected Attributes

GeneralOptions m_general_options
 GeneralOptions containing all shared options.
 

Private Attributes

PythonOptions m_specific_options
 Method specific options.
 

Detailed Description

Teacher for the Python MVA method.

Definition at line 90 of file Python.h.

Constructor & Destructor Documentation

◆ PythonTeacher()

PythonTeacher ( const GeneralOptions & general_options,
const PythonOptions & specific_options )

Constructs a new teacher using the GeneralOptions and specific options of this training.

Parameters
general_optionsdefining all shared options
specific_optionsdefininf all method specific options

Definition at line 166 of file Python.cc.

167 : Teacher(general_options),
168 m_specific_options(specific_options)
169 {
170 PythonInitializerSingleton::GetInstance();
171 }

Member Function Documentation

◆ train()

Weightfile train ( Dataset & training_data) const
overridevirtual

Train a mva method using the given dataset returning a Weightfile.

Parameters
training_dataused to train the method

Implements Teacher.

Definition at line 173 of file Python.cc.

174 {
175
176 Weightfile weightfile;
177 std::string custom_weightfile = weightfile.generateFileName();
178 std::string custom_steeringfile = weightfile.generateFileName();
179
180 uint64_t numberOfFeatures = training_data.getNumberOfFeatures();
181 uint64_t numberOfSpectators = training_data.getNumberOfSpectators();
182 uint64_t numberOfEvents = training_data.getNumberOfEvents();
183
184 if (m_specific_options.m_training_fraction <= 0.0 or m_specific_options.m_training_fraction > 1.0) {
185 B2ERROR("Please provide a positive training fraction");
186 throw std::runtime_error("Please provide a training fraction between (0.0,1.0]");
187 }
188
189 auto numberOfTrainingEvents = static_cast<uint64_t>(numberOfEvents * 100 * m_specific_options.m_training_fraction);
190 numberOfTrainingEvents = numberOfTrainingEvents / 100 + (numberOfTrainingEvents % 100 != 0);
191 auto numberOfValidationEvents = numberOfEvents - numberOfTrainingEvents;
192
193 uint64_t batch_size = m_specific_options.m_mini_batch_size;
194 if (batch_size == 0) {
195 batch_size = numberOfTrainingEvents;
196 }
197
198 if (batch_size > numberOfTrainingEvents) {
199 B2WARNING("Mini batch size (" << batch_size << ") is larger than the number of training events (" << numberOfTrainingEvents << ")"\
200 " The batch size has been set equal to the number of training events.");
201 batch_size = numberOfTrainingEvents;
202 };
203
204 auto X = std::unique_ptr<float[]>(new float[batch_size * numberOfFeatures]);
205 auto S = std::unique_ptr<float[]>(new float[batch_size * numberOfSpectators]);
206 auto y = std::unique_ptr<float[]>(new float[batch_size]);
207 auto w = std::unique_ptr<float[]>(new float[batch_size]);
208 npy_intp dimensions_X[2] = {static_cast<npy_intp>(batch_size), static_cast<npy_intp>(numberOfFeatures)};
209 npy_intp dimensions_S[2] = {static_cast<npy_intp>(batch_size), static_cast<npy_intp>(numberOfSpectators)};
210 npy_intp dimensions_y[2] = {static_cast<npy_intp>(batch_size), 1};
211 npy_intp dimensions_w[2] = {static_cast<npy_intp>(batch_size), 1};
212
213 auto X_v = std::unique_ptr<float[]>(new float[numberOfValidationEvents * numberOfFeatures]);
214 auto S_v = std::unique_ptr<float[]>(new float[numberOfValidationEvents * numberOfSpectators]);
215 auto y_v = std::unique_ptr<float[]>(new float[numberOfValidationEvents]);
216 auto w_v = std::unique_ptr<float[]>(new float[numberOfValidationEvents]);
217 npy_intp dimensions_X_v[2] = {static_cast<npy_intp>(numberOfValidationEvents), static_cast<npy_intp>(numberOfFeatures)};
218 npy_intp dimensions_S_v[2] = {static_cast<npy_intp>(numberOfValidationEvents), static_cast<npy_intp>(numberOfSpectators)};
219 npy_intp dimensions_y_v[2] = {static_cast<npy_intp>(numberOfValidationEvents), 1};
220 npy_intp dimensions_w_v[2] = {static_cast<npy_intp>(numberOfValidationEvents), 1};
221
222 std::string steering_file_source_code;
223 if (m_specific_options.m_steering_file != "") {
224 steering_file_source_code = loadPythonFileAsString(m_specific_options.m_steering_file);
225 }
226
227 std::vector<float> means(numberOfFeatures, 0.0);
228 std::vector<float> stds(numberOfFeatures, 0.0);
229
230 if (m_specific_options.m_normalize) {
231 // Stable calculation of mean and variance with weights
232 // see https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
233 auto weights = training_data.getWeights();
234 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
235 double wSum = 0.0;
236 double mean = 0.0;
237 double running_std = 0.0;
238 auto feature = training_data.getFeature(iFeature);
239 for (uint64_t i = 0; i < weights.size(); ++i) {
240 wSum += weights[i];
241 double meanOld = mean;
242 mean += (weights[i] / wSum) * (feature[i] - meanOld);
243 running_std += weights[i] * (feature[i] - meanOld) * (feature[i] - mean);
244 }
245 means[iFeature] = mean;
246 stds[iFeature] = std::sqrt(running_std / (wSum - 1));
247 }
248 }
249
250 try {
251 // Load python modules
252 auto json = boost::python::import("json");
253 auto builtins = boost::python::import("builtins");
254 auto inspect = boost::python::import("inspect");
255
256 // Create a new empty module with a unique name.
257 // This way we dont end up with multiple mvas trying to overwrite the same apply method with the last one being used by all.
258 boost::python::object type = boost::python::import("types");
259
260 // Generate a unique module
261 boost::uuids::random_generator uuid_gen;
262 std::string unique_mva_module_name = "unique_module_name" + boost::uuids::to_string(uuid_gen());
263 boost::python::object unique_mva_module = type.attr("ModuleType")(unique_mva_module_name.c_str());
264
265 // Find the framework file. Then execute it in the scope of the new module
266 auto framework = boost::python::import((std::string("basf2_mva_python_interface.") + m_specific_options.m_framework).c_str());
267 auto framework_file = framework.attr("__file__");
268 auto framework_file_source_code = loadPythonFileAsString(boost::python::extract<std::string>(boost::python::object(
269 framework_file)));
270 builtins.attr("exec")(framework_file_source_code.c_str(), boost::python::object(unique_mva_module.attr("__dict__")));
271 // Overwrite framework with user-defined code from the steering file
272 builtins.attr("exec")(steering_file_source_code.c_str(), boost::python::object(unique_mva_module.attr("__dict__")));
273
274 // Call get_model with the parameters provided by the user
275 auto parameters = json.attr("loads")(m_specific_options.m_config.c_str());
276 auto model = unique_mva_module.attr("get_model")(numberOfFeatures, numberOfSpectators,
277 numberOfEvents, m_specific_options.m_training_fraction, parameters);
278
279 // Call begin_fit with validation sample
280 for (uint64_t iEvent = 0; iEvent < numberOfValidationEvents; ++iEvent) {
281 training_data.loadEvent(iEvent);
282 if (m_specific_options.m_normalize) {
283 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
284 X_v[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
285 } else {
286 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
287 X_v[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
288 }
289 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
290 S_v[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
291 y_v[iEvent] = training_data.m_target;
292 w_v[iEvent] = training_data.m_weight;
293 }
294
295 auto ndarray_X_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X_v, NPY_FLOAT32, X_v.get()));
296 auto ndarray_S_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S_v, NPY_FLOAT32, S_v.get()));
297 auto ndarray_y_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y_v, NPY_FLOAT32, y_v.get()));
298 auto ndarray_w_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w_v, NPY_FLOAT32, w_v.get()));
299
300 uint64_t nBatches = std::floor(numberOfTrainingEvents / batch_size);
301
302 auto state = unique_mva_module.attr("begin_fit")(model, ndarray_X_v, ndarray_S_v, ndarray_y_v, ndarray_w_v, nBatches);
303
304 bool continue_loop = true;
305
306 std::vector<uint64_t> iteration_index_vector(numberOfTrainingEvents);
307 std::iota(std::begin(iteration_index_vector), std::end(iteration_index_vector), 0);
308
309 for (uint64_t iIteration = 0; (iIteration < m_specific_options.m_nIterations or m_specific_options.m_nIterations == 0)
310 and continue_loop; ++iIteration) {
311
312 // shuffle the indices on each iteration to get randomised batches
313 if (iIteration > 0) std::shuffle(std::begin(iteration_index_vector), std::end(iteration_index_vector), TRandomWrapper());
314
315 for (uint64_t iBatch = 0; iBatch < nBatches and continue_loop; ++iBatch) {
316
317 // Release Global Interpreter Lock in python to allow multithreading while reading root files
318 // also see: https://docs.python.org/3.5/c-api/init.html
319 PyThreadState* m_thread_state = PyEval_SaveThread();
320 for (uint64_t iEvent = 0; iEvent < batch_size; ++iEvent) {
321 training_data.loadEvent(iteration_index_vector.at(iEvent + iBatch * batch_size) + numberOfValidationEvents);
322 if (m_specific_options.m_normalize) {
323 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
324 X[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
325 } else {
326 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
327 X[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
328 }
329 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
330 S[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
331 y[iEvent] = training_data.m_target;
332 w[iEvent] = training_data.m_weight;
333 }
334
335 // Maybe slow, create ndarrays outside of loop?
336 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
337 auto ndarray_S = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S, NPY_FLOAT32, S.get()));
338 auto ndarray_y = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y, NPY_FLOAT32, y.get()));
339 auto ndarray_w = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w, NPY_FLOAT32, w.get()));
340
341 // Reactivate Global Interpreter Lock to safely execute python code
342 PyEval_RestoreThread(m_thread_state);
343 auto r = unique_mva_module.attr("partial_fit")(state, ndarray_X, ndarray_S, ndarray_y,
344 ndarray_w, iIteration, iBatch);
345 boost::python::extract<bool> proxy(r);
346 if (proxy.check())
347 continue_loop = static_cast<bool>(proxy);
348 }
349 }
350
351 auto result = unique_mva_module.attr("end_fit")(state);
352
353 auto pickle = boost::python::import("pickle");
354 auto file = builtins.attr("open")(custom_weightfile.c_str(), "wb");
355 pickle.attr("dump")(result, file);
356
357 auto steeringfile = builtins.attr("open")(custom_steeringfile.c_str(), "wb");
358 pickle.attr("dump")(steering_file_source_code.c_str(), steeringfile);
359
360 auto importances = unique_mva_module.attr("feature_importance")(state);
361 if (len(importances) == 0) {
362 B2INFO("Python method returned empty feature importance. There won't be any information about the feature importance in the weightfile.");
363 } else if (numberOfFeatures != static_cast<uint64_t>(len(importances))) {
364 B2WARNING("Python method didn't return the correct number of importance value. I ignore the importances");
365 } else {
366 std::map<std::string, float> feature_importances;
367 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
368 boost::python::extract<float> proxy(importances[iFeature]);
369 if (proxy.check()) {
370 feature_importances[m_general_options.m_variables[iFeature]] = static_cast<float>(proxy);
371 } else {
372 B2WARNING("Failed to convert importance output of the method to a float, using 0 instead");
373 feature_importances[m_general_options.m_variables[iFeature]] = 0.0;
374 }
375 }
376 weightfile.addFeatureImportance(feature_importances);
377 }
378
379 } catch (...) {
380 PyErr_Print();
381 PyErr_Clear();
382 B2ERROR("Failed calling train in PythonTeacher");
383 throw std::runtime_error(std::string("Failed calling train in PythonTeacher"));
384 }
385
386 weightfile.addOptions(m_general_options);
387 weightfile.addOptions(m_specific_options);
388 weightfile.addFile("Python_Weightfile", custom_weightfile);
389 weightfile.addFile("Python_Steeringfile", custom_steeringfile);
390 weightfile.addSignalFraction(training_data.getSignalFraction());
391 if (m_specific_options.m_normalize) {
392 weightfile.addVector("Python_Means", means);
393 weightfile.addVector("Python_Stds", stds);
394 }
395
396 return weightfile;
397
398 }

Member Data Documentation

◆ m_general_options

GeneralOptions m_general_options
protectedinherited

GeneralOptions containing all shared options.

Definition at line 49 of file Teacher.h.

◆ m_specific_options

PythonOptions m_specific_options
private

Method specific options.

Definition at line 107 of file Python.h.


The documentation for this class was generated from the following files: