9#include <mva/methods/Python.h>
11#include <numpy/npy_common.h>
12#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
13#include <numpy/arrayobject.h>
15#include <framework/logging/Logger.h>
16#include <framework/utilities/FileSystem.h>
17#include <framework/utilities/TRandomWrapper.h>
31 int version = pt.get<
int>(
"Python_version");
32 if (version < 1 or version > 2) {
33 B2ERROR(
"Unknown weightfile version " << std::to_string(version));
34 throw std::runtime_error(
"Unknown weightfile version " + std::to_string(version));
36 m_framework = pt.get<std::string>(
"Python_framework");
40 m_config = pt.get<std::string>(
"Python_config");
52 pt.put(
"Python_version", 2);
64 po::options_description description(
"Python options");
65 description.add_options()
67 "Framework which should be used. Currently supported are sklearn, xgboost, tensorflow, keras, torch, and theano")
68 (
"steering_file", po::value<std::string>(&
m_steering_file),
"Steering file which describes the model")
69 (
"mini_batch_size", po::value<unsigned int>(&
m_mini_batch_size),
"Size of the mini batch given to partial_fit function")
70 (
"nIterations", po::value<unsigned int>(&
m_nIterations),
"Number of iterations")
71 (
"normalize", po::value<bool>(&
m_normalize),
"Normalize input data (shift mean to 0 and std to 1)")
73 "Training fraction used to split up dataset in training and validation sample.")
74 (
"config", po::value<std::string>(&
m_config),
"Json encoded python object passed to begin_fit function");
101 if (not Py_IsInitialized()) {
104 wchar_t** bla =
nullptr;
105 PySys_SetArgvEx(0, bla, 0);
109 if (PyArray_API ==
nullptr) {
120 if (Py_IsInitialized()) {
153 m_specific_options(specific_options)
166 uint64_t numberOfFeatures = training_data.getNumberOfFeatures();
167 uint64_t numberOfSpectators = training_data.getNumberOfSpectators();
168 uint64_t numberOfEvents = training_data.getNumberOfEvents();
171 B2ERROR(
"Please provide a positive training fraction");
172 throw std::runtime_error(
"Please provide a training fraction between (0.0,1.0]");
176 numberOfTrainingEvents = numberOfTrainingEvents / 100 + (numberOfTrainingEvents % 100 != 0);
177 auto numberOfValidationEvents = numberOfEvents - numberOfTrainingEvents;
180 if (batch_size == 0) {
181 batch_size = numberOfTrainingEvents;
184 if (batch_size > numberOfTrainingEvents) {
185 B2WARNING(
"Mini batch size (" << batch_size <<
") is larger than the number of training events (" << numberOfTrainingEvents <<
")"\
186 " The batch size has been set equal to the number of training events.");
187 batch_size = numberOfTrainingEvents;
190 auto X = std::unique_ptr<float[]>(
new float[batch_size * numberOfFeatures]);
191 auto S = std::unique_ptr<float[]>(
new float[batch_size * numberOfSpectators]);
192 auto y = std::unique_ptr<float[]>(
new float[batch_size]);
193 auto w = std::unique_ptr<float[]>(
new float[batch_size]);
194 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(batch_size),
static_cast<npy_intp
>(numberOfFeatures)};
195 npy_intp dimensions_S[2] = {
static_cast<npy_intp
>(batch_size),
static_cast<npy_intp
>(numberOfSpectators)};
196 npy_intp dimensions_y[2] = {
static_cast<npy_intp
>(batch_size), 1};
197 npy_intp dimensions_w[2] = {
static_cast<npy_intp
>(batch_size), 1};
199 auto X_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfFeatures]);
200 auto S_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfSpectators]);
201 auto y_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
202 auto w_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
203 npy_intp dimensions_X_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents),
static_cast<npy_intp
>(numberOfFeatures)};
204 npy_intp dimensions_S_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents),
static_cast<npy_intp
>(numberOfSpectators)};
205 npy_intp dimensions_y_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
206 npy_intp dimensions_w_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
208 std::string steering_file_source_code;
211 std::ifstream steering_file(filename);
212 if (not steering_file) {
213 throw std::runtime_error(std::string(
"Couldn't open file ") + filename);
215 steering_file.seekg(0, std::ios::end);
216 steering_file_source_code.resize(steering_file.tellg());
217 steering_file.seekg(0, std::ios::beg);
218 steering_file.read(&steering_file_source_code[0], steering_file_source_code.size());
221 std::vector<float> means(numberOfFeatures, 0.0);
222 std::vector<float> stds(numberOfFeatures, 0.0);
227 auto weights = training_data.getWeights();
228 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
231 double running_std = 0.0;
232 auto feature = training_data.getFeature(iFeature);
233 for (uint64_t i = 0; i < weights.size(); ++i) {
235 double meanOld = mean;
236 mean += (weights[i] / wSum) * (feature[i] - meanOld);
237 running_std += weights[i] * (feature[i] - meanOld) * (feature[i] - mean);
239 means[iFeature] = mean;
240 stds[iFeature] = std::sqrt(running_std / (wSum - 1));
246 auto json = boost::python::import(
"json");
247 auto builtins = boost::python::import(
"builtins");
248 auto inspect = boost::python::import(
"inspect");
253 builtins.attr(
"exec")(steering_file_source_code.c_str(), boost::python::object(framework.attr(
"__dict__")));
257 auto model = framework.attr(
"get_model")(numberOfFeatures, numberOfSpectators,
261 for (uint64_t iEvent = 0; iEvent < numberOfValidationEvents; ++iEvent) {
262 training_data.loadEvent(iEvent);
264 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
265 X_v[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
267 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
268 X_v[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
270 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
271 S_v[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
272 y_v[iEvent] = training_data.m_target;
273 w_v[iEvent] = training_data.m_weight;
276 auto ndarray_X_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X_v, NPY_FLOAT32, X_v.get()));
277 auto ndarray_S_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S_v, NPY_FLOAT32, S_v.get()));
278 auto ndarray_y_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y_v, NPY_FLOAT32, y_v.get()));
279 auto ndarray_w_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w_v, NPY_FLOAT32, w_v.get()));
281 uint64_t nBatches = std::floor(numberOfTrainingEvents / batch_size);
283 auto state = framework.attr(
"begin_fit")(model, ndarray_X_v, ndarray_S_v, ndarray_y_v, ndarray_w_v, nBatches);
285 bool continue_loop =
true;
287 std::vector<uint64_t> iteration_index_vector(numberOfTrainingEvents);
288 std::iota(std::begin(iteration_index_vector), std::end(iteration_index_vector), 0);
291 and continue_loop; ++iIteration) {
294 if (iIteration > 0) std::shuffle(std::begin(iteration_index_vector), std::end(iteration_index_vector),
TRandomWrapper());
296 for (uint64_t iBatch = 0; iBatch < nBatches and continue_loop; ++iBatch) {
300 PyThreadState* m_thread_state = PyEval_SaveThread();
301 for (uint64_t iEvent = 0; iEvent < batch_size; ++iEvent) {
302 training_data.loadEvent(iteration_index_vector.at(iEvent + iBatch * batch_size) + numberOfValidationEvents);
304 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
305 X[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
307 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
308 X[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
310 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
311 S[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
312 y[iEvent] = training_data.m_target;
313 w[iEvent] = training_data.m_weight;
317 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
318 auto ndarray_S = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S, NPY_FLOAT32, S.get()));
319 auto ndarray_y = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y, NPY_FLOAT32, y.get()));
320 auto ndarray_w = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w, NPY_FLOAT32, w.get()));
323 PyEval_RestoreThread(m_thread_state);
324 auto r = framework.attr(
"partial_fit")(state, ndarray_X, ndarray_S, ndarray_y,
325 ndarray_w, iIteration, iBatch);
326 boost::python::extract<bool> proxy(r);
328 continue_loop =
static_cast<bool>(proxy);
332 auto result = framework.attr(
"end_fit")(state);
334 auto pickle = boost::python::import(
"pickle");
335 auto file = builtins.attr(
"open")(custom_weightfile.c_str(),
"wb");
336 pickle.attr(
"dump")(result, file);
338 auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(),
"wb");
339 pickle.attr(
"dump")(steering_file_source_code.c_str(), steeringfile);
341 auto importances = framework.attr(
"feature_importance")(state);
342 if (len(importances) == 0) {
343 B2INFO(
"Python method returned empty feature importance. There won't be any information about the feature importance in the weightfile.");
344 }
else if (numberOfFeatures !=
static_cast<uint64_t
>(len(importances))) {
345 B2WARNING(
"Python method didn't return the correct number of importance value. I ignore the importances");
347 std::map<std::string, float> feature_importances;
348 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
349 boost::python::extract<float> proxy(importances[iFeature]);
353 B2WARNING(
"Failed to convert importance output of the method to a float, using 0 instead");
363 B2ERROR(
"Failed calling train in PythonTeacher");
364 throw std::runtime_error(std::string(
"Failed calling train in PythonTeacher"));
369 weightfile.
addFile(
"Python_Weightfile", custom_weightfile);
370 weightfile.
addFile(
"Python_Steeringfile", custom_steeringfile);
373 weightfile.
addVector(
"Python_Means", means);
374 weightfile.
addVector(
"Python_Stds", stds);
391 weightfile.
getFile(
"Python_Weightfile", custom_weightfile);
401 auto pickle = boost::python::import(
"pickle");
402 auto builtins = boost::python::import(
"builtins");
407 weightfile.
getFile(
"Python_Steeringfile", custom_steeringfile);
408 auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(),
"rb");
409 auto source_code = pickle.attr(
"load")(steeringfile);
410 builtins.attr(
"exec")(boost::python::object(source_code), boost::python::object(
m_framework.attr(
"__dict__")));
413 auto file = builtins.attr(
"open")(custom_weightfile.c_str(),
"rb");
414 auto unpickled_fit_object = pickle.attr(
"load")(file);
419 B2ERROR(
"Failed calling load in PythonExpert");
420 throw std::runtime_error(
"Failed calling load in PythonExpert");
428 uint64_t numberOfFeatures = test_data.getNumberOfFeatures();
429 uint64_t numberOfEvents = test_data.getNumberOfEvents();
431 auto X = std::unique_ptr<float[]>(
new float[numberOfEvents * numberOfFeatures]);
432 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(numberOfEvents),
static_cast<npy_intp
>(numberOfFeatures)};
434 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
435 test_data.loadEvent(iEvent);
437 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
438 X[iEvent * numberOfFeatures + iFeature] = (test_data.m_input[iFeature] -
m_means[iFeature]) /
m_stds[iFeature];
440 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
441 X[iEvent * numberOfFeatures + iFeature] = test_data.m_input[iFeature];
445 std::vector<float> probabilities(test_data.getNumberOfEvents(), std::numeric_limits<float>::quiet_NaN());
448 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
450 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
453 probabilities[iEvent] =
static_cast<float>(*
static_cast<float*
>(PyArray_GETPTR1(
reinterpret_cast<PyArrayObject*
>(result.ptr()),
459 B2ERROR(
"Failed calling applying PythonExpert");
460 throw std::runtime_error(
"Failed calling applying PythonExpert");
463 return probabilities;
469 uint64_t numberOfFeatures = test_data.getNumberOfFeatures();
470 uint64_t numberOfEvents = test_data.getNumberOfEvents();
472 auto X = std::unique_ptr<float[]>(
new float[numberOfEvents * numberOfFeatures]);
473 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(numberOfEvents),
static_cast<npy_intp
>(numberOfFeatures)};
475 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
476 test_data.loadEvent(iEvent);
478 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
479 X[iEvent * numberOfFeatures + iFeature] = (test_data.m_input[iFeature] -
m_means[iFeature]) /
m_stds[iFeature];
481 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
482 X[iEvent * numberOfFeatures + iFeature] = test_data.m_input[iFeature];
487 std::vector<std::vector<float>> probabilities(test_data.getNumberOfEvents(), std::vector<float>(nClasses,
488 std::numeric_limits<float>::quiet_NaN()));
491 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
493 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
496 for (uint64_t iClass = 0; iClass < nClasses; ++iClass) {
497 probabilities[iEvent][iClass] =
static_cast<float>(*
static_cast<float*
>(PyArray_GETPTR2(
reinterpret_cast<PyArrayObject*
>
505 B2ERROR(
"Failed calling applying PythonExpert");
506 throw std::runtime_error(
"Failed calling applying PythonExpert");
509 return probabilities;
static std::string findFile(const std::string &path, bool silent=false)
Search for given file or directory in local or central release directory, and return absolute path if...
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
GeneralOptions m_general_options
General options loaded from the weightfile.
General options which are shared by all MVA trainings.
std::vector< std::string > m_variables
Vector of all variables (branch names) used in the training.
unsigned int m_nClasses
Number of classes in a classification problem.
PythonExpert()
Constructs a new Python Expert.
boost::python::object m_state
current state object of method
std::vector< float > m_stds
Stds of all features for normalization.
boost::python::object m_framework
Framework module.
virtual std::vector< float > apply(Dataset &test_data) const override
Apply this expert onto a dataset.
PythonOptions m_specific_options
Method specific options.
virtual void load(Weightfile &weightfile) override
Load the expert from a Weightfile.
std::vector< float > m_means
Means of all features for normalization.
virtual std::vector< std::vector< float > > applyMulticlass(Dataset &test_data) const override
Apply this expert onto a dataset for multiclass problem.
Singleton class which handles the initialization and finalization of Python and numpy.
void * init_numpy()
Helper function which initializes array system of numpy.
~PythonInitializerSingleton()
Destructor of PythonInitializerSingleton.
bool m_initialized_python
Member which keeps indicate if this class initialized python.
static PythonInitializerSingleton & GetInstance()
Return static instance of PythonInitializerSingleton.
PythonInitializerSingleton()
Constructor of PythonInitializerSingleton.
PythonInitializerSingleton(const PythonInitializerSingleton &)=delete
Forbid copy constructor of PythonInitializerSingleton.
Options for the Python MVA method.
unsigned int m_nIterations
Number of iterations through the whole data.
std::string m_steering_file
steering file provided by the user to override the functions in the framework
std::string m_framework
framework to use e.g.
std::string m_config
Config string in json, which is passed to the get model function.
virtual po::options_description getDescription() override
Returns a program options description for all available options.
bool m_normalize
Normalize the inputs (shift mean to zero and std to 1)
double m_training_fraction
Fraction of data passed as training data, rest is passed as test data.
virtual void load(const boost::property_tree::ptree &pt) override
Load mechanism to load Options from a xml tree.
virtual void save(boost::property_tree::ptree &pt) const override
Save mechanism to store Options in a xml tree.
unsigned int m_mini_batch_size
Mini batch size, 0 passes the whole data in one call.
PythonTeacher(const GeneralOptions &general_options, const PythonOptions &specific_options)
Constructs a new teacher using the GeneralOptions and specific options of this training.
PythonOptions m_specific_options
Method specific options.
virtual Weightfile train(Dataset &training_data) const override
Train a mva method using the given dataset returning a Weightfile.
Abstract base class of all Teachers Each MVA library has its own implementation of this class,...
GeneralOptions m_general_options
GeneralOptions containing all shared options.
The Weightfile class serializes all information about a training into an xml tree.
void addFile(const std::string &identifier, const std::string &custom_weightfile)
Add a file (mostly a weightfile from a MVA library) to our Weightfile.
bool containsElement(const std::string &identifier) const
Returns true if given element is stored in the property tree.
std::vector< T > getVector(const std::string &identifier) const
Returns a stored vector from the xml tree.
void addOptions(const Options &options)
Add an Option object to the xml tree.
void getOptions(Options &options) const
Fills an Option object from the xml tree.
void addSignalFraction(float signal_fraction)
Saves the signal fraction in the xml tree.
void addFeatureImportance(const std::map< std::string, float > &importance)
Add variable importance.
void addVector(const std::string &identifier, const std::vector< T > &vector)
Add a vector to the xml tree.
std::string generateFileName(const std::string &suffix="")
Returns a temporary filename with the given suffix.
void getFile(const std::string &identifier, const std::string &custom_weightfile)
Creates a file from our weightfile (mostly this will be a weightfile of an MVA library)
Abstract base class for different kinds of events.
Wrap TRandom to be useable as a uniform random number generator with STL algorithms like std::shuffle...