9#include <mva/methods/Python.h>
11#include <numpy/npy_common.h>
12#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
13#include <numpy/arrayobject.h>
15#include <framework/logging/Logger.h>
16#include <framework/utilities/FileSystem.h>
17#include <framework/utilities/TRandomWrapper.h>
31 int version = pt.get<
int>(
"Python_version");
32 if (version < 1 or version > 2) {
33 B2ERROR(
"Unknown weightfile version " << std::to_string(version));
34 throw std::runtime_error(
"Unknown weightfile version " + std::to_string(version));
36 m_framework = pt.get<std::string>(
"Python_framework");
40 m_config = pt.get<std::string>(
"Python_config");
52 pt.put(
"Python_version", 2);
64 po::options_description description(
"Python options");
65 description.add_options()
67 "Framework which should be used. Currently supported are sklearn, xgboost, tensorflow, keras, torch, and theano")
68 (
"steering_file", po::value<std::string>(&
m_steering_file),
"Steering file which describes the model")
69 (
"mini_batch_size", po::value<unsigned int>(&
m_mini_batch_size),
"Size of the mini batch given to partial_fit function")
70 (
"nIterations", po::value<unsigned int>(&
m_nIterations),
"Number of iterations")
71 (
"normalize", po::value<bool>(&
m_normalize),
"Normalize input data (shift mean to 0 and std to 1)")
73 "Training fraction used to split up dataset in training and validation sample.")
74 (
"config", po::value<std::string>(&
m_config),
"Json encoded python object passed to begin_fit function");
101 if (not Py_IsInitialized()) {
106 if (PyArray_API ==
nullptr) {
117 if (Py_IsInitialized()) {
150 m_specific_options(specific_options)
163 uint64_t numberOfFeatures = training_data.getNumberOfFeatures();
164 uint64_t numberOfSpectators = training_data.getNumberOfSpectators();
165 uint64_t numberOfEvents = training_data.getNumberOfEvents();
168 B2ERROR(
"Please provide a positive training fraction");
169 throw std::runtime_error(
"Please provide a training fraction between (0.0,1.0]");
173 numberOfTrainingEvents = numberOfTrainingEvents / 100 + (numberOfTrainingEvents % 100 != 0);
174 auto numberOfValidationEvents = numberOfEvents - numberOfTrainingEvents;
177 if (batch_size == 0) {
178 batch_size = numberOfTrainingEvents;
181 if (batch_size > numberOfTrainingEvents) {
182 B2WARNING(
"Mini batch size (" << batch_size <<
") is larger than the number of training events (" << numberOfTrainingEvents <<
")"\
183 " The batch size has been set equal to the number of training events.");
184 batch_size = numberOfTrainingEvents;
187 auto X = std::unique_ptr<float[]>(
new float[batch_size * numberOfFeatures]);
188 auto S = std::unique_ptr<float[]>(
new float[batch_size * numberOfSpectators]);
189 auto y = std::unique_ptr<float[]>(
new float[batch_size]);
190 auto w = std::unique_ptr<float[]>(
new float[batch_size]);
191 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(batch_size),
static_cast<npy_intp
>(numberOfFeatures)};
192 npy_intp dimensions_S[2] = {
static_cast<npy_intp
>(batch_size),
static_cast<npy_intp
>(numberOfSpectators)};
193 npy_intp dimensions_y[2] = {
static_cast<npy_intp
>(batch_size), 1};
194 npy_intp dimensions_w[2] = {
static_cast<npy_intp
>(batch_size), 1};
196 auto X_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfFeatures]);
197 auto S_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfSpectators]);
198 auto y_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
199 auto w_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
200 npy_intp dimensions_X_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents),
static_cast<npy_intp
>(numberOfFeatures)};
201 npy_intp dimensions_S_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents),
static_cast<npy_intp
>(numberOfSpectators)};
202 npy_intp dimensions_y_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
203 npy_intp dimensions_w_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
205 std::string steering_file_source_code;
208 std::ifstream steering_file(filename);
209 if (not steering_file) {
210 throw std::runtime_error(std::string(
"Couldn't open file ") + filename);
212 steering_file.seekg(0, std::ios::end);
213 steering_file_source_code.resize(steering_file.tellg());
214 steering_file.seekg(0, std::ios::beg);
215 steering_file.read(&steering_file_source_code[0], steering_file_source_code.size());
218 std::vector<float> means(numberOfFeatures, 0.0);
219 std::vector<float> stds(numberOfFeatures, 0.0);
224 auto weights = training_data.getWeights();
225 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
228 double running_std = 0.0;
229 auto feature = training_data.getFeature(iFeature);
230 for (uint64_t i = 0; i < weights.size(); ++i) {
232 double meanOld = mean;
233 mean += (weights[i] / wSum) * (feature[i] - meanOld);
234 running_std += weights[i] * (feature[i] - meanOld) * (feature[i] - mean);
236 means[iFeature] = mean;
237 stds[iFeature] = std::sqrt(running_std / (wSum - 1));
243 auto json = boost::python::import(
"json");
244 auto builtins = boost::python::import(
"builtins");
245 auto inspect = boost::python::import(
"inspect");
250 builtins.attr(
"exec")(steering_file_source_code.c_str(), boost::python::object(framework.attr(
"__dict__")));
254 auto model = framework.attr(
"get_model")(numberOfFeatures, numberOfSpectators,
258 for (uint64_t iEvent = 0; iEvent < numberOfValidationEvents; ++iEvent) {
259 training_data.loadEvent(iEvent);
261 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
262 X_v[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
264 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
265 X_v[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
267 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
268 S_v[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
269 y_v[iEvent] = training_data.m_target;
270 w_v[iEvent] = training_data.m_weight;
273 auto ndarray_X_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X_v, NPY_FLOAT32, X_v.get()));
274 auto ndarray_S_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S_v, NPY_FLOAT32, S_v.get()));
275 auto ndarray_y_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y_v, NPY_FLOAT32, y_v.get()));
276 auto ndarray_w_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w_v, NPY_FLOAT32, w_v.get()));
278 uint64_t nBatches = std::floor(numberOfTrainingEvents / batch_size);
280 auto state = framework.attr(
"begin_fit")(model, ndarray_X_v, ndarray_S_v, ndarray_y_v, ndarray_w_v, nBatches);
282 bool continue_loop =
true;
284 std::vector<uint64_t> iteration_index_vector(numberOfTrainingEvents);
285 std::iota(std::begin(iteration_index_vector), std::end(iteration_index_vector), 0);
288 and continue_loop; ++iIteration) {
291 if (iIteration > 0) std::shuffle(std::begin(iteration_index_vector), std::end(iteration_index_vector),
TRandomWrapper());
293 for (uint64_t iBatch = 0; iBatch < nBatches and continue_loop; ++iBatch) {
297 PyThreadState* m_thread_state = PyEval_SaveThread();
298 for (uint64_t iEvent = 0; iEvent < batch_size; ++iEvent) {
299 training_data.loadEvent(iteration_index_vector.at(iEvent + iBatch * batch_size) + numberOfValidationEvents);
301 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
302 X[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
304 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
305 X[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
307 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
308 S[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
309 y[iEvent] = training_data.m_target;
310 w[iEvent] = training_data.m_weight;
314 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
315 auto ndarray_S = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S, NPY_FLOAT32, S.get()));
316 auto ndarray_y = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y, NPY_FLOAT32, y.get()));
317 auto ndarray_w = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w, NPY_FLOAT32, w.get()));
320 PyEval_RestoreThread(m_thread_state);
321 auto r = framework.attr(
"partial_fit")(state, ndarray_X, ndarray_S, ndarray_y,
322 ndarray_w, iIteration, iBatch);
323 boost::python::extract<bool> proxy(r);
325 continue_loop =
static_cast<bool>(proxy);
329 auto result = framework.attr(
"end_fit")(state);
331 auto pickle = boost::python::import(
"pickle");
332 auto file = builtins.attr(
"open")(custom_weightfile.c_str(),
"wb");
333 pickle.attr(
"dump")(result, file);
335 auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(),
"wb");
336 pickle.attr(
"dump")(steering_file_source_code.c_str(), steeringfile);
338 auto importances = framework.attr(
"feature_importance")(state);
339 if (len(importances) == 0) {
340 B2INFO(
"Python method returned empty feature importance. There won't be any information about the feature importance in the weightfile.");
341 }
else if (numberOfFeatures !=
static_cast<uint64_t
>(len(importances))) {
342 B2WARNING(
"Python method didn't return the correct number of importance value. I ignore the importances");
344 std::map<std::string, float> feature_importances;
345 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
346 boost::python::extract<float> proxy(importances[iFeature]);
350 B2WARNING(
"Failed to convert importance output of the method to a float, using 0 instead");
360 B2ERROR(
"Failed calling train in PythonTeacher");
361 throw std::runtime_error(std::string(
"Failed calling train in PythonTeacher"));
366 weightfile.
addFile(
"Python_Weightfile", custom_weightfile);
367 weightfile.
addFile(
"Python_Steeringfile", custom_steeringfile);
370 weightfile.
addVector(
"Python_Means", means);
371 weightfile.
addVector(
"Python_Stds", stds);
388 weightfile.
getFile(
"Python_Weightfile", custom_weightfile);
398 auto pickle = boost::python::import(
"pickle");
399 auto builtins = boost::python::import(
"builtins");
404 weightfile.
getFile(
"Python_Steeringfile", custom_steeringfile);
405 auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(),
"rb");
406 auto source_code = pickle.attr(
"load")(steeringfile);
407 builtins.attr(
"exec")(boost::python::object(source_code), boost::python::object(
m_framework.attr(
"__dict__")));
410 auto file = builtins.attr(
"open")(custom_weightfile.c_str(),
"rb");
411 auto unpickled_fit_object = pickle.attr(
"load")(file);
416 B2ERROR(
"Failed calling load in PythonExpert");
417 throw std::runtime_error(
"Failed calling load in PythonExpert");
425 uint64_t numberOfFeatures = test_data.getNumberOfFeatures();
426 uint64_t numberOfEvents = test_data.getNumberOfEvents();
428 auto X = std::unique_ptr<float[]>(
new float[numberOfEvents * numberOfFeatures]);
429 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(numberOfEvents),
static_cast<npy_intp
>(numberOfFeatures)};
431 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
432 test_data.loadEvent(iEvent);
434 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
435 X[iEvent * numberOfFeatures + iFeature] = (test_data.m_input[iFeature] -
m_means[iFeature]) /
m_stds[iFeature];
437 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
438 X[iEvent * numberOfFeatures + iFeature] = test_data.m_input[iFeature];
442 std::vector<float> probabilities(test_data.getNumberOfEvents(), std::numeric_limits<float>::quiet_NaN());
445 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
447 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
450 probabilities[iEvent] =
static_cast<float>(*
static_cast<float*
>(PyArray_GETPTR1(
reinterpret_cast<PyArrayObject*
>(result.ptr()),
456 B2ERROR(
"Failed calling applying PythonExpert");
457 throw std::runtime_error(
"Failed calling applying PythonExpert");
460 return probabilities;
466 uint64_t numberOfFeatures = test_data.getNumberOfFeatures();
467 uint64_t numberOfEvents = test_data.getNumberOfEvents();
469 auto X = std::unique_ptr<float[]>(
new float[numberOfEvents * numberOfFeatures]);
470 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(numberOfEvents),
static_cast<npy_intp
>(numberOfFeatures)};
472 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
473 test_data.loadEvent(iEvent);
475 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
476 X[iEvent * numberOfFeatures + iFeature] = (test_data.m_input[iFeature] -
m_means[iFeature]) /
m_stds[iFeature];
478 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
479 X[iEvent * numberOfFeatures + iFeature] = test_data.m_input[iFeature];
484 std::vector<std::vector<float>> probabilities(test_data.getNumberOfEvents(), std::vector<float>(nClasses,
485 std::numeric_limits<float>::quiet_NaN()));
488 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
490 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
493 for (uint64_t iClass = 0; iClass < nClasses; ++iClass) {
494 probabilities[iEvent][iClass] =
static_cast<float>(*
static_cast<float*
>(PyArray_GETPTR2(
reinterpret_cast<PyArrayObject*
>
502 B2ERROR(
"Failed calling applying PythonExpert");
503 throw std::runtime_error(
"Failed calling applying PythonExpert");
506 return probabilities;
static std::string findFile(const std::string &path, bool silent=false)
Search for given file or directory in local or central release directory, and return absolute path if...
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
GeneralOptions m_general_options
General options loaded from the weightfile.
General options which are shared by all MVA trainings.
std::vector< std::string > m_variables
Vector of all variables (branch names) used in the training.
unsigned int m_nClasses
Number of classes in a classification problem.
PythonExpert()
Constructs a new Python Expert.
boost::python::object m_state
current state object of method
std::vector< float > m_stds
Stds of all features for normalization.
boost::python::object m_framework
Framework module.
virtual std::vector< float > apply(Dataset &test_data) const override
Apply this expert onto a dataset.
PythonOptions m_specific_options
Method specific options.
virtual void load(Weightfile &weightfile) override
Load the expert from a Weightfile.
std::vector< float > m_means
Means of all features for normalization.
virtual std::vector< std::vector< float > > applyMulticlass(Dataset &test_data) const override
Apply this expert onto a dataset for multiclass problem.
Singleton class which handles the initialization and finalization of Python and numpy.
void * init_numpy()
Helper function which initializes array system of numpy.
~PythonInitializerSingleton()
Destructor of PythonInitializerSingleton.
bool m_initialized_python
Member which keeps indicate if this class initialized python.
static PythonInitializerSingleton & GetInstance()
Return static instance of PythonInitializerSingleton.
PythonInitializerSingleton()
Constructor of PythonInitializerSingleton.
PythonInitializerSingleton(const PythonInitializerSingleton &)=delete
Forbid copy constructor of PythonInitializerSingleton.
Options for the Python MVA method.
unsigned int m_nIterations
Number of iterations through the whole data.
std::string m_steering_file
steering file provided by the user to override the functions in the framework
std::string m_framework
framework to use e.g.
std::string m_config
Config string in json, which is passed to the get model function.
virtual po::options_description getDescription() override
Returns a program options description for all available options.
bool m_normalize
Normalize the inputs (shift mean to zero and std to 1)
double m_training_fraction
Fraction of data passed as training data, rest is passed as test data.
virtual void load(const boost::property_tree::ptree &pt) override
Load mechanism to load Options from a xml tree.
virtual void save(boost::property_tree::ptree &pt) const override
Save mechanism to store Options in a xml tree.
unsigned int m_mini_batch_size
Mini batch size, 0 passes the whole data in one call.
PythonTeacher(const GeneralOptions &general_options, const PythonOptions &specific_options)
Constructs a new teacher using the GeneralOptions and specific options of this training.
PythonOptions m_specific_options
Method specific options.
virtual Weightfile train(Dataset &training_data) const override
Train a mva method using the given dataset returning a Weightfile.
Abstract base class of all Teachers Each MVA library has its own implementation of this class,...
GeneralOptions m_general_options
GeneralOptions containing all shared options.
The Weightfile class serializes all information about a training into an xml tree.
void addFile(const std::string &identifier, const std::string &custom_weightfile)
Add a file (mostly a weightfile from a MVA library) to our Weightfile.
bool containsElement(const std::string &identifier) const
Returns true if given element is stored in the property tree.
std::vector< T > getVector(const std::string &identifier) const
Returns a stored vector from the xml tree.
void addOptions(const Options &options)
Add an Option object to the xml tree.
void getOptions(Options &options) const
Fills an Option object from the xml tree.
void addSignalFraction(float signal_fraction)
Saves the signal fraction in the xml tree.
void addFeatureImportance(const std::map< std::string, float > &importance)
Add variable importance.
void addVector(const std::string &identifier, const std::vector< T > &vector)
Add a vector to the xml tree.
std::string generateFileName(const std::string &suffix="")
Returns a temporary filename with the given suffix.
void getFile(const std::string &identifier, const std::string &custom_weightfile)
Creates a file from our weightfile (mostly this will be a weightfile of an MVA library)
Abstract base class for different kinds of events.
Wrap TRandom to be useable as a uniform random number generator with STL algorithms like std::shuffle...