9 #include <mva/methods/Python.h>
11 #include <boost/filesystem/convenience.hpp>
12 #include <numpy/npy_common.h>
13 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
14 #include <numpy/arrayobject.h>
16 #include <framework/logging/Logger.h>
17 #include <framework/utilities/FileSystem.h>
18 #include <framework/utilities/TRandomWrapper.h>
32 int version = pt.get<
int>(
"Python_version");
33 if (version < 1 or version > 2) {
34 B2ERROR(
"Unknown weightfile version " << std::to_string(version));
35 throw std::runtime_error(
"Unknown weightfile version " + std::to_string(version));
37 m_framework = pt.get<std::string>(
"Python_framework");
41 m_config = pt.get<std::string>(
"Python_config");
53 pt.put(
"Python_version", 2);
65 po::options_description description(
"Python options");
66 description.add_options()
68 "Framework which should be used. Currently supported are sklearn, tensorflow and theano")
69 (
"steering_file", po::value<std::string>(&
m_steering_file),
"Steering file which describes")
70 (
"mini_batch_size", po::value<unsigned int>(&
m_mini_batch_size),
"Size of the mini batch given to partial_fit function")
71 (
"nIterations", po::value<unsigned int>(&
m_nIterations),
"Number of iterations")
72 (
"normalize", po::value<bool>(&
m_normalize),
"Normalize input data (shift mean to 0 and std to 1)")
74 "Training fraction used to split up dataset in training and validation sample.")
75 (
"config", po::value<std::string>(&
m_config),
"Json encoded python object passed to begin_fit function");
102 if (not Py_IsInitialized()) {
105 wchar_t** bla =
nullptr;
106 PySys_SetArgvEx(0, bla, 0);
110 if (PyArray_API ==
nullptr) {
121 if (Py_IsInitialized()) {
154 m_specific_options(specific_options)
167 uint64_t numberOfFeatures = training_data.getNumberOfFeatures();
168 uint64_t numberOfSpectators = training_data.getNumberOfSpectators();
169 uint64_t numberOfEvents = training_data.getNumberOfEvents();
172 B2ERROR(
"Please provide a positive training fraction");
173 throw std::runtime_error(
"Please provide a training fraction between (0.0,1.0]");
177 numberOfTrainingEvents = numberOfTrainingEvents / 100 + (numberOfTrainingEvents % 100 != 0);
178 auto numberOfValidationEvents = numberOfEvents - numberOfTrainingEvents;
181 if (batch_size == 0) {
182 batch_size = numberOfTrainingEvents;
185 if (batch_size > numberOfTrainingEvents) {
186 B2WARNING(
"Mini batch size (" << batch_size <<
") is larger than the number of training events (" << numberOfTrainingEvents <<
")"\
187 " The batch size has been set equal to the number of training events.");
188 batch_size = numberOfTrainingEvents;
191 auto X = std::unique_ptr<float[]>(
new float[batch_size * numberOfFeatures]);
192 auto S = std::unique_ptr<float[]>(
new float[batch_size * numberOfSpectators]);
193 auto y = std::unique_ptr<float[]>(
new float[batch_size]);
194 auto w = std::unique_ptr<float[]>(
new float[batch_size]);
195 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(batch_size),
static_cast<npy_intp
>(numberOfFeatures)};
196 npy_intp dimensions_S[2] = {
static_cast<npy_intp
>(batch_size),
static_cast<npy_intp
>(numberOfSpectators)};
197 npy_intp dimensions_y[2] = {
static_cast<npy_intp
>(batch_size), 1};
198 npy_intp dimensions_w[2] = {
static_cast<npy_intp
>(batch_size), 1};
200 auto X_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfFeatures]);
201 auto S_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfSpectators]);
202 auto y_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
203 auto w_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
204 npy_intp dimensions_X_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents),
static_cast<npy_intp
>(numberOfFeatures)};
205 npy_intp dimensions_S_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents),
static_cast<npy_intp
>(numberOfSpectators)};
206 npy_intp dimensions_y_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
207 npy_intp dimensions_w_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
209 std::string steering_file_source_code;
212 std::ifstream steering_file(filename);
213 if (not steering_file) {
214 throw std::runtime_error(std::string(
"Couldn't open file ") + filename);
216 steering_file.seekg(0, std::ios::end);
217 steering_file_source_code.resize(steering_file.tellg());
218 steering_file.seekg(0, std::ios::beg);
219 steering_file.read(&steering_file_source_code[0], steering_file_source_code.size());
222 std::vector<float> means(numberOfFeatures, 0.0);
223 std::vector<float> stds(numberOfFeatures, 0.0);
228 auto weights = training_data.getWeights();
229 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
232 double running_std = 0.0;
233 auto feature = training_data.getFeature(iFeature);
234 for (uint64_t i = 0; i < weights.size(); ++i) {
236 double meanOld = mean;
237 mean += (weights[i] / wSum) * (feature[i] - meanOld);
238 running_std += weights[i] * (feature[i] - meanOld) * (feature[i] - mean);
240 means[iFeature] = mean;
241 stds[iFeature] = std::sqrt(running_std / (wSum - 1));
247 auto json = boost::python::import(
"json");
248 auto builtins = boost::python::import(
"builtins");
249 auto inspect = boost::python::import(
"inspect");
254 builtins.attr(
"exec")(steering_file_source_code.c_str(), boost::python::object(framework.attr(
"__dict__")));
258 auto model = framework.attr(
"get_model")(numberOfFeatures, numberOfSpectators,
262 for (uint64_t iEvent = 0; iEvent < numberOfValidationEvents; ++iEvent) {
263 training_data.loadEvent(iEvent);
265 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
266 X_v[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
268 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
269 X_v[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
271 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
272 S_v[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
273 y_v[iEvent] = training_data.m_target;
274 w_v[iEvent] = training_data.m_weight;
277 auto ndarray_X_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X_v, NPY_FLOAT32, X_v.get()));
278 auto ndarray_S_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S_v, NPY_FLOAT32, S_v.get()));
279 auto ndarray_y_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y_v, NPY_FLOAT32, y_v.get()));
280 auto ndarray_w_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w_v, NPY_FLOAT32, w_v.get()));
282 uint64_t nBatches = std::floor(numberOfTrainingEvents / batch_size);
284 auto state = framework.attr(
"begin_fit")(model, ndarray_X_v, ndarray_S_v, ndarray_y_v, ndarray_w_v, nBatches);
286 bool continue_loop =
true;
288 std::vector<uint64_t> iteration_index_vector(numberOfTrainingEvents);
289 std::iota(std::begin(iteration_index_vector), std::end(iteration_index_vector), 0);
292 and continue_loop; ++iIteration) {
295 if (iIteration > 0) std::shuffle(std::begin(iteration_index_vector), std::end(iteration_index_vector),
TRandomWrapper());
297 for (uint64_t iBatch = 0; iBatch < nBatches and continue_loop; ++iBatch) {
301 PyThreadState* m_thread_state = PyEval_SaveThread();
302 for (uint64_t iEvent = 0; iEvent < batch_size; ++iEvent) {
303 training_data.loadEvent(iteration_index_vector.at(iEvent + iBatch * batch_size) + numberOfValidationEvents);
305 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
306 X[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
308 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
309 X[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
311 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
312 S[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
313 y[iEvent] = training_data.m_target;
314 w[iEvent] = training_data.m_weight;
318 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
319 auto ndarray_S = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S, NPY_FLOAT32, S.get()));
320 auto ndarray_y = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y, NPY_FLOAT32, y.get()));
321 auto ndarray_w = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w, NPY_FLOAT32, w.get()));
324 PyEval_RestoreThread(m_thread_state);
325 auto r = framework.attr(
"partial_fit")(state, ndarray_X, ndarray_S, ndarray_y,
326 ndarray_w, iIteration, iBatch);
327 boost::python::extract<bool> proxy(r);
329 continue_loop =
static_cast<bool>(proxy);
333 auto result = framework.attr(
"end_fit")(state);
335 auto pickle = boost::python::import(
"pickle");
336 auto file = builtins.attr(
"open")(custom_weightfile.c_str(),
"wb");
337 pickle.attr(
"dump")(result, file);
339 auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(),
"wb");
340 pickle.attr(
"dump")(steering_file_source_code.c_str(), steeringfile);
342 auto importances = framework.attr(
"feature_importance")(state);
343 if (len(importances) == 0) {
344 B2INFO(
"Python method returned empty feature importance. There won't be any information about the feature importance in the weightfile.");
345 }
else if (numberOfFeatures !=
static_cast<uint64_t
>(len(importances))) {
346 B2WARNING(
"Python method didn't return the correct number of importance value. I ignore the importances");
348 std::map<std::string, float> feature_importances;
349 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
350 boost::python::extract<float> proxy(importances[iFeature]);
354 B2WARNING(
"Failed to convert importance output of the method to a float, using 0 instead");
364 B2ERROR(
"Failed calling train in PythonTeacher");
365 throw std::runtime_error(std::string(
"Failed calling train in PythonTeacher"));
370 weightfile.
addFile(
"Python_Weightfile", custom_weightfile);
371 weightfile.
addFile(
"Python_Steeringfile", custom_steeringfile);
374 weightfile.
addVector(
"Python_Means", means);
375 weightfile.
addVector(
"Python_Stds", stds);
392 weightfile.
getFile(
"Python_Weightfile", custom_weightfile);
402 auto pickle = boost::python::import(
"pickle");
403 auto builtins = boost::python::import(
"builtins");
408 weightfile.
getFile(
"Python_Steeringfile", custom_steeringfile);
409 auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(),
"rb");
410 auto source_code = pickle.attr(
"load")(steeringfile);
411 builtins.attr(
"exec")(boost::python::object(source_code), boost::python::object(
m_framework.attr(
"__dict__")));
414 auto file = builtins.attr(
"open")(custom_weightfile.c_str(),
"rb");
415 auto unpickled_fit_object = pickle.attr(
"load")(file);
420 B2ERROR(
"Failed calling load in PythonExpert");
421 throw std::runtime_error(
"Failed calling load in PythonExpert");
429 uint64_t numberOfFeatures = test_data.getNumberOfFeatures();
430 uint64_t numberOfEvents = test_data.getNumberOfEvents();
432 auto X = std::unique_ptr<float[]>(
new float[numberOfEvents * numberOfFeatures]);
433 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(numberOfEvents),
static_cast<npy_intp
>(numberOfFeatures)};
435 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
436 test_data.loadEvent(iEvent);
438 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
439 X[iEvent * numberOfFeatures + iFeature] = (test_data.m_input[iFeature] -
m_means[iFeature]) /
m_stds[iFeature];
441 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
442 X[iEvent * numberOfFeatures + iFeature] = test_data.m_input[iFeature];
446 std::vector<float> probabilities(test_data.getNumberOfEvents(), std::numeric_limits<float>::quiet_NaN());
449 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
451 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
454 probabilities[iEvent] =
static_cast<float>(*
static_cast<float*
>(PyArray_GETPTR1(
reinterpret_cast<PyArrayObject*
>(result.ptr()),
460 B2ERROR(
"Failed calling applying PythonExpert");
461 throw std::runtime_error(
"Failed calling applying PythonExpert");
464 return probabilities;
470 uint64_t numberOfFeatures = test_data.getNumberOfFeatures();
471 uint64_t numberOfEvents = test_data.getNumberOfEvents();
473 auto X = std::unique_ptr<float[]>(
new float[numberOfEvents * numberOfFeatures]);
474 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(numberOfEvents),
static_cast<npy_intp
>(numberOfFeatures)};
476 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
477 test_data.loadEvent(iEvent);
479 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
480 X[iEvent * numberOfFeatures + iFeature] = (test_data.m_input[iFeature] -
m_means[iFeature]) /
m_stds[iFeature];
482 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
483 X[iEvent * numberOfFeatures + iFeature] = test_data.m_input[iFeature];
488 std::vector<std::vector<float>> probabilities(test_data.getNumberOfEvents(), std::vector<float>(nClasses,
489 std::numeric_limits<float>::quiet_NaN()));
492 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
494 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
497 for (uint64_t iClass = 0; iClass < nClasses; ++iClass) {
498 probabilities[iEvent][iClass] =
static_cast<float>(*
static_cast<float*
>(PyArray_GETPTR2(
reinterpret_cast<PyArrayObject*
>
506 B2ERROR(
"Failed calling applying PythonExpert");
507 throw std::runtime_error(
"Failed calling applying PythonExpert");
510 return probabilities;
static std::string findFile(const std::string &path, bool silent=false)
Search for given file or directory in local or central release directory, and return absolute path if...
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
GeneralOptions m_general_options
General options loaded from the weightfile.
General options which are shared by all MVA trainings.
std::vector< std::string > m_variables
Vector of all variables (branch names) used in the training.
unsigned int m_nClasses
Number of classes in a classification problem.
PythonExpert()
Constructs a new Python Expert.
boost::python::object m_state
current state object of method
std::vector< float > m_stds
Stds of all features for normalization.
boost::python::object m_framework
Framework module.
virtual std::vector< float > apply(Dataset &test_data) const override
Apply this expert onto a dataset.
PythonOptions m_specific_options
Method specific options.
virtual void load(Weightfile &weightfile) override
Load the expert from a Weightfile.
std::vector< float > m_means
Means of all features for normalization.
virtual std::vector< std::vector< float > > applyMulticlass(Dataset &test_data) const override
Apply this expert onto a dataset for multiclass problem.
Singleton class which handles the initialization and finalization of Python and numpy.
void * init_numpy()
Helper function which initializes array system of numpy.
~PythonInitializerSingleton()
Destructor of PythonInitializerSingleton.
bool m_initialized_python
Member which keeps indicate if this class initialized python.
static PythonInitializerSingleton & GetInstance()
Return static instance of PythonInitializerSingleton.
PythonInitializerSingleton()
Constructor of PythonInitializerSingleton.
PythonInitializerSingleton(const PythonInitializerSingleton &)=delete
Forbid copy constructor of PythonInitializerSingleton.
Options for the Python MVA method.
unsigned int m_nIterations
Number of iterations through the whole data.
std::string m_steering_file
steering file provided by the user to override the functions in the framework
std::string m_framework
framework to use e.g.
std::string m_config
Config string in json, which is passed to the get model function.
virtual po::options_description getDescription() override
Returns a program options description for all available options.
bool m_normalize
Normalize the inputs (shift mean to zero and std to 1)
double m_training_fraction
Fraction of data passed as training data, rest is passed as test data.
virtual void load(const boost::property_tree::ptree &pt) override
Load mechanism to load Options from a xml tree.
virtual void save(boost::property_tree::ptree &pt) const override
Save mechanism to store Options in a xml tree.
unsigned int m_mini_batch_size
Mini batch size, 0 passes the whole data in one call.
PythonTeacher(const GeneralOptions &general_options, const PythonOptions &specific_options)
Constructs a new teacher using the GeneralOptions and specific options of this training.
PythonOptions m_specific_options
Method specific options.
virtual Weightfile train(Dataset &training_data) const override
Train a mva method using the given dataset returning a Weightfile.
Abstract base class of all Teachers Each MVA library has its own implementation of this class,...
GeneralOptions m_general_options
GeneralOptions containing all shared options.
The Weightfile class serializes all information about a training into an xml tree.
void addFile(const std::string &identifier, const std::string &custom_weightfile)
Add a file (mostly a weightfile from a MVA library) to our Weightfile.
bool containsElement(const std::string &identifier) const
Returns true if given element is stored in the property tree.
void addOptions(const Options &options)
Add an Option object to the xml tree.
std::vector< T > getVector(const std::string &identifier) const
Returns a stored vector from the xml tree.
void getOptions(Options &options) const
Fills an Option object from the xml tree.
void addSignalFraction(float signal_fraction)
Saves the signal fraction in the xml tree.
void addFeatureImportance(const std::map< std::string, float > &importance)
Add variable importance.
void addVector(const std::string &identifier, const std::vector< T > &vector)
Add a vector to the xml tree.
std::string generateFileName(const std::string &suffix="")
Returns a temporary filename with the given suffix.
void getFile(const std::string &identifier, const std::string &custom_weightfile)
Creates a file from our weightfile (mostly this will be a weightfile of an MVA library)
Abstract base class for different kinds of events.
Wrap TRandom to be useable as a uniform random number generator with STL algorithms like std::shuffle...