9 #include <mva/methods/Python.h>
11 #include <boost/filesystem/convenience.hpp>
12 #include <numpy/npy_common.h>
13 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
14 #include <numpy/arrayobject.h>
16 #include <framework/logging/Logger.h>
17 #include <framework/utilities/FileSystem.h>
30 int version = pt.get<
int>(
"Python_version");
31 if (version < 1 or version > 2) {
32 B2ERROR(
"Unknown weightfile version " << std::to_string(version));
33 throw std::runtime_error(
"Unknown weightfile version " + std::to_string(version));
35 m_framework = pt.get<std::string>(
"Python_framework");
39 m_config = pt.get<std::string>(
"Python_config");
51 pt.put(
"Python_version", 2);
63 po::options_description description(
"Python options");
64 description.add_options()
66 "Framework which should be used. Currently supported are sklearn, tensorflow and theano")
67 (
"steering_file", po::value<std::string>(&
m_steering_file),
"Steering file which describes")
68 (
"mini_batch_size", po::value<unsigned int>(&
m_mini_batch_size),
"Size of the mini batch given to partial_fit function")
69 (
"nIterations", po::value<unsigned int>(&
m_nIterations),
"Number of iterations")
70 (
"normalize", po::value<bool>(&
m_normalize),
"Normalize input data (shift mean to 0 and std to 1)")
72 "Training fraction used to split up dataset in training and validation sample.")
73 (
"config", po::value<std::string>(&
m_config),
"Json encoded python object passed to begin_fit function");
100 if (not Py_IsInitialized()) {
103 wchar_t** bla =
nullptr;
104 PySys_SetArgvEx(0, bla, 0);
108 if (PyArray_API ==
nullptr) {
119 if (Py_IsInitialized()) {
152 m_specific_options(specific_options)
165 uint64_t numberOfFeatures = training_data.getNumberOfFeatures();
166 uint64_t numberOfSpectators = training_data.getNumberOfSpectators();
167 uint64_t numberOfEvents = training_data.getNumberOfEvents();
173 if (batch_size == 0) {
174 batch_size = numberOfTrainingEvents;
177 if (batch_size > numberOfTrainingEvents) {
178 B2WARNING(
"Mini batch size (" << batch_size <<
") is larger than the number of training events (" << numberOfTrainingEvents <<
")"\
179 " The batch size has been set equal to the number of training events.");
180 batch_size = numberOfTrainingEvents;
184 B2ERROR(
"Please provide a positive training fraction");
185 throw std::runtime_error(
"Please provide a training fraction between (0.0,1.0]");
188 auto X = std::unique_ptr<float[]>(
new float[batch_size * numberOfFeatures]);
189 auto S = std::unique_ptr<float[]>(
new float[batch_size * numberOfSpectators]);
190 auto y = std::unique_ptr<float[]>(
new float[batch_size]);
191 auto w = std::unique_ptr<float[]>(
new float[batch_size]);
192 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(batch_size),
static_cast<npy_intp
>(numberOfFeatures)};
193 npy_intp dimensions_S[2] = {
static_cast<npy_intp
>(batch_size),
static_cast<npy_intp
>(numberOfSpectators)};
194 npy_intp dimensions_y[2] = {
static_cast<npy_intp
>(batch_size), 1};
195 npy_intp dimensions_w[2] = {
static_cast<npy_intp
>(batch_size), 1};
197 auto X_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfFeatures]);
198 auto S_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfSpectators]);
199 auto y_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
200 auto w_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
201 npy_intp dimensions_X_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents),
static_cast<npy_intp
>(numberOfFeatures)};
202 npy_intp dimensions_S_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents),
static_cast<npy_intp
>(numberOfSpectators)};
203 npy_intp dimensions_y_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
204 npy_intp dimensions_w_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
206 std::string steering_file_source_code;
209 std::ifstream steering_file(filename);
210 if (not steering_file) {
211 throw std::runtime_error(std::string(
"Couldn't open file ") + filename);
213 steering_file.seekg(0, std::ios::end);
214 steering_file_source_code.resize(steering_file.tellg());
215 steering_file.seekg(0, std::ios::beg);
216 steering_file.read(&steering_file_source_code[0], steering_file_source_code.size());
219 std::vector<float> means(numberOfFeatures, 0.0);
220 std::vector<float> stds(numberOfFeatures, 0.0);
225 auto weights = training_data.getWeights();
226 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
229 double running_std = 0.0;
230 auto feature = training_data.getFeature(iFeature);
231 for (uint64_t i = 0; i < weights.size(); ++i) {
233 double meanOld = mean;
234 mean += (weights[i] / wSum) * (feature[i] - meanOld);
235 running_std += weights[i] * (feature[i] - meanOld) * (feature[i] - mean);
237 means[iFeature] = mean;
238 stds[iFeature] = std::sqrt(running_std / (wSum - 1));
244 auto json = boost::python::import(
"json");
245 auto builtins = boost::python::import(
"builtins");
246 auto inspect = boost::python::import(
"inspect");
251 builtins.attr(
"exec")(steering_file_source_code.c_str(), boost::python::object(framework.attr(
"__dict__")));
255 auto model = framework.attr(
"get_model")(numberOfFeatures, numberOfSpectators,
259 for (uint64_t iEvent = 0; iEvent < numberOfValidationEvents; ++iEvent) {
260 training_data.loadEvent(iEvent);
262 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
263 X_v[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
265 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
266 X_v[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
268 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
269 S_v[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
270 y_v[iEvent] = training_data.m_target;
271 w_v[iEvent] = training_data.m_weight;
274 auto ndarray_X_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X_v, NPY_FLOAT32, X_v.get()));
275 auto ndarray_S_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S_v, NPY_FLOAT32, S_v.get()));
276 auto ndarray_y_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y_v, NPY_FLOAT32, y_v.get()));
277 auto ndarray_w_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w_v, NPY_FLOAT32, w_v.get()));
279 uint64_t nBatches = std::floor(numberOfTrainingEvents / batch_size);
281 auto state = framework.attr(
"begin_fit")(model, ndarray_X_v, ndarray_S_v, ndarray_y_v, ndarray_w_v, nBatches);
283 bool continue_loop =
true;
285 std::vector<uint64_t> iteration_index_vector(numberOfTrainingEvents);
286 std::iota(std::begin(iteration_index_vector), std::end(iteration_index_vector), 0);
289 and continue_loop; ++iIteration) {
292 if (iIteration > 0) std::shuffle(std::begin(iteration_index_vector), std::end(iteration_index_vector),
TRandomWrapper());
294 for (uint64_t iBatch = 0; iBatch < nBatches and continue_loop; ++iBatch) {
298 PyThreadState* m_thread_state = PyEval_SaveThread();
299 for (uint64_t iEvent = 0; iEvent < batch_size; ++iEvent) {
300 training_data.loadEvent(iteration_index_vector.at(iEvent + iBatch * batch_size) + numberOfValidationEvents);
302 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
303 X[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
305 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
306 X[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
308 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
309 S[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
310 y[iEvent] = training_data.m_target;
311 w[iEvent] = training_data.m_weight;
315 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
316 auto ndarray_S = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S, NPY_FLOAT32, S.get()));
317 auto ndarray_y = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y, NPY_FLOAT32, y.get()));
318 auto ndarray_w = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w, NPY_FLOAT32, w.get()));
321 PyEval_RestoreThread(m_thread_state);
322 auto r = framework.attr(
"partial_fit")(state, ndarray_X, ndarray_S, ndarray_y,
323 ndarray_w, iIteration, iBatch);
324 boost::python::extract<bool> proxy(r);
326 continue_loop =
static_cast<bool>(proxy);
330 auto result = framework.attr(
"end_fit")(state);
332 auto pickle = boost::python::import(
"pickle");
333 auto file = builtins.attr(
"open")(custom_weightfile.c_str(),
"wb");
334 pickle.attr(
"dump")(result, file);
336 auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(),
"wb");
337 pickle.attr(
"dump")(steering_file_source_code.c_str(), steeringfile);
339 auto importances = framework.attr(
"feature_importance")(state);
340 if (len(importances) == 0) {
341 B2INFO(
"Python method returned empty feature importance. There won't be any information about the feature importance in the weightfile.");
342 }
else if (numberOfFeatures !=
static_cast<uint64_t
>(len(importances))) {
343 B2WARNING(
"Python method didn't return the correct number of importance value. I ignore the importances");
345 std::map<std::string, float> feature_importances;
346 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
347 boost::python::extract<float> proxy(importances[iFeature]);
351 B2WARNING(
"Failed to convert importance output of the method to a float, using 0 instead");
361 B2ERROR(
"Failed calling train in PythonTeacher");
362 throw std::runtime_error(std::string(
"Failed calling train in PythonTeacher"));
367 weightfile.
addFile(
"Python_Weightfile", custom_weightfile);
368 weightfile.
addFile(
"Python_Steeringfile", custom_steeringfile);
371 weightfile.
addVector(
"Python_Means", means);
372 weightfile.
addVector(
"Python_Stds", stds);
389 weightfile.
getFile(
"Python_Weightfile", custom_weightfile);
399 auto pickle = boost::python::import(
"pickle");
400 auto builtins = boost::python::import(
"builtins");
405 weightfile.
getFile(
"Python_Steeringfile", custom_steeringfile);
406 auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(),
"rb");
407 auto source_code = pickle.attr(
"load")(steeringfile);
408 builtins.attr(
"exec")(boost::python::object(source_code), boost::python::object(
m_framework.attr(
"__dict__")));
411 auto file = builtins.attr(
"open")(custom_weightfile.c_str(),
"rb");
412 auto unpickled_fit_object = pickle.attr(
"load")(file);
417 B2ERROR(
"Failed calling load in PythonExpert");
418 throw std::runtime_error(
"Failed calling load in PythonExpert");
426 uint64_t numberOfFeatures = test_data.getNumberOfFeatures();
427 uint64_t numberOfEvents = test_data.getNumberOfEvents();
429 auto X = std::unique_ptr<float[]>(
new float[numberOfEvents * numberOfFeatures]);
430 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(numberOfEvents),
static_cast<npy_intp
>(numberOfFeatures)};
432 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
433 test_data.loadEvent(iEvent);
435 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
436 X[iEvent * numberOfFeatures + iFeature] = (test_data.m_input[iFeature] -
m_means[iFeature]) /
m_stds[iFeature];
438 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
439 X[iEvent * numberOfFeatures + iFeature] = test_data.m_input[iFeature];
443 std::vector<float> probabilities(test_data.getNumberOfEvents(), std::numeric_limits<float>::quiet_NaN());
446 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
448 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
451 probabilities[iEvent] =
static_cast<float>(*
static_cast<float*
>(PyArray_GETPTR1(
reinterpret_cast<PyArrayObject*
>(result.ptr()),
457 B2ERROR(
"Failed calling applying PythonExpert");
458 throw std::runtime_error(
"Failed calling applying PythonExpert");
461 return probabilities;
467 uint64_t numberOfFeatures = test_data.getNumberOfFeatures();
468 uint64_t numberOfEvents = test_data.getNumberOfEvents();
470 auto X = std::unique_ptr<float[]>(
new float[numberOfEvents * numberOfFeatures]);
471 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(numberOfEvents),
static_cast<npy_intp
>(numberOfFeatures)};
473 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
474 test_data.loadEvent(iEvent);
476 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
477 X[iEvent * numberOfFeatures + iFeature] = (test_data.m_input[iFeature] -
m_means[iFeature]) /
m_stds[iFeature];
479 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
480 X[iEvent * numberOfFeatures + iFeature] = test_data.m_input[iFeature];
485 std::vector<std::vector<float>> probabilities(test_data.getNumberOfEvents(), std::vector<float>(nClasses,
486 std::numeric_limits<float>::quiet_NaN()));
489 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
491 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
494 for (uint64_t iClass = 0; iClass < nClasses; ++iClass) {
495 probabilities[iEvent][iClass] =
static_cast<float>(*
static_cast<float*
>(PyArray_GETPTR2(
reinterpret_cast<PyArrayObject*
>
503 B2ERROR(
"Failed calling applying PythonExpert");
504 throw std::runtime_error(
"Failed calling applying PythonExpert");
507 return probabilities;
static std::string findFile(const std::string &path, bool silent=false)
Search for given file or directory in local or central release directory, and return absolute path if...
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
GeneralOptions m_general_options
General options loaded from the weightfile.
General options which are shared by all MVA trainings.
std::vector< std::string > m_variables
Vector of all variables (branch names) used in the training.
unsigned int m_nClasses
Number of classes in a classification problem.
PythonExpert()
Constructs a new Python Expert.
boost::python::object m_state
current state object of method
std::vector< float > m_stds
Stds of all features for normalization.
boost::python::object m_framework
Framework module.
virtual std::vector< float > apply(Dataset &test_data) const override
Apply this expert onto a dataset.
PythonOptions m_specific_options
Method specific options.
virtual void load(Weightfile &weightfile) override
Load the expert from a Weightfile.
std::vector< float > m_means
Means of all features for normalization.
virtual std::vector< std::vector< float > > applyMulticlass(Dataset &test_data) const override
Apply this expert onto a dataset for multiclass problem.
Singleton class which handles the initialization and finalization of Python and numpy.
void * init_numpy()
Helper function which initializes array system of numpy.
~PythonInitializerSingleton()
Destructor of PythonInitializerSingleton.
bool m_initialized_python
Member which keeps indicate if this class initialized python.
static PythonInitializerSingleton & GetInstance()
Return static instance of PythonInitializerSingleton.
PythonInitializerSingleton()
Constructor of PythonInitializerSingleton.
PythonInitializerSingleton(const PythonInitializerSingleton &)=delete
Forbid copy constructor of PythonInitializerSingleton.
Options for the Python MVA method.
unsigned int m_nIterations
Number of iterations through the whole data.
std::string m_steering_file
steering file provided by the user to override the functions in the framework
std::string m_framework
framework to use e.g.
std::string m_config
Config string in json, which is passed to the get model function.
virtual po::options_description getDescription() override
Returns a program options description for all available options.
bool m_normalize
Normalize the inputs (shift mean to zero and std to 1)
double m_training_fraction
Fraction of data passed as training data, rest is passed as test data.
virtual void load(const boost::property_tree::ptree &pt) override
Load mechanism to load Options from a xml tree.
virtual void save(boost::property_tree::ptree &pt) const override
Save mechanism to store Options in a xml tree.
unsigned int m_mini_batch_size
Mini batch size, 0 passes the whole data in one call.
PythonTeacher(const GeneralOptions &general_options, const PythonOptions &specific_options)
Constructs a new teacher using the GeneralOptions and specific options of this training.
PythonOptions m_specific_options
Method specific options.
virtual Weightfile train(Dataset &training_data) const override
Train a mva method using the given dataset returning a Weightfile.
Abstract base class of all Teachers Each MVA library has its own implementation of this class,...
GeneralOptions m_general_options
GeneralOptions containing all shared options.
The Weightfile class serializes all information about a training into an xml tree.
void addFile(const std::string &identifier, const std::string &custom_weightfile)
Add a file (mostly a weightfile from a MVA library) to our Weightfile.
bool containsElement(const std::string &identifier) const
Returns true if given element is stored in the property tree.
void addOptions(const Options &options)
Add an Option object to the xml tree.
std::vector< T > getVector(const std::string &identifier) const
Returns a stored vector from the xml tree.
void getOptions(Options &options) const
Fills an Option object from the xml tree.
void addSignalFraction(float signal_fraction)
Saves the signal fraction in the xml tree.
void addFeatureImportance(const std::map< std::string, float > &importance)
Add variable importance.
void addVector(const std::string &identifier, const std::vector< T > &vector)
Add a vector to the xml tree.
std::string generateFileName(const std::string &suffix="")
Returns a temporary filename with the given suffix.
void getFile(const std::string &identifier, const std::string &custom_weightfile)
Creates a file from our weightfile (mostly this will be a weightfile of an MVA library)
Abstract base class for different kinds of events.
Wrap TRandom to be useable as a uniform random number generator with std algorithms like std::shuffle...