9 #include <mva/methods/Python.h>
11 #include <boost/filesystem/convenience.hpp>
12 #include <numpy/npy_common.h>
13 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
14 #include <numpy/arrayobject.h>
16 #include <framework/logging/Logger.h>
17 #include <framework/utilities/FileSystem.h>
29 int version = pt.get<
int>(
"Python_version");
30 if (version < 1 or version > 2) {
31 B2ERROR(
"Unknown weightfile version " << std::to_string(version));
32 throw std::runtime_error(
"Unknown weightfile version " + std::to_string(version));
34 m_framework = pt.get<std::string>(
"Python_framework");
38 m_config = pt.get<std::string>(
"Python_config");
50 pt.put(
"Python_version", 2);
62 po::options_description description(
"Python options");
63 description.add_options()
65 "Framework which should be used. Currently supported are sklearn, tensorflow and theano")
66 (
"steering_file", po::value<std::string>(&
m_steering_file),
"Steering file which describes")
67 (
"mini_batch_size", po::value<unsigned int>(&
m_mini_batch_size),
"Size of the mini batch given to partial_fit function")
68 (
"nIterations", po::value<unsigned int>(&
m_nIterations),
"Number of iterations")
69 (
"normalize", po::value<bool>(&
m_normalize),
"Normalize input data (shift mean to 0 and std to 1)")
71 "Training fraction used to split up dataset in training and validation sample.")
72 (
"config", po::value<std::string>(&
m_config),
"Json encoded python object passed to begin_fit function");
99 if (not Py_IsInitialized()) {
102 wchar_t** bla =
nullptr;
103 PySys_SetArgvEx(0, bla, 0);
107 if (PyArray_API ==
nullptr) {
118 if (Py_IsInitialized()) {
151 m_specific_options(specific_options)
164 uint64_t numberOfFeatures = training_data.getNumberOfFeatures();
165 uint64_t numberOfSpectators = training_data.getNumberOfSpectators();
166 uint64_t numberOfEvents = training_data.getNumberOfEvents();
172 if (batch_size == 0) {
173 batch_size = numberOfTrainingEvents;
177 B2ERROR(
"Please provide a positive training fraction");
178 throw std::runtime_error(
"Please provide a training fraction between (0.0,1.0]");
181 auto X = std::unique_ptr<float[]>(
new float[batch_size * numberOfFeatures]);
182 auto S = std::unique_ptr<float[]>(
new float[batch_size * numberOfSpectators]);
183 auto y = std::unique_ptr<float[]>(
new float[batch_size]);
184 auto w = std::unique_ptr<float[]>(
new float[batch_size]);
185 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(batch_size),
static_cast<npy_intp
>(numberOfFeatures)};
186 npy_intp dimensions_S[2] = {
static_cast<npy_intp
>(batch_size),
static_cast<npy_intp
>(numberOfSpectators)};
187 npy_intp dimensions_y[2] = {
static_cast<npy_intp
>(batch_size), 1};
188 npy_intp dimensions_w[2] = {
static_cast<npy_intp
>(batch_size), 1};
190 auto X_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfFeatures]);
191 auto S_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfSpectators]);
192 auto y_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
193 auto w_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
194 npy_intp dimensions_X_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents),
static_cast<npy_intp
>(numberOfFeatures)};
195 npy_intp dimensions_S_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents),
static_cast<npy_intp
>(numberOfSpectators)};
196 npy_intp dimensions_y_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
197 npy_intp dimensions_w_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
199 std::string steering_file_source_code;
202 std::ifstream steering_file(filename);
203 if (not steering_file) {
204 throw std::runtime_error(std::string(
"Couldn't open file ") + filename);
206 steering_file.seekg(0, std::ios::end);
207 steering_file_source_code.resize(steering_file.tellg());
208 steering_file.seekg(0, std::ios::beg);
209 steering_file.read(&steering_file_source_code[0], steering_file_source_code.size());
212 std::vector<float> means(numberOfFeatures, 0.0);
213 std::vector<float> stds(numberOfFeatures, 0.0);
218 auto weights = training_data.getWeights();
219 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
223 double running_std = 0.0;
224 auto feature = training_data.getFeature(iFeature);
225 for (uint64_t i = 0; i < weights.size(); ++i) {
227 wSum2 += weights[i] * weights[i];
228 double meanOld = mean;
229 mean += (weights[i] / wSum) * (feature[i] - meanOld);
230 running_std += weights[i] * (feature[i] - meanOld) * (feature[i] - mean);
232 means[iFeature] = mean;
233 stds[iFeature] = std::sqrt(running_std / (wSum - 1));
239 auto json = boost::python::import(
"json");
240 auto builtins = boost::python::import(
"builtins");
241 auto inspect = boost::python::import(
"inspect");
246 builtins.attr(
"exec")(steering_file_source_code.c_str(), boost::python::object(framework.attr(
"__dict__")));
250 auto model = framework.attr(
"get_model")(numberOfFeatures, numberOfSpectators,
254 for (uint64_t iEvent = 0; iEvent < numberOfValidationEvents; ++iEvent) {
255 training_data.loadEvent(iEvent);
257 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
258 X_v[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
260 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
261 X_v[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
263 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
264 S_v[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
265 y_v[iEvent] = training_data.m_target;
266 w_v[iEvent] = training_data.m_weight;
269 auto ndarray_X_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X_v, NPY_FLOAT32, X_v.get()));
270 auto ndarray_S_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S_v, NPY_FLOAT32, S_v.get()));
271 auto ndarray_y_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y_v, NPY_FLOAT32, y_v.get()));
272 auto ndarray_w_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w_v, NPY_FLOAT32, w_v.get()));
274 auto state = framework.attr(
"begin_fit")(model, ndarray_X_v, ndarray_S_v, ndarray_y_v, ndarray_w_v);
276 uint64_t nBatches = std::floor(numberOfTrainingEvents / batch_size);
277 bool continue_loop =
true;
279 and continue_loop; ++iIteration) {
280 for (uint64_t iBatch = 0; iBatch < nBatches and continue_loop; ++iBatch) {
284 PyThreadState* m_thread_state = PyEval_SaveThread();
285 for (uint64_t iEvent = 0; iEvent < batch_size; ++iEvent) {
286 training_data.loadEvent(iEvent + iBatch * batch_size + numberOfValidationEvents);
288 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
289 X[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
291 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
292 X[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
294 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
295 S[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
296 y[iEvent] = training_data.m_target;
297 w[iEvent] = training_data.m_weight;
301 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
302 auto ndarray_S = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S, NPY_FLOAT32, S.get()));
303 auto ndarray_y = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y, NPY_FLOAT32, y.get()));
304 auto ndarray_w = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w, NPY_FLOAT32, w.get()));
307 PyEval_RestoreThread(m_thread_state);
308 auto r = framework.attr(
"partial_fit")(state, ndarray_X, ndarray_S, ndarray_y,
309 ndarray_w, iIteration * nBatches + iBatch);
310 boost::python::extract<bool> proxy(r);
312 continue_loop =
static_cast<bool>(proxy);
316 auto result = framework.attr(
"end_fit")(state);
318 auto pickle = boost::python::import(
"pickle");
319 auto file = builtins.attr(
"open")(custom_weightfile.c_str(),
"wb");
320 pickle.attr(
"dump")(result, file);
322 auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(),
"wb");
323 pickle.attr(
"dump")(steering_file_source_code.c_str(), steeringfile);
325 auto importances = framework.attr(
"feature_importance")(state);
326 if (len(importances) == 0) {
327 B2INFO(
"Python method returned empty feature importance. There won't be any information about the feature importance in the weightfile.");
328 }
else if (numberOfFeatures !=
static_cast<uint64_t
>(len(importances))) {
329 B2WARNING(
"Python method didn't return the correct number of importance value. I ignore the importances");
331 std::map<std::string, float> feature_importances;
332 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
333 boost::python::extract<float> proxy(importances[iFeature]);
337 B2WARNING(
"Failed to convert importance output of the method to a float, using 0 instead");
347 B2ERROR(
"Failed calling train in PythonTeacher");
348 throw std::runtime_error(std::string(
"Failed calling train in PythonTeacher"));
353 weightfile.
addFile(
"Python_Weightfile", custom_weightfile);
354 weightfile.
addFile(
"Python_Steeringfile", custom_steeringfile);
357 weightfile.
addVector(
"Python_Means", means);
358 weightfile.
addVector(
"Python_Stds", stds);
375 weightfile.
getFile(
"Python_Weightfile", custom_weightfile);
385 auto pickle = boost::python::import(
"pickle");
386 auto builtins = boost::python::import(
"builtins");
391 weightfile.
getFile(
"Python_Steeringfile", custom_steeringfile);
392 auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(),
"rb");
393 auto source_code = pickle.attr(
"load")(steeringfile);
394 builtins.attr(
"exec")(boost::python::object(source_code), boost::python::object(
m_framework.attr(
"__dict__")));
397 auto file = builtins.attr(
"open")(custom_weightfile.c_str(),
"rb");
398 auto unpickled_fit_object = pickle.attr(
"load")(file);
403 B2ERROR(
"Failed calling load in PythonExpert");
404 throw std::runtime_error(
"Failed calling load in PythonExpert");
412 uint64_t numberOfFeatures = test_data.getNumberOfFeatures();
413 uint64_t numberOfEvents = test_data.getNumberOfEvents();
415 auto X = std::unique_ptr<float[]>(
new float[numberOfEvents * numberOfFeatures]);
416 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(numberOfEvents),
static_cast<npy_intp
>(numberOfFeatures)};
418 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
419 test_data.loadEvent(iEvent);
421 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
422 X[iEvent * numberOfFeatures + iFeature] = (test_data.m_input[iFeature] -
m_means[iFeature]) /
m_stds[iFeature];
424 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
425 X[iEvent * numberOfFeatures + iFeature] = test_data.m_input[iFeature];
429 std::vector<float> probabilities(test_data.getNumberOfEvents());
432 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
434 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
437 probabilities[iEvent] =
static_cast<float>(*
static_cast<float*
>(PyArray_GETPTR1(
reinterpret_cast<PyArrayObject*
>(result.ptr()),
443 B2ERROR(
"Failed calling applying PythonExpert");
444 throw std::runtime_error(
"Failed calling applying PythonExpert");
447 return probabilities;
static std::string findFile(const std::string &path, bool silent=false)
Search for given file or directory in local or central release directory, and return absolute path if...
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
GeneralOptions m_general_options
General options loaded from the weightfile.
General options which are shared by all MVA trainings.
std::vector< std::string > m_variables
Vector of all variables (branch names) used in the training.
PythonExpert()
Constructs a new Python Expert.
boost::python::object m_state
current state object of method
std::vector< float > m_stds
Stds of all features for normalization.
boost::python::object m_framework
Framework module.
virtual std::vector< float > apply(Dataset &test_data) const override
Apply this expert onto a dataset.
PythonOptions m_specific_options
Method specific options.
virtual void load(Weightfile &weightfile) override
Load the expert from a Weightfile.
std::vector< float > m_means
Means of all features for normalization.
Singleton class which handles the initialization and finalization of Python and numpy.
void * init_numpy()
Helper function which initializes array system of numpy.
~PythonInitializerSingleton()
Destructor of PythonInitializerSingleton.
bool m_initialized_python
Member which keeps indicate if this class initialized python.
static PythonInitializerSingleton & GetInstance()
Return static instance of PythonInitializerSingleton.
PythonInitializerSingleton()
Constructor of PythonInitializerSingleton.
PythonInitializerSingleton(const PythonInitializerSingleton &)=delete
Forbid copy constructor of PythonInitializerSingleton.
Options for the Python MVA method.
unsigned int m_nIterations
Number of iterations through the whole data.
std::string m_steering_file
steering file provided by the user to override the functions in the framework
std::string m_framework
framework to use e.g.
std::string m_config
Config string in json, which is passed to the get model function.
virtual po::options_description getDescription() override
Returns a program options description for all available options.
bool m_normalize
Normalize the inputs (shift mean to zero and std to 1)
double m_training_fraction
Fraction of data passed as training data, rest is passed as test data.
virtual void load(const boost::property_tree::ptree &pt) override
Load mechanism to load Options from a xml tree.
virtual void save(boost::property_tree::ptree &pt) const override
Save mechanism to store Options in a xml tree.
unsigned int m_mini_batch_size
Mini batch size, 0 passes the whole data in one call.
PythonTeacher(const GeneralOptions &general_options, const PythonOptions &specific_options)
Constructs a new teacher using the GeneralOptions and specific options of this training.
PythonOptions m_specific_options
Method specific options.
virtual Weightfile train(Dataset &training_data) const override
Train a mva method using the given dataset returning a Weightfile.
Abstract base class of all Teachers Each MVA library has its own implementation of this class,...
GeneralOptions m_general_options
GeneralOptions containing all shared options.
The Weightfile class serializes all information about a training into an xml tree.
void addFile(const std::string &identifier, const std::string &custom_weightfile)
Add a file (mostly a weightfile from a MVA library) to our Weightfile.
bool containsElement(const std::string &identifier) const
Returns true if given element is stored in the property tree.
void addOptions(const Options &options)
Add an Option object to the xml tree.
std::vector< T > getVector(const std::string &identifier) const
Returns a stored vector from the xml tree.
void getOptions(Options &options) const
Fills an Option object from the xml tree.
void addSignalFraction(float signal_fraction)
Saves the signal fraction in the xml tree.
void addFeatureImportance(const std::map< std::string, float > &importance)
Add variable importance.
void addVector(const std::string &identifier, const std::vector< T > &vector)
Add a vector to the xml tree.
std::string generateFileName(const std::string &suffix="")
Returns a temporary filename with the given suffix.
void getFile(const std::string &identifier, const std::string &custom_weightfile)
Creates a file from our weightfile (mostly this will be a weightfile of an MVA library)
Abstract base class for different kinds of events.