9 #include <mva/methods/Python.h> 
   11 #include <boost/filesystem/convenience.hpp> 
   12 #include <numpy/npy_common.h> 
   13 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION 
   14 #include <numpy/arrayobject.h> 
   16 #include <framework/logging/Logger.h> 
   17 #include <framework/utilities/FileSystem.h> 
   18 #include <framework/utilities/TRandomWrapper.h> 
   32       int version = pt.get<
int>(
"Python_version");
 
   33       if (version < 1 or version > 2) {
 
   34         B2ERROR(
"Unknown weightfile version " << std::to_string(version));
 
   35         throw std::runtime_error(
"Unknown weightfile version " + std::to_string(version));
 
   37       m_framework = pt.get<std::string>(
"Python_framework");
 
   41       m_config = pt.get<std::string>(
"Python_config");
 
   53       pt.put(
"Python_version", 2);
 
   65       po::options_description description(
"Python options");
 
   66       description.add_options()
 
   68        "Framework which should be used. Currently supported are sklearn, tensorflow and theano")
 
   69       (
"steering_file", po::value<std::string>(&
m_steering_file), 
"Steering file which describes")
 
   70       (
"mini_batch_size", po::value<unsigned int>(&
m_mini_batch_size), 
"Size of the mini batch given to partial_fit function")
 
   71       (
"nIterations", po::value<unsigned int>(&
m_nIterations), 
"Number of iterations")
 
   72       (
"normalize", po::value<bool>(&
m_normalize), 
"Normalize input data (shift mean to 0 and std to 1)")
 
   74        "Training fraction used to split up dataset in training and validation sample.")
 
   75       (
"config", po::value<std::string>(&
m_config), 
"Json encoded python object passed to begin_fit function");
 
  102         if (not Py_IsInitialized()) {
 
  105           wchar_t** bla = 
nullptr;
 
  106           PySys_SetArgvEx(0, bla, 0);
 
  110         if (PyArray_API == 
nullptr) {
 
  121           if (Py_IsInitialized()) {
 
  154       m_specific_options(specific_options)
 
  167       uint64_t numberOfFeatures = training_data.getNumberOfFeatures();
 
  168       uint64_t numberOfSpectators = training_data.getNumberOfSpectators();
 
  169       uint64_t numberOfEvents = training_data.getNumberOfEvents();
 
  175       if (batch_size == 0) {
 
  176         batch_size = numberOfTrainingEvents;
 
  179       if (batch_size > numberOfTrainingEvents) {
 
  180         B2WARNING(
"Mini batch size (" << batch_size << 
") is larger than the number of training events (" << numberOfTrainingEvents << 
")"\
 
  181                   " The batch size has been set equal to the number of training events.");
 
  182         batch_size = numberOfTrainingEvents;
 
  186         B2ERROR(
"Please provide a positive training fraction");
 
  187         throw std::runtime_error(
"Please provide a training fraction between (0.0,1.0]");
 
  190       auto X = std::unique_ptr<float[]>(
new float[batch_size * numberOfFeatures]);
 
  191       auto S = std::unique_ptr<float[]>(
new float[batch_size * numberOfSpectators]);
 
  192       auto y = std::unique_ptr<float[]>(
new float[batch_size]);
 
  193       auto w = std::unique_ptr<float[]>(
new float[batch_size]);
 
  194       npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(batch_size), 
static_cast<npy_intp
>(numberOfFeatures)};
 
  195       npy_intp dimensions_S[2] = {
static_cast<npy_intp
>(batch_size), 
static_cast<npy_intp
>(numberOfSpectators)};
 
  196       npy_intp dimensions_y[2] = {
static_cast<npy_intp
>(batch_size), 1};
 
  197       npy_intp dimensions_w[2] = {
static_cast<npy_intp
>(batch_size), 1};
 
  199       auto X_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfFeatures]);
 
  200       auto S_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfSpectators]);
 
  201       auto y_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
 
  202       auto w_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
 
  203       npy_intp dimensions_X_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 
static_cast<npy_intp
>(numberOfFeatures)};
 
  204       npy_intp dimensions_S_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 
static_cast<npy_intp
>(numberOfSpectators)};
 
  205       npy_intp dimensions_y_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
 
  206       npy_intp dimensions_w_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
 
  208       std::string steering_file_source_code;
 
  211         std::ifstream steering_file(filename);
 
  212         if (not steering_file) {
 
  213           throw std::runtime_error(std::string(
"Couldn't open file ") + filename);
 
  215         steering_file.seekg(0, std::ios::end);
 
  216         steering_file_source_code.resize(steering_file.tellg());
 
  217         steering_file.seekg(0, std::ios::beg);
 
  218         steering_file.read(&steering_file_source_code[0], steering_file_source_code.size());
 
  221       std::vector<float> means(numberOfFeatures, 0.0);
 
  222       std::vector<float> stds(numberOfFeatures, 0.0);
 
  227         auto weights = training_data.getWeights();
 
  228         for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
 
  231           double running_std = 0.0;
 
  232           auto feature = training_data.getFeature(iFeature);
 
  233           for (uint64_t i = 0; i < weights.size(); ++i) {
 
  235             double meanOld = mean;
 
  236             mean += (weights[i] / wSum) * (feature[i] - meanOld);
 
  237             running_std += weights[i] * (feature[i] - meanOld) * (feature[i] - mean);
 
  239           means[iFeature] = mean;
 
  240           stds[iFeature] = 
std::sqrt(running_std / (wSum - 1));
 
  246         auto json = boost::python::import(
"json");
 
  247         auto builtins = boost::python::import(
"builtins");
 
  248         auto inspect = boost::python::import(
"inspect");
 
  253         builtins.attr(
"exec")(steering_file_source_code.c_str(), boost::python::object(framework.attr(
"__dict__")));
 
  257         auto model = framework.attr(
"get_model")(numberOfFeatures, numberOfSpectators,
 
  261         for (uint64_t iEvent = 0; iEvent < numberOfValidationEvents; ++iEvent) {
 
  262           training_data.loadEvent(iEvent);
 
  264             for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
 
  265               X_v[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
 
  267             for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
 
  268               X_v[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
 
  270           for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
 
  271             S_v[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
 
  272           y_v[iEvent] = training_data.m_target;
 
  273           w_v[iEvent] = training_data.m_weight;
 
  276         auto ndarray_X_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X_v, NPY_FLOAT32, X_v.get()));
 
  277         auto ndarray_S_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S_v, NPY_FLOAT32, S_v.get()));
 
  278         auto ndarray_y_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y_v, NPY_FLOAT32, y_v.get()));
 
  279         auto ndarray_w_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w_v, NPY_FLOAT32, w_v.get()));
 
  281         uint64_t nBatches = std::floor(numberOfTrainingEvents / batch_size);
 
  283         auto state = framework.attr(
"begin_fit")(model, ndarray_X_v, ndarray_S_v, ndarray_y_v, ndarray_w_v, nBatches);
 
  285         bool continue_loop = 
true;
 
  287         std::vector<uint64_t> iteration_index_vector(numberOfTrainingEvents);
 
  288         std::iota(std::begin(iteration_index_vector), std::end(iteration_index_vector), 0);
 
  291              and continue_loop; ++iIteration) {
 
  294           if (iIteration > 0) std::shuffle(std::begin(iteration_index_vector), std::end(iteration_index_vector), 
TRandomWrapper());
 
  296           for (uint64_t iBatch = 0; iBatch < nBatches and continue_loop; ++iBatch) {
 
  300             PyThreadState* m_thread_state =  PyEval_SaveThread();
 
  301             for (uint64_t iEvent = 0; iEvent < batch_size; ++iEvent) {
 
  302               training_data.loadEvent(iteration_index_vector.at(iEvent + iBatch * batch_size) + numberOfValidationEvents);
 
  304                 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
 
  305                   X[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
 
  307                 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
 
  308                   X[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
 
  310               for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
 
  311                 S[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
 
  312               y[iEvent] = training_data.m_target;
 
  313               w[iEvent] = training_data.m_weight;
 
  317             auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
 
  318             auto ndarray_S = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S, NPY_FLOAT32, S.get()));
 
  319             auto ndarray_y = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y, NPY_FLOAT32, y.get()));
 
  320             auto ndarray_w = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w, NPY_FLOAT32, w.get()));
 
  323             PyEval_RestoreThread(m_thread_state);
 
  324             auto r = framework.attr(
"partial_fit")(state, ndarray_X, ndarray_S, ndarray_y,
 
  325                                                    ndarray_w, iIteration, iBatch);
 
  326             boost::python::extract<bool> proxy(r);
 
  328               continue_loop = 
static_cast<bool>(proxy);
 
  332         auto result = framework.attr(
"end_fit")(state);
 
  334         auto pickle = boost::python::import(
"pickle");
 
  335         auto file = builtins.attr(
"open")(custom_weightfile.c_str(), 
"wb");
 
  336         pickle.attr(
"dump")(result, file);
 
  338         auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(), 
"wb");
 
  339         pickle.attr(
"dump")(steering_file_source_code.c_str(), steeringfile);
 
  341         auto importances = framework.attr(
"feature_importance")(state);
 
  342         if (len(importances) == 0) {
 
  343           B2INFO(
"Python method returned empty feature importance. There won't be any information about the feature importance in the weightfile.");
 
  344         } 
else if (numberOfFeatures != 
static_cast<uint64_t
>(len(importances))) {
 
  345           B2WARNING(
"Python method didn't return the correct number of importance value. I ignore the importances");
 
  347           std::map<std::string, float> feature_importances;
 
  348           for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
 
  349             boost::python::extract<float> proxy(importances[iFeature]);
 
  353               B2WARNING(
"Failed to convert importance output of the method to a float, using 0 instead");
 
  363         B2ERROR(
"Failed calling train in PythonTeacher");
 
  364         throw std::runtime_error(std::string(
"Failed calling train in PythonTeacher"));
 
  369       weightfile.
addFile(
"Python_Weightfile", custom_weightfile);
 
  370       weightfile.
addFile(
"Python_Steeringfile", custom_steeringfile);
 
  373         weightfile.
addVector(
"Python_Means", means);
 
  374         weightfile.
addVector(
"Python_Stds", stds);
 
  391       weightfile.
getFile(
"Python_Weightfile", custom_weightfile);
 
  401         auto pickle = boost::python::import(
"pickle");
 
  402         auto builtins = boost::python::import(
"builtins");
 
  407           weightfile.
getFile(
"Python_Steeringfile", custom_steeringfile);
 
  408           auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(), 
"rb");
 
  409           auto source_code = pickle.attr(
"load")(steeringfile);
 
  410           builtins.attr(
"exec")(boost::python::object(source_code), boost::python::object(
m_framework.attr(
"__dict__")));
 
  413         auto file = builtins.attr(
"open")(custom_weightfile.c_str(), 
"rb");
 
  414         auto unpickled_fit_object = pickle.attr(
"load")(file);
 
  419         B2ERROR(
"Failed calling load in PythonExpert");
 
  420         throw std::runtime_error(
"Failed calling load in PythonExpert");
 
  428       uint64_t numberOfFeatures = test_data.getNumberOfFeatures();
 
  429       uint64_t numberOfEvents = test_data.getNumberOfEvents();
 
  431       auto X = std::unique_ptr<float[]>(
new float[numberOfEvents * numberOfFeatures]);
 
  432       npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(numberOfEvents), 
static_cast<npy_intp
>(numberOfFeatures)};
 
  434       for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
 
  435         test_data.loadEvent(iEvent);
 
  437           for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
 
  438             X[iEvent * numberOfFeatures + iFeature] = (test_data.m_input[iFeature] - 
m_means[iFeature]) / 
m_stds[iFeature];
 
  440           for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
 
  441             X[iEvent * numberOfFeatures + iFeature] = test_data.m_input[iFeature];
 
  445       std::vector<float> probabilities(test_data.getNumberOfEvents(), std::numeric_limits<float>::quiet_NaN());
 
  448         auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
 
  450         for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
 
  453           probabilities[iEvent] = 
static_cast<float>(*
static_cast<float*
>(PyArray_GETPTR1(
reinterpret_cast<PyArrayObject*
>(result.ptr()),
 
  459         B2ERROR(
"Failed calling applying PythonExpert");
 
  460         throw std::runtime_error(
"Failed calling applying PythonExpert");
 
  463       return probabilities;
 
  469       uint64_t numberOfFeatures = test_data.getNumberOfFeatures();
 
  470       uint64_t numberOfEvents = test_data.getNumberOfEvents();
 
  472       auto X = std::unique_ptr<float[]>(
new float[numberOfEvents * numberOfFeatures]);
 
  473       npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(numberOfEvents), 
static_cast<npy_intp
>(numberOfFeatures)};
 
  475       for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
 
  476         test_data.loadEvent(iEvent);
 
  478           for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
 
  479             X[iEvent * numberOfFeatures + iFeature] = (test_data.m_input[iFeature] - 
m_means[iFeature]) / 
m_stds[iFeature];
 
  481           for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
 
  482             X[iEvent * numberOfFeatures + iFeature] = test_data.m_input[iFeature];
 
  487       std::vector<std::vector<float>> probabilities(test_data.getNumberOfEvents(), std::vector<float>(nClasses,
 
  488                                                     std::numeric_limits<float>::quiet_NaN()));
 
  491         auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
 
  493         for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
 
  496           for (uint64_t iClass = 0; iClass < nClasses; ++iClass) {
 
  497             probabilities[iEvent][iClass] = 
static_cast<float>(*
static_cast<float*
>(PyArray_GETPTR2(
reinterpret_cast<PyArrayObject*
> 
  505         B2ERROR(
"Failed calling applying PythonExpert");
 
  506         throw std::runtime_error(
"Failed calling applying PythonExpert");
 
  509       return probabilities;
 
static std::string findFile(const std::string &path, bool silent=false)
Search for given file or directory in local or central release directory, and return absolute path if...
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
GeneralOptions m_general_options
General options loaded from the weightfile.
General options which are shared by all MVA trainings.
std::vector< std::string > m_variables
Vector of all variables (branch names) used in the training.
unsigned int m_nClasses
Number of classes in a classification problem.
PythonExpert()
Constructs a new Python Expert.
boost::python::object m_state
current state object of method
std::vector< float > m_stds
Stds of all features for normalization.
boost::python::object m_framework
Framework module.
virtual std::vector< float > apply(Dataset &test_data) const override
Apply this expert onto a dataset.
PythonOptions m_specific_options
Method specific options.
virtual void load(Weightfile &weightfile) override
Load the expert from a Weightfile.
std::vector< float > m_means
Means of all features for normalization.
virtual std::vector< std::vector< float > > applyMulticlass(Dataset &test_data) const override
Apply this expert onto a dataset for multiclass problem.
Singleton class which handles the initialization and finalization of Python and numpy.
void * init_numpy()
Helper function which initializes array system of numpy.
~PythonInitializerSingleton()
Destructor of PythonInitializerSingleton.
bool m_initialized_python
Member which keeps indicate if this class initialized python.
static PythonInitializerSingleton & GetInstance()
Return static instance of PythonInitializerSingleton.
PythonInitializerSingleton()
Constructor of PythonInitializerSingleton.
PythonInitializerSingleton(const PythonInitializerSingleton &)=delete
Forbid copy constructor of PythonInitializerSingleton.
Options for the Python MVA method.
unsigned int m_nIterations
Number of iterations through the whole data.
std::string m_steering_file
steering file provided by the user to override the functions in the framework
std::string m_framework
framework to use e.g.
std::string m_config
Config string in json, which is passed to the get model function.
virtual po::options_description getDescription() override
Returns a program options description for all available options.
bool m_normalize
Normalize the inputs (shift mean to zero and std to 1)
double m_training_fraction
Fraction of data passed as training data, rest is passed as test data.
virtual void load(const boost::property_tree::ptree &pt) override
Load mechanism to load Options from a xml tree.
virtual void save(boost::property_tree::ptree &pt) const override
Save mechanism to store Options in a xml tree.
unsigned int m_mini_batch_size
Mini batch size, 0 passes the whole data in one call.
PythonTeacher(const GeneralOptions &general_options, const PythonOptions &specific_options)
Constructs a new teacher using the GeneralOptions and specific options of this training.
PythonOptions m_specific_options
Method specific options.
virtual Weightfile train(Dataset &training_data) const override
Train a mva method using the given dataset returning a Weightfile.
Abstract base class of all Teachers Each MVA library has its own implementation of this class,...
GeneralOptions m_general_options
GeneralOptions containing all shared options.
The Weightfile class serializes all information about a training into an xml tree.
void addFile(const std::string &identifier, const std::string &custom_weightfile)
Add a file (mostly a weightfile from a MVA library) to our Weightfile.
bool containsElement(const std::string &identifier) const
Returns true if given element is stored in the property tree.
void addOptions(const Options &options)
Add an Option object to the xml tree.
std::vector< T > getVector(const std::string &identifier) const
Returns a stored vector from the xml tree.
void getOptions(Options &options) const
Fills an Option object from the xml tree.
void addSignalFraction(float signal_fraction)
Saves the signal fraction in the xml tree.
void addFeatureImportance(const std::map< std::string, float > &importance)
Add variable importance.
void addVector(const std::string &identifier, const std::vector< T > &vector)
Add a vector to the xml tree.
std::string generateFileName(const std::string &suffix="")
Returns a temporary filename with the given suffix.
void getFile(const std::string &identifier, const std::string &custom_weightfile)
Creates a file from our weightfile (mostly this will be a weightfile of an MVA library)
double sqrt(double a)
sqrt for double
Abstract base class for different kinds of events.
Wrap TRandom to be useable as a uniform random number generator with STL algorithms like std::shuffle...