Teacher for the Python MVA method. More...

#include <Python.h>

Inheritance diagram for PythonTeacher:

Collaboration diagram for PythonTeacher:

[legend]

Public Member Functions
	PythonTeacher (const GeneralOptions &general_options, const PythonOptions &specific_options)
	Constructs a new teacher using the GeneralOptions and specific options of this training.

virtual Weightfile	train (Dataset &training_data) const override
	Train a mva method using the given dataset returning a Weightfile.

Protected Attributes
GeneralOptions	m_general_options
	GeneralOptions containing all shared options.

Private Attributes
PythonOptions	m_specific_options
	Method specific options.

Detailed Description

Teacher for the Python MVA method.

Definition at line 90 of file Python.h.

Constructor & Destructor Documentation

◆ PythonTeacher()

PythonTeacher	(	const GeneralOptions &	general_options,
		const PythonOptions &	specific_options
	)

Constructs a new teacher using the GeneralOptions and specific options of this training.

Parameters

general_options	defining all shared options
specific_options	defininf all method specific options

Definition at line 152 of file Python.cc.

                                                                        : Teacher(general_options),
      m_specific_options(specific_options)
    {
      PythonInitializerSingleton::GetInstance();
    }

Member Function Documentation

◆ train()

Weightfile train ( Dataset & training_data ) const

overridevirtual

Train a mva method using the given dataset returning a Weightfile.

Parameters

training_data used to train the method

Implements Teacher.

Definition at line 160 of file Python.cc.

    {
 
      Weightfile weightfile;
      std::string custom_weightfile = weightfile.generateFileName();
      std::string custom_steeringfile = weightfile.generateFileName();
 
      uint64_t numberOfFeatures = training_data.getNumberOfFeatures();
      uint64_t numberOfSpectators = training_data.getNumberOfSpectators();
      uint64_t numberOfEvents = training_data.getNumberOfEvents();
 
      if (m_specific_options.m_training_fraction <= 0.0 or m_specific_options.m_training_fraction > 1.0) {
        B2ERROR("Please provide a positive training fraction");
        throw std::runtime_error("Please provide a training fraction between (0.0,1.0]");
      }
 
      auto numberOfTrainingEvents = static_cast<uint64_t>(numberOfEvents * 100 * m_specific_options.m_training_fraction);
      numberOfTrainingEvents = numberOfTrainingEvents / 100 + (numberOfTrainingEvents % 100 != 0);
      auto numberOfValidationEvents = numberOfEvents - numberOfTrainingEvents;
 
      uint64_t batch_size = m_specific_options.m_mini_batch_size;
      if (batch_size == 0) {
        batch_size = numberOfTrainingEvents;
      }
 
      if (batch_size > numberOfTrainingEvents) {
        B2WARNING("Mini batch size (" << batch_size << ") is larger than the number of training events (" << numberOfTrainingEvents << ")"\
                  " The batch size has been set equal to the number of training events.");
        batch_size = numberOfTrainingEvents;
      };
 
      auto X = std::unique_ptr<float[]>(new float[batch_size * numberOfFeatures]);
      auto S = std::unique_ptr<float[]>(new float[batch_size * numberOfSpectators]);
      auto y = std::unique_ptr<float[]>(new float[batch_size]);
      auto w = std::unique_ptr<float[]>(new float[batch_size]);
      npy_intp dimensions_X[2] = {static_cast<npy_intp>(batch_size), static_cast<npy_intp>(numberOfFeatures)};
      npy_intp dimensions_S[2] = {static_cast<npy_intp>(batch_size), static_cast<npy_intp>(numberOfSpectators)};
      npy_intp dimensions_y[2] = {static_cast<npy_intp>(batch_size), 1};
      npy_intp dimensions_w[2] = {static_cast<npy_intp>(batch_size), 1};
 
      auto X_v = std::unique_ptr<float[]>(new float[numberOfValidationEvents * numberOfFeatures]);
      auto S_v = std::unique_ptr<float[]>(new float[numberOfValidationEvents * numberOfSpectators]);
      auto y_v = std::unique_ptr<float[]>(new float[numberOfValidationEvents]);
      auto w_v = std::unique_ptr<float[]>(new float[numberOfValidationEvents]);
      npy_intp dimensions_X_v[2] = {static_cast<npy_intp>(numberOfValidationEvents), static_cast<npy_intp>(numberOfFeatures)};
      npy_intp dimensions_S_v[2] = {static_cast<npy_intp>(numberOfValidationEvents), static_cast<npy_intp>(numberOfSpectators)};
      npy_intp dimensions_y_v[2] = {static_cast<npy_intp>(numberOfValidationEvents), 1};
      npy_intp dimensions_w_v[2] = {static_cast<npy_intp>(numberOfValidationEvents), 1};
 
      std::string steering_file_source_code;
      if (m_specific_options.m_steering_file != "") {
        std::string filename = FileSystem::findFile(m_specific_options.m_steering_file);
        std::ifstream steering_file(filename);
        if (not steering_file) {
          throw std::runtime_error(std::string("Couldn't open file ") + filename);
        }
        steering_file.seekg(0, std::ios::end);
        steering_file_source_code.resize(steering_file.tellg());
        steering_file.seekg(0, std::ios::beg);
        steering_file.read(&steering_file_source_code[0], steering_file_source_code.size());
      }
 
      std::vector<float> means(numberOfFeatures, 0.0);
      std::vector<float> stds(numberOfFeatures, 0.0);
 
      if (m_specific_options.m_normalize) {
        // Stable calculation of mean and variance with weights
        // see https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
        auto weights = training_data.getWeights();
        for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
          double wSum = 0.0;
          double mean = 0.0;
          double running_std = 0.0;
          auto feature = training_data.getFeature(iFeature);
          for (uint64_t i = 0; i < weights.size(); ++i) {
            wSum += weights[i];
            double meanOld = mean;
            mean += (weights[i] / wSum) * (feature[i] - meanOld);
            running_std += weights[i] * (feature[i] - meanOld) * (feature[i] - mean);
          }
          means[iFeature] = mean;
          stds[iFeature] = std::sqrt(running_std / (wSum - 1));
        }
      }
 
      try {
        // Load python modules
        auto json = boost::python::import("json");
        auto builtins = boost::python::import("builtins");
        auto inspect = boost::python::import("inspect");
 
        // Load framework
        auto framework = boost::python::import((std::string("basf2_mva_python_interface.") + m_specific_options.m_framework).c_str());
        // Overwrite framework with user-defined code from the steering file
        builtins.attr("exec")(steering_file_source_code.c_str(), boost::python::object(framework.attr("__dict__")));
 
        // Call get_model with the parameters provided by the user
        auto parameters = json.attr("loads")(m_specific_options.m_config.c_str());
        auto model = framework.attr("get_model")(numberOfFeatures, numberOfSpectators,
                                                 numberOfEvents,  m_specific_options.m_training_fraction, parameters);
 
        // Call begin_fit with validation sample
        for (uint64_t iEvent = 0; iEvent < numberOfValidationEvents; ++iEvent) {
          training_data.loadEvent(iEvent);
          if (m_specific_options.m_normalize) {
            for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
              X_v[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
          } else {
            for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
              X_v[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
          }
          for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
            S_v[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
          y_v[iEvent] = training_data.m_target;
          w_v[iEvent] = training_data.m_weight;
        }
 
        auto ndarray_X_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X_v, NPY_FLOAT32, X_v.get()));
        auto ndarray_S_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S_v, NPY_FLOAT32, S_v.get()));
        auto ndarray_y_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y_v, NPY_FLOAT32, y_v.get()));
        auto ndarray_w_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w_v, NPY_FLOAT32, w_v.get()));
 
        uint64_t nBatches = std::floor(numberOfTrainingEvents / batch_size);
 
        auto state = framework.attr("begin_fit")(model, ndarray_X_v, ndarray_S_v, ndarray_y_v, ndarray_w_v, nBatches);
 
        bool continue_loop = true;
 
        std::vector<uint64_t> iteration_index_vector(numberOfTrainingEvents);
        std::iota(std::begin(iteration_index_vector), std::end(iteration_index_vector), 0);
 
        for (uint64_t iIteration = 0; (iIteration < m_specific_options.m_nIterations or m_specific_options.m_nIterations == 0)
             and continue_loop; ++iIteration) {
 
          // shuffle the indices on each iteration to get randomised batches
          if (iIteration > 0) std::shuffle(std::begin(iteration_index_vector), std::end(iteration_index_vector), TRandomWrapper());
 
          for (uint64_t iBatch = 0; iBatch < nBatches and continue_loop; ++iBatch) {
 
            // Release Global Interpreter Lock in python to allow multithreading while reading root files
            // also see: https://docs.python.org/3.5/c-api/init.html
            PyThreadState* m_thread_state =  PyEval_SaveThread();
            for (uint64_t iEvent = 0; iEvent < batch_size; ++iEvent) {
              training_data.loadEvent(iteration_index_vector.at(iEvent + iBatch * batch_size) + numberOfValidationEvents);
              if (m_specific_options.m_normalize) {
                for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
                  X[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
              } else {
                for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
                  X[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
              }
              for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
                S[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
              y[iEvent] = training_data.m_target;
              w[iEvent] = training_data.m_weight;
            }
 
            // Maybe slow, create ndarrays outside of loop?
            auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
            auto ndarray_S = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S, NPY_FLOAT32, S.get()));
            auto ndarray_y = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y, NPY_FLOAT32, y.get()));
            auto ndarray_w = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w, NPY_FLOAT32, w.get()));
 
            // Reactivate Global Interpreter Lock to safely execute python code
            PyEval_RestoreThread(m_thread_state);
            auto r = framework.attr("partial_fit")(state, ndarray_X, ndarray_S, ndarray_y,
                                                   ndarray_w, iIteration, iBatch);
            boost::python::extract<bool> proxy(r);
            if (proxy.check())
              continue_loop = static_cast<bool>(proxy);
          }
        }
 
        auto result = framework.attr("end_fit")(state);
 
        auto pickle = boost::python::import("pickle");
        auto file = builtins.attr("open")(custom_weightfile.c_str(), "wb");
        pickle.attr("dump")(result, file);
 
        auto steeringfile = builtins.attr("open")(custom_steeringfile.c_str(), "wb");
        pickle.attr("dump")(steering_file_source_code.c_str(), steeringfile);
 
        auto importances = framework.attr("feature_importance")(state);
        if (len(importances) == 0) {
          B2INFO("Python method returned empty feature importance. There won't be any information about the feature importance in the weightfile.");
        } else if (numberOfFeatures != static_cast<uint64_t>(len(importances))) {
          B2WARNING("Python method didn't return the correct number of importance value. I ignore the importances");
        } else {
          std::map<std::string, float> feature_importances;
          for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
            boost::python::extract<float> proxy(importances[iFeature]);
            if (proxy.check()) {
              feature_importances[m_general_options.m_variables[iFeature]] = static_cast<float>(proxy);
            } else {
              B2WARNING("Failed to convert importance output of the method to a float, using 0 instead");
              feature_importances[m_general_options.m_variables[iFeature]] = 0.0;
            }
          }
          weightfile.addFeatureImportance(feature_importances);
        }
 
      } catch (...) {
        PyErr_Print();
        PyErr_Clear();
        B2ERROR("Failed calling train in PythonTeacher");
        throw std::runtime_error(std::string("Failed calling train in PythonTeacher"));
      }
 
      weightfile.addOptions(m_general_options);
      weightfile.addOptions(m_specific_options);
      weightfile.addFile("Python_Weightfile", custom_weightfile);
      weightfile.addFile("Python_Steeringfile", custom_steeringfile);
      weightfile.addSignalFraction(training_data.getSignalFraction());
      if (m_specific_options.m_normalize) {
        weightfile.addVector("Python_Means", means);
        weightfile.addVector("Python_Stds", stds);
      }
 
      return weightfile;
 
    }

Member Data Documentation

◆ m_general_options

GeneralOptions m_general_options

protectedinherited

GeneralOptions containing all shared options.

Definition at line 49 of file Teacher.h.

◆ m_specific_options

PythonOptions m_specific_options

private

Method specific options.

Definition at line 107 of file Python.h.

The documentation for this class was generated from the following files:

mva/methods/include/Python.h
mva/methods/src/Python.cc

Public Member Functions

Protected Attributes

Private Attributes