12 #include <mva/methods/Python.h>
14 #include <boost/filesystem/convenience.hpp>
15 #include <numpy/npy_common.h>
16 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
17 #include <numpy/arrayobject.h>
19 #include <framework/logging/Logger.h>
20 #include <framework/utilities/FileSystem.h>
32 int version = pt.get<
int>(
"Python_version");
33 if (version < 1 or version > 2) {
34 B2ERROR(
"Unknown weightfile version " << std::to_string(version));
35 throw std::runtime_error(
"Unknown weightfile version " + std::to_string(version));
37 m_framework = pt.get<std::string>(
"Python_framework");
41 m_config = pt.get<std::string>(
"Python_config");
53 pt.put(
"Python_version", 2);
65 po::options_description description(
"Python options");
66 description.add_options()
68 "Framework which should be used. Currently supported are sklearn, tensorflow and theano")
69 (
"steering_file", po::value<std::string>(&
m_steering_file),
"Steering file which describes")
70 (
"mini_batch_size", po::value<unsigned int>(&
m_mini_batch_size),
"Size of the mini batch given to partial_fit function")
71 (
"nIterations", po::value<unsigned int>(&
m_nIterations),
"Number of iterations")
72 (
"normalize", po::value<bool>(&
m_normalize),
"Normalize input data (shift mean to 0 and std to 1)")
74 "Training fraction used to split up dataset in training and validation sample.")
75 (
"config", po::value<std::string>(&
m_config),
"Json encoded python object passed to begin_fit function");
102 if (not Py_IsInitialized()) {
105 wchar_t** bla =
nullptr;
106 PySys_SetArgvEx(0, bla, 0);
110 if (PyArray_API ==
nullptr) {
121 if (Py_IsInitialized()) {
154 m_specific_options(specific_options)
167 uint64_t numberOfFeatures = training_data.getNumberOfFeatures();
168 uint64_t numberOfSpectators = training_data.getNumberOfSpectators();
169 uint64_t numberOfEvents = training_data.getNumberOfEvents();
175 if (batch_size == 0) {
176 batch_size = numberOfTrainingEvents;
180 B2ERROR(
"Please provide a positive training fraction");
181 throw std::runtime_error(
"Please provide a training fraction between (0.0,1.0]");
184 auto X = std::unique_ptr<float[]>(
new float[batch_size * numberOfFeatures]);
185 auto S = std::unique_ptr<float[]>(
new float[batch_size * numberOfSpectators]);
186 auto y = std::unique_ptr<float[]>(
new float[batch_size]);
187 auto w = std::unique_ptr<float[]>(
new float[batch_size]);
188 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(batch_size),
static_cast<npy_intp
>(numberOfFeatures)};
189 npy_intp dimensions_S[2] = {
static_cast<npy_intp
>(batch_size),
static_cast<npy_intp
>(numberOfSpectators)};
190 npy_intp dimensions_y[2] = {
static_cast<npy_intp
>(batch_size), 1};
191 npy_intp dimensions_w[2] = {
static_cast<npy_intp
>(batch_size), 1};
193 auto X_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfFeatures]);
194 auto S_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfSpectators]);
195 auto y_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
196 auto w_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
197 npy_intp dimensions_X_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents),
static_cast<npy_intp
>(numberOfFeatures)};
198 npy_intp dimensions_S_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents),
static_cast<npy_intp
>(numberOfSpectators)};
199 npy_intp dimensions_y_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
200 npy_intp dimensions_w_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
202 std::string steering_file_source_code;
205 std::ifstream steering_file(filename);
206 if (not steering_file) {
207 throw std::runtime_error(std::string(
"Couldn't open file ") + filename);
209 steering_file.seekg(0, std::ios::end);
210 steering_file_source_code.resize(steering_file.tellg());
211 steering_file.seekg(0, std::ios::beg);
212 steering_file.read(&steering_file_source_code[0], steering_file_source_code.size());
215 std::vector<float> means(numberOfFeatures, 0.0);
216 std::vector<float> stds(numberOfFeatures, 0.0);
221 auto weights = training_data.getWeights();
222 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
226 double running_std = 0.0;
227 auto feature = training_data.getFeature(iFeature);
228 for (uint64_t i = 0; i < weights.size(); ++i) {
230 wSum2 += weights[i] * weights[i];
231 double meanOld = mean;
232 mean += (weights[i] / wSum) * (feature[i] - meanOld);
233 running_std += weights[i] * (feature[i] - meanOld) * (feature[i] - mean);
235 means[iFeature] = mean;
236 stds[iFeature] = std::sqrt(running_std / (wSum - 1));
242 auto json = boost::python::import(
"json");
243 auto builtins = boost::python::import(
"builtins");
244 auto inspect = boost::python::import(
"inspect");
249 builtins.attr(
"exec")(steering_file_source_code.c_str(), boost::python::object(framework.attr(
"__dict__")));
253 auto model = framework.attr(
"get_model")(numberOfFeatures, numberOfSpectators,
257 for (uint64_t iEvent = 0; iEvent < numberOfValidationEvents; ++iEvent) {
258 training_data.loadEvent(iEvent);
260 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
261 X_v[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
263 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
264 X_v[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
266 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
267 S_v[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
268 y_v[iEvent] = training_data.m_target;
269 w_v[iEvent] = training_data.m_weight;
272 auto ndarray_X_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X_v, NPY_FLOAT32, X_v.get()));
273 auto ndarray_S_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S_v, NPY_FLOAT32, S_v.get()));
274 auto ndarray_y_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y_v, NPY_FLOAT32, y_v.get()));
275 auto ndarray_w_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w_v, NPY_FLOAT32, w_v.get()));
277 auto state = framework.attr(
"begin_fit")(model, ndarray_X_v, ndarray_S_v, ndarray_y_v, ndarray_w_v);
279 uint64_t nBatches = std::floor(numberOfTrainingEvents / batch_size);
280 bool continue_loop =
true;
282 and continue_loop; ++iIteration) {
283 for (uint64_t iBatch = 0; iBatch < nBatches and continue_loop; ++iBatch) {
287 PyThreadState* m_thread_state = PyEval_SaveThread();
288 for (uint64_t iEvent = 0; iEvent < batch_size; ++iEvent) {
289 training_data.loadEvent(iEvent + iBatch * batch_size + numberOfValidationEvents);
291 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
292 X[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
294 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
295 X[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
297 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
298 S[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
299 y[iEvent] = training_data.m_target;
300 w[iEvent] = training_data.m_weight;
304 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
305 auto ndarray_S = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S, NPY_FLOAT32, S.get()));
306 auto ndarray_y = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y, NPY_FLOAT32, y.get()));
307 auto ndarray_w = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w, NPY_FLOAT32, w.get()));
310 PyEval_RestoreThread(m_thread_state);
311 auto r = framework.attr(
"partial_fit")(state, ndarray_X, ndarray_S, ndarray_y,
312 ndarray_w, iIteration * nBatches + iBatch);
313 boost::python::extract<bool> proxy(r);
315 continue_loop =
static_cast<bool>(proxy);
319 auto result = framework.attr(
"end_fit")(state);
321 auto pickle = boost::python::import(
"pickle");
322 auto file = builtins.attr(
"open")(custom_weightfile.c_str(),
"wb");
323 pickle.attr(
"dump")(result, file);
325 auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(),
"wb");
326 pickle.attr(
"dump")(steering_file_source_code.c_str(), steeringfile);
328 auto importances = framework.attr(
"feature_importance")(state);
329 if (len(importances) == 0) {
330 B2INFO(
"Python method returned empty feature importance. There won't be any information about the feature importance in the weightfile.");
331 }
else if (numberOfFeatures !=
static_cast<uint64_t
>(len(importances))) {
332 B2WARNING(
"Python method didn't return the correct number of importance value. I ignore the importances");
334 std::map<std::string, float> feature_importances;
335 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
336 boost::python::extract<float> proxy(importances[iFeature]);
340 B2WARNING(
"Failed to convert importance output of the method to a float, using 0 instead");
350 B2ERROR(
"Failed calling train in PythonTeacher");
351 throw std::runtime_error(std::string(
"Failed calling train in PythonTeacher"));
356 weightfile.
addFile(
"Python_Weightfile", custom_weightfile);
357 weightfile.
addFile(
"Python_Steeringfile", custom_steeringfile);
360 weightfile.
addVector(
"Python_Means", means);
361 weightfile.
addVector(
"Python_Stds", stds);
378 weightfile.
getFile(
"Python_Weightfile", custom_weightfile);
388 auto pickle = boost::python::import(
"pickle");
389 auto builtins = boost::python::import(
"builtins");
394 weightfile.
getFile(
"Python_Steeringfile", custom_steeringfile);
395 auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(),
"rb");
396 auto source_code = pickle.attr(
"load")(steeringfile);
397 builtins.attr(
"exec")(boost::python::object(source_code), boost::python::object(
m_framework.attr(
"__dict__")));
400 auto file = builtins.attr(
"open")(custom_weightfile.c_str(),
"rb");
401 auto unpickled_fit_object = pickle.attr(
"load")(file);
406 B2ERROR(
"Failed calling load in PythonExpert");
407 throw std::runtime_error(
"Failed calling load in PythonExpert");
415 uint64_t numberOfFeatures = test_data.getNumberOfFeatures();
416 uint64_t numberOfEvents = test_data.getNumberOfEvents();
418 auto X = std::unique_ptr<float[]>(
new float[numberOfEvents * numberOfFeatures]);
419 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(numberOfEvents),
static_cast<npy_intp
>(numberOfFeatures)};
421 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
422 test_data.loadEvent(iEvent);
424 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
425 X[iEvent * numberOfFeatures + iFeature] = (test_data.m_input[iFeature] -
m_means[iFeature]) /
m_stds[iFeature];
427 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
428 X[iEvent * numberOfFeatures + iFeature] = test_data.m_input[iFeature];
432 std::vector<float> probabilities(test_data.getNumberOfEvents());
435 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
437 for (uint64_t iEvent = 0; iEvent < numberOfEvents; ++iEvent) {
440 probabilities[iEvent] =
static_cast<float>(*
static_cast<float*
>(PyArray_GETPTR1(
reinterpret_cast<PyArrayObject*
>(result.ptr()),
446 B2ERROR(
"Failed calling applying PythonExpert");
447 throw std::runtime_error(
"Failed calling applying PythonExpert");
450 return probabilities;