180 uint64_t numberOfFeatures = training_data.getNumberOfFeatures();
181 uint64_t numberOfSpectators = training_data.getNumberOfSpectators();
182 uint64_t numberOfEvents = training_data.getNumberOfEvents();
185 B2ERROR(
"Please provide a positive training fraction");
186 throw std::runtime_error(
"Please provide a training fraction between (0.0,1.0]");
189 auto numberOfTrainingEvents =
static_cast<uint64_t
>(numberOfEvents * 100 *
m_specific_options.m_training_fraction);
190 numberOfTrainingEvents = numberOfTrainingEvents / 100 + (numberOfTrainingEvents % 100 != 0);
191 auto numberOfValidationEvents = numberOfEvents - numberOfTrainingEvents;
194 if (batch_size == 0) {
195 batch_size = numberOfTrainingEvents;
198 if (batch_size > numberOfTrainingEvents) {
199 B2WARNING(
"Mini batch size (" << batch_size <<
") is larger than the number of training events (" << numberOfTrainingEvents <<
")"\
200 " The batch size has been set equal to the number of training events.");
201 batch_size = numberOfTrainingEvents;
204 auto X = std::unique_ptr<float[]>(
new float[batch_size * numberOfFeatures]);
205 auto S = std::unique_ptr<float[]>(
new float[batch_size * numberOfSpectators]);
206 auto y = std::unique_ptr<float[]>(
new float[batch_size]);
207 auto w = std::unique_ptr<float[]>(
new float[batch_size]);
208 npy_intp dimensions_X[2] = {
static_cast<npy_intp
>(batch_size),
static_cast<npy_intp
>(numberOfFeatures)};
209 npy_intp dimensions_S[2] = {
static_cast<npy_intp
>(batch_size),
static_cast<npy_intp
>(numberOfSpectators)};
210 npy_intp dimensions_y[2] = {
static_cast<npy_intp
>(batch_size), 1};
211 npy_intp dimensions_w[2] = {
static_cast<npy_intp
>(batch_size), 1};
213 auto X_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfFeatures]);
214 auto S_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents * numberOfSpectators]);
215 auto y_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
216 auto w_v = std::unique_ptr<float[]>(
new float[numberOfValidationEvents]);
217 npy_intp dimensions_X_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents),
static_cast<npy_intp
>(numberOfFeatures)};
218 npy_intp dimensions_S_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents),
static_cast<npy_intp
>(numberOfSpectators)};
219 npy_intp dimensions_y_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
220 npy_intp dimensions_w_v[2] = {
static_cast<npy_intp
>(numberOfValidationEvents), 1};
222 std::string steering_file_source_code;
224 steering_file_source_code = loadPythonFileAsString(
m_specific_options.m_steering_file);
227 std::vector<float> means(numberOfFeatures, 0.0);
228 std::vector<float> stds(numberOfFeatures, 0.0);
233 auto weights = training_data.getWeights();
234 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
237 double running_std = 0.0;
238 auto feature = training_data.getFeature(iFeature);
239 for (uint64_t i = 0; i < weights.size(); ++i) {
241 double meanOld = mean;
242 mean += (weights[i] / wSum) * (feature[i] - meanOld);
243 running_std += weights[i] * (feature[i] - meanOld) * (feature[i] - mean);
245 means[iFeature] = mean;
246 stds[iFeature] = std::sqrt(running_std / (wSum - 1));
252 auto json = boost::python::import(
"json");
253 auto builtins = boost::python::import(
"builtins");
254 auto inspect = boost::python::import(
"inspect");
258 boost::python::object type = boost::python::import(
"types");
261 boost::uuids::random_generator uuid_gen;
262 std::string unique_mva_module_name =
"unique_module_name" + boost::uuids::to_string(uuid_gen());
263 boost::python::object unique_mva_module = type.attr(
"ModuleType")(unique_mva_module_name.c_str());
266 auto framework = boost::python::import((std::string(
"basf2_mva_python_interface.") +
m_specific_options.m_framework).c_str());
267 auto framework_file = framework.attr(
"__file__");
268 auto framework_file_source_code = loadPythonFileAsString(boost::python::extract<std::string>(boost::python::object(
270 builtins.attr(
"exec")(framework_file_source_code.c_str(), boost::python::object(unique_mva_module.attr(
"__dict__")));
272 builtins.attr(
"exec")(steering_file_source_code.c_str(), boost::python::object(unique_mva_module.attr(
"__dict__")));
276 auto model = unique_mva_module.attr(
"get_model")(numberOfFeatures, numberOfSpectators,
280 for (uint64_t iEvent = 0; iEvent < numberOfValidationEvents; ++iEvent) {
281 training_data.loadEvent(iEvent);
283 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
284 X_v[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
286 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
287 X_v[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
289 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
290 S_v[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
291 y_v[iEvent] = training_data.m_target;
292 w_v[iEvent] = training_data.m_weight;
295 auto ndarray_X_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X_v, NPY_FLOAT32, X_v.get()));
296 auto ndarray_S_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S_v, NPY_FLOAT32, S_v.get()));
297 auto ndarray_y_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y_v, NPY_FLOAT32, y_v.get()));
298 auto ndarray_w_v = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w_v, NPY_FLOAT32, w_v.get()));
300 uint64_t nBatches = std::floor(numberOfTrainingEvents / batch_size);
302 auto state = unique_mva_module.attr(
"begin_fit")(model, ndarray_X_v, ndarray_S_v, ndarray_y_v, ndarray_w_v, nBatches);
304 bool continue_loop =
true;
306 std::vector<uint64_t> iteration_index_vector(numberOfTrainingEvents);
307 std::iota(std::begin(iteration_index_vector), std::end(iteration_index_vector), 0);
310 and continue_loop; ++iIteration) {
313 if (iIteration > 0) std::shuffle(std::begin(iteration_index_vector), std::end(iteration_index_vector),
TRandomWrapper());
315 for (uint64_t iBatch = 0; iBatch < nBatches and continue_loop; ++iBatch) {
319 PyThreadState* m_thread_state = PyEval_SaveThread();
320 for (uint64_t iEvent = 0; iEvent < batch_size; ++iEvent) {
321 training_data.loadEvent(iteration_index_vector.at(iEvent + iBatch * batch_size) + numberOfValidationEvents);
323 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
324 X[iEvent * numberOfFeatures + iFeature] = (training_data.m_input[iFeature] - means[iFeature]) / stds[iFeature];
326 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature)
327 X[iEvent * numberOfFeatures + iFeature] = training_data.m_input[iFeature];
329 for (uint64_t iSpectator = 0; iSpectator < numberOfSpectators; ++iSpectator)
330 S[iEvent * numberOfSpectators + iSpectator] = training_data.m_spectators[iSpectator];
331 y[iEvent] = training_data.m_target;
332 w[iEvent] = training_data.m_weight;
336 auto ndarray_X = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_X, NPY_FLOAT32, X.get()));
337 auto ndarray_S = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_S, NPY_FLOAT32, S.get()));
338 auto ndarray_y = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_y, NPY_FLOAT32, y.get()));
339 auto ndarray_w = boost::python::handle<>(PyArray_SimpleNewFromData(2, dimensions_w, NPY_FLOAT32, w.get()));
342 PyEval_RestoreThread(m_thread_state);
343 auto r = unique_mva_module.attr(
"partial_fit")(state, ndarray_X, ndarray_S, ndarray_y,
344 ndarray_w, iIteration, iBatch);
345 boost::python::extract<bool> proxy(r);
347 continue_loop =
static_cast<bool>(proxy);
351 auto result = unique_mva_module.attr(
"end_fit")(state);
353 auto pickle = boost::python::import(
"pickle");
354 auto file = builtins.attr(
"open")(custom_weightfile.c_str(),
"wb");
355 pickle.attr(
"dump")(result, file);
357 auto steeringfile = builtins.attr(
"open")(custom_steeringfile.c_str(),
"wb");
358 pickle.attr(
"dump")(steering_file_source_code.c_str(), steeringfile);
360 auto importances = unique_mva_module.attr(
"feature_importance")(state);
361 if (len(importances) == 0) {
362 B2INFO(
"Python method returned empty feature importance. There won't be any information about the feature importance in the weightfile.");
363 }
else if (numberOfFeatures !=
static_cast<uint64_t
>(len(importances))) {
364 B2WARNING(
"Python method didn't return the correct number of importance value. I ignore the importances");
366 std::map<std::string, float> feature_importances;
367 for (uint64_t iFeature = 0; iFeature < numberOfFeatures; ++iFeature) {
368 boost::python::extract<float> proxy(importances[iFeature]);
370 feature_importances[
m_general_options.m_variables[iFeature]] =
static_cast<float>(proxy);
372 B2WARNING(
"Failed to convert importance output of the method to a float, using 0 instead");
382 B2ERROR(
"Failed calling train in PythonTeacher");
383 throw std::runtime_error(std::string(
"Failed calling train in PythonTeacher"));
388 weightfile.
addFile(
"Python_Weightfile", custom_weightfile);
389 weightfile.
addFile(
"Python_Steeringfile", custom_steeringfile);
392 weightfile.
addVector(
"Python_Means", means);
393 weightfile.
addVector(
"Python_Stds", stds);