Belle II Software development
FastBDT.cc
1/**************************************************************************
2 * basf2 (Belle II Analysis Software Framework) *
3 * Author: The Belle II Collaboration *
4 * *
5 * See git log for contributors and copyright holders. *
6 * This file is licensed under LGPL-3.0, see LICENSE.md. *
7 **************************************************************************/
8
9#include <mva/methods/FastBDT.h>
10
11#include <framework/logging/Logger.h>
12#include <sstream>
13#include <vector>
14
15namespace Belle2 {
20 namespace MVA {
21 bool isValidSignal(const std::vector<bool>& Signals)
22 {
23 const auto first = Signals.front();
24 for (const auto& value : Signals) {
25 if (value != first)
26 return true;
27 }
28 return false;
29 }
30
31 void FastBDTOptions::load(const boost::property_tree::ptree& pt)
32 {
33 int version = pt.get<int>("FastBDT_version");
34 if (version != 1 and version != 2) {
35 B2ERROR("Unknown weightfile version " << std::to_string(version));
36 throw std::runtime_error("Unknown weightfile version " + std::to_string(version));
37 }
38 m_nTrees = pt.get<int>("FastBDT_nTrees");
39 m_nCuts = pt.get<int>("FastBDT_nCuts");
40 m_nLevels = pt.get<int>("FastBDT_nLevels");
41 m_shrinkage = pt.get<double>("FastBDT_shrinkage");
42 m_randRatio = pt.get<double>("FastBDT_randRatio");
43
44 if (version > 1) {
45
46 m_flatnessLoss = pt.get<double>("FastBDT_flatnessLoss");
47 m_sPlot = pt.get<bool>("FastBDT_sPlot");
48
49 unsigned int numberOfIndividualNCuts = pt.get<unsigned int>("FastBDT_number_individual_nCuts", 0);
50 m_individual_nCuts.resize(numberOfIndividualNCuts);
51 for (unsigned int i = 0; i < numberOfIndividualNCuts; ++i) {
52 m_individual_nCuts[i] = pt.get<unsigned int>(std::string("FastBDT_individual_nCuts") + std::to_string(i));
53 }
54
55 m_purityTransformation = pt.get<bool>("FastBDT_purityTransformation");
56 unsigned int numberOfIndividualPurityTransformation = pt.get<unsigned int>("FastBDT_number_individualPurityTransformation", 0);
57 m_individualPurityTransformation.resize(numberOfIndividualPurityTransformation);
58 for (unsigned int i = 0; i < numberOfIndividualPurityTransformation; ++i) {
59 m_individualPurityTransformation[i] = pt.get<bool>(std::string("FastBDT_individualPurityTransformation") + std::to_string(i));
60 }
61
62 } else {
63 m_flatnessLoss = -1.0;
64 m_sPlot = false;
65 }
66 }
67
68 void FastBDTOptions::save(boost::property_tree::ptree& pt) const
69 {
70 pt.put("FastBDT_version", 2);
71 pt.put("FastBDT_nTrees", m_nTrees);
72 pt.put("FastBDT_nCuts", m_nCuts);
73 pt.put("FastBDT_nLevels", m_nLevels);
74 pt.put("FastBDT_shrinkage", m_shrinkage);
75 pt.put("FastBDT_randRatio", m_randRatio);
76 pt.put("FastBDT_flatnessLoss", m_flatnessLoss);
77 pt.put("FastBDT_sPlot", m_sPlot);
78 pt.put("FastBDT_number_individual_nCuts", m_individual_nCuts.size());
79 for (unsigned int i = 0; i < m_individual_nCuts.size(); ++i) {
80 pt.put(std::string("FastBDT_individual_nCuts") + std::to_string(i), m_individual_nCuts[i]);
81 }
82 pt.put("FastBDT_purityTransformation", m_purityTransformation);
83 pt.put("FastBDT_number_individualPurityTransformation", m_individualPurityTransformation.size());
84 for (unsigned int i = 0; i < m_individualPurityTransformation.size(); ++i) {
85 pt.put(std::string("FastBDT_individualPurityTransformation") + std::to_string(i), m_individualPurityTransformation[i]);
86 }
87 }
88
89 po::options_description FastBDTOptions::getDescription()
90 {
91 po::options_description description("FastBDT options");
92 description.add_options()
93 ("nTrees", po::value<unsigned int>(&m_nTrees), "Number of trees in the forest. Reasonable values are between 10 and 1000")
94 ("nLevels", po::value<unsigned int>(&m_nLevels)->notifier(check_bounds<unsigned int>(0, 20, "nLevels")),
95 "Depth d of trees. The last layer of the tree will contain 2^d bins. Maximum is 20. Reasonable values are 2 and 6.")
96 ("shrinkage", po::value<double>(&m_shrinkage)->notifier(check_bounds<double>(0.0, 1.0, "shrinkage")),
97 "Shrinkage of the boosting algorithm. Reasonable values are between 0.01 and 1.0.")
98 ("nCutLevels", po::value<unsigned int>(&m_nCuts)->notifier(check_bounds<unsigned int>(0, 20, "nCutLevels")),
99 "Number of cut levels N per feature. 2^N Bins will be used per feature. Reasonable values are between 6 and 12.")
100 ("individualNCutLevels", po::value<std::vector<unsigned int>>(&m_individual_nCuts)->multitoken()->notifier(
101 check_bounds_vector<unsigned int>(0, 20, "individualNCutLevels")),
102 "Number of cut levels N per feature. 2^N Bins will be used per feature. Reasonable values are between 6 and 12. One value per feature (including spectators) should be provided, if parameter is not set the global value specified by nCutLevels is used for all features.")
103 ("sPlot", po::value<bool>(&m_sPlot),
104 "Since in sPlot each event enters twice, this option modifies the sampling algorithm so that the matching signal and background events are selected together.")
105 ("flatnessLoss", po::value<double>(&m_flatnessLoss),
106 "Activate Flatness Loss, all spectator variables are assumed to be variables in which the signal and background efficiency should be flat. negative values deactivates flatness loss.")
107 ("purityTransformation", po::value<bool>(&m_purityTransformation),
108 "Activates purity transformation on all features: Add the purity transformed of all features in addition to the training. This will double the number of features and slow down the inference considerably")
109 ("individualPurityTransformation", po::value<std::vector<bool>>(&m_individualPurityTransformation)->multitoken(),
110 "Activates purity transformation for each feature: Vector of boolean values which decide if the purity transformed of the feature should be added in addition to this training.")
111 ("randRatio", po::value<double>(&m_randRatio)->notifier(check_bounds<double>(0.0, 1.0001, "randRatio")),
112 "Fraction of the data sampled each training iteration. Reasonable values are between 0.1 and 1.0.");
113 return description;
114 }
115
116
118 const FastBDTOptions& specific_options) : Teacher(general_options),
119 m_specific_options(specific_options) { }
120
122 {
123 if (training_data.getNumberOfEvents() > 5e+6) {
124 B2WARNING("Number of events for training exceeds 5 million. FastBDT performance starts getting worse when the number reaches O(10^7).");
125 }
126
127 unsigned int numberOfFeatures = training_data.getNumberOfFeatures();
128 unsigned int numberOfSpectators = training_data.getNumberOfSpectators();
129
131 and m_specific_options.m_individual_nCuts.size() != numberOfFeatures + numberOfSpectators) {
132 B2ERROR("You provided individual nCut values for each feature and spectator, but the total number of provided cuts is not same as as the total number of features and spectators.");
133 }
134
135 std::vector<bool> individualPurityTransformation = m_specific_options.m_individualPurityTransformation;
137 if (individualPurityTransformation.size() == 0) {
138 for (unsigned int i = 0; i < numberOfFeatures; ++i) {
139 individualPurityTransformation.push_back(true);
140 }
141 }
142 }
143
144 std::vector<unsigned int> individual_nCuts = m_specific_options.m_individual_nCuts;
145 if (individual_nCuts.size() == 0) {
146 for (unsigned int i = 0; i < numberOfFeatures + numberOfSpectators; ++i) {
147 individual_nCuts.push_back(m_specific_options.m_nCuts);
148 }
149 }
150
151 FastBDT::Classifier classifier(m_specific_options.m_nTrees, m_specific_options.m_nLevels, individual_nCuts,
153 m_specific_options.m_sPlot, m_specific_options.m_flatnessLoss, individualPurityTransformation,
154 numberOfSpectators, true);
155
156 std::vector<std::vector<float>> X(numberOfFeatures + numberOfSpectators);
157 const auto& y = training_data.getSignals();
158 if (not isValidSignal(y)) {
159 B2FATAL("The training data is not valid. It only contains one class instead of two.");
160 }
161 const auto& w = training_data.getWeights();
162 for (unsigned int i = 0; i < numberOfFeatures; ++i) {
163 X[i] = training_data.getFeature(i);
164 }
165 for (unsigned int i = 0; i < numberOfSpectators; ++i) {
166 X[i + numberOfFeatures] = training_data.getSpectator(i);
167 }
168 classifier.fit(X, y, w);
169
170 Weightfile weightfile;
171 std::string custom_weightfile = weightfile.generateFileName();
172 std::fstream file(custom_weightfile, std::ios_base::out | std::ios_base::trunc);
173
174 file << classifier << std::endl;
175 file.close();
176
177 weightfile.addOptions(m_general_options);
178 weightfile.addOptions(m_specific_options);
179 weightfile.addFile("FastBDT_Weightfile", custom_weightfile);
180 weightfile.addSignalFraction(training_data.getSignalFraction());
181
182 std::map<std::string, float> importance;
183 for (auto& pair : classifier.GetVariableRanking()) {
184 importance[m_general_options.m_variables[pair.first]] = pair.second;
185 }
186 weightfile.addFeatureImportance(importance);
187
188 return weightfile;
189
190 }
191
193 {
194
195 std::string custom_weightfile = weightfile.generateFileName();
196 weightfile.getFile("FastBDT_Weightfile", custom_weightfile);
197 std::fstream file(custom_weightfile, std::ios_base::in);
198
199 int version = weightfile.getElement<int>("FastBDT_version", 0);
200 B2DEBUG(100, "FastBDT Weightfile Version " << version);
201 if (version < 2) {
202 std::stringstream s;
203 {
204 std::string t;
205 std::fstream file2(custom_weightfile, std::ios_base::in);
206 getline(file2, t);
207 s << t;
208 }
209 int dummy;
210 // Try to read to integers, if this is successful we have a old weightfile with a Feature Binning before the Tree.
211 if (!(s >> dummy >> dummy)) {
212 B2DEBUG(100, "FastBDT: I read a new weightfile of FastBDT using the new FastBDT version 3. Everything fine!");
213 // New format since version 3
214 m_expert_forest = FastBDT::readForestFromStream<float>(file);
215 } else {
216 B2INFO("FastBDT: I read an old weightfile of FastBDT using the new FastBDT version 3."
217 "I will convert your FastBDT on-the-fly to the new version."
218 "Retrain the classifier to get rid of this message");
219 // Old format before version 3
220 // We read in first the feature binnings and than rewrite the tree
221 std::vector<FastBDT::FeatureBinning<float>> feature_binnings;
222 file >> feature_binnings;
223 double F0;
224 file >> F0;
225 double shrinkage;
226 file >> shrinkage;
227 // This parameter was not available in the old version
228 bool transform2probability = true;
229 FastBDT::Forest<unsigned int> temp_forest(shrinkage, F0, transform2probability);
230 unsigned int size;
231 file >> size;
232 for (unsigned int i = 0; i < size; ++i) {
233 temp_forest.AddTree(FastBDT::readTreeFromStream<unsigned int>(file));
234 }
235
236 FastBDT::Forest<float> cleaned_forest(temp_forest.GetShrinkage(), temp_forest.GetF0(), temp_forest.GetTransform2Probability());
237 for (auto& tree : temp_forest.GetForest()) {
238 cleaned_forest.AddTree(FastBDT::removeFeatureBinningTransformationFromTree(tree, feature_binnings));
239 }
240 m_expert_forest = cleaned_forest;
241 }
242 } else {
244 m_classifier = FastBDT::Classifier(file);
245 }
246 file.close();
247
248 weightfile.getOptions(m_specific_options);
249 }
250
251 std::vector<float> FastBDTExpert::apply(Dataset& test_data) const
252 {
253
254 std::vector<float> probabilities(test_data.getNumberOfEvents());
255 for (unsigned int iEvent = 0; iEvent < test_data.getNumberOfEvents(); ++iEvent) {
256 test_data.loadEvent(iEvent);
258 probabilities[iEvent] = m_classifier.predict(test_data.m_input);
259 else
260 probabilities[iEvent] = m_expert_forest.Analyse(test_data.m_input);
261 }
262
263 return probabilities;
264
265 }
266
267 }
269}
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
Definition: Dataset.h:33
FastBDT::Forest< float > m_expert_forest
Forest Expert -> used in case of no purity transformation.
Definition: FastBDT.h:123
FastBDT::Classifier m_classifier
Simplified FastBDT interface: classifier combines preprocessing and forest.
Definition: FastBDT.h:122
virtual std::vector< float > apply(Dataset &test_data) const override
Apply this expert onto a dataset.
Definition: FastBDT.cc:251
bool m_use_simplified_interface
Use the simplified FastBDT interface of version 4.
Definition: FastBDT.h:121
virtual void load(Weightfile &weightfile) override
Load the expert from a Weightfile.
Definition: FastBDT.cc:192
FastBDTOptions m_specific_options
Method specific options.
Definition: FastBDT.h:120
Options for the FANN MVA method.
Definition: FastBDT.h:37
std::vector< unsigned int > m_individual_nCuts
Number of cut Levels = log_2(Number of Cuts) for each provided feature.
Definition: FastBDT.h:68
bool m_sPlot
Activates sPlot sampling.
Definition: FastBDT.h:70
virtual po::options_description getDescription() override
Returns a program options description for all available options.
Definition: FastBDT.cc:89
double m_randRatio
Fraction of data to use in the stochastic training.
Definition: FastBDT.h:66
double m_flatnessLoss
Flatness Loss constant.
Definition: FastBDT.h:69
double m_shrinkage
Shrinkage during the boosting step.
Definition: FastBDT.h:65
virtual void load(const boost::property_tree::ptree &pt) override
Load mechanism to load Options from a xml tree.
Definition: FastBDT.cc:31
bool m_purityTransformation
Activates purity transformation globally for all features.
Definition: FastBDT.h:71
unsigned int m_nLevels
Depth of tree.
Definition: FastBDT.h:64
virtual void save(boost::property_tree::ptree &pt) const override
Save mechanism to store Options in a xml tree.
Definition: FastBDT.cc:68
std::vector< bool > m_individualPurityTransformation
Vector which decided for each feature individually if the purity transformation should be used.
Definition: FastBDT.h:73
unsigned int m_nCuts
Number of cut Levels = log_2(Number of Cuts)
Definition: FastBDT.h:63
unsigned int m_nTrees
Number of trees.
Definition: FastBDT.h:62
FastBDTTeacher(const GeneralOptions &general_options, const FastBDTOptions &specific_options)
Constructs a new teacher using the GeneralOptions and specific options of this training.
Definition: FastBDT.cc:117
FastBDTOptions m_specific_options
Method specific options.
Definition: FastBDT.h:97
virtual Weightfile train(Dataset &training_data) const override
Train a mva method using the given dataset returning a Weightfile.
Definition: FastBDT.cc:121
General options which are shared by all MVA trainings.
Definition: Options.h:62
std::vector< std::string > m_variables
Vector of all variables (branch names) used in the training.
Definition: Options.h:86
Abstract base class of all Teachers Each MVA library has its own implementation of this class,...
Definition: Teacher.h:29
GeneralOptions m_general_options
GeneralOptions containing all shared options.
Definition: Teacher.h:49
The Weightfile class serializes all information about a training into an xml tree.
Definition: Weightfile.h:38
T getElement(const std::string &identifier) const
Returns a stored element from the xml tree.
Definition: Weightfile.h:151
void addFile(const std::string &identifier, const std::string &custom_weightfile)
Add a file (mostly a weightfile from a MVA library) to our Weightfile.
Definition: Weightfile.cc:115
void addOptions(const Options &options)
Add an Option object to the xml tree.
Definition: Weightfile.cc:62
void getOptions(Options &options) const
Fills an Option object from the xml tree.
Definition: Weightfile.cc:67
void addSignalFraction(float signal_fraction)
Saves the signal fraction in the xml tree.
Definition: Weightfile.cc:95
void addFeatureImportance(const std::map< std::string, float > &importance)
Add variable importance.
Definition: Weightfile.cc:72
std::string generateFileName(const std::string &suffix="")
Returns a temporary filename with the given suffix.
Definition: Weightfile.cc:105
void getFile(const std::string &identifier, const std::string &custom_weightfile)
Creates a file from our weightfile (mostly this will be a weightfile of an MVA library)
Definition: Weightfile.cc:138
Abstract base class for different kinds of events.