Belle II Software development
test_DataDriven.cc
1/**************************************************************************
2 * basf2 (Belle II Analysis Software Framework) *
3 * Author: The Belle II Collaboration *
4 * *
5 * See git log for contributors and copyright holders. *
6 * This file is licensed under LGPL-3.0, see LICENSE.md. *
7 **************************************************************************/
8
9#include <mva/utility/DataDriven.h>
10#include <mva/interface/Interface.h>
11#include <framework/utilities/FileSystem.h>
12#include <framework/utilities/TestHelpers.h>
13
14#include <gtest/gtest.h>
15#include <numeric>
16
17using namespace Belle2;
18
19namespace {
20
21 class TestDataset : public MVA::Dataset {
22 public:
23 explicit TestDataset(MVA::GeneralOptions& general_options) : MVA::Dataset(general_options)
24 {
25 m_input = {0.0, 0.0};
26 m_target = 0.0;
27 m_isSignal = false;
28 m_weight = 1.0;
29 // Suppress cppcheck remark
30 // performance: Variable 'm_a' is assigned in constructor body. Consider performing initialization in initialization list.
31 // Initializing this vector in the initialization list is not readable, and performance is negligible here
32 // cppcheck-suppress *
33 m_a = {1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 4.0, 3.0, 4.0, 4.0, 4.0, 4.0};
34 }
35
36 [[nodiscard]] unsigned int getNumberOfFeatures() const override { return 1; }
37 [[nodiscard]] unsigned int getNumberOfSpectators() const override { return 0; }
38 [[nodiscard]] unsigned int getNumberOfEvents() const override { return 20; }
39 void loadEvent(unsigned int iEvent) override { m_input[0] = m_a[iEvent]; m_target = iEvent % 2; m_isSignal = m_target == 1; };
40 float getSignalFraction() override { return 0.5; };
41 std::vector<float> getFeature(unsigned int) override { return m_a; }
42
43 std::vector<float> m_a;
44
45 };
46
47 TEST(SPlotTest, SPlotDataset)
48 {
49
50 MVA::GeneralOptions general_options;
51 general_options.m_variables = {"A"};
52 TestDataset dataset(general_options);
53
54 std::vector<float> weights(40);
55 std::iota(weights.begin(), weights.end(), 0.0);
56 MVA::SPlotDataset splot_dataset(general_options, dataset, weights, 0.5);
57
58 EXPECT_EQ(splot_dataset.getNumberOfFeatures(), 1);
59 EXPECT_EQ(splot_dataset.getNumberOfEvents(), 40);
60 EXPECT_EQ(splot_dataset.getSignalFraction(), 0.5);
61
62 auto feature = dataset.getFeature(0);
63 for (unsigned int i = 0; i < 40; ++i) {
64 splot_dataset.loadEvent(i);
65 EXPECT_FLOAT_EQ(splot_dataset.m_input[0], feature[i / 2]);
66 EXPECT_FLOAT_EQ(splot_dataset.m_weight, 1.0 * i);
67 EXPECT_EQ(splot_dataset.m_isSignal, (i % 2) == 0);
68 }
69
70 }
71
72 TEST(ReweightingTest, ReweightingDataset)
73 {
74
75 MVA::GeneralOptions general_options;
76 general_options.m_variables = {"A"};
77 TestDataset dataset(general_options);
78
79 std::vector<float> weights(20);
80 std::iota(weights.begin(), weights.end(), 0.0);
81 MVA::ReweightingDataset reweighting_dataset(general_options, dataset, weights);
82
83 EXPECT_EQ(reweighting_dataset.getNumberOfFeatures(), 1);
84 EXPECT_EQ(reweighting_dataset.getNumberOfEvents(), 20);
85
86 auto feature = dataset.getFeature(0);
87 for (unsigned int i = 0; i < 20; ++i) {
88 reweighting_dataset.loadEvent(i);
89 EXPECT_FLOAT_EQ(reweighting_dataset.m_input[0], feature[i]);
90 EXPECT_FLOAT_EQ(reweighting_dataset.m_weight, 1.0 * i);
91 EXPECT_EQ(reweighting_dataset.m_isSignal, (i % 2) == 1);
92 }
93
94 }
95
96 TEST(SPlotTest, GetSPlotWeights)
97 {
98
99 MVA::GeneralOptions general_options;
100 general_options.m_variables = {"A", "D"};
101 TestDataset dataset(general_options);
102
103 MVA::Binning binning = MVA::Binning::CreateEquidistant(dataset.getFeature(0), dataset.getWeights(), dataset.getSignals(), 4);
104
105 EXPECT_EQ(binning.m_boundaries.size(), 5);
106 EXPECT_FLOAT_EQ(binning.m_boundaries[0], 1.0);
107 EXPECT_FLOAT_EQ(binning.m_boundaries[1], 1.75);
108 EXPECT_FLOAT_EQ(binning.m_boundaries[2], 2.5);
109 EXPECT_FLOAT_EQ(binning.m_boundaries[3], 3.25);
110 EXPECT_FLOAT_EQ(binning.m_boundaries[4], 4.0);
111
112 EXPECT_EQ(binning.m_signal_pdf.size(), 4);
113 EXPECT_FLOAT_EQ(binning.m_signal_pdf[0], 0.2 * 4.0);
114 EXPECT_FLOAT_EQ(binning.m_signal_pdf[1], 0.3 * 4.0);
115 EXPECT_FLOAT_EQ(binning.m_signal_pdf[2], 0.3 * 4.0);
116 EXPECT_FLOAT_EQ(binning.m_signal_pdf[3], 0.2 * 4.0);
117
118 EXPECT_EQ(binning.m_bckgrd_pdf.size(), 4);
119 EXPECT_FLOAT_EQ(binning.m_bckgrd_pdf[0], 0.3 * 4.0);
120 EXPECT_FLOAT_EQ(binning.m_bckgrd_pdf[1], 0.2 * 4.0);
121 EXPECT_FLOAT_EQ(binning.m_bckgrd_pdf[2], 0.2 * 4.0);
122 EXPECT_FLOAT_EQ(binning.m_bckgrd_pdf[3], 0.3 * 4.0);
123
124 EXPECT_FLOAT_EQ(binning.m_signal_yield, 10);
125 EXPECT_FLOAT_EQ(binning.m_bckgrd_yield, 10);
126
127 auto splot_weights = MVA::getSPlotWeights(dataset, binning);
128
129 double sum = 0;
130 for (auto& s : splot_weights)
131 sum += s;
132 EXPECT_FLOAT_EQ(sum, 20.0);
133
134 EXPECT_EQ(splot_weights.size(), 40);
135 for (unsigned int i = 0; i < 10; i += 2) {
136 EXPECT_FLOAT_EQ(splot_weights[i], -2.0);
137 EXPECT_FLOAT_EQ(splot_weights[i + 1], 3.0);
138 }
139 for (unsigned int i = 10; i < 20; i += 2) {
140 EXPECT_FLOAT_EQ(splot_weights[i], 3.0);
141 EXPECT_FLOAT_EQ(splot_weights[i + 1], -2.0);
142 }
143 for (unsigned int i = 20; i < 28; i += 2) {
144 EXPECT_FLOAT_EQ(splot_weights[i], 3.0);
145 EXPECT_FLOAT_EQ(splot_weights[i + 1], -2.0);
146 }
147 EXPECT_FLOAT_EQ(splot_weights[28], -2.0);
148 EXPECT_FLOAT_EQ(splot_weights[29], 3.0);
149 EXPECT_FLOAT_EQ(splot_weights[30], 3.0);
150 EXPECT_FLOAT_EQ(splot_weights[31], -2.0);
151 for (unsigned int i = 32; i < 40; i += 2) {
152 EXPECT_FLOAT_EQ(splot_weights[i], -2.0);
153 EXPECT_FLOAT_EQ(splot_weights[i + 1], 3.0);
154 }
155
156 }
157
158 TEST(SPlotTest, GetBoostWeights)
159 {
160
161 MVA::GeneralOptions general_options;
162 general_options.m_variables = {"A"};
163 TestDataset dataset(general_options);
164
165 MVA::Binning binning = MVA::Binning::CreateEquidistant(dataset.getFeature(0), dataset.getWeights(), dataset.getSignals(), 4);
166
167 auto boost_weights = MVA::getBoostWeights(dataset, binning);
168
169 EXPECT_EQ(boost_weights.size(), 40);
170 for (unsigned int i = 0; i < 10; i += 2) {
171 EXPECT_FLOAT_EQ(boost_weights[i], 0.2 / 0.3 / 4.0);
172 EXPECT_FLOAT_EQ(boost_weights[i + 1], 0.8 / 0.3 / 4.0);
173 }
174 for (unsigned int i = 10; i < 20; i += 2) {
175 EXPECT_FLOAT_EQ(boost_weights[i], 0.5 / 0.2 / 4.0);
176 EXPECT_FLOAT_EQ(boost_weights[i + 1], 0.5 / 0.2 / 4.0);
177 }
178 for (unsigned int i = 20; i < 28; i += 2) {
179 EXPECT_FLOAT_EQ(boost_weights[i], 0.8 / 0.2 / 4.0);
180 // cppcheck-suppress duplicateExpression
181 EXPECT_FLOAT_EQ(boost_weights[i + 1], 0.2 / 0.2 / 4.0);
182 }
183 EXPECT_FLOAT_EQ(boost_weights[28], 1.0 / 0.3 / 4.0);
184 EXPECT_FLOAT_EQ(boost_weights[29], 0.0 / 0.3 / 4.0);
185 EXPECT_FLOAT_EQ(boost_weights[30], 0.8 / 0.2 / 4.0);
186 // cppcheck-suppress duplicateExpression
187 EXPECT_FLOAT_EQ(boost_weights[31], 0.2 / 0.2 / 4.0);
188 for (unsigned int i = 32; i < 40; i += 2) {
189 EXPECT_FLOAT_EQ(boost_weights[i], 1.0 / 0.3 / 4.0);
190 EXPECT_FLOAT_EQ(boost_weights[i + 1], 0.0 / 0.3 / 4.0);
191 }
192
193 }
194
195 TEST(SPlotTest, GetAPlotWeights)
196 {
197
198 MVA::GeneralOptions general_options;
199 general_options.m_variables = {"A"};
200 TestDataset dataset(general_options);
201
202 MVA::Binning binning = MVA::Binning::CreateEquidistant(dataset.getFeature(0), dataset.getWeights(), dataset.getSignals(), 4);
203
204 std::vector<float> boost_prediction = {0.0, 0.005, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45,
205 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.9, 0.995, 1.0
206 };
207 auto aplot_weights = MVA::getAPlotWeights(dataset, binning, boost_prediction);
208
209 // Regularisation
210 boost_prediction[0] = 0.005;
211 boost_prediction[19] = 0.995;
212
213 auto splot_weights = MVA::getSPlotWeights(dataset, binning);
214
215 EXPECT_EQ(aplot_weights.size(), 40);
216 for (unsigned int i = 0; i < 10; i += 2) {
217 double aplot = 0.1 / boost_prediction[i / 2] + 0.4 / (1 - boost_prediction[i / 2]);
218 EXPECT_FLOAT_EQ(aplot_weights[i], aplot * splot_weights[i]);
219 EXPECT_FLOAT_EQ(aplot_weights[i + 1], aplot * splot_weights[i + 1]);
220 }
221 for (unsigned int i = 10; i < 20; i += 2) {
222 double aplot = 0.25 / boost_prediction[i / 2] + 0.25 / (1 - boost_prediction[i / 2]);
223 EXPECT_FLOAT_EQ(aplot_weights[i], aplot * splot_weights[i]);
224 EXPECT_FLOAT_EQ(aplot_weights[i + 1], aplot * splot_weights[i + 1]);
225 }
226 for (unsigned int i = 20; i < 28; i += 2) {
227 double aplot = 0.4 / boost_prediction[i / 2] + 0.1 / (1 - boost_prediction[i / 2]);
228 EXPECT_FLOAT_EQ(aplot_weights[i], aplot * splot_weights[i]);
229 EXPECT_FLOAT_EQ(aplot_weights[i + 1], aplot * splot_weights[i + 1]);
230 }
231 {
232 double aplot = 0.5 / boost_prediction[14];
233 EXPECT_FLOAT_EQ(aplot_weights[28], aplot * splot_weights[28]);
234 EXPECT_FLOAT_EQ(aplot_weights[29], aplot * splot_weights[29]);
235 aplot = 0.4 / boost_prediction[15] + 0.1 / (1 - boost_prediction[15]);
236 EXPECT_FLOAT_EQ(aplot_weights[30], aplot * splot_weights[30]);
237 EXPECT_FLOAT_EQ(aplot_weights[31], aplot * splot_weights[31]);
238 }
239 for (unsigned int i = 32; i < 40; i += 2) {
240 double aplot = 0.5 / boost_prediction[i / 2];
241 EXPECT_FLOAT_EQ(aplot_weights[i], aplot * splot_weights[i]);
242 EXPECT_FLOAT_EQ(aplot_weights[i + 1], aplot * splot_weights[i + 1]);
243 }
244
245 }
246
247}
Binning of a data distribution Provides PDF and CDF values of the distribution per bin.
Definition: Binning.h:27
std::vector< float > m_bckgrd_pdf
Background pdf of data distribution per bin.
Definition: Binning.h:58
std::vector< float > m_signal_pdf
Signal pdf of data distribution per bin.
Definition: Binning.h:56
std::vector< float > m_boundaries
Boundaries of data distribution, including minimum and maximum value as first and last boundary.
Definition: Binning.h:61
double m_bckgrd_yield
Background yield in data distribution.
Definition: Binning.h:54
double m_signal_yield
Signal yield in data distribution.
Definition: Binning.h:53
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
Definition: Dataset.h:33
virtual unsigned int getNumberOfEvents() const =0
Returns the number of events in this dataset.
virtual unsigned int getNumberOfSpectators() const =0
Returns the number of spectators in this dataset.
virtual unsigned int getNumberOfFeatures() const =0
Returns the number of features in this dataset.
virtual void loadEvent(unsigned int iEvent)=0
Load the event number iEvent.
virtual std::vector< float > getFeature(unsigned int iFeature)
Returns all values of one feature in a std::vector<float>
Definition: Dataset.cc:74
virtual float getSignalFraction()
Returns the signal fraction of the whole sample.
Definition: Dataset.cc:35
General options which are shared by all MVA trainings.
Definition: Options.h:62
Dataset for Reweighting Wraps a dataset and provides each data-point with a new weight.
Definition: DataDriven.h:29
Dataset for sPlot Wraps a dataset and provides each data-point twice, once as signal and once as back...
Definition: DataDriven.h:161
Abstract base class for different kinds of events.