Belle II Software development
DataDriven.cc
1/**************************************************************************
2 * basf2 (Belle II Analysis Software Framework) *
3 * Author: The Belle II Collaboration *
4 * *
5 * See git log for contributors and copyright holders. *
6 * This file is licensed under LGPL-3.0, see LICENSE.md. *
7 **************************************************************************/
8
9#include <mva/utility/DataDriven.h>
10#include <mva/utility/Utility.h>
11#include <mva/methods/PDF.h>
12#include <mva/methods/Trivial.h>
13#include <mva/methods/Combination.h>
14
15#include <framework/logging/Logger.h>
16#include <iostream>
17
18namespace Belle2 {
23 namespace MVA {
24
26 const std::vector<float>& weights) : Dataset(general_options), m_dataset(dataset), m_weights(weights) { }
27
28 void ReweightingDataset::loadEvent(unsigned int event)
29 {
30 m_dataset.loadEvent(event);
36 }
37
38 SidebandDataset::SidebandDataset(const GeneralOptions& general_options, Dataset& dataset, Dataset& mc_dataset,
39 const std::string& sideband_variable) : Dataset(general_options), m_dataset(dataset)
40 {
41
42 m_spectator_index = dataset.getSpectatorIndex(sideband_variable);
43 int mc_spectator_index = mc_dataset.getSpectatorIndex(sideband_variable);
44
45 double total_signal_mc = 0.0;
46 double total_mc = 0.0;
47 double sum_signal_sr = 0.0;
48 double sum_sr = 0.0;
49 double sum_signal_br = 0.0;
50 double sum_br = 0.0;
51 double sum_signal_nr = 0.0;
52 double sum_nr = 0.0;
53
54 for (unsigned int iEvent = 0; iEvent < mc_dataset.getNumberOfEvents(); ++iEvent) {
55 mc_dataset.loadEvent(iEvent);
56 if (mc_dataset.m_isSignal)
57 total_signal_mc += mc_dataset.m_weight;
58 total_mc += mc_dataset.m_weight;
59 if (mc_dataset.m_spectators[mc_spectator_index] == 1.0) {
60 if (mc_dataset.m_isSignal)
61 sum_signal_sr += mc_dataset.m_weight;
62 sum_sr += mc_dataset.m_weight;
63 } else if (mc_dataset.m_spectators[mc_spectator_index] == 2.0) {
64 if (mc_dataset.m_isSignal)
65 sum_signal_br += mc_dataset.m_weight;
66 sum_br += mc_dataset.m_weight;
67 } else if (mc_dataset.m_spectators[mc_spectator_index] == 3.0) {
68 if (mc_dataset.m_isSignal)
69 sum_signal_nr += mc_dataset.m_weight;
70 sum_nr += mc_dataset.m_weight;
71 }
72 }
73
74 double total_data = 0.0;
75 double sum_data_sr = 0.0;
76 double sum_data_br = 0.0;
77 double sum_data_nr = 0.0;
78
79 for (unsigned int iEvent = 0; iEvent < dataset.getNumberOfEvents(); ++iEvent) {
80 dataset.loadEvent(iEvent);
81 total_data += dataset.m_weight;
82 if (dataset.m_spectators[m_spectator_index] == 1.0) {
83 sum_data_sr += dataset.m_weight;
84 } else if (dataset.m_spectators[m_spectator_index] == 2.0) {
85 sum_data_br += dataset.m_weight;
86 } else if (dataset.m_spectators[m_spectator_index] == 3.0) {
87 sum_data_nr += dataset.m_weight;
88 }
89 }
90
91 if (sum_signal_br / sum_br > 0.01) {
92 B2WARNING("The background region you defined in the sideband subtraction contains more than 1% signal");
93 }
94 if (sum_signal_nr / sum_nr > 0.01) {
95 B2WARNING("The negative signal region you defined in the sideband subtraction contains more than 1% signal");
96 }
97
98 if (sum_data_sr - sum_signal_sr < 0) {
99 B2ERROR("There is less data in the signal region than the expected amount of signal events in the signal region estimated from MC.");
100 }
101
102 if (total_data - total_signal_mc < 0) {
103 B2ERROR("There is less data than the expected amount of signal events estimated from MC.");
104 }
105
106 // We assume the number of signal events is correctly described in mc
107 // Everything else (like the background) we take from the data sample
108
109 // So Signal events in the signal region receive weight 1
110 m_signal_weight = 1.0;
111
112 // The background is scaled so that it corresponds to the total background in the whole sample
113 m_background_weight = (total_data - total_signal_mc) / sum_data_br;
114
115 // The negative signal is scaled so it corresponds to the expected background in the signal region:
116 // Background Events in Signal Region in Data = Total Events in Signal Region In Data - Signal Events in Signal Region in MC
117 m_negative_signal_weight = - (sum_data_sr - sum_signal_sr) / sum_data_nr;
118
119 B2INFO("Data " << total_data << " " << sum_data_sr << " " << sum_data_br << " " << sum_data_nr);
120 B2INFO("MC " << total_mc << " " << sum_sr << " " << sum_br << " " << sum_nr);
121 B2INFO("MC (signal)" << total_signal_mc << " " << sum_signal_sr << " " << sum_signal_br << " " << sum_signal_nr);
122 B2INFO("Sideband Subtraction: Signal Weight " << m_signal_weight << " Background Weight " << m_background_weight <<
123 " Negative Signal Weight " << m_negative_signal_weight);
124
125 }
126
127 void SidebandDataset::loadEvent(unsigned int event)
128 {
129 m_dataset.loadEvent(event);
133 if (m_spectators[m_spectator_index] == 1.0) {
134 m_isSignal = true;
135 m_target = 1.0;
137 } else if (m_spectators[m_spectator_index] == 2.0) {
138 m_isSignal = false;
139 m_target = 0.0;
141 } else if (m_spectators[m_spectator_index] == 3.0) {
142 m_isSignal = true;
143 m_target = 1.0;
145 } else {
146 m_isSignal = false;
147 m_target = 0.0;
148 m_weight = 0.0;
149 }
150 }
151
152
153 SPlotDataset::SPlotDataset(const GeneralOptions& general_options, Dataset& dataset, const std::vector<float>& weights,
154 float signalFraction) : Dataset(general_options), m_dataset(dataset), m_weights(weights), m_signalFraction(signalFraction) { }
155
156
158 {
159 return m_signalFraction;
160 }
161
162 void SPlotDataset::loadEvent(unsigned int event)
163 {
164 m_dataset.loadEvent(event / 2);
165 for (unsigned int iFeature = 0; iFeature < getNumberOfFeatures(); ++iFeature) {
166 m_input[iFeature] = m_dataset.m_input[iFeature];
167 }
168
169 if (event % 2 == 1) {
170 m_isSignal = false;
171 m_target = 0.0;
173 } else {
174 m_isSignal = true;
175 m_target = 1.0;
177 }
178 }
179
180
181 std::vector<float> getSPlotWeights(Dataset& dataset, const Binning& binning)
182 {
183 // TODO
184 // Instead of calculating the covariance matrix directly one could do an actual fit here,
185 // this would reduce the reliance on MC even further.
186 // Also it should be no problem generalize this code to more than two components.
187 // I didn't implement this, yet. Since it is not foreseeable if and in which context this will be used,
188 // so feel free to implement the missing features here.
189
190 auto discriminants = dataset.getFeature(0);
191
192 // Calculate Covariance Matrix of pdfs by calculating first the inverse-covariance matrix
193 // and then inverting the inverse-convariance matrix
194
195 double inverse_covariance[3] = {0};
196 for (auto& v : discriminants) {
197 const unsigned int iBin = binning.getBin(v);
198 double norm = (binning.m_signal_yield * binning.m_signal_pdf[iBin] + binning.m_bckgrd_yield * binning.m_bckgrd_pdf[iBin]);
199 norm = norm * norm;
200 inverse_covariance[0] += binning.m_signal_pdf[iBin] * binning.m_signal_pdf[iBin] / norm;
201 inverse_covariance[1] += binning.m_signal_pdf[iBin] * binning.m_bckgrd_pdf[iBin] / norm;
202 inverse_covariance[2] += binning.m_bckgrd_pdf[iBin] * binning.m_bckgrd_pdf[iBin] / norm;
203 }
204 double covariance[3] = {0};
205 double determinante = (inverse_covariance[0] * inverse_covariance[2] - inverse_covariance[1] * inverse_covariance[1]);
206 covariance[0] = inverse_covariance[2] / determinante;
207 covariance[1] = -inverse_covariance[1] / determinante;
208 covariance[2] = inverse_covariance[0] / determinante;
209
210 // Finally calculate sPlot weights
211 std::vector<float> splot_weights;
212 splot_weights.reserve(2 * discriminants.size());
213 for (auto& v : discriminants) {
214 const unsigned int iBin = binning.getBin(v);
215 double norm = (binning.m_signal_yield * binning.m_signal_pdf[iBin] + binning.m_bckgrd_yield * binning.m_bckgrd_pdf[iBin]);
216 splot_weights.push_back((covariance[0] * binning.m_signal_pdf[iBin] + covariance[1] * binning.m_bckgrd_pdf[iBin]) / norm);
217 splot_weights.push_back((covariance[1] * binning.m_signal_pdf[iBin] + covariance[2] * binning.m_bckgrd_pdf[iBin]) / norm);
218 }
219
220 B2INFO("Covariance Matrix of SPlot");
221 B2INFO(covariance[0] << " " << covariance[1] << " " << covariance[2]);
222
223 return splot_weights;
224
225 }
226
227 std::vector<float> getBoostWeights(Dataset& dataset, const Binning& binning)
228 {
229
230 std::vector<float> boost_weights;
231 boost_weights.reserve(2 * dataset.getNumberOfEvents());
232 for (unsigned int iEvent = 0; iEvent < dataset.getNumberOfEvents(); ++iEvent) {
233 dataset.loadEvent(iEvent);
234 const unsigned int bin = binning.getBin(dataset.m_input[0]);
235 boost_weights.push_back(binning.m_signal_cdf[bin] / binning.m_bckgrd_pdf[bin]);
236 boost_weights.push_back((1 - binning.m_signal_cdf[bin]) / binning.m_bckgrd_pdf[bin]);
237 }
238 return boost_weights;
239
240 }
241
242 std::vector<float> getAPlotWeights(Dataset& dataset, const Binning& binning, const std::vector<float>& boost_predictions)
243 {
244
245 std::vector<float> splot_weights = getSPlotWeights(dataset, binning);
246 std::vector<float> aplot_weights;
247 aplot_weights.reserve(2 * dataset.getNumberOfEvents());
248
249 for (unsigned int iEvent = 0; iEvent < dataset.getNumberOfEvents(); ++iEvent) {
250 dataset.loadEvent(iEvent);
251 const unsigned int iBin = binning.getBin(dataset.m_input[0]);
252
253 double reg_boost_prediction = boost_predictions[iEvent];
254
255 if (reg_boost_prediction > 0.995)
256 reg_boost_prediction = 0.995;
257
258 if (reg_boost_prediction < 0.005)
259 reg_boost_prediction = 0.005;
260
261 float aplot_weight = (binning.m_signal_cdf[iBin] / reg_boost_prediction + (1 - binning.m_signal_cdf[iBin]) /
262 (1 - reg_boost_prediction)) / 2.0;
263
264 aplot_weights.push_back(splot_weights[2 * iEvent] * aplot_weight);
265 aplot_weights.push_back(splot_weights[2 * iEvent + 1] * aplot_weight);
266
267 }
268
269 return aplot_weights;
270 }
271
272
273 }
275}
Binning of a data distribution Provides PDF and CDF values of the distribution per bin.
Definition: Binning.h:27
std::vector< float > m_bckgrd_pdf
Background pdf of data distribution per bin.
Definition: Binning.h:58
std::vector< float > m_signal_pdf
Signal pdf of data distribution per bin.
Definition: Binning.h:56
double m_bckgrd_yield
Background yield in data distribution.
Definition: Binning.h:54
double m_signal_yield
Signal yield in data distribution.
Definition: Binning.h:53
unsigned int getBin(float datapoint) const
Gets the bin corresponding to the given datapoint.
Definition: Binning.cc:34
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
Definition: Dataset.h:33
virtual unsigned int getNumberOfEvents() const =0
Returns the number of events in this dataset.
std::vector< float > m_spectators
Contains all spectators values of the currently loaded event.
Definition: Dataset.h:124
virtual void loadEvent(unsigned int iEvent)=0
Load the event number iEvent.
std::vector< float > m_input
Contains all feature values of the currently loaded event.
Definition: Dataset.h:123
virtual std::vector< float > getFeature(unsigned int iFeature)
Returns all values of one feature in a std::vector<float>
Definition: Dataset.cc:74
bool m_isSignal
Defines if the currently loaded event is signal or background.
Definition: Dataset.h:127
float m_weight
Contains the weight of the currently loaded event.
Definition: Dataset.h:125
virtual unsigned int getSpectatorIndex(const std::string &spectator)
Return index of spectator with the given name.
Definition: Dataset.cc:62
float m_target
Contains the target value of the currently loaded event.
Definition: Dataset.h:126
General options which are shared by all MVA trainings.
Definition: Options.h:62
ReweightingDataset(const GeneralOptions &general_options, Dataset &dataset, const std::vector< float > &weights)
Constructs a new ReweightingDataset.
Definition: DataDriven.cc:25
std::vector< float > m_weights
sPlot weights
Definition: DataDriven.h:97
Dataset & m_dataset
Wrapped dataset.
Definition: DataDriven.h:96
virtual void loadEvent(unsigned int event) override
Load the event number iEvent.
Definition: DataDriven.cc:28
std::vector< float > m_weights
sPlot weights
Definition: DataDriven.h:201
Dataset & m_dataset
Wrapped dataset.
Definition: DataDriven.h:200
virtual void loadEvent(unsigned int event) override
Load the event number iEvent.
Definition: DataDriven.cc:162
SPlotDataset(const GeneralOptions &general_options, Dataset &dataset, const std::vector< float > &weights, float signalFraction)
Constructs a new SPlotDataset.
Definition: DataDriven.cc:153
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: DataDriven.h:176
virtual float getSignalFraction() override
Returns the signal fraction of the whole sample.
Definition: DataDriven.cc:157
float m_signalFraction
Signal fraction.
Definition: DataDriven.h:202
Dataset & m_dataset
Wrapped dataset.
Definition: DataDriven.h:150
double m_negative_signal_weight
the weight for negative signal events
Definition: DataDriven.h:154
double m_signal_weight
the weight for signal events
Definition: DataDriven.h:152
virtual void loadEvent(unsigned int event) override
Load the event number iEvent.
Definition: DataDriven.cc:127
double m_background_weight
the weight for background events
Definition: DataDriven.h:153
SidebandDataset(const GeneralOptions &general_options, Dataset &dataset, Dataset &mc_dataset, const std::string &sideband_variable)
Constructs a new SidebandDataset.
Definition: DataDriven.cc:38
int m_spectator_index
spectator containing the sideband variable
Definition: DataDriven.h:151
Abstract base class for different kinds of events.