Belle II Software  release-08-01-10
DataDriven.cc
1 /**************************************************************************
2  * basf2 (Belle II Analysis Software Framework) *
3  * Author: The Belle II Collaboration *
4  * *
5  * See git log for contributors and copyright holders. *
6  * This file is licensed under LGPL-3.0, see LICENSE.md. *
7  **************************************************************************/
8 
9 #include <mva/utility/DataDriven.h>
10 #include <mva/utility/Utility.h>
11 #include <mva/methods/PDF.h>
12 #include <mva/methods/Trivial.h>
13 #include <mva/methods/Combination.h>
14 
15 #include <framework/logging/Logger.h>
16 #include <iostream>
17 
18 namespace Belle2 {
23  namespace MVA {
24 
26  const std::vector<float>& weights) : Dataset(general_options), m_dataset(dataset), m_weights(weights) { }
27 
28  void ReweightingDataset::loadEvent(unsigned int event)
29  {
30  m_dataset.loadEvent(event);
36  }
37 
38  SidebandDataset::SidebandDataset(const GeneralOptions& general_options, Dataset& dataset, Dataset& mc_dataset,
39  const std::string& sideband_variable) : Dataset(general_options), m_dataset(dataset)
40  {
41 
42  m_spectator_index = dataset.getSpectatorIndex(sideband_variable);
43  int mc_spectator_index = mc_dataset.getSpectatorIndex(sideband_variable);
44 
45  double total_signal_mc = 0.0;
46  double total_mc = 0.0;
47  double sum_signal_sr = 0.0;
48  double sum_sr = 0.0;
49  double sum_signal_br = 0.0;
50  double sum_br = 0.0;
51  double sum_signal_nr = 0.0;
52  double sum_nr = 0.0;
53 
54  for (unsigned int iEvent = 0; iEvent < mc_dataset.getNumberOfEvents(); ++iEvent) {
55  mc_dataset.loadEvent(iEvent);
56  if (mc_dataset.m_isSignal)
57  total_signal_mc += mc_dataset.m_weight;
58  total_mc += mc_dataset.m_weight;
59  if (mc_dataset.m_spectators[mc_spectator_index] == 1.0) {
60  if (mc_dataset.m_isSignal)
61  sum_signal_sr += mc_dataset.m_weight;
62  sum_sr += mc_dataset.m_weight;
63  } else if (mc_dataset.m_spectators[mc_spectator_index] == 2.0) {
64  if (mc_dataset.m_isSignal)
65  sum_signal_br += mc_dataset.m_weight;
66  sum_br += mc_dataset.m_weight;
67  } else if (mc_dataset.m_spectators[mc_spectator_index] == 3.0) {
68  if (mc_dataset.m_isSignal)
69  sum_signal_nr += mc_dataset.m_weight;
70  sum_nr += mc_dataset.m_weight;
71  }
72  }
73 
74  double total_data = 0.0;
75  double sum_data_sr = 0.0;
76  double sum_data_br = 0.0;
77  double sum_data_nr = 0.0;
78 
79  for (unsigned int iEvent = 0; iEvent < dataset.getNumberOfEvents(); ++iEvent) {
80  dataset.loadEvent(iEvent);
81  total_data += dataset.m_weight;
82  if (dataset.m_spectators[m_spectator_index] == 1.0) {
83  sum_data_sr += dataset.m_weight;
84  } else if (dataset.m_spectators[m_spectator_index] == 2.0) {
85  sum_data_br += dataset.m_weight;
86  } else if (dataset.m_spectators[m_spectator_index] == 3.0) {
87  sum_data_nr += dataset.m_weight;
88  }
89  }
90 
91  if (sum_signal_br / sum_br > 0.01) {
92  B2WARNING("The background region you defined in the sideband subtraction contains more than 1% signal");
93  }
94  if (sum_signal_nr / sum_nr > 0.01) {
95  B2WARNING("The negative signal region you defined in the sideband subtraction contains more than 1% signal");
96  }
97 
98  if (sum_data_sr - sum_signal_sr < 0) {
99  B2ERROR("There is less data in the signal region than the expected amount of signal events in the signal region estimated from MC.");
100  }
101 
102  if (total_data - total_signal_mc < 0) {
103  B2ERROR("There is less data than the expected amount of signal events estimated from MC.");
104  }
105 
106  // We assume the number of signal events is correctly described in mc
107  // Everything else (like the background) we take from the data sample
108 
109  // So Signal events in the signal region receive weight 1
110  m_signal_weight = 1.0;
111 
112  // The background is scaled so that it corresponds to the total background in the whole sample
113  m_background_weight = (total_data - total_signal_mc) / sum_data_br;
114 
115  // The negative signal is scaled so it corresponds to the expected background in the signal region:
116  // Background Events in Signal Region in Data = Total Events in Signal Region In Data - Signal Events in Signal Region in MC
117  m_negative_signal_weight = - (sum_data_sr - sum_signal_sr) / sum_data_nr;
118 
119  B2INFO("Data " << total_data << " " << sum_data_sr << " " << sum_data_br << " " << sum_data_nr);
120  B2INFO("MC " << total_mc << " " << sum_sr << " " << sum_br << " " << sum_nr);
121  B2INFO("MC (signal)" << total_signal_mc << " " << sum_signal_sr << " " << sum_signal_br << " " << sum_signal_nr);
122  B2INFO("Sideband Subtraction: Signal Weight " << m_signal_weight << " Background Weight " << m_background_weight <<
123  " Negative Signal Weight " << m_negative_signal_weight);
124 
125  }
126 
127  void SidebandDataset::loadEvent(unsigned int event)
128  {
129  m_dataset.loadEvent(event);
133  if (m_spectators[m_spectator_index] == 1.0) {
134  m_isSignal = true;
135  m_target = 1.0;
137  } else if (m_spectators[m_spectator_index] == 2.0) {
138  m_isSignal = false;
139  m_target = 0.0;
141  } else if (m_spectators[m_spectator_index] == 3.0) {
142  m_isSignal = true;
143  m_target = 1.0;
145  } else {
146  m_isSignal = false;
147  m_target = 0.0;
148  m_weight = 0.0;
149  }
150  }
151 
152 
153  SPlotDataset::SPlotDataset(const GeneralOptions& general_options, Dataset& dataset, const std::vector<float>& weights,
154  float signalFraction) : Dataset(general_options), m_dataset(dataset), m_weights(weights), m_signalFraction(signalFraction) { }
155 
156 
158  {
159  return m_signalFraction;
160  }
161 
162  void SPlotDataset::loadEvent(unsigned int event)
163  {
164  m_dataset.loadEvent(event / 2);
165  for (unsigned int iFeature = 0; iFeature < getNumberOfFeatures(); ++iFeature) {
166  m_input[iFeature] = m_dataset.m_input[iFeature];
167  }
168 
169  if (event % 2 == 1) {
170  m_isSignal = false;
171  m_target = 0.0;
173  } else {
174  m_isSignal = true;
175  m_target = 1.0;
177  }
178  }
179 
180 
181  std::vector<float> getSPlotWeights(Dataset& dataset, const Binning& binning)
182  {
183  // TODO
184  // Instead of calculating the covariance matrix directly one could do an actual fit here,
185  // this would reduce the reliance on MC even further.
186  // Also it should be no problem generalize this code to more than two components.
187  // I didn't implement this, yet. Since it is not foreseeable if and in which context this will be used,
188  // so feel free to implement the missing features here.
189 
190  auto discriminants = dataset.getFeature(0);
191 
192  // Calculate Covariance Matrix of pdfs by calculating first the inverse-covariance matrix
193  // and then inverting the inverse-convariance matrix
194 
195  double inverse_covariance[3] = {0};
196  for (auto& v : discriminants) {
197  const unsigned int iBin = binning.getBin(v);
198  double norm = (binning.m_signal_yield * binning.m_signal_pdf[iBin] + binning.m_bckgrd_yield * binning.m_bckgrd_pdf[iBin]);
199  norm = norm * norm;
200  inverse_covariance[0] += binning.m_signal_pdf[iBin] * binning.m_signal_pdf[iBin] / norm;
201  inverse_covariance[1] += binning.m_signal_pdf[iBin] * binning.m_bckgrd_pdf[iBin] / norm;
202  inverse_covariance[2] += binning.m_bckgrd_pdf[iBin] * binning.m_bckgrd_pdf[iBin] / norm;
203  }
204  double covariance[3] = {0};
205  double determinante = (inverse_covariance[0] * inverse_covariance[2] - inverse_covariance[1] * inverse_covariance[1]);
206  covariance[0] = inverse_covariance[2] / determinante;
207  covariance[1] = -inverse_covariance[1] / determinante;
208  covariance[2] = inverse_covariance[0] / determinante;
209 
210  // Finally calculate sPlot weights
211  std::vector<float> splot_weights;
212  splot_weights.reserve(2 * discriminants.size());
213  for (auto& v : discriminants) {
214  const unsigned int iBin = binning.getBin(v);
215  double norm = (binning.m_signal_yield * binning.m_signal_pdf[iBin] + binning.m_bckgrd_yield * binning.m_bckgrd_pdf[iBin]);
216  splot_weights.push_back((covariance[0] * binning.m_signal_pdf[iBin] + covariance[1] * binning.m_bckgrd_pdf[iBin]) / norm);
217  splot_weights.push_back((covariance[1] * binning.m_signal_pdf[iBin] + covariance[2] * binning.m_bckgrd_pdf[iBin]) / norm);
218  }
219 
220  B2INFO("Covariance Matrix of SPlot");
221  B2INFO(covariance[0] << " " << covariance[1] << " " << covariance[2]);
222 
223  return splot_weights;
224 
225  }
226 
227  std::vector<float> getBoostWeights(Dataset& dataset, const Binning& binning)
228  {
229 
230  std::vector<float> boost_weights;
231  boost_weights.reserve(2 * dataset.getNumberOfEvents());
232  for (unsigned int iEvent = 0; iEvent < dataset.getNumberOfEvents(); ++iEvent) {
233  dataset.loadEvent(iEvent);
234  const unsigned int bin = binning.getBin(dataset.m_input[0]);
235  boost_weights.push_back(binning.m_signal_cdf[bin] / binning.m_bckgrd_pdf[bin]);
236  boost_weights.push_back((1 - binning.m_signal_cdf[bin]) / binning.m_bckgrd_pdf[bin]);
237  }
238  return boost_weights;
239 
240  }
241 
242  std::vector<float> getAPlotWeights(Dataset& dataset, const Binning& binning, const std::vector<float>& boost_predictions)
243  {
244 
245  std::vector<float> splot_weights = getSPlotWeights(dataset, binning);
246  std::vector<float> aplot_weights;
247  aplot_weights.reserve(2 * dataset.getNumberOfEvents());
248 
249  for (unsigned int iEvent = 0; iEvent < dataset.getNumberOfEvents(); ++iEvent) {
250  dataset.loadEvent(iEvent);
251  const unsigned int iBin = binning.getBin(dataset.m_input[0]);
252 
253  double reg_boost_prediction = boost_predictions[iEvent];
254 
255  if (reg_boost_prediction > 0.995)
256  reg_boost_prediction = 0.995;
257 
258  if (reg_boost_prediction < 0.005)
259  reg_boost_prediction = 0.005;
260 
261  float aplot_weight = (binning.m_signal_cdf[iBin] / reg_boost_prediction + (1 - binning.m_signal_cdf[iBin]) /
262  (1 - reg_boost_prediction)) / 2.0;
263 
264  aplot_weights.push_back(splot_weights[2 * iEvent] * aplot_weight);
265  aplot_weights.push_back(splot_weights[2 * iEvent + 1] * aplot_weight);
266 
267  }
268 
269  return aplot_weights;
270  }
271 
272 
273  }
275 }
Binning of a data distribution Provides PDF and CDF values of the distribution per bin.
Definition: Binning.h:27
std::vector< float > m_bckgrd_pdf
Background pdf of data distribution per bin.
Definition: Binning.h:58
std::vector< float > m_signal_pdf
Signal pdf of data distribution per bin.
Definition: Binning.h:56
double m_bckgrd_yield
Background yield in data distribution.
Definition: Binning.h:54
double m_signal_yield
Signal yield in data distribution.
Definition: Binning.h:53
unsigned int getBin(float datapoint) const
Gets the bin corresponding to the given datapoint.
Definition: Binning.cc:34
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
Definition: Dataset.h:33
virtual unsigned int getNumberOfEvents() const =0
Returns the number of events in this dataset.
std::vector< float > m_spectators
Contains all spectators values of the currently loaded event.
Definition: Dataset.h:124
virtual void loadEvent(unsigned int iEvent)=0
Load the event number iEvent.
std::vector< float > m_input
Contains all feature values of the currently loaded event.
Definition: Dataset.h:123
virtual std::vector< float > getFeature(unsigned int iFeature)
Returns all values of one feature in a std::vector<float>
Definition: Dataset.cc:74
bool m_isSignal
Defines if the currently loaded event is signal or background.
Definition: Dataset.h:127
float m_weight
Contains the weight of the currently loaded event.
Definition: Dataset.h:125
virtual unsigned int getSpectatorIndex(const std::string &spectator)
Return index of spectator with the given name.
Definition: Dataset.cc:62
float m_target
Contains the target value of the currently loaded event.
Definition: Dataset.h:126
General options which are shared by all MVA trainings.
Definition: Options.h:62
ReweightingDataset(const GeneralOptions &general_options, Dataset &dataset, const std::vector< float > &weights)
Constructs a new ReweightingDataset.
Definition: DataDriven.cc:25
std::vector< float > m_weights
sPlot weights
Definition: DataDriven.h:97
Dataset & m_dataset
Wrapped dataset.
Definition: DataDriven.h:96
virtual void loadEvent(unsigned int event) override
Load the event number iEvent.
Definition: DataDriven.cc:28
std::vector< float > m_weights
sPlot weights
Definition: DataDriven.h:201
Dataset & m_dataset
Wrapped dataset.
Definition: DataDriven.h:200
virtual void loadEvent(unsigned int event) override
Load the event number iEvent.
Definition: DataDriven.cc:162
SPlotDataset(const GeneralOptions &general_options, Dataset &dataset, const std::vector< float > &weights, float signalFraction)
Constructs a new SPlotDataset.
Definition: DataDriven.cc:153
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: DataDriven.h:176
virtual float getSignalFraction() override
Returns the signal fraction of the whole sample.
Definition: DataDriven.cc:157
float m_signalFraction
Signal fraction.
Definition: DataDriven.h:202
Dataset & m_dataset
Wrapped dataset.
Definition: DataDriven.h:150
double m_negative_signal_weight
the weight for negative signal events
Definition: DataDriven.h:154
double m_signal_weight
the weight for signal events
Definition: DataDriven.h:152
virtual void loadEvent(unsigned int event) override
Load the event number iEvent.
Definition: DataDriven.cc:127
double m_background_weight
the weight for background events
Definition: DataDriven.h:153
SidebandDataset(const GeneralOptions &general_options, Dataset &dataset, Dataset &mc_dataset, const std::string &sideband_variable)
Constructs a new SidebandDataset.
Definition: DataDriven.cc:38
int m_spectator_index
spectator containing the sideband variable
Definition: DataDriven.h:151
Abstract base class for different kinds of events.