Belle II Software  release-05-01-25
DataDriven.cc
1 /**************************************************************************
2  * BASF2 (Belle Analysis Framework 2) *
3  * Copyright(C) 2016 - Belle II Collaboration *
4  * *
5  * Author: The Belle II Collaboration *
6  * Contributors: Thomas Keck *
7  * *
8  * This software is provided "as is" without any warranty. *
9  **************************************************************************/
10 
11 #include <mva/utility/DataDriven.h>
12 #include <mva/utility/Utility.h>
13 #include <mva/methods/PDF.h>
14 #include <mva/methods/Trivial.h>
15 #include <mva/methods/Combination.h>
16 
17 #include <framework/logging/Logger.h>
18 #include <iostream>
19 
20 namespace Belle2 {
25  namespace MVA {
26 
27  ReweightingDataset::ReweightingDataset(const GeneralOptions& general_options, Dataset& dataset,
28  const std::vector<float>& weights) : Dataset(general_options), m_dataset(dataset), m_weights(weights) { }
29 
30  void ReweightingDataset::loadEvent(unsigned int event)
31  {
32  m_dataset.loadEvent(event);
38  }
39 
40  SidebandDataset::SidebandDataset(const GeneralOptions& general_options, Dataset& dataset, Dataset& mc_dataset,
41  const std::string& sideband_variable) : Dataset(general_options), m_dataset(dataset)
42  {
43 
44  m_spectator_index = dataset.getSpectatorIndex(sideband_variable);
45  int mc_spectator_index = mc_dataset.getSpectatorIndex(sideband_variable);
46 
47  double total_signal_mc = 0.0;
48  double total_mc = 0.0;
49  double sum_signal_sr = 0.0;
50  double sum_sr = 0.0;
51  double sum_signal_br = 0.0;
52  double sum_br = 0.0;
53  double sum_signal_nr = 0.0;
54  double sum_nr = 0.0;
55 
56  for (unsigned int iEvent = 0; iEvent < mc_dataset.getNumberOfEvents(); ++iEvent) {
57  mc_dataset.loadEvent(iEvent);
58  if (mc_dataset.m_isSignal)
59  total_signal_mc += mc_dataset.m_weight;
60  total_mc += mc_dataset.m_weight;
61  if (mc_dataset.m_spectators[mc_spectator_index] == 1.0) {
62  if (mc_dataset.m_isSignal)
63  sum_signal_sr += mc_dataset.m_weight;
64  sum_sr += mc_dataset.m_weight;
65  } else if (mc_dataset.m_spectators[mc_spectator_index] == 2.0) {
66  if (mc_dataset.m_isSignal)
67  sum_signal_br += mc_dataset.m_weight;
68  sum_br += mc_dataset.m_weight;
69  } else if (mc_dataset.m_spectators[mc_spectator_index] == 3.0) {
70  if (mc_dataset.m_isSignal)
71  sum_signal_nr += mc_dataset.m_weight;
72  sum_nr += mc_dataset.m_weight;
73  }
74  }
75 
76  double total_data = 0.0;
77  double sum_data_sr = 0.0;
78  double sum_data_br = 0.0;
79  double sum_data_nr = 0.0;
80 
81  for (unsigned int iEvent = 0; iEvent < dataset.getNumberOfEvents(); ++iEvent) {
82  dataset.loadEvent(iEvent);
83  total_data += dataset.m_weight;
84  if (dataset.m_spectators[m_spectator_index] == 1.0) {
85  sum_data_sr += dataset.m_weight;
86  } else if (dataset.m_spectators[m_spectator_index] == 2.0) {
87  sum_data_br += dataset.m_weight;
88  } else if (dataset.m_spectators[m_spectator_index] == 3.0) {
89  sum_data_nr += dataset.m_weight;
90  }
91  }
92 
93  if (sum_signal_br / sum_br > 0.01) {
94  B2WARNING("The background region you defined in the sideband substraction contains more than 1% signal");
95  }
96  if (sum_signal_nr / sum_nr > 0.01) {
97  B2WARNING("The negative signal region you defined in the sideband substraction contains more than 1% signal");
98  }
99 
100  if (sum_data_sr - sum_signal_sr < 0) {
101  B2ERROR("There is less data in the signal region than the expected amount of signal events in the signal region estimated from MC.");
102  }
103 
104  if (total_data - total_signal_mc < 0) {
105  B2ERROR("There is less data than the expected amount of signal events estimated from MC.");
106  }
107 
108  // We assume the number of signal events is correctly described in mc
109  // Everything else (like the background) we take from the data sample
110 
111  // So Signal events in the signal region receive weight 1
112  m_signal_weight = 1.0;
113 
114  // The background is scaled so that it corresponds to the total background in the whole sample
115  m_background_weight = (total_data - total_signal_mc) / sum_data_br;
116 
117  // The negative signal is scaled so it corresponds to the expected background in the signal region:
118  // Background Events in Signal Region in Data = Total Events in Signal Region In Data - Signal Events in Signal Region in MC
119  m_negative_signal_weight = - (sum_data_sr - sum_signal_sr) / sum_data_nr;
120 
121  B2INFO("Data " << total_data << " " << sum_data_sr << " " << sum_data_br << " " << sum_data_nr);
122  B2INFO("MC " << total_mc << " " << sum_sr << " " << sum_br << " " << sum_nr);
123  B2INFO("MC (signal)" << total_signal_mc << " " << sum_signal_sr << " " << sum_signal_br << " " << sum_signal_nr);
124  B2INFO("Sideband Substraction: Signal Weight " << m_signal_weight << " Background Weight " << m_background_weight <<
125  " Negative Signal Weight " << m_negative_signal_weight);
126 
127  }
128 
129  void SidebandDataset::loadEvent(unsigned int event)
130  {
131  m_dataset.loadEvent(event);
135  if (m_spectators[m_spectator_index] == 1.0) {
136  m_isSignal = true;
137  m_target = 1.0;
139  } else if (m_spectators[m_spectator_index] == 2.0) {
140  m_isSignal = false;
141  m_target = 0.0;
143  } else if (m_spectators[m_spectator_index] == 3.0) {
144  m_isSignal = true;
145  m_target = 1.0;
147  } else {
148  m_isSignal = false;
149  m_target = 0.0;
150  m_weight = 0.0;
151  }
152  }
153 
154 
155  SPlotDataset::SPlotDataset(const GeneralOptions& general_options, Dataset& dataset, const std::vector<float>& weights,
156  float signalFraction) : Dataset(general_options), m_dataset(dataset), m_weights(weights), m_signalFraction(signalFraction) { }
157 
158 
160  {
161  return m_signalFraction;
162  }
163 
164  void SPlotDataset::loadEvent(unsigned int event)
165  {
166  m_dataset.loadEvent(event / 2);
167  for (unsigned int iFeature = 0; iFeature < getNumberOfFeatures(); ++iFeature) {
168  m_input[iFeature] = m_dataset.m_input[iFeature];
169  }
170 
171  if (event % 2 == 1) {
172  m_isSignal = false;
173  m_target = 0.0;
175  } else {
176  m_isSignal = true;
177  m_target = 1.0;
179  }
180  }
181 
182 
183  std::vector<float> getSPlotWeights(Dataset& dataset, Binning& binning)
184  {
185  // TODO
186  // Instead of calculating the covariance matrix directly one could do an actual fit here,
187  // this would reduce the reliance on MC even further.
188  // Also it should be no problem generalize this code to more than two components.
189  // I didn't implement this, yet. Since it is not forseeable if and in which context this will be used,
190  // so feel free to implement the missing features here.
191 
192  auto discriminants = dataset.getFeature(0);
193 
194  // Calculate Covariance Matrix of pdfs by calculating first the inverse-covariance matrix
195  // and then inverting the inverse-convariance matrix
196 
197  double inverse_covariance[3] = {0};
198  for (auto& v : discriminants) {
199  const unsigned int iBin = binning.getBin(v);
200  double norm = (binning.m_signal_yield * binning.m_signal_pdf[iBin] + binning.m_bckgrd_yield * binning.m_bckgrd_pdf[iBin]);
201  norm = norm * norm;
202  inverse_covariance[0] += binning.m_signal_pdf[iBin] * binning.m_signal_pdf[iBin] / norm;
203  inverse_covariance[1] += binning.m_signal_pdf[iBin] * binning.m_bckgrd_pdf[iBin] / norm;
204  inverse_covariance[2] += binning.m_bckgrd_pdf[iBin] * binning.m_bckgrd_pdf[iBin] / norm;
205  }
206  double covariance[3] = {0};
207  double determinante = (inverse_covariance[0] * inverse_covariance[2] - inverse_covariance[1] * inverse_covariance[1]);
208  covariance[0] = inverse_covariance[2] / determinante;
209  covariance[1] = -inverse_covariance[1] / determinante;
210  covariance[2] = inverse_covariance[0] / determinante;
211 
212  // Finally calculate sPlot weights
213  std::vector<float> splot_weights;
214  splot_weights.reserve(2 * discriminants.size());
215  for (auto& v : discriminants) {
216  const unsigned int iBin = binning.getBin(v);
217  double norm = (binning.m_signal_yield * binning.m_signal_pdf[iBin] + binning.m_bckgrd_yield * binning.m_bckgrd_pdf[iBin]);
218  splot_weights.push_back((covariance[0] * binning.m_signal_pdf[iBin] + covariance[1] * binning.m_bckgrd_pdf[iBin]) / norm);
219  splot_weights.push_back((covariance[1] * binning.m_signal_pdf[iBin] + covariance[2] * binning.m_bckgrd_pdf[iBin]) / norm);
220  }
221 
222  B2INFO("Covariance Matrix of SPlot");
223  B2INFO(covariance[0] << " " << covariance[1] << " " << covariance[2]);
224 
225  return splot_weights;
226 
227  }
228 
229  std::vector<float> getBoostWeights(Dataset& dataset, Binning& binning)
230  {
231 
232  std::vector<float> boost_weights;
233  boost_weights.reserve(2 * dataset.getNumberOfEvents());
234  for (unsigned int iEvent = 0; iEvent < dataset.getNumberOfEvents(); ++iEvent) {
235  dataset.loadEvent(iEvent);
236  const unsigned int bin = binning.getBin(dataset.m_input[0]);
237  boost_weights.push_back(binning.m_signal_cdf[bin] / binning.m_bckgrd_pdf[bin]);
238  boost_weights.push_back((1 - binning.m_signal_cdf[bin]) / binning.m_bckgrd_pdf[bin]);
239  }
240  return boost_weights;
241 
242  }
243 
244  std::vector<float> getAPlotWeights(Dataset& dataset, Binning& binning, const std::vector<float>& boost_predictions)
245  {
246 
247  std::vector<float> splot_weights = getSPlotWeights(dataset, binning);
248  std::vector<float> aplot_weights;
249  aplot_weights.reserve(2 * dataset.getNumberOfEvents());
250 
251  for (unsigned int iEvent = 0; iEvent < dataset.getNumberOfEvents(); ++iEvent) {
252  dataset.loadEvent(iEvent);
253  const unsigned int iBin = binning.getBin(dataset.m_input[0]);
254 
255  double reg_boost_prediction = boost_predictions[iEvent];
256 
257  if (reg_boost_prediction > 0.995)
258  reg_boost_prediction = 0.995;
259 
260  if (reg_boost_prediction < 0.005)
261  reg_boost_prediction = 0.005;
262 
263  float aplot_weight = (binning.m_signal_cdf[iBin] / reg_boost_prediction + (1 - binning.m_signal_cdf[iBin]) /
264  (1 - reg_boost_prediction)) / 2.0;
265 
266  aplot_weights.push_back(splot_weights[2 * iEvent] * aplot_weight);
267  aplot_weights.push_back(splot_weights[2 * iEvent + 1] * aplot_weight);
268 
269  }
270 
271  return aplot_weights;
272  }
273 
274 
275  }
277 }
Belle2::MVA::SPlotDataset::SPlotDataset
SPlotDataset(const GeneralOptions &general_options, Dataset &dataset, const std::vector< float > &weights, float signalFraction)
Constructs a new SPlotDataset.
Definition: DataDriven.cc:163
Belle2::MVA::SidebandDataset::loadEvent
virtual void loadEvent(unsigned int event) override
Load the event number iEvent.
Definition: DataDriven.cc:137
Belle2::MVA::Dataset
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
Definition: Dataset.h:34
Belle2::MVA::Dataset::loadEvent
virtual void loadEvent(unsigned int iEvent)=0
Load the event number iEvent.
Belle2::MVA::ReweightingDataset::m_weights
std::vector< float > m_weights
sPlot weights
Definition: DataDriven.h:99
Belle2::MVA::SidebandDataset::m_spectator_index
int m_spectator_index
spectator containing the sideband variable
Definition: DataDriven.h:153
Belle2::MVA::SidebandDataset::m_dataset
Dataset & m_dataset
Wrapped dataset.
Definition: DataDriven.h:152
Belle2::MVA::SPlotDataset::getNumberOfFeatures
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: DataDriven.h:178
Belle2::MVA::ReweightingDataset::loadEvent
virtual void loadEvent(unsigned int event) override
Load the event number iEvent.
Definition: DataDriven.cc:38
Belle2::MVA::SPlotDataset::loadEvent
virtual void loadEvent(unsigned int event) override
Load the event number iEvent.
Definition: DataDriven.cc:172
Belle2::MVA::SPlotDataset::m_dataset
Dataset & m_dataset
Wrapped dataset.
Definition: DataDriven.h:202
Belle2::MVA::SidebandDataset::SidebandDataset
SidebandDataset(const GeneralOptions &general_options, Dataset &dataset, Dataset &mc_dataset, const std::string &sideband_variable)
Constructs a new SidebandDataset.
Definition: DataDriven.cc:48
Belle2::MVA::SPlotDataset::getSignalFraction
virtual float getSignalFraction() override
Returns the signal fraction of the whole sample.
Definition: DataDriven.cc:167
Belle2
Abstract base class for different kinds of events.
Definition: MillepedeAlgorithm.h:19
Belle2::MVA::SidebandDataset::m_background_weight
double m_background_weight
the weight for background events
Definition: DataDriven.h:155
Belle2::MVA::Dataset::m_input
std::vector< float > m_input
Contains all feature values of the currently loaded event.
Definition: Dataset.h:124
Belle2::MVA::ReweightingDataset::ReweightingDataset
ReweightingDataset(const GeneralOptions &general_options, Dataset &dataset, const std::vector< float > &weights)
Constructs a new ReweightingDataset.
Definition: DataDriven.cc:35
Belle2::MVA::Dataset::m_spectators
std::vector< float > m_spectators
Contains all spectators values of the currently loaded event.
Definition: Dataset.h:125
Belle2::MVA::GeneralOptions
General options which are shared by all MVA trainings.
Definition: Options.h:64
Belle2::MVA::SidebandDataset::m_negative_signal_weight
double m_negative_signal_weight
the weight for negative signal events
Definition: DataDriven.h:156
Belle2::MVA::SPlotDataset::m_weights
std::vector< float > m_weights
sPlot weights
Definition: DataDriven.h:203
Belle2::MVA::Dataset::m_weight
float m_weight
Contains the weight of the currently loaded event.
Definition: Dataset.h:126
Belle2::MVA::Dataset::getSpectatorIndex
virtual unsigned int getSpectatorIndex(const std::string &spectator)
Return index of spectator with the given name.
Definition: Dataset.cc:74
Belle2::MVA::Dataset::m_isSignal
bool m_isSignal
Defines if the currently loaded event is signal or background.
Definition: Dataset.h:128
Belle2::MVA::Dataset::getNumberOfEvents
virtual unsigned int getNumberOfEvents() const =0
Returns the number of events in this dataset.
Belle2::MVA::Dataset::m_target
float m_target
Contains the target value of the currently loaded event.
Definition: Dataset.h:127
Belle2::MVA::SPlotDataset::m_signalFraction
float m_signalFraction
Signal fraction.
Definition: DataDriven.h:204
Belle2::MVA::SidebandDataset::m_signal_weight
double m_signal_weight
the weight for signal events
Definition: DataDriven.h:154
Belle2::MVA::ReweightingDataset::m_dataset
Dataset & m_dataset
Wrapped dataset.
Definition: DataDriven.h:98