Belle II Software  release-08-01-10
Binning.cc
1 /**************************************************************************
2  * basf2 (Belle II Analysis Software Framework) *
3  * Author: The Belle II Collaboration *
4  * *
5  * See git log for contributors and copyright holders. *
6  * This file is licensed under LGPL-3.0, see LICENSE.md. *
7  **************************************************************************/
8 
9 #include <mva/utility/Binning.h>
10 
11 #include <algorithm>
12 #include <numeric>
13 
14 namespace Belle2 {
19  namespace MVA {
20 
21  Binning::Binning(unsigned int nBins)
22  {
23 
24  m_signal_pdf.resize(nBins, 0.0);
25  m_signal_cdf.resize(nBins, 0.0);
26  m_bckgrd_pdf.resize(nBins, 0.0);
27  m_bckgrd_cdf.resize(nBins, 0.0);
28  m_boundaries.resize(nBins + 1, 0.0);
29 
30  m_signal_yield = 0;
31  m_bckgrd_yield = 0;
32  }
33 
34  unsigned int Binning::getBin(float datapoint) const
35  {
36 
37  auto it = std::upper_bound(m_boundaries.begin(), m_boundaries.end(), datapoint);
38  unsigned int bin = std::distance(m_boundaries.begin(), it);
39  if (bin == 0)
40  bin = 1;
41  if (bin == m_boundaries.size())
42  bin = m_boundaries.size() - 1;
43  return bin - 1;
44 
45  }
46 
48  {
49 
50  unsigned int nBins = m_signal_pdf.size();
51 
52  m_signal_yield = 0;
53  m_bckgrd_yield = 0;
54 
55  // Total number of events
56  for (unsigned int iBin = 0; iBin < nBins; ++iBin) {
59  }
60 
61  // Each bin is normed to its width
62  double last_valid_bound = m_boundaries[0];
63  for (unsigned int iBin = 0; iBin < nBins; ++iBin) {
64  m_signal_pdf[iBin] /= m_signal_yield * (m_boundaries[iBin + 1] - last_valid_bound) / (m_boundaries[nBins] - m_boundaries[0]);
65  m_bckgrd_pdf[iBin] /= m_bckgrd_yield * (m_boundaries[iBin + 1] - last_valid_bound) / (m_boundaries[nBins] - m_boundaries[0]);
66  if (iBin + 1 < nBins and m_boundaries[iBin + 2] > m_boundaries[iBin + 1]) {
67  last_valid_bound = m_boundaries[iBin + 1];
68  }
69  }
70 
71  }
72 
74  {
75 
76  unsigned int nBins = m_signal_pdf.size();
77 
80 
81  for (unsigned int iBin = 0; iBin < nBins; ++iBin) {
82  m_signal_cdf[iBin] *= (m_boundaries[iBin + 1] - m_boundaries[iBin]) / (m_boundaries[nBins] - m_boundaries[0]);
83  m_bckgrd_cdf[iBin] *= (m_boundaries[iBin + 1] - m_boundaries[iBin]) / (m_boundaries[nBins] - m_boundaries[0]);
84  }
85 
86  for (unsigned int iBin = 1; iBin < nBins; ++iBin) {
87  m_signal_cdf[iBin] += m_signal_cdf[iBin - 1];
88  m_bckgrd_cdf[iBin] += m_bckgrd_cdf[iBin - 1];
89  }
90 
91  }
92 
93  Binning Binning::CreateEqualFrequency(const std::vector<float>& data, const std::vector<float>& weights,
94  const std::vector<bool>& isSignal, unsigned int nBins)
95  {
96 
97  Binning binning(nBins);
98 
99  unsigned int nEvents = data.size();
100 
101  std::vector<unsigned int> indices(nEvents);
102  std::iota(indices.begin(), indices.end(), 0);
103  std::sort(indices.begin(), indices.end(), [&](unsigned int i, unsigned int j) {return data[i] < data[j]; });
104 
105  double sum_weights = 0;
106  for (auto& w : weights)
107  sum_weights += w;
108  double weight_per_bin = sum_weights / nBins;
109 
110  unsigned int bin = 1;
111  double current_weight = 0;
112  binning.m_boundaries[0] = data[indices[0]];
113  binning.m_boundaries[nBins] = data[indices[nEvents - 1]];
114 
115  for (unsigned int iEvent = 0; iEvent < nEvents; ++iEvent) {
116  unsigned int index = indices[iEvent];
117  current_weight += weights[index];
118  if (current_weight >= weight_per_bin and bin < nBins and binning.m_boundaries[bin - 1] < data[index]) {
119  auto number_of_bins = static_cast<unsigned int>(current_weight / weight_per_bin);
120  current_weight -= weight_per_bin * number_of_bins;
121  for (unsigned int i = 0; i < number_of_bins; ++i) {
122  binning.m_boundaries[bin] = data[index];
123  bin++;
124  }
125  }
126  if (isSignal[index]) {
127  binning.m_signal_pdf[bin - 1] += weights[index];
128  } else {
129  binning.m_bckgrd_pdf[bin - 1] += weights[index];
130  }
131  }
132 
133  binning.normalizePDFs();
134  binning.calculateCDFsFromPDFs();
135 
136  return binning;
137  }
138 
139  Binning Binning::CreateEquidistant(const std::vector<float>& data, const std::vector<float>& weights,
140  const std::vector<bool>& isSignal, unsigned int nBins)
141  {
142 
143  Binning binning(nBins);
144 
145  auto minmax = std::minmax_element(data.begin(), data.end());
146  float min = *(minmax.first);
147  float max = *(minmax.second);
148  float step = (max - min) / nBins;
149 
150  for (unsigned int iBin = 0; iBin <= nBins; ++iBin) {
151  binning.m_boundaries[iBin] = min + step * iBin;
152  }
153 
154  for (unsigned int iEvent = 0; iEvent < data.size(); ++iEvent) {
155  unsigned int bin = binning.getBin(data[iEvent]);
156 
157  if (isSignal[iEvent])
158  binning.m_signal_pdf[bin] += weights[iEvent];
159  else
160  binning.m_bckgrd_pdf[bin] += weights[iEvent];
161 
162  }
163 
164  binning.normalizePDFs();
165  binning.calculateCDFsFromPDFs();
166 
167  return binning;
168 
169  }
170 
171  }
173 }
174 
Binning of a data distribution Provides PDF and CDF values of the distribution per bin.
Definition: Binning.h:27
std::vector< float > m_bckgrd_pdf
Background pdf of data distribution per bin.
Definition: Binning.h:58
std::vector< float > m_signal_pdf
Signal pdf of data distribution per bin.
Definition: Binning.h:56
std::vector< float > m_boundaries
Boundaries of data distribution, including minimum and maximum value as first and last boundary.
Definition: Binning.h:61
Binning(unsigned int nBins=0)
Creates an empty binning with nBins.
Definition: Binning.cc:21
std::vector< float > m_bckgrd_cdf
Background cdf of data distribution per bin.
Definition: Binning.h:59
static Binning CreateEquidistant(const std::vector< float > &data, const std::vector< float > &weights, const std::vector< bool > &isSignal, unsigned int nBins)
Create an equidistant binning.
Definition: Binning.cc:139
double m_bckgrd_yield
Background yield in data distribution.
Definition: Binning.h:54
double m_signal_yield
Signal yield in data distribution.
Definition: Binning.h:53
std::vector< float > m_signal_cdf
Signal cdf of data distribution per bin.
Definition: Binning.h:57
void calculateCDFsFromPDFs()
Calculates the CDF values from the pdf values, which are assumed to be normalized.
Definition: Binning.cc:73
void normalizePDFs()
Normalizes the PDF values, so their sum is 1.
Definition: Binning.cc:47
static Binning CreateEqualFrequency(const std::vector< float > &data, const std::vector< float > &weights, const std::vector< bool > &isSignal, unsigned int nBins)
Create an equal frequency (aka equal-statistics) binning.
Definition: Binning.cc:93
unsigned int getBin(float datapoint) const
Gets the bin corresponding to the given datapoint.
Definition: Binning.cc:34
Abstract base class for different kinds of events.