Belle II Software development
Binning.cc
1/**************************************************************************
2 * basf2 (Belle II Analysis Software Framework) *
3 * Author: The Belle II Collaboration *
4 * *
5 * See git log for contributors and copyright holders. *
6 * This file is licensed under LGPL-3.0, see LICENSE.md. *
7 **************************************************************************/
8
9#include <mva/utility/Binning.h>
10
11#include <algorithm>
12#include <numeric>
13
14namespace Belle2 {
19 namespace MVA {
20
21 Binning::Binning(unsigned int nBins)
22 {
23
24 m_signal_pdf.resize(nBins, 0.0);
25 m_signal_cdf.resize(nBins, 0.0);
26 m_bckgrd_pdf.resize(nBins, 0.0);
27 m_bckgrd_cdf.resize(nBins, 0.0);
28 m_boundaries.resize(nBins + 1, 0.0);
29
32 }
33
34 unsigned int Binning::getBin(float datapoint) const
35 {
36
37 auto it = std::upper_bound(m_boundaries.begin(), m_boundaries.end(), datapoint);
38 unsigned int bin = std::distance(m_boundaries.begin(), it);
39 if (bin == 0)
40 bin = 1;
41 if (bin == m_boundaries.size())
42 bin = m_boundaries.size() - 1;
43 return bin - 1;
44
45 }
46
47 void Binning::normalizePDFs()
48 {
49
50 unsigned int nBins = m_signal_pdf.size();
51
54
55 // Total number of events
56 for (unsigned int iBin = 0; iBin < nBins; ++iBin) {
59 }
60
61 // Each bin is normed to its width
62 double last_valid_bound = m_boundaries[0];
63 for (unsigned int iBin = 0; iBin < nBins; ++iBin) {
64 m_signal_pdf[iBin] /= m_signal_yield * (m_boundaries[iBin + 1] - last_valid_bound) / (m_boundaries[nBins] - m_boundaries[0]);
65 m_bckgrd_pdf[iBin] /= m_bckgrd_yield * (m_boundaries[iBin + 1] - last_valid_bound) / (m_boundaries[nBins] - m_boundaries[0]);
66 if (iBin + 1 < nBins and m_boundaries[iBin + 2] > m_boundaries[iBin + 1]) {
67 last_valid_bound = m_boundaries[iBin + 1];
68 }
69 }
70
71 }
72
73 void Binning::calculateCDFsFromPDFs()
74 {
75
76 unsigned int nBins = m_signal_pdf.size();
77
80
81 for (unsigned int iBin = 0; iBin < nBins; ++iBin) {
82 m_signal_cdf[iBin] *= (m_boundaries[iBin + 1] - m_boundaries[iBin]) / (m_boundaries[nBins] - m_boundaries[0]);
83 m_bckgrd_cdf[iBin] *= (m_boundaries[iBin + 1] - m_boundaries[iBin]) / (m_boundaries[nBins] - m_boundaries[0]);
84 }
85
86 for (unsigned int iBin = 1; iBin < nBins; ++iBin) {
87 m_signal_cdf[iBin] += m_signal_cdf[iBin - 1];
88 m_bckgrd_cdf[iBin] += m_bckgrd_cdf[iBin - 1];
89 }
90
91 }
92
93 Binning Binning::CreateEqualFrequency(const std::vector<float>& data, const std::vector<float>& weights,
94 const std::vector<bool>& isSignal, unsigned int nBins)
95 {
96
97 Binning binning(nBins);
98
99 unsigned int nEvents = data.size();
100
101 std::vector<unsigned int> indices(nEvents);
102 std::iota(indices.begin(), indices.end(), 0);
103 std::sort(indices.begin(), indices.end(), [&](unsigned int i, unsigned int j) {return data[i] < data[j]; });
104
105 double sum_weights = 0;
106 for (auto& w : weights)
107 sum_weights += w;
108 double weight_per_bin = sum_weights / nBins;
109
110 unsigned int bin = 1;
111 double current_weight = 0;
112 binning.m_boundaries[0] = data[indices[0]];
113 binning.m_boundaries[nBins] = data[indices[nEvents - 1]];
114
115 for (unsigned int iEvent = 0; iEvent < nEvents; ++iEvent) {
116 unsigned int index = indices[iEvent];
117 current_weight += weights[index];
118 if (current_weight >= weight_per_bin and bin < nBins and binning.m_boundaries[bin - 1] < data[index]) {
119 auto number_of_bins = static_cast<unsigned int>(current_weight / weight_per_bin);
120 current_weight -= weight_per_bin * number_of_bins;
121 for (unsigned int i = 0; i < number_of_bins; ++i) {
122 binning.m_boundaries[bin] = data[index];
123 bin++;
124 }
125 }
126 if (isSignal[index]) {
127 binning.m_signal_pdf[bin - 1] += weights[index];
128 } else {
129 binning.m_bckgrd_pdf[bin - 1] += weights[index];
130 }
131 }
132
133 binning.normalizePDFs();
134 binning.calculateCDFsFromPDFs();
135
136 return binning;
137 }
138
139 Binning Binning::CreateEquidistant(const std::vector<float>& data, const std::vector<float>& weights,
140 const std::vector<bool>& isSignal, unsigned int nBins)
141 {
142
143 Binning binning(nBins);
144
145 auto minmax = std::minmax_element(data.begin(), data.end());
146 float min = *(minmax.first);
147 float max = *(minmax.second);
148 float step = (max - min) / nBins;
149
150 for (unsigned int iBin = 0; iBin <= nBins; ++iBin) {
151 binning.m_boundaries[iBin] = min + step * iBin;
152 }
153
154 for (unsigned int iEvent = 0; iEvent < data.size(); ++iEvent) {
155 unsigned int bin = binning.getBin(data[iEvent]);
156
157 if (isSignal[iEvent])
158 binning.m_signal_pdf[bin] += weights[iEvent];
159 else
160 binning.m_bckgrd_pdf[bin] += weights[iEvent];
161
162 }
163
164 binning.normalizePDFs();
165 binning.calculateCDFsFromPDFs();
166
167 return binning;
168
169 }
170
171 }
173}
174
Binning of a data distribution Provides PDF and CDF values of the distribution per bin.
Definition: Binning.h:27
std::vector< float > m_bckgrd_pdf
Background pdf of data distribution per bin.
Definition: Binning.h:58
std::vector< float > m_signal_pdf
Signal pdf of data distribution per bin.
Definition: Binning.h:56
std::vector< float > m_boundaries
Boundaries of data distribution, including minimum and maximum value as first and last boundary.
Definition: Binning.h:61
std::vector< float > m_bckgrd_cdf
Background cdf of data distribution per bin.
Definition: Binning.h:59
double m_bckgrd_yield
Background yield in data distribution.
Definition: Binning.h:54
double m_signal_yield
Signal yield in data distribution.
Definition: Binning.h:53
std::vector< float > m_signal_cdf
Signal cdf of data distribution per bin.
Definition: Binning.h:57
void calculateCDFsFromPDFs()
Calculates the CDF values from the pdf values, which are assumed to be normalized.
Definition: Binning.cc:73
void normalizePDFs()
Normalizes the PDF values, so their sum is 1.
Definition: Binning.cc:47
unsigned int getBin(float datapoint) const
Gets the bin corresponding to the given datapoint.
Definition: Binning.cc:34
Abstract base class for different kinds of events.