Belle II Software  release-05-01-25
histogram.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 # Thomas Keck 2015
5 
6 import copy
7 import numpy
8 
9 
10 def binom_error(n_sig, n_tot):
11  """
12  for an efficiency = nSig/nTrueSig or purity = nSig / (nSig + nBckgrd), this function calculates the
13  standard deviation according to http://arxiv.org/abs/physics/0701199 .
14  """
15  variance = numpy.where(n_tot > 0, (n_sig + 1) * (n_sig + 2) / ((n_tot + 2) * (n_tot + 3)) -
16  (n_sig + 1) ** 2 / ((n_tot + 2) ** 2), 0)
17  return numpy.sqrt(variance)
18 
19 
20 def poisson_error(n_tot):
21  """
22  use poisson error, except for 0 we use an 68% CL upper limit
23  """
24  return numpy.where(n_tot > 0, numpy.sqrt(n_tot), numpy.log(1.0 / (1 - 0.6827)))
25 
26 
27 def weighted_mean_and_std(x, w):
28  """
29  Return the weighted average and standard deviation.
30  @param x values
31  @param w weights
32  """
33  mean = numpy.average(x, weights=w)
34  var = numpy.average((x-mean)**2, weights=w)
35  return (mean, numpy.sqrt(var))
36 
37 
38 class Histograms(object):
39  """
40  Extracts information from a pandas.DataFrame and stores it
41  in a binned format.
42  Therefore the size independent from the size of the pandas.DataFrame.
43  Used by the plotting routines below.
44  """
45 
46 
47  hist = None
48 
49  bins = None
50 
51  bin_centers = None
52 
53  bin_widths = None
54 
55  hists = None
56 
57  def __init__(self, data, column, masks=dict(), weight_column=None, bins=100, equal_frequency=True, range_in_std=None):
58  """
59  Creates a common binning of the given column of the given pandas.Dataframe,
60  and stores for each given mask the histogram of the column
61  @param data pandas.DataFrame like object containing column and weight_column
62  @param column string identifiying the column in the pandas.DataFrame which is binned.
63  @param masks dictionary of names and boolean arrays, which select the data
64  used for the creation of histograms with these names
65  @param weight_column identifiying the column in the pandas.DataFrame which is used as weight
66  @param bins use given bins instead of default 100
67  @param equal_frequency perform an equal_frequency binning
68  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
69  """
70  isfinite = numpy.isfinite(data[column])
71  if range_in_std is not None:
72  mean, std = weighted_mean_and_std(data[column][isfinite],
73  None if weight_column is None else data[weight_column][isfinite])
74  # Everything outside mean +- range_in_std * std is considered infinite
75  isfinite = isfinite & (data[column] > (mean - range_in_std * std)) & (data[column] < (mean + range_in_std * std))
76 
77  if equal_frequency:
78  if data[column][isfinite].size > 0:
79  bins = numpy.unique(numpy.percentile(data[column][isfinite], q=range(bins + 1)))
80  else:
81  print('Empty Array')
82  bins = [1]
83  # If all values are unique, we make at least one bin
84  if len(bins) == 1:
85  bins = numpy.array([bins[0]-1, bins[0]+1])
86 
87  self.hist, self.bins = numpy.histogram(data[column][isfinite], bins=bins,
88  weights=None if weight_column is None else data[weight_column])
89  self.bin_centers = (self.bins + numpy.roll(self.bins, 1))[1:] / 2.0
90  # Subtract a small number from the bin width, otherwise the errorband plot is unstable.
91  self.bin_widths = (self.bins - numpy.roll(self.bins, 1))[1:] - 0.00001
92  self.hists = dict()
93  for name, mask in masks.items():
94  self.hists[name] = numpy.histogram(data[column][mask & isfinite], bins=self.bins,
95  weights=None if weight_column is None else data[weight_column][mask & isfinite])[0]
96 
97  def get_hist(self, name=None):
98  """
99  Return histogram with the given name. If none returns histogram of the full data.
100  @param name name of the histogram
101  @return numpy.array with hist data, numpy.array with corresponding poisson errors
102  """
103  if name is None:
104  return self.hist, poisson_error(self.hist)
105  return self.get_summed_hist([name])
106 
107  def get_summed_hist(self, names):
108  """
109  Return the sum of histograms with the given names.
110  @param names names of the histograms
111  @return numpy.array with hist data, numpy.array with corresponding poisson errors
112  """
113  default = numpy.zeros(len(self.bin_centers))
114  hist = numpy.sum(self.hists.get(v, default) for v in names)
115  hist_error = poisson_error(hist)
116  return hist, hist_error
117 
118  def get_efficiency(self, signal_names):
119  """
120  Return the cumulative efficiency in each bin of the sum of the histograms with the given names.
121  @param signal_names of the histograms
122  @return numpy.array with hist data, numpy.array with corresponding binomial errors
123  """
124  signal, _ = self.get_summed_hist(signal_names)
125  cumsignal = (signal.sum() - signal.cumsum()).astype('float')
126 
127  efficiency = 0
128  efficiency_error = 0
129  if signal.sum() > 0:
130  efficiency = cumsignal / signal.sum()
131  efficiency_error = binom_error(cumsignal, signal.sum())
132  return efficiency, efficiency_error
133 
134  def get_true_positives(self, signal_names):
135  """
136  Return the cumulative true positives in each bin of the sum of the histograms with the given names.
137  @param names names of the histograms
138  @return numpy.array with hist data, numpy.array with corresponding binomial errors
139  """
140  signal, _ = self.get_summed_hist(signal_names)
141  cumsignal = (signal.sum() - signal.cumsum()).astype('float')
142  signal_error = poisson_error(cumsignal)
143  return cumsignal, signal_error
144 
145  def get_false_positives(self, bckgrd_names):
146  """
147  Return the cumulative false positives in each bin of the sum of the histograms with the given names.
148  @param names names of the histograms
149  @return numpy.array with hist data, numpy.array with corresponding binomial errors
150  """
151  background, _ = self.get_summed_hist(bckgrd_names)
152  cumbackground = (background.sum() - background.cumsum()).astype('float')
153  background_error = poisson_error(cumbackground)
154  return cumbackground, background_error
155 
156  def get_purity(self, signal_names, bckgrd_names):
157  """
158  Return the cumulative purity in each bin of the sum of the histograms with the given names.
159  @param names names of the histograms
160  @return numpy.array with hist data, numpy.array with corresponding binomial errors
161  """
162  signal, _ = self.get_summed_hist(signal_names)
163  bckgrd, _ = self.get_summed_hist(bckgrd_names)
164  cumsignal = (signal.sum() - signal.cumsum()).astype('float')
165  cumbckgrd = (bckgrd.sum() - bckgrd.cumsum()).astype('float')
166 
167  purity = cumsignal / (cumsignal + cumbckgrd)
168  purity_error = binom_error(cumsignal, cumsignal + cumbckgrd)
169  return purity, purity_error
170 
171  def get_signal_to_noise(self, signal_names, bckgrd_names):
172  """
173  Return the cumulative signal to noise ratio in each bin of the sum of the histograms with the given names.
174  @param names names of the histograms
175  @return numpy.array with hist data, numpy.array with corresponding binomial errors
176  """
177  signal, _ = self.get_summed_hist(signal_names)
178  bckgrd, _ = self.get_summed_hist(bckgrd_names)
179  cumsignal = (signal.sum() - signal.cumsum()).astype('float')
180  cumbckgrd = (bckgrd.sum() - bckgrd.cumsum()).astype('float')
181 
182  signal2noise = cumsignal / (cumsignal + cumbckgrd)**0.5
183  signal2noise_error = numpy.sqrt(cumsignal / (cumsignal + cumbckgrd) + (cumsignal / (2 * (cumsignal + cumbckgrd)))**2)
184  return signal2noise, signal2noise_error
185 
186  def get_purity_per_bin(self, signal_names, bckgrd_names):
187  """
188  Return the purity in each bin of the sum of the histograms with the given names.
189  @param names names of the histograms
190  @return numpy.array with hist data, numpy.array with corresponding binomial errors
191  """
192  signal, _ = self.get_summed_hist(signal_names)
193  bckgrd, _ = self.get_summed_hist(bckgrd_names)
194  signal = signal.astype('float')
195  bckgrd = bckgrd.astype('float')
196 
197  purity = signal / (signal + bckgrd)
198  purity_error = binom_error(signal, signal + bckgrd)
199  return purity, purity_error
histogram.Histograms.bin_centers
bin_centers
Bin centers.
Definition: histogram.py:51
histogram.Histograms.bin_widths
bin_widths
Bin widths.
Definition: histogram.py:53
histogram.Histograms.__init__
def __init__(self, data, column, masks=dict(), weight_column=None, bins=100, equal_frequency=True, range_in_std=None)
Definition: histogram.py:57
histogram.Histograms.get_purity
def get_purity(self, signal_names, bckgrd_names)
Definition: histogram.py:156
histogram.Histograms.get_purity_per_bin
def get_purity_per_bin(self, signal_names, bckgrd_names)
Definition: histogram.py:186
histogram.Histograms.get_efficiency
def get_efficiency(self, signal_names)
Definition: histogram.py:118
histogram.Histograms
Definition: histogram.py:38
histogram.Histograms.hist
hist
Histogram of the full data.
Definition: histogram.py:47
histogram.Histograms.get_hist
def get_hist(self, name=None)
Definition: histogram.py:97
histogram.Histograms.get_false_positives
def get_false_positives(self, bckgrd_names)
Definition: histogram.py:145
histogram.Histograms.get_signal_to_noise
def get_signal_to_noise(self, signal_names, bckgrd_names)
Definition: histogram.py:171
histogram.Histograms.bins
bins
Binning.
Definition: histogram.py:49
histogram.Histograms.get_true_positives
def get_true_positives(self, signal_names)
Definition: histogram.py:134
histogram.Histograms.hists
hists
Dictionary of histograms for the given masks.
Definition: histogram.py:55
histogram.Histograms.get_summed_hist
def get_summed_hist(self, names)
Definition: histogram.py:107