Belle II Software  release-06-02-00
histogram.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 
11 
12 import numpy
13 
14 
15 def binom_error(n_sig, n_tot):
16  """
17  for an efficiency = nSig/nTrueSig or purity = nSig / (nSig + nBckgrd), this function calculates the
18  standard deviation according to http://arxiv.org/abs/physics/0701199 .
19  """
20  variance = numpy.where(n_tot > 0, (n_sig + 1) * (n_sig + 2) / ((n_tot + 2) * (n_tot + 3)) -
21  (n_sig + 1) ** 2 / ((n_tot + 2) ** 2), 0)
22  return numpy.sqrt(variance)
23 
24 
25 def poisson_error(n_tot):
26  """
27  use poisson error, except for 0 we use an 68% CL upper limit
28  """
29  return numpy.where(n_tot > 0, numpy.sqrt(n_tot), numpy.log(1.0 / (1 - 0.6827)))
30 
31 
32 def weighted_mean_and_std(x, w):
33  """
34  Return the weighted average and standard deviation.
35  @param x values
36  @param w weights
37  """
38  mean = numpy.average(x, weights=w)
39  var = numpy.average((x-mean)**2, weights=w)
40  return (mean, numpy.sqrt(var))
41 
42 
43 class Histograms(object):
44  """
45  Extracts information from a pandas.DataFrame and stores it
46  in a binned format.
47  Therefore the size independent from the size of the pandas.DataFrame.
48  Used by the plotting routines below.
49  """
50 
51 
52  hist = None
53 
54  bins = None
55 
56  bin_centers = None
57 
58  bin_widths = None
59 
60  hists = None
61 
62  def __init__(self, data, column, masks=dict(), weight_column=None, bins=100, equal_frequency=True, range_in_std=None):
63  """
64  Creates a common binning of the given column of the given pandas.Dataframe,
65  and stores for each given mask the histogram of the column
66  @param data pandas.DataFrame like object containing column and weight_column
67  @param column string identifiying the column in the pandas.DataFrame which is binned.
68  @param masks dictionary of names and boolean arrays, which select the data
69  used for the creation of histograms with these names
70  @param weight_column identifiying the column in the pandas.DataFrame which is used as weight
71  @param bins use given bins instead of default 100
72  @param equal_frequency perform an equal_frequency binning
73  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
74  """
75  isfinite = numpy.isfinite(data[column])
76  if range_in_std is not None:
77  mean, std = weighted_mean_and_std(data[column][isfinite],
78  None if weight_column is None else data[weight_column][isfinite])
79  # Everything outside mean +- range_in_std * std is considered infinite
80  isfinite = isfinite & (data[column] > (mean - range_in_std * std)) & (data[column] < (mean + range_in_std * std))
81 
82  if equal_frequency:
83  if data[column][isfinite].size > 0:
84  bins = numpy.unique(numpy.percentile(data[column][isfinite], q=range(bins + 1)))
85  else:
86  print('Empty Array')
87  bins = [1]
88  # If all values are unique, we make at least one bin
89  if len(bins) == 1:
90  bins = numpy.array([bins[0]-1, bins[0]+1])
91 
92  self.histhist, self.binsbins = numpy.histogram(data[column][isfinite], bins=bins,
93  weights=None if weight_column is None else data[weight_column])
94  self.bin_centersbin_centers = (self.binsbins + numpy.roll(self.binsbins, 1))[1:] / 2.0
95  # Subtract a small number from the bin width, otherwise the errorband plot is unstable.
96  self.bin_widthsbin_widths = (self.binsbins - numpy.roll(self.binsbins, 1))[1:] - 0.00001
97  self.histshists = dict()
98  for name, mask in masks.items():
99  self.histshists[name] = numpy.histogram(data[column][mask & isfinite], bins=self.binsbins,
100  weights=None if weight_column is None else data[weight_column][mask & isfinite])[0]
101 
102  def get_hist(self, name=None):
103  """
104  Return histogram with the given name. If none returns histogram of the full data.
105  @param name name of the histogram
106  @return numpy.array with hist data, numpy.array with corresponding poisson errors
107  """
108  if name is None:
109  return self.histhist, poisson_error(self.histhist)
110  return self.get_summed_histget_summed_hist([name])
111 
112  def get_summed_hist(self, names):
113  """
114  Return the sum of histograms with the given names.
115  @param names names of the histograms
116  @return numpy.array with hist data, numpy.array with corresponding poisson errors
117  """
118  default = numpy.zeros(len(self.bin_centersbin_centers))
119  hist = numpy.sum(self.histshists.get(v, default) for v in names)
120  hist_error = poisson_error(hist)
121  return hist, hist_error
122 
123  def get_efficiency(self, signal_names):
124  """
125  Return the cumulative efficiency in each bin of the sum of the histograms with the given names.
126  @param signal_names of the histograms
127  @return numpy.array with hist data, numpy.array with corresponding binomial errors
128  """
129  signal, _ = self.get_summed_histget_summed_hist(signal_names)
130  cumsignal = (signal.sum() - signal.cumsum()).astype('float')
131 
132  efficiency = 0
133  efficiency_error = 0
134  if signal.sum() > 0:
135  efficiency = cumsignal / signal.sum()
136  efficiency_error = binom_error(cumsignal, signal.sum())
137  return efficiency, efficiency_error
138 
139  def get_true_positives(self, signal_names):
140  """
141  Return the cumulative true positives in each bin of the sum of the histograms with the given names.
142  @param names names of the histograms
143  @return numpy.array with hist data, numpy.array with corresponding binomial errors
144  """
145  signal, _ = self.get_summed_histget_summed_hist(signal_names)
146  cumsignal = (signal.sum() - signal.cumsum()).astype('float')
147  signal_error = poisson_error(cumsignal)
148  return cumsignal, signal_error
149 
150  def get_false_positives(self, bckgrd_names):
151  """
152  Return the cumulative false positives in each bin of the sum of the histograms with the given names.
153  @param names names of the histograms
154  @return numpy.array with hist data, numpy.array with corresponding binomial errors
155  """
156  background, _ = self.get_summed_histget_summed_hist(bckgrd_names)
157  cumbackground = (background.sum() - background.cumsum()).astype('float')
158  background_error = poisson_error(cumbackground)
159  return cumbackground, background_error
160 
161  def get_purity(self, signal_names, bckgrd_names):
162  """
163  Return the cumulative purity in each bin of the sum of the histograms with the given names.
164  @param names names of the histograms
165  @return numpy.array with hist data, numpy.array with corresponding binomial errors
166  """
167  signal, _ = self.get_summed_histget_summed_hist(signal_names)
168  bckgrd, _ = self.get_summed_histget_summed_hist(bckgrd_names)
169  cumsignal = (signal.sum() - signal.cumsum()).astype('float')
170  cumbckgrd = (bckgrd.sum() - bckgrd.cumsum()).astype('float')
171 
172  purity = cumsignal / (cumsignal + cumbckgrd)
173  purity_error = binom_error(cumsignal, cumsignal + cumbckgrd)
174  return purity, purity_error
175 
176  def get_signal_to_noise(self, signal_names, bckgrd_names):
177  """
178  Return the cumulative signal to noise ratio in each bin of the sum of the histograms with the given names.
179  @param names names of the histograms
180  @return numpy.array with hist data, numpy.array with corresponding binomial errors
181  """
182  signal, _ = self.get_summed_histget_summed_hist(signal_names)
183  bckgrd, _ = self.get_summed_histget_summed_hist(bckgrd_names)
184  cumsignal = (signal.sum() - signal.cumsum()).astype('float')
185  cumbckgrd = (bckgrd.sum() - bckgrd.cumsum()).astype('float')
186 
187  signal2noise = cumsignal / (cumsignal + cumbckgrd)**0.5
188  signal2noise_error = numpy.sqrt(cumsignal / (cumsignal + cumbckgrd) + (cumsignal / (2 * (cumsignal + cumbckgrd)))**2)
189  return signal2noise, signal2noise_error
190 
191  def get_purity_per_bin(self, signal_names, bckgrd_names):
192  """
193  Return the purity in each bin of the sum of the histograms with the given names.
194  @param names names of the histograms
195  @return numpy.array with hist data, numpy.array with corresponding binomial errors
196  """
197  signal, _ = self.get_summed_histget_summed_hist(signal_names)
198  bckgrd, _ = self.get_summed_histget_summed_hist(bckgrd_names)
199  signal = signal.astype('float')
200  bckgrd = bckgrd.astype('float')
201 
202  purity = signal / (signal + bckgrd)
203  purity_error = binom_error(signal, signal + bckgrd)
204  return purity, purity_error
def __init__(self, data, column, masks=dict(), weight_column=None, bins=100, equal_frequency=True, range_in_std=None)
Definition: histogram.py:62
def get_purity(self, signal_names, bckgrd_names)
Definition: histogram.py:161
bin_centers
Bin centers.
Definition: histogram.py:56
def get_hist(self, name=None)
Definition: histogram.py:102
bin_widths
Bin widths.
Definition: histogram.py:58
def get_efficiency(self, signal_names)
Definition: histogram.py:123
hist
Histogram of the full data.
Definition: histogram.py:52
def get_false_positives(self, bckgrd_names)
Definition: histogram.py:150
def get_true_positives(self, signal_names)
Definition: histogram.py:139
def get_summed_hist(self, names)
Definition: histogram.py:112
def get_signal_to_noise(self, signal_names, bckgrd_names)
Definition: histogram.py:176
def get_purity_per_bin(self, signal_names, bckgrd_names)
Definition: histogram.py:191
hists
Dictionary of histograms for the given masks.
Definition: histogram.py:60