Belle II Software  release-08-01-10
histogram.py
1 #!/usr/bin/env python3
2 
3 
10 
11 import numpy
12 
13 
14 def binom_error(n_sig, n_tot):
15  """
16  for an efficiency = nSig/nTrueSig or purity = nSig / (nSig + nBckgrd), this function calculates the
17  standard deviation according to http://arxiv.org/abs/physics/0701199 .
18  """
19  variance = numpy.where(n_tot > 0, (n_sig + 1) * (n_sig + 2) / ((n_tot + 2) * (n_tot + 3)) -
20  (n_sig + 1) ** 2 / ((n_tot + 2) ** 2), 0)
21  return numpy.sqrt(variance)
22 
23 
24 def poisson_error(n_tot):
25  """
26  use poisson error, except for 0 we use an 68% CL upper limit
27  """
28  return numpy.where(n_tot > 0, numpy.sqrt(n_tot), numpy.log(1.0 / (1 - 0.6827)))
29 
30 
31 def weighted_mean_and_std(x, w):
32  """
33  Return the weighted average and standard deviation.
34  @param x values
35  @param w weights
36  """
37  mean = numpy.average(x, weights=w)
38  var = numpy.average((x-mean)**2, weights=w)
39  return (mean, numpy.sqrt(var))
40 
41 
42 class Histograms:
43  """
44  Extracts information from a pandas.DataFrame and stores it
45  in a binned format.
46  Therefore the size independent from the size of the pandas.DataFrame.
47  Used by the plotting routines below.
48  """
49 
50 
51  hist = None
52 
53  bins = None
54 
55  bin_centers = None
56 
57  bin_widths = None
58 
59  hists = None
60 
61  def __init__(self, data, column, masks=dict(), weight_column=None, bins=100, equal_frequency=True, range_in_std=None):
62  """
63  Creates a common binning of the given column of the given pandas.Dataframe,
64  and stores for each given mask the histogram of the column
65  @param data pandas.DataFrame like object containing column and weight_column
66  @param column string identifiying the column in the pandas.DataFrame which is binned.
67  @param masks dictionary of names and boolean arrays, which select the data
68  used for the creation of histograms with these names
69  @param weight_column identifiying the column in the pandas.DataFrame which is used as weight
70  @param bins use given bins instead of default 100
71  @param equal_frequency perform an equal_frequency binning
72  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
73  """
74  isfinite = numpy.isfinite(data[column])
75  if range_in_std is not None:
76  mean, std = weighted_mean_and_std(data[column][isfinite],
77  None if weight_column is None else data[weight_column][isfinite])
78  # Everything outside mean +- range_in_std * std is considered infinite
79  isfinite = isfinite & (data[column] > (mean - range_in_std * std)) & (data[column] < (mean + range_in_std * std))
80 
81  if equal_frequency:
82  if data[column][isfinite].size > 0:
83  bins = numpy.unique(numpy.percentile(data[column][isfinite], q=range(bins + 1)))
84  else:
85  print('Empty Array')
86  bins = [1]
87  # If all values are unique, we make at least one bin
88  if len(bins) == 1:
89  bins = numpy.array([bins[0]-1, bins[0]+1])
90 
91  self.histhist, self.binsbins = numpy.histogram(data[column][isfinite], bins=bins,
92  weights=None if weight_column is None else data[weight_column])
93  self.bin_centersbin_centers = (self.binsbins + numpy.roll(self.binsbins, 1))[1:] / 2.0
94  # Subtract a small number from the bin width, otherwise the errorband plot is unstable.
95  self.bin_widthsbin_widths = (self.binsbins - numpy.roll(self.binsbins, 1))[1:] - 0.00001
96  self.histshists = dict()
97  for name, mask in masks.items():
98  self.histshists[name] = numpy.histogram(data[column][mask & isfinite], bins=self.binsbins,
99  weights=None if weight_column is None else data[weight_column][mask & isfinite])[0]
100 
101  def get_hist(self, name=None):
102  """
103  Return histogram with the given name. If none returns histogram of the full data.
104  @param name name of the histogram
105  @return numpy.array with hist data, numpy.array with corresponding poisson errors
106  """
107  if name is None:
108  return self.histhist, poisson_error(self.histhist)
109  return self.get_summed_histget_summed_hist([name])
110 
111  def get_summed_hist(self, names):
112  """
113  Return the sum of histograms with the given names.
114  @param names names of the histograms
115  @return numpy.array with hist data, numpy.array with corresponding poisson errors
116  """
117  default = numpy.zeros(len(self.bin_centersbin_centers))
118  hist = numpy.sum(self.histshists.get(v, default) for v in names)
119  hist_error = poisson_error(hist)
120  return hist, hist_error
121 
122  def get_efficiency(self, signal_names):
123  """
124  Return the cumulative efficiency in each bin of the sum of the histograms with the given names.
125  @param signal_names of the histograms
126  @return numpy.array with hist data, numpy.array with corresponding binomial errors
127  """
128  signal, _ = self.get_summed_histget_summed_hist(signal_names)
129  cumsignal = (signal.sum() - signal.cumsum()).astype('float')
130 
131  efficiency = 0
132  efficiency_error = 0
133  if signal.sum() > 0:
134  efficiency = cumsignal / signal.sum()
135  efficiency_error = binom_error(cumsignal, signal.sum())
136  return efficiency, efficiency_error
137 
138  def get_true_positives(self, signal_names):
139  """
140  Return the cumulative true positives in each bin of the sum of the histograms with the given names.
141  @param names names of the histograms
142  @return numpy.array with hist data, numpy.array with corresponding binomial errors
143  """
144  signal, _ = self.get_summed_histget_summed_hist(signal_names)
145  cumsignal = (signal.sum() - signal.cumsum()).astype('float')
146  signal_error = poisson_error(cumsignal)
147  return cumsignal, signal_error
148 
149  def get_false_positives(self, bckgrd_names):
150  """
151  Return the cumulative false positives in each bin of the sum of the histograms with the given names.
152  @param names names of the histograms
153  @return numpy.array with hist data, numpy.array with corresponding binomial errors
154  """
155  background, _ = self.get_summed_histget_summed_hist(bckgrd_names)
156  cumbackground = (background.sum() - background.cumsum()).astype('float')
157  background_error = poisson_error(cumbackground)
158  return cumbackground, background_error
159 
160  def get_purity(self, signal_names, bckgrd_names):
161  """
162  Return the cumulative purity in each bin of the sum of the histograms with the given names.
163  @param names names of the histograms
164  @return numpy.array with hist data, numpy.array with corresponding binomial errors
165  """
166  signal, _ = self.get_summed_histget_summed_hist(signal_names)
167  bckgrd, _ = self.get_summed_histget_summed_hist(bckgrd_names)
168  cumsignal = (signal.sum() - signal.cumsum()).astype('float')
169  cumbckgrd = (bckgrd.sum() - bckgrd.cumsum()).astype('float')
170 
171  purity = cumsignal / (cumsignal + cumbckgrd)
172  purity_error = binom_error(cumsignal, cumsignal + cumbckgrd)
173  return purity, purity_error
174 
175  def get_signal_to_noise(self, signal_names, bckgrd_names):
176  """
177  Return the cumulative signal to noise ratio in each bin of the sum of the histograms with the given names.
178  @param names names of the histograms
179  @return numpy.array with hist data, numpy.array with corresponding binomial errors
180  """
181  signal, _ = self.get_summed_histget_summed_hist(signal_names)
182  bckgrd, _ = self.get_summed_histget_summed_hist(bckgrd_names)
183  cumsignal = (signal.sum() - signal.cumsum()).astype('float')
184  cumbckgrd = (bckgrd.sum() - bckgrd.cumsum()).astype('float')
185 
186  signal2noise = cumsignal / (cumsignal + cumbckgrd)**0.5
187  signal2noise_error = numpy.sqrt(cumsignal / (cumsignal + cumbckgrd) + (cumsignal / (2 * (cumsignal + cumbckgrd)))**2)
188  return signal2noise, signal2noise_error
189 
190  def get_purity_per_bin(self, signal_names, bckgrd_names):
191  """
192  Return the purity in each bin of the sum of the histograms with the given names.
193  @param names names of the histograms
194  @return numpy.array with hist data, numpy.array with corresponding binomial errors
195  """
196  signal, _ = self.get_summed_histget_summed_hist(signal_names)
197  bckgrd, _ = self.get_summed_histget_summed_hist(bckgrd_names)
198  signal = signal.astype('float')
199  bckgrd = bckgrd.astype('float')
200 
201  purity = signal / (signal + bckgrd)
202  purity_error = binom_error(signal, signal + bckgrd)
203  return purity, purity_error
def __init__(self, data, column, masks=dict(), weight_column=None, bins=100, equal_frequency=True, range_in_std=None)
Definition: histogram.py:61
def get_purity(self, signal_names, bckgrd_names)
Definition: histogram.py:160
bin_centers
Bin centers.
Definition: histogram.py:55
def get_hist(self, name=None)
Definition: histogram.py:101
bin_widths
Bin widths.
Definition: histogram.py:57
def get_efficiency(self, signal_names)
Definition: histogram.py:122
hist
Histogram of the full data.
Definition: histogram.py:51
def get_false_positives(self, bckgrd_names)
Definition: histogram.py:149
def get_true_positives(self, signal_names)
Definition: histogram.py:138
def get_summed_hist(self, names)
Definition: histogram.py:111
def get_signal_to_noise(self, signal_names, bckgrd_names)
Definition: histogram.py:175
def get_purity_per_bin(self, signal_names, bckgrd_names)
Definition: histogram.py:190
hists
Dictionary of histograms for the given masks.
Definition: histogram.py:59