release-06-02-00/doxygen/histogram_8py_source.html

 #!/usr/bin/env python3

 # -*- coding: utf-8 -*-


 import numpy


 def binom_error(n_sig, n_tot):

     """

     for an efficiency = nSig/nTrueSig or purity = nSig / (nSig + nBckgrd), this function calculates the

     standard deviation according to http://arxiv.org/abs/physics/0701199 .

     """

     variance = numpy.where(n_tot > 0, (n_sig + 1) * (n_sig + 2) / ((n_tot + 2) * (n_tot + 3)) -

                            (n_sig + 1) ** 2 / ((n_tot + 2) ** 2), 0)

     return numpy.sqrt(variance)


 def poisson_error(n_tot):

     """

     use poisson error, except for 0 we use an 68% CL upper limit

     """

     return numpy.where(n_tot > 0, numpy.sqrt(n_tot), numpy.log(1.0 / (1 - 0.6827)))


 def weighted_mean_and_std(x, w):

     """

     Return the weighted average and standard deviation.

     @param x values

     @param w weights

     """

     mean = numpy.average(x, weights=w)

     var = numpy.average((x-mean)**2, weights=w)

     return (mean, numpy.sqrt(var))


 class Histograms(object):

     """

     Extracts information from a pandas.DataFrame and stores it

     in a binned format.

     Therefore the size independent from the size of the pandas.DataFrame.

     Used by the plotting routines below.

     """


     hist = None


     bins = None


     bin_centers = None


     bin_widths = None


     hists = None


     def __init__(self, data, column, masks=dict(), weight_column=None, bins=100, equal_frequency=True, range_in_std=None):

         """

         Creates a common binning of the given column of the given pandas.Dataframe,

         and stores for each given mask the histogram of the column

         @param data pandas.DataFrame  like object containing column and weight_column

         @param column string identifiying the column in the pandas.DataFrame which is binned.

         @param masks dictionary of names and boolean arrays, which select the data

                      used for the creation of histograms with these names

         @param weight_column identifiying the column in the pandas.DataFrame which is used as weight

         @param bins use given bins instead of default 100

         @param equal_frequency perform an equal_frequency binning

         @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean

         """

         isfinite = numpy.isfinite(data[column])

         if range_in_std is not None:

             mean, std = weighted_mean_and_std(data[column][isfinite],

                                               None if weight_column is None else data[weight_column][isfinite])

             # Everything outside mean +- range_in_std * std is considered infinite

             isfinite = isfinite & (data[column] > (mean - range_in_std * std)) & (data[column] < (mean + range_in_std * std))


         if equal_frequency:

             if data[column][isfinite].size > 0:

                 bins = numpy.unique(numpy.percentile(data[column][isfinite], q=range(bins + 1)))

             else:

                 print('Empty Array')

                 bins = [1]

             # If all values are unique, we make at least one bin

             if len(bins) == 1:

                 bins = numpy.array([bins[0]-1, bins[0]+1])


         self.histhist, self.binsbins = numpy.histogram(data[column][isfinite], bins=bins,

                                                weights=None if weight_column is None else data[weight_column])

         self.bin_centersbin_centers = (self.binsbins + numpy.roll(self.binsbins, 1))[1:] / 2.0

         # Subtract a small number from the bin width, otherwise the errorband plot is unstable.

         self.bin_widthsbin_widths = (self.binsbins - numpy.roll(self.binsbins, 1))[1:] - 0.00001

         self.histshists = dict()

         for name, mask in masks.items():

             self.histshists[name] = numpy.histogram(data[column][mask & isfinite], bins=self.binsbins,

                                                weights=None if weight_column is None else data[weight_column][mask & isfinite])[0]


     def get_hist(self, name=None):

         """

         Return histogram with the given name. If none returns histogram of the full data.

         @param name name of the histogram

         @return numpy.array with hist data, numpy.array with corresponding poisson errors

         """

         if name is None:

             return self.histhist, poisson_error(self.histhist)

         return self.get_summed_histget_summed_hist([name])


     def get_summed_hist(self, names):

         """

         Return the sum of histograms with the given names.

         @param names names of the histograms

         @return numpy.array with hist data, numpy.array with corresponding poisson errors

         """

         default = numpy.zeros(len(self.bin_centersbin_centers))

         hist = numpy.sum(self.histshists.get(v, default) for v in names)

         hist_error = poisson_error(hist)

         return hist, hist_error


     def get_efficiency(self, signal_names):

         """

         Return the cumulative efficiency in each bin of the sum of the histograms with the given names.

         @param  signal_names of the histograms

         @return numpy.array with hist data, numpy.array with corresponding binomial errors

         """

         signal, _ = self.get_summed_histget_summed_hist(signal_names)

         cumsignal = (signal.sum() - signal.cumsum()).astype('float')


         efficiency = 0

         efficiency_error = 0

         if signal.sum() > 0:

             efficiency = cumsignal / signal.sum()

             efficiency_error = binom_error(cumsignal, signal.sum())

         return efficiency, efficiency_error


     def get_true_positives(self, signal_names):

         """

         Return the cumulative true positives in each bin of the sum of the histograms with the given names.

         @param names names of the histograms

         @return numpy.array with hist data, numpy.array with corresponding binomial errors

         """

         signal, _ = self.get_summed_histget_summed_hist(signal_names)

         cumsignal = (signal.sum() - signal.cumsum()).astype('float')

         signal_error = poisson_error(cumsignal)

         return cumsignal, signal_error


     def get_false_positives(self, bckgrd_names):

         """

         Return the cumulative false positives in each bin of the sum of the histograms with the given names.

         @param names names of the histograms

         @return numpy.array with hist data, numpy.array with corresponding binomial errors

         """

         background, _ = self.get_summed_histget_summed_hist(bckgrd_names)

         cumbackground = (background.sum() - background.cumsum()).astype('float')

         background_error = poisson_error(cumbackground)

         return cumbackground, background_error


     def get_purity(self, signal_names, bckgrd_names):

         """

         Return the cumulative purity in each bin of the sum of the histograms with the given names.

         @param names names of the histograms

         @return numpy.array with hist data, numpy.array with corresponding binomial errors

         """

         signal, _ = self.get_summed_histget_summed_hist(signal_names)

         bckgrd, _ = self.get_summed_histget_summed_hist(bckgrd_names)

         cumsignal = (signal.sum() - signal.cumsum()).astype('float')

         cumbckgrd = (bckgrd.sum() - bckgrd.cumsum()).astype('float')


         purity = cumsignal / (cumsignal + cumbckgrd)

         purity_error = binom_error(cumsignal, cumsignal + cumbckgrd)

         return purity, purity_error


     def get_signal_to_noise(self, signal_names, bckgrd_names):

         """

         Return the cumulative signal to noise ratio in each bin of the sum of the histograms with the given names.

         @param names names of the histograms

         @return numpy.array with hist data, numpy.array with corresponding binomial errors

         """

         signal, _ = self.get_summed_histget_summed_hist(signal_names)

         bckgrd, _ = self.get_summed_histget_summed_hist(bckgrd_names)

         cumsignal = (signal.sum() - signal.cumsum()).astype('float')

         cumbckgrd = (bckgrd.sum() - bckgrd.cumsum()).astype('float')


         signal2noise = cumsignal / (cumsignal + cumbckgrd)**0.5

         signal2noise_error = numpy.sqrt(cumsignal / (cumsignal + cumbckgrd) + (cumsignal / (2 * (cumsignal + cumbckgrd)))**2)

         return signal2noise, signal2noise_error


     def get_purity_per_bin(self, signal_names, bckgrd_names):

         """

         Return the purity in each bin of the sum of the histograms with the given names.

         @param names names of the histograms

         @return numpy.array with hist data, numpy.array with corresponding binomial errors

         """

         signal, _ = self.get_summed_histget_summed_hist(signal_names)

         bckgrd, _ = self.get_summed_histget_summed_hist(bckgrd_names)

         signal = signal.astype('float')

         bckgrd = bckgrd.astype('float')


         purity = signal / (signal + bckgrd)

         purity_error = binom_error(signal, signal + bckgrd)

         return purity, purity_error

histogram.Histograms
Definition: histogram.py:43

histogram.Histograms.__init__
def __init__(self, data, column, masks=dict(), weight_column=None, bins=100, equal_frequency=True, range_in_std=None)
Definition: histogram.py:62

histogram.Histograms.get_purity
def get_purity(self, signal_names, bckgrd_names)
Definition: histogram.py:161

histogram.Histograms.bin_centers
bin_centers
Bin centers.
Definition: histogram.py:56

histogram.Histograms.get_hist
def get_hist(self, name=None)
Definition: histogram.py:102

histogram.Histograms.bin_widths
bin_widths
Bin widths.
Definition: histogram.py:58

histogram.Histograms.get_efficiency
def get_efficiency(self, signal_names)
Definition: histogram.py:123

histogram.Histograms.hist
hist
Histogram of the full data.
Definition: histogram.py:52

histogram.Histograms.get_false_positives
def get_false_positives(self, bckgrd_names)
Definition: histogram.py:150

histogram.Histograms.get_true_positives
def get_true_positives(self, signal_names)
Definition: histogram.py:139

histogram.Histograms.get_summed_hist
def get_summed_hist(self, names)
Definition: histogram.py:112

histogram.Histograms.bins
bins
Binning.
Definition: histogram.py:54

histogram.Histograms.get_signal_to_noise
def get_signal_to_noise(self, signal_names, bckgrd_names)
Definition: histogram.py:176

histogram.Histograms.get_purity_per_bin
def get_purity_per_bin(self, signal_names, bckgrd_names)
Definition: histogram.py:191

histogram.Histograms.hists
hists
Dictionary of histograms for the given masks.
Definition: histogram.py:60