15 def binom_error(n_sig, n_tot):
 
   17     for an efficiency = nSig/nTrueSig or purity = nSig / (nSig + nBckgrd), this function calculates the 
   18     standard deviation according to http://arxiv.org/abs/physics/0701199 . 
   20     variance = numpy.where(n_tot > 0, (n_sig + 1) * (n_sig + 2) / ((n_tot + 2) * (n_tot + 3)) -
 
   21                            (n_sig + 1) ** 2 / ((n_tot + 2) ** 2), 0)
 
   22     return numpy.sqrt(variance)
 
   25 def poisson_error(n_tot):
 
   27     use poisson error, except for 0 we use an 68% CL upper limit 
   29     return numpy.where(n_tot > 0, numpy.sqrt(n_tot), numpy.log(1.0 / (1 - 0.6827)))
 
   32 def weighted_mean_and_std(x, w):
 
   34     Return the weighted average and standard deviation. 
   38     mean = numpy.average(x, weights=w)
 
   39     var = numpy.average((x-mean)**2, weights=w)
 
   40     return (mean, numpy.sqrt(var))
 
   45     Extracts information from a pandas.DataFrame and stores it 
   47     Therefore the size independent from the size of the pandas.DataFrame. 
   48     Used by the plotting routines below. 
   62     def __init__(self, data, column, masks=dict(), weight_column=
None, bins=100, equal_frequency=
True, range_in_std=
None):
 
   64         Creates a common binning of the given column of the given pandas.Dataframe, 
   65         and stores for each given mask the histogram of the column 
   66         @param data pandas.DataFrame  like object containing column and weight_column 
   67         @param column string identifiying the column in the pandas.DataFrame which is binned. 
   68         @param masks dictionary of names and boolean arrays, which select the data 
   69                      used for the creation of histograms with these names 
   70         @param weight_column identifiying the column in the pandas.DataFrame which is used as weight 
   71         @param bins use given bins instead of default 100 
   72         @param equal_frequency perform an equal_frequency binning 
   73         @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean 
   75         isfinite = numpy.isfinite(data[column])
 
   76         if range_in_std 
is not None:
 
   77             mean, std = weighted_mean_and_std(data[column][isfinite],
 
   78                                               None if weight_column 
is None else data[weight_column][isfinite])
 
   80             isfinite = isfinite & (data[column] > (mean - range_in_std * std)) & (data[column] < (mean + range_in_std * std))
 
   83             if data[column][isfinite].size > 0:
 
   84                 bins = numpy.unique(numpy.percentile(data[column][isfinite], q=range(bins + 1)))
 
   90                 bins = numpy.array([bins[0]-1, bins[0]+1])
 
   92         self.
histhist, self.
binsbins = numpy.histogram(data[column][isfinite], bins=bins,
 
   93                                                weights=
None if weight_column 
is None else data[weight_column])
 
   96         self.
bin_widthsbin_widths = (self.
binsbins - numpy.roll(self.
binsbins, 1))[1:] - 0.00001
 
   97         self.
histshists = dict()
 
   98         for name, mask 
in masks.items():
 
   99             self.
histshists[name] = numpy.histogram(data[column][mask & isfinite], bins=self.
binsbins,
 
  100                                                weights=
None if weight_column 
is None else data[weight_column][mask & isfinite])[0]
 
  104         Return histogram with the given name. If none returns histogram of the full data. 
  105         @param name name of the histogram 
  106         @return numpy.array with hist data, numpy.array with corresponding poisson errors 
  109             return self.
histhist, poisson_error(self.
histhist)
 
  114         Return the sum of histograms with the given names. 
  115         @param names names of the histograms 
  116         @return numpy.array with hist data, numpy.array with corresponding poisson errors 
  118         default = numpy.zeros(len(self.
bin_centersbin_centers))
 
  119         hist = numpy.sum(self.
histshists.get(v, default) 
for v 
in names)
 
  120         hist_error = poisson_error(hist)
 
  121         return hist, hist_error
 
  125         Return the cumulative efficiency in each bin of the sum of the histograms with the given names. 
  126         @param  signal_names of the histograms 
  127         @return numpy.array with hist data, numpy.array with corresponding binomial errors 
  130         cumsignal = (signal.sum() - signal.cumsum()).astype(
'float')
 
  135             efficiency = cumsignal / signal.sum()
 
  136             efficiency_error = binom_error(cumsignal, signal.sum())
 
  137         return efficiency, efficiency_error
 
  141         Return the cumulative true positives in each bin of the sum of the histograms with the given names. 
  142         @param names names of the histograms 
  143         @return numpy.array with hist data, numpy.array with corresponding binomial errors 
  146         cumsignal = (signal.sum() - signal.cumsum()).astype(
'float')
 
  147         signal_error = poisson_error(cumsignal)
 
  148         return cumsignal, signal_error
 
  152         Return the cumulative false positives in each bin of the sum of the histograms with the given names. 
  153         @param names names of the histograms 
  154         @return numpy.array with hist data, numpy.array with corresponding binomial errors 
  157         cumbackground = (background.sum() - background.cumsum()).astype(
'float')
 
  158         background_error = poisson_error(cumbackground)
 
  159         return cumbackground, background_error
 
  163         Return the cumulative purity in each bin of the sum of the histograms with the given names. 
  164         @param names names of the histograms 
  165         @return numpy.array with hist data, numpy.array with corresponding binomial errors 
  169         cumsignal = (signal.sum() - signal.cumsum()).astype(
'float')
 
  170         cumbckgrd = (bckgrd.sum() - bckgrd.cumsum()).astype(
'float')
 
  172         purity = cumsignal / (cumsignal + cumbckgrd)
 
  173         purity_error = binom_error(cumsignal, cumsignal + cumbckgrd)
 
  174         return purity, purity_error
 
  178         Return the cumulative signal to noise ratio in each bin of the sum of the histograms with the given names. 
  179         @param names names of the histograms 
  180         @return numpy.array with hist data, numpy.array with corresponding binomial errors 
  184         cumsignal = (signal.sum() - signal.cumsum()).astype(
'float')
 
  185         cumbckgrd = (bckgrd.sum() - bckgrd.cumsum()).astype(
'float')
 
  187         signal2noise = cumsignal / (cumsignal + cumbckgrd)**0.5
 
  188         signal2noise_error = numpy.sqrt(cumsignal / (cumsignal + cumbckgrd) + (cumsignal / (2 * (cumsignal + cumbckgrd)))**2)
 
  189         return signal2noise, signal2noise_error
 
  193         Return the purity in each bin of the sum of the histograms with the given names. 
  194         @param names names of the histograms 
  195         @return numpy.array with hist data, numpy.array with corresponding binomial errors 
  199         signal = signal.astype(
'float')
 
  200         bckgrd = bckgrd.astype(
'float')
 
  202         purity = signal / (signal + bckgrd)
 
  203         purity_error = binom_error(signal, signal + bckgrd)
 
  204         return purity, purity_error
 
def __init__(self, data, column, masks=dict(), weight_column=None, bins=100, equal_frequency=True, range_in_std=None)
def get_purity(self, signal_names, bckgrd_names)
def get_hist(self, name=None)
def get_efficiency(self, signal_names)
hist
Histogram of the full data.
def get_false_positives(self, bckgrd_names)
def get_true_positives(self, signal_names)
def get_summed_hist(self, names)
def get_signal_to_noise(self, signal_names, bckgrd_names)
def get_purity_per_bin(self, signal_names, bckgrd_names)
hists
Dictionary of histograms for the given masks.