14def binom_error(n_sig, n_tot):
16 for an efficiency = nSig/nTrueSig
or purity = nSig / (nSig + nBckgrd), this function calculates the
17 standard deviation according to http://arxiv.org/abs/physics/0701199 .
19 variance = numpy.where(n_tot > 0, (n_sig + 1) * (n_sig + 2) / ((n_tot + 2) * (n_tot + 3)) -
20 (n_sig + 1) ** 2 / ((n_tot + 2) ** 2), 0)
21 return numpy.sqrt(variance)
24def poisson_error(n_tot):
26 use poisson error, except for 0 we use an 68% CL upper limit
28 return numpy.where(n_tot > 0, numpy.sqrt(n_tot), numpy.log(1.0 / (1 - 0.6827)))
31def weighted_mean_and_std(x, w):
33 Return the weighted average and standard deviation.
37 mean = numpy.average(x, weights=w)
38 var = numpy.average((x-mean)**2, weights=w)
39 return (mean, numpy.sqrt(var))
44 Extracts information from a pandas.DataFrame
and stores it
46 Therefore the size independent
from the size of the pandas.DataFrame.
47 Used by the plotting routines below.
61 def __init__(self, data, column, masks=dict(), weight_column=
None, bins=100, equal_frequency=
True, range_in_std=
None):
63 Creates a common binning of the given column of the given pandas.Dataframe,
64 and stores
for each given mask the histogram of the column
65 @param data pandas.DataFrame like object containing column
and weight_column
66 @param column string identifiying the column
in the pandas.DataFrame which
is binned.
67 @param masks dictionary of names
and boolean arrays, which select the data
68 used
for the creation of histograms
with these names
69 @param weight_column identifiying the column
in the pandas.DataFrame which
is used
as weight
70 @param bins use given bins instead of default 100
71 @param equal_frequency perform an equal_frequency binning
72 @param range_in_std show only the data
in a windows around +- range_in_std * standard_deviation around the mean
74 isfinite = numpy.isfinite(data[column])
75 if range_in_std
is not None:
76 mean, std = weighted_mean_and_std(data[column][isfinite],
77 None if weight_column
is None else data[weight_column][isfinite])
79 isfinite = isfinite & (data[column] > (mean - range_in_std * std)) & (data[column] < (mean + range_in_std * std))
82 if data[column][isfinite].size > 0:
83 bins = numpy.unique(numpy.percentile(data[column][isfinite], q=range(bins + 1)))
89 bins = numpy.array([bins[0]-1, bins[0]+1])
92 self.
hist, self.
binsbins = numpy.histogram(data[column][isfinite], bins=bins,
93 weights=
None if weight_column
is None else data[weight_column])
100 for name, mask
in masks.items():
101 self.
histshists[name] = numpy.histogram(data[column][mask & isfinite], bins=self.
binsbins,
102 weights=
None if weight_column
is None else data[weight_column][mask & isfinite])[0]
106 Return histogram with the given name. If none returns histogram of the full data.
107 @param name name of the histogram
108 @return numpy.array
with hist data, numpy.array
with corresponding poisson errors
111 return self.
hist, poisson_error(self.
hist)
116 Return the sum of histograms with the given names.
117 @param names names of the histograms
118 @return numpy.array
with hist data, numpy.array
with corresponding poisson errors
121 hist = numpy.sum(self.histshists.get(v, default) for v
in names)
122 hist_error = poisson_error(hist)
123 return hist, hist_error
127 Return the cumulative efficiency in each bin of the sum of the histograms
with the given names.
128 @param signal_names of the histograms
129 @return numpy.array
with hist data, numpy.array
with corresponding binomial errors
132 cumsignal = (signal.sum() - signal.cumsum()).astype('float')
137 efficiency = cumsignal / signal.sum()
138 efficiency_error = binom_error(cumsignal, signal.sum())
139 return efficiency, efficiency_error
143 Return the cumulative true positives in each bin of the sum of the histograms
with the given names.
144 @param names names of the histograms
145 @return numpy.array
with hist data, numpy.array
with corresponding binomial errors
148 cumsignal = (signal.sum() - signal.cumsum()).astype('float')
149 signal_error = poisson_error(cumsignal)
150 return cumsignal, signal_error
154 Return the cumulative false positives in each bin of the sum of the histograms
with the given names.
155 @param names names of the histograms
156 @return numpy.array
with hist data, numpy.array
with corresponding binomial errors
159 cumbackground = (background.sum() - background.cumsum()).astype('float')
160 background_error = poisson_error(cumbackground)
161 return cumbackground, background_error
165 Return the cumulative purity in each bin of the sum of the histograms
with the given names.
166 @param names names of the histograms
167 @return numpy.array
with hist data, numpy.array
with corresponding binomial errors
171 cumsignal = (signal.sum() - signal.cumsum()).astype('float')
172 cumbckgrd = (bckgrd.sum() - bckgrd.cumsum()).astype(
'float')
174 purity = cumsignal / (cumsignal + cumbckgrd)
175 purity_error = binom_error(cumsignal, cumsignal + cumbckgrd)
176 return purity, purity_error
180 Return the cumulative signal to noise ratio in each bin of the sum of the histograms
with the given names.
181 @param names names of the histograms
182 @return numpy.array
with hist data, numpy.array
with corresponding binomial errors
186 cumsignal = (signal.sum() - signal.cumsum()).astype('float')
187 cumbckgrd = (bckgrd.sum() - bckgrd.cumsum()).astype(
'float')
189 signal2noise = cumsignal / (cumsignal + cumbckgrd)**0.5
190 signal2noise_error = numpy.sqrt(cumsignal / (cumsignal + cumbckgrd) + (cumsignal / (2 * (cumsignal + cumbckgrd)))**2)
191 return signal2noise, signal2noise_error
195 Return the purity in each bin of the sum of the histograms
with the given names.
196 @param names names of the histograms
197 @return numpy.array
with hist data, numpy.array
with corresponding binomial errors
201 signal = signal.astype('float')
202 bckgrd = bckgrd.astype(
'float')
204 purity = signal / (signal + bckgrd)
205 purity_error = binom_error(signal, signal + bckgrd)
206 return purity, purity_error
def __init__(self, data, column, masks=dict(), weight_column=None, bins=100, equal_frequency=True, range_in_std=None)
def get_purity(self, signal_names, bckgrd_names)
None hists
Dictionary of histograms for the given masks.
None hist
Histogram of the full data.
None bin_centers
Bin centers.
def get_hist(self, name=None)
bin_widths
Subtract a small number from the bin width, otherwise the errorband plot is unstable.
None bin_widths
Bin widths.
def get_efficiency(self, signal_names)
def get_false_positives(self, bckgrd_names)
def get_true_positives(self, signal_names)
def get_summed_hist(self, names)
def get_signal_to_noise(self, signal_names, bckgrd_names)
def get_purity_per_bin(self, signal_names, bckgrd_names)
hists
initialize empty dictionary for histograms