10 def binom_error(n_sig, n_tot):
12 for an efficiency = nSig/nTrueSig or purity = nSig / (nSig + nBckgrd), this function calculates the
13 standard deviation according to http://arxiv.org/abs/physics/0701199 .
15 variance = numpy.where(n_tot > 0, (n_sig + 1) * (n_sig + 2) / ((n_tot + 2) * (n_tot + 3)) -
16 (n_sig + 1) ** 2 / ((n_tot + 2) ** 2), 0)
17 return numpy.sqrt(variance)
20 def poisson_error(n_tot):
22 use poisson error, except for 0 we use an 68% CL upper limit
24 return numpy.where(n_tot > 0, numpy.sqrt(n_tot), numpy.log(1.0 / (1 - 0.6827)))
27 def weighted_mean_and_std(x, w):
29 Return the weighted average and standard deviation.
33 mean = numpy.average(x, weights=w)
34 var = numpy.average((x-mean)**2, weights=w)
35 return (mean, numpy.sqrt(var))
40 Extracts information from a pandas.DataFrame and stores it
42 Therefore the size independent from the size of the pandas.DataFrame.
43 Used by the plotting routines below.
57 def __init__(self, data, column, masks=dict(), weight_column=
None, bins=100, equal_frequency=
True, range_in_std=
None):
59 Creates a common binning of the given column of the given pandas.Dataframe,
60 and stores for each given mask the histogram of the column
61 @param data pandas.DataFrame like object containing column and weight_column
62 @param column string identifiying the column in the pandas.DataFrame which is binned.
63 @param masks dictionary of names and boolean arrays, which select the data
64 used for the creation of histograms with these names
65 @param weight_column identifiying the column in the pandas.DataFrame which is used as weight
66 @param bins use given bins instead of default 100
67 @param equal_frequency perform an equal_frequency binning
68 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
70 isfinite = numpy.isfinite(data[column])
71 if range_in_std
is not None:
72 mean, std = weighted_mean_and_std(data[column][isfinite],
73 None if weight_column
is None else data[weight_column][isfinite])
75 isfinite = isfinite & (data[column] > (mean - range_in_std * std)) & (data[column] < (mean + range_in_std * std))
78 if data[column][isfinite].size > 0:
79 bins = numpy.unique(numpy.percentile(data[column][isfinite], q=range(bins + 1)))
85 bins = numpy.array([bins[0]-1, bins[0]+1])
87 self.
hist, self.
bins = numpy.histogram(data[column][isfinite], bins=bins,
88 weights=
None if weight_column
is None else data[weight_column])
93 for name, mask
in masks.items():
94 self.
hists[name] = numpy.histogram(data[column][mask & isfinite], bins=self.
bins,
95 weights=
None if weight_column
is None else data[weight_column][mask & isfinite])[0]
99 Return histogram with the given name. If none returns histogram of the full data.
100 @param name name of the histogram
101 @return numpy.array with hist data, numpy.array with corresponding poisson errors
104 return self.
hist, poisson_error(self.
hist)
109 Return the sum of histograms with the given names.
110 @param names names of the histograms
111 @return numpy.array with hist data, numpy.array with corresponding poisson errors
114 hist = numpy.sum(self.
hists.get(v, default)
for v
in names)
115 hist_error = poisson_error(hist)
116 return hist, hist_error
120 Return the cumulative efficiency in each bin of the sum of the histograms with the given names.
121 @param signal_names of the histograms
122 @return numpy.array with hist data, numpy.array with corresponding binomial errors
125 cumsignal = (signal.sum() - signal.cumsum()).astype(
'float')
130 efficiency = cumsignal / signal.sum()
131 efficiency_error = binom_error(cumsignal, signal.sum())
132 return efficiency, efficiency_error
136 Return the cumulative true positives in each bin of the sum of the histograms with the given names.
137 @param names names of the histograms
138 @return numpy.array with hist data, numpy.array with corresponding binomial errors
141 cumsignal = (signal.sum() - signal.cumsum()).astype(
'float')
142 signal_error = poisson_error(cumsignal)
143 return cumsignal, signal_error
147 Return the cumulative false positives in each bin of the sum of the histograms with the given names.
148 @param names names of the histograms
149 @return numpy.array with hist data, numpy.array with corresponding binomial errors
152 cumbackground = (background.sum() - background.cumsum()).astype(
'float')
153 background_error = poisson_error(cumbackground)
154 return cumbackground, background_error
158 Return the cumulative purity in each bin of the sum of the histograms with the given names.
159 @param names names of the histograms
160 @return numpy.array with hist data, numpy.array with corresponding binomial errors
164 cumsignal = (signal.sum() - signal.cumsum()).astype(
'float')
165 cumbckgrd = (bckgrd.sum() - bckgrd.cumsum()).astype(
'float')
167 purity = cumsignal / (cumsignal + cumbckgrd)
168 purity_error = binom_error(cumsignal, cumsignal + cumbckgrd)
169 return purity, purity_error
173 Return the cumulative signal to noise ratio in each bin of the sum of the histograms with the given names.
174 @param names names of the histograms
175 @return numpy.array with hist data, numpy.array with corresponding binomial errors
179 cumsignal = (signal.sum() - signal.cumsum()).astype(
'float')
180 cumbckgrd = (bckgrd.sum() - bckgrd.cumsum()).astype(
'float')
182 signal2noise = cumsignal / (cumsignal + cumbckgrd)**0.5
183 signal2noise_error = numpy.sqrt(cumsignal / (cumsignal + cumbckgrd) + (cumsignal / (2 * (cumsignal + cumbckgrd)))**2)
184 return signal2noise, signal2noise_error
188 Return the purity in each bin of the sum of the histograms with the given names.
189 @param names names of the histograms
190 @return numpy.array with hist data, numpy.array with corresponding binomial errors
194 signal = signal.astype(
'float')
195 bckgrd = bckgrd.astype(
'float')
197 purity = signal / (signal + bckgrd)
198 purity_error = binom_error(signal, signal + bckgrd)
199 return purity, purity_error