14 from __future__
import division, print_function
20 def get_bins(arr, bin_count=1024):
21 """ Returns binning limits for equal statistic binning for an array
22 :param arr: numpy array to get binning for
23 :param bin_count: int number of bins
24 :return: list with bin limits
31 _arr = arr[np.logical_not(np.isnan(arr))]
33 if bin_count
is not None:
34 if arr_len <= bin_count:
35 raise ValueError(
'%d entries are not enough for equal statistics binning.' % len(_arr))
39 bin_idx_step = arr_len // bin_count
42 remainder = arr_len % bin_count
44 bin_limits = [_arr[0]]
47 for bin_number
in range(bin_count):
49 curr_idx += bin_idx_step
50 if bin_number < remainder:
53 bin_limits.append(_arr[curr_idx])
58 def get_modified_bin_limits(arr, bin_count=1024):
59 """ Feature binning: this case considers that multiple valuse can have the same value
60 bins are increased respectively and set to the mean value of the new bin
61 :param arr: numpy array to get binning for
62 :param bin_count: int number of bins
63 :return: list with bin limits
65 bins = get_bins(arr, bin_count)
66 bin_limits, counts = np.unique(bins, return_counts=
True)
71 for i, count
in enumerate(counts):
72 new_bin_limits.append(bin_limits[i])
75 new_bin_limits.append(np.nextafter(bin_limits[i], bin_limits[i] + 1))
76 bin_weights.append(count - 1)
81 new_bin_limits[-1] = np.nextafter(new_bin_limits[-1], len(new_bin_limits) * new_bin_limits[-1])
83 total_bins = sum(bin_weights)
87 step_len = 1 / total_bins
88 bin_values = np.zeros(len(bin_weights))
90 for i_idx, bin_weight
in enumerate(bin_weights):
91 bin_values[i_idx] = (current_bin + np.sum(range(bin_weight + 1)) / (bin_weight + 1)) * step_len
92 current_bin += bin_weight
95 bin_values = 2 * bin_values - 1
97 return new_bin_limits, bin_values
100 def transform_value(value, new_bin_limits, bin_values):
101 """ transforms a value according to given bins and bin values (mapping)
103 :param new_bin_limits:
109 return bin_values[np.digitize(value, new_bin_limits)]
112 def transform_array(arr, new_bin_limits, bin_values):
113 """ transforms an array according to given bins and bin values
115 :param new_bin_limits:
120 bin_idx = np.digitize(arr, new_bin_limits)
121 nan_idx = np.where(np.isnan(arr))
123 arr = bin_values[bin_idx]
129 def get_transform_to_probability_map(df, bins=100):
130 """ returns a transformation map to probability for a signal/background = 1 ratio
131 :param df: pandas.DataFrame with truth: 'y', and network output: 'y_hat'
132 :param bins: integer with number of bins
133 :return: numpy array for bin mapping
136 a_bins = np.linspace(0, 1, bins + 1)
139 a_bins[-1] = 1.0000001
143 grouped = df[
'y'].groupby(np.digitize(df[
'y_hat'], a_bins))
146 if not len(grouped) == bins:
147 raise RuntimeError(
'Not enough values per bin. Choose less bins.')
149 b_map = (grouped.sum() / grouped.count()).values
154 def transform_to_probability(value, b_map):
155 """ transforms a given value to probability according to a bin map
158 :return: float transformed value
161 if value < 0
or value > 1:
162 raise ValueError(value)
165 return b_map[int(value * (len(b_map) - 1))]
168 def transform_array_to_probability(arr, b_map):
169 """ transforms a given arr to probability according to a bin map
170 :param arr: numpy array to transform
172 :return: numpy array: transformed array
175 if not np.all(np.isfinite(arr)):
176 raise ValueError(
'Array not finite.')
177 if not np.min(arr) >= 0
and not np.max(arr) <= 1:
178 raise ValueError(
'Unexpected input values')
180 map_entries = len(b_map)
181 return b_map[(arr * (map_entries - 1)).astype(int)]
184 def get_signal_background_pdf(df, bins=100):
185 """ get the signal and background pdfs of a dataframe to a given network output
188 :return: tuple of signal pdf and back ground
190 print(
"WARNING: this function (%s) is not tested yet" % get_signal_background_pdf.__name__)
192 a_bins = np.linspace(0, 1, bins + 1)
193 a_bins[-1] = 1 + np.nextafter(1, 1.1)
195 df_sig = df[df[
'y'] == 1]
196 df_back = df[df[
'y'] == 0]
198 binned_sig = df_sig[
'y'].groupby(np.digitize(df_sig[
'y_hat'], a_bins))
199 binned_back = df_back[
'y'].groupby(np.digitize(df_back[
'y_hat'], a_bins))
201 sig_pdf = (binned_sig.count() / df_sig[
'y'].count()).values
202 back_pdf = (binned_back.count() / df_back[
'y'].count()).values
204 return sig_pdf, back_pdf
207 def trafo_to_prob_sf_func(p_signal, p_background, signal_fraction):
209 :param p_signal: signal_pdf value or array
210 :param p_background: signal_pdf value or array
211 :param signal_fraction:
212 :return: (single value, np array) signal fraction dependent to probability transformation
215 return (p_signal * signal_fraction) / (p_signal * signal_fraction + p_background * (1 - signal_fraction))
218 def transform_to_probability_sf(value, sig_back_tuple, signal_fraction):
219 """ returns a probability for a given signal fraction != .5
220 :param value: classifier output
221 :param sig_back_tuple: np.array, signal pdf, background pdf of the trained classifier
222 :param signal_fraction: signal fraction of classifier events
224 :return: float, probability for a given signal fraction
226 assert(signal_fraction > 0)
228 p_signal = transform_to_probability(value, sig_back_tuple[0])
229 p_background = transform_to_probability(value, sig_back_tuple[1])
233 return trafo_to_prob_sf_func(p_signal, p_background, signal_fraction)
236 def transform_array_to_probability_sf(arr, sig_back_tuple, signal_fraction):
237 """ transformation to probability. if smother output ("not peaky") is required, please implement spline
239 :param arr: array to transform
240 :param sig_back_tuple: np.array, signal pdf, background pdf of the trained classifier
241 :param signal_fraction: signal fraction of classifier events
244 assert(signal_fraction > 0)
246 p_signal = transform_array_to_probability(arr, sig_back_tuple[0])
247 p_back = transform_array_to_probability(arr, sig_back_tuple[1])
249 return trafo_to_prob_sf_func(p_signal, p_back, signal_fraction)
252 def get_signal_fraction(arr, weights=None):
256 :return: signal fraction of a given array
260 if weights
is not None:
261 return NotImplementedError
263 if not np.all(np.isfinite(arr)):
264 raise ValueError(
'Array not finite.')
265 if not np.min(arr) >= 0
and not np.max(arr) <= 1:
266 raise ValueError(
'Unexpected input values.')
268 return np.sum(arr) / len(arr)
272 def get_ndarray_binning_parameters(ndarr, bin_count=1024):
274 :param ndarr: numpy.ndarray with variables to transform (may contain NaN values)
275 :param bin_count: number of bins
276 :return: list of tuples with scheme [new_bin_limits, bin_values]
279 binning_parameters = []
281 for column
in ndarr.T:
282 binning_parameters.append(get_modified_bin_limits(column, bin_count))
284 return binning_parameters
287 def transform_ndarray(ndarr, binning_parameters):
289 :param ndarr: numpy.ndarray with variables
290 :param binning_parameters: list of tuples with scheme [new_bin_limits, bin_values]
291 :return: None, inplace operation
294 assert(ndarr.dtype
not in [np.int, np.int16, np.int32, np.int64])
295 for i, param_tuple
in enumerate(binning_parameters):
296 ndarr[:, i] = transform_array(ndarr[:, i], *param_tuple)
301 def transform_variable_vector(arr, binning_parameters):
302 """ transform only according to a recorded flatten distribution. this is necessary for single vector experts
303 :param arr: numpy.array
304 :param binning_parameters: list of tuples with scheme [new_bin_limits, bin_values]
305 :return: None, inplace operation
308 assert(arr.dtype
not in [np.int, np.int16, np.int32, np.int64])
309 for i, param_tuple
in enumerate(binning_parameters):
310 arr[i] = transform_value(arr[i], *param_tuple)
315 def sanitize_labels(arr):
317 checks for a binary classification problem
318 transforms the two class labels to {0,1}
320 @param arr numpy array,
321 @:return None, inplace, will not change dtype
324 assert len(np.unique(arr)) == 2,
'Not a binary classification!'
328 arr[arr == arr.min()] = 0
331 arr[arr == arr.max()] = 1
335 arr[arr == arr.min()] = 0