16def get_bins(arr, bin_count=1024):
17 """ Returns binning limits for equal statistic binning for an array
18 :param arr: numpy array to get binning for
19 :param bin_count: int number of bins
20 :
return: list
with bin limits
27 _arr = arr[np.logical_not(np.isnan(arr))]
29 if bin_count
is not None:
30 if arr_len <= bin_count:
31 raise ValueError(f
'{len(_arr)} entries are not enough for equal statistics binning.')
35 bin_idx_step = arr_len // bin_count
38 remainder = arr_len % bin_count
40 bin_limits = [_arr[0]]
43 for bin_number
in range(bin_count):
45 curr_idx += bin_idx_step
46 if bin_number < remainder:
49 bin_limits.append(_arr[curr_idx])
54def get_modified_bin_limits(arr, bin_count=1024):
55 """ Feature binning: this case considers that multiple values can have the same value
56 bins are increased respectively and set to the mean value of the new bin
57 :param arr: numpy array to get binning
for
58 :param bin_count: int number of bins
59 :
return: list
with bin limits
61 bins = get_bins(arr, bin_count)
62 bin_limits, counts = np.unique(bins, return_counts=True)
67 for i, count
in enumerate(counts):
68 new_bin_limits.append(bin_limits[i])
71 new_bin_limits.append(np.nextafter(bin_limits[i], bin_limits[i] + 1))
72 bin_weights.append(count - 1)
77 new_bin_limits[-1] = np.nextafter(new_bin_limits[-1], len(new_bin_limits) * new_bin_limits[-1])
79 total_bins = sum(bin_weights)
83 step_len = 1 / total_bins
84 bin_values = np.zeros(len(bin_weights))
86 for i_idx, bin_weight
in enumerate(bin_weights):
87 bin_values[i_idx] = (current_bin + np.sum(range(bin_weight + 1)) / (bin_weight + 1)) * step_len
88 current_bin += bin_weight
91 bin_values = 2 * bin_values - 1
93 return new_bin_limits, bin_values
96def transform_value(value, new_bin_limits, bin_values):
97 """ transforms a value according to given bins and bin values (mapping)
99 :param new_bin_limits:
105 return bin_values[np.digitize(value, new_bin_limits)]
108def transform_array(arr, new_bin_limits, bin_values):
109 """ transforms an array according to given bins and bin values
111 :param new_bin_limits:
116 bin_idx = np.digitize(arr, new_bin_limits)
117 nan_idx = np.where(np.isnan(arr))
119 arr = bin_values[bin_idx]
125def get_transform_to_probability_map(df, bins=100):
126 """ returns a transformation map to probability for a signal/background = 1 ratio
127 :param df: pandas.DataFrame with truth:
'y',
and network output:
'y_hat'
128 :param bins: integer
with number of bins
129 :
return: numpy array
for bin mapping
132 a_bins = np.linspace(0, 1, bins + 1)
135 a_bins[-1] = 1.0000001
139 grouped = df[
'y'].groupby(np.digitize(df[
'y_hat'], a_bins))
142 if not len(grouped) == bins:
143 raise RuntimeError(
'Not enough values per bin. Choose less bins.')
145 b_map = (grouped.sum() / grouped.count()).values
150def transform_to_probability(value, b_map):
151 """ transforms a given value to probability according to a bin map
154 :return: float transformed value
157 if value < 0
or value > 1:
158 raise ValueError(value)
161 return b_map[int(value * (len(b_map) - 1))]
164def transform_array_to_probability(arr, b_map):
165 """ transforms a given arr to probability according to a bin map
166 :param arr: numpy array to transform
168 :return: numpy array: transformed array
171 if not np.all(np.isfinite(arr)):
172 raise ValueError(
'Array not finite.')
173 if not np.min(arr) >= 0
and not np.max(arr) <= 1:
174 raise ValueError(
'Unexpected input values')
176 map_entries = len(b_map)
177 return b_map[(arr * (map_entries - 1)).astype(int)]
180def get_signal_background_pdf(df, bins=100):
181 """ get the signal and background pdfs of a dataframe to a given network output
184 :return: tuple of signal pdf
and back ground
186 print(f"WARNING: this function ({get_signal_background_pdf.__name__}) is not tested yet")
188 a_bins = np.linspace(0, 1, bins + 1)
189 a_bins[-1] = 1 + np.nextafter(1, 1.1)
191 df_sig = df[df[
'y'] == 1]
192 df_back = df[df[
'y'] == 0]
194 binned_sig = df_sig[
'y'].groupby(np.digitize(df_sig[
'y_hat'], a_bins))
195 binned_back = df_back[
'y'].groupby(np.digitize(df_back[
'y_hat'], a_bins))
197 sig_pdf = (binned_sig.count() / df_sig[
'y'].count()).values
198 back_pdf = (binned_back.count() / df_back[
'y'].count()).values
200 return sig_pdf, back_pdf
203def trafo_to_prob_sf_func(p_signal, p_background, signal_fraction):
205 :param p_signal: signal_pdf value or array
206 :param p_background: signal_pdf value
or array
207 :param signal_fraction:
208 :
return: (single value, np array) signal fraction dependent to probability transformation
211 return (p_signal * signal_fraction) / (p_signal * signal_fraction + p_background * (1 - signal_fraction))
214def transform_to_probability_sf(value, sig_back_tuple, signal_fraction):
215 """ returns a probability for a given signal fraction != .5
216 :param value: classifier output
217 :param sig_back_tuple: np.array, signal pdf, background pdf of the trained classifier
218 :param signal_fraction: signal fraction of classifier events
220 :return: float, probability
for a given signal fraction
222 assert(signal_fraction > 0)
224 p_signal = transform_to_probability(value, sig_back_tuple[0])
225 p_background = transform_to_probability(value, sig_back_tuple[1])
229 return trafo_to_prob_sf_func(p_signal, p_background, signal_fraction)
232def transform_array_to_probability_sf(arr, sig_back_tuple, signal_fraction):
233 """ transformation to probability. if smoother output ("not peaky") is required, please implement spline
235 :param arr: array to transform
236 :param sig_back_tuple: np.array, signal pdf, background pdf of the trained classifier
237 :param signal_fraction: signal fraction of classifier events
240 assert(signal_fraction > 0)
242 p_signal = transform_array_to_probability(arr, sig_back_tuple[0])
243 p_back = transform_array_to_probability(arr, sig_back_tuple[1])
245 return trafo_to_prob_sf_func(p_signal, p_back, signal_fraction)
248def get_signal_fraction(arr, weights=None):
252 :return: signal fraction of a given array
256 if weights
is not None:
257 return NotImplementedError
259 if not np.all(np.isfinite(arr)):
260 raise ValueError(
'Array not finite.')
261 if not np.min(arr) >= 0
and not np.max(arr) <= 1:
262 raise ValueError(
'Unexpected input values.')
264 return np.sum(arr) / len(arr)
268def get_ndarray_binning_parameters(ndarr, bin_count=1024):
270 :param ndarr: numpy.ndarray with variables to transform (may contain NaN values)
271 :param bin_count: number of bins
272 :
return: list of tuples
with scheme [new_bin_limits, bin_values]
275 binning_parameters = []
277 for column
in ndarr.T:
278 binning_parameters.append(get_modified_bin_limits(column, bin_count))
280 return binning_parameters
283def transform_ndarray(ndarr, binning_parameters):
285 :param ndarr: numpy.ndarray with variables
286 :param binning_parameters: list of tuples
with scheme [new_bin_limits, bin_values]
287 :
return:
None, inplace operation
290 assert(ndarr.dtype not in [np.int, np.int16, np.int32, np.int64])
291 for i, param_tuple
in enumerate(binning_parameters):
292 ndarr[:, i] = transform_array(ndarr[:, i], *param_tuple)
297def transform_variable_vector(arr, binning_parameters):
298 """ transform only according to a recorded flatten distribution. this is necessary for single vector experts
299 :param arr: numpy.array
300 :param binning_parameters: list of tuples with scheme [new_bin_limits, bin_values]
301 :
return:
None, inplace operation
304 assert(arr.dtype not in [np.int, np.int16, np.int32, np.int64])
305 for i, param_tuple
in enumerate(binning_parameters):
306 arr[i] = transform_value(arr[i], *param_tuple)
311def sanitize_labels(arr):
313 checks for a binary classification problem
314 transforms the two
class labels to {0,1}
316 @param arr numpy array,
317 @:
return None, inplace, will
not change dtype
320 assert len(np.unique(arr)) == 2,
'Not a binary classification!'
324 arr[arr == arr.min()] = 0
327 arr[arr == arr.max()] = 1
331 arr[arr == arr.min()] = 0