Belle II Software development
binning.py
1#!/usr/bin/env python3
2
3
10
11import numpy as np
12
13# Purity transformations
14
15
16def get_bins(arr, bin_count=1024):
17 """ Returns binning limits for equal statistic binning for an array
18 :param arr: numpy array to get binning for
19 :param bin_count: int number of bins
20 :return: list with bin limits
21 """
22
23 # bin_count will produce +2 bins
24 bin_count -= 2
25
26 # remove nan -> will be set to zero later (left of lowest bin)
27 _arr = arr[np.logical_not(np.isnan(arr))]
28 arr_len = len(_arr)
29 if bin_count is not None:
30 if arr_len <= bin_count:
31 raise ValueError(f'{len(_arr)} entries are not enough for equal statistics binning.')
32
33 _arr = np.sort(_arr)
34
35 bin_idx_step = arr_len // bin_count
36
37 # if array length not multiple of step size:
38 remainder = arr_len % bin_count
39
40 bin_limits = [_arr[0]]
41
42 curr_idx = -1
43 for bin_number in range(bin_count):
44
45 curr_idx += bin_idx_step
46 if bin_number < remainder:
47 curr_idx += 1
48
49 bin_limits.append(_arr[curr_idx])
50
51 return bin_limits
52
53
54def get_modified_bin_limits(arr, bin_count=1024):
55 """ Feature binning: this case considers that multiple values can have the same value
56 bins are increased respectively and set to the mean value of the new bin
57 :param arr: numpy array to get binning for
58 :param bin_count: int number of bins
59 :return: list with bin limits
60 """
61 bins = get_bins(arr, bin_count)
62 bin_limits, counts = np.unique(bins, return_counts=True)
63
64 new_bin_limits = []
65 bin_weights = []
66
67 for i, count in enumerate(counts):
68 new_bin_limits.append(bin_limits[i])
69 bin_weights.append(1)
70 if count > 1:
71 new_bin_limits.append(np.nextafter(bin_limits[i], bin_limits[i] + 1))
72 bin_weights.append(count - 1)
73
74 bin_weights.append(1)
75
76 # increase bin limits slightly (make sure that all occurring values are actually binned correctly)
77 new_bin_limits[-1] = np.nextafter(new_bin_limits[-1], len(new_bin_limits) * new_bin_limits[-1])
78
79 total_bins = sum(bin_weights)
80
81 current_bin = 0
82
83 step_len = 1 / total_bins
84 bin_values = np.zeros(len(bin_weights))
85
86 for i_idx, bin_weight in enumerate(bin_weights):
87 bin_values[i_idx] = (current_bin + np.sum(range(bin_weight + 1)) / (bin_weight + 1)) * step_len
88 current_bin += bin_weight
89
90 # transform bin values from [0, 1] -> [-1, 1]
91 bin_values = 2 * bin_values - 1
92
93 return new_bin_limits, bin_values
94
95
96def transform_value(value, new_bin_limits, bin_values):
97 """ transforms a value according to given bins and bin values (mapping)
98 :param value:
99 :param new_bin_limits:
100 :param bin_values:
101 :return:
102 """
103 if np.isnan(value):
104 return 0
105 return bin_values[np.digitize(value, new_bin_limits)]
106
107
108def transform_array(arr, new_bin_limits, bin_values):
109 """ transforms an array according to given bins and bin values
110 :param arr:
111 :param new_bin_limits:
112 :param bin_values:
113 :return:
114 """
115
116 bin_idx = np.digitize(arr, new_bin_limits)
117 nan_idx = np.where(np.isnan(arr))
118
119 arr = bin_values[bin_idx]
120
121 arr[nan_idx] = 0
122 return arr
123
124
125def get_transform_to_probability_map(df, bins=100):
126 """ returns a transformation map to probability for a signal/background = 1 ratio
127 :param df: pandas.DataFrame with truth: 'y', and network output: 'y_hat'
128 :param bins: integer with number of bins
129 :return: numpy array for bin mapping
130 """
131
132 a_bins = np.linspace(0, 1, bins + 1)
133
134 # in case maximum value is equal to 1
135 a_bins[-1] = 1.0000001
136
137 # mapping tagger output to signal/ (signal + background) in the relevant bin
138
139 grouped = df['y'].groupby(np.digitize(df['y_hat'], a_bins))
140
141 # check if length equals set of values
142 if not len(grouped) == bins:
143 raise RuntimeError('Not enough values per bin. Choose less bins.')
144
145 b_map = (grouped.sum() / grouped.count()).values
146
147 return b_map
148
149
150def transform_to_probability(value, b_map):
151 """ transforms a given value to probability according to a bin map
152 :param value:
153 :param b_map:
154 :return: float transformed value
155 """
156
157 if value < 0 or value > 1:
158 raise ValueError(value)
159
160 # shift -1 for array index purpose
161 return b_map[int(value * (len(b_map) - 1))]
162
163
164def transform_array_to_probability(arr, b_map):
165 """ transforms a given arr to probability according to a bin map
166 :param arr: numpy array to transform
167 :param b_map:
168 :return: numpy array: transformed array
169 """
170
171 if not np.all(np.isfinite(arr)):
172 raise ValueError('Array not finite.')
173 if not np.min(arr) >= 0 and not np.max(arr) <= 1:
174 raise ValueError('Unexpected input values')
175
176 map_entries = len(b_map)
177 return b_map[(arr * (map_entries - 1)).astype(int)]
178
179
180def get_signal_background_pdf(df, bins=100):
181 """ get the signal and background pdfs of a dataframe to a given network output
182 :param df:
183 :param bins:
184 :return: tuple of signal pdf and back ground
185 """
186 print(f"WARNING: this function ({get_signal_background_pdf.__name__}) is not tested yet")
187
188 a_bins = np.linspace(0, 1, bins + 1)
189 a_bins[-1] = 1 + np.nextafter(1, 1.1)
190
191 df_sig = df[df['y'] == 1]
192 df_back = df[df['y'] == 0]
193
194 binned_sig = df_sig['y'].groupby(np.digitize(df_sig['y_hat'], a_bins))
195 binned_back = df_back['y'].groupby(np.digitize(df_back['y_hat'], a_bins))
196
197 sig_pdf = (binned_sig.count() / df_sig['y'].count()).values
198 back_pdf = (binned_back.count() / df_back['y'].count()).values
199
200 return sig_pdf, back_pdf
201
202
203def trafo_to_prob_sf_func(p_signal, p_background, signal_fraction):
204 """
205 :param p_signal: signal_pdf value or array
206 :param p_background: signal_pdf value or array
207 :param signal_fraction:
208 :return: (single value, np array) signal fraction dependent to probability transformation
209 """
210
211 return (p_signal * signal_fraction) / (p_signal * signal_fraction + p_background * (1 - signal_fraction))
212
213
214def transform_to_probability_sf(value, sig_back_tuple, signal_fraction):
215 """ returns a probability for a given signal fraction != .5
216 :param value: classifier output
217 :param sig_back_tuple: np.array, signal pdf, background pdf of the trained classifier
218 :param signal_fraction: signal fraction of classifier events
219
220 :return: float, probability for a given signal fraction
221 """
222 assert (signal_fraction > 0)
223
224 p_signal = transform_to_probability(value, sig_back_tuple[0])
225 p_background = transform_to_probability(value, sig_back_tuple[1])
226 # print('Warning function %s is not tested yet' % transform_to_probability_sf.__name__)
227 # function transform to probability actually just evaluates the pdf at a given point
228
229 return trafo_to_prob_sf_func(p_signal, p_background, signal_fraction)
230
231
232def transform_array_to_probability_sf(arr, sig_back_tuple, signal_fraction):
233 """ transformation to probability. if smoother output ("not peaky") is required, please implement spline
234 interpolation
235 :param arr: array to transform
236 :param sig_back_tuple: np.array, signal pdf, background pdf of the trained classifier
237 :param signal_fraction: signal fraction of classifier events
238 :return:
239 """
240 assert (signal_fraction > 0)
241
242 p_signal = transform_array_to_probability(arr, sig_back_tuple[0])
243 p_back = transform_array_to_probability(arr, sig_back_tuple[1])
244 # print('Warning function %s is not tested yet' % transform_array_to_probability_sf.__name__)
245 return trafo_to_prob_sf_func(p_signal, p_back, signal_fraction)
246
247
248def get_signal_fraction(arr, weights=None):
249 """
250 :param arr:
251 :param weights:
252 :return: signal fraction of a given array
253 """
254 # isinstance(arr, np.array)
255
256 if weights is not None:
257 return NotImplementedError
258
259 if not np.all(np.isfinite(arr)):
260 raise ValueError('Array not finite.')
261 if not np.min(arr) >= 0 and not np.max(arr) <= 1:
262 raise ValueError('Unexpected input values.')
263
264 return np.sum(arr) / len(arr)
265
266
267# new MVA interface adaptions
268def get_ndarray_binning_parameters(ndarr, bin_count=1024):
269 """
270 :param ndarr: numpy.ndarray with variables to transform (may contain NaN values)
271 :param bin_count: number of bins
272 :return: list of tuples with scheme [new_bin_limits, bin_values]
273 """
274
275 binning_parameters = []
276 # transform each column in an numpy ndarr
277 for column in ndarr.T:
278 binning_parameters.append(get_modified_bin_limits(column, bin_count))
279
280 return binning_parameters
281
282
283def transform_ndarray(ndarr, binning_parameters):
284 """ flatten ndarray
285 :param ndarr: numpy.ndarray with variables
286 :param binning_parameters: list of tuples with scheme [new_bin_limits, bin_values]
287 :return: None, inplace operation
288 """
289
290 assert (ndarr.dtype not in [np.int16, np.int32, np.int64])
291 for i, param_tuple in enumerate(binning_parameters):
292 ndarr[:, i] = transform_array(ndarr[:, i], *param_tuple)
293
294 return None
295
296
297def transform_variable_vector(arr, binning_parameters):
298 """ transform only according to a recorded flatten distribution. this is necessary for single vector experts
299 :param arr: numpy.array
300 :param binning_parameters: list of tuples with scheme [new_bin_limits, bin_values]
301 :return: None, inplace operation
302 """
303
304 assert (arr.dtype not in [np.int16, np.int32, np.int64])
305 for i, param_tuple in enumerate(binning_parameters):
306 arr[i] = transform_value(arr[i], *param_tuple)
307
308 return None
309
310
311def sanitize_labels(arr):
312 """
313 checks for a binary classification problem
314 transforms the two class labels to {0,1}
315
316 @param arr numpy array,
317 @:return None, inplace, will not change dtype
318 """
319 # not binary
320 assert len(np.unique(arr)) == 2, 'Not a binary classification!'
321
322 # reject corner cases when classes would have special values
323 if arr.min() > 0:
324 arr[arr == arr.min()] = 0
325
326 if arr.max() != 1:
327 arr[arr == arr.max()] = 1
328
329 # transform labels
330 if arr.min() != 0:
331 arr[arr == arr.min()] = 0