Belle II Software  release-05-01-25
binning.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 
13 
14 from __future__ import division, print_function
15 import numpy as np
16 
17 # Purity transformations
18 
19 
20 def get_bins(arr, bin_count=1024):
21  """ Returns binning limits for equal statistic binning for an array
22  :param arr: numpy array to get binning for
23  :param bin_count: int number of bins
24  :return: list with bin limits
25  """
26 
27  # bin_count will produce +2 bins
28  bin_count -= 2
29 
30  # remove nan -> will be set to zero later (left of lowest bin)
31  _arr = arr[np.logical_not(np.isnan(arr))]
32  arr_len = len(_arr)
33  if bin_count is not None:
34  if arr_len <= bin_count:
35  raise ValueError('%d entries are not enough for equal statistics binning.' % len(_arr))
36 
37  _arr = np.sort(_arr)
38 
39  bin_idx_step = arr_len // bin_count
40 
41  # if array length not multiple of step size:
42  remainder = arr_len % bin_count
43 
44  bin_limits = [_arr[0]]
45 
46  curr_idx = -1
47  for bin_number in range(bin_count):
48 
49  curr_idx += bin_idx_step
50  if bin_number < remainder:
51  curr_idx += 1
52 
53  bin_limits.append(_arr[curr_idx])
54 
55  return bin_limits
56 
57 
58 def get_modified_bin_limits(arr, bin_count=1024):
59  """ Feature binning: this case considers that multiple valuse can have the same value
60  bins are increased respectively and set to the mean value of the new bin
61  :param arr: numpy array to get binning for
62  :param bin_count: int number of bins
63  :return: list with bin limits
64  """
65  bins = get_bins(arr, bin_count)
66  bin_limits, counts = np.unique(bins, return_counts=True)
67 
68  new_bin_limits = []
69  bin_weights = []
70 
71  for i, count in enumerate(counts):
72  new_bin_limits.append(bin_limits[i])
73  bin_weights.append(1)
74  if count > 1:
75  new_bin_limits.append(np.nextafter(bin_limits[i], bin_limits[i] + 1))
76  bin_weights.append(count - 1)
77 
78  bin_weights.append(1)
79 
80  # increase bin limits slightly (make sure that all occuring values are actually binned correctly)
81  new_bin_limits[-1] = np.nextafter(new_bin_limits[-1], len(new_bin_limits) * new_bin_limits[-1])
82 
83  total_bins = sum(bin_weights)
84 
85  current_bin = 0
86 
87  step_len = 1 / total_bins
88  bin_values = np.zeros(len(bin_weights))
89 
90  for i_idx, bin_weight in enumerate(bin_weights):
91  bin_values[i_idx] = (current_bin + np.sum(range(bin_weight + 1)) / (bin_weight + 1)) * step_len
92  current_bin += bin_weight
93 
94  # transform bin values from [0, 1] -> [-1, 1]
95  bin_values = 2 * bin_values - 1
96 
97  return new_bin_limits, bin_values
98 
99 
100 def transform_value(value, new_bin_limits, bin_values):
101  """ transforms a value according to given bins and bin values (mapping)
102  :param value:
103  :param new_bin_limits:
104  :param bin_values:
105  :return:
106  """
107  if np.isnan(value):
108  return 0
109  return bin_values[np.digitize(value, new_bin_limits)]
110 
111 
112 def transform_array(arr, new_bin_limits, bin_values):
113  """ transforms an array according to given bins and bin values
114  :param arr:
115  :param new_bin_limits:
116  :param bin_values:
117  :return:
118  """
119 
120  bin_idx = np.digitize(arr, new_bin_limits)
121  nan_idx = np.where(np.isnan(arr))
122 
123  arr = bin_values[bin_idx]
124 
125  arr[nan_idx] = 0
126  return arr
127 
128 
129 def get_transform_to_probability_map(df, bins=100):
130  """ returns a transformation map to probability for a signal/background = 1 ratio
131  :param df: pandas.DataFrame with truth: 'y', and network output: 'y_hat'
132  :param bins: integer with number of bins
133  :return: numpy array for bin mapping
134  """
135 
136  a_bins = np.linspace(0, 1, bins + 1)
137 
138  # in case maximum value is equal to 1
139  a_bins[-1] = 1.0000001
140 
141  # mapping tagger output to signal/ (signal + background) in the relevant bin
142 
143  grouped = df['y'].groupby(np.digitize(df['y_hat'], a_bins))
144 
145  # check if length equals set of values
146  if not len(grouped) == bins:
147  raise RuntimeError('Not enough values per bin. Choose less bins.')
148 
149  b_map = (grouped.sum() / grouped.count()).values
150 
151  return b_map
152 
153 
154 def transform_to_probability(value, b_map):
155  """ transforms a given value to probability according to a bin map
156  :param value:
157  :param b_map:
158  :return: float transformed value
159  """
160 
161  if value < 0 or value > 1:
162  raise ValueError(value)
163 
164  # shift -1 for array index purpose
165  return b_map[int(value * (len(b_map) - 1))]
166 
167 
168 def transform_array_to_probability(arr, b_map):
169  """ transforms a given arr to probability according to a bin map
170  :param arr: numpy array to transform
171  :param b_map:
172  :return: numpy array: transformed array
173  """
174 
175  if not np.all(np.isfinite(arr)):
176  raise ValueError('Array not finite.')
177  if not np.min(arr) >= 0 and not np.max(arr) <= 1:
178  raise ValueError('Unexpected input values')
179 
180  map_entries = len(b_map)
181  return b_map[(arr * (map_entries - 1)).astype(int)]
182 
183 
184 def get_signal_background_pdf(df, bins=100):
185  """ get the signal and background pdfs of a dataframe to a given network output
186  :param df:
187  :param bins:
188  :return: tuple of signal pdf and back ground
189  """
190  print("WARNING: this function (%s) is not tested yet" % get_signal_background_pdf.__name__)
191 
192  a_bins = np.linspace(0, 1, bins + 1)
193  a_bins[-1] = 1 + np.nextafter(1, 1.1)
194 
195  df_sig = df[df['y'] == 1]
196  df_back = df[df['y'] == 0]
197 
198  binned_sig = df_sig['y'].groupby(np.digitize(df_sig['y_hat'], a_bins))
199  binned_back = df_back['y'].groupby(np.digitize(df_back['y_hat'], a_bins))
200 
201  sig_pdf = (binned_sig.count() / df_sig['y'].count()).values
202  back_pdf = (binned_back.count() / df_back['y'].count()).values
203 
204  return sig_pdf, back_pdf
205 
206 
207 def trafo_to_prob_sf_func(p_signal, p_background, signal_fraction):
208  """
209  :param p_signal: signal_pdf value or array
210  :param p_background: signal_pdf value or array
211  :param signal_fraction:
212  :return: (single value, np array) signal fraction dependent to probability transformation
213  """
214 
215  return (p_signal * signal_fraction) / (p_signal * signal_fraction + p_background * (1 - signal_fraction))
216 
217 
218 def transform_to_probability_sf(value, sig_back_tuple, signal_fraction):
219  """ returns a probability for a given signal fraction != .5
220  :param value: classifier output
221  :param sig_back_tuple: np.array, signal pdf, background pdf of the trained classifier
222  :param signal_fraction: signal fraction of classifier events
223 
224  :return: float, probability for a given signal fraction
225  """
226  assert(signal_fraction > 0)
227 
228  p_signal = transform_to_probability(value, sig_back_tuple[0])
229  p_background = transform_to_probability(value, sig_back_tuple[1])
230  # print('Warning function %s is not tested yet' % transform_to_probability_sf.__name__)
231  # function transform to probability actually just evluates the pdf a given point
232 
233  return trafo_to_prob_sf_func(p_signal, p_background, signal_fraction)
234 
235 
236 def transform_array_to_probability_sf(arr, sig_back_tuple, signal_fraction):
237  """ transformation to probability. if smother output ("not peaky") is required, please implement spline
238  interpolation
239  :param arr: array to transform
240  :param sig_back_tuple: np.array, signal pdf, background pdf of the trained classifier
241  :param signal_fraction: signal fraction of classifier events
242  :return:
243  """
244  assert(signal_fraction > 0)
245 
246  p_signal = transform_array_to_probability(arr, sig_back_tuple[0])
247  p_back = transform_array_to_probability(arr, sig_back_tuple[1])
248  # print('Warning function %s is not tested yet' % transform_array_to_probability_sf.__name__)
249  return trafo_to_prob_sf_func(p_signal, p_back, signal_fraction)
250 
251 
252 def get_signal_fraction(arr, weights=None):
253  """
254  :param arr:
255  :param weights:
256  :return: signal fraction of a given array
257  """
258  # isinstance(arr, np.array)
259 
260  if weights is not None:
261  return NotImplementedError
262 
263  if not np.all(np.isfinite(arr)):
264  raise ValueError('Array not finite.')
265  if not np.min(arr) >= 0 and not np.max(arr) <= 1:
266  raise ValueError('Unexpected input values.')
267 
268  return np.sum(arr) / len(arr)
269 
270 
271 # new MVA interface adaptions
272 def get_ndarray_binning_parameters(ndarr, bin_count=1024):
273  """
274  :param ndarr: numpy.ndarray with variables to transform (may contain NaN values)
275  :param bin_count: number of bins
276  :return: list of tuples with scheme [new_bin_limits, bin_values]
277  """
278 
279  binning_parameters = []
280  # transform each column in an numpy ndarr
281  for column in ndarr.T:
282  binning_parameters.append(get_modified_bin_limits(column, bin_count))
283 
284  return binning_parameters
285 
286 
287 def transform_ndarray(ndarr, binning_parameters):
288  """ flatten ndarray
289  :param ndarr: numpy.ndarray with variables
290  :param binning_parameters: list of tuples with scheme [new_bin_limits, bin_values]
291  :return: None, inplace operation
292  """
293 
294  assert(ndarr.dtype not in [np.int, np.int16, np.int32, np.int64])
295  for i, param_tuple in enumerate(binning_parameters):
296  ndarr[:, i] = transform_array(ndarr[:, i], *param_tuple)
297 
298  return None
299 
300 
301 def transform_variable_vector(arr, binning_parameters):
302  """ transform only according to a recorded flatten distribution. this is necessary for single vector experts
303  :param arr: numpy.array
304  :param binning_parameters: list of tuples with scheme [new_bin_limits, bin_values]
305  :return: None, inplace operation
306  """
307 
308  assert(arr.dtype not in [np.int, np.int16, np.int32, np.int64])
309  for i, param_tuple in enumerate(binning_parameters):
310  arr[i] = transform_value(arr[i], *param_tuple)
311 
312  return None
313 
314 
315 def sanitize_labels(arr):
316  """
317  checks for a binary classification problem
318  transforms the two class labels to {0,1}
319 
320  @param arr numpy array,
321  @:return None, inplace, will not change dtype
322  """
323  # not binary
324  assert len(np.unique(arr)) == 2, 'Not a binary classification!'
325 
326  # reject corner cases when classes would have special values
327  if arr.min() > 0:
328  arr[arr == arr.min()] = 0
329 
330  if arr.max() != 1:
331  arr[arr == arr.max()] = 1
332 
333  # transform labels
334  if arr.min() != 0:
335  arr[arr == arr.min()] = 0