Belle II Software  release-08-01-10
binning.py
1 #!/usr/bin/env python3
2 
3 
10 
11 import numpy as np
12 
13 # Purity transformations
14 
15 
16 def get_bins(arr, bin_count=1024):
17  """ Returns binning limits for equal statistic binning for an array
18  :param arr: numpy array to get binning for
19  :param bin_count: int number of bins
20  :return: list with bin limits
21  """
22 
23  # bin_count will produce +2 bins
24  bin_count -= 2
25 
26  # remove nan -> will be set to zero later (left of lowest bin)
27  _arr = arr[np.logical_not(np.isnan(arr))]
28  arr_len = len(_arr)
29  if bin_count is not None:
30  if arr_len <= bin_count:
31  raise ValueError('%d entries are not enough for equal statistics binning.' % len(_arr))
32 
33  _arr = np.sort(_arr)
34 
35  bin_idx_step = arr_len // bin_count
36 
37  # if array length not multiple of step size:
38  remainder = arr_len % bin_count
39 
40  bin_limits = [_arr[0]]
41 
42  curr_idx = -1
43  for bin_number in range(bin_count):
44 
45  curr_idx += bin_idx_step
46  if bin_number < remainder:
47  curr_idx += 1
48 
49  bin_limits.append(_arr[curr_idx])
50 
51  return bin_limits
52 
53 
54 def get_modified_bin_limits(arr, bin_count=1024):
55  """ Feature binning: this case considers that multiple values can have the same value
56  bins are increased respectively and set to the mean value of the new bin
57  :param arr: numpy array to get binning for
58  :param bin_count: int number of bins
59  :return: list with bin limits
60  """
61  bins = get_bins(arr, bin_count)
62  bin_limits, counts = np.unique(bins, return_counts=True)
63 
64  new_bin_limits = []
65  bin_weights = []
66 
67  for i, count in enumerate(counts):
68  new_bin_limits.append(bin_limits[i])
69  bin_weights.append(1)
70  if count > 1:
71  new_bin_limits.append(np.nextafter(bin_limits[i], bin_limits[i] + 1))
72  bin_weights.append(count - 1)
73 
74  bin_weights.append(1)
75 
76  # increase bin limits slightly (make sure that all occurring values are actually binned correctly)
77  new_bin_limits[-1] = np.nextafter(new_bin_limits[-1], len(new_bin_limits) * new_bin_limits[-1])
78 
79  total_bins = sum(bin_weights)
80 
81  current_bin = 0
82 
83  step_len = 1 / total_bins
84  bin_values = np.zeros(len(bin_weights))
85 
86  for i_idx, bin_weight in enumerate(bin_weights):
87  bin_values[i_idx] = (current_bin + np.sum(range(bin_weight + 1)) / (bin_weight + 1)) * step_len
88  current_bin += bin_weight
89 
90  # transform bin values from [0, 1] -> [-1, 1]
91  bin_values = 2 * bin_values - 1
92 
93  return new_bin_limits, bin_values
94 
95 
96 def transform_value(value, new_bin_limits, bin_values):
97  """ transforms a value according to given bins and bin values (mapping)
98  :param value:
99  :param new_bin_limits:
100  :param bin_values:
101  :return:
102  """
103  if np.isnan(value):
104  return 0
105  return bin_values[np.digitize(value, new_bin_limits)]
106 
107 
108 def transform_array(arr, new_bin_limits, bin_values):
109  """ transforms an array according to given bins and bin values
110  :param arr:
111  :param new_bin_limits:
112  :param bin_values:
113  :return:
114  """
115 
116  bin_idx = np.digitize(arr, new_bin_limits)
117  nan_idx = np.where(np.isnan(arr))
118 
119  arr = bin_values[bin_idx]
120 
121  arr[nan_idx] = 0
122  return arr
123 
124 
125 def get_transform_to_probability_map(df, bins=100):
126  """ returns a transformation map to probability for a signal/background = 1 ratio
127  :param df: pandas.DataFrame with truth: 'y', and network output: 'y_hat'
128  :param bins: integer with number of bins
129  :return: numpy array for bin mapping
130  """
131 
132  a_bins = np.linspace(0, 1, bins + 1)
133 
134  # in case maximum value is equal to 1
135  a_bins[-1] = 1.0000001
136 
137  # mapping tagger output to signal/ (signal + background) in the relevant bin
138 
139  grouped = df['y'].groupby(np.digitize(df['y_hat'], a_bins))
140 
141  # check if length equals set of values
142  if not len(grouped) == bins:
143  raise RuntimeError('Not enough values per bin. Choose less bins.')
144 
145  b_map = (grouped.sum() / grouped.count()).values
146 
147  return b_map
148 
149 
150 def transform_to_probability(value, b_map):
151  """ transforms a given value to probability according to a bin map
152  :param value:
153  :param b_map:
154  :return: float transformed value
155  """
156 
157  if value < 0 or value > 1:
158  raise ValueError(value)
159 
160  # shift -1 for array index purpose
161  return b_map[int(value * (len(b_map) - 1))]
162 
163 
164 def transform_array_to_probability(arr, b_map):
165  """ transforms a given arr to probability according to a bin map
166  :param arr: numpy array to transform
167  :param b_map:
168  :return: numpy array: transformed array
169  """
170 
171  if not np.all(np.isfinite(arr)):
172  raise ValueError('Array not finite.')
173  if not np.min(arr) >= 0 and not np.max(arr) <= 1:
174  raise ValueError('Unexpected input values')
175 
176  map_entries = len(b_map)
177  return b_map[(arr * (map_entries - 1)).astype(int)]
178 
179 
180 def get_signal_background_pdf(df, bins=100):
181  """ get the signal and background pdfs of a dataframe to a given network output
182  :param df:
183  :param bins:
184  :return: tuple of signal pdf and back ground
185  """
186  print("WARNING: this function (%s) is not tested yet" % get_signal_background_pdf.__name__)
187 
188  a_bins = np.linspace(0, 1, bins + 1)
189  a_bins[-1] = 1 + np.nextafter(1, 1.1)
190 
191  df_sig = df[df['y'] == 1]
192  df_back = df[df['y'] == 0]
193 
194  binned_sig = df_sig['y'].groupby(np.digitize(df_sig['y_hat'], a_bins))
195  binned_back = df_back['y'].groupby(np.digitize(df_back['y_hat'], a_bins))
196 
197  sig_pdf = (binned_sig.count() / df_sig['y'].count()).values
198  back_pdf = (binned_back.count() / df_back['y'].count()).values
199 
200  return sig_pdf, back_pdf
201 
202 
203 def trafo_to_prob_sf_func(p_signal, p_background, signal_fraction):
204  """
205  :param p_signal: signal_pdf value or array
206  :param p_background: signal_pdf value or array
207  :param signal_fraction:
208  :return: (single value, np array) signal fraction dependent to probability transformation
209  """
210 
211  return (p_signal * signal_fraction) / (p_signal * signal_fraction + p_background * (1 - signal_fraction))
212 
213 
214 def transform_to_probability_sf(value, sig_back_tuple, signal_fraction):
215  """ returns a probability for a given signal fraction != .5
216  :param value: classifier output
217  :param sig_back_tuple: np.array, signal pdf, background pdf of the trained classifier
218  :param signal_fraction: signal fraction of classifier events
219 
220  :return: float, probability for a given signal fraction
221  """
222  assert(signal_fraction > 0)
223 
224  p_signal = transform_to_probability(value, sig_back_tuple[0])
225  p_background = transform_to_probability(value, sig_back_tuple[1])
226  # print('Warning function %s is not tested yet' % transform_to_probability_sf.__name__)
227  # function transform to probability actually just evaluates the pdf at a given point
228 
229  return trafo_to_prob_sf_func(p_signal, p_background, signal_fraction)
230 
231 
232 def transform_array_to_probability_sf(arr, sig_back_tuple, signal_fraction):
233  """ transformation to probability. if smoother output ("not peaky") is required, please implement spline
234  interpolation
235  :param arr: array to transform
236  :param sig_back_tuple: np.array, signal pdf, background pdf of the trained classifier
237  :param signal_fraction: signal fraction of classifier events
238  :return:
239  """
240  assert(signal_fraction > 0)
241 
242  p_signal = transform_array_to_probability(arr, sig_back_tuple[0])
243  p_back = transform_array_to_probability(arr, sig_back_tuple[1])
244  # print('Warning function %s is not tested yet' % transform_array_to_probability_sf.__name__)
245  return trafo_to_prob_sf_func(p_signal, p_back, signal_fraction)
246 
247 
248 def get_signal_fraction(arr, weights=None):
249  """
250  :param arr:
251  :param weights:
252  :return: signal fraction of a given array
253  """
254  # isinstance(arr, np.array)
255 
256  if weights is not None:
257  return NotImplementedError
258 
259  if not np.all(np.isfinite(arr)):
260  raise ValueError('Array not finite.')
261  if not np.min(arr) >= 0 and not np.max(arr) <= 1:
262  raise ValueError('Unexpected input values.')
263 
264  return np.sum(arr) / len(arr)
265 
266 
267 # new MVA interface adaptions
268 def get_ndarray_binning_parameters(ndarr, bin_count=1024):
269  """
270  :param ndarr: numpy.ndarray with variables to transform (may contain NaN values)
271  :param bin_count: number of bins
272  :return: list of tuples with scheme [new_bin_limits, bin_values]
273  """
274 
275  binning_parameters = []
276  # transform each column in an numpy ndarr
277  for column in ndarr.T:
278  binning_parameters.append(get_modified_bin_limits(column, bin_count))
279 
280  return binning_parameters
281 
282 
283 def transform_ndarray(ndarr, binning_parameters):
284  """ flatten ndarray
285  :param ndarr: numpy.ndarray with variables
286  :param binning_parameters: list of tuples with scheme [new_bin_limits, bin_values]
287  :return: None, inplace operation
288  """
289 
290  assert(ndarr.dtype not in [np.int, np.int16, np.int32, np.int64])
291  for i, param_tuple in enumerate(binning_parameters):
292  ndarr[:, i] = transform_array(ndarr[:, i], *param_tuple)
293 
294  return None
295 
296 
297 def transform_variable_vector(arr, binning_parameters):
298  """ transform only according to a recorded flatten distribution. this is necessary for single vector experts
299  :param arr: numpy.array
300  :param binning_parameters: list of tuples with scheme [new_bin_limits, bin_values]
301  :return: None, inplace operation
302  """
303 
304  assert(arr.dtype not in [np.int, np.int16, np.int32, np.int64])
305  for i, param_tuple in enumerate(binning_parameters):
306  arr[i] = transform_value(arr[i], *param_tuple)
307 
308  return None
309 
310 
311 def sanitize_labels(arr):
312  """
313  checks for a binary classification problem
314  transforms the two class labels to {0,1}
315 
316  @param arr numpy array,
317  @:return None, inplace, will not change dtype
318  """
319  # not binary
320  assert len(np.unique(arr)) == 2, 'Not a binary classification!'
321 
322  # reject corner cases when classes would have special values
323  if arr.min() > 0:
324  arr[arr == arr.min()] = 0
325 
326  if arr.max() != 1:
327  arr[arr == arr.max()] = 1
328 
329  # transform labels
330  if arr.min() != 0:
331  arr[arr == arr.min()] = 0