16def get_bins(arr, bin_count=1024):
 
   17    """ Returns binning limits for equal statistic binning for an array 
   18    :param arr: numpy array to get binning for 
   19    :param bin_count: int number of bins 
   20    :return: list with bin limits 
   27    _arr = arr[np.logical_not(np.isnan(arr))]
 
   29    if bin_count 
is not None:
 
   30        if arr_len <= bin_count:
 
   31            raise ValueError(f
'{len(_arr)} entries are not enough for equal statistics binning.')
 
   35    bin_idx_step = arr_len // bin_count
 
   38    remainder = arr_len % bin_count
 
   40    bin_limits = [_arr[0]]
 
   43    for bin_number 
in range(bin_count):
 
   45        curr_idx += bin_idx_step
 
   46        if bin_number < remainder:
 
   49        bin_limits.append(_arr[curr_idx])
 
   54def get_modified_bin_limits(arr, bin_count=1024):
 
   55    """ Feature binning: this case considers that multiple values can have the same value 
   56    bins are increased respectively and set to the mean value of the new bin 
   57    :param arr: numpy array to get binning for 
   58    :param bin_count: int number of bins 
   59    :return: list with bin limits 
   61    bins = get_bins(arr, bin_count)
 
   62    bin_limits, counts = np.unique(bins, return_counts=
True)
 
   67    for i, count 
in enumerate(counts):
 
   68        new_bin_limits.append(bin_limits[i])
 
   71            new_bin_limits.append(np.nextafter(bin_limits[i], bin_limits[i] + 1))
 
   72            bin_weights.append(count - 1)
 
   77    new_bin_limits[-1] = np.nextafter(new_bin_limits[-1], len(new_bin_limits) * new_bin_limits[-1])
 
   79    total_bins = sum(bin_weights)
 
   83    step_len = 1 / total_bins
 
   84    bin_values = np.zeros(len(bin_weights))
 
   86    for i_idx, bin_weight 
in enumerate(bin_weights):
 
   87        bin_values[i_idx] = (current_bin + np.sum(range(bin_weight + 1)) / (bin_weight + 1)) * step_len
 
   88        current_bin += bin_weight
 
   91    bin_values = 2 * bin_values - 1
 
   93    return new_bin_limits, bin_values
 
   96def transform_value(value, new_bin_limits, bin_values):
 
   97    """ transforms a value according to given bins and bin values (mapping) 
   99    :param new_bin_limits: 
  105    return bin_values[np.digitize(value, new_bin_limits)]
 
  108def transform_array(arr, new_bin_limits, bin_values):
 
  109    """ transforms an array according to given bins and bin values 
  111    :param new_bin_limits: 
  116    bin_idx = np.digitize(arr, new_bin_limits)
 
  117    nan_idx = np.where(np.isnan(arr))
 
  119    arr = bin_values[bin_idx]
 
  125def get_transform_to_probability_map(df, bins=100):
 
  126    """ returns a transformation map to probability for a signal/background = 1 ratio 
  127    :param df: pandas.DataFrame with truth: 'y', and network output: 'y_hat' 
  128    :param bins: integer with number of bins 
  129    :return: numpy array for bin mapping 
  132    a_bins = np.linspace(0, 1, bins + 1)
 
  135    a_bins[-1] = 1.0000001
 
  139    grouped = df[
'y'].groupby(np.digitize(df[
'y_hat'], a_bins))
 
  142    if not len(grouped) == bins:
 
  143        raise RuntimeError(
'Not enough values per bin. Choose less bins.')
 
  145    b_map = (grouped.sum() / grouped.count()).values
 
  150def transform_to_probability(value, b_map):
 
  151    """ transforms a given value to probability according to a bin map 
  154    :return: float transformed value 
  157    if value < 0 
or value > 1:
 
  158        raise ValueError(value)
 
  161    return b_map[int(value * (len(b_map) - 1))]
 
  164def transform_array_to_probability(arr, b_map):
 
  165    """ transforms a given arr to probability according to a bin map 
  166    :param arr: numpy array to transform 
  168    :return: numpy array: transformed array 
  171    if not np.all(np.isfinite(arr)):
 
  172        raise ValueError(
'Array not finite.')
 
  173    if not np.min(arr) >= 0 
and not np.max(arr) <= 1:
 
  174        raise ValueError(
'Unexpected input values')
 
  176    map_entries = len(b_map)
 
  177    return b_map[(arr * (map_entries - 1)).astype(int)]
 
  180def get_signal_background_pdf(df, bins=100):
 
  181    """ get the signal and background pdfs of a dataframe to a given network output 
  184    :return: tuple of signal pdf and back ground 
  186    print(f
"WARNING: this function ({get_signal_background_pdf.__name__}) is not tested yet")
 
  188    a_bins = np.linspace(0, 1, bins + 1)
 
  189    a_bins[-1] = 1 + np.nextafter(1, 1.1)
 
  191    df_sig = df[df[
'y'] == 1]
 
  192    df_back = df[df[
'y'] == 0]
 
  194    binned_sig = df_sig[
'y'].groupby(np.digitize(df_sig[
'y_hat'], a_bins))
 
  195    binned_back = df_back[
'y'].groupby(np.digitize(df_back[
'y_hat'], a_bins))
 
  197    sig_pdf = (binned_sig.count() / df_sig[
'y'].count()).values
 
  198    back_pdf = (binned_back.count() / df_back[
'y'].count()).values
 
  200    return sig_pdf, back_pdf
 
  203def trafo_to_prob_sf_func(p_signal, p_background, signal_fraction):
 
  205    :param p_signal: signal_pdf value or array 
  206    :param p_background: signal_pdf value or array 
  207    :param signal_fraction: 
  208    :return: (single value, np array) signal fraction dependent to probability transformation 
  211    return (p_signal * signal_fraction) / (p_signal * signal_fraction + p_background * (1 - signal_fraction))
 
  214def transform_to_probability_sf(value, sig_back_tuple, signal_fraction):
 
  215    """ returns a probability for a given signal fraction != .5 
  216    :param value: classifier output 
  217    :param sig_back_tuple: np.array, signal pdf, background pdf of the trained classifier 
  218    :param signal_fraction: signal fraction of classifier events 
  220    :return: float, probability for a given signal fraction 
  222    assert (signal_fraction > 0)
 
  224    p_signal = transform_to_probability(value, sig_back_tuple[0])
 
  225    p_background = transform_to_probability(value, sig_back_tuple[1])
 
  229    return trafo_to_prob_sf_func(p_signal, p_background, signal_fraction)
 
  232def transform_array_to_probability_sf(arr, sig_back_tuple, signal_fraction):
 
  233    """ transformation to probability. if smoother output ("not peaky") is required, please implement spline 
  235    :param arr: array to transform 
  236    :param sig_back_tuple: np.array, signal pdf, background pdf of the trained classifier 
  237    :param signal_fraction: signal fraction of classifier events 
  240    assert (signal_fraction > 0)
 
  242    p_signal = transform_array_to_probability(arr, sig_back_tuple[0])
 
  243    p_back = transform_array_to_probability(arr, sig_back_tuple[1])
 
  245    return trafo_to_prob_sf_func(p_signal, p_back, signal_fraction)
 
  248def get_signal_fraction(arr, weights=None):
 
  252    :return: signal fraction of a given array 
  256    if weights 
is not None:
 
  257        return NotImplementedError
 
  259    if not np.all(np.isfinite(arr)):
 
  260        raise ValueError(
'Array not finite.')
 
  261    if not np.min(arr) >= 0 
and not np.max(arr) <= 1:
 
  262        raise ValueError(
'Unexpected input values.')
 
  264    return np.sum(arr) / len(arr)
 
  268def get_ndarray_binning_parameters(ndarr, bin_count=1024):
 
  270    :param ndarr: numpy.ndarray with variables to transform (may contain NaN values) 
  271    :param bin_count: number of bins 
  272    :return: list of tuples with scheme [new_bin_limits, bin_values] 
  275    binning_parameters = []
 
  277    for column 
in ndarr.T:
 
  278        binning_parameters.append(get_modified_bin_limits(column, bin_count))
 
  280    return binning_parameters
 
  283def transform_ndarray(ndarr, binning_parameters):
 
  285    :param ndarr: numpy.ndarray with variables 
  286    :param binning_parameters: list of tuples with scheme [new_bin_limits, bin_values] 
  287    :return: None, inplace operation 
  290    assert (ndarr.dtype 
not in [np.int16, np.int32, np.int64])
 
  291    for i, param_tuple 
in enumerate(binning_parameters):
 
  292        ndarr[:, i] = transform_array(ndarr[:, i], *param_tuple)
 
  297def transform_variable_vector(arr, binning_parameters):
 
  298    """ transform only according to a recorded flatten distribution. this is necessary for single vector experts 
  299    :param arr: numpy.array 
  300    :param binning_parameters: list of tuples with scheme [new_bin_limits, bin_values] 
  301    :return: None, inplace operation 
  304    assert (arr.dtype 
not in [np.int16, np.int32, np.int64])
 
  305    for i, param_tuple 
in enumerate(binning_parameters):
 
  306        arr[i] = transform_value(arr[i], *param_tuple)
 
  311def sanitize_labels(arr):
 
  313    checks for a binary classification problem 
  314    transforms the two class labels to {0,1} 
  316    @param arr          numpy array, 
  317    @:return            None, inplace, will not change dtype 
  320    assert len(np.unique(arr)) == 2, 
'Not a binary classification!' 
  324        arr[arr == arr.min()] = 0
 
  327        arr[arr == arr.max()] = 1
 
  331        arr[arr == arr.min()] = 0