Belle II Software  release-05-01-25
statistics.py
1 import math
2 import numpy as np
3 
4 
5 def iqr(array_like):
6  """Computes the interquantile range, hence the distance from the 25% to 75% quantile."""
7 
8  return np.percentile(array_like, 75) - np.percentile(array_like, 25)
9 
10 
11 def trimmed_std(array_like):
12  """A trimmed estimate for the standard deviation of a normal distribution
13  contaminated by outliers using the interquanitle range times the appropriate factor.
14  """
15 
16  normal_iqr_to_std_factor = math.sqrt(2.0) * math.erf(0.5)
17  return normal_iqr_to_std_factor * iqr(array_like)
18 
19 
20 def truncated_mean(array_like, r=0.23):
21  """Calculates the truncated mean, where 2r is the fraction of data to be taken into account.
22 
23  The truncated mean is a robust estimator for the central value of a symmetric distribution.
24  The (1-r) upper and the (1-r) lower values are left out and only the central remaining 2r fraction enters
25  the normal calculation of the mean. The default value of r is the text book value, which produces an approximatelly
26  82%-efficient estimate of the mean for normal, central value of the cauchy and the double-exponential
27  distribution.
28  """
29 
30  truncation = 1 - 2 * r
31 
32  if truncation >= 1 or truncation <= 0:
33  raise ValueError('Value of r must be between 0 and 0.5')
34 
35  array = np.array(array_like)
36 
37  lower_percentile = 100 * truncation / 2.0
38  upper_percentile = 100 * (1.0 - truncation / 2.0)
39 
40  (lower_bound, upper_bound) = np.percentile(array, (lower_percentile,
41  upper_percentile))
42 
43  weights = (array >= lower_bound) & (array <= upper_bound)
44 
45  truncated_array = array[weights]
46 
47  return np.mean(truncated_array)
48 
49 
50 def cubic_root(x):
51  """Returns the cubic root of a number"""
52  return pow(float(x), 1.0 / 3.0)
53 
54 
55 def rice_n_bin(n_data):
56  """Returns the number of bins for a number of data instances according to the rice rule."""
57  return np.ceil(2.0 * cubic_root(n_data))
58 
59 
60 def rice_exceptional_values(xs):
61  """Returns a array of exceptionally frequent values according to the rice rule."""
62  unique_xs, indices, unique_xs_count = np.unique(xs, return_inverse=True, return_counts=True)
63  # Note: The indices are such that unique_xs[indices] == xs
64 
65  exceptional_xs = []
66 
67  while True:
68  n_data = np.sum(unique_xs_count)
69  n_exceptional = 1.0 * n_data / rice_n_bin(n_data)
70  exceptional_indices = unique_xs_count > n_exceptional
71 
72  if np.any(exceptional_indices):
73  exceptional_xs.extend(unique_xs[exceptional_indices])
74  unique_xs_count[exceptional_indices] = 0
75  else:
76  break
77 
78  return np.array(sorted(exceptional_xs))
79 
80 
81 def is_binary_series(xs):
82  """Determines if the given series only consists of true and false values"""
83  is_one_or_zero = np.all((xs == 0) | (xs == 1) | ~np.isfinite(xs))
84  return is_one_or_zero
85 
86 
87 default_max_n_unique_for_discrete = 20
88 
89 
90 def is_discrete_series(xs, max_n_unique=None):
91  """Determine of a data series is only made of a bunch of discrete values
92 
93  Currently only test if the number of unique values is lower than
94  the default_max_n_unique_for_discrete.
95 
96  Parameters
97  ----------
98  xs : np.array (1d)
99  Data series
100  max_n_unique : int
101  Number of allowed distinct values for the series to be discrete.
102  """
103 
104  if max_n_unique is None:
105  max_n_unique = default_max_n_unique_for_discrete
106 
107  unique_xs = np.unique(xs)
108  if len(unique_xs) < max_n_unique:
109  return True
110  elif len(unique_xs) > 150:
111  return False
112  else:
113  exceptional_xs = rice_exceptional_values(xs)
114  rest_xs = np.setdiff1d(unique_xs, exceptional_xs, assume_unique=True)
115  if len(rest_xs) < max_n_unique:
116  return True
117 
118  return False
119 
120 
121 def is_single_value_series(xs):
122  """Determine if a series contains only a single value
123 
124  Parameters
125  ----------
126  xs : np.array (1d)
127  Data series
128  """
129  return np.all(xs == xs[0]) or np.all(np.isnan(xs))