Belle II Software development
statistics.py
1
8import math
9import numpy as np
10
11
12def iqr(array_like):
13 """Computes the interquantile range, hence the distance from the 25% to 75% quantile."""
14
15 return np.percentile(array_like, 75) - np.percentile(array_like, 25)
16
17
18def trimmed_std(array_like):
19 """A trimmed estimate for the standard deviation of a normal distribution
20 contaminated by outliers using the interquanitle range times the appropriate factor.
21 """
22
23 normal_iqr_to_std_factor = math.sqrt(2.0) * math.erf(0.5)
24 return normal_iqr_to_std_factor * iqr(array_like)
25
26
27def truncated_mean(array_like, r=0.23):
28 """Calculates the truncated mean, where 2r is the fraction of data to be taken into account.
29
30 The truncated mean is a robust estimator for the central value of a symmetric distribution.
31 The (1-r) upper and the (1-r) lower values are left out and only the central remaining 2r fraction enters
32 the normal calculation of the mean. The default value of r is the text book value, which produces an approximatelly
33 82%-efficient estimate of the mean for normal, central value of the cauchy and the double-exponential
34 distribution.
35 """
36
37 truncation = 1 - 2 * r
38
39 if truncation >= 1 or truncation <= 0:
40 raise ValueError('Value of r must be between 0 and 0.5')
41
42 array = np.array(array_like)
43
44 lower_percentile = 100 * truncation / 2.0
45 upper_percentile = 100 * (1.0 - truncation / 2.0)
46
47 (lower_bound, upper_bound) = np.percentile(array, (lower_percentile,
48 upper_percentile))
49
50 weights = (array >= lower_bound) & (array <= upper_bound)
51
52 truncated_array = array[weights]
53
54 return np.mean(truncated_array)
55
56
57def cubic_root(x):
58 """Returns the cubic root of a number"""
59 return pow(float(x), 1.0 / 3.0)
60
61
62def rice_n_bin(n_data):
63 """Returns the number of bins for a number of data instances according to the rice rule."""
64 rice_n_bin = np.ceil(2.0 * cubic_root(n_data))
65 return rice_n_bin if rice_n_bin > 0 else 1
66
67
68def rice_exceptional_values(xs):
69 """Returns a array of exceptionally frequent values according to the rice rule."""
70 unique_xs, indices, unique_xs_count = np.unique(xs, return_inverse=True, return_counts=True)
71 # Note: The indices are such that unique_xs[indices] == xs
72
73 exceptional_xs = []
74
75 while True:
76 n_data = np.sum(unique_xs_count)
77 n_exceptional = 1.0 * n_data / rice_n_bin(n_data)
78 exceptional_indices = unique_xs_count > n_exceptional
79
80 if np.any(exceptional_indices):
81 exceptional_xs.extend(unique_xs[exceptional_indices])
82 unique_xs_count[exceptional_indices] = 0
83 else:
84 break
85
86 return np.array(sorted(exceptional_xs))
87
88
89def is_binary_series(xs):
90 """Determines if the given series only consists of true and false values"""
91 is_one_or_zero = np.all((xs == 0) | (xs == 1) | ~np.isfinite(xs))
92 return is_one_or_zero
93
94
95
96default_max_n_unique_for_discrete = 20
97
98
99def is_discrete_series(xs, max_n_unique=None):
100 """Determine of a data series is only made of a bunch of discrete values
101
102 Currently only test if the number of unique values is lower than
103 the default_max_n_unique_for_discrete.
104
105 Parameters
106 ----------
107 xs : np.array (1d)
108 Data series
109 max_n_unique : int
110 Number of allowed distinct values for the series to be discrete.
111 """
112
113 if max_n_unique is None:
114 max_n_unique = default_max_n_unique_for_discrete
115
116 unique_xs = np.unique(xs)
117 if len(unique_xs) < max_n_unique:
118 return True
119 elif len(unique_xs) > 150:
120 return False
121 else:
122 exceptional_xs = rice_exceptional_values(xs)
123 rest_xs = np.setdiff1d(unique_xs, exceptional_xs, assume_unique=True)
124 if len(rest_xs) < max_n_unique:
125 return True
126
127 return False
128
129
130def is_single_value_series(xs):
131 """Determine if a series contains only a single value
132
133 Parameters
134 ----------
135 xs : np.array (1d)
136 Data series
137 """
138 return np.all(xs == xs[0]) or np.all(np.isnan(xs))