Belle II Software development
transform.py
1#!/usr/bin/env python3
2
3
10
11""" Transformation classes
12
13In this file all classes for the transformation methods are defined.
14The base class is Transform.
15
16
17"""
18
19
20from alignment.fancystuff.settings import ProTool
21
22import numpy as np
23import pandas as pd
24from scipy.interpolate import InterpolatedUnivariateSpline
25
26
28
29 """
30 Base Class for the transformations.
31 The function _fit() is overwritten by the sub classes.
32
33 Attributes
34 ----------
35 n_bins : int, optional
36 Binning in x, will be set automatically
37 max : float
38 Maximum of the fitted distribution
39 min : float
40 Minimum of the fitted distribution
41 is_processed : bool
42 Status flag
43 name : str
44 Name of the transformation
45
46 """
47
48 def __init__(self, name="Original", n_bins=None):
49 """ Init function
50
51 :param name: Name
52 :param n_bins: Binning for the transformations
53 """
54
55 self.n_bins = n_bins
56
57
58 self.max = 0
59
60
61 self.min = 0
62
63
64 self.is_processed = False
65
66
67 self.name = name
68
69 ProTool.__init__(self, "Transform." + self.name)
70
71 def _initialise(self, x):
72 """
73 Sets limits for the data.
74 Not called by the user.
75
76 :param x: array type
77 """
78 self.io.debug("Initiating " + self.name)
79 if self.n_bins is None:
80 self.set_n_bins(len(x))
81 self.max = np.max(x)
82 self.min = np.min(x)
83
84 def fit(self, x, y=None):
85 """
86 The fit function is calls the individual _fit() functions.
87
88 :param x: Distribution to fit, array type
89 :param y: optional for some transformations, sets signal class
90 """
91 self._initialise(x)
92 self._fit(x, y)
93 self.is_processed = True
94
95 def __call__(self, x):
96 """ Call function calls transform
97 :param x: Input data
98 :return: Transformed data
99 """
100 return self.transform(x)
101
102 def _fit(self, x, y=None):
103 """
104 This is defined in the children and overwritten.
105 :param x: array x values
106 :param y: class variable [1,0]
107
108 """
109
110 def transform(self, x, set_limits=False):
111 """
112 This is defined in the children and overwritten.
113 :param x: Distribution to transform, array type
114 :param set_limits: Limits the range of the data to the fitted range
115 :return: Transformed data
116 """
117 if set_limits:
118 self.set_limits(x)
119 return self._transform(x)
120
121 def _transform(self, x):
122 """
123 This is defined in the children and overwritten.
124 In the base class it does nothing and returns the original distribution.
125
126 :param x: Distribution to transform, array type
127 :return: Transformed data
128 """
129 return x
130
131 def set_n_bins(self, n):
132 """
133 Calculates the optimal size for the binning.
134 :param n: Length of the input data
135 """
136 self.n_bins = get_optimal_bin_size(n)
137 self.io.debug("Bins are set to " + str(self.n_bins) + "\t " + str(n / float(self.n_bins)) + "per bin")
138
139 def set_limits(self, x):
140 """
141 Limits the data to the fitted range.
142 :param x: Input data
143 :return: Limited data
144 """
145 x_min = self.min
146 x_max = self.max
147
148 try:
149 _ = len(x) # to catch exception
150 x[x > x_max] = x_max
151 x[x < x_min] = x_min
152 except TypeError:
153 if x < x_min:
154 x = x_min
155 if x > x_max:
156 x = x_max
157 return x
158
159
160def get_optimal_bin_size(n):
161 """
162 This function calculates the optimal amount of bins for the number of events n.
163 :param n: number of Events
164 :return: optimal bin size
165
166 """
167 return int(3 * n ** (1 / 3.0))
168
169
170def get_average_in_bins(n):
171 """
172 Returns the expected amount of entries in each bins.
173 :param n: Length of the data
174 :return: Length of the data divided by the optimal bin size
175 """
176 return n / float(get_optimal_bin_size(n))
177
178
180
181 """
182 Calculates the cumulative distribution (CDF)
183 Can be used for the flat transformation.
184
185 Attributes
186 ----------
187 spline : InterpolatedUnivariateSpline
188 Spline, fitting the CDF
189
190 """
191
192 def __init__(self, *args):
193 """ Init function
194
195 :param args: None
196 """
197 Transform.__init__(self, "CDF", *args)
198
199
200 self.spline = None
201
202 def _fit(self, x, y=None):
203 """
204 Fit function calculates the cumulative distribution with numpy percentile.
205
206 :param x: Input distribution
207 :param y: Will not be used in this transformation
208 """
209 self.io.debug("Fitting CDF")
210 y_ = np.linspace(0, 100, 2 * self.n_bins)
211 x_ = pd.Series(np.percentile(x, list(y_)))
212
213 # Count same values
214 vc = x_.value_counts()
215 vc = vc.sort_index()
216
217 # replace same values
218 for i, xi in enumerate(vc):
219 if xi > 1:
220 try:
221 nex_val = vc.index[i + 1]
222 except IndexError:
223 nex_val = vc.index[i] + 0.01
224 fill = np.linspace(vc.index[i], nex_val, xi)
225 x_[x_ == vc.index[i]] = fill
226 self.spline = InterpolatedUnivariateSpline(x_, y_)
227
228 def _transform(self, x):
229 """
230 Transforms the input data according to the cdf.
231 :param x: Input data
232 :return: Transformed data
233 """
234 x = self.set_limits(x)
235 return self.spline(x)
236
237
239
240 """
241 This transformation uses the CDF to transform input data to a
242 flat transformation.
243
244 Attributes
245 ----------
246 cdf : Transform.CDF
247 Transformation with the CDF
248
249 """
250
251 def __init__(self, *args):
252 """ Init function
253
254 :param args: None
255 """
256 Transform.__init__(self, "Flat", *args)
257
258
259 self.cdf = CDF(*args)
260
261 def _fit(self, x, y=None):
262 """
263 Fit function calculates the cumulative distribution with numpy percentile.
264
265 :param x: Input distribution
266 :param y: Will not be used in this transformation
267 """
268 self.io.debug("Fitting Flat")
269 self.cdf.fit(x)
270
271 def _transform(self, x):
272 """
273 Transforms the input data according to the cdf.
274 :param x: Input data
275 :return: Transformed data
276 """
277 if not self.is_processed:
278 self.fit(x)
279 return self.cdf.transform(x)
280
281 def get_flat_bins(self):
282 """
283 Returns the binning of the CDF
284 :return: Binning for a flat distribution
285 """
286 return self.cdf.x
287
288 def get_x(self, x_flat):
289 """
290 Dirty version for getting the original x value out of a flat x value.
291 :param x_flat: x value in the flat distribution
292 :return: x value on the original axis (approx)
293 """
294 x_cumul = np.linspace(self.min, self.max, self.n_bins * 50)
295 for xx in x_cumul:
296 if self.cdf.spline(xx) > x_flat:
297 return xx
spline
Spline, fitting the CDF.
Definition transform.py:200
cdf
Transformation with the CDF.
Definition transform.py:259
transform(self, x, set_limits=False)
Definition transform.py:110
n_bins
Binning in x, will be set automatically.
Definition transform.py:55
int min
Minimum of the fitted distribution.
Definition transform.py:61
__init__(self, name="Original", n_bins=None)
Definition transform.py:48
int max
Maximum of the fitted distribution.
Definition transform.py:58