Belle II Software development
transform.py
1#!/usr/bin/env python3
2
3
10
11""" Transformation classes
12
13In this file all classes for the transformation methods are defined.
14The base class is Transform.
15
16
17"""
18
19
20from alignment.fancystuff.settings import ProTool
21
22import numpy as np
23import pandas as pd
24from scipy.interpolate import InterpolatedUnivariateSpline
25
26
28
29 """
30 Base Class for the transformations.
31 The function _fit() is overwritten by the sub classes.
32
33 Attributes
34 ----------
35 n_bins : int, optional
36 Binning in x, will be set automatically
37 max : float
38 Maximum of the fitted distribution
39 min : float
40 Minimum of the fitted distribution
41 is_processed : bool
42 Status flag
43 name : str
44 Name of the transformation
45
46 """
47
48 def __init__(self, name="Original", n_bins=None):
49 """ Init function
50
51 :param name: Name
52 :param n_bins: Binning for the transformations
53 """
54
55 self.n_bins = n_bins
56
57
58 self.max = 0
59
60
61 self.min = 0
62
63
64 self.is_processed = False
65
66
67 self.namename = name
68
69 ProTool.__init__(self, "Transform." + self.namename)
70
71 def _initialise(self, x):
72 """
73 Sets limits for the data.
74 Not called by the user.
75
76 :param x: array type
77 """
78 self.io.debug("Initiating " + self.namename)
79 if self.n_bins is None:
80 self.set_n_bins(len(x))
81 self.max = np.max(x)
82 self.min = np.min(x)
83
84 def fit(self, x, y=None):
85 """
86 The fit function is calls the individual _fit() functions.
87
88 :param x: Distribution to fit, array type
89 :param y: optional for some transformations, sets signal class
90 """
91 self._initialise(x)
92 self._fit(x, y)
93 self.is_processed = True
94
95 def __call__(self, x):
96 """ Call function calls transform
97 :param x: Input data
98 :return: Transformed data
99 """
100 return self.transform(x)
101
102 def _fit(self, x, y=None):
103 """
104 This is defined in the children and overwritten.
105 :param x: array x values
106 :param y: class variable [1,0]
107
108 """
109
110 def transform(self, x, set_limits=False):
111 """
112 This is defined in the children and overwritten.
113 :param x: Distribution to transform, array type
114 :param set_limits: Limits the range of the data to the fitted range
115 :return: Transformed data
116 """
117 if set_limits:
118 self.set_limits(x)
119 return self._transform(x)
120
121 def _transform(self, x):
122 """
123 This is defined in the children and overwritten.
124 In the base class it does nothing and returns the original distribution.
125
126 :param x: Distribution to transform, array type
127 :return: Transformed data
128 """
129 return x
130
131 def set_n_bins(self, n):
132 """
133 Calculates the optimal size for the binning.
134 :param n: Length of the input data
135 """
136 self.n_bins = get_optimal_bin_size(n)
137 self.io.debug("Bins are set to " + str(self.n_bins) + "\t " + str(n / float(self.n_bins)) + "per bin")
138
139 def set_limits(self, x):
140 """
141 Limits the data to the fitted range.
142 :param x: Input data
143 :return: Limited data
144 """
145 try:
146 _ = len(x) # to catch exception
147 x[x > self.max] = self.max
148 x[x < self.min] = self.min
149 except TypeError:
150 if x < self.min:
151 x = self.min
152 if x > self.max:
153 x = self.max
154 return x
155
156
157def get_optimal_bin_size(n):
158 """
159 This function calculates the optimal amount of bins for the number of events n.
160 :param n: number of Events
161 :return: optimal bin size
162
163 """
164 return int(3 * n ** (1 / 3.0))
165
166
167def get_average_in_bins(n):
168 """
169 Returns the expected amount of entries in each bins.
170 :param n: Length of the data
171 :return: Length of the data divided by the optimal bin size
172 """
173 return n / float(get_optimal_bin_size(n))
174
175
177
178 """
179 Calculates the cumulative distribution (CDF)
180 Can be used for the flat transformation.
181
182 Attributes
183 ----------
184 spline : InterpolatedUnivariateSpline
185 Spline, fitting the CDF
186
187 """
188
189 def __init__(self, *args):
190 """ Init function
191
192 :param args: None
193 """
194 Transform.__init__(self, "CDF", *args)
195
196
197 self.spline = None
198
199 def _fit(self, x, y=None):
200 """
201 Fit function calculates the cumulative distribution with numpy percentile.
202
203 :param x: Input distribution
204 :param y: Will not be used in this transformation
205 """
206 self.io.debug("Fitting CDF")
207 y_ = np.linspace(0, 100, 2 * self.n_bins)
208 x_ = pd.Series(np.percentile(x, list(y_)))
209
210 # Count same values
211 vc = x_.value_counts()
212 vc = vc.sort_index()
213
214 # replace same values
215 for i, xi in enumerate(vc):
216 if xi > 1:
217 try:
218 nex_val = vc.index[i + 1]
219 except IndexError:
220 nex_val = vc.index[i] + 0.01
221 fill = np.linspace(vc.index[i], nex_val, xi)
222 x_[x_ == vc.index[i]] = fill
223 self.spline = InterpolatedUnivariateSpline(x_, y_)
224
225 def _transform(self, x):
226 """
227 Transforms the input data according to the cdf.
228 :param x: Input data
229 :return: Transformed data
230 """
231 x = self.set_limits(x)
232 return self.spline(x)
233
234
236
237 """
238 This transformation uses the CDF to transform input data to a
239 flat transformation.
240
241 Attributes
242 ----------
243 cdf : Transform.CDF
244 Transformation with the CDF
245
246 """
247
248 def __init__(self, *args):
249 """ Init function
250
251 :param args: None
252 """
253 Transform.__init__(self, "Flat", *args)
254
255
256 self.cdf = CDF(*args)
257
258 def _fit(self, x, y=None):
259 """
260 Fit function calculates the cumulative distribution with numpy percentile.
261
262 :param x: Inout distribution
263 :param y: Will not be used in this transformation
264 """
265 self.io.debug("Fitting Flat")
266 self.cdf.fit(x)
267
268 def _transform(self, x):
269 """
270 Transforms the input data according to the cdf.
271 :param x: Input data
272 :return: Transformed data
273 """
274 if not self.is_processed:
275 self.fit(x)
276 return self.cdf.transform(x)
277
278 def get_flat_bins(self):
279 """
280 Returns the binning of the CDF
281 :return: Binning for a flat distribution
282 """
283 return self.cdf.x
284
285 def get_x(self, x_flat):
286 """
287 Dirty version for getting the original x value out of a flat x value.
288 :param x_flat: x value in the flat distribution
289 :return: x value on the original axis (approx)
290 """
291 x_cumul = np.linspace(self.min, self.max, self.n_bins * 50)
292 for xx in x_cumul:
293 if self.cdf.spline(xx) > x_flat:
294 return xx
def _fit(self, x, y=None)
Definition: transform.py:199
spline
Spline, fitting the CDF.
Definition: transform.py:197
cdf
Transformation with the CDF.
Definition: transform.py:256
n_bins
Binning in x, will be set automatically.
Definition: transform.py:55
def transform(self, x, set_limits=False)
Definition: transform.py:110
min
Minimum of the fitted distribution.
Definition: transform.py:61
max
Maximum of the fitted distribution.
Definition: transform.py:58
name
Name of the transformation.
Definition: transform.py:67
def __init__(self, name="Original", n_bins=None)
Definition: transform.py:48