4 """ Transformation classes
6 In this file all classes for the transformation methods are defined.
7 The base class is Transform.
14 from .settings
import ProTool
18 from scipy.interpolate
import InterpolatedUnivariateSpline
24 Base Class for the transformations.
25 The function _fit() is overwritten by the sub classes.
29 n_bins : int, optional
30 Binning in x, will be set automatically
32 Maximum of the fitted distribution
34 Minimum of the fitted distribution
38 Name of the transformation
42 def __init__(self, name="Original", n_bins=None):
46 :param n_bins: Binning for the transformations
63 ProTool.__init__(self,
"Transform." + self.
name)
67 Sets limits for the data.
68 Not called by the user.
72 self.
io.debug(
"Initiating " + self.
name)
78 def fit(self, x, y=None):
80 The fit function is calls the individual _fit() functions.
82 :param x: Distribution to fit, array type
83 :param y: optional for some transformations, sets signal class
90 """ Call function calls transform
92 :return: Transformed data
98 This is defined in the children and overwritten.
99 :param x: array x values
100 :param y: class variable [1,0]
106 This is defined in the children and overwritten.
107 :param x: Distribution to transform, array type
108 :param set_limits: Limits the range of the data to the fitted range
109 :return: Transformed data
113 return self._transform(x)
117 This is defined in the children and overwritten.
118 In the base class it does nothing and returns the original distribution.
120 :param x: Distribution to transform, array type
121 :return: Transformed data
127 Calculates the optimal size for the binning.
128 :param n: Length of the input data
130 self.
n_bins = get_optimal_bin_size(n)
131 self.
io.debug(
"Bins are set to " + str(self.
n_bins) +
"\t " + str(n / float(self.
n_bins)) +
"per bin")
135 Limits the data to the fitted range.
137 :return: Limited data
141 x[x > self.
max] = self.
max
142 x[x < self.
min] = self.
min
151 def get_optimal_bin_size(n):
153 This function calculates the optimal amount of bins for the number of events n.
154 :param n: number of Events
155 :return: optimal bin size
158 return int(3 * n ** (1 / 3.0))
161 def get_average_in_bins(n):
163 Returns the expected amount of entries in each bins.
164 :param n: Length of the data
165 :return: Length of the data divided by the optimal bin size
167 return n / float(get_optimal_bin_size(n))
173 Calculates the cumulative distribution (CDF)
174 Can be used for the flat transformation.
178 spline : InterpolatedUnivariateSpline
179 Spline, fitting the CDF
188 Transform.__init__(self,
"CDF", *args)
195 Fit function calculates the cumulative distribution with numpy percentile.
197 :param x: Input distribution
198 :param y: Will not be used in this transformation
200 self.
io.debug(
"Fitting CDF")
201 y_ = np.linspace(0, 100, 2 * self.
n_bins)
202 x_ = pd.Series(np.percentile(x, list(y_)))
205 vc = x_.value_counts()
209 for i, xi
in enumerate(vc):
212 nex_val = vc.index[i + 1]
214 nex_val = vc.index[i] + 0.01
215 fill = np.linspace(vc.index[i], nex_val, xi)
216 x_[x_ == vc.index[i]] = fill
217 self.
spline = InterpolatedUnivariateSpline(x_, y_)
221 Transforms the input data according to the cdf.
223 :return: Transformed data
232 This transformation uses the CDF to transform input data to a
238 Transformation with the CDF
247 Transform.__init__(self,
"Flat", *args)
254 Fit function calculates the cumulative distribution with numpy percentile.
256 :param x: Inout distribution
257 :param y: Will not be used in this transformation
259 self.
io.debug(
"Fitting Flat")
264 Transforms the input data according to the cdf.
266 :return: Transformed data
274 Returns the binning of the CDF
275 :return: Binning for a flat distribution
281 Dirty version for getting the original x value out of a flat x value.
282 :param x_flat: x value in the flat distribution
283 :return: x value on the original axis (approx)
285 x_cum = np.linspace(self.
min, self.
max, self.
n_bins * 50)
287 if self.
cdf.spline(xx) > x_flat: