11""" Transformation classes
13In this file all classes for the transformation methods are defined.
14The base class is Transform.
24from scipy.interpolate
import InterpolatedUnivariateSpline
30 Base Class for the transformations.
31 The function
_fit()
is overwritten by the sub classes.
35 n_bins : int, optional
36 Binning
in x, will be set automatically
38 Maximum of the fitted distribution
40 Minimum of the fitted distribution
44 Name of the transformation
48 def __init__(self, name="Original", n_bins=None):
52 :param n_bins: Binning for the transformations
69 ProTool.__init__(self,
"Transform." + self.
namename)
73 Sets limits for the data.
74 Not called by the user.
84 def fit(self, x, y=None):
86 The fit function is calls the individual
_fit() functions.
88 :param x: Distribution to fit, array type
89 :param y: optional
for some transformations, sets signal
class
96 """ Call function calls transform
98 :return: Transformed data
104 This is defined
in the children
and overwritten.
105 :param x: array x values
106 :param y:
class variable [1,0]
110 def transform(self, x, set_limits=False):
112 This is defined
in the children
and overwritten.
113 :param x: Distribution to transform, array type
114 :param set_limits: Limits the range of the data to the fitted range
115 :
return: Transformed data
123 This is defined
in the children
and overwritten.
124 In the base
class it does nothing and returns the original distribution.
126 :param x: Distribution to transform, array type
127 :
return: Transformed data
133 Calculates the optimal size for the binning.
134 :param n: Length of the input data
136 self.n_bins = get_optimal_bin_size(n)
137 self.io.debug("Bins are set to " + str(self.
n_bins) +
"\t " + str(n / float(self.
n_bins)) +
"per bin")
141 Limits the data to the fitted range.
143 :return: Limited data
147 x[x > self.
max] = self.
max
148 x[x < self.
min] = self.
min
157def get_optimal_bin_size(n):
159 This function calculates the optimal amount of bins for the number of events n.
160 :param n: number of Events
161 :
return: optimal bin size
164 return int(3 * n ** (1 / 3.0))
167def get_average_in_bins(n):
169 Returns the expected amount of entries in each bins.
170 :param n: Length of the data
171 :
return: Length of the data divided by the optimal bin size
173 return n / float(get_optimal_bin_size(n))
179 Calculates the cumulative distribution (CDF)
180 Can be used for the flat transformation.
184 spline : InterpolatedUnivariateSpline
185 Spline, fitting the CDF
194 Transform.__init__(self, "CDF", *args)
201 Fit function calculates the cumulative distribution with numpy percentile.
203 :param x: Input distribution
204 :param y: Will
not be used
in this transformation
206 self.io.debug("Fitting CDF")
207 y_ = np.linspace(0, 100, 2 * self.
n_bins)
208 x_ = pd.Series(np.percentile(x, list(y_)))
211 vc = x_.value_counts()
215 for i, xi
in enumerate(vc):
218 nex_val = vc.index[i + 1]
220 nex_val = vc.index[i] + 0.01
221 fill = np.linspace(vc.index[i], nex_val, xi)
222 x_[x_ == vc.index[i]] = fill
223 self.
spline = InterpolatedUnivariateSpline(x_, y_)
227 Transforms the input data according to the cdf.
229 :return: Transformed data
238 This transformation uses the CDF to transform input data to a
244 Transformation with the CDF
253 Transform.__init__(self, "Flat", *args)
260 Fit function calculates the cumulative distribution with numpy percentile.
262 :param x: Inout distribution
263 :param y: Will
not be used
in this transformation
265 self.io.debug("Fitting Flat")
270 Transforms the input data according to the cdf.
272 :return: Transformed data
276 return self.
cdf.transform(x)
280 Returns the binning of the CDF
281 :return: Binning
for a flat distribution
287 Dirty version for getting the original x value out of a flat x value.
288 :param x_flat: x value
in the flat distribution
289 :
return: x value on the original axis (approx)
291 x_cumul = np.linspace(self.min, self.max, self.n_bins * 50)
293 if self.
cdf.spline(xx) > x_flat: