11 """ Transformation classes
13 In this file all classes for the transformation methods are defined.
14 The base class is Transform.
24 from scipy.interpolate
import InterpolatedUnivariateSpline
30 Base Class for the transformations.
31 The function _fit() is overwritten by the sub classes.
35 n_bins : int, optional
36 Binning in x, will be set automatically
38 Maximum of the fitted distribution
40 Minimum of the fitted distribution
44 Name of the transformation
48 def __init__(self, name="Original", n_bins=None):
52 :param n_bins: Binning for the transformations
69 ProTool.__init__(self,
"Transform." + self.
namenamename)
73 Sets limits for the data.
74 Not called by the user.
78 self.
ioio.debug(
"Initiating " + self.
namenamename)
79 if self.
n_binsn_bins
is None:
81 self.
maxmax = np.max(x)
82 self.
minmin = np.min(x)
84 def fit(self, x, y=None):
86 The fit function is calls the individual _fit() functions.
88 :param x: Distribution to fit, array type
89 :param y: optional for some transformations, sets signal class
96 """ Call function calls transform
98 :return: Transformed data
104 This is defined in the children and overwritten.
105 :param x: array x values
106 :param y: class variable [1,0]
112 This is defined in the children and overwritten.
113 :param x: Distribution to transform, array type
114 :param set_limits: Limits the range of the data to the fitted range
115 :return: Transformed data
119 return self._transform(x)
123 This is defined in the children and overwritten.
124 In the base class it does nothing and returns the original distribution.
126 :param x: Distribution to transform, array type
127 :return: Transformed data
133 Calculates the optimal size for the binning.
134 :param n: Length of the input data
136 self.
n_binsn_bins = get_optimal_bin_size(n)
137 self.
ioio.debug(
"Bins are set to " + str(self.
n_binsn_bins) +
"\t " + str(n / float(self.
n_binsn_bins)) +
"per bin")
141 Limits the data to the fitted range.
143 :return: Limited data
147 x[x > self.
maxmax] = self.
maxmax
148 x[x < self.
minmin] = self.
minmin
157 def get_optimal_bin_size(n):
159 This function calculates the optimal amount of bins for the number of events n.
160 :param n: number of Events
161 :return: optimal bin size
164 return int(3 * n ** (1 / 3.0))
167 def get_average_in_bins(n):
169 Returns the expected amount of entries in each bins.
170 :param n: Length of the data
171 :return: Length of the data divided by the optimal bin size
173 return n / float(get_optimal_bin_size(n))
179 Calculates the cumulative distribution (CDF)
180 Can be used for the flat transformation.
184 spline : InterpolatedUnivariateSpline
185 Spline, fitting the CDF
194 Transform.__init__(self,
"CDF", *args)
201 Fit function calculates the cumulative distribution with numpy percentile.
203 :param x: Input distribution
204 :param y: Will not be used in this transformation
206 self.
ioio.debug(
"Fitting CDF")
207 y_ = np.linspace(0, 100, 2 * self.
n_binsn_bins)
208 x_ = pd.Series(np.percentile(x, list(y_)))
211 vc = x_.value_counts()
215 for i, xi
in enumerate(vc):
218 nex_val = vc.index[i + 1]
220 nex_val = vc.index[i] + 0.01
221 fill = np.linspace(vc.index[i], nex_val, xi)
222 x_[x_ == vc.index[i]] = fill
223 self.
splinespline = InterpolatedUnivariateSpline(x_, y_)
227 Transforms the input data according to the cdf.
229 :return: Transformed data
232 return self.
splinespline(x)
238 This transformation uses the CDF to transform input data to a
244 Transformation with the CDF
253 Transform.__init__(self,
"Flat", *args)
260 Fit function calculates the cumulative distribution with numpy percentile.
262 :param x: Inout distribution
263 :param y: Will not be used in this transformation
265 self.
ioio.debug(
"Fitting Flat")
270 Transforms the input data according to the cdf.
272 :return: Transformed data
280 Returns the binning of the CDF
281 :return: Binning for a flat distribution
287 Dirty version for getting the original x value out of a flat x value.
288 :param x_flat: x value in the flat distribution
289 :return: x value on the original axis (approx)
291 x_cumul = np.linspace(self.
minmin, self.
maxmax, self.
n_binsn_bins * 50)
293 if self.
cdfcdf.spline(xx) > x_flat: