12 """ Transformation classes
14 In this file all classes for the transformation methods are defined.
15 The base class is Transform.
25 from scipy.interpolate
import InterpolatedUnivariateSpline
31 Base Class for the transformations.
32 The function _fit() is overwritten by the sub classes.
36 n_bins : int, optional
37 Binning in x, will be set automatically
39 Maximum of the fitted distribution
41 Minimum of the fitted distribution
45 Name of the transformation
49 def __init__(self, name="Original", n_bins=None):
53 :param n_bins: Binning for the transformations
70 ProTool.__init__(self,
"Transform." + self.
namenamename)
74 Sets limits for the data.
75 Not called by the user.
79 self.
ioio.debug(
"Initiating " + self.
namenamename)
80 if self.
n_binsn_bins
is None:
82 self.
maxmax = np.max(x)
83 self.
minmin = np.min(x)
85 def fit(self, x, y=None):
87 The fit function is calls the individual _fit() functions.
89 :param x: Distribution to fit, array type
90 :param y: optional for some transformations, sets signal class
97 """ Call function calls transform
99 :return: Transformed data
105 This is defined in the children and overwritten.
106 :param x: array x values
107 :param y: class variable [1,0]
113 This is defined in the children and overwritten.
114 :param x: Distribution to transform, array type
115 :param set_limits: Limits the range of the data to the fitted range
116 :return: Transformed data
120 return self._transform(x)
124 This is defined in the children and overwritten.
125 In the base class it does nothing and returns the original distribution.
127 :param x: Distribution to transform, array type
128 :return: Transformed data
134 Calculates the optimal size for the binning.
135 :param n: Length of the input data
137 self.
n_binsn_bins = get_optimal_bin_size(n)
138 self.
ioio.debug(
"Bins are set to " + str(self.
n_binsn_bins) +
"\t " + str(n / float(self.
n_binsn_bins)) +
"per bin")
142 Limits the data to the fitted range.
144 :return: Limited data
148 x[x > self.
maxmax] = self.
maxmax
149 x[x < self.
minmin] = self.
minmin
158 def get_optimal_bin_size(n):
160 This function calculates the optimal amount of bins for the number of events n.
161 :param n: number of Events
162 :return: optimal bin size
165 return int(3 * n ** (1 / 3.0))
168 def get_average_in_bins(n):
170 Returns the expected amount of entries in each bins.
171 :param n: Length of the data
172 :return: Length of the data divided by the optimal bin size
174 return n / float(get_optimal_bin_size(n))
180 Calculates the cumulative distribution (CDF)
181 Can be used for the flat transformation.
185 spline : InterpolatedUnivariateSpline
186 Spline, fitting the CDF
195 Transform.__init__(self,
"CDF", *args)
202 Fit function calculates the cumulative distribution with numpy percentile.
204 :param x: Input distribution
205 :param y: Will not be used in this transformation
207 self.
ioio.debug(
"Fitting CDF")
208 y_ = np.linspace(0, 100, 2 * self.
n_binsn_bins)
209 x_ = pd.Series(np.percentile(x, list(y_)))
212 vc = x_.value_counts()
216 for i, xi
in enumerate(vc):
219 nex_val = vc.index[i + 1]
221 nex_val = vc.index[i] + 0.01
222 fill = np.linspace(vc.index[i], nex_val, xi)
223 x_[x_ == vc.index[i]] = fill
224 self.
splinespline = InterpolatedUnivariateSpline(x_, y_)
228 Transforms the input data according to the cdf.
230 :return: Transformed data
233 return self.
splinespline(x)
239 This transformation uses the CDF to transform input data to a
245 Transformation with the CDF
254 Transform.__init__(self,
"Flat", *args)
261 Fit function calculates the cumulative distribution with numpy percentile.
263 :param x: Inout distribution
264 :param y: Will not be used in this transformation
266 self.
ioio.debug(
"Fitting Flat")
271 Transforms the input data according to the cdf.
273 :return: Transformed data
281 Returns the binning of the CDF
282 :return: Binning for a flat distribution
288 Dirty version for getting the original x value out of a flat x value.
289 :param x_flat: x value in the flat distribution
290 :return: x value on the original axis (approx)
292 x_cumul = np.linspace(self.
minmin, self.
maxmax, self.
n_binsn_bins * 50)
294 if self.
cdfcdf.spline(xx) > x_flat: