Belle II Software  release-08-01-10
transform.py
1 #!/usr/bin/env python3
2 
3 
10 
11 """ Transformation classes
12 
13 In this file all classes for the transformation methods are defined.
14 The base class is Transform.
15 
16 
17 """
18 
19 
20 from alignment.fancystuff.settings import ProTool
21 
22 import numpy as np
23 import pandas as pd
24 from scipy.interpolate import InterpolatedUnivariateSpline
25 
26 
28 
29  """
30  Base Class for the transformations.
31  The function _fit() is overwritten by the sub classes.
32 
33  Attributes
34  ----------
35  n_bins : int, optional
36  Binning in x, will be set automatically
37  max : float
38  Maximum of the fitted distribution
39  min : float
40  Minimum of the fitted distribution
41  is_processed : bool
42  Status flag
43  name : str
44  Name of the transformation
45 
46  """
47 
48  def __init__(self, name="Original", n_bins=None):
49  """ Init function
50 
51  :param name: Name
52  :param n_bins: Binning for the transformations
53  """
54 
55  self.n_binsn_bins = n_bins
56 
57 
58  self.maxmax = 0
59 
60 
61  self.minmin = 0
62 
63 
64  self.is_processedis_processed = False
65 
66 
67  self.namenamename = name
68 
69  ProTool.__init__(self, "Transform." + self.namenamename)
70 
71  def _initialise(self, x):
72  """
73  Sets limits for the data.
74  Not called by the user.
75 
76  :param x: array type
77  """
78  self.ioio.debug("Initiating " + self.namenamename)
79  if self.n_binsn_bins is None:
80  self.set_n_binsset_n_bins(len(x))
81  self.maxmax = np.max(x)
82  self.minmin = np.min(x)
83 
84  def fit(self, x, y=None):
85  """
86  The fit function is calls the individual _fit() functions.
87 
88  :param x: Distribution to fit, array type
89  :param y: optional for some transformations, sets signal class
90  """
91  self._initialise_initialise(x)
92  self._fit_fit(x, y)
93  self.is_processedis_processed = True
94 
95  def __call__(self, x):
96  """ Call function calls transform
97  :param x: Input data
98  :return: Transformed data
99  """
100  return self.transformtransform(x)
101 
102  def _fit(self, x, y=None):
103  """
104  This is defined in the children and overwritten.
105  :param x: array x values
106  :param y: class variable [1,0]
107 
108  """
109 
110  def transform(self, x, set_limits=False):
111  """
112  This is defined in the children and overwritten.
113  :param x: Distribution to transform, array type
114  :param set_limits: Limits the range of the data to the fitted range
115  :return: Transformed data
116  """
117  if set_limits:
118  self.set_limits(x)
119  return self._transform(x)
120 
121  def _transform(self, x):
122  """
123  This is defined in the children and overwritten.
124  In the base class it does nothing and returns the original distribution.
125 
126  :param x: Distribution to transform, array type
127  :return: Transformed data
128  """
129  return x
130 
131  def set_n_bins(self, n):
132  """
133  Calculates the optimal size for the binning.
134  :param n: Length of the input data
135  """
136  self.n_binsn_bins = get_optimal_bin_size(n)
137  self.ioio.debug("Bins are set to " + str(self.n_binsn_bins) + "\t " + str(n / float(self.n_binsn_bins)) + "per bin")
138 
139  def set_limits(self, x):
140  """
141  Limits the data to the fitted range.
142  :param x: Input data
143  :return: Limited data
144  """
145  try:
146  _ = len(x) # to catch exception
147  x[x > self.maxmax] = self.maxmax
148  x[x < self.minmin] = self.minmin
149  except TypeError:
150  if x < self.minmin:
151  x = self.minmin
152  if x > self.maxmax:
153  x = self.maxmax
154  return x
155 
156 
157 def get_optimal_bin_size(n):
158  """
159  This function calculates the optimal amount of bins for the number of events n.
160  :param n: number of Events
161  :return: optimal bin size
162 
163  """
164  return int(3 * n ** (1 / 3.0))
165 
166 
167 def get_average_in_bins(n):
168  """
169  Returns the expected amount of entries in each bins.
170  :param n: Length of the data
171  :return: Length of the data divided by the optimal bin size
172  """
173  return n / float(get_optimal_bin_size(n))
174 
175 
176 class CDF(Transform):
177 
178  """
179  Calculates the cumulative distribution (CDF)
180  Can be used for the flat transformation.
181 
182  Attributes
183  ----------
184  spline : InterpolatedUnivariateSpline
185  Spline, fitting the CDF
186 
187  """
188 
189  def __init__(self, *args):
190  """ Init function
191 
192  :param args: None
193  """
194  Transform.__init__(self, "CDF", *args)
195 
196 
197  self.splinespline = None
198 
199  def _fit(self, x, y=None):
200  """
201  Fit function calculates the cumulative distribution with numpy percentile.
202 
203  :param x: Input distribution
204  :param y: Will not be used in this transformation
205  """
206  self.ioio.debug("Fitting CDF")
207  y_ = np.linspace(0, 100, 2 * self.n_binsn_bins)
208  x_ = pd.Series(np.percentile(x, list(y_)))
209 
210  # Count same values
211  vc = x_.value_counts()
212  vc = vc.sort_index()
213 
214  # replace same values
215  for i, xi in enumerate(vc):
216  if xi > 1:
217  try:
218  nex_val = vc.index[i + 1]
219  except IndexError:
220  nex_val = vc.index[i] + 0.01
221  fill = np.linspace(vc.index[i], nex_val, xi)
222  x_[x_ == vc.index[i]] = fill
223  self.splinespline = InterpolatedUnivariateSpline(x_, y_)
224 
225  def _transform(self, x):
226  """
227  Transforms the input data according to the cdf.
228  :param x: Input data
229  :return: Transformed data
230  """
231  x = self.set_limitsset_limits(x)
232  return self.splinespline(x)
233 
234 
236 
237  """
238  This transformation uses the CDF to transform input data to a
239  flat transformation.
240 
241  Attributes
242  ----------
243  cdf : Transform.CDF
244  Transformation with the CDF
245 
246  """
247 
248  def __init__(self, *args):
249  """ Init function
250 
251  :param args: None
252  """
253  Transform.__init__(self, "Flat", *args)
254 
255 
256  self.cdfcdf = CDF(*args)
257 
258  def _fit(self, x, y=None):
259  """
260  Fit function calculates the cumulative distribution with numpy percentile.
261 
262  :param x: Inout distribution
263  :param y: Will not be used in this transformation
264  """
265  self.ioio.debug("Fitting Flat")
266  self.cdfcdf.fit(x)
267 
268  def _transform(self, x):
269  """
270  Transforms the input data according to the cdf.
271  :param x: Input data
272  :return: Transformed data
273  """
274  if not self.is_processedis_processed:
275  self.fitfit(x)
276  return self.cdfcdf.transform(x)
277 
278  def get_flat_bins(self):
279  """
280  Returns the binning of the CDF
281  :return: Binning for a flat distribution
282  """
283  return self.cdfcdf.x
284 
285  def get_x(self, x_flat):
286  """
287  Dirty version for getting the original x value out of a flat x value.
288  :param x_flat: x value in the flat distribution
289  :return: x value on the original axis (approx)
290  """
291  x_cumul = np.linspace(self.minmin, self.maxmax, self.n_binsn_bins * 50)
292  for xx in x_cumul:
293  if self.cdfcdf.spline(xx) > x_flat:
294  return xx
def _fit(self, x, y=None)
Definition: transform.py:199
spline
Spline, fitting the CDF.
Definition: transform.py:197
cdf
Transformation with the CDF.
Definition: transform.py:256
n_bins
Binning in x, will be set automatically.
Definition: transform.py:55
def transform(self, x, set_limits=False)
Definition: transform.py:110
min
Minimum of the fitted distribution.
Definition: transform.py:61
max
Maximum of the fitted distribution.
Definition: transform.py:58
name
Name of the transformation.
Definition: transform.py:67
def __init__(self, name="Original", n_bins=None)
Definition: transform.py:48