Belle II Software  release-06-00-14
transform.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 
11 
12 """ Transformation classes
13 
14 In this file all classes for the transformation methods are defined.
15 The base class is Transform.
16 
17 
18 """
19 
20 
21 from alignment.fancystuff.settings import ProTool
22 
23 import numpy as np
24 import pandas as pd
25 from scipy.interpolate import InterpolatedUnivariateSpline
26 
27 
29 
30  """
31  Base Class for the transformations.
32  The function _fit() is overwritten by the sub classes.
33 
34  Attributes
35  ----------
36  n_bins : int, optional
37  Binning in x, will be set automatically
38  max : float
39  Maximum of the fitted distribution
40  min : float
41  Minimum of the fitted distribution
42  is_processed : bool
43  Status flag
44  name : str
45  Name of the transformation
46 
47  """
48 
49  def __init__(self, name="Original", n_bins=None):
50  """ Init function
51 
52  :param name: Name
53  :param n_bins: Binning for the transformations
54  """
55 
56  self.n_binsn_bins = n_bins
57 
58 
59  self.maxmax = 0
60 
61 
62  self.minmin = 0
63 
64 
65  self.is_processedis_processed = False
66 
67 
68  self.namenamename = name
69 
70  ProTool.__init__(self, "Transform." + self.namenamename)
71 
72  def _initialise(self, x):
73  """
74  Sets limits for the data.
75  Not called by the user.
76 
77  :param x: array type
78  """
79  self.ioio.debug("Initiating " + self.namenamename)
80  if self.n_binsn_bins is None:
81  self.set_n_binsset_n_bins(len(x))
82  self.maxmax = np.max(x)
83  self.minmin = np.min(x)
84 
85  def fit(self, x, y=None):
86  """
87  The fit function is calls the individual _fit() functions.
88 
89  :param x: Distribution to fit, array type
90  :param y: optional for some transformations, sets signal class
91  """
92  self._initialise_initialise(x)
93  self._fit_fit(x, y)
94  self.is_processedis_processed = True
95 
96  def __call__(self, x):
97  """ Call function calls transform
98  :param x: Input data
99  :return: Transformed data
100  """
101  return self.transformtransform(x)
102 
103  def _fit(self, x, y=None):
104  """
105  This is defined in the children and overwritten.
106  :param x: array x values
107  :param y: class variable [1,0]
108 
109  """
110 
111  def transform(self, x, set_limits=False):
112  """
113  This is defined in the children and overwritten.
114  :param x: Distribution to transform, array type
115  :param set_limits: Limits the range of the data to the fitted range
116  :return: Transformed data
117  """
118  if set_limits:
119  self.set_limits(x)
120  return self._transform(x)
121 
122  def _transform(self, x):
123  """
124  This is defined in the children and overwritten.
125  In the base class it does nothing and returns the original distribution.
126 
127  :param x: Distribution to transform, array type
128  :return: Transformed data
129  """
130  return x
131 
132  def set_n_bins(self, n):
133  """
134  Calculates the optimal size for the binning.
135  :param n: Length of the input data
136  """
137  self.n_binsn_bins = get_optimal_bin_size(n)
138  self.ioio.debug("Bins are set to " + str(self.n_binsn_bins) + "\t " + str(n / float(self.n_binsn_bins)) + "per bin")
139 
140  def set_limits(self, x):
141  """
142  Limits the data to the fitted range.
143  :param x: Input data
144  :return: Limited data
145  """
146  try:
147  _ = len(x) # to catch exception
148  x[x > self.maxmax] = self.maxmax
149  x[x < self.minmin] = self.minmin
150  except TypeError:
151  if x < self.minmin:
152  x = self.minmin
153  if x > self.maxmax:
154  x = self.maxmax
155  return x
156 
157 
158 def get_optimal_bin_size(n):
159  """
160  This function calculates the optimal amount of bins for the number of events n.
161  :param n: number of Events
162  :return: optimal bin size
163 
164  """
165  return int(3 * n ** (1 / 3.0))
166 
167 
168 def get_average_in_bins(n):
169  """
170  Returns the expected amount of entries in each bins.
171  :param n: Length of the data
172  :return: Length of the data divided by the optimal bin size
173  """
174  return n / float(get_optimal_bin_size(n))
175 
176 
177 class CDF(Transform):
178 
179  """
180  Calculates the cumulative distribution (CDF)
181  Can be used for the flat transformation.
182 
183  Attributes
184  ----------
185  spline : InterpolatedUnivariateSpline
186  Spline, fitting the CDF
187 
188  """
189 
190  def __init__(self, *args):
191  """ Init function
192 
193  :param args: None
194  """
195  Transform.__init__(self, "CDF", *args)
196 
197 
198  self.splinespline = None
199 
200  def _fit(self, x, y=None):
201  """
202  Fit function calculates the cumulative distribution with numpy percentile.
203 
204  :param x: Input distribution
205  :param y: Will not be used in this transformation
206  """
207  self.ioio.debug("Fitting CDF")
208  y_ = np.linspace(0, 100, 2 * self.n_binsn_bins)
209  x_ = pd.Series(np.percentile(x, list(y_)))
210 
211  # Count same values
212  vc = x_.value_counts()
213  vc = vc.sort_index()
214 
215  # replace same values
216  for i, xi in enumerate(vc):
217  if xi > 1:
218  try:
219  nex_val = vc.index[i + 1]
220  except IndexError:
221  nex_val = vc.index[i] + 0.01
222  fill = np.linspace(vc.index[i], nex_val, xi)
223  x_[x_ == vc.index[i]] = fill
224  self.splinespline = InterpolatedUnivariateSpline(x_, y_)
225 
226  def _transform(self, x):
227  """
228  Transforms the input data according to the cdf.
229  :param x: Input data
230  :return: Transformed data
231  """
232  x = self.set_limitsset_limits(x)
233  return self.splinespline(x)
234 
235 
237 
238  """
239  This transformation uses the CDF to transform input data to a
240  flat transformation.
241 
242  Attributes
243  ----------
244  cdf : Transform.CDF
245  Transformation with the CDF
246 
247  """
248 
249  def __init__(self, *args):
250  """ Init function
251 
252  :param args: None
253  """
254  Transform.__init__(self, "Flat", *args)
255 
256 
257  self.cdfcdf = CDF(*args)
258 
259  def _fit(self, x, y=None):
260  """
261  Fit function calculates the cumulative distribution with numpy percentile.
262 
263  :param x: Inout distribution
264  :param y: Will not be used in this transformation
265  """
266  self.ioio.debug("Fitting Flat")
267  self.cdfcdf.fit(x)
268 
269  def _transform(self, x):
270  """
271  Transforms the input data according to the cdf.
272  :param x: Input data
273  :return: Transformed data
274  """
275  if not self.is_processedis_processed:
276  self.fitfit(x)
277  return self.cdfcdf.transform(x)
278 
279  def get_flat_bins(self):
280  """
281  Returns the binning of the CDF
282  :return: Binning for a flat distribution
283  """
284  return self.cdfcdf.x
285 
286  def get_x(self, x_flat):
287  """
288  Dirty version for getting the original x value out of a flat x value.
289  :param x_flat: x value in the flat distribution
290  :return: x value on the original axis (approx)
291  """
292  x_cumul = np.linspace(self.minmin, self.maxmax, self.n_binsn_bins * 50)
293  for xx in x_cumul:
294  if self.cdfcdf.spline(xx) > x_flat:
295  return xx
def _fit(self, x, y=None)
Definition: transform.py:200
spline
Spline, fitting the CDF.
Definition: transform.py:198
cdf
Transformation with the CDF.
Definition: transform.py:257
n_bins
Binning in x, will be set automatically.
Definition: transform.py:56
def transform(self, x, set_limits=False)
Definition: transform.py:111
min
Minimum of the fitted distribution.
Definition: transform.py:62
max
Maximum of the fitted distribution.
Definition: transform.py:59
name
Name of the transformation.
Definition: transform.py:68
def __init__(self, name="Original", n_bins=None)
Definition: transform.py:49