Belle II Software  release-05-02-19
transform.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 """ Transformation classes
5 
6 In this file all classes for the transformation methods are defined.
7 The base class is Transform.
8 
9 
10 """
11 __author__ = 'swehle'
12 
13 
14 from .settings import ProTool
15 
16 import numpy as np
17 import pandas as pd
18 from scipy.interpolate import InterpolatedUnivariateSpline
19 
20 
22 
23  """
24  Base Class for the transformations.
25  The function _fit() is overwritten by the sub classes.
26 
27  Attributes
28  ----------
29  n_bins : int, optional
30  Binning in x, will be set automatically
31  max : float
32  Maximum of the fitted distribution
33  min : float
34  Minimum of the fitted distribution
35  is_processed : bool
36  Status flag
37  name : str
38  Name of the transformation
39 
40  """
41 
42  def __init__(self, name="Original", n_bins=None):
43  """ Init function
44 
45  :param name: Name
46  :param n_bins: Binning for the transformations
47  """
48 
49  self.n_bins = n_bins
50 
51 
52  self.max = 0
53 
54 
55  self.min = 0
56 
57 
58  self.is_processed = False
59 
60 
61  self.name = name
62 
63  ProTool.__init__(self, "Transform." + self.name)
64 
65  def _initialise(self, x):
66  """
67  Sets limits for the data.
68  Not called by the user.
69 
70  :param x: array type
71  """
72  self.io.debug("Initiating " + self.name)
73  if self.n_bins is None:
74  self.set_n_bins(len(x))
75  self.max = np.max(x)
76  self.min = np.min(x)
77 
78  def fit(self, x, y=None):
79  """
80  The fit function is calls the individual _fit() functions.
81 
82  :param x: Distribution to fit, array type
83  :param y: optional for some transformations, sets signal class
84  """
85  self._initialise(x)
86  self._fit(x, y)
87  self.is_processed = True
88 
89  def __call__(self, x):
90  """ Call function calls transform
91  :param x: Input data
92  :return: Transformed data
93  """
94  return self.transform(x)
95 
96  def _fit(self, x, y=None):
97  """
98  This is defined in the children and overwritten.
99  :param x: array x values
100  :param y: class variable [1,0]
101 
102  """
103 
104  def transform(self, x, set_limits=False):
105  """
106  This is defined in the children and overwritten.
107  :param x: Distribution to transform, array type
108  :param set_limits: Limits the range of the data to the fitted range
109  :return: Transformed data
110  """
111  if set_limits:
112  self.set_limits(x)
113  return self._transform(x)
114 
115  def _transform(self, x):
116  """
117  This is defined in the children and overwritten.
118  In the base class it does nothing and returns the original distribution.
119 
120  :param x: Distribution to transform, array type
121  :return: Transformed data
122  """
123  return x
124 
125  def set_n_bins(self, n):
126  """
127  Calculates the optimal size for the binning.
128  :param n: Length of the input data
129  """
130  self.n_bins = get_optimal_bin_size(n)
131  self.io.debug("Bins are set to " + str(self.n_bins) + "\t " + str(n / float(self.n_bins)) + "per bin")
132 
133  def set_limits(self, x):
134  """
135  Limits the data to the fitted range.
136  :param x: Input data
137  :return: Limited data
138  """
139  try:
140  _ = len(x) # to catch exception
141  x[x > self.max] = self.max
142  x[x < self.min] = self.min
143  except TypeError:
144  if x < self.min:
145  x = self.min
146  if x > self.max:
147  x = self.max
148  return x
149 
150 
151 def get_optimal_bin_size(n):
152  """
153  This function calculates the optimal amount of bins for the number of events n.
154  :param n: number of Events
155  :return: optimal bin size
156 
157  """
158  return int(3 * n ** (1 / 3.0))
159 
160 
161 def get_average_in_bins(n):
162  """
163  Returns the expected amount of entries in each bins.
164  :param n: Length of the data
165  :return: Length of the data divided by the optimal bin size
166  """
167  return n / float(get_optimal_bin_size(n))
168 
169 
170 class CDF(Transform):
171 
172  """
173  Calculates the cumulative distribution (CDF)
174  Can be used for the flat transformation.
175 
176  Attributes
177  ----------
178  spline : InterpolatedUnivariateSpline
179  Spline, fitting the CDF
180 
181  """
182 
183  def __init__(self, *args):
184  """ Init function
185 
186  :param args: None
187  """
188  Transform.__init__(self, "CDF", *args)
189 
190 
191  self.spline = None
192 
193  def _fit(self, x, y=None):
194  """
195  Fit function calculates the cumulative distribution with numpy percentile.
196 
197  :param x: Input distribution
198  :param y: Will not be used in this transformation
199  """
200  self.io.debug("Fitting CDF")
201  y_ = np.linspace(0, 100, 2 * self.n_bins)
202  x_ = pd.Series(np.percentile(x, list(y_)))
203 
204  # Count same values
205  vc = x_.value_counts()
206  vc = vc.sort_index()
207 
208  # replace same values
209  for i, xi in enumerate(vc):
210  if xi > 1:
211  try:
212  nex_val = vc.index[i + 1]
213  except IndexError:
214  nex_val = vc.index[i] + 0.01
215  fill = np.linspace(vc.index[i], nex_val, xi)
216  x_[x_ == vc.index[i]] = fill
217  self.spline = InterpolatedUnivariateSpline(x_, y_)
218 
219  def _transform(self, x):
220  """
221  Transforms the input data according to the cdf.
222  :param x: Input data
223  :return: Transformed data
224  """
225  x = self.set_limits(x)
226  return self.spline(x)
227 
228 
230 
231  """
232  This transformation uses the CDF to transform input data to a
233  flat transformation.
234 
235  Attributes
236  ----------
237  cdf : Transform.CDF
238  Transformation with the CDF
239 
240  """
241 
242  def __init__(self, *args):
243  """ Init function
244 
245  :param args: None
246  """
247  Transform.__init__(self, "Flat", *args)
248 
249 
250  self.cdf = CDF(*args)
251 
252  def _fit(self, x, y=None):
253  """
254  Fit function calculates the cumulative distribution with numpy percentile.
255 
256  :param x: Inout distribution
257  :param y: Will not be used in this transformation
258  """
259  self.io.debug("Fitting Flat")
260  self.cdf.fit(x)
261 
262  def _transform(self, x):
263  """
264  Transforms the input data according to the cdf.
265  :param x: Input data
266  :return: Transformed data
267  """
268  if not self.is_processed:
269  self.fit(x)
270  return self.cdf.transform(x)
271 
272  def get_flat_bins(self):
273  """
274  Returns the binning of the CDF
275  :return: Binning for a flat distribution
276  """
277  return self.cdf.x
278 
279  def get_x(self, x_flat):
280  """
281  Dirty version for getting the original x value out of a flat x value.
282  :param x_flat: x value in the flat distribution
283  :return: x value on the original axis (approx)
284  """
285  x_cum = np.linspace(self.min, self.max, self.n_bins * 50)
286  for xx in x_cum:
287  if self.cdf.spline(xx) > x_flat:
288  return xx
alignment.fancystuff.transform.Transform._initialise
def _initialise(self, x)
Definition: transform.py:65
alignment.fancystuff.transform.Transform
Definition: transform.py:21
alignment.fancystuff.transform.CDF._fit
def _fit(self, x, y=None)
Definition: transform.py:193
alignment.fancystuff.transform.Transform._fit
def _fit(self, x, y=None)
Definition: transform.py:96
alignment.fancystuff.transform.Transform.is_processed
is_processed
Status flag.
Definition: transform.py:58
alignment.fancystuff.settings.ProTool.io
def io(self)
Definition: settings.py:43
alignment.fancystuff.transform.ToFlat._transform
def _transform(self, x)
Definition: transform.py:262
alignment.fancystuff.transform.Transform.max
max
Maximum of the fitted distribution.
Definition: transform.py:52
alignment.fancystuff.transform.CDF
Definition: transform.py:170
alignment.fancystuff.transform.CDF.spline
spline
Spline, fitting the CDF.
Definition: transform.py:191
alignment.fancystuff.transform.Transform.set_n_bins
def set_n_bins(self, n)
Definition: transform.py:125
alignment.fancystuff.transform.CDF._transform
def _transform(self, x)
Definition: transform.py:219
alignment.fancystuff.transform.ToFlat._fit
def _fit(self, x, y=None)
Definition: transform.py:252
alignment.fancystuff.transform.CDF.__init__
def __init__(self, *args)
Definition: transform.py:183
alignment.fancystuff.transform.Transform.__init__
def __init__(self, name="Original", n_bins=None)
Definition: transform.py:42
alignment.fancystuff.transform.ToFlat.cdf
cdf
Transformation with the CDF.
Definition: transform.py:250
alignment.fancystuff.transform.Transform.__call__
def __call__(self, x)
Definition: transform.py:89
alignment.fancystuff.settings.ProTool.name
name
Name of the class.
Definition: settings.py:40
alignment.fancystuff.transform.ToFlat.get_flat_bins
def get_flat_bins(self)
Definition: transform.py:272
alignment.fancystuff.settings.ProTool
Definition: settings.py:20
alignment.fancystuff.transform.Transform.min
min
Minimum of the fitted distribution.
Definition: transform.py:55
alignment.fancystuff.transform.ToFlat.get_x
def get_x(self, x_flat)
Definition: transform.py:279
alignment.fancystuff.transform.ToFlat.__init__
def __init__(self, *args)
Definition: transform.py:242
alignment.fancystuff.transform.Transform.transform
def transform(self, x, set_limits=False)
Definition: transform.py:104
alignment.fancystuff.transform.Transform.fit
def fit(self, x, y=None)
Definition: transform.py:78
alignment.fancystuff.transform.Transform.set_limits
def set_limits(self, x)
Definition: transform.py:133
alignment.fancystuff.transform.Transform._transform
def _transform(self, x)
Definition: transform.py:115
alignment.fancystuff.transform.Transform.n_bins
n_bins
Binning in x, will be set automatically.
Definition: transform.py:49
alignment.fancystuff.transform.ToFlat
Definition: transform.py:229