Belle II Software  release-08-01-10
plotting.py
1 #!/usr/bin/env python3
2 
3 
4 
11 
12 import copy
13 import math
14 
15 import pandas
16 import numpy
17 import matplotlib.pyplot as plt
18 import matplotlib.artist
19 import matplotlib.figure
20 import matplotlib.gridspec
21 import matplotlib.colors
22 import matplotlib.patches
23 import matplotlib.ticker
24 import matplotlib.patheffects as PathEffects
25 
26 
27 from basf2_mva_evaluation import histogram
28 
29 import basf2 as b2
30 
31 import basf2_mva_util
32 import matplotlib
33 
34 # Do not use standard backend TkAgg, because it is NOT thread-safe
35 # You will get an RuntimeError: main thread is not in main loop otherwise!
36 matplotlib.use("svg")
37 matplotlib.rcParams.update({'font.size': 36})
38 
39 # Use the Belle II style while producing the plots
40 plt.style.use("belle2")
41 
42 
43 class Plotter:
44  """
45  Base class for all Plotters.
46  """
47 
48  # stupid workaround for doxygen refusing to document things
49 
50 
52 
53 
57 
58 
59  plots = None
60 
61  labels = None
62 
63  xmin = None
64 
65  xmax = None
66 
67  ymin = None
68 
69  ymax = None
70  yscale = 0.0
71  xscale = 0.0
72 
73  figure = None
74 
75  axis = None
76 
77  def __init__(self, figure=None, axis=None):
78  """
79  Creates a new figure and axis if None is given, sets the default plot parameters
80  @param figure default draw figure which is used
81  @param axis default draw axis which is used
82  """
83  b2.B2INFO("Create new figure for class " + str(type(self)))
84  if figure is None:
85  self.figurefigure = matplotlib.figure.Figure(figsize=(32, 18))
86  self.figurefigure.set_tight_layout(False)
87  else:
88  self.figurefigure = figure
89 
90  if axis is None:
91  self.axisaxis = self.figurefigure.add_subplot(1, 1, 1)
92  else:
93  self.axisaxis = axis
94 
95  self.plotsplots = []
96  self.labelslabels = []
97  self.xminxmin, self.xmaxxmax = float(0), float(1)
98  self.yminymin, self.ymaxymax = float(0), float(1)
99 
100  self.yscaleyscaleyscale = 0.1
101 
102  self.xscalexscalexscale = 0.0
103 
104 
105  self.plot_kwargsplot_kwargs = None
106 
107  self.errorbar_kwargserrorbar_kwargs = None
108 
109  self.errorband_kwargserrorband_kwargs = None
110 
111  self.fill_kwargsfill_kwargs = None
112 
113  self.set_plot_optionsset_plot_options()
114  self.set_errorbar_optionsset_errorbar_options()
115  self.set_errorband_optionsset_errorband_options()
116  self.set_fill_optionsset_fill_options()
117 
118  def add_subplot(self, gridspecs):
119  """
120  Adds a new subplot to the figure, updates all other axes
121  according to the given gridspec
122  @param gridspecs gridspecs for all axes including the new one
123  """
124  for gs, ax in zip(gridspecs[:-1], self.figurefigure.axes):
125  ax.set_position(gs.get_position(self.figurefigure))
126  ax.set_subplotspec(gs)
127  axis = self.figurefigure.add_subplot(gridspecs[-1], sharex=self.axisaxis)
128  return axis
129 
130  def save(self, filename):
131  """
132  Save the figure into a file
133  @param filename of the file
134  """
135  b2.B2INFO("Save figure for class " + str(type(self)))
136  from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
137  canvas = FigureCanvas(self.figurefigure)
138  canvas.print_figure(filename, dpi=50)
139  return self
140 
141  def set_plot_options(self, plot_kwargs={'linestyle': ''}):
142  """
143  Overrides default plot options for datapoint plot
144  @param plot_kwargs keyword arguments for the plot function
145  """
146  self.plot_kwargsplot_kwargs = copy.copy(plot_kwargs)
147  return self
148 
149  def set_errorbar_options(self, errorbar_kwargs={'fmt': '.', 'elinewidth': 3, 'alpha': 1}):
150  """
151  Overrides default errorbar options for datapoint errorbars
152  @param errorbar_kwargs keyword arguments for the errorbar function
153  """
154  self.errorbar_kwargserrorbar_kwargs = copy.copy(errorbar_kwargs)
155  return self
156 
157  def set_errorband_options(self, errorband_kwargs={'alpha': 0.5}):
158  """
159  Overrides default errorband options for datapoint errorband
160  @param errorbar_kwargs keyword arguments for the fill_between function
161  """
162  self.errorband_kwargserrorband_kwargs = copy.copy(errorband_kwargs)
163  return self
164 
165  def set_fill_options(self, fill_kwargs=None):
166  """
167  Overrides default fill_between options for datapoint errorband
168  @param fill_kwargs keyword arguments for the fill_between function
169  """
170  self.fill_kwargsfill_kwargs = copy.copy(fill_kwargs)
171  return self
172 
173  def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None):
174  """
175  Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
176  @param x coordinates of the data points
177  @param y coordinates of the data points
178  @param xerr symmetric error on x data points
179  @param yerr symmetric error on y data points
180  """
181  p = e = f = None
182  plot_kwargs = copy.copy(self.plot_kwargsplot_kwargs)
183  errorbar_kwargs = copy.copy(self.errorbar_kwargserrorbar_kwargs)
184  errorband_kwargs = copy.copy(self.errorband_kwargserrorband_kwargs)
185  fill_kwargs = copy.copy(self.fill_kwargsfill_kwargs)
186 
187  if plot_kwargs is None or 'color' not in plot_kwargs:
188  color = next(axis._get_lines.prop_cycler)
189  color = color['color']
190  plot_kwargs['color'] = color
191  else:
192  color = plot_kwargs['color']
193  color = matplotlib.colors.ColorConverter().to_rgb(color)
194  patch = matplotlib.patches.Patch(color=color, alpha=0.5)
195  patch.get_color = patch.get_facecolor
196  patches = [patch]
197 
198  if plot_kwargs is not None:
199  p, = axis.plot(x, y, rasterized=True, **plot_kwargs)
200  patches.append(p)
201 
202  if errorbar_kwargs is not None and (xerr is not None or yerr is not None):
203  if 'color' not in errorbar_kwargs:
204  errorbar_kwargs['color'] = color
205  if 'ecolor' not in errorbar_kwargs:
206  errorbar_kwargs['ecolor'] = [0.5 * x for x in color]
207 
208  # fully mask nan values.
209  # Needed until https://github.com/matplotlib/matplotlib/pull/23333 makes it into the externals.
210  # TODO: remove in release 8.
211  if not isinstance(xerr, (numpy.ndarray, list)):
212  xerr = xerr*numpy.ones(len(x))
213  mask = numpy.logical_and.reduce([numpy.isfinite(v) for v in [x, y, xerr, yerr]])
214 
215  e = axis.errorbar(x[mask], y[mask], xerr=xerr[mask], yerr=yerr[mask], rasterized=True, **errorbar_kwargs)
216  patches.append(e)
217 
218  if errorband_kwargs is not None and yerr is not None:
219  if 'color' not in errorband_kwargs:
220  errorband_kwargs['color'] = color
221  if xerr is not None:
222  # Ensure that xerr and yerr are iterable numpy arrays
223  xerr = x + xerr - x
224  yerr = y + yerr - y
225  for _x, _y, _xe, _ye in zip(x, y, xerr, yerr):
226  axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=True,
227  **errorband_kwargs))
228  else:
229  f = axis.fill_between(x, y - yerr, y + yerr, interpolate=True, rasterized=True, **errorband_kwargs)
230 
231  if fill_kwargs is not None:
232  # to fill the last bin of a histogram
233  x = numpy.append(x, x[-1]+2*xerr[-1])
234  y = numpy.append(y, y[-1])
235  xerr = numpy.append(xerr, xerr[-1])
236 
237  axis.fill_between(x-xerr, y, 0, rasterized=True, **fill_kwargs)
238 
239  return (tuple(patches), p, e, f)
240 
241  def add(self, *args, **kwargs):
242  """
243  Add a new plot to this plotter
244  """
245  return NotImplemented
246 
247  def finish(self, *args, **kwargs):
248  """
249  Finish plotting and set labels, legends and stuff
250  """
251  return NotImplemented
252 
253  def scale_limits(self):
254  """
255  Scale limits to increase distance to boundaries
256  """
257  self.yminymin *= 1.0 - math.copysign(self.yscaleyscaleyscale, self.yminymin)
258  self.ymaxymax *= 1.0 + math.copysign(self.yscaleyscaleyscale, self.ymaxymax)
259  self.xminxmin *= 1.0 - math.copysign(self.xscalexscalexscale, self.xminxmin)
260  self.xmaxxmax *= 1.0 + math.copysign(self.xscalexscalexscale, self.xmaxxmax)
261  return self
262 
263 
265  """
266  Plots the purity and the efficiency over the cut value (for cut choosing)
267  """
268 
272 
273  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
274  """
275  Add a new curve to the plot
276  @param data pandas.DataFrame containing all data
277  @param column which is used to calculate efficiency and purity for different cuts
278  @param signal_mask boolean numpy.array defining which events are signal events
279  @param bckgrd_mask boolean numpy.array defining which events are background events
280  @param weight_column column in data containing the weights for each event
281  """
282 
283  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
284 
285  if normed:
286  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
287  purity, purity_error = hists.get_purity(['Signal'], ['Background'])
288  else:
289  efficiency, efficiency_error = hists.get_true_positives(['Signal'])
290  purity, purity_error = hists.get_false_positives(['Background'])
291 
292  cuts = hists.bin_centers
293 
294  self.xminxmin, self.xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.xminxmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmaxxmaxxmax])
295  self.yminymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.yminymin]), \
296  numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.ymaxymaxymax])
297 
298  self.plotsplots.append(self._plot_datapoints_plot_datapoints(self.axisaxis, cuts, efficiency, xerr=0, yerr=efficiency_error))
299 
300  if normed:
301  self.labelslabels.append("Efficiency")
302  else:
303  self.labelslabels.append("True positive")
304 
305  self.plotsplots.append(self._plot_datapoints_plot_datapoints(self.axisaxis, cuts, purity, xerr=0, yerr=purity_error))
306 
307  if normed:
308  self.labelslabels.append("Purity")
309  else:
310  self.labelslabels.append("False positive")
311 
312  return self
313 
314  def finish(self):
315  """
316  Sets limits, title, axis-labels and legend of the plot
317  """
318  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
319  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
320  self.axisaxis.set_title("Classification Plot")
321  self.axisaxis.get_xaxis().set_label_text('Cut Value')
322  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
323  return self
324 
325 
327  """
328  Plots the signal to noise ratio over the cut value (for cut choosing)
329  """
330 
334 
335  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
336  """
337  Add a new curve to the plot
338  @param data pandas.DataFrame containing all data
339  @param column which is used to calculate signal to noise ratio for different cuts
340  @param signal_mask boolean numpy.array defining which events are signal events
341  @param bckgrd_mask boolean numpy.array defining which events are background events
342  @param weight_column column in data containing the weights for each event
343  """
344 
345  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
346 
347  signal2noise, signal2noise_error = hists.get_signal_to_noise(['Signal'], ['Background'])
348 
349  cuts = hists.bin_centers
350 
351  self.xminxmin, self.xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.xminxmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmaxxmaxxmax])
352  self.yminymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(signal2noise), self.yminymin]), \
353  numpy.nanmax([numpy.nanmax(signal2noise), self.ymaxymaxymax])
354 
355  self.plotsplots.append(self._plot_datapoints_plot_datapoints(self.axisaxis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))
356 
357  self.labelslabels.append(column)
358 
359  return self
360 
361  def finish(self):
362  """
363  Sets limits, title, axis-labels and legend of the plot
364  """
365  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
366  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
367  self.axisaxis.set_title("Signal to Noise Plot")
368  self.axisaxis.get_xaxis().set_label_text('Cut Value')
369  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
370  return self
371 
372 
374  """
375  Plots the purity over the efficiency also known as ROC curve
376  """
377 
381 
382  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
383  """
384  Add a new curve to the ROC plot
385  @param data pandas.DataFrame containing all data
386  @param column which is used to calculate efficiency and purity for different cuts
387  @param signal_mask boolean numpy.array defining which events are signal events
388  @param bckgrd_mask boolean numpy.array defining which events are background events
389  @param weight_column column in data containing the weights for each event
390  """
391  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
392  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
393  purity, purity_error = hists.get_purity(['Signal'], ['Background'])
394 
395  self.xminxmin, self.xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.xminxmin]), numpy.nanmax([efficiency.max(), self.xmaxxmaxxmax])
396  self.yminymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(purity), self.yminymin]), numpy.nanmax([numpy.nanmax(purity), self.ymaxymaxymax])
397 
398  p = self._plot_datapoints_plot_datapoints(self.axisaxis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
399  self.plotsplots.append(p)
400  if label is not None:
401  self.labelslabels.append(label)
402  else:
403  self.labelslabels.append(column)
404  return self
405 
406  def finish(self):
407  """
408  Sets limits, title, axis-labels and legend of the plot
409  """
410  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
411  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
412  self.axisaxis.set_title("ROC Purity Plot")
413  self.axisaxis.get_xaxis().set_label_text('Efficiency')
414  self.axisaxis.get_yaxis().set_label_text('Purity')
415  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
416  return self
417 
418 
420  """
421  Plots the rejection over the efficiency also known as ROC curve
422  """
423 
427 
428  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
429  """
430  Add a new curve to the ROC plot
431  @param data pandas.DataFrame containing all data
432  @param column which is used to calculate efficiency and purity for different cuts
433  @param signal_mask boolean numpy.array defining which events are signal events
434  @param bckgrd_mask boolean numpy.array defining which events are background events
435  @param weight_column column in data containing the weights for each event
436  """
437  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
438  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
439  rejection, rejection_error = hists.get_efficiency(['Background'])
440  rejection = 1 - rejection
441  if isinstance(efficiency, int) and not isinstance(rejection, int):
442  efficiency = numpy.array([efficiency] * len(rejection))
443  elif isinstance(rejection, int) and not isinstance(efficiency, int):
444  rejection = numpy.array([rejection] * len(efficiency))
445  elif isinstance(rejection, int) and isinstance(efficiency, int):
446  efficiency = numpy.array([efficiency])
447  rejection = numpy.array([rejection])
448 
449  self.xminxmin, self.xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.xminxmin]), numpy.nanmax([efficiency.max(), self.xmaxxmaxxmax])
450  self.yminymin, self.ymaxymaxymax = numpy.nanmin([rejection.min(), self.yminymin]), numpy.nanmax([rejection.max(), self.ymaxymaxymax])
451 
452  auc = numpy.abs(numpy.trapz(rejection, efficiency))
453 
454  p = self._plot_datapoints_plot_datapoints(self.axisaxis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
455  self.plotsplots.append(p)
456  if label is not None:
457  self.labelslabels.append(label[:10] + f" ({auc:.2f})")
458  else:
459  self.labelslabels.append(column[:10] + f" ({auc:.2f})")
460  return self
461 
462  def finish(self):
463  """
464  Sets limits, title, axis-labels and legend of the plot
465  """
466  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
467  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
468  self.axisaxis.set_title("ROC Rejection Plot")
469  self.axisaxis.get_xaxis().set_label_text('Signal Efficiency')
470  self.axisaxis.get_yaxis().set_label_text('Background Rejection')
471  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
472  return self
473 
474 
476  """
477  Plots multiple other plots into a grid 3x?
478  """
479 
480  figure = None
481 
482  axis = None
483 
484  def __init__(self, cls, number_of_plots, figure=None):
485  """
486  Creates a new figure if None is given, sets the default plot parameters
487  @param figure default draw figure which is used
488  """
489  if figure is None:
490  self.figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
491  self.figurefigurefigure.set_tight_layout(True)
492  else:
493  self.figurefigurefigure = figure
494 
495  if number_of_plots == 1:
496  gs = matplotlib.gridspec.GridSpec(1, 1)
497  elif number_of_plots == 2:
498  gs = matplotlib.gridspec.GridSpec(1, 2)
499  elif number_of_plots == 3:
500  gs = matplotlib.gridspec.GridSpec(1, 3)
501  else:
502  gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
503 
504 
505  self.sub_plotssub_plots = [cls(self.figurefigurefigure, self.figurefigurefigure.add_subplot(gs[i // 3, i % 3])) for i in range(number_of_plots)]
506  self.axisaxisaxis = self.sub_plotssub_plots[0].axis
507  super().__init__(self.figurefigurefigure, self.axisaxisaxis)
508 
509  def add(self, i, *args, **kwargs):
510  """
511  Call add function of ith subplot
512  @param i position of the subplot
513  """
514  self.sub_plotssub_plots[i].add(*args, **kwargs)
515 
516  def finish(self):
517  """
518  Sets limits, title, axis-labels and legend of the plot
519  """
520  for plot in self.sub_plotssub_plots:
521  plot.finish()
522  return self
523 
524 
526  """
527  Plots the purity in each bin over the classifier output.
528  """
529 
533 
534  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
535  """
536  Add a new curve to the Diagonal plot
537  @param data pandas.DataFrame containing all data
538  @param column which is used to calculate purity for different cuts
539  @param signal_mask boolean numpy.array defining which events are signal events
540  @param bckgrd_mask boolean numpy.array defining which events are background events
541  @param weight_column column in data containing the weights for each event
542  """
543  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
544  purity, purity_error = hists.get_purity_per_bin(['Signal'], ['Background'])
545 
546  self.xminxmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xminxmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
547  # self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
548  self.yminymin, self.ymaxymaxymax = 0, 1
549 
550  p = self._plot_datapoints_plot_datapoints(self.axisaxis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
551  self.plotsplots.append(p)
552  self.labelslabels.append(column)
553  return self
554 
555  def finish(self):
556  """
557  Sets limits, title, axis-labels and legend of the plot
558  """
559  self.scale_limitsscale_limits()
560  self.axisaxis.plot((0.0, 1.0), (0.0, 1.0), color='black')
561  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
562  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
563  self.axisaxis.set_title("Diagonal Plot")
564  self.axisaxis.get_xaxis().set_label_text('Classifier Output')
565  self.axisaxis.get_yaxis().set_label_text('Purity Per Bin')
566  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
567  return self
568 
569 
571  """
572  Plots distribution of a quantity
573  """
574 
575  def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
576  keep_first_binning=False, range_in_std=None):
577  """
578  Creates a new figure and axis if None is given, sets the default plot parameters
579  @param figure default draw figure which is used
580  @param axis default draw axis which is used
581  @param normed true if histograms should be normed before drawing
582  @param keep_first_binning use the binning of the first distribution for further plots
583  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
584  """
585  super().__init__(figure, axis)
586 
587  self.normed_to_all_entriesnormed_to_all_entries = normed_to_all_entries
588 
589  self.normed_to_bin_widthnormed_to_bin_width = normed_to_bin_width
590 
591  self.range_in_stdrange_in_std = range_in_std
592  # if self.normed_to_all_entries or self.normed_to_bin_width:
593 
594  self.yminyminymin = float(0)
595 
596  self.ymaxymaxymax = float('-inf')
597 
598  self.xminxminxmin = float('inf')
599 
600  self.xmaxxmaxxmax = float('-inf')
601 
602  self.keep_first_binningkeep_first_binning = keep_first_binning
603 
604  self.first_binningfirst_binning = None
605 
606  self.x_axis_labelx_axis_label = ''
607 
608  def add(self, data, column, mask=None, weight_column=None, label=None):
609  """
610  Add a new distribution to the plots
611  @param data pandas.DataFrame containing all data
612  @param column which is used to calculate distribution histogram
613  @param mask boolean numpy.array defining which events are used for the histogram
614  @param weight_column column in data containing the weights for each event
615  """
616  if mask is None:
617  mask = numpy.ones(len(data)).astype('bool')
618 
619  bins = 100
620  if self.keep_first_binningkeep_first_binning and self.first_binningfirst_binning is not None:
621  bins = self.first_binningfirst_binning
622  hists = histogram.Histograms(data, column, {'Total': mask}, weight_column=weight_column,
623  bins=bins, equal_frequency=False, range_in_std=self.range_in_stdrange_in_std)
624  if self.keep_first_binningkeep_first_binning and self.first_binningfirst_binning is None:
625  self.first_binningfirst_binning = hists.bins
626  hist, hist_error = hists.get_hist('Total')
627 
628  if self.normed_to_all_entriesnormed_to_all_entries:
629  normalization = float(numpy.sum(hist))
630  hist = hist / normalization
631  hist_error = hist_error / normalization
632 
633  if self.normed_to_bin_widthnormed_to_bin_width:
634  hist = hist / hists.bin_widths
635  hist_error = hist_error / hists.bin_widths
636 
637  self.xminxminxmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xminxminxmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
638  self.yminyminymin = numpy.nanmin([hist.min(), self.yminyminymin])
639  self.ymaxymaxymax = numpy.nanmax([(hist + hist_error).max(), self.ymaxymaxymax])
640 
641  p = self._plot_datapoints_plot_datapoints(self.axisaxis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
642  self.plotsplots.append(p)
643  self.x_axis_labelx_axis_label = column
644 
645  appendix = ''
646  if self.ymaxymaxymax <= self.yminyminymin or self.xmaxxmaxxmax <= self.xminxminxmin:
647  appendix = ' No data to plot!'
648 
649  if label is None:
650  self.labelslabels.append(column + appendix)
651  else:
652  self.labelslabels.append(label + appendix)
653  return self
654 
655  def finish(self):
656  """
657  Sets limits, title, axis-labels and legend of the plot
658  """
659  self.axisaxis.set_title("Distribution Plot")
660  self.axisaxis.get_xaxis().set_label_text(self.x_axis_labelx_axis_label)
661 
662  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
663 
664  if self.ymaxymaxymax <= self.yminyminymin or self.xmaxxmaxxmax <= self.xminxminxmin:
665  self.axisaxis.set_xlim((0., 1.))
666  self.axisaxis.set_ylim((0., 1.))
667  self.axisaxis.text(0.36, 0.5, 'No data to plot', fontsize=60, color='black')
668  return self
669 
670  self.scale_limitsscale_limits()
671 
672  self.axisaxis.set_xlim((self.xminxminxmin, self.xmaxxmaxxmax))
673  self.axisaxis.set_ylim((self.yminyminymin, self.ymaxymaxymax))
674 
675  if self.normed_to_all_entriesnormed_to_all_entries and self.normed_to_bin_widthnormed_to_bin_width:
676  self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / (# Entries * Bin Width)')
677  elif self.normed_to_all_entriesnormed_to_all_entries:
678  self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / # Entries')
679  elif self.normed_to_bin_widthnormed_to_bin_width:
680  self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / Bin Width')
681  else:
682  self.axisaxis.get_yaxis().set_label_text('# Entries per Bin')
683 
684  return self
685 
686 
687 class Box(Plotter):
688  """
689  Create a boxplot
690  """
691 
693 
694  def __init__(self, figure=None, axis=None):
695  """
696  Creates a new figure and axis if None is given, sets the default plot parameters
697  @param figure default draw figure which is used
698  @param axis default draw axis which is used
699  """
700  super().__init__(figure=figure, axis=axis)
701 
702 
703  self.x_axis_labelx_axis_label = ""
704 
705  def add(self, data, column, mask=None, weight_column=None):
706  """
707  Add a new boxplot to the plots
708  @param data pandas.DataFrame containing all data
709  @param column which is used to calculate boxplot quantities
710  @param mask boolean numpy.array defining which events are used for the histogram
711  @param weight_column column in data containing the weights for each event
712  """
713  if mask is None:
714  mask = numpy.ones(len(data)).astype('bool')
715  x = data[column][mask]
716  if weight_column is not None:
717  # weight = data[weight_column][mask]
718  b2.B2WARNING("Weights are currently not used in boxplot, due to limitations in matplotlib")
719 
720  if len(x) == 0:
721  b2.B2WARNING("Ignore empty boxplot.")
722  return self
723 
724  p = self.axisaxis.boxplot(x, sym='k.', whis=1.5, vert=False, patch_artist=True, showmeans=True, widths=1,
725  boxprops=dict(facecolor='blue', alpha=0.5),
726  # medianprobs=dict(color='blue'),
727  # meanprobs=dict(color='red'),
728  )
729  self.plotsplots.append(p)
730  self.labelslabels.append(column)
731  self.x_axis_labelx_axis_label = column
732  r"""
733  self.axis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + '\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
734  fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
735  self.axis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + '\n' + r'$IQD = {:.2f}$').format(x.std(),
736  x.quantile(0.75) - x.quantile(0.25)),
737  fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
738  self.axis.text(0.7, 0.9, (r'$min = {:.2f}$' + '\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
739  fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
740  """
741 
742  return self
743 
744  def finish(self):
745  """
746  Sets limits, title, axis-labels and legend of the plot
747  """
748  matplotlib.artist.setp(self.axisaxis.get_yaxis(), visible=False)
749  self.axisaxis.get_xaxis().set_label_text(self.x_axis_labelx_axis_label)
750  self.axisaxis.set_title("Box Plot")
751  return self
752 
753 
755  """
756  Plots the difference between two histograms
757  """
758 
770 
771  def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
772  """
773  Creates a new figure and axis if None is given, sets the default plot parameters
774  @param figure default draw figure which is used
775  @param axis default draw axis which is used
776  @param normed normalize minuend and subtrahend before comparing them
777  @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
778  """
779  super().__init__(figure, axis)
780  self.normednormed = normed
781  self.shift_to_zeroshift_to_zero = shift_to_zero
782  if self.normednormed:
783  self.yminyminymin = -0.01
784  self.ymaxymaxymax = 0.01
785  else:
786  self.yminyminymin = -1
787  self.ymaxymaxymax = 1
788 
789  def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
790  """
791  Add a new difference plot
792  @param data pandas.DataFrame containing all data
793  @param column which is used to calculate distribution histogram
794  @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
795  @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
796  @param weight_column column in data containing the weights for each event
797  @param label label for the legend if None, the column name is used
798  """
799  hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask},
800  weight_column=weight_column, equal_frequency=False)
801  minuend, minuend_error = hists.get_hist('Minuend')
802  subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
803 
804  difference_error = histogram.poisson_error(minuend + subtrahend)
805  if self.normednormed:
806  difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
807  minuend = minuend / numpy.sum(minuend)
808  subtrahend = subtrahend / numpy.sum(subtrahend)
809  difference = minuend - subtrahend
810 
811  if self.shift_to_zeroshift_to_zero:
812  difference = difference - numpy.mean(difference)
813 
814  self.xminxmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xminxmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
815  self.yminyminymin = min((difference - difference_error).min(), self.yminyminymin)
816  self.ymaxymaxymax = max((difference + difference_error).max(), self.ymaxymaxymax)
817 
818  p = self._plot_datapoints_plot_datapoints(self.axisaxis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
819  self.plotsplots.append(p)
820  if label is None:
821  self.labelslabels.append(label)
822  else:
823  self.labelslabels.append(column)
824  self.x_axis_labelx_axis_label = column
825  return self
826 
827  def finish(self, line_color='black'):
828  """
829  Sets limits, title, axis-labels and legend of the plot
830  """
831  self.axisaxis.plot((self.xminxmin, self.xmaxxmaxxmax), (0, 0), color=line_color, linewidth=4, rasterized=True)
832  self.scale_limitsscale_limits()
833  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
834  self.axisaxis.set_ylim((self.yminyminymin, self.ymaxymaxymax))
835  self.axisaxis.set_title("Difference Plot")
836  self.axisaxis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
837  self.axisaxis.get_xaxis().set_label_text(self.x_axis_labelx_axis_label)
838  self.axisaxis.get_yaxis().set_label_text('Difference')
839  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
840  return self
841 
842 
844  """
845  Create TMVA-like overtraining control plot for a classification training
846  """
847 
848 
849  figure = None
850 
851  axis = None
852 
853  axis_d1 = None
854 
855  axis_d2 = None
856 
857  def __init__(self, figure=None):
858  """
859  Creates a new figure if None is given, sets the default plot parameters
860  @param figure default draw figure which is used
861  """
862  if figure is None:
863  self.figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
864  self.figurefigurefigure.set_tight_layout(True)
865  else:
866  self.figurefigurefigure = figure
867 
868  gs = matplotlib.gridspec.GridSpec(5, 1)
869  self.axisaxisaxis = self.figurefigurefigure.add_subplot(gs[:3, :])
870  self.axis_d1axis_d1 = self.figurefigurefigure.add_subplot(gs[3, :], sharex=self.axisaxisaxis)
871  self.axis_d2axis_d2 = self.figurefigurefigure.add_subplot(gs[4, :], sharex=self.axisaxisaxis)
872 
873  super().__init__(self.figurefigurefigure, self.axisaxisaxis)
874 
875  def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
876  """
877  Add a new overtraining plot, I recommend to draw only one overtraining plot at the time,
878  otherwise there are too many curves in the plot to recognize anything in the plot.
879  @param data pandas.DataFrame containing all data
880  @param column which is used to calculate distribution histogram
881  @param train_mask boolean numpy.array defining which events are training events
882  @param test_mask boolean numpy.array defining which events are test events
883  @param signal_mask boolean numpy.array defining which events are signal events
884  @param bckgrd_mask boolean numpy.array defining which events are background events
885  @param weight_column column in data containing the weights for each event
886  """
887  distribution = Distribution(self.figurefigurefigure, self.axisaxisaxis, normed_to_all_entries=True)
888 
889  distribution.set_plot_options(self.plot_kwargsplot_kwargs)
890  distribution.set_errorbar_options(self.errorbar_kwargserrorbar_kwargs)
891  distribution.set_errorband_options(self.errorband_kwargserrorband_kwargs)
892  distribution.add(data, column, test_mask & signal_mask, weight_column)
893  distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
894 
895  distribution.set_plot_options(
896  {'color': distribution.plots[0][0][0].get_color(), 'linestyle': '-', 'lw': 4, 'drawstyle': 'steps-mid'})
897  distribution.set_fill_options({'color': distribution.plots[0][0][0].get_color(), 'alpha': 0.5, 'step': 'post'})
898  distribution.set_errorbar_options(None)
899  distribution.set_errorband_options(None)
900  distribution.add(data, column, train_mask & signal_mask, weight_column)
901  distribution.set_plot_options(
902  {'color': distribution.plots[1][0][0].get_color(), 'linestyle': '-', 'lw': 4, 'drawstyle': 'steps-mid'})
903  distribution.set_fill_options({'color': distribution.plots[1][0][0].get_color(), 'alpha': 0.5, 'step': 'post'})
904  distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
905 
906  distribution.labels = ['Test-Signal', 'Test-Background', 'Train-Signal', 'Train-Background']
907  distribution.finish()
908 
909  self.plot_kwargsplot_kwargs['color'] = distribution.plots[0][0][0].get_color()
910  difference_signal = Difference(self.figurefigurefigure, self.axis_d1axis_d1, shift_to_zero=True, normed=True)
911  difference_signal.set_plot_options(self.plot_kwargsplot_kwargs)
912  difference_signal.set_errorbar_options(self.errorbar_kwargserrorbar_kwargs)
913  difference_signal.set_errorband_options(self.errorband_kwargserrorband_kwargs)
914  difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
915  self.axis_d1axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
916  self.axis_d1axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
917  difference_signal.plots = difference_signal.labels = []
918  difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
919 
920  self.plot_kwargsplot_kwargs['color'] = distribution.plots[1][0][0].get_color()
921  difference_bckgrd = Difference(self.figurefigurefigure, self.axis_d2axis_d2, shift_to_zero=True, normed=True)
922  difference_bckgrd.set_plot_options(self.plot_kwargsplot_kwargs)
923  difference_bckgrd.set_errorbar_options(self.errorbar_kwargserrorbar_kwargs)
924  difference_bckgrd.set_errorband_options(self.errorband_kwargserrorband_kwargs)
925  difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
926  self.axis_d2axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
927  self.axis_d2axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
928  difference_bckgrd.plots = difference_bckgrd.labels = []
929  difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
930 
931  try:
932  import scipy.stats
933  # Kolmogorov smirnov test
934  if len(data[column][train_mask & signal_mask]) == 0 or len(data[column][test_mask & signal_mask]) == 0:
935  b2.B2WARNING("Cannot calculate kolmogorov smirnov test for signal due to missing data")
936  else:
937  ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
938  props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
939  self.axis_d1axis_d1.text(0.1, 0.9, r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36, bbox=props,
940  verticalalignment='top', horizontalalignment='left', transform=self.axis_d1axis_d1.transAxes)
941  if len(data[column][train_mask & bckgrd_mask]) == 0 or len(data[column][test_mask & bckgrd_mask]) == 0:
942  b2.B2WARNING("Cannot calculate kolmogorov smirnov test for background due to missing data")
943  else:
944  ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
945  props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
946  self.axis_d2axis_d2.text(0.1, 0.9, r'background (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36,
947  bbox=props,
948  verticalalignment='top', horizontalalignment='left', transform=self.axis_d2axis_d2.transAxes)
949  except ImportError:
950  b2.B2WARNING("Cannot calculate kolmogorov smirnov test please install scipy!")
951 
952  return self
953 
954  def finish(self):
955  """
956  Sets limits, title, axis-labels and legend of the plot
957  """
958  self.axisaxisaxis.set_title("Overtraining Plot")
959  self.axis_d1axis_d1.set_title("")
960  self.axis_d2axis_d2.set_title("")
961  matplotlib.artist.setp(self.axisaxisaxis.get_xticklabels(), visible=False)
962  matplotlib.artist.setp(self.axis_d1axis_d1.get_xticklabels(), visible=False)
963  self.axisaxisaxis.get_xaxis().set_label_text('')
964  self.axis_d1axis_d1.get_xaxis().set_label_text('')
965  self.axis_d2axis_d2.get_xaxis().set_label_text('Classifier Output')
966  return self
967 
968 
970  """
971  Plots distribution of a quantity including boxplots
972  """
973 
974 
975  box_axes = None
976 
977  def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
978  """
979  Creates a new figure and axis if None is given, sets the default plot parameters
980  @param figure default draw figure which is used
981  @param axis default draw axis which is used
982  @param normed true if the histograms should be normed before drawing
983  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
984  """
985  super().__init__(figure, axis)
986 
987  self.normednormed = normed
988 
989  self.range_in_stdrange_in_std = range_in_std
990  self.box_axesbox_axes = []
991 
992  self.distributiondistribution = Distribution(self.figurefigure, self.axisaxis, normed_to_all_entries=self.normednormed, range_in_std=self.range_in_stdrange_in_std)
993 
994  def add(self, data, column, mask=None, weight_column=None, label=None):
995  """
996  Add a new distribution plot, with additional information like a boxplot compared to
997  the ordinary Distribution plot.
998  @param data pandas.DataFrame containing all data
999  @param column which is used to calculate distribution histogram
1000  @param mask boolean numpy.array defining which events are used for the distribution histogram
1001  @param weight_column column in data containing the weights for each event
1002  """
1003  self.distributiondistribution.set_plot_options(self.plot_kwargsplot_kwargs)
1004  self.distributiondistribution.set_errorbar_options(self.errorbar_kwargserrorbar_kwargs)
1005  self.distributiondistribution.set_errorband_options(self.errorband_kwargserrorband_kwargs)
1006  self.distributiondistribution.add(data, column, mask, weight_column, label=label)
1007 
1008  n = len(self.box_axesbox_axes) + 1
1009  gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1010  gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i in range(n)]
1011  box_axis = self.add_subplotadd_subplot(gridspecs)
1012 
1013  if self.range_in_stdrange_in_std is not None:
1014  mean, std = histogram.weighted_mean_and_std(data[column], None if weight_column is None else data[weight_column])
1015  # Everything outside mean +- range_in_std * std is considered not inside the mask
1016  mask = mask & (data[column] > (mean - self.range_in_stdrange_in_std * std)) & (data[column] < (mean + self.range_in_stdrange_in_std * std))
1017  box = Box(self.figurefigure, box_axis)
1018  box.add(data, column, mask, weight_column)
1019  if len(box.plots) > 0:
1020  box.plots[0]['boxes'][0].set_facecolor(self.distributiondistribution.plots[-1][0][0].get_color())
1021  box.finish()
1022 
1023  self.box_axesbox_axes.append(box_axis)
1024  return self
1025 
1026  def finish(self):
1027  """
1028  Sets limits, title, axis-labels and legend of the plot
1029  """
1030  self.distributiondistribution.finish()
1031  matplotlib.artist.setp(self.axisaxis.get_xticklabels(), visible=False)
1032  self.axisaxis.get_xaxis().set_label_text('')
1033  for box_axis in self.box_axesbox_axes[:-1]:
1034  matplotlib.artist.setp(box_axis.get_xticklabels(), visible=False)
1035  box_axis.set_title("")
1036  box_axis.get_xaxis().set_label_text('')
1037  self.box_axesbox_axes[-1].set_title("")
1038  self.axisaxis.set_title("Distribution Plot")
1039  self.axisaxis.legend([x[0] for x in self.distributiondistribution.plots], self.distributiondistribution.labels,
1040  loc='best', fancybox=True, framealpha=0.5)
1041  return self
1042 
1043 
1045  """
1046  Plots change of a distribution of a quantity depending on the cut on a classifier
1047  """
1048 
1049  figure = None
1050 
1051  axis = None
1052 
1053  axis_d1 = None
1054 
1055  axis_d2 = None
1056 
1057  def __init__(self, figure=None):
1058  """
1059  Creates a new figure if None is given, sets the default plot parameters
1060  @param figure default draw figure which is used
1061  """
1062  if figure is None:
1063  self.figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1064  self.figurefigurefigure.set_tight_layout(True)
1065  else:
1066  self.figurefigurefigure = figure
1067 
1068  gs = matplotlib.gridspec.GridSpec(3, 2)
1069  self.axisaxisaxis = self.figurefigurefigure.add_subplot(gs[0, :])
1070  self.axis_d1axis_d1 = self.figurefigurefigure.add_subplot(gs[1, :], sharex=self.axisaxisaxis)
1071  self.axis_d2axis_d2 = self.figurefigurefigure.add_subplot(gs[2, :], sharex=self.axisaxisaxis)
1072 
1073  super().__init__(self.figurefigurefigure, self.axisaxisaxis)
1074 
1075  def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1076  """
1077  Add a new correlation plot.
1078  @param data pandas.DataFrame containing all data
1079  @param column which is used to calculate distribution histogram
1080  @param cut_column which is used to calculate cut on the other quantity defined by column
1081  @param quantiles list of quantiles between 0 and 100, defining the different cuts
1082  @param weight_column column in data containing the weights for each event
1083  """
1084  if len(data[cut_column]) == 0:
1085  b2.B2WARNING("Ignore empty Correlation.")
1086  return self
1087 
1088  axes = [self.axisaxisaxis, self.axis_d1axis_d1, self.axis_d2axis_d2]
1089 
1090  for i, (l, m) in enumerate([('.', signal_mask | bckgrd_mask), ('S', signal_mask), ('B', bckgrd_mask)]):
1091 
1092  if weight_column is not None:
1093  weights = numpy.array(data[weight_column][m])
1094  else:
1095  weights = numpy.ones(len(data[column][m]))
1096 
1097  xrange = numpy.percentile(data[column][m], [5, 95])
1098 
1099  colormap = plt.get_cmap('coolwarm')
1100  tmp, x = numpy.histogram(data[column][m], bins=100,
1101  range=xrange, density=True, weights=weights)
1102  bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1103  axes[i].plot(bin_center, tmp, color='black', lw=1)
1104 
1105  for quantil in numpy.arange(5, 100, 5):
1106  cut = numpy.percentile(data[cut_column][m], quantil)
1107  sel = data[cut_column][m] >= cut
1108  y, x = numpy.histogram(data[column][m][sel], bins=100,
1109  range=xrange, density=True, weights=weights[sel])
1110  bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1111  axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1112  tmp = y
1113 
1114  axes[i].set_ylim(bottom=0)
1115 
1116  flatness_score = basf2_mva_util.calculate_flatness(data[column][m], data[cut_column][m], weights)
1117  axes[i].set_title(r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1118  return self
1119 
1120  def finish(self):
1121  """
1122  Sets limits, title, axis-labels and legend of the plot
1123  """
1124  return self
1125 
1126 
1127 class TSNE(Plotter):
1128  """
1129  Plots multivariate distribution using TSNE algorithm
1130  """
1131 
1132  def add(self, data, columns, *masks):
1133  """
1134  Add a new correlation plot.
1135  @param data pandas.DataFrame containing all data
1136  @param columns which are used to calculate the correlations
1137  @param masks different classes to show in TSNE
1138  """
1139  try:
1140  import sklearn
1141  import sklearn.manifold
1142  model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1143  data = numpy.array([data[column] for column in columns]).T
1144  model.fit(data)
1145  for mask in masks:
1146  data = numpy.array([data[column][mask] for column in columns]).T
1147  data = model.transform(data)
1148  self.axisaxis.scatter(data[:, 0], data[:, 1], rasterized=True)
1149  except ImportError:
1150  print("Cannot create TSNE plot. Install sklearn if you want it")
1151  return self
1152 
1153  def finish(self):
1154  """
1155  Sets limits, title, axis-labels and legend of the plot
1156  """
1157  return self
1158 
1159 
1161  """
1162  Plots importance matrix
1163  """
1164 
1165  def add(self, data, columns, variables):
1166  """
1167  Add a new correlation plot.
1168  @param data pandas.DataFrame containing all data
1169  @param columns which are used to calculate the correlations
1170  """
1171  self.figurefigure.set_tight_layout(True)
1172 
1173  def norm(x):
1174  width = (numpy.max(x) - numpy.min(x))
1175  if width <= 0:
1176  return numpy.zeros(x.shape)
1177  return (x - numpy.min(x)) / width * 100
1178 
1179  importance_matrix = numpy.vstack([norm(data[column]) for column in columns]).T
1180  importance_heatmap = self.axisaxis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,
1181  rasterized=True)
1182 
1183  # put the major ticks at the middle of each cell
1184  self.axisaxis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=False)
1185  self.axisaxis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=False)
1186 
1187  self.axisaxis.set_xticklabels(columns, minor=False, rotation=90)
1188  self.axisaxis.set_yticklabels(variables, minor=False)
1189 
1190  self.axisaxis.xaxis.tick_top()
1191 
1192  for y in range(importance_matrix.shape[0]):
1193  for x in range(importance_matrix.shape[1]):
1194  txt = self.axisaxis.text(x + 0.5, y + 0.5, f'{importance_matrix[y, x]:.0f}',
1195  size=14,
1196  horizontalalignment='center',
1197  verticalalignment='center',
1198  color='w')
1199  txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])
1200 
1201  cb = self.figurefigure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation='vertical')
1202  cb.ax.set_yticklabels(['low', 'high'])
1203 
1204  # remove whitespace
1205  self.axisaxis.set_ylim(0, importance_matrix.shape[0])
1206 
1207  self.axisaxis.set_aspect('equal')
1208 
1209  return self
1210 
1211  def finish(self):
1212  """
1213  Sets limits, title, axis-labels and legend of the plot
1214  """
1215  return self
1216 
1217 
1219  """
1220  Plots correlation matrix
1221  """
1222 
1223  figure = None
1224 
1225  signal_axis = None
1226 
1227  bckgrd_axis = None
1228 
1229  def __init__(self, figure=None):
1230  """
1231  Creates a new figure if None is given, sets the default plot parameters
1232  @param figure default draw figure which is used
1233  """
1234  if figure is None:
1235  self.figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1236  self.figurefigurefigure.set_tight_layout(True)
1237  else:
1238  self.figurefigurefigure = figure
1239 
1240  gs = matplotlib.gridspec.GridSpec(8, 2)
1241  self.signal_axissignal_axis = self.figurefigurefigure.add_subplot(gs[:6, 0])
1242  self.bckgrd_axisbckgrd_axis = self.figurefigurefigure.add_subplot(gs[:6, 1], sharey=self.signal_axissignal_axis)
1243 
1244  self.colorbar_axiscolorbar_axis = self.figurefigurefigure.add_subplot(gs[7, :])
1245 
1246  self.axisaxisaxis = self.signal_axissignal_axis
1247 
1248  super().__init__(self.figurefigurefigure, self.axisaxisaxis)
1249 
1250  def add(self, data, columns, signal_mask, bckgrd_mask):
1251  """
1252  Add a new correlation plot.
1253  @param data pandas.DataFrame containing all data
1254  @param columns which are used to calculate the correlations
1255  """
1256  signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column in columns])) * 100
1257  bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask] for column in columns])) * 100
1258 
1259  signal_heatmap = self.signal_axissignal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1260  self.bckgrd_axisbckgrd_axis.pcolor(bckgrd_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1261 
1262  self.signal_axissignal_axis.invert_yaxis()
1263  self.signal_axissignal_axis.xaxis.tick_top()
1264  self.bckgrd_axisbckgrd_axis.invert_yaxis()
1265  self.bckgrd_axisbckgrd_axis.xaxis.tick_top()
1266 
1267  # put the major ticks at the middle of each cell
1268  self.signal_axissignal_axis.set_xticks(numpy.arange(signal_corr.shape[0]) + 0.5, minor=False)
1269  self.signal_axissignal_axis.set_yticks(numpy.arange(signal_corr.shape[1]) + 0.5, minor=False)
1270 
1271  self.signal_axissignal_axis.set_xticklabels(columns, minor=False, rotation=90)
1272  self.signal_axissignal_axis.set_yticklabels(columns, minor=False)
1273 
1274  # put the major ticks at the middle of each cell
1275  self.bckgrd_axisbckgrd_axis.set_xticks(numpy.arange(bckgrd_corr.shape[0]) + 0.5, minor=False)
1276  self.bckgrd_axisbckgrd_axis.set_yticks(numpy.arange(bckgrd_corr.shape[1]) + 0.5, minor=False)
1277 
1278  self.bckgrd_axisbckgrd_axis.set_xticklabels(columns, minor=False, rotation=90)
1279  self.bckgrd_axisbckgrd_axis.set_yticklabels(columns, minor=False)
1280 
1281  for y in range(signal_corr.shape[0]):
1282  for x in range(signal_corr.shape[1]):
1283  txt = self.signal_axissignal_axis.text(x + 0.5, y + 0.5, f'{signal_corr[y, x]:.0f}',
1284  size=14,
1285  horizontalalignment='center',
1286  verticalalignment='center',
1287  color='w')
1288  txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])
1289 
1290  for y in range(bckgrd_corr.shape[0]):
1291  for x in range(bckgrd_corr.shape[1]):
1292  txt = self.bckgrd_axisbckgrd_axis.text(x + 0.5, y + 0.5, f'{bckgrd_corr[y, x]:.0f}',
1293  size=14,
1294  horizontalalignment='center',
1295  verticalalignment='center',
1296  color='w')
1297  txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])
1298 
1299  cb = self.figurefigurefigure.colorbar(signal_heatmap, cax=self.colorbar_axiscolorbar_axis, ticks=[-100, 0, 100], orientation='horizontal')
1300  cb.solids.set_rasterized(True)
1301  cb.ax.set_xticklabels(['negative', 'uncorrelated', 'positive'])
1302 
1303  self.signal_axissignal_axis.text(0.5, -1.0, "Signal", horizontalalignment='center')
1304  self.bckgrd_axisbckgrd_axis.text(0.5, -1.0, "Background", horizontalalignment='center')
1305 
1306  # remove whitespace
1307  self.signal_axissignal_axis.set_xlim(0, signal_corr.shape[0])
1308  self.signal_axissignal_axis.set_ylim(0, signal_corr.shape[1])
1309  self.bckgrd_axisbckgrd_axis.set_xlim(0, bckgrd_corr.shape[0])
1310  self.bckgrd_axisbckgrd_axis.set_ylim(0, bckgrd_corr.shape[1])
1311  return self
1312 
1313  def finish(self):
1314  """
1315  Sets limits, title, axis-labels and legend of the plot
1316  """
1317  matplotlib.artist.setp(self.bckgrd_axisbckgrd_axis.get_yticklabels(), visible=False)
1318  return self
1319 
1320 
1321 if __name__ == '__main__':
1322 
1323  def get_data(N, columns):
1324  """
1325  Creates fake data for example plots
1326  """
1327  N /= 2
1328  n = len(columns) - 1
1329  xs = numpy.random.normal(0, size=(N, n))
1330  xb = numpy.random.normal(1, size=(N, n))
1331  ys = numpy.zeros(N)
1332  yb = numpy.ones(N)
1333  data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1334  return data.reindex(numpy.random.permutation(data.index))
1335 
1336  import seaborn
1337  # Set nice searborn settings
1338  seaborn.set(font_scale=3)
1339  seaborn.set_style('whitegrid')
1340 
1341  # Standard plots
1342  N = 100000
1343  data = get_data(N, columns=['FastBDT', 'NeuroBayes', 'isSignal'])
1344  data['type'] = ''
1345  data.type.iloc[:N / 2] = 'Train'
1346  data.type.iloc[N / 2:] = 'Test'
1347 
1348  p = Box()
1349  p.add(data, 'FastBDT')
1350  p.finish()
1351  p.save('box_plot.png')
1352 
1353  p = VerboseDistribution()
1354  p.add(data, 'FastBDT')
1355  p.add(data, 'NeuroBayes')
1356  p.finish()
1357  p.save('verbose_distribution_plot.png')
1358 
1359  p = PurityOverEfficiency()
1360  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1361  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1362  p.finish()
1363  p.save('roc_purity_plot.png')
1364 
1366  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1367  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1368  p.finish()
1369  p.save('roc_rejection_plot.png')
1370 
1371  p = Diagonal()
1372  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1373  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1374  p.finish()
1375  p.save('diagonal_plot.png')
1376 
1377  p = Distribution()
1378  p.add(data, 'FastBDT')
1379  p.add(data, 'NeuroBayes')
1380  p.finish()
1381  p.save('distribution_plot.png')
1382 
1383  p = Difference()
1384  p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test')
1385  p.add(data, 'NeuroBayes', data['type'] == 'Train', data['type'] == 'Test')
1386  p.finish()
1387  p.save('difference_plot.png')
1388 
1389  p = Overtraining()
1390  p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test', data['isSignal'] == 1, data['isSignal'] == 0)
1391  p.finish()
1392  p.save('overtraining_plot.png')
1393 
1394  p = Correlation()
1395  p.add(data, 'FastBDT', 'NeuroBayes', [0, 20, 40, 60, 80, 100], data['isSignal'] == 0)
1396  p.finish()
1397  p.save('correlation_plot.png')
1398 
1399  p = CorrelationMatrix()
1400  data['FastBDT2'] = data['FastBDT']**2
1401  data['NeuroBayes2'] = data['NeuroBayes']**2
1402  data['FastBDT3'] = data['FastBDT']**3
1403  data['NeuroBayes3'] = data['NeuroBayes']**3
1404  p.add(data, ['FastBDT', 'NeuroBayes', 'FastBDT2', 'NeuroBayes2', 'FastBDT3', 'NeuroBayes3'])
1405  p.finish()
1406  p.save('correlation_matrix.png')
def calculate_flatness(f, p, w=None)
x_axis_label
Label on x axis.
Definition: plotting.py:703
def add(self, data, column, mask=None, weight_column=None)
Definition: plotting.py:705
def __init__(self, figure=None, axis=None)
Definition: plotting.py:694
def finish(self)
Definition: plotting.py:744
signal_axis
Main axis which shows the correlation of the signal samples.
Definition: plotting.py:1225
def add(self, data, columns, signal_mask, bckgrd_mask)
Definition: plotting.py:1250
colorbar_axis
Colorbar axis contains the colorbar.
Definition: plotting.py:1244
figure
figure which is used to draw
Definition: plotting.py:1223
def __init__(self, figure=None)
Definition: plotting.py:1229
bckgrd_axis
Axis which shows the correlation of the background samples.
Definition: plotting.py:1227
axis
Usual axis object which every Plotter object needs, here it is just a dummy.
Definition: plotting.py:1246
def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
Definition: plotting.py:1075
axis_d1
Axis which shows shape of signal.
Definition: plotting.py:1053
figure
figure which is used to draw
Definition: plotting.py:1049
def __init__(self, figure=None)
Definition: plotting.py:1057
axis_d2
Axis which shows shape of background.
Definition: plotting.py:1055
axis
Main axis which is used to draw.
Definition: plotting.py:1051
ymax
Maximum y value.
Definition: plotting.py:548
xmax
Maximum x value.
Definition: plotting.py:546
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:534
def finish(self)
Definition: plotting.py:555
x_axis_label
Label on x axis.
Definition: plotting.py:824
shift_to_zero
Mean difference is shifted to zero (removes constant offset) if this is true.
Definition: plotting.py:781
def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
Definition: plotting.py:771
ymax
Maximum y value.
Definition: plotting.py:784
xmax
Maximum x value.
Definition: plotting.py:814
def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
Definition: plotting.py:789
ymin
min y value
Definition: plotting.py:783
def finish(self, line_color='black')
Definition: plotting.py:827
normed
Minuend and subtrahend are normed before comparing them if this is true.
Definition: plotting.py:780
def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
Definition: plotting.py:576
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:608
x_axis_label
x axis label
Definition: plotting.py:606
keep_first_binning
Keep first binning if user wants so.
Definition: plotting.py:602
normed_to_all_entries
Normalize histograms before drawing them.
Definition: plotting.py:587
first_binning
first binning
Definition: plotting.py:604
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:591
normed_to_bin_width
Normalize histograms before drawing them.
Definition: plotting.py:589
def add(self, data, columns, variables)
Definition: plotting.py:1165
def finish(self)
Definition: plotting.py:1211
def add(self, i, *args, **kwargs)
Definition: plotting.py:509
figure
figure which is used to draw
Definition: plotting.py:480
def __init__(self, cls, number_of_plots, figure=None)
Definition: plotting.py:484
sub_plots
the subplots which are displayed in the grid
Definition: plotting.py:505
axis
Main axis.
Definition: plotting.py:482
def finish(self)
Definition: plotting.py:516
axis_d1
Axis which shows the difference between training and test signal.
Definition: plotting.py:853
figure
figure which is used to draw
Definition: plotting.py:849
def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:875
def __init__(self, figure=None)
Definition: plotting.py:857
axis_d2
Axis which shows the difference between training and test background.
Definition: plotting.py:855
axis
Main axis which is used to draw.
Definition: plotting.py:851
def finish(self, *args, **kwargs)
Definition: plotting.py:247
fill_kwargs
Default keyword arguments for fill_between function.
Definition: plotting.py:111
xmin
Minimum x value.
Definition: plotting.py:63
def set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
Definition: plotting.py:157
plots
Plots added to the axis so far.
Definition: plotting.py:59
float xscale
limit scale
Definition: plotting.py:71
figure
figure which is used to draw
Definition: plotting.py:73
errorband_kwargs
Default keyword arguments for errorband function.
Definition: plotting.py:109
def scale_limits(self)
Definition: plotting.py:253
def add(self, *args, **kwargs)
Definition: plotting.py:241
def set_fill_options(self, fill_kwargs=None)
Definition: plotting.py:165
def save(self, filename)
Definition: plotting.py:130
def __init__(self, figure=None, axis=None)
Definition: plotting.py:77
ymax
Maximum y value.
Definition: plotting.py:69
xmax
Maximum x value.
Definition: plotting.py:65
errorbar_kwargs
Default keyword arguments for errorbar function.
Definition: plotting.py:107
labels
Labels of the plots added so far.
Definition: plotting.py:61
axis
Main axis which is used to draw.
Definition: plotting.py:75
def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
Definition: plotting.py:173
def set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Overrides default errorbar options for datapoint errorbars.
Definition: plotting.py:149
float yscale
limit scale
Definition: plotting.py:70
ymin
Minimum y value.
Definition: plotting.py:67
def add_subplot(self, gridspecs)
Definition: plotting.py:118
def set_plot_options(self, plot_kwargs={ 'linestyle':''})
Definition: plotting.py:141
plot_kwargs
Default keyword arguments for plot function.
Definition: plotting.py:105
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:273
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:382
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:428
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:335
def add(self, data, columns, *masks)
Definition: plotting.py:1132
def finish(self)
Definition: plotting.py:1153
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:994
distribution
The distribution plot.
Definition: plotting.py:992
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:989
box_axes
Axes for the boxplots.
Definition: plotting.py:975
normed
Normalize histograms before drawing them.
Definition: plotting.py:987
def __init__(self, figure=None, axis=None, normed=False, range_in_std=None)
Definition: plotting.py:977
def weighted_mean_and_std(x, w)
Definition: histogram.py:31
def poisson_error(n_tot)
Definition: histogram.py:24
Definition: plot.py:1