Belle II Software  release-05-02-19
plotting.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 # Thomas Keck 2015
5 
6 import copy
7 import math
8 
9 import numpy
10 import numpy as np
11 import matplotlib
12 
13 # Do not use standard backend TkAgg, because it is NOT thread-safe
14 # You will get an RuntimeError: main thread is not in main loop otherwise!
15 matplotlib.use("svg")
16 matplotlib.rcParams.update({'font.size': 36})
17 
18 import matplotlib.pyplot as plt
19 import matplotlib.artist
20 import matplotlib.figure
21 import matplotlib.gridspec
22 import matplotlib.colors
23 import matplotlib.patches
24 import matplotlib.ticker
25 
26 from . import histogram
27 
28 from basf2 import *
29 
30 import basf2_mva_util
31 
32 
33 class Plotter(object):
34  """
35  Base class for all Plotters.
36  """
37 
38  # stupid workaround for doxygen refusing to document things
39 
40 
42 
43 
47 
48 
49  plots = None
50 
51  labels = None
52 
53  xmin = None
54 
55  xmax = None
56 
57  ymin = None
58 
59  ymax = None
60  yscale = 0.0
61  xscale = 0.0
62 
63  figure = None
64 
65  axis = None
66 
67  def __init__(self, figure=None, axis=None):
68  """
69  Creates a new figure and axis if None is given, sets the default plot parameters
70  @param figure default draw figure which is used
71  @param axis default draw axis which is used
72  """
73  B2INFO("Create new figure for class " + str(type(self)))
74  if figure is None:
75  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
76  self.figure.set_tight_layout(False)
77  else:
78  self.figure = figure
79 
80  if axis is None:
81  self.axis = self.figure.add_subplot(1, 1, 1)
82  else:
83  self.axis = axis
84 
85  self.plots = []
86  self.labels = []
87  self.xmin, self.xmax = float(0), float(1)
88  self.ymin, self.ymax = float(0), float(1)
89 
90  self.yscale = 0.1
91 
92  self.xscale = 0.0
93 
94 
95  self.plot_kwargs = None
96 
97  self.errorbar_kwargs = None
98 
99  self.errorband_kwargs = None
100 
101  self.fill_kwargs = None
102 
103  self.set_plot_options()
104  self.set_errorbar_options()
105  self.set_errorband_options()
106  self.set_fill_options()
107 
108  def add_subplot(self, gridspecs):
109  """
110  Adds a new subplot to the figure, updates all other axes
111  according to the given gridspec
112  @param gridspecs gridspecs for all axes including the new one
113  """
114  for gs, ax in zip(gridspecs[:-1], self.figure.axes):
115  ax.set_position(gs.get_position(self.figure))
116  ax.set_subplotspec(gs)
117  axis = self.figure.add_subplot(gridspecs[-1], sharex=self.axis)
118  return axis
119 
120  def save(self, filename):
121  """
122  Save the figure into a file
123  @param filename of the file
124  """
125  B2INFO("Save figure for class " + str(type(self)))
126  from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
127  canvas = FigureCanvas(self.figure)
128  canvas.print_figure(filename, dpi=50)
129  return self
130 
131  def set_plot_options(self, plot_kwargs={'linestyle': ''}):
132  """
133  Overrides default plot options for datapoint plot
134  @param plot_kwargs keyword arguments for the plot function
135  """
136  self.plot_kwargs = copy.copy(plot_kwargs)
137  return self
138 
139  def set_errorbar_options(self, errorbar_kwargs={'fmt': '.', 'elinewidth': 3, 'alpha': 1}):
140  """
141  Overrides default errorbar options for datapoint errorbars
142  @param errorbar_kwargs keyword arguments for the errorbar function
143  """
144  self.errorbar_kwargs = copy.copy(errorbar_kwargs)
145  return self
146 
147  def set_errorband_options(self, errorband_kwargs={'alpha': 0.5}):
148  """
149  Overrides default errorband options for datapoint errorband
150  @param errorbar_kwargs keyword arguments for the fill_between function
151  """
152  self.errorband_kwargs = copy.copy(errorband_kwargs)
153  return self
154 
155  def set_fill_options(self, fill_kwargs=None):
156  """
157  Overrides default fill_between options for datapoint errorband
158  @param fill_kwargs keyword arguments for the fill_between function
159  """
160  self.fill_kwargs = copy.copy(fill_kwargs)
161  return self
162 
163  def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None):
164  """
165  Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
166  @param x coordinates of the data points
167  @param y coordinates of the data points
168  @param xerr symmetric error on x data points
169  @param yerr symmetric error on y data points
170  """
171  p = e = f = None
172  plot_kwargs = copy.copy(self.plot_kwargs)
173  errorbar_kwargs = copy.copy(self.errorbar_kwargs)
174  errorband_kwargs = copy.copy(self.errorband_kwargs)
175  fill_kwargs = copy.copy(self.fill_kwargs)
176 
177  if plot_kwargs is None or 'color' not in plot_kwargs:
178  color = next(axis._get_lines.prop_cycler)
179  color = color['color']
180  plot_kwargs['color'] = color
181  else:
182  color = plot_kwargs['color']
183  color = matplotlib.colors.ColorConverter().to_rgb(color)
184  patch = matplotlib.patches.Patch(color=color, alpha=0.5)
185  patch.get_color = patch.get_facecolor
186  patches = [patch]
187 
188  if plot_kwargs is not None:
189  p, = axis.plot(x, y, rasterized=True, **plot_kwargs)
190  patches.append(p)
191 
192  if errorbar_kwargs is not None and (xerr is not None or yerr is not None):
193  if 'color' not in errorbar_kwargs:
194  errorbar_kwargs['color'] = color
195  if 'ecolor' not in errorbar_kwargs:
196  errorbar_kwargs['ecolor'] = [0.5 * x for x in color]
197  e = axis.errorbar(x, y, xerr=xerr, yerr=yerr, rasterized=True, **errorbar_kwargs)
198  patches.append(e)
199 
200  if errorband_kwargs is not None and yerr is not None:
201  if 'color' not in errorband_kwargs:
202  errorband_kwargs['color'] = color
203  if xerr is not None:
204  # Ensure that xerr and yerr are iterable numpy arrays
205  xerr = x + xerr - x
206  yerr = y + yerr - y
207  for _x, _y, _xe, _ye in zip(x, y, xerr, yerr):
208  axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=True,
209  **errorband_kwargs))
210  else:
211  f = axis.fill_between(x, y - yerr, y + yerr, interpolate=True, rasterized=True, **errorband_kwargs)
212 
213  if fill_kwargs is not None:
214  axis.fill_between(x, y, 0, rasterized=True, **fill_kwargs)
215 
216  return (tuple(patches), p, e, f)
217 
218  def add(self, *args, **kwargs):
219  """
220  Add a new plot to this plotter
221  """
222  return NotImplemented
223 
224  def finish(self, *args, **kwargs):
225  """
226  Finish plotting and set labels, legends and stuff
227  """
228  return NotImplemented
229 
230  def scale_limits(self):
231  """
232  Scale limits to increase distance to boundaries
233  """
234  self.ymin *= 1.0 - math.copysign(self.yscale, self.ymin)
235  self.ymax *= 1.0 + math.copysign(self.yscale, self.ymax)
236  self.xmin *= 1.0 - math.copysign(self.xscale, self.xmin)
237  self.xmax *= 1.0 + math.copysign(self.xscale, self.xmax)
238  return self
239 
240 
242  """
243  Plots the purity and the efficiency over the cut value (for cut choosing)
244  """
245 
249 
250  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
251  """
252  Add a new curve to the plot
253  @param data pandas.DataFrame containing all data
254  @param column which is used to calculate efficiency and purity for different cuts
255  @param signal_mask boolean numpy.array defining which events are signal events
256  @param bckgrd_mask boolean numpy.array defining which events are background events
257  @param weight_column column in data containing the weights for each event
258  """
259 
260  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
261 
262  if normed:
263  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
264  purity, purity_error = hists.get_purity(['Signal'], ['Background'])
265  else:
266  efficiency, efficiency_error = hists.get_true_positives(['Signal'])
267  purity, purity_error = hists.get_false_positives(['Background'])
268 
269  cuts = hists.bin_centers
270 
271  self.xmin, self.xmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmax])
272  self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.ymin]), \
273  numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.ymax])
274 
275  self.plots.append(self._plot_datapoints(self.axis, cuts, efficiency, xerr=0, yerr=efficiency_error))
276 
277  if normed:
278  self.labels.append("Efficiency")
279  else:
280  self.labels.append("True positive")
281 
282  self.plots.append(self._plot_datapoints(self.axis, cuts, purity, xerr=0, yerr=purity_error))
283 
284  if normed:
285  self.labels.append("Purity")
286  else:
287  self.labels.append("False positive")
288 
289  return self
290 
291  def finish(self):
292  """
293  Sets limits, title, axis-labels and legend of the plot
294  """
295  self.axis.set_xlim((self.xmin, self.xmax))
296  self.axis.set_ylim((self.ymin, self.ymax))
297  self.axis.set_title("Classification Plot")
298  self.axis.get_xaxis().set_label_text('Cut Value')
299  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
300  return self
301 
302 
304  """
305  Plots the signal to noise ratio over the cut value (for cut choosing)
306  """
307 
311 
312  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
313  """
314  Add a new curve to the plot
315  @param data pandas.DataFrame containing all data
316  @param column which is used to calculate signal to noise ratio for different cuts
317  @param signal_mask boolean numpy.array defining which events are signal events
318  @param bckgrd_mask boolean numpy.array defining which events are background events
319  @param weight_column column in data containing the weights for each event
320  """
321 
322  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
323 
324  signal2noise, signal2noise_error = hists.get_signal_to_noise(['Signal'], ['Background'])
325 
326  cuts = hists.bin_centers
327 
328  self.xmin, self.xmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmax])
329  self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(signal2noise), self.ymin]), \
330  numpy.nanmax([numpy.nanmax(signal2noise), self.ymax])
331 
332  self.plots.append(self._plot_datapoints(self.axis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))
333 
334  self.labels.append(column)
335 
336  return self
337 
338  def finish(self):
339  """
340  Sets limits, title, axis-labels and legend of the plot
341  """
342  self.axis.set_xlim((self.xmin, self.xmax))
343  self.axis.set_ylim((self.ymin, self.ymax))
344  self.axis.set_title("Signal to Noise Plot")
345  self.axis.get_xaxis().set_label_text('Cut Value')
346  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
347  return self
348 
349 
351  """
352  Plots the purity over the efficiency also known as ROC curve
353  """
354 
358 
359  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
360  """
361  Add a new curve to the ROC plot
362  @param data pandas.DataFrame containing all data
363  @param column which is used to calculate efficiency and purity for different cuts
364  @param signal_mask boolean numpy.array defining which events are signal events
365  @param bckgrd_mask boolean numpy.array defining which events are background events
366  @param weight_column column in data containing the weights for each event
367  """
368  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
369  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
370  purity, purity_error = hists.get_purity(['Signal'], ['Background'])
371 
372  self.xmin, self.xmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmax])
373  self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
374 
375  p = self._plot_datapoints(self.axis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
376  self.plots.append(p)
377  if label is not None:
378  self.labels.append(label)
379  else:
380  self.labels.append(column)
381  return self
382 
383  def finish(self):
384  """
385  Sets limits, title, axis-labels and legend of the plot
386  """
387  self.axis.set_xlim((self.xmin, self.xmax))
388  self.axis.set_ylim((self.ymin, self.ymax))
389  self.axis.set_title("ROC Purity Plot")
390  self.axis.get_xaxis().set_label_text('Efficiency')
391  self.axis.get_yaxis().set_label_text('Purity')
392  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
393  return self
394 
395 
397  """
398  Plots the rejection over the efficiency also known as ROC curve
399  """
400 
404 
405  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
406  """
407  Add a new curve to the ROC plot
408  @param data pandas.DataFrame containing all data
409  @param column which is used to calculate efficiency and purity for different cuts
410  @param signal_mask boolean numpy.array defining which events are signal events
411  @param bckgrd_mask boolean numpy.array defining which events are background events
412  @param weight_column column in data containing the weights for each event
413  """
414  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
415  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
416  rejection, rejection_error = hists.get_efficiency(['Background'])
417  rejection = 1 - rejection
418 
419  self.xmin, self.xmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmax])
420  self.ymin, self.ymax = numpy.nanmin([rejection.min(), self.ymin]), numpy.nanmax([rejection.max(), self.ymax])
421 
422  auc = numpy.abs(numpy.trapz(rejection, efficiency))
423 
424  p = self._plot_datapoints(self.axis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
425  self.plots.append(p)
426  if label is not None:
427  self.labels.append(label[:10] + " ({:.2f})".format(auc))
428  else:
429  self.labels.append(column[:10] + " ({:.2f})".format(auc))
430  return self
431 
432  def finish(self):
433  """
434  Sets limits, title, axis-labels and legend of the plot
435  """
436  self.axis.set_xlim((self.xmin, self.xmax))
437  self.axis.set_ylim((self.ymin, self.ymax))
438  self.axis.set_title("ROC Rejection Plot")
439  self.axis.get_xaxis().set_label_text('Signal Efficiency')
440  self.axis.get_yaxis().set_label_text('Background Rejection')
441  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
442  return self
443 
444 
446  """
447  Plots multiple other plots into a grid 3x?
448  """
449 
450  figure = None
451 
452  axis = None
453 
454  def __init__(self, cls, number_of_plots, figure=None):
455  """
456  Creates a new figure if None is given, sets the default plot parameters
457  @param figure default draw figure which is used
458  """
459  if figure is None:
460  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
461  self.figure.set_tight_layout(True)
462  else:
463  self.figure = figure
464 
465  if number_of_plots == 1:
466  gs = matplotlib.gridspec.GridSpec(1, 1)
467  elif number_of_plots == 2:
468  gs = matplotlib.gridspec.GridSpec(1, 2)
469  elif number_of_plots == 3:
470  gs = matplotlib.gridspec.GridSpec(1, 3)
471  else:
472  gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
473 
474 
475  self.sub_plots = [cls(self.figure, self.figure.add_subplot(gs[i // 3, i % 3])) for i in range(number_of_plots)]
476  self.axis = self.sub_plots[0].axis
477  super(Multiplot, self).__init__(self.figure, self.axis)
478 
479  def add(self, i, *args, **kwargs):
480  """
481  Call add function of ith subplot
482  @param i position of the subplot
483  """
484  self.sub_plots[i].add(*args, **kwargs)
485 
486  def finish(self):
487  """
488  Sets limits, title, axis-labels and legend of the plot
489  """
490  for plot in self.sub_plots:
491  plot.finish()
492  return self
493 
494 
496  """
497  Plots the purity in each bin over the classifier output.
498  """
499 
503 
504  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
505  """
506  Add a new curve to the Diagonal plot
507  @param data pandas.DataFrame containing all data
508  @param column which is used to calculate purity for different cuts
509  @param signal_mask boolean numpy.array defining which events are signal events
510  @param bckgrd_mask boolean numpy.array defining which events are background events
511  @param weight_column column in data containing the weights for each event
512  """
513  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
514  purity, purity_error = hists.get_purity_per_bin(['Signal'], ['Background'])
515 
516  self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
517  # self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
518  self.ymin, self.ymax = 0, 1
519 
520  p = self._plot_datapoints(self.axis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
521  self.plots.append(p)
522  self.labels.append(column)
523  return self
524 
525  def finish(self):
526  """
527  Sets limits, title, axis-labels and legend of the plot
528  """
529  self.scale_limits()
530  self.axis.plot((0.0, 1.0), (0.0, 1.0), color='black')
531  self.axis.set_xlim((self.xmin, self.xmax))
532  self.axis.set_ylim((self.ymin, self.ymax))
533  self.axis.set_title("Diagonal Plot")
534  self.axis.get_xaxis().set_label_text('Classifier Output')
535  self.axis.get_yaxis().set_label_text('Purity Per Bin')
536  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
537  return self
538 
539 
541  """
542  Plots distribution of a quantity
543  """
544 
545  def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
546  keep_first_binning=False, range_in_std=None):
547  """
548  Creates a new figure and axis if None is given, sets the default plot parameters
549  @param figure default draw figure which is used
550  @param axis default draw axis which is used
551  @param normed true if histograms should be normed before drawing
552  @param keep_first_binning use the binning of the first distribution for further plots
553  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
554  """
555  super(Distribution, self).__init__(figure, axis)
556 
557  self.normed_to_all_entries = normed_to_all_entries
558 
559  self.normed_to_bin_width = normed_to_bin_width
560 
561  self.range_in_std = range_in_std
562  # if self.normed_to_all_entries or self.normed_to_bin_width:
563 
564  self.ymin = float(0)
565 
566  self.ymax = float('-inf')
567 
568  self.xmin = float('inf')
569 
570  self.xmax = float('-inf')
571 
572  self.keep_first_binning = keep_first_binning
573 
574  self.first_binning = None
575 
576  self.x_axis_label = ''
577 
578  def add(self, data, column, mask=None, weight_column=None, label=None):
579  """
580  Add a new distribution to the plots
581  @param data pandas.DataFrame containing all data
582  @param column which is used to calculate distribution histogram
583  @param mask boolean numpy.array defining which events are used for the histogram
584  @param weight_column column in data containing the weights for each event
585  """
586  if mask is None:
587  mask = numpy.ones(len(data)).astype('bool')
588 
589  bins = 100
590  if self.keep_first_binning and self.first_binning is not None:
591  bins = self.first_binning
592  hists = histogram.Histograms(data, column, {'Total': mask}, weight_column=weight_column,
593  bins=bins, equal_frequency=False, range_in_std=self.range_in_std)
594  if self.keep_first_binning and self.first_binning is None:
595  self.first_binning = hists.bins
596  hist, hist_error = hists.get_hist('Total')
597 
598  if self.normed_to_all_entries:
599  normalization = float(numpy.sum(hist))
600  hist = hist / normalization
601  hist_error = hist_error / normalization
602 
603  if self.normed_to_bin_width:
604  hist = hist / hists.bin_widths
605  hist_error = hist_error / hists.bin_widths
606 
607  self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
608  self.ymin = numpy.nanmin([hist.min(), self.ymin])
609  self.ymax = numpy.nanmax([(hist + hist_error).max(), self.ymax])
610 
611  p = self._plot_datapoints(self.axis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
612  self.plots.append(p)
613  self.x_axis_label = column
614 
615  appendix = ''
616  if self.ymax <= self.ymin or self.xmax <= self.xmin:
617  appendix = ' No data to plot!'
618 
619  if label is None:
620  self.labels.append(column + appendix)
621  else:
622  self.labels.append(label + appendix)
623  return self
624 
625  def finish(self):
626  """
627  Sets limits, title, axis-labels and legend of the plot
628  """
629  self.axis.set_title("Distribution Plot")
630  self.axis.get_xaxis().set_label_text(self.x_axis_label)
631 
632  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
633 
634  if self.ymax <= self.ymin or self.xmax <= self.xmin:
635  self.axis.set_xlim((0., 1.))
636  self.axis.set_ylim((0., 1.))
637  self.axis.text(0.36, 0.5, 'No data to plot', fontsize=60, color='black')
638  return self
639 
640  self.scale_limits()
641 
642  self.axis.set_xlim((self.xmin, self.xmax))
643  self.axis.set_ylim((self.ymin, self.ymax))
644 
646  self.axis.get_yaxis().set_label_text('# Entries per Bin / (# Entries * Bin Width)')
647  elif self.normed_to_all_entries:
648  self.axis.get_yaxis().set_label_text('# Entries per Bin / # Entries')
649  elif self.normed_to_bin_width:
650  self.axis.get_yaxis().set_label_text('# Entries per Bin / Bin Width')
651  else:
652  self.axis.get_yaxis().set_label_text('# Entries per Bin')
653 
654  return self
655 
656 
657 class Box(Plotter):
658  """
659  Create a boxplot
660  """
661 
663 
664  def __init__(self, figure=None, axis=None):
665  """
666  Creates a new figure and axis if None is given, sets the default plot parameters
667  @param figure default draw figure which is used
668  @param axis default draw axis which is used
669  """
670  super().__init__(figure=figure, axis=axis)
671 
672 
673  self.x_axis_label = ""
674 
675  def add(self, data, column, mask=None, weight_column=None):
676  """
677  Add a new boxplot to the plots
678  @param data pandas.DataFrame containing all data
679  @param column which is used to calculate boxplot quantities
680  @param mask boolean numpy.array defining which events are used for the histogram
681  @param weight_column column in data containing the weights for each event
682  """
683  if mask is None:
684  mask = numpy.ones(len(data)).astype('bool')
685  x = data[column][mask]
686  if weight_column is not None:
687  weight = data[weight_column][mask]
688  B2WARNING("Weights are currently not used in boxplot, due to limitations in matplotlib")
689 
690  if len(x) == 0:
691  B2WARNING("Ignore empty boxplot.")
692  return self
693 
694  p = self.axis.boxplot(x, sym='k.', whis=1.5, vert=False, patch_artist=True, showmeans=True, widths=1,
695  boxprops=dict(facecolor='blue', alpha=0.5),
696  # medianprobs=dict(color='blue'),
697  # meanprobs=dict(color='red'),
698  )
699  self.plots.append(p)
700  self.labels.append(column)
701  self.x_axis_label = column
702  """
703  self.axis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + '\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
704  fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
705  self.axis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + '\n' + r'$IQD = {:.2f}$').format(x.std(),
706  x.quantile(0.75) - x.quantile(0.25)),
707  fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
708  self.axis.text(0.7, 0.9, (r'$min = {:.2f}$' + '\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
709  fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
710  """
711 
712  return self
713 
714  def finish(self):
715  """
716  Sets limits, title, axis-labels and legend of the plot
717  """
718  matplotlib.artist.setp(self.axis.get_yaxis(), visible=False)
719  self.axis.get_xaxis().set_label_text(self.x_axis_label)
720  self.axis.set_title("Box Plot")
721  return self
722 
723 
725  """
726  Plots the difference between two histograms
727  """
728 
740 
741  def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
742  """
743  Creates a new figure and axis if None is given, sets the default plot parameters
744  @param figure default draw figure which is used
745  @param axis default draw axis which is used
746  @param normed normalize minuend and subtrahend before comparing them
747  @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
748  """
749  super(Difference, self).__init__(figure, axis)
750  self.normed = normed
751  self.shift_to_zero = shift_to_zero
752  if self.normed:
753  self.ymin = -0.01
754  self.ymax = 0.01
755  else:
756  self.ymin = -1
757  self.ymax = 1
758 
759  def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
760  """
761  Add a new difference plot
762  @param data pandas.DataFrame containing all data
763  @param column which is used to calculate distribution histogram
764  @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
765  @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
766  @param weight_column column in data containing the weights for each event
767  @param label label for the legend if None, the column name is used
768  """
769  hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask},
770  weight_column=weight_column, equal_frequency=False)
771  minuend, minuend_error = hists.get_hist('Minuend')
772  subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
773 
774  difference_error = histogram.poisson_error(minuend + subtrahend)
775  if self.normed:
776  difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
777  minuend = minuend / numpy.sum(minuend)
778  subtrahend = subtrahend / numpy.sum(subtrahend)
779  difference = minuend - subtrahend
780 
781  if self.shift_to_zero:
782  difference = difference - numpy.mean(difference)
783 
784  self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
785  self.ymin = min((difference - difference_error).min(), self.ymin)
786  self.ymax = max((difference + difference_error).max(), self.ymax)
787 
788  p = self._plot_datapoints(self.axis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
789  self.plots.append(p)
790  if label is None:
791  self.labels.append(label)
792  else:
793  self.labels.append(column)
794  self.x_axis_label = column
795  return self
796 
797  def finish(self, line_color='black'):
798  """
799  Sets limits, title, axis-labels and legend of the plot
800  """
801  self.axis.plot((self.xmin, self.xmax), (0, 0), color=line_color, linewidth=4, rasterized=True)
802  self.scale_limits()
803  self.axis.set_xlim((self.xmin, self.xmax))
804  self.axis.set_ylim((self.ymin, self.ymax))
805  self.axis.set_title("Difference Plot")
806  self.axis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
807  self.axis.get_xaxis().set_label_text(self.x_axis_label)
808  self.axis.get_yaxis().set_label_text('Difference')
809  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
810  return self
811 
812 
814  """
815  Create TMVA-like overtraining control plot for a classification training
816  """
817 
818 
819  figure = None
820 
821  axis = None
822 
823  axis_d1 = None
824 
825  axis_d2 = None
826 
827  def __init__(self, figure=None):
828  """
829  Creates a new figure if None is given, sets the default plot parameters
830  @param figure default draw figure which is used
831  """
832  if figure is None:
833  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
834  self.figure.set_tight_layout(True)
835  else:
836  self.figure = figure
837 
838  gs = matplotlib.gridspec.GridSpec(5, 1)
839  self.axis = self.figure.add_subplot(gs[:3, :])
840  self.axis_d1 = self.figure.add_subplot(gs[3, :], sharex=self.axis)
841  self.axis_d2 = self.figure.add_subplot(gs[4, :], sharex=self.axis)
842 
843  super(Overtraining, self).__init__(self.figure, self.axis)
844 
845  def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
846  """
847  Add a new overtraining plot, I recommend to raw only one overtraining plot at the time,
848  otherwise there are too many curves in the plot to reconize anything in the plot.
849  @param data pandas.DataFrame containing all data
850  @param column which is used to calculate distribution histogram
851  @param train_mask boolean numpy.array defining which events are training events
852  @param test_mask boolean numpy.array defining which events are test events
853  @param signal_mask boolean numpy.array defining which events are signal events
854  @param bckgrd_mask boolean numpy.array defining which events are background events
855  @param weight_column column in data containing the weights for each event
856  """
857  distribution = Distribution(self.figure, self.axis, normed_to_all_entries=True)
858 
859  distribution.set_plot_options(self.plot_kwargs)
860  distribution.set_errorbar_options(self.errorbar_kwargs)
861  distribution.set_errorband_options(self.errorband_kwargs)
862  distribution.add(data, column, test_mask & signal_mask, weight_column)
863  distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
864 
865  distribution.set_plot_options({'color': distribution.plots[0][0][0].get_color(), 'linestyle': 'steps-mid-', 'lw': 4})
866  distribution.set_fill_options({'color': distribution.plots[0][0][0].get_color(), 'alpha': 0.5, 'step': 'mid'})
867  distribution.set_errorbar_options(None)
868  distribution.set_errorband_options(None)
869  distribution.add(data, column, train_mask & signal_mask, weight_column)
870  distribution.set_plot_options({'color': distribution.plots[1][0][0].get_color(), 'linestyle': 'steps-mid-', 'lw': 4})
871  distribution.set_fill_options({'color': distribution.plots[1][0][0].get_color(), 'alpha': 0.5, 'step': 'mid'})
872  distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
873 
874  distribution.labels = ['Test-Signal', 'Test-Background', 'Train-Signal', 'Train-Background']
875  distribution.finish()
876 
877  self.plot_kwargs['color'] = distribution.plots[0][0][0].get_color()
878  difference_signal = Difference(self.figure, self.axis_d1, shift_to_zero=True, normed=True)
879  difference_signal.set_plot_options(self.plot_kwargs)
880  difference_signal.set_errorbar_options(self.errorbar_kwargs)
881  difference_signal.set_errorband_options(self.errorband_kwargs)
882  difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
883  self.axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
884  self.axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
885  difference_signal.plots = difference_signal.labels = []
886  difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
887 
888  self.plot_kwargs['color'] = distribution.plots[1][0][0].get_color()
889  difference_bckgrd = Difference(self.figure, self.axis_d2, shift_to_zero=True, normed=True)
890  difference_bckgrd.set_plot_options(self.plot_kwargs)
891  difference_bckgrd.set_errorbar_options(self.errorbar_kwargs)
892  difference_bckgrd.set_errorband_options(self.errorband_kwargs)
893  difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
894  self.axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
895  self.axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
896  difference_bckgrd.plots = difference_bckgrd.labels = []
897  difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
898 
899  try:
900  import scipy.stats
901  # Kolmogorov smirnov test
902  if len(data[column][train_mask & signal_mask]) == 0 or len(data[column][test_mask & signal_mask]) == 0:
903  B2WARNING("Cannot calculate kolmogorov smirnov test for signal due to missing data")
904  else:
905  ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
906  props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
907  self.axis_d1.text(0.1, 0.9, r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36, bbox=props,
908  verticalalignment='top', horizontalalignment='left', transform=self.axis_d1.transAxes)
909  if len(data[column][train_mask & bckgrd_mask]) == 0 or len(data[column][test_mask & bckgrd_mask]) == 0:
910  B2WARNING("Cannot calculate kolmogorov smirnov test for background due to missing data")
911  else:
912  ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
913  props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
914  self.axis_d2.text(0.1, 0.9, r'background (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36,
915  bbox=props,
916  verticalalignment='top', horizontalalignment='left', transform=self.axis_d2.transAxes)
917  except ImportError:
918  B2WARNING("Cannot calculate kolmogorov smirnov test please install scipy!")
919 
920  return self
921 
922  def finish(self):
923  """
924  Sets limits, title, axis-labels and legend of the plot
925  """
926  self.axis.set_title("Overtraining Plot")
927  self.axis_d1.set_title("")
928  self.axis_d2.set_title("")
929  matplotlib.artist.setp(self.axis.get_xticklabels(), visible=False)
930  matplotlib.artist.setp(self.axis_d1.get_xticklabels(), visible=False)
931  self.axis.get_xaxis().set_label_text('')
932  self.axis_d1.get_xaxis().set_label_text('')
933  self.axis_d2.get_xaxis().set_label_text('Classifier Output')
934  return self
935 
936 
938  """
939  Plots distribution of a quantity including boxplots
940  """
941 
942 
943  box_axes = None
944 
945  def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
946  """
947  Creates a new figure and axis if None is given, sets the default plot parameters
948  @param figure default draw figure which is used
949  @param axis default draw axis which is used
950  @param normed true if the histograms should be normed before drawing
951  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
952  """
953  super(VerboseDistribution, self).__init__(figure, axis)
954 
955  self.normed = normed
956 
957  self.range_in_std = range_in_std
958  self.box_axes = []
959 
960  self.distribution = Distribution(self.figure, self.axis, normed_to_all_entries=self.normed, range_in_std=self.range_in_std)
961 
962  def add(self, data, column, mask=None, weight_column=None, label=None):
963  """
964  Add a new distribution plot, with additional information like a boxplot compared to
965  the ordinary Distribution plot.
966  @param data pandas.DataFrame containing all data
967  @param column which is used to calculate distribution histogram
968  @param mask boolean numpy.array defining which events are used for the distribution histogram
969  @param weight_column column in data containing the weights for each event
970  """
974  self.distribution.add(data, column, mask, weight_column, label=label)
975 
976  n = len(self.box_axes) + 1
977  gs = matplotlib.gridspec.GridSpec(4 * n, 1)
978  gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i in range(n)]
979  box_axis = self.add_subplot(gridspecs)
980 
981  if self.range_in_std is not None:
982  mean, std = histogram.weighted_mean_and_std(data[column], None if weight_column is None else data[weight_column])
983  # Everything outside mean +- range_in_std * std is considered not inside the mask
984  mask = mask & (data[column] > (mean - self.range_in_std * std)) & (data[column] < (mean + self.range_in_std * std))
985  box = Box(self.figure, box_axis)
986  box.add(data, column, mask, weight_column)
987  if len(box.plots) > 0:
988  box.plots[0]['boxes'][0].set_facecolor(self.distribution.plots[-1][0][0].get_color())
989  box.finish()
990 
991  self.box_axes.append(box_axis)
992  return self
993 
994  def finish(self):
995  """
996  Sets limits, title, axis-labels and legend of the plot
997  """
998  self.distribution.finish()
999  matplotlib.artist.setp(self.axis.get_xticklabels(), visible=False)
1000  self.axis.get_xaxis().set_label_text('')
1001  for box_axis in self.box_axes[:-1]:
1002  matplotlib.artist.setp(box_axis.get_xticklabels(), visible=False)
1003  box_axis.set_title("")
1004  box_axis.get_xaxis().set_label_text('')
1005  self.box_axes[-1].set_title("")
1006  self.axis.set_title("Distribution Plot")
1007  self.axis.legend([x[0] for x in self.distribution.plots], self.distribution.labels,
1008  loc='best', fancybox=True, framealpha=0.5)
1009  return self
1010 
1011 
1013  """
1014  Plots change of a distribution of a quantity depending on the cut on a classifier
1015  """
1016 
1017  figure = None
1018 
1019  axis = None
1020 
1021  axis_d1 = None
1022 
1023  axis_d2 = None
1024 
1025  def __init__(self, figure=None):
1026  """
1027  Creates a new figure if None is given, sets the default plot parameters
1028  @param figure default draw figure which is used
1029  """
1030  if figure is None:
1031  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
1032  self.figure.set_tight_layout(True)
1033  else:
1034  self.figure = figure
1035 
1036  gs = matplotlib.gridspec.GridSpec(3, 2)
1037  self.axis = self.figure.add_subplot(gs[0, :])
1038  self.axis_d1 = self.figure.add_subplot(gs[1, :], sharex=self.axis)
1039  self.axis_d2 = self.figure.add_subplot(gs[2, :], sharex=self.axis)
1040 
1041  super(Correlation, self).__init__(self.figure, self.axis)
1042 
1043  def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1044  """
1045  Add a new correlation plot.
1046  @param data pandas.DataFrame containing all data
1047  @param column which is used to calculate distribution histogram
1048  @param cut_column which is used to calculate cut on the other quantity defined by column
1049  @param quantiles list of quantiles between 0 and 100, defining the different cuts
1050  @param weight_column column in data containing the weights for each event
1051  """
1052  if len(data[cut_column]) == 0:
1053  B2WARNING("Ignore empty Correlation.")
1054  return self
1055 
1056  axes = [self.axis, self.axis_d1, self.axis_d2]
1057 
1058  for i, (l, m) in enumerate([('.', signal_mask | bckgrd_mask), ('S', signal_mask), ('B', bckgrd_mask)]):
1059 
1060  if weight_column is not None:
1061  weights = numpy.array(data[weight_column][m])
1062  else:
1063  weights = numpy.ones(len(data[column][m]))
1064 
1065  # The cast to float32 is a workaround for the following numpy issue:
1066  # https://github.com/numpy/numpy/issues/8123
1067  xrange = np.percentile(data[column][m], [5, 95]).astype(np.float32)
1068 
1069  colormap = plt.get_cmap('coolwarm')
1070  tmp, x = np.histogram(data[column][m], bins=100,
1071  range=xrange, normed=True, weights=weights)
1072  bin_center = ((x + np.roll(x, 1)) / 2)[1:]
1073  axes[i].plot(bin_center, tmp, color='black', lw=1)
1074 
1075  for quantil in np.arange(5, 100, 5):
1076  cut = np.percentile(data[cut_column][m], quantil)
1077  sel = data[cut_column][m] >= cut
1078  y, x = np.histogram(data[column][m][sel], bins=100,
1079  range=xrange, normed=True, weights=weights[sel])
1080  bin_center = ((x + np.roll(x, 1)) / 2)[1:]
1081  axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1082  tmp = y
1083 
1084  axes[i].set_ylim(bottom=0)
1085 
1086  flatness_score = basf2_mva_util.calculate_flatness(data[column][m], data[cut_column][m], weights)
1087  axes[i].set_title(r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1088  return self
1089 
1090  def finish(self):
1091  """
1092  Sets limits, title, axis-labels and legend of the plot
1093  """
1094  return self
1095 
1096 
1097 class TSNE(Plotter):
1098  """
1099  Plots multivariate distribution using TSNE algorithm
1100  """
1101 
1102  def add(self, data, columns, *masks):
1103  """
1104  Add a new correlation plot.
1105  @param data pandas.DataFrame containing all data
1106  @param columns which are used to calculate the correlations
1107  @param masks different classes to show in TSNE
1108  """
1109  try:
1110  import sklearn
1111  import sklearn.manifold
1112  model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1113  data = numpy.array([data[column] for column in columns]).T
1114  model.fit(data)
1115  for mask in masks:
1116  data = numpy.array([data[column][mask] for column in columns]).T
1117  data = model.transform(data)
1118  self.axis.scatter(data[:, 0], data[:, 1], rasterized=True)
1119  except ImportError:
1120  print("Cannot create TSNE plot. Install sklearn if you want it")
1121  return self
1122 
1123  def finish(self):
1124  """
1125  Sets limits, title, axis-labels and legend of the plot
1126  """
1127  return self
1128 
1129 
1131  """
1132  Plots importance matrix
1133  """
1134 
1135  def add(self, data, columns, variables):
1136  """
1137  Add a new correlation plot.
1138  @param data pandas.DataFrame containing all data
1139  @param columns which are used to calculate the correlations
1140  """
1141  self.figure.set_tight_layout(True)
1142 
1143  def norm(x):
1144  width = (numpy.max(x) - numpy.min(x))
1145  if width <= 0:
1146  return numpy.zeros(x.shape)
1147  return (x - numpy.min(x)) / width * 100
1148 
1149  importance_matrix = numpy.vstack([norm(data[column]) for column in columns]).T
1150  importance_heatmap = self.axis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,
1151  rasterized=True)
1152 
1153  # put the major ticks at the middle of each cell
1154  self.axis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=False)
1155  self.axis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=False)
1156 
1157  self.axis.set_xticklabels(columns, minor=False, rotation=90)
1158  self.axis.set_yticklabels(variables, minor=False)
1159 
1160  self.axis.xaxis.tick_top()
1161 
1162  for y in range(importance_matrix.shape[0]):
1163  for x in range(importance_matrix.shape[1]):
1164  self.axis.text(x + 0.5, y + 0.5, '%.0f' % importance_matrix[y, x],
1165  size=14,
1166  horizontalalignment='center',
1167  verticalalignment='center')
1168 
1169  cb = self.figure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation='vertical')
1170  cb.ax.set_yticklabels(['low', 'high'])
1171 
1172  self.axis.set_aspect('equal')
1173 
1174  return self
1175 
1176  def finish(self):
1177  """
1178  Sets limits, title, axis-labels and legend of the plot
1179  """
1180  return self
1181 
1182 
1184  """
1185  Plots correlation matrix
1186  """
1187 
1188  figure = None
1189 
1190  signal_axis = None
1191 
1192  bckgrd_axis = None
1193 
1194  def __init__(self, figure=None):
1195  """
1196  Creates a new figure if None is given, sets the default plot parameters
1197  @param figure default draw figure which is used
1198  """
1199  if figure is None:
1200  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
1201  self.figure.set_tight_layout(True)
1202  else:
1203  self.figure = figure
1204 
1205  gs = matplotlib.gridspec.GridSpec(8, 2)
1206  self.signal_axis = self.figure.add_subplot(gs[:6, 0])
1207  self.bckgrd_axis = self.figure.add_subplot(gs[:6, 1], sharey=self.signal_axis)
1208 
1209  self.colorbar_axis = self.figure.add_subplot(gs[7, :])
1210 
1211  self.axis = self.signal_axis
1212 
1213  super(CorrelationMatrix, self).__init__(self.figure, self.axis)
1214 
1215  def add(self, data, columns, signal_mask, bckgrd_mask):
1216  """
1217  Add a new correlation plot.
1218  @param data pandas.DataFrame containing all data
1219  @param columns which are used to calculate the correlations
1220  """
1221  signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column in columns])) * 100
1222  bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask] for column in columns])) * 100
1223 
1224  signal_heatmap = self.signal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1225 
1226  bckgrd_heatmap = self.bckgrd_axis.pcolor(bckgrd_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1227 
1228  self.signal_axis.invert_yaxis()
1229  self.signal_axis.xaxis.tick_top()
1230  self.bckgrd_axis.invert_yaxis()
1231  self.bckgrd_axis.xaxis.tick_top()
1232 
1233  # put the major ticks at the middle of each cell
1234  self.signal_axis.set_xticks(numpy.arange(signal_corr.shape[0]) + 0.5, minor=False)
1235  self.signal_axis.set_yticks(numpy.arange(signal_corr.shape[1]) + 0.5, minor=False)
1236 
1237  self.signal_axis.set_xticklabels(columns, minor=False, rotation=90)
1238  self.signal_axis.set_yticklabels(columns, minor=False)
1239 
1240  # put the major ticks at the middle of each cell
1241  self.bckgrd_axis.set_xticks(numpy.arange(bckgrd_corr.shape[0]) + 0.5, minor=False)
1242  self.bckgrd_axis.set_yticks(numpy.arange(bckgrd_corr.shape[1]) + 0.5, minor=False)
1243 
1244  self.bckgrd_axis.set_xticklabels(columns, minor=False, rotation=90)
1245  self.bckgrd_axis.set_yticklabels(columns, minor=False)
1246 
1247  for y in range(signal_corr.shape[0]):
1248  for x in range(signal_corr.shape[1]):
1249  self.signal_axis.text(x + 0.5, y + 0.5, '%.0f' % signal_corr[y, x],
1250  size=14,
1251  horizontalalignment='center',
1252  verticalalignment='center')
1253 
1254  for y in range(bckgrd_corr.shape[0]):
1255  for x in range(bckgrd_corr.shape[1]):
1256  self.bckgrd_axis.text(x + 0.5, y + 0.5, '%.0f' % bckgrd_corr[y, x],
1257  size=14,
1258  horizontalalignment='center',
1259  verticalalignment='center')
1260 
1261  cb = self.figure.colorbar(signal_heatmap, cax=self.colorbar_axis, ticks=[-100, 0, 100], orientation='horizontal')
1262  cb.solids.set_rasterized(True)
1263  cb.ax.set_xticklabels(['negative', 'uncorrelated', 'positive'])
1264 
1265  self.signal_axis.text(0.5, -1.0, "Signal", horizontalalignment='center')
1266  self.bckgrd_axis.text(0.5, -1.0, "Background", horizontalalignment='center')
1267 
1268  return self
1269 
1270  def finish(self):
1271  """
1272  Sets limits, title, axis-labels and legend of the plot
1273  """
1274  matplotlib.artist.setp(self.bckgrd_axis.get_yticklabels(), visible=False)
1275  return self
1276 
1277 
1278 if __name__ == '__main__':
1279 
1280  def get_data(N, columns):
1281  """
1282  Creates fake data for example plots
1283  """
1284  N /= 2
1285  n = len(columns) - 1
1286  xs = numpy.random.normal(0, size=(N, n))
1287  xb = numpy.random.normal(1, size=(N, n))
1288  ys = numpy.zeros(N)
1289  yb = numpy.ones(N)
1290  data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1291  return data.reindex(numpy.random.permutation(data.index))
1292 
1293  import seaborn
1294  # Set nice searborn settings
1295  seaborn.set(font_scale=3)
1296  seaborn.set_style('whitegrid')
1297 
1298  # Standard plots
1299  N = 100000
1300  data = get_data(N, columns=['FastBDT', 'NeuroBayes', 'isSignal'])
1301  data['type'] = ''
1302  data.type.iloc[:N / 2] = 'Train'
1303  data.type.iloc[N / 2:] = 'Test'
1304 
1305  p = Box()
1306  p.add(data, 'FastBDT')
1307  p.finish()
1308  p.save('box_plot.png')
1309 
1310  p = VerboseDistribution()
1311  p.add(data, 'FastBDT')
1312  p.add(data, 'NeuroBayes')
1313  p.finish()
1314  p.save('verbose_distribution_plot.png')
1315 
1316  p = PurityOverEfficiency()
1317  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1318  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1319  p.finish()
1320  p.save('roc_purity_plot.png')
1321 
1323  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1324  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1325  p.finish()
1326  p.save('roc_rejection_plot.png')
1327 
1328  p = Diagonal()
1329  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1330  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1331  p.finish()
1332  p.save('diagonal_plot.png')
1333 
1334  p = Distribution()
1335  p.add(data, 'FastBDT')
1336  p.add(data, 'NeuroBayes')
1337  p.finish()
1338  p.save('distribution_plot.png')
1339 
1340  p = Difference()
1341  p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test')
1342  p.add(data, 'NeuroBayes', data['type'] == 'Train', data['type'] == 'Test')
1343  p.finish()
1344  p.save('difference_plot.png')
1345 
1346  p = Overtraining()
1347  p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test', data['isSignal'] == 1, data['isSignal'] == 0)
1348  p.finish()
1349  p.save('overtraining_plot.png')
1350 
1351  p = Correlation()
1352  p.add(data, 'FastBDT', 'NeuroBayes', [0, 20, 40, 60, 80, 100], data['isSignal'] == 0)
1353  p.finish()
1354  p.save('correlation_plot.png')
1355 
1356  p = CorrelationMatrix()
1357  data['FastBDT2'] = data['FastBDT']**2
1358  data['NeuroBayes2'] = data['NeuroBayes']**2
1359  data['FastBDT3'] = data['FastBDT']**3
1360  data['NeuroBayes3'] = data['NeuroBayes']**3
1361  p.add(data, ['FastBDT', 'NeuroBayes', 'FastBDT2', 'NeuroBayes2', 'FastBDT3', 'NeuroBayes3'])
1362  p.finish()
1363  p.save('correlation_matrix.png')
plotting.VerboseDistribution
Definition: plotting.py:937
plotting.CorrelationMatrix.signal_axis
signal_axis
Main axis which shows the correlation of the signal samples.
Definition: plotting.py:1190
plotting.Importance.add
def add(self, data, columns, variables)
Definition: plotting.py:1135
plotting.Difference
Definition: plotting.py:724
plotting.Overtraining.axis_d2
axis_d2
Axis which shows the difference between training and test background.
Definition: plotting.py:825
plotting.Correlation
Definition: plotting.py:1012
plotting.RejectionOverEfficiency.finish
def finish(self)
Definition: plotting.py:432
plotting.Plotter.set_fill_options
def set_fill_options(self, fill_kwargs=None)
Definition: plotting.py:155
plotting.PurityAndEfficiencyOverCut
Definition: plotting.py:241
plotting.RejectionOverEfficiency
Definition: plotting.py:396
plotting.CorrelationMatrix.bckgrd_axis
bckgrd_axis
Axis which shows the correlation of the background samples.
Definition: plotting.py:1192
plotting.PurityOverEfficiency
Definition: plotting.py:350
plotting.TSNE.add
def add(self, data, columns, *masks)
Definition: plotting.py:1102
plotting.Plotter.errorbar_kwargs
errorbar_kwargs
Default keyword arguments for errorbar function.
Definition: plotting.py:97
plotting.Plotter.errorband_kwargs
errorband_kwargs
Default keyword arguments for errorband function.
Definition: plotting.py:99
plotting.Diagonal
Definition: plotting.py:495
plotting.Plotter.xmax
xmax
Maximum x value.
Definition: plotting.py:55
plotting.Plotter.xscale
float xscale
Definition: plotting.py:61
plotting.Correlation.add
def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
Definition: plotting.py:1043
plotting.RejectionOverEfficiency.add
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:405
plotting.Plotter.ymin
ymin
Minimum y value.
Definition: plotting.py:57
plotting.Plotter.axis
axis
Main axis which is used to draw.
Definition: plotting.py:65
plotting.Overtraining.__init__
def __init__(self, figure=None)
Definition: plotting.py:827
plotting.Plotter.__init__
def __init__(self, figure=None, axis=None)
Definition: plotting.py:67
plotting.CorrelationMatrix
Definition: plotting.py:1183
plotting.Plotter.scale_limits
def scale_limits(self)
Definition: plotting.py:230
plotting.Distribution.range_in_std
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:560
plotting.TSNE.finish
def finish(self)
Definition: plotting.py:1123
plotting.Plotter.add_subplot
def add_subplot(self, gridspecs)
Definition: plotting.py:108
plot
Definition: plot.py:1
plotting.VerboseDistribution.distribution
distribution
The distribution plot.
Definition: plotting.py:960
plotting.SignalToNoiseOverCut
Definition: plotting.py:303
histogram.Histograms
Definition: histogram.py:38
plotting.CorrelationMatrix.__init__
def __init__(self, figure=None)
Definition: plotting.py:1194
plotting.VerboseDistribution.__init__
def __init__(self, figure=None, axis=None, normed=False, range_in_std=None)
Definition: plotting.py:945
plotting.Difference.__init__
def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
Definition: plotting.py:741
plotting.Distribution.finish
def finish(self)
Definition: plotting.py:625
plotting.Box.x_axis_label
x_axis_label
Label on x axis.
Definition: plotting.py:673
plotting.Plotter.plots
plots
Plots added to the axis so far.
Definition: plotting.py:49
plotting.Plotter._plot_datapoints
def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
Definition: plotting.py:163
plotting.Plotter.xmin
xmin
Minimum x value.
Definition: plotting.py:53
plotting.Difference.finish
def finish(self, line_color='black')
Definition: plotting.py:797
plotting.Plotter.labels
labels
Labels of the plots added so far.
Definition: plotting.py:51
plotting.Plotter.fill_kwargs
fill_kwargs
Default keyword arguments for fill_between function.
Definition: plotting.py:101
plotting.Distribution.add
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:578
plotting.Plotter.yscale
float yscale
Definition: plotting.py:60
plotting.CorrelationMatrix.colorbar_axis
colorbar_axis
Colorbar axis contains the colorbar.
Definition: plotting.py:1209
plotting.Correlation.finish
def finish(self)
Definition: plotting.py:1090
plotting.PurityOverEfficiency.add
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:359
plotting.Importance
Definition: plotting.py:1130
plotting.Correlation.axis_d1
axis_d1
Axis which shows shape of signal.
Definition: plotting.py:1021
plotting.Multiplot.add
def add(self, i, *args, **kwargs)
Definition: plotting.py:479
plotting.Distribution.first_binning
first_binning
first binning
Definition: plotting.py:573
plotting.Correlation.axis_d2
axis_d2
Axis which shows shape of background.
Definition: plotting.py:1023
plotting.Multiplot.__init__
def __init__(self, cls, number_of_plots, figure=None)
Definition: plotting.py:454
plotting.Difference.normed
normed
Definition: plotting.py:750
plotting.Plotter
Definition: plotting.py:33
plotting.Overtraining
Definition: plotting.py:813
plotting.Plotter.set_errorband_options
def set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
Definition: plotting.py:147
plotting.CorrelationMatrix.add
def add(self, data, columns, signal_mask, bckgrd_mask)
Definition: plotting.py:1215
plotting.Diagonal.add
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:504
plotting.Distribution.x_axis_label
x_axis_label
x axis label
Definition: plotting.py:575
plotting.PurityAndEfficiencyOverCut.finish
def finish(self)
Definition: plotting.py:291
plotting.Box.__init__
def __init__(self, figure=None, axis=None)
Definition: plotting.py:664
plotting.Plotter.save
def save(self, filename)
Definition: plotting.py:120
plotting.SignalToNoiseOverCut.finish
def finish(self)
Definition: plotting.py:338
plotting.VerboseDistribution.add
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:962
plotting.Plotter.figure
figure
figure which is used to draw
Definition: plotting.py:63
plotting.Plotter.ymax
ymax
Maximum y value.
Definition: plotting.py:59
plotting.Distribution.__init__
def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
Definition: plotting.py:545
plotting.Difference.x_axis_label
x_axis_label
Definition: plotting.py:794
plotting.Multiplot
Definition: plotting.py:445
plotting.Box
Definition: plotting.py:657
plotting.TSNE
Definition: plotting.py:1097
plotting.Distribution.keep_first_binning
keep_first_binning
Keep first binning if user wants so.
Definition: plotting.py:571
plotting.Multiplot.finish
def finish(self)
Definition: plotting.py:486
histogram.weighted_mean_and_std
def weighted_mean_and_std(x, w)
Definition: histogram.py:27
plotting.PurityAndEfficiencyOverCut.add
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:250
plotting.VerboseDistribution.range_in_std
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:957
plotting.SignalToNoiseOverCut.add
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:312
plotting.Overtraining.finish
def finish(self)
Definition: plotting.py:922
plotting.Difference.add
def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
Definition: plotting.py:759
plotting.VerboseDistribution.normed
normed
Normalize histograms before drawing them.
Definition: plotting.py:955
plotting.Distribution.normed_to_all_entries
normed_to_all_entries
Normalize histograms before drawing them.
Definition: plotting.py:556
plotting.Box.add
def add(self, data, column, mask=None, weight_column=None)
Definition: plotting.py:675
plotting.VerboseDistribution.finish
def finish(self)
Definition: plotting.py:994
plotting.Multiplot.sub_plots
sub_plots
the subplots which are displayed in the grid
Definition: plotting.py:475
plotting.CorrelationMatrix.finish
def finish(self)
Definition: plotting.py:1270
plotting.Plotter.set_errorbar_options
def set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Definition: plotting.py:139
plotting.Plotter.plot_kwargs
plot_kwargs
Default keyword arguments for plot function.
Definition: plotting.py:95
plotting.Plotter.add
def add(self, *args, **kwargs)
Definition: plotting.py:218
histogram.poisson_error
def poisson_error(n_tot)
Definition: histogram.py:20
plotting.PurityOverEfficiency.finish
def finish(self)
Definition: plotting.py:383
plotting.Correlation.__init__
def __init__(self, figure=None)
Definition: plotting.py:1025
plotting.Diagonal.finish
def finish(self)
Definition: plotting.py:525
plotting.Overtraining.axis_d1
axis_d1
Axis which shows the difference between training and test signal.
Definition: plotting.py:823
plotting.Importance.finish
def finish(self)
Definition: plotting.py:1176
plotting.Overtraining.add
def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:845
plotting.Plotter.set_plot_options
def set_plot_options(self, plot_kwargs={ 'linestyle':''})
Definition: plotting.py:131
plotting.Distribution
Definition: plotting.py:540
basf2_mva_util.calculate_flatness
def calculate_flatness(f, p, w=None)
Definition: basf2_mva_util.py:54
plotting.Distribution.normed_to_bin_width
normed_to_bin_width
Normalize histograms before drawing them.
Definition: plotting.py:558
plotting.Plotter.finish
def finish(self, *args, **kwargs)
Definition: plotting.py:224
plotting.VerboseDistribution.box_axes
box_axes
Axes for the boxplots.
Definition: plotting.py:943
plotting.Box.finish
def finish(self)
Definition: plotting.py:714
plotting.Difference.shift_to_zero
shift_to_zero
Definition: plotting.py:751