Belle II Software  release-06-01-15
ftPlotting.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 
11 
12 # @cond SUPPRESS_DOXYGEN
13 
14 import basf2_mva_util
15 from basf2 import B2INFO, B2WARNING
16 import basf2_mva_evaluation.histogram as histogram
17 import matplotlib.ticker
18 import matplotlib.patches
19 import matplotlib.colors
20 import matplotlib.gridspec
21 import matplotlib.figure
22 import matplotlib.artist
23 import matplotlib.pyplot as plt
24 import copy
25 import math
26 import pandas
27 import numpy
28 import numpy as np
29 import matplotlib
30 # Do not use standard backend TkAgg, because it is NOT thread-safe
31 # You will get an RuntimeError: main thread is not in main loop otherwise!
32 matplotlib.use("svg")
33 matplotlib.rcParams.update({'font.size': 40})
34 matplotlib.rcParams['text.usetex'] = True
35 matplotlib.rcParams['text.latex.preamble'] = [r"\usepackage{amsmath}"]
36 
37 
38 class Plotter(object):
39  """
40  Base class for all Plotters.
41  """
42 
43 
44  plots = None
45 
46  labels = None
47 
48  xmin = None
49 
50  xmax = None
51 
52  ymin = None
53 
54  ymax = None
55  yscale = 0.0
56  xscale = 0.0
57 
58  figure = None
59 
60  axis = None
61 
62  def __init__(self, figure=None, axis=None):
63  """
64  Creates a new figure and axis if None is given, sets the default plot parameters
65  @param figure default draw figure which is used
66  @param axis default draw axis which is used
67  """
68  B2INFO("Create new figure for class " + str(type(self)))
69  if figure is None:
70  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
71  self.figure.set_tight_layout(False)
72  else:
73  self.figure = figure
74 
75  if axis is None:
76  self.axis = self.figure.add_subplot(1, 1, 1)
77  else:
78  self.axis = axis
79 
80  self.plots = []
81  self.labels = []
82  self.xmin, self.xmax = float(0), float(1)
83  self.ymin, self.ymax = float(0), float(1)
84 
85  self.yscale = 0.1
86 
87  self.xscale = 0.0
88 
89 
90  self.plot_kwargs = None
91 
92  self.errorbar_kwargs = None
93 
94  self.errorband_kwargs = None
95 
96  self.fill_kwargs = None
97 
98  self.set_plot_options()
99  self.set_errorbar_options()
100  self.set_errorband_options()
101  self.set_fill_options()
102 
103  def add_subplot(self, gridspecs):
104  """
105  Adds a new subplot to the figure, updates all other axes
106  according to the given gridspec
107  @param gridspecs gridspecs for all axes including the new one
108  """
109  for gs, ax in zip(gridspecs[:-1], self.figure.axes):
110  ax.set_position(gs.get_position(self.figure))
111  ax.set_subplotspec(gs)
112  axis = self.figure.add_subplot(gridspecs[-1], sharex=self.axis)
113  return axis
114 
115  def save(self, filename):
116  """
117  Save the figure into a file
118  @param filename of the file
119  """
120  B2INFO("Save figure for class " + str(type(self)))
121  from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
122  canvas = FigureCanvas(self.figure)
123  canvas.print_figure(filename, dpi=50)
124  return self
125 
126  def set_plot_options(self, plot_kwargs={'linestyle': ''}):
127  """
128  Overrides default plot options for datapoint plot
129  @param plot_kwargs keyword arguments for the plot function
130  """
131  self.plot_kwargs = copy.copy(plot_kwargs)
132  return self
133 
134  def set_errorbar_options(self, errorbar_kwargs={'fmt': '.', 'elinewidth': 3, 'alpha': 1}):
135  """
136  Overrides default errorbar options for datapoint errorbars
137  @param errorbar_kwargs keyword arguments for the errorbar function
138  """
139  self.errorbar_kwargs = copy.copy(errorbar_kwargs)
140  return self
141 
142  def set_errorband_options(self, errorband_kwargs={'alpha': 0.5}):
143  """
144  Overrides default errorband options for datapoint errorband
145  @param errorbar_kwargs keyword arguments for the fill_between function
146  """
147  self.errorband_kwargs = copy.copy(errorband_kwargs)
148  return self
149 
150  def set_fill_options(self, fill_kwargs=None):
151  """
152  Overrides default fill_between options for datapoint errorband
153  @param fill_kwargs keyword arguments for the fill_between function
154  """
155  self.fill_kwargs = copy.copy(fill_kwargs)
156  return self
157 
158  def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None):
159  """
160  Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
161  @param x coordinates of the data points
162  @param y coordinates of the data points
163  @param xerr symmetric error on x data points
164  @param yerr symmetric error on y data points
165  """
166  p = e = f = None
167  plot_kwargs = copy.copy(self.plot_kwargs)
168  errorbar_kwargs = copy.copy(self.errorbar_kwargs)
169  errorband_kwargs = copy.copy(self.errorband_kwargs)
170  fill_kwargs = copy.copy(self.fill_kwargs)
171 
172  if plot_kwargs is None or 'color' not in plot_kwargs:
173  color = next(axis._get_lines.prop_cycler)
174  color = color['color']
175  plot_kwargs['color'] = color
176  else:
177  color = plot_kwargs['color']
178  color = matplotlib.colors.ColorConverter().to_rgb(color)
179  patch = matplotlib.patches.Patch(color=color, alpha=0.7)
180  patch.get_color = patch.get_facecolor
181  patches = [patch]
182 
183  if plot_kwargs is not None:
184  p, = axis.plot(x, y, **plot_kwargs)
185  patches.append(p)
186 
187  if errorbar_kwargs is not None and (xerr is not None or yerr is not None):
188  if 'color' not in errorbar_kwargs:
189  errorbar_kwargs['color'] = color
190  if 'ecolor' not in errorbar_kwargs:
191  errorbar_kwargs['ecolor'] = [0.4 * x for x in color]
192  # print("Here Colors")
193  # print([0.4 * x for x in color])
194  # if 'elinewidth' not in errorbar_kwargs:
195  errorbar_kwargs['elinewidth'] = 5
196  e = axis.errorbar(x, y, xerr=xerr, yerr=yerr, **errorbar_kwargs)
197  patches.append(e)
198 
199  if errorband_kwargs is not None and yerr is not None:
200  if 'color' not in errorband_kwargs:
201  errorband_kwargs['color'] = color
202  if xerr is not None:
203  # Ensure that xerr and yerr are iterable numpy arrays
204  xerr = x + xerr - x
205  yerr = y + yerr - y
206  for _x, _y, _xe, _ye in zip(x, y, xerr, yerr):
207  axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye,
208  **errorband_kwargs))
209  else:
210  f = axis.fill_between(x, y - yerr, y + yerr, interpolate=True, **errorband_kwargs)
211 
212  if fill_kwargs is not None:
213  axis.fill_between(x, y, 0, **fill_kwargs)
214 
215  return (tuple(patches), p, e, f)
216 
217  def add(self, *args, **kwargs):
218  """
219  Add a new plot to this plotter
220  """
221  return NotImplemented
222 
223  def finish(self, *args, **kwargs):
224  """
225  Finish plotting and set labels, legends and stuff
226  """
227  return NotImplemented
228 
229  def scale_limits(self):
230  """
231  Scale limits to increase distance to boundaries
232  """
233  self.ymin *= 1.0 - math.copysign(self.yscale, self.ymin)
234  self.ymax *= 1.0 + math.copysign(self.yscale, self.ymax)
235  self.xmin *= 1.0 - math.copysign(self.xscale, self.xmin)
236  self.xmax *= 1.0 + math.copysign(self.xscale, self.xmax)
237  return self
238 
239 
240 class PurityAndEfficiencyOverCut(Plotter):
241  """
242  Plots the purity and the efficiency over the cut value (for cut choosing)
243  """
244 
248 
249  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
250  """
251  Add a new curve to the plot
252  @param data pandas.DataFrame containing all data
253  @param column which is used to calculate efficiency and purity for different cuts
254  @param signal_mask boolean numpy.array defining which events are signal events
255  @param bckgrd_mask boolean numpy.array defining which events are background events
256  @param weight_column column in data containing the weights for each event
257  """
258 
259  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
260 
261  if normed:
262  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
263  purity, purity_error = hists.get_purity(['Signal'], ['Background'])
264  else:
265  efficiency, efficiency_error = hists.get_true_positives(['Signal'])
266  purity, purity_error = hists.get_false_positives(['Background'])
267 
268  cuts = hists.bin_centers
269 
270  self.xmin, self.xmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmax])
271  self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.ymin]), \
272  numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.ymax])
273 
274  self.plots.append(self._plot_datapoints(self.axis, cuts, efficiency, xerr=0, yerr=efficiency_error))
275 
276  if normed:
277  self.labels.append("Efficiency")
278  else:
279  self.labels.append("True positive")
280 
281  self.plots.append(self._plot_datapoints(self.axis, cuts, purity, xerr=0, yerr=purity_error))
282 
283  if normed:
284  self.labels.append("Purity")
285  else:
286  self.labels.append("False positive")
287 
288  return self
289 
290  def finish(self):
291  """
292  Sets limits, title, axis-labels and legend of the plot
293  """
294  self.axis.set_xlim((self.xmin, self.xmax))
295  self.axis.set_ylim((self.ymin, self.ymax))
296  self.axis.set_title("Classification Plot")
297  self.axis.get_xaxis().set_label_text('Cut Value')
298  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
299  return self
300 
301 
302 class SignalToNoiseOverCut(Plotter):
303  """
304  Plots the signal to noise ratio over the cut value (for cut choosing)
305  """
306 
310 
311  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
312  """
313  Add a new curve to the plot
314  @param data pandas.DataFrame containing all data
315  @param column which is used to calculate signal to noise ratio for different cuts
316  @param signal_mask boolean numpy.array defining which events are signal events
317  @param bckgrd_mask boolean numpy.array defining which events are background events
318  @param weight_column column in data containing the weights for each event
319  """
320 
321  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
322 
323  signal2noise, signal2noise_error = hists.get_signal_to_noise(['Signal'], ['Background'])
324 
325  cuts = hists.bin_centers
326 
327  self.xmin, self.xmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmax])
328  self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(signal2noise), self.ymin]), \
329  numpy.nanmax([numpy.nanmax(signal2noise), self.ymax])
330 
331  self.plots.append(self._plot_datapoints(self.axis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))
332 
333  self.labels.append(column)
334 
335  return self
336 
337  def finish(self):
338  """
339  Sets limits, title, axis-labels and legend of the plot
340  """
341  self.axis.set_xlim((self.xmin, self.xmax))
342  self.axis.set_ylim((self.ymin, self.ymax))
343  self.axis.set_title("Signal to Noise Plot")
344  self.axis.get_xaxis().set_label_text('Cut Value')
345  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
346  return self
347 
348 
349 class PurityOverEfficiency(Plotter):
350  """
351  Plots the purity over the efficiency also known as ROC curve
352  """
353 
357 
358  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
359  """
360  Add a new curve to the ROC plot
361  @param data pandas.DataFrame containing all data
362  @param column which is used to calculate efficiency and purity for different cuts
363  @param signal_mask boolean numpy.array defining which events are signal events
364  @param bckgrd_mask boolean numpy.array defining which events are background events
365  @param weight_column column in data containing the weights for each event
366  """
367  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
368  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
369  purity, purity_error = hists.get_purity(['Signal'], ['Background'])
370 
371  self.xmin, self.xmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmax])
372  self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
373 
374  p = self._plot_datapoints(self.axis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
375  self.plots.append(p)
376  if label is not None:
377  self.labels.append(label)
378  else:
379  self.labels.append(column)
380  return self
381 
382  def finish(self):
383  """
384  Sets limits, title, axis-labels and legend of the plot
385  """
386  self.axis.set_xlim((self.xmin, self.xmax))
387  self.axis.set_ylim((self.ymin, self.ymax))
388  self.axis.set_title("ROC Purity Plot")
389  self.axis.get_xaxis().set_label_text('Efficiency')
390  self.axis.get_yaxis().set_label_text('Purity')
391  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
392  return self
393 
394 
395 class RejectionOverEfficiency(Plotter):
396  """
397  Plots the rejection over the efficiency also known as ROC curve
398  """
399 
403 
404  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
405  """
406  Add a new curve to the ROC plot
407  @param data pandas.DataFrame containing all data
408  @param column which is used to calculate efficiency and purity for different cuts
409  @param signal_mask boolean numpy.array defining which events are signal events
410  @param bckgrd_mask boolean numpy.array defining which events are background events
411  @param weight_column column in data containing the weights for each event
412  """
413  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
414  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
415  rejection, rejection_error = hists.get_efficiency(['Background'])
416  rejection = 1 - rejection
417 
418  self.xmin, self.xmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmax])
419  self.ymin, self.ymax = numpy.nanmin([rejection.min(), self.ymin]), numpy.nanmax([rejection.max(), self.ymax])
420 
421  auc = numpy.abs(numpy.trapz(rejection, efficiency))
422 
423  p = self._plot_datapoints(self.axis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
424  self.plots.append(p)
425  if label is not None:
426  self.labels.append(label[:10] + r"$\ {\rm AUC}\ =\ $" + r"${:.2f}$".format(auc))
427  else:
428  self.labels.append(r"${\rm AUC}\ =\ $" + r"${:.2f}$".format(auc))
429 
430  return auc # self,
431 
432  def finish(self):
433  """
434  Sets limits, title, axis-labels and legend of the plot
435  """
436  self.axis.set_xlim((self.xmin, self.xmax))
437  self.axis.set_ylim((self.ymin, self.ymax))
438  # self.axis.set_title("ROC Rejection Plot")
439  self.axis.get_xaxis().set_tick_params(labelsize=60)
440  self.axis.get_yaxis().set_tick_params(labelsize=60)
441  self.axis.grid(True)
442  self.axis.get_xaxis().labelpad = 20
443  self.axis.get_yaxis().labelpad = 20
444  self.axis.get_xaxis().set_label_text(r'${\rm Signal\ Efficiency}$', fontsize=65)
445  self.axis.get_yaxis().set_label_text(r'${\rm Background\ Rejection}$', fontsize=65)
446  self.axis.legend([x[0] for x in self.plots], self.labels, fancybox=True, framealpha=0.5, fontsize=60, loc=3)
447  return self
448 
449 
450 class Multiplot(Plotter):
451  """
452  Plots multiple other plots into a grid 3x?
453  """
454 
455  figure = None
456 
457  axis = None
458 
459  def __init__(self, cls, number_of_plots, figure=None):
460  """
461  Creates a new figure if None is given, sets the default plot parameters
462  @param figure default draw figure which is used
463  """
464  if figure is None:
465  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
466  self.figure.set_tight_layout(True)
467  else:
468  self.figure = figure
469 
470  if number_of_plots == 1:
471  gs = matplotlib.gridspec.GridSpec(1, 1)
472  elif number_of_plots == 2:
473  gs = matplotlib.gridspec.GridSpec(1, 2)
474  elif number_of_plots == 3:
475  gs = matplotlib.gridspec.GridSpec(1, 3)
476  else:
477  gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
478 
479 
480  self.sub_plots = [cls(self.figure, self.figure.add_subplot(gs[i // 3, i % 3])) for i in range(number_of_plots)]
481  self.axis = self.sub_plots[0].axis
482  super(Multiplot, self).__init__(self.figure, self.axis)
483 
484  def add(self, i, *args, **kwargs):
485  """
486  Call add function of ith subplot
487  @param i position of the subplot
488  """
489  self.sub_plots[i].add(*args, **kwargs)
490 
491  def finish(self):
492  """
493  Sets limits, title, axis-labels and legend of the plot
494  """
495  for plot in self.sub_plots:
496  plot.finish()
497  return self
498 
499 
500 class Diagonal(Plotter):
501  """
502  Plots the purity in each bin over the classifier output.
503  """
504 
508 
509  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
510  """
511  Add a new curve to the Diagonal plot
512  @param data pandas.DataFrame containing all data
513  @param column which is used to calculate purity for different cuts
514  @param signal_mask boolean numpy.array defining which events are signal events
515  @param bckgrd_mask boolean numpy.array defining which events are background events
516  @param weight_column column in data containing the weights for each event
517  """
518  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
519  purity, purity_error = hists.get_purity_per_bin(['Signal'], ['Background'])
520 
521  self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
522  # self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
523  self.ymin, self.ymax = 0, 1
524 
525  p = self._plot_datapoints(self.axis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
526  self.plots.append(p)
527  self.labels.append(column)
528  return self
529 
530  def finish(self):
531  """
532  Sets limits, title, axis-labels and legend of the plot
533  """
534  self.scale_limits()
535  self.axis.plot((0.0, 1.0), (0.0, 1.0), color='black')
536  self.axis.set_xlim((self.xmin, self.xmax))
537  self.axis.set_ylim((self.ymin, self.ymax))
538  self.axis.set_title("Diagonal Plot")
539  self.axis.get_xaxis().set_label_text('Classifier Output')
540  self.axis.get_yaxis().set_label_text('Purity Per Bin')
541  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
542  return self
543 
544 
545 class Distribution(Plotter):
546  """
547  Plots distribution of a quantity
548  """
549 
550  def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
551  keep_first_binning=False, range_in_std=None, logScale=False):
552  """
553  Creates a new figure and axis if None is given, sets the default plot parameters
554  @param figure default draw figure which is used
555  @param axis default draw axis which is used
556  @param normed true if histograms should be normed before drawing
557  @param keep_first_binning use the binning of the first distribution for further plots
558  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
559  """
560  super(Distribution, self).__init__(figure, axis)
561 
562  self.normed_to_all_entries = normed_to_all_entries
563 
564  self.normed_to_bin_width = normed_to_bin_width
565 
566  self.range_in_std = range_in_std
567  # if self.normed_to_all_entries or self.normed_to_bin_width:
568 
569  self.ymin = float(0)
570 
571  self.ymax = float('-inf')
572 
573  self.xmin = float('inf')
574 
575  self.xmax = float('-inf')
576 
577  self.keep_first_binning = keep_first_binning
578 
579  self.first_binning = None
580 
581  self.x_axis_label = ''
582 
583  self.logScale = False
584 
585  self.binWidth = 0.02
586 
587  def add(self, data, column, mask=None, weight_column=None, label=None, bins=50):
588  """
589  Add a new distribution to the plots
590  @param data pandas.DataFrame containing all data
591  @param column which is used to calculate distribution histogram
592  @param mask boolean numpy.array defining which events are used for the histogram
593  @param weight_column column in data containing the weights for each event
594  """
595  if mask is None:
596  mask = numpy.ones(len(data)).astype('bool')
597 
598  # bins = 50
599  if self.keep_first_binning and self.first_binning is not None:
600  bins = self.first_binning
601  hists = histogram.Histograms(data, column, {'Total': mask}, weight_column=weight_column,
602  bins=bins, equal_frequency=False, range_in_std=self.range_in_std)
603  if self.keep_first_binning and self.first_binning is None:
604  self.first_binning = hists.bins
605  hist, hist_error = hists.get_hist('Total')
606  self.binWidth = hists.bin_widths[1]
607 
608  if self.normed_to_all_entries:
609  normalization = float(numpy.sum(hist))
610  hist = hist / normalization
611  hist_error = hist_error / normalization
612 
613  if self.normed_to_bin_width:
614  hist = hist / hists.bin_widths
615  hist_error = hist_error / hists.bin_widths
616 
617  self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
618  self.ymin, self.ymax = numpy.nanmin([hist.min(), self.ymin]), numpy.nanmax([(hist + hist_error).max(), self.ymax])
619 
620  p = self._plot_datapoints(self.axis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
621  self.plots.append(p)
622  self.x_axis_label = column
623  if label is None:
624  self.labels.append(column)
625  else:
626  self.labels.append(label)
627  return self
628 
629  def finish(self):
630  """
631  Sets limits, title, axis-labels and legend of the plot
632  """
633  self.scale_limits()
634  self.axis.set_xlim((self.xmin, self.xmax))
635 
636  if self.logScale:
637  self.axis.set_yscale('log', nonposy='clip')
638  else:
639  self.axis.set_ylim((self.ymin, self.ymax))
640  self.binWidth = '{:8.2f}'.format(self.binWidth)
641 
642  # self.axis.set_title("Distribution Plot")
643  self.axis.get_xaxis().set_label_text(self.x_axis_label)
644  if self.normed_to_all_entries and self.normed_to_bin_width:
645  self.axis.get_yaxis().set_label_text(r'# Entries per Bin / (# Entries * Bin Width)')
646  elif self.normed_to_all_entries:
647  # self.axis.get_yaxis().set_label_text('# Entries per Bin / # Entries')
648  self.axis.get_yaxis().set_label_text(
649  r'{$\frac{\rm Entries\hspace{0.25em} per\hspace{0.25em} Bin}{\rm Entries}\, /\, (' +
650  self.binWidth + r'\,)$}', fontsize=65)
651  self.axis.get_yaxis().labelpad = 20
652  self.axis.get_yaxis().set_tick_params(labelsize=60)
653  elif self.normed_to_bin_width:
654  self.axis.get_yaxis().set_label_text(r'# Entries per Bin / Bin Width')
655  else:
656  self.axis.get_yaxis().set_label_text(r'# Entries per Bin')
657  # self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5, fontsize=60)
658  return self
659 
660 
661 class Box(Plotter):
662  """
663  Create a boxplot
664  """
665 
667 
668  def __init__(self, figure=None, axis=None):
669  """
670  Creates a new figure and axis if None is given, sets the default plot parameters
671  @param figure default draw figure which is used
672  @param axis default draw axis which is used
673  """
674  super().__init__(figure=figure, axis=axis)
675 
676 
677  self.x_axis_label = ""
678 
679  def add(self, data, column, mask=None, weight_column=None):
680  """
681  Add a new boxplot to the plots
682  @param data pandas.DataFrame containing all data
683  @param column which is used to calculate boxplot quantities
684  @param mask boolean numpy.array defining which events are used for the histogram
685  @param weight_column column in data containing the weights for each event
686  """
687  if mask is None:
688  mask = numpy.ones(len(data)).astype('bool')
689  x = data[column][mask]
690  if weight_column is not None:
691  # weight = data[weight_column][mask]
692  B2WARNING("Weights are currently not used in boxplot, due to limitations in matplotlib")
693 
694  if len(x) == 0:
695  B2WARNING("Ignore empty boxplot.")
696  return self
697 
698  p = self.axis.boxplot(x, sym='k.', whis=1.5, vert=False, patch_artist=True, showmeans=True, widths=1,
699  boxprops=dict(facecolor='blue', alpha=0.5),
700  # medianprobs=dict(color='blue'),
701  # meanprobs=dict(color='red'),
702  )
703  self.plots.append(p)
704  self.labels.append(column)
705  self.x_axis_label = column
706  # """
707  # self.axis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + r'\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
708  # fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
709  # self.axis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + r'\n' + r'$IQD = {:.2f}$').format(x.std(),
710  # x.quantile(0.75) - x.quantile(0.25)),
711  # fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
712  # self.axis.text(0.7, 0.9, (r'$min = {:.2f}$' + r'\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
713  # fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
714  # """
715 
716  return self
717 
718  def finish(self):
719  """
720  Sets limits, title, axis-labels and legend of the plot
721  """
722  matplotlib.artist.setp(self.axis.get_yaxis(), visible=False)
723  self.axis.get_xaxis().set_label_text(self.x_axis_label)
724  self.axis.set_title("Box Plot")
725  return self
726 
727 
728 class Difference(Plotter):
729  """
730  Plots the difference between two histograms
731  """
732 
744 
745  def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
746  """
747  Creates a new figure and axis if None is given, sets the default plot parameters
748  @param figure default draw figure which is used
749  @param axis default draw axis which is used
750  @param normed normalize minuend and subtrahend before comparing them
751  @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
752  """
753  super(Difference, self).__init__(figure, axis)
754  self.normed = normed
755  self.shift_to_zero = shift_to_zero
756  if self.normed:
757  self.ymin = -0.01
758  self.ymax = 0.01
759  else:
760  self.ymin = -1
761  self.ymax = 1
762 
763  def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
764  """
765  Add a new difference plot
766  @param data pandas.DataFrame containing all data
767  @param column which is used to calculate distribution histogram
768  @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
769  @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
770  @param weight_column column in data containing the weights for each event
771  @param label label for the legend if None, the column name is used
772  """
773  bins = 50
774  hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask}, bins=bins,
775  weight_column=weight_column, equal_frequency=False)
776  minuend, minuend_error = hists.get_hist('Minuend')
777  subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
778 
779  difference_error = histogram.poisson_error(minuend + subtrahend)
780  if self.normed:
781  difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
782  minuend = minuend / numpy.sum(minuend)
783  subtrahend = subtrahend / numpy.sum(subtrahend)
784  difference = minuend - subtrahend
785 
786  if self.shift_to_zero:
787  difference = difference - numpy.mean(difference)
788 
789  self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
790  self.ymin = min((difference - difference_error).min(), self.ymin)
791  self.ymax = max((difference + difference_error).max(), self.ymax)
792 
793  p = self._plot_datapoints(self.axis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
794  self.plots.append(p)
795  if label is None:
796  self.labels.append(label)
797  else:
798  self.labels.append(column)
799  self.x_axis_label = column
800  return self
801 
802  def finish(self, line_color='black'):
803  """
804  Sets limits, title, axis-labels and legend of the plot
805  """
806  self.axis.plot((self.xmin, self.xmax), (0, 0), color=line_color, linewidth=4)
807  self.scale_limits()
808  self.axis.set_xlim((self.xmin, self.xmax))
809  self.axis.set_ylim((self.ymin, self.ymax))
810  self.axis.set_title("Difference Plot")
811  self.axis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
812  self.axis.get_xaxis().set_label_text(self.x_axis_label)
813  self.axis.set_ylabel(r'{\rm Difference}', fontsize=40, labelpad=20)
814  self.axis.get_xaxis().grid(True)
815  # self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
816  return self
817 
818 
819 class normalizedResiduals(Plotter):
820  """
821  Plots the difference between two histograms
822  """
823 
837 
838  def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
839  """
840  Creates a new figure and axis if None is given, sets the default plot parameters
841  @param figure default draw figure which is used
842  @param axis default draw axis which is used
843  @param normed normalize minuend and subtrahend before comparing them
844  @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
845  """
846  super(normalizedResiduals, self).__init__(figure, axis)
847  self.normed = normed
848  self.shift_to_zero = shift_to_zero
849  if self.normed:
850  self.ymin = -0.01
851  self.ymax = 0.01
852  else:
853  self.ymin = -1
854  self.ymax = 1
855 
856  def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None, bins=50, isNN=False):
857  """
858  Add a new difference plot
859  @param data pandas.DataFrame containing all data
860  @param column which is used to calculate distribution histogram
861  @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
862  @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
863  @param weight_column column in data containing the weights for each event
864  @param label label for the legend if None, the column name is used
865  """
866  # bins = 50
867  hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask}, bins=bins,
868  weight_column=weight_column, equal_frequency=False)
869  minuend, minuend_error = hists.get_hist('Minuend')
870  subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
871 
872  print("Here BinWidths Norm", hists.bin_widths)
873  difference_error = histogram.poisson_error(minuend + subtrahend)
874 
875  if self.normed:
876  difference_error = numpy.sqrt((minuend_error / numpy.sum(minuend))**2 + (subtrahend_error / numpy.sum(subtrahend))**2)
877  minuend = minuend / numpy.sum(minuend)
878  subtrahend = subtrahend / numpy.sum(subtrahend)
879  difference = minuend - subtrahend
880  normalizedRes = (minuend - subtrahend) / difference_error
881 
882  if self.shift_to_zero:
883  difference = difference - numpy.mean(difference)
884 
885  # self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
886 
887  # if min(hists.bin_centers.min(), self.xmin) < -0.8:
888  if isNN:
889  self.xmin = float(-1.0)
890 
891  self.xmin, self.xmax = self.xmin, self.xmax
892 
893  p = self._plot_datapoints(self.axis, hists.bin_centers, normalizedRes, xerr=hists.bin_widths / 2, yerr=1)
894  self.plots.append(p)
895  if label is None:
896  self.labels.append(label)
897  else:
898  self.labels.append(column)
899  self.x_axis_label = column
900  return self
901 
902  def finish(self, line_color='black'):
903  """
904  Sets limits, title, axis-labels and legend of the plot
905  """
906  # self.axis.plot((self.xmin, self.xmax), (0, 0), color=line_color, linewidth=4)
907  self.scale_limits()
908  self.axis.set_xlim((self.xmin, self.xmax))
909  self.axis.set_ylim((-5, 5))
910  self.axis.set_title("Difference Plot")
911  self.axis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
912  self.axis.get_xaxis().set_label_text(self.x_axis_label)
913  self.axis.set_ylabel(r'${\rm Normalized}$' + '\n' + r'${\rm Residuals}$', fontsize=40, labelpad=20)
914  self.axis.get_yaxis().set_ticks([-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5])
915  self.axis.get_yaxis().set_ticklabels([r'', r'$-4$', r'', r'$-2$', r'', r'$0$', r'', r'$2$', r'', r'$4$', r''], fontsize=45)
916  self.axis.get_xaxis().grid(True) # linestyle='--'
917  # plt.axhline(y= 4, xmin=-1.005, xmax=1.005, linewidth=1, color = 'k', linestyle = '-')
918  self.axis.plot((self.xmin, self.xmax), (3, 3), linewidth=4, color='#006600', linestyle='-')
919  self.axis.plot((self.xmin, self.xmax), (1, 1), linewidth=4, color='b', linestyle='-')
920  self.axis.plot((self.xmin, self.xmax), (-1, -1), linewidth=4, color='b', linestyle='-')
921  self.axis.plot((self.xmin, self.xmax), (-3, -3), linewidth=4, color='#006600', linestyle='-')
922 
923  # self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
924  return self
925 
926 
927 class Overtraining(Plotter):
928  """
929  Create TMVA-like overtraining control plot for a classification training
930  """
931 
932 
933  figure = None
934 
935  axis = None
936 
937  axis_d1 = None
938 
939  axis_d2 = None
940 
941  def __init__(self, figure=None):
942  """
943  Creates a new figure if None is given, sets the default plot parameters
944  @param figure default draw figure which is used
945  """
946  if figure is None:
947  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
948  self.figure.set_tight_layout(True)
949  else:
950  self.figure = figure
951 
952  gs = matplotlib.gridspec.GridSpec(5, 1)
953  self.axis = self.figure.add_subplot(gs[:3, :])
954  self.axis_d1 = self.figure.add_subplot(gs[3, :], sharex=self.axis)
955  self.axis_d2 = self.figure.add_subplot(gs[4, :], sharex=self.axis)
956 
957  super(Overtraining, self).__init__(self.figure, self.axis)
958 
959  def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None, bkgrOutput=0, isNN=False):
960  """
961  Add a new overtraining plot, I recommend to raw only one overtraining plot at the time,
962  otherwise there are too many curves in the plot to recognize anything in the plot.
963  @param data pandas.DataFrame containing all data
964  @param column which is used to calculate distribution histogram
965  @param train_mask boolean numpy.array defining which events are training events
966  @param test_mask boolean numpy.array defining which events are test events
967  @param signal_mask boolean numpy.array defining which events are signal events
968  @param bckgrd_mask boolean numpy.array defining which events are background events
969  @param weight_column column in data containing the weights for each event
970  """
971  distribution = Distribution(self.figure, self.axis, normed_to_all_entries=True)
972 
973  bins = []
974 
975  if isNN:
976  bins = list(range(-51, 55, 1))
977  for i in range(0, len(bins)):
978  bins[i] = float(bins[i]) / 25
979  else:
980  bins = list(range(-51, 55, 1))
981  for i in range(0, len(bins)):
982  bins[i] = float(bins[i]) / 50
983 
984  if bkgrOutput == 0:
985  distribution.logScale = True
986  distribution.labels = [r'{\rm Test-Bkgr.}', r'{\rm Train-Bkgr.}', r'{\rm Test-Signal}', r'{\rm Train-Signal}']
987  else:
988  distribution.labels = [
989  r'{\rm Test-$\bar{B}^{0}$}',
990  r'{\rm Train-$\bar{B}^{0}$}',
991  r'{\rm Test-$B^{0}$}',
992  r'{\rm Train-$B^{0}$}']
993 
994  distribution.set_plot_options(self.plot_kwargs)
995  # distribution.set_errorbar_options(self.errorbar_kwargs)
996  distribution.set_errorbar_options({'fmt': 'o', 'elinewidth': 5, 'alpha': 1, 'markersize': 20, 'ecolor': 'w'})
997  # distribution.set_errorband_options(self.errorband_kwargs)
998  distribution.set_errorband_options(None)
999  distribution.add(data, column, test_mask & bckgrd_mask, weight_column, None, bins)
1000  distribution.add(data, column, test_mask & signal_mask, weight_column, None, bins)
1001 
1002  distribution.set_errorbar_options(None)
1003 
1004  distribution.set_plot_options({'color': distribution.plots[0][0][0].get_color(
1005  ), 'drawstyle': 'steps-mid', 'linestyle': 'dashed', 'lw': 5})
1006  distribution.set_fill_options(None)
1007  distribution.add(data, column, train_mask & bckgrd_mask, weight_column, None, bins)
1008  distribution.set_plot_options({'color': distribution.plots[1][0][0].get_color(
1009  ), 'drawstyle': 'steps-mid', 'linestyle': 'solid', 'lw': 5})
1010  # distribution.set_fill_options({'color': distribution.plots[1][0][0].get_color(), 'alpha': 0.5, 'step': 'mid'})
1011  distribution.add(data, column, train_mask & signal_mask, weight_column, None, bins)
1012 
1013  distribution.finish()
1014 
1015  p1 = distribution.axis.errorbar([], [], xerr=0, yerr=0, elinewidth=5, mew=2, ecolor='w',
1016  # ecolor=[0.04862745098039216, 0.18666666666666668, 0.28235294117647064],
1017  fmt='o', mfc=distribution.plots[0][0][0].get_color(),
1018  mec=distribution.plots[0][0][0].get_color(), markersize=20, label=r'${\rm Test-Bkgr.}$')
1019  p2, = distribution.axis.plot([], label=r'${\rm Train-Bkgr.}$', linewidth=5,
1020  linestyle='dashed', c=distribution.plots[0][0][0].get_color())
1021  p3 = distribution.axis.errorbar([], [], xerr=0, yerr=0, elinewidth=5, mew=2, ecolor='w',
1022  # ecolor=[0.4, 0.1992156862745098, 0.02196078431372549],
1023  fmt='o', mfc=distribution.plots[1][0][0].get_color(),
1024  mec=distribution.plots[1][0][0].get_color(), markersize=20, label=r'${\rm Test-Signal}$')
1025  p4, = distribution.axis.plot([], label=r'${\rm Train-Signal}$', linewidth=5,
1026  linestyle='solid', alpha=0.9, c=distribution.plots[1][0][0].get_color())
1027 
1028  distribution.axis.legend([p1, p2, p3, p4], distribution.labels, loc='best', fancybox=True, framealpha=0.5, fontsize=60)
1029 
1030  self.plot_kwargs['color'] = distribution.plots[0][0][0].get_color()
1031  difference_bckgrd = normalizedResiduals(self.figure, self.axis_d1, shift_to_zero=True, normed=True)
1032  difference_bckgrd.set_plot_options(self.plot_kwargs)
1033  difference_bckgrd.set_errorbar_options(self.errorbar_kwargs)
1034  difference_bckgrd.set_errorband_options(self.errorband_kwargs)
1035  difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column, None, bins, isNN)
1036  self.axis_d1.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
1037  self.axis_d1.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
1038  # self.plot_kwargs['color'] = distribution.plots[0][0][0].get_color()
1039  difference_bckgrd.plots = difference_bckgrd.labels = []
1040  difference_bckgrd.finish(line_color=distribution.plots[0][0][0].get_color())
1041 
1042  self.plot_kwargs['color'] = distribution.plots[1][0][0].get_color()
1043  difference_signal = normalizedResiduals(self.figure, self.axis_d2, shift_to_zero=True, normed=True)
1044  difference_signal.set_plot_options(self.plot_kwargs)
1045  difference_signal.set_errorbar_options(self.errorbar_kwargs)
1046  difference_signal.set_errorband_options(self.errorband_kwargs)
1047  difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column, None, bins, isNN)
1048  self.axis_d2.set_xlim((difference_signal.xmin, difference_signal.xmax))
1049  self.axis_d2.set_ylim((difference_signal.ymin, difference_signal.ymax))
1050  difference_signal.plots = difference_signal.labels = []
1051  difference_signal.finish(line_color=distribution.plots[1][0][0].get_color())
1052 
1053 
1054 # try:
1055 # import scipy.stats
1056 # # Kolmogorov smirnov test
1057 # if len(data[column][train_mask & signal_mask]) == 0 or len(data[column][test_mask & signal_mask]) == 0:
1058 # B2WARNING("Cannot calculate kolmogorov smirnov test for signal due to missing data")
1059 # else:
1060 # ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
1061 # props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
1062 # self.axis_d1.text(0.1, 0.9, r'${\rm Signal\ (train\ -\ test)}\ p$-{\rm value}' + r'$={:.2f}$'.format(ks[1]),
1063 # fontsize=36, bbox=props,
1064 # verticalalignment='top', horizontalalignment='left', transform=self.axis_d1.transAxes)
1065 # if len(data[column][train_mask & bckgrd_mask]) == 0 or len(data[column][test_mask & bckgrd_mask]) == 0:
1066 # B2WARNING("Cannot calculate kolmogorov smirnov test for background due to missing data")
1067 # else:
1068 # ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
1069 # props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
1070 # self.axis_d2.text(0.1, 0.9, r'${\rm Bkgr.\ (train\ -\ test)}\ p$-{\rm value}' + r'$={:.2f}$'.format(ks[1]),
1071 # fontsize=36,
1072 # bbox=props,
1073 # verticalalignment='top', horizontalalignment='left', transform=self.axis_d2.transAxes)
1074 # except ImportError:
1075 # B2WARNING("Cannot calculate kolmogorov smirnov test please install scipy!")
1076 
1077  return self
1078 
1079  def finish(self, xLabel=r'${\rm Classifier\ Output}$'):
1080  """
1081  Sets limits, title, axis-labels and legend of the plot
1082  """
1083  # self.axis.set_title("Overtraining Plot")
1084  self.axis_d1.set_title("")
1085  self.axis_d2.set_title("")
1086  matplotlib.artist.setp(self.axis.get_xticklabels(), visible=False)
1087  matplotlib.artist.setp(self.axis_d1.get_xticklabels(), visible=False)
1088  self.axis.get_xaxis().set_label_text('')
1089  self.axis_d1.get_xaxis().set_label_text('')
1090  self.axis_d2.get_xaxis().set_label_text(xLabel, fontsize=85)
1091  self.axis_d2.get_xaxis().labelpad = 20
1092  self.axis_d2.get_xaxis().set_tick_params(labelsize=60)
1093  return self
1094 
1095 
1096 class VerboseDistribution(Plotter):
1097  """
1098  Plots distribution of a quantity including boxplots
1099  """
1100 
1101 
1102  box_axes = None
1103 
1104  def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
1105  """
1106  Creates a new figure and axis if None is given, sets the default plot parameters
1107  @param figure default draw figure which is used
1108  @param axis default draw axis which is used
1109  @param normed true if the histograms should be normed before drawing
1110  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
1111  """
1112  super(VerboseDistribution, self).__init__(figure, axis)
1113 
1114  self.normed = normed
1115 
1116  self.range_in_std = range_in_std
1117  self.box_axes = []
1118 
1119  self.distribution = Distribution(self.figure, self.axis, normed_to_all_entries=self.normed, range_in_std=self.range_in_std)
1120 
1121  def add(self, data, column, mask=None, weight_column=None, label=None):
1122  """
1123  Add a new distribution plot, with additional information like a boxplot compared to
1124  the ordinary Distribution plot.
1125  @param data pandas.DataFrame containing all data
1126  @param column which is used to calculate distribution histogram
1127  @param mask boolean numpy.array defining which events are used for the distribution histogram
1128  @param weight_column column in data containing the weights for each event
1129  """
1130  self.distribution.set_plot_options(self.plot_kwargs)
1131  self.distribution.set_errorbar_options(self.errorbar_kwargs)
1132  self.distribution.set_errorband_options(self.errorband_kwargs)
1133  self.distribution.add(data, column, mask, weight_column, label=label)
1134 
1135  n = len(self.box_axes) + 1
1136  gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1137  gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i in range(n)]
1138  box_axis = self.add_subplot(gridspecs)
1139 
1140  if self.range_in_std is not None:
1141  mean, std = histogram.weighted_mean_and_std(data[column], None if weight_column is None else data[weight_column])
1142  # Everything outside mean +- range_in_std * std is considered not inside the mask
1143  mask = mask & (data[column] > (mean - self.range_in_std * std)) & (data[column] < (mean + self.range_in_std * std))
1144  box = Box(self.figure, box_axis)
1145  box.add(data, column, mask, weight_column)
1146  if len(box.plots) > 0:
1147  box.plots[0]['boxes'][0].set_facecolor(self.distribution.plots[-1][0][0].get_color())
1148  box.finish()
1149 
1150  self.box_axes.append(box_axis)
1151  return self
1152 
1153  def finish(self):
1154  """
1155  Sets limits, title, axis-labels and legend of the plot
1156  """
1157  self.distribution.finish()
1158  matplotlib.artist.setp(self.axis.get_xticklabels(), visible=False)
1159  self.axis.get_xaxis().set_label_text('')
1160  for box_axis in self.box_axes[:-1]:
1161  matplotlib.artist.setp(box_axis.get_xticklabels(), visible=False)
1162  box_axis.set_title("")
1163  box_axis.get_xaxis().set_label_text('')
1164  self.box_axes[-1].set_title("")
1165  self.axis.set_title("Distribution Plot")
1166  self.axis.legend([x[0] for x in self.distribution.plots], self.distribution.labels,
1167  loc='best', fancybox=True, framealpha=0.5)
1168  return self
1169 
1170 
1171 class Correlation(Plotter):
1172  """
1173  Plots change of a distribution of a quantity depending on the cut on a classifier
1174  """
1175 
1176  figure = None
1177 
1178  axis = None
1179 
1180  axis_d1 = None
1181 
1182  axis_d2 = None
1183 
1184  def __init__(self, figure=None):
1185  """
1186  Creates a new figure if None is given, sets the default plot parameters
1187  @param figure default draw figure which is used
1188  """
1189  if figure is None:
1190  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
1191  self.figure.set_tight_layout(True)
1192  else:
1193  self.figure = figure
1194 
1195  gs = matplotlib.gridspec.GridSpec(3, 2)
1196  self.axis = self.figure.add_subplot(gs[0, :])
1197  self.axis_d1 = self.figure.add_subplot(gs[1, :], sharex=self.axis)
1198  self.axis_d2 = self.figure.add_subplot(gs[2, :], sharex=self.axis)
1199 
1200  super(Correlation, self).__init__(self.figure, self.axis)
1201 
1202  def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1203  """
1204  Add a new correlation plot.
1205  @param data pandas.DataFrame containing all data
1206  @param column which is used to calculate distribution histogram
1207  @param cut_column which is used to calculate cut on the other quantity defined by column
1208  @param quantiles list of quantiles between 0 and 100, defining the different cuts
1209  @param weight_column column in data containing the weights for each event
1210  """
1211  if len(data[cut_column]) == 0:
1212  B2WARNING("Ignore empty Correlation.")
1213  return self
1214 
1215  axes = [self.axis, self.axis_d1, self.axis_d2]
1216 
1217  for i, (l, m) in enumerate([('.', signal_mask | bckgrd_mask), ('S', signal_mask), ('B', bckgrd_mask)]):
1218 
1219  if weight_column is not None:
1220  weights = numpy.array(data[weight_column][m])
1221  else:
1222  weights = numpy.ones(len(data[column][m]))
1223 
1224  # The cast to float32 is a workaround for the following numpy issue:
1225  # https://github.com/numpy/numpy/issues/8123
1226  xrange = np.percentile(data[column][m], [5, 95]).astype(np.float32)
1227 
1228  colormap = plt.get_cmap('coolwarm')
1229  tmp, x = np.histogram(data[column][m], bins=100,
1230  range=xrange, normed=True, weights=weights)
1231  bin_center = ((x + np.roll(x, 1)) / 2)[1:]
1232  axes[i].plot(bin_center, tmp, color='black', lw=1)
1233 
1234  for quantil in np.arange(5, 100, 5):
1235  cut = np.percentile(data[cut_column][m], quantil)
1236  sel = data[cut_column][m] >= cut
1237  y, x = np.histogram(data[column][m][sel], bins=100,
1238  range=xrange, normed=True, weights=weights[sel])
1239  bin_center = ((x + np.roll(x, 1)) / 2)[1:]
1240  axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1241  tmp = y
1242 
1243  axes[i].set_ylim(bottom=0)
1244 
1245  flatness_score = basf2_mva_util.calculate_flatness(data[column][m], data[cut_column][m], weights)
1246  axes[i].set_title(r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1247  return self
1248 
1249  def finish(self):
1250  """
1251  Sets limits, title, axis-labels and legend of the plot
1252  """
1253  return self
1254 
1255 
1256 class TSNE(Plotter):
1257  """
1258  Plots multivariate distribution using TSNE algorithm
1259  """
1260 
1261  def add(self, data, columns, *masks):
1262  """
1263  Add a new correlation plot.
1264  @param data pandas.DataFrame containing all data
1265  @param columns which are used to calculate the correlations
1266  @param masks different classes to show in TSNE
1267  """
1268  try:
1269  import sklearn
1270  import sklearn.manifold
1271  model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1272  data = numpy.array([data[column] for column in columns]).T
1273  model.fit(data)
1274  for mask in masks:
1275  data = numpy.array([data[column][mask] for column in columns]).T
1276  data = model.transform(data)
1277  self.axis.scatter(data[:, 0], data[:, 1])
1278  except ImportError:
1279  print("Cannot create TSNE plot. Install sklearn if you want it")
1280  return self
1281 
1282  def finish(self):
1283  """
1284  Sets limits, title, axis-labels and legend of the plot
1285  """
1286  return self
1287 
1288 
1289 class Importance(Plotter):
1290  """
1291  Plots importance matrix
1292  """
1293 
1294  def add(self, data, columns, variables, displayHeatMap):
1295  """
1296  Add a new correlation plot.
1297  @param data pandas.DataFrame containing all data
1298  @param columns which are used to calculate the correlations
1299  """
1300  self.figure.set_tight_layout(True)
1301 
1302  def norm(x):
1303  width = (numpy.max(x) - numpy.min(x))
1304  if width <= 0:
1305  return numpy.zeros(x.shape)
1306  return (x - numpy.min(x)) / width * 100
1307 
1308  importance_matrix = numpy.vstack([norm(data[column]) for column in columns]).T
1309 
1310  cRdBu = plt.get_cmap('RdBu')
1311  new_RdBu = truncate_colormap(cRdBu, 0.5, 0.85)
1312 
1313  labelsValues = []
1314  labels = list(variables)
1315 
1316  for y in range(importance_matrix.shape[0]):
1317  for x in range(importance_matrix.shape[1]):
1318  labelsValues.append([importance_matrix[y, x], labels[y]])
1319 
1320  labelsValues = np.array(sorted(labelsValues))
1321 
1322  arrayToSort = np.array(np.sort(importance_matrix, axis=0))
1323  # print(arrayToSort)
1324  importance_heatmap = self.axis.pcolor(arrayToSort, cmap=new_RdBu, vmin=0, vmax=100)
1325  # importance_heatmap = self.axis.pcolor(importance_matrix, cmap=matplotlib.pyplot.cm.viridis, vmin=0.0, vmax=100)
1326 
1327  CoeffSize = 33
1328 
1329  # put the major ticks at the middle of each cell
1330  self.axis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=False)
1331  self.axis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=False)
1332 
1333  self.axis.set_xticklabels(columns, minor=False, rotation=90)
1334  # self.axis.set_yticklabels(variables, minor=False)
1335 
1336  if labelsValues.shape[0] < 6:
1337  CoeffSize = 50
1338  self.axis.set_yticklabels(labelsValues[:, 1], minor=False, size=58)
1339 
1340  else:
1341  self.axis.set_yticklabels(labelsValues[:, 1], minor=False)
1342 
1343  self.axis.set_xticklabels([''])
1344 
1345  # for y in range(importance_matrix.shape[0]):
1346  # for x in range(importance_matrix.shape[1]):
1347  # self.axis.text(x + 0.5, y + 0.5, r'$%.0f$' % importance_matrix[y, x],
1348  # size=33,
1349  # horizontalalignment='center',
1350  # verticalalignment='center')
1351 
1352  for y in range(labelsValues.shape[0]):
1353  self.axis.text(x + 0.5, y + 0.5, r'$%.0f$' % float(labelsValues[y][0]),
1354  size=CoeffSize,
1355  horizontalalignment='center',
1356  verticalalignment='center')
1357 
1358  if displayHeatMap:
1359  cb = self.figure.colorbar(importance_heatmap, ticks=[2, 98], orientation='vertical')
1360  cb.ax.tick_params(length=0)
1361  cb.ax.set_yticklabels([r'${\rm low}$', r'${\rm high}$'], size=60)
1362 
1363  self.axis.set_aspect('equal')
1364 
1365  return self
1366 
1367  def finish(self):
1368  """
1369  Sets limits, title, axis-labels and legend of the plot
1370  """
1371  return self
1372 
1373 
1374 def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
1375  new_cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
1376  'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
1377  cmap(np.linspace(minval, maxval, n)))
1378  return new_cmap
1379 
1380 
1381 class CorrelationMatrix(Plotter):
1382  """
1383  Plots correlation matrix
1384  """
1385 
1386  figure = None
1387 
1388  signal_axis = None
1389 
1390  bckgrd_axis = None
1391 
1392  def __init__(self, figure=None):
1393  """
1394  Creates a new figure if None is given, sets the default plot parameters
1395  @param figure default draw figure which is used
1396  """
1397  if figure is None:
1398  self.figure = matplotlib.figure.Figure(figsize=(38, 24))
1399  self.figure.set_tight_layout(True)
1400  else:
1401  self.figure = figure
1402 
1403  gs = matplotlib.gridspec.GridSpec(16, 2)
1404  self.signal_axis = self.figure.add_subplot(gs[:14, 0])
1405  self.bckgrd_axis = self.figure.add_subplot(gs[:14, 1], sharey=self.signal_axis)
1406 
1407  self.colorbar_axis = self.figure.add_subplot(gs[15, :])
1408 
1409  self.axis = self.signal_axis
1410 
1411  super(CorrelationMatrix, self).__init__(self.figure, self.axis)
1412 
1413  def add(self, data, columns, signal_mask, bckgrd_mask, bkgrOutput):
1414  """
1415  Add a new correlation plot.
1416  @param data pandas.DataFrame containing all data
1417  @param columns which are used to calculate the correlations
1418  """
1419  # columns = list(reversed(columns))
1420  signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column in columns])) * 100
1421  bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask] for column in columns])) * 100
1422 
1423  mirrored_signal_corr = np.zeros(signal_corr.shape)
1424  mirrored_bckgrd_corr = np.zeros(bckgrd_corr.shape)
1425 
1426  for y in range(signal_corr.shape[0]):
1427  for x in range(signal_corr.shape[1]):
1428  mirrored_signal_corr[y, x] = signal_corr[y, signal_corr.shape[1] - 1 - x]
1429 
1430  for y in range(bckgrd_corr.shape[0]):
1431  for x in range(bckgrd_corr.shape[1]):
1432  mirrored_bckgrd_corr[y, x] = bckgrd_corr[y, bckgrd_corr.shape[1] - 1 - x]
1433 
1434  cRdBu = plt.get_cmap('RdBu')
1435  new_RdBu = truncate_colormap(cRdBu, 0.15, 0.85)
1436  signal_heatmap = self.signal_axis.pcolor(mirrored_signal_corr, cmap=new_RdBu, vmin=-100.0, vmax=100.0)
1437  # bckgrd_heatmap = self.bckgrd_axis.pcolor(mirrored_bckgrd_corr, cmap=new_RdBu, vmin=-100.0, vmax=100.0)
1438 
1439  # cvir = plt.get_cmap('viridis_r')
1440  # new_cvir = truncate_colormap(cvir, 0, 0.75)
1441  # signal_heatmap = self.signal_axis.pcolor(mirrored_signal_corr, cmap=new_cvir, vmin=-100.0, vmax=100.0)
1442  # bckgrd_heatmap = self.bckgrd_axis.pcolor(mirrored_bckgrd_corr, cmap=new_cvir, vmin=-100.0, vmax=100.0)
1443 
1444  for y in range(mirrored_signal_corr.shape[0]):
1445  for x in range(mirrored_signal_corr.shape[1]):
1446  outputWithRedundantMinus = '%.0f' % mirrored_signal_corr[y, x]
1447  if outputWithRedundantMinus == '-0':
1448  mirrored_signal_corr[y, x] = 0
1449 
1450  for y in range(mirrored_bckgrd_corr.shape[0]):
1451  for x in range(mirrored_bckgrd_corr.shape[1]):
1452  outputWithRedundantMinus = '%.0f' % mirrored_bckgrd_corr[y, x]
1453  if outputWithRedundantMinus == '-0':
1454  mirrored_bckgrd_corr[y, x] = 0
1455 
1456  self.signal_axis.invert_yaxis()
1457  self.signal_axis.xaxis.tick_top()
1458  self.bckgrd_axis.invert_yaxis()
1459  self.bckgrd_axis.xaxis.tick_top()
1460 
1461  # put the major ticks at the middle of each cell
1462  self.signal_axis.set_xticks(numpy.arange(mirrored_signal_corr.shape[0]) + 0.5, minor=False)
1463  self.signal_axis.set_yticks(numpy.arange(mirrored_signal_corr.shape[1]) + 0.5, minor=False)
1464 
1465  CoeffSize = 30
1466 
1467  # put the major ticks at the middle of each cell
1468  self.bckgrd_axis.set_xticks(numpy.arange(mirrored_bckgrd_corr.shape[0]) + 0.5, minor=False)
1469  self.bckgrd_axis.set_yticks(numpy.arange(mirrored_bckgrd_corr.shape[1]) + 0.5, minor=False)
1470 
1471  if mirrored_signal_corr.shape[0] < 8:
1472  CoeffSize = 50
1473  self.bckgrd_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90, size=58)
1474  self.bckgrd_axis.set_yticklabels(columns, minor=False, size=58)
1475  self.signal_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90, size=58)
1476  self.signal_axis.set_yticklabels(columns, minor=False, size=58)
1477  else:
1478  self.bckgrd_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90)
1479  self.bckgrd_axis.set_yticklabels(columns, minor=False)
1480  self.signal_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90)
1481  self.signal_axis.set_yticklabels(columns, minor=False)
1482 
1483  for y in range(mirrored_signal_corr.shape[0]):
1484  for x in range(mirrored_signal_corr.shape[1]):
1485  if mirrored_signal_corr.shape[0] > 24 and mirrored_signal_corr[y, x] < 0:
1486  self.signal_axis.text(x + 0.5, y + 0.5, '-' + r'$%.0f$' % abs(mirrored_signal_corr[y, x]),
1487  size=25,
1488  horizontalalignment='center',
1489  verticalalignment='center')
1490  else:
1491  self.signal_axis.text(x + 0.5, y + 0.5, r'$%.0f$' % mirrored_signal_corr[y, x],
1492  size=CoeffSize,
1493  horizontalalignment='center',
1494  verticalalignment='center')
1495 
1496  for y in range(mirrored_bckgrd_corr.shape[0]):
1497  for x in range(mirrored_bckgrd_corr.shape[1]):
1498  if mirrored_bckgrd_corr.shape[0] > 24 and mirrored_bckgrd_corr[y, x] < 0:
1499  self.signal_axis.text(x + 0.5, y + 0.5, '-' + r'$%.0f$' % abs(mirrored_bckgrd_corr[y, x]),
1500  size=25,
1501  horizontalalignment='center',
1502  verticalalignment='center')
1503  else:
1504  self.bckgrd_axis.text(x + 0.5, y + 0.5, r'$%.0f$' % mirrored_bckgrd_corr[y, x],
1505  size=CoeffSize,
1506  horizontalalignment='center',
1507  verticalalignment='center')
1508 
1509  cb = self.figure.colorbar(signal_heatmap, cax=self.colorbar_axis, ticks=[-92.3, 0, 92.5], orientation='horizontal')
1510  cb.ax.tick_params(length=0)
1511  cb.ax.set_xticklabels([r'${\rm negative}$', r'${\rm uncorrelated}$', r'${\rm positive}$'], fontsize=60)
1512 
1513  if bkgrOutput == -1:
1514  self.figure.text(0.30, 0.11, r'$B^0\,(q_{\rm MC} = +1)$', horizontalalignment='center', size=65)
1515  self.figure.text(0.74, 0.11, r'$\bar{B}^0\,(q_{\rm MC} = -1)$', horizontalalignment='center', size=65)
1516 
1517  else:
1518  self.figure.text(0.27, 0.115, r'${\rm Signal}$', horizontalalignment='center', size=65)
1519  self.figure.text(0.73, 0.115, r'${\rm Background}$', horizontalalignment='center', size=65)
1520 
1521  return self
1522 
1523  def finish(self):
1524  """
1525  Sets limits, title, axis-labels and legend of the plot
1526  """
1527  matplotlib.artist.setp(self.bckgrd_axis.get_yticklabels(), visible=False)
1528  return self
1529 
1530 
1531 if __name__ == '__main__':
1532 
1533  def get_data(N, columns):
1534  """
1535  Creates fake data for example plots
1536  """
1537  N /= 2
1538  n = len(columns) - 1
1539  xs = numpy.random.normal(0, size=(N, n))
1540  xb = numpy.random.normal(1, size=(N, n))
1541  ys = numpy.zeros(N)
1542  yb = numpy.ones(N)
1543  data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1544  return data.reindex(numpy.random.permutation(data.index))
1545 
1546  import seaborn
1547  # Set nice searborn settings
1548  seaborn.set(font_scale=3)
1549  seaborn.set_style('whitegrid')
1550 
1551  # Standard plots
1552  N = 100000
1553  data = get_data(N, columns=['FastBDT', 'NeuroBayes', 'isSignal'])
1554  data['type'] = ''
1555  data.type.iloc[:N / 2] = 'Train'
1556  data.type.iloc[N / 2:] = 'Test'
1557 
1558  p = Box()
1559  p.add(data, 'FastBDT')
1560  p.finish()
1561  p.save('box_plot.png')
1562 
1563  p = VerboseDistribution()
1564  p.add(data, 'FastBDT')
1565  p.add(data, 'NeuroBayes')
1566  p.finish()
1567  p.save('verbose_distribution_plot.png')
1568 
1569  p = PurityOverEfficiency()
1570  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1571  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1572  p.finish()
1573  p.save('roc_purity_plot.png')
1574 
1575  p = RejectionOverEfficiency()
1576  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1577  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1578  p.finish()
1579  p.save('roc_rejection_plot.png')
1580 
1581  p = Diagonal()
1582  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1583  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1584  p.finish()
1585  p.save('diagonal_plot.png')
1586 
1587  p = Distribution()
1588  p.add(data, 'FastBDT')
1589  p.add(data, 'NeuroBayes')
1590  p.finish()
1591  p.save('distribution_plot.png')
1592 
1593  p = Difference()
1594  p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test')
1595  p.add(data, 'NeuroBayes', data['type'] == 'Train', data['type'] == 'Test')
1596  p.finish()
1597  p.save('difference_plot.png')
1598 
1599  p = Overtraining()
1600  p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test', data['isSignal'] == 1, data['isSignal'] == 0)
1601  p.finish()
1602  p.save('overtraining_plot.png')
1603 
1604  p = Correlation()
1605  p.add(data, 'FastBDT', 'NeuroBayes', [0, 20, 40, 60, 80, 100], data['isSignal'] == 0)
1606  p.finish()
1607  p.save('correlation_plot.png')
1608 
1609  p = CorrelationMatrix()
1610  data['FastBDT2'] = data['FastBDT']**2
1611  data['NeuroBayes2'] = data['NeuroBayes']**2
1612  data['FastBDT3'] = data['FastBDT']**3
1613  data['NeuroBayes3'] = data['NeuroBayes']**3
1614  p.add(data, ['FastBDT', 'NeuroBayes', 'FastBDT2', 'NeuroBayes2', 'FastBDT3', 'NeuroBayes3'])
1615  p.finish()
1616  p.save('correlation_matrix.png')
1617 
1618 # @endcond
def calculate_flatness(f, p, w=None)
def weighted_mean_and_std(x, w)
Definition: histogram.py:32
def poisson_error(n_tot)
Definition: histogram.py:25
Definition: plot.py:1