Belle II Software  release-05-01-25
ftPlotting.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 # @cond SUPPRESS_DOXYGEN
5 
6 # Thomas Keck 2015
7 
8 import copy
9 import math
10 
11 import numpy
12 import numpy as np
13 import matplotlib
14 # Do not use standard backend TkAgg, because it is NOT thread-safe
15 # You will get an RuntimeError: main thread is not in main loop otherwise!
16 matplotlib.use("svg")
17 matplotlib.rcParams.update({'font.size': 40})
18 matplotlib.rcParams['text.usetex'] = True
19 matplotlib.rcParams['text.latex.preamble'] = [r"\usepackage{amsmath}"]
20 import matplotlib.pyplot as plt
21 import matplotlib.artist
22 import matplotlib.figure
23 import matplotlib.gridspec
24 import matplotlib.colors
25 import matplotlib.patches
26 import matplotlib.ticker
27 
28 import basf2_mva_evaluation.histogram as histogram
29 
30 from basf2 import B2INFO, B2WARNING
31 
32 import basf2_mva_util
33 
34 
35 class Plotter(object):
36  """
37  Base class for all Plotters.
38  """
39 
40 
41  plots = None
42 
43  labels = None
44 
45  xmin = None
46 
47  xmax = None
48 
49  ymin = None
50 
51  ymax = None
52  yscale = 0.0
53  xscale = 0.0
54 
55  figure = None
56 
57  axis = None
58 
59  def __init__(self, figure=None, axis=None):
60  """
61  Creates a new figure and axis if None is given, sets the default plot parameters
62  @param figure default draw figure which is used
63  @param axis default draw axis which is used
64  """
65  B2INFO("Create new figure for class " + str(type(self)))
66  if figure is None:
67  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
68  self.figure.set_tight_layout(False)
69  else:
70  self.figure = figure
71 
72  if axis is None:
73  self.axis = self.figure.add_subplot(1, 1, 1)
74  else:
75  self.axis = axis
76 
77  self.plots = []
78  self.labels = []
79  self.xmin, self.xmax = float(0), float(1)
80  self.ymin, self.ymax = float(0), float(1)
81 
82  self.yscale = 0.1
83 
84  self.xscale = 0.0
85 
86 
87  self.plot_kwargs = None
88 
89  self.errorbar_kwargs = None
90 
91  self.errorband_kwargs = None
92 
93  self.fill_kwargs = None
94 
95  self.set_plot_options()
96  self.set_errorbar_options()
97  self.set_errorband_options()
98  self.set_fill_options()
99 
100  def add_subplot(self, gridspecs):
101  """
102  Adds a new subplot to the figure, updates all other axes
103  according to the given gridspec
104  @param gridspecs gridspecs for all axes including the new one
105  """
106  for gs, ax in zip(gridspecs[:-1], self.figure.axes):
107  ax.set_position(gs.get_position(self.figure))
108  ax.set_subplotspec(gs)
109  axis = self.figure.add_subplot(gridspecs[-1], sharex=self.axis)
110  return axis
111 
112  def save(self, filename):
113  """
114  Save the figure into a file
115  @param filename of the file
116  """
117  B2INFO("Save figure for class " + str(type(self)))
118  from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
119  canvas = FigureCanvas(self.figure)
120  canvas.print_figure(filename, dpi=50)
121  return self
122 
123  def set_plot_options(self, plot_kwargs={'linestyle': ''}):
124  """
125  Overrides default plot options for datapoint plot
126  @param plot_kwargs keyword arguments for the plot function
127  """
128  self.plot_kwargs = copy.copy(plot_kwargs)
129  return self
130 
131  def set_errorbar_options(self, errorbar_kwargs={'fmt': '.', 'elinewidth': 3, 'alpha': 1}):
132  """
133  Overrides default errorbar options for datapoint errorbars
134  @param errorbar_kwargs keyword arguments for the errorbar function
135  """
136  self.errorbar_kwargs = copy.copy(errorbar_kwargs)
137  return self
138 
139  def set_errorband_options(self, errorband_kwargs={'alpha': 0.5}):
140  """
141  Overrides default errorband options for datapoint errorband
142  @param errorbar_kwargs keyword arguments for the fill_between function
143  """
144  self.errorband_kwargs = copy.copy(errorband_kwargs)
145  return self
146 
147  def set_fill_options(self, fill_kwargs=None):
148  """
149  Overrides default fill_between options for datapoint errorband
150  @param fill_kwargs keyword arguments for the fill_between function
151  """
152  self.fill_kwargs = copy.copy(fill_kwargs)
153  return self
154 
155  def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None):
156  """
157  Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
158  @param x coordinates of the data points
159  @param y coordinates of the data points
160  @param xerr symmetric error on x data points
161  @param yerr symmetric error on y data points
162  """
163  p = e = f = None
164  plot_kwargs = copy.copy(self.plot_kwargs)
165  errorbar_kwargs = copy.copy(self.errorbar_kwargs)
166  errorband_kwargs = copy.copy(self.errorband_kwargs)
167  fill_kwargs = copy.copy(self.fill_kwargs)
168 
169  if plot_kwargs is None or 'color' not in plot_kwargs:
170  color = next(axis._get_lines.prop_cycler)
171  color = color['color']
172  plot_kwargs['color'] = color
173  else:
174  color = plot_kwargs['color']
175  color = matplotlib.colors.ColorConverter().to_rgb(color)
176  patch = matplotlib.patches.Patch(color=color, alpha=0.7)
177  patch.get_color = patch.get_facecolor
178  patches = [patch]
179 
180  if plot_kwargs is not None:
181  p, = axis.plot(x, y, **plot_kwargs)
182  patches.append(p)
183 
184  if errorbar_kwargs is not None and (xerr is not None or yerr is not None):
185  if 'color' not in errorbar_kwargs:
186  errorbar_kwargs['color'] = color
187  if 'ecolor' not in errorbar_kwargs:
188  errorbar_kwargs['ecolor'] = [0.4 * x for x in color]
189  # print("Here Colors")
190  # print([0.4 * x for x in color])
191  # if 'elinewidth' not in errorbar_kwargs:
192  errorbar_kwargs['elinewidth'] = 5
193  e = axis.errorbar(x, y, xerr=xerr, yerr=yerr, **errorbar_kwargs)
194  patches.append(e)
195 
196  if errorband_kwargs is not None and yerr is not None:
197  if 'color' not in errorband_kwargs:
198  errorband_kwargs['color'] = color
199  if xerr is not None:
200  # Ensure that xerr and yerr are iterable numpy arrays
201  xerr = x + xerr - x
202  yerr = y + yerr - y
203  for _x, _y, _xe, _ye in zip(x, y, xerr, yerr):
204  axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye,
205  **errorband_kwargs))
206  else:
207  f = axis.fill_between(x, y - yerr, y + yerr, interpolate=True, **errorband_kwargs)
208 
209  if fill_kwargs is not None:
210  axis.fill_between(x, y, 0, **fill_kwargs)
211 
212  return (tuple(patches), p, e, f)
213 
214  def add(self, *args, **kwargs):
215  """
216  Add a new plot to this plotter
217  """
218  return NotImplemented
219 
220  def finish(self, *args, **kwargs):
221  """
222  Finish plotting and set labels, legends and stuff
223  """
224  return NotImplemented
225 
226  def scale_limits(self):
227  """
228  Scale limits to increase distance to boundaries
229  """
230  self.ymin *= 1.0 - math.copysign(self.yscale, self.ymin)
231  self.ymax *= 1.0 + math.copysign(self.yscale, self.ymax)
232  self.xmin *= 1.0 - math.copysign(self.xscale, self.xmin)
233  self.xmax *= 1.0 + math.copysign(self.xscale, self.xmax)
234  return self
235 
236 
237 class PurityAndEfficiencyOverCut(Plotter):
238  """
239  Plots the purity and the efficiency over the cut value (for cut choosing)
240  """
241 
245 
246  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
247  """
248  Add a new curve to the plot
249  @param data pandas.DataFrame containing all data
250  @param column which is used to calculate efficiency and purity for different cuts
251  @param signal_mask boolean numpy.array defining which events are signal events
252  @param bckgrd_mask boolean numpy.array defining which events are background events
253  @param weight_column column in data containing the weights for each event
254  """
255 
256  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
257 
258  if normed:
259  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
260  purity, purity_error = hists.get_purity(['Signal'], ['Background'])
261  else:
262  efficiency, efficiency_error = hists.get_true_positives(['Signal'])
263  purity, purity_error = hists.get_false_positives(['Background'])
264 
265  cuts = hists.bin_centers
266 
267  self.xmin, self.xmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmax])
268  self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.ymin]), \
269  numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.ymax])
270 
271  self.plots.append(self._plot_datapoints(self.axis, cuts, efficiency, xerr=0, yerr=efficiency_error))
272 
273  if normed:
274  self.labels.append("Efficiency")
275  else:
276  self.labels.append("True positive")
277 
278  self.plots.append(self._plot_datapoints(self.axis, cuts, purity, xerr=0, yerr=purity_error))
279 
280  if normed:
281  self.labels.append("Purity")
282  else:
283  self.labels.append("False positive")
284 
285  return self
286 
287  def finish(self):
288  """
289  Sets limits, title, axis-labels and legend of the plot
290  """
291  self.axis.set_xlim((self.xmin, self.xmax))
292  self.axis.set_ylim((self.ymin, self.ymax))
293  self.axis.set_title("Classification Plot")
294  self.axis.get_xaxis().set_label_text('Cut Value')
295  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
296  return self
297 
298 
299 class SignalToNoiseOverCut(Plotter):
300  """
301  Plots the signal to noise ratio over the cut value (for cut choosing)
302  """
303 
307 
308  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
309  """
310  Add a new curve to the plot
311  @param data pandas.DataFrame containing all data
312  @param column which is used to calculate signal to noise ratio for different cuts
313  @param signal_mask boolean numpy.array defining which events are signal events
314  @param bckgrd_mask boolean numpy.array defining which events are background events
315  @param weight_column column in data containing the weights for each event
316  """
317 
318  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
319 
320  signal2noise, signal2noise_error = hists.get_signal_to_noise(['Signal'], ['Background'])
321 
322  cuts = hists.bin_centers
323 
324  self.xmin, self.xmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmax])
325  self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(signal2noise), self.ymin]), \
326  numpy.nanmax([numpy.nanmax(signal2noise), self.ymax])
327 
328  self.plots.append(self._plot_datapoints(self.axis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))
329 
330  self.labels.append(column)
331 
332  return self
333 
334  def finish(self):
335  """
336  Sets limits, title, axis-labels and legend of the plot
337  """
338  self.axis.set_xlim((self.xmin, self.xmax))
339  self.axis.set_ylim((self.ymin, self.ymax))
340  self.axis.set_title("Signal to Noise Plot")
341  self.axis.get_xaxis().set_label_text('Cut Value')
342  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
343  return self
344 
345 
346 class PurityOverEfficiency(Plotter):
347  """
348  Plots the purity over the efficiency also known as ROC curve
349  """
350 
354 
355  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
356  """
357  Add a new curve to the ROC plot
358  @param data pandas.DataFrame containing all data
359  @param column which is used to calculate efficiency and purity for different cuts
360  @param signal_mask boolean numpy.array defining which events are signal events
361  @param bckgrd_mask boolean numpy.array defining which events are background events
362  @param weight_column column in data containing the weights for each event
363  """
364  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
365  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
366  purity, purity_error = hists.get_purity(['Signal'], ['Background'])
367 
368  self.xmin, self.xmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmax])
369  self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
370 
371  p = self._plot_datapoints(self.axis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
372  self.plots.append(p)
373  if label is not None:
374  self.labels.append(label)
375  else:
376  self.labels.append(column)
377  return self
378 
379  def finish(self):
380  """
381  Sets limits, title, axis-labels and legend of the plot
382  """
383  self.axis.set_xlim((self.xmin, self.xmax))
384  self.axis.set_ylim((self.ymin, self.ymax))
385  self.axis.set_title("ROC Purity Plot")
386  self.axis.get_xaxis().set_label_text('Efficiency')
387  self.axis.get_yaxis().set_label_text('Purity')
388  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
389  return self
390 
391 
392 class RejectionOverEfficiency(Plotter):
393  """
394  Plots the rejection over the efficiency also known as ROC curve
395  """
396 
400 
401  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
402  """
403  Add a new curve to the ROC plot
404  @param data pandas.DataFrame containing all data
405  @param column which is used to calculate efficiency and purity for different cuts
406  @param signal_mask boolean numpy.array defining which events are signal events
407  @param bckgrd_mask boolean numpy.array defining which events are background events
408  @param weight_column column in data containing the weights for each event
409  """
410  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
411  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
412  rejection, rejection_error = hists.get_efficiency(['Background'])
413  rejection = 1 - rejection
414 
415  self.xmin, self.xmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmax])
416  self.ymin, self.ymax = numpy.nanmin([rejection.min(), self.ymin]), numpy.nanmax([rejection.max(), self.ymax])
417 
418  auc = numpy.abs(numpy.trapz(rejection, efficiency))
419 
420  p = self._plot_datapoints(self.axis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
421  self.plots.append(p)
422  if label is not None:
423  self.labels.append(label[:10] + r"$\ {\rm AUC}\ =\ $" + r"${:.2f}$".format(auc))
424  else:
425  self.labels.append(r"${\rm AUC}\ =\ $" + r"${:.2f}$".format(auc))
426 
427  return auc # self,
428 
429  def finish(self):
430  """
431  Sets limits, title, axis-labels and legend of the plot
432  """
433  self.axis.set_xlim((self.xmin, self.xmax))
434  self.axis.set_ylim((self.ymin, self.ymax))
435  # self.axis.set_title("ROC Rejection Plot")
436  self.axis.get_xaxis().set_tick_params(labelsize=60)
437  self.axis.get_yaxis().set_tick_params(labelsize=60)
438  self.axis.grid(True)
439  self.axis.get_xaxis().labelpad = 20
440  self.axis.get_yaxis().labelpad = 20
441  self.axis.get_xaxis().set_label_text(r'${\rm Signal\ Efficiency}$', fontsize=65)
442  self.axis.get_yaxis().set_label_text(r'${\rm Background\ Rejection}$', fontsize=65)
443  self.axis.legend([x[0] for x in self.plots], self.labels, fancybox=True, framealpha=0.5, fontsize=60, loc=3)
444  return self
445 
446 
447 class Multiplot(Plotter):
448  """
449  Plots multiple other plots into a grid 3x?
450  """
451 
452  figure = None
453 
454  axis = None
455 
456  def __init__(self, cls, number_of_plots, figure=None):
457  """
458  Creates a new figure if None is given, sets the default plot parameters
459  @param figure default draw figure which is used
460  """
461  if figure is None:
462  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
463  self.figure.set_tight_layout(True)
464  else:
465  self.figure = figure
466 
467  if number_of_plots == 1:
468  gs = matplotlib.gridspec.GridSpec(1, 1)
469  elif number_of_plots == 2:
470  gs = matplotlib.gridspec.GridSpec(1, 2)
471  elif number_of_plots == 3:
472  gs = matplotlib.gridspec.GridSpec(1, 3)
473  else:
474  gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
475 
476 
477  self.sub_plots = [cls(self.figure, self.figure.add_subplot(gs[i // 3, i % 3])) for i in range(number_of_plots)]
478  self.axis = self.sub_plots[0].axis
479  super(Multiplot, self).__init__(self.figure, self.axis)
480 
481  def add(self, i, *args, **kwargs):
482  """
483  Call add function of ith subplot
484  @param i position of the subplot
485  """
486  self.sub_plots[i].add(*args, **kwargs)
487 
488  def finish(self):
489  """
490  Sets limits, title, axis-labels and legend of the plot
491  """
492  for plot in self.sub_plots:
493  plot.finish()
494  return self
495 
496 
497 class Diagonal(Plotter):
498  """
499  Plots the purity in each bin over the classifier output.
500  """
501 
505 
506  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
507  """
508  Add a new curve to the Diagonal plot
509  @param data pandas.DataFrame containing all data
510  @param column which is used to calculate purity for different cuts
511  @param signal_mask boolean numpy.array defining which events are signal events
512  @param bckgrd_mask boolean numpy.array defining which events are background events
513  @param weight_column column in data containing the weights for each event
514  """
515  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
516  purity, purity_error = hists.get_purity_per_bin(['Signal'], ['Background'])
517 
518  self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
519  # self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
520  self.ymin, self.ymax = 0, 1
521 
522  p = self._plot_datapoints(self.axis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
523  self.plots.append(p)
524  self.labels.append(column)
525  return self
526 
527  def finish(self):
528  """
529  Sets limits, title, axis-labels and legend of the plot
530  """
531  self.scale_limits()
532  self.axis.plot((0.0, 1.0), (0.0, 1.0), color='black')
533  self.axis.set_xlim((self.xmin, self.xmax))
534  self.axis.set_ylim((self.ymin, self.ymax))
535  self.axis.set_title("Diagonal Plot")
536  self.axis.get_xaxis().set_label_text('Classifier Output')
537  self.axis.get_yaxis().set_label_text('Purity Per Bin')
538  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
539  return self
540 
541 
542 class Distribution(Plotter):
543  """
544  Plots distribution of a quantity
545  """
546 
547  def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
548  keep_first_binning=False, range_in_std=None, logScale=False):
549  """
550  Creates a new figure and axis if None is given, sets the default plot parameters
551  @param figure default draw figure which is used
552  @param axis default draw axis which is used
553  @param normed true if histograms should be normed before drawing
554  @param keep_first_binning use the binning of the first distribution for further plots
555  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
556  """
557  super(Distribution, self).__init__(figure, axis)
558 
559  self.normed_to_all_entries = normed_to_all_entries
560 
561  self.normed_to_bin_width = normed_to_bin_width
562 
563  self.range_in_std = range_in_std
564  # if self.normed_to_all_entries or self.normed_to_bin_width:
565 
566  self.ymin = float(0)
567 
568  self.ymax = float('-inf')
569 
570  self.xmin = float('inf')
571 
572  self.xmax = float('-inf')
573 
574  self.keep_first_binning = keep_first_binning
575 
576  self.first_binning = None
577 
578  self.x_axis_label = ''
579 
580  self.logScale = False
581 
582  self.binWidth = 0.02
583 
584  def add(self, data, column, mask=None, weight_column=None, label=None, bins=50):
585  """
586  Add a new distribution to the plots
587  @param data pandas.DataFrame containing all data
588  @param column which is used to calculate distribution histogram
589  @param mask boolean numpy.array defining which events are used for the histogram
590  @param weight_column column in data containing the weights for each event
591  """
592  if mask is None:
593  mask = numpy.ones(len(data)).astype('bool')
594 
595  # bins = 50
596  if self.keep_first_binning and self.first_binning is not None:
597  bins = self.first_binning
598  hists = histogram.Histograms(data, column, {'Total': mask}, weight_column=weight_column,
599  bins=bins, equal_frequency=False, range_in_std=self.range_in_std)
600  if self.keep_first_binning and self.first_binning is None:
601  self.first_binning = hists.bins
602  hist, hist_error = hists.get_hist('Total')
603  self.binWidth = hists.bin_widths[1]
604 
605  if self.normed_to_all_entries:
606  normalization = float(numpy.sum(hist))
607  hist = hist / normalization
608  hist_error = hist_error / normalization
609 
610  if self.normed_to_bin_width:
611  hist = hist / hists.bin_widths
612  hist_error = hist_error / hists.bin_widths
613 
614  self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
615  self.ymin, self.ymax = numpy.nanmin([hist.min(), self.ymin]), numpy.nanmax([(hist + hist_error).max(), self.ymax])
616 
617  p = self._plot_datapoints(self.axis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
618  self.plots.append(p)
619  self.x_axis_label = column
620  if label is None:
621  self.labels.append(column)
622  else:
623  self.labels.append(label)
624  return self
625 
626  def finish(self):
627  """
628  Sets limits, title, axis-labels and legend of the plot
629  """
630  self.scale_limits()
631  self.axis.set_xlim((self.xmin, self.xmax))
632 
633  if self.logScale:
634  self.axis.set_yscale('log', nonposy='clip')
635  else:
636  self.axis.set_ylim((self.ymin, self.ymax))
637  self.binWidth = '{:8.2f}'.format(self.binWidth)
638 
639  # self.axis.set_title("Distribution Plot")
640  self.axis.get_xaxis().set_label_text(self.x_axis_label)
641  if self.normed_to_all_entries and self.normed_to_bin_width:
642  self.axis.get_yaxis().set_label_text(r'# Entries per Bin / (# Entries * Bin Width)')
643  elif self.normed_to_all_entries:
644  # self.axis.get_yaxis().set_label_text('# Entries per Bin / # Entries')
645  self.axis.get_yaxis().set_label_text(
646  r'{$\frac{\rm Entries\hspace{0.25em} per\hspace{0.25em} Bin}{\rm Entries}\, /\, (' +
647  self.binWidth + r'\,)$}', fontsize=65)
648  self.axis.get_yaxis().labelpad = 20
649  self.axis.get_yaxis().set_tick_params(labelsize=60)
650  elif self.normed_to_bin_width:
651  self.axis.get_yaxis().set_label_text(r'# Entries per Bin / Bin Width')
652  else:
653  self.axis.get_yaxis().set_label_text(r'# Entries per Bin')
654  # self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5, fontsize=60)
655  return self
656 
657 
658 class Box(Plotter):
659  """
660  Create a boxplot
661  """
662 
664 
665  def __init__(self, figure=None, axis=None):
666  """
667  Creates a new figure and axis if None is given, sets the default plot parameters
668  @param figure default draw figure which is used
669  @param axis default draw axis which is used
670  """
671  super().__init__(figure=figure, axis=axis)
672 
673 
674  self.x_axis_label = ""
675 
676  def add(self, data, column, mask=None, weight_column=None):
677  """
678  Add a new boxplot to the plots
679  @param data pandas.DataFrame containing all data
680  @param column which is used to calculate boxplot quantities
681  @param mask boolean numpy.array defining which events are used for the histogram
682  @param weight_column column in data containing the weights for each event
683  """
684  if mask is None:
685  mask = numpy.ones(len(data)).astype('bool')
686  x = data[column][mask]
687  if weight_column is not None:
688  weight = data[weight_column][mask]
689  B2WARNING("Weights are currently not used in boxplot, due to limitations in matplotlib")
690 
691  if len(x) == 0:
692  B2WARNING("Ignore empty boxplot.")
693  return self
694 
695  p = self.axis.boxplot(x, sym='k.', whis=1.5, vert=False, patch_artist=True, showmeans=True, widths=1,
696  boxprops=dict(facecolor='blue', alpha=0.5),
697  # medianprobs=dict(color='blue'),
698  # meanprobs=dict(color='red'),
699  )
700  self.plots.append(p)
701  self.labels.append(column)
702  self.x_axis_label = column
703  # """
704  # self.axis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + r'\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
705  # fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
706  # self.axis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + r'\n' + r'$IQD = {:.2f}$').format(x.std(),
707  # x.quantile(0.75) - x.quantile(0.25)),
708  # fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
709  # self.axis.text(0.7, 0.9, (r'$min = {:.2f}$' + r'\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
710  # fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
711  # """
712 
713  return self
714 
715  def finish(self):
716  """
717  Sets limits, title, axis-labels and legend of the plot
718  """
719  matplotlib.artist.setp(self.axis.get_yaxis(), visible=False)
720  self.axis.get_xaxis().set_label_text(self.x_axis_label)
721  self.axis.set_title("Box Plot")
722  return self
723 
724 
725 class Difference(Plotter):
726  """
727  Plots the difference between two histograms
728  """
729 
741 
742  def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
743  """
744  Creates a new figure and axis if None is given, sets the default plot parameters
745  @param figure default draw figure which is used
746  @param axis default draw axis which is used
747  @param normed normalize minuend and subtrahend before comparing them
748  @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
749  """
750  super(Difference, self).__init__(figure, axis)
751  self.normed = normed
752  self.shift_to_zero = shift_to_zero
753  if self.normed:
754  self.ymin = -0.01
755  self.ymax = 0.01
756  else:
757  self.ymin = -1
758  self.ymax = 1
759 
760  def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
761  """
762  Add a new difference plot
763  @param data pandas.DataFrame containing all data
764  @param column which is used to calculate distribution histogram
765  @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
766  @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
767  @param weight_column column in data containing the weights for each event
768  @param label label for the legend if None, the column name is used
769  """
770  bins = 50
771  hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask}, bins=bins,
772  weight_column=weight_column, equal_frequency=False)
773  minuend, minuend_error = hists.get_hist('Minuend')
774  subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
775 
776  difference_error = histogram.poisson_error(minuend + subtrahend)
777  if self.normed:
778  difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
779  minuend = minuend / numpy.sum(minuend)
780  subtrahend = subtrahend / numpy.sum(subtrahend)
781  difference = minuend - subtrahend
782 
783  if self.shift_to_zero:
784  difference = difference - numpy.mean(difference)
785 
786  self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
787  self.ymin = min((difference - difference_error).min(), self.ymin)
788  self.ymax = max((difference + difference_error).max(), self.ymax)
789 
790  p = self._plot_datapoints(self.axis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
791  self.plots.append(p)
792  if label is None:
793  self.labels.append(label)
794  else:
795  self.labels.append(column)
796  self.x_axis_label = column
797  return self
798 
799  def finish(self, line_color='black'):
800  """
801  Sets limits, title, axis-labels and legend of the plot
802  """
803  self.axis.plot((self.xmin, self.xmax), (0, 0), color=line_color, linewidth=4)
804  self.scale_limits()
805  self.axis.set_xlim((self.xmin, self.xmax))
806  self.axis.set_ylim((self.ymin, self.ymax))
807  self.axis.set_title("Difference Plot")
808  self.axis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
809  self.axis.get_xaxis().set_label_text(self.x_axis_label)
810  self.axis.set_ylabel(r'{\rm Difference}', fontsize=40, labelpad=20)
811  self.axis.get_xaxis().grid(True)
812  # self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
813  return self
814 
815 
816 class normalizedResiduals(Plotter):
817  """
818  Plots the difference between two histograms
819  """
820 
834 
835  def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
836  """
837  Creates a new figure and axis if None is given, sets the default plot parameters
838  @param figure default draw figure which is used
839  @param axis default draw axis which is used
840  @param normed normalize minuend and subtrahend before comparing them
841  @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
842  """
843  super(normalizedResiduals, self).__init__(figure, axis)
844  self.normed = normed
845  self.shift_to_zero = shift_to_zero
846  if self.normed:
847  self.ymin = -0.01
848  self.ymax = 0.01
849  else:
850  self.ymin = -1
851  self.ymax = 1
852 
853  def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None, bins=50, isNN=False):
854  """
855  Add a new difference plot
856  @param data pandas.DataFrame containing all data
857  @param column which is used to calculate distribution histogram
858  @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
859  @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
860  @param weight_column column in data containing the weights for each event
861  @param label label for the legend if None, the column name is used
862  """
863  # bins = 50
864  hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask}, bins=bins,
865  weight_column=weight_column, equal_frequency=False)
866  minuend, minuend_error = hists.get_hist('Minuend')
867  subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
868 
869  print("Here BinWidths Norm", hists.bin_widths)
870  difference_error = histogram.poisson_error(minuend + subtrahend)
871 
872  if self.normed:
873  difference_error = numpy.sqrt((minuend_error / numpy.sum(minuend))**2 + (subtrahend_error / numpy.sum(subtrahend))**2)
874  minuend = minuend / numpy.sum(minuend)
875  subtrahend = subtrahend / numpy.sum(subtrahend)
876  difference = minuend - subtrahend
877  normalizedRes = (minuend - subtrahend) / difference_error
878 
879  if self.shift_to_zero:
880  difference = difference - numpy.mean(difference)
881 
882  # self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
883 
884  # if min(hists.bin_centers.min(), self.xmin) < -0.8:
885  if isNN:
886  self.xmin = float(-1.0)
887 
888  self.xmin, self.xmax = self.xmin, self.xmax
889 
890  p = self._plot_datapoints(self.axis, hists.bin_centers, normalizedRes, xerr=hists.bin_widths / 2, yerr=1)
891  self.plots.append(p)
892  if label is None:
893  self.labels.append(label)
894  else:
895  self.labels.append(column)
896  self.x_axis_label = column
897  return self
898 
899  def finish(self, line_color='black'):
900  """
901  Sets limits, title, axis-labels and legend of the plot
902  """
903  # self.axis.plot((self.xmin, self.xmax), (0, 0), color=line_color, linewidth=4)
904  self.scale_limits()
905  self.axis.set_xlim((self.xmin, self.xmax))
906  self.axis.set_ylim((-5, 5))
907  self.axis.set_title("Difference Plot")
908  self.axis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
909  self.axis.get_xaxis().set_label_text(self.x_axis_label)
910  self.axis.set_ylabel(r'${\rm Normalized}$' + '\n' + r'${\rm Residuals}$', fontsize=40, labelpad=20)
911  self.axis.get_yaxis().set_ticks([-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5])
912  self.axis.get_yaxis().set_ticklabels([r'', r'$-4$', r'', r'$-2$', r'', r'$0$', r'', r'$2$', r'', r'$4$', r''], fontsize=45)
913  self.axis.get_xaxis().grid(True) # linestyle='--'
914  # plt.axhline(y= 4, xmin=-1.005, xmax=1.005, linewidth=1, color = 'k', linestyle = '-')
915  self.axis.plot((self.xmin, self.xmax), (3, 3), linewidth=4, color='#006600', linestyle='-')
916  self.axis.plot((self.xmin, self.xmax), (1, 1), linewidth=4, color='b', linestyle='-')
917  self.axis.plot((self.xmin, self.xmax), (-1, -1), linewidth=4, color='b', linestyle='-')
918  self.axis.plot((self.xmin, self.xmax), (-3, -3), linewidth=4, color='#006600', linestyle='-')
919 
920  # self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
921  return self
922 
923 
924 class Overtraining(Plotter):
925  """
926  Create TMVA-like overtraining control plot for a classification training
927  """
928 
929 
930  figure = None
931 
932  axis = None
933 
934  axis_d1 = None
935 
936  axis_d2 = None
937 
938  def __init__(self, figure=None):
939  """
940  Creates a new figure if None is given, sets the default plot parameters
941  @param figure default draw figure which is used
942  """
943  if figure is None:
944  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
945  self.figure.set_tight_layout(True)
946  else:
947  self.figure = figure
948 
949  gs = matplotlib.gridspec.GridSpec(5, 1)
950  self.axis = self.figure.add_subplot(gs[:3, :])
951  self.axis_d1 = self.figure.add_subplot(gs[3, :], sharex=self.axis)
952  self.axis_d2 = self.figure.add_subplot(gs[4, :], sharex=self.axis)
953 
954  super(Overtraining, self).__init__(self.figure, self.axis)
955 
956  def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None, bkgrOutput=0, isNN=False):
957  """
958  Add a new overtraining plot, I recommend to raw only one overtraining plot at the time,
959  otherwise there are too many curves in the plot to reconize anything in the plot.
960  @param data pandas.DataFrame containing all data
961  @param column which is used to calculate distribution histogram
962  @param train_mask boolean numpy.array defining which events are training events
963  @param test_mask boolean numpy.array defining which events are test events
964  @param signal_mask boolean numpy.array defining which events are signal events
965  @param bckgrd_mask boolean numpy.array defining which events are background events
966  @param weight_column column in data containing the weights for each event
967  """
968  distribution = Distribution(self.figure, self.axis, normed_to_all_entries=True)
969 
970  bins = []
971 
972  if isNN:
973  bins = list(range(-51, 55, 1))
974  for i in range(0, len(bins)):
975  bins[i] = float(bins[i]) / 25
976  else:
977  bins = list(range(-51, 55, 1))
978  for i in range(0, len(bins)):
979  bins[i] = float(bins[i]) / 50
980 
981  if bkgrOutput == 0:
982  distribution.logScale = True
983  distribution.labels = [r'{\rm Test-Bkgr.}', r'{\rm Train-Bkgr.}', r'{\rm Test-Signal}', r'{\rm Train-Signal}']
984  else:
985  distribution.labels = [
986  r'{\rm Test-$\bar{B}^{0}$}',
987  r'{\rm Train-$\bar{B}^{0}$}',
988  r'{\rm Test-$B^{0}$}',
989  r'{\rm Train-$B^{0}$}']
990 
991  distribution.set_plot_options(self.plot_kwargs)
992  # distribution.set_errorbar_options(self.errorbar_kwargs)
993  distribution.set_errorbar_options({'fmt': 'o', 'elinewidth': 5, 'alpha': 1, 'markersize': 20, 'ecolor': 'w'})
994  # distribution.set_errorband_options(self.errorband_kwargs)
995  distribution.set_errorband_options(None)
996  distribution.add(data, column, test_mask & bckgrd_mask, weight_column, None, bins)
997  distribution.add(data, column, test_mask & signal_mask, weight_column, None, bins)
998 
999  distribution.set_errorbar_options(None)
1000 
1001  distribution.set_plot_options({'color': distribution.plots[0][0][0].get_color(
1002  ), 'drawstyle': 'steps-mid', 'linestyle': 'dashed', 'lw': 5})
1003  distribution.set_fill_options(None)
1004  distribution.add(data, column, train_mask & bckgrd_mask, weight_column, None, bins)
1005  distribution.set_plot_options({'color': distribution.plots[1][0][0].get_color(
1006  ), 'drawstyle': 'steps-mid', 'linestyle': 'solid', 'lw': 5})
1007  # distribution.set_fill_options({'color': distribution.plots[1][0][0].get_color(), 'alpha': 0.5, 'step': 'mid'})
1008  distribution.add(data, column, train_mask & signal_mask, weight_column, None, bins)
1009 
1010  distribution.finish()
1011 
1012  p1 = distribution.axis.errorbar([], [], xerr=0, yerr=0, elinewidth=5, mew=2, ecolor='w',
1013  # ecolor=[0.04862745098039216, 0.18666666666666668, 0.28235294117647064],
1014  fmt='o', mfc=distribution.plots[0][0][0].get_color(),
1015  mec=distribution.plots[0][0][0].get_color(), markersize=20, label=r'${\rm Test-Bkgr.}$')
1016  p2, = distribution.axis.plot([], label=r'${\rm Train-Bkgr.}$', linewidth=5,
1017  linestyle='dashed', c=distribution.plots[0][0][0].get_color())
1018  p3 = distribution.axis.errorbar([], [], xerr=0, yerr=0, elinewidth=5, mew=2, ecolor='w',
1019  # ecolor=[0.4, 0.1992156862745098, 0.02196078431372549],
1020  fmt='o', mfc=distribution.plots[1][0][0].get_color(),
1021  mec=distribution.plots[1][0][0].get_color(), markersize=20, label=r'${\rm Test-Signal}$')
1022  p4, = distribution.axis.plot([], label=r'${\rm Train-Signal}$', linewidth=5,
1023  linestyle='solid', alpha=0.9, c=distribution.plots[1][0][0].get_color())
1024 
1025  distribution.axis.legend([p1, p2, p3, p4], distribution.labels, loc='best', fancybox=True, framealpha=0.5, fontsize=60)
1026 
1027  self.plot_kwargs['color'] = distribution.plots[0][0][0].get_color()
1028  difference_bckgrd = normalizedResiduals(self.figure, self.axis_d1, shift_to_zero=True, normed=True)
1029  difference_bckgrd.set_plot_options(self.plot_kwargs)
1030  difference_bckgrd.set_errorbar_options(self.errorbar_kwargs)
1031  difference_bckgrd.set_errorband_options(self.errorband_kwargs)
1032  difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column, None, bins, isNN)
1033  self.axis_d1.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
1034  self.axis_d1.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
1035  # self.plot_kwargs['color'] = distribution.plots[0][0][0].get_color()
1036  difference_bckgrd.plots = difference_bckgrd.labels = []
1037  difference_bckgrd.finish(line_color=distribution.plots[0][0][0].get_color())
1038 
1039  self.plot_kwargs['color'] = distribution.plots[1][0][0].get_color()
1040  difference_signal = normalizedResiduals(self.figure, self.axis_d2, shift_to_zero=True, normed=True)
1041  difference_signal.set_plot_options(self.plot_kwargs)
1042  difference_signal.set_errorbar_options(self.errorbar_kwargs)
1043  difference_signal.set_errorband_options(self.errorband_kwargs)
1044  difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column, None, bins, isNN)
1045  self.axis_d2.set_xlim((difference_signal.xmin, difference_signal.xmax))
1046  self.axis_d2.set_ylim((difference_signal.ymin, difference_signal.ymax))
1047  difference_signal.plots = difference_signal.labels = []
1048  difference_signal.finish(line_color=distribution.plots[1][0][0].get_color())
1049 
1050 
1051 # try:
1052 # import scipy.stats
1053 # # Kolmogorov smirnov test
1054 # if len(data[column][train_mask & signal_mask]) == 0 or len(data[column][test_mask & signal_mask]) == 0:
1055 # B2WARNING("Cannot calculate kolmogorov smirnov test for signal due to missing data")
1056 # else:
1057 # ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
1058 # props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
1059 # self.axis_d1.text(0.1, 0.9, r'${\rm Signal\ (train\ -\ test)}\ p$-{\rm value}' + r'$={:.2f}$'.format(ks[1]),
1060 # fontsize=36, bbox=props,
1061 # verticalalignment='top', horizontalalignment='left', transform=self.axis_d1.transAxes)
1062 # if len(data[column][train_mask & bckgrd_mask]) == 0 or len(data[column][test_mask & bckgrd_mask]) == 0:
1063 # B2WARNING("Cannot calculate kolmogorov smirnov test for background due to missing data")
1064 # else:
1065 # ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
1066 # props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
1067 # self.axis_d2.text(0.1, 0.9, r'${\rm Bkgr.\ (train\ -\ test)}\ p$-{\rm value}' + r'$={:.2f}$'.format(ks[1]),
1068 # fontsize=36,
1069 # bbox=props,
1070 # verticalalignment='top', horizontalalignment='left', transform=self.axis_d2.transAxes)
1071 # except ImportError:
1072 # B2WARNING("Cannot calculate kolmogorov smirnov test please install scipy!")
1073 
1074  return self
1075 
1076  def finish(self, xLabel=r'${\rm Classifier\ Output}$'):
1077  """
1078  Sets limits, title, axis-labels and legend of the plot
1079  """
1080  # self.axis.set_title("Overtraining Plot")
1081  self.axis_d1.set_title("")
1082  self.axis_d2.set_title("")
1083  matplotlib.artist.setp(self.axis.get_xticklabels(), visible=False)
1084  matplotlib.artist.setp(self.axis_d1.get_xticklabels(), visible=False)
1085  self.axis.get_xaxis().set_label_text('')
1086  self.axis_d1.get_xaxis().set_label_text('')
1087  self.axis_d2.get_xaxis().set_label_text(xLabel, fontsize=85)
1088  self.axis_d2.get_xaxis().labelpad = 20
1089  self.axis_d2.get_xaxis().set_tick_params(labelsize=60)
1090  return self
1091 
1092 
1093 class VerboseDistribution(Plotter):
1094  """
1095  Plots distribution of a quantity including boxplots
1096  """
1097 
1098 
1099  box_axes = None
1100 
1101  def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
1102  """
1103  Creates a new figure and axis if None is given, sets the default plot parameters
1104  @param figure default draw figure which is used
1105  @param axis default draw axis which is used
1106  @param normed true if the histograms should be normed before drawing
1107  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
1108  """
1109  super(VerboseDistribution, self).__init__(figure, axis)
1110 
1111  self.normed = normed
1112 
1113  self.range_in_std = range_in_std
1114  self.box_axes = []
1115 
1116  self.distribution = Distribution(self.figure, self.axis, normed_to_all_entries=self.normed, range_in_std=self.range_in_std)
1117 
1118  def add(self, data, column, mask=None, weight_column=None, label=None):
1119  """
1120  Add a new distribution plot, with additional information like a boxplot compared to
1121  the ordinary Distribution plot.
1122  @param data pandas.DataFrame containing all data
1123  @param column which is used to calculate distribution histogram
1124  @param mask boolean numpy.array defining which events are used for the distribution histogram
1125  @param weight_column column in data containing the weights for each event
1126  """
1127  self.distribution.set_plot_options(self.plot_kwargs)
1128  self.distribution.set_errorbar_options(self.errorbar_kwargs)
1129  self.distribution.set_errorband_options(self.errorband_kwargs)
1130  self.distribution.add(data, column, mask, weight_column, label=label)
1131 
1132  n = len(self.box_axes) + 1
1133  gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1134  gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i in range(n)]
1135  box_axis = self.add_subplot(gridspecs)
1136 
1137  if self.range_in_std is not None:
1138  mean, std = histogram.weighted_mean_and_std(data[column], None if weight_column is None else data[weight_column])
1139  # Everything outside mean +- range_in_std * std is considered not inside the mask
1140  mask = mask & (data[column] > (mean - self.range_in_std * std)) & (data[column] < (mean + self.range_in_std * std))
1141  box = Box(self.figure, box_axis)
1142  box.add(data, column, mask, weight_column)
1143  if len(box.plots) > 0:
1144  box.plots[0]['boxes'][0].set_facecolor(self.distribution.plots[-1][0][0].get_color())
1145  box.finish()
1146 
1147  self.box_axes.append(box_axis)
1148  return self
1149 
1150  def finish(self):
1151  """
1152  Sets limits, title, axis-labels and legend of the plot
1153  """
1154  self.distribution.finish()
1155  matplotlib.artist.setp(self.axis.get_xticklabels(), visible=False)
1156  self.axis.get_xaxis().set_label_text('')
1157  for box_axis in self.box_axes[:-1]:
1158  matplotlib.artist.setp(box_axis.get_xticklabels(), visible=False)
1159  box_axis.set_title("")
1160  box_axis.get_xaxis().set_label_text('')
1161  self.box_axes[-1].set_title("")
1162  self.axis.set_title("Distribution Plot")
1163  self.axis.legend([x[0] for x in self.distribution.plots], self.distribution.labels,
1164  loc='best', fancybox=True, framealpha=0.5)
1165  return self
1166 
1167 
1168 class Correlation(Plotter):
1169  """
1170  Plots change of a distribution of a quantity depending on the cut on a classifier
1171  """
1172 
1173  figure = None
1174 
1175  axis = None
1176 
1177  axis_d1 = None
1178 
1179  axis_d2 = None
1180 
1181  def __init__(self, figure=None):
1182  """
1183  Creates a new figure if None is given, sets the default plot parameters
1184  @param figure default draw figure which is used
1185  """
1186  if figure is None:
1187  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
1188  self.figure.set_tight_layout(True)
1189  else:
1190  self.figure = figure
1191 
1192  gs = matplotlib.gridspec.GridSpec(3, 2)
1193  self.axis = self.figure.add_subplot(gs[0, :])
1194  self.axis_d1 = self.figure.add_subplot(gs[1, :], sharex=self.axis)
1195  self.axis_d2 = self.figure.add_subplot(gs[2, :], sharex=self.axis)
1196 
1197  super(Correlation, self).__init__(self.figure, self.axis)
1198 
1199  def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1200  """
1201  Add a new correlation plot.
1202  @param data pandas.DataFrame containing all data
1203  @param column which is used to calculate distribution histogram
1204  @param cut_column which is used to calculate cut on the other quantity defined by column
1205  @param quantiles list of quantiles between 0 and 100, defining the different cuts
1206  @param weight_column column in data containing the weights for each event
1207  """
1208  if len(data[cut_column]) == 0:
1209  B2WARNING("Ignore empty Correlation.")
1210  return self
1211 
1212  axes = [self.axis, self.axis_d1, self.axis_d2]
1213 
1214  for i, (l, m) in enumerate([('.', signal_mask | bckgrd_mask), ('S', signal_mask), ('B', bckgrd_mask)]):
1215 
1216  if weight_column is not None:
1217  weights = numpy.array(data[weight_column][m])
1218  else:
1219  weights = numpy.ones(len(data[column][m]))
1220 
1221  # The cast to float32 is a workaround for the following numpy issue:
1222  # https://github.com/numpy/numpy/issues/8123
1223  xrange = np.percentile(data[column][m], [5, 95]).astype(np.float32)
1224 
1225  colormap = plt.get_cmap('coolwarm')
1226  tmp, x = np.histogram(data[column][m], bins=100,
1227  range=xrange, normed=True, weights=weights)
1228  bin_center = ((x + np.roll(x, 1)) / 2)[1:]
1229  axes[i].plot(bin_center, tmp, color='black', lw=1)
1230 
1231  for quantil in np.arange(5, 100, 5):
1232  cut = np.percentile(data[cut_column][m], quantil)
1233  sel = data[cut_column][m] >= cut
1234  y, x = np.histogram(data[column][m][sel], bins=100,
1235  range=xrange, normed=True, weights=weights[sel])
1236  bin_center = ((x + np.roll(x, 1)) / 2)[1:]
1237  axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1238  tmp = y
1239 
1240  axes[i].set_ylim(bottom=0)
1241 
1242  flatness_score = basf2_mva_util.calculate_flatness(data[column][m], data[cut_column][m], weights)
1243  axes[i].set_title(r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1244  return self
1245 
1246  def finish(self):
1247  """
1248  Sets limits, title, axis-labels and legend of the plot
1249  """
1250  return self
1251 
1252 
1253 class TSNE(Plotter):
1254  """
1255  Plots multivariate distribution using TSNE algorithm
1256  """
1257 
1258  def add(self, data, columns, *masks):
1259  """
1260  Add a new correlation plot.
1261  @param data pandas.DataFrame containing all data
1262  @param columns which are used to calculate the correlations
1263  @param masks different classes to show in TSNE
1264  """
1265  try:
1266  import sklearn
1267  import sklearn.manifold
1268  model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1269  data = numpy.array([data[column] for column in columns]).T
1270  model.fit(data)
1271  for mask in masks:
1272  data = numpy.array([data[column][mask] for column in columns]).T
1273  data = model.transform(data)
1274  self.axis.scatter(data[:, 0], data[:, 1])
1275  except ImportError:
1276  print("Cannot create TSNE plot. Install sklearn if you want it")
1277  return self
1278 
1279  def finish(self):
1280  """
1281  Sets limits, title, axis-labels and legend of the plot
1282  """
1283  return self
1284 
1285 
1286 class Importance(Plotter):
1287  """
1288  Plots importance matrix
1289  """
1290 
1291  def add(self, data, columns, variables, displayHeatMap):
1292  """
1293  Add a new correlation plot.
1294  @param data pandas.DataFrame containing all data
1295  @param columns which are used to calculate the correlations
1296  """
1297  self.figure.set_tight_layout(True)
1298 
1299  def norm(x):
1300  width = (numpy.max(x) - numpy.min(x))
1301  if width <= 0:
1302  return numpy.zeros(x.shape)
1303  return (x - numpy.min(x)) / width * 100
1304 
1305  importance_matrix = numpy.vstack([norm(data[column]) for column in columns]).T
1306 
1307  cRdBu = plt.get_cmap('RdBu')
1308  new_RdBu = truncate_colormap(cRdBu, 0.5, 0.85)
1309 
1310  labelsValues = []
1311  labels = list(variables)
1312 
1313  for y in range(importance_matrix.shape[0]):
1314  for x in range(importance_matrix.shape[1]):
1315  labelsValues.append([importance_matrix[y, x], labels[y]])
1316 
1317  labelsValues = np.array(sorted(labelsValues))
1318 
1319  arrayToSort = np.array(np.sort(importance_matrix, axis=0))
1320  # print(arrayToSort)
1321  importance_heatmap = self.axis.pcolor(arrayToSort, cmap=new_RdBu, vmin=0, vmax=100)
1322  # importance_heatmap = self.axis.pcolor(importance_matrix, cmap=matplotlib.pyplot.cm.viridis, vmin=0.0, vmax=100)
1323 
1324  CoeffSize = 33
1325 
1326  # put the major ticks at the middle of each cell
1327  self.axis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=False)
1328  self.axis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=False)
1329 
1330  self.axis.set_xticklabels(columns, minor=False, rotation=90)
1331  # self.axis.set_yticklabels(variables, minor=False)
1332 
1333  if labelsValues.shape[0] < 6:
1334  CoeffSize = 50
1335  self.axis.set_yticklabels(labelsValues[:, 1], minor=False, size=58)
1336 
1337  else:
1338  self.axis.set_yticklabels(labelsValues[:, 1], minor=False)
1339 
1340  self.axis.set_xticklabels([''])
1341 
1342  # for y in range(importance_matrix.shape[0]):
1343  # for x in range(importance_matrix.shape[1]):
1344  # self.axis.text(x + 0.5, y + 0.5, r'$%.0f$' % importance_matrix[y, x],
1345  # size=33,
1346  # horizontalalignment='center',
1347  # verticalalignment='center')
1348 
1349  for y in range(labelsValues.shape[0]):
1350  self.axis.text(x + 0.5, y + 0.5, r'$%.0f$' % float(labelsValues[y][0]),
1351  size=CoeffSize,
1352  horizontalalignment='center',
1353  verticalalignment='center')
1354 
1355  if displayHeatMap:
1356  cb = self.figure.colorbar(importance_heatmap, ticks=[2, 98], orientation='vertical')
1357  cb.ax.tick_params(length=0)
1358  cb.ax.set_yticklabels([r'${\rm low}$', r'${\rm high}$'], size=60)
1359 
1360  self.axis.set_aspect('equal')
1361 
1362  return self
1363 
1364  def finish(self):
1365  """
1366  Sets limits, title, axis-labels and legend of the plot
1367  """
1368  return self
1369 
1370 
1371 def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
1372  new_cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
1373  'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
1374  cmap(np.linspace(minval, maxval, n)))
1375  return new_cmap
1376 
1377 
1378 class CorrelationMatrix(Plotter):
1379  """
1380  Plots correlation matrix
1381  """
1382 
1383  figure = None
1384 
1385  signal_axis = None
1386 
1387  bckgrd_axis = None
1388 
1389  def __init__(self, figure=None):
1390  """
1391  Creates a new figure if None is given, sets the default plot parameters
1392  @param figure default draw figure which is used
1393  """
1394  if figure is None:
1395  self.figure = matplotlib.figure.Figure(figsize=(38, 24))
1396  self.figure.set_tight_layout(True)
1397  else:
1398  self.figure = figure
1399 
1400  gs = matplotlib.gridspec.GridSpec(16, 2)
1401  self.signal_axis = self.figure.add_subplot(gs[:14, 0])
1402  self.bckgrd_axis = self.figure.add_subplot(gs[:14, 1], sharey=self.signal_axis)
1403 
1404  self.colorbar_axis = self.figure.add_subplot(gs[15, :])
1405 
1406  self.axis = self.signal_axis
1407 
1408  super(CorrelationMatrix, self).__init__(self.figure, self.axis)
1409 
1410  def add(self, data, columns, signal_mask, bckgrd_mask, bkgrOutput):
1411  """
1412  Add a new correlation plot.
1413  @param data pandas.DataFrame containing all data
1414  @param columns which are used to calculate the correlations
1415  """
1416  # columns = list(reversed(columns))
1417  signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column in columns])) * 100
1418  bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask] for column in columns])) * 100
1419 
1420  mirrored_signal_corr = np.zeros(signal_corr.shape)
1421  mirrored_bckgrd_corr = np.zeros(bckgrd_corr.shape)
1422 
1423  for y in range(signal_corr.shape[0]):
1424  for x in range(signal_corr.shape[1]):
1425  mirrored_signal_corr[y, x] = signal_corr[y, signal_corr.shape[1] - 1 - x]
1426 
1427  for y in range(bckgrd_corr.shape[0]):
1428  for x in range(bckgrd_corr.shape[1]):
1429  mirrored_bckgrd_corr[y, x] = bckgrd_corr[y, bckgrd_corr.shape[1] - 1 - x]
1430 
1431  cRdBu = plt.get_cmap('RdBu')
1432  new_RdBu = truncate_colormap(cRdBu, 0.15, 0.85)
1433  signal_heatmap = self.signal_axis.pcolor(mirrored_signal_corr, cmap=new_RdBu, vmin=-100.0, vmax=100.0)
1434  bckgrd_heatmap = self.bckgrd_axis.pcolor(mirrored_bckgrd_corr, cmap=new_RdBu, vmin=-100.0, vmax=100.0)
1435 
1436  # cvir = plt.get_cmap('viridis_r')
1437  # new_cvir = truncate_colormap(cvir, 0, 0.75)
1438  # signal_heatmap = self.signal_axis.pcolor(mirrored_signal_corr, cmap=new_cvir, vmin=-100.0, vmax=100.0)
1439  # bckgrd_heatmap = self.bckgrd_axis.pcolor(mirrored_bckgrd_corr, cmap=new_cvir, vmin=-100.0, vmax=100.0)
1440 
1441  for y in range(mirrored_signal_corr.shape[0]):
1442  for x in range(mirrored_signal_corr.shape[1]):
1443  outputWithRedundantMinus = '%.0f' % mirrored_signal_corr[y, x]
1444  if outputWithRedundantMinus == '-0':
1445  mirrored_signal_corr[y, x] = 0
1446 
1447  for y in range(mirrored_bckgrd_corr.shape[0]):
1448  for x in range(mirrored_bckgrd_corr.shape[1]):
1449  outputWithRedundantMinus = '%.0f' % mirrored_bckgrd_corr[y, x]
1450  if outputWithRedundantMinus == '-0':
1451  mirrored_bckgrd_corr[y, x] = 0
1452 
1453  self.signal_axis.invert_yaxis()
1454  self.signal_axis.xaxis.tick_top()
1455  self.bckgrd_axis.invert_yaxis()
1456  self.bckgrd_axis.xaxis.tick_top()
1457 
1458  # put the major ticks at the middle of each cell
1459  self.signal_axis.set_xticks(numpy.arange(mirrored_signal_corr.shape[0]) + 0.5, minor=False)
1460  self.signal_axis.set_yticks(numpy.arange(mirrored_signal_corr.shape[1]) + 0.5, minor=False)
1461 
1462  CoeffSize = 30
1463 
1464  # put the major ticks at the middle of each cell
1465  self.bckgrd_axis.set_xticks(numpy.arange(mirrored_bckgrd_corr.shape[0]) + 0.5, minor=False)
1466  self.bckgrd_axis.set_yticks(numpy.arange(mirrored_bckgrd_corr.shape[1]) + 0.5, minor=False)
1467 
1468  if mirrored_signal_corr.shape[0] < 8:
1469  CoeffSize = 50
1470  self.bckgrd_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90, size=58)
1471  self.bckgrd_axis.set_yticklabels(columns, minor=False, size=58)
1472  self.signal_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90, size=58)
1473  self.signal_axis.set_yticklabels(columns, minor=False, size=58)
1474  else:
1475  self.bckgrd_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90)
1476  self.bckgrd_axis.set_yticklabels(columns, minor=False)
1477  self.signal_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90)
1478  self.signal_axis.set_yticklabels(columns, minor=False)
1479 
1480  for y in range(mirrored_signal_corr.shape[0]):
1481  for x in range(mirrored_signal_corr.shape[1]):
1482  if mirrored_signal_corr.shape[0] > 24 and mirrored_signal_corr[y, x] < 0:
1483  self.signal_axis.text(x + 0.5, y + 0.5, '-' + r'$%.0f$' % abs(mirrored_signal_corr[y, x]),
1484  size=25,
1485  horizontalalignment='center',
1486  verticalalignment='center')
1487  else:
1488  self.signal_axis.text(x + 0.5, y + 0.5, r'$%.0f$' % mirrored_signal_corr[y, x],
1489  size=CoeffSize,
1490  horizontalalignment='center',
1491  verticalalignment='center')
1492 
1493  for y in range(mirrored_bckgrd_corr.shape[0]):
1494  for x in range(mirrored_bckgrd_corr.shape[1]):
1495  if mirrored_bckgrd_corr.shape[0] > 24 and mirrored_bckgrd_corr[y, x] < 0:
1496  self.signal_axis.text(x + 0.5, y + 0.5, '-' + r'$%.0f$' % abs(mirrored_bckgrd_corr[y, x]),
1497  size=25,
1498  horizontalalignment='center',
1499  verticalalignment='center')
1500  else:
1501  self.bckgrd_axis.text(x + 0.5, y + 0.5, r'$%.0f$' % mirrored_bckgrd_corr[y, x],
1502  size=CoeffSize,
1503  horizontalalignment='center',
1504  verticalalignment='center')
1505 
1506  cb = self.figure.colorbar(signal_heatmap, cax=self.colorbar_axis, ticks=[-92.3, 0, 92.5], orientation='horizontal')
1507  cb.ax.tick_params(length=0)
1508  cb.ax.set_xticklabels([r'${\rm negative}$', r'${\rm uncorrelated}$', r'${\rm positive}$'], fontsize=60)
1509 
1510  if bkgrOutput == -1:
1511  self.figure.text(0.30, 0.11, r'$B^0\,(q_{\rm MC} = +1)$', horizontalalignment='center', size=65)
1512  self.figure.text(0.74, 0.11, r'$\bar{B}^0\,(q_{\rm MC} = -1)$', horizontalalignment='center', size=65)
1513 
1514  else:
1515  self.figure.text(0.27, 0.115, r'${\rm Signal}$', horizontalalignment='center', size=65)
1516  self.figure.text(0.73, 0.115, r'${\rm Background}$', horizontalalignment='center', size=65)
1517 
1518  return self
1519 
1520  def finish(self):
1521  """
1522  Sets limits, title, axis-labels and legend of the plot
1523  """
1524  matplotlib.artist.setp(self.bckgrd_axis.get_yticklabels(), visible=False)
1525  return self
1526 
1527 
1528 if __name__ == '__main__':
1529 
1530  def get_data(N, columns):
1531  """
1532  Creates fake data for example plots
1533  """
1534  N /= 2
1535  n = len(columns) - 1
1536  xs = numpy.random.normal(0, size=(N, n))
1537  xb = numpy.random.normal(1, size=(N, n))
1538  ys = numpy.zeros(N)
1539  yb = numpy.ones(N)
1540  data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1541  return data.reindex(numpy.random.permutation(data.index))
1542 
1543  import seaborn
1544  # Set nice searborn settings
1545  seaborn.set(font_scale=3)
1546  seaborn.set_style('whitegrid')
1547 
1548  # Standard plots
1549  N = 100000
1550  data = get_data(N, columns=['FastBDT', 'NeuroBayes', 'isSignal'])
1551  data['type'] = ''
1552  data.type.iloc[:N / 2] = 'Train'
1553  data.type.iloc[N / 2:] = 'Test'
1554 
1555  p = Box()
1556  p.add(data, 'FastBDT')
1557  p.finish()
1558  p.save('box_plot.png')
1559 
1560  p = VerboseDistribution()
1561  p.add(data, 'FastBDT')
1562  p.add(data, 'NeuroBayes')
1563  p.finish()
1564  p.save('verbose_distribution_plot.png')
1565 
1566  p = PurityOverEfficiency()
1567  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1568  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1569  p.finish()
1570  p.save('roc_purity_plot.png')
1571 
1572  p = RejectionOverEfficiency()
1573  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1574  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1575  p.finish()
1576  p.save('roc_rejection_plot.png')
1577 
1578  p = Diagonal()
1579  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1580  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1581  p.finish()
1582  p.save('diagonal_plot.png')
1583 
1584  p = Distribution()
1585  p.add(data, 'FastBDT')
1586  p.add(data, 'NeuroBayes')
1587  p.finish()
1588  p.save('distribution_plot.png')
1589 
1590  p = Difference()
1591  p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test')
1592  p.add(data, 'NeuroBayes', data['type'] == 'Train', data['type'] == 'Test')
1593  p.finish()
1594  p.save('difference_plot.png')
1595 
1596  p = Overtraining()
1597  p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test', data['isSignal'] == 1, data['isSignal'] == 0)
1598  p.finish()
1599  p.save('overtraining_plot.png')
1600 
1601  p = Correlation()
1602  p.add(data, 'FastBDT', 'NeuroBayes', [0, 20, 40, 60, 80, 100], data['isSignal'] == 0)
1603  p.finish()
1604  p.save('correlation_plot.png')
1605 
1606  p = CorrelationMatrix()
1607  data['FastBDT2'] = data['FastBDT']**2
1608  data['NeuroBayes2'] = data['NeuroBayes']**2
1609  data['FastBDT3'] = data['FastBDT']**3
1610  data['NeuroBayes3'] = data['NeuroBayes']**3
1611  p.add(data, ['FastBDT', 'NeuroBayes', 'FastBDT2', 'NeuroBayes2', 'FastBDT3', 'NeuroBayes3'])
1612  p.finish()
1613  p.save('correlation_matrix.png')
1614 
1615 # @endcond
plot
Definition: plot.py:1
histogram.Histograms
Definition: histogram.py:38
defaultEvaluationParameters.Quiet.__init__
def __init__(self, level=ROOT.kInfo+1)
Definition: defaultEvaluationParameters.py:21
histogram.weighted_mean_and_std
def weighted_mean_and_std(x, w)
Definition: histogram.py:27
histogram.poisson_error
def poisson_error(n_tot)
Definition: histogram.py:20
basf2_mva_util.calculate_flatness
def calculate_flatness(f, p, w=None)
Definition: basf2_mva_util.py:54