Belle II Software  release-08-01-10
ftPlotting.py
1 #!/usr/bin/env python3
2 
3 
10 
11 # @cond SUPPRESS_DOXYGEN
12 
13 import basf2_mva_util
14 from basf2 import B2INFO, B2WARNING
15 import basf2_mva_evaluation.histogram as histogram
16 import matplotlib.ticker
17 import matplotlib.patches
18 import matplotlib.colors
19 import matplotlib.gridspec
20 import matplotlib.figure
21 import matplotlib.artist
22 import matplotlib.pyplot as plt
23 import copy
24 import math
25 import pandas
26 import numpy
27 import numpy as np
28 import matplotlib
29 # Do not use standard backend TkAgg, because it is NOT thread-safe
30 # You will get an RuntimeError: main thread is not in main loop otherwise!
31 matplotlib.use("svg")
32 matplotlib.rcParams.update({'font.size': 40})
33 matplotlib.rcParams['text.usetex'] = True
34 matplotlib.rcParams['text.latex.preamble'] = [r"\usepackage{amsmath}"]
35 
36 
37 class Plotter:
38  """
39  Base class for all Plotters.
40  """
41 
42 
43  plots = None
44 
45  labels = None
46 
47  xmin = None
48 
49  xmax = None
50 
51  ymin = None
52 
53  ymax = None
54  yscale = 0.0
55  xscale = 0.0
56 
57  figure = None
58 
59  axis = None
60 
61  def __init__(self, figure=None, axis=None):
62  """
63  Creates a new figure and axis if None is given, sets the default plot parameters
64  @param figure default draw figure which is used
65  @param axis default draw axis which is used
66  """
67  B2INFO("Create new figure for class " + str(type(self)))
68  if figure is None:
69  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
70  self.figure.set_tight_layout(False)
71  else:
72  self.figure = figure
73 
74  if axis is None:
75  self.axis = self.figure.add_subplot(1, 1, 1)
76  else:
77  self.axis = axis
78 
79  self.plots = []
80  self.labels = []
81  self.xmin, self.xmax = float(0), float(1)
82  self.ymin, self.ymax = float(0), float(1)
83 
84  self.yscale = 0.1
85 
86  self.xscale = 0.0
87 
88 
89  self.plot_kwargs = None
90 
91  self.errorbar_kwargs = None
92 
93  self.errorband_kwargs = None
94 
95  self.fill_kwargs = None
96 
97  self.set_plot_options()
98  self.set_errorbar_options()
99  self.set_errorband_options()
100  self.set_fill_options()
101 
102  def add_subplot(self, gridspecs):
103  """
104  Adds a new subplot to the figure, updates all other axes
105  according to the given gridspec
106  @param gridspecs gridspecs for all axes including the new one
107  """
108  for gs, ax in zip(gridspecs[:-1], self.figure.axes):
109  ax.set_position(gs.get_position(self.figure))
110  ax.set_subplotspec(gs)
111  axis = self.figure.add_subplot(gridspecs[-1], sharex=self.axis)
112  return axis
113 
114  def save(self, filename):
115  """
116  Save the figure into a file
117  @param filename of the file
118  """
119  B2INFO("Save figure for class " + str(type(self)))
120  from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
121  canvas = FigureCanvas(self.figure)
122  canvas.print_figure(filename, dpi=50)
123  return self
124 
125  def set_plot_options(self, plot_kwargs={'linestyle': ''}):
126  """
127  Overrides default plot options for datapoint plot
128  @param plot_kwargs keyword arguments for the plot function
129  """
130  self.plot_kwargs = copy.copy(plot_kwargs)
131  return self
132 
133  def set_errorbar_options(self, errorbar_kwargs={'fmt': '.', 'elinewidth': 3, 'alpha': 1}):
134  """
135  Overrides default errorbar options for datapoint errorbars
136  @param errorbar_kwargs keyword arguments for the errorbar function
137  """
138  self.errorbar_kwargs = copy.copy(errorbar_kwargs)
139  return self
140 
141  def set_errorband_options(self, errorband_kwargs={'alpha': 0.5}):
142  """
143  Overrides default errorband options for datapoint errorband
144  @param errorbar_kwargs keyword arguments for the fill_between function
145  """
146  self.errorband_kwargs = copy.copy(errorband_kwargs)
147  return self
148 
149  def set_fill_options(self, fill_kwargs=None):
150  """
151  Overrides default fill_between options for datapoint errorband
152  @param fill_kwargs keyword arguments for the fill_between function
153  """
154  self.fill_kwargs = copy.copy(fill_kwargs)
155  return self
156 
157  def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None):
158  """
159  Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
160  @param x coordinates of the data points
161  @param y coordinates of the data points
162  @param xerr symmetric error on x data points
163  @param yerr symmetric error on y data points
164  """
165  p = e = f = None
166  plot_kwargs = copy.copy(self.plot_kwargs)
167  errorbar_kwargs = copy.copy(self.errorbar_kwargs)
168  errorband_kwargs = copy.copy(self.errorband_kwargs)
169  fill_kwargs = copy.copy(self.fill_kwargs)
170 
171  if plot_kwargs is None or 'color' not in plot_kwargs:
172  color = next(axis._get_lines.prop_cycler)
173  color = color['color']
174  plot_kwargs['color'] = color
175  else:
176  color = plot_kwargs['color']
177  color = matplotlib.colors.ColorConverter().to_rgb(color)
178  patch = matplotlib.patches.Patch(color=color, alpha=0.7)
179  patch.get_color = patch.get_facecolor
180  patches = [patch]
181 
182  if plot_kwargs is not None:
183  p, = axis.plot(x, y, **plot_kwargs)
184  patches.append(p)
185 
186  if errorbar_kwargs is not None and (xerr is not None or yerr is not None):
187  if 'color' not in errorbar_kwargs:
188  errorbar_kwargs['color'] = color
189  if 'ecolor' not in errorbar_kwargs:
190  errorbar_kwargs['ecolor'] = [0.4 * x for x in color]
191  # print("Here Colors")
192  # print([0.4 * x for x in color])
193  # if 'elinewidth' not in errorbar_kwargs:
194  errorbar_kwargs['elinewidth'] = 5
195  e = axis.errorbar(x, y, xerr=xerr, yerr=yerr, **errorbar_kwargs)
196  patches.append(e)
197 
198  if errorband_kwargs is not None and yerr is not None:
199  if 'color' not in errorband_kwargs:
200  errorband_kwargs['color'] = color
201  if xerr is not None:
202  # Ensure that xerr and yerr are iterable numpy arrays
203  xerr = x + xerr - x
204  yerr = y + yerr - y
205  for _x, _y, _xe, _ye in zip(x, y, xerr, yerr):
206  axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye,
207  **errorband_kwargs))
208  else:
209  f = axis.fill_between(x, y - yerr, y + yerr, interpolate=True, **errorband_kwargs)
210 
211  if fill_kwargs is not None:
212  axis.fill_between(x, y, 0, **fill_kwargs)
213 
214  return (tuple(patches), p, e, f)
215 
216  def add(self, *args, **kwargs):
217  """
218  Add a new plot to this plotter
219  """
220  return NotImplemented
221 
222  def finish(self, *args, **kwargs):
223  """
224  Finish plotting and set labels, legends and stuff
225  """
226  return NotImplemented
227 
228  def scale_limits(self):
229  """
230  Scale limits to increase distance to boundaries
231  """
232  self.ymin *= 1.0 - math.copysign(self.yscale, self.ymin)
233  self.ymax *= 1.0 + math.copysign(self.yscale, self.ymax)
234  self.xmin *= 1.0 - math.copysign(self.xscale, self.xmin)
235  self.xmax *= 1.0 + math.copysign(self.xscale, self.xmax)
236  return self
237 
238 
239 class PurityAndEfficiencyOverCut(Plotter):
240  """
241  Plots the purity and the efficiency over the cut value (for cut choosing)
242  """
243 
247 
248  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
249  """
250  Add a new curve to the plot
251  @param data pandas.DataFrame containing all data
252  @param column which is used to calculate efficiency and purity for different cuts
253  @param signal_mask boolean numpy.array defining which events are signal events
254  @param bckgrd_mask boolean numpy.array defining which events are background events
255  @param weight_column column in data containing the weights for each event
256  """
257 
258  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
259 
260  if normed:
261  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
262  purity, purity_error = hists.get_purity(['Signal'], ['Background'])
263  else:
264  efficiency, efficiency_error = hists.get_true_positives(['Signal'])
265  purity, purity_error = hists.get_false_positives(['Background'])
266 
267  cuts = hists.bin_centers
268 
269  self.xmin, self.xmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmax])
270  self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.ymin]), \
271  numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.ymax])
272 
273  self.plots.append(self._plot_datapoints(self.axis, cuts, efficiency, xerr=0, yerr=efficiency_error))
274 
275  if normed:
276  self.labels.append("Efficiency")
277  else:
278  self.labels.append("True positive")
279 
280  self.plots.append(self._plot_datapoints(self.axis, cuts, purity, xerr=0, yerr=purity_error))
281 
282  if normed:
283  self.labels.append("Purity")
284  else:
285  self.labels.append("False positive")
286 
287  return self
288 
289  def finish(self):
290  """
291  Sets limits, title, axis-labels and legend of the plot
292  """
293  self.axis.set_xlim((self.xmin, self.xmax))
294  self.axis.set_ylim((self.ymin, self.ymax))
295  self.axis.set_title("Classification Plot")
296  self.axis.get_xaxis().set_label_text('Cut Value')
297  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
298  return self
299 
300 
301 class SignalToNoiseOverCut(Plotter):
302  """
303  Plots the signal to noise ratio over the cut value (for cut choosing)
304  """
305 
309 
310  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
311  """
312  Add a new curve to the plot
313  @param data pandas.DataFrame containing all data
314  @param column which is used to calculate signal to noise ratio for different cuts
315  @param signal_mask boolean numpy.array defining which events are signal events
316  @param bckgrd_mask boolean numpy.array defining which events are background events
317  @param weight_column column in data containing the weights for each event
318  """
319 
320  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
321 
322  signal2noise, signal2noise_error = hists.get_signal_to_noise(['Signal'], ['Background'])
323 
324  cuts = hists.bin_centers
325 
326  self.xmin, self.xmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmax])
327  self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(signal2noise), self.ymin]), \
328  numpy.nanmax([numpy.nanmax(signal2noise), self.ymax])
329 
330  self.plots.append(self._plot_datapoints(self.axis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))
331 
332  self.labels.append(column)
333 
334  return self
335 
336  def finish(self):
337  """
338  Sets limits, title, axis-labels and legend of the plot
339  """
340  self.axis.set_xlim((self.xmin, self.xmax))
341  self.axis.set_ylim((self.ymin, self.ymax))
342  self.axis.set_title("Signal to Noise Plot")
343  self.axis.get_xaxis().set_label_text('Cut Value')
344  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
345  return self
346 
347 
348 class PurityOverEfficiency(Plotter):
349  """
350  Plots the purity over the efficiency also known as ROC curve
351  """
352 
356 
357  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
358  """
359  Add a new curve to the ROC plot
360  @param data pandas.DataFrame containing all data
361  @param column which is used to calculate efficiency and purity for different cuts
362  @param signal_mask boolean numpy.array defining which events are signal events
363  @param bckgrd_mask boolean numpy.array defining which events are background events
364  @param weight_column column in data containing the weights for each event
365  """
366  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
367  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
368  purity, purity_error = hists.get_purity(['Signal'], ['Background'])
369 
370  self.xmin, self.xmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmax])
371  self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
372 
373  p = self._plot_datapoints(self.axis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
374  self.plots.append(p)
375  if label is not None:
376  self.labels.append(label)
377  else:
378  self.labels.append(column)
379  return self
380 
381  def finish(self):
382  """
383  Sets limits, title, axis-labels and legend of the plot
384  """
385  self.axis.set_xlim((self.xmin, self.xmax))
386  self.axis.set_ylim((self.ymin, self.ymax))
387  self.axis.set_title("ROC Purity Plot")
388  self.axis.get_xaxis().set_label_text('Efficiency')
389  self.axis.get_yaxis().set_label_text('Purity')
390  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
391  return self
392 
393 
394 class RejectionOverEfficiency(Plotter):
395  """
396  Plots the rejection over the efficiency also known as ROC curve
397  """
398 
402 
403  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
404  """
405  Add a new curve to the ROC plot
406  @param data pandas.DataFrame containing all data
407  @param column which is used to calculate efficiency and purity for different cuts
408  @param signal_mask boolean numpy.array defining which events are signal events
409  @param bckgrd_mask boolean numpy.array defining which events are background events
410  @param weight_column column in data containing the weights for each event
411  """
412  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
413  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
414  rejection, rejection_error = hists.get_efficiency(['Background'])
415  rejection = 1 - rejection
416 
417  self.xmin, self.xmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmax])
418  self.ymin, self.ymax = numpy.nanmin([rejection.min(), self.ymin]), numpy.nanmax([rejection.max(), self.ymax])
419 
420  auc = numpy.abs(numpy.trapz(rejection, efficiency))
421 
422  p = self._plot_datapoints(self.axis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
423  self.plots.append(p)
424  if label is not None:
425  self.labels.append(label[:10] + r"$\ {\rm AUC}\ =\ $" + r"${:.2f}$".format(auc))
426  else:
427  self.labels.append(r"${\rm AUC}\ =\ $" + r"${:.2f}$".format(auc))
428 
429  return auc # self,
430 
431  def finish(self):
432  """
433  Sets limits, title, axis-labels and legend of the plot
434  """
435  self.axis.set_xlim((self.xmin, self.xmax))
436  self.axis.set_ylim((self.ymin, self.ymax))
437  # self.axis.set_title("ROC Rejection Plot")
438  self.axis.get_xaxis().set_tick_params(labelsize=60)
439  self.axis.get_yaxis().set_tick_params(labelsize=60)
440  self.axis.grid(True)
441  self.axis.get_xaxis().labelpad = 20
442  self.axis.get_yaxis().labelpad = 20
443  self.axis.get_xaxis().set_label_text(r'${\rm Signal\ Efficiency}$', fontsize=65)
444  self.axis.get_yaxis().set_label_text(r'${\rm Background\ Rejection}$', fontsize=65)
445  self.axis.legend([x[0] for x in self.plots], self.labels, fancybox=True, framealpha=0.5, fontsize=60, loc=3)
446  return self
447 
448 
449 class Multiplot(Plotter):
450  """
451  Plots multiple other plots into a grid 3x?
452  """
453 
454  figure = None
455 
456  axis = None
457 
458  def __init__(self, cls, number_of_plots, figure=None):
459  """
460  Creates a new figure if None is given, sets the default plot parameters
461  @param figure default draw figure which is used
462  """
463  if figure is None:
464  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
465  self.figure.set_tight_layout(True)
466  else:
467  self.figure = figure
468 
469  if number_of_plots == 1:
470  gs = matplotlib.gridspec.GridSpec(1, 1)
471  elif number_of_plots == 2:
472  gs = matplotlib.gridspec.GridSpec(1, 2)
473  elif number_of_plots == 3:
474  gs = matplotlib.gridspec.GridSpec(1, 3)
475  else:
476  gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
477 
478 
479  self.sub_plots = [cls(self.figure, self.figure.add_subplot(gs[i // 3, i % 3])) for i in range(number_of_plots)]
480  self.axis = self.sub_plots[0].axis
481  super().__init__(self.figure, self.axis)
482 
483  def add(self, i, *args, **kwargs):
484  """
485  Call add function of ith subplot
486  @param i position of the subplot
487  """
488  self.sub_plots[i].add(*args, **kwargs)
489 
490  def finish(self):
491  """
492  Sets limits, title, axis-labels and legend of the plot
493  """
494  for plot in self.sub_plots:
495  plot.finish()
496  return self
497 
498 
499 class Diagonal(Plotter):
500  """
501  Plots the purity in each bin over the classifier output.
502  """
503 
507 
508  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
509  """
510  Add a new curve to the Diagonal plot
511  @param data pandas.DataFrame containing all data
512  @param column which is used to calculate purity for different cuts
513  @param signal_mask boolean numpy.array defining which events are signal events
514  @param bckgrd_mask boolean numpy.array defining which events are background events
515  @param weight_column column in data containing the weights for each event
516  """
517  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
518  purity, purity_error = hists.get_purity_per_bin(['Signal'], ['Background'])
519 
520  self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
521  # self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
522  self.ymin, self.ymax = 0, 1
523 
524  p = self._plot_datapoints(self.axis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
525  self.plots.append(p)
526  self.labels.append(column)
527  return self
528 
529  def finish(self):
530  """
531  Sets limits, title, axis-labels and legend of the plot
532  """
533  self.scale_limits()
534  self.axis.plot((0.0, 1.0), (0.0, 1.0), color='black')
535  self.axis.set_xlim((self.xmin, self.xmax))
536  self.axis.set_ylim((self.ymin, self.ymax))
537  self.axis.set_title("Diagonal Plot")
538  self.axis.get_xaxis().set_label_text('Classifier Output')
539  self.axis.get_yaxis().set_label_text('Purity Per Bin')
540  self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
541  return self
542 
543 
544 class Distribution(Plotter):
545  """
546  Plots distribution of a quantity
547  """
548 
549  def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
550  keep_first_binning=False, range_in_std=None, logScale=False):
551  """
552  Creates a new figure and axis if None is given, sets the default plot parameters
553  @param figure default draw figure which is used
554  @param axis default draw axis which is used
555  @param normed true if histograms should be normed before drawing
556  @param keep_first_binning use the binning of the first distribution for further plots
557  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
558  """
559  super().__init__(figure, axis)
560 
561  self.normed_to_all_entries = normed_to_all_entries
562 
563  self.normed_to_bin_width = normed_to_bin_width
564 
565  self.range_in_std = range_in_std
566  # if self.normed_to_all_entries or self.normed_to_bin_width:
567 
568  self.ymin = float(0)
569 
570  self.ymax = float('-inf')
571 
572  self.xmin = float('inf')
573 
574  self.xmax = float('-inf')
575 
576  self.keep_first_binning = keep_first_binning
577 
578  self.first_binning = None
579 
580  self.x_axis_label = ''
581 
582  self.logScale = False
583 
584  self.binWidth = 0.02
585 
586  def add(self, data, column, mask=None, weight_column=None, label=None, bins=50):
587  """
588  Add a new distribution to the plots
589  @param data pandas.DataFrame containing all data
590  @param column which is used to calculate distribution histogram
591  @param mask boolean numpy.array defining which events are used for the histogram
592  @param weight_column column in data containing the weights for each event
593  """
594  if mask is None:
595  mask = numpy.ones(len(data)).astype('bool')
596 
597  # bins = 50
598  if self.keep_first_binning and self.first_binning is not None:
599  bins = self.first_binning
600  hists = histogram.Histograms(data, column, {'Total': mask}, weight_column=weight_column,
601  bins=bins, equal_frequency=False, range_in_std=self.range_in_std)
602  if self.keep_first_binning and self.first_binning is None:
603  self.first_binning = hists.bins
604  hist, hist_error = hists.get_hist('Total')
605  self.binWidth = hists.bin_widths[1]
606 
607  if self.normed_to_all_entries:
608  normalization = float(numpy.sum(hist))
609  hist = hist / normalization
610  hist_error = hist_error / normalization
611 
612  if self.normed_to_bin_width:
613  hist = hist / hists.bin_widths
614  hist_error = hist_error / hists.bin_widths
615 
616  self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
617  self.ymin, self.ymax = numpy.nanmin([hist.min(), self.ymin]), numpy.nanmax([(hist + hist_error).max(), self.ymax])
618 
619  p = self._plot_datapoints(self.axis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
620  self.plots.append(p)
621  self.x_axis_label = column
622  if label is None:
623  self.labels.append(column)
624  else:
625  self.labels.append(label)
626  return self
627 
628  def finish(self):
629  """
630  Sets limits, title, axis-labels and legend of the plot
631  """
632  self.scale_limits()
633  self.axis.set_xlim((self.xmin, self.xmax))
634 
635  if self.logScale:
636  self.axis.set_yscale('log', nonposy='clip')
637  else:
638  self.axis.set_ylim((self.ymin, self.ymax))
639  self.binWidth = '{:8.2f}'.format(self.binWidth)
640 
641  # self.axis.set_title("Distribution Plot")
642  self.axis.get_xaxis().set_label_text(self.x_axis_label)
643  if self.normed_to_all_entries and self.normed_to_bin_width:
644  self.axis.get_yaxis().set_label_text(r'# Entries per Bin / (# Entries * Bin Width)')
645  elif self.normed_to_all_entries:
646  # self.axis.get_yaxis().set_label_text('# Entries per Bin / # Entries')
647  self.axis.get_yaxis().set_label_text(
648  r'{$\frac{\rm Entries\hspace{0.25em} per\hspace{0.25em} Bin}{\rm Entries}\, /\, (' +
649  self.binWidth + r'\,)$}', fontsize=65)
650  self.axis.get_yaxis().labelpad = 20
651  self.axis.get_yaxis().set_tick_params(labelsize=60)
652  elif self.normed_to_bin_width:
653  self.axis.get_yaxis().set_label_text(r'# Entries per Bin / Bin Width')
654  else:
655  self.axis.get_yaxis().set_label_text(r'# Entries per Bin')
656  # self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5, fontsize=60)
657  return self
658 
659 
660 class Box(Plotter):
661  """
662  Create a boxplot
663  """
664 
666 
667  def __init__(self, figure=None, axis=None):
668  """
669  Creates a new figure and axis if None is given, sets the default plot parameters
670  @param figure default draw figure which is used
671  @param axis default draw axis which is used
672  """
673  super().__init__(figure=figure, axis=axis)
674 
675 
676  self.x_axis_label = ""
677 
678  def add(self, data, column, mask=None, weight_column=None):
679  """
680  Add a new boxplot to the plots
681  @param data pandas.DataFrame containing all data
682  @param column which is used to calculate boxplot quantities
683  @param mask boolean numpy.array defining which events are used for the histogram
684  @param weight_column column in data containing the weights for each event
685  """
686  if mask is None:
687  mask = numpy.ones(len(data)).astype('bool')
688  x = data[column][mask]
689  if weight_column is not None:
690  # weight = data[weight_column][mask]
691  B2WARNING("Weights are currently not used in boxplot, due to limitations in matplotlib")
692 
693  if len(x) == 0:
694  B2WARNING("Ignore empty boxplot.")
695  return self
696 
697  p = self.axis.boxplot(x, sym='k.', whis=1.5, vert=False, patch_artist=True, showmeans=True, widths=1,
698  boxprops=dict(facecolor='blue', alpha=0.5),
699  # medianprobs=dict(color='blue'),
700  # meanprobs=dict(color='red'),
701  )
702  self.plots.append(p)
703  self.labels.append(column)
704  self.x_axis_label = column
705  # """
706  # self.axis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + r'\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
707  # fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
708  # self.axis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + r'\n' + r'$IQD = {:.2f}$').format(x.std(),
709  # x.quantile(0.75) - x.quantile(0.25)),
710  # fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
711  # self.axis.text(0.7, 0.9, (r'$min = {:.2f}$' + r'\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
712  # fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
713  # """
714 
715  return self
716 
717  def finish(self):
718  """
719  Sets limits, title, axis-labels and legend of the plot
720  """
721  matplotlib.artist.setp(self.axis.get_yaxis(), visible=False)
722  self.axis.get_xaxis().set_label_text(self.x_axis_label)
723  self.axis.set_title("Box Plot")
724  return self
725 
726 
727 class Difference(Plotter):
728  """
729  Plots the difference between two histograms
730  """
731 
743 
744  def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
745  """
746  Creates a new figure and axis if None is given, sets the default plot parameters
747  @param figure default draw figure which is used
748  @param axis default draw axis which is used
749  @param normed normalize minuend and subtrahend before comparing them
750  @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
751  """
752  super().__init__(figure, axis)
753  self.normed = normed
754  self.shift_to_zero = shift_to_zero
755  if self.normed:
756  self.ymin = -0.01
757  self.ymax = 0.01
758  else:
759  self.ymin = -1
760  self.ymax = 1
761 
762  def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
763  """
764  Add a new difference plot
765  @param data pandas.DataFrame containing all data
766  @param column which is used to calculate distribution histogram
767  @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
768  @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
769  @param weight_column column in data containing the weights for each event
770  @param label label for the legend if None, the column name is used
771  """
772  bins = 50
773  hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask}, bins=bins,
774  weight_column=weight_column, equal_frequency=False)
775  minuend, minuend_error = hists.get_hist('Minuend')
776  subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
777 
778  difference_error = histogram.poisson_error(minuend + subtrahend)
779  if self.normed:
780  difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
781  minuend = minuend / numpy.sum(minuend)
782  subtrahend = subtrahend / numpy.sum(subtrahend)
783  difference = minuend - subtrahend
784 
785  if self.shift_to_zero:
786  difference = difference - numpy.mean(difference)
787 
788  self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
789  self.ymin = min((difference - difference_error).min(), self.ymin)
790  self.ymax = max((difference + difference_error).max(), self.ymax)
791 
792  p = self._plot_datapoints(self.axis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
793  self.plots.append(p)
794  if label is None:
795  self.labels.append(label)
796  else:
797  self.labels.append(column)
798  self.x_axis_label = column
799  return self
800 
801  def finish(self, line_color='black'):
802  """
803  Sets limits, title, axis-labels and legend of the plot
804  """
805  self.axis.plot((self.xmin, self.xmax), (0, 0), color=line_color, linewidth=4)
806  self.scale_limits()
807  self.axis.set_xlim((self.xmin, self.xmax))
808  self.axis.set_ylim((self.ymin, self.ymax))
809  self.axis.set_title("Difference Plot")
810  self.axis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
811  self.axis.get_xaxis().set_label_text(self.x_axis_label)
812  self.axis.set_ylabel(r'{\rm Difference}', fontsize=40, labelpad=20)
813  self.axis.get_xaxis().grid(True)
814  # self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
815  return self
816 
817 
818 class normalizedResiduals(Plotter):
819  """
820  Plots the difference between two histograms
821  """
822 
836 
837  def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
838  """
839  Creates a new figure and axis if None is given, sets the default plot parameters
840  @param figure default draw figure which is used
841  @param axis default draw axis which is used
842  @param normed normalize minuend and subtrahend before comparing them
843  @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
844  """
845  super().__init__(figure, axis)
846  self.normed = normed
847  self.shift_to_zero = shift_to_zero
848  if self.normed:
849  self.ymin = -0.01
850  self.ymax = 0.01
851  else:
852  self.ymin = -1
853  self.ymax = 1
854 
855  def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None, bins=50, isNN=False):
856  """
857  Add a new difference plot
858  @param data pandas.DataFrame containing all data
859  @param column which is used to calculate distribution histogram
860  @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
861  @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
862  @param weight_column column in data containing the weights for each event
863  @param label label for the legend if None, the column name is used
864  """
865  # bins = 50
866  hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask}, bins=bins,
867  weight_column=weight_column, equal_frequency=False)
868  minuend, minuend_error = hists.get_hist('Minuend')
869  subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
870 
871  print("Here BinWidths Norm", hists.bin_widths)
872  difference_error = histogram.poisson_error(minuend + subtrahend)
873 
874  if self.normed:
875  difference_error = numpy.sqrt((minuend_error / numpy.sum(minuend))**2 + (subtrahend_error / numpy.sum(subtrahend))**2)
876  minuend = minuend / numpy.sum(minuend)
877  subtrahend = subtrahend / numpy.sum(subtrahend)
878  difference = minuend - subtrahend
879  normalizedRes = (minuend - subtrahend) / difference_error
880 
881  if self.shift_to_zero:
882  difference = difference - numpy.mean(difference)
883 
884  # self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
885 
886  # if min(hists.bin_centers.min(), self.xmin) < -0.8:
887  if isNN:
888  self.xmin = float(-1.0)
889 
890  self.xmin, self.xmax = self.xmin, self.xmax
891 
892  p = self._plot_datapoints(self.axis, hists.bin_centers, normalizedRes, xerr=hists.bin_widths / 2, yerr=1)
893  self.plots.append(p)
894  if label is None:
895  self.labels.append(label)
896  else:
897  self.labels.append(column)
898  self.x_axis_label = column
899  return self
900 
901  def finish(self, line_color='black'):
902  """
903  Sets limits, title, axis-labels and legend of the plot
904  """
905  # self.axis.plot((self.xmin, self.xmax), (0, 0), color=line_color, linewidth=4)
906  self.scale_limits()
907  self.axis.set_xlim((self.xmin, self.xmax))
908  self.axis.set_ylim((-5, 5))
909  self.axis.set_title("Difference Plot")
910  self.axis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
911  self.axis.get_xaxis().set_label_text(self.x_axis_label)
912  self.axis.set_ylabel(r'${\rm Normalized}$' + '\n' + r'${\rm Residuals}$', fontsize=40, labelpad=20)
913  self.axis.get_yaxis().set_ticks([-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5])
914  self.axis.get_yaxis().set_ticklabels([r'', r'$-4$', r'', r'$-2$', r'', r'$0$', r'', r'$2$', r'', r'$4$', r''], fontsize=45)
915  self.axis.get_xaxis().grid(True) # linestyle='--'
916  # plt.axhline(y= 4, xmin=-1.005, xmax=1.005, linewidth=1, color = 'k', linestyle = '-')
917  self.axis.plot((self.xmin, self.xmax), (3, 3), linewidth=4, color='#006600', linestyle='-')
918  self.axis.plot((self.xmin, self.xmax), (1, 1), linewidth=4, color='b', linestyle='-')
919  self.axis.plot((self.xmin, self.xmax), (-1, -1), linewidth=4, color='b', linestyle='-')
920  self.axis.plot((self.xmin, self.xmax), (-3, -3), linewidth=4, color='#006600', linestyle='-')
921 
922  # self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
923  return self
924 
925 
926 class Overtraining(Plotter):
927  """
928  Create TMVA-like overtraining control plot for a classification training
929  """
930 
931 
932  figure = None
933 
934  axis = None
935 
936  axis_d1 = None
937 
938  axis_d2 = None
939 
940  def __init__(self, figure=None):
941  """
942  Creates a new figure if None is given, sets the default plot parameters
943  @param figure default draw figure which is used
944  """
945  if figure is None:
946  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
947  self.figure.set_tight_layout(True)
948  else:
949  self.figure = figure
950 
951  gs = matplotlib.gridspec.GridSpec(5, 1)
952  self.axis = self.figure.add_subplot(gs[:3, :])
953  self.axis_d1 = self.figure.add_subplot(gs[3, :], sharex=self.axis)
954  self.axis_d2 = self.figure.add_subplot(gs[4, :], sharex=self.axis)
955 
956  super().__init__(self.figure, self.axis)
957 
958  def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None, bkgrOutput=0, isNN=False):
959  """
960  Add a new overtraining plot, I recommend to raw only one overtraining plot at the time,
961  otherwise there are too many curves in the plot to recognize anything in the plot.
962  @param data pandas.DataFrame containing all data
963  @param column which is used to calculate distribution histogram
964  @param train_mask boolean numpy.array defining which events are training events
965  @param test_mask boolean numpy.array defining which events are test events
966  @param signal_mask boolean numpy.array defining which events are signal events
967  @param bckgrd_mask boolean numpy.array defining which events are background events
968  @param weight_column column in data containing the weights for each event
969  """
970  distribution = Distribution(self.figure, self.axis, normed_to_all_entries=True)
971 
972  bins = []
973 
974  if isNN:
975  bins = list(range(-51, 55, 1))
976  for i in range(0, len(bins)):
977  bins[i] = float(bins[i]) / 25
978  else:
979  bins = list(range(-51, 55, 1))
980  for i in range(0, len(bins)):
981  bins[i] = float(bins[i]) / 50
982 
983  if bkgrOutput == 0:
984  distribution.logScale = True
985  distribution.labels = [r'{\rm Test-Bkgr.}', r'{\rm Train-Bkgr.}', r'{\rm Test-Signal}', r'{\rm Train-Signal}']
986  else:
987  distribution.labels = [
988  r'{\rm Test-$\bar{B}^{0}$}',
989  r'{\rm Train-$\bar{B}^{0}$}',
990  r'{\rm Test-$B^{0}$}',
991  r'{\rm Train-$B^{0}$}']
992 
993  distribution.set_plot_options(self.plot_kwargs)
994  # distribution.set_errorbar_options(self.errorbar_kwargs)
995  distribution.set_errorbar_options({'fmt': 'o', 'elinewidth': 5, 'alpha': 1, 'markersize': 20, 'ecolor': 'w'})
996  # distribution.set_errorband_options(self.errorband_kwargs)
997  distribution.set_errorband_options(None)
998  distribution.add(data, column, test_mask & bckgrd_mask, weight_column, None, bins)
999  distribution.add(data, column, test_mask & signal_mask, weight_column, None, bins)
1000 
1001  distribution.set_errorbar_options(None)
1002 
1003  distribution.set_plot_options({'color': distribution.plots[0][0][0].get_color(
1004  ), 'drawstyle': 'steps-mid', 'linestyle': 'dashed', 'lw': 5})
1005  distribution.set_fill_options(None)
1006  distribution.add(data, column, train_mask & bckgrd_mask, weight_column, None, bins)
1007  distribution.set_plot_options({'color': distribution.plots[1][0][0].get_color(
1008  ), 'drawstyle': 'steps-mid', 'linestyle': 'solid', 'lw': 5})
1009  # distribution.set_fill_options({'color': distribution.plots[1][0][0].get_color(), 'alpha': 0.5, 'step': 'mid'})
1010  distribution.add(data, column, train_mask & signal_mask, weight_column, None, bins)
1011 
1012  distribution.finish()
1013 
1014  p1 = distribution.axis.errorbar([], [], xerr=0, yerr=0, elinewidth=5, mew=2, ecolor='w',
1015  # ecolor=[0.04862745098039216, 0.18666666666666668, 0.28235294117647064],
1016  fmt='o', mfc=distribution.plots[0][0][0].get_color(),
1017  mec=distribution.plots[0][0][0].get_color(), markersize=20, label=r'${\rm Test-Bkgr.}$')
1018  p2, = distribution.axis.plot([], label=r'${\rm Train-Bkgr.}$', linewidth=5,
1019  linestyle='dashed', c=distribution.plots[0][0][0].get_color())
1020  p3 = distribution.axis.errorbar([], [], xerr=0, yerr=0, elinewidth=5, mew=2, ecolor='w',
1021  # ecolor=[0.4, 0.1992156862745098, 0.02196078431372549],
1022  fmt='o', mfc=distribution.plots[1][0][0].get_color(),
1023  mec=distribution.plots[1][0][0].get_color(), markersize=20, label=r'${\rm Test-Signal}$')
1024  p4, = distribution.axis.plot([], label=r'${\rm Train-Signal}$', linewidth=5,
1025  linestyle='solid', alpha=0.9, c=distribution.plots[1][0][0].get_color())
1026 
1027  distribution.axis.legend([p1, p2, p3, p4], distribution.labels, loc='best', fancybox=True, framealpha=0.5, fontsize=60)
1028 
1029  self.plot_kwargs['color'] = distribution.plots[0][0][0].get_color()
1030  difference_bckgrd = normalizedResiduals(self.figure, self.axis_d1, shift_to_zero=True, normed=True)
1031  difference_bckgrd.set_plot_options(self.plot_kwargs)
1032  difference_bckgrd.set_errorbar_options(self.errorbar_kwargs)
1033  difference_bckgrd.set_errorband_options(self.errorband_kwargs)
1034  difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column, None, bins, isNN)
1035  self.axis_d1.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
1036  self.axis_d1.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
1037  # self.plot_kwargs['color'] = distribution.plots[0][0][0].get_color()
1038  difference_bckgrd.plots = difference_bckgrd.labels = []
1039  difference_bckgrd.finish(line_color=distribution.plots[0][0][0].get_color())
1040 
1041  self.plot_kwargs['color'] = distribution.plots[1][0][0].get_color()
1042  difference_signal = normalizedResiduals(self.figure, self.axis_d2, shift_to_zero=True, normed=True)
1043  difference_signal.set_plot_options(self.plot_kwargs)
1044  difference_signal.set_errorbar_options(self.errorbar_kwargs)
1045  difference_signal.set_errorband_options(self.errorband_kwargs)
1046  difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column, None, bins, isNN)
1047  self.axis_d2.set_xlim((difference_signal.xmin, difference_signal.xmax))
1048  self.axis_d2.set_ylim((difference_signal.ymin, difference_signal.ymax))
1049  difference_signal.plots = difference_signal.labels = []
1050  difference_signal.finish(line_color=distribution.plots[1][0][0].get_color())
1051 
1052 
1053 # try:
1054 # import scipy.stats
1055 # # Kolmogorov smirnov test
1056 # if len(data[column][train_mask & signal_mask]) == 0 or len(data[column][test_mask & signal_mask]) == 0:
1057 # B2WARNING("Cannot calculate kolmogorov smirnov test for signal due to missing data")
1058 # else:
1059 # ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
1060 # props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
1061 # self.axis_d1.text(0.1, 0.9, r'${\rm Signal\ (train\ -\ test)}\ p$-{\rm value}' + r'$={:.2f}$'.format(ks[1]),
1062 # fontsize=36, bbox=props,
1063 # verticalalignment='top', horizontalalignment='left', transform=self.axis_d1.transAxes)
1064 # if len(data[column][train_mask & bckgrd_mask]) == 0 or len(data[column][test_mask & bckgrd_mask]) == 0:
1065 # B2WARNING("Cannot calculate kolmogorov smirnov test for background due to missing data")
1066 # else:
1067 # ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
1068 # props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
1069 # self.axis_d2.text(0.1, 0.9, r'${\rm Bkgr.\ (train\ -\ test)}\ p$-{\rm value}' + r'$={:.2f}$'.format(ks[1]),
1070 # fontsize=36,
1071 # bbox=props,
1072 # verticalalignment='top', horizontalalignment='left', transform=self.axis_d2.transAxes)
1073 # except ImportError:
1074 # B2WARNING("Cannot calculate kolmogorov smirnov test please install scipy!")
1075 
1076  return self
1077 
1078  def finish(self, xLabel=r'${\rm Classifier\ Output}$'):
1079  """
1080  Sets limits, title, axis-labels and legend of the plot
1081  """
1082  # self.axis.set_title("Overtraining Plot")
1083  self.axis_d1.set_title("")
1084  self.axis_d2.set_title("")
1085  matplotlib.artist.setp(self.axis.get_xticklabels(), visible=False)
1086  matplotlib.artist.setp(self.axis_d1.get_xticklabels(), visible=False)
1087  self.axis.get_xaxis().set_label_text('')
1088  self.axis_d1.get_xaxis().set_label_text('')
1089  self.axis_d2.get_xaxis().set_label_text(xLabel, fontsize=85)
1090  self.axis_d2.get_xaxis().labelpad = 20
1091  self.axis_d2.get_xaxis().set_tick_params(labelsize=60)
1092  return self
1093 
1094 
1095 class VerboseDistribution(Plotter):
1096  """
1097  Plots distribution of a quantity including boxplots
1098  """
1099 
1100 
1101  box_axes = None
1102 
1103  def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
1104  """
1105  Creates a new figure and axis if None is given, sets the default plot parameters
1106  @param figure default draw figure which is used
1107  @param axis default draw axis which is used
1108  @param normed true if the histograms should be normed before drawing
1109  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
1110  """
1111  super().__init__(figure, axis)
1112 
1113  self.normed = normed
1114 
1115  self.range_in_std = range_in_std
1116  self.box_axes = []
1117 
1118  self.distribution = Distribution(self.figure, self.axis, normed_to_all_entries=self.normed, range_in_std=self.range_in_std)
1119 
1120  def add(self, data, column, mask=None, weight_column=None, label=None):
1121  """
1122  Add a new distribution plot, with additional information like a boxplot compared to
1123  the ordinary Distribution plot.
1124  @param data pandas.DataFrame containing all data
1125  @param column which is used to calculate distribution histogram
1126  @param mask boolean numpy.array defining which events are used for the distribution histogram
1127  @param weight_column column in data containing the weights for each event
1128  """
1129  self.distribution.set_plot_options(self.plot_kwargs)
1130  self.distribution.set_errorbar_options(self.errorbar_kwargs)
1131  self.distribution.set_errorband_options(self.errorband_kwargs)
1132  self.distribution.add(data, column, mask, weight_column, label=label)
1133 
1134  n = len(self.box_axes) + 1
1135  gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1136  gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i in range(n)]
1137  box_axis = self.add_subplot(gridspecs)
1138 
1139  if self.range_in_std is not None:
1140  mean, std = histogram.weighted_mean_and_std(data[column], None if weight_column is None else data[weight_column])
1141  # Everything outside mean +- range_in_std * std is considered not inside the mask
1142  mask = mask & (data[column] > (mean - self.range_in_std * std)) & (data[column] < (mean + self.range_in_std * std))
1143  box = Box(self.figure, box_axis)
1144  box.add(data, column, mask, weight_column)
1145  if len(box.plots) > 0:
1146  box.plots[0]['boxes'][0].set_facecolor(self.distribution.plots[-1][0][0].get_color())
1147  box.finish()
1148 
1149  self.box_axes.append(box_axis)
1150  return self
1151 
1152  def finish(self):
1153  """
1154  Sets limits, title, axis-labels and legend of the plot
1155  """
1156  self.distribution.finish()
1157  matplotlib.artist.setp(self.axis.get_xticklabels(), visible=False)
1158  self.axis.get_xaxis().set_label_text('')
1159  for box_axis in self.box_axes[:-1]:
1160  matplotlib.artist.setp(box_axis.get_xticklabels(), visible=False)
1161  box_axis.set_title("")
1162  box_axis.get_xaxis().set_label_text('')
1163  self.box_axes[-1].set_title("")
1164  self.axis.set_title("Distribution Plot")
1165  self.axis.legend([x[0] for x in self.distribution.plots], self.distribution.labels,
1166  loc='best', fancybox=True, framealpha=0.5)
1167  return self
1168 
1169 
1170 class Correlation(Plotter):
1171  """
1172  Plots change of a distribution of a quantity depending on the cut on a classifier
1173  """
1174 
1175  figure = None
1176 
1177  axis = None
1178 
1179  axis_d1 = None
1180 
1181  axis_d2 = None
1182 
1183  def __init__(self, figure=None):
1184  """
1185  Creates a new figure if None is given, sets the default plot parameters
1186  @param figure default draw figure which is used
1187  """
1188  if figure is None:
1189  self.figure = matplotlib.figure.Figure(figsize=(32, 18))
1190  self.figure.set_tight_layout(True)
1191  else:
1192  self.figure = figure
1193 
1194  gs = matplotlib.gridspec.GridSpec(3, 2)
1195  self.axis = self.figure.add_subplot(gs[0, :])
1196  self.axis_d1 = self.figure.add_subplot(gs[1, :], sharex=self.axis)
1197  self.axis_d2 = self.figure.add_subplot(gs[2, :], sharex=self.axis)
1198 
1199  super().__init__(self.figure, self.axis)
1200 
1201  def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1202  """
1203  Add a new correlation plot.
1204  @param data pandas.DataFrame containing all data
1205  @param column which is used to calculate distribution histogram
1206  @param cut_column which is used to calculate cut on the other quantity defined by column
1207  @param quantiles list of quantiles between 0 and 100, defining the different cuts
1208  @param weight_column column in data containing the weights for each event
1209  """
1210  if len(data[cut_column]) == 0:
1211  B2WARNING("Ignore empty Correlation.")
1212  return self
1213 
1214  axes = [self.axis, self.axis_d1, self.axis_d2]
1215 
1216  for i, (l, m) in enumerate([('.', signal_mask | bckgrd_mask), ('S', signal_mask), ('B', bckgrd_mask)]):
1217 
1218  if weight_column is not None:
1219  weights = numpy.array(data[weight_column][m])
1220  else:
1221  weights = numpy.ones(len(data[column][m]))
1222 
1223  # The cast to float32 is a workaround for the following numpy issue:
1224  # https://github.com/numpy/numpy/issues/8123
1225  xrange = np.percentile(data[column][m], [5, 95]).astype(np.float32)
1226 
1227  colormap = plt.get_cmap('coolwarm')
1228  tmp, x = np.histogram(data[column][m], bins=100,
1229  range=xrange, normed=True, weights=weights)
1230  bin_center = ((x + np.roll(x, 1)) / 2)[1:]
1231  axes[i].plot(bin_center, tmp, color='black', lw=1)
1232 
1233  for quantil in np.arange(5, 100, 5):
1234  cut = np.percentile(data[cut_column][m], quantil)
1235  sel = data[cut_column][m] >= cut
1236  y, x = np.histogram(data[column][m][sel], bins=100,
1237  range=xrange, normed=True, weights=weights[sel])
1238  bin_center = ((x + np.roll(x, 1)) / 2)[1:]
1239  axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1240  tmp = y
1241 
1242  axes[i].set_ylim(bottom=0)
1243 
1244  flatness_score = basf2_mva_util.calculate_flatness(data[column][m], data[cut_column][m], weights)
1245  axes[i].set_title(r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1246  return self
1247 
1248  def finish(self):
1249  """
1250  Sets limits, title, axis-labels and legend of the plot
1251  """
1252  return self
1253 
1254 
1255 class TSNE(Plotter):
1256  """
1257  Plots multivariate distribution using TSNE algorithm
1258  """
1259 
1260  def add(self, data, columns, *masks):
1261  """
1262  Add a new correlation plot.
1263  @param data pandas.DataFrame containing all data
1264  @param columns which are used to calculate the correlations
1265  @param masks different classes to show in TSNE
1266  """
1267  try:
1268  import sklearn
1269  import sklearn.manifold
1270  model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1271  data = numpy.array([data[column] for column in columns]).T
1272  model.fit(data)
1273  for mask in masks:
1274  data = numpy.array([data[column][mask] for column in columns]).T
1275  data = model.transform(data)
1276  self.axis.scatter(data[:, 0], data[:, 1])
1277  except ImportError:
1278  print("Cannot create TSNE plot. Install sklearn if you want it")
1279  return self
1280 
1281  def finish(self):
1282  """
1283  Sets limits, title, axis-labels and legend of the plot
1284  """
1285  return self
1286 
1287 
1288 class Importance(Plotter):
1289  """
1290  Plots importance matrix
1291  """
1292 
1293  def add(self, data, columns, variables, displayHeatMap):
1294  """
1295  Add a new correlation plot.
1296  @param data pandas.DataFrame containing all data
1297  @param columns which are used to calculate the correlations
1298  """
1299  self.figure.set_tight_layout(True)
1300 
1301  def norm(x):
1302  width = (numpy.max(x) - numpy.min(x))
1303  if width <= 0:
1304  return numpy.zeros(x.shape)
1305  return (x - numpy.min(x)) / width * 100
1306 
1307  importance_matrix = numpy.vstack([norm(data[column]) for column in columns]).T
1308 
1309  cRdBu = plt.get_cmap('RdBu')
1310  new_RdBu = truncate_colormap(cRdBu, 0.5, 0.85)
1311 
1312  labelsValues = []
1313  labels = list(variables)
1314 
1315  for y in range(importance_matrix.shape[0]):
1316  for x in range(importance_matrix.shape[1]):
1317  labelsValues.append([importance_matrix[y, x], labels[y]])
1318 
1319  labelsValues = np.array(sorted(labelsValues))
1320 
1321  arrayToSort = np.array(np.sort(importance_matrix, axis=0))
1322  # print(arrayToSort)
1323  importance_heatmap = self.axis.pcolor(arrayToSort, cmap=new_RdBu, vmin=0, vmax=100)
1324  # importance_heatmap = self.axis.pcolor(importance_matrix, cmap=matplotlib.pyplot.cm.viridis, vmin=0.0, vmax=100)
1325 
1326  CoeffSize = 33
1327 
1328  # put the major ticks at the middle of each cell
1329  self.axis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=False)
1330  self.axis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=False)
1331 
1332  self.axis.set_xticklabels(columns, minor=False, rotation=90)
1333  # self.axis.set_yticklabels(variables, minor=False)
1334 
1335  if labelsValues.shape[0] < 6:
1336  CoeffSize = 50
1337  self.axis.set_yticklabels(labelsValues[:, 1], minor=False, size=58)
1338 
1339  else:
1340  self.axis.set_yticklabels(labelsValues[:, 1], minor=False)
1341 
1342  self.axis.set_xticklabels([''])
1343 
1344  # for y in range(importance_matrix.shape[0]):
1345  # for x in range(importance_matrix.shape[1]):
1346  # self.axis.text(x + 0.5, y + 0.5, r'$%.0f$' % importance_matrix[y, x],
1347  # size=33,
1348  # horizontalalignment='center',
1349  # verticalalignment='center')
1350 
1351  for y in range(labelsValues.shape[0]):
1352  self.axis.text(x + 0.5, y + 0.5, r'$%.0f$' % float(labelsValues[y][0]),
1353  size=CoeffSize,
1354  horizontalalignment='center',
1355  verticalalignment='center')
1356 
1357  if displayHeatMap:
1358  cb = self.figure.colorbar(importance_heatmap, ticks=[2, 98], orientation='vertical')
1359  cb.ax.tick_params(length=0)
1360  cb.ax.set_yticklabels([r'${\rm low}$', r'${\rm high}$'], size=60)
1361 
1362  self.axis.set_aspect('equal')
1363 
1364  return self
1365 
1366  def finish(self):
1367  """
1368  Sets limits, title, axis-labels and legend of the plot
1369  """
1370  return self
1371 
1372 
1373 def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
1374  new_cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
1375  'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
1376  cmap(np.linspace(minval, maxval, n)))
1377  return new_cmap
1378 
1379 
1380 class CorrelationMatrix(Plotter):
1381  """
1382  Plots correlation matrix
1383  """
1384 
1385  figure = None
1386 
1387  signal_axis = None
1388 
1389  bckgrd_axis = None
1390 
1391  def __init__(self, figure=None):
1392  """
1393  Creates a new figure if None is given, sets the default plot parameters
1394  @param figure default draw figure which is used
1395  """
1396  if figure is None:
1397  self.figure = matplotlib.figure.Figure(figsize=(38, 24))
1398  self.figure.set_tight_layout(True)
1399  else:
1400  self.figure = figure
1401 
1402  gs = matplotlib.gridspec.GridSpec(16, 2)
1403  self.signal_axis = self.figure.add_subplot(gs[:14, 0])
1404  self.bckgrd_axis = self.figure.add_subplot(gs[:14, 1], sharey=self.signal_axis)
1405 
1406  self.colorbar_axis = self.figure.add_subplot(gs[15, :])
1407 
1408  self.axis = self.signal_axis
1409 
1410  super().__init__(self.figure, self.axis)
1411 
1412  def add(self, data, columns, signal_mask, bckgrd_mask, bkgrOutput):
1413  """
1414  Add a new correlation plot.
1415  @param data pandas.DataFrame containing all data
1416  @param columns which are used to calculate the correlations
1417  """
1418  # columns = list(reversed(columns))
1419  signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column in columns])) * 100
1420  bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask] for column in columns])) * 100
1421 
1422  mirrored_signal_corr = np.zeros(signal_corr.shape)
1423  mirrored_bckgrd_corr = np.zeros(bckgrd_corr.shape)
1424 
1425  for y in range(signal_corr.shape[0]):
1426  for x in range(signal_corr.shape[1]):
1427  mirrored_signal_corr[y, x] = signal_corr[y, signal_corr.shape[1] - 1 - x]
1428 
1429  for y in range(bckgrd_corr.shape[0]):
1430  for x in range(bckgrd_corr.shape[1]):
1431  mirrored_bckgrd_corr[y, x] = bckgrd_corr[y, bckgrd_corr.shape[1] - 1 - x]
1432 
1433  cRdBu = plt.get_cmap('RdBu')
1434  new_RdBu = truncate_colormap(cRdBu, 0.15, 0.85)
1435  signal_heatmap = self.signal_axis.pcolor(mirrored_signal_corr, cmap=new_RdBu, vmin=-100.0, vmax=100.0)
1436  # bckgrd_heatmap = self.bckgrd_axis.pcolor(mirrored_bckgrd_corr, cmap=new_RdBu, vmin=-100.0, vmax=100.0)
1437 
1438  # cvir = plt.get_cmap('viridis_r')
1439  # new_cvir = truncate_colormap(cvir, 0, 0.75)
1440  # signal_heatmap = self.signal_axis.pcolor(mirrored_signal_corr, cmap=new_cvir, vmin=-100.0, vmax=100.0)
1441  # bckgrd_heatmap = self.bckgrd_axis.pcolor(mirrored_bckgrd_corr, cmap=new_cvir, vmin=-100.0, vmax=100.0)
1442 
1443  for y in range(mirrored_signal_corr.shape[0]):
1444  for x in range(mirrored_signal_corr.shape[1]):
1445  outputWithRedundantMinus = '%.0f' % mirrored_signal_corr[y, x]
1446  if outputWithRedundantMinus == '-0':
1447  mirrored_signal_corr[y, x] = 0
1448 
1449  for y in range(mirrored_bckgrd_corr.shape[0]):
1450  for x in range(mirrored_bckgrd_corr.shape[1]):
1451  outputWithRedundantMinus = '%.0f' % mirrored_bckgrd_corr[y, x]
1452  if outputWithRedundantMinus == '-0':
1453  mirrored_bckgrd_corr[y, x] = 0
1454 
1455  self.signal_axis.invert_yaxis()
1456  self.signal_axis.xaxis.tick_top()
1457  self.bckgrd_axis.invert_yaxis()
1458  self.bckgrd_axis.xaxis.tick_top()
1459 
1460  # put the major ticks at the middle of each cell
1461  self.signal_axis.set_xticks(numpy.arange(mirrored_signal_corr.shape[0]) + 0.5, minor=False)
1462  self.signal_axis.set_yticks(numpy.arange(mirrored_signal_corr.shape[1]) + 0.5, minor=False)
1463 
1464  CoeffSize = 30
1465 
1466  # put the major ticks at the middle of each cell
1467  self.bckgrd_axis.set_xticks(numpy.arange(mirrored_bckgrd_corr.shape[0]) + 0.5, minor=False)
1468  self.bckgrd_axis.set_yticks(numpy.arange(mirrored_bckgrd_corr.shape[1]) + 0.5, minor=False)
1469 
1470  if mirrored_signal_corr.shape[0] < 8:
1471  CoeffSize = 50
1472  self.bckgrd_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90, size=58)
1473  self.bckgrd_axis.set_yticklabels(columns, minor=False, size=58)
1474  self.signal_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90, size=58)
1475  self.signal_axis.set_yticklabels(columns, minor=False, size=58)
1476  else:
1477  self.bckgrd_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90)
1478  self.bckgrd_axis.set_yticklabels(columns, minor=False)
1479  self.signal_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90)
1480  self.signal_axis.set_yticklabels(columns, minor=False)
1481 
1482  for y in range(mirrored_signal_corr.shape[0]):
1483  for x in range(mirrored_signal_corr.shape[1]):
1484  if mirrored_signal_corr.shape[0] > 24 and mirrored_signal_corr[y, x] < 0:
1485  self.signal_axis.text(x + 0.5, y + 0.5, '-' + r'$%.0f$' % abs(mirrored_signal_corr[y, x]),
1486  size=25,
1487  horizontalalignment='center',
1488  verticalalignment='center')
1489  else:
1490  self.signal_axis.text(x + 0.5, y + 0.5, r'$%.0f$' % mirrored_signal_corr[y, x],
1491  size=CoeffSize,
1492  horizontalalignment='center',
1493  verticalalignment='center')
1494 
1495  for y in range(mirrored_bckgrd_corr.shape[0]):
1496  for x in range(mirrored_bckgrd_corr.shape[1]):
1497  if mirrored_bckgrd_corr.shape[0] > 24 and mirrored_bckgrd_corr[y, x] < 0:
1498  self.signal_axis.text(x + 0.5, y + 0.5, '-' + r'$%.0f$' % abs(mirrored_bckgrd_corr[y, x]),
1499  size=25,
1500  horizontalalignment='center',
1501  verticalalignment='center')
1502  else:
1503  self.bckgrd_axis.text(x + 0.5, y + 0.5, r'$%.0f$' % mirrored_bckgrd_corr[y, x],
1504  size=CoeffSize,
1505  horizontalalignment='center',
1506  verticalalignment='center')
1507 
1508  cb = self.figure.colorbar(signal_heatmap, cax=self.colorbar_axis, ticks=[-92.3, 0, 92.5], orientation='horizontal')
1509  cb.ax.tick_params(length=0)
1510  cb.ax.set_xticklabels([r'${\rm negative}$', r'${\rm uncorrelated}$', r'${\rm positive}$'], fontsize=60)
1511 
1512  if bkgrOutput == -1:
1513  self.figure.text(0.30, 0.11, r'$B^0\,(q_{\rm MC} = +1)$', horizontalalignment='center', size=65)
1514  self.figure.text(0.74, 0.11, r'$\bar{B}^0\,(q_{\rm MC} = -1)$', horizontalalignment='center', size=65)
1515 
1516  else:
1517  self.figure.text(0.27, 0.115, r'${\rm Signal}$', horizontalalignment='center', size=65)
1518  self.figure.text(0.73, 0.115, r'${\rm Background}$', horizontalalignment='center', size=65)
1519 
1520  return self
1521 
1522  def finish(self):
1523  """
1524  Sets limits, title, axis-labels and legend of the plot
1525  """
1526  matplotlib.artist.setp(self.bckgrd_axis.get_yticklabels(), visible=False)
1527  return self
1528 
1529 
1530 if __name__ == '__main__':
1531 
1532  def get_data(N, columns):
1533  """
1534  Creates fake data for example plots
1535  """
1536  N /= 2
1537  n = len(columns) - 1
1538  xs = numpy.random.normal(0, size=(N, n))
1539  xb = numpy.random.normal(1, size=(N, n))
1540  ys = numpy.zeros(N)
1541  yb = numpy.ones(N)
1542  data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1543  return data.reindex(numpy.random.permutation(data.index))
1544 
1545  import seaborn
1546  # Set nice searborn settings
1547  seaborn.set(font_scale=3)
1548  seaborn.set_style('whitegrid')
1549 
1550  # Standard plots
1551  N = 100000
1552  data = get_data(N, columns=['FastBDT', 'NeuroBayes', 'isSignal'])
1553  data['type'] = ''
1554  data.type.iloc[:N / 2] = 'Train'
1555  data.type.iloc[N / 2:] = 'Test'
1556 
1557  p = Box()
1558  p.add(data, 'FastBDT')
1559  p.finish()
1560  p.save('box_plot.png')
1561 
1562  p = VerboseDistribution()
1563  p.add(data, 'FastBDT')
1564  p.add(data, 'NeuroBayes')
1565  p.finish()
1566  p.save('verbose_distribution_plot.png')
1567 
1568  p = PurityOverEfficiency()
1569  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1570  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1571  p.finish()
1572  p.save('roc_purity_plot.png')
1573 
1574  p = RejectionOverEfficiency()
1575  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1576  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1577  p.finish()
1578  p.save('roc_rejection_plot.png')
1579 
1580  p = Diagonal()
1581  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1582  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1583  p.finish()
1584  p.save('diagonal_plot.png')
1585 
1586  p = Distribution()
1587  p.add(data, 'FastBDT')
1588  p.add(data, 'NeuroBayes')
1589  p.finish()
1590  p.save('distribution_plot.png')
1591 
1592  p = Difference()
1593  p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test')
1594  p.add(data, 'NeuroBayes', data['type'] == 'Train', data['type'] == 'Test')
1595  p.finish()
1596  p.save('difference_plot.png')
1597 
1598  p = Overtraining()
1599  p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test', data['isSignal'] == 1, data['isSignal'] == 0)
1600  p.finish()
1601  p.save('overtraining_plot.png')
1602 
1603  p = Correlation()
1604  p.add(data, 'FastBDT', 'NeuroBayes', [0, 20, 40, 60, 80, 100], data['isSignal'] == 0)
1605  p.finish()
1606  p.save('correlation_plot.png')
1607 
1608  p = CorrelationMatrix()
1609  data['FastBDT2'] = data['FastBDT']**2
1610  data['NeuroBayes2'] = data['NeuroBayes']**2
1611  data['FastBDT3'] = data['FastBDT']**3
1612  data['NeuroBayes3'] = data['NeuroBayes']**3
1613  p.add(data, ['FastBDT', 'NeuroBayes', 'FastBDT2', 'NeuroBayes2', 'FastBDT3', 'NeuroBayes3'])
1614  p.finish()
1615  p.save('correlation_matrix.png')
1616 
1617 # @endcond
def calculate_flatness(f, p, w=None)
def weighted_mean_and_std(x, w)
Definition: histogram.py:31
def poisson_error(n_tot)
Definition: histogram.py:24
Definition: plot.py:1