Belle II Software  light-2212-foldex
plotting.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 
11 
12 import copy
13 import math
14 
15 import pandas
16 import numpy
17 import matplotlib.pyplot as plt
18 import matplotlib.artist
19 import matplotlib.figure
20 import matplotlib.gridspec
21 import matplotlib.colors
22 import matplotlib.patches
23 import matplotlib.ticker
24 
25 from basf2_mva_evaluation import histogram
26 
27 import basf2 as b2
28 
29 import basf2_mva_util
30 import matplotlib
31 
32 # Do not use standard backend TkAgg, because it is NOT thread-safe
33 # You will get an RuntimeError: main thread is not in main loop otherwise!
34 matplotlib.use("svg")
35 matplotlib.rcParams.update({'font.size': 36})
36 
37 # Use the Belle II style while producing the plots
38 plt.style.use("belle2")
39 
40 
41 class Plotter(object):
42  """
43  Base class for all Plotters.
44  """
45 
46  # stupid workaround for doxygen refusing to document things
47 
48 
50 
51 
55 
56 
57  plots = None
58 
59  labels = None
60 
61  xmin = None
62 
63  xmax = None
64 
65  ymin = None
66 
67  ymax = None
68  yscale = 0.0
69  xscale = 0.0
70 
71  figure = None
72 
73  axis = None
74 
75  def __init__(self, figure=None, axis=None):
76  """
77  Creates a new figure and axis if None is given, sets the default plot parameters
78  @param figure default draw figure which is used
79  @param axis default draw axis which is used
80  """
81  b2.B2INFO("Create new figure for class " + str(type(self)))
82  if figure is None:
83  self.figurefigure = matplotlib.figure.Figure(figsize=(32, 18))
84  self.figurefigure.set_tight_layout(False)
85  else:
86  self.figurefigure = figure
87 
88  if axis is None:
89  self.axisaxis = self.figurefigure.add_subplot(1, 1, 1)
90  else:
91  self.axisaxis = axis
92 
93  self.plotsplots = []
94  self.labelslabels = []
95  self.xminxmin, self.xmaxxmax = float(0), float(1)
96  self.yminymin, self.ymaxymax = float(0), float(1)
97 
98  self.yscaleyscaleyscale = 0.1
99 
100  self.xscalexscalexscale = 0.0
101 
102 
103  self.plot_kwargsplot_kwargs = None
104 
105  self.errorbar_kwargserrorbar_kwargs = None
106 
107  self.errorband_kwargserrorband_kwargs = None
108 
109  self.fill_kwargsfill_kwargs = None
110 
111  self.set_plot_optionsset_plot_options()
112  self.set_errorbar_optionsset_errorbar_options()
113  self.set_errorband_optionsset_errorband_options()
114  self.set_fill_optionsset_fill_options()
115 
116  def add_subplot(self, gridspecs):
117  """
118  Adds a new subplot to the figure, updates all other axes
119  according to the given gridspec
120  @param gridspecs gridspecs for all axes including the new one
121  """
122  for gs, ax in zip(gridspecs[:-1], self.figurefigure.axes):
123  ax.set_position(gs.get_position(self.figurefigure))
124  ax.set_subplotspec(gs)
125  axis = self.figurefigure.add_subplot(gridspecs[-1], sharex=self.axisaxis)
126  return axis
127 
128  def save(self, filename):
129  """
130  Save the figure into a file
131  @param filename of the file
132  """
133  b2.B2INFO("Save figure for class " + str(type(self)))
134  from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
135  canvas = FigureCanvas(self.figurefigure)
136  canvas.print_figure(filename, dpi=50)
137  return self
138 
139  def set_plot_options(self, plot_kwargs={'linestyle': ''}):
140  """
141  Overrides default plot options for datapoint plot
142  @param plot_kwargs keyword arguments for the plot function
143  """
144  self.plot_kwargsplot_kwargs = copy.copy(plot_kwargs)
145  return self
146 
147  def set_errorbar_options(self, errorbar_kwargs={'fmt': '.', 'elinewidth': 3, 'alpha': 1}):
148  """
149  Overrides default errorbar options for datapoint errorbars
150  @param errorbar_kwargs keyword arguments for the errorbar function
151  """
152  self.errorbar_kwargserrorbar_kwargs = copy.copy(errorbar_kwargs)
153  return self
154 
155  def set_errorband_options(self, errorband_kwargs={'alpha': 0.5}):
156  """
157  Overrides default errorband options for datapoint errorband
158  @param errorbar_kwargs keyword arguments for the fill_between function
159  """
160  self.errorband_kwargserrorband_kwargs = copy.copy(errorband_kwargs)
161  return self
162 
163  def set_fill_options(self, fill_kwargs=None):
164  """
165  Overrides default fill_between options for datapoint errorband
166  @param fill_kwargs keyword arguments for the fill_between function
167  """
168  self.fill_kwargsfill_kwargs = copy.copy(fill_kwargs)
169  return self
170 
171  def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None):
172  """
173  Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
174  @param x coordinates of the data points
175  @param y coordinates of the data points
176  @param xerr symmetric error on x data points
177  @param yerr symmetric error on y data points
178  """
179  p = e = f = None
180  plot_kwargs = copy.copy(self.plot_kwargsplot_kwargs)
181  errorbar_kwargs = copy.copy(self.errorbar_kwargserrorbar_kwargs)
182  errorband_kwargs = copy.copy(self.errorband_kwargserrorband_kwargs)
183  fill_kwargs = copy.copy(self.fill_kwargsfill_kwargs)
184 
185  if plot_kwargs is None or 'color' not in plot_kwargs:
186  color = next(axis._get_lines.prop_cycler)
187  color = color['color']
188  plot_kwargs['color'] = color
189  else:
190  color = plot_kwargs['color']
191  color = matplotlib.colors.ColorConverter().to_rgb(color)
192  patch = matplotlib.patches.Patch(color=color, alpha=0.5)
193  patch.get_color = patch.get_facecolor
194  patches = [patch]
195 
196  if plot_kwargs is not None:
197  p, = axis.plot(x, y, rasterized=True, **plot_kwargs)
198  patches.append(p)
199 
200  if errorbar_kwargs is not None and (xerr is not None or yerr is not None):
201  if 'color' not in errorbar_kwargs:
202  errorbar_kwargs['color'] = color
203  if 'ecolor' not in errorbar_kwargs:
204  errorbar_kwargs['ecolor'] = [0.5 * x for x in color]
205 
206  # fully mask nan values.
207  # Needed until https://github.com/matplotlib/matplotlib/pull/23333 makes it into the externals.
208  # TODO: remove in release 8.
209  if not isinstance(xerr, (numpy.ndarray, list)):
210  xerr = xerr*numpy.ones(len(x))
211  mask = numpy.logical_and.reduce([numpy.isfinite(v) for v in [x, y, xerr, yerr]])
212 
213  e = axis.errorbar(x[mask], y[mask], xerr=xerr[mask], yerr=yerr[mask], rasterized=True, **errorbar_kwargs)
214  patches.append(e)
215 
216  if errorband_kwargs is not None and yerr is not None:
217  if 'color' not in errorband_kwargs:
218  errorband_kwargs['color'] = color
219  if xerr is not None:
220  # Ensure that xerr and yerr are iterable numpy arrays
221  xerr = x + xerr - x
222  yerr = y + yerr - y
223  for _x, _y, _xe, _ye in zip(x, y, xerr, yerr):
224  axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=True,
225  **errorband_kwargs))
226  else:
227  f = axis.fill_between(x, y - yerr, y + yerr, interpolate=True, rasterized=True, **errorband_kwargs)
228 
229  if fill_kwargs is not None:
230  axis.fill_between(x, y, 0, rasterized=True, **fill_kwargs)
231 
232  return (tuple(patches), p, e, f)
233 
234  def add(self, *args, **kwargs):
235  """
236  Add a new plot to this plotter
237  """
238  return NotImplemented
239 
240  def finish(self, *args, **kwargs):
241  """
242  Finish plotting and set labels, legends and stuff
243  """
244  return NotImplemented
245 
246  def scale_limits(self):
247  """
248  Scale limits to increase distance to boundaries
249  """
250  self.yminymin *= 1.0 - math.copysign(self.yscaleyscaleyscale, self.yminymin)
251  self.ymaxymax *= 1.0 + math.copysign(self.yscaleyscaleyscale, self.ymaxymax)
252  self.xminxmin *= 1.0 - math.copysign(self.xscalexscalexscale, self.xminxmin)
253  self.xmaxxmax *= 1.0 + math.copysign(self.xscalexscalexscale, self.xmaxxmax)
254  return self
255 
256 
258  """
259  Plots the purity and the efficiency over the cut value (for cut choosing)
260  """
261 
265 
266  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
267  """
268  Add a new curve to the plot
269  @param data pandas.DataFrame containing all data
270  @param column which is used to calculate efficiency and purity for different cuts
271  @param signal_mask boolean numpy.array defining which events are signal events
272  @param bckgrd_mask boolean numpy.array defining which events are background events
273  @param weight_column column in data containing the weights for each event
274  """
275 
276  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
277 
278  if normed:
279  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
280  purity, purity_error = hists.get_purity(['Signal'], ['Background'])
281  else:
282  efficiency, efficiency_error = hists.get_true_positives(['Signal'])
283  purity, purity_error = hists.get_false_positives(['Background'])
284 
285  cuts = hists.bin_centers
286 
287  self.xminxmin, self.xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.xminxmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmaxxmaxxmax])
288  self.yminymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.yminymin]), \
289  numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.ymaxymaxymax])
290 
291  self.plotsplots.append(self._plot_datapoints_plot_datapoints(self.axisaxis, cuts, efficiency, xerr=0, yerr=efficiency_error))
292 
293  if normed:
294  self.labelslabels.append("Efficiency")
295  else:
296  self.labelslabels.append("True positive")
297 
298  self.plotsplots.append(self._plot_datapoints_plot_datapoints(self.axisaxis, cuts, purity, xerr=0, yerr=purity_error))
299 
300  if normed:
301  self.labelslabels.append("Purity")
302  else:
303  self.labelslabels.append("False positive")
304 
305  return self
306 
307  def finish(self):
308  """
309  Sets limits, title, axis-labels and legend of the plot
310  """
311  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
312  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
313  self.axisaxis.set_title("Classification Plot")
314  self.axisaxis.get_xaxis().set_label_text('Cut Value')
315  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
316  return self
317 
318 
320  """
321  Plots the signal to noise ratio over the cut value (for cut choosing)
322  """
323 
327 
328  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
329  """
330  Add a new curve to the plot
331  @param data pandas.DataFrame containing all data
332  @param column which is used to calculate signal to noise ratio for different cuts
333  @param signal_mask boolean numpy.array defining which events are signal events
334  @param bckgrd_mask boolean numpy.array defining which events are background events
335  @param weight_column column in data containing the weights for each event
336  """
337 
338  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
339 
340  signal2noise, signal2noise_error = hists.get_signal_to_noise(['Signal'], ['Background'])
341 
342  cuts = hists.bin_centers
343 
344  self.xminxmin, self.xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.xminxmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmaxxmaxxmax])
345  self.yminymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(signal2noise), self.yminymin]), \
346  numpy.nanmax([numpy.nanmax(signal2noise), self.ymaxymaxymax])
347 
348  self.plotsplots.append(self._plot_datapoints_plot_datapoints(self.axisaxis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))
349 
350  self.labelslabels.append(column)
351 
352  return self
353 
354  def finish(self):
355  """
356  Sets limits, title, axis-labels and legend of the plot
357  """
358  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
359  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
360  self.axisaxis.set_title("Signal to Noise Plot")
361  self.axisaxis.get_xaxis().set_label_text('Cut Value')
362  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
363  return self
364 
365 
367  """
368  Plots the purity over the efficiency also known as ROC curve
369  """
370 
374 
375  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
376  """
377  Add a new curve to the ROC plot
378  @param data pandas.DataFrame containing all data
379  @param column which is used to calculate efficiency and purity for different cuts
380  @param signal_mask boolean numpy.array defining which events are signal events
381  @param bckgrd_mask boolean numpy.array defining which events are background events
382  @param weight_column column in data containing the weights for each event
383  """
384  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
385  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
386  purity, purity_error = hists.get_purity(['Signal'], ['Background'])
387 
388  self.xminxmin, self.xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.xminxmin]), numpy.nanmax([efficiency.max(), self.xmaxxmaxxmax])
389  self.yminymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(purity), self.yminymin]), numpy.nanmax([numpy.nanmax(purity), self.ymaxymaxymax])
390 
391  p = self._plot_datapoints_plot_datapoints(self.axisaxis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
392  self.plotsplots.append(p)
393  if label is not None:
394  self.labelslabels.append(label)
395  else:
396  self.labelslabels.append(column)
397  return self
398 
399  def finish(self):
400  """
401  Sets limits, title, axis-labels and legend of the plot
402  """
403  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
404  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
405  self.axisaxis.set_title("ROC Purity Plot")
406  self.axisaxis.get_xaxis().set_label_text('Efficiency')
407  self.axisaxis.get_yaxis().set_label_text('Purity')
408  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
409  return self
410 
411 
413  """
414  Plots the rejection over the efficiency also known as ROC curve
415  """
416 
420 
421  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
422  """
423  Add a new curve to the ROC plot
424  @param data pandas.DataFrame containing all data
425  @param column which is used to calculate efficiency and purity for different cuts
426  @param signal_mask boolean numpy.array defining which events are signal events
427  @param bckgrd_mask boolean numpy.array defining which events are background events
428  @param weight_column column in data containing the weights for each event
429  """
430  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
431  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
432  rejection, rejection_error = hists.get_efficiency(['Background'])
433  rejection = 1 - rejection
434  if isinstance(efficiency, int) and not isinstance(rejection, int):
435  efficiency = numpy.array([efficiency] * len(rejection))
436  elif isinstance(rejection, int) and not isinstance(efficiency, int):
437  rejection = numpy.array([rejection] * len(efficiency))
438  elif isinstance(rejection, int) and isinstance(efficiency, int):
439  efficiency = numpy.array([efficiency])
440  rejection = numpy.array([rejection])
441 
442  self.xminxmin, self.xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.xminxmin]), numpy.nanmax([efficiency.max(), self.xmaxxmaxxmax])
443  self.yminymin, self.ymaxymaxymax = numpy.nanmin([rejection.min(), self.yminymin]), numpy.nanmax([rejection.max(), self.ymaxymaxymax])
444 
445  auc = numpy.abs(numpy.trapz(rejection, efficiency))
446 
447  p = self._plot_datapoints_plot_datapoints(self.axisaxis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
448  self.plotsplots.append(p)
449  if label is not None:
450  self.labelslabels.append(label[:10] + " ({:.2f})".format(auc))
451  else:
452  self.labelslabels.append(column[:10] + " ({:.2f})".format(auc))
453  return self
454 
455  def finish(self):
456  """
457  Sets limits, title, axis-labels and legend of the plot
458  """
459  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
460  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
461  self.axisaxis.set_title("ROC Rejection Plot")
462  self.axisaxis.get_xaxis().set_label_text('Signal Efficiency')
463  self.axisaxis.get_yaxis().set_label_text('Background Rejection')
464  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
465  return self
466 
467 
469  """
470  Plots multiple other plots into a grid 3x?
471  """
472 
473  figure = None
474 
475  axis = None
476 
477  def __init__(self, cls, number_of_plots, figure=None):
478  """
479  Creates a new figure if None is given, sets the default plot parameters
480  @param figure default draw figure which is used
481  """
482  if figure is None:
483  self.figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
484  self.figurefigurefigure.set_tight_layout(True)
485  else:
486  self.figurefigurefigure = figure
487 
488  if number_of_plots == 1:
489  gs = matplotlib.gridspec.GridSpec(1, 1)
490  elif number_of_plots == 2:
491  gs = matplotlib.gridspec.GridSpec(1, 2)
492  elif number_of_plots == 3:
493  gs = matplotlib.gridspec.GridSpec(1, 3)
494  else:
495  gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
496 
497 
498  self.sub_plotssub_plots = [cls(self.figurefigurefigure, self.figurefigurefigure.add_subplot(gs[i // 3, i % 3])) for i in range(number_of_plots)]
499  self.axisaxisaxis = self.sub_plotssub_plots[0].axis
500  super(Multiplot, self).__init__(self.figurefigurefigure, self.axisaxisaxis)
501 
502  def add(self, i, *args, **kwargs):
503  """
504  Call add function of ith subplot
505  @param i position of the subplot
506  """
507  self.sub_plotssub_plots[i].add(*args, **kwargs)
508 
509  def finish(self):
510  """
511  Sets limits, title, axis-labels and legend of the plot
512  """
513  for plot in self.sub_plotssub_plots:
514  plot.finish()
515  return self
516 
517 
519  """
520  Plots the purity in each bin over the classifier output.
521  """
522 
526 
527  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
528  """
529  Add a new curve to the Diagonal plot
530  @param data pandas.DataFrame containing all data
531  @param column which is used to calculate purity for different cuts
532  @param signal_mask boolean numpy.array defining which events are signal events
533  @param bckgrd_mask boolean numpy.array defining which events are background events
534  @param weight_column column in data containing the weights for each event
535  """
536  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
537  purity, purity_error = hists.get_purity_per_bin(['Signal'], ['Background'])
538 
539  self.xminxmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xminxmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
540  # self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
541  self.yminymin, self.ymaxymaxymax = 0, 1
542 
543  p = self._plot_datapoints_plot_datapoints(self.axisaxis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
544  self.plotsplots.append(p)
545  self.labelslabels.append(column)
546  return self
547 
548  def finish(self):
549  """
550  Sets limits, title, axis-labels and legend of the plot
551  """
552  self.scale_limitsscale_limits()
553  self.axisaxis.plot((0.0, 1.0), (0.0, 1.0), color='black')
554  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
555  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
556  self.axisaxis.set_title("Diagonal Plot")
557  self.axisaxis.get_xaxis().set_label_text('Classifier Output')
558  self.axisaxis.get_yaxis().set_label_text('Purity Per Bin')
559  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
560  return self
561 
562 
564  """
565  Plots distribution of a quantity
566  """
567 
568  def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
569  keep_first_binning=False, range_in_std=None):
570  """
571  Creates a new figure and axis if None is given, sets the default plot parameters
572  @param figure default draw figure which is used
573  @param axis default draw axis which is used
574  @param normed true if histograms should be normed before drawing
575  @param keep_first_binning use the binning of the first distribution for further plots
576  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
577  """
578  super(Distribution, self).__init__(figure, axis)
579 
580  self.normed_to_all_entriesnormed_to_all_entries = normed_to_all_entries
581 
582  self.normed_to_bin_widthnormed_to_bin_width = normed_to_bin_width
583 
584  self.range_in_stdrange_in_std = range_in_std
585  # if self.normed_to_all_entries or self.normed_to_bin_width:
586 
587  self.yminyminymin = float(0)
588 
589  self.ymaxymaxymax = float('-inf')
590 
591  self.xminxminxmin = float('inf')
592 
593  self.xmaxxmaxxmax = float('-inf')
594 
595  self.keep_first_binningkeep_first_binning = keep_first_binning
596 
597  self.first_binningfirst_binning = None
598 
599  self.x_axis_labelx_axis_label = ''
600 
601  def add(self, data, column, mask=None, weight_column=None, label=None):
602  """
603  Add a new distribution to the plots
604  @param data pandas.DataFrame containing all data
605  @param column which is used to calculate distribution histogram
606  @param mask boolean numpy.array defining which events are used for the histogram
607  @param weight_column column in data containing the weights for each event
608  """
609  if mask is None:
610  mask = numpy.ones(len(data)).astype('bool')
611 
612  bins = 100
613  if self.keep_first_binningkeep_first_binning and self.first_binningfirst_binning is not None:
614  bins = self.first_binningfirst_binning
615  hists = histogram.Histograms(data, column, {'Total': mask}, weight_column=weight_column,
616  bins=bins, equal_frequency=False, range_in_std=self.range_in_stdrange_in_std)
617  if self.keep_first_binningkeep_first_binning and self.first_binningfirst_binning is None:
618  self.first_binningfirst_binning = hists.bins
619  hist, hist_error = hists.get_hist('Total')
620 
621  if self.normed_to_all_entriesnormed_to_all_entries:
622  normalization = float(numpy.sum(hist))
623  hist = hist / normalization
624  hist_error = hist_error / normalization
625 
626  if self.normed_to_bin_widthnormed_to_bin_width:
627  hist = hist / hists.bin_widths
628  hist_error = hist_error / hists.bin_widths
629 
630  self.xminxminxmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xminxminxmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
631  self.yminyminymin = numpy.nanmin([hist.min(), self.yminyminymin])
632  self.ymaxymaxymax = numpy.nanmax([(hist + hist_error).max(), self.ymaxymaxymax])
633 
634  p = self._plot_datapoints_plot_datapoints(self.axisaxis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
635  self.plotsplots.append(p)
636  self.x_axis_labelx_axis_label = column
637 
638  appendix = ''
639  if self.ymaxymaxymax <= self.yminyminymin or self.xmaxxmaxxmax <= self.xminxminxmin:
640  appendix = ' No data to plot!'
641 
642  if label is None:
643  self.labelslabels.append(column + appendix)
644  else:
645  self.labelslabels.append(label + appendix)
646  return self
647 
648  def finish(self):
649  """
650  Sets limits, title, axis-labels and legend of the plot
651  """
652  self.axisaxis.set_title("Distribution Plot")
653  self.axisaxis.get_xaxis().set_label_text(self.x_axis_labelx_axis_label)
654 
655  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
656 
657  if self.ymaxymaxymax <= self.yminyminymin or self.xmaxxmaxxmax <= self.xminxminxmin:
658  self.axisaxis.set_xlim((0., 1.))
659  self.axisaxis.set_ylim((0., 1.))
660  self.axisaxis.text(0.36, 0.5, 'No data to plot', fontsize=60, color='black')
661  return self
662 
663  self.scale_limitsscale_limits()
664 
665  self.axisaxis.set_xlim((self.xminxminxmin, self.xmaxxmaxxmax))
666  self.axisaxis.set_ylim((self.yminyminymin, self.ymaxymaxymax))
667 
668  if self.normed_to_all_entriesnormed_to_all_entries and self.normed_to_bin_widthnormed_to_bin_width:
669  self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / (# Entries * Bin Width)')
670  elif self.normed_to_all_entriesnormed_to_all_entries:
671  self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / # Entries')
672  elif self.normed_to_bin_widthnormed_to_bin_width:
673  self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / Bin Width')
674  else:
675  self.axisaxis.get_yaxis().set_label_text('# Entries per Bin')
676 
677  return self
678 
679 
680 class Box(Plotter):
681  """
682  Create a boxplot
683  """
684 
686 
687  def __init__(self, figure=None, axis=None):
688  """
689  Creates a new figure and axis if None is given, sets the default plot parameters
690  @param figure default draw figure which is used
691  @param axis default draw axis which is used
692  """
693  super().__init__(figure=figure, axis=axis)
694 
695 
696  self.x_axis_labelx_axis_label = ""
697 
698  def add(self, data, column, mask=None, weight_column=None):
699  """
700  Add a new boxplot to the plots
701  @param data pandas.DataFrame containing all data
702  @param column which is used to calculate boxplot quantities
703  @param mask boolean numpy.array defining which events are used for the histogram
704  @param weight_column column in data containing the weights for each event
705  """
706  if mask is None:
707  mask = numpy.ones(len(data)).astype('bool')
708  x = data[column][mask]
709  if weight_column is not None:
710  # weight = data[weight_column][mask]
711  b2.B2WARNING("Weights are currently not used in boxplot, due to limitations in matplotlib")
712 
713  if len(x) == 0:
714  b2.B2WARNING("Ignore empty boxplot.")
715  return self
716 
717  p = self.axisaxis.boxplot(x, sym='k.', whis=1.5, vert=False, patch_artist=True, showmeans=True, widths=1,
718  boxprops=dict(facecolor='blue', alpha=0.5),
719  # medianprobs=dict(color='blue'),
720  # meanprobs=dict(color='red'),
721  )
722  self.plotsplots.append(p)
723  self.labelslabels.append(column)
724  self.x_axis_labelx_axis_label = column
725  r"""
726  self.axis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + '\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
727  fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
728  self.axis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + '\n' + r'$IQD = {:.2f}$').format(x.std(),
729  x.quantile(0.75) - x.quantile(0.25)),
730  fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
731  self.axis.text(0.7, 0.9, (r'$min = {:.2f}$' + '\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
732  fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
733  """
734 
735  return self
736 
737  def finish(self):
738  """
739  Sets limits, title, axis-labels and legend of the plot
740  """
741  matplotlib.artist.setp(self.axisaxis.get_yaxis(), visible=False)
742  self.axisaxis.get_xaxis().set_label_text(self.x_axis_labelx_axis_label)
743  self.axisaxis.set_title("Box Plot")
744  return self
745 
746 
748  """
749  Plots the difference between two histograms
750  """
751 
763 
764  def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
765  """
766  Creates a new figure and axis if None is given, sets the default plot parameters
767  @param figure default draw figure which is used
768  @param axis default draw axis which is used
769  @param normed normalize minuend and subtrahend before comparing them
770  @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
771  """
772  super(Difference, self).__init__(figure, axis)
773  self.normednormed = normed
774  self.shift_to_zeroshift_to_zero = shift_to_zero
775  if self.normednormed:
776  self.yminyminymin = -0.01
777  self.ymaxymaxymax = 0.01
778  else:
779  self.yminyminymin = -1
780  self.ymaxymaxymax = 1
781 
782  def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
783  """
784  Add a new difference plot
785  @param data pandas.DataFrame containing all data
786  @param column which is used to calculate distribution histogram
787  @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
788  @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
789  @param weight_column column in data containing the weights for each event
790  @param label label for the legend if None, the column name is used
791  """
792  hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask},
793  weight_column=weight_column, equal_frequency=False)
794  minuend, minuend_error = hists.get_hist('Minuend')
795  subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
796 
797  difference_error = histogram.poisson_error(minuend + subtrahend)
798  if self.normednormed:
799  difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
800  minuend = minuend / numpy.sum(minuend)
801  subtrahend = subtrahend / numpy.sum(subtrahend)
802  difference = minuend - subtrahend
803 
804  if self.shift_to_zeroshift_to_zero:
805  difference = difference - numpy.mean(difference)
806 
807  self.xminxmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xminxmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
808  self.yminyminymin = min((difference - difference_error).min(), self.yminyminymin)
809  self.ymaxymaxymax = max((difference + difference_error).max(), self.ymaxymaxymax)
810 
811  p = self._plot_datapoints_plot_datapoints(self.axisaxis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
812  self.plotsplots.append(p)
813  if label is None:
814  self.labelslabels.append(label)
815  else:
816  self.labelslabels.append(column)
817  self.x_axis_labelx_axis_label = column
818  return self
819 
820  def finish(self, line_color='black'):
821  """
822  Sets limits, title, axis-labels and legend of the plot
823  """
824  self.axisaxis.plot((self.xminxmin, self.xmaxxmaxxmax), (0, 0), color=line_color, linewidth=4, rasterized=True)
825  self.scale_limitsscale_limits()
826  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
827  self.axisaxis.set_ylim((self.yminyminymin, self.ymaxymaxymax))
828  self.axisaxis.set_title("Difference Plot")
829  self.axisaxis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
830  self.axisaxis.get_xaxis().set_label_text(self.x_axis_labelx_axis_label)
831  self.axisaxis.get_yaxis().set_label_text('Difference')
832  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
833  return self
834 
835 
837  """
838  Create TMVA-like overtraining control plot for a classification training
839  """
840 
841 
842  figure = None
843 
844  axis = None
845 
846  axis_d1 = None
847 
848  axis_d2 = None
849 
850  def __init__(self, figure=None):
851  """
852  Creates a new figure if None is given, sets the default plot parameters
853  @param figure default draw figure which is used
854  """
855  if figure is None:
856  self.figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
857  self.figurefigurefigure.set_tight_layout(True)
858  else:
859  self.figurefigurefigure = figure
860 
861  gs = matplotlib.gridspec.GridSpec(5, 1)
862  self.axisaxisaxis = self.figurefigurefigure.add_subplot(gs[:3, :])
863  self.axis_d1axis_d1 = self.figurefigurefigure.add_subplot(gs[3, :], sharex=self.axisaxisaxis)
864  self.axis_d2axis_d2 = self.figurefigurefigure.add_subplot(gs[4, :], sharex=self.axisaxisaxis)
865 
866  super(Overtraining, self).__init__(self.figurefigurefigure, self.axisaxisaxis)
867 
868  def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
869  """
870  Add a new overtraining plot, I recommend to raw only one overtraining plot at the time,
871  otherwise there are too many curves in the plot to recognize anything in the plot.
872  @param data pandas.DataFrame containing all data
873  @param column which is used to calculate distribution histogram
874  @param train_mask boolean numpy.array defining which events are training events
875  @param test_mask boolean numpy.array defining which events are test events
876  @param signal_mask boolean numpy.array defining which events are signal events
877  @param bckgrd_mask boolean numpy.array defining which events are background events
878  @param weight_column column in data containing the weights for each event
879  """
880  distribution = Distribution(self.figurefigurefigure, self.axisaxisaxis, normed_to_all_entries=True)
881 
882  distribution.set_plot_options(self.plot_kwargsplot_kwargs)
883  distribution.set_errorbar_options(self.errorbar_kwargserrorbar_kwargs)
884  distribution.set_errorband_options(self.errorband_kwargserrorband_kwargs)
885  distribution.add(data, column, test_mask & signal_mask, weight_column)
886  distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
887 
888  distribution.set_plot_options(
889  {'color': distribution.plots[0][0][0].get_color(), 'linestyle': '-', 'lw': 4, 'drawstyle': 'steps-mid'})
890  distribution.set_fill_options({'color': distribution.plots[0][0][0].get_color(), 'alpha': 0.5, 'step': 'mid'})
891  distribution.set_errorbar_options(None)
892  distribution.set_errorband_options(None)
893  distribution.add(data, column, train_mask & signal_mask, weight_column)
894  distribution.set_plot_options(
895  {'color': distribution.plots[1][0][0].get_color(), 'linestyle': '-', 'lw': 4, 'drawstyle': 'steps-mid'})
896  distribution.set_fill_options({'color': distribution.plots[1][0][0].get_color(), 'alpha': 0.5, 'step': 'mid'})
897  distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
898 
899  distribution.labels = ['Test-Signal', 'Test-Background', 'Train-Signal', 'Train-Background']
900  distribution.finish()
901 
902  self.plot_kwargsplot_kwargs['color'] = distribution.plots[0][0][0].get_color()
903  difference_signal = Difference(self.figurefigurefigure, self.axis_d1axis_d1, shift_to_zero=True, normed=True)
904  difference_signal.set_plot_options(self.plot_kwargsplot_kwargs)
905  difference_signal.set_errorbar_options(self.errorbar_kwargserrorbar_kwargs)
906  difference_signal.set_errorband_options(self.errorband_kwargserrorband_kwargs)
907  difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
908  self.axis_d1axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
909  self.axis_d1axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
910  difference_signal.plots = difference_signal.labels = []
911  difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
912 
913  self.plot_kwargsplot_kwargs['color'] = distribution.plots[1][0][0].get_color()
914  difference_bckgrd = Difference(self.figurefigurefigure, self.axis_d2axis_d2, shift_to_zero=True, normed=True)
915  difference_bckgrd.set_plot_options(self.plot_kwargsplot_kwargs)
916  difference_bckgrd.set_errorbar_options(self.errorbar_kwargserrorbar_kwargs)
917  difference_bckgrd.set_errorband_options(self.errorband_kwargserrorband_kwargs)
918  difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
919  self.axis_d2axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
920  self.axis_d2axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
921  difference_bckgrd.plots = difference_bckgrd.labels = []
922  difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
923 
924  try:
925  import scipy.stats
926  # Kolmogorov smirnov test
927  if len(data[column][train_mask & signal_mask]) == 0 or len(data[column][test_mask & signal_mask]) == 0:
928  b2.B2WARNING("Cannot calculate kolmogorov smirnov test for signal due to missing data")
929  else:
930  ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
931  props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
932  self.axis_d1axis_d1.text(0.1, 0.9, r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36, bbox=props,
933  verticalalignment='top', horizontalalignment='left', transform=self.axis_d1axis_d1.transAxes)
934  if len(data[column][train_mask & bckgrd_mask]) == 0 or len(data[column][test_mask & bckgrd_mask]) == 0:
935  b2.B2WARNING("Cannot calculate kolmogorov smirnov test for background due to missing data")
936  else:
937  ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
938  props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
939  self.axis_d2axis_d2.text(0.1, 0.9, r'background (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36,
940  bbox=props,
941  verticalalignment='top', horizontalalignment='left', transform=self.axis_d2axis_d2.transAxes)
942  except ImportError:
943  b2.B2WARNING("Cannot calculate kolmogorov smirnov test please install scipy!")
944 
945  return self
946 
947  def finish(self):
948  """
949  Sets limits, title, axis-labels and legend of the plot
950  """
951  self.axisaxisaxis.set_title("Overtraining Plot")
952  self.axis_d1axis_d1.set_title("")
953  self.axis_d2axis_d2.set_title("")
954  matplotlib.artist.setp(self.axisaxisaxis.get_xticklabels(), visible=False)
955  matplotlib.artist.setp(self.axis_d1axis_d1.get_xticklabels(), visible=False)
956  self.axisaxisaxis.get_xaxis().set_label_text('')
957  self.axis_d1axis_d1.get_xaxis().set_label_text('')
958  self.axis_d2axis_d2.get_xaxis().set_label_text('Classifier Output')
959  return self
960 
961 
963  """
964  Plots distribution of a quantity including boxplots
965  """
966 
967 
968  box_axes = None
969 
970  def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
971  """
972  Creates a new figure and axis if None is given, sets the default plot parameters
973  @param figure default draw figure which is used
974  @param axis default draw axis which is used
975  @param normed true if the histograms should be normed before drawing
976  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
977  """
978  super(VerboseDistribution, self).__init__(figure, axis)
979 
980  self.normednormed = normed
981 
982  self.range_in_stdrange_in_std = range_in_std
983  self.box_axesbox_axes = []
984 
985  self.distributiondistribution = Distribution(self.figurefigure, self.axisaxis, normed_to_all_entries=self.normednormed, range_in_std=self.range_in_stdrange_in_std)
986 
987  def add(self, data, column, mask=None, weight_column=None, label=None):
988  """
989  Add a new distribution plot, with additional information like a boxplot compared to
990  the ordinary Distribution plot.
991  @param data pandas.DataFrame containing all data
992  @param column which is used to calculate distribution histogram
993  @param mask boolean numpy.array defining which events are used for the distribution histogram
994  @param weight_column column in data containing the weights for each event
995  """
996  self.distributiondistribution.set_plot_options(self.plot_kwargsplot_kwargs)
997  self.distributiondistribution.set_errorbar_options(self.errorbar_kwargserrorbar_kwargs)
998  self.distributiondistribution.set_errorband_options(self.errorband_kwargserrorband_kwargs)
999  self.distributiondistribution.add(data, column, mask, weight_column, label=label)
1000 
1001  n = len(self.box_axesbox_axes) + 1
1002  gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1003  gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i in range(n)]
1004  box_axis = self.add_subplotadd_subplot(gridspecs)
1005 
1006  if self.range_in_stdrange_in_std is not None:
1007  mean, std = histogram.weighted_mean_and_std(data[column], None if weight_column is None else data[weight_column])
1008  # Everything outside mean +- range_in_std * std is considered not inside the mask
1009  mask = mask & (data[column] > (mean - self.range_in_stdrange_in_std * std)) & (data[column] < (mean + self.range_in_stdrange_in_std * std))
1010  box = Box(self.figurefigure, box_axis)
1011  box.add(data, column, mask, weight_column)
1012  if len(box.plots) > 0:
1013  box.plots[0]['boxes'][0].set_facecolor(self.distributiondistribution.plots[-1][0][0].get_color())
1014  box.finish()
1015 
1016  self.box_axesbox_axes.append(box_axis)
1017  return self
1018 
1019  def finish(self):
1020  """
1021  Sets limits, title, axis-labels and legend of the plot
1022  """
1023  self.distributiondistribution.finish()
1024  matplotlib.artist.setp(self.axisaxis.get_xticklabels(), visible=False)
1025  self.axisaxis.get_xaxis().set_label_text('')
1026  for box_axis in self.box_axesbox_axes[:-1]:
1027  matplotlib.artist.setp(box_axis.get_xticklabels(), visible=False)
1028  box_axis.set_title("")
1029  box_axis.get_xaxis().set_label_text('')
1030  self.box_axesbox_axes[-1].set_title("")
1031  self.axisaxis.set_title("Distribution Plot")
1032  self.axisaxis.legend([x[0] for x in self.distributiondistribution.plots], self.distributiondistribution.labels,
1033  loc='best', fancybox=True, framealpha=0.5)
1034  return self
1035 
1036 
1038  """
1039  Plots change of a distribution of a quantity depending on the cut on a classifier
1040  """
1041 
1042  figure = None
1043 
1044  axis = None
1045 
1046  axis_d1 = None
1047 
1048  axis_d2 = None
1049 
1050  def __init__(self, figure=None):
1051  """
1052  Creates a new figure if None is given, sets the default plot parameters
1053  @param figure default draw figure which is used
1054  """
1055  if figure is None:
1056  self.figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1057  self.figurefigurefigure.set_tight_layout(True)
1058  else:
1059  self.figurefigurefigure = figure
1060 
1061  gs = matplotlib.gridspec.GridSpec(3, 2)
1062  self.axisaxisaxis = self.figurefigurefigure.add_subplot(gs[0, :])
1063  self.axis_d1axis_d1 = self.figurefigurefigure.add_subplot(gs[1, :], sharex=self.axisaxisaxis)
1064  self.axis_d2axis_d2 = self.figurefigurefigure.add_subplot(gs[2, :], sharex=self.axisaxisaxis)
1065 
1066  super(Correlation, self).__init__(self.figurefigurefigure, self.axisaxisaxis)
1067 
1068  def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1069  """
1070  Add a new correlation plot.
1071  @param data pandas.DataFrame containing all data
1072  @param column which is used to calculate distribution histogram
1073  @param cut_column which is used to calculate cut on the other quantity defined by column
1074  @param quantiles list of quantiles between 0 and 100, defining the different cuts
1075  @param weight_column column in data containing the weights for each event
1076  """
1077  if len(data[cut_column]) == 0:
1078  b2.B2WARNING("Ignore empty Correlation.")
1079  return self
1080 
1081  axes = [self.axisaxisaxis, self.axis_d1axis_d1, self.axis_d2axis_d2]
1082 
1083  for i, (l, m) in enumerate([('.', signal_mask | bckgrd_mask), ('S', signal_mask), ('B', bckgrd_mask)]):
1084 
1085  if weight_column is not None:
1086  weights = numpy.array(data[weight_column][m])
1087  else:
1088  weights = numpy.ones(len(data[column][m]))
1089 
1090  xrange = numpy.percentile(data[column][m], [5, 95])
1091 
1092  colormap = plt.get_cmap('coolwarm')
1093  tmp, x = numpy.histogram(data[column][m], bins=100,
1094  range=xrange, normed=True, weights=weights)
1095  bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1096  axes[i].plot(bin_center, tmp, color='black', lw=1)
1097 
1098  for quantil in numpy.arange(5, 100, 5):
1099  cut = numpy.percentile(data[cut_column][m], quantil)
1100  sel = data[cut_column][m] >= cut
1101  y, x = numpy.histogram(data[column][m][sel], bins=100,
1102  range=xrange, normed=True, weights=weights[sel])
1103  bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1104  axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1105  tmp = y
1106 
1107  axes[i].set_ylim(bottom=0)
1108 
1109  flatness_score = basf2_mva_util.calculate_flatness(data[column][m], data[cut_column][m], weights)
1110  axes[i].set_title(r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1111  return self
1112 
1113  def finish(self):
1114  """
1115  Sets limits, title, axis-labels and legend of the plot
1116  """
1117  return self
1118 
1119 
1120 class TSNE(Plotter):
1121  """
1122  Plots multivariate distribution using TSNE algorithm
1123  """
1124 
1125  def add(self, data, columns, *masks):
1126  """
1127  Add a new correlation plot.
1128  @param data pandas.DataFrame containing all data
1129  @param columns which are used to calculate the correlations
1130  @param masks different classes to show in TSNE
1131  """
1132  try:
1133  import sklearn
1134  import sklearn.manifold
1135  model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1136  data = numpy.array([data[column] for column in columns]).T
1137  model.fit(data)
1138  for mask in masks:
1139  data = numpy.array([data[column][mask] for column in columns]).T
1140  data = model.transform(data)
1141  self.axisaxis.scatter(data[:, 0], data[:, 1], rasterized=True)
1142  except ImportError:
1143  print("Cannot create TSNE plot. Install sklearn if you want it")
1144  return self
1145 
1146  def finish(self):
1147  """
1148  Sets limits, title, axis-labels and legend of the plot
1149  """
1150  return self
1151 
1152 
1154  """
1155  Plots importance matrix
1156  """
1157 
1158  def add(self, data, columns, variables):
1159  """
1160  Add a new correlation plot.
1161  @param data pandas.DataFrame containing all data
1162  @param columns which are used to calculate the correlations
1163  """
1164  self.figurefigure.set_tight_layout(True)
1165 
1166  def norm(x):
1167  width = (numpy.max(x) - numpy.min(x))
1168  if width <= 0:
1169  return numpy.zeros(x.shape)
1170  return (x - numpy.min(x)) / width * 100
1171 
1172  importance_matrix = numpy.vstack([norm(data[column]) for column in columns]).T
1173  importance_heatmap = self.axisaxis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,
1174  rasterized=True)
1175 
1176  # put the major ticks at the middle of each cell
1177  self.axisaxis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=False)
1178  self.axisaxis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=False)
1179 
1180  self.axisaxis.set_xticklabels(columns, minor=False, rotation=90)
1181  self.axisaxis.set_yticklabels(variables, minor=False)
1182 
1183  self.axisaxis.xaxis.tick_top()
1184 
1185  for y in range(importance_matrix.shape[0]):
1186  for x in range(importance_matrix.shape[1]):
1187  self.axisaxis.text(x + 0.5, y + 0.5, '%.0f' % importance_matrix[y, x],
1188  size=14,
1189  horizontalalignment='center',
1190  verticalalignment='center')
1191 
1192  cb = self.figurefigure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation='vertical')
1193  cb.ax.set_yticklabels(['low', 'high'])
1194 
1195  self.axisaxis.set_aspect('equal')
1196 
1197  return self
1198 
1199  def finish(self):
1200  """
1201  Sets limits, title, axis-labels and legend of the plot
1202  """
1203  return self
1204 
1205 
1207  """
1208  Plots correlation matrix
1209  """
1210 
1211  figure = None
1212 
1213  signal_axis = None
1214 
1215  bckgrd_axis = None
1216 
1217  def __init__(self, figure=None):
1218  """
1219  Creates a new figure if None is given, sets the default plot parameters
1220  @param figure default draw figure which is used
1221  """
1222  if figure is None:
1223  self.figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1224  self.figurefigurefigure.set_tight_layout(True)
1225  else:
1226  self.figurefigurefigure = figure
1227 
1228  gs = matplotlib.gridspec.GridSpec(8, 2)
1229  self.signal_axissignal_axis = self.figurefigurefigure.add_subplot(gs[:6, 0])
1230  self.bckgrd_axisbckgrd_axis = self.figurefigurefigure.add_subplot(gs[:6, 1], sharey=self.signal_axissignal_axis)
1231 
1232  self.colorbar_axiscolorbar_axis = self.figurefigurefigure.add_subplot(gs[7, :])
1233 
1234  self.axisaxisaxis = self.signal_axissignal_axis
1235 
1236  super(CorrelationMatrix, self).__init__(self.figurefigurefigure, self.axisaxisaxis)
1237 
1238  def add(self, data, columns, signal_mask, bckgrd_mask):
1239  """
1240  Add a new correlation plot.
1241  @param data pandas.DataFrame containing all data
1242  @param columns which are used to calculate the correlations
1243  """
1244  signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column in columns])) * 100
1245  bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask] for column in columns])) * 100
1246 
1247  signal_heatmap = self.signal_axissignal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1248  self.bckgrd_axisbckgrd_axis.pcolor(bckgrd_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1249 
1250  self.signal_axissignal_axis.invert_yaxis()
1251  self.signal_axissignal_axis.xaxis.tick_top()
1252  self.bckgrd_axisbckgrd_axis.invert_yaxis()
1253  self.bckgrd_axisbckgrd_axis.xaxis.tick_top()
1254 
1255  # put the major ticks at the middle of each cell
1256  self.signal_axissignal_axis.set_xticks(numpy.arange(signal_corr.shape[0]) + 0.5, minor=False)
1257  self.signal_axissignal_axis.set_yticks(numpy.arange(signal_corr.shape[1]) + 0.5, minor=False)
1258 
1259  self.signal_axissignal_axis.set_xticklabels(columns, minor=False, rotation=90)
1260  self.signal_axissignal_axis.set_yticklabels(columns, minor=False)
1261 
1262  # put the major ticks at the middle of each cell
1263  self.bckgrd_axisbckgrd_axis.set_xticks(numpy.arange(bckgrd_corr.shape[0]) + 0.5, minor=False)
1264  self.bckgrd_axisbckgrd_axis.set_yticks(numpy.arange(bckgrd_corr.shape[1]) + 0.5, minor=False)
1265 
1266  self.bckgrd_axisbckgrd_axis.set_xticklabels(columns, minor=False, rotation=90)
1267  self.bckgrd_axisbckgrd_axis.set_yticklabels(columns, minor=False)
1268 
1269  for y in range(signal_corr.shape[0]):
1270  for x in range(signal_corr.shape[1]):
1271  self.signal_axissignal_axis.text(x + 0.5, y + 0.5, '%.0f' % signal_corr[y, x],
1272  size=14,
1273  horizontalalignment='center',
1274  verticalalignment='center')
1275 
1276  for y in range(bckgrd_corr.shape[0]):
1277  for x in range(bckgrd_corr.shape[1]):
1278  self.bckgrd_axisbckgrd_axis.text(x + 0.5, y + 0.5, '%.0f' % bckgrd_corr[y, x],
1279  size=14,
1280  horizontalalignment='center',
1281  verticalalignment='center')
1282 
1283  cb = self.figurefigurefigure.colorbar(signal_heatmap, cax=self.colorbar_axiscolorbar_axis, ticks=[-100, 0, 100], orientation='horizontal')
1284  cb.solids.set_rasterized(True)
1285  cb.ax.set_xticklabels(['negative', 'uncorrelated', 'positive'])
1286 
1287  self.signal_axissignal_axis.text(0.5, -1.0, "Signal", horizontalalignment='center')
1288  self.bckgrd_axisbckgrd_axis.text(0.5, -1.0, "Background", horizontalalignment='center')
1289 
1290  return self
1291 
1292  def finish(self):
1293  """
1294  Sets limits, title, axis-labels and legend of the plot
1295  """
1296  matplotlib.artist.setp(self.bckgrd_axisbckgrd_axis.get_yticklabels(), visible=False)
1297  return self
1298 
1299 
1300 if __name__ == '__main__':
1301 
1302  def get_data(N, columns):
1303  """
1304  Creates fake data for example plots
1305  """
1306  N /= 2
1307  n = len(columns) - 1
1308  xs = numpy.random.normal(0, size=(N, n))
1309  xb = numpy.random.normal(1, size=(N, n))
1310  ys = numpy.zeros(N)
1311  yb = numpy.ones(N)
1312  data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1313  return data.reindex(numpy.random.permutation(data.index))
1314 
1315  import seaborn
1316  # Set nice searborn settings
1317  seaborn.set(font_scale=3)
1318  seaborn.set_style('whitegrid')
1319 
1320  # Standard plots
1321  N = 100000
1322  data = get_data(N, columns=['FastBDT', 'NeuroBayes', 'isSignal'])
1323  data['type'] = ''
1324  data.type.iloc[:N / 2] = 'Train'
1325  data.type.iloc[N / 2:] = 'Test'
1326 
1327  p = Box()
1328  p.add(data, 'FastBDT')
1329  p.finish()
1330  p.save('box_plot.png')
1331 
1332  p = VerboseDistribution()
1333  p.add(data, 'FastBDT')
1334  p.add(data, 'NeuroBayes')
1335  p.finish()
1336  p.save('verbose_distribution_plot.png')
1337 
1338  p = PurityOverEfficiency()
1339  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1340  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1341  p.finish()
1342  p.save('roc_purity_plot.png')
1343 
1345  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1346  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1347  p.finish()
1348  p.save('roc_rejection_plot.png')
1349 
1350  p = Diagonal()
1351  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1352  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1353  p.finish()
1354  p.save('diagonal_plot.png')
1355 
1356  p = Distribution()
1357  p.add(data, 'FastBDT')
1358  p.add(data, 'NeuroBayes')
1359  p.finish()
1360  p.save('distribution_plot.png')
1361 
1362  p = Difference()
1363  p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test')
1364  p.add(data, 'NeuroBayes', data['type'] == 'Train', data['type'] == 'Test')
1365  p.finish()
1366  p.save('difference_plot.png')
1367 
1368  p = Overtraining()
1369  p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test', data['isSignal'] == 1, data['isSignal'] == 0)
1370  p.finish()
1371  p.save('overtraining_plot.png')
1372 
1373  p = Correlation()
1374  p.add(data, 'FastBDT', 'NeuroBayes', [0, 20, 40, 60, 80, 100], data['isSignal'] == 0)
1375  p.finish()
1376  p.save('correlation_plot.png')
1377 
1378  p = CorrelationMatrix()
1379  data['FastBDT2'] = data['FastBDT']**2
1380  data['NeuroBayes2'] = data['NeuroBayes']**2
1381  data['FastBDT3'] = data['FastBDT']**3
1382  data['NeuroBayes3'] = data['NeuroBayes']**3
1383  p.add(data, ['FastBDT', 'NeuroBayes', 'FastBDT2', 'NeuroBayes2', 'FastBDT3', 'NeuroBayes3'])
1384  p.finish()
1385  p.save('correlation_matrix.png')
def calculate_flatness(f, p, w=None)
x_axis_label
Label on x axis.
Definition: plotting.py:696
def add(self, data, column, mask=None, weight_column=None)
Definition: plotting.py:698
def __init__(self, figure=None, axis=None)
Definition: plotting.py:687
def finish(self)
Definition: plotting.py:737
signal_axis
Main axis which shows the correlation of the signal samples.
Definition: plotting.py:1213
def add(self, data, columns, signal_mask, bckgrd_mask)
Definition: plotting.py:1238
colorbar_axis
Colorbar axis contains the colorbar.
Definition: plotting.py:1232
figure
figure which is used to draw
Definition: plotting.py:1211
def __init__(self, figure=None)
Definition: plotting.py:1217
bckgrd_axis
Axis which shows the correlation of the background samples.
Definition: plotting.py:1215
axis
Usual axis object which every Plotter object needs, here it is just a dummy.
Definition: plotting.py:1234
def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
Definition: plotting.py:1068
axis_d1
Axis which shows shape of signal.
Definition: plotting.py:1046
figure
figure which is used to draw
Definition: plotting.py:1042
def __init__(self, figure=None)
Definition: plotting.py:1050
axis_d2
Axis which shows shape of background.
Definition: plotting.py:1048
axis
Main axis which is used to draw.
Definition: plotting.py:1044
ymax
Maximum y value.
Definition: plotting.py:541
xmax
Maximum x value.
Definition: plotting.py:539
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:527
def finish(self)
Definition: plotting.py:548
x_axis_label
Label on x axis.
Definition: plotting.py:817
shift_to_zero
Mean difference is shifted to zero (removes constant offset) if this is true.
Definition: plotting.py:774
def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
Definition: plotting.py:764
ymax
Maximum y value.
Definition: plotting.py:777
xmax
Maximum x value.
Definition: plotting.py:807
def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
Definition: plotting.py:782
ymin
min y value
Definition: plotting.py:776
def finish(self, line_color='black')
Definition: plotting.py:820
normed
Minuend and subtrahend are normed before comparing them if this is true.
Definition: plotting.py:773
def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
Definition: plotting.py:569
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:601
x_axis_label
x axis label
Definition: plotting.py:599
keep_first_binning
Keep first binning if user wants so.
Definition: plotting.py:595
normed_to_all_entries
Normalize histograms before drawing them.
Definition: plotting.py:580
first_binning
first binning
Definition: plotting.py:597
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:584
normed_to_bin_width
Normalize histograms before drawing them.
Definition: plotting.py:582
def add(self, data, columns, variables)
Definition: plotting.py:1158
def finish(self)
Definition: plotting.py:1199
def add(self, i, *args, **kwargs)
Definition: plotting.py:502
figure
figure which is used to draw
Definition: plotting.py:473
def __init__(self, cls, number_of_plots, figure=None)
Definition: plotting.py:477
sub_plots
the subplots which are displayed in the grid
Definition: plotting.py:498
axis
Main axis.
Definition: plotting.py:475
def finish(self)
Definition: plotting.py:509
axis_d1
Axis which shows the difference between training and test signal.
Definition: plotting.py:846
figure
figure which is used to draw
Definition: plotting.py:842
def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:868
def __init__(self, figure=None)
Definition: plotting.py:850
axis_d2
Axis which shows the difference between training and test background.
Definition: plotting.py:848
axis
Main axis which is used to draw.
Definition: plotting.py:844
def finish(self, *args, **kwargs)
Definition: plotting.py:240
fill_kwargs
Default keyword arguments for fill_between function.
Definition: plotting.py:109
xmin
Minimum x value.
Definition: plotting.py:61
def set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
Definition: plotting.py:155
plots
Plots added to the axis so far.
Definition: plotting.py:57
float xscale
limit scale
Definition: plotting.py:69
figure
figure which is used to draw
Definition: plotting.py:71
errorband_kwargs
Default keyword arguments for errorband function.
Definition: plotting.py:107
def scale_limits(self)
Definition: plotting.py:246
def add(self, *args, **kwargs)
Definition: plotting.py:234
def set_fill_options(self, fill_kwargs=None)
Definition: plotting.py:163
def save(self, filename)
Definition: plotting.py:128
def __init__(self, figure=None, axis=None)
Definition: plotting.py:75
ymax
Maximum y value.
Definition: plotting.py:67
xmax
Maximum x value.
Definition: plotting.py:63
errorbar_kwargs
Default keyword arguments for errorbar function.
Definition: plotting.py:105
labels
Labels of the plots added so far.
Definition: plotting.py:59
axis
Main axis which is used to draw.
Definition: plotting.py:73
def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
Definition: plotting.py:171
def set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Overrides default errorbar options for datapoint errorbars.
Definition: plotting.py:147
float yscale
limit scale
Definition: plotting.py:68
ymin
Minimum y value.
Definition: plotting.py:65
def add_subplot(self, gridspecs)
Definition: plotting.py:116
def set_plot_options(self, plot_kwargs={ 'linestyle':''})
Definition: plotting.py:139
plot_kwargs
Default keyword arguments for plot function.
Definition: plotting.py:103
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:266
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:375
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:421
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:328
def add(self, data, columns, *masks)
Definition: plotting.py:1125
def finish(self)
Definition: plotting.py:1146
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:987
distribution
The distribution plot.
Definition: plotting.py:985
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:982
box_axes
Axes for the boxplots.
Definition: plotting.py:968
normed
Normalize histograms before drawing them.
Definition: plotting.py:980
def __init__(self, figure=None, axis=None, normed=False, range_in_std=None)
Definition: plotting.py:970
def weighted_mean_and_std(x, w)
Definition: histogram.py:32
def poisson_error(n_tot)
Definition: histogram.py:25
Definition: plot.py:1