Belle II Software  release-06-01-15
plotting.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 
11 
12 import copy
13 import math
14 
15 import pandas
16 import numpy
17 import matplotlib.pyplot as plt
18 import matplotlib.artist
19 import matplotlib.figure
20 import matplotlib.gridspec
21 import matplotlib.colors
22 import matplotlib.patches
23 import matplotlib.ticker
24 
25 from basf2_mva_evaluation import histogram
26 
27 import basf2 as b2
28 
29 import basf2_mva_util
30 import matplotlib
31 
32 # Do not use standard backend TkAgg, because it is NOT thread-safe
33 # You will get an RuntimeError: main thread is not in main loop otherwise!
34 matplotlib.use("svg")
35 matplotlib.rcParams.update({'font.size': 36})
36 
37 # Use the Belle II style while producing the plots
38 plt.style.use("belle2")
39 
40 
41 class Plotter(object):
42  """
43  Base class for all Plotters.
44  """
45 
46  # stupid workaround for doxygen refusing to document things
47 
48 
50 
51 
55 
56 
57  plots = None
58 
59  labels = None
60 
61  xmin = None
62 
63  xmax = None
64 
65  ymin = None
66 
67  ymax = None
68  yscale = 0.0
69  xscale = 0.0
70 
71  figure = None
72 
73  axis = None
74 
75  def __init__(self, figure=None, axis=None):
76  """
77  Creates a new figure and axis if None is given, sets the default plot parameters
78  @param figure default draw figure which is used
79  @param axis default draw axis which is used
80  """
81  b2.B2INFO("Create new figure for class " + str(type(self)))
82  if figure is None:
83  self.figurefigure = matplotlib.figure.Figure(figsize=(32, 18))
84  self.figurefigure.set_tight_layout(False)
85  else:
86  self.figurefigure = figure
87 
88  if axis is None:
89  self.axisaxis = self.figurefigure.add_subplot(1, 1, 1)
90  else:
91  self.axisaxis = axis
92 
93  self.plotsplots = []
94  self.labelslabels = []
95  self.xminxmin, self.xmaxxmax = float(0), float(1)
96  self.yminymin, self.ymaxymax = float(0), float(1)
97 
98  self.yscaleyscaleyscale = 0.1
99 
100  self.xscalexscalexscale = 0.0
101 
102 
103  self.plot_kwargsplot_kwargs = None
104 
105  self.errorbar_kwargserrorbar_kwargs = None
106 
107  self.errorband_kwargserrorband_kwargs = None
108 
109  self.fill_kwargsfill_kwargs = None
110 
111  self.set_plot_optionsset_plot_options()
112  self.set_errorbar_optionsset_errorbar_options()
113  self.set_errorband_optionsset_errorband_options()
114  self.set_fill_optionsset_fill_options()
115 
116  def add_subplot(self, gridspecs):
117  """
118  Adds a new subplot to the figure, updates all other axes
119  according to the given gridspec
120  @param gridspecs gridspecs for all axes including the new one
121  """
122  for gs, ax in zip(gridspecs[:-1], self.figurefigure.axes):
123  ax.set_position(gs.get_position(self.figurefigure))
124  ax.set_subplotspec(gs)
125  axis = self.figurefigure.add_subplot(gridspecs[-1], sharex=self.axisaxis)
126  return axis
127 
128  def save(self, filename):
129  """
130  Save the figure into a file
131  @param filename of the file
132  """
133  b2.B2INFO("Save figure for class " + str(type(self)))
134  from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
135  canvas = FigureCanvas(self.figurefigure)
136  canvas.print_figure(filename, dpi=50)
137  return self
138 
139  def set_plot_options(self, plot_kwargs={'linestyle': ''}):
140  """
141  Overrides default plot options for datapoint plot
142  @param plot_kwargs keyword arguments for the plot function
143  """
144  self.plot_kwargsplot_kwargs = copy.copy(plot_kwargs)
145  return self
146 
147  def set_errorbar_options(self, errorbar_kwargs={'fmt': '.', 'elinewidth': 3, 'alpha': 1}):
148  """
149  Overrides default errorbar options for datapoint errorbars
150  @param errorbar_kwargs keyword arguments for the errorbar function
151  """
152  self.errorbar_kwargserrorbar_kwargs = copy.copy(errorbar_kwargs)
153  return self
154 
155  def set_errorband_options(self, errorband_kwargs={'alpha': 0.5}):
156  """
157  Overrides default errorband options for datapoint errorband
158  @param errorbar_kwargs keyword arguments for the fill_between function
159  """
160  self.errorband_kwargserrorband_kwargs = copy.copy(errorband_kwargs)
161  return self
162 
163  def set_fill_options(self, fill_kwargs=None):
164  """
165  Overrides default fill_between options for datapoint errorband
166  @param fill_kwargs keyword arguments for the fill_between function
167  """
168  self.fill_kwargsfill_kwargs = copy.copy(fill_kwargs)
169  return self
170 
171  def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None):
172  """
173  Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
174  @param x coordinates of the data points
175  @param y coordinates of the data points
176  @param xerr symmetric error on x data points
177  @param yerr symmetric error on y data points
178  """
179  p = e = f = None
180  plot_kwargs = copy.copy(self.plot_kwargsplot_kwargs)
181  errorbar_kwargs = copy.copy(self.errorbar_kwargserrorbar_kwargs)
182  errorband_kwargs = copy.copy(self.errorband_kwargserrorband_kwargs)
183  fill_kwargs = copy.copy(self.fill_kwargsfill_kwargs)
184 
185  if plot_kwargs is None or 'color' not in plot_kwargs:
186  color = next(axis._get_lines.prop_cycler)
187  color = color['color']
188  plot_kwargs['color'] = color
189  else:
190  color = plot_kwargs['color']
191  color = matplotlib.colors.ColorConverter().to_rgb(color)
192  patch = matplotlib.patches.Patch(color=color, alpha=0.5)
193  patch.get_color = patch.get_facecolor
194  patches = [patch]
195 
196  if plot_kwargs is not None:
197  p, = axis.plot(x, y, rasterized=True, **plot_kwargs)
198  patches.append(p)
199 
200  if errorbar_kwargs is not None and (xerr is not None or yerr is not None):
201  if 'color' not in errorbar_kwargs:
202  errorbar_kwargs['color'] = color
203  if 'ecolor' not in errorbar_kwargs:
204  errorbar_kwargs['ecolor'] = [0.5 * x for x in color]
205  e = axis.errorbar(x, y, xerr=xerr, yerr=yerr, rasterized=True, **errorbar_kwargs)
206  patches.append(e)
207 
208  if errorband_kwargs is not None and yerr is not None:
209  if 'color' not in errorband_kwargs:
210  errorband_kwargs['color'] = color
211  if xerr is not None:
212  # Ensure that xerr and yerr are iterable numpy arrays
213  xerr = x + xerr - x
214  yerr = y + yerr - y
215  for _x, _y, _xe, _ye in zip(x, y, xerr, yerr):
216  axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=True,
217  **errorband_kwargs))
218  else:
219  f = axis.fill_between(x, y - yerr, y + yerr, interpolate=True, rasterized=True, **errorband_kwargs)
220 
221  if fill_kwargs is not None:
222  axis.fill_between(x, y, 0, rasterized=True, **fill_kwargs)
223 
224  return (tuple(patches), p, e, f)
225 
226  def add(self, *args, **kwargs):
227  """
228  Add a new plot to this plotter
229  """
230  return NotImplemented
231 
232  def finish(self, *args, **kwargs):
233  """
234  Finish plotting and set labels, legends and stuff
235  """
236  return NotImplemented
237 
238  def scale_limits(self):
239  """
240  Scale limits to increase distance to boundaries
241  """
242  self.yminymin *= 1.0 - math.copysign(self.yscaleyscaleyscale, self.yminymin)
243  self.ymaxymax *= 1.0 + math.copysign(self.yscaleyscaleyscale, self.ymaxymax)
244  self.xminxmin *= 1.0 - math.copysign(self.xscalexscalexscale, self.xminxmin)
245  self.xmaxxmax *= 1.0 + math.copysign(self.xscalexscalexscale, self.xmaxxmax)
246  return self
247 
248 
250  """
251  Plots the purity and the efficiency over the cut value (for cut choosing)
252  """
253 
257 
258  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
259  """
260  Add a new curve to the plot
261  @param data pandas.DataFrame containing all data
262  @param column which is used to calculate efficiency and purity for different cuts
263  @param signal_mask boolean numpy.array defining which events are signal events
264  @param bckgrd_mask boolean numpy.array defining which events are background events
265  @param weight_column column in data containing the weights for each event
266  """
267 
268  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
269 
270  if normed:
271  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
272  purity, purity_error = hists.get_purity(['Signal'], ['Background'])
273  else:
274  efficiency, efficiency_error = hists.get_true_positives(['Signal'])
275  purity, purity_error = hists.get_false_positives(['Background'])
276 
277  cuts = hists.bin_centers
278 
279  self.xminxmin, self.xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.xminxmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmaxxmaxxmax])
280  self.yminymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.yminymin]), \
281  numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.ymaxymaxymax])
282 
283  self.plotsplots.append(self._plot_datapoints_plot_datapoints(self.axisaxis, cuts, efficiency, xerr=0, yerr=efficiency_error))
284 
285  if normed:
286  self.labelslabels.append("Efficiency")
287  else:
288  self.labelslabels.append("True positive")
289 
290  self.plotsplots.append(self._plot_datapoints_plot_datapoints(self.axisaxis, cuts, purity, xerr=0, yerr=purity_error))
291 
292  if normed:
293  self.labelslabels.append("Purity")
294  else:
295  self.labelslabels.append("False positive")
296 
297  return self
298 
299  def finish(self):
300  """
301  Sets limits, title, axis-labels and legend of the plot
302  """
303  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
304  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
305  self.axisaxis.set_title("Classification Plot")
306  self.axisaxis.get_xaxis().set_label_text('Cut Value')
307  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
308  return self
309 
310 
312  """
313  Plots the signal to noise ratio over the cut value (for cut choosing)
314  """
315 
319 
320  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
321  """
322  Add a new curve to the plot
323  @param data pandas.DataFrame containing all data
324  @param column which is used to calculate signal to noise ratio for different cuts
325  @param signal_mask boolean numpy.array defining which events are signal events
326  @param bckgrd_mask boolean numpy.array defining which events are background events
327  @param weight_column column in data containing the weights for each event
328  """
329 
330  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
331 
332  signal2noise, signal2noise_error = hists.get_signal_to_noise(['Signal'], ['Background'])
333 
334  cuts = hists.bin_centers
335 
336  self.xminxmin, self.xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.xminxmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmaxxmaxxmax])
337  self.yminymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(signal2noise), self.yminymin]), \
338  numpy.nanmax([numpy.nanmax(signal2noise), self.ymaxymaxymax])
339 
340  self.plotsplots.append(self._plot_datapoints_plot_datapoints(self.axisaxis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))
341 
342  self.labelslabels.append(column)
343 
344  return self
345 
346  def finish(self):
347  """
348  Sets limits, title, axis-labels and legend of the plot
349  """
350  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
351  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
352  self.axisaxis.set_title("Signal to Noise Plot")
353  self.axisaxis.get_xaxis().set_label_text('Cut Value')
354  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
355  return self
356 
357 
359  """
360  Plots the purity over the efficiency also known as ROC curve
361  """
362 
366 
367  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
368  """
369  Add a new curve to the ROC plot
370  @param data pandas.DataFrame containing all data
371  @param column which is used to calculate efficiency and purity for different cuts
372  @param signal_mask boolean numpy.array defining which events are signal events
373  @param bckgrd_mask boolean numpy.array defining which events are background events
374  @param weight_column column in data containing the weights for each event
375  """
376  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
377  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
378  purity, purity_error = hists.get_purity(['Signal'], ['Background'])
379 
380  self.xminxmin, self.xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.xminxmin]), numpy.nanmax([efficiency.max(), self.xmaxxmaxxmax])
381  self.yminymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(purity), self.yminymin]), numpy.nanmax([numpy.nanmax(purity), self.ymaxymaxymax])
382 
383  p = self._plot_datapoints_plot_datapoints(self.axisaxis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
384  self.plotsplots.append(p)
385  if label is not None:
386  self.labelslabels.append(label)
387  else:
388  self.labelslabels.append(column)
389  return self
390 
391  def finish(self):
392  """
393  Sets limits, title, axis-labels and legend of the plot
394  """
395  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
396  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
397  self.axisaxis.set_title("ROC Purity Plot")
398  self.axisaxis.get_xaxis().set_label_text('Efficiency')
399  self.axisaxis.get_yaxis().set_label_text('Purity')
400  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
401  return self
402 
403 
405  """
406  Plots the rejection over the efficiency also known as ROC curve
407  """
408 
412 
413  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
414  """
415  Add a new curve to the ROC plot
416  @param data pandas.DataFrame containing all data
417  @param column which is used to calculate efficiency and purity for different cuts
418  @param signal_mask boolean numpy.array defining which events are signal events
419  @param bckgrd_mask boolean numpy.array defining which events are background events
420  @param weight_column column in data containing the weights for each event
421  """
422  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
423  efficiency, efficiency_error = hists.get_efficiency(['Signal'])
424  rejection, rejection_error = hists.get_efficiency(['Background'])
425  rejection = 1 - rejection
426  if isinstance(efficiency, int) and not isinstance(rejection, int):
427  efficiency = numpy.array([efficiency] * len(rejection))
428  elif isinstance(rejection, int) and not isinstance(efficiency, int):
429  rejection = numpy.array([rejection] * len(efficiency))
430  elif isinstance(rejection, int) and isinstance(efficiency, int):
431  efficiency = numpy.array([efficiency])
432  rejection = numpy.array([rejection])
433 
434  self.xminxmin, self.xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.xminxmin]), numpy.nanmax([efficiency.max(), self.xmaxxmaxxmax])
435  self.yminymin, self.ymaxymaxymax = numpy.nanmin([rejection.min(), self.yminymin]), numpy.nanmax([rejection.max(), self.ymaxymaxymax])
436 
437  auc = numpy.abs(numpy.trapz(rejection, efficiency))
438 
439  p = self._plot_datapoints_plot_datapoints(self.axisaxis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
440  self.plotsplots.append(p)
441  if label is not None:
442  self.labelslabels.append(label[:10] + " ({:.2f})".format(auc))
443  else:
444  self.labelslabels.append(column[:10] + " ({:.2f})".format(auc))
445  return self
446 
447  def finish(self):
448  """
449  Sets limits, title, axis-labels and legend of the plot
450  """
451  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
452  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
453  self.axisaxis.set_title("ROC Rejection Plot")
454  self.axisaxis.get_xaxis().set_label_text('Signal Efficiency')
455  self.axisaxis.get_yaxis().set_label_text('Background Rejection')
456  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
457  return self
458 
459 
461  """
462  Plots multiple other plots into a grid 3x?
463  """
464 
465  figure = None
466 
467  axis = None
468 
469  def __init__(self, cls, number_of_plots, figure=None):
470  """
471  Creates a new figure if None is given, sets the default plot parameters
472  @param figure default draw figure which is used
473  """
474  if figure is None:
475  self.figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
476  self.figurefigurefigure.set_tight_layout(True)
477  else:
478  self.figurefigurefigure = figure
479 
480  if number_of_plots == 1:
481  gs = matplotlib.gridspec.GridSpec(1, 1)
482  elif number_of_plots == 2:
483  gs = matplotlib.gridspec.GridSpec(1, 2)
484  elif number_of_plots == 3:
485  gs = matplotlib.gridspec.GridSpec(1, 3)
486  else:
487  gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
488 
489 
490  self.sub_plotssub_plots = [cls(self.figurefigurefigure, self.figurefigurefigure.add_subplot(gs[i // 3, i % 3])) for i in range(number_of_plots)]
491  self.axisaxisaxis = self.sub_plotssub_plots[0].axis
492  super(Multiplot, self).__init__(self.figurefigurefigure, self.axisaxisaxis)
493 
494  def add(self, i, *args, **kwargs):
495  """
496  Call add function of ith subplot
497  @param i position of the subplot
498  """
499  self.sub_plotssub_plots[i].add(*args, **kwargs)
500 
501  def finish(self):
502  """
503  Sets limits, title, axis-labels and legend of the plot
504  """
505  for plot in self.sub_plotssub_plots:
506  plot.finish()
507  return self
508 
509 
511  """
512  Plots the purity in each bin over the classifier output.
513  """
514 
518 
519  def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
520  """
521  Add a new curve to the Diagonal plot
522  @param data pandas.DataFrame containing all data
523  @param column which is used to calculate purity for different cuts
524  @param signal_mask boolean numpy.array defining which events are signal events
525  @param bckgrd_mask boolean numpy.array defining which events are background events
526  @param weight_column column in data containing the weights for each event
527  """
528  hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
529  purity, purity_error = hists.get_purity_per_bin(['Signal'], ['Background'])
530 
531  self.xminxmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xminxmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
532  # self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
533  self.yminymin, self.ymaxymaxymax = 0, 1
534 
535  p = self._plot_datapoints_plot_datapoints(self.axisaxis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
536  self.plotsplots.append(p)
537  self.labelslabels.append(column)
538  return self
539 
540  def finish(self):
541  """
542  Sets limits, title, axis-labels and legend of the plot
543  """
544  self.scale_limitsscale_limits()
545  self.axisaxis.plot((0.0, 1.0), (0.0, 1.0), color='black')
546  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
547  self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
548  self.axisaxis.set_title("Diagonal Plot")
549  self.axisaxis.get_xaxis().set_label_text('Classifier Output')
550  self.axisaxis.get_yaxis().set_label_text('Purity Per Bin')
551  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
552  return self
553 
554 
556  """
557  Plots distribution of a quantity
558  """
559 
560  def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
561  keep_first_binning=False, range_in_std=None):
562  """
563  Creates a new figure and axis if None is given, sets the default plot parameters
564  @param figure default draw figure which is used
565  @param axis default draw axis which is used
566  @param normed true if histograms should be normed before drawing
567  @param keep_first_binning use the binning of the first distribution for further plots
568  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
569  """
570  super(Distribution, self).__init__(figure, axis)
571 
572  self.normed_to_all_entriesnormed_to_all_entries = normed_to_all_entries
573 
574  self.normed_to_bin_widthnormed_to_bin_width = normed_to_bin_width
575 
576  self.range_in_stdrange_in_std = range_in_std
577  # if self.normed_to_all_entries or self.normed_to_bin_width:
578 
579  self.yminyminymin = float(0)
580 
581  self.ymaxymaxymax = float('-inf')
582 
583  self.xminxminxmin = float('inf')
584 
585  self.xmaxxmaxxmax = float('-inf')
586 
587  self.keep_first_binningkeep_first_binning = keep_first_binning
588 
589  self.first_binningfirst_binning = None
590 
591  self.x_axis_labelx_axis_label = ''
592 
593  def add(self, data, column, mask=None, weight_column=None, label=None):
594  """
595  Add a new distribution to the plots
596  @param data pandas.DataFrame containing all data
597  @param column which is used to calculate distribution histogram
598  @param mask boolean numpy.array defining which events are used for the histogram
599  @param weight_column column in data containing the weights for each event
600  """
601  if mask is None:
602  mask = numpy.ones(len(data)).astype('bool')
603 
604  bins = 100
605  if self.keep_first_binningkeep_first_binning and self.first_binningfirst_binning is not None:
606  bins = self.first_binningfirst_binning
607  hists = histogram.Histograms(data, column, {'Total': mask}, weight_column=weight_column,
608  bins=bins, equal_frequency=False, range_in_std=self.range_in_stdrange_in_std)
609  if self.keep_first_binningkeep_first_binning and self.first_binningfirst_binning is None:
610  self.first_binningfirst_binning = hists.bins
611  hist, hist_error = hists.get_hist('Total')
612 
613  if self.normed_to_all_entriesnormed_to_all_entries:
614  normalization = float(numpy.sum(hist))
615  hist = hist / normalization
616  hist_error = hist_error / normalization
617 
618  if self.normed_to_bin_widthnormed_to_bin_width:
619  hist = hist / hists.bin_widths
620  hist_error = hist_error / hists.bin_widths
621 
622  self.xminxminxmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xminxminxmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
623  self.yminyminymin = numpy.nanmin([hist.min(), self.yminyminymin])
624  self.ymaxymaxymax = numpy.nanmax([(hist + hist_error).max(), self.ymaxymaxymax])
625 
626  p = self._plot_datapoints_plot_datapoints(self.axisaxis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
627  self.plotsplots.append(p)
628  self.x_axis_labelx_axis_label = column
629 
630  appendix = ''
631  if self.ymaxymaxymax <= self.yminyminymin or self.xmaxxmaxxmax <= self.xminxminxmin:
632  appendix = ' No data to plot!'
633 
634  if label is None:
635  self.labelslabels.append(column + appendix)
636  else:
637  self.labelslabels.append(label + appendix)
638  return self
639 
640  def finish(self):
641  """
642  Sets limits, title, axis-labels and legend of the plot
643  """
644  self.axisaxis.set_title("Distribution Plot")
645  self.axisaxis.get_xaxis().set_label_text(self.x_axis_labelx_axis_label)
646 
647  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
648 
649  if self.ymaxymaxymax <= self.yminyminymin or self.xmaxxmaxxmax <= self.xminxminxmin:
650  self.axisaxis.set_xlim((0., 1.))
651  self.axisaxis.set_ylim((0., 1.))
652  self.axisaxis.text(0.36, 0.5, 'No data to plot', fontsize=60, color='black')
653  return self
654 
655  self.scale_limitsscale_limits()
656 
657  self.axisaxis.set_xlim((self.xminxminxmin, self.xmaxxmaxxmax))
658  self.axisaxis.set_ylim((self.yminyminymin, self.ymaxymaxymax))
659 
660  if self.normed_to_all_entriesnormed_to_all_entries and self.normed_to_bin_widthnormed_to_bin_width:
661  self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / (# Entries * Bin Width)')
662  elif self.normed_to_all_entriesnormed_to_all_entries:
663  self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / # Entries')
664  elif self.normed_to_bin_widthnormed_to_bin_width:
665  self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / Bin Width')
666  else:
667  self.axisaxis.get_yaxis().set_label_text('# Entries per Bin')
668 
669  return self
670 
671 
672 class Box(Plotter):
673  """
674  Create a boxplot
675  """
676 
678 
679  def __init__(self, figure=None, axis=None):
680  """
681  Creates a new figure and axis if None is given, sets the default plot parameters
682  @param figure default draw figure which is used
683  @param axis default draw axis which is used
684  """
685  super().__init__(figure=figure, axis=axis)
686 
687 
688  self.x_axis_labelx_axis_label = ""
689 
690  def add(self, data, column, mask=None, weight_column=None):
691  """
692  Add a new boxplot to the plots
693  @param data pandas.DataFrame containing all data
694  @param column which is used to calculate boxplot quantities
695  @param mask boolean numpy.array defining which events are used for the histogram
696  @param weight_column column in data containing the weights for each event
697  """
698  if mask is None:
699  mask = numpy.ones(len(data)).astype('bool')
700  x = data[column][mask]
701  if weight_column is not None:
702  # weight = data[weight_column][mask]
703  b2.B2WARNING("Weights are currently not used in boxplot, due to limitations in matplotlib")
704 
705  if len(x) == 0:
706  b2.B2WARNING("Ignore empty boxplot.")
707  return self
708 
709  p = self.axisaxis.boxplot(x, sym='k.', whis=1.5, vert=False, patch_artist=True, showmeans=True, widths=1,
710  boxprops=dict(facecolor='blue', alpha=0.5),
711  # medianprobs=dict(color='blue'),
712  # meanprobs=dict(color='red'),
713  )
714  self.plotsplots.append(p)
715  self.labelslabels.append(column)
716  self.x_axis_labelx_axis_label = column
717  r"""
718  self.axis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + '\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
719  fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
720  self.axis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + '\n' + r'$IQD = {:.2f}$').format(x.std(),
721  x.quantile(0.75) - x.quantile(0.25)),
722  fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
723  self.axis.text(0.7, 0.9, (r'$min = {:.2f}$' + '\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
724  fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
725  """
726 
727  return self
728 
729  def finish(self):
730  """
731  Sets limits, title, axis-labels and legend of the plot
732  """
733  matplotlib.artist.setp(self.axisaxis.get_yaxis(), visible=False)
734  self.axisaxis.get_xaxis().set_label_text(self.x_axis_labelx_axis_label)
735  self.axisaxis.set_title("Box Plot")
736  return self
737 
738 
740  """
741  Plots the difference between two histograms
742  """
743 
755 
756  def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
757  """
758  Creates a new figure and axis if None is given, sets the default plot parameters
759  @param figure default draw figure which is used
760  @param axis default draw axis which is used
761  @param normed normalize minuend and subtrahend before comparing them
762  @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
763  """
764  super(Difference, self).__init__(figure, axis)
765  self.normednormed = normed
766  self.shift_to_zeroshift_to_zero = shift_to_zero
767  if self.normednormed:
768  self.yminyminymin = -0.01
769  self.ymaxymaxymax = 0.01
770  else:
771  self.yminyminymin = -1
772  self.ymaxymaxymax = 1
773 
774  def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
775  """
776  Add a new difference plot
777  @param data pandas.DataFrame containing all data
778  @param column which is used to calculate distribution histogram
779  @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
780  @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
781  @param weight_column column in data containing the weights for each event
782  @param label label for the legend if None, the column name is used
783  """
784  hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask},
785  weight_column=weight_column, equal_frequency=False)
786  minuend, minuend_error = hists.get_hist('Minuend')
787  subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
788 
789  difference_error = histogram.poisson_error(minuend + subtrahend)
790  if self.normednormed:
791  difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
792  minuend = minuend / numpy.sum(minuend)
793  subtrahend = subtrahend / numpy.sum(subtrahend)
794  difference = minuend - subtrahend
795 
796  if self.shift_to_zeroshift_to_zero:
797  difference = difference - numpy.mean(difference)
798 
799  self.xminxmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xminxmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
800  self.yminyminymin = min((difference - difference_error).min(), self.yminyminymin)
801  self.ymaxymaxymax = max((difference + difference_error).max(), self.ymaxymaxymax)
802 
803  p = self._plot_datapoints_plot_datapoints(self.axisaxis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
804  self.plotsplots.append(p)
805  if label is None:
806  self.labelslabels.append(label)
807  else:
808  self.labelslabels.append(column)
809  self.x_axis_labelx_axis_label = column
810  return self
811 
812  def finish(self, line_color='black'):
813  """
814  Sets limits, title, axis-labels and legend of the plot
815  """
816  self.axisaxis.plot((self.xminxmin, self.xmaxxmaxxmax), (0, 0), color=line_color, linewidth=4, rasterized=True)
817  self.scale_limitsscale_limits()
818  self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
819  self.axisaxis.set_ylim((self.yminyminymin, self.ymaxymaxymax))
820  self.axisaxis.set_title("Difference Plot")
821  self.axisaxis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
822  self.axisaxis.get_xaxis().set_label_text(self.x_axis_labelx_axis_label)
823  self.axisaxis.get_yaxis().set_label_text('Difference')
824  self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
825  return self
826 
827 
829  """
830  Create TMVA-like overtraining control plot for a classification training
831  """
832 
833 
834  figure = None
835 
836  axis = None
837 
838  axis_d1 = None
839 
840  axis_d2 = None
841 
842  def __init__(self, figure=None):
843  """
844  Creates a new figure if None is given, sets the default plot parameters
845  @param figure default draw figure which is used
846  """
847  if figure is None:
848  self.figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
849  self.figurefigurefigure.set_tight_layout(True)
850  else:
851  self.figurefigurefigure = figure
852 
853  gs = matplotlib.gridspec.GridSpec(5, 1)
854  self.axisaxisaxis = self.figurefigurefigure.add_subplot(gs[:3, :])
855  self.axis_d1axis_d1 = self.figurefigurefigure.add_subplot(gs[3, :], sharex=self.axisaxisaxis)
856  self.axis_d2axis_d2 = self.figurefigurefigure.add_subplot(gs[4, :], sharex=self.axisaxisaxis)
857 
858  super(Overtraining, self).__init__(self.figurefigurefigure, self.axisaxisaxis)
859 
860  def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
861  """
862  Add a new overtraining plot, I recommend to raw only one overtraining plot at the time,
863  otherwise there are too many curves in the plot to recognize anything in the plot.
864  @param data pandas.DataFrame containing all data
865  @param column which is used to calculate distribution histogram
866  @param train_mask boolean numpy.array defining which events are training events
867  @param test_mask boolean numpy.array defining which events are test events
868  @param signal_mask boolean numpy.array defining which events are signal events
869  @param bckgrd_mask boolean numpy.array defining which events are background events
870  @param weight_column column in data containing the weights for each event
871  """
872  distribution = Distribution(self.figurefigurefigure, self.axisaxisaxis, normed_to_all_entries=True)
873 
874  distribution.set_plot_options(self.plot_kwargsplot_kwargs)
875  distribution.set_errorbar_options(self.errorbar_kwargserrorbar_kwargs)
876  distribution.set_errorband_options(self.errorband_kwargserrorband_kwargs)
877  distribution.add(data, column, test_mask & signal_mask, weight_column)
878  distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
879 
880  distribution.set_plot_options(
881  {'color': distribution.plots[0][0][0].get_color(), 'linestyle': '-', 'lw': 4, 'drawstyle': 'steps-mid'})
882  distribution.set_fill_options({'color': distribution.plots[0][0][0].get_color(), 'alpha': 0.5, 'step': 'mid'})
883  distribution.set_errorbar_options(None)
884  distribution.set_errorband_options(None)
885  distribution.add(data, column, train_mask & signal_mask, weight_column)
886  distribution.set_plot_options(
887  {'color': distribution.plots[1][0][0].get_color(), 'linestyle': '-', 'lw': 4, 'drawstyle': 'steps-mid'})
888  distribution.set_fill_options({'color': distribution.plots[1][0][0].get_color(), 'alpha': 0.5, 'step': 'mid'})
889  distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
890 
891  distribution.labels = ['Test-Signal', 'Test-Background', 'Train-Signal', 'Train-Background']
892  distribution.finish()
893 
894  self.plot_kwargsplot_kwargs['color'] = distribution.plots[0][0][0].get_color()
895  difference_signal = Difference(self.figurefigurefigure, self.axis_d1axis_d1, shift_to_zero=True, normed=True)
896  difference_signal.set_plot_options(self.plot_kwargsplot_kwargs)
897  difference_signal.set_errorbar_options(self.errorbar_kwargserrorbar_kwargs)
898  difference_signal.set_errorband_options(self.errorband_kwargserrorband_kwargs)
899  difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
900  self.axis_d1axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
901  self.axis_d1axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
902  difference_signal.plots = difference_signal.labels = []
903  difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
904 
905  self.plot_kwargsplot_kwargs['color'] = distribution.plots[1][0][0].get_color()
906  difference_bckgrd = Difference(self.figurefigurefigure, self.axis_d2axis_d2, shift_to_zero=True, normed=True)
907  difference_bckgrd.set_plot_options(self.plot_kwargsplot_kwargs)
908  difference_bckgrd.set_errorbar_options(self.errorbar_kwargserrorbar_kwargs)
909  difference_bckgrd.set_errorband_options(self.errorband_kwargserrorband_kwargs)
910  difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
911  self.axis_d2axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
912  self.axis_d2axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
913  difference_bckgrd.plots = difference_bckgrd.labels = []
914  difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
915 
916  try:
917  import scipy.stats
918  # Kolmogorov smirnov test
919  if len(data[column][train_mask & signal_mask]) == 0 or len(data[column][test_mask & signal_mask]) == 0:
920  b2.B2WARNING("Cannot calculate kolmogorov smirnov test for signal due to missing data")
921  else:
922  ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
923  props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
924  self.axis_d1axis_d1.text(0.1, 0.9, r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36, bbox=props,
925  verticalalignment='top', horizontalalignment='left', transform=self.axis_d1axis_d1.transAxes)
926  if len(data[column][train_mask & bckgrd_mask]) == 0 or len(data[column][test_mask & bckgrd_mask]) == 0:
927  b2.B2WARNING("Cannot calculate kolmogorov smirnov test for background due to missing data")
928  else:
929  ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
930  props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
931  self.axis_d2axis_d2.text(0.1, 0.9, r'background (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36,
932  bbox=props,
933  verticalalignment='top', horizontalalignment='left', transform=self.axis_d2axis_d2.transAxes)
934  except ImportError:
935  b2.B2WARNING("Cannot calculate kolmogorov smirnov test please install scipy!")
936 
937  return self
938 
939  def finish(self):
940  """
941  Sets limits, title, axis-labels and legend of the plot
942  """
943  self.axisaxisaxis.set_title("Overtraining Plot")
944  self.axis_d1axis_d1.set_title("")
945  self.axis_d2axis_d2.set_title("")
946  matplotlib.artist.setp(self.axisaxisaxis.get_xticklabels(), visible=False)
947  matplotlib.artist.setp(self.axis_d1axis_d1.get_xticklabels(), visible=False)
948  self.axisaxisaxis.get_xaxis().set_label_text('')
949  self.axis_d1axis_d1.get_xaxis().set_label_text('')
950  self.axis_d2axis_d2.get_xaxis().set_label_text('Classifier Output')
951  return self
952 
953 
955  """
956  Plots distribution of a quantity including boxplots
957  """
958 
959 
960  box_axes = None
961 
962  def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
963  """
964  Creates a new figure and axis if None is given, sets the default plot parameters
965  @param figure default draw figure which is used
966  @param axis default draw axis which is used
967  @param normed true if the histograms should be normed before drawing
968  @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
969  """
970  super(VerboseDistribution, self).__init__(figure, axis)
971 
972  self.normednormed = normed
973 
974  self.range_in_stdrange_in_std = range_in_std
975  self.box_axesbox_axes = []
976 
977  self.distributiondistribution = Distribution(self.figurefigure, self.axisaxis, normed_to_all_entries=self.normednormed, range_in_std=self.range_in_stdrange_in_std)
978 
979  def add(self, data, column, mask=None, weight_column=None, label=None):
980  """
981  Add a new distribution plot, with additional information like a boxplot compared to
982  the ordinary Distribution plot.
983  @param data pandas.DataFrame containing all data
984  @param column which is used to calculate distribution histogram
985  @param mask boolean numpy.array defining which events are used for the distribution histogram
986  @param weight_column column in data containing the weights for each event
987  """
988  self.distributiondistribution.set_plot_options(self.plot_kwargsplot_kwargs)
989  self.distributiondistribution.set_errorbar_options(self.errorbar_kwargserrorbar_kwargs)
990  self.distributiondistribution.set_errorband_options(self.errorband_kwargserrorband_kwargs)
991  self.distributiondistribution.add(data, column, mask, weight_column, label=label)
992 
993  n = len(self.box_axesbox_axes) + 1
994  gs = matplotlib.gridspec.GridSpec(4 * n, 1)
995  gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i in range(n)]
996  box_axis = self.add_subplotadd_subplot(gridspecs)
997 
998  if self.range_in_stdrange_in_std is not None:
999  mean, std = histogram.weighted_mean_and_std(data[column], None if weight_column is None else data[weight_column])
1000  # Everything outside mean +- range_in_std * std is considered not inside the mask
1001  mask = mask & (data[column] > (mean - self.range_in_stdrange_in_std * std)) & (data[column] < (mean + self.range_in_stdrange_in_std * std))
1002  box = Box(self.figurefigure, box_axis)
1003  box.add(data, column, mask, weight_column)
1004  if len(box.plots) > 0:
1005  box.plots[0]['boxes'][0].set_facecolor(self.distributiondistribution.plots[-1][0][0].get_color())
1006  box.finish()
1007 
1008  self.box_axesbox_axes.append(box_axis)
1009  return self
1010 
1011  def finish(self):
1012  """
1013  Sets limits, title, axis-labels and legend of the plot
1014  """
1015  self.distributiondistribution.finish()
1016  matplotlib.artist.setp(self.axisaxis.get_xticklabels(), visible=False)
1017  self.axisaxis.get_xaxis().set_label_text('')
1018  for box_axis in self.box_axesbox_axes[:-1]:
1019  matplotlib.artist.setp(box_axis.get_xticklabels(), visible=False)
1020  box_axis.set_title("")
1021  box_axis.get_xaxis().set_label_text('')
1022  self.box_axesbox_axes[-1].set_title("")
1023  self.axisaxis.set_title("Distribution Plot")
1024  self.axisaxis.legend([x[0] for x in self.distributiondistribution.plots], self.distributiondistribution.labels,
1025  loc='best', fancybox=True, framealpha=0.5)
1026  return self
1027 
1028 
1030  """
1031  Plots change of a distribution of a quantity depending on the cut on a classifier
1032  """
1033 
1034  figure = None
1035 
1036  axis = None
1037 
1038  axis_d1 = None
1039 
1040  axis_d2 = None
1041 
1042  def __init__(self, figure=None):
1043  """
1044  Creates a new figure if None is given, sets the default plot parameters
1045  @param figure default draw figure which is used
1046  """
1047  if figure is None:
1048  self.figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1049  self.figurefigurefigure.set_tight_layout(True)
1050  else:
1051  self.figurefigurefigure = figure
1052 
1053  gs = matplotlib.gridspec.GridSpec(3, 2)
1054  self.axisaxisaxis = self.figurefigurefigure.add_subplot(gs[0, :])
1055  self.axis_d1axis_d1 = self.figurefigurefigure.add_subplot(gs[1, :], sharex=self.axisaxisaxis)
1056  self.axis_d2axis_d2 = self.figurefigurefigure.add_subplot(gs[2, :], sharex=self.axisaxisaxis)
1057 
1058  super(Correlation, self).__init__(self.figurefigurefigure, self.axisaxisaxis)
1059 
1060  def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1061  """
1062  Add a new correlation plot.
1063  @param data pandas.DataFrame containing all data
1064  @param column which is used to calculate distribution histogram
1065  @param cut_column which is used to calculate cut on the other quantity defined by column
1066  @param quantiles list of quantiles between 0 and 100, defining the different cuts
1067  @param weight_column column in data containing the weights for each event
1068  """
1069  if len(data[cut_column]) == 0:
1070  b2.B2WARNING("Ignore empty Correlation.")
1071  return self
1072 
1073  axes = [self.axisaxisaxis, self.axis_d1axis_d1, self.axis_d2axis_d2]
1074 
1075  for i, (l, m) in enumerate([('.', signal_mask | bckgrd_mask), ('S', signal_mask), ('B', bckgrd_mask)]):
1076 
1077  if weight_column is not None:
1078  weights = numpy.array(data[weight_column][m])
1079  else:
1080  weights = numpy.ones(len(data[column][m]))
1081 
1082  xrange = numpy.percentile(data[column][m], [5, 95])
1083 
1084  colormap = plt.get_cmap('coolwarm')
1085  tmp, x = numpy.histogram(data[column][m], bins=100,
1086  range=xrange, normed=True, weights=weights)
1087  bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1088  axes[i].plot(bin_center, tmp, color='black', lw=1)
1089 
1090  for quantil in numpy.arange(5, 100, 5):
1091  cut = numpy.percentile(data[cut_column][m], quantil)
1092  sel = data[cut_column][m] >= cut
1093  y, x = numpy.histogram(data[column][m][sel], bins=100,
1094  range=xrange, normed=True, weights=weights[sel])
1095  bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1096  axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1097  tmp = y
1098 
1099  axes[i].set_ylim(bottom=0)
1100 
1101  flatness_score = basf2_mva_util.calculate_flatness(data[column][m], data[cut_column][m], weights)
1102  axes[i].set_title(r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1103  return self
1104 
1105  def finish(self):
1106  """
1107  Sets limits, title, axis-labels and legend of the plot
1108  """
1109  return self
1110 
1111 
1112 class TSNE(Plotter):
1113  """
1114  Plots multivariate distribution using TSNE algorithm
1115  """
1116 
1117  def add(self, data, columns, *masks):
1118  """
1119  Add a new correlation plot.
1120  @param data pandas.DataFrame containing all data
1121  @param columns which are used to calculate the correlations
1122  @param masks different classes to show in TSNE
1123  """
1124  try:
1125  import sklearn
1126  import sklearn.manifold
1127  model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1128  data = numpy.array([data[column] for column in columns]).T
1129  model.fit(data)
1130  for mask in masks:
1131  data = numpy.array([data[column][mask] for column in columns]).T
1132  data = model.transform(data)
1133  self.axisaxis.scatter(data[:, 0], data[:, 1], rasterized=True)
1134  except ImportError:
1135  print("Cannot create TSNE plot. Install sklearn if you want it")
1136  return self
1137 
1138  def finish(self):
1139  """
1140  Sets limits, title, axis-labels and legend of the plot
1141  """
1142  return self
1143 
1144 
1146  """
1147  Plots importance matrix
1148  """
1149 
1150  def add(self, data, columns, variables):
1151  """
1152  Add a new correlation plot.
1153  @param data pandas.DataFrame containing all data
1154  @param columns which are used to calculate the correlations
1155  """
1156  self.figurefigure.set_tight_layout(True)
1157 
1158  def norm(x):
1159  width = (numpy.max(x) - numpy.min(x))
1160  if width <= 0:
1161  return numpy.zeros(x.shape)
1162  return (x - numpy.min(x)) / width * 100
1163 
1164  importance_matrix = numpy.vstack([norm(data[column]) for column in columns]).T
1165  importance_heatmap = self.axisaxis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,
1166  rasterized=True)
1167 
1168  # put the major ticks at the middle of each cell
1169  self.axisaxis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=False)
1170  self.axisaxis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=False)
1171 
1172  self.axisaxis.set_xticklabels(columns, minor=False, rotation=90)
1173  self.axisaxis.set_yticklabels(variables, minor=False)
1174 
1175  self.axisaxis.xaxis.tick_top()
1176 
1177  for y in range(importance_matrix.shape[0]):
1178  for x in range(importance_matrix.shape[1]):
1179  self.axisaxis.text(x + 0.5, y + 0.5, '%.0f' % importance_matrix[y, x],
1180  size=14,
1181  horizontalalignment='center',
1182  verticalalignment='center')
1183 
1184  cb = self.figurefigure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation='vertical')
1185  cb.ax.set_yticklabels(['low', 'high'])
1186 
1187  self.axisaxis.set_aspect('equal')
1188 
1189  return self
1190 
1191  def finish(self):
1192  """
1193  Sets limits, title, axis-labels and legend of the plot
1194  """
1195  return self
1196 
1197 
1199  """
1200  Plots correlation matrix
1201  """
1202 
1203  figure = None
1204 
1205  signal_axis = None
1206 
1207  bckgrd_axis = None
1208 
1209  def __init__(self, figure=None):
1210  """
1211  Creates a new figure if None is given, sets the default plot parameters
1212  @param figure default draw figure which is used
1213  """
1214  if figure is None:
1215  self.figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1216  self.figurefigurefigure.set_tight_layout(True)
1217  else:
1218  self.figurefigurefigure = figure
1219 
1220  gs = matplotlib.gridspec.GridSpec(8, 2)
1221  self.signal_axissignal_axis = self.figurefigurefigure.add_subplot(gs[:6, 0])
1222  self.bckgrd_axisbckgrd_axis = self.figurefigurefigure.add_subplot(gs[:6, 1], sharey=self.signal_axissignal_axis)
1223 
1224  self.colorbar_axiscolorbar_axis = self.figurefigurefigure.add_subplot(gs[7, :])
1225 
1226  self.axisaxisaxis = self.signal_axissignal_axis
1227 
1228  super(CorrelationMatrix, self).__init__(self.figurefigurefigure, self.axisaxisaxis)
1229 
1230  def add(self, data, columns, signal_mask, bckgrd_mask):
1231  """
1232  Add a new correlation plot.
1233  @param data pandas.DataFrame containing all data
1234  @param columns which are used to calculate the correlations
1235  """
1236  signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column in columns])) * 100
1237  bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask] for column in columns])) * 100
1238 
1239  signal_heatmap = self.signal_axissignal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1240 
1241  self.signal_axissignal_axis.invert_yaxis()
1242  self.signal_axissignal_axis.xaxis.tick_top()
1243  self.bckgrd_axisbckgrd_axis.invert_yaxis()
1244  self.bckgrd_axisbckgrd_axis.xaxis.tick_top()
1245 
1246  # put the major ticks at the middle of each cell
1247  self.signal_axissignal_axis.set_xticks(numpy.arange(signal_corr.shape[0]) + 0.5, minor=False)
1248  self.signal_axissignal_axis.set_yticks(numpy.arange(signal_corr.shape[1]) + 0.5, minor=False)
1249 
1250  self.signal_axissignal_axis.set_xticklabels(columns, minor=False, rotation=90)
1251  self.signal_axissignal_axis.set_yticklabels(columns, minor=False)
1252 
1253  # put the major ticks at the middle of each cell
1254  self.bckgrd_axisbckgrd_axis.set_xticks(numpy.arange(bckgrd_corr.shape[0]) + 0.5, minor=False)
1255  self.bckgrd_axisbckgrd_axis.set_yticks(numpy.arange(bckgrd_corr.shape[1]) + 0.5, minor=False)
1256 
1257  self.bckgrd_axisbckgrd_axis.set_xticklabels(columns, minor=False, rotation=90)
1258  self.bckgrd_axisbckgrd_axis.set_yticklabels(columns, minor=False)
1259 
1260  for y in range(signal_corr.shape[0]):
1261  for x in range(signal_corr.shape[1]):
1262  self.signal_axissignal_axis.text(x + 0.5, y + 0.5, '%.0f' % signal_corr[y, x],
1263  size=14,
1264  horizontalalignment='center',
1265  verticalalignment='center')
1266 
1267  for y in range(bckgrd_corr.shape[0]):
1268  for x in range(bckgrd_corr.shape[1]):
1269  self.bckgrd_axisbckgrd_axis.text(x + 0.5, y + 0.5, '%.0f' % bckgrd_corr[y, x],
1270  size=14,
1271  horizontalalignment='center',
1272  verticalalignment='center')
1273 
1274  cb = self.figurefigurefigure.colorbar(signal_heatmap, cax=self.colorbar_axiscolorbar_axis, ticks=[-100, 0, 100], orientation='horizontal')
1275  cb.solids.set_rasterized(True)
1276  cb.ax.set_xticklabels(['negative', 'uncorrelated', 'positive'])
1277 
1278  self.signal_axissignal_axis.text(0.5, -1.0, "Signal", horizontalalignment='center')
1279  self.bckgrd_axisbckgrd_axis.text(0.5, -1.0, "Background", horizontalalignment='center')
1280 
1281  return self
1282 
1283  def finish(self):
1284  """
1285  Sets limits, title, axis-labels and legend of the plot
1286  """
1287  matplotlib.artist.setp(self.bckgrd_axisbckgrd_axis.get_yticklabels(), visible=False)
1288  return self
1289 
1290 
1291 if __name__ == '__main__':
1292 
1293  def get_data(N, columns):
1294  """
1295  Creates fake data for example plots
1296  """
1297  N /= 2
1298  n = len(columns) - 1
1299  xs = numpy.random.normal(0, size=(N, n))
1300  xb = numpy.random.normal(1, size=(N, n))
1301  ys = numpy.zeros(N)
1302  yb = numpy.ones(N)
1303  data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1304  return data.reindex(numpy.random.permutation(data.index))
1305 
1306  import seaborn
1307  # Set nice searborn settings
1308  seaborn.set(font_scale=3)
1309  seaborn.set_style('whitegrid')
1310 
1311  # Standard plots
1312  N = 100000
1313  data = get_data(N, columns=['FastBDT', 'NeuroBayes', 'isSignal'])
1314  data['type'] = ''
1315  data.type.iloc[:N / 2] = 'Train'
1316  data.type.iloc[N / 2:] = 'Test'
1317 
1318  p = Box()
1319  p.add(data, 'FastBDT')
1320  p.finish()
1321  p.save('box_plot.png')
1322 
1323  p = VerboseDistribution()
1324  p.add(data, 'FastBDT')
1325  p.add(data, 'NeuroBayes')
1326  p.finish()
1327  p.save('verbose_distribution_plot.png')
1328 
1329  p = PurityOverEfficiency()
1330  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1331  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1332  p.finish()
1333  p.save('roc_purity_plot.png')
1334 
1336  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1337  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1338  p.finish()
1339  p.save('roc_rejection_plot.png')
1340 
1341  p = Diagonal()
1342  p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1343  p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1344  p.finish()
1345  p.save('diagonal_plot.png')
1346 
1347  p = Distribution()
1348  p.add(data, 'FastBDT')
1349  p.add(data, 'NeuroBayes')
1350  p.finish()
1351  p.save('distribution_plot.png')
1352 
1353  p = Difference()
1354  p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test')
1355  p.add(data, 'NeuroBayes', data['type'] == 'Train', data['type'] == 'Test')
1356  p.finish()
1357  p.save('difference_plot.png')
1358 
1359  p = Overtraining()
1360  p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test', data['isSignal'] == 1, data['isSignal'] == 0)
1361  p.finish()
1362  p.save('overtraining_plot.png')
1363 
1364  p = Correlation()
1365  p.add(data, 'FastBDT', 'NeuroBayes', [0, 20, 40, 60, 80, 100], data['isSignal'] == 0)
1366  p.finish()
1367  p.save('correlation_plot.png')
1368 
1369  p = CorrelationMatrix()
1370  data['FastBDT2'] = data['FastBDT']**2
1371  data['NeuroBayes2'] = data['NeuroBayes']**2
1372  data['FastBDT3'] = data['FastBDT']**3
1373  data['NeuroBayes3'] = data['NeuroBayes']**3
1374  p.add(data, ['FastBDT', 'NeuroBayes', 'FastBDT2', 'NeuroBayes2', 'FastBDT3', 'NeuroBayes3'])
1375  p.finish()
1376  p.save('correlation_matrix.png')
def calculate_flatness(f, p, w=None)
x_axis_label
Label on x axis.
Definition: plotting.py:688
def add(self, data, column, mask=None, weight_column=None)
Definition: plotting.py:690
def __init__(self, figure=None, axis=None)
Definition: plotting.py:679
def finish(self)
Definition: plotting.py:729
signal_axis
Main axis which shows the correlation of the signal samples.
Definition: plotting.py:1205
def add(self, data, columns, signal_mask, bckgrd_mask)
Definition: plotting.py:1230
colorbar_axis
Colorbar axis contains the colorbar.
Definition: plotting.py:1224
figure
figure which is used to draw
Definition: plotting.py:1203
def __init__(self, figure=None)
Definition: plotting.py:1209
bckgrd_axis
Axis which shows the correlation of the background samples.
Definition: plotting.py:1207
axis
Usual axis object which every Plotter object needs, here it is just a dummy.
Definition: plotting.py:1226
def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
Definition: plotting.py:1060
axis_d1
Axis which shows shape of signal.
Definition: plotting.py:1038
figure
figure which is used to draw
Definition: plotting.py:1034
def __init__(self, figure=None)
Definition: plotting.py:1042
axis_d2
Axis which shows shape of background.
Definition: plotting.py:1040
axis
Main axis which is used to draw.
Definition: plotting.py:1036
ymax
Maximum y value.
Definition: plotting.py:533
xmax
Maximum x value.
Definition: plotting.py:531
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:519
def finish(self)
Definition: plotting.py:540
x_axis_label
Label on x axis.
Definition: plotting.py:809
shift_to_zero
Mean difference is shifted to zero (removes constant offset) if this is true.
Definition: plotting.py:766
def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
Definition: plotting.py:756
ymax
Maximum y value.
Definition: plotting.py:769
xmax
Maximum x value.
Definition: plotting.py:799
def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
Definition: plotting.py:774
ymin
min y value
Definition: plotting.py:768
def finish(self, line_color='black')
Definition: plotting.py:812
normed
Minuend and subtrahend are normed before comparing them if this is true.
Definition: plotting.py:765
def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
Definition: plotting.py:561
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:593
x_axis_label
x axis label
Definition: plotting.py:591
keep_first_binning
Keep first binning if user wants so.
Definition: plotting.py:587
normed_to_all_entries
Normalize histograms before drawing them.
Definition: plotting.py:572
first_binning
first binning
Definition: plotting.py:589
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:576
normed_to_bin_width
Normalize histograms before drawing them.
Definition: plotting.py:574
def add(self, data, columns, variables)
Definition: plotting.py:1150
def finish(self)
Definition: plotting.py:1191
def add(self, i, *args, **kwargs)
Definition: plotting.py:494
figure
figure which is used to draw
Definition: plotting.py:465
def __init__(self, cls, number_of_plots, figure=None)
Definition: plotting.py:469
sub_plots
the subplots which are displayed in the grid
Definition: plotting.py:490
axis
Main axis.
Definition: plotting.py:467
def finish(self)
Definition: plotting.py:501
axis_d1
Axis which shows the difference between training and test signal.
Definition: plotting.py:838
figure
figure which is used to draw
Definition: plotting.py:834
def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:860
def __init__(self, figure=None)
Definition: plotting.py:842
axis_d2
Axis which shows the difference between training and test background.
Definition: plotting.py:840
axis
Main axis which is used to draw.
Definition: plotting.py:836
def finish(self, *args, **kwargs)
Definition: plotting.py:232
fill_kwargs
Default keyword arguments for fill_between function.
Definition: plotting.py:109
xmin
Minimum x value.
Definition: plotting.py:61
def set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
Definition: plotting.py:155
plots
Plots added to the axis so far.
Definition: plotting.py:57
float xscale
limit scale
Definition: plotting.py:69
figure
figure which is used to draw
Definition: plotting.py:71
errorband_kwargs
Default keyword arguments for errorband function.
Definition: plotting.py:107
def scale_limits(self)
Definition: plotting.py:238
def add(self, *args, **kwargs)
Definition: plotting.py:226
def set_fill_options(self, fill_kwargs=None)
Definition: plotting.py:163
def save(self, filename)
Definition: plotting.py:128
def __init__(self, figure=None, axis=None)
Definition: plotting.py:75
ymax
Maximum y value.
Definition: plotting.py:67
xmax
Maximum x value.
Definition: plotting.py:63
errorbar_kwargs
Default keyword arguments for errorbar function.
Definition: plotting.py:105
labels
Labels of the plots added so far.
Definition: plotting.py:59
axis
Main axis which is used to draw.
Definition: plotting.py:73
def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
Definition: plotting.py:171
def set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Overrides default errorbar options for datapoint errorbars.
Definition: plotting.py:147
float yscale
limit scale
Definition: plotting.py:68
ymin
Minimum y value.
Definition: plotting.py:65
def add_subplot(self, gridspecs)
Definition: plotting.py:116
def set_plot_options(self, plot_kwargs={ 'linestyle':''})
Definition: plotting.py:139
plot_kwargs
Default keyword arguments for plot function.
Definition: plotting.py:103
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:258
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:367
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:413
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:320
def add(self, data, columns, *masks)
Definition: plotting.py:1117
def finish(self)
Definition: plotting.py:1138
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:979
distribution
The distribution plot.
Definition: plotting.py:977
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:974
box_axes
Axes for the boxplots.
Definition: plotting.py:960
normed
Normalize histograms before drawing them.
Definition: plotting.py:972
def __init__(self, figure=None, axis=None, normed=False, range_in_std=None)
Definition: plotting.py:962
def weighted_mean_and_std(x, w)
Definition: histogram.py:32
def poisson_error(n_tot)
Definition: histogram.py:25
Definition: plot.py:1