Belle II Software development
plotting.py
1#!/usr/bin/env python3
2
3
4
11
12import copy
13import math
14
15import pandas
16import numpy
17import itertools
18import matplotlib.pyplot as plt
19import matplotlib.artist
20import matplotlib.figure
21import matplotlib.gridspec
22import matplotlib.colors
23import matplotlib.patches
24import matplotlib.ticker
25import matplotlib.patheffects as PathEffects
26
27
28from basf2_mva_evaluation import histogram
29
30import basf2 as b2
31
32import basf2_mva_util
33import matplotlib
34
35# Do not use standard backend TkAgg, because it is NOT thread-safe
36# You will get an RuntimeError: main thread is not in main loop otherwise!
37matplotlib.use("svg")
38matplotlib.rcParams.update({'font.size': 36})
39
40# Use the Belle II style while producing the plots
41plt.style.use("belle2")
42
43
44class Plotter:
45 """
46 Base class for all Plotters.
47 """
48
49 # stupid workaround for doxygen refusing to document things
50
51
53
54
58
59
60 plots = None
61
62 labels = None
63
64 xmin = None
65
66 xmax = None
67
68 ymin = None
69
70 ymax = None
71 yscale = 0.0
72 xscale = 0.0
73
74 figure = None
75
76 axis = None
77
78 def __init__(self, figure=None, axis=None):
79 """
80 Creates a new figure and axis if None is given, sets the default plot parameters
81 @param figure default draw figure which is used
82 @param axis default draw axis which is used
83 """
84 b2.B2INFO("Create new figure for class " + str(type(self)))
85 if figure is None:
86
87 self.figurefigure = matplotlib.figure.Figure(figsize=(32, 18))
88 self.figurefigure.set_tight_layout(False)
89 else:
90 self.figurefigure = figure
91
92 if axis is None:
93
94 self.axisaxis = self.figurefigure.add_subplot(1, 1, 1)
95 else:
96 self.axisaxis = axis
97
98
99 self.plotsplots = []
100
101 self.labelslabels = []
102
103 self.xmin, self.xmaxxmax = float(0), float(1)
104
105 self.ymin, self.ymaxymax = float(0), float(1)
106
107 self.yscaleyscale = 0.1
108
109 self.xscalexscale = 0.0
110
111
112 self.plot_kwargs = None
113
114 self.errorbar_kwargs = None
115
117
118 self.fill_kwargs = None
119
120 self.set_plot_options()
123 self.set_fill_options()
124
125
126 self.prop_cycler = itertools.cycle(plt.rcParams["axes.prop_cycle"])
127
128 def add_subplot(self, gridspecs):
129 """
130 Adds a new subplot to the figure, updates all other axes
131 according to the given gridspec
132 @param gridspecs gridspecs for all axes including the new one
133 """
134 for gs, ax in zip(gridspecs[:-1], self.figurefigure.axes):
135 ax.set_position(gs.get_position(self.figurefigure))
136 ax.set_subplotspec(gs)
137 axis = self.figurefigure.add_subplot(gridspecs[-1], sharex=self.axisaxis)
138 return axis
139
140 def save(self, filename):
141 """
142 Save the figure into a file
143 @param filename of the file
144 """
145 b2.B2INFO("Save figure for class " + str(type(self)))
146 from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
147 canvas = FigureCanvas(self.figurefigure)
148 canvas.print_figure(filename, dpi=50)
149 return self
150
151 def set_plot_options(self, plot_kwargs={'linestyle': ''}):
152 """
153 Overrides default plot options for datapoint plot
154 @param plot_kwargs keyword arguments for the plot function
155 """
156 self.plot_kwargs = copy.copy(plot_kwargs)
157 return self
158
159 def set_errorbar_options(self, errorbar_kwargs={'fmt': '.', 'elinewidth': 3, 'alpha': 1}):
160 """
161 Overrides default errorbar options for datapoint errorbars
162 @param errorbar_kwargs keyword arguments for the errorbar function
163 """
164 self.errorbar_kwargs = copy.copy(errorbar_kwargs)
165 return self
166
167 def set_errorband_options(self, errorband_kwargs={'alpha': 0.5}):
168 """
169 Overrides default errorband options for datapoint errorband
170 @param errorbar_kwargs keyword arguments for the fill_between function
171 """
172 self.errorband_kwargs = copy.copy(errorband_kwargs)
173 return self
174
175 def set_fill_options(self, fill_kwargs=None):
176 """
177 Overrides default fill_between options for datapoint errorband
178 @param fill_kwargs keyword arguments for the fill_between function
179 """
180 self.fill_kwargs = copy.copy(fill_kwargs)
181 return self
182
183 def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None):
184 """
185 Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
186 @param x coordinates of the data points
187 @param y coordinates of the data points
188 @param xerr symmetric error on x data points
189 @param yerr symmetric error on y data points
190 """
191 p = e = f = None
192 plot_kwargs = copy.copy(self.plot_kwargs)
193 errorbar_kwargs = copy.copy(self.errorbar_kwargs)
194 errorband_kwargs = copy.copy(self.errorband_kwargs)
195 fill_kwargs = copy.copy(self.fill_kwargs)
196
197 if plot_kwargs is None or 'color' not in plot_kwargs:
198 color = next(self.prop_cycler)
199 color = color['color']
200 plot_kwargs['color'] = color
201 else:
202 color = plot_kwargs['color']
203 color = matplotlib.colors.ColorConverter().to_rgb(color)
204 patch = matplotlib.patches.Patch(color=color, alpha=0.5)
205 patch.get_color = patch.get_facecolor
206 patches = [patch]
207
208 if plot_kwargs is not None:
209 p, = axis.plot(x, y, rasterized=True, **plot_kwargs)
210 patches.append(p)
211
212 if errorbar_kwargs is not None and (xerr is not None or yerr is not None):
213 if 'color' not in errorbar_kwargs:
214 errorbar_kwargs['color'] = color
215 if 'ecolor' not in errorbar_kwargs:
216 errorbar_kwargs['ecolor'] = [0.5 * x for x in color]
217
218 # fully mask nan values.
219 # Needed until https://github.com/matplotlib/matplotlib/pull/23333 makes it into the externals.
220 # TODO: remove in release 8.
221 if not isinstance(xerr, (numpy.ndarray, list)):
222 xerr = xerr*numpy.ones(len(x))
223 mask = numpy.logical_and.reduce([numpy.isfinite(v) for v in [x, y, xerr, yerr]])
224
225 e = axis.errorbar(
226 x[mask], y[mask], xerr=numpy.where(
227 xerr[mask] < 0, 0.0, xerr[mask]), yerr=numpy.where(
228 yerr[mask] < 0, 0.0, yerr[mask]), rasterized=True, **errorbar_kwargs)
229 patches.append(e)
230
231 if errorband_kwargs is not None and yerr is not None:
232 if 'color' not in errorband_kwargs:
233 errorband_kwargs['color'] = color
234 if xerr is not None:
235 # Ensure that xerr and yerr are iterable numpy arrays
236 xerr = x + xerr - x
237 yerr = y + yerr - y
238 for _x, _y, _xe, _ye in zip(x, y, xerr, yerr):
239 axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=True,
240 **errorband_kwargs))
241 else:
242 f = axis.fill_between(x, y - yerr, y + yerr, interpolate=True, rasterized=True, **errorband_kwargs)
243
244 if fill_kwargs is not None:
245 # to fill the last bin of a histogram
246 x = numpy.append(x, x[-1]+2*xerr[-1])
247 y = numpy.append(y, y[-1])
248 xerr = numpy.append(xerr, xerr[-1])
249
250 axis.fill_between(x-xerr, y, 0, rasterized=True, **fill_kwargs)
251
252 return (tuple(patches), p, e, f)
253
254 def add(self, *args, **kwargs):
255 """
256 Add a new plot to this plotter
257 """
258 return NotImplemented
259
260 def finish(self, *args, **kwargs):
261 """
262 Finish plotting and set labels, legends and stuff
263 """
264 return NotImplemented
265
266 def scale_limits(self):
267 """
268 Scale limits to increase distance to boundaries
269 """
270 self.ymin *= 1.0 - math.copysign(self.yscaleyscale, self.ymin)
271 self.ymaxymax *= 1.0 + math.copysign(self.yscaleyscale, self.ymaxymax)
272 self.xmin *= 1.0 - math.copysign(self.xscalexscale, self.xmin)
273 self.xmaxxmax *= 1.0 + math.copysign(self.xscalexscale, self.xmaxxmax)
274 return self
275
276
278 """
279 Plots the purity and the efficiency over the cut value (for cut choosing)
280 """
281
285
286 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
287 """
288 Add a new curve to the plot
289 @param data pandas.DataFrame containing all data
290 @param column which is used to calculate efficiency and purity for different cuts
291 @param signal_mask boolean numpy.array defining which events are signal events
292 @param bckgrd_mask boolean numpy.array defining which events are background events
293 @param weight_column column in data containing the weights for each event
294 """
295
296 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
297
298 if normed:
299 efficiency, efficiency_error = hists.get_efficiency(['Signal'])
300 purity, purity_error = hists.get_purity(['Signal'], ['Background'])
301 else:
302 efficiency, efficiency_error = hists.get_true_positives(['Signal'])
303 purity, purity_error = hists.get_false_positives(['Background'])
304
305 cuts = hists.bin_centers
306
307 self.xmin, self.xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmaxxmaxxmax])
308 self.ymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.ymin]), \
309 numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.ymaxymaxymax])
310
311 self.plotsplots.append(self._plot_datapoints(self.axisaxis, cuts, efficiency, xerr=0, yerr=efficiency_error))
312
313 if normed:
314 self.labelslabels.append("Efficiency")
315 else:
316 self.labelslabels.append("True positive")
317
318 self.plotsplots.append(self._plot_datapoints(self.axisaxis, cuts, purity, xerr=0, yerr=purity_error))
319
320 if normed:
321 self.labelslabels.append("Purity")
322 else:
323 self.labelslabels.append("False positive")
324
325 return self
326
327 def finish(self):
328 """
329 Sets limits, title, axis-labels and legend of the plot
330 """
331 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
332 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
333 self.axisaxis.set_title("Classification Plot")
334 self.axisaxis.get_xaxis().set_label_text('Cut Value')
335 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
336 return self
337
338
340 """
341 Plots the signal to noise ratio over the cut value (for cut choosing)
342 """
343
347
348 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
349 """
350 Add a new curve to the plot
351 @param data pandas.DataFrame containing all data
352 @param column which is used to calculate signal to noise ratio for different cuts
353 @param signal_mask boolean numpy.array defining which events are signal events
354 @param bckgrd_mask boolean numpy.array defining which events are background events
355 @param weight_column column in data containing the weights for each event
356 """
357
358 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
359
360 signal2noise, signal2noise_error = hists.get_signal_to_noise(['Signal'], ['Background'])
361
362 cuts = hists.bin_centers
363
364 self.xmin, self.xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmaxxmaxxmax])
365 self.ymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(signal2noise), self.ymin]), \
366 numpy.nanmax([numpy.nanmax(signal2noise), self.ymaxymaxymax])
367
368 self.plotsplots.append(self._plot_datapoints(self.axisaxis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))
369
370 self.labelslabels.append(column)
371
372 return self
373
374 def finish(self):
375 """
376 Sets limits, title, axis-labels and legend of the plot
377 """
378 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
379 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
380 self.axisaxis.set_title("Signal to Noise Plot")
381 self.axisaxis.get_xaxis().set_label_text('Cut Value')
382 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
383 return self
384
385
387 """
388 Plots the purity over the efficiency also known as ROC curve
389 """
390
394
395 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
396 """
397 Add a new curve to the ROC plot
398 @param data pandas.DataFrame containing all data
399 @param column which is used to calculate efficiency and purity for different cuts
400 @param signal_mask boolean numpy.array defining which events are signal events
401 @param bckgrd_mask boolean numpy.array defining which events are background events
402 @param weight_column column in data containing the weights for each event
403 """
404 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
405 efficiency, efficiency_error = hists.get_efficiency(['Signal'])
406 purity, purity_error = hists.get_purity(['Signal'], ['Background'])
407
408 self.xmin, self.xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmaxxmaxxmax])
409 self.ymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymaxymaxymax])
410
411 p = self._plot_datapoints(self.axisaxis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
412 self.plotsplots.append(p)
413 if label is not None:
414 self.labelslabels.append(label)
415 else:
416 self.labelslabels.append(column)
417 return self
418
419 def finish(self):
420 """
421 Sets limits, title, axis-labels and legend of the plot
422 """
423 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
424 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
425 self.axisaxis.set_title("ROC Purity Plot")
426 self.axisaxis.get_xaxis().set_label_text('Efficiency')
427 self.axisaxis.get_yaxis().set_label_text('Purity')
428 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
429 return self
430
431
433 """
434 Plots the rejection over the efficiency also known as ROC curve
435 """
436
440
441 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
442 """
443 Add a new curve to the ROC plot
444 @param data pandas.DataFrame containing all data
445 @param column which is used to calculate efficiency and purity for different cuts
446 @param signal_mask boolean numpy.array defining which events are signal events
447 @param bckgrd_mask boolean numpy.array defining which events are background events
448 @param weight_column column in data containing the weights for each event
449 """
450 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
451 efficiency, efficiency_error = hists.get_efficiency(['Signal'])
452 rejection, rejection_error = hists.get_efficiency(['Background'])
453 rejection = 1 - rejection
454 if isinstance(efficiency, int) and not isinstance(rejection, int):
455 efficiency = numpy.array([efficiency] * len(rejection))
456 elif isinstance(rejection, int) and not isinstance(efficiency, int):
457 rejection = numpy.array([rejection] * len(efficiency))
458 elif isinstance(rejection, int) and isinstance(efficiency, int):
459 efficiency = numpy.array([efficiency])
460 rejection = numpy.array([rejection])
461
462 self.xmin, self.xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmaxxmaxxmax])
463 self.ymin, self.ymaxymaxymax = numpy.nanmin([rejection.min(), self.ymin]), numpy.nanmax([rejection.max(), self.ymaxymaxymax])
464
465 auc = numpy.abs(numpy.trapz(rejection, efficiency))
466
467 p = self._plot_datapoints(self.axisaxis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
468 self.plotsplots.append(p)
469 if label is not None:
470 self.labelslabels.append(label[:10] + f" ({auc:.2f})")
471 else:
472 self.labelslabels.append(column[:10] + f" ({auc:.2f})")
473 return self
474
475 def finish(self):
476 """
477 Sets limits, title, axis-labels and legend of the plot
478 """
479 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
480 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
481 self.axisaxis.set_title("ROC Rejection Plot")
482 self.axisaxis.get_xaxis().set_label_text('Signal Efficiency')
483 self.axisaxis.get_yaxis().set_label_text('Background Rejection')
484 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
485 return self
486
487
489 """
490 Plots multiple other plots into a grid 3x?
491 """
492
493 figure = None
494
495 axis = None
496
497 def __init__(self, cls, number_of_plots, figure=None):
498 """
499 Creates a new figure if None is given, sets the default plot parameters
500 @param figure default draw figure which is used
501 """
502 if figure is None:
503
504 self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
505 self.figurefigurefigurefigure.set_tight_layout(True)
506 else:
507 self.figurefigurefigurefigure = figure
508
509 if number_of_plots == 1:
510 gs = matplotlib.gridspec.GridSpec(1, 1)
511 elif number_of_plots == 2:
512 gs = matplotlib.gridspec.GridSpec(1, 2)
513 elif number_of_plots == 3:
514 gs = matplotlib.gridspec.GridSpec(1, 3)
515 else:
516 gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
517
518
519 self.sub_plots = [cls(self.figurefigurefigurefigure, self.figurefigurefigurefigure.add_subplot(gs[i // 3, i % 3])) for i in range(number_of_plots)]
520
521 self.axisaxisaxisaxis = self.sub_plots[0].axis
523
524 def add(self, i, *args, **kwargs):
525 """
526 Call add function of ith subplot
527 @param i position of the subplot
528 """
529 self.sub_plots[i].add(*args, **kwargs)
530
531 def finish(self):
532 """
533 Sets limits, title, axis-labels and legend of the plot
534 """
535 for plot in self.sub_plots:
536 plot.finish()
537 return self
538
539
541 """
542 Plots the purity in each bin over the classifier output.
543 """
544
548
549 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
550 """
551 Add a new curve to the Diagonal plot
552 @param data pandas.DataFrame containing all data
553 @param column which is used to calculate purity for different cuts
554 @param signal_mask boolean numpy.array defining which events are signal events
555 @param bckgrd_mask boolean numpy.array defining which events are background events
556 @param weight_column column in data containing the weights for each event
557 """
558 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
559 purity, purity_error = hists.get_purity_per_bin(['Signal'], ['Background'])
560
561 self.xmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
562 # self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
563 self.ymin, self.ymaxymaxymax = 0, 1
564
565 p = self._plot_datapoints(self.axisaxis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
566 self.plotsplots.append(p)
567 self.labelslabels.append(column)
568 return self
569
570 def finish(self):
571 """
572 Sets limits, title, axis-labels and legend of the plot
573 """
574 self.scale_limits()
575 self.axisaxis.plot((0.0, 1.0), (0.0, 1.0), color='black')
576 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
577 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
578 self.axisaxis.set_title("Diagonal Plot")
579 self.axisaxis.get_xaxis().set_label_text('Classifier Output')
580 self.axisaxis.get_yaxis().set_label_text('Purity Per Bin')
581 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
582 return self
583
584
586 """
587 Plots distribution of a quantity
588 """
589
590 def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
591 keep_first_binning=False, range_in_std=None):
592 """
593 Creates a new figure and axis if None is given, sets the default plot parameters
594 @param figure default draw figure which is used
595 @param axis default draw axis which is used
596 @param normed true if histograms should be normed before drawing
597 @param keep_first_binning use the binning of the first distribution for further plots
598 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
599 """
600 super().__init__(figure, axis)
601
602 self.normed_to_all_entries = normed_to_all_entries
603
604 self.normed_to_bin_width = normed_to_bin_width
605
606 self.range_in_std = range_in_std
607 # if self.normed_to_all_entries or self.normed_to_bin_width:
608
609 self.yminymin = float(0)
610
611 self.ymaxymaxymax = float('-inf')
612
613 self.xminxmin = float('inf')
614
615 self.xmaxxmaxxmax = float('-inf')
616
617 self.keep_first_binning = keep_first_binning
618
619 self.first_binning = None
620
621 self.x_axis_label = ''
622
623 def add(self, data, column, mask=None, weight_column=None, label=None):
624 """
625 Add a new distribution to the plots
626 @param data pandas.DataFrame containing all data
627 @param column which is used to calculate distribution histogram
628 @param mask boolean numpy.array defining which events are used for the histogram
629 @param weight_column column in data containing the weights for each event
630 """
631 if mask is None:
632 mask = numpy.ones(len(data)).astype('bool')
633
634 bins = 100
635 if self.keep_first_binning and self.first_binning is not None:
636 bins = self.first_binning
637 hists = histogram.Histograms(data, column, {'Total': mask}, weight_column=weight_column,
638 bins=bins, equal_frequency=False, range_in_std=self.range_in_std)
639 if self.keep_first_binning and self.first_binning is None:
640 self.first_binning = hists.bins
641 hist, hist_error = hists.get_hist('Total')
642
643 if self.normed_to_all_entries:
644 normalization = float(numpy.sum(hist))
645 hist = hist / normalization
646 hist_error = hist_error / normalization
647
648 if self.normed_to_bin_width:
649 hist = hist / hists.bin_widths
650 hist_error = hist_error / hists.bin_widths
651
652 self.xminxmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xminxmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
653 self.yminymin = numpy.nanmin([hist.min(), self.yminymin])
654 self.ymaxymaxymax = numpy.nanmax([(hist + hist_error).max(), self.ymaxymaxymax])
655
656 p = self._plot_datapoints(self.axisaxis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
657 self.plotsplots.append(p)
658 self.x_axis_label = column
659
660 appendix = ''
661 if self.ymaxymaxymax <= self.yminymin or self.xmaxxmaxxmax <= self.xminxmin:
662 appendix = ' No data to plot!'
663
664 if label is None:
665 self.labelslabels.append(column + appendix)
666 else:
667 self.labelslabels.append(label + appendix)
668 return self
669
670 def finish(self):
671 """
672 Sets limits, title, axis-labels and legend of the plot
673 """
674 self.axisaxis.set_title("Distribution Plot")
675 self.axisaxis.get_xaxis().set_label_text(self.x_axis_label)
676
677 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
678
679 if self.ymaxymaxymax <= self.yminymin or self.xmaxxmaxxmax <= self.xminxmin:
680 self.axisaxis.set_xlim((0., 1.))
681 self.axisaxis.set_ylim((0., 1.))
682 self.axisaxis.text(0.36, 0.5, 'No data to plot', fontsize=60, color='black')
683 return self
684
685 self.scale_limits()
686
687 self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
688 self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
689
691 self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / (# Entries * Bin Width)')
692 elif self.normed_to_all_entries:
693 self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / # Entries')
694 elif self.normed_to_bin_width:
695 self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / Bin Width')
696 else:
697 self.axisaxis.get_yaxis().set_label_text('# Entries per Bin')
698
699 return self
700
701
703 """
704 Create a boxplot
705 """
706
708
709 def __init__(self, figure=None, axis=None):
710 """
711 Creates a new figure and axis if None is given, sets the default plot parameters
712 @param figure default draw figure which is used
713 @param axis default draw axis which is used
714 """
715 super().__init__(figure=figure, axis=axis)
716
717
718 self.x_axis_label = ""
719
720 def add(self, data, column, mask=None, weight_column=None):
721 """
722 Add a new boxplot to the plots
723 @param data pandas.DataFrame containing all data
724 @param column which is used to calculate boxplot quantities
725 @param mask boolean numpy.array defining which events are used for the histogram
726 @param weight_column column in data containing the weights for each event
727 """
728 if mask is None:
729 mask = numpy.ones(len(data)).astype('bool')
730 x = data[column][mask]
731 if weight_column is not None:
732 # weight = data[weight_column][mask]
733 b2.B2WARNING("Weights are currently not used in boxplot, due to limitations in matplotlib")
734
735 if len(x) == 0:
736 b2.B2WARNING("Ignore empty boxplot.")
737 return self
738
739 # we don't plot outliers as they cause the file size to explode if large datasets are used
740 p = self.axisaxis.boxplot(x, sym='k.', whis=1.5, vert=False, patch_artist=True, showmeans=True, widths=1,
741 boxprops=dict(facecolor='blue', alpha=0.5), showfliers=False,
742 # medianprobs=dict(color='blue'),
743 # meanprobs=dict(color='red'),
744 )
745 self.plotsplots.append(p)
746 self.labelslabels.append(column)
747 self.x_axis_label = column
748 r"""
749 self.axisaxis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + '\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
750 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axisaxis.transAxes)
751 self.axisaxis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + '\n' + r'$IQD = {:.2f}$').format(x.std(),
752 x.quantile(0.75) - x.quantile(0.25)),
753 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axisaxis.transAxes)
754 self.axisaxis.text(0.7, 0.9, (r'$min = {:.2f}$' + '\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
755 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axisaxis.transAxes)
756 """
757
758 return self
759
760 def finish(self):
761 """
762 Sets limits, title, axis-labels and legend of the plot
763 """
764 matplotlib.artist.setp(self.axisaxis.get_yaxis(), visible=False)
765 self.axisaxis.get_xaxis().set_label_text(self.x_axis_label)
766 self.axisaxis.set_title("Box Plot")
767 return self
768
769
771 """
772 Plots the difference between two histograms
773 """
774
786
787 def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
788 """
789 Creates a new figure and axis if None is given, sets the default plot parameters
790 @param figure default draw figure which is used
791 @param axis default draw axis which is used
792 @param normed normalize minuend and subtrahend before comparing them
793 @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
794 """
795 super().__init__(figure, axis)
796 self.normed = normed
797 self.shift_to_zero = shift_to_zero
798 if self.normed:
799 self.yminymin = -0.01
800 self.ymaxymaxymax = 0.01
801 else:
802 self.yminymin = -1
803 self.ymaxymaxymax = 1
804
805 def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
806 """
807 Add a new difference plot
808 @param data pandas.DataFrame containing all data
809 @param column which is used to calculate distribution histogram
810 @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
811 @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
812 @param weight_column column in data containing the weights for each event
813 @param label label for the legend if None, the column name is used
814 """
815 hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask},
816 weight_column=weight_column, equal_frequency=False)
817 minuend, minuend_error = hists.get_hist('Minuend')
818 subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
819
820 difference_error = histogram.poisson_error(minuend + subtrahend)
821 if self.normed:
822 difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
823 minuend = minuend / numpy.sum(minuend)
824 subtrahend = subtrahend / numpy.sum(subtrahend)
825 difference = minuend - subtrahend
826
827 if self.shift_to_zero:
828 difference = difference - numpy.mean(difference)
829
830 self.xmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
831 self.yminymin = min((difference - difference_error).min(), self.yminymin)
832 self.ymaxymaxymax = max((difference + difference_error).max(), self.ymaxymaxymax)
833
834 p = self._plot_datapoints(self.axisaxis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
835 self.plotsplots.append(p)
836 if label is None:
837 self.labelslabels.append(label)
838 else:
839 self.labelslabels.append(column)
840 self.x_axis_label = column
841 return self
842
843 def finish(self, line_color='black'):
844 """
845 Sets limits, title, axis-labels and legend of the plot
846 """
847 self.axisaxis.plot((self.xmin, self.xmaxxmaxxmax), (0, 0), color=line_color, linewidth=4, rasterized=True)
848 self.scale_limits()
849 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
850 self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
851 self.axisaxis.set_title("Difference Plot")
852 self.axisaxis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
853 self.axisaxis.get_xaxis().set_label_text(self.x_axis_label)
854 self.axisaxis.get_yaxis().set_label_text('Difference')
855 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
856 return self
857
858
860 """
861 Create TMVA-like overtraining control plot for a classification training
862 """
863
864
865 figure = None
866
867 axis = None
868
869 axis_d1 = None
870
871 axis_d2 = None
872
873 def __init__(self, figure=None):
874 """
875 Creates a new figure if None is given, sets the default plot parameters
876 @param figure default draw figure which is used
877 """
878 if figure is None:
879
880 self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
881 self.figurefigurefigurefigure.set_tight_layout(True)
882 else:
883 self.figurefigurefigurefigure = figure
884
885 gs = matplotlib.gridspec.GridSpec(5, 1)
886
888
890
892
894
895 def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
896 """
897 Add a new overtraining plot, I recommend to draw only one overtraining plot at the time,
898 otherwise there are too many curves in the plot to recognize anything in the plot.
899 @param data pandas.DataFrame containing all data
900 @param column which is used to calculate distribution histogram
901 @param train_mask boolean numpy.array defining which events are training events
902 @param test_mask boolean numpy.array defining which events are test events
903 @param signal_mask boolean numpy.array defining which events are signal events
904 @param bckgrd_mask boolean numpy.array defining which events are background events
905 @param weight_column column in data containing the weights for each event
906 """
907 distribution = Distribution(self.figurefigurefigurefigure, self.axisaxisaxisaxis, normed_to_all_entries=True)
908
909 distribution.set_plot_options(self.plot_kwargs)
910 distribution.set_errorbar_options(self.errorbar_kwargs)
911 distribution.set_errorband_options(self.errorband_kwargs)
912 distribution.add(data, column, test_mask & signal_mask, weight_column)
913 distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
914
915 distribution.set_plot_options(
916 {'color': distribution.plots[0][0][0].get_color(), 'linestyle': '-', 'lw': 4, 'drawstyle': 'steps-mid'})
917 distribution.set_fill_options({'color': distribution.plots[0][0][0].get_color(), 'alpha': 0.5, 'step': 'post'})
918 distribution.set_errorbar_options(None)
919 distribution.set_errorband_options(None)
920 distribution.add(data, column, train_mask & signal_mask, weight_column)
921 distribution.set_plot_options(
922 {'color': distribution.plots[1][0][0].get_color(), 'linestyle': '-', 'lw': 4, 'drawstyle': 'steps-mid'})
923 distribution.set_fill_options({'color': distribution.plots[1][0][0].get_color(), 'alpha': 0.5, 'step': 'post'})
924 distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
925
926 distribution.labels = ['Test-Signal', 'Test-Background', 'Train-Signal', 'Train-Background']
927 distribution.finish()
928
929 self.plot_kwargs['color'] = distribution.plots[0][0][0].get_color()
930 difference_signal = Difference(self.figurefigurefigurefigure, self.axis_d1axis_d1, shift_to_zero=True, normed=True)
931 difference_signal.set_plot_options(self.plot_kwargs)
932 difference_signal.set_errorbar_options(self.errorbar_kwargs)
933 difference_signal.set_errorband_options(self.errorband_kwargs)
934 difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
935 self.axis_d1axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
936 self.axis_d1axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
937 difference_signal.plots = difference_signal.labels = []
938 difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
939
940 self.plot_kwargs['color'] = distribution.plots[1][0][0].get_color()
941 difference_bckgrd = Difference(self.figurefigurefigurefigure, self.axis_d2axis_d2, shift_to_zero=True, normed=True)
942 difference_bckgrd.set_plot_options(self.plot_kwargs)
943 difference_bckgrd.set_errorbar_options(self.errorbar_kwargs)
944 difference_bckgrd.set_errorband_options(self.errorband_kwargs)
945 difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
946 self.axis_d2axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
947 self.axis_d2axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
948 difference_bckgrd.plots = difference_bckgrd.labels = []
949 difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
950
951 try:
952 import scipy.stats
953 # Kolmogorov smirnov test
954 if len(data[column][train_mask & signal_mask]) == 0 or len(data[column][test_mask & signal_mask]) == 0:
955 b2.B2WARNING("Cannot calculate kolmogorov smirnov test for signal due to missing data")
956 else:
957 ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
958 props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
959 self.axis_d1axis_d1.text(0.1, 0.9, r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36, bbox=props,
960 verticalalignment='top', horizontalalignment='left', transform=self.axis_d1axis_d1.transAxes)
961 if len(data[column][train_mask & bckgrd_mask]) == 0 or len(data[column][test_mask & bckgrd_mask]) == 0:
962 b2.B2WARNING("Cannot calculate kolmogorov smirnov test for background due to missing data")
963 else:
964 ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
965 props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
966 self.axis_d2axis_d2.text(0.1, 0.9, r'background (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36,
967 bbox=props,
968 verticalalignment='top', horizontalalignment='left', transform=self.axis_d2axis_d2.transAxes)
969 except ImportError:
970 b2.B2WARNING("Cannot calculate kolmogorov smirnov test please install scipy!")
971
972 return self
973
974 def finish(self):
975 """
976 Sets limits, title, axis-labels and legend of the plot
977 """
978 self.axisaxisaxisaxis.set_title("Overtraining Plot")
979 self.axis_d1axis_d1.set_title("")
980 self.axis_d2axis_d2.set_title("")
981 matplotlib.artist.setp(self.axisaxisaxisaxis.get_xticklabels(), visible=False)
982 matplotlib.artist.setp(self.axis_d1axis_d1.get_xticklabels(), visible=False)
983 self.axisaxisaxisaxis.get_xaxis().set_label_text('')
984 self.axis_d1axis_d1.get_xaxis().set_label_text('')
985 self.axis_d2axis_d2.get_xaxis().set_label_text('Classifier Output')
986 return self
987
988
990 """
991 Plots distribution of a quantity including boxplots
992 """
993
994
995 box_axes = None
996
997 def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
998 """
999 Creates a new figure and axis if None is given, sets the default plot parameters
1000 @param figure default draw figure which is used
1001 @param axis default draw axis which is used
1002 @param normed true if the histograms should be normed before drawing
1003 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
1004 """
1005 super().__init__(figure, axis)
1006
1007 self.normed = normed
1008
1009 self.range_in_std = range_in_std
1010
1012
1013 self.distribution = Distribution(self.figurefigure, self.axisaxis, normed_to_all_entries=self.normed, range_in_std=self.range_in_std)
1014
1015 def add(self, data, column, mask=None, weight_column=None, label=None):
1016 """
1017 Add a new distribution plot, with additional information like a boxplot compared to
1018 the ordinary Distribution plot.
1019 @param data pandas.DataFrame containing all data
1020 @param column which is used to calculate distribution histogram
1021 @param mask boolean numpy.array defining which events are used for the distribution histogram
1022 @param weight_column column in data containing the weights for each event
1023 """
1027 self.distribution.add(data, column, mask, weight_column, label=label)
1028
1029 n = len(self.box_axesbox_axes) + 1
1030 gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1031 gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i in range(n)]
1032 box_axis = self.add_subplot(gridspecs)
1033
1034 if self.range_in_std is not None:
1035 mean, std = histogram.weighted_mean_and_std(data[column], None if weight_column is None else data[weight_column])
1036 # Everything outside mean +- range_in_std * std is considered not inside the mask
1037 mask = mask & (data[column] > (mean - self.range_in_std * std)) & (data[column] < (mean + self.range_in_std * std))
1038 box = Box(self.figurefigure, box_axis)
1039 box.add(data, column, mask, weight_column)
1040 if len(box.plots) > 0:
1041 box.plots[0]['boxes'][0].set_facecolor(self.distribution.plots[-1][0][0].get_color())
1042 box.finish()
1043
1044 self.box_axesbox_axes.append(box_axis)
1045 return self
1046
1047 def finish(self):
1048 """
1049 Sets limits, title, axis-labels and legend of the plot
1050 """
1051 self.distribution.finish()
1052 matplotlib.artist.setp(self.axisaxis.get_xticklabels(), visible=False)
1053 self.axisaxis.get_xaxis().set_label_text('')
1054 for box_axis in self.box_axesbox_axes[:-1]:
1055 matplotlib.artist.setp(box_axis.get_xticklabels(), visible=False)
1056 box_axis.set_title("")
1057 box_axis.get_xaxis().set_label_text('')
1058 self.box_axesbox_axes[-1].set_title("")
1059 self.axisaxis.set_title("Distribution Plot")
1060 self.axisaxis.legend([x[0] for x in self.distribution.plots], self.distribution.labels,
1061 loc='best', fancybox=True, framealpha=0.5)
1062 return self
1063
1064
1066 """
1067 Plots change of a distribution of a quantity depending on the cut on a classifier
1068 """
1069
1070 figure = None
1071
1072 axis = None
1073
1074 axis_d1 = None
1075
1076 axis_d2 = None
1077
1078 def __init__(self, figure=None):
1079 """
1080 Creates a new figure if None is given, sets the default plot parameters
1081 @param figure default draw figure which is used
1082 """
1083 if figure is None:
1084
1085 self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1086 self.figurefigurefigurefigure.set_tight_layout(True)
1087 else:
1088 self.figurefigurefigurefigure = figure
1089
1090 gs = matplotlib.gridspec.GridSpec(3, 2)
1091
1093
1095
1097
1099
1100 def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1101 """
1102 Add a new correlation plot.
1103 @param data pandas.DataFrame containing all data
1104 @param column which is used to calculate distribution histogram
1105 @param cut_column which is used to calculate cut on the other quantity defined by column
1106 @param quantiles list of quantiles between 0 and 100, defining the different cuts
1107 @param weight_column column in data containing the weights for each event
1108 """
1109 if len(data[cut_column]) == 0:
1110 b2.B2WARNING("Ignore empty Correlation.")
1111 return self
1112
1113 axes = [self.axisaxisaxisaxis, self.axis_d1axis_d1, self.axis_d2axis_d2]
1114
1115 for i, (l, m) in enumerate([('.', signal_mask | bckgrd_mask), ('S', signal_mask), ('B', bckgrd_mask)]):
1116
1117 if weight_column is not None:
1118 weights = numpy.array(data[weight_column][m])
1119 else:
1120 weights = numpy.ones(len(data[column][m]))
1121
1122 xrange = numpy.percentile(data[column][m], [5, 95])
1123
1124 colormap = plt.get_cmap('coolwarm')
1125 tmp, x = numpy.histogram(data[column][m], bins=100,
1126 range=xrange, density=True, weights=weights)
1127 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1128 axes[i].plot(bin_center, tmp, color='black', lw=1)
1129
1130 for quantil in numpy.arange(5, 100, 5):
1131 cut = numpy.percentile(data[cut_column][m], quantil)
1132 sel = data[cut_column][m] >= cut
1133 y, x = numpy.histogram(data[column][m][sel], bins=100,
1134 range=xrange, density=True, weights=weights[sel])
1135 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1136 axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1137 tmp = y
1138
1139 axes[i].set_ylim(bottom=0)
1140
1141 flatness_score = basf2_mva_util.calculate_flatness(data[column][m], data[cut_column][m], weights)
1142 axes[i].set_title(r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1143 return self
1144
1145 def finish(self):
1146 """
1147 Sets limits, title, axis-labels and legend of the plot
1148 """
1149 return self
1150
1151
1153 """
1154 Plots multivariate distribution using TSNE algorithm
1155 """
1156
1157 def add(self, data, columns, *masks):
1158 """
1159 Add a new correlation plot.
1160 @param data pandas.DataFrame containing all data
1161 @param columns which are used to calculate the correlations
1162 @param masks different classes to show in TSNE
1163 """
1164 try:
1165 import sklearn
1166 import sklearn.manifold
1167 model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1168 data = numpy.array([data[column] for column in columns]).T
1169 model.fit(data)
1170 for mask in masks:
1171 data = numpy.array([data[column][mask] for column in columns]).T
1172 data = model.transform(data)
1173 self.axisaxis.scatter(data[:, 0], data[:, 1], rasterized=True)
1174 except ImportError:
1175 print("Cannot create TSNE plot. Install sklearn if you want it")
1176 return self
1177
1178 def finish(self):
1179 """
1180 Sets limits, title, axis-labels and legend of the plot
1181 """
1182 return self
1183
1184
1186 """
1187 Plots importance matrix
1188 """
1189
1190 def add(self, data, columns, variables):
1191 """
1192 Add a new correlation plot.
1193 @param data pandas.DataFrame containing all data
1194 @param columns which are used to calculate the correlations
1195 """
1196 self.figurefigure.set_tight_layout(True)
1197
1198 def norm(x):
1199 width = (numpy.max(x) - numpy.min(x))
1200 if width <= 0:
1201 return numpy.zeros(x.shape)
1202 return (x - numpy.min(x)) / width * 100
1203
1204 importance_matrix = numpy.vstack([norm(data[column]) for column in columns]).T
1205 importance_heatmap = self.axisaxis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,
1206 rasterized=True)
1207
1208 # put the major ticks at the middle of each cell
1209 self.axisaxis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=False)
1210 self.axisaxis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=False)
1211
1212 self.axisaxis.set_xticklabels(columns, minor=False, rotation=90)
1213 self.axisaxis.set_yticklabels(variables, minor=False)
1214
1215 self.axisaxis.xaxis.tick_top()
1216
1217 for y in range(importance_matrix.shape[0]):
1218 for x in range(importance_matrix.shape[1]):
1219 txt = self.axisaxis.text(x + 0.5, y + 0.5, f'{importance_matrix[y, x]:.0f}',
1220 size=14,
1221 horizontalalignment='center',
1222 verticalalignment='center',
1223 color='w')
1224 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])
1225
1226 cb = self.figurefigure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation='vertical')
1227 cb.ax.set_yticklabels(['low', 'high'])
1228
1229 # remove whitespace
1230 self.axisaxis.set_ylim(0, importance_matrix.shape[0])
1231
1232 self.axisaxis.set_aspect('equal')
1233
1234 return self
1235
1236 def finish(self):
1237 """
1238 Sets limits, title, axis-labels and legend of the plot
1239 """
1240 return self
1241
1242
1244 """
1245 Plots correlation matrix
1246 """
1247
1248 figure = None
1249
1250 signal_axis = None
1251
1252 bckgrd_axis = None
1253
1254 def __init__(self, figure=None):
1255 """
1256 Creates a new figure if None is given, sets the default plot parameters
1257 @param figure default draw figure which is used
1258 """
1259 if figure is None:
1260
1261 self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1262 self.figurefigurefigurefigure.set_tight_layout(True)
1263 else:
1264 self.figurefigurefigurefigure = figure
1265
1266 gs = matplotlib.gridspec.GridSpec(8, 2)
1267
1269
1271
1273
1275
1277
1278 def add(self, data, columns, signal_mask, bckgrd_mask):
1279 """
1280 Add a new correlation plot.
1281 @param data pandas.DataFrame containing all data
1282 @param columns which are used to calculate the correlations
1283 """
1284 signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column in columns])) * 100
1285 bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask] for column in columns])) * 100
1286
1287 signal_heatmap = self.signal_axissignal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1288 self.bckgrd_axisbckgrd_axis.pcolor(bckgrd_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1289
1290 self.signal_axissignal_axis.invert_yaxis()
1291 self.signal_axissignal_axis.xaxis.tick_top()
1292 self.bckgrd_axisbckgrd_axis.invert_yaxis()
1293 self.bckgrd_axisbckgrd_axis.xaxis.tick_top()
1294
1295 # put the major ticks at the middle of each cell
1296 self.signal_axissignal_axis.set_xticks(numpy.arange(signal_corr.shape[0]) + 0.5, minor=False)
1297 self.signal_axissignal_axis.set_yticks(numpy.arange(signal_corr.shape[1]) + 0.5, minor=False)
1298
1299 self.signal_axissignal_axis.set_xticklabels(columns, minor=False, rotation=90)
1300 self.signal_axissignal_axis.set_yticklabels(columns, minor=False)
1301
1302 # put the major ticks at the middle of each cell
1303 self.bckgrd_axisbckgrd_axis.set_xticks(numpy.arange(bckgrd_corr.shape[0]) + 0.5, minor=False)
1304 self.bckgrd_axisbckgrd_axis.set_yticks(numpy.arange(bckgrd_corr.shape[1]) + 0.5, minor=False)
1305
1306 self.bckgrd_axisbckgrd_axis.set_xticklabels(columns, minor=False, rotation=90)
1307 self.bckgrd_axisbckgrd_axis.set_yticklabels(columns, minor=False)
1308
1309 for y in range(signal_corr.shape[0]):
1310 for x in range(signal_corr.shape[1]):
1311 txt = self.signal_axissignal_axis.text(x + 0.5, y + 0.5, f'{signal_corr[y, x]:.0f}',
1312 size=14,
1313 horizontalalignment='center',
1314 verticalalignment='center',
1315 color='w')
1316 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])
1317
1318 for y in range(bckgrd_corr.shape[0]):
1319 for x in range(bckgrd_corr.shape[1]):
1320 txt = self.bckgrd_axisbckgrd_axis.text(x + 0.5, y + 0.5, f'{bckgrd_corr[y, x]:.0f}',
1321 size=14,
1322 horizontalalignment='center',
1323 verticalalignment='center',
1324 color='w')
1325 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])
1326
1327 cb = self.figurefigurefigurefigure.colorbar(signal_heatmap, cax=self.colorbar_axis, ticks=[-100, 0, 100], orientation='horizontal')
1328 cb.solids.set_rasterized(True)
1329 cb.ax.set_xticklabels(['negative', 'uncorrelated', 'positive'])
1330
1331 self.signal_axissignal_axis.text(0.5, -1.0, "Signal", horizontalalignment='center')
1332 self.bckgrd_axisbckgrd_axis.text(0.5, -1.0, "Background", horizontalalignment='center')
1333
1334 # remove whitespace
1335 self.signal_axissignal_axis.set_xlim(0, signal_corr.shape[0])
1336 self.signal_axissignal_axis.set_ylim(0, signal_corr.shape[1])
1337 self.bckgrd_axisbckgrd_axis.set_xlim(0, bckgrd_corr.shape[0])
1338 self.bckgrd_axisbckgrd_axis.set_ylim(0, bckgrd_corr.shape[1])
1339 return self
1340
1341 def finish(self):
1342 """
1343 Sets limits, title, axis-labels and legend of the plot
1344 """
1345 matplotlib.artist.setp(self.bckgrd_axisbckgrd_axis.get_yticklabels(), visible=False)
1346 return self
1347
1348
1349if __name__ == '__main__':
1350
1351 def get_data(N, columns):
1352 """
1353 Creates fake data for example plots
1354 """
1355 N /= 2
1356 n = len(columns) - 1
1357 xs = numpy.random.normal(0, size=(N, n))
1358 xb = numpy.random.normal(1, size=(N, n))
1359 ys = numpy.zeros(N)
1360 yb = numpy.ones(N)
1361 data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1362 return data.reindex(numpy.random.permutation(data.index))
1363
1364 import seaborn
1365 # Set nice searborn settings
1366 seaborn.set(font_scale=3)
1367 seaborn.set_style('whitegrid')
1368
1369 # Standard plots
1370 N = 100000
1371 data = get_data(N, columns=['FastBDT', 'NeuroBayes', 'isSignal'])
1372 data['type'] = ''
1373 data.type.iloc[:N / 2] = 'Train'
1374 data.type.iloc[N / 2:] = 'Test'
1375
1376 p = Box()
1377 p.add(data, 'FastBDT')
1378 p.finish()
1379 p.save('box_plot.png')
1380
1382 p.add(data, 'FastBDT')
1383 p.add(data, 'NeuroBayes')
1384 p.finish()
1385 p.save('verbose_distribution_plot.png')
1386
1388 p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1389 p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1390 p.finish()
1391 p.save('roc_purity_plot.png')
1392
1394 p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1395 p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1396 p.finish()
1397 p.save('roc_rejection_plot.png')
1398
1399 p = Diagonal()
1400 p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1401 p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1402 p.finish()
1403 p.save('diagonal_plot.png')
1404
1405 p = Distribution()
1406 p.add(data, 'FastBDT')
1407 p.add(data, 'NeuroBayes')
1408 p.finish()
1409 p.save('distribution_plot.png')
1410
1411 p = Difference()
1412 p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test')
1413 p.add(data, 'NeuroBayes', data['type'] == 'Train', data['type'] == 'Test')
1414 p.finish()
1415 p.save('difference_plot.png')
1416
1417 p = Overtraining()
1418 p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test', data['isSignal'] == 1, data['isSignal'] == 0)
1419 p.finish()
1420 p.save('overtraining_plot.png')
1421
1422 p = Correlation()
1423 p.add(data, 'FastBDT', 'NeuroBayes', [0, 20, 40, 60, 80, 100], data['isSignal'] == 0)
1424 p.finish()
1425 p.save('correlation_plot.png')
1426
1427 p = CorrelationMatrix()
1428 data['FastBDT2'] = data['FastBDT']**2
1429 data['NeuroBayes2'] = data['NeuroBayes']**2
1430 data['FastBDT3'] = data['FastBDT']**3
1431 data['NeuroBayes3'] = data['NeuroBayes']**3
1432 p.add(data, ['FastBDT', 'NeuroBayes', 'FastBDT2', 'NeuroBayes2', 'FastBDT3', 'NeuroBayes3'])
1433 p.finish()
1434 p.save('correlation_matrix.png')
def calculate_flatness(f, p, w=None)
x_axis_label
Label on x axis.
Definition: plotting.py:718
def add(self, data, column, mask=None, weight_column=None)
Definition: plotting.py:720
def __init__(self, figure=None, axis=None)
Definition: plotting.py:709
def finish(self)
Definition: plotting.py:760
signal_axis
add signal subplot
Definition: plotting.py:1268
def add(self, data, columns, signal_mask, bckgrd_mask)
Definition: plotting.py:1278
colorbar_axis
Colorbar axis contains the colorbar.
Definition: plotting.py:1272
None bckgrd_axis
Axis which shows the correlation of the background samples.
Definition: plotting.py:1252
def __init__(self, figure=None)
Definition: plotting.py:1254
None figure
figure which is used to draw
Definition: plotting.py:1248
None signal_axis
Main axis which shows the correlation of the signal samples.
Definition: plotting.py:1250
bckgrd_axis
add background subplot
Definition: plotting.py:1270
axis
Usual axis object which every Plotter object needs, here it is just a dummy.
Definition: plotting.py:1274
def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
Definition: plotting.py:1100
axis_d1
define second subplot
Definition: plotting.py:1094
figure
create figure
Definition: plotting.py:1085
None axis_d1
Axis which shows shape of signal.
Definition: plotting.py:1074
None axis
Main axis which is used to draw.
Definition: plotting.py:1072
def __init__(self, figure=None)
Definition: plotting.py:1078
axis_d2
define third subplot
Definition: plotting.py:1096
None figure
figure which is used to draw
Definition: plotting.py:1070
None axis_d2
Axis which shows shape of background.
Definition: plotting.py:1076
axis
define first subplot
Definition: plotting.py:1092
ymax
Maximum y value.
Definition: plotting.py:563
xmax
Maximum x value.
Definition: plotting.py:561
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:549
def finish(self)
Definition: plotting.py:570
x_axis_label
Label on x axis.
Definition: plotting.py:840
shift_to_zero
Mean difference is shifted to zero (removes constant offset) if this is true.
Definition: plotting.py:797
def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
Definition: plotting.py:787
ymax
Maximum y value.
Definition: plotting.py:800
xmax
Maximum x value.
Definition: plotting.py:830
def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
Definition: plotting.py:805
ymin
min y value
Definition: plotting.py:799
def finish(self, line_color='black')
Definition: plotting.py:843
normed
Minuend and subtrahend are normed before comparing them if this is true.
Definition: plotting.py:796
def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
Definition: plotting.py:591
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:623
x_axis_label
x axis label
Definition: plotting.py:621
keep_first_binning
Keep first binning if user wants so.
Definition: plotting.py:617
normed_to_all_entries
Normalize histograms before drawing them.
Definition: plotting.py:602
first_binning
first binning
Definition: plotting.py:619
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:606
normed_to_bin_width
Normalize histograms before drawing them.
Definition: plotting.py:604
def add(self, data, columns, variables)
Definition: plotting.py:1190
def finish(self)
Definition: plotting.py:1236
def add(self, i, *args, **kwargs)
Definition: plotting.py:524
figure
create figure
Definition: plotting.py:504
def __init__(self, cls, number_of_plots, figure=None)
Definition: plotting.py:497
None axis
Main axis.
Definition: plotting.py:495
None figure
figure which is used to draw
Definition: plotting.py:493
sub_plots
the subplots which are displayed in the grid
Definition: plotting.py:519
axis
the axis of the first subplot
Definition: plotting.py:521
def finish(self)
Definition: plotting.py:531
axis_d1
define second subplot
Definition: plotting.py:889
figure
create figure
Definition: plotting.py:880
def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:895
None axis_d1
Axis which shows the difference between training and test signal.
Definition: plotting.py:869
None axis
Main axis which is used to draw.
Definition: plotting.py:867
def __init__(self, figure=None)
Definition: plotting.py:873
axis_d2
define third subplot
Definition: plotting.py:891
None figure
figure which is used to draw
Definition: plotting.py:865
None axis_d2
Axis which shows the difference between training and test background.
Definition: plotting.py:871
axis
define first subplot
Definition: plotting.py:887
def finish(self, *args, **kwargs)
Definition: plotting.py:260
fill_kwargs
Default keyword arguments for fill_between function.
Definition: plotting.py:118
None ymin
Minimum y value.
Definition: plotting.py:68
def set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
Definition: plotting.py:167
plots
create empty list for plots
Definition: plotting.py:99
float xscale
limit scale
Definition: plotting.py:72
figure
create figure
Definition: plotting.py:87
None ymax
Maximum y value.
Definition: plotting.py:70
errorband_kwargs
Default keyword arguments for errorband function.
Definition: plotting.py:116
None axis
Main axis which is used to draw.
Definition: plotting.py:76
def scale_limits(self)
Definition: plotting.py:266
def add(self, *args, **kwargs)
Definition: plotting.py:254
None xmin
Minimum x value.
Definition: plotting.py:64
def set_fill_options(self, fill_kwargs=None)
Definition: plotting.py:175
def save(self, filename)
Definition: plotting.py:140
def __init__(self, figure=None, axis=None)
Definition: plotting.py:78
None figure
figure which is used to draw
Definition: plotting.py:74
ymax
set y limits
Definition: plotting.py:105
None plots
Plots added to the axis so far.
Definition: plotting.py:60
prop_cycler
Property cycler used to give plots unique colors.
Definition: plotting.py:126
xmax
set x limits
Definition: plotting.py:103
errorbar_kwargs
Default keyword arguments for errorbar function.
Definition: plotting.py:114
labels
create empty list for labels
Definition: plotting.py:101
axis
divide figure into subplots
Definition: plotting.py:94
def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
Definition: plotting.py:183
def set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Overrides default errorbar options for datapoint errorbars.
Definition: plotting.py:159
float yscale
limit scale
Definition: plotting.py:71
None labels
Labels of the plots added so far.
Definition: plotting.py:62
def add_subplot(self, gridspecs)
Definition: plotting.py:128
def set_plot_options(self, plot_kwargs={ 'linestyle':''})
Definition: plotting.py:151
plot_kwargs
Default keyword arguments for plot function.
Definition: plotting.py:112
None xmax
Maximum x value.
Definition: plotting.py:66
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:286
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:395
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:441
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:348
def add(self, data, columns, *masks)
Definition: plotting.py:1157
def finish(self)
Definition: plotting.py:1178
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:1015
distribution
The distribution plot.
Definition: plotting.py:1013
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:1009
None box_axes
Axes for the boxplots.
Definition: plotting.py:995
box_axes
create empty list for box axes
Definition: plotting.py:1011
normed
Normalize histograms before drawing them.
Definition: plotting.py:1007
def __init__(self, figure=None, axis=None, normed=False, range_in_std=None)
Definition: plotting.py:997
def weighted_mean_and_std(x, w)
Definition: histogram.py:31
def poisson_error(n_tot)
Definition: histogram.py:24
Definition: plot.py:1