Belle II Software light-2406-ragdoll
plotting.py
1#!/usr/bin/env python3
2
3
4
11
12import copy
13import math
14
15import pandas
16import numpy
17import itertools
18import matplotlib.pyplot as plt
19import matplotlib.artist
20import matplotlib.figure
21import matplotlib.gridspec
22import matplotlib.colors
23import matplotlib.patches
24import matplotlib.ticker
25import matplotlib.patheffects as PathEffects
26
27
28from basf2_mva_evaluation import histogram
29
30import basf2 as b2
31
32import basf2_mva_util
33import matplotlib
34
35# Do not use standard backend TkAgg, because it is NOT thread-safe
36# You will get an RuntimeError: main thread is not in main loop otherwise!
37matplotlib.use("svg")
38matplotlib.rcParams.update({'font.size': 36})
39
40# Use the Belle II style while producing the plots
41plt.style.use("belle2")
42
43
44class Plotter:
45 """
46 Base class for all Plotters.
47 """
48
49 # stupid workaround for doxygen refusing to document things
50
51
53
54
58
59
60 plots = None
61
62 labels = None
63
64 xmin = None
65
66 xmax = None
67
68 ymin = None
69
70 ymax = None
71 yscale = 0.0
72 xscale = 0.0
73
74 figure = None
75
76 axis = None
77
78 def __init__(self, figure=None, axis=None):
79 """
80 Creates a new figure and axis if None is given, sets the default plot parameters
81 @param figure default draw figure which is used
82 @param axis default draw axis which is used
83 """
84 b2.B2INFO("Create new figure for class " + str(type(self)))
85 if figure is None:
86
87 self.figurefigure = matplotlib.figure.Figure(figsize=(32, 18))
88 self.figurefigure.set_tight_layout(False)
89 else:
90 self.figurefigure = figure
91
92 if axis is None:
93
94 self.axisaxis = self.figurefigure.add_subplot(1, 1, 1)
95 else:
96 self.axisaxis = axis
97
98
99 self.plotsplots = []
100
101 self.labelslabels = []
102
103 self.xmin, self.xmaxxmax = float(0), float(1)
104
105 self.ymin, self.ymaxymax = float(0), float(1)
106
107 self.yscaleyscale = 0.1
108
109 self.xscalexscale = 0.0
110
111
112 self.plot_kwargs = None
113
114 self.errorbar_kwargs = None
115
117
118 self.fill_kwargs = None
119
120 self.set_plot_options()
123 self.set_fill_options()
124
125
126 self.prop_cycler = itertools.cycle(plt.rcParams["axes.prop_cycle"])
127
128 def add_subplot(self, gridspecs):
129 """
130 Adds a new subplot to the figure, updates all other axes
131 according to the given gridspec
132 @param gridspecs gridspecs for all axes including the new one
133 """
134 for gs, ax in zip(gridspecs[:-1], self.figurefigure.axes):
135 ax.set_position(gs.get_position(self.figurefigure))
136 ax.set_subplotspec(gs)
137 axis = self.figurefigure.add_subplot(gridspecs[-1], sharex=self.axisaxis)
138 return axis
139
140 def save(self, filename):
141 """
142 Save the figure into a file
143 @param filename of the file
144 """
145 b2.B2INFO("Save figure for class " + str(type(self)))
146 from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
147 canvas = FigureCanvas(self.figurefigure)
148 canvas.print_figure(filename, dpi=50)
149 return self
150
151 def set_plot_options(self, plot_kwargs={'linestyle': ''}):
152 """
153 Overrides default plot options for datapoint plot
154 @param plot_kwargs keyword arguments for the plot function
155 """
156 self.plot_kwargs = copy.copy(plot_kwargs)
157 return self
158
159 def set_errorbar_options(self, errorbar_kwargs={'fmt': '.', 'elinewidth': 3, 'alpha': 1}):
160 """
161 Overrides default errorbar options for datapoint errorbars
162 @param errorbar_kwargs keyword arguments for the errorbar function
163 """
164 self.errorbar_kwargs = copy.copy(errorbar_kwargs)
165 return self
166
167 def set_errorband_options(self, errorband_kwargs={'alpha': 0.5}):
168 """
169 Overrides default errorband options for datapoint errorband
170 @param errorbar_kwargs keyword arguments for the fill_between function
171 """
172 self.errorband_kwargs = copy.copy(errorband_kwargs)
173 return self
174
175 def set_fill_options(self, fill_kwargs=None):
176 """
177 Overrides default fill_between options for datapoint errorband
178 @param fill_kwargs keyword arguments for the fill_between function
179 """
180 self.fill_kwargs = copy.copy(fill_kwargs)
181 return self
182
183 def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None):
184 """
185 Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
186 @param x coordinates of the data points
187 @param y coordinates of the data points
188 @param xerr symmetric error on x data points
189 @param yerr symmetric error on y data points
190 """
191 p = e = f = None
192 plot_kwargs = copy.copy(self.plot_kwargs)
193 errorbar_kwargs = copy.copy(self.errorbar_kwargs)
194 errorband_kwargs = copy.copy(self.errorband_kwargs)
195 fill_kwargs = copy.copy(self.fill_kwargs)
196
197 if plot_kwargs is None or 'color' not in plot_kwargs:
198 color = next(self.prop_cycler)
199 color = color['color']
200 plot_kwargs['color'] = color
201 else:
202 color = plot_kwargs['color']
203 color = matplotlib.colors.ColorConverter().to_rgb(color)
204 patch = matplotlib.patches.Patch(color=color, alpha=0.5)
205 patch.get_color = patch.get_facecolor
206 patches = [patch]
207
208 if plot_kwargs is not None:
209 p, = axis.plot(x, y, rasterized=True, **plot_kwargs)
210 patches.append(p)
211
212 if errorbar_kwargs is not None and (xerr is not None or yerr is not None):
213 if 'color' not in errorbar_kwargs:
214 errorbar_kwargs['color'] = color
215 if 'ecolor' not in errorbar_kwargs:
216 errorbar_kwargs['ecolor'] = [0.5 * x for x in color]
217
218 # fully mask nan values.
219 # Needed until https://github.com/matplotlib/matplotlib/pull/23333 makes it into the externals.
220 # TODO: remove in release 8.
221 if not isinstance(xerr, (numpy.ndarray, list)):
222 xerr = xerr*numpy.ones(len(x))
223 mask = numpy.logical_and.reduce([numpy.isfinite(v) for v in [x, y, xerr, yerr]])
224
225 e = axis.errorbar(
226 x[mask], y[mask], xerr=numpy.where(
227 xerr[mask] < 0, 0.0, xerr[mask]), yerr=numpy.where(
228 yerr[mask] < 0, 0.0, yerr[mask]), rasterized=True, **errorbar_kwargs)
229 patches.append(e)
230
231 if errorband_kwargs is not None and yerr is not None:
232 if 'color' not in errorband_kwargs:
233 errorband_kwargs['color'] = color
234 if xerr is not None:
235 # Ensure that xerr and yerr are iterable numpy arrays
236 xerr = x + xerr - x
237 yerr = y + yerr - y
238 for _x, _y, _xe, _ye in zip(x, y, xerr, yerr):
239 axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=True,
240 **errorband_kwargs))
241 else:
242 f = axis.fill_between(x, y - yerr, y + yerr, interpolate=True, rasterized=True, **errorband_kwargs)
243
244 if fill_kwargs is not None:
245 # to fill the last bin of a histogram
246 x = numpy.append(x, x[-1]+2*xerr[-1])
247 y = numpy.append(y, y[-1])
248 xerr = numpy.append(xerr, xerr[-1])
249
250 axis.fill_between(x-xerr, y, 0, rasterized=True, **fill_kwargs)
251
252 return (tuple(patches), p, e, f)
253
254 def add(self, *args, **kwargs):
255 """
256 Add a new plot to this plotter
257 """
258 return NotImplemented
259
260 def finish(self, *args, **kwargs):
261 """
262 Finish plotting and set labels, legends and stuff
263 """
264 return NotImplemented
265
266 def scale_limits(self):
267 """
268 Scale limits to increase distance to boundaries
269 """
270 self.ymin *= 1.0 - math.copysign(self.yscaleyscale, self.ymin)
271 self.ymaxymax *= 1.0 + math.copysign(self.yscaleyscale, self.ymaxymax)
272 self.xmin *= 1.0 - math.copysign(self.xscalexscale, self.xmin)
273 self.xmaxxmax *= 1.0 + math.copysign(self.xscalexscale, self.xmaxxmax)
274 return self
275
276
278 """
279 Plots the purity and the efficiency over the cut value (for cut choosing)
280 """
281
285
286 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
287 """
288 Add a new curve to the plot
289 @param data pandas.DataFrame containing all data
290 @param column which is used to calculate efficiency and purity for different cuts
291 @param signal_mask boolean numpy.array defining which events are signal events
292 @param bckgrd_mask boolean numpy.array defining which events are background events
293 @param weight_column column in data containing the weights for each event
294 """
295
296 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
297
298 if normed:
299 efficiency, efficiency_error = hists.get_efficiency(['Signal'])
300 purity, purity_error = hists.get_purity(['Signal'], ['Background'])
301 else:
302 efficiency, efficiency_error = hists.get_true_positives(['Signal'])
303 purity, purity_error = hists.get_false_positives(['Background'])
304
305 cuts = hists.bin_centers
306
307 self.xmin, self.xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmaxxmaxxmax])
308 self.ymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.ymin]), \
309 numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.ymaxymaxymax])
310
311 self.plotsplots.append(self._plot_datapoints(self.axisaxis, cuts, efficiency, xerr=0, yerr=efficiency_error))
312
313 if normed:
314 self.labelslabels.append("Efficiency")
315 else:
316 self.labelslabels.append("True positive")
317
318 self.plotsplots.append(self._plot_datapoints(self.axisaxis, cuts, purity, xerr=0, yerr=purity_error))
319
320 if normed:
321 self.labelslabels.append("Purity")
322 else:
323 self.labelslabels.append("False positive")
324
325 return self
326
327 def finish(self):
328 """
329 Sets limits, title, axis-labels and legend of the plot
330 """
331 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
332 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
333 self.axisaxis.set_title("Classification Plot")
334 self.axisaxis.get_xaxis().set_label_text('Cut Value')
335 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
336 return self
337
338
340 """
341 Plots the signal to noise ratio over the cut value (for cut choosing)
342 """
343
347
348 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
349 """
350 Add a new curve to the plot
351 @param data pandas.DataFrame containing all data
352 @param column which is used to calculate signal to noise ratio for different cuts
353 @param signal_mask boolean numpy.array defining which events are signal events
354 @param bckgrd_mask boolean numpy.array defining which events are background events
355 @param weight_column column in data containing the weights for each event
356 """
357
358 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
359
360 signal2noise, signal2noise_error = hists.get_signal_to_noise(['Signal'], ['Background'])
361
362 cuts = hists.bin_centers
363
364 self.xmin, self.xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmaxxmaxxmax])
365 self.ymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(signal2noise), self.ymin]), \
366 numpy.nanmax([numpy.nanmax(signal2noise), self.ymaxymaxymax])
367
368 self.plotsplots.append(self._plot_datapoints(self.axisaxis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))
369
370 self.labelslabels.append(column)
371
372 return self
373
374 def finish(self):
375 """
376 Sets limits, title, axis-labels and legend of the plot
377 """
378 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
379 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
380 self.axisaxis.set_title("Signal to Noise Plot")
381 self.axisaxis.get_xaxis().set_label_text('Cut Value')
382 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
383 return self
384
385
387 """
388 Plots the purity over the efficiency also known as ROC curve
389 """
390
394
395 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
396 """
397 Add a new curve to the ROC plot
398 @param data pandas.DataFrame containing all data
399 @param column which is used to calculate efficiency and purity for different cuts
400 @param signal_mask boolean numpy.array defining which events are signal events
401 @param bckgrd_mask boolean numpy.array defining which events are background events
402 @param weight_column column in data containing the weights for each event
403 """
404 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
405 efficiency, efficiency_error = hists.get_efficiency(['Signal'])
406 purity, purity_error = hists.get_purity(['Signal'], ['Background'])
407
408 self.xmin, self.xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmaxxmaxxmax])
409 self.ymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymaxymaxymax])
410
411 p = self._plot_datapoints(self.axisaxis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
412 self.plotsplots.append(p)
413 if label is not None:
414 self.labelslabels.append(label)
415 else:
416 self.labelslabels.append(column)
417 return self
418
419 def finish(self):
420 """
421 Sets limits, title, axis-labels and legend of the plot
422 """
423 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
424 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
425 self.axisaxis.set_title("ROC Purity Plot")
426 self.axisaxis.get_xaxis().set_label_text('Efficiency')
427 self.axisaxis.get_yaxis().set_label_text('Purity')
428 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
429 return self
430
431
433 """
434 Plots the rejection over the efficiency also known as ROC curve
435 """
436
440
441 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
442 """
443 Add a new curve to the ROC plot
444 @param data pandas.DataFrame containing all data
445 @param column which is used to calculate efficiency and purity for different cuts
446 @param signal_mask boolean numpy.array defining which events are signal events
447 @param bckgrd_mask boolean numpy.array defining which events are background events
448 @param weight_column column in data containing the weights for each event
449 """
450 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
451 efficiency, efficiency_error = hists.get_efficiency(['Signal'])
452 rejection, rejection_error = hists.get_efficiency(['Background'])
453 rejection = 1 - rejection
454 if isinstance(efficiency, int) and not isinstance(rejection, int):
455 efficiency = numpy.array([efficiency] * len(rejection))
456 elif isinstance(rejection, int) and not isinstance(efficiency, int):
457 rejection = numpy.array([rejection] * len(efficiency))
458 elif isinstance(rejection, int) and isinstance(efficiency, int):
459 efficiency = numpy.array([efficiency])
460 rejection = numpy.array([rejection])
461
462 self.xmin, self.xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmaxxmaxxmax])
463 self.ymin, self.ymaxymaxymax = numpy.nanmin([rejection.min(), self.ymin]), numpy.nanmax([rejection.max(), self.ymaxymaxymax])
464
465 auc = numpy.abs(numpy.trapz(rejection, efficiency))
466
467 p = self._plot_datapoints(self.axisaxis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
468 self.plotsplots.append(p)
469 if label is not None:
470 self.labelslabels.append(label[:10] + f" ({auc:.2f})")
471 else:
472 self.labelslabels.append(column[:10] + f" ({auc:.2f})")
473 return self
474
475 def finish(self):
476 """
477 Sets limits, title, axis-labels and legend of the plot
478 """
479 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
480 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
481 self.axisaxis.set_title("ROC Rejection Plot")
482 self.axisaxis.get_xaxis().set_label_text('Signal Efficiency')
483 self.axisaxis.get_yaxis().set_label_text('Background Rejection')
484 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
485 return self
486
487
489 """
490 Plots multiple other plots into a grid 3x?
491 """
492
493 figure = None
494
495 axis = None
496
497 def __init__(self, cls, number_of_plots, figure=None):
498 """
499 Creates a new figure if None is given, sets the default plot parameters
500 @param figure default draw figure which is used
501 """
502 if figure is None:
503
504 self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
505 self.figurefigurefigurefigure.set_tight_layout(True)
506 else:
507 self.figurefigurefigurefigure = figure
508
509 if number_of_plots == 1:
510 gs = matplotlib.gridspec.GridSpec(1, 1)
511 elif number_of_plots == 2:
512 gs = matplotlib.gridspec.GridSpec(1, 2)
513 elif number_of_plots == 3:
514 gs = matplotlib.gridspec.GridSpec(1, 3)
515 else:
516 gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
517
518
519 self.sub_plots = [cls(self.figurefigurefigurefigure, self.figurefigurefigurefigure.add_subplot(gs[i // 3, i % 3])) for i in range(number_of_plots)]
520
521 self.axisaxisaxisaxis = self.sub_plots[0].axis
523
524 def add(self, i, *args, **kwargs):
525 """
526 Call add function of ith subplot
527 @param i position of the subplot
528 """
529 self.sub_plots[i].add(*args, **kwargs)
530
531 def finish(self):
532 """
533 Sets limits, title, axis-labels and legend of the plot
534 """
535 for plot in self.sub_plots:
536 plot.finish()
537 return self
538
539
541 """
542 Plots the purity in each bin over the classifier output.
543 """
544
548
549 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
550 """
551 Add a new curve to the Diagonal plot
552 @param data pandas.DataFrame containing all data
553 @param column which is used to calculate purity for different cuts
554 @param signal_mask boolean numpy.array defining which events are signal events
555 @param bckgrd_mask boolean numpy.array defining which events are background events
556 @param weight_column column in data containing the weights for each event
557 """
558 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
559 purity, purity_error = hists.get_purity_per_bin(['Signal'], ['Background'])
560
561 self.xmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
562 # self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
563 self.ymin, self.ymaxymaxymax = 0, 1
564
565 p = self._plot_datapoints(self.axisaxis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
566 self.plotsplots.append(p)
567 self.labelslabels.append(column)
568 return self
569
570 def finish(self):
571 """
572 Sets limits, title, axis-labels and legend of the plot
573 """
574 self.scale_limits()
575 self.axisaxis.plot((0.0, 1.0), (0.0, 1.0), color='black')
576 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
577 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
578 self.axisaxis.set_title("Diagonal Plot")
579 self.axisaxis.get_xaxis().set_label_text('Classifier Output')
580 self.axisaxis.get_yaxis().set_label_text('Purity Per Bin')
581 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
582 return self
583
584
586 """
587 Plots distribution of a quantity
588 """
589
590 def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
591 keep_first_binning=False, range_in_std=None):
592 """
593 Creates a new figure and axis if None is given, sets the default plot parameters
594 @param figure default draw figure which is used
595 @param axis default draw axis which is used
596 @param normed true if histograms should be normed before drawing
597 @param keep_first_binning use the binning of the first distribution for further plots
598 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
599 """
600 super().__init__(figure, axis)
601
602 self.normed_to_all_entries = normed_to_all_entries
603
604 self.normed_to_bin_width = normed_to_bin_width
605
606 self.range_in_std = range_in_std
607 # if self.normed_to_all_entries or self.normed_to_bin_width:
608
609 self.yminymin = float(0)
610
611 self.ymaxymaxymax = float('-inf')
612
613 self.xminxmin = float('inf')
614
615 self.xmaxxmaxxmax = float('-inf')
616
617 self.keep_first_binning = keep_first_binning
618
619 self.first_binning = None
620
621 self.x_axis_label = ''
622
623 def add(self, data, column, mask=None, weight_column=None, label=None):
624 """
625 Add a new distribution to the plots
626 @param data pandas.DataFrame containing all data
627 @param column which is used to calculate distribution histogram
628 @param mask boolean numpy.array defining which events are used for the histogram
629 @param weight_column column in data containing the weights for each event
630 """
631 if mask is None:
632 mask = numpy.ones(len(data)).astype('bool')
633
634 bins = 100
635 if self.keep_first_binning and self.first_binning is not None:
636 bins = self.first_binning
637 hists = histogram.Histograms(data, column, {'Total': mask}, weight_column=weight_column,
638 bins=bins, equal_frequency=False, range_in_std=self.range_in_std)
639 if self.keep_first_binning and self.first_binning is None:
640 self.first_binning = hists.bins
641 hist, hist_error = hists.get_hist('Total')
642
643 if self.normed_to_all_entries:
644 normalization = float(numpy.sum(hist))
645 hist = hist / normalization
646 hist_error = hist_error / normalization
647
648 if self.normed_to_bin_width:
649 hist = hist / hists.bin_widths
650 hist_error = hist_error / hists.bin_widths
651
652 self.xminxmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xminxmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
653 self.yminymin = numpy.nanmin([hist.min(), self.yminymin])
654 self.ymaxymaxymax = numpy.nanmax([(hist + hist_error).max(), self.ymaxymaxymax])
655
656 p = self._plot_datapoints(self.axisaxis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
657 self.plotsplots.append(p)
658 self.x_axis_label = column
659
660 appendix = ''
661 if self.ymaxymaxymax <= self.yminymin or self.xmaxxmaxxmax <= self.xminxmin:
662 appendix = ' No data to plot!'
663
664 if label is None:
665 self.labelslabels.append(column + appendix)
666 else:
667 self.labelslabels.append(label + appendix)
668 return self
669
670 def finish(self):
671 """
672 Sets limits, title, axis-labels and legend of the plot
673 """
674 self.axisaxis.set_title("Distribution Plot")
675 self.axisaxis.get_xaxis().set_label_text(self.x_axis_label)
676
677 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
678
679 if self.ymaxymaxymax <= self.yminymin or self.xmaxxmaxxmax <= self.xminxmin:
680 self.axisaxis.set_xlim((0., 1.))
681 self.axisaxis.set_ylim((0., 1.))
682 self.axisaxis.text(0.36, 0.5, 'No data to plot', fontsize=60, color='black')
683 return self
684
685 self.scale_limits()
686
687 self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
688 self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
689
691 self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / (# Entries * Bin Width)')
692 elif self.normed_to_all_entries:
693 self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / # Entries')
694 elif self.normed_to_bin_width:
695 self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / Bin Width')
696 else:
697 self.axisaxis.get_yaxis().set_label_text('# Entries per Bin')
698
699 return self
700
701
703 """
704 Create a boxplot
705 """
706
708
709 def __init__(self, figure=None, axis=None):
710 """
711 Creates a new figure and axis if None is given, sets the default plot parameters
712 @param figure default draw figure which is used
713 @param axis default draw axis which is used
714 """
715 super().__init__(figure=figure, axis=axis)
716
717
718 self.x_axis_label = ""
719
720 def add(self, data, column, mask=None, weight_column=None):
721 """
722 Add a new boxplot to the plots
723 @param data pandas.DataFrame containing all data
724 @param column which is used to calculate boxplot quantities
725 @param mask boolean numpy.array defining which events are used for the histogram
726 @param weight_column column in data containing the weights for each event
727 """
728 if mask is None:
729 mask = numpy.ones(len(data)).astype('bool')
730 x = data[column][mask]
731 if weight_column is not None:
732 # weight = data[weight_column][mask]
733 b2.B2WARNING("Weights are currently not used in boxplot, due to limitations in matplotlib")
734
735 if len(x) == 0:
736 b2.B2WARNING("Ignore empty boxplot.")
737 return self
738
739 p = self.axisaxis.boxplot(x, sym='k.', whis=1.5, vert=False, patch_artist=True, showmeans=True, widths=1,
740 boxprops=dict(facecolor='blue', alpha=0.5),
741 # medianprobs=dict(color='blue'),
742 # meanprobs=dict(color='red'),
743 )
744 self.plotsplots.append(p)
745 self.labelslabels.append(column)
746 self.x_axis_label = column
747 r"""
748 self.axisaxis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + '\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
749 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axisaxis.transAxes)
750 self.axisaxis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + '\n' + r'$IQD = {:.2f}$').format(x.std(),
751 x.quantile(0.75) - x.quantile(0.25)),
752 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axisaxis.transAxes)
753 self.axisaxis.text(0.7, 0.9, (r'$min = {:.2f}$' + '\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
754 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axisaxis.transAxes)
755 """
756
757 return self
758
759 def finish(self):
760 """
761 Sets limits, title, axis-labels and legend of the plot
762 """
763 matplotlib.artist.setp(self.axisaxis.get_yaxis(), visible=False)
764 self.axisaxis.get_xaxis().set_label_text(self.x_axis_label)
765 self.axisaxis.set_title("Box Plot")
766 return self
767
768
770 """
771 Plots the difference between two histograms
772 """
773
785
786 def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
787 """
788 Creates a new figure and axis if None is given, sets the default plot parameters
789 @param figure default draw figure which is used
790 @param axis default draw axis which is used
791 @param normed normalize minuend and subtrahend before comparing them
792 @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
793 """
794 super().__init__(figure, axis)
795 self.normed = normed
796 self.shift_to_zero = shift_to_zero
797 if self.normed:
798 self.yminymin = -0.01
799 self.ymaxymaxymax = 0.01
800 else:
801 self.yminymin = -1
802 self.ymaxymaxymax = 1
803
804 def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
805 """
806 Add a new difference plot
807 @param data pandas.DataFrame containing all data
808 @param column which is used to calculate distribution histogram
809 @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
810 @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
811 @param weight_column column in data containing the weights for each event
812 @param label label for the legend if None, the column name is used
813 """
814 hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask},
815 weight_column=weight_column, equal_frequency=False)
816 minuend, minuend_error = hists.get_hist('Minuend')
817 subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
818
819 difference_error = histogram.poisson_error(minuend + subtrahend)
820 if self.normed:
821 difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
822 minuend = minuend / numpy.sum(minuend)
823 subtrahend = subtrahend / numpy.sum(subtrahend)
824 difference = minuend - subtrahend
825
826 if self.shift_to_zero:
827 difference = difference - numpy.mean(difference)
828
829 self.xmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
830 self.yminymin = min((difference - difference_error).min(), self.yminymin)
831 self.ymaxymaxymax = max((difference + difference_error).max(), self.ymaxymaxymax)
832
833 p = self._plot_datapoints(self.axisaxis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
834 self.plotsplots.append(p)
835 if label is None:
836 self.labelslabels.append(label)
837 else:
838 self.labelslabels.append(column)
839 self.x_axis_label = column
840 return self
841
842 def finish(self, line_color='black'):
843 """
844 Sets limits, title, axis-labels and legend of the plot
845 """
846 self.axisaxis.plot((self.xmin, self.xmaxxmaxxmax), (0, 0), color=line_color, linewidth=4, rasterized=True)
847 self.scale_limits()
848 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
849 self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
850 self.axisaxis.set_title("Difference Plot")
851 self.axisaxis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
852 self.axisaxis.get_xaxis().set_label_text(self.x_axis_label)
853 self.axisaxis.get_yaxis().set_label_text('Difference')
854 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
855 return self
856
857
859 """
860 Create TMVA-like overtraining control plot for a classification training
861 """
862
863
864 figure = None
865
866 axis = None
867
868 axis_d1 = None
869
870 axis_d2 = None
871
872 def __init__(self, figure=None):
873 """
874 Creates a new figure if None is given, sets the default plot parameters
875 @param figure default draw figure which is used
876 """
877 if figure is None:
878
879 self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
880 self.figurefigurefigurefigure.set_tight_layout(True)
881 else:
882 self.figurefigurefigurefigure = figure
883
884 gs = matplotlib.gridspec.GridSpec(5, 1)
885
887
889
891
893
894 def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
895 """
896 Add a new overtraining plot, I recommend to draw only one overtraining plot at the time,
897 otherwise there are too many curves in the plot to recognize anything in the plot.
898 @param data pandas.DataFrame containing all data
899 @param column which is used to calculate distribution histogram
900 @param train_mask boolean numpy.array defining which events are training events
901 @param test_mask boolean numpy.array defining which events are test events
902 @param signal_mask boolean numpy.array defining which events are signal events
903 @param bckgrd_mask boolean numpy.array defining which events are background events
904 @param weight_column column in data containing the weights for each event
905 """
906 distribution = Distribution(self.figurefigurefigurefigure, self.axisaxisaxisaxis, normed_to_all_entries=True)
907
908 distribution.set_plot_options(self.plot_kwargs)
909 distribution.set_errorbar_options(self.errorbar_kwargs)
910 distribution.set_errorband_options(self.errorband_kwargs)
911 distribution.add(data, column, test_mask & signal_mask, weight_column)
912 distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
913
914 distribution.set_plot_options(
915 {'color': distribution.plots[0][0][0].get_color(), 'linestyle': '-', 'lw': 4, 'drawstyle': 'steps-mid'})
916 distribution.set_fill_options({'color': distribution.plots[0][0][0].get_color(), 'alpha': 0.5, 'step': 'post'})
917 distribution.set_errorbar_options(None)
918 distribution.set_errorband_options(None)
919 distribution.add(data, column, train_mask & signal_mask, weight_column)
920 distribution.set_plot_options(
921 {'color': distribution.plots[1][0][0].get_color(), 'linestyle': '-', 'lw': 4, 'drawstyle': 'steps-mid'})
922 distribution.set_fill_options({'color': distribution.plots[1][0][0].get_color(), 'alpha': 0.5, 'step': 'post'})
923 distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
924
925 distribution.labels = ['Test-Signal', 'Test-Background', 'Train-Signal', 'Train-Background']
926 distribution.finish()
927
928 self.plot_kwargs['color'] = distribution.plots[0][0][0].get_color()
929 difference_signal = Difference(self.figurefigurefigurefigure, self.axis_d1axis_d1, shift_to_zero=True, normed=True)
930 difference_signal.set_plot_options(self.plot_kwargs)
931 difference_signal.set_errorbar_options(self.errorbar_kwargs)
932 difference_signal.set_errorband_options(self.errorband_kwargs)
933 difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
934 self.axis_d1axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
935 self.axis_d1axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
936 difference_signal.plots = difference_signal.labels = []
937 difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
938
939 self.plot_kwargs['color'] = distribution.plots[1][0][0].get_color()
940 difference_bckgrd = Difference(self.figurefigurefigurefigure, self.axis_d2axis_d2, shift_to_zero=True, normed=True)
941 difference_bckgrd.set_plot_options(self.plot_kwargs)
942 difference_bckgrd.set_errorbar_options(self.errorbar_kwargs)
943 difference_bckgrd.set_errorband_options(self.errorband_kwargs)
944 difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
945 self.axis_d2axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
946 self.axis_d2axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
947 difference_bckgrd.plots = difference_bckgrd.labels = []
948 difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
949
950 try:
951 import scipy.stats
952 # Kolmogorov smirnov test
953 if len(data[column][train_mask & signal_mask]) == 0 or len(data[column][test_mask & signal_mask]) == 0:
954 b2.B2WARNING("Cannot calculate kolmogorov smirnov test for signal due to missing data")
955 else:
956 ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
957 props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
958 self.axis_d1axis_d1.text(0.1, 0.9, r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36, bbox=props,
959 verticalalignment='top', horizontalalignment='left', transform=self.axis_d1axis_d1.transAxes)
960 if len(data[column][train_mask & bckgrd_mask]) == 0 or len(data[column][test_mask & bckgrd_mask]) == 0:
961 b2.B2WARNING("Cannot calculate kolmogorov smirnov test for background due to missing data")
962 else:
963 ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
964 props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
965 self.axis_d2axis_d2.text(0.1, 0.9, r'background (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36,
966 bbox=props,
967 verticalalignment='top', horizontalalignment='left', transform=self.axis_d2axis_d2.transAxes)
968 except ImportError:
969 b2.B2WARNING("Cannot calculate kolmogorov smirnov test please install scipy!")
970
971 return self
972
973 def finish(self):
974 """
975 Sets limits, title, axis-labels and legend of the plot
976 """
977 self.axisaxisaxisaxis.set_title("Overtraining Plot")
978 self.axis_d1axis_d1.set_title("")
979 self.axis_d2axis_d2.set_title("")
980 matplotlib.artist.setp(self.axisaxisaxisaxis.get_xticklabels(), visible=False)
981 matplotlib.artist.setp(self.axis_d1axis_d1.get_xticklabels(), visible=False)
982 self.axisaxisaxisaxis.get_xaxis().set_label_text('')
983 self.axis_d1axis_d1.get_xaxis().set_label_text('')
984 self.axis_d2axis_d2.get_xaxis().set_label_text('Classifier Output')
985 return self
986
987
989 """
990 Plots distribution of a quantity including boxplots
991 """
992
993
994 box_axes = None
995
996 def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
997 """
998 Creates a new figure and axis if None is given, sets the default plot parameters
999 @param figure default draw figure which is used
1000 @param axis default draw axis which is used
1001 @param normed true if the histograms should be normed before drawing
1002 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
1003 """
1004 super().__init__(figure, axis)
1005
1006 self.normed = normed
1007
1008 self.range_in_std = range_in_std
1009
1011
1012 self.distribution = Distribution(self.figurefigure, self.axisaxis, normed_to_all_entries=self.normed, range_in_std=self.range_in_std)
1013
1014 def add(self, data, column, mask=None, weight_column=None, label=None):
1015 """
1016 Add a new distribution plot, with additional information like a boxplot compared to
1017 the ordinary Distribution plot.
1018 @param data pandas.DataFrame containing all data
1019 @param column which is used to calculate distribution histogram
1020 @param mask boolean numpy.array defining which events are used for the distribution histogram
1021 @param weight_column column in data containing the weights for each event
1022 """
1026 self.distribution.add(data, column, mask, weight_column, label=label)
1027
1028 n = len(self.box_axesbox_axes) + 1
1029 gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1030 gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i in range(n)]
1031 box_axis = self.add_subplot(gridspecs)
1032
1033 if self.range_in_std is not None:
1034 mean, std = histogram.weighted_mean_and_std(data[column], None if weight_column is None else data[weight_column])
1035 # Everything outside mean +- range_in_std * std is considered not inside the mask
1036 mask = mask & (data[column] > (mean - self.range_in_std * std)) & (data[column] < (mean + self.range_in_std * std))
1037 box = Box(self.figurefigure, box_axis)
1038 box.add(data, column, mask, weight_column)
1039 if len(box.plots) > 0:
1040 box.plots[0]['boxes'][0].set_facecolor(self.distribution.plots[-1][0][0].get_color())
1041 box.finish()
1042
1043 self.box_axesbox_axes.append(box_axis)
1044 return self
1045
1046 def finish(self):
1047 """
1048 Sets limits, title, axis-labels and legend of the plot
1049 """
1050 self.distribution.finish()
1051 matplotlib.artist.setp(self.axisaxis.get_xticklabels(), visible=False)
1052 self.axisaxis.get_xaxis().set_label_text('')
1053 for box_axis in self.box_axesbox_axes[:-1]:
1054 matplotlib.artist.setp(box_axis.get_xticklabels(), visible=False)
1055 box_axis.set_title("")
1056 box_axis.get_xaxis().set_label_text('')
1057 self.box_axesbox_axes[-1].set_title("")
1058 self.axisaxis.set_title("Distribution Plot")
1059 self.axisaxis.legend([x[0] for x in self.distribution.plots], self.distribution.labels,
1060 loc='best', fancybox=True, framealpha=0.5)
1061 return self
1062
1063
1065 """
1066 Plots change of a distribution of a quantity depending on the cut on a classifier
1067 """
1068
1069 figure = None
1070
1071 axis = None
1072
1073 axis_d1 = None
1074
1075 axis_d2 = None
1076
1077 def __init__(self, figure=None):
1078 """
1079 Creates a new figure if None is given, sets the default plot parameters
1080 @param figure default draw figure which is used
1081 """
1082 if figure is None:
1083
1084 self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1085 self.figurefigurefigurefigure.set_tight_layout(True)
1086 else:
1087 self.figurefigurefigurefigure = figure
1088
1089 gs = matplotlib.gridspec.GridSpec(3, 2)
1090
1092
1094
1096
1098
1099 def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1100 """
1101 Add a new correlation plot.
1102 @param data pandas.DataFrame containing all data
1103 @param column which is used to calculate distribution histogram
1104 @param cut_column which is used to calculate cut on the other quantity defined by column
1105 @param quantiles list of quantiles between 0 and 100, defining the different cuts
1106 @param weight_column column in data containing the weights for each event
1107 """
1108 if len(data[cut_column]) == 0:
1109 b2.B2WARNING("Ignore empty Correlation.")
1110 return self
1111
1112 axes = [self.axisaxisaxisaxis, self.axis_d1axis_d1, self.axis_d2axis_d2]
1113
1114 for i, (l, m) in enumerate([('.', signal_mask | bckgrd_mask), ('S', signal_mask), ('B', bckgrd_mask)]):
1115
1116 if weight_column is not None:
1117 weights = numpy.array(data[weight_column][m])
1118 else:
1119 weights = numpy.ones(len(data[column][m]))
1120
1121 xrange = numpy.percentile(data[column][m], [5, 95])
1122
1123 colormap = plt.get_cmap('coolwarm')
1124 tmp, x = numpy.histogram(data[column][m], bins=100,
1125 range=xrange, density=True, weights=weights)
1126 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1127 axes[i].plot(bin_center, tmp, color='black', lw=1)
1128
1129 for quantil in numpy.arange(5, 100, 5):
1130 cut = numpy.percentile(data[cut_column][m], quantil)
1131 sel = data[cut_column][m] >= cut
1132 y, x = numpy.histogram(data[column][m][sel], bins=100,
1133 range=xrange, density=True, weights=weights[sel])
1134 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1135 axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1136 tmp = y
1137
1138 axes[i].set_ylim(bottom=0)
1139
1140 flatness_score = basf2_mva_util.calculate_flatness(data[column][m], data[cut_column][m], weights)
1141 axes[i].set_title(r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1142 return self
1143
1144 def finish(self):
1145 """
1146 Sets limits, title, axis-labels and legend of the plot
1147 """
1148 return self
1149
1150
1152 """
1153 Plots multivariate distribution using TSNE algorithm
1154 """
1155
1156 def add(self, data, columns, *masks):
1157 """
1158 Add a new correlation plot.
1159 @param data pandas.DataFrame containing all data
1160 @param columns which are used to calculate the correlations
1161 @param masks different classes to show in TSNE
1162 """
1163 try:
1164 import sklearn
1165 import sklearn.manifold
1166 model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1167 data = numpy.array([data[column] for column in columns]).T
1168 model.fit(data)
1169 for mask in masks:
1170 data = numpy.array([data[column][mask] for column in columns]).T
1171 data = model.transform(data)
1172 self.axisaxis.scatter(data[:, 0], data[:, 1], rasterized=True)
1173 except ImportError:
1174 print("Cannot create TSNE plot. Install sklearn if you want it")
1175 return self
1176
1177 def finish(self):
1178 """
1179 Sets limits, title, axis-labels and legend of the plot
1180 """
1181 return self
1182
1183
1185 """
1186 Plots importance matrix
1187 """
1188
1189 def add(self, data, columns, variables):
1190 """
1191 Add a new correlation plot.
1192 @param data pandas.DataFrame containing all data
1193 @param columns which are used to calculate the correlations
1194 """
1195 self.figurefigure.set_tight_layout(True)
1196
1197 def norm(x):
1198 width = (numpy.max(x) - numpy.min(x))
1199 if width <= 0:
1200 return numpy.zeros(x.shape)
1201 return (x - numpy.min(x)) / width * 100
1202
1203 importance_matrix = numpy.vstack([norm(data[column]) for column in columns]).T
1204 importance_heatmap = self.axisaxis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,
1205 rasterized=True)
1206
1207 # put the major ticks at the middle of each cell
1208 self.axisaxis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=False)
1209 self.axisaxis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=False)
1210
1211 self.axisaxis.set_xticklabels(columns, minor=False, rotation=90)
1212 self.axisaxis.set_yticklabels(variables, minor=False)
1213
1214 self.axisaxis.xaxis.tick_top()
1215
1216 for y in range(importance_matrix.shape[0]):
1217 for x in range(importance_matrix.shape[1]):
1218 txt = self.axisaxis.text(x + 0.5, y + 0.5, f'{importance_matrix[y, x]:.0f}',
1219 size=14,
1220 horizontalalignment='center',
1221 verticalalignment='center',
1222 color='w')
1223 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])
1224
1225 cb = self.figurefigure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation='vertical')
1226 cb.ax.set_yticklabels(['low', 'high'])
1227
1228 # remove whitespace
1229 self.axisaxis.set_ylim(0, importance_matrix.shape[0])
1230
1231 self.axisaxis.set_aspect('equal')
1232
1233 return self
1234
1235 def finish(self):
1236 """
1237 Sets limits, title, axis-labels and legend of the plot
1238 """
1239 return self
1240
1241
1243 """
1244 Plots correlation matrix
1245 """
1246
1247 figure = None
1248
1249 signal_axis = None
1250
1251 bckgrd_axis = None
1252
1253 def __init__(self, figure=None):
1254 """
1255 Creates a new figure if None is given, sets the default plot parameters
1256 @param figure default draw figure which is used
1257 """
1258 if figure is None:
1259
1260 self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1261 self.figurefigurefigurefigure.set_tight_layout(True)
1262 else:
1263 self.figurefigurefigurefigure = figure
1264
1265 gs = matplotlib.gridspec.GridSpec(8, 2)
1266
1268
1270
1272
1274
1276
1277 def add(self, data, columns, signal_mask, bckgrd_mask):
1278 """
1279 Add a new correlation plot.
1280 @param data pandas.DataFrame containing all data
1281 @param columns which are used to calculate the correlations
1282 """
1283 signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column in columns])) * 100
1284 bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask] for column in columns])) * 100
1285
1286 signal_heatmap = self.signal_axissignal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1287 self.bckgrd_axisbckgrd_axis.pcolor(bckgrd_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1288
1289 self.signal_axissignal_axis.invert_yaxis()
1290 self.signal_axissignal_axis.xaxis.tick_top()
1291 self.bckgrd_axisbckgrd_axis.invert_yaxis()
1292 self.bckgrd_axisbckgrd_axis.xaxis.tick_top()
1293
1294 # put the major ticks at the middle of each cell
1295 self.signal_axissignal_axis.set_xticks(numpy.arange(signal_corr.shape[0]) + 0.5, minor=False)
1296 self.signal_axissignal_axis.set_yticks(numpy.arange(signal_corr.shape[1]) + 0.5, minor=False)
1297
1298 self.signal_axissignal_axis.set_xticklabels(columns, minor=False, rotation=90)
1299 self.signal_axissignal_axis.set_yticklabels(columns, minor=False)
1300
1301 # put the major ticks at the middle of each cell
1302 self.bckgrd_axisbckgrd_axis.set_xticks(numpy.arange(bckgrd_corr.shape[0]) + 0.5, minor=False)
1303 self.bckgrd_axisbckgrd_axis.set_yticks(numpy.arange(bckgrd_corr.shape[1]) + 0.5, minor=False)
1304
1305 self.bckgrd_axisbckgrd_axis.set_xticklabels(columns, minor=False, rotation=90)
1306 self.bckgrd_axisbckgrd_axis.set_yticklabels(columns, minor=False)
1307
1308 for y in range(signal_corr.shape[0]):
1309 for x in range(signal_corr.shape[1]):
1310 txt = self.signal_axissignal_axis.text(x + 0.5, y + 0.5, f'{signal_corr[y, x]:.0f}',
1311 size=14,
1312 horizontalalignment='center',
1313 verticalalignment='center',
1314 color='w')
1315 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])
1316
1317 for y in range(bckgrd_corr.shape[0]):
1318 for x in range(bckgrd_corr.shape[1]):
1319 txt = self.bckgrd_axisbckgrd_axis.text(x + 0.5, y + 0.5, f'{bckgrd_corr[y, x]:.0f}',
1320 size=14,
1321 horizontalalignment='center',
1322 verticalalignment='center',
1323 color='w')
1324 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])
1325
1326 cb = self.figurefigurefigurefigure.colorbar(signal_heatmap, cax=self.colorbar_axis, ticks=[-100, 0, 100], orientation='horizontal')
1327 cb.solids.set_rasterized(True)
1328 cb.ax.set_xticklabels(['negative', 'uncorrelated', 'positive'])
1329
1330 self.signal_axissignal_axis.text(0.5, -1.0, "Signal", horizontalalignment='center')
1331 self.bckgrd_axisbckgrd_axis.text(0.5, -1.0, "Background", horizontalalignment='center')
1332
1333 # remove whitespace
1334 self.signal_axissignal_axis.set_xlim(0, signal_corr.shape[0])
1335 self.signal_axissignal_axis.set_ylim(0, signal_corr.shape[1])
1336 self.bckgrd_axisbckgrd_axis.set_xlim(0, bckgrd_corr.shape[0])
1337 self.bckgrd_axisbckgrd_axis.set_ylim(0, bckgrd_corr.shape[1])
1338 return self
1339
1340 def finish(self):
1341 """
1342 Sets limits, title, axis-labels and legend of the plot
1343 """
1344 matplotlib.artist.setp(self.bckgrd_axisbckgrd_axis.get_yticklabels(), visible=False)
1345 return self
1346
1347
1348if __name__ == '__main__':
1349
1350 def get_data(N, columns):
1351 """
1352 Creates fake data for example plots
1353 """
1354 N /= 2
1355 n = len(columns) - 1
1356 xs = numpy.random.normal(0, size=(N, n))
1357 xb = numpy.random.normal(1, size=(N, n))
1358 ys = numpy.zeros(N)
1359 yb = numpy.ones(N)
1360 data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1361 return data.reindex(numpy.random.permutation(data.index))
1362
1363 import seaborn
1364 # Set nice searborn settings
1365 seaborn.set(font_scale=3)
1366 seaborn.set_style('whitegrid')
1367
1368 # Standard plots
1369 N = 100000
1370 data = get_data(N, columns=['FastBDT', 'NeuroBayes', 'isSignal'])
1371 data['type'] = ''
1372 data.type.iloc[:N / 2] = 'Train'
1373 data.type.iloc[N / 2:] = 'Test'
1374
1375 p = Box()
1376 p.add(data, 'FastBDT')
1377 p.finish()
1378 p.save('box_plot.png')
1379
1381 p.add(data, 'FastBDT')
1382 p.add(data, 'NeuroBayes')
1383 p.finish()
1384 p.save('verbose_distribution_plot.png')
1385
1387 p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1388 p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1389 p.finish()
1390 p.save('roc_purity_plot.png')
1391
1393 p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1394 p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1395 p.finish()
1396 p.save('roc_rejection_plot.png')
1397
1398 p = Diagonal()
1399 p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1400 p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1401 p.finish()
1402 p.save('diagonal_plot.png')
1403
1404 p = Distribution()
1405 p.add(data, 'FastBDT')
1406 p.add(data, 'NeuroBayes')
1407 p.finish()
1408 p.save('distribution_plot.png')
1409
1410 p = Difference()
1411 p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test')
1412 p.add(data, 'NeuroBayes', data['type'] == 'Train', data['type'] == 'Test')
1413 p.finish()
1414 p.save('difference_plot.png')
1415
1416 p = Overtraining()
1417 p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test', data['isSignal'] == 1, data['isSignal'] == 0)
1418 p.finish()
1419 p.save('overtraining_plot.png')
1420
1421 p = Correlation()
1422 p.add(data, 'FastBDT', 'NeuroBayes', [0, 20, 40, 60, 80, 100], data['isSignal'] == 0)
1423 p.finish()
1424 p.save('correlation_plot.png')
1425
1426 p = CorrelationMatrix()
1427 data['FastBDT2'] = data['FastBDT']**2
1428 data['NeuroBayes2'] = data['NeuroBayes']**2
1429 data['FastBDT3'] = data['FastBDT']**3
1430 data['NeuroBayes3'] = data['NeuroBayes']**3
1431 p.add(data, ['FastBDT', 'NeuroBayes', 'FastBDT2', 'NeuroBayes2', 'FastBDT3', 'NeuroBayes3'])
1432 p.finish()
1433 p.save('correlation_matrix.png')
def calculate_flatness(f, p, w=None)
x_axis_label
Label on x axis.
Definition: plotting.py:718
def add(self, data, column, mask=None, weight_column=None)
Definition: plotting.py:720
def __init__(self, figure=None, axis=None)
Definition: plotting.py:709
def finish(self)
Definition: plotting.py:759
signal_axis
add signal subplot
Definition: plotting.py:1267
def add(self, data, columns, signal_mask, bckgrd_mask)
Definition: plotting.py:1277
colorbar_axis
Colorbar axis contains the colorbar.
Definition: plotting.py:1271
None bckgrd_axis
Axis which shows the correlation of the background samples.
Definition: plotting.py:1251
def __init__(self, figure=None)
Definition: plotting.py:1253
None figure
figure which is used to draw
Definition: plotting.py:1247
None signal_axis
Main axis which shows the correlation of the signal samples.
Definition: plotting.py:1249
bckgrd_axis
add background subplot
Definition: plotting.py:1269
axis
Usual axis object which every Plotter object needs, here it is just a dummy.
Definition: plotting.py:1273
def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
Definition: plotting.py:1099
axis_d1
define second subplot
Definition: plotting.py:1093
figure
create figure
Definition: plotting.py:1084
None axis_d1
Axis which shows shape of signal.
Definition: plotting.py:1073
None axis
Main axis which is used to draw.
Definition: plotting.py:1071
def __init__(self, figure=None)
Definition: plotting.py:1077
axis_d2
define third subplot
Definition: plotting.py:1095
None figure
figure which is used to draw
Definition: plotting.py:1069
None axis_d2
Axis which shows shape of background.
Definition: plotting.py:1075
axis
define first subplot
Definition: plotting.py:1091
ymax
Maximum y value.
Definition: plotting.py:563
xmax
Maximum x value.
Definition: plotting.py:561
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:549
def finish(self)
Definition: plotting.py:570
x_axis_label
Label on x axis.
Definition: plotting.py:839
shift_to_zero
Mean difference is shifted to zero (removes constant offset) if this is true.
Definition: plotting.py:796
def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
Definition: plotting.py:786
ymax
Maximum y value.
Definition: plotting.py:799
xmax
Maximum x value.
Definition: plotting.py:829
def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
Definition: plotting.py:804
ymin
min y value
Definition: plotting.py:798
def finish(self, line_color='black')
Definition: plotting.py:842
normed
Minuend and subtrahend are normed before comparing them if this is true.
Definition: plotting.py:795
def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
Definition: plotting.py:591
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:623
x_axis_label
x axis label
Definition: plotting.py:621
keep_first_binning
Keep first binning if user wants so.
Definition: plotting.py:617
normed_to_all_entries
Normalize histograms before drawing them.
Definition: plotting.py:602
first_binning
first binning
Definition: plotting.py:619
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:606
normed_to_bin_width
Normalize histograms before drawing them.
Definition: plotting.py:604
def add(self, data, columns, variables)
Definition: plotting.py:1189
def finish(self)
Definition: plotting.py:1235
def add(self, i, *args, **kwargs)
Definition: plotting.py:524
figure
create figure
Definition: plotting.py:504
def __init__(self, cls, number_of_plots, figure=None)
Definition: plotting.py:497
None axis
Main axis.
Definition: plotting.py:495
None figure
figure which is used to draw
Definition: plotting.py:493
sub_plots
the subplots which are displayed in the grid
Definition: plotting.py:519
axis
the axis of the first subplot
Definition: plotting.py:521
def finish(self)
Definition: plotting.py:531
axis_d1
define second subplot
Definition: plotting.py:888
figure
create figure
Definition: plotting.py:879
def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:894
None axis_d1
Axis which shows the difference between training and test signal.
Definition: plotting.py:868
None axis
Main axis which is used to draw.
Definition: plotting.py:866
def __init__(self, figure=None)
Definition: plotting.py:872
axis_d2
define third subplot
Definition: plotting.py:890
None figure
figure which is used to draw
Definition: plotting.py:864
None axis_d2
Axis which shows the difference between training and test background.
Definition: plotting.py:870
axis
define first subplot
Definition: plotting.py:886
def finish(self, *args, **kwargs)
Definition: plotting.py:260
fill_kwargs
Default keyword arguments for fill_between function.
Definition: plotting.py:118
None ymin
Minimum y value.
Definition: plotting.py:68
def set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
Definition: plotting.py:167
plots
create empty list for plots
Definition: plotting.py:99
float xscale
limit scale
Definition: plotting.py:72
figure
create figure
Definition: plotting.py:87
None ymax
Maximum y value.
Definition: plotting.py:70
errorband_kwargs
Default keyword arguments for errorband function.
Definition: plotting.py:116
None axis
Main axis which is used to draw.
Definition: plotting.py:76
def scale_limits(self)
Definition: plotting.py:266
def add(self, *args, **kwargs)
Definition: plotting.py:254
None xmin
Minimum x value.
Definition: plotting.py:64
def set_fill_options(self, fill_kwargs=None)
Definition: plotting.py:175
def save(self, filename)
Definition: plotting.py:140
def __init__(self, figure=None, axis=None)
Definition: plotting.py:78
None figure
figure which is used to draw
Definition: plotting.py:74
ymax
set y limits
Definition: plotting.py:105
None plots
Plots added to the axis so far.
Definition: plotting.py:60
prop_cycler
Property cycler used to give plots unique colors.
Definition: plotting.py:126
xmax
set x limits
Definition: plotting.py:103
errorbar_kwargs
Default keyword arguments for errorbar function.
Definition: plotting.py:114
labels
create empty list for labels
Definition: plotting.py:101
axis
divide figure into subplots
Definition: plotting.py:94
def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
Definition: plotting.py:183
def set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Overrides default errorbar options for datapoint errorbars.
Definition: plotting.py:159
float yscale
limit scale
Definition: plotting.py:71
None labels
Labels of the plots added so far.
Definition: plotting.py:62
def add_subplot(self, gridspecs)
Definition: plotting.py:128
def set_plot_options(self, plot_kwargs={ 'linestyle':''})
Definition: plotting.py:151
plot_kwargs
Default keyword arguments for plot function.
Definition: plotting.py:112
None xmax
Maximum x value.
Definition: plotting.py:66
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:286
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:395
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:441
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:348
def add(self, data, columns, *masks)
Definition: plotting.py:1156
def finish(self)
Definition: plotting.py:1177
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:1014
distribution
The distribution plot.
Definition: plotting.py:1012
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:1008
None box_axes
Axes for the boxplots.
Definition: plotting.py:994
box_axes
create empty list for box axes
Definition: plotting.py:1010
normed
Normalize histograms before drawing them.
Definition: plotting.py:1006
def __init__(self, figure=None, axis=None, normed=False, range_in_std=None)
Definition: plotting.py:996
def weighted_mean_and_std(x, w)
Definition: histogram.py:31
def poisson_error(n_tot)
Definition: histogram.py:24
Definition: plot.py:1