Belle II Software development
plotting.py
1#!/usr/bin/env python3
2
3
4
11
12import copy
13import math
14
15import pandas
16import numpy
17import itertools
18import matplotlib.pyplot as plt
19import matplotlib.artist
20import matplotlib.figure
21import matplotlib.gridspec
22import matplotlib.colors
23import matplotlib.patches
24import matplotlib.ticker
25import matplotlib.patheffects as PathEffects
26
27
28from basf2_mva_evaluation import histogram
29
30import basf2 as b2
31
32import basf2_mva_util
33import matplotlib
34
35# Do not use standard backend TkAgg, because it is NOT thread-safe
36# You will get an RuntimeError: main thread is not in main loop otherwise!
37matplotlib.use("svg")
38
39# Use the Belle II style while producing the plots
40plt.style.use("belle2")
41
42
43class Plotter:
44 """
45 Base class for all Plotters.
46 """
47
48 # stupid workaround for doxygen refusing to document things
49
50
52
53
57
58
59 plots = None
60
61 labels = None
62
63 xmin = None
64
65 xmax = None
66
67 ymin = None
68
69 ymax = None
70 yscale = 0.0
71 xscale = 0.0
72
73 figure = None
74
75 axis = None
76
77 def __init__(self, figure=None, axis=None):
78 """
79 Creates a new figure and axis if None is given, sets the default plot parameters
80 @param figure default draw figure which is used
81 @param axis default draw axis which is used
82 """
83 b2.B2INFO("Create new figure for class " + str(type(self)))
84 if figure is None:
85
86 self.figurefigure = matplotlib.figure.Figure(figsize=(12, 8), dpi=120)
87 self.figurefigure.set_tight_layout(True)
88 else:
89 self.figurefigure = figure
90
91 if axis is None:
92
93 self.axisaxis = self.figurefigure.add_subplot(1, 1, 1)
94 else:
95 self.axisaxis = axis
96
97
98 self.plotsplots = []
99
100 self.labelslabels = []
101
102 self.xmin, self.xmaxxmax = float(0), float(1)
103
104 self.ymin, self.ymaxymax = float(0), float(1)
105
106 self.yscaleyscale = 0.1
107
108 self.xscalexscale = 0.0
109
110
111 self.plot_kwargs = None
112
113 self.errorbar_kwargs = None
114
116
117 self.fill_kwargs = None
118
119 self.set_plot_options()
122 self.set_fill_options()
123
124
125 self.prop_cycler = itertools.cycle(plt.rcParams["axes.prop_cycle"])
126
127 def add_subplot(self, gridspecs):
128 """
129 Adds a new subplot to the figure, updates all other axes
130 according to the given gridspec
131 @param gridspecs gridspecs for all axes including the new one
132 """
133 for gs, ax in zip(gridspecs[:-1], self.figurefigure.axes):
134 ax.set_position(gs.get_position(self.figurefigure))
135 ax.set_subplotspec(gs)
136 axis = self.figurefigure.add_subplot(gridspecs[-1], sharex=self.axisaxis)
137 return axis
138
139 def save(self, filename):
140 """
141 Save the figure into a file
142 @param filename of the file
143 """
144 b2.B2INFO("Save figure for class " + str(type(self)))
145 from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
146 canvas = FigureCanvas(self.figurefigure)
147 canvas.print_figure(filename, dpi=50)
148 return self
149
150 def set_plot_options(self, plot_kwargs={'linestyle': ''}):
151 """
152 Overrides default plot options for datapoint plot
153 @param plot_kwargs keyword arguments for the plot function
154 """
155 self.plot_kwargs = copy.copy(plot_kwargs)
156 return self
157
158 def set_errorbar_options(self, errorbar_kwargs={'fmt': '.', 'elinewidth': 3, 'alpha': 1}):
159 """
160 Overrides default errorbar options for datapoint errorbars
161 @param errorbar_kwargs keyword arguments for the errorbar function
162 """
163 self.errorbar_kwargs = copy.copy(errorbar_kwargs)
164 return self
165
166 def set_errorband_options(self, errorband_kwargs={'alpha': 0.5}):
167 """
168 Overrides default errorband options for datapoint errorband
169 @param errorbar_kwargs keyword arguments for the fill_between function
170 """
171 self.errorband_kwargs = copy.copy(errorband_kwargs)
172 return self
173
174 def set_fill_options(self, fill_kwargs=None):
175 """
176 Overrides default fill_between options for datapoint errorband
177 @param fill_kwargs keyword arguments for the fill_between function
178 """
179 self.fill_kwargs = copy.copy(fill_kwargs)
180 return self
181
182 def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None):
183 """
184 Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
185 @param x coordinates of the data points
186 @param y coordinates of the data points
187 @param xerr symmetric error on x data points
188 @param yerr symmetric error on y data points
189 """
190 p = e = f = None
191 plot_kwargs = copy.copy(self.plot_kwargs)
192 errorbar_kwargs = copy.copy(self.errorbar_kwargs)
193 errorband_kwargs = copy.copy(self.errorband_kwargs)
194 fill_kwargs = copy.copy(self.fill_kwargs)
195
196 if plot_kwargs is None or 'color' not in plot_kwargs:
197 color = next(self.prop_cycler)
198 color = color['color']
199 plot_kwargs['color'] = color
200 else:
201 color = plot_kwargs['color']
202 color = matplotlib.colors.ColorConverter().to_rgb(color)
203 patch = matplotlib.patches.Patch(color=color, alpha=0.5)
204 patch.get_color = patch.get_facecolor
205 patches = [patch]
206
207 if plot_kwargs is not None:
208 p, = axis.plot(x, y, rasterized=True, **plot_kwargs)
209 patches.append(p)
210
211 if errorbar_kwargs is not None and (xerr is not None or yerr is not None):
212 if 'color' not in errorbar_kwargs:
213 errorbar_kwargs['color'] = color
214 if 'ecolor' not in errorbar_kwargs:
215 errorbar_kwargs['ecolor'] = [0.5 * x for x in color]
216
217 # fully mask nan values.
218 # Needed until https://github.com/matplotlib/matplotlib/pull/23333 makes it into the externals.
219 # TODO: remove in release 8.
220 if not isinstance(xerr, (numpy.ndarray, list)):
221 xerr = xerr*numpy.ones(len(x))
222 mask = numpy.logical_and.reduce([numpy.isfinite(v) for v in [x, y, xerr, yerr]])
223
224 e = axis.errorbar(
225 x[mask], y[mask], xerr=numpy.where(
226 xerr[mask] < 0, 0.0, xerr[mask]), yerr=numpy.where(
227 yerr[mask] < 0, 0.0, yerr[mask]), rasterized=True, **errorbar_kwargs)
228 patches.append(e)
229
230 if errorband_kwargs is not None and yerr is not None:
231 if 'color' not in errorband_kwargs:
232 errorband_kwargs['color'] = color
233 if xerr is not None:
234 # Ensure that xerr and yerr are iterable numpy arrays
235 xerr = x + xerr - x
236 yerr = y + yerr - y
237 for _x, _y, _xe, _ye in zip(x, y, xerr, yerr):
238 axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=True,
239 **errorband_kwargs))
240 else:
241 f = axis.fill_between(x, y - yerr, y + yerr, interpolate=True, rasterized=True, **errorband_kwargs)
242
243 if fill_kwargs is not None:
244 # to fill the last bin of a histogram
245 x = numpy.append(x, x[-1]+2*xerr[-1])
246 y = numpy.append(y, y[-1])
247 xerr = numpy.append(xerr, xerr[-1])
248
249 axis.fill_between(x-xerr, y, 0, rasterized=True, **fill_kwargs)
250
251 return (tuple(patches), p, e, f)
252
253 def add(self, *args, **kwargs):
254 """
255 Add a new plot to this plotter
256 """
257 return NotImplemented
258
259 def finish(self, *args, **kwargs):
260 """
261 Finish plotting and set labels, legends and stuff
262 """
263 return NotImplemented
264
265 def scale_limits(self):
266 """
267 Scale limits to increase distance to boundaries
268 """
269 self.ymin *= 1.0 - math.copysign(self.yscaleyscale, self.ymin)
270 self.ymaxymax *= 1.0 + math.copysign(self.yscaleyscale, self.ymaxymax)
271 self.xmin *= 1.0 - math.copysign(self.xscalexscale, self.xmin)
272 self.xmaxxmax *= 1.0 + math.copysign(self.xscalexscale, self.xmaxxmax)
273 return self
274
275
277 """
278 Plots the purity and the efficiency over the cut value (for cut choosing)
279 """
280
284
285 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
286 """
287 Add a new curve to the plot
288 @param data pandas.DataFrame containing all data
289 @param column which is used to calculate efficiency and purity for different cuts
290 @param signal_mask boolean numpy.array defining which events are signal events
291 @param bckgrd_mask boolean numpy.array defining which events are background events
292 @param weight_column column in data containing the weights for each event
293 """
294
295 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
296
297 if normed:
298 efficiency, efficiency_error = hists.get_efficiency(['Signal'])
299 purity, purity_error = hists.get_purity(['Signal'], ['Background'])
300 else:
301 efficiency, efficiency_error = hists.get_true_positives(['Signal'])
302 purity, purity_error = hists.get_false_positives(['Background'])
303
304 cuts = hists.bin_centers
305
306 self.xmin, self.xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmaxxmaxxmax])
307 self.ymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.ymin]), \
308 numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.ymaxymaxymax])
309
310 self.plotsplots.append(self._plot_datapoints(self.axisaxis, cuts, efficiency, xerr=0, yerr=efficiency_error))
311
312 if normed:
313 self.labelslabels.append("Efficiency")
314 else:
315 self.labelslabels.append("True positive")
316
317 self.plotsplots.append(self._plot_datapoints(self.axisaxis, cuts, purity, xerr=0, yerr=purity_error))
318
319 if normed:
320 self.labelslabels.append("Purity")
321 else:
322 self.labelslabels.append("False positive")
323
324 return self
325
326 def finish(self):
327 """
328 Sets limits, title, axis-labels and legend of the plot
329 """
330 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
331 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
332 self.axisaxis.set_title("Classification Plot")
333 self.axisaxis.get_xaxis().set_label_text('Cut Value')
334 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
335 return self
336
337
339 """
340 Plots the signal to noise ratio over the cut value (for cut choosing)
341 """
342
346
347 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
348 """
349 Add a new curve to the plot
350 @param data pandas.DataFrame containing all data
351 @param column which is used to calculate signal to noise ratio for different cuts
352 @param signal_mask boolean numpy.array defining which events are signal events
353 @param bckgrd_mask boolean numpy.array defining which events are background events
354 @param weight_column column in data containing the weights for each event
355 """
356
357 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
358
359 signal2noise, signal2noise_error = hists.get_signal_to_noise(['Signal'], ['Background'])
360
361 cuts = hists.bin_centers
362
363 self.xmin, self.xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmaxxmaxxmax])
364 self.ymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(signal2noise), self.ymin]), \
365 numpy.nanmax([numpy.nanmax(signal2noise), self.ymaxymaxymax])
366
367 self.plotsplots.append(self._plot_datapoints(self.axisaxis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))
368
369 self.labelslabels.append(column)
370
371 return self
372
373 def finish(self):
374 """
375 Sets limits, title, axis-labels and legend of the plot
376 """
377 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
378 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
379 self.axisaxis.set_title("Signal to Noise Plot")
380 self.axisaxis.get_xaxis().set_label_text('Cut Value')
381 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
382 return self
383
384
386 """
387 Plots the purity over the efficiency also known as ROC curve
388 """
389
393
394 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
395 """
396 Add a new curve to the ROC plot
397 @param data pandas.DataFrame containing all data
398 @param column which is used to calculate efficiency and purity for different cuts
399 @param signal_mask boolean numpy.array defining which events are signal events
400 @param bckgrd_mask boolean numpy.array defining which events are background events
401 @param weight_column column in data containing the weights for each event
402 """
403 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
404 efficiency, efficiency_error = hists.get_efficiency(['Signal'])
405 purity, purity_error = hists.get_purity(['Signal'], ['Background'])
406
407 self.xmin, self.xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmaxxmaxxmax])
408 self.ymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymaxymaxymax])
409
410 p = self._plot_datapoints(self.axisaxis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
411 self.plotsplots.append(p)
412 if label is not None:
413 self.labelslabels.append(label)
414 else:
415 self.labelslabels.append(column)
416 return self
417
418 def finish(self):
419 """
420 Sets limits, title, axis-labels and legend of the plot
421 """
422 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
423 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
424 self.axisaxis.set_title("ROC Purity Plot")
425 self.axisaxis.get_xaxis().set_label_text('Efficiency')
426 self.axisaxis.get_yaxis().set_label_text('Purity')
427 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
428 return self
429
430
432 """
433 Plots the rejection over the efficiency also known as ROC curve
434 """
435
439
440 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
441 """
442 Add a new curve to the ROC plot
443 @param data pandas.DataFrame containing all data
444 @param column which is used to calculate efficiency and purity for different cuts
445 @param signal_mask boolean numpy.array defining which events are signal events
446 @param bckgrd_mask boolean numpy.array defining which events are background events
447 @param weight_column column in data containing the weights for each event
448 """
449 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
450 efficiency, efficiency_error = hists.get_efficiency(['Signal'])
451 rejection, rejection_error = hists.get_efficiency(['Background'])
452 rejection = 1 - rejection
453 if isinstance(efficiency, int) and not isinstance(rejection, int):
454 efficiency = numpy.array([efficiency] * len(rejection))
455 elif isinstance(rejection, int) and not isinstance(efficiency, int):
456 rejection = numpy.array([rejection] * len(efficiency))
457 elif isinstance(rejection, int) and isinstance(efficiency, int):
458 efficiency = numpy.array([efficiency])
459 rejection = numpy.array([rejection])
460
461 self.xmin, self.xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmaxxmaxxmax])
462 self.ymin, self.ymaxymaxymax = numpy.nanmin([rejection.min(), self.ymin]), numpy.nanmax([rejection.max(), self.ymaxymaxymax])
463
464 auc = numpy.abs(numpy.trapz(rejection, efficiency))
465
466 p = self._plot_datapoints(self.axisaxis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
467 self.plotsplots.append(p)
468 if label is not None:
469 self.labelslabels.append(label[:10] + f" ({auc:.2f})")
470 else:
471 self.labelslabels.append(column[:10] + f" ({auc:.2f})")
472 return self
473
474 def finish(self):
475 """
476 Sets limits, title, axis-labels and legend of the plot
477 """
478 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
479 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
480 self.axisaxis.set_title("ROC Rejection Plot")
481 self.axisaxis.get_xaxis().set_label_text('Signal Efficiency')
482 self.axisaxis.get_yaxis().set_label_text('Background Rejection')
483 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
484 return self
485
486
488 """
489 Plots multiple other plots into a grid 3x?
490 """
491
492 figure = None
493
494 axis = None
495
496 def __init__(self, cls, number_of_plots, figure=None):
497 """
498 Creates a new figure if None is given, sets the default plot parameters
499 @param figure default draw figure which is used
500 """
501 if figure is None:
502
503 self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(12, 8), dpi=120)
504 self.figurefigurefigurefigure.set_tight_layout(True)
505 else:
506 self.figurefigurefigurefigure = figure
507
508 if number_of_plots == 1:
509 gs = matplotlib.gridspec.GridSpec(1, 1)
510 elif number_of_plots == 2:
511 gs = matplotlib.gridspec.GridSpec(1, 2)
512 elif number_of_plots == 3:
513 gs = matplotlib.gridspec.GridSpec(1, 3)
514 else:
515 gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
516
517
518 self.sub_plots = [cls(self.figurefigurefigurefigure, self.figurefigurefigurefigure.add_subplot(gs[i // 3, i % 3])) for i in range(number_of_plots)]
519
520 self.axisaxisaxisaxis = self.sub_plots[0].axis
522
523 def add(self, i, *args, **kwargs):
524 """
525 Call add function of ith subplot
526 @param i position of the subplot
527 """
528 self.sub_plots[i].add(*args, **kwargs)
529
530 def finish(self):
531 """
532 Sets limits, title, axis-labels and legend of the plot
533 """
534 for plot in self.sub_plots:
535 plot.finish()
536 return self
537
538
540 """
541 Plots the purity in each bin over the classifier output.
542 """
543
547
548 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
549 """
550 Add a new curve to the Diagonal plot
551 @param data pandas.DataFrame containing all data
552 @param column which is used to calculate purity for different cuts
553 @param signal_mask boolean numpy.array defining which events are signal events
554 @param bckgrd_mask boolean numpy.array defining which events are background events
555 @param weight_column column in data containing the weights for each event
556 """
557 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
558 purity, purity_error = hists.get_purity_per_bin(['Signal'], ['Background'])
559
560 self.xmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
561 # self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
562 self.ymin, self.ymaxymaxymax = 0, 1
563
564 p = self._plot_datapoints(self.axisaxis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
565 self.plotsplots.append(p)
566 self.labelslabels.append(column)
567 return self
568
569 def finish(self):
570 """
571 Sets limits, title, axis-labels and legend of the plot
572 """
573 self.scale_limits()
574 self.axisaxis.plot((0.0, 1.0), (0.0, 1.0), color='black')
575 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
576 self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))
577 self.axisaxis.set_title("Diagonal Plot")
578 self.axisaxis.get_xaxis().set_label_text('Classifier Output')
579 self.axisaxis.get_yaxis().set_label_text('Purity Per Bin')
580 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
581 return self
582
583
585 """
586 Plots distribution of a quantity
587 """
588
589 def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
590 keep_first_binning=False, range_in_std=None):
591 """
592 Creates a new figure and axis if None is given, sets the default plot parameters
593 @param figure default draw figure which is used
594 @param axis default draw axis which is used
595 @param normed true if histograms should be normed before drawing
596 @param keep_first_binning use the binning of the first distribution for further plots
597 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
598 """
599 super().__init__(figure, axis)
600
601 self.normed_to_all_entries = normed_to_all_entries
602
603 self.normed_to_bin_width = normed_to_bin_width
604
605 self.range_in_std = range_in_std
606 # if self.normed_to_all_entries or self.normed_to_bin_width:
607
608 self.yminymin = float(0)
609
610 self.ymaxymaxymax = float('-inf')
611
612 self.xminxmin = float('inf')
613
614 self.xmaxxmaxxmax = float('-inf')
615
616 self.keep_first_binning = keep_first_binning
617
618 self.first_binning = None
619
620 self.x_axis_label = ''
621
622 def add(self, data, column, mask=None, weight_column=None, label=None):
623 """
624 Add a new distribution to the plots
625 @param data pandas.DataFrame containing all data
626 @param column which is used to calculate distribution histogram
627 @param mask boolean numpy.array defining which events are used for the histogram
628 @param weight_column column in data containing the weights for each event
629 """
630 if mask is None:
631 mask = numpy.ones(len(data)).astype('bool')
632
633 bins = 100
634 if self.keep_first_binning and self.first_binning is not None:
635 bins = self.first_binning
636 hists = histogram.Histograms(data, column, {'Total': mask}, weight_column=weight_column,
637 bins=bins, equal_frequency=False, range_in_std=self.range_in_std)
638 if self.keep_first_binning and self.first_binning is None:
639 self.first_binning = hists.bins
640 hist, hist_error = hists.get_hist('Total')
641
642 if self.normed_to_all_entries:
643 normalization = float(numpy.sum(hist))
644 hist = hist / normalization
645 hist_error = hist_error / normalization
646
647 if self.normed_to_bin_width:
648 hist = hist / hists.bin_widths
649 hist_error = hist_error / hists.bin_widths
650
651 self.xminxmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xminxmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
652 self.yminymin = numpy.nanmin([hist.min(), self.yminymin])
653 self.ymaxymaxymax = numpy.nanmax([(hist + hist_error).max(), self.ymaxymaxymax])
654
655 p = self._plot_datapoints(self.axisaxis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
656 self.plotsplots.append(p)
657 self.x_axis_label = column
658
659 appendix = ''
660 if self.ymaxymaxymax <= self.yminymin or self.xmaxxmaxxmax <= self.xminxmin:
661 appendix = ' No data to plot!'
662
663 if label is None:
664 self.labelslabels.append(column + appendix)
665 else:
666 self.labelslabels.append(label + appendix)
667 return self
668
669 def finish(self):
670 """
671 Sets limits, title, axis-labels and legend of the plot
672 """
673 self.axisaxis.set_title("Distribution Plot")
674 self.axisaxis.get_xaxis().set_label_text(self.x_axis_label)
675
676 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
677
678 if self.ymaxymaxymax <= self.yminymin or self.xmaxxmaxxmax <= self.xminxmin:
679 self.axisaxis.set_xlim((0., 1.))
680 self.axisaxis.set_ylim((0., 1.))
681 self.axisaxis.text(0.36, 0.5, 'No data to plot', fontsize=60, color='black')
682 return self
683
684 self.scale_limits()
685
686 self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))
687 self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
688
690 self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / (# Entries * Bin Width)')
691 elif self.normed_to_all_entries:
692 self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / # Entries')
693 elif self.normed_to_bin_width:
694 self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / Bin Width')
695 else:
696 self.axisaxis.get_yaxis().set_label_text('# Entries per Bin')
697
698 return self
699
700
702 """
703 Create a boxplot
704 """
705
707
708 def __init__(self, figure=None, axis=None, x_axis_label=None):
709 """
710 Creates a new figure and axis if None is given, sets the default plot parameters
711 @param figure default draw figure which is used
712 @param axis default draw axis which is used
713 """
714 super().__init__(figure=figure, axis=axis)
715
716
717 self.x_axis_label = x_axis_label
718
719 def add(self, data, column, mask=None, weight_column=None):
720 """
721 Add a new boxplot to the plots
722 @param data pandas.DataFrame containing all data
723 @param column which is used to calculate boxplot quantities
724 @param mask boolean numpy.array defining which events are used for the histogram
725 @param weight_column column in data containing the weights for each event
726 """
727 if mask is None:
728 mask = numpy.ones(len(data)).astype('bool')
729 x = data[column][mask]
730 if weight_column is not None:
731 # weight = data[weight_column][mask]
732 b2.B2WARNING("Weights are currently not used in boxplot, due to limitations in matplotlib")
733
734 if len(x) == 0:
735 b2.B2WARNING("Ignore empty boxplot.")
736 return self
737
738 # we don't plot outliers as they cause the file size to explode if large datasets are used
739 p = self.axisaxis.boxplot(x, sym='k.', whis=1.5, vert=False, patch_artist=True, showmeans=True, widths=1,
740 boxprops=dict(facecolor='blue', alpha=0.5), showfliers=False,
741 # medianprobs=dict(color='blue'),
742 # meanprobs=dict(color='red'),
743 )
744 self.plotsplots.append(p)
745 self.labelslabels.append(column)
746 if not self.x_axis_label:
747 self.x_axis_label = column
748 r"""
749 self.axisaxis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + '\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
750 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axisaxis.transAxes)
751 self.axisaxis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + '\n' + r'$IQD = {:.2f}$').format(x.std(),
752 x.quantile(0.75) - x.quantile(0.25)),
753 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axisaxis.transAxes)
754 self.axisaxis.text(0.7, 0.9, (r'$min = {:.2f}$' + '\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
755 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axisaxis.transAxes)
756 """
757
758 return self
759
760 def finish(self):
761 """
762 Sets limits, title, axis-labels and legend of the plot
763 """
764 matplotlib.artist.setp(self.axisaxis.get_yaxis(), visible=False)
765 self.axisaxis.get_xaxis().set_label_text(self.x_axis_label)
766 self.axisaxis.set_title("Box Plot")
767 return self
768
769
771 """
772 Plots the difference between two histograms
773 """
774
786
787 def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
788 """
789 Creates a new figure and axis if None is given, sets the default plot parameters
790 @param figure default draw figure which is used
791 @param axis default draw axis which is used
792 @param normed normalize minuend and subtrahend before comparing them
793 @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
794 """
795 super().__init__(figure, axis)
796 self.normed = normed
797 self.shift_to_zero = shift_to_zero
798 if self.normed:
799 self.yminymin = -0.01
800 self.ymaxymaxymax = 0.01
801 else:
802 self.yminymin = -1
803 self.ymaxymaxymax = 1
804
805 def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
806 """
807 Add a new difference plot
808 @param data pandas.DataFrame containing all data
809 @param column which is used to calculate distribution histogram
810 @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
811 @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
812 @param weight_column column in data containing the weights for each event
813 @param label label for the legend if None, the column name is used
814 """
815 hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask},
816 weight_column=weight_column, equal_frequency=False)
817 minuend, minuend_error = hists.get_hist('Minuend')
818 subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
819
820 difference_error = histogram.poisson_error(minuend + subtrahend)
821 if self.normed:
822 difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
823 minuend = minuend / numpy.sum(minuend)
824 subtrahend = subtrahend / numpy.sum(subtrahend)
825 difference = minuend - subtrahend
826
827 if self.shift_to_zero:
828 difference = difference - numpy.mean(difference)
829
830 self.xmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)
831 self.yminymin = min((difference - difference_error).min(), self.yminymin)
832 self.ymaxymaxymax = max((difference + difference_error).max(), self.ymaxymaxymax)
833
834 p = self._plot_datapoints(self.axisaxis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
835 self.plotsplots.append(p)
836 if label is None:
837 self.labelslabels.append(label)
838 else:
839 self.labelslabels.append(column)
840 self.x_axis_label = column
841 return self
842
843 def finish(self, line_color='black'):
844 """
845 Sets limits, title, axis-labels and legend of the plot
846 """
847 self.axisaxis.plot((self.xmin, self.xmaxxmaxxmax), (0, 0), color=line_color, linewidth=4, rasterized=True)
848 self.scale_limits()
849 self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))
850 self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))
851 self.axisaxis.set_title("Difference Plot")
852 self.axisaxis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
853 self.axisaxis.get_xaxis().set_label_text(self.x_axis_label)
854 self.axisaxis.get_yaxis().set_label_text('Diff.')
855 self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)
856 return self
857
858
860 """
861 Create TMVA-like overtraining control plot for a classification training
862 """
863
864
865 figure = None
866
867 axis = None
868
869 axis_d1 = None
870
871 axis_d2 = None
872
873 def __init__(self, figure=None):
874 """
875 Creates a new figure if None is given, sets the default plot parameters
876 @param figure default draw figure which is used
877 """
878 if figure is None:
879
880 self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(12, 8), dpi=120)
881 self.figurefigurefigurefigure.set_tight_layout(True)
882 else:
883 self.figurefigurefigurefigure = figure
884
885 gs = matplotlib.gridspec.GridSpec(5, 1)
886
888
890
892
894
895 def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
896 """
897 Add a new overtraining plot, I recommend to draw only one overtraining plot at the time,
898 otherwise there are too many curves in the plot to recognize anything in the plot.
899 @param data pandas.DataFrame containing all data
900 @param column which is used to calculate distribution histogram
901 @param train_mask boolean numpy.array defining which events are training events
902 @param test_mask boolean numpy.array defining which events are test events
903 @param signal_mask boolean numpy.array defining which events are signal events
904 @param bckgrd_mask boolean numpy.array defining which events are background events
905 @param weight_column column in data containing the weights for each event
906 """
907 distribution = Distribution(self.figurefigurefigurefigure, self.axisaxisaxisaxis, normed_to_all_entries=True)
908
909 distribution.set_plot_options(self.plot_kwargs)
910 distribution.set_errorbar_options(self.errorbar_kwargs)
911 distribution.set_errorband_options(self.errorband_kwargs)
912 distribution.add(data, column, test_mask & signal_mask, weight_column)
913 distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
914
915 distribution.set_plot_options(
916 {'color': distribution.plots[0][0][0].get_color(), 'linestyle': '-', 'lw': 4, 'drawstyle': 'steps-mid'})
917 distribution.set_fill_options({'color': distribution.plots[0][0][0].get_color(), 'alpha': 0.5, 'step': 'post'})
918 distribution.set_errorbar_options(None)
919 distribution.set_errorband_options(None)
920 distribution.add(data, column, train_mask & signal_mask, weight_column)
921 distribution.set_plot_options(
922 {'color': distribution.plots[1][0][0].get_color(), 'linestyle': '-', 'lw': 4, 'drawstyle': 'steps-mid'})
923 distribution.set_fill_options({'color': distribution.plots[1][0][0].get_color(), 'alpha': 0.5, 'step': 'post'})
924 distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
925
926 distribution.labels = ['Test-Signal', 'Test-Background', 'Train-Signal', 'Train-Background']
927 distribution.finish()
928
929 self.plot_kwargs['color'] = distribution.plots[0][0][0].get_color()
930 difference_signal = Difference(self.figurefigurefigurefigure, self.axis_d1axis_d1, shift_to_zero=True, normed=True)
931 difference_signal.set_plot_options(self.plot_kwargs)
932 difference_signal.set_errorbar_options(self.errorbar_kwargs)
933 difference_signal.set_errorband_options(self.errorband_kwargs)
934 difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
935 self.axis_d1axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
936 self.axis_d1axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
937 difference_signal.plots = difference_signal.labels = []
938 difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
939
940 self.plot_kwargs['color'] = distribution.plots[1][0][0].get_color()
941 difference_bckgrd = Difference(self.figurefigurefigurefigure, self.axis_d2axis_d2, shift_to_zero=True, normed=True)
942 difference_bckgrd.set_plot_options(self.plot_kwargs)
943 difference_bckgrd.set_errorbar_options(self.errorbar_kwargs)
944 difference_bckgrd.set_errorband_options(self.errorband_kwargs)
945 difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
946 self.axis_d2axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
947 self.axis_d2axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
948 difference_bckgrd.plots = difference_bckgrd.labels = []
949 difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
950
951 try:
952 import scipy.stats
953 # Kolmogorov smirnov test
954 if len(data[column][train_mask & signal_mask]) == 0 or len(data[column][test_mask & signal_mask]) == 0:
955 b2.B2WARNING("Cannot calculate kolmogorov smirnov test for signal due to missing data")
956 else:
957 ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
958 props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
959 self.axis_d1axis_d1.text(0.1, 0.9, r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), bbox=props,
960 verticalalignment='top', horizontalalignment='left', transform=self.axis_d1axis_d1.transAxes)
961 if len(data[column][train_mask & bckgrd_mask]) == 0 or len(data[column][test_mask & bckgrd_mask]) == 0:
962 b2.B2WARNING("Cannot calculate kolmogorov smirnov test for background due to missing data")
963 else:
964 ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
965 props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
966 self.axis_d2axis_d2.text(0.1, 0.9, r'background (train - test) difference $p={:.2f}$'.format(ks[1]),
967 bbox=props,
968 verticalalignment='top', horizontalalignment='left', transform=self.axis_d2axis_d2.transAxes)
969 except ImportError:
970 b2.B2WARNING("Cannot calculate kolmogorov smirnov test please install scipy!")
971
972 return self
973
974 def finish(self):
975 """
976 Sets limits, title, axis-labels and legend of the plot
977 """
978 self.axisaxisaxisaxis.set_title("Overtraining Plot")
979 self.axis_d1axis_d1.set_title("")
980 self.axis_d2axis_d2.set_title("")
981 matplotlib.artist.setp(self.axisaxisaxisaxis.get_xticklabels(), visible=False)
982 matplotlib.artist.setp(self.axis_d1axis_d1.get_xticklabels(), visible=False)
983 self.axisaxisaxisaxis.get_xaxis().set_label_text('')
984 self.axis_d1axis_d1.get_xaxis().set_label_text('')
985 self.axis_d2axis_d2.get_xaxis().set_label_text('Classifier Output')
986 return self
987
988
990 """
991 Plots distribution of a quantity including boxplots
992 """
993
994
995 box_axes = None
996
997 def __init__(self, figure=None, axis=None, normed=False, range_in_std=None, x_axis_label=None):
998 """
999 Creates a new figure and axis if None is given, sets the default plot parameters
1000 @param figure default draw figure which is used
1001 @param axis default draw axis which is used
1002 @param normed true if the histograms should be normed before drawing
1003 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
1004 """
1005 super().__init__(figure, axis)
1006
1007 self.normed = normed
1008
1009 self.range_in_std = range_in_std
1010
1012
1013 self.distribution = Distribution(self.figurefigure, self.axisaxis, normed_to_all_entries=self.normed, range_in_std=self.range_in_std)
1014
1015 self.x_axis_label = x_axis_label
1016
1017 def add(self, data, column, mask=None, weight_column=None, label=None):
1018 """
1019 Add a new distribution plot, with additional information like a boxplot compared to
1020 the ordinary Distribution plot.
1021 @param data pandas.DataFrame containing all data
1022 @param column which is used to calculate distribution histogram
1023 @param mask boolean numpy.array defining which events are used for the distribution histogram
1024 @param weight_column column in data containing the weights for each event
1025 """
1029 self.distribution.add(data, column, mask, weight_column, label=label)
1030
1031 n = len(self.box_axesbox_axes) + 1
1032 gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1033 gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i in range(n)]
1034 box_axis = self.add_subplot(gridspecs)
1035
1036 if self.range_in_std is not None:
1037 mean, std = histogram.weighted_mean_and_std(data[column], None if weight_column is None else data[weight_column])
1038 # Everything outside mean +- range_in_std * std is considered not inside the mask
1039 mask = mask & (data[column] > (mean - self.range_in_std * std)) & (data[column] < (mean + self.range_in_std * std))
1040 box = Box(self.figurefigure, box_axis, x_axis_label=self.x_axis_label)
1041 box.add(data, column, mask, weight_column)
1042 if len(box.plots) > 0:
1043 box.plots[0]['boxes'][0].set_facecolor(self.distribution.plots[-1][0][0].get_color())
1044 box.finish()
1045
1046 self.box_axesbox_axes.append(box_axis)
1047 return self
1048
1049 def finish(self):
1050 """
1051 Sets limits, title, axis-labels and legend of the plot
1052 """
1053 self.distribution.finish()
1054 matplotlib.artist.setp(self.axisaxis.get_xticklabels(), visible=False)
1055 self.axisaxis.get_xaxis().set_label_text('')
1056 for box_axis in self.box_axesbox_axes[:-1]:
1057 matplotlib.artist.setp(box_axis.get_xticklabels(), visible=False)
1058 box_axis.set_title("")
1059 box_axis.get_xaxis().set_label_text('')
1060 self.box_axesbox_axes[-1].set_title("")
1061 self.axisaxis.set_title("Distribution Plot")
1062 self.axisaxis.legend([x[0] for x in self.distribution.plots], self.distribution.labels,
1063 loc='best', fancybox=True, framealpha=0.5)
1064 return self
1065
1066
1068 """
1069 Plots change of a distribution of a quantity depending on the cut on a classifier
1070 """
1071
1072 figure = None
1073
1074 axis = None
1075
1076 axis_d1 = None
1077
1078 axis_d2 = None
1079
1080 def __init__(self, figure=None):
1081 """
1082 Creates a new figure if None is given, sets the default plot parameters
1083 @param figure default draw figure which is used
1084 """
1085 if figure is None:
1086
1087 self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(12, 8), dpi=120)
1088 self.figurefigurefigurefigure.set_tight_layout(True)
1089 else:
1090 self.figurefigurefigurefigure = figure
1091
1092 gs = matplotlib.gridspec.GridSpec(3, 2)
1093
1095
1097
1099
1101
1102 def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1103 """
1104 Add a new correlation plot.
1105 @param data pandas.DataFrame containing all data
1106 @param column which is used to calculate distribution histogram
1107 @param cut_column which is used to calculate cut on the other quantity defined by column
1108 @param quantiles list of quantiles between 0 and 100, defining the different cuts
1109 @param weight_column column in data containing the weights for each event
1110 """
1111 if len(data[cut_column]) == 0:
1112 b2.B2WARNING("Ignore empty Correlation.")
1113 return self
1114
1115 axes = [self.axisaxisaxisaxis, self.axis_d1axis_d1, self.axis_d2axis_d2]
1116
1117 for i, (l, m) in enumerate([('.', signal_mask | bckgrd_mask), ('S', signal_mask), ('B', bckgrd_mask)]):
1118
1119 if weight_column is not None:
1120 weights = numpy.array(data[weight_column][m])
1121 else:
1122 weights = numpy.ones(len(data[column][m]))
1123
1124 xrange = numpy.percentile(data[column][m], [5, 95])
1125
1126 colormap = plt.get_cmap('coolwarm')
1127 tmp, x = numpy.histogram(data[column][m], bins=100,
1128 range=xrange, density=True, weights=weights)
1129 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1130 axes[i].plot(bin_center, tmp, color='black', lw=1)
1131
1132 for quantil in numpy.arange(5, 100, 5):
1133 cut = numpy.percentile(data[cut_column][m], quantil)
1134 sel = data[cut_column][m] >= cut
1135 y, x = numpy.histogram(data[column][m][sel], bins=100,
1136 range=xrange, density=True, weights=weights[sel])
1137 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1138 axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1139 tmp = y
1140
1141 axes[i].set_ylim(bottom=0)
1142
1143 flatness_score = basf2_mva_util.calculate_flatness(data[column][m], data[cut_column][m], weights)
1144 axes[i].set_title(r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1145 return self
1146
1147 def finish(self):
1148 """
1149 Sets limits, title, axis-labels and legend of the plot
1150 """
1151 return self
1152
1153
1155 """
1156 Plots multivariate distribution using TSNE algorithm
1157 """
1158
1159 def add(self, data, columns, *masks):
1160 """
1161 Add a new correlation plot.
1162 @param data pandas.DataFrame containing all data
1163 @param columns which are used to calculate the correlations
1164 @param masks different classes to show in TSNE
1165 """
1166 try:
1167 import sklearn
1168 import sklearn.manifold
1169 model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1170 data = numpy.array([data[column] for column in columns]).T
1171 model.fit(data)
1172 for mask in masks:
1173 data = numpy.array([data[column][mask] for column in columns]).T
1174 data = model.transform(data)
1175 self.axisaxis.scatter(data[:, 0], data[:, 1], rasterized=True)
1176 except ImportError:
1177 print("Cannot create TSNE plot. Install sklearn if you want it")
1178 return self
1179
1180 def finish(self):
1181 """
1182 Sets limits, title, axis-labels and legend of the plot
1183 """
1184 return self
1185
1186
1188 """
1189 Plots importance matrix
1190 """
1191
1192 def add(self, data, columns, variables):
1193 """
1194 Add a new correlation plot.
1195 @param data pandas.DataFrame containing all data
1196 @param columns which are used to calculate the correlations
1197 """
1198 self.figurefigure.set_tight_layout(True)
1199
1200 def norm(x):
1201 width = (numpy.max(x) - numpy.min(x))
1202 if width <= 0:
1203 return numpy.zeros(x.shape)
1204 return (x - numpy.min(x)) / width * 100
1205
1206 importance_matrix = numpy.vstack([norm(data[column]) for column in columns]).T
1207 importance_heatmap = self.axisaxis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,
1208 rasterized=True)
1209
1210 # put the major ticks at the middle of each cell
1211 self.axisaxis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=False)
1212 self.axisaxis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=False)
1213
1214 self.axisaxis.set_xticklabels(columns, minor=False, rotation=90)
1215 self.axisaxis.set_yticklabels(variables, minor=False)
1216
1217 self.axisaxis.xaxis.tick_top()
1218
1219 for y in range(importance_matrix.shape[0]):
1220 for x in range(importance_matrix.shape[1]):
1221 txt = self.axisaxis.text(x + 0.5, y + 0.5, f'{importance_matrix[y, x]:.0f}',
1222 size=14,
1223 horizontalalignment='center',
1224 verticalalignment='center',
1225 color='w')
1226 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])
1227
1228 cb = self.figurefigure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation='vertical')
1229 cb.ax.set_yticklabels(['low', 'high'])
1230
1231 # remove whitespace
1232 self.axisaxis.set_ylim(0, importance_matrix.shape[0])
1233
1234 self.axisaxis.set_aspect('equal')
1235
1236 return self
1237
1238 def finish(self):
1239 """
1240 Sets limits, title, axis-labels and legend of the plot
1241 """
1242 return self
1243
1244
1246 """
1247 Plots correlation matrix
1248 """
1249
1250 figure = None
1251
1252 signal_axis = None
1253
1254 bckgrd_axis = None
1255
1256 def __init__(self, figure=None):
1257 """
1258 Creates a new figure if None is given, sets the default plot parameters
1259 @param figure default draw figure which is used
1260 """
1261 if figure is None:
1262
1263 self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(12, 8), dpi=120)
1264 self.figurefigurefigurefigure.set_tight_layout(True)
1265 else:
1266 self.figurefigurefigurefigure = figure
1267
1268 gs = matplotlib.gridspec.GridSpec(8, 2)
1269
1271
1273
1275
1277
1279
1280 def add(self, data, columns, signal_mask, bckgrd_mask):
1281 """
1282 Add a new correlation plot.
1283 @param data pandas.DataFrame containing all data
1284 @param columns which are used to calculate the correlations
1285 """
1286 signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column in columns])) * 100
1287 bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask] for column in columns])) * 100
1288
1289 signal_heatmap = self.signal_axissignal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1290 self.bckgrd_axisbckgrd_axis.pcolor(bckgrd_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1291
1292 self.signal_axissignal_axis.invert_yaxis()
1293 self.signal_axissignal_axis.xaxis.tick_top()
1294 self.bckgrd_axisbckgrd_axis.invert_yaxis()
1295 self.bckgrd_axisbckgrd_axis.xaxis.tick_top()
1296
1297 # put the major ticks at the middle of each cell
1298 self.signal_axissignal_axis.set_xticks(numpy.arange(signal_corr.shape[0]) + 0.5, minor=False)
1299 self.signal_axissignal_axis.set_yticks(numpy.arange(signal_corr.shape[1]) + 0.5, minor=False)
1300
1301 self.signal_axissignal_axis.set_xticklabels(columns, minor=False, rotation=90)
1302 self.signal_axissignal_axis.set_yticklabels(columns, minor=False)
1303
1304 # put the major ticks at the middle of each cell
1305 self.bckgrd_axisbckgrd_axis.set_xticks(numpy.arange(bckgrd_corr.shape[0]) + 0.5, minor=False)
1306 self.bckgrd_axisbckgrd_axis.set_yticks(numpy.arange(bckgrd_corr.shape[1]) + 0.5, minor=False)
1307
1308 self.bckgrd_axisbckgrd_axis.set_xticklabels(columns, minor=False, rotation=90)
1309 self.bckgrd_axisbckgrd_axis.set_yticklabels(columns, minor=False)
1310
1311 for y in range(signal_corr.shape[0]):
1312 for x in range(signal_corr.shape[1]):
1313 txt = self.signal_axissignal_axis.text(x + 0.5, y + 0.5, f'{signal_corr[y, x]:.0f}',
1314 size=14,
1315 horizontalalignment='center',
1316 verticalalignment='center',
1317 color='w')
1318 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])
1319
1320 for y in range(bckgrd_corr.shape[0]):
1321 for x in range(bckgrd_corr.shape[1]):
1322 txt = self.bckgrd_axisbckgrd_axis.text(x + 0.5, y + 0.5, f'{bckgrd_corr[y, x]:.0f}',
1323 size=14,
1324 horizontalalignment='center',
1325 verticalalignment='center',
1326 color='w')
1327 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])
1328
1329 cb = self.figurefigurefigurefigure.colorbar(signal_heatmap, cax=self.colorbar_axis, ticks=[-100, 0, 100], orientation='horizontal')
1330 cb.solids.set_rasterized(True)
1331 cb.ax.set_xticklabels(['negative', 'uncorrelated', 'positive'])
1332
1333 self.signal_axissignal_axis.text(0.5, -1.0, "Signal", horizontalalignment='center')
1334 self.bckgrd_axisbckgrd_axis.text(0.5, -1.0, "Background", horizontalalignment='center')
1335
1336 # remove whitespace
1337 self.signal_axissignal_axis.set_xlim(0, signal_corr.shape[0])
1338 self.signal_axissignal_axis.set_ylim(0, signal_corr.shape[1])
1339 self.bckgrd_axisbckgrd_axis.set_xlim(0, bckgrd_corr.shape[0])
1340 self.bckgrd_axisbckgrd_axis.set_ylim(0, bckgrd_corr.shape[1])
1341 return self
1342
1343 def finish(self):
1344 """
1345 Sets limits, title, axis-labels and legend of the plot
1346 """
1347 matplotlib.artist.setp(self.bckgrd_axisbckgrd_axis.get_yticklabels(), visible=False)
1348 return self
1349
1350
1351if __name__ == '__main__':
1352
1353 def get_data(N, columns):
1354 """
1355 Creates fake data for example plots
1356 """
1357 N /= 2
1358 n = len(columns) - 1
1359 xs = numpy.random.normal(0, size=(N, n))
1360 xb = numpy.random.normal(1, size=(N, n))
1361 ys = numpy.zeros(N)
1362 yb = numpy.ones(N)
1363 data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1364 return data.reindex(numpy.random.permutation(data.index))
1365
1366 import seaborn
1367 # Set nice searborn settings
1368 seaborn.set(font_scale=3)
1369 seaborn.set_style('whitegrid')
1370
1371 # Standard plots
1372 N = 100000
1373 data = get_data(N, columns=['FastBDT', 'NeuroBayes', 'isSignal'])
1374 data['type'] = ''
1375 data.type.iloc[:N / 2] = 'Train'
1376 data.type.iloc[N / 2:] = 'Test'
1377
1378 p = Box()
1379 p.add(data, 'FastBDT')
1380 p.finish()
1381 p.save('box_plot.png')
1382
1384 p.add(data, 'FastBDT')
1385 p.add(data, 'NeuroBayes')
1386 p.finish()
1387 p.save('verbose_distribution_plot.png')
1388
1390 p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1391 p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1392 p.finish()
1393 p.save('roc_purity_plot.png')
1394
1396 p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1397 p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1398 p.finish()
1399 p.save('roc_rejection_plot.png')
1400
1401 p = Diagonal()
1402 p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1403 p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1404 p.finish()
1405 p.save('diagonal_plot.png')
1406
1407 p = Distribution()
1408 p.add(data, 'FastBDT')
1409 p.add(data, 'NeuroBayes')
1410 p.finish()
1411 p.save('distribution_plot.png')
1412
1413 p = Difference()
1414 p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test')
1415 p.add(data, 'NeuroBayes', data['type'] == 'Train', data['type'] == 'Test')
1416 p.finish()
1417 p.save('difference_plot.png')
1418
1419 p = Overtraining()
1420 p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test', data['isSignal'] == 1, data['isSignal'] == 0)
1421 p.finish()
1422 p.save('overtraining_plot.png')
1423
1424 p = Correlation()
1425 p.add(data, 'FastBDT', 'NeuroBayes', [0, 20, 40, 60, 80, 100], data['isSignal'] == 0)
1426 p.finish()
1427 p.save('correlation_plot.png')
1428
1429 p = CorrelationMatrix()
1430 data['FastBDT2'] = data['FastBDT']**2
1431 data['NeuroBayes2'] = data['NeuroBayes']**2
1432 data['FastBDT3'] = data['FastBDT']**3
1433 data['NeuroBayes3'] = data['NeuroBayes']**3
1434 p.add(data, ['FastBDT', 'NeuroBayes', 'FastBDT2', 'NeuroBayes2', 'FastBDT3', 'NeuroBayes3'])
1435 p.finish()
1436 p.save('correlation_matrix.png')
def calculate_flatness(f, p, w=None)
x_axis_label
Label on x axis.
Definition: plotting.py:717
def __init__(self, figure=None, axis=None, x_axis_label=None)
Definition: plotting.py:708
def add(self, data, column, mask=None, weight_column=None)
Definition: plotting.py:719
def finish(self)
Definition: plotting.py:760
signal_axis
add signal subplot
Definition: plotting.py:1270
def add(self, data, columns, signal_mask, bckgrd_mask)
Definition: plotting.py:1280
colorbar_axis
Colorbar axis contains the colorbar.
Definition: plotting.py:1274
None bckgrd_axis
Axis which shows the correlation of the background samples.
Definition: plotting.py:1254
def __init__(self, figure=None)
Definition: plotting.py:1256
None figure
figure which is used to draw
Definition: plotting.py:1250
None signal_axis
Main axis which shows the correlation of the signal samples.
Definition: plotting.py:1252
bckgrd_axis
add background subplot
Definition: plotting.py:1272
axis
Usual axis object which every Plotter object needs, here it is just a dummy.
Definition: plotting.py:1276
def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
Definition: plotting.py:1102
axis_d1
define second subplot
Definition: plotting.py:1096
figure
create figure
Definition: plotting.py:1087
None axis_d1
Axis which shows shape of signal.
Definition: plotting.py:1076
None axis
Main axis which is used to draw.
Definition: plotting.py:1074
def __init__(self, figure=None)
Definition: plotting.py:1080
axis_d2
define third subplot
Definition: plotting.py:1098
None figure
figure which is used to draw
Definition: plotting.py:1072
None axis_d2
Axis which shows shape of background.
Definition: plotting.py:1078
axis
define first subplot
Definition: plotting.py:1094
ymax
Maximum y value.
Definition: plotting.py:562
xmax
Maximum x value.
Definition: plotting.py:560
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:548
def finish(self)
Definition: plotting.py:569
x_axis_label
Label on x axis.
Definition: plotting.py:840
shift_to_zero
Mean difference is shifted to zero (removes constant offset) if this is true.
Definition: plotting.py:797
def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
Definition: plotting.py:787
ymax
Maximum y value.
Definition: plotting.py:800
xmax
Maximum x value.
Definition: plotting.py:830
def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
Definition: plotting.py:805
ymin
min y value
Definition: plotting.py:799
def finish(self, line_color='black')
Definition: plotting.py:843
normed
Minuend and subtrahend are normed before comparing them if this is true.
Definition: plotting.py:796
def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
Definition: plotting.py:590
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:622
x_axis_label
x axis label
Definition: plotting.py:620
keep_first_binning
Keep first binning if user wants so.
Definition: plotting.py:616
normed_to_all_entries
Normalize histograms before drawing them.
Definition: plotting.py:601
first_binning
first binning
Definition: plotting.py:618
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:605
normed_to_bin_width
Normalize histograms before drawing them.
Definition: plotting.py:603
def add(self, data, columns, variables)
Definition: plotting.py:1192
def finish(self)
Definition: plotting.py:1238
def add(self, i, *args, **kwargs)
Definition: plotting.py:523
figure
create figure
Definition: plotting.py:503
def __init__(self, cls, number_of_plots, figure=None)
Definition: plotting.py:496
None axis
Main axis.
Definition: plotting.py:494
None figure
figure which is used to draw
Definition: plotting.py:492
sub_plots
the subplots which are displayed in the grid
Definition: plotting.py:518
axis
the axis of the first subplot
Definition: plotting.py:520
def finish(self)
Definition: plotting.py:530
axis_d1
define second subplot
Definition: plotting.py:889
figure
create figure
Definition: plotting.py:880
def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:895
None axis_d1
Axis which shows the difference between training and test signal.
Definition: plotting.py:869
None axis
Main axis which is used to draw.
Definition: plotting.py:867
def __init__(self, figure=None)
Definition: plotting.py:873
axis_d2
define third subplot
Definition: plotting.py:891
None figure
figure which is used to draw
Definition: plotting.py:865
None axis_d2
Axis which shows the difference between training and test background.
Definition: plotting.py:871
axis
define first subplot
Definition: plotting.py:887
def finish(self, *args, **kwargs)
Definition: plotting.py:259
fill_kwargs
Default keyword arguments for fill_between function.
Definition: plotting.py:117
None ymin
Minimum y value.
Definition: plotting.py:67
def set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
Definition: plotting.py:166
plots
create empty list for plots
Definition: plotting.py:98
float xscale
limit scale
Definition: plotting.py:71
figure
create figure
Definition: plotting.py:86
None ymax
Maximum y value.
Definition: plotting.py:69
errorband_kwargs
Default keyword arguments for errorband function.
Definition: plotting.py:115
None axis
Main axis which is used to draw.
Definition: plotting.py:75
def scale_limits(self)
Definition: plotting.py:265
def add(self, *args, **kwargs)
Definition: plotting.py:253
None xmin
Minimum x value.
Definition: plotting.py:63
def set_fill_options(self, fill_kwargs=None)
Definition: plotting.py:174
def save(self, filename)
Definition: plotting.py:139
def __init__(self, figure=None, axis=None)
Definition: plotting.py:77
None figure
figure which is used to draw
Definition: plotting.py:73
ymax
set y limits
Definition: plotting.py:104
None plots
Plots added to the axis so far.
Definition: plotting.py:59
prop_cycler
Property cycler used to give plots unique colors.
Definition: plotting.py:125
xmax
set x limits
Definition: plotting.py:102
errorbar_kwargs
Default keyword arguments for errorbar function.
Definition: plotting.py:113
labels
create empty list for labels
Definition: plotting.py:100
axis
divide figure into subplots
Definition: plotting.py:93
def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
Definition: plotting.py:182
def set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Overrides default errorbar options for datapoint errorbars.
Definition: plotting.py:158
float yscale
limit scale
Definition: plotting.py:70
None labels
Labels of the plots added so far.
Definition: plotting.py:61
def add_subplot(self, gridspecs)
Definition: plotting.py:127
def set_plot_options(self, plot_kwargs={ 'linestyle':''})
Definition: plotting.py:150
plot_kwargs
Default keyword arguments for plot function.
Definition: plotting.py:111
None xmax
Maximum x value.
Definition: plotting.py:65
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:285
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:394
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:440
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:347
def add(self, data, columns, *masks)
Definition: plotting.py:1159
def finish(self)
Definition: plotting.py:1180
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:1017
distribution
The distribution plot.
Definition: plotting.py:1013
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:1009
None box_axes
Axes for the boxplots.
Definition: plotting.py:995
box_axes
create empty list for box axes
Definition: plotting.py:1011
normed
Normalize histograms before drawing them.
Definition: plotting.py:1007
def __init__(self, figure=None, axis=None, normed=False, range_in_std=None, x_axis_label=None)
Definition: plotting.py:997
def weighted_mean_and_std(x, w)
Definition: histogram.py:31
def poisson_error(n_tot)
Definition: histogram.py:24
Definition: plot.py:1