Belle II Software development
ftPlotting.py
1#!/usr/bin/env python3
2
3
10
11# @cond SUPPRESS_DOXYGEN
12
13import basf2_mva_util
14from basf2 import B2INFO, B2WARNING
15import basf2_mva_evaluation.histogram as histogram
16import matplotlib.ticker
17import matplotlib.patches
18import matplotlib.colors
19import matplotlib.gridspec
20import matplotlib.figure
21import matplotlib.artist
22import matplotlib.pyplot as plt
23import copy
24import math
25import pandas
26import numpy
27import numpy as np
28import matplotlib
29# Do not use standard backend TkAgg, because it is NOT thread-safe
30# You will get an RuntimeError: main thread is not in main loop otherwise!
31matplotlib.use("svg")
32matplotlib.rcParams.update({'font.size': 40})
33matplotlib.rcParams['text.usetex'] = True
34matplotlib.rcParams['text.latex.preamble'] = [r"\usepackage{amsmath}"]
35
36
37class Plotter:
38 """
39 Base class for all Plotters.
40 """
41
42
43 plots = None
44
45 labels = None
46
47 xmin = None
48
49 xmax = None
50
51 ymin = None
52
53 ymax = None
54 yscale = 0.0
55 xscale = 0.0
56
57 figure = None
58
59 axis = None
60
61 def __init__(self, figure=None, axis=None):
62 """
63 Creates a new figure and axis if None is given, sets the default plot parameters
64 @param figure default draw figure which is used
65 @param axis default draw axis which is used
66 """
67 B2INFO("Create new figure for class " + str(type(self)))
68 if figure is None:
69 self.figure = matplotlib.figure.Figure(figsize=(32, 18))
70 self.figure.set_tight_layout(False)
71 else:
72 self.figure = figure
73
74 if axis is None:
75 self.axis = self.figure.add_subplot(1, 1, 1)
76 else:
77 self.axis = axis
78
79 self.plots = []
80 self.labels = []
81 self.xmin, self.xmax = float(0), float(1)
82 self.ymin, self.ymax = float(0), float(1)
83
84 self.yscale = 0.1
85
86 self.xscale = 0.0
87
88
89 self.plot_kwargs = None
90
91 self.errorbar_kwargs = None
92
93 self.errorband_kwargs = None
94
95 self.fill_kwargs = None
96
97 self.set_plot_options()
98 self.set_errorbar_options()
99 self.set_errorband_options()
100 self.set_fill_options()
101
102 def add_subplot(self, gridspecs):
103 """
104 Adds a new subplot to the figure, updates all other axes
105 according to the given gridspec
106 @param gridspecs gridspecs for all axes including the new one
107 """
108 for gs, ax in zip(gridspecs[:-1], self.figure.axes):
109 ax.set_position(gs.get_position(self.figure))
110 ax.set_subplotspec(gs)
111 axis = self.figure.add_subplot(gridspecs[-1], sharex=self.axis)
112 return axis
113
114 def save(self, filename):
115 """
116 Save the figure into a file
117 @param filename of the file
118 """
119 B2INFO("Save figure for class " + str(type(self)))
120 from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
121 canvas = FigureCanvas(self.figure)
122 canvas.print_figure(filename, dpi=50)
123 return self
124
125 def set_plot_options(self, plot_kwargs={'linestyle': ''}):
126 """
127 Overrides default plot options for datapoint plot
128 @param plot_kwargs keyword arguments for the plot function
129 """
130 self.plot_kwargs = copy.copy(plot_kwargs)
131 return self
132
133 def set_errorbar_options(self, errorbar_kwargs={'fmt': '.', 'elinewidth': 3, 'alpha': 1}):
134 """
135 Overrides default errorbar options for datapoint errorbars
136 @param errorbar_kwargs keyword arguments for the errorbar function
137 """
138 self.errorbar_kwargs = copy.copy(errorbar_kwargs)
139 return self
140
141 def set_errorband_options(self, errorband_kwargs={'alpha': 0.5}):
142 """
143 Overrides default errorband options for datapoint errorband
144 @param errorbar_kwargs keyword arguments for the fill_between function
145 """
146 self.errorband_kwargs = copy.copy(errorband_kwargs)
147 return self
148
149 def set_fill_options(self, fill_kwargs=None):
150 """
151 Overrides default fill_between options for datapoint errorband
152 @param fill_kwargs keyword arguments for the fill_between function
153 """
154 self.fill_kwargs = copy.copy(fill_kwargs)
155 return self
156
157 def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None):
158 """
159 Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
160 @param x coordinates of the data points
161 @param y coordinates of the data points
162 @param xerr symmetric error on x data points
163 @param yerr symmetric error on y data points
164 """
165 p = e = f = None
166 plot_kwargs = copy.copy(self.plot_kwargs)
167 errorbar_kwargs = copy.copy(self.errorbar_kwargs)
168 errorband_kwargs = copy.copy(self.errorband_kwargs)
169 fill_kwargs = copy.copy(self.fill_kwargs)
170
171 if plot_kwargs is None or 'color' not in plot_kwargs:
172 color = next(axis._get_lines.prop_cycler)
173 color = color['color']
174 plot_kwargs['color'] = color
175 else:
176 color = plot_kwargs['color']
177 color = matplotlib.colors.ColorConverter().to_rgb(color)
178 patch = matplotlib.patches.Patch(color=color, alpha=0.7)
179 patch.get_color = patch.get_facecolor
180 patches = [patch]
181
182 if plot_kwargs is not None:
183 p, = axis.plot(x, y, **plot_kwargs)
184 patches.append(p)
185
186 if errorbar_kwargs is not None and (xerr is not None or yerr is not None):
187 if 'color' not in errorbar_kwargs:
188 errorbar_kwargs['color'] = color
189 if 'ecolor' not in errorbar_kwargs:
190 errorbar_kwargs['ecolor'] = [0.4 * x for x in color]
191 # print("Here Colors")
192 # print([0.4 * x for x in color])
193 # if 'elinewidth' not in errorbar_kwargs:
194 errorbar_kwargs['elinewidth'] = 5
195 e = axis.errorbar(x, y, xerr=xerr, yerr=yerr, **errorbar_kwargs)
196 patches.append(e)
197
198 if errorband_kwargs is not None and yerr is not None:
199 if 'color' not in errorband_kwargs:
200 errorband_kwargs['color'] = color
201 if xerr is not None:
202 # Ensure that xerr and yerr are iterable numpy arrays
203 xerr = x + xerr - x
204 yerr = y + yerr - y
205 for _x, _y, _xe, _ye in zip(x, y, xerr, yerr):
206 axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye,
207 **errorband_kwargs))
208 else:
209 f = axis.fill_between(x, y - yerr, y + yerr, interpolate=True, **errorband_kwargs)
210
211 if fill_kwargs is not None:
212 axis.fill_between(x, y, 0, **fill_kwargs)
213
214 return (tuple(patches), p, e, f)
215
216 def add(self, *args, **kwargs):
217 """
218 Add a new plot to this plotter
219 """
220 return NotImplemented
221
222 def finish(self, *args, **kwargs):
223 """
224 Finish plotting and set labels, legends and stuff
225 """
226 return NotImplemented
227
228 def scale_limits(self):
229 """
230 Scale limits to increase distance to boundaries
231 """
232 self.ymin *= 1.0 - math.copysign(self.yscale, self.ymin)
233 self.ymax *= 1.0 + math.copysign(self.yscale, self.ymax)
234 self.xmin *= 1.0 - math.copysign(self.xscale, self.xmin)
235 self.xmax *= 1.0 + math.copysign(self.xscale, self.xmax)
236 return self
237
238
239class PurityAndEfficiencyOverCut(Plotter):
240 """
241 Plots the purity and the efficiency over the cut value (for cut choosing)
242 """
243
247
248 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
249 """
250 Add a new curve to the plot
251 @param data pandas.DataFrame containing all data
252 @param column which is used to calculate efficiency and purity for different cuts
253 @param signal_mask boolean numpy.array defining which events are signal events
254 @param bckgrd_mask boolean numpy.array defining which events are background events
255 @param weight_column column in data containing the weights for each event
256 """
257
258 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
259
260 if normed:
261 efficiency, efficiency_error = hists.get_efficiency(['Signal'])
262 purity, purity_error = hists.get_purity(['Signal'], ['Background'])
263 else:
264 efficiency, efficiency_error = hists.get_true_positives(['Signal'])
265 purity, purity_error = hists.get_false_positives(['Background'])
266
267 cuts = hists.bin_centers
268
269 self.xmin, self.xmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmax])
270 self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.ymin]), \
271 numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.ymax])
272
273 self.plots.append(self._plot_datapoints(self.axis, cuts, efficiency, xerr=0, yerr=efficiency_error))
274
275 if normed:
276 self.labels.append("Efficiency")
277 else:
278 self.labels.append("True positive")
279
280 self.plots.append(self._plot_datapoints(self.axis, cuts, purity, xerr=0, yerr=purity_error))
281
282 if normed:
283 self.labels.append("Purity")
284 else:
285 self.labels.append("False positive")
286
287 return self
288
289 def finish(self):
290 """
291 Sets limits, title, axis-labels and legend of the plot
292 """
293 self.axis.set_xlim((self.xmin, self.xmax))
294 self.axis.set_ylim((self.ymin, self.ymax))
295 self.axis.set_title("Classification Plot")
296 self.axis.get_xaxis().set_label_text('Cut Value')
297 self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
298 return self
299
300
301class SignalToNoiseOverCut(Plotter):
302 """
303 Plots the signal to noise ratio over the cut value (for cut choosing)
304 """
305
309
310 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
311 """
312 Add a new curve to the plot
313 @param data pandas.DataFrame containing all data
314 @param column which is used to calculate signal to noise ratio for different cuts
315 @param signal_mask boolean numpy.array defining which events are signal events
316 @param bckgrd_mask boolean numpy.array defining which events are background events
317 @param weight_column column in data containing the weights for each event
318 """
319
320 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
321
322 signal2noise, signal2noise_error = hists.get_signal_to_noise(['Signal'], ['Background'])
323
324 cuts = hists.bin_centers
325
326 self.xmin, self.xmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmax])
327 self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(signal2noise), self.ymin]), \
328 numpy.nanmax([numpy.nanmax(signal2noise), self.ymax])
329
330 self.plots.append(self._plot_datapoints(self.axis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))
331
332 self.labels.append(column)
333
334 return self
335
336 def finish(self):
337 """
338 Sets limits, title, axis-labels and legend of the plot
339 """
340 self.axis.set_xlim((self.xmin, self.xmax))
341 self.axis.set_ylim((self.ymin, self.ymax))
342 self.axis.set_title("Signal to Noise Plot")
343 self.axis.get_xaxis().set_label_text('Cut Value')
344 self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
345 return self
346
347
348class PurityOverEfficiency(Plotter):
349 """
350 Plots the purity over the efficiency also known as ROC curve
351 """
352
356
357 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
358 """
359 Add a new curve to the ROC plot
360 @param data pandas.DataFrame containing all data
361 @param column which is used to calculate efficiency and purity for different cuts
362 @param signal_mask boolean numpy.array defining which events are signal events
363 @param bckgrd_mask boolean numpy.array defining which events are background events
364 @param weight_column column in data containing the weights for each event
365 """
366 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
367 efficiency, efficiency_error = hists.get_efficiency(['Signal'])
368 purity, purity_error = hists.get_purity(['Signal'], ['Background'])
369
370 self.xmin, self.xmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmax])
371 self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
372
373 p = self._plot_datapoints(self.axis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
374 self.plots.append(p)
375 if label is not None:
376 self.labels.append(label)
377 else:
378 self.labels.append(column)
379 return self
380
381 def finish(self):
382 """
383 Sets limits, title, axis-labels and legend of the plot
384 """
385 self.axis.set_xlim((self.xmin, self.xmax))
386 self.axis.set_ylim((self.ymin, self.ymax))
387 self.axis.set_title("ROC Purity Plot")
388 self.axis.get_xaxis().set_label_text('Efficiency')
389 self.axis.get_yaxis().set_label_text('Purity')
390 self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
391 return self
392
393
394class RejectionOverEfficiency(Plotter):
395 """
396 Plots the rejection over the efficiency also known as ROC curve
397 """
398
402
403 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
404 """
405 Add a new curve to the ROC plot
406 @param data pandas.DataFrame containing all data
407 @param column which is used to calculate efficiency and purity for different cuts
408 @param signal_mask boolean numpy.array defining which events are signal events
409 @param bckgrd_mask boolean numpy.array defining which events are background events
410 @param weight_column column in data containing the weights for each event
411 """
412 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
413 efficiency, efficiency_error = hists.get_efficiency(['Signal'])
414 rejection, rejection_error = hists.get_efficiency(['Background'])
415 rejection = 1 - rejection
416
417 self.xmin, self.xmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmax])
418 self.ymin, self.ymax = numpy.nanmin([rejection.min(), self.ymin]), numpy.nanmax([rejection.max(), self.ymax])
419
420 auc = numpy.abs(numpy.trapz(rejection, efficiency))
421
422 p = self._plot_datapoints(self.axis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
423 self.plots.append(p)
424 if label is not None:
425 self.labels.append(label[:10] + r"$\ {\rm AUC}\ =\ $" + r"${:.2f}$".format(auc))
426 else:
427 self.labels.append(r"${\rm AUC}\ =\ $" + r"${:.2f}$".format(auc))
428
429 return auc # self,
430
431 def finish(self):
432 """
433 Sets limits, title, axis-labels and legend of the plot
434 """
435 self.axis.set_xlim((self.xmin, self.xmax))
436 self.axis.set_ylim((self.ymin, self.ymax))
437 # self.axis.set_title("ROC Rejection Plot")
438 self.axis.get_xaxis().set_tick_params(labelsize=60)
439 self.axis.get_yaxis().set_tick_params(labelsize=60)
440 self.axis.grid(True)
441 self.axis.get_xaxis().labelpad = 20
442 self.axis.get_yaxis().labelpad = 20
443 self.axis.get_xaxis().set_label_text(r'${\rm Signal\ Efficiency}$', fontsize=65)
444 self.axis.get_yaxis().set_label_text(r'${\rm Background\ Rejection}$', fontsize=65)
445 self.axis.legend([x[0] for x in self.plots], self.labels, fancybox=True, framealpha=0.5, fontsize=60, loc=3)
446 return self
447
448
449class Multiplot(Plotter):
450 """
451 Plots multiple other plots into a grid 3x?
452 """
453
454 figure = None
455
456 axis = None
457
458 def __init__(self, cls, number_of_plots, figure=None):
459 """
460 Creates a new figure if None is given, sets the default plot parameters
461 @param figure default draw figure which is used
462 """
463 if figure is None:
464 self.figure = matplotlib.figure.Figure(figsize=(32, 18))
465 self.figure.set_tight_layout(True)
466 else:
467 self.figure = figure
468
469 if number_of_plots == 1:
470 gs = matplotlib.gridspec.GridSpec(1, 1)
471 elif number_of_plots == 2:
472 gs = matplotlib.gridspec.GridSpec(1, 2)
473 elif number_of_plots == 3:
474 gs = matplotlib.gridspec.GridSpec(1, 3)
475 else:
476 gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
477
478
479 self.sub_plots = [cls(self.figure, self.figure.add_subplot(gs[i // 3, i % 3])) for i in range(number_of_plots)]
480 self.axis = self.sub_plots[0].axis
481 super().__init__(self.figure, self.axis)
482
483 def add(self, i, *args, **kwargs):
484 """
485 Call add function of ith subplot
486 @param i position of the subplot
487 """
488 self.sub_plots[i].add(*args, **kwargs)
489
490 def finish(self):
491 """
492 Sets limits, title, axis-labels and legend of the plot
493 """
494 for plot in self.sub_plots:
495 plot.finish()
496 return self
497
498
499class Diagonal(Plotter):
500 """
501 Plots the purity in each bin over the classifier output.
502 """
503
507
508 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
509 """
510 Add a new curve to the Diagonal plot
511 @param data pandas.DataFrame containing all data
512 @param column which is used to calculate purity for different cuts
513 @param signal_mask boolean numpy.array defining which events are signal events
514 @param bckgrd_mask boolean numpy.array defining which events are background events
515 @param weight_column column in data containing the weights for each event
516 """
517 hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)
518 purity, purity_error = hists.get_purity_per_bin(['Signal'], ['Background'])
519
520 self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
521 # self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])
522 self.ymin, self.ymax = 0, 1
523
524 p = self._plot_datapoints(self.axis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
525 self.plots.append(p)
526 self.labels.append(column)
527 return self
528
529 def finish(self):
530 """
531 Sets limits, title, axis-labels and legend of the plot
532 """
533 self.scale_limits()
534 self.axis.plot((0.0, 1.0), (0.0, 1.0), color='black')
535 self.axis.set_xlim((self.xmin, self.xmax))
536 self.axis.set_ylim((self.ymin, self.ymax))
537 self.axis.set_title("Diagonal Plot")
538 self.axis.get_xaxis().set_label_text('Classifier Output')
539 self.axis.get_yaxis().set_label_text('Purity Per Bin')
540 self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
541 return self
542
543
544class Distribution(Plotter):
545 """
546 Plots distribution of a quantity
547 """
548
549 def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
550 keep_first_binning=False, range_in_std=None, logScale=False):
551 """
552 Creates a new figure and axis if None is given, sets the default plot parameters
553 @param figure default draw figure which is used
554 @param axis default draw axis which is used
555 @param normed true if histograms should be normed before drawing
556 @param keep_first_binning use the binning of the first distribution for further plots
557 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
558 """
559 super().__init__(figure, axis)
560
561 self.normed_to_all_entries = normed_to_all_entries
562
563 self.normed_to_bin_width = normed_to_bin_width
564
565 self.range_in_std = range_in_std
566 # if self.normed_to_all_entries or self.normed_to_bin_width:
567
568 self.ymin = float(0)
569
570 self.ymax = float('-inf')
571
572 self.xmin = float('inf')
573
574 self.xmax = float('-inf')
575
576 self.keep_first_binning = keep_first_binning
577
578 self.first_binning = None
579
580 self.x_axis_label = ''
581
582 self.logScale = False
583
584 self.binWidth = 0.02
585
586 def add(self, data, column, mask=None, weight_column=None, label=None, bins=50):
587 """
588 Add a new distribution to the plots
589 @param data pandas.DataFrame containing all data
590 @param column which is used to calculate distribution histogram
591 @param mask boolean numpy.array defining which events are used for the histogram
592 @param weight_column column in data containing the weights for each event
593 """
594 if mask is None:
595 mask = numpy.ones(len(data)).astype('bool')
596
597 # bins = 50
598 if self.keep_first_binning and self.first_binning is not None:
599 bins = self.first_binning
600 hists = histogram.Histograms(data, column, {'Total': mask}, weight_column=weight_column,
601 bins=bins, equal_frequency=False, range_in_std=self.range_in_std)
602 if self.keep_first_binning and self.first_binning is None:
603 self.first_binning = hists.bins
604 hist, hist_error = hists.get_hist('Total')
605 self.binWidth = hists.bin_widths[1]
606
607 if self.normed_to_all_entries:
608 normalization = float(numpy.sum(hist))
609 hist = hist / normalization
610 hist_error = hist_error / normalization
611
612 if self.normed_to_bin_width:
613 hist = hist / hists.bin_widths
614 hist_error = hist_error / hists.bin_widths
615
616 self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
617 self.ymin, self.ymax = numpy.nanmin([hist.min(), self.ymin]), numpy.nanmax([(hist + hist_error).max(), self.ymax])
618
619 p = self._plot_datapoints(self.axis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
620 self.plots.append(p)
621 self.x_axis_label = column
622 if label is None:
623 self.labels.append(column)
624 else:
625 self.labels.append(label)
626 return self
627
628 def finish(self):
629 """
630 Sets limits, title, axis-labels and legend of the plot
631 """
632 self.scale_limits()
633 self.axis.set_xlim((self.xmin, self.xmax))
634
635 if self.logScale:
636 self.axis.set_yscale('log', nonposy='clip')
637 else:
638 self.axis.set_ylim((self.ymin, self.ymax))
639 self.binWidth = f'{self.binWidth:8.2f}'
640
641 # self.axis.set_title("Distribution Plot")
642 self.axis.get_xaxis().set_label_text(self.x_axis_label)
643 if self.normed_to_all_entries and self.normed_to_bin_width:
644 self.axis.get_yaxis().set_label_text(r'# Entries per Bin / (# Entries * Bin Width)')
645 elif self.normed_to_all_entries:
646 # self.axis.get_yaxis().set_label_text('# Entries per Bin / # Entries')
647 self.axis.get_yaxis().set_label_text(
648 r'{$\frac{\rm Entries\hspace{0.25em} per\hspace{0.25em} Bin}{\rm Entries}\, /\, (' +
649 self.binWidth + r'\,)$}', fontsize=65)
650 self.axis.get_yaxis().labelpad = 20
651 self.axis.get_yaxis().set_tick_params(labelsize=60)
652 elif self.normed_to_bin_width:
653 self.axis.get_yaxis().set_label_text(r'# Entries per Bin / Bin Width')
654 else:
655 self.axis.get_yaxis().set_label_text(r'# Entries per Bin')
656 # self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5, fontsize=60)
657 return self
658
659
660class Box(Plotter):
661 """
662 Create a boxplot
663 """
664
666
667 def __init__(self, figure=None, axis=None):
668 """
669 Creates a new figure and axis if None is given, sets the default plot parameters
670 @param figure default draw figure which is used
671 @param axis default draw axis which is used
672 """
673 super().__init__(figure=figure, axis=axis)
674
675
676 self.x_axis_label = ""
677
678 def add(self, data, column, mask=None, weight_column=None):
679 """
680 Add a new boxplot to the plots
681 @param data pandas.DataFrame containing all data
682 @param column which is used to calculate boxplot quantities
683 @param mask boolean numpy.array defining which events are used for the histogram
684 @param weight_column column in data containing the weights for each event
685 """
686 if mask is None:
687 mask = numpy.ones(len(data)).astype('bool')
688 x = data[column][mask]
689 if weight_column is not None:
690 # weight = data[weight_column][mask]
691 B2WARNING("Weights are currently not used in boxplot, due to limitations in matplotlib")
692
693 if len(x) == 0:
694 B2WARNING("Ignore empty boxplot.")
695 return self
696
697 p = self.axis.boxplot(x, sym='k.', whis=1.5, vert=False, patch_artist=True, showmeans=True, widths=1,
698 boxprops=dict(facecolor='blue', alpha=0.5),
699 # medianprobs=dict(color='blue'),
700 # meanprobs=dict(color='red'),
701 )
702 self.plots.append(p)
703 self.labels.append(column)
704 self.x_axis_label = column
705 # """
706 # self.axis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + r'\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
707 # fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
708 # self.axis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + r'\n' + r'$IQD = {:.2f}$').format(x.std(),
709 # x.quantile(0.75) - x.quantile(0.25)),
710 # fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
711 # self.axis.text(0.7, 0.9, (r'$min = {:.2f}$' + r'\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
712 # fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
713 # """
714
715 return self
716
717 def finish(self):
718 """
719 Sets limits, title, axis-labels and legend of the plot
720 """
721 matplotlib.artist.setp(self.axis.get_yaxis(), visible=False)
722 self.axis.get_xaxis().set_label_text(self.x_axis_label)
723 self.axis.set_title("Box Plot")
724 return self
725
726
727class Difference(Plotter):
728 """
729 Plots the difference between two histograms
730 """
731
743
744 def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
745 """
746 Creates a new figure and axis if None is given, sets the default plot parameters
747 @param figure default draw figure which is used
748 @param axis default draw axis which is used
749 @param normed normalize minuend and subtrahend before comparing them
750 @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
751 """
752 super().__init__(figure, axis)
753 self.normed = normed
754 self.shift_to_zero = shift_to_zero
755 if self.normed:
756 self.ymin = -0.01
757 self.ymax = 0.01
758 else:
759 self.ymin = -1
760 self.ymax = 1
761
762 def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
763 """
764 Add a new difference plot
765 @param data pandas.DataFrame containing all data
766 @param column which is used to calculate distribution histogram
767 @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
768 @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
769 @param weight_column column in data containing the weights for each event
770 @param label label for the legend if None, the column name is used
771 """
772 bins = 50
773 hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask}, bins=bins,
774 weight_column=weight_column, equal_frequency=False)
775 minuend, minuend_error = hists.get_hist('Minuend')
776 subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
777
778 difference_error = histogram.poisson_error(minuend + subtrahend)
779 if self.normed:
780 difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
781 minuend = minuend / numpy.sum(minuend)
782 subtrahend = subtrahend / numpy.sum(subtrahend)
783 difference = minuend - subtrahend
784
785 if self.shift_to_zero:
786 difference = difference - numpy.mean(difference)
787
788 self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
789 self.ymin = min((difference - difference_error).min(), self.ymin)
790 self.ymax = max((difference + difference_error).max(), self.ymax)
791
792 p = self._plot_datapoints(self.axis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
793 self.plots.append(p)
794 if label is None:
795 self.labels.append(label)
796 else:
797 self.labels.append(column)
798 self.x_axis_label = column
799 return self
800
801 def finish(self, line_color='black'):
802 """
803 Sets limits, title, axis-labels and legend of the plot
804 """
805 self.axis.plot((self.xmin, self.xmax), (0, 0), color=line_color, linewidth=4)
806 self.scale_limits()
807 self.axis.set_xlim((self.xmin, self.xmax))
808 self.axis.set_ylim((self.ymin, self.ymax))
809 self.axis.set_title("Difference Plot")
810 self.axis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
811 self.axis.get_xaxis().set_label_text(self.x_axis_label)
812 self.axis.set_ylabel(r'{\rm Difference}', fontsize=40, labelpad=20)
813 self.axis.get_xaxis().grid(True)
814 # self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
815 return self
816
817
818class normalizedResiduals(Plotter):
819 """
820 Plots the difference between two histograms
821 """
822
836
837 def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
838 """
839 Creates a new figure and axis if None is given, sets the default plot parameters
840 @param figure default draw figure which is used
841 @param axis default draw axis which is used
842 @param normed normalize minuend and subtrahend before comparing them
843 @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
844 """
845 super().__init__(figure, axis)
846 self.normed = normed
847 self.shift_to_zero = shift_to_zero
848 if self.normed:
849 self.ymin = -0.01
850 self.ymax = 0.01
851 else:
852 self.ymin = -1
853 self.ymax = 1
854
855 def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None, bins=50, isNN=False):
856 """
857 Add a new difference plot
858 @param data pandas.DataFrame containing all data
859 @param column which is used to calculate distribution histogram
860 @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
861 @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
862 @param weight_column column in data containing the weights for each event
863 @param label label for the legend if None, the column name is used
864 """
865 # bins = 50
866 hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask}, bins=bins,
867 weight_column=weight_column, equal_frequency=False)
868 minuend, minuend_error = hists.get_hist('Minuend')
869 subtrahend, subtrahend_error = hists.get_hist('Subtrahend')
870
871 print("Here BinWidths Norm", hists.bin_widths)
872 difference_error = histogram.poisson_error(minuend + subtrahend)
873
874 if self.normed:
875 difference_error = numpy.sqrt((minuend_error / numpy.sum(minuend))**2 + (subtrahend_error / numpy.sum(subtrahend))**2)
876 minuend = minuend / numpy.sum(minuend)
877 subtrahend = subtrahend / numpy.sum(subtrahend)
878 difference = minuend - subtrahend
879 normalizedRes = (minuend - subtrahend) / difference_error
880
881 if self.shift_to_zero:
882 difference = difference - numpy.mean(difference)
883
884 # self.xmin, self.xmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmax)
885
886 # if min(hists.bin_centers.min(), self.xmin) < -0.8:
887 if isNN:
888 self.xmin = float(-1.0)
889
890 self.xmin, self.xmax = self.xmin, self.xmax
891
892 p = self._plot_datapoints(self.axis, hists.bin_centers, normalizedRes, xerr=hists.bin_widths / 2, yerr=1)
893 self.plots.append(p)
894 if label is None:
895 self.labels.append(label)
896 else:
897 self.labels.append(column)
898 self.x_axis_label = column
899 return self
900
901 def finish(self, line_color='black'):
902 """
903 Sets limits, title, axis-labels and legend of the plot
904 """
905 # self.axis.plot((self.xmin, self.xmax), (0, 0), color=line_color, linewidth=4)
906 self.scale_limits()
907 self.axis.set_xlim((self.xmin, self.xmax))
908 self.axis.set_ylim((-5, 5))
909 self.axis.set_title("Difference Plot")
910 self.axis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
911 self.axis.get_xaxis().set_label_text(self.x_axis_label)
912 self.axis.set_ylabel(r'${\rm Normalized}$' + '\n' + r'${\rm Residuals}$', fontsize=40, labelpad=20)
913 self.axis.get_yaxis().set_ticks([-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5])
914 self.axis.get_yaxis().set_ticklabels([r'', r'$-4$', r'', r'$-2$', r'', r'$0$', r'', r'$2$', r'', r'$4$', r''], fontsize=45)
915 self.axis.get_xaxis().grid(True) # linestyle='--'
916 # plt.axhline(y= 4, xmin=-1.005, xmax=1.005, linewidth=1, color = 'k', linestyle = '-')
917 self.axis.plot((self.xmin, self.xmax), (3, 3), linewidth=4, color='#006600', linestyle='-')
918 self.axis.plot((self.xmin, self.xmax), (1, 1), linewidth=4, color='b', linestyle='-')
919 self.axis.plot((self.xmin, self.xmax), (-1, -1), linewidth=4, color='b', linestyle='-')
920 self.axis.plot((self.xmin, self.xmax), (-3, -3), linewidth=4, color='#006600', linestyle='-')
921
922 # self.axis.legend([x[0] for x in self.plots], self.labels, loc='best', fancybox=True, framealpha=0.5)
923 return self
924
925
926class Overtraining(Plotter):
927 """
928 Create TMVA-like overtraining control plot for a classification training
929 """
930
931
932 figure = None
933
934 axis = None
935
936 axis_d1 = None
937
938 axis_d2 = None
939
940 def __init__(self, figure=None):
941 """
942 Creates a new figure if None is given, sets the default plot parameters
943 @param figure default draw figure which is used
944 """
945 if figure is None:
946 self.figure = matplotlib.figure.Figure(figsize=(32, 18))
947 self.figure.set_tight_layout(True)
948 else:
949 self.figure = figure
950
951 gs = matplotlib.gridspec.GridSpec(5, 1)
952 self.axis = self.figure.add_subplot(gs[:3, :])
953 self.axis_d1 = self.figure.add_subplot(gs[3, :], sharex=self.axis)
954 self.axis_d2 = self.figure.add_subplot(gs[4, :], sharex=self.axis)
955
956 super().__init__(self.figure, self.axis)
957
958 def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None, bkgrOutput=0, isNN=False):
959 """
960 Add a new overtraining plot, I recommend to raw only one overtraining plot at the time,
961 otherwise there are too many curves in the plot to recognize anything in the plot.
962 @param data pandas.DataFrame containing all data
963 @param column which is used to calculate distribution histogram
964 @param train_mask boolean numpy.array defining which events are training events
965 @param test_mask boolean numpy.array defining which events are test events
966 @param signal_mask boolean numpy.array defining which events are signal events
967 @param bckgrd_mask boolean numpy.array defining which events are background events
968 @param weight_column column in data containing the weights for each event
969 """
970 distribution = Distribution(self.figure, self.axis, normed_to_all_entries=True)
971
972 bins = []
973
974 if isNN:
975 bins = list(range(-51, 55, 1))
976 for i in range(0, len(bins)):
977 bins[i] = float(bins[i]) / 25
978 else:
979 bins = list(range(-51, 55, 1))
980 for i in range(0, len(bins)):
981 bins[i] = float(bins[i]) / 50
982
983 if bkgrOutput == 0:
984 distribution.logScale = True
985 distribution.labels = [r'{\rm Test-Bkgr.}', r'{\rm Train-Bkgr.}', r'{\rm Test-Signal}', r'{\rm Train-Signal}']
986 else:
987 distribution.labels = [
988 r'{\rm Test-$\bar{B}^{0}$}',
989 r'{\rm Train-$\bar{B}^{0}$}',
990 r'{\rm Test-$B^{0}$}',
991 r'{\rm Train-$B^{0}$}']
992
993 distribution.set_plot_options(self.plot_kwargs)
994 # distribution.set_errorbar_options(self.errorbar_kwargs)
995 distribution.set_errorbar_options({'fmt': 'o', 'elinewidth': 5, 'alpha': 1, 'markersize': 20, 'ecolor': 'w'})
996 # distribution.set_errorband_options(self.errorband_kwargs)
997 distribution.set_errorband_options(None)
998 distribution.add(data, column, test_mask & bckgrd_mask, weight_column, None, bins)
999 distribution.add(data, column, test_mask & signal_mask, weight_column, None, bins)
1000
1001 distribution.set_errorbar_options(None)
1002
1003 distribution.set_plot_options({'color': distribution.plots[0][0][0].get_color(
1004 ), 'drawstyle': 'steps-mid', 'linestyle': 'dashed', 'lw': 5})
1005 distribution.set_fill_options(None)
1006 distribution.add(data, column, train_mask & bckgrd_mask, weight_column, None, bins)
1007 distribution.set_plot_options({'color': distribution.plots[1][0][0].get_color(
1008 ), 'drawstyle': 'steps-mid', 'linestyle': 'solid', 'lw': 5})
1009 # distribution.set_fill_options({'color': distribution.plots[1][0][0].get_color(), 'alpha': 0.5, 'step': 'mid'})
1010 distribution.add(data, column, train_mask & signal_mask, weight_column, None, bins)
1011
1012 distribution.finish()
1013
1014 p1 = distribution.axis.errorbar([], [], xerr=0, yerr=0, elinewidth=5, mew=2, ecolor='w',
1015 # ecolor=[0.04862745098039216, 0.18666666666666668, 0.28235294117647064],
1016 fmt='o', mfc=distribution.plots[0][0][0].get_color(),
1017 mec=distribution.plots[0][0][0].get_color(), markersize=20, label=r'${\rm Test-Bkgr.}$')
1018 p2, = distribution.axis.plot([], label=r'${\rm Train-Bkgr.}$', linewidth=5,
1019 linestyle='dashed', c=distribution.plots[0][0][0].get_color())
1020 p3 = distribution.axis.errorbar([], [], xerr=0, yerr=0, elinewidth=5, mew=2, ecolor='w',
1021 # ecolor=[0.4, 0.1992156862745098, 0.02196078431372549],
1022 fmt='o', mfc=distribution.plots[1][0][0].get_color(),
1023 mec=distribution.plots[1][0][0].get_color(), markersize=20, label=r'${\rm Test-Signal}$')
1024 p4, = distribution.axis.plot([], label=r'${\rm Train-Signal}$', linewidth=5,
1025 linestyle='solid', alpha=0.9, c=distribution.plots[1][0][0].get_color())
1026
1027 distribution.axis.legend([p1, p2, p3, p4], distribution.labels, loc='best', fancybox=True, framealpha=0.5, fontsize=60)
1028
1029 self.plot_kwargs['color'] = distribution.plots[0][0][0].get_color()
1030 difference_bckgrd = normalizedResiduals(self.figure, self.axis_d1, shift_to_zero=True, normed=True)
1031 difference_bckgrd.set_plot_options(self.plot_kwargs)
1032 difference_bckgrd.set_errorbar_options(self.errorbar_kwargs)
1033 difference_bckgrd.set_errorband_options(self.errorband_kwargs)
1034 difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column, None, bins, isNN)
1035 self.axis_d1.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
1036 self.axis_d1.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
1037 # self.plot_kwargs['color'] = distribution.plots[0][0][0].get_color()
1038 difference_bckgrd.plots = difference_bckgrd.labels = []
1039 difference_bckgrd.finish(line_color=distribution.plots[0][0][0].get_color())
1040
1041 self.plot_kwargs['color'] = distribution.plots[1][0][0].get_color()
1042 difference_signal = normalizedResiduals(self.figure, self.axis_d2, shift_to_zero=True, normed=True)
1043 difference_signal.set_plot_options(self.plot_kwargs)
1044 difference_signal.set_errorbar_options(self.errorbar_kwargs)
1045 difference_signal.set_errorband_options(self.errorband_kwargs)
1046 difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column, None, bins, isNN)
1047 self.axis_d2.set_xlim((difference_signal.xmin, difference_signal.xmax))
1048 self.axis_d2.set_ylim((difference_signal.ymin, difference_signal.ymax))
1049 difference_signal.plots = difference_signal.labels = []
1050 difference_signal.finish(line_color=distribution.plots[1][0][0].get_color())
1051
1052
1053# try:
1054# import scipy.stats
1055# # Kolmogorov smirnov test
1056# if len(data[column][train_mask & signal_mask]) == 0 or len(data[column][test_mask & signal_mask]) == 0:
1057# B2WARNING("Cannot calculate kolmogorov smirnov test for signal due to missing data")
1058# else:
1059# ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
1060# props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
1061# self.axis_d1.text(0.1, 0.9, r'${\rm Signal\ (train\ -\ test)}\ p$-{\rm value}' + r'$={:.2f}$'.format(ks[1]),
1062# fontsize=36, bbox=props,
1063# verticalalignment='top', horizontalalignment='left', transform=self.axis_d1.transAxes)
1064# if len(data[column][train_mask & bckgrd_mask]) == 0 or len(data[column][test_mask & bckgrd_mask]) == 0:
1065# B2WARNING("Cannot calculate kolmogorov smirnov test for background due to missing data")
1066# else:
1067# ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
1068# props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)
1069# self.axis_d2.text(0.1, 0.9, r'${\rm Bkgr.\ (train\ -\ test)}\ p$-{\rm value}' + r'$={:.2f}$'.format(ks[1]),
1070# fontsize=36,
1071# bbox=props,
1072# verticalalignment='top', horizontalalignment='left', transform=self.axis_d2.transAxes)
1073# except ImportError:
1074# B2WARNING("Cannot calculate kolmogorov smirnov test please install scipy!")
1075
1076 return self
1077
1078 def finish(self, xLabel=r'${\rm Classifier\ Output}$'):
1079 """
1080 Sets limits, title, axis-labels and legend of the plot
1081 """
1082 # self.axis.set_title("Overtraining Plot")
1083 self.axis_d1.set_title("")
1084 self.axis_d2.set_title("")
1085 matplotlib.artist.setp(self.axis.get_xticklabels(), visible=False)
1086 matplotlib.artist.setp(self.axis_d1.get_xticklabels(), visible=False)
1087 self.axis.get_xaxis().set_label_text('')
1088 self.axis_d1.get_xaxis().set_label_text('')
1089 self.axis_d2.get_xaxis().set_label_text(xLabel, fontsize=85)
1090 self.axis_d2.get_xaxis().labelpad = 20
1091 self.axis_d2.get_xaxis().set_tick_params(labelsize=60)
1092 return self
1093
1094
1095class VerboseDistribution(Plotter):
1096 """
1097 Plots distribution of a quantity including boxplots
1098 """
1099
1100
1101 box_axes = None
1102
1103 def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
1104 """
1105 Creates a new figure and axis if None is given, sets the default plot parameters
1106 @param figure default draw figure which is used
1107 @param axis default draw axis which is used
1108 @param normed true if the histograms should be normed before drawing
1109 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
1110 """
1111 super().__init__(figure, axis)
1112
1113 self.normed = normed
1114
1115 self.range_in_std = range_in_std
1116 self.box_axes = []
1117
1118 self.distribution = Distribution(self.figure, self.axis, normed_to_all_entries=self.normed, range_in_std=self.range_in_std)
1119
1120 def add(self, data, column, mask=None, weight_column=None, label=None):
1121 """
1122 Add a new distribution plot, with additional information like a boxplot compared to
1123 the ordinary Distribution plot.
1124 @param data pandas.DataFrame containing all data
1125 @param column which is used to calculate distribution histogram
1126 @param mask boolean numpy.array defining which events are used for the distribution histogram
1127 @param weight_column column in data containing the weights for each event
1128 """
1129 self.distribution.set_plot_options(self.plot_kwargs)
1130 self.distribution.set_errorbar_options(self.errorbar_kwargs)
1131 self.distribution.set_errorband_options(self.errorband_kwargs)
1132 self.distribution.add(data, column, mask, weight_column, label=label)
1133
1134 n = len(self.box_axes) + 1
1135 gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1136 gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i in range(n)]
1137 box_axis = self.add_subplot(gridspecs)
1138
1139 if self.range_in_std is not None:
1140 mean, std = histogram.weighted_mean_and_std(data[column], None if weight_column is None else data[weight_column])
1141 # Everything outside mean +- range_in_std * std is considered not inside the mask
1142 mask = mask & (data[column] > (mean - self.range_in_std * std)) & (data[column] < (mean + self.range_in_std * std))
1143 box = Box(self.figure, box_axis)
1144 box.add(data, column, mask, weight_column)
1145 if len(box.plots) > 0:
1146 box.plots[0]['boxes'][0].set_facecolor(self.distribution.plots[-1][0][0].get_color())
1147 box.finish()
1148
1149 self.box_axes.append(box_axis)
1150 return self
1151
1152 def finish(self):
1153 """
1154 Sets limits, title, axis-labels and legend of the plot
1155 """
1156 self.distribution.finish()
1157 matplotlib.artist.setp(self.axis.get_xticklabels(), visible=False)
1158 self.axis.get_xaxis().set_label_text('')
1159 for box_axis in self.box_axes[:-1]:
1160 matplotlib.artist.setp(box_axis.get_xticklabels(), visible=False)
1161 box_axis.set_title("")
1162 box_axis.get_xaxis().set_label_text('')
1163 self.box_axes[-1].set_title("")
1164 self.axis.set_title("Distribution Plot")
1165 self.axis.legend([x[0] for x in self.distribution.plots], self.distribution.labels,
1166 loc='best', fancybox=True, framealpha=0.5)
1167 return self
1168
1169
1170class Correlation(Plotter):
1171 """
1172 Plots change of a distribution of a quantity depending on the cut on a classifier
1173 """
1174
1175 figure = None
1176
1177 axis = None
1178
1179 axis_d1 = None
1180
1181 axis_d2 = None
1182
1183 def __init__(self, figure=None):
1184 """
1185 Creates a new figure if None is given, sets the default plot parameters
1186 @param figure default draw figure which is used
1187 """
1188 if figure is None:
1189 self.figure = matplotlib.figure.Figure(figsize=(32, 18))
1190 self.figure.set_tight_layout(True)
1191 else:
1192 self.figure = figure
1193
1194 gs = matplotlib.gridspec.GridSpec(3, 2)
1195 self.axis = self.figure.add_subplot(gs[0, :])
1196 self.axis_d1 = self.figure.add_subplot(gs[1, :], sharex=self.axis)
1197 self.axis_d2 = self.figure.add_subplot(gs[2, :], sharex=self.axis)
1198
1199 super().__init__(self.figure, self.axis)
1200
1201 def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1202 """
1203 Add a new correlation plot.
1204 @param data pandas.DataFrame containing all data
1205 @param column which is used to calculate distribution histogram
1206 @param cut_column which is used to calculate cut on the other quantity defined by column
1207 @param quantiles list of quantiles between 0 and 100, defining the different cuts
1208 @param weight_column column in data containing the weights for each event
1209 """
1210 if len(data[cut_column]) == 0:
1211 B2WARNING("Ignore empty Correlation.")
1212 return self
1213
1214 axes = [self.axis, self.axis_d1, self.axis_d2]
1215
1216 for i, (l, m) in enumerate([('.', signal_mask | bckgrd_mask), ('S', signal_mask), ('B', bckgrd_mask)]):
1217
1218 if weight_column is not None:
1219 weights = numpy.array(data[weight_column][m])
1220 else:
1221 weights = numpy.ones(len(data[column][m]))
1222
1223 # The cast to float32 is a workaround for the following numpy issue:
1224 # https://github.com/numpy/numpy/issues/8123
1225 xrange = np.percentile(data[column][m], [5, 95]).astype(np.float32)
1226
1227 colormap = plt.get_cmap('coolwarm')
1228 tmp, x = np.histogram(data[column][m], bins=100,
1229 range=xrange, normed=True, weights=weights)
1230 bin_center = ((x + np.roll(x, 1)) / 2)[1:]
1231 axes[i].plot(bin_center, tmp, color='black', lw=1)
1232
1233 for quantil in np.arange(5, 100, 5):
1234 cut = np.percentile(data[cut_column][m], quantil)
1235 sel = data[cut_column][m] >= cut
1236 y, x = np.histogram(data[column][m][sel], bins=100,
1237 range=xrange, normed=True, weights=weights[sel])
1238 bin_center = ((x + np.roll(x, 1)) / 2)[1:]
1239 axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1240 tmp = y
1241
1242 axes[i].set_ylim(bottom=0)
1243
1244 flatness_score = basf2_mva_util.calculate_flatness(data[column][m], data[cut_column][m], weights)
1245 axes[i].set_title(r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1246 return self
1247
1248 def finish(self):
1249 """
1250 Sets limits, title, axis-labels and legend of the plot
1251 """
1252 return self
1253
1254
1255class TSNE(Plotter):
1256 """
1257 Plots multivariate distribution using TSNE algorithm
1258 """
1259
1260 def add(self, data, columns, *masks):
1261 """
1262 Add a new correlation plot.
1263 @param data pandas.DataFrame containing all data
1264 @param columns which are used to calculate the correlations
1265 @param masks different classes to show in TSNE
1266 """
1267 try:
1268 import sklearn
1269 import sklearn.manifold
1270 model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1271 data = numpy.array([data[column] for column in columns]).T
1272 model.fit(data)
1273 for mask in masks:
1274 data = numpy.array([data[column][mask] for column in columns]).T
1275 data = model.transform(data)
1276 self.axis.scatter(data[:, 0], data[:, 1])
1277 except ImportError:
1278 print("Cannot create TSNE plot. Install sklearn if you want it")
1279 return self
1280
1281 def finish(self):
1282 """
1283 Sets limits, title, axis-labels and legend of the plot
1284 """
1285 return self
1286
1287
1288class Importance(Plotter):
1289 """
1290 Plots importance matrix
1291 """
1292
1293 def add(self, data, columns, variables, displayHeatMap):
1294 """
1295 Add a new correlation plot.
1296 @param data pandas.DataFrame containing all data
1297 @param columns which are used to calculate the correlations
1298 """
1299 self.figure.set_tight_layout(True)
1300
1301 def norm(x):
1302 width = (numpy.max(x) - numpy.min(x))
1303 if width <= 0:
1304 return numpy.zeros(x.shape)
1305 return (x - numpy.min(x)) / width * 100
1306
1307 importance_matrix = numpy.vstack([norm(data[column]) for column in columns]).T
1308
1309 cRdBu = plt.get_cmap('RdBu')
1310 new_RdBu = truncate_colormap(cRdBu, 0.5, 0.85)
1311
1312 labelsValues = []
1313 labels = list(variables)
1314
1315 for y in range(importance_matrix.shape[0]):
1316 for x in range(importance_matrix.shape[1]):
1317 labelsValues.append([importance_matrix[y, x], labels[y]])
1318
1319 labelsValues = np.array(sorted(labelsValues))
1320
1321 arrayToSort = np.array(np.sort(importance_matrix, axis=0))
1322 # print(arrayToSort)
1323 importance_heatmap = self.axis.pcolor(arrayToSort, cmap=new_RdBu, vmin=0, vmax=100)
1324 # importance_heatmap = self.axis.pcolor(importance_matrix, cmap=matplotlib.pyplot.cm.viridis, vmin=0.0, vmax=100)
1325
1326 CoeffSize = 33
1327
1328 # put the major ticks at the middle of each cell
1329 self.axis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=False)
1330 self.axis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=False)
1331
1332 self.axis.set_xticklabels(columns, minor=False, rotation=90)
1333 # self.axis.set_yticklabels(variables, minor=False)
1334
1335 if labelsValues.shape[0] < 6:
1336 CoeffSize = 50
1337 self.axis.set_yticklabels(labelsValues[:, 1], minor=False, size=58)
1338
1339 else:
1340 self.axis.set_yticklabels(labelsValues[:, 1], minor=False)
1341
1342 self.axis.set_xticklabels([''])
1343
1344 # for y in range(importance_matrix.shape[0]):
1345 # for x in range(importance_matrix.shape[1]):
1346 # self.axis.text(x + 0.5, y + 0.5, r'$%.0f$' % importance_matrix[y, x],
1347 # size=33,
1348 # horizontalalignment='center',
1349 # verticalalignment='center')
1350
1351 for y in range(labelsValues.shape[0]):
1352 self.axis.text(x + 0.5, y + 0.5, r'$%.0f$' % float(labelsValues[y][0]),
1353 size=CoeffSize,
1354 horizontalalignment='center',
1355 verticalalignment='center')
1356
1357 if displayHeatMap:
1358 cb = self.figure.colorbar(importance_heatmap, ticks=[2, 98], orientation='vertical')
1359 cb.ax.tick_params(length=0)
1360 cb.ax.set_yticklabels([r'${\rm low}$', r'${\rm high}$'], size=60)
1361
1362 self.axis.set_aspect('equal')
1363
1364 return self
1365
1366 def finish(self):
1367 """
1368 Sets limits, title, axis-labels and legend of the plot
1369 """
1370 return self
1371
1372
1373def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
1374 new_cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
1375 f'trunc({cmap.name},{minval:.2f},{maxval:.2f})',
1376 cmap(np.linspace(minval, maxval, n)))
1377 return new_cmap
1378
1379
1380class CorrelationMatrix(Plotter):
1381 """
1382 Plots correlation matrix
1383 """
1384
1385 figure = None
1386
1387 signal_axis = None
1388
1389 bckgrd_axis = None
1390
1391 def __init__(self, figure=None):
1392 """
1393 Creates a new figure if None is given, sets the default plot parameters
1394 @param figure default draw figure which is used
1395 """
1396 if figure is None:
1397 self.figure = matplotlib.figure.Figure(figsize=(38, 24))
1398 self.figure.set_tight_layout(True)
1399 else:
1400 self.figure = figure
1401
1402 gs = matplotlib.gridspec.GridSpec(16, 2)
1403 self.signal_axis = self.figure.add_subplot(gs[:14, 0])
1404 self.bckgrd_axis = self.figure.add_subplot(gs[:14, 1], sharey=self.signal_axis)
1405
1406 self.colorbar_axis = self.figure.add_subplot(gs[15, :])
1407
1408 self.axis = self.signal_axis
1409
1410 super().__init__(self.figure, self.axis)
1411
1412 def add(self, data, columns, signal_mask, bckgrd_mask, bkgrOutput):
1413 """
1414 Add a new correlation plot.
1415 @param data pandas.DataFrame containing all data
1416 @param columns which are used to calculate the correlations
1417 """
1418 # columns = list(reversed(columns))
1419 signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column in columns])) * 100
1420 bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask] for column in columns])) * 100
1421
1422 mirrored_signal_corr = np.zeros(signal_corr.shape)
1423 mirrored_bckgrd_corr = np.zeros(bckgrd_corr.shape)
1424
1425 for y in range(signal_corr.shape[0]):
1426 for x in range(signal_corr.shape[1]):
1427 mirrored_signal_corr[y, x] = signal_corr[y, signal_corr.shape[1] - 1 - x]
1428
1429 for y in range(bckgrd_corr.shape[0]):
1430 for x in range(bckgrd_corr.shape[1]):
1431 mirrored_bckgrd_corr[y, x] = bckgrd_corr[y, bckgrd_corr.shape[1] - 1 - x]
1432
1433 cRdBu = plt.get_cmap('RdBu')
1434 new_RdBu = truncate_colormap(cRdBu, 0.15, 0.85)
1435 signal_heatmap = self.signal_axis.pcolor(mirrored_signal_corr, cmap=new_RdBu, vmin=-100.0, vmax=100.0)
1436 # bckgrd_heatmap = self.bckgrd_axis.pcolor(mirrored_bckgrd_corr, cmap=new_RdBu, vmin=-100.0, vmax=100.0)
1437
1438 # cvir = plt.get_cmap('viridis_r')
1439 # new_cvir = truncate_colormap(cvir, 0, 0.75)
1440 # signal_heatmap = self.signal_axis.pcolor(mirrored_signal_corr, cmap=new_cvir, vmin=-100.0, vmax=100.0)
1441 # bckgrd_heatmap = self.bckgrd_axis.pcolor(mirrored_bckgrd_corr, cmap=new_cvir, vmin=-100.0, vmax=100.0)
1442
1443 for y in range(mirrored_signal_corr.shape[0]):
1444 for x in range(mirrored_signal_corr.shape[1]):
1445 outputWithRedundantMinus = f'{mirrored_signal_corr[y, x]:.0f}'
1446 if outputWithRedundantMinus == '-0':
1447 mirrored_signal_corr[y, x] = 0
1448
1449 for y in range(mirrored_bckgrd_corr.shape[0]):
1450 for x in range(mirrored_bckgrd_corr.shape[1]):
1451 outputWithRedundantMinus = f'{mirrored_bckgrd_corr[y, x]:.0f}'
1452 if outputWithRedundantMinus == '-0':
1453 mirrored_bckgrd_corr[y, x] = 0
1454
1455 self.signal_axis.invert_yaxis()
1456 self.signal_axis.xaxis.tick_top()
1457 self.bckgrd_axis.invert_yaxis()
1458 self.bckgrd_axis.xaxis.tick_top()
1459
1460 # put the major ticks at the middle of each cell
1461 self.signal_axis.set_xticks(numpy.arange(mirrored_signal_corr.shape[0]) + 0.5, minor=False)
1462 self.signal_axis.set_yticks(numpy.arange(mirrored_signal_corr.shape[1]) + 0.5, minor=False)
1463
1464 CoeffSize = 30
1465
1466 # put the major ticks at the middle of each cell
1467 self.bckgrd_axis.set_xticks(numpy.arange(mirrored_bckgrd_corr.shape[0]) + 0.5, minor=False)
1468 self.bckgrd_axis.set_yticks(numpy.arange(mirrored_bckgrd_corr.shape[1]) + 0.5, minor=False)
1469
1470 if mirrored_signal_corr.shape[0] < 8:
1471 CoeffSize = 50
1472 self.bckgrd_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90, size=58)
1473 self.bckgrd_axis.set_yticklabels(columns, minor=False, size=58)
1474 self.signal_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90, size=58)
1475 self.signal_axis.set_yticklabels(columns, minor=False, size=58)
1476 else:
1477 self.bckgrd_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90)
1478 self.bckgrd_axis.set_yticklabels(columns, minor=False)
1479 self.signal_axis.set_xticklabels(list(reversed(columns)), minor=False, rotation=90)
1480 self.signal_axis.set_yticklabels(columns, minor=False)
1481
1482 for y in range(mirrored_signal_corr.shape[0]):
1483 for x in range(mirrored_signal_corr.shape[1]):
1484 if mirrored_signal_corr.shape[0] > 24 and mirrored_signal_corr[y, x] < 0:
1485 self.signal_axis.text(x + 0.5, y + 0.5, '-' + r'$%.0f$' % abs(mirrored_signal_corr[y, x]),
1486 size=25,
1487 horizontalalignment='center',
1488 verticalalignment='center')
1489 else:
1490 self.signal_axis.text(x + 0.5, y + 0.5, r'$%.0f$' % mirrored_signal_corr[y, x],
1491 size=CoeffSize,
1492 horizontalalignment='center',
1493 verticalalignment='center')
1494
1495 for y in range(mirrored_bckgrd_corr.shape[0]):
1496 for x in range(mirrored_bckgrd_corr.shape[1]):
1497 if mirrored_bckgrd_corr.shape[0] > 24 and mirrored_bckgrd_corr[y, x] < 0:
1498 self.signal_axis.text(x + 0.5, y + 0.5, '-' + r'$%.0f$' % abs(mirrored_bckgrd_corr[y, x]),
1499 size=25,
1500 horizontalalignment='center',
1501 verticalalignment='center')
1502 else:
1503 self.bckgrd_axis.text(x + 0.5, y + 0.5, r'$%.0f$' % mirrored_bckgrd_corr[y, x],
1504 size=CoeffSize,
1505 horizontalalignment='center',
1506 verticalalignment='center')
1507
1508 cb = self.figure.colorbar(signal_heatmap, cax=self.colorbar_axis, ticks=[-92.3, 0, 92.5], orientation='horizontal')
1509 cb.ax.tick_params(length=0)
1510 cb.ax.set_xticklabels([r'${\rm negative}$', r'${\rm uncorrelated}$', r'${\rm positive}$'], fontsize=60)
1511
1512 if bkgrOutput == -1:
1513 self.figure.text(0.30, 0.11, r'$B^0\,(q_{\rm MC} = +1)$', horizontalalignment='center', size=65)
1514 self.figure.text(0.74, 0.11, r'$\bar{B}^0\,(q_{\rm MC} = -1)$', horizontalalignment='center', size=65)
1515
1516 else:
1517 self.figure.text(0.27, 0.115, r'${\rm Signal}$', horizontalalignment='center', size=65)
1518 self.figure.text(0.73, 0.115, r'${\rm Background}$', horizontalalignment='center', size=65)
1519
1520 return self
1521
1522 def finish(self):
1523 """
1524 Sets limits, title, axis-labels and legend of the plot
1525 """
1526 matplotlib.artist.setp(self.bckgrd_axis.get_yticklabels(), visible=False)
1527 return self
1528
1529
1530if __name__ == '__main__':
1531
1532 def get_data(N, columns):
1533 """
1534 Creates fake data for example plots
1535 """
1536 N /= 2
1537 n = len(columns) - 1
1538 xs = numpy.random.normal(0, size=(N, n))
1539 xb = numpy.random.normal(1, size=(N, n))
1540 ys = numpy.zeros(N)
1541 yb = numpy.ones(N)
1542 data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1543 return data.reindex(numpy.random.permutation(data.index))
1544
1545 import seaborn
1546 # Set nice searborn settings
1547 seaborn.set(font_scale=3)
1548 seaborn.set_style('whitegrid')
1549
1550 # Standard plots
1551 N = 100000
1552 data = get_data(N, columns=['FastBDT', 'NeuroBayes', 'isSignal'])
1553 data['type'] = ''
1554 data.type.iloc[:N / 2] = 'Train'
1555 data.type.iloc[N / 2:] = 'Test'
1556
1557 p = Box()
1558 p.add(data, 'FastBDT')
1559 p.finish()
1560 p.save('box_plot.png')
1561
1562 p = VerboseDistribution()
1563 p.add(data, 'FastBDT')
1564 p.add(data, 'NeuroBayes')
1565 p.finish()
1566 p.save('verbose_distribution_plot.png')
1567
1568 p = PurityOverEfficiency()
1569 p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1570 p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1571 p.finish()
1572 p.save('roc_purity_plot.png')
1573
1574 p = RejectionOverEfficiency()
1575 p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1576 p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1577 p.finish()
1578 p.save('roc_rejection_plot.png')
1579
1580 p = Diagonal()
1581 p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)
1582 p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)
1583 p.finish()
1584 p.save('diagonal_plot.png')
1585
1586 p = Distribution()
1587 p.add(data, 'FastBDT')
1588 p.add(data, 'NeuroBayes')
1589 p.finish()
1590 p.save('distribution_plot.png')
1591
1592 p = Difference()
1593 p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test')
1594 p.add(data, 'NeuroBayes', data['type'] == 'Train', data['type'] == 'Test')
1595 p.finish()
1596 p.save('difference_plot.png')
1597
1598 p = Overtraining()
1599 p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test', data['isSignal'] == 1, data['isSignal'] == 0)
1600 p.finish()
1601 p.save('overtraining_plot.png')
1602
1603 p = Correlation()
1604 p.add(data, 'FastBDT', 'NeuroBayes', [0, 20, 40, 60, 80, 100], data['isSignal'] == 0)
1605 p.finish()
1606 p.save('correlation_plot.png')
1607
1608 p = CorrelationMatrix()
1609 data['FastBDT2'] = data['FastBDT']**2
1610 data['NeuroBayes2'] = data['NeuroBayes']**2
1611 data['FastBDT3'] = data['FastBDT']**3
1612 data['NeuroBayes3'] = data['NeuroBayes']**3
1613 p.add(data, ['FastBDT', 'NeuroBayes', 'FastBDT2', 'NeuroBayes2', 'FastBDT3', 'NeuroBayes3'])
1614 p.finish()
1615 p.save('correlation_matrix.png')
1616
1617# @endcond
def calculate_flatness(f, p, w=None)
def weighted_mean_and_std(x, w)
Definition: histogram.py:31
def poisson_error(n_tot)
Definition: histogram.py:24
Definition: plot.py:1