17 import matplotlib.pyplot
as plt
18 import matplotlib.artist
19 import matplotlib.figure
20 import matplotlib.gridspec
21 import matplotlib.colors
22 import matplotlib.patches
23 import matplotlib.ticker
25 from basf2_mva_evaluation
import histogram
35 matplotlib.rcParams.update({
'font.size': 36})
38 plt.style.use(
"belle2")
43 Base class for all Plotters.
77 Creates a new figure and axis if None is given, sets the default plot parameters
78 @param figure default draw figure which is used
79 @param axis default draw axis which is used
81 b2.B2INFO(
"Create new figure for class " + str(type(self)))
83 self.
figurefigure = matplotlib.figure.Figure(figsize=(32, 18))
84 self.
figurefigure.set_tight_layout(
False)
95 self.
xminxmin, self.
xmaxxmax = float(0), float(1)
96 self.
yminymin, self.
ymaxymax = float(0), float(1)
118 Adds a new subplot to the figure, updates all other axes
119 according to the given gridspec
120 @param gridspecs gridspecs for all axes including the new one
122 for gs, ax
in zip(gridspecs[:-1], self.
figurefigure.axes):
123 ax.set_position(gs.get_position(self.
figurefigure))
124 ax.set_subplotspec(gs)
130 Save the figure into a file
131 @param filename of the file
133 b2.B2INFO(
"Save figure for class " + str(type(self)))
134 from matplotlib.backends.backend_agg
import FigureCanvasAgg
as FigureCanvas
135 canvas = FigureCanvas(self.
figurefigure)
136 canvas.print_figure(filename, dpi=50)
141 Overrides default plot options for datapoint plot
142 @param plot_kwargs keyword arguments for the plot function
144 self.
plot_kwargsplot_kwargs = copy.copy(plot_kwargs)
149 Overrides default errorbar options for datapoint errorbars
150 @param errorbar_kwargs keyword arguments for the errorbar function
157 Overrides default errorband options for datapoint errorband
158 @param errorbar_kwargs keyword arguments for the fill_between function
165 Overrides default fill_between options for datapoint errorband
166 @param fill_kwargs keyword arguments for the fill_between function
168 self.
fill_kwargsfill_kwargs = copy.copy(fill_kwargs)
173 Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
174 @param x coordinates of the data points
175 @param y coordinates of the data points
176 @param xerr symmetric error on x data points
177 @param yerr symmetric error on y data points
180 plot_kwargs = copy.copy(self.
plot_kwargsplot_kwargs)
183 fill_kwargs = copy.copy(self.
fill_kwargsfill_kwargs)
185 if plot_kwargs
is None or 'color' not in plot_kwargs:
186 color = next(axis._get_lines.prop_cycler)
187 color = color[
'color']
188 plot_kwargs[
'color'] = color
190 color = plot_kwargs[
'color']
191 color = matplotlib.colors.ColorConverter().to_rgb(color)
192 patch = matplotlib.patches.Patch(color=color, alpha=0.5)
193 patch.get_color = patch.get_facecolor
196 if plot_kwargs
is not None:
197 p, = axis.plot(x, y, rasterized=
True, **plot_kwargs)
200 if errorbar_kwargs
is not None and (xerr
is not None or yerr
is not None):
201 if 'color' not in errorbar_kwargs:
202 errorbar_kwargs[
'color'] = color
203 if 'ecolor' not in errorbar_kwargs:
204 errorbar_kwargs[
'ecolor'] = [0.5 * x
for x
in color]
209 if not isinstance(xerr, (numpy.ndarray, list)):
210 xerr = xerr*numpy.ones(len(x))
211 mask = numpy.logical_and.reduce([numpy.isfinite(v)
for v
in [x, y, xerr, yerr]])
213 e = axis.errorbar(x[mask], y[mask], xerr=xerr[mask], yerr=yerr[mask], rasterized=
True, **errorbar_kwargs)
216 if errorband_kwargs
is not None and yerr
is not None:
217 if 'color' not in errorband_kwargs:
218 errorband_kwargs[
'color'] = color
223 for _x, _y, _xe, _ye
in zip(x, y, xerr, yerr):
224 axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=
True,
227 f = axis.fill_between(x, y - yerr, y + yerr, interpolate=
True, rasterized=
True, **errorband_kwargs)
229 if fill_kwargs
is not None:
230 axis.fill_between(x, y, 0, rasterized=
True, **fill_kwargs)
232 return (tuple(patches), p, e, f)
234 def add(self, *args, **kwargs):
236 Add a new plot to this plotter
238 return NotImplemented
242 Finish plotting and set labels, legends and stuff
244 return NotImplemented
248 Scale limits to increase distance to boundaries
259 Plots the purity and the efficiency over the cut value (for cut choosing)
266 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
268 Add a new curve to the plot
269 @param data pandas.DataFrame containing all data
270 @param column which is used to calculate efficiency and purity for different cuts
271 @param signal_mask boolean numpy.array defining which events are signal events
272 @param bckgrd_mask boolean numpy.array defining which events are background events
273 @param weight_column column in data containing the weights for each event
276 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
279 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
280 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
282 efficiency, efficiency_error = hists.get_true_positives([
'Signal'])
283 purity, purity_error = hists.get_false_positives([
'Background'])
285 cuts = hists.bin_centers
287 self.
xminxmin, self.
xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.
xminxmin]), numpy.nanmax([numpy.nanmax(cuts), self.
xmaxxmaxxmax])
288 self.
yminymin, self.
ymaxymaxymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.
yminymin]), \
289 numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.
ymaxymaxymax])
291 self.
plotsplots.append(self.
_plot_datapoints_plot_datapoints(self.
axisaxis, cuts, efficiency, xerr=0, yerr=efficiency_error))
294 self.
labelslabels.append(
"Efficiency")
296 self.
labelslabels.append(
"True positive")
301 self.
labelslabels.append(
"Purity")
303 self.
labelslabels.append(
"False positive")
309 Sets limits, title, axis-labels and legend of the plot
313 self.
axisaxis.set_title(
"Classification Plot")
314 self.
axisaxis.get_xaxis().set_label_text(
'Cut Value')
315 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
321 Plots the signal to noise ratio over the cut value (for cut choosing)
328 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
330 Add a new curve to the plot
331 @param data pandas.DataFrame containing all data
332 @param column which is used to calculate signal to noise ratio for different cuts
333 @param signal_mask boolean numpy.array defining which events are signal events
334 @param bckgrd_mask boolean numpy.array defining which events are background events
335 @param weight_column column in data containing the weights for each event
338 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
340 signal2noise, signal2noise_error = hists.get_signal_to_noise([
'Signal'], [
'Background'])
342 cuts = hists.bin_centers
344 self.
xminxmin, self.
xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.
xminxmin]), numpy.nanmax([numpy.nanmax(cuts), self.
xmaxxmaxxmax])
345 self.
yminymin, self.
ymaxymaxymax = numpy.nanmin([numpy.nanmin(signal2noise), self.
yminymin]), \
346 numpy.nanmax([numpy.nanmax(signal2noise), self.
ymaxymaxymax])
348 self.
plotsplots.append(self.
_plot_datapoints_plot_datapoints(self.
axisaxis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))
350 self.
labelslabels.append(column)
356 Sets limits, title, axis-labels and legend of the plot
360 self.
axisaxis.set_title(
"Signal to Noise Plot")
361 self.
axisaxis.get_xaxis().set_label_text(
'Cut Value')
362 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
368 Plots the purity over the efficiency also known as ROC curve
375 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
377 Add a new curve to the ROC plot
378 @param data pandas.DataFrame containing all data
379 @param column which is used to calculate efficiency and purity for different cuts
380 @param signal_mask boolean numpy.array defining which events are signal events
381 @param bckgrd_mask boolean numpy.array defining which events are background events
382 @param weight_column column in data containing the weights for each event
384 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
385 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
386 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
388 self.
xminxmin, self.
xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.
xminxmin]), numpy.nanmax([efficiency.max(), self.
xmaxxmaxxmax])
389 self.
yminymin, self.
ymaxymaxymax = numpy.nanmin([numpy.nanmin(purity), self.
yminymin]), numpy.nanmax([numpy.nanmax(purity), self.
ymaxymaxymax])
391 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
392 self.
plotsplots.append(p)
393 if label
is not None:
394 self.
labelslabels.append(label)
396 self.
labelslabels.append(column)
401 Sets limits, title, axis-labels and legend of the plot
405 self.
axisaxis.set_title(
"ROC Purity Plot")
406 self.
axisaxis.get_xaxis().set_label_text(
'Efficiency')
407 self.
axisaxis.get_yaxis().set_label_text(
'Purity')
408 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
414 Plots the rejection over the efficiency also known as ROC curve
421 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
423 Add a new curve to the ROC plot
424 @param data pandas.DataFrame containing all data
425 @param column which is used to calculate efficiency and purity for different cuts
426 @param signal_mask boolean numpy.array defining which events are signal events
427 @param bckgrd_mask boolean numpy.array defining which events are background events
428 @param weight_column column in data containing the weights for each event
430 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
431 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
432 rejection, rejection_error = hists.get_efficiency([
'Background'])
433 rejection = 1 - rejection
434 if isinstance(efficiency, int)
and not isinstance(rejection, int):
435 efficiency = numpy.array([efficiency] * len(rejection))
436 elif isinstance(rejection, int)
and not isinstance(efficiency, int):
437 rejection = numpy.array([rejection] * len(efficiency))
438 elif isinstance(rejection, int)
and isinstance(efficiency, int):
439 efficiency = numpy.array([efficiency])
440 rejection = numpy.array([rejection])
442 self.
xminxmin, self.
xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.
xminxmin]), numpy.nanmax([efficiency.max(), self.
xmaxxmaxxmax])
443 self.
yminymin, self.
ymaxymaxymax = numpy.nanmin([rejection.min(), self.
yminymin]), numpy.nanmax([rejection.max(), self.
ymaxymaxymax])
445 auc = numpy.abs(numpy.trapz(rejection, efficiency))
447 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
448 self.
plotsplots.append(p)
449 if label
is not None:
450 self.
labelslabels.append(label[:10] +
" ({:.2f})".format(auc))
452 self.
labelslabels.append(column[:10] +
" ({:.2f})".format(auc))
457 Sets limits, title, axis-labels and legend of the plot
461 self.
axisaxis.set_title(
"ROC Rejection Plot")
462 self.
axisaxis.get_xaxis().set_label_text(
'Signal Efficiency')
463 self.
axisaxis.get_yaxis().set_label_text(
'Background Rejection')
464 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
470 Plots multiple other plots into a grid 3x?
477 def __init__(self, cls, number_of_plots, figure=None):
479 Creates a new figure if None is given, sets the default plot parameters
480 @param figure default draw figure which is used
483 self.
figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
488 if number_of_plots == 1:
489 gs = matplotlib.gridspec.GridSpec(1, 1)
490 elif number_of_plots == 2:
491 gs = matplotlib.gridspec.GridSpec(1, 2)
492 elif number_of_plots == 3:
493 gs = matplotlib.gridspec.GridSpec(1, 3)
495 gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
502 def add(self, i, *args, **kwargs):
504 Call add function of ith subplot
505 @param i position of the subplot
511 Sets limits, title, axis-labels and legend of the plot
520 Plots the purity in each bin over the classifier output.
527 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
529 Add a new curve to the Diagonal plot
530 @param data pandas.DataFrame containing all data
531 @param column which is used to calculate purity for different cuts
532 @param signal_mask boolean numpy.array defining which events are signal events
533 @param bckgrd_mask boolean numpy.array defining which events are background events
534 @param weight_column column in data containing the weights for each event
536 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
537 purity, purity_error = hists.get_purity_per_bin([
'Signal'], [
'Background'])
543 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
544 self.
plotsplots.append(p)
545 self.
labelslabels.append(column)
550 Sets limits, title, axis-labels and legend of the plot
553 self.
axisaxis.
plot((0.0, 1.0), (0.0, 1.0), color=
'black')
556 self.
axisaxis.set_title(
"Diagonal Plot")
557 self.
axisaxis.get_xaxis().set_label_text(
'Classifier Output')
558 self.
axisaxis.get_yaxis().set_label_text(
'Purity Per Bin')
559 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
565 Plots distribution of a quantity
568 def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
569 keep_first_binning=False, range_in_std=None):
571 Creates a new figure and axis if None is given, sets the default plot parameters
572 @param figure default draw figure which is used
573 @param axis default draw axis which is used
574 @param normed true if histograms should be normed before drawing
575 @param keep_first_binning use the binning of the first distribution for further plots
576 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
578 super(Distribution, self).
__init__(figure, axis)
601 def add(self, data, column, mask=None, weight_column=None, label=None):
603 Add a new distribution to the plots
604 @param data pandas.DataFrame containing all data
605 @param column which is used to calculate distribution histogram
606 @param mask boolean numpy.array defining which events are used for the histogram
607 @param weight_column column in data containing the weights for each event
610 mask = numpy.ones(len(data)).astype(
'bool')
616 bins=bins, equal_frequency=
False, range_in_std=self.
range_in_stdrange_in_std)
619 hist, hist_error = hists.get_hist(
'Total')
622 normalization = float(numpy.sum(hist))
623 hist = hist / normalization
624 hist_error = hist_error / normalization
627 hist = hist / hists.bin_widths
628 hist_error = hist_error / hists.bin_widths
632 self.
ymaxymaxymax = numpy.nanmax([(hist + hist_error).max(), self.
ymaxymaxymax])
634 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
635 self.
plotsplots.append(p)
640 appendix =
' No data to plot!'
643 self.
labelslabels.append(column + appendix)
645 self.
labelslabels.append(label + appendix)
650 Sets limits, title, axis-labels and legend of the plot
652 self.
axisaxis.set_title(
"Distribution Plot")
655 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
658 self.
axisaxis.set_xlim((0., 1.))
659 self.
axisaxis.set_ylim((0., 1.))
660 self.
axisaxis.text(0.36, 0.5,
'No data to plot', fontsize=60, color=
'black')
669 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / (# Entries * Bin Width)')
671 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / # Entries')
673 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / Bin Width')
675 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin')
689 Creates a new figure and axis if None is given, sets the default plot parameters
690 @param figure default draw figure which is used
691 @param axis default draw axis which is used
693 super().
__init__(figure=figure, axis=axis)
698 def add(self, data, column, mask=None, weight_column=None):
700 Add a new boxplot to the plots
701 @param data pandas.DataFrame containing all data
702 @param column which is used to calculate boxplot quantities
703 @param mask boolean numpy.array defining which events are used for the histogram
704 @param weight_column column in data containing the weights for each event
707 mask = numpy.ones(len(data)).astype(
'bool')
708 x = data[column][mask]
709 if weight_column
is not None:
711 b2.B2WARNING(
"Weights are currently not used in boxplot, due to limitations in matplotlib")
714 b2.B2WARNING(
"Ignore empty boxplot.")
717 p = self.
axisaxis.boxplot(x, sym=
'k.', whis=1.5, vert=
False, patch_artist=
True, showmeans=
True, widths=1,
718 boxprops=dict(facecolor=
'blue', alpha=0.5),
722 self.
plotsplots.append(p)
723 self.
labelslabels.append(column)
726 self.axis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + '\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
727 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
728 self.axis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + '\n' + r'$IQD = {:.2f}$').format(x.std(),
729 x.quantile(0.75) - x.quantile(0.25)),
730 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
731 self.axis.text(0.7, 0.9, (r'$min = {:.2f}$' + '\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
732 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
739 Sets limits, title, axis-labels and legend of the plot
741 matplotlib.artist.setp(self.
axisaxis.get_yaxis(), visible=
False)
743 self.
axisaxis.set_title(
"Box Plot")
749 Plots the difference between two histograms
764 def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
766 Creates a new figure and axis if None is given, sets the default plot parameters
767 @param figure default draw figure which is used
768 @param axis default draw axis which is used
769 @param normed normalize minuend and subtrahend before comparing them
770 @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
772 super(Difference, self).
__init__(figure, axis)
782 def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
784 Add a new difference plot
785 @param data pandas.DataFrame containing all data
786 @param column which is used to calculate distribution histogram
787 @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
788 @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
789 @param weight_column column in data containing the weights for each event
790 @param label label for the legend if None, the column name is used
792 hists =
histogram.Histograms(data, column, {
'Minuend': minuend_mask,
'Subtrahend': subtrahend_mask},
793 weight_column=weight_column, equal_frequency=
False)
794 minuend, minuend_error = hists.get_hist(
'Minuend')
795 subtrahend, subtrahend_error = hists.get_hist(
'Subtrahend')
799 difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
800 minuend = minuend / numpy.sum(minuend)
801 subtrahend = subtrahend / numpy.sum(subtrahend)
802 difference = minuend - subtrahend
805 difference = difference - numpy.mean(difference)
808 self.
yminyminymin = min((difference - difference_error).min(), self.
yminyminymin)
809 self.
ymaxymaxymax = max((difference + difference_error).max(), self.
ymaxymaxymax)
811 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
812 self.
plotsplots.append(p)
814 self.
labelslabels.append(label)
816 self.
labelslabels.append(column)
822 Sets limits, title, axis-labels and legend of the plot
824 self.
axisaxis.
plot((self.
xminxmin, self.
xmaxxmaxxmax), (0, 0), color=line_color, linewidth=4, rasterized=
True)
828 self.
axisaxis.set_title(
"Difference Plot")
829 self.
axisaxis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
831 self.
axisaxis.get_yaxis().set_label_text(
'Difference')
832 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
838 Create TMVA-like overtraining control plot for a classification training
852 Creates a new figure if None is given, sets the default plot parameters
853 @param figure default draw figure which is used
856 self.
figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
861 gs = matplotlib.gridspec.GridSpec(5, 1)
868 def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
870 Add a new overtraining plot, I recommend to raw only one overtraining plot at the time,
871 otherwise there are too many curves in the plot to recognize anything in the plot.
872 @param data pandas.DataFrame containing all data
873 @param column which is used to calculate distribution histogram
874 @param train_mask boolean numpy.array defining which events are training events
875 @param test_mask boolean numpy.array defining which events are test events
876 @param signal_mask boolean numpy.array defining which events are signal events
877 @param bckgrd_mask boolean numpy.array defining which events are background events
878 @param weight_column column in data containing the weights for each event
882 distribution.set_plot_options(self.
plot_kwargsplot_kwargs)
885 distribution.add(data, column, test_mask & signal_mask, weight_column)
886 distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
888 distribution.set_plot_options(
889 {
'color': distribution.plots[0][0][0].get_color(),
'linestyle':
'-',
'lw': 4,
'drawstyle':
'steps-mid'})
890 distribution.set_fill_options({
'color': distribution.plots[0][0][0].get_color(),
'alpha': 0.5,
'step':
'mid'})
891 distribution.set_errorbar_options(
None)
892 distribution.set_errorband_options(
None)
893 distribution.add(data, column, train_mask & signal_mask, weight_column)
894 distribution.set_plot_options(
895 {
'color': distribution.plots[1][0][0].get_color(),
'linestyle':
'-',
'lw': 4,
'drawstyle':
'steps-mid'})
896 distribution.set_fill_options({
'color': distribution.plots[1][0][0].get_color(),
'alpha': 0.5,
'step':
'mid'})
897 distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
899 distribution.labels = [
'Test-Signal',
'Test-Background',
'Train-Signal',
'Train-Background']
900 distribution.finish()
902 self.
plot_kwargsplot_kwargs[
'color'] = distribution.plots[0][0][0].get_color()
904 difference_signal.set_plot_options(self.
plot_kwargsplot_kwargs)
905 difference_signal.set_errorbar_options(self.
errorbar_kwargserrorbar_kwargs)
906 difference_signal.set_errorband_options(self.
errorband_kwargserrorband_kwargs)
907 difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
908 self.
axis_d1axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
909 self.
axis_d1axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
910 difference_signal.plots = difference_signal.labels = []
911 difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
913 self.
plot_kwargsplot_kwargs[
'color'] = distribution.plots[1][0][0].get_color()
915 difference_bckgrd.set_plot_options(self.
plot_kwargsplot_kwargs)
916 difference_bckgrd.set_errorbar_options(self.
errorbar_kwargserrorbar_kwargs)
917 difference_bckgrd.set_errorband_options(self.
errorband_kwargserrorband_kwargs)
918 difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
919 self.
axis_d2axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
920 self.
axis_d2axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
921 difference_bckgrd.plots = difference_bckgrd.labels = []
922 difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
927 if len(data[column][train_mask & signal_mask]) == 0
or len(data[column][test_mask & signal_mask]) == 0:
928 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test for signal due to missing data")
930 ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
931 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
932 self.
axis_d1axis_d1.text(0.1, 0.9,
r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36, bbox=props,
933 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d1axis_d1.transAxes)
934 if len(data[column][train_mask & bckgrd_mask]) == 0
or len(data[column][test_mask & bckgrd_mask]) == 0:
935 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test for background due to missing data")
937 ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
938 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
939 self.
axis_d2axis_d2.text(0.1, 0.9,
r'background (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36,
941 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d2axis_d2.transAxes)
943 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test please install scipy!")
949 Sets limits, title, axis-labels and legend of the plot
951 self.
axisaxisaxis.set_title(
"Overtraining Plot")
952 self.
axis_d1axis_d1.set_title(
"")
953 self.
axis_d2axis_d2.set_title(
"")
954 matplotlib.artist.setp(self.
axisaxisaxis.get_xticklabels(), visible=
False)
955 matplotlib.artist.setp(self.
axis_d1axis_d1.get_xticklabels(), visible=
False)
956 self.
axisaxisaxis.get_xaxis().set_label_text(
'')
957 self.
axis_d1axis_d1.get_xaxis().set_label_text(
'')
958 self.
axis_d2axis_d2.get_xaxis().set_label_text(
'Classifier Output')
964 Plots distribution of a quantity including boxplots
970 def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
972 Creates a new figure and axis if None is given, sets the default plot parameters
973 @param figure default draw figure which is used
974 @param axis default draw axis which is used
975 @param normed true if the histograms should be normed before drawing
976 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
978 super(VerboseDistribution, self).
__init__(figure, axis)
987 def add(self, data, column, mask=None, weight_column=None, label=None):
989 Add a new distribution plot, with additional information like a boxplot compared to
990 the ordinary Distribution plot.
991 @param data pandas.DataFrame containing all data
992 @param column which is used to calculate distribution histogram
993 @param mask boolean numpy.array defining which events are used for the distribution histogram
994 @param weight_column column in data containing the weights for each event
999 self.
distributiondistribution.
add(data, column, mask, weight_column, label=label)
1002 gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1003 gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :]
for i
in range(n)]
1009 mask = mask & (data[column] > (mean - self.
range_in_stdrange_in_std * std)) & (data[column] < (mean + self.
range_in_stdrange_in_std * std))
1011 box.add(data, column, mask, weight_column)
1012 if len(box.plots) > 0:
1013 box.plots[0][
'boxes'][0].set_facecolor(self.
distributiondistribution.plots[-1][0][0].get_color())
1016 self.
box_axesbox_axes.append(box_axis)
1021 Sets limits, title, axis-labels and legend of the plot
1024 matplotlib.artist.setp(self.
axisaxis.get_xticklabels(), visible=
False)
1025 self.
axisaxis.get_xaxis().set_label_text(
'')
1026 for box_axis
in self.
box_axesbox_axes[:-1]:
1027 matplotlib.artist.setp(box_axis.get_xticklabels(), visible=
False)
1028 box_axis.set_title(
"")
1029 box_axis.get_xaxis().set_label_text(
'')
1030 self.
box_axesbox_axes[-1].set_title(
"")
1031 self.
axisaxis.set_title(
"Distribution Plot")
1033 loc=
'best', fancybox=
True, framealpha=0.5)
1039 Plots change of a distribution of a quantity depending on the cut on a classifier
1052 Creates a new figure if None is given, sets the default plot parameters
1053 @param figure default draw figure which is used
1056 self.
figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1061 gs = matplotlib.gridspec.GridSpec(3, 2)
1068 def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1070 Add a new correlation plot.
1071 @param data pandas.DataFrame containing all data
1072 @param column which is used to calculate distribution histogram
1073 @param cut_column which is used to calculate cut on the other quantity defined by column
1074 @param quantiles list of quantiles between 0 and 100, defining the different cuts
1075 @param weight_column column in data containing the weights for each event
1077 if len(data[cut_column]) == 0:
1078 b2.B2WARNING(
"Ignore empty Correlation.")
1083 for i, (l, m)
in enumerate([(
'.', signal_mask | bckgrd_mask), (
'S', signal_mask), (
'B', bckgrd_mask)]):
1085 if weight_column
is not None:
1086 weights = numpy.array(data[weight_column][m])
1088 weights = numpy.ones(len(data[column][m]))
1090 xrange = numpy.percentile(data[column][m], [5, 95])
1092 colormap = plt.get_cmap(
'coolwarm')
1093 tmp, x = numpy.histogram(data[column][m], bins=100,
1094 range=xrange, normed=
True, weights=weights)
1095 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1096 axes[i].
plot(bin_center, tmp, color=
'black', lw=1)
1098 for quantil
in numpy.arange(5, 100, 5):
1099 cut = numpy.percentile(data[cut_column][m], quantil)
1100 sel = data[cut_column][m] >= cut
1101 y, x = numpy.histogram(data[column][m][sel], bins=100,
1102 range=xrange, normed=
True, weights=weights[sel])
1103 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1104 axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1107 axes[i].set_ylim(bottom=0)
1110 axes[i].set_title(
r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1115 Sets limits, title, axis-labels and legend of the plot
1122 Plots multivariate distribution using TSNE algorithm
1125 def add(self, data, columns, *masks):
1127 Add a new correlation plot.
1128 @param data pandas.DataFrame containing all data
1129 @param columns which are used to calculate the correlations
1130 @param masks different classes to show in TSNE
1134 import sklearn.manifold
1135 model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1136 data = numpy.array([data[column]
for column
in columns]).T
1139 data = numpy.array([data[column][mask]
for column
in columns]).T
1140 data = model.transform(data)
1141 self.
axisaxis.scatter(data[:, 0], data[:, 1], rasterized=
True)
1143 print(
"Cannot create TSNE plot. Install sklearn if you want it")
1148 Sets limits, title, axis-labels and legend of the plot
1155 Plots importance matrix
1158 def add(self, data, columns, variables):
1160 Add a new correlation plot.
1161 @param data pandas.DataFrame containing all data
1162 @param columns which are used to calculate the correlations
1164 self.
figurefigure.set_tight_layout(
True)
1167 width = (numpy.max(x) - numpy.min(x))
1169 return numpy.zeros(x.shape)
1170 return (x - numpy.min(x)) / width * 100
1172 importance_matrix = numpy.vstack([norm(data[column])
for column
in columns]).T
1173 importance_heatmap = self.
axisaxis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,
1177 self.
axisaxis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=
False)
1178 self.
axisaxis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=
False)
1180 self.
axisaxis.set_xticklabels(columns, minor=
False, rotation=90)
1181 self.
axisaxis.set_yticklabels(variables, minor=
False)
1183 self.
axisaxis.xaxis.tick_top()
1185 for y
in range(importance_matrix.shape[0]):
1186 for x
in range(importance_matrix.shape[1]):
1187 self.
axisaxis.text(x + 0.5, y + 0.5,
'%.0f' % importance_matrix[y, x],
1189 horizontalalignment=
'center',
1190 verticalalignment=
'center')
1192 cb = self.
figurefigure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation=
'vertical')
1193 cb.ax.set_yticklabels([
'low',
'high'])
1195 self.
axisaxis.set_aspect(
'equal')
1201 Sets limits, title, axis-labels and legend of the plot
1208 Plots correlation matrix
1219 Creates a new figure if None is given, sets the default plot parameters
1220 @param figure default draw figure which is used
1223 self.
figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1228 gs = matplotlib.gridspec.GridSpec(8, 2)
1238 def add(self, data, columns, signal_mask, bckgrd_mask):
1240 Add a new correlation plot.
1241 @param data pandas.DataFrame containing all data
1242 @param columns which are used to calculate the correlations
1244 signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask]
for column
in columns])) * 100
1245 bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask]
for column
in columns])) * 100
1247 signal_heatmap = self.
signal_axissignal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1248 self.
bckgrd_axisbckgrd_axis.pcolor(bckgrd_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1256 self.
signal_axissignal_axis.set_xticks(numpy.arange(signal_corr.shape[0]) + 0.5, minor=
False)
1257 self.
signal_axissignal_axis.set_yticks(numpy.arange(signal_corr.shape[1]) + 0.5, minor=
False)
1259 self.
signal_axissignal_axis.set_xticklabels(columns, minor=
False, rotation=90)
1260 self.
signal_axissignal_axis.set_yticklabels(columns, minor=
False)
1263 self.
bckgrd_axisbckgrd_axis.set_xticks(numpy.arange(bckgrd_corr.shape[0]) + 0.5, minor=
False)
1264 self.
bckgrd_axisbckgrd_axis.set_yticks(numpy.arange(bckgrd_corr.shape[1]) + 0.5, minor=
False)
1266 self.
bckgrd_axisbckgrd_axis.set_xticklabels(columns, minor=
False, rotation=90)
1267 self.
bckgrd_axisbckgrd_axis.set_yticklabels(columns, minor=
False)
1269 for y
in range(signal_corr.shape[0]):
1270 for x
in range(signal_corr.shape[1]):
1271 self.
signal_axissignal_axis.text(x + 0.5, y + 0.5,
'%.0f' % signal_corr[y, x],
1273 horizontalalignment=
'center',
1274 verticalalignment=
'center')
1276 for y
in range(bckgrd_corr.shape[0]):
1277 for x
in range(bckgrd_corr.shape[1]):
1278 self.
bckgrd_axisbckgrd_axis.text(x + 0.5, y + 0.5,
'%.0f' % bckgrd_corr[y, x],
1280 horizontalalignment=
'center',
1281 verticalalignment=
'center')
1283 cb = self.
figurefigurefigure.colorbar(signal_heatmap, cax=self.
colorbar_axiscolorbar_axis, ticks=[-100, 0, 100], orientation=
'horizontal')
1284 cb.solids.set_rasterized(
True)
1285 cb.ax.set_xticklabels([
'negative',
'uncorrelated',
'positive'])
1287 self.
signal_axissignal_axis.text(0.5, -1.0,
"Signal", horizontalalignment=
'center')
1288 self.
bckgrd_axisbckgrd_axis.text(0.5, -1.0,
"Background", horizontalalignment=
'center')
1294 Sets limits, title, axis-labels and legend of the plot
1296 matplotlib.artist.setp(self.
bckgrd_axisbckgrd_axis.get_yticklabels(), visible=
False)
1300 if __name__ ==
'__main__':
1302 def get_data(N, columns):
1304 Creates fake data for example plots
1307 n = len(columns) - 1
1308 xs = numpy.random.normal(0, size=(N, n))
1309 xb = numpy.random.normal(1, size=(N, n))
1312 data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1313 return data.reindex(numpy.random.permutation(data.index))
1317 seaborn.set(font_scale=3)
1318 seaborn.set_style(
'whitegrid')
1322 data = get_data(N, columns=[
'FastBDT',
'NeuroBayes',
'isSignal'])
1324 data.type.iloc[:N / 2] =
'Train'
1325 data.type.iloc[N / 2:] =
'Test'
1328 p.add(data,
'FastBDT')
1330 p.save(
'box_plot.png')
1333 p.add(data,
'FastBDT')
1334 p.add(data,
'NeuroBayes')
1336 p.save(
'verbose_distribution_plot.png')
1339 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1340 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1342 p.save(
'roc_purity_plot.png')
1345 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1346 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1348 p.save(
'roc_rejection_plot.png')
1351 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1352 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1354 p.save(
'diagonal_plot.png')
1357 p.add(data,
'FastBDT')
1358 p.add(data,
'NeuroBayes')
1360 p.save(
'distribution_plot.png')
1363 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1364 p.add(data,
'NeuroBayes', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1366 p.save(
'difference_plot.png')
1369 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1371 p.save(
'overtraining_plot.png')
1374 p.add(data,
'FastBDT',
'NeuroBayes', [0, 20, 40, 60, 80, 100], data[
'isSignal'] == 0)
1376 p.save(
'correlation_plot.png')
1379 data[
'FastBDT2'] = data[
'FastBDT']**2
1380 data[
'NeuroBayes2'] = data[
'NeuroBayes']**2
1381 data[
'FastBDT3'] = data[
'FastBDT']**3
1382 data[
'NeuroBayes3'] = data[
'NeuroBayes']**3
1383 p.add(data, [
'FastBDT',
'NeuroBayes',
'FastBDT2',
'NeuroBayes2',
'FastBDT3',
'NeuroBayes3'])
1385 p.save(
'correlation_matrix.png')
def calculate_flatness(f, p, w=None)
x_axis_label
Label on x axis.
def add(self, data, column, mask=None, weight_column=None)
def __init__(self, figure=None, axis=None)
signal_axis
Main axis which shows the correlation of the signal samples.
def add(self, data, columns, signal_mask, bckgrd_mask)
colorbar_axis
Colorbar axis contains the colorbar.
figure
figure which is used to draw
def __init__(self, figure=None)
bckgrd_axis
Axis which shows the correlation of the background samples.
axis
Usual axis object which every Plotter object needs, here it is just a dummy.
def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
axis_d1
Axis which shows shape of signal.
figure
figure which is used to draw
def __init__(self, figure=None)
axis_d2
Axis which shows shape of background.
axis
Main axis which is used to draw.
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None)
x_axis_label
Label on x axis.
shift_to_zero
Mean difference is shifted to zero (removes constant offset) if this is true.
def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
def finish(self, line_color='black')
normed
Minuend and subtrahend are normed before comparing them if this is true.
def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
def add(self, data, column, mask=None, weight_column=None, label=None)
keep_first_binning
Keep first binning if user wants so.
normed_to_all_entries
Normalize histograms before drawing them.
first_binning
first binning
range_in_std
Show only a certain range in terms of standard deviations of the data.
normed_to_bin_width
Normalize histograms before drawing them.
def add(self, data, columns, variables)
def add(self, i, *args, **kwargs)
figure
figure which is used to draw
def __init__(self, cls, number_of_plots, figure=None)
sub_plots
the subplots which are displayed in the grid
axis_d1
Axis which shows the difference between training and test signal.
figure
figure which is used to draw
def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
def __init__(self, figure=None)
axis_d2
Axis which shows the difference between training and test background.
axis
Main axis which is used to draw.
def finish(self, *args, **kwargs)
fill_kwargs
Default keyword arguments for fill_between function.
def set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
plots
Plots added to the axis so far.
figure
figure which is used to draw
errorband_kwargs
Default keyword arguments for errorband function.
def add(self, *args, **kwargs)
def set_fill_options(self, fill_kwargs=None)
def __init__(self, figure=None, axis=None)
errorbar_kwargs
Default keyword arguments for errorbar function.
labels
Labels of the plots added so far.
axis
Main axis which is used to draw.
def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
def set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Overrides default errorbar options for datapoint errorbars.
def add_subplot(self, gridspecs)
def set_plot_options(self, plot_kwargs={ 'linestyle':''})
plot_kwargs
Default keyword arguments for plot function.
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
def add(self, data, columns, *masks)
def add(self, data, column, mask=None, weight_column=None, label=None)
distribution
The distribution plot.
range_in_std
Show only a certain range in terms of standard deviations of the data.
box_axes
Axes for the boxplots.
normed
Normalize histograms before drawing them.
def __init__(self, figure=None, axis=None, normed=False, range_in_std=None)
def weighted_mean_and_std(x, w)