17 import matplotlib.pyplot
as plt
18 import matplotlib.artist
19 import matplotlib.figure
20 import matplotlib.gridspec
21 import matplotlib.colors
22 import matplotlib.patches
23 import matplotlib.ticker
25 from basf2_mva_evaluation
import histogram
35 matplotlib.rcParams.update({
'font.size': 36})
38 plt.style.use(
"belle2")
43 Base class for all Plotters.
77 Creates a new figure and axis if None is given, sets the default plot parameters
78 @param figure default draw figure which is used
79 @param axis default draw axis which is used
81 b2.B2INFO(
"Create new figure for class " + str(type(self)))
83 self.
figurefigure = matplotlib.figure.Figure(figsize=(32, 18))
84 self.
figurefigure.set_tight_layout(
False)
95 self.
xminxmin, self.
xmaxxmax = float(0), float(1)
96 self.
yminymin, self.
ymaxymax = float(0), float(1)
118 Adds a new subplot to the figure, updates all other axes
119 according to the given gridspec
120 @param gridspecs gridspecs for all axes including the new one
122 for gs, ax
in zip(gridspecs[:-1], self.
figurefigure.axes):
123 ax.set_position(gs.get_position(self.
figurefigure))
124 ax.set_subplotspec(gs)
130 Save the figure into a file
131 @param filename of the file
133 b2.B2INFO(
"Save figure for class " + str(type(self)))
134 from matplotlib.backends.backend_agg
import FigureCanvasAgg
as FigureCanvas
135 canvas = FigureCanvas(self.
figurefigure)
136 canvas.print_figure(filename, dpi=50)
141 Overrides default plot options for datapoint plot
142 @param plot_kwargs keyword arguments for the plot function
144 self.
plot_kwargsplot_kwargs = copy.copy(plot_kwargs)
149 Overrides default errorbar options for datapoint errorbars
150 @param errorbar_kwargs keyword arguments for the errorbar function
157 Overrides default errorband options for datapoint errorband
158 @param errorbar_kwargs keyword arguments for the fill_between function
165 Overrides default fill_between options for datapoint errorband
166 @param fill_kwargs keyword arguments for the fill_between function
168 self.
fill_kwargsfill_kwargs = copy.copy(fill_kwargs)
173 Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
174 @param x coordinates of the data points
175 @param y coordinates of the data points
176 @param xerr symmetric error on x data points
177 @param yerr symmetric error on y data points
180 plot_kwargs = copy.copy(self.
plot_kwargsplot_kwargs)
183 fill_kwargs = copy.copy(self.
fill_kwargsfill_kwargs)
185 if plot_kwargs
is None or 'color' not in plot_kwargs:
186 color = next(axis._get_lines.prop_cycler)
187 color = color[
'color']
188 plot_kwargs[
'color'] = color
190 color = plot_kwargs[
'color']
191 color = matplotlib.colors.ColorConverter().to_rgb(color)
192 patch = matplotlib.patches.Patch(color=color, alpha=0.5)
193 patch.get_color = patch.get_facecolor
196 if plot_kwargs
is not None:
197 p, = axis.plot(x, y, rasterized=
True, **plot_kwargs)
200 if errorbar_kwargs
is not None and (xerr
is not None or yerr
is not None):
201 if 'color' not in errorbar_kwargs:
202 errorbar_kwargs[
'color'] = color
203 if 'ecolor' not in errorbar_kwargs:
204 errorbar_kwargs[
'ecolor'] = [0.5 * x
for x
in color]
205 e = axis.errorbar(x, y, xerr=xerr, yerr=yerr, rasterized=
True, **errorbar_kwargs)
208 if errorband_kwargs
is not None and yerr
is not None:
209 if 'color' not in errorband_kwargs:
210 errorband_kwargs[
'color'] = color
215 for _x, _y, _xe, _ye
in zip(x, y, xerr, yerr):
216 axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=
True,
219 f = axis.fill_between(x, y - yerr, y + yerr, interpolate=
True, rasterized=
True, **errorband_kwargs)
221 if fill_kwargs
is not None:
222 axis.fill_between(x, y, 0, rasterized=
True, **fill_kwargs)
224 return (tuple(patches), p, e, f)
226 def add(self, *args, **kwargs):
228 Add a new plot to this plotter
230 return NotImplemented
234 Finish plotting and set labels, legends and stuff
236 return NotImplemented
240 Scale limits to increase distance to boundaries
251 Plots the purity and the efficiency over the cut value (for cut choosing)
258 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
260 Add a new curve to the plot
261 @param data pandas.DataFrame containing all data
262 @param column which is used to calculate efficiency and purity for different cuts
263 @param signal_mask boolean numpy.array defining which events are signal events
264 @param bckgrd_mask boolean numpy.array defining which events are background events
265 @param weight_column column in data containing the weights for each event
268 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
271 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
272 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
274 efficiency, efficiency_error = hists.get_true_positives([
'Signal'])
275 purity, purity_error = hists.get_false_positives([
'Background'])
277 cuts = hists.bin_centers
279 self.
xminxmin, self.
xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.
xminxmin]), numpy.nanmax([numpy.nanmax(cuts), self.
xmaxxmaxxmax])
280 self.
yminymin, self.
ymaxymaxymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.
yminymin]), \
281 numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.
ymaxymaxymax])
283 self.
plotsplots.append(self.
_plot_datapoints_plot_datapoints(self.
axisaxis, cuts, efficiency, xerr=0, yerr=efficiency_error))
286 self.
labelslabels.append(
"Efficiency")
288 self.
labelslabels.append(
"True positive")
293 self.
labelslabels.append(
"Purity")
295 self.
labelslabels.append(
"False positive")
301 Sets limits, title, axis-labels and legend of the plot
305 self.
axisaxis.set_title(
"Classification Plot")
306 self.
axisaxis.get_xaxis().set_label_text(
'Cut Value')
307 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
313 Plots the signal to noise ratio over the cut value (for cut choosing)
320 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
322 Add a new curve to the plot
323 @param data pandas.DataFrame containing all data
324 @param column which is used to calculate signal to noise ratio for different cuts
325 @param signal_mask boolean numpy.array defining which events are signal events
326 @param bckgrd_mask boolean numpy.array defining which events are background events
327 @param weight_column column in data containing the weights for each event
330 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
332 signal2noise, signal2noise_error = hists.get_signal_to_noise([
'Signal'], [
'Background'])
334 cuts = hists.bin_centers
336 self.
xminxmin, self.
xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.
xminxmin]), numpy.nanmax([numpy.nanmax(cuts), self.
xmaxxmaxxmax])
337 self.
yminymin, self.
ymaxymaxymax = numpy.nanmin([numpy.nanmin(signal2noise), self.
yminymin]), \
338 numpy.nanmax([numpy.nanmax(signal2noise), self.
ymaxymaxymax])
340 self.
plotsplots.append(self.
_plot_datapoints_plot_datapoints(self.
axisaxis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))
342 self.
labelslabels.append(column)
348 Sets limits, title, axis-labels and legend of the plot
352 self.
axisaxis.set_title(
"Signal to Noise Plot")
353 self.
axisaxis.get_xaxis().set_label_text(
'Cut Value')
354 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
360 Plots the purity over the efficiency also known as ROC curve
367 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
369 Add a new curve to the ROC plot
370 @param data pandas.DataFrame containing all data
371 @param column which is used to calculate efficiency and purity for different cuts
372 @param signal_mask boolean numpy.array defining which events are signal events
373 @param bckgrd_mask boolean numpy.array defining which events are background events
374 @param weight_column column in data containing the weights for each event
376 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
377 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
378 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
380 self.
xminxmin, self.
xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.
xminxmin]), numpy.nanmax([efficiency.max(), self.
xmaxxmaxxmax])
381 self.
yminymin, self.
ymaxymaxymax = numpy.nanmin([numpy.nanmin(purity), self.
yminymin]), numpy.nanmax([numpy.nanmax(purity), self.
ymaxymaxymax])
383 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
384 self.
plotsplots.append(p)
385 if label
is not None:
386 self.
labelslabels.append(label)
388 self.
labelslabels.append(column)
393 Sets limits, title, axis-labels and legend of the plot
397 self.
axisaxis.set_title(
"ROC Purity Plot")
398 self.
axisaxis.get_xaxis().set_label_text(
'Efficiency')
399 self.
axisaxis.get_yaxis().set_label_text(
'Purity')
400 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
406 Plots the rejection over the efficiency also known as ROC curve
413 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
415 Add a new curve to the ROC plot
416 @param data pandas.DataFrame containing all data
417 @param column which is used to calculate efficiency and purity for different cuts
418 @param signal_mask boolean numpy.array defining which events are signal events
419 @param bckgrd_mask boolean numpy.array defining which events are background events
420 @param weight_column column in data containing the weights for each event
422 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
423 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
424 rejection, rejection_error = hists.get_efficiency([
'Background'])
425 rejection = 1 - rejection
426 if isinstance(efficiency, int)
and not isinstance(rejection, int):
427 efficiency = numpy.array([efficiency] * len(rejection))
428 elif isinstance(rejection, int)
and not isinstance(efficiency, int):
429 rejection = numpy.array([rejection] * len(efficiency))
430 elif isinstance(rejection, int)
and isinstance(efficiency, int):
431 efficiency = numpy.array([efficiency])
432 rejection = numpy.array([rejection])
434 self.
xminxmin, self.
xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.
xminxmin]), numpy.nanmax([efficiency.max(), self.
xmaxxmaxxmax])
435 self.
yminymin, self.
ymaxymaxymax = numpy.nanmin([rejection.min(), self.
yminymin]), numpy.nanmax([rejection.max(), self.
ymaxymaxymax])
437 auc = numpy.abs(numpy.trapz(rejection, efficiency))
439 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
440 self.
plotsplots.append(p)
441 if label
is not None:
442 self.
labelslabels.append(label[:10] +
" ({:.2f})".format(auc))
444 self.
labelslabels.append(column[:10] +
" ({:.2f})".format(auc))
449 Sets limits, title, axis-labels and legend of the plot
453 self.
axisaxis.set_title(
"ROC Rejection Plot")
454 self.
axisaxis.get_xaxis().set_label_text(
'Signal Efficiency')
455 self.
axisaxis.get_yaxis().set_label_text(
'Background Rejection')
456 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
462 Plots multiple other plots into a grid 3x?
469 def __init__(self, cls, number_of_plots, figure=None):
471 Creates a new figure if None is given, sets the default plot parameters
472 @param figure default draw figure which is used
475 self.
figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
480 if number_of_plots == 1:
481 gs = matplotlib.gridspec.GridSpec(1, 1)
482 elif number_of_plots == 2:
483 gs = matplotlib.gridspec.GridSpec(1, 2)
484 elif number_of_plots == 3:
485 gs = matplotlib.gridspec.GridSpec(1, 3)
487 gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
494 def add(self, i, *args, **kwargs):
496 Call add function of ith subplot
497 @param i position of the subplot
503 Sets limits, title, axis-labels and legend of the plot
512 Plots the purity in each bin over the classifier output.
519 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
521 Add a new curve to the Diagonal plot
522 @param data pandas.DataFrame containing all data
523 @param column which is used to calculate purity for different cuts
524 @param signal_mask boolean numpy.array defining which events are signal events
525 @param bckgrd_mask boolean numpy.array defining which events are background events
526 @param weight_column column in data containing the weights for each event
528 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
529 purity, purity_error = hists.get_purity_per_bin([
'Signal'], [
'Background'])
535 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
536 self.
plotsplots.append(p)
537 self.
labelslabels.append(column)
542 Sets limits, title, axis-labels and legend of the plot
545 self.
axisaxis.
plot((0.0, 1.0), (0.0, 1.0), color=
'black')
548 self.
axisaxis.set_title(
"Diagonal Plot")
549 self.
axisaxis.get_xaxis().set_label_text(
'Classifier Output')
550 self.
axisaxis.get_yaxis().set_label_text(
'Purity Per Bin')
551 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
557 Plots distribution of a quantity
560 def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
561 keep_first_binning=False, range_in_std=None):
563 Creates a new figure and axis if None is given, sets the default plot parameters
564 @param figure default draw figure which is used
565 @param axis default draw axis which is used
566 @param normed true if histograms should be normed before drawing
567 @param keep_first_binning use the binning of the first distribution for further plots
568 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
570 super(Distribution, self).
__init__(figure, axis)
593 def add(self, data, column, mask=None, weight_column=None, label=None):
595 Add a new distribution to the plots
596 @param data pandas.DataFrame containing all data
597 @param column which is used to calculate distribution histogram
598 @param mask boolean numpy.array defining which events are used for the histogram
599 @param weight_column column in data containing the weights for each event
602 mask = numpy.ones(len(data)).astype(
'bool')
608 bins=bins, equal_frequency=
False, range_in_std=self.
range_in_stdrange_in_std)
611 hist, hist_error = hists.get_hist(
'Total')
614 normalization = float(numpy.sum(hist))
615 hist = hist / normalization
616 hist_error = hist_error / normalization
619 hist = hist / hists.bin_widths
620 hist_error = hist_error / hists.bin_widths
624 self.
ymaxymaxymax = numpy.nanmax([(hist + hist_error).max(), self.
ymaxymaxymax])
626 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
627 self.
plotsplots.append(p)
632 appendix =
' No data to plot!'
635 self.
labelslabels.append(column + appendix)
637 self.
labelslabels.append(label + appendix)
642 Sets limits, title, axis-labels and legend of the plot
644 self.
axisaxis.set_title(
"Distribution Plot")
647 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
650 self.
axisaxis.set_xlim((0., 1.))
651 self.
axisaxis.set_ylim((0., 1.))
652 self.
axisaxis.text(0.36, 0.5,
'No data to plot', fontsize=60, color=
'black')
661 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / (# Entries * Bin Width)')
663 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / # Entries')
665 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / Bin Width')
667 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin')
681 Creates a new figure and axis if None is given, sets the default plot parameters
682 @param figure default draw figure which is used
683 @param axis default draw axis which is used
685 super().
__init__(figure=figure, axis=axis)
690 def add(self, data, column, mask=None, weight_column=None):
692 Add a new boxplot to the plots
693 @param data pandas.DataFrame containing all data
694 @param column which is used to calculate boxplot quantities
695 @param mask boolean numpy.array defining which events are used for the histogram
696 @param weight_column column in data containing the weights for each event
699 mask = numpy.ones(len(data)).astype(
'bool')
700 x = data[column][mask]
701 if weight_column
is not None:
703 b2.B2WARNING(
"Weights are currently not used in boxplot, due to limitations in matplotlib")
706 b2.B2WARNING(
"Ignore empty boxplot.")
709 p = self.
axisaxis.boxplot(x, sym=
'k.', whis=1.5, vert=
False, patch_artist=
True, showmeans=
True, widths=1,
710 boxprops=dict(facecolor=
'blue', alpha=0.5),
714 self.
plotsplots.append(p)
715 self.
labelslabels.append(column)
718 self.axis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + '\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
719 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
720 self.axis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + '\n' + r'$IQD = {:.2f}$').format(x.std(),
721 x.quantile(0.75) - x.quantile(0.25)),
722 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
723 self.axis.text(0.7, 0.9, (r'$min = {:.2f}$' + '\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
724 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
731 Sets limits, title, axis-labels and legend of the plot
733 matplotlib.artist.setp(self.
axisaxis.get_yaxis(), visible=
False)
735 self.
axisaxis.set_title(
"Box Plot")
741 Plots the difference between two histograms
756 def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
758 Creates a new figure and axis if None is given, sets the default plot parameters
759 @param figure default draw figure which is used
760 @param axis default draw axis which is used
761 @param normed normalize minuend and subtrahend before comparing them
762 @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
764 super(Difference, self).
__init__(figure, axis)
774 def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
776 Add a new difference plot
777 @param data pandas.DataFrame containing all data
778 @param column which is used to calculate distribution histogram
779 @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
780 @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
781 @param weight_column column in data containing the weights for each event
782 @param label label for the legend if None, the column name is used
784 hists =
histogram.Histograms(data, column, {
'Minuend': minuend_mask,
'Subtrahend': subtrahend_mask},
785 weight_column=weight_column, equal_frequency=
False)
786 minuend, minuend_error = hists.get_hist(
'Minuend')
787 subtrahend, subtrahend_error = hists.get_hist(
'Subtrahend')
791 difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
792 minuend = minuend / numpy.sum(minuend)
793 subtrahend = subtrahend / numpy.sum(subtrahend)
794 difference = minuend - subtrahend
797 difference = difference - numpy.mean(difference)
800 self.
yminyminymin = min((difference - difference_error).min(), self.
yminyminymin)
801 self.
ymaxymaxymax = max((difference + difference_error).max(), self.
ymaxymaxymax)
803 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
804 self.
plotsplots.append(p)
806 self.
labelslabels.append(label)
808 self.
labelslabels.append(column)
814 Sets limits, title, axis-labels and legend of the plot
816 self.
axisaxis.
plot((self.
xminxmin, self.
xmaxxmaxxmax), (0, 0), color=line_color, linewidth=4, rasterized=
True)
820 self.
axisaxis.set_title(
"Difference Plot")
821 self.
axisaxis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
823 self.
axisaxis.get_yaxis().set_label_text(
'Difference')
824 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
830 Create TMVA-like overtraining control plot for a classification training
844 Creates a new figure if None is given, sets the default plot parameters
845 @param figure default draw figure which is used
848 self.
figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
853 gs = matplotlib.gridspec.GridSpec(5, 1)
860 def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
862 Add a new overtraining plot, I recommend to raw only one overtraining plot at the time,
863 otherwise there are too many curves in the plot to recognize anything in the plot.
864 @param data pandas.DataFrame containing all data
865 @param column which is used to calculate distribution histogram
866 @param train_mask boolean numpy.array defining which events are training events
867 @param test_mask boolean numpy.array defining which events are test events
868 @param signal_mask boolean numpy.array defining which events are signal events
869 @param bckgrd_mask boolean numpy.array defining which events are background events
870 @param weight_column column in data containing the weights for each event
874 distribution.set_plot_options(self.
plot_kwargsplot_kwargs)
877 distribution.add(data, column, test_mask & signal_mask, weight_column)
878 distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
880 distribution.set_plot_options(
881 {
'color': distribution.plots[0][0][0].get_color(),
'linestyle':
'-',
'lw': 4,
'drawstyle':
'steps-mid'})
882 distribution.set_fill_options({
'color': distribution.plots[0][0][0].get_color(),
'alpha': 0.5,
'step':
'mid'})
883 distribution.set_errorbar_options(
None)
884 distribution.set_errorband_options(
None)
885 distribution.add(data, column, train_mask & signal_mask, weight_column)
886 distribution.set_plot_options(
887 {
'color': distribution.plots[1][0][0].get_color(),
'linestyle':
'-',
'lw': 4,
'drawstyle':
'steps-mid'})
888 distribution.set_fill_options({
'color': distribution.plots[1][0][0].get_color(),
'alpha': 0.5,
'step':
'mid'})
889 distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
891 distribution.labels = [
'Test-Signal',
'Test-Background',
'Train-Signal',
'Train-Background']
892 distribution.finish()
894 self.
plot_kwargsplot_kwargs[
'color'] = distribution.plots[0][0][0].get_color()
896 difference_signal.set_plot_options(self.
plot_kwargsplot_kwargs)
897 difference_signal.set_errorbar_options(self.
errorbar_kwargserrorbar_kwargs)
898 difference_signal.set_errorband_options(self.
errorband_kwargserrorband_kwargs)
899 difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
900 self.
axis_d1axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
901 self.
axis_d1axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
902 difference_signal.plots = difference_signal.labels = []
903 difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
905 self.
plot_kwargsplot_kwargs[
'color'] = distribution.plots[1][0][0].get_color()
907 difference_bckgrd.set_plot_options(self.
plot_kwargsplot_kwargs)
908 difference_bckgrd.set_errorbar_options(self.
errorbar_kwargserrorbar_kwargs)
909 difference_bckgrd.set_errorband_options(self.
errorband_kwargserrorband_kwargs)
910 difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
911 self.
axis_d2axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
912 self.
axis_d2axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
913 difference_bckgrd.plots = difference_bckgrd.labels = []
914 difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
919 if len(data[column][train_mask & signal_mask]) == 0
or len(data[column][test_mask & signal_mask]) == 0:
920 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test for signal due to missing data")
922 ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
923 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
924 self.
axis_d1axis_d1.text(0.1, 0.9,
r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36, bbox=props,
925 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d1axis_d1.transAxes)
926 if len(data[column][train_mask & bckgrd_mask]) == 0
or len(data[column][test_mask & bckgrd_mask]) == 0:
927 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test for background due to missing data")
929 ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
930 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
931 self.
axis_d2axis_d2.text(0.1, 0.9,
r'background (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36,
933 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d2axis_d2.transAxes)
935 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test please install scipy!")
941 Sets limits, title, axis-labels and legend of the plot
943 self.
axisaxisaxis.set_title(
"Overtraining Plot")
944 self.
axis_d1axis_d1.set_title(
"")
945 self.
axis_d2axis_d2.set_title(
"")
946 matplotlib.artist.setp(self.
axisaxisaxis.get_xticklabels(), visible=
False)
947 matplotlib.artist.setp(self.
axis_d1axis_d1.get_xticklabels(), visible=
False)
948 self.
axisaxisaxis.get_xaxis().set_label_text(
'')
949 self.
axis_d1axis_d1.get_xaxis().set_label_text(
'')
950 self.
axis_d2axis_d2.get_xaxis().set_label_text(
'Classifier Output')
956 Plots distribution of a quantity including boxplots
962 def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
964 Creates a new figure and axis if None is given, sets the default plot parameters
965 @param figure default draw figure which is used
966 @param axis default draw axis which is used
967 @param normed true if the histograms should be normed before drawing
968 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
970 super(VerboseDistribution, self).
__init__(figure, axis)
979 def add(self, data, column, mask=None, weight_column=None, label=None):
981 Add a new distribution plot, with additional information like a boxplot compared to
982 the ordinary Distribution plot.
983 @param data pandas.DataFrame containing all data
984 @param column which is used to calculate distribution histogram
985 @param mask boolean numpy.array defining which events are used for the distribution histogram
986 @param weight_column column in data containing the weights for each event
991 self.
distributiondistribution.
add(data, column, mask, weight_column, label=label)
994 gs = matplotlib.gridspec.GridSpec(4 * n, 1)
995 gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :]
for i
in range(n)]
1001 mask = mask & (data[column] > (mean - self.
range_in_stdrange_in_std * std)) & (data[column] < (mean + self.
range_in_stdrange_in_std * std))
1003 box.add(data, column, mask, weight_column)
1004 if len(box.plots) > 0:
1005 box.plots[0][
'boxes'][0].set_facecolor(self.
distributiondistribution.plots[-1][0][0].get_color())
1008 self.
box_axesbox_axes.append(box_axis)
1013 Sets limits, title, axis-labels and legend of the plot
1016 matplotlib.artist.setp(self.
axisaxis.get_xticklabels(), visible=
False)
1017 self.
axisaxis.get_xaxis().set_label_text(
'')
1018 for box_axis
in self.
box_axesbox_axes[:-1]:
1019 matplotlib.artist.setp(box_axis.get_xticklabels(), visible=
False)
1020 box_axis.set_title(
"")
1021 box_axis.get_xaxis().set_label_text(
'')
1022 self.
box_axesbox_axes[-1].set_title(
"")
1023 self.
axisaxis.set_title(
"Distribution Plot")
1025 loc=
'best', fancybox=
True, framealpha=0.5)
1031 Plots change of a distribution of a quantity depending on the cut on a classifier
1044 Creates a new figure if None is given, sets the default plot parameters
1045 @param figure default draw figure which is used
1048 self.
figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1053 gs = matplotlib.gridspec.GridSpec(3, 2)
1060 def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1062 Add a new correlation plot.
1063 @param data pandas.DataFrame containing all data
1064 @param column which is used to calculate distribution histogram
1065 @param cut_column which is used to calculate cut on the other quantity defined by column
1066 @param quantiles list of quantiles between 0 and 100, defining the different cuts
1067 @param weight_column column in data containing the weights for each event
1069 if len(data[cut_column]) == 0:
1070 b2.B2WARNING(
"Ignore empty Correlation.")
1075 for i, (l, m)
in enumerate([(
'.', signal_mask | bckgrd_mask), (
'S', signal_mask), (
'B', bckgrd_mask)]):
1077 if weight_column
is not None:
1078 weights = numpy.array(data[weight_column][m])
1080 weights = numpy.ones(len(data[column][m]))
1082 xrange = numpy.percentile(data[column][m], [5, 95])
1084 colormap = plt.get_cmap(
'coolwarm')
1085 tmp, x = numpy.histogram(data[column][m], bins=100,
1086 range=xrange, normed=
True, weights=weights)
1087 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1088 axes[i].
plot(bin_center, tmp, color=
'black', lw=1)
1090 for quantil
in numpy.arange(5, 100, 5):
1091 cut = numpy.percentile(data[cut_column][m], quantil)
1092 sel = data[cut_column][m] >= cut
1093 y, x = numpy.histogram(data[column][m][sel], bins=100,
1094 range=xrange, normed=
True, weights=weights[sel])
1095 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1096 axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1099 axes[i].set_ylim(bottom=0)
1102 axes[i].set_title(
r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1107 Sets limits, title, axis-labels and legend of the plot
1114 Plots multivariate distribution using TSNE algorithm
1117 def add(self, data, columns, *masks):
1119 Add a new correlation plot.
1120 @param data pandas.DataFrame containing all data
1121 @param columns which are used to calculate the correlations
1122 @param masks different classes to show in TSNE
1126 import sklearn.manifold
1127 model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1128 data = numpy.array([data[column]
for column
in columns]).T
1131 data = numpy.array([data[column][mask]
for column
in columns]).T
1132 data = model.transform(data)
1133 self.
axisaxis.scatter(data[:, 0], data[:, 1], rasterized=
True)
1135 print(
"Cannot create TSNE plot. Install sklearn if you want it")
1140 Sets limits, title, axis-labels and legend of the plot
1147 Plots importance matrix
1150 def add(self, data, columns, variables):
1152 Add a new correlation plot.
1153 @param data pandas.DataFrame containing all data
1154 @param columns which are used to calculate the correlations
1156 self.
figurefigure.set_tight_layout(
True)
1159 width = (numpy.max(x) - numpy.min(x))
1161 return numpy.zeros(x.shape)
1162 return (x - numpy.min(x)) / width * 100
1164 importance_matrix = numpy.vstack([norm(data[column])
for column
in columns]).T
1165 importance_heatmap = self.
axisaxis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,
1169 self.
axisaxis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=
False)
1170 self.
axisaxis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=
False)
1172 self.
axisaxis.set_xticklabels(columns, minor=
False, rotation=90)
1173 self.
axisaxis.set_yticklabels(variables, minor=
False)
1175 self.
axisaxis.xaxis.tick_top()
1177 for y
in range(importance_matrix.shape[0]):
1178 for x
in range(importance_matrix.shape[1]):
1179 self.
axisaxis.text(x + 0.5, y + 0.5,
'%.0f' % importance_matrix[y, x],
1181 horizontalalignment=
'center',
1182 verticalalignment=
'center')
1184 cb = self.
figurefigure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation=
'vertical')
1185 cb.ax.set_yticklabels([
'low',
'high'])
1187 self.
axisaxis.set_aspect(
'equal')
1193 Sets limits, title, axis-labels and legend of the plot
1200 Plots correlation matrix
1211 Creates a new figure if None is given, sets the default plot parameters
1212 @param figure default draw figure which is used
1215 self.
figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1220 gs = matplotlib.gridspec.GridSpec(8, 2)
1230 def add(self, data, columns, signal_mask, bckgrd_mask):
1232 Add a new correlation plot.
1233 @param data pandas.DataFrame containing all data
1234 @param columns which are used to calculate the correlations
1236 signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask]
for column
in columns])) * 100
1237 bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask]
for column
in columns])) * 100
1239 signal_heatmap = self.
signal_axissignal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1247 self.
signal_axissignal_axis.set_xticks(numpy.arange(signal_corr.shape[0]) + 0.5, minor=
False)
1248 self.
signal_axissignal_axis.set_yticks(numpy.arange(signal_corr.shape[1]) + 0.5, minor=
False)
1250 self.
signal_axissignal_axis.set_xticklabels(columns, minor=
False, rotation=90)
1251 self.
signal_axissignal_axis.set_yticklabels(columns, minor=
False)
1254 self.
bckgrd_axisbckgrd_axis.set_xticks(numpy.arange(bckgrd_corr.shape[0]) + 0.5, minor=
False)
1255 self.
bckgrd_axisbckgrd_axis.set_yticks(numpy.arange(bckgrd_corr.shape[1]) + 0.5, minor=
False)
1257 self.
bckgrd_axisbckgrd_axis.set_xticklabels(columns, minor=
False, rotation=90)
1258 self.
bckgrd_axisbckgrd_axis.set_yticklabels(columns, minor=
False)
1260 for y
in range(signal_corr.shape[0]):
1261 for x
in range(signal_corr.shape[1]):
1262 self.
signal_axissignal_axis.text(x + 0.5, y + 0.5,
'%.0f' % signal_corr[y, x],
1264 horizontalalignment=
'center',
1265 verticalalignment=
'center')
1267 for y
in range(bckgrd_corr.shape[0]):
1268 for x
in range(bckgrd_corr.shape[1]):
1269 self.
bckgrd_axisbckgrd_axis.text(x + 0.5, y + 0.5,
'%.0f' % bckgrd_corr[y, x],
1271 horizontalalignment=
'center',
1272 verticalalignment=
'center')
1274 cb = self.
figurefigurefigure.colorbar(signal_heatmap, cax=self.
colorbar_axiscolorbar_axis, ticks=[-100, 0, 100], orientation=
'horizontal')
1275 cb.solids.set_rasterized(
True)
1276 cb.ax.set_xticklabels([
'negative',
'uncorrelated',
'positive'])
1278 self.
signal_axissignal_axis.text(0.5, -1.0,
"Signal", horizontalalignment=
'center')
1279 self.
bckgrd_axisbckgrd_axis.text(0.5, -1.0,
"Background", horizontalalignment=
'center')
1285 Sets limits, title, axis-labels and legend of the plot
1287 matplotlib.artist.setp(self.
bckgrd_axisbckgrd_axis.get_yticklabels(), visible=
False)
1291 if __name__ ==
'__main__':
1293 def get_data(N, columns):
1295 Creates fake data for example plots
1298 n = len(columns) - 1
1299 xs = numpy.random.normal(0, size=(N, n))
1300 xb = numpy.random.normal(1, size=(N, n))
1303 data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1304 return data.reindex(numpy.random.permutation(data.index))
1308 seaborn.set(font_scale=3)
1309 seaborn.set_style(
'whitegrid')
1313 data = get_data(N, columns=[
'FastBDT',
'NeuroBayes',
'isSignal'])
1315 data.type.iloc[:N / 2] =
'Train'
1316 data.type.iloc[N / 2:] =
'Test'
1319 p.add(data,
'FastBDT')
1321 p.save(
'box_plot.png')
1324 p.add(data,
'FastBDT')
1325 p.add(data,
'NeuroBayes')
1327 p.save(
'verbose_distribution_plot.png')
1330 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1331 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1333 p.save(
'roc_purity_plot.png')
1336 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1337 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1339 p.save(
'roc_rejection_plot.png')
1342 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1343 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1345 p.save(
'diagonal_plot.png')
1348 p.add(data,
'FastBDT')
1349 p.add(data,
'NeuroBayes')
1351 p.save(
'distribution_plot.png')
1354 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1355 p.add(data,
'NeuroBayes', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1357 p.save(
'difference_plot.png')
1360 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1362 p.save(
'overtraining_plot.png')
1365 p.add(data,
'FastBDT',
'NeuroBayes', [0, 20, 40, 60, 80, 100], data[
'isSignal'] == 0)
1367 p.save(
'correlation_plot.png')
1370 data[
'FastBDT2'] = data[
'FastBDT']**2
1371 data[
'NeuroBayes2'] = data[
'NeuroBayes']**2
1372 data[
'FastBDT3'] = data[
'FastBDT']**3
1373 data[
'NeuroBayes3'] = data[
'NeuroBayes']**3
1374 p.add(data, [
'FastBDT',
'NeuroBayes',
'FastBDT2',
'NeuroBayes2',
'FastBDT3',
'NeuroBayes3'])
1376 p.save(
'correlation_matrix.png')
def calculate_flatness(f, p, w=None)
x_axis_label
Label on x axis.
def add(self, data, column, mask=None, weight_column=None)
def __init__(self, figure=None, axis=None)
signal_axis
Main axis which shows the correlation of the signal samples.
def add(self, data, columns, signal_mask, bckgrd_mask)
colorbar_axis
Colorbar axis contains the colorbar.
figure
figure which is used to draw
def __init__(self, figure=None)
bckgrd_axis
Axis which shows the correlation of the background samples.
axis
Usual axis object which every Plotter object needs, here it is just a dummy.
def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
axis_d1
Axis which shows shape of signal.
figure
figure which is used to draw
def __init__(self, figure=None)
axis_d2
Axis which shows shape of background.
axis
Main axis which is used to draw.
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None)
x_axis_label
Label on x axis.
shift_to_zero
Mean difference is shifted to zero (removes constant offset) if this is true.
def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
def finish(self, line_color='black')
normed
Minuend and subtrahend are normed before comparing them if this is true.
def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
def add(self, data, column, mask=None, weight_column=None, label=None)
keep_first_binning
Keep first binning if user wants so.
normed_to_all_entries
Normalize histograms before drawing them.
first_binning
first binning
range_in_std
Show only a certain range in terms of standard deviations of the data.
normed_to_bin_width
Normalize histograms before drawing them.
def add(self, data, columns, variables)
def add(self, i, *args, **kwargs)
figure
figure which is used to draw
def __init__(self, cls, number_of_plots, figure=None)
sub_plots
the subplots which are displayed in the grid
axis_d1
Axis which shows the difference between training and test signal.
figure
figure which is used to draw
def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
def __init__(self, figure=None)
axis_d2
Axis which shows the difference between training and test background.
axis
Main axis which is used to draw.
def finish(self, *args, **kwargs)
fill_kwargs
Default keyword arguments for fill_between function.
def set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
plots
Plots added to the axis so far.
figure
figure which is used to draw
errorband_kwargs
Default keyword arguments for errorband function.
def add(self, *args, **kwargs)
def set_fill_options(self, fill_kwargs=None)
def __init__(self, figure=None, axis=None)
errorbar_kwargs
Default keyword arguments for errorbar function.
labels
Labels of the plots added so far.
axis
Main axis which is used to draw.
def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
def set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Overrides default errorbar options for datapoint errorbars.
def add_subplot(self, gridspecs)
def set_plot_options(self, plot_kwargs={ 'linestyle':''})
plot_kwargs
Default keyword arguments for plot function.
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
def add(self, data, columns, *masks)
def add(self, data, column, mask=None, weight_column=None, label=None)
distribution
The distribution plot.
range_in_std
Show only a certain range in terms of standard deviations of the data.
box_axes
Axes for the boxplots.
normed
Normalize histograms before drawing them.
def __init__(self, figure=None, axis=None, normed=False, range_in_std=None)
def weighted_mean_and_std(x, w)