17 import matplotlib.pyplot
as plt
18 import matplotlib.artist
19 import matplotlib.figure
20 import matplotlib.gridspec
21 import matplotlib.colors
22 import matplotlib.patches
23 import matplotlib.ticker
24 import matplotlib.patheffects
as PathEffects
27 from basf2_mva_evaluation
import histogram
37 matplotlib.rcParams.update({
'font.size': 36})
40 plt.style.use(
"belle2")
45 Base class for all Plotters.
79 Creates a new figure and axis if None is given, sets the default plot parameters
80 @param figure default draw figure which is used
81 @param axis default draw axis which is used
83 b2.B2INFO(
"Create new figure for class " + str(type(self)))
85 self.
figurefigure = matplotlib.figure.Figure(figsize=(32, 18))
86 self.
figurefigure.set_tight_layout(
False)
97 self.
xminxmin, self.
xmaxxmax = float(0), float(1)
98 self.
yminymin, self.
ymaxymax = float(0), float(1)
120 Adds a new subplot to the figure, updates all other axes
121 according to the given gridspec
122 @param gridspecs gridspecs for all axes including the new one
124 for gs, ax
in zip(gridspecs[:-1], self.
figurefigure.axes):
125 ax.set_position(gs.get_position(self.
figurefigure))
126 ax.set_subplotspec(gs)
132 Save the figure into a file
133 @param filename of the file
135 b2.B2INFO(
"Save figure for class " + str(type(self)))
136 from matplotlib.backends.backend_agg
import FigureCanvasAgg
as FigureCanvas
137 canvas = FigureCanvas(self.
figurefigure)
138 canvas.print_figure(filename, dpi=50)
143 Overrides default plot options for datapoint plot
144 @param plot_kwargs keyword arguments for the plot function
146 self.
plot_kwargsplot_kwargs = copy.copy(plot_kwargs)
151 Overrides default errorbar options for datapoint errorbars
152 @param errorbar_kwargs keyword arguments for the errorbar function
159 Overrides default errorband options for datapoint errorband
160 @param errorbar_kwargs keyword arguments for the fill_between function
167 Overrides default fill_between options for datapoint errorband
168 @param fill_kwargs keyword arguments for the fill_between function
170 self.
fill_kwargsfill_kwargs = copy.copy(fill_kwargs)
175 Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
176 @param x coordinates of the data points
177 @param y coordinates of the data points
178 @param xerr symmetric error on x data points
179 @param yerr symmetric error on y data points
182 plot_kwargs = copy.copy(self.
plot_kwargsplot_kwargs)
185 fill_kwargs = copy.copy(self.
fill_kwargsfill_kwargs)
187 if plot_kwargs
is None or 'color' not in plot_kwargs:
188 color = next(axis._get_lines.prop_cycler)
189 color = color[
'color']
190 plot_kwargs[
'color'] = color
192 color = plot_kwargs[
'color']
193 color = matplotlib.colors.ColorConverter().to_rgb(color)
194 patch = matplotlib.patches.Patch(color=color, alpha=0.5)
195 patch.get_color = patch.get_facecolor
198 if plot_kwargs
is not None:
199 p, = axis.plot(x, y, rasterized=
True, **plot_kwargs)
202 if errorbar_kwargs
is not None and (xerr
is not None or yerr
is not None):
203 if 'color' not in errorbar_kwargs:
204 errorbar_kwargs[
'color'] = color
205 if 'ecolor' not in errorbar_kwargs:
206 errorbar_kwargs[
'ecolor'] = [0.5 * x
for x
in color]
211 if not isinstance(xerr, (numpy.ndarray, list)):
212 xerr = xerr*numpy.ones(len(x))
213 mask = numpy.logical_and.reduce([numpy.isfinite(v)
for v
in [x, y, xerr, yerr]])
215 e = axis.errorbar(x[mask], y[mask], xerr=xerr[mask], yerr=yerr[mask], rasterized=
True, **errorbar_kwargs)
218 if errorband_kwargs
is not None and yerr
is not None:
219 if 'color' not in errorband_kwargs:
220 errorband_kwargs[
'color'] = color
225 for _x, _y, _xe, _ye
in zip(x, y, xerr, yerr):
226 axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=
True,
229 f = axis.fill_between(x, y - yerr, y + yerr, interpolate=
True, rasterized=
True, **errorband_kwargs)
231 if fill_kwargs
is not None:
233 x = numpy.append(x, x[-1]+2*xerr[-1])
234 y = numpy.append(y, y[-1])
235 xerr = numpy.append(xerr, xerr[-1])
237 axis.fill_between(x-xerr, y, 0, rasterized=
True, **fill_kwargs)
239 return (tuple(patches), p, e, f)
241 def add(self, *args, **kwargs):
243 Add a new plot to this plotter
245 return NotImplemented
249 Finish plotting and set labels, legends and stuff
251 return NotImplemented
255 Scale limits to increase distance to boundaries
266 Plots the purity and the efficiency over the cut value (for cut choosing)
273 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
275 Add a new curve to the plot
276 @param data pandas.DataFrame containing all data
277 @param column which is used to calculate efficiency and purity for different cuts
278 @param signal_mask boolean numpy.array defining which events are signal events
279 @param bckgrd_mask boolean numpy.array defining which events are background events
280 @param weight_column column in data containing the weights for each event
283 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
286 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
287 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
289 efficiency, efficiency_error = hists.get_true_positives([
'Signal'])
290 purity, purity_error = hists.get_false_positives([
'Background'])
292 cuts = hists.bin_centers
294 self.
xminxmin, self.
xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.
xminxmin]), numpy.nanmax([numpy.nanmax(cuts), self.
xmaxxmaxxmax])
295 self.
yminymin, self.
ymaxymaxymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.
yminymin]), \
296 numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.
ymaxymaxymax])
298 self.
plotsplots.append(self.
_plot_datapoints_plot_datapoints(self.
axisaxis, cuts, efficiency, xerr=0, yerr=efficiency_error))
301 self.
labelslabels.append(
"Efficiency")
303 self.
labelslabels.append(
"True positive")
308 self.
labelslabels.append(
"Purity")
310 self.
labelslabels.append(
"False positive")
316 Sets limits, title, axis-labels and legend of the plot
320 self.
axisaxis.set_title(
"Classification Plot")
321 self.
axisaxis.get_xaxis().set_label_text(
'Cut Value')
322 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
328 Plots the signal to noise ratio over the cut value (for cut choosing)
335 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
337 Add a new curve to the plot
338 @param data pandas.DataFrame containing all data
339 @param column which is used to calculate signal to noise ratio for different cuts
340 @param signal_mask boolean numpy.array defining which events are signal events
341 @param bckgrd_mask boolean numpy.array defining which events are background events
342 @param weight_column column in data containing the weights for each event
345 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
347 signal2noise, signal2noise_error = hists.get_signal_to_noise([
'Signal'], [
'Background'])
349 cuts = hists.bin_centers
351 self.
xminxmin, self.
xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.
xminxmin]), numpy.nanmax([numpy.nanmax(cuts), self.
xmaxxmaxxmax])
352 self.
yminymin, self.
ymaxymaxymax = numpy.nanmin([numpy.nanmin(signal2noise), self.
yminymin]), \
353 numpy.nanmax([numpy.nanmax(signal2noise), self.
ymaxymaxymax])
355 self.
plotsplots.append(self.
_plot_datapoints_plot_datapoints(self.
axisaxis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))
357 self.
labelslabels.append(column)
363 Sets limits, title, axis-labels and legend of the plot
367 self.
axisaxis.set_title(
"Signal to Noise Plot")
368 self.
axisaxis.get_xaxis().set_label_text(
'Cut Value')
369 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
375 Plots the purity over the efficiency also known as ROC curve
382 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
384 Add a new curve to the ROC plot
385 @param data pandas.DataFrame containing all data
386 @param column which is used to calculate efficiency and purity for different cuts
387 @param signal_mask boolean numpy.array defining which events are signal events
388 @param bckgrd_mask boolean numpy.array defining which events are background events
389 @param weight_column column in data containing the weights for each event
391 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
392 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
393 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
395 self.
xminxmin, self.
xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.
xminxmin]), numpy.nanmax([efficiency.max(), self.
xmaxxmaxxmax])
396 self.
yminymin, self.
ymaxymaxymax = numpy.nanmin([numpy.nanmin(purity), self.
yminymin]), numpy.nanmax([numpy.nanmax(purity), self.
ymaxymaxymax])
398 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
399 self.
plotsplots.append(p)
400 if label
is not None:
401 self.
labelslabels.append(label)
403 self.
labelslabels.append(column)
408 Sets limits, title, axis-labels and legend of the plot
412 self.
axisaxis.set_title(
"ROC Purity Plot")
413 self.
axisaxis.get_xaxis().set_label_text(
'Efficiency')
414 self.
axisaxis.get_yaxis().set_label_text(
'Purity')
415 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
421 Plots the rejection over the efficiency also known as ROC curve
428 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
430 Add a new curve to the ROC plot
431 @param data pandas.DataFrame containing all data
432 @param column which is used to calculate efficiency and purity for different cuts
433 @param signal_mask boolean numpy.array defining which events are signal events
434 @param bckgrd_mask boolean numpy.array defining which events are background events
435 @param weight_column column in data containing the weights for each event
437 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
438 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
439 rejection, rejection_error = hists.get_efficiency([
'Background'])
440 rejection = 1 - rejection
441 if isinstance(efficiency, int)
and not isinstance(rejection, int):
442 efficiency = numpy.array([efficiency] * len(rejection))
443 elif isinstance(rejection, int)
and not isinstance(efficiency, int):
444 rejection = numpy.array([rejection] * len(efficiency))
445 elif isinstance(rejection, int)
and isinstance(efficiency, int):
446 efficiency = numpy.array([efficiency])
447 rejection = numpy.array([rejection])
449 self.
xminxmin, self.
xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.
xminxmin]), numpy.nanmax([efficiency.max(), self.
xmaxxmaxxmax])
450 self.
yminymin, self.
ymaxymaxymax = numpy.nanmin([rejection.min(), self.
yminymin]), numpy.nanmax([rejection.max(), self.
ymaxymaxymax])
452 auc = numpy.abs(numpy.trapz(rejection, efficiency))
454 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
455 self.
plotsplots.append(p)
456 if label
is not None:
457 self.
labelslabels.append(label[:10] + f
" ({auc:.2f})")
459 self.
labelslabels.append(column[:10] + f
" ({auc:.2f})")
464 Sets limits, title, axis-labels and legend of the plot
468 self.
axisaxis.set_title(
"ROC Rejection Plot")
469 self.
axisaxis.get_xaxis().set_label_text(
'Signal Efficiency')
470 self.
axisaxis.get_yaxis().set_label_text(
'Background Rejection')
471 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
477 Plots multiple other plots into a grid 3x?
484 def __init__(self, cls, number_of_plots, figure=None):
486 Creates a new figure if None is given, sets the default plot parameters
487 @param figure default draw figure which is used
490 self.
figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
495 if number_of_plots == 1:
496 gs = matplotlib.gridspec.GridSpec(1, 1)
497 elif number_of_plots == 2:
498 gs = matplotlib.gridspec.GridSpec(1, 2)
499 elif number_of_plots == 3:
500 gs = matplotlib.gridspec.GridSpec(1, 3)
502 gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
509 def add(self, i, *args, **kwargs):
511 Call add function of ith subplot
512 @param i position of the subplot
518 Sets limits, title, axis-labels and legend of the plot
527 Plots the purity in each bin over the classifier output.
534 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
536 Add a new curve to the Diagonal plot
537 @param data pandas.DataFrame containing all data
538 @param column which is used to calculate purity for different cuts
539 @param signal_mask boolean numpy.array defining which events are signal events
540 @param bckgrd_mask boolean numpy.array defining which events are background events
541 @param weight_column column in data containing the weights for each event
543 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
544 purity, purity_error = hists.get_purity_per_bin([
'Signal'], [
'Background'])
550 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
551 self.
plotsplots.append(p)
552 self.
labelslabels.append(column)
557 Sets limits, title, axis-labels and legend of the plot
560 self.
axisaxis.
plot((0.0, 1.0), (0.0, 1.0), color=
'black')
563 self.
axisaxis.set_title(
"Diagonal Plot")
564 self.
axisaxis.get_xaxis().set_label_text(
'Classifier Output')
565 self.
axisaxis.get_yaxis().set_label_text(
'Purity Per Bin')
566 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
572 Plots distribution of a quantity
575 def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
576 keep_first_binning=False, range_in_std=None):
578 Creates a new figure and axis if None is given, sets the default plot parameters
579 @param figure default draw figure which is used
580 @param axis default draw axis which is used
581 @param normed true if histograms should be normed before drawing
582 @param keep_first_binning use the binning of the first distribution for further plots
583 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
608 def add(self, data, column, mask=None, weight_column=None, label=None):
610 Add a new distribution to the plots
611 @param data pandas.DataFrame containing all data
612 @param column which is used to calculate distribution histogram
613 @param mask boolean numpy.array defining which events are used for the histogram
614 @param weight_column column in data containing the weights for each event
617 mask = numpy.ones(len(data)).astype(
'bool')
623 bins=bins, equal_frequency=
False, range_in_std=self.
range_in_stdrange_in_std)
626 hist, hist_error = hists.get_hist(
'Total')
629 normalization = float(numpy.sum(hist))
630 hist = hist / normalization
631 hist_error = hist_error / normalization
634 hist = hist / hists.bin_widths
635 hist_error = hist_error / hists.bin_widths
639 self.
ymaxymaxymax = numpy.nanmax([(hist + hist_error).max(), self.
ymaxymaxymax])
641 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
642 self.
plotsplots.append(p)
647 appendix =
' No data to plot!'
650 self.
labelslabels.append(column + appendix)
652 self.
labelslabels.append(label + appendix)
657 Sets limits, title, axis-labels and legend of the plot
659 self.
axisaxis.set_title(
"Distribution Plot")
662 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
665 self.
axisaxis.set_xlim((0., 1.))
666 self.
axisaxis.set_ylim((0., 1.))
667 self.
axisaxis.text(0.36, 0.5,
'No data to plot', fontsize=60, color=
'black')
676 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / (# Entries * Bin Width)')
678 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / # Entries')
680 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / Bin Width')
682 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin')
696 Creates a new figure and axis if None is given, sets the default plot parameters
697 @param figure default draw figure which is used
698 @param axis default draw axis which is used
700 super().
__init__(figure=figure, axis=axis)
705 def add(self, data, column, mask=None, weight_column=None):
707 Add a new boxplot to the plots
708 @param data pandas.DataFrame containing all data
709 @param column which is used to calculate boxplot quantities
710 @param mask boolean numpy.array defining which events are used for the histogram
711 @param weight_column column in data containing the weights for each event
714 mask = numpy.ones(len(data)).astype(
'bool')
715 x = data[column][mask]
716 if weight_column
is not None:
718 b2.B2WARNING(
"Weights are currently not used in boxplot, due to limitations in matplotlib")
721 b2.B2WARNING(
"Ignore empty boxplot.")
724 p = self.
axisaxis.boxplot(x, sym=
'k.', whis=1.5, vert=
False, patch_artist=
True, showmeans=
True, widths=1,
725 boxprops=dict(facecolor=
'blue', alpha=0.5),
729 self.
plotsplots.append(p)
730 self.
labelslabels.append(column)
733 self.axis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + '\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
734 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
735 self.axis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + '\n' + r'$IQD = {:.2f}$').format(x.std(),
736 x.quantile(0.75) - x.quantile(0.25)),
737 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
738 self.axis.text(0.7, 0.9, (r'$min = {:.2f}$' + '\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
739 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
746 Sets limits, title, axis-labels and legend of the plot
748 matplotlib.artist.setp(self.
axisaxis.get_yaxis(), visible=
False)
750 self.
axisaxis.set_title(
"Box Plot")
756 Plots the difference between two histograms
771 def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
773 Creates a new figure and axis if None is given, sets the default plot parameters
774 @param figure default draw figure which is used
775 @param axis default draw axis which is used
776 @param normed normalize minuend and subtrahend before comparing them
777 @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
789 def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
791 Add a new difference plot
792 @param data pandas.DataFrame containing all data
793 @param column which is used to calculate distribution histogram
794 @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
795 @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
796 @param weight_column column in data containing the weights for each event
797 @param label label for the legend if None, the column name is used
799 hists =
histogram.Histograms(data, column, {
'Minuend': minuend_mask,
'Subtrahend': subtrahend_mask},
800 weight_column=weight_column, equal_frequency=
False)
801 minuend, minuend_error = hists.get_hist(
'Minuend')
802 subtrahend, subtrahend_error = hists.get_hist(
'Subtrahend')
806 difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
807 minuend = minuend / numpy.sum(minuend)
808 subtrahend = subtrahend / numpy.sum(subtrahend)
809 difference = minuend - subtrahend
812 difference = difference - numpy.mean(difference)
815 self.
yminyminymin = min((difference - difference_error).min(), self.
yminyminymin)
816 self.
ymaxymaxymax = max((difference + difference_error).max(), self.
ymaxymaxymax)
818 p = self.
_plot_datapoints_plot_datapoints(self.
axisaxis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
819 self.
plotsplots.append(p)
821 self.
labelslabels.append(label)
823 self.
labelslabels.append(column)
829 Sets limits, title, axis-labels and legend of the plot
831 self.
axisaxis.
plot((self.
xminxmin, self.
xmaxxmaxxmax), (0, 0), color=line_color, linewidth=4, rasterized=
True)
835 self.
axisaxis.set_title(
"Difference Plot")
836 self.
axisaxis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
838 self.
axisaxis.get_yaxis().set_label_text(
'Difference')
839 self.
axisaxis.legend([x[0]
for x
in self.
plotsplots], self.
labelslabels, loc=
'best', fancybox=
True, framealpha=0.5)
845 Create TMVA-like overtraining control plot for a classification training
859 Creates a new figure if None is given, sets the default plot parameters
860 @param figure default draw figure which is used
863 self.
figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
868 gs = matplotlib.gridspec.GridSpec(5, 1)
875 def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
877 Add a new overtraining plot, I recommend to draw only one overtraining plot at the time,
878 otherwise there are too many curves in the plot to recognize anything in the plot.
879 @param data pandas.DataFrame containing all data
880 @param column which is used to calculate distribution histogram
881 @param train_mask boolean numpy.array defining which events are training events
882 @param test_mask boolean numpy.array defining which events are test events
883 @param signal_mask boolean numpy.array defining which events are signal events
884 @param bckgrd_mask boolean numpy.array defining which events are background events
885 @param weight_column column in data containing the weights for each event
889 distribution.set_plot_options(self.
plot_kwargsplot_kwargs)
892 distribution.add(data, column, test_mask & signal_mask, weight_column)
893 distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
895 distribution.set_plot_options(
896 {
'color': distribution.plots[0][0][0].get_color(),
'linestyle':
'-',
'lw': 4,
'drawstyle':
'steps-mid'})
897 distribution.set_fill_options({
'color': distribution.plots[0][0][0].get_color(),
'alpha': 0.5,
'step':
'post'})
898 distribution.set_errorbar_options(
None)
899 distribution.set_errorband_options(
None)
900 distribution.add(data, column, train_mask & signal_mask, weight_column)
901 distribution.set_plot_options(
902 {
'color': distribution.plots[1][0][0].get_color(),
'linestyle':
'-',
'lw': 4,
'drawstyle':
'steps-mid'})
903 distribution.set_fill_options({
'color': distribution.plots[1][0][0].get_color(),
'alpha': 0.5,
'step':
'post'})
904 distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
906 distribution.labels = [
'Test-Signal',
'Test-Background',
'Train-Signal',
'Train-Background']
907 distribution.finish()
909 self.
plot_kwargsplot_kwargs[
'color'] = distribution.plots[0][0][0].get_color()
911 difference_signal.set_plot_options(self.
plot_kwargsplot_kwargs)
912 difference_signal.set_errorbar_options(self.
errorbar_kwargserrorbar_kwargs)
913 difference_signal.set_errorband_options(self.
errorband_kwargserrorband_kwargs)
914 difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
915 self.
axis_d1axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
916 self.
axis_d1axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
917 difference_signal.plots = difference_signal.labels = []
918 difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
920 self.
plot_kwargsplot_kwargs[
'color'] = distribution.plots[1][0][0].get_color()
922 difference_bckgrd.set_plot_options(self.
plot_kwargsplot_kwargs)
923 difference_bckgrd.set_errorbar_options(self.
errorbar_kwargserrorbar_kwargs)
924 difference_bckgrd.set_errorband_options(self.
errorband_kwargserrorband_kwargs)
925 difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
926 self.
axis_d2axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
927 self.
axis_d2axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
928 difference_bckgrd.plots = difference_bckgrd.labels = []
929 difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
934 if len(data[column][train_mask & signal_mask]) == 0
or len(data[column][test_mask & signal_mask]) == 0:
935 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test for signal due to missing data")
937 ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
938 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
939 self.
axis_d1axis_d1.text(0.1, 0.9,
r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36, bbox=props,
940 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d1axis_d1.transAxes)
941 if len(data[column][train_mask & bckgrd_mask]) == 0
or len(data[column][test_mask & bckgrd_mask]) == 0:
942 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test for background due to missing data")
944 ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
945 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
946 self.
axis_d2axis_d2.text(0.1, 0.9,
r'background (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36,
948 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d2axis_d2.transAxes)
950 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test please install scipy!")
956 Sets limits, title, axis-labels and legend of the plot
958 self.
axisaxisaxis.set_title(
"Overtraining Plot")
959 self.
axis_d1axis_d1.set_title(
"")
960 self.
axis_d2axis_d2.set_title(
"")
961 matplotlib.artist.setp(self.
axisaxisaxis.get_xticklabels(), visible=
False)
962 matplotlib.artist.setp(self.
axis_d1axis_d1.get_xticklabels(), visible=
False)
963 self.
axisaxisaxis.get_xaxis().set_label_text(
'')
964 self.
axis_d1axis_d1.get_xaxis().set_label_text(
'')
965 self.
axis_d2axis_d2.get_xaxis().set_label_text(
'Classifier Output')
971 Plots distribution of a quantity including boxplots
977 def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
979 Creates a new figure and axis if None is given, sets the default plot parameters
980 @param figure default draw figure which is used
981 @param axis default draw axis which is used
982 @param normed true if the histograms should be normed before drawing
983 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
994 def add(self, data, column, mask=None, weight_column=None, label=None):
996 Add a new distribution plot, with additional information like a boxplot compared to
997 the ordinary Distribution plot.
998 @param data pandas.DataFrame containing all data
999 @param column which is used to calculate distribution histogram
1000 @param mask boolean numpy.array defining which events are used for the distribution histogram
1001 @param weight_column column in data containing the weights for each event
1006 self.
distributiondistribution.
add(data, column, mask, weight_column, label=label)
1009 gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1010 gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :]
for i
in range(n)]
1016 mask = mask & (data[column] > (mean - self.
range_in_stdrange_in_std * std)) & (data[column] < (mean + self.
range_in_stdrange_in_std * std))
1018 box.add(data, column, mask, weight_column)
1019 if len(box.plots) > 0:
1020 box.plots[0][
'boxes'][0].set_facecolor(self.
distributiondistribution.plots[-1][0][0].get_color())
1023 self.
box_axesbox_axes.append(box_axis)
1028 Sets limits, title, axis-labels and legend of the plot
1031 matplotlib.artist.setp(self.
axisaxis.get_xticklabels(), visible=
False)
1032 self.
axisaxis.get_xaxis().set_label_text(
'')
1033 for box_axis
in self.
box_axesbox_axes[:-1]:
1034 matplotlib.artist.setp(box_axis.get_xticklabels(), visible=
False)
1035 box_axis.set_title(
"")
1036 box_axis.get_xaxis().set_label_text(
'')
1037 self.
box_axesbox_axes[-1].set_title(
"")
1038 self.
axisaxis.set_title(
"Distribution Plot")
1040 loc=
'best', fancybox=
True, framealpha=0.5)
1046 Plots change of a distribution of a quantity depending on the cut on a classifier
1059 Creates a new figure if None is given, sets the default plot parameters
1060 @param figure default draw figure which is used
1063 self.
figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1068 gs = matplotlib.gridspec.GridSpec(3, 2)
1075 def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1077 Add a new correlation plot.
1078 @param data pandas.DataFrame containing all data
1079 @param column which is used to calculate distribution histogram
1080 @param cut_column which is used to calculate cut on the other quantity defined by column
1081 @param quantiles list of quantiles between 0 and 100, defining the different cuts
1082 @param weight_column column in data containing the weights for each event
1084 if len(data[cut_column]) == 0:
1085 b2.B2WARNING(
"Ignore empty Correlation.")
1090 for i, (l, m)
in enumerate([(
'.', signal_mask | bckgrd_mask), (
'S', signal_mask), (
'B', bckgrd_mask)]):
1092 if weight_column
is not None:
1093 weights = numpy.array(data[weight_column][m])
1095 weights = numpy.ones(len(data[column][m]))
1097 xrange = numpy.percentile(data[column][m], [5, 95])
1099 colormap = plt.get_cmap(
'coolwarm')
1100 tmp, x = numpy.histogram(data[column][m], bins=100,
1101 range=xrange, density=
True, weights=weights)
1102 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1103 axes[i].
plot(bin_center, tmp, color=
'black', lw=1)
1105 for quantil
in numpy.arange(5, 100, 5):
1106 cut = numpy.percentile(data[cut_column][m], quantil)
1107 sel = data[cut_column][m] >= cut
1108 y, x = numpy.histogram(data[column][m][sel], bins=100,
1109 range=xrange, density=
True, weights=weights[sel])
1110 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1111 axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1114 axes[i].set_ylim(bottom=0)
1117 axes[i].set_title(
r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1122 Sets limits, title, axis-labels and legend of the plot
1129 Plots multivariate distribution using TSNE algorithm
1132 def add(self, data, columns, *masks):
1134 Add a new correlation plot.
1135 @param data pandas.DataFrame containing all data
1136 @param columns which are used to calculate the correlations
1137 @param masks different classes to show in TSNE
1141 import sklearn.manifold
1142 model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1143 data = numpy.array([data[column]
for column
in columns]).T
1146 data = numpy.array([data[column][mask]
for column
in columns]).T
1147 data = model.transform(data)
1148 self.
axisaxis.scatter(data[:, 0], data[:, 1], rasterized=
True)
1150 print(
"Cannot create TSNE plot. Install sklearn if you want it")
1155 Sets limits, title, axis-labels and legend of the plot
1162 Plots importance matrix
1165 def add(self, data, columns, variables):
1167 Add a new correlation plot.
1168 @param data pandas.DataFrame containing all data
1169 @param columns which are used to calculate the correlations
1171 self.
figurefigure.set_tight_layout(
True)
1174 width = (numpy.max(x) - numpy.min(x))
1176 return numpy.zeros(x.shape)
1177 return (x - numpy.min(x)) / width * 100
1179 importance_matrix = numpy.vstack([norm(data[column])
for column
in columns]).T
1180 importance_heatmap = self.
axisaxis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,
1184 self.
axisaxis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=
False)
1185 self.
axisaxis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=
False)
1187 self.
axisaxis.set_xticklabels(columns, minor=
False, rotation=90)
1188 self.
axisaxis.set_yticklabels(variables, minor=
False)
1190 self.
axisaxis.xaxis.tick_top()
1192 for y
in range(importance_matrix.shape[0]):
1193 for x
in range(importance_matrix.shape[1]):
1194 txt = self.
axisaxis.text(x + 0.5, y + 0.5, f
'{importance_matrix[y, x]:.0f}',
1196 horizontalalignment=
'center',
1197 verticalalignment=
'center',
1199 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'k')])
1201 cb = self.
figurefigure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation=
'vertical')
1202 cb.ax.set_yticklabels([
'low',
'high'])
1205 self.
axisaxis.set_ylim(0, importance_matrix.shape[0])
1207 self.
axisaxis.set_aspect(
'equal')
1213 Sets limits, title, axis-labels and legend of the plot
1220 Plots correlation matrix
1231 Creates a new figure if None is given, sets the default plot parameters
1232 @param figure default draw figure which is used
1235 self.
figurefigurefigure = matplotlib.figure.Figure(figsize=(32, 18))
1240 gs = matplotlib.gridspec.GridSpec(8, 2)
1250 def add(self, data, columns, signal_mask, bckgrd_mask):
1252 Add a new correlation plot.
1253 @param data pandas.DataFrame containing all data
1254 @param columns which are used to calculate the correlations
1256 signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask]
for column
in columns])) * 100
1257 bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask]
for column
in columns])) * 100
1259 signal_heatmap = self.
signal_axissignal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1260 self.
bckgrd_axisbckgrd_axis.pcolor(bckgrd_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1268 self.
signal_axissignal_axis.set_xticks(numpy.arange(signal_corr.shape[0]) + 0.5, minor=
False)
1269 self.
signal_axissignal_axis.set_yticks(numpy.arange(signal_corr.shape[1]) + 0.5, minor=
False)
1271 self.
signal_axissignal_axis.set_xticklabels(columns, minor=
False, rotation=90)
1272 self.
signal_axissignal_axis.set_yticklabels(columns, minor=
False)
1275 self.
bckgrd_axisbckgrd_axis.set_xticks(numpy.arange(bckgrd_corr.shape[0]) + 0.5, minor=
False)
1276 self.
bckgrd_axisbckgrd_axis.set_yticks(numpy.arange(bckgrd_corr.shape[1]) + 0.5, minor=
False)
1278 self.
bckgrd_axisbckgrd_axis.set_xticklabels(columns, minor=
False, rotation=90)
1279 self.
bckgrd_axisbckgrd_axis.set_yticklabels(columns, minor=
False)
1281 for y
in range(signal_corr.shape[0]):
1282 for x
in range(signal_corr.shape[1]):
1283 txt = self.
signal_axissignal_axis.text(x + 0.5, y + 0.5, f
'{signal_corr[y, x]:.0f}',
1285 horizontalalignment=
'center',
1286 verticalalignment=
'center',
1288 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'k')])
1290 for y
in range(bckgrd_corr.shape[0]):
1291 for x
in range(bckgrd_corr.shape[1]):
1292 txt = self.
bckgrd_axisbckgrd_axis.text(x + 0.5, y + 0.5, f
'{bckgrd_corr[y, x]:.0f}',
1294 horizontalalignment=
'center',
1295 verticalalignment=
'center',
1297 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'k')])
1299 cb = self.
figurefigurefigure.colorbar(signal_heatmap, cax=self.
colorbar_axiscolorbar_axis, ticks=[-100, 0, 100], orientation=
'horizontal')
1300 cb.solids.set_rasterized(
True)
1301 cb.ax.set_xticklabels([
'negative',
'uncorrelated',
'positive'])
1303 self.
signal_axissignal_axis.text(0.5, -1.0,
"Signal", horizontalalignment=
'center')
1304 self.
bckgrd_axisbckgrd_axis.text(0.5, -1.0,
"Background", horizontalalignment=
'center')
1307 self.
signal_axissignal_axis.set_xlim(0, signal_corr.shape[0])
1308 self.
signal_axissignal_axis.set_ylim(0, signal_corr.shape[1])
1309 self.
bckgrd_axisbckgrd_axis.set_xlim(0, bckgrd_corr.shape[0])
1310 self.
bckgrd_axisbckgrd_axis.set_ylim(0, bckgrd_corr.shape[1])
1315 Sets limits, title, axis-labels and legend of the plot
1317 matplotlib.artist.setp(self.
bckgrd_axisbckgrd_axis.get_yticklabels(), visible=
False)
1321 if __name__ ==
'__main__':
1323 def get_data(N, columns):
1325 Creates fake data for example plots
1328 n = len(columns) - 1
1329 xs = numpy.random.normal(0, size=(N, n))
1330 xb = numpy.random.normal(1, size=(N, n))
1333 data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1334 return data.reindex(numpy.random.permutation(data.index))
1338 seaborn.set(font_scale=3)
1339 seaborn.set_style(
'whitegrid')
1343 data = get_data(N, columns=[
'FastBDT',
'NeuroBayes',
'isSignal'])
1345 data.type.iloc[:N / 2] =
'Train'
1346 data.type.iloc[N / 2:] =
'Test'
1349 p.add(data,
'FastBDT')
1351 p.save(
'box_plot.png')
1354 p.add(data,
'FastBDT')
1355 p.add(data,
'NeuroBayes')
1357 p.save(
'verbose_distribution_plot.png')
1360 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1361 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1363 p.save(
'roc_purity_plot.png')
1366 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1367 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1369 p.save(
'roc_rejection_plot.png')
1372 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1373 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1375 p.save(
'diagonal_plot.png')
1378 p.add(data,
'FastBDT')
1379 p.add(data,
'NeuroBayes')
1381 p.save(
'distribution_plot.png')
1384 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1385 p.add(data,
'NeuroBayes', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1387 p.save(
'difference_plot.png')
1390 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1392 p.save(
'overtraining_plot.png')
1395 p.add(data,
'FastBDT',
'NeuroBayes', [0, 20, 40, 60, 80, 100], data[
'isSignal'] == 0)
1397 p.save(
'correlation_plot.png')
1400 data[
'FastBDT2'] = data[
'FastBDT']**2
1401 data[
'NeuroBayes2'] = data[
'NeuroBayes']**2
1402 data[
'FastBDT3'] = data[
'FastBDT']**3
1403 data[
'NeuroBayes3'] = data[
'NeuroBayes']**3
1404 p.add(data, [
'FastBDT',
'NeuroBayes',
'FastBDT2',
'NeuroBayes2',
'FastBDT3',
'NeuroBayes3'])
1406 p.save(
'correlation_matrix.png')
def calculate_flatness(f, p, w=None)
x_axis_label
Label on x axis.
def add(self, data, column, mask=None, weight_column=None)
def __init__(self, figure=None, axis=None)
signal_axis
Main axis which shows the correlation of the signal samples.
def add(self, data, columns, signal_mask, bckgrd_mask)
colorbar_axis
Colorbar axis contains the colorbar.
figure
figure which is used to draw
def __init__(self, figure=None)
bckgrd_axis
Axis which shows the correlation of the background samples.
axis
Usual axis object which every Plotter object needs, here it is just a dummy.
def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
axis_d1
Axis which shows shape of signal.
figure
figure which is used to draw
def __init__(self, figure=None)
axis_d2
Axis which shows shape of background.
axis
Main axis which is used to draw.
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None)
x_axis_label
Label on x axis.
shift_to_zero
Mean difference is shifted to zero (removes constant offset) if this is true.
def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
def finish(self, line_color='black')
normed
Minuend and subtrahend are normed before comparing them if this is true.
def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
def add(self, data, column, mask=None, weight_column=None, label=None)
keep_first_binning
Keep first binning if user wants so.
normed_to_all_entries
Normalize histograms before drawing them.
first_binning
first binning
range_in_std
Show only a certain range in terms of standard deviations of the data.
normed_to_bin_width
Normalize histograms before drawing them.
def add(self, data, columns, variables)
def add(self, i, *args, **kwargs)
figure
figure which is used to draw
def __init__(self, cls, number_of_plots, figure=None)
sub_plots
the subplots which are displayed in the grid
axis_d1
Axis which shows the difference between training and test signal.
figure
figure which is used to draw
def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
def __init__(self, figure=None)
axis_d2
Axis which shows the difference between training and test background.
axis
Main axis which is used to draw.
def finish(self, *args, **kwargs)
fill_kwargs
Default keyword arguments for fill_between function.
def set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
plots
Plots added to the axis so far.
figure
figure which is used to draw
errorband_kwargs
Default keyword arguments for errorband function.
def add(self, *args, **kwargs)
def set_fill_options(self, fill_kwargs=None)
def __init__(self, figure=None, axis=None)
errorbar_kwargs
Default keyword arguments for errorbar function.
labels
Labels of the plots added so far.
axis
Main axis which is used to draw.
def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
def set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Overrides default errorbar options for datapoint errorbars.
def add_subplot(self, gridspecs)
def set_plot_options(self, plot_kwargs={ 'linestyle':''})
plot_kwargs
Default keyword arguments for plot function.
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
def add(self, data, columns, *masks)
def add(self, data, column, mask=None, weight_column=None, label=None)
distribution
The distribution plot.
range_in_std
Show only a certain range in terms of standard deviations of the data.
box_axes
Axes for the boxplots.
normed
Normalize histograms before drawing them.
def __init__(self, figure=None, axis=None, normed=False, range_in_std=None)
def weighted_mean_and_std(x, w)