16 matplotlib.rcParams.update({
'font.size': 36})
18 import matplotlib.pyplot
as plt
19 import matplotlib.artist
20 import matplotlib.figure
21 import matplotlib.gridspec
22 import matplotlib.colors
23 import matplotlib.patches
24 import matplotlib.ticker
26 from .
import histogram
35 Base class for all Plotters.
69 Creates a new figure and axis if None is given, sets the default plot parameters
70 @param figure default draw figure which is used
71 @param axis default draw axis which is used
73 B2INFO(
"Create new figure for class " + str(type(self)))
75 self.
figure = matplotlib.figure.Figure(figsize=(32, 18))
76 self.
figure.set_tight_layout(
False)
87 self.
xmin, self.
xmax = float(0), float(1)
88 self.
ymin, self.
ymax = float(0), float(1)
110 Adds a new subplot to the figure, updates all other axes
111 according to the given gridspec
112 @param gridspecs gridspecs for all axes including the new one
114 for gs, ax
in zip(gridspecs[:-1], self.
figure.axes):
115 ax.set_position(gs.get_position(self.
figure))
116 ax.set_subplotspec(gs)
122 Save the figure into a file
123 @param filename of the file
125 B2INFO(
"Save figure for class " + str(type(self)))
126 from matplotlib.backends.backend_agg
import FigureCanvasAgg
as FigureCanvas
127 canvas = FigureCanvas(self.
figure)
128 canvas.print_figure(filename, dpi=50)
133 Overrides default plot options for datapoint plot
134 @param plot_kwargs keyword arguments for the plot function
141 Overrides default errorbar options for datapoint errorbars
142 @param errorbar_kwargs keyword arguments for the errorbar function
149 Overrides default errorband options for datapoint errorband
150 @param errorbar_kwargs keyword arguments for the fill_between function
157 Overrides default fill_between options for datapoint errorband
158 @param fill_kwargs keyword arguments for the fill_between function
165 Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
166 @param x coordinates of the data points
167 @param y coordinates of the data points
168 @param xerr symmetric error on x data points
169 @param yerr symmetric error on y data points
177 if plot_kwargs
is None or 'color' not in plot_kwargs:
178 color = next(axis._get_lines.prop_cycler)
179 color = color[
'color']
180 plot_kwargs[
'color'] = color
182 color = plot_kwargs[
'color']
183 color = matplotlib.colors.ColorConverter().to_rgb(color)
184 patch = matplotlib.patches.Patch(color=color, alpha=0.5)
185 patch.get_color = patch.get_facecolor
188 if plot_kwargs
is not None:
189 p, = axis.plot(x, y, rasterized=
True, **plot_kwargs)
192 if errorbar_kwargs
is not None and (xerr
is not None or yerr
is not None):
193 if 'color' not in errorbar_kwargs:
194 errorbar_kwargs[
'color'] = color
195 if 'ecolor' not in errorbar_kwargs:
196 errorbar_kwargs[
'ecolor'] = [0.5 * x
for x
in color]
197 e = axis.errorbar(x, y, xerr=xerr, yerr=yerr, rasterized=
True, **errorbar_kwargs)
200 if errorband_kwargs
is not None and yerr
is not None:
201 if 'color' not in errorband_kwargs:
202 errorband_kwargs[
'color'] = color
207 for _x, _y, _xe, _ye
in zip(x, y, xerr, yerr):
208 axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=
True,
211 f = axis.fill_between(x, y - yerr, y + yerr, interpolate=
True, rasterized=
True, **errorband_kwargs)
213 if fill_kwargs
is not None:
214 axis.fill_between(x, y, 0, rasterized=
True, **fill_kwargs)
216 return (tuple(patches), p, e, f)
218 def add(self, *args, **kwargs):
220 Add a new plot to this plotter
222 return NotImplemented
226 Finish plotting and set labels, legends and stuff
228 return NotImplemented
232 Scale limits to increase distance to boundaries
243 Plots the purity and the efficiency over the cut value (for cut choosing)
250 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
252 Add a new curve to the plot
253 @param data pandas.DataFrame containing all data
254 @param column which is used to calculate efficiency and purity for different cuts
255 @param signal_mask boolean numpy.array defining which events are signal events
256 @param bckgrd_mask boolean numpy.array defining which events are background events
257 @param weight_column column in data containing the weights for each event
260 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
263 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
264 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
266 efficiency, efficiency_error = hists.get_true_positives([
'Signal'])
267 purity, purity_error = hists.get_false_positives([
'Background'])
269 cuts = hists.bin_centers
271 self.
xmin, self.
xmax = numpy.nanmin([numpy.nanmin(cuts), self.
xmin]), numpy.nanmax([numpy.nanmax(cuts), self.
xmax])
272 self.
ymin, self.
ymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.
ymin]), \
273 numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.
ymax])
278 self.
labels.append(
"Efficiency")
280 self.
labels.append(
"True positive")
285 self.
labels.append(
"Purity")
287 self.
labels.append(
"False positive")
293 Sets limits, title, axis-labels and legend of the plot
297 self.
axis.set_title(
"Classification Plot")
298 self.
axis.get_xaxis().set_label_text(
'Cut Value')
299 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
305 Plots the signal to noise ratio over the cut value (for cut choosing)
312 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
314 Add a new curve to the plot
315 @param data pandas.DataFrame containing all data
316 @param column which is used to calculate signal to noise ratio for different cuts
317 @param signal_mask boolean numpy.array defining which events are signal events
318 @param bckgrd_mask boolean numpy.array defining which events are background events
319 @param weight_column column in data containing the weights for each event
322 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
324 signal2noise, signal2noise_error = hists.get_signal_to_noise([
'Signal'], [
'Background'])
326 cuts = hists.bin_centers
328 self.
xmin, self.
xmax = numpy.nanmin([numpy.nanmin(cuts), self.
xmin]), numpy.nanmax([numpy.nanmax(cuts), self.
xmax])
329 self.
ymin, self.
ymax = numpy.nanmin([numpy.nanmin(signal2noise), self.
ymin]), \
330 numpy.nanmax([numpy.nanmax(signal2noise), self.
ymax])
334 self.
labels.append(column)
340 Sets limits, title, axis-labels and legend of the plot
344 self.
axis.set_title(
"Signal to Noise Plot")
345 self.
axis.get_xaxis().set_label_text(
'Cut Value')
346 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
352 Plots the purity over the efficiency also known as ROC curve
359 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
361 Add a new curve to the ROC plot
362 @param data pandas.DataFrame containing all data
363 @param column which is used to calculate efficiency and purity for different cuts
364 @param signal_mask boolean numpy.array defining which events are signal events
365 @param bckgrd_mask boolean numpy.array defining which events are background events
366 @param weight_column column in data containing the weights for each event
368 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
369 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
370 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
372 self.
xmin, self.
xmax = numpy.nanmin([efficiency.min(), self.
xmin]), numpy.nanmax([efficiency.max(), self.
xmax])
373 self.
ymin, self.
ymax = numpy.nanmin([numpy.nanmin(purity), self.
ymin]), numpy.nanmax([numpy.nanmax(purity), self.
ymax])
375 p = self.
_plot_datapoints(self.
axis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
377 if label
is not None:
380 self.
labels.append(column)
385 Sets limits, title, axis-labels and legend of the plot
389 self.
axis.set_title(
"ROC Purity Plot")
390 self.
axis.get_xaxis().set_label_text(
'Efficiency')
391 self.
axis.get_yaxis().set_label_text(
'Purity')
392 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
398 Plots the rejection over the efficiency also known as ROC curve
405 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
407 Add a new curve to the ROC plot
408 @param data pandas.DataFrame containing all data
409 @param column which is used to calculate efficiency and purity for different cuts
410 @param signal_mask boolean numpy.array defining which events are signal events
411 @param bckgrd_mask boolean numpy.array defining which events are background events
412 @param weight_column column in data containing the weights for each event
414 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
415 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
416 rejection, rejection_error = hists.get_efficiency([
'Background'])
417 rejection = 1 - rejection
419 self.
xmin, self.
xmax = numpy.nanmin([efficiency.min(), self.
xmin]), numpy.nanmax([efficiency.max(), self.
xmax])
420 self.
ymin, self.
ymax = numpy.nanmin([rejection.min(), self.
ymin]), numpy.nanmax([rejection.max(), self.
ymax])
422 auc = numpy.abs(numpy.trapz(rejection, efficiency))
424 p = self.
_plot_datapoints(self.
axis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
426 if label
is not None:
427 self.
labels.append(label[:10] +
" ({:.2f})".format(auc))
429 self.
labels.append(column[:10] +
" ({:.2f})".format(auc))
434 Sets limits, title, axis-labels and legend of the plot
438 self.
axis.set_title(
"ROC Rejection Plot")
439 self.
axis.get_xaxis().set_label_text(
'Signal Efficiency')
440 self.
axis.get_yaxis().set_label_text(
'Background Rejection')
441 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
447 Plots multiple other plots into a grid 3x?
454 def __init__(self, cls, number_of_plots, figure=None):
456 Creates a new figure if None is given, sets the default plot parameters
457 @param figure default draw figure which is used
460 self.
figure = matplotlib.figure.Figure(figsize=(32, 18))
461 self.
figure.set_tight_layout(
True)
465 if number_of_plots == 1:
466 gs = matplotlib.gridspec.GridSpec(1, 1)
467 elif number_of_plots == 2:
468 gs = matplotlib.gridspec.GridSpec(1, 2)
469 elif number_of_plots == 3:
470 gs = matplotlib.gridspec.GridSpec(1, 3)
472 gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
479 def add(self, i, *args, **kwargs):
481 Call add function of ith subplot
482 @param i position of the subplot
488 Sets limits, title, axis-labels and legend of the plot
497 Plots the purity in each bin over the classifier output.
504 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
506 Add a new curve to the Diagonal plot
507 @param data pandas.DataFrame containing all data
508 @param column which is used to calculate purity for different cuts
509 @param signal_mask boolean numpy.array defining which events are signal events
510 @param bckgrd_mask boolean numpy.array defining which events are background events
511 @param weight_column column in data containing the weights for each event
513 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
514 purity, purity_error = hists.get_purity_per_bin([
'Signal'], [
'Background'])
516 self.
xmin, self.
xmax = min(hists.bin_centers.min(), self.
xmin), max(hists.bin_centers.max(), self.
xmax)
520 p = self.
_plot_datapoints(self.
axis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
522 self.
labels.append(column)
527 Sets limits, title, axis-labels and legend of the plot
530 self.
axis.
plot((0.0, 1.0), (0.0, 1.0), color=
'black')
533 self.
axis.set_title(
"Diagonal Plot")
534 self.
axis.get_xaxis().set_label_text(
'Classifier Output')
535 self.
axis.get_yaxis().set_label_text(
'Purity Per Bin')
536 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
542 Plots distribution of a quantity
545 def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
546 keep_first_binning=False, range_in_std=None):
548 Creates a new figure and axis if None is given, sets the default plot parameters
549 @param figure default draw figure which is used
550 @param axis default draw axis which is used
551 @param normed true if histograms should be normed before drawing
552 @param keep_first_binning use the binning of the first distribution for further plots
553 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
555 super(Distribution, self).
__init__(figure, axis)
566 self.
ymax = float(
'-inf')
568 self.
xmin = float(
'inf')
570 self.
xmax = float(
'-inf')
578 def add(self, data, column, mask=None, weight_column=None, label=None):
580 Add a new distribution to the plots
581 @param data pandas.DataFrame containing all data
582 @param column which is used to calculate distribution histogram
583 @param mask boolean numpy.array defining which events are used for the histogram
584 @param weight_column column in data containing the weights for each event
587 mask = numpy.ones(len(data)).astype(
'bool')
593 bins=bins, equal_frequency=
False, range_in_std=self.
range_in_std)
596 hist, hist_error = hists.get_hist(
'Total')
599 normalization = float(numpy.sum(hist))
600 hist = hist / normalization
601 hist_error = hist_error / normalization
604 hist = hist / hists.bin_widths
605 hist_error = hist_error / hists.bin_widths
607 self.
xmin, self.
xmax = min(hists.bin_centers.min(), self.
xmin), max(hists.bin_centers.max(), self.
xmax)
608 self.
ymin = numpy.nanmin([hist.min(), self.
ymin])
609 self.
ymax = numpy.nanmax([(hist + hist_error).max(), self.
ymax])
611 p = self.
_plot_datapoints(self.
axis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
617 appendix =
' No data to plot!'
620 self.
labels.append(column + appendix)
622 self.
labels.append(label + appendix)
627 Sets limits, title, axis-labels and legend of the plot
629 self.
axis.set_title(
"Distribution Plot")
632 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
635 self.
axis.set_xlim((0., 1.))
636 self.
axis.set_ylim((0., 1.))
637 self.
axis.text(0.36, 0.5,
'No data to plot', fontsize=60, color=
'black')
646 self.
axis.get_yaxis().set_label_text(
'# Entries per Bin / (# Entries * Bin Width)')
648 self.
axis.get_yaxis().set_label_text(
'# Entries per Bin / # Entries')
650 self.
axis.get_yaxis().set_label_text(
'# Entries per Bin / Bin Width')
652 self.
axis.get_yaxis().set_label_text(
'# Entries per Bin')
666 Creates a new figure and axis if None is given, sets the default plot parameters
667 @param figure default draw figure which is used
668 @param axis default draw axis which is used
670 super().
__init__(figure=figure, axis=axis)
675 def add(self, data, column, mask=None, weight_column=None):
677 Add a new boxplot to the plots
678 @param data pandas.DataFrame containing all data
679 @param column which is used to calculate boxplot quantities
680 @param mask boolean numpy.array defining which events are used for the histogram
681 @param weight_column column in data containing the weights for each event
684 mask = numpy.ones(len(data)).astype(
'bool')
685 x = data[column][mask]
686 if weight_column
is not None:
687 weight = data[weight_column][mask]
688 B2WARNING(
"Weights are currently not used in boxplot, due to limitations in matplotlib")
691 B2WARNING(
"Ignore empty boxplot.")
694 p = self.
axis.boxplot(x, sym=
'k.', whis=1.5, vert=
False, patch_artist=
True, showmeans=
True, widths=1,
695 boxprops=dict(facecolor=
'blue', alpha=0.5),
700 self.
labels.append(column)
703 self.axis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + '\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
704 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
705 self.axis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + '\n' + r'$IQD = {:.2f}$').format(x.std(),
706 x.quantile(0.75) - x.quantile(0.25)),
707 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
708 self.axis.text(0.7, 0.9, (r'$min = {:.2f}$' + '\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
709 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
716 Sets limits, title, axis-labels and legend of the plot
718 matplotlib.artist.setp(self.
axis.get_yaxis(), visible=
False)
720 self.
axis.set_title(
"Box Plot")
726 Plots the difference between two histograms
741 def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
743 Creates a new figure and axis if None is given, sets the default plot parameters
744 @param figure default draw figure which is used
745 @param axis default draw axis which is used
746 @param normed normalize minuend and subtrahend before comparing them
747 @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
749 super(Difference, self).
__init__(figure, axis)
759 def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
761 Add a new difference plot
762 @param data pandas.DataFrame containing all data
763 @param column which is used to calculate distribution histogram
764 @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
765 @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
766 @param weight_column column in data containing the weights for each event
767 @param label label for the legend if None, the column name is used
769 hists =
histogram.Histograms(data, column, {
'Minuend': minuend_mask,
'Subtrahend': subtrahend_mask},
770 weight_column=weight_column, equal_frequency=
False)
771 minuend, minuend_error = hists.get_hist(
'Minuend')
772 subtrahend, subtrahend_error = hists.get_hist(
'Subtrahend')
776 difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
777 minuend = minuend / numpy.sum(minuend)
778 subtrahend = subtrahend / numpy.sum(subtrahend)
779 difference = minuend - subtrahend
782 difference = difference - numpy.mean(difference)
784 self.
xmin, self.
xmax = min(hists.bin_centers.min(), self.
xmin), max(hists.bin_centers.max(), self.
xmax)
785 self.
ymin = min((difference - difference_error).min(), self.
ymin)
786 self.
ymax = max((difference + difference_error).max(), self.
ymax)
788 p = self.
_plot_datapoints(self.
axis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
793 self.
labels.append(column)
799 Sets limits, title, axis-labels and legend of the plot
801 self.
axis.
plot((self.
xmin, self.
xmax), (0, 0), color=line_color, linewidth=4, rasterized=
True)
805 self.
axis.set_title(
"Difference Plot")
806 self.
axis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
808 self.
axis.get_yaxis().set_label_text(
'Difference')
809 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
815 Create TMVA-like overtraining control plot for a classification training
829 Creates a new figure if None is given, sets the default plot parameters
830 @param figure default draw figure which is used
833 self.
figure = matplotlib.figure.Figure(figsize=(32, 18))
834 self.
figure.set_tight_layout(
True)
838 gs = matplotlib.gridspec.GridSpec(5, 1)
845 def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
847 Add a new overtraining plot, I recommend to raw only one overtraining plot at the time,
848 otherwise there are too many curves in the plot to reconize anything in the plot.
849 @param data pandas.DataFrame containing all data
850 @param column which is used to calculate distribution histogram
851 @param train_mask boolean numpy.array defining which events are training events
852 @param test_mask boolean numpy.array defining which events are test events
853 @param signal_mask boolean numpy.array defining which events are signal events
854 @param bckgrd_mask boolean numpy.array defining which events are background events
855 @param weight_column column in data containing the weights for each event
862 distribution.add(data, column, test_mask & signal_mask, weight_column)
863 distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
865 distribution.set_plot_options({
'color': distribution.plots[0][0][0].get_color(),
'linestyle':
'steps-mid-',
'lw': 4})
866 distribution.set_fill_options({
'color': distribution.plots[0][0][0].get_color(),
'alpha': 0.5,
'step':
'mid'})
867 distribution.set_errorbar_options(
None)
868 distribution.set_errorband_options(
None)
869 distribution.add(data, column, train_mask & signal_mask, weight_column)
870 distribution.set_plot_options({
'color': distribution.plots[1][0][0].get_color(),
'linestyle':
'steps-mid-',
'lw': 4})
871 distribution.set_fill_options({
'color': distribution.plots[1][0][0].get_color(),
'alpha': 0.5,
'step':
'mid'})
872 distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
874 distribution.labels = [
'Test-Signal',
'Test-Background',
'Train-Signal',
'Train-Background']
875 distribution.finish()
877 self.
plot_kwargs[
'color'] = distribution.plots[0][0][0].get_color()
879 difference_signal.set_plot_options(self.
plot_kwargs)
882 difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
883 self.
axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
884 self.
axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
885 difference_signal.plots = difference_signal.labels = []
886 difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
888 self.
plot_kwargs[
'color'] = distribution.plots[1][0][0].get_color()
890 difference_bckgrd.set_plot_options(self.
plot_kwargs)
893 difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
894 self.
axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
895 self.
axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
896 difference_bckgrd.plots = difference_bckgrd.labels = []
897 difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
902 if len(data[column][train_mask & signal_mask]) == 0
or len(data[column][test_mask & signal_mask]) == 0:
903 B2WARNING(
"Cannot calculate kolmogorov smirnov test for signal due to missing data")
905 ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
906 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
907 self.
axis_d1.text(0.1, 0.9,
r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36, bbox=props,
908 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d1.transAxes)
909 if len(data[column][train_mask & bckgrd_mask]) == 0
or len(data[column][test_mask & bckgrd_mask]) == 0:
910 B2WARNING(
"Cannot calculate kolmogorov smirnov test for background due to missing data")
912 ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
913 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
914 self.
axis_d2.text(0.1, 0.9,
r'background (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36,
916 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d2.transAxes)
918 B2WARNING(
"Cannot calculate kolmogorov smirnov test please install scipy!")
924 Sets limits, title, axis-labels and legend of the plot
926 self.
axis.set_title(
"Overtraining Plot")
929 matplotlib.artist.setp(self.
axis.get_xticklabels(), visible=
False)
930 matplotlib.artist.setp(self.
axis_d1.get_xticklabels(), visible=
False)
931 self.
axis.get_xaxis().set_label_text(
'')
932 self.
axis_d1.get_xaxis().set_label_text(
'')
933 self.
axis_d2.get_xaxis().set_label_text(
'Classifier Output')
939 Plots distribution of a quantity including boxplots
945 def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
947 Creates a new figure and axis if None is given, sets the default plot parameters
948 @param figure default draw figure which is used
949 @param axis default draw axis which is used
950 @param normed true if the histograms should be normed before drawing
951 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
953 super(VerboseDistribution, self).
__init__(figure, axis)
962 def add(self, data, column, mask=None, weight_column=None, label=None):
964 Add a new distribution plot, with additional information like a boxplot compared to
965 the ordinary Distribution plot.
966 @param data pandas.DataFrame containing all data
967 @param column which is used to calculate distribution histogram
968 @param mask boolean numpy.array defining which events are used for the distribution histogram
969 @param weight_column column in data containing the weights for each event
977 gs = matplotlib.gridspec.GridSpec(4 * n, 1)
978 gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :]
for i
in range(n)]
986 box.add(data, column, mask, weight_column)
987 if len(box.plots) > 0:
988 box.plots[0][
'boxes'][0].set_facecolor(self.
distribution.plots[-1][0][0].get_color())
996 Sets limits, title, axis-labels and legend of the plot
999 matplotlib.artist.setp(self.
axis.get_xticklabels(), visible=
False)
1000 self.
axis.get_xaxis().set_label_text(
'')
1001 for box_axis
in self.
box_axes[:-1]:
1002 matplotlib.artist.setp(box_axis.get_xticklabels(), visible=
False)
1003 box_axis.set_title(
"")
1004 box_axis.get_xaxis().set_label_text(
'')
1006 self.
axis.set_title(
"Distribution Plot")
1008 loc=
'best', fancybox=
True, framealpha=0.5)
1014 Plots change of a distribution of a quantity depending on the cut on a classifier
1027 Creates a new figure if None is given, sets the default plot parameters
1028 @param figure default draw figure which is used
1031 self.
figure = matplotlib.figure.Figure(figsize=(32, 18))
1032 self.
figure.set_tight_layout(
True)
1036 gs = matplotlib.gridspec.GridSpec(3, 2)
1043 def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1045 Add a new correlation plot.
1046 @param data pandas.DataFrame containing all data
1047 @param column which is used to calculate distribution histogram
1048 @param cut_column which is used to calculate cut on the other quantity defined by column
1049 @param quantiles list of quantiles between 0 and 100, defining the different cuts
1050 @param weight_column column in data containing the weights for each event
1052 if len(data[cut_column]) == 0:
1053 B2WARNING(
"Ignore empty Correlation.")
1058 for i, (l, m)
in enumerate([(
'.', signal_mask | bckgrd_mask), (
'S', signal_mask), (
'B', bckgrd_mask)]):
1060 if weight_column
is not None:
1061 weights = numpy.array(data[weight_column][m])
1063 weights = numpy.ones(len(data[column][m]))
1067 xrange = np.percentile(data[column][m], [5, 95]).astype(np.float32)
1069 colormap = plt.get_cmap(
'coolwarm')
1070 tmp, x = np.histogram(data[column][m], bins=100,
1071 range=xrange, normed=
True, weights=weights)
1072 bin_center = ((x + np.roll(x, 1)) / 2)[1:]
1073 axes[i].
plot(bin_center, tmp, color=
'black', lw=1)
1075 for quantil
in np.arange(5, 100, 5):
1076 cut = np.percentile(data[cut_column][m], quantil)
1077 sel = data[cut_column][m] >= cut
1078 y, x = np.histogram(data[column][m][sel], bins=100,
1079 range=xrange, normed=
True, weights=weights[sel])
1080 bin_center = ((x + np.roll(x, 1)) / 2)[1:]
1081 axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1084 axes[i].set_ylim(bottom=0)
1087 axes[i].set_title(
r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1092 Sets limits, title, axis-labels and legend of the plot
1099 Plots multivariate distribution using TSNE algorithm
1102 def add(self, data, columns, *masks):
1104 Add a new correlation plot.
1105 @param data pandas.DataFrame containing all data
1106 @param columns which are used to calculate the correlations
1107 @param masks different classes to show in TSNE
1111 import sklearn.manifold
1112 model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1113 data = numpy.array([data[column]
for column
in columns]).T
1116 data = numpy.array([data[column][mask]
for column
in columns]).T
1117 data = model.transform(data)
1118 self.
axis.scatter(data[:, 0], data[:, 1], rasterized=
True)
1120 print(
"Cannot create TSNE plot. Install sklearn if you want it")
1125 Sets limits, title, axis-labels and legend of the plot
1132 Plots importance matrix
1135 def add(self, data, columns, variables):
1137 Add a new correlation plot.
1138 @param data pandas.DataFrame containing all data
1139 @param columns which are used to calculate the correlations
1141 self.
figure.set_tight_layout(
True)
1144 width = (numpy.max(x) - numpy.min(x))
1146 return numpy.zeros(x.shape)
1147 return (x - numpy.min(x)) / width * 100
1149 importance_matrix = numpy.vstack([norm(data[column])
for column
in columns]).T
1150 importance_heatmap = self.
axis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,
1154 self.
axis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=
False)
1155 self.
axis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=
False)
1157 self.
axis.set_xticklabels(columns, minor=
False, rotation=90)
1158 self.
axis.set_yticklabels(variables, minor=
False)
1160 self.
axis.xaxis.tick_top()
1162 for y
in range(importance_matrix.shape[0]):
1163 for x
in range(importance_matrix.shape[1]):
1164 self.
axis.text(x + 0.5, y + 0.5,
'%.0f' % importance_matrix[y, x],
1166 horizontalalignment=
'center',
1167 verticalalignment=
'center')
1169 cb = self.
figure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation=
'vertical')
1170 cb.ax.set_yticklabels([
'low',
'high'])
1172 self.
axis.set_aspect(
'equal')
1178 Sets limits, title, axis-labels and legend of the plot
1185 Plots correlation matrix
1196 Creates a new figure if None is given, sets the default plot parameters
1197 @param figure default draw figure which is used
1200 self.
figure = matplotlib.figure.Figure(figsize=(32, 18))
1201 self.
figure.set_tight_layout(
True)
1205 gs = matplotlib.gridspec.GridSpec(8, 2)
1215 def add(self, data, columns, signal_mask, bckgrd_mask):
1217 Add a new correlation plot.
1218 @param data pandas.DataFrame containing all data
1219 @param columns which are used to calculate the correlations
1221 signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask]
for column
in columns])) * 100
1222 bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask]
for column
in columns])) * 100
1224 signal_heatmap = self.
signal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1226 bckgrd_heatmap = self.
bckgrd_axis.pcolor(bckgrd_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1234 self.
signal_axis.set_xticks(numpy.arange(signal_corr.shape[0]) + 0.5, minor=
False)
1235 self.
signal_axis.set_yticks(numpy.arange(signal_corr.shape[1]) + 0.5, minor=
False)
1237 self.
signal_axis.set_xticklabels(columns, minor=
False, rotation=90)
1238 self.
signal_axis.set_yticklabels(columns, minor=
False)
1241 self.
bckgrd_axis.set_xticks(numpy.arange(bckgrd_corr.shape[0]) + 0.5, minor=
False)
1242 self.
bckgrd_axis.set_yticks(numpy.arange(bckgrd_corr.shape[1]) + 0.5, minor=
False)
1244 self.
bckgrd_axis.set_xticklabels(columns, minor=
False, rotation=90)
1245 self.
bckgrd_axis.set_yticklabels(columns, minor=
False)
1247 for y
in range(signal_corr.shape[0]):
1248 for x
in range(signal_corr.shape[1]):
1249 self.
signal_axis.text(x + 0.5, y + 0.5,
'%.0f' % signal_corr[y, x],
1251 horizontalalignment=
'center',
1252 verticalalignment=
'center')
1254 for y
in range(bckgrd_corr.shape[0]):
1255 for x
in range(bckgrd_corr.shape[1]):
1256 self.
bckgrd_axis.text(x + 0.5, y + 0.5,
'%.0f' % bckgrd_corr[y, x],
1258 horizontalalignment=
'center',
1259 verticalalignment=
'center')
1261 cb = self.
figure.colorbar(signal_heatmap, cax=self.
colorbar_axis, ticks=[-100, 0, 100], orientation=
'horizontal')
1262 cb.solids.set_rasterized(
True)
1263 cb.ax.set_xticklabels([
'negative',
'uncorrelated',
'positive'])
1265 self.
signal_axis.text(0.5, -1.0,
"Signal", horizontalalignment=
'center')
1266 self.
bckgrd_axis.text(0.5, -1.0,
"Background", horizontalalignment=
'center')
1272 Sets limits, title, axis-labels and legend of the plot
1274 matplotlib.artist.setp(self.
bckgrd_axis.get_yticklabels(), visible=
False)
1278 if __name__ ==
'__main__':
1280 def get_data(N, columns):
1282 Creates fake data for example plots
1285 n = len(columns) - 1
1286 xs = numpy.random.normal(0, size=(N, n))
1287 xb = numpy.random.normal(1, size=(N, n))
1290 data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1291 return data.reindex(numpy.random.permutation(data.index))
1295 seaborn.set(font_scale=3)
1296 seaborn.set_style(
'whitegrid')
1300 data = get_data(N, columns=[
'FastBDT',
'NeuroBayes',
'isSignal'])
1302 data.type.iloc[:N / 2] =
'Train'
1303 data.type.iloc[N / 2:] =
'Test'
1306 p.add(data,
'FastBDT')
1308 p.save(
'box_plot.png')
1311 p.add(data,
'FastBDT')
1312 p.add(data,
'NeuroBayes')
1314 p.save(
'verbose_distribution_plot.png')
1317 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1318 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1320 p.save(
'roc_purity_plot.png')
1323 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1324 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1326 p.save(
'roc_rejection_plot.png')
1329 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1330 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1332 p.save(
'diagonal_plot.png')
1335 p.add(data,
'FastBDT')
1336 p.add(data,
'NeuroBayes')
1338 p.save(
'distribution_plot.png')
1341 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1342 p.add(data,
'NeuroBayes', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1344 p.save(
'difference_plot.png')
1347 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1349 p.save(
'overtraining_plot.png')
1352 p.add(data,
'FastBDT',
'NeuroBayes', [0, 20, 40, 60, 80, 100], data[
'isSignal'] == 0)
1354 p.save(
'correlation_plot.png')
1357 data[
'FastBDT2'] = data[
'FastBDT']**2
1358 data[
'NeuroBayes2'] = data[
'NeuroBayes']**2
1359 data[
'FastBDT3'] = data[
'FastBDT']**3
1360 data[
'NeuroBayes3'] = data[
'NeuroBayes']**3
1361 p.add(data, [
'FastBDT',
'NeuroBayes',
'FastBDT2',
'NeuroBayes2',
'FastBDT3',
'NeuroBayes3'])
1363 p.save(
'correlation_matrix.png')