17import matplotlib.pyplot
as plt
18import matplotlib.artist
19import matplotlib.figure
20import matplotlib.gridspec
21import matplotlib.colors
22import matplotlib.patches
23import matplotlib.ticker
24import matplotlib.patheffects
as PathEffects
27from basf2_mva_evaluation
import histogram
37matplotlib.rcParams.update({
'font.size': 36})
40plt.style.use(
"belle2")
45 Base class for all Plotters.
79 Creates a new figure and axis
if None is given, sets the default plot parameters
80 @param figure default draw figure which
is used
81 @param axis default draw axis which
is used
83 b2.B2INFO("Create new figure for class " + str(type(self)))
126 Adds a new subplot to the figure, updates all other axes
127 according to the given gridspec
128 @param gridspecs gridspecs
for all axes including the new one
130 for gs, ax
in zip(gridspecs[:-1], self.
figurefigure.axes):
132 ax.set_subplotspec(gs)
138 Save the figure into a file
139 @param filename of the file
141 b2.B2INFO("Save figure for class " + str(type(self)))
142 from matplotlib.backends.backend_agg
import FigureCanvasAgg
as FigureCanvas
144 canvas.print_figure(filename, dpi=50)
149 Overrides default plot options for datapoint plot
150 @param plot_kwargs keyword arguments
for the plot function
157 Overrides default errorbar options for datapoint errorbars
158 @param errorbar_kwargs keyword arguments
for the errorbar function
165 Overrides default errorband options for datapoint errorband
166 @param errorbar_kwargs keyword arguments
for the fill_between function
173 Overrides default fill_between options for datapoint errorband
174 @param fill_kwargs keyword arguments
for the fill_between function
181 Plot the given datapoints, with plot, errorbar
and make a errorband
with fill_between
182 @param x coordinates of the data points
183 @param y coordinates of the data points
184 @param xerr symmetric error on x data points
185 @param yerr symmetric error on y data points
193 if plot_kwargs
is None or 'color' not in plot_kwargs:
194 color = next(axis._get_lines.prop_cycler)
195 color = color[
'color']
196 plot_kwargs[
'color'] = color
198 color = plot_kwargs[
'color']
199 color = matplotlib.colors.ColorConverter().to_rgb(color)
200 patch = matplotlib.patches.Patch(color=color, alpha=0.5)
201 patch.get_color = patch.get_facecolor
204 if plot_kwargs
is not None:
205 p, = axis.plot(x, y, rasterized=
True, **plot_kwargs)
208 if errorbar_kwargs
is not None and (xerr
is not None or yerr
is not None):
209 if 'color' not in errorbar_kwargs:
210 errorbar_kwargs[
'color'] = color
211 if 'ecolor' not in errorbar_kwargs:
212 errorbar_kwargs[
'ecolor'] = [0.5 * x
for x
in color]
217 if not isinstance(xerr, (numpy.ndarray, list)):
218 xerr = xerr*numpy.ones(len(x))
219 mask = numpy.logical_and.reduce([numpy.isfinite(v)
for v
in [x, y, xerr, yerr]])
221 e = axis.errorbar(x[mask], y[mask], xerr=xerr[mask], yerr=yerr[mask], rasterized=
True, **errorbar_kwargs)
224 if errorband_kwargs
is not None and yerr
is not None:
225 if 'color' not in errorband_kwargs:
226 errorband_kwargs[
'color'] = color
231 for _x, _y, _xe, _ye
in zip(x, y, xerr, yerr):
232 axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=
True,
235 f = axis.fill_between(x, y - yerr, y + yerr, interpolate=
True, rasterized=
True, **errorband_kwargs)
237 if fill_kwargs
is not None:
239 x = numpy.append(x, x[-1]+2*xerr[-1])
240 y = numpy.append(y, y[-1])
241 xerr = numpy.append(xerr, xerr[-1])
243 axis.fill_between(x-xerr, y, 0, rasterized=
True, **fill_kwargs)
245 return (tuple(patches), p, e, f)
247 def add(self, *args, **kwargs):
249 Add a new plot to this plotter
251 return NotImplemented
255 Finish plotting and set labels, legends
and stuff
257 return NotImplemented
261 Scale limits to increase distance to boundaries
272 Plots the purity and the efficiency over the cut value (
for cut choosing)
279 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
281 Add a new curve to the plot
282 @param data pandas.DataFrame containing all data
283 @param column which
is used to calculate efficiency
and purity
for different cuts
284 @param signal_mask boolean numpy.array defining which events are signal events
285 @param bckgrd_mask boolean numpy.array defining which events are background events
286 @param weight_column column
in data containing the weights
for each event
289 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
292 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
293 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
295 efficiency, efficiency_error = hists.get_true_positives([
'Signal'])
296 purity, purity_error = hists.get_false_positives([
'Background'])
298 cuts = hists.bin_centers
302 numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.
ymaxymaxymax])
322 Sets limits, title, axis-labels and legend of the plot
326 self.axisaxis.set_title("Classification Plot")
327 self.
axisaxis.get_xaxis().set_label_text(
'Cut Value')
334 Plots the signal to noise ratio over the cut value (for cut choosing)
341 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
343 Add a new curve to the plot
344 @param data pandas.DataFrame containing all data
345 @param column which
is used to calculate signal to noise ratio
for different cuts
346 @param signal_mask boolean numpy.array defining which events are signal events
347 @param bckgrd_mask boolean numpy.array defining which events are background events
348 @param weight_column column
in data containing the weights
for each event
351 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
353 signal2noise, signal2noise_error = hists.get_signal_to_noise([
'Signal'], [
'Background'])
355 cuts = hists.bin_centers
359 numpy.nanmax([numpy.nanmax(signal2noise), self.
ymaxymaxymax])
369 Sets limits, title, axis-labels and legend of the plot
373 self.axisaxis.set_title("Signal to Noise Plot")
374 self.
axisaxis.get_xaxis().set_label_text(
'Cut Value')
381 Plots the purity over the efficiency also known as ROC curve
388 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
390 Add a new curve to the ROC plot
391 @param data pandas.DataFrame containing all data
392 @param column which
is used to calculate efficiency
and purity
for different cuts
393 @param signal_mask boolean numpy.array defining which events are signal events
394 @param bckgrd_mask boolean numpy.array defining which events are background events
395 @param weight_column column
in data containing the weights
for each event
397 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
398 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
399 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
406 if label
is not None:
414 Sets limits, title, axis-labels and legend of the plot
418 self.axisaxis.set_title("ROC Purity Plot")
419 self.
axisaxis.get_xaxis().set_label_text(
'Efficiency')
420 self.
axisaxis.get_yaxis().set_label_text(
'Purity')
427 Plots the rejection over the efficiency also known as ROC curve
434 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
436 Add a new curve to the ROC plot
437 @param data pandas.DataFrame containing all data
438 @param column which
is used to calculate efficiency
and purity
for different cuts
439 @param signal_mask boolean numpy.array defining which events are signal events
440 @param bckgrd_mask boolean numpy.array defining which events are background events
441 @param weight_column column
in data containing the weights
for each event
443 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
444 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
445 rejection, rejection_error = hists.get_efficiency([
'Background'])
446 rejection = 1 - rejection
447 if isinstance(efficiency, int)
and not isinstance(rejection, int):
448 efficiency = numpy.array([efficiency] * len(rejection))
449 elif isinstance(rejection, int)
and not isinstance(efficiency, int):
450 rejection = numpy.array([rejection] * len(efficiency))
451 elif isinstance(rejection, int)
and isinstance(efficiency, int):
452 efficiency = numpy.array([efficiency])
453 rejection = numpy.array([rejection])
458 auc = numpy.abs(numpy.trapz(rejection, efficiency))
462 if label
is not None:
470 Sets limits, title, axis-labels and legend of the plot
474 self.axisaxis.set_title("ROC Rejection Plot")
475 self.
axisaxis.get_xaxis().set_label_text(
'Signal Efficiency')
476 self.
axisaxis.get_yaxis().set_label_text(
'Background Rejection')
483 Plots multiple other plots into a grid 3x?
490 def __init__(self, cls, number_of_plots, figure=None):
492 Creates a new figure if None is given, sets the default plot parameters
493 @param figure default draw figure which
is used
502 if number_of_plots == 1:
503 gs = matplotlib.gridspec.GridSpec(1, 1)
504 elif number_of_plots == 2:
505 gs = matplotlib.gridspec.GridSpec(1, 2)
506 elif number_of_plots == 3:
507 gs = matplotlib.gridspec.GridSpec(1, 3)
509 gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
517 def add(self, i, *args, **kwargs):
519 Call add function of ith subplot
520 @param i position of the subplot
526 Sets limits, title, axis-labels and legend of the plot
535 Plots the purity in each bin over the classifier output.
542 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
544 Add a new curve to the Diagonal plot
545 @param data pandas.DataFrame containing all data
546 @param column which
is used to calculate purity
for different cuts
547 @param signal_mask boolean numpy.array defining which events are signal events
548 @param bckgrd_mask boolean numpy.array defining which events are background events
549 @param weight_column column
in data containing the weights
for each event
551 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
552 purity, purity_error = hists.get_purity_per_bin([
'Signal'], [
'Background'])
565 Sets limits, title, axis-labels and legend of the plot
568 self.axisaxis.plot((0.0, 1.0), (0.0, 1.0), color='black')
571 self.
axisaxis.set_title(
"Diagonal Plot")
572 self.
axisaxis.get_xaxis().set_label_text(
'Classifier Output')
573 self.
axisaxis.get_yaxis().set_label_text(
'Purity Per Bin')
580 Plots distribution of a quantity
583 def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
584 keep_first_binning=False, range_in_std=None):
586 Creates a new figure and axis
if None is given, sets the default plot parameters
587 @param figure default draw figure which
is used
588 @param axis default draw axis which
is used
589 @param normed true
if histograms should be normed before drawing
590 @param keep_first_binning use the binning of the first distribution
for further plots
591 @param range_in_std show only the data
in a windows around +- range_in_std * standard_deviation around the mean
616 def add(self, data, column, mask=None, weight_column=None, label=None):
618 Add a new distribution to the plots
619 @param data pandas.DataFrame containing all data
620 @param column which
is used to calculate distribution histogram
621 @param mask boolean numpy.array defining which events are used
for the histogram
622 @param weight_column column
in data containing the weights
for each event
625 mask = numpy.ones(len(data)).astype(
'bool')
631 bins=bins, equal_frequency=
False, range_in_std=self.
range_in_std)
634 hist, hist_error = hists.get_hist(
'Total')
637 normalization = float(numpy.sum(hist))
638 hist = hist / normalization
639 hist_error = hist_error / normalization
642 hist = hist / hists.bin_widths
643 hist_error = hist_error / hists.bin_widths
655 appendix =
' No data to plot!'
665 Sets limits, title, axis-labels and legend of the plot
667 self.axisaxis.set_title("Distribution Plot")
675 self.
axisaxis.text(0.36, 0.5,
'No data to plot', fontsize=60, color=
'black')
684 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / (# Entries * Bin Width)')
686 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / # Entries')
688 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / Bin Width')
690 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin')
704 Creates a new figure and axis
if None is given, sets the default plot parameters
705 @param figure default draw figure which
is used
706 @param axis default draw axis which
is used
708 super().__init__(figure=figure, axis=axis)
713 def add(self, data, column, mask=None, weight_column=None):
715 Add a new boxplot to the plots
716 @param data pandas.DataFrame containing all data
717 @param column which
is used to calculate boxplot quantities
718 @param mask boolean numpy.array defining which events are used
for the histogram
719 @param weight_column column
in data containing the weights
for each event
722 mask = numpy.ones(len(data)).astype(
'bool')
723 x = data[column][mask]
724 if weight_column
is not None:
726 b2.B2WARNING(
"Weights are currently not used in boxplot, due to limitations in matplotlib")
729 b2.B2WARNING(
"Ignore empty boxplot.")
732 p = self.
axisaxis.boxplot(x, sym=
'k.', whis=1.5, vert=
False, patch_artist=
True, showmeans=
True, widths=1,
733 boxprops=dict(facecolor=
'blue', alpha=0.5),
741 self.axisaxis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' +
'\n' +
r'$median = {:.2f}$').format(x.mean(), x.median()),
742 fontsize=28, verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axisaxis.transAxes)
743 self.
axisaxis.text(0.4, 0.9, (
r'$ \sigma = {:.2f}$' +
'\n' +
r'$IQD = {:.2f}$').format(x.std(),
744 x.quantile(0.75) - x.quantile(0.25)),
745 fontsize=28, verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axisaxis.transAxes)
746 self.
axisaxis.text(0.7, 0.9, (
r'$min = {:.2f}$' +
'\n' +
r'$max = {:.2f}$').format(x.min(), x.max()),
747 fontsize=28, verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axisaxis.transAxes)
754 Sets limits, title, axis-labels and legend of the plot
756 matplotlib.artist.setp(self.axisaxis.get_yaxis(), visible=False)
764 Plots the difference between two histograms
779 def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
781 Creates a new figure and axis
if None is given, sets the default plot parameters
782 @param figure default draw figure which
is used
783 @param axis default draw axis which
is used
784 @param normed normalize minuend
and subtrahend before comparing them
785 @param shift_to_zero mean difference
is shifted to zero, to remove constant offset due to e.g. different sample sizes
797 def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
799 Add a new difference plot
800 @param data pandas.DataFrame containing all data
801 @param column which
is used to calculate distribution histogram
802 @param minuend_mask boolean numpy.array defining which events are
for the minuend histogram
803 @param subtrahend_mask boolean numpy.array defining which events are
for the subtrahend histogram
804 @param weight_column column
in data containing the weights
for each event
805 @param label label
for the legend
if None, the column name
is used
807 hists = histogram.Histograms(data, column, {'Minuend': minuend_mask,
'Subtrahend': subtrahend_mask},
808 weight_column=weight_column, equal_frequency=
False)
809 minuend, minuend_error = hists.get_hist(
'Minuend')
810 subtrahend, subtrahend_error = hists.get_hist(
'Subtrahend')
814 difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
815 minuend = minuend / numpy.sum(minuend)
816 subtrahend = subtrahend / numpy.sum(subtrahend)
817 difference = minuend - subtrahend
820 difference = difference - numpy.mean(difference)
826 p = self.
_plot_datapoints(self.
axisaxis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
837 Sets limits, title, axis-labels and legend of the plot
843 self.
axisaxis.set_title(
"Difference Plot")
844 self.
axisaxis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
846 self.
axisaxis.get_yaxis().set_label_text(
'Difference')
853 Create TMVA-like overtraining control plot for a classification training
867 Creates a new figure if None is given, sets the default plot parameters
868 @param figure default draw figure which
is used
877 gs = matplotlib.gridspec.GridSpec(5, 1)
887 def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
889 Add a new overtraining plot, I recommend to draw only one overtraining plot at the time,
890 otherwise there are too many curves in the plot to recognize anything
in the plot.
891 @param data pandas.DataFrame containing all data
892 @param column which
is used to calculate distribution histogram
893 @param train_mask boolean numpy.array defining which events are training events
894 @param test_mask boolean numpy.array defining which events are test events
895 @param signal_mask boolean numpy.array defining which events are signal events
896 @param bckgrd_mask boolean numpy.array defining which events are background events
897 @param weight_column column
in data containing the weights
for each event
904 distribution.add(data, column, test_mask & signal_mask, weight_column)
905 distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
907 distribution.set_plot_options(
908 {
'color': distribution.plots[0][0][0].get_color(),
'linestyle':
'-',
'lw': 4,
'drawstyle':
'steps-mid'})
909 distribution.set_fill_options({
'color': distribution.plots[0][0][0].get_color(),
'alpha': 0.5,
'step':
'post'})
910 distribution.set_errorbar_options(
None)
911 distribution.set_errorband_options(
None)
912 distribution.add(data, column, train_mask & signal_mask, weight_column)
913 distribution.set_plot_options(
914 {
'color': distribution.plots[1][0][0].get_color(),
'linestyle':
'-',
'lw': 4,
'drawstyle':
'steps-mid'})
915 distribution.set_fill_options({
'color': distribution.plots[1][0][0].get_color(),
'alpha': 0.5,
'step':
'post'})
916 distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
918 distribution.labels = [
'Test-Signal',
'Test-Background',
'Train-Signal',
'Train-Background']
919 distribution.finish()
921 self.
plot_kwargs[
'color'] = distribution.plots[0][0][0].get_color()
923 difference_signal.set_plot_options(self.
plot_kwargs)
926 difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
927 self.
axis_d1axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
928 self.
axis_d1axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
929 difference_signal.plots = difference_signal.labels = []
930 difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
932 self.
plot_kwargs[
'color'] = distribution.plots[1][0][0].get_color()
934 difference_bckgrd.set_plot_options(self.
plot_kwargs)
937 difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
938 self.
axis_d2axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
939 self.
axis_d2axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
940 difference_bckgrd.plots = difference_bckgrd.labels = []
941 difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
946 if len(data[column][train_mask & signal_mask]) == 0
or len(data[column][test_mask & signal_mask]) == 0:
947 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test for signal due to missing data")
949 ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
950 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
951 self.
axis_d1axis_d1.text(0.1, 0.9,
r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36, bbox=props,
952 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d1axis_d1.transAxes)
953 if len(data[column][train_mask & bckgrd_mask]) == 0
or len(data[column][test_mask & bckgrd_mask]) == 0:
954 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test for background due to missing data")
956 ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
957 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
958 self.
axis_d2axis_d2.text(0.1, 0.9,
r'background (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36,
960 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d2axis_d2.transAxes)
962 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test please install scipy!")
968 Sets limits, title, axis-labels and legend of the plot
973 matplotlib.artist.setp(self.
axisaxisaxisaxis.get_xticklabels(), visible=
False)
974 matplotlib.artist.setp(self.
axis_d1axis_d1.get_xticklabels(), visible=
False)
977 self.
axis_d2axis_d2.get_xaxis().set_label_text(
'Classifier Output')
983 Plots distribution of a quantity including boxplots
989 def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
991 Creates a new figure and axis
if None is given, sets the default plot parameters
992 @param figure default draw figure which
is used
993 @param axis default draw axis which
is used
994 @param normed true
if the histograms should be normed before drawing
995 @param range_in_std show only the data
in a windows around +- range_in_std * standard_deviation around the mean
1007 def add(self, data, column, mask=None, weight_column=None, label=None):
1009 Add a new distribution plot, with additional information like a boxplot compared to
1010 the ordinary Distribution plot.
1011 @param data pandas.DataFrame containing all data
1012 @param column which
is used to calculate distribution histogram
1013 @param mask boolean numpy.array defining which events are used
for the distribution histogram
1014 @param weight_column column
in data containing the weights
for each event
1022 gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1023 gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i
in range(n)]
1029 mask = mask & (data[column] > (mean - self.
range_in_std * std)) & (data[column] < (mean + self.
range_in_std * std))
1031 box.add(data, column, mask, weight_column)
1032 if len(box.plots) > 0:
1033 box.plots[0][
'boxes'][0].set_facecolor(self.
distribution.plots[-1][0][0].get_color())
1041 Sets limits, title, axis-labels and legend of the plot
1044 matplotlib.artist.setp(self.axisaxis.get_xticklabels(), visible=False)
1045 self.
axisaxis.get_xaxis().set_label_text(
'')
1047 matplotlib.artist.setp(box_axis.get_xticklabels(), visible=
False)
1048 box_axis.set_title(
"")
1049 box_axis.get_xaxis().set_label_text(
'')
1051 self.
axisaxis.set_title(
"Distribution Plot")
1053 loc=
'best', fancybox=
True, framealpha=0.5)
1059 Plots change of a distribution of a quantity depending on the cut on a classifier
1072 Creates a new figure if None is given, sets the default plot parameters
1073 @param figure default draw figure which
is used
1082 gs = matplotlib.gridspec.GridSpec(3, 2)
1092 def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1094 Add a new correlation plot.
1095 @param data pandas.DataFrame containing all data
1096 @param column which
is used to calculate distribution histogram
1097 @param cut_column which
is used to calculate cut on the other quantity defined by column
1098 @param quantiles list of quantiles between 0
and 100, defining the different cuts
1099 @param weight_column column
in data containing the weights
for each event
1101 if len(data[cut_column]) == 0:
1102 b2.B2WARNING(
"Ignore empty Correlation.")
1107 for i, (l, m)
in enumerate([(
'.', signal_mask | bckgrd_mask), (
'S', signal_mask), (
'B', bckgrd_mask)]):
1109 if weight_column
is not None:
1110 weights = numpy.array(data[weight_column][m])
1112 weights = numpy.ones(len(data[column][m]))
1114 xrange = numpy.percentile(data[column][m], [5, 95])
1116 colormap = plt.get_cmap(
'coolwarm')
1117 tmp, x = numpy.histogram(data[column][m], bins=100,
1118 range=xrange, density=
True, weights=weights)
1119 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1120 axes[i].
plot(bin_center, tmp, color=
'black', lw=1)
1122 for quantil
in numpy.arange(5, 100, 5):
1123 cut = numpy.percentile(data[cut_column][m], quantil)
1124 sel = data[cut_column][m] >= cut
1125 y, x = numpy.histogram(data[column][m][sel], bins=100,
1126 range=xrange, density=
True, weights=weights[sel])
1127 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1128 axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1131 axes[i].set_ylim(bottom=0)
1134 axes[i].set_title(
r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1139 Sets limits, title, axis-labels and legend of the plot
1146 Plots multivariate distribution using TSNE algorithm
1149 def add(self, data, columns, *masks):
1151 Add a new correlation plot.
1152 @param data pandas.DataFrame containing all data
1153 @param columns which are used to calculate the correlations
1154 @param masks different classes to show
in TSNE
1158 import sklearn.manifold
1159 model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1160 data = numpy.array([data[column]
for column
in columns]).T
1163 data = numpy.array([data[column][mask]
for column
in columns]).T
1164 data = model.transform(data)
1165 self.
axisaxis.scatter(data[:, 0], data[:, 1], rasterized=
True)
1167 print(
"Cannot create TSNE plot. Install sklearn if you want it")
1172 Sets limits, title, axis-labels and legend of the plot
1179 Plots importance matrix
1182 def add(self, data, columns, variables):
1184 Add a new correlation plot.
1185 @param data pandas.DataFrame containing all data
1186 @param columns which are used to calculate the correlations
1191 width = (numpy.max(x) - numpy.min(x))
1193 return numpy.zeros(x.shape)
1194 return (x - numpy.min(x)) / width * 100
1196 importance_matrix = numpy.vstack([norm(data[column])
for column
in columns]).T
1197 importance_heatmap = self.
axisaxis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,
1201 self.
axisaxis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=
False)
1202 self.
axisaxis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=
False)
1204 self.
axisaxis.set_xticklabels(columns, minor=
False, rotation=90)
1205 self.
axisaxis.set_yticklabels(variables, minor=
False)
1209 for y
in range(importance_matrix.shape[0]):
1210 for x
in range(importance_matrix.shape[1]):
1211 txt = self.
axisaxis.text(x + 0.5, y + 0.5, f
'{importance_matrix[y, x]:.0f}',
1213 horizontalalignment=
'center',
1214 verticalalignment=
'center',
1216 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'k')])
1218 cb = self.
figurefigure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation=
'vertical')
1219 cb.ax.set_yticklabels([
'low',
'high'])
1222 self.
axisaxis.set_ylim(0, importance_matrix.shape[0])
1230 Sets limits, title, axis-labels and legend of the plot
1237 Plots correlation matrix
1248 Creates a new figure if None is given, sets the default plot parameters
1249 @param figure default draw figure which
is used
1258 gs = matplotlib.gridspec.GridSpec(8, 2)
1270 def add(self, data, columns, signal_mask, bckgrd_mask):
1272 Add a new correlation plot.
1273 @param data pandas.DataFrame containing all data
1274 @param columns which are used to calculate the correlations
1276 signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column
in columns])) * 100
1277 bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask]
for column
in columns])) * 100
1279 signal_heatmap = self.
signal_axissignal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1301 for y
in range(signal_corr.shape[0]):
1302 for x
in range(signal_corr.shape[1]):
1305 horizontalalignment=
'center',
1306 verticalalignment=
'center',
1308 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'k')])
1310 for y
in range(bckgrd_corr.shape[0]):
1311 for x
in range(bckgrd_corr.shape[1]):
1314 horizontalalignment=
'center',
1315 verticalalignment=
'center',
1317 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'k')])
1320 cb.solids.set_rasterized(
True)
1321 cb.ax.set_xticklabels([
'negative',
'uncorrelated',
'positive'])
1335 Sets limits, title, axis-labels and legend of the plot
1341if __name__ ==
'__main__':
1343 def get_data(N, columns):
1345 Creates fake data for example plots
1348 n = len(columns) - 1
1349 xs = numpy.random.normal(0, size=(N, n))
1350 xb = numpy.random.normal(1, size=(N, n))
1353 data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1354 return data.reindex(numpy.random.permutation(data.index))
1358 seaborn.set(font_scale=3)
1359 seaborn.set_style(
'whitegrid')
1363 data = get_data(N, columns=[
'FastBDT',
'NeuroBayes',
'isSignal'])
1365 data.type.iloc[:N / 2] =
'Train'
1366 data.type.iloc[N / 2:] =
'Test'
1369 p.add(data,
'FastBDT')
1371 p.save(
'box_plot.png')
1374 p.add(data,
'FastBDT')
1375 p.add(data,
'NeuroBayes')
1377 p.save(
'verbose_distribution_plot.png')
1380 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1381 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1383 p.save(
'roc_purity_plot.png')
1386 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1387 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1389 p.save(
'roc_rejection_plot.png')
1392 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1393 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1395 p.save(
'diagonal_plot.png')
1398 p.add(data,
'FastBDT')
1399 p.add(data,
'NeuroBayes')
1401 p.save(
'distribution_plot.png')
1404 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1405 p.add(data,
'NeuroBayes', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1407 p.save(
'difference_plot.png')
1410 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1412 p.save(
'overtraining_plot.png')
1415 p.add(data,
'FastBDT',
'NeuroBayes', [0, 20, 40, 60, 80, 100], data[
'isSignal'] == 0)
1417 p.save(
'correlation_plot.png')
1420 data[
'FastBDT2'] = data[
'FastBDT']**2
1421 data[
'NeuroBayes2'] = data[
'NeuroBayes']**2
1422 data[
'FastBDT3'] = data[
'FastBDT']**3
1423 data[
'NeuroBayes3'] = data[
'NeuroBayes']**3
1424 p.add(data, [
'FastBDT',
'NeuroBayes',
'FastBDT2',
'NeuroBayes2',
'FastBDT3',
'NeuroBayes3'])
1426 p.save(
'correlation_matrix.png')
def calculate_flatness(f, p, w=None)
x_axis_label
Label on x axis.
def add(self, data, column, mask=None, weight_column=None)
def __init__(self, figure=None, axis=None)
signal_axis
add signal subplot
def add(self, data, columns, signal_mask, bckgrd_mask)
colorbar_axis
Colorbar axis contains the colorbar.
None bckgrd_axis
Axis which shows the correlation of the background samples.
def __init__(self, figure=None)
None figure
figure which is used to draw
None signal_axis
Main axis which shows the correlation of the signal samples.
bckgrd_axis
add background subplot
axis
Usual axis object which every Plotter object needs, here it is just a dummy.
def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
axis_d1
define second subplot
None axis_d1
Axis which shows shape of signal.
None axis
Main axis which is used to draw.
def __init__(self, figure=None)
axis_d2
define third subplot
None figure
figure which is used to draw
None axis_d2
Axis which shows shape of background.
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None)
x_axis_label
Label on x axis.
shift_to_zero
Mean difference is shifted to zero (removes constant offset) if this is true.
def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
def finish(self, line_color='black')
normed
Minuend and subtrahend are normed before comparing them if this is true.
def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
def add(self, data, column, mask=None, weight_column=None, label=None)
keep_first_binning
Keep first binning if user wants so.
normed_to_all_entries
Normalize histograms before drawing them.
first_binning
first binning
range_in_std
Show only a certain range in terms of standard deviations of the data.
normed_to_bin_width
Normalize histograms before drawing them.
def add(self, data, columns, variables)
def add(self, i, *args, **kwargs)
def __init__(self, cls, number_of_plots, figure=None)
None figure
figure which is used to draw
sub_plots
the subplots which are displayed in the grid
axis
the axis of the first subplot
axis_d1
define second subplot
def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
None axis_d1
Axis which shows the difference between training and test signal.
None axis
Main axis which is used to draw.
def __init__(self, figure=None)
axis_d2
define third subplot
None figure
figure which is used to draw
None axis_d2
Axis which shows the difference between training and test background.
def finish(self, *args, **kwargs)
fill_kwargs
Default keyword arguments for fill_between function.
None ymin
Minimum y value.
def set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
plots
create empty list for plots
None ymax
Maximum y value.
errorband_kwargs
Default keyword arguments for errorband function.
None axis
Main axis which is used to draw.
def add(self, *args, **kwargs)
None xmin
Minimum x value.
def set_fill_options(self, fill_kwargs=None)
def __init__(self, figure=None, axis=None)
None figure
figure which is used to draw
None plots
Plots added to the axis so far.
errorbar_kwargs
Default keyword arguments for errorbar function.
labels
create empty list for labels
axis
divide figure into subplots
def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
def set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Overrides default errorbar options for datapoint errorbars.
None labels
Labels of the plots added so far.
def add_subplot(self, gridspecs)
def set_plot_options(self, plot_kwargs={ 'linestyle':''})
plot_kwargs
Default keyword arguments for plot function.
None xmax
Maximum x value.
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
def add(self, data, columns, *masks)
def add(self, data, column, mask=None, weight_column=None, label=None)
distribution
The distribution plot.
range_in_std
Show only a certain range in terms of standard deviations of the data.
None box_axes
Axes for the boxplots.
box_axes
create empty list for box axes
normed
Normalize histograms before drawing them.
def __init__(self, figure=None, axis=None, normed=False, range_in_std=None)
def weighted_mean_and_std(x, w)