18import matplotlib.pyplot
as plt
19import matplotlib.artist
20import matplotlib.figure
21import matplotlib.gridspec
22import matplotlib.colors
23import matplotlib.patches
24import matplotlib.ticker
25import matplotlib.patheffects
as PathEffects
28from basf2_mva_evaluation
import histogram
38matplotlib.rcParams.update({
'font.size': 36})
41plt.style.use(
"belle2")
46 Base class for all Plotters.
80 Creates a new figure and axis
if None is given, sets the default plot parameters
81 @param figure default draw figure which
is used
82 @param axis default draw axis which
is used
84 b2.B2INFO("Create new figure for class " + str(type(self)))
126 self.
prop_cycler = itertools.cycle(plt.rcParams[
"axes.prop_cycle"])
130 Adds a new subplot to the figure, updates all other axes
131 according to the given gridspec
132 @param gridspecs gridspecs
for all axes including the new one
134 for gs, ax
in zip(gridspecs[:-1], self.
figurefigure.axes):
136 ax.set_subplotspec(gs)
142 Save the figure into a file
143 @param filename of the file
145 b2.B2INFO("Save figure for class " + str(type(self)))
146 from matplotlib.backends.backend_agg
import FigureCanvasAgg
as FigureCanvas
148 canvas.print_figure(filename, dpi=50)
153 Overrides default plot options for datapoint plot
154 @param plot_kwargs keyword arguments
for the plot function
161 Overrides default errorbar options for datapoint errorbars
162 @param errorbar_kwargs keyword arguments
for the errorbar function
169 Overrides default errorband options for datapoint errorband
170 @param errorbar_kwargs keyword arguments
for the fill_between function
177 Overrides default fill_between options for datapoint errorband
178 @param fill_kwargs keyword arguments
for the fill_between function
185 Plot the given datapoints, with plot, errorbar
and make a errorband
with fill_between
186 @param x coordinates of the data points
187 @param y coordinates of the data points
188 @param xerr symmetric error on x data points
189 @param yerr symmetric error on y data points
197 if plot_kwargs
is None or 'color' not in plot_kwargs:
199 color = color[
'color']
200 plot_kwargs[
'color'] = color
202 color = plot_kwargs[
'color']
203 color = matplotlib.colors.ColorConverter().to_rgb(color)
204 patch = matplotlib.patches.Patch(color=color, alpha=0.5)
205 patch.get_color = patch.get_facecolor
208 if plot_kwargs
is not None:
209 p, = axis.plot(x, y, rasterized=
True, **plot_kwargs)
212 if errorbar_kwargs
is not None and (xerr
is not None or yerr
is not None):
213 if 'color' not in errorbar_kwargs:
214 errorbar_kwargs[
'color'] = color
215 if 'ecolor' not in errorbar_kwargs:
216 errorbar_kwargs[
'ecolor'] = [0.5 * x
for x
in color]
221 if not isinstance(xerr, (numpy.ndarray, list)):
222 xerr = xerr*numpy.ones(len(x))
223 mask = numpy.logical_and.reduce([numpy.isfinite(v)
for v
in [x, y, xerr, yerr]])
226 x[mask], y[mask], xerr=numpy.where(
227 xerr[mask] < 0, 0.0, xerr[mask]), yerr=numpy.where(
228 yerr[mask] < 0, 0.0, yerr[mask]), rasterized=
True, **errorbar_kwargs)
231 if errorband_kwargs
is not None and yerr
is not None:
232 if 'color' not in errorband_kwargs:
233 errorband_kwargs[
'color'] = color
238 for _x, _y, _xe, _ye
in zip(x, y, xerr, yerr):
239 axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=
True,
242 f = axis.fill_between(x, y - yerr, y + yerr, interpolate=
True, rasterized=
True, **errorband_kwargs)
244 if fill_kwargs
is not None:
246 x = numpy.append(x, x[-1]+2*xerr[-1])
247 y = numpy.append(y, y[-1])
248 xerr = numpy.append(xerr, xerr[-1])
250 axis.fill_between(x-xerr, y, 0, rasterized=
True, **fill_kwargs)
252 return (tuple(patches), p, e, f)
254 def add(self, *args, **kwargs):
256 Add a new plot to this plotter
258 return NotImplemented
262 Finish plotting and set labels, legends
and stuff
264 return NotImplemented
268 Scale limits to increase distance to boundaries
279 Plots the purity and the efficiency over the cut value (
for cut choosing)
286 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
288 Add a new curve to the plot
289 @param data pandas.DataFrame containing all data
290 @param column which
is used to calculate efficiency
and purity
for different cuts
291 @param signal_mask boolean numpy.array defining which events are signal events
292 @param bckgrd_mask boolean numpy.array defining which events are background events
293 @param weight_column column
in data containing the weights
for each event
296 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
299 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
300 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
302 efficiency, efficiency_error = hists.get_true_positives([
'Signal'])
303 purity, purity_error = hists.get_false_positives([
'Background'])
305 cuts = hists.bin_centers
309 numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.
ymaxymaxymax])
329 Sets limits, title, axis-labels and legend of the plot
333 self.axisaxis.set_title("Classification Plot")
334 self.
axisaxis.get_xaxis().set_label_text(
'Cut Value')
341 Plots the signal to noise ratio over the cut value (for cut choosing)
348 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
350 Add a new curve to the plot
351 @param data pandas.DataFrame containing all data
352 @param column which
is used to calculate signal to noise ratio
for different cuts
353 @param signal_mask boolean numpy.array defining which events are signal events
354 @param bckgrd_mask boolean numpy.array defining which events are background events
355 @param weight_column column
in data containing the weights
for each event
358 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
360 signal2noise, signal2noise_error = hists.get_signal_to_noise([
'Signal'], [
'Background'])
362 cuts = hists.bin_centers
366 numpy.nanmax([numpy.nanmax(signal2noise), self.
ymaxymaxymax])
376 Sets limits, title, axis-labels and legend of the plot
380 self.axisaxis.set_title("Signal to Noise Plot")
381 self.
axisaxis.get_xaxis().set_label_text(
'Cut Value')
388 Plots the purity over the efficiency also known as ROC curve
395 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
397 Add a new curve to the ROC plot
398 @param data pandas.DataFrame containing all data
399 @param column which
is used to calculate efficiency
and purity
for different cuts
400 @param signal_mask boolean numpy.array defining which events are signal events
401 @param bckgrd_mask boolean numpy.array defining which events are background events
402 @param weight_column column
in data containing the weights
for each event
404 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
405 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
406 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
413 if label
is not None:
421 Sets limits, title, axis-labels and legend of the plot
425 self.axisaxis.set_title("ROC Purity Plot")
426 self.
axisaxis.get_xaxis().set_label_text(
'Efficiency')
427 self.
axisaxis.get_yaxis().set_label_text(
'Purity')
434 Plots the rejection over the efficiency also known as ROC curve
441 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
443 Add a new curve to the ROC plot
444 @param data pandas.DataFrame containing all data
445 @param column which
is used to calculate efficiency
and purity
for different cuts
446 @param signal_mask boolean numpy.array defining which events are signal events
447 @param bckgrd_mask boolean numpy.array defining which events are background events
448 @param weight_column column
in data containing the weights
for each event
450 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
451 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
452 rejection, rejection_error = hists.get_efficiency([
'Background'])
453 rejection = 1 - rejection
454 if isinstance(efficiency, int)
and not isinstance(rejection, int):
455 efficiency = numpy.array([efficiency] * len(rejection))
456 elif isinstance(rejection, int)
and not isinstance(efficiency, int):
457 rejection = numpy.array([rejection] * len(efficiency))
458 elif isinstance(rejection, int)
and isinstance(efficiency, int):
459 efficiency = numpy.array([efficiency])
460 rejection = numpy.array([rejection])
465 auc = numpy.abs(numpy.trapz(rejection, efficiency))
469 if label
is not None:
477 Sets limits, title, axis-labels and legend of the plot
481 self.axisaxis.set_title("ROC Rejection Plot")
482 self.
axisaxis.get_xaxis().set_label_text(
'Signal Efficiency')
483 self.
axisaxis.get_yaxis().set_label_text(
'Background Rejection')
490 Plots multiple other plots into a grid 3x?
497 def __init__(self, cls, number_of_plots, figure=None):
499 Creates a new figure if None is given, sets the default plot parameters
500 @param figure default draw figure which
is used
509 if number_of_plots == 1:
510 gs = matplotlib.gridspec.GridSpec(1, 1)
511 elif number_of_plots == 2:
512 gs = matplotlib.gridspec.GridSpec(1, 2)
513 elif number_of_plots == 3:
514 gs = matplotlib.gridspec.GridSpec(1, 3)
516 gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
524 def add(self, i, *args, **kwargs):
526 Call add function of ith subplot
527 @param i position of the subplot
533 Sets limits, title, axis-labels and legend of the plot
542 Plots the purity in each bin over the classifier output.
549 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
551 Add a new curve to the Diagonal plot
552 @param data pandas.DataFrame containing all data
553 @param column which
is used to calculate purity
for different cuts
554 @param signal_mask boolean numpy.array defining which events are signal events
555 @param bckgrd_mask boolean numpy.array defining which events are background events
556 @param weight_column column
in data containing the weights
for each event
558 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
559 purity, purity_error = hists.get_purity_per_bin([
'Signal'], [
'Background'])
572 Sets limits, title, axis-labels and legend of the plot
575 self.axisaxis.plot((0.0, 1.0), (0.0, 1.0), color='black')
578 self.
axisaxis.set_title(
"Diagonal Plot")
579 self.
axisaxis.get_xaxis().set_label_text(
'Classifier Output')
580 self.
axisaxis.get_yaxis().set_label_text(
'Purity Per Bin')
587 Plots distribution of a quantity
590 def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
591 keep_first_binning=False, range_in_std=None):
593 Creates a new figure and axis
if None is given, sets the default plot parameters
594 @param figure default draw figure which
is used
595 @param axis default draw axis which
is used
596 @param normed true
if histograms should be normed before drawing
597 @param keep_first_binning use the binning of the first distribution
for further plots
598 @param range_in_std show only the data
in a windows around +- range_in_std * standard_deviation around the mean
623 def add(self, data, column, mask=None, weight_column=None, label=None):
625 Add a new distribution to the plots
626 @param data pandas.DataFrame containing all data
627 @param column which
is used to calculate distribution histogram
628 @param mask boolean numpy.array defining which events are used
for the histogram
629 @param weight_column column
in data containing the weights
for each event
632 mask = numpy.ones(len(data)).astype(
'bool')
638 bins=bins, equal_frequency=
False, range_in_std=self.
range_in_std)
641 hist, hist_error = hists.get_hist(
'Total')
644 normalization = float(numpy.sum(hist))
645 hist = hist / normalization
646 hist_error = hist_error / normalization
649 hist = hist / hists.bin_widths
650 hist_error = hist_error / hists.bin_widths
662 appendix =
' No data to plot!'
672 Sets limits, title, axis-labels and legend of the plot
674 self.axisaxis.set_title("Distribution Plot")
682 self.
axisaxis.text(0.36, 0.5,
'No data to plot', fontsize=60, color=
'black')
691 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / (# Entries * Bin Width)')
693 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / # Entries')
695 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / Bin Width')
697 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin')
711 Creates a new figure and axis
if None is given, sets the default plot parameters
712 @param figure default draw figure which
is used
713 @param axis default draw axis which
is used
715 super().__init__(figure=figure, axis=axis)
720 def add(self, data, column, mask=None, weight_column=None):
722 Add a new boxplot to the plots
723 @param data pandas.DataFrame containing all data
724 @param column which
is used to calculate boxplot quantities
725 @param mask boolean numpy.array defining which events are used
for the histogram
726 @param weight_column column
in data containing the weights
for each event
729 mask = numpy.ones(len(data)).astype(
'bool')
730 x = data[column][mask]
731 if weight_column
is not None:
733 b2.B2WARNING(
"Weights are currently not used in boxplot, due to limitations in matplotlib")
736 b2.B2WARNING(
"Ignore empty boxplot.")
739 p = self.
axisaxis.boxplot(x, sym=
'k.', whis=1.5, vert=
False, patch_artist=
True, showmeans=
True, widths=1,
740 boxprops=dict(facecolor=
'blue', alpha=0.5),
748 self.axisaxis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' +
'\n' +
r'$median = {:.2f}$').format(x.mean(), x.median()),
749 fontsize=28, verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axisaxis.transAxes)
750 self.
axisaxis.text(0.4, 0.9, (
r'$ \sigma = {:.2f}$' +
'\n' +
r'$IQD = {:.2f}$').format(x.std(),
751 x.quantile(0.75) - x.quantile(0.25)),
752 fontsize=28, verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axisaxis.transAxes)
753 self.
axisaxis.text(0.7, 0.9, (
r'$min = {:.2f}$' +
'\n' +
r'$max = {:.2f}$').format(x.min(), x.max()),
754 fontsize=28, verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axisaxis.transAxes)
761 Sets limits, title, axis-labels and legend of the plot
763 matplotlib.artist.setp(self.axisaxis.get_yaxis(), visible=False)
771 Plots the difference between two histograms
786 def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
788 Creates a new figure and axis
if None is given, sets the default plot parameters
789 @param figure default draw figure which
is used
790 @param axis default draw axis which
is used
791 @param normed normalize minuend
and subtrahend before comparing them
792 @param shift_to_zero mean difference
is shifted to zero, to remove constant offset due to e.g. different sample sizes
804 def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
806 Add a new difference plot
807 @param data pandas.DataFrame containing all data
808 @param column which
is used to calculate distribution histogram
809 @param minuend_mask boolean numpy.array defining which events are
for the minuend histogram
810 @param subtrahend_mask boolean numpy.array defining which events are
for the subtrahend histogram
811 @param weight_column column
in data containing the weights
for each event
812 @param label label
for the legend
if None, the column name
is used
814 hists = histogram.Histograms(data, column, {'Minuend': minuend_mask,
'Subtrahend': subtrahend_mask},
815 weight_column=weight_column, equal_frequency=
False)
816 minuend, minuend_error = hists.get_hist(
'Minuend')
817 subtrahend, subtrahend_error = hists.get_hist(
'Subtrahend')
821 difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
822 minuend = minuend / numpy.sum(minuend)
823 subtrahend = subtrahend / numpy.sum(subtrahend)
824 difference = minuend - subtrahend
827 difference = difference - numpy.mean(difference)
833 p = self.
_plot_datapoints(self.
axisaxis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
844 Sets limits, title, axis-labels and legend of the plot
850 self.
axisaxis.set_title(
"Difference Plot")
851 self.
axisaxis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
853 self.
axisaxis.get_yaxis().set_label_text(
'Difference')
860 Create TMVA-like overtraining control plot for a classification training
874 Creates a new figure if None is given, sets the default plot parameters
875 @param figure default draw figure which
is used
884 gs = matplotlib.gridspec.GridSpec(5, 1)
894 def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
896 Add a new overtraining plot, I recommend to draw only one overtraining plot at the time,
897 otherwise there are too many curves in the plot to recognize anything
in the plot.
898 @param data pandas.DataFrame containing all data
899 @param column which
is used to calculate distribution histogram
900 @param train_mask boolean numpy.array defining which events are training events
901 @param test_mask boolean numpy.array defining which events are test events
902 @param signal_mask boolean numpy.array defining which events are signal events
903 @param bckgrd_mask boolean numpy.array defining which events are background events
904 @param weight_column column
in data containing the weights
for each event
911 distribution.add(data, column, test_mask & signal_mask, weight_column)
912 distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
914 distribution.set_plot_options(
915 {
'color': distribution.plots[0][0][0].get_color(),
'linestyle':
'-',
'lw': 4,
'drawstyle':
'steps-mid'})
916 distribution.set_fill_options({
'color': distribution.plots[0][0][0].get_color(),
'alpha': 0.5,
'step':
'post'})
917 distribution.set_errorbar_options(
None)
918 distribution.set_errorband_options(
None)
919 distribution.add(data, column, train_mask & signal_mask, weight_column)
920 distribution.set_plot_options(
921 {
'color': distribution.plots[1][0][0].get_color(),
'linestyle':
'-',
'lw': 4,
'drawstyle':
'steps-mid'})
922 distribution.set_fill_options({
'color': distribution.plots[1][0][0].get_color(),
'alpha': 0.5,
'step':
'post'})
923 distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
925 distribution.labels = [
'Test-Signal',
'Test-Background',
'Train-Signal',
'Train-Background']
926 distribution.finish()
928 self.
plot_kwargs[
'color'] = distribution.plots[0][0][0].get_color()
930 difference_signal.set_plot_options(self.
plot_kwargs)
933 difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
934 self.
axis_d1axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
935 self.
axis_d1axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
936 difference_signal.plots = difference_signal.labels = []
937 difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
939 self.
plot_kwargs[
'color'] = distribution.plots[1][0][0].get_color()
941 difference_bckgrd.set_plot_options(self.
plot_kwargs)
944 difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
945 self.
axis_d2axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
946 self.
axis_d2axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
947 difference_bckgrd.plots = difference_bckgrd.labels = []
948 difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
953 if len(data[column][train_mask & signal_mask]) == 0
or len(data[column][test_mask & signal_mask]) == 0:
954 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test for signal due to missing data")
956 ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
957 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
958 self.
axis_d1axis_d1.text(0.1, 0.9,
r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36, bbox=props,
959 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d1axis_d1.transAxes)
960 if len(data[column][train_mask & bckgrd_mask]) == 0
or len(data[column][test_mask & bckgrd_mask]) == 0:
961 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test for background due to missing data")
963 ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
964 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
965 self.
axis_d2axis_d2.text(0.1, 0.9,
r'background (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36,
967 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d2axis_d2.transAxes)
969 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test please install scipy!")
975 Sets limits, title, axis-labels and legend of the plot
980 matplotlib.artist.setp(self.
axisaxisaxisaxis.get_xticklabels(), visible=
False)
981 matplotlib.artist.setp(self.
axis_d1axis_d1.get_xticklabels(), visible=
False)
984 self.
axis_d2axis_d2.get_xaxis().set_label_text(
'Classifier Output')
990 Plots distribution of a quantity including boxplots
996 def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
998 Creates a new figure and axis
if None is given, sets the default plot parameters
999 @param figure default draw figure which
is used
1000 @param axis default draw axis which
is used
1001 @param normed true
if the histograms should be normed before drawing
1002 @param range_in_std show only the data
in a windows around +- range_in_std * standard_deviation around the mean
1014 def add(self, data, column, mask=None, weight_column=None, label=None):
1016 Add a new distribution plot, with additional information like a boxplot compared to
1017 the ordinary Distribution plot.
1018 @param data pandas.DataFrame containing all data
1019 @param column which
is used to calculate distribution histogram
1020 @param mask boolean numpy.array defining which events are used
for the distribution histogram
1021 @param weight_column column
in data containing the weights
for each event
1029 gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1030 gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i
in range(n)]
1036 mask = mask & (data[column] > (mean - self.
range_in_std * std)) & (data[column] < (mean + self.
range_in_std * std))
1038 box.add(data, column, mask, weight_column)
1039 if len(box.plots) > 0:
1040 box.plots[0][
'boxes'][0].set_facecolor(self.
distribution.plots[-1][0][0].get_color())
1048 Sets limits, title, axis-labels and legend of the plot
1051 matplotlib.artist.setp(self.axisaxis.get_xticklabels(), visible=False)
1052 self.
axisaxis.get_xaxis().set_label_text(
'')
1054 matplotlib.artist.setp(box_axis.get_xticklabels(), visible=
False)
1055 box_axis.set_title(
"")
1056 box_axis.get_xaxis().set_label_text(
'')
1058 self.
axisaxis.set_title(
"Distribution Plot")
1060 loc=
'best', fancybox=
True, framealpha=0.5)
1066 Plots change of a distribution of a quantity depending on the cut on a classifier
1079 Creates a new figure if None is given, sets the default plot parameters
1080 @param figure default draw figure which
is used
1089 gs = matplotlib.gridspec.GridSpec(3, 2)
1099 def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1101 Add a new correlation plot.
1102 @param data pandas.DataFrame containing all data
1103 @param column which
is used to calculate distribution histogram
1104 @param cut_column which
is used to calculate cut on the other quantity defined by column
1105 @param quantiles list of quantiles between 0
and 100, defining the different cuts
1106 @param weight_column column
in data containing the weights
for each event
1108 if len(data[cut_column]) == 0:
1109 b2.B2WARNING(
"Ignore empty Correlation.")
1114 for i, (l, m)
in enumerate([(
'.', signal_mask | bckgrd_mask), (
'S', signal_mask), (
'B', bckgrd_mask)]):
1116 if weight_column
is not None:
1117 weights = numpy.array(data[weight_column][m])
1119 weights = numpy.ones(len(data[column][m]))
1121 xrange = numpy.percentile(data[column][m], [5, 95])
1123 colormap = plt.get_cmap(
'coolwarm')
1124 tmp, x = numpy.histogram(data[column][m], bins=100,
1125 range=xrange, density=
True, weights=weights)
1126 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1127 axes[i].
plot(bin_center, tmp, color=
'black', lw=1)
1129 for quantil
in numpy.arange(5, 100, 5):
1130 cut = numpy.percentile(data[cut_column][m], quantil)
1131 sel = data[cut_column][m] >= cut
1132 y, x = numpy.histogram(data[column][m][sel], bins=100,
1133 range=xrange, density=
True, weights=weights[sel])
1134 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1135 axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1138 axes[i].set_ylim(bottom=0)
1141 axes[i].set_title(
r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1146 Sets limits, title, axis-labels and legend of the plot
1153 Plots multivariate distribution using TSNE algorithm
1156 def add(self, data, columns, *masks):
1158 Add a new correlation plot.
1159 @param data pandas.DataFrame containing all data
1160 @param columns which are used to calculate the correlations
1161 @param masks different classes to show
in TSNE
1165 import sklearn.manifold
1166 model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1167 data = numpy.array([data[column]
for column
in columns]).T
1170 data = numpy.array([data[column][mask]
for column
in columns]).T
1171 data = model.transform(data)
1172 self.
axisaxis.scatter(data[:, 0], data[:, 1], rasterized=
True)
1174 print(
"Cannot create TSNE plot. Install sklearn if you want it")
1179 Sets limits, title, axis-labels and legend of the plot
1186 Plots importance matrix
1189 def add(self, data, columns, variables):
1191 Add a new correlation plot.
1192 @param data pandas.DataFrame containing all data
1193 @param columns which are used to calculate the correlations
1198 width = (numpy.max(x) - numpy.min(x))
1200 return numpy.zeros(x.shape)
1201 return (x - numpy.min(x)) / width * 100
1203 importance_matrix = numpy.vstack([norm(data[column])
for column
in columns]).T
1204 importance_heatmap = self.
axisaxis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,
1208 self.
axisaxis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=
False)
1209 self.
axisaxis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=
False)
1211 self.
axisaxis.set_xticklabels(columns, minor=
False, rotation=90)
1212 self.
axisaxis.set_yticklabels(variables, minor=
False)
1216 for y
in range(importance_matrix.shape[0]):
1217 for x
in range(importance_matrix.shape[1]):
1218 txt = self.
axisaxis.text(x + 0.5, y + 0.5, f
'{importance_matrix[y, x]:.0f}',
1220 horizontalalignment=
'center',
1221 verticalalignment=
'center',
1223 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'k')])
1225 cb = self.
figurefigure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation=
'vertical')
1226 cb.ax.set_yticklabels([
'low',
'high'])
1229 self.
axisaxis.set_ylim(0, importance_matrix.shape[0])
1237 Sets limits, title, axis-labels and legend of the plot
1244 Plots correlation matrix
1255 Creates a new figure if None is given, sets the default plot parameters
1256 @param figure default draw figure which
is used
1265 gs = matplotlib.gridspec.GridSpec(8, 2)
1277 def add(self, data, columns, signal_mask, bckgrd_mask):
1279 Add a new correlation plot.
1280 @param data pandas.DataFrame containing all data
1281 @param columns which are used to calculate the correlations
1283 signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column
in columns])) * 100
1284 bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask]
for column
in columns])) * 100
1286 signal_heatmap = self.
signal_axissignal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1308 for y
in range(signal_corr.shape[0]):
1309 for x
in range(signal_corr.shape[1]):
1312 horizontalalignment=
'center',
1313 verticalalignment=
'center',
1315 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'k')])
1317 for y
in range(bckgrd_corr.shape[0]):
1318 for x
in range(bckgrd_corr.shape[1]):
1321 horizontalalignment=
'center',
1322 verticalalignment=
'center',
1324 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'k')])
1327 cb.solids.set_rasterized(
True)
1328 cb.ax.set_xticklabels([
'negative',
'uncorrelated',
'positive'])
1342 Sets limits, title, axis-labels and legend of the plot
1348if __name__ ==
'__main__':
1350 def get_data(N, columns):
1352 Creates fake data for example plots
1355 n = len(columns) - 1
1356 xs = numpy.random.normal(0, size=(N, n))
1357 xb = numpy.random.normal(1, size=(N, n))
1360 data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1361 return data.reindex(numpy.random.permutation(data.index))
1365 seaborn.set(font_scale=3)
1366 seaborn.set_style(
'whitegrid')
1370 data = get_data(N, columns=[
'FastBDT',
'NeuroBayes',
'isSignal'])
1372 data.type.iloc[:N / 2] =
'Train'
1373 data.type.iloc[N / 2:] =
'Test'
1376 p.add(data,
'FastBDT')
1378 p.save(
'box_plot.png')
1381 p.add(data,
'FastBDT')
1382 p.add(data,
'NeuroBayes')
1384 p.save(
'verbose_distribution_plot.png')
1387 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1388 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1390 p.save(
'roc_purity_plot.png')
1393 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1394 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1396 p.save(
'roc_rejection_plot.png')
1399 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1400 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1402 p.save(
'diagonal_plot.png')
1405 p.add(data,
'FastBDT')
1406 p.add(data,
'NeuroBayes')
1408 p.save(
'distribution_plot.png')
1411 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1412 p.add(data,
'NeuroBayes', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1414 p.save(
'difference_plot.png')
1417 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1419 p.save(
'overtraining_plot.png')
1422 p.add(data,
'FastBDT',
'NeuroBayes', [0, 20, 40, 60, 80, 100], data[
'isSignal'] == 0)
1424 p.save(
'correlation_plot.png')
1427 data[
'FastBDT2'] = data[
'FastBDT']**2
1428 data[
'NeuroBayes2'] = data[
'NeuroBayes']**2
1429 data[
'FastBDT3'] = data[
'FastBDT']**3
1430 data[
'NeuroBayes3'] = data[
'NeuroBayes']**3
1431 p.add(data, [
'FastBDT',
'NeuroBayes',
'FastBDT2',
'NeuroBayes2',
'FastBDT3',
'NeuroBayes3'])
1433 p.save(
'correlation_matrix.png')
def calculate_flatness(f, p, w=None)
x_axis_label
Label on x axis.
def add(self, data, column, mask=None, weight_column=None)
def __init__(self, figure=None, axis=None)
signal_axis
add signal subplot
def add(self, data, columns, signal_mask, bckgrd_mask)
colorbar_axis
Colorbar axis contains the colorbar.
None bckgrd_axis
Axis which shows the correlation of the background samples.
def __init__(self, figure=None)
None figure
figure which is used to draw
None signal_axis
Main axis which shows the correlation of the signal samples.
bckgrd_axis
add background subplot
axis
Usual axis object which every Plotter object needs, here it is just a dummy.
def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
axis_d1
define second subplot
None axis_d1
Axis which shows shape of signal.
None axis
Main axis which is used to draw.
def __init__(self, figure=None)
axis_d2
define third subplot
None figure
figure which is used to draw
None axis_d2
Axis which shows shape of background.
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None)
x_axis_label
Label on x axis.
shift_to_zero
Mean difference is shifted to zero (removes constant offset) if this is true.
def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
def finish(self, line_color='black')
normed
Minuend and subtrahend are normed before comparing them if this is true.
def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
def add(self, data, column, mask=None, weight_column=None, label=None)
keep_first_binning
Keep first binning if user wants so.
normed_to_all_entries
Normalize histograms before drawing them.
first_binning
first binning
range_in_std
Show only a certain range in terms of standard deviations of the data.
normed_to_bin_width
Normalize histograms before drawing them.
def add(self, data, columns, variables)
def add(self, i, *args, **kwargs)
def __init__(self, cls, number_of_plots, figure=None)
None figure
figure which is used to draw
sub_plots
the subplots which are displayed in the grid
axis
the axis of the first subplot
axis_d1
define second subplot
def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
None axis_d1
Axis which shows the difference between training and test signal.
None axis
Main axis which is used to draw.
def __init__(self, figure=None)
axis_d2
define third subplot
None figure
figure which is used to draw
None axis_d2
Axis which shows the difference between training and test background.
def finish(self, *args, **kwargs)
fill_kwargs
Default keyword arguments for fill_between function.
None ymin
Minimum y value.
def set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
plots
create empty list for plots
None ymax
Maximum y value.
errorband_kwargs
Default keyword arguments for errorband function.
None axis
Main axis which is used to draw.
def add(self, *args, **kwargs)
None xmin
Minimum x value.
def set_fill_options(self, fill_kwargs=None)
def __init__(self, figure=None, axis=None)
None figure
figure which is used to draw
None plots
Plots added to the axis so far.
prop_cycler
Property cycler used to give plots unique colors.
errorbar_kwargs
Default keyword arguments for errorbar function.
labels
create empty list for labels
axis
divide figure into subplots
def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
def set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Overrides default errorbar options for datapoint errorbars.
None labels
Labels of the plots added so far.
def add_subplot(self, gridspecs)
def set_plot_options(self, plot_kwargs={ 'linestyle':''})
plot_kwargs
Default keyword arguments for plot function.
None xmax
Maximum x value.
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
def add(self, data, columns, *masks)
def add(self, data, column, mask=None, weight_column=None, label=None)
distribution
The distribution plot.
range_in_std
Show only a certain range in terms of standard deviations of the data.
None box_axes
Axes for the boxplots.
box_axes
create empty list for box axes
normed
Normalize histograms before drawing them.
def __init__(self, figure=None, axis=None, normed=False, range_in_std=None)
def weighted_mean_and_std(x, w)