18import matplotlib.pyplot
as plt
19import matplotlib.artist
20import matplotlib.figure
21import matplotlib.gridspec
22import matplotlib.colors
23import matplotlib.patches
24import matplotlib.ticker
25import matplotlib.patheffects
as PathEffects
28from basf2_mva_evaluation
import histogram
38matplotlib.rcParams.update({
'font.size': 36})
41plt.style.use(
"belle2")
46 Base class for all Plotters.
80 Creates a new figure and axis
if None is given, sets the default plot parameters
81 @param figure default draw figure which
is used
82 @param axis default draw axis which
is used
84 b2.B2INFO("Create new figure for class " + str(type(self)))
126 self.
prop_cycler = itertools.cycle(plt.rcParams[
"axes.prop_cycle"])
130 Adds a new subplot to the figure, updates all other axes
131 according to the given gridspec
132 @param gridspecs gridspecs
for all axes including the new one
134 for gs, ax
in zip(gridspecs[:-1], self.
figurefigure.axes):
136 ax.set_subplotspec(gs)
142 Save the figure into a file
143 @param filename of the file
145 b2.B2INFO("Save figure for class " + str(type(self)))
146 from matplotlib.backends.backend_agg
import FigureCanvasAgg
as FigureCanvas
148 canvas.print_figure(filename, dpi=50)
153 Overrides default plot options for datapoint plot
154 @param plot_kwargs keyword arguments
for the plot function
161 Overrides default errorbar options for datapoint errorbars
162 @param errorbar_kwargs keyword arguments
for the errorbar function
169 Overrides default errorband options for datapoint errorband
170 @param errorbar_kwargs keyword arguments
for the fill_between function
177 Overrides default fill_between options for datapoint errorband
178 @param fill_kwargs keyword arguments
for the fill_between function
185 Plot the given datapoints, with plot, errorbar
and make a errorband
with fill_between
186 @param x coordinates of the data points
187 @param y coordinates of the data points
188 @param xerr symmetric error on x data points
189 @param yerr symmetric error on y data points
197 if plot_kwargs
is None or 'color' not in plot_kwargs:
199 color = color[
'color']
200 plot_kwargs[
'color'] = color
202 color = plot_kwargs[
'color']
203 color = matplotlib.colors.ColorConverter().to_rgb(color)
204 patch = matplotlib.patches.Patch(color=color, alpha=0.5)
205 patch.get_color = patch.get_facecolor
208 if plot_kwargs
is not None:
209 p, = axis.plot(x, y, rasterized=
True, **plot_kwargs)
212 if errorbar_kwargs
is not None and (xerr
is not None or yerr
is not None):
213 if 'color' not in errorbar_kwargs:
214 errorbar_kwargs[
'color'] = color
215 if 'ecolor' not in errorbar_kwargs:
216 errorbar_kwargs[
'ecolor'] = [0.5 * x
for x
in color]
221 if not isinstance(xerr, (numpy.ndarray, list)):
222 xerr = xerr*numpy.ones(len(x))
223 mask = numpy.logical_and.reduce([numpy.isfinite(v)
for v
in [x, y, xerr, yerr]])
226 x[mask], y[mask], xerr=numpy.where(
227 xerr[mask] < 0, 0.0, xerr[mask]), yerr=numpy.where(
228 yerr[mask] < 0, 0.0, yerr[mask]), rasterized=
True, **errorbar_kwargs)
231 if errorband_kwargs
is not None and yerr
is not None:
232 if 'color' not in errorband_kwargs:
233 errorband_kwargs[
'color'] = color
238 for _x, _y, _xe, _ye
in zip(x, y, xerr, yerr):
239 axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=
True,
242 f = axis.fill_between(x, y - yerr, y + yerr, interpolate=
True, rasterized=
True, **errorband_kwargs)
244 if fill_kwargs
is not None:
246 x = numpy.append(x, x[-1]+2*xerr[-1])
247 y = numpy.append(y, y[-1])
248 xerr = numpy.append(xerr, xerr[-1])
250 axis.fill_between(x-xerr, y, 0, rasterized=
True, **fill_kwargs)
252 return (tuple(patches), p, e, f)
254 def add(self, *args, **kwargs):
256 Add a new plot to this plotter
258 return NotImplemented
262 Finish plotting and set labels, legends
and stuff
264 return NotImplemented
268 Scale limits to increase distance to boundaries
279 Plots the purity and the efficiency over the cut value (
for cut choosing)
286 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
288 Add a new curve to the plot
289 @param data pandas.DataFrame containing all data
290 @param column which
is used to calculate efficiency
and purity
for different cuts
291 @param signal_mask boolean numpy.array defining which events are signal events
292 @param bckgrd_mask boolean numpy.array defining which events are background events
293 @param weight_column column
in data containing the weights
for each event
296 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
299 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
300 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
302 efficiency, efficiency_error = hists.get_true_positives([
'Signal'])
303 purity, purity_error = hists.get_false_positives([
'Background'])
305 cuts = hists.bin_centers
309 numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.
ymaxymaxymax])
329 Sets limits, title, axis-labels and legend of the plot
333 self.axisaxis.set_title("Classification Plot")
334 self.
axisaxis.get_xaxis().set_label_text(
'Cut Value')
341 Plots the signal to noise ratio over the cut value (for cut choosing)
348 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
350 Add a new curve to the plot
351 @param data pandas.DataFrame containing all data
352 @param column which
is used to calculate signal to noise ratio
for different cuts
353 @param signal_mask boolean numpy.array defining which events are signal events
354 @param bckgrd_mask boolean numpy.array defining which events are background events
355 @param weight_column column
in data containing the weights
for each event
358 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
360 signal2noise, signal2noise_error = hists.get_signal_to_noise([
'Signal'], [
'Background'])
362 cuts = hists.bin_centers
366 numpy.nanmax([numpy.nanmax(signal2noise), self.
ymaxymaxymax])
376 Sets limits, title, axis-labels and legend of the plot
380 self.axisaxis.set_title("Signal to Noise Plot")
381 self.
axisaxis.get_xaxis().set_label_text(
'Cut Value')
388 Plots the purity over the efficiency also known as ROC curve
395 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
397 Add a new curve to the ROC plot
398 @param data pandas.DataFrame containing all data
399 @param column which
is used to calculate efficiency
and purity
for different cuts
400 @param signal_mask boolean numpy.array defining which events are signal events
401 @param bckgrd_mask boolean numpy.array defining which events are background events
402 @param weight_column column
in data containing the weights
for each event
404 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
405 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
406 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
413 if label
is not None:
421 Sets limits, title, axis-labels and legend of the plot
425 self.axisaxis.set_title("ROC Purity Plot")
426 self.
axisaxis.get_xaxis().set_label_text(
'Efficiency')
427 self.
axisaxis.get_yaxis().set_label_text(
'Purity')
434 Plots the rejection over the efficiency also known as ROC curve
441 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
443 Add a new curve to the ROC plot
444 @param data pandas.DataFrame containing all data
445 @param column which
is used to calculate efficiency
and purity
for different cuts
446 @param signal_mask boolean numpy.array defining which events are signal events
447 @param bckgrd_mask boolean numpy.array defining which events are background events
448 @param weight_column column
in data containing the weights
for each event
450 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
451 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
452 rejection, rejection_error = hists.get_efficiency([
'Background'])
453 rejection = 1 - rejection
454 if isinstance(efficiency, int)
and not isinstance(rejection, int):
455 efficiency = numpy.array([efficiency] * len(rejection))
456 elif isinstance(rejection, int)
and not isinstance(efficiency, int):
457 rejection = numpy.array([rejection] * len(efficiency))
458 elif isinstance(rejection, int)
and isinstance(efficiency, int):
459 efficiency = numpy.array([efficiency])
460 rejection = numpy.array([rejection])
465 auc = numpy.abs(numpy.trapz(rejection, efficiency))
469 if label
is not None:
477 Sets limits, title, axis-labels and legend of the plot
481 self.axisaxis.set_title("ROC Rejection Plot")
482 self.
axisaxis.get_xaxis().set_label_text(
'Signal Efficiency')
483 self.
axisaxis.get_yaxis().set_label_text(
'Background Rejection')
490 Plots multiple other plots into a grid 3x?
497 def __init__(self, cls, number_of_plots, figure=None):
499 Creates a new figure if None is given, sets the default plot parameters
500 @param figure default draw figure which
is used
509 if number_of_plots == 1:
510 gs = matplotlib.gridspec.GridSpec(1, 1)
511 elif number_of_plots == 2:
512 gs = matplotlib.gridspec.GridSpec(1, 2)
513 elif number_of_plots == 3:
514 gs = matplotlib.gridspec.GridSpec(1, 3)
516 gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)
524 def add(self, i, *args, **kwargs):
526 Call add function of ith subplot
527 @param i position of the subplot
533 Sets limits, title, axis-labels and legend of the plot
542 Plots the purity in each bin over the classifier output.
549 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):
551 Add a new curve to the Diagonal plot
552 @param data pandas.DataFrame containing all data
553 @param column which
is used to calculate purity
for different cuts
554 @param signal_mask boolean numpy.array defining which events are signal events
555 @param bckgrd_mask boolean numpy.array defining which events are background events
556 @param weight_column column
in data containing the weights
for each event
558 hists = histogram.Histograms(data, column, {'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
559 purity, purity_error = hists.get_purity_per_bin([
'Signal'], [
'Background'])
572 Sets limits, title, axis-labels and legend of the plot
575 self.axisaxis.plot((0.0, 1.0), (0.0, 1.0), color='black')
578 self.
axisaxis.set_title(
"Diagonal Plot")
579 self.
axisaxis.get_xaxis().set_label_text(
'Classifier Output')
580 self.
axisaxis.get_yaxis().set_label_text(
'Purity Per Bin')
587 Plots distribution of a quantity
590 def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
591 keep_first_binning=False, range_in_std=None):
593 Creates a new figure and axis
if None is given, sets the default plot parameters
594 @param figure default draw figure which
is used
595 @param axis default draw axis which
is used
596 @param normed true
if histograms should be normed before drawing
597 @param keep_first_binning use the binning of the first distribution
for further plots
598 @param range_in_std show only the data
in a windows around +- range_in_std * standard_deviation around the mean
623 def add(self, data, column, mask=None, weight_column=None, label=None):
625 Add a new distribution to the plots
626 @param data pandas.DataFrame containing all data
627 @param column which
is used to calculate distribution histogram
628 @param mask boolean numpy.array defining which events are used
for the histogram
629 @param weight_column column
in data containing the weights
for each event
632 mask = numpy.ones(len(data)).astype(
'bool')
638 bins=bins, equal_frequency=
False, range_in_std=self.
range_in_std)
641 hist, hist_error = hists.get_hist(
'Total')
644 normalization = float(numpy.sum(hist))
645 hist = hist / normalization
646 hist_error = hist_error / normalization
649 hist = hist / hists.bin_widths
650 hist_error = hist_error / hists.bin_widths
662 appendix =
' No data to plot!'
672 Sets limits, title, axis-labels and legend of the plot
674 self.axisaxis.set_title("Distribution Plot")
682 self.
axisaxis.text(0.36, 0.5,
'No data to plot', fontsize=60, color=
'black')
691 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / (# Entries * Bin Width)')
693 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / # Entries')
695 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin / Bin Width')
697 self.
axisaxis.get_yaxis().set_label_text(
'# Entries per Bin')
711 Creates a new figure and axis
if None is given, sets the default plot parameters
712 @param figure default draw figure which
is used
713 @param axis default draw axis which
is used
715 super().__init__(figure=figure, axis=axis)
720 def add(self, data, column, mask=None, weight_column=None):
722 Add a new boxplot to the plots
723 @param data pandas.DataFrame containing all data
724 @param column which
is used to calculate boxplot quantities
725 @param mask boolean numpy.array defining which events are used
for the histogram
726 @param weight_column column
in data containing the weights
for each event
729 mask = numpy.ones(len(data)).astype(
'bool')
730 x = data[column][mask]
731 if weight_column
is not None:
733 b2.B2WARNING(
"Weights are currently not used in boxplot, due to limitations in matplotlib")
736 b2.B2WARNING(
"Ignore empty boxplot.")
740 p = self.
axisaxis.boxplot(x, sym=
'k.', whis=1.5, vert=
False, patch_artist=
True, showmeans=
True, widths=1,
741 boxprops=dict(facecolor=
'blue', alpha=0.5), showfliers=
False,
749 self.axisaxis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' +
'\n' +
r'$median = {:.2f}$').format(x.mean(), x.median()),
750 fontsize=28, verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axisaxis.transAxes)
751 self.
axisaxis.text(0.4, 0.9, (
r'$ \sigma = {:.2f}$' +
'\n' +
r'$IQD = {:.2f}$').format(x.std(),
752 x.quantile(0.75) - x.quantile(0.25)),
753 fontsize=28, verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axisaxis.transAxes)
754 self.
axisaxis.text(0.7, 0.9, (
r'$min = {:.2f}$' +
'\n' +
r'$max = {:.2f}$').format(x.min(), x.max()),
755 fontsize=28, verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axisaxis.transAxes)
762 Sets limits, title, axis-labels and legend of the plot
764 matplotlib.artist.setp(self.axisaxis.get_yaxis(), visible=False)
772 Plots the difference between two histograms
787 def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
789 Creates a new figure and axis
if None is given, sets the default plot parameters
790 @param figure default draw figure which
is used
791 @param axis default draw axis which
is used
792 @param normed normalize minuend
and subtrahend before comparing them
793 @param shift_to_zero mean difference
is shifted to zero, to remove constant offset due to e.g. different sample sizes
805 def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
807 Add a new difference plot
808 @param data pandas.DataFrame containing all data
809 @param column which
is used to calculate distribution histogram
810 @param minuend_mask boolean numpy.array defining which events are
for the minuend histogram
811 @param subtrahend_mask boolean numpy.array defining which events are
for the subtrahend histogram
812 @param weight_column column
in data containing the weights
for each event
813 @param label label
for the legend
if None, the column name
is used
815 hists = histogram.Histograms(data, column, {'Minuend': minuend_mask,
'Subtrahend': subtrahend_mask},
816 weight_column=weight_column, equal_frequency=
False)
817 minuend, minuend_error = hists.get_hist(
'Minuend')
818 subtrahend, subtrahend_error = hists.get_hist(
'Subtrahend')
822 difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
823 minuend = minuend / numpy.sum(minuend)
824 subtrahend = subtrahend / numpy.sum(subtrahend)
825 difference = minuend - subtrahend
828 difference = difference - numpy.mean(difference)
834 p = self.
_plot_datapoints(self.
axisaxis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
845 Sets limits, title, axis-labels and legend of the plot
851 self.
axisaxis.set_title(
"Difference Plot")
852 self.
axisaxis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
854 self.
axisaxis.get_yaxis().set_label_text(
'Difference')
861 Create TMVA-like overtraining control plot for a classification training
875 Creates a new figure if None is given, sets the default plot parameters
876 @param figure default draw figure which
is used
885 gs = matplotlib.gridspec.GridSpec(5, 1)
895 def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
897 Add a new overtraining plot, I recommend to draw only one overtraining plot at the time,
898 otherwise there are too many curves in the plot to recognize anything
in the plot.
899 @param data pandas.DataFrame containing all data
900 @param column which
is used to calculate distribution histogram
901 @param train_mask boolean numpy.array defining which events are training events
902 @param test_mask boolean numpy.array defining which events are test events
903 @param signal_mask boolean numpy.array defining which events are signal events
904 @param bckgrd_mask boolean numpy.array defining which events are background events
905 @param weight_column column
in data containing the weights
for each event
912 distribution.add(data, column, test_mask & signal_mask, weight_column)
913 distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
915 distribution.set_plot_options(
916 {
'color': distribution.plots[0][0][0].get_color(),
'linestyle':
'-',
'lw': 4,
'drawstyle':
'steps-mid'})
917 distribution.set_fill_options({
'color': distribution.plots[0][0][0].get_color(),
'alpha': 0.5,
'step':
'post'})
918 distribution.set_errorbar_options(
None)
919 distribution.set_errorband_options(
None)
920 distribution.add(data, column, train_mask & signal_mask, weight_column)
921 distribution.set_plot_options(
922 {
'color': distribution.plots[1][0][0].get_color(),
'linestyle':
'-',
'lw': 4,
'drawstyle':
'steps-mid'})
923 distribution.set_fill_options({
'color': distribution.plots[1][0][0].get_color(),
'alpha': 0.5,
'step':
'post'})
924 distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
926 distribution.labels = [
'Test-Signal',
'Test-Background',
'Train-Signal',
'Train-Background']
927 distribution.finish()
929 self.
plot_kwargs[
'color'] = distribution.plots[0][0][0].get_color()
931 difference_signal.set_plot_options(self.
plot_kwargs)
934 difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
935 self.
axis_d1axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
936 self.
axis_d1axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
937 difference_signal.plots = difference_signal.labels = []
938 difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
940 self.
plot_kwargs[
'color'] = distribution.plots[1][0][0].get_color()
942 difference_bckgrd.set_plot_options(self.
plot_kwargs)
945 difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
946 self.
axis_d2axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
947 self.
axis_d2axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
948 difference_bckgrd.plots = difference_bckgrd.labels = []
949 difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
954 if len(data[column][train_mask & signal_mask]) == 0
or len(data[column][test_mask & signal_mask]) == 0:
955 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test for signal due to missing data")
957 ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
958 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
959 self.
axis_d1axis_d1.text(0.1, 0.9,
r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36, bbox=props,
960 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d1axis_d1.transAxes)
961 if len(data[column][train_mask & bckgrd_mask]) == 0
or len(data[column][test_mask & bckgrd_mask]) == 0:
962 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test for background due to missing data")
964 ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
965 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
966 self.
axis_d2axis_d2.text(0.1, 0.9,
r'background (train - test) difference $p={:.2f}$'.format(ks[1]), fontsize=36,
968 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d2axis_d2.transAxes)
970 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test please install scipy!")
976 Sets limits, title, axis-labels and legend of the plot
981 matplotlib.artist.setp(self.
axisaxisaxisaxis.get_xticklabels(), visible=
False)
982 matplotlib.artist.setp(self.
axis_d1axis_d1.get_xticklabels(), visible=
False)
985 self.
axis_d2axis_d2.get_xaxis().set_label_text(
'Classifier Output')
991 Plots distribution of a quantity including boxplots
997 def __init__(self, figure=None, axis=None, normed=False, range_in_std=None):
999 Creates a new figure and axis
if None is given, sets the default plot parameters
1000 @param figure default draw figure which
is used
1001 @param axis default draw axis which
is used
1002 @param normed true
if the histograms should be normed before drawing
1003 @param range_in_std show only the data
in a windows around +- range_in_std * standard_deviation around the mean
1015 def add(self, data, column, mask=None, weight_column=None, label=None):
1017 Add a new distribution plot, with additional information like a boxplot compared to
1018 the ordinary Distribution plot.
1019 @param data pandas.DataFrame containing all data
1020 @param column which
is used to calculate distribution histogram
1021 @param mask boolean numpy.array defining which events are used
for the distribution histogram
1022 @param weight_column column
in data containing the weights
for each event
1030 gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1031 gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i
in range(n)]
1037 mask = mask & (data[column] > (mean - self.
range_in_std * std)) & (data[column] < (mean + self.
range_in_std * std))
1039 box.add(data, column, mask, weight_column)
1040 if len(box.plots) > 0:
1041 box.plots[0][
'boxes'][0].set_facecolor(self.
distribution.plots[-1][0][0].get_color())
1049 Sets limits, title, axis-labels and legend of the plot
1052 matplotlib.artist.setp(self.axisaxis.get_xticklabels(), visible=False)
1053 self.
axisaxis.get_xaxis().set_label_text(
'')
1055 matplotlib.artist.setp(box_axis.get_xticklabels(), visible=
False)
1056 box_axis.set_title(
"")
1057 box_axis.get_xaxis().set_label_text(
'')
1059 self.
axisaxis.set_title(
"Distribution Plot")
1061 loc=
'best', fancybox=
True, framealpha=0.5)
1067 Plots change of a distribution of a quantity depending on the cut on a classifier
1080 Creates a new figure if None is given, sets the default plot parameters
1081 @param figure default draw figure which
is used
1090 gs = matplotlib.gridspec.GridSpec(3, 2)
1100 def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1102 Add a new correlation plot.
1103 @param data pandas.DataFrame containing all data
1104 @param column which
is used to calculate distribution histogram
1105 @param cut_column which
is used to calculate cut on the other quantity defined by column
1106 @param quantiles list of quantiles between 0
and 100, defining the different cuts
1107 @param weight_column column
in data containing the weights
for each event
1109 if len(data[cut_column]) == 0:
1110 b2.B2WARNING(
"Ignore empty Correlation.")
1115 for i, (l, m)
in enumerate([(
'.', signal_mask | bckgrd_mask), (
'S', signal_mask), (
'B', bckgrd_mask)]):
1117 if weight_column
is not None:
1118 weights = numpy.array(data[weight_column][m])
1120 weights = numpy.ones(len(data[column][m]))
1122 xrange = numpy.percentile(data[column][m], [5, 95])
1124 colormap = plt.get_cmap(
'coolwarm')
1125 tmp, x = numpy.histogram(data[column][m], bins=100,
1126 range=xrange, density=
True, weights=weights)
1127 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1128 axes[i].
plot(bin_center, tmp, color=
'black', lw=1)
1130 for quantil
in numpy.arange(5, 100, 5):
1131 cut = numpy.percentile(data[cut_column][m], quantil)
1132 sel = data[cut_column][m] >= cut
1133 y, x = numpy.histogram(data[column][m][sel], bins=100,
1134 range=xrange, density=
True, weights=weights[sel])
1135 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1136 axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1139 axes[i].set_ylim(bottom=0)
1142 axes[i].set_title(
r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1147 Sets limits, title, axis-labels and legend of the plot
1154 Plots multivariate distribution using TSNE algorithm
1157 def add(self, data, columns, *masks):
1159 Add a new correlation plot.
1160 @param data pandas.DataFrame containing all data
1161 @param columns which are used to calculate the correlations
1162 @param masks different classes to show
in TSNE
1166 import sklearn.manifold
1167 model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1168 data = numpy.array([data[column]
for column
in columns]).T
1171 data = numpy.array([data[column][mask]
for column
in columns]).T
1172 data = model.transform(data)
1173 self.
axisaxis.scatter(data[:, 0], data[:, 1], rasterized=
True)
1175 print(
"Cannot create TSNE plot. Install sklearn if you want it")
1180 Sets limits, title, axis-labels and legend of the plot
1187 Plots importance matrix
1190 def add(self, data, columns, variables):
1192 Add a new correlation plot.
1193 @param data pandas.DataFrame containing all data
1194 @param columns which are used to calculate the correlations
1199 width = (numpy.max(x) - numpy.min(x))
1201 return numpy.zeros(x.shape)
1202 return (x - numpy.min(x)) / width * 100
1204 importance_matrix = numpy.vstack([norm(data[column])
for column
in columns]).T
1205 importance_heatmap = self.
axisaxis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,
1209 self.
axisaxis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=
False)
1210 self.
axisaxis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=
False)
1212 self.
axisaxis.set_xticklabels(columns, minor=
False, rotation=90)
1213 self.
axisaxis.set_yticklabels(variables, minor=
False)
1217 for y
in range(importance_matrix.shape[0]):
1218 for x
in range(importance_matrix.shape[1]):
1219 txt = self.
axisaxis.text(x + 0.5, y + 0.5, f
'{importance_matrix[y, x]:.0f}',
1221 horizontalalignment=
'center',
1222 verticalalignment=
'center',
1224 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'k')])
1226 cb = self.
figurefigure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation=
'vertical')
1227 cb.ax.set_yticklabels([
'low',
'high'])
1230 self.
axisaxis.set_ylim(0, importance_matrix.shape[0])
1238 Sets limits, title, axis-labels and legend of the plot
1245 Plots correlation matrix
1256 Creates a new figure if None is given, sets the default plot parameters
1257 @param figure default draw figure which
is used
1266 gs = matplotlib.gridspec.GridSpec(8, 2)
1278 def add(self, data, columns, signal_mask, bckgrd_mask):
1280 Add a new correlation plot.
1281 @param data pandas.DataFrame containing all data
1282 @param columns which are used to calculate the correlations
1284 signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column
in columns])) * 100
1285 bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask]
for column
in columns])) * 100
1287 signal_heatmap = self.
signal_axissignal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)
1309 for y
in range(signal_corr.shape[0]):
1310 for x
in range(signal_corr.shape[1]):
1313 horizontalalignment=
'center',
1314 verticalalignment=
'center',
1316 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'k')])
1318 for y
in range(bckgrd_corr.shape[0]):
1319 for x
in range(bckgrd_corr.shape[1]):
1322 horizontalalignment=
'center',
1323 verticalalignment=
'center',
1325 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'k')])
1328 cb.solids.set_rasterized(
True)
1329 cb.ax.set_xticklabels([
'negative',
'uncorrelated',
'positive'])
1343 Sets limits, title, axis-labels and legend of the plot
1349if __name__ ==
'__main__':
1351 def get_data(N, columns):
1353 Creates fake data for example plots
1356 n = len(columns) - 1
1357 xs = numpy.random.normal(0, size=(N, n))
1358 xb = numpy.random.normal(1, size=(N, n))
1361 data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1362 return data.reindex(numpy.random.permutation(data.index))
1366 seaborn.set(font_scale=3)
1367 seaborn.set_style(
'whitegrid')
1371 data = get_data(N, columns=[
'FastBDT',
'NeuroBayes',
'isSignal'])
1373 data.type.iloc[:N / 2] =
'Train'
1374 data.type.iloc[N / 2:] =
'Test'
1377 p.add(data,
'FastBDT')
1379 p.save(
'box_plot.png')
1382 p.add(data,
'FastBDT')
1383 p.add(data,
'NeuroBayes')
1385 p.save(
'verbose_distribution_plot.png')
1388 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1389 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1391 p.save(
'roc_purity_plot.png')
1394 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1395 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1397 p.save(
'roc_rejection_plot.png')
1400 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1401 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1403 p.save(
'diagonal_plot.png')
1406 p.add(data,
'FastBDT')
1407 p.add(data,
'NeuroBayes')
1409 p.save(
'distribution_plot.png')
1412 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1413 p.add(data,
'NeuroBayes', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1415 p.save(
'difference_plot.png')
1418 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1420 p.save(
'overtraining_plot.png')
1423 p.add(data,
'FastBDT',
'NeuroBayes', [0, 20, 40, 60, 80, 100], data[
'isSignal'] == 0)
1425 p.save(
'correlation_plot.png')
1428 data[
'FastBDT2'] = data[
'FastBDT']**2
1429 data[
'NeuroBayes2'] = data[
'NeuroBayes']**2
1430 data[
'FastBDT3'] = data[
'FastBDT']**3
1431 data[
'NeuroBayes3'] = data[
'NeuroBayes']**3
1432 p.add(data, [
'FastBDT',
'NeuroBayes',
'FastBDT2',
'NeuroBayes2',
'FastBDT3',
'NeuroBayes3'])
1434 p.save(
'correlation_matrix.png')
def calculate_flatness(f, p, w=None)
x_axis_label
Label on x axis.
def add(self, data, column, mask=None, weight_column=None)
def __init__(self, figure=None, axis=None)
signal_axis
add signal subplot
def add(self, data, columns, signal_mask, bckgrd_mask)
colorbar_axis
Colorbar axis contains the colorbar.
None bckgrd_axis
Axis which shows the correlation of the background samples.
def __init__(self, figure=None)
None figure
figure which is used to draw
None signal_axis
Main axis which shows the correlation of the signal samples.
bckgrd_axis
add background subplot
axis
Usual axis object which every Plotter object needs, here it is just a dummy.
def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
axis_d1
define second subplot
None axis_d1
Axis which shows shape of signal.
None axis
Main axis which is used to draw.
def __init__(self, figure=None)
axis_d2
define third subplot
None figure
figure which is used to draw
None axis_d2
Axis which shows shape of background.
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None)
x_axis_label
Label on x axis.
shift_to_zero
Mean difference is shifted to zero (removes constant offset) if this is true.
def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
def finish(self, line_color='black')
normed
Minuend and subtrahend are normed before comparing them if this is true.
def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
def add(self, data, column, mask=None, weight_column=None, label=None)
keep_first_binning
Keep first binning if user wants so.
normed_to_all_entries
Normalize histograms before drawing them.
first_binning
first binning
range_in_std
Show only a certain range in terms of standard deviations of the data.
normed_to_bin_width
Normalize histograms before drawing them.
def add(self, data, columns, variables)
def add(self, i, *args, **kwargs)
def __init__(self, cls, number_of_plots, figure=None)
None figure
figure which is used to draw
sub_plots
the subplots which are displayed in the grid
axis
the axis of the first subplot
axis_d1
define second subplot
def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
None axis_d1
Axis which shows the difference between training and test signal.
None axis
Main axis which is used to draw.
def __init__(self, figure=None)
axis_d2
define third subplot
None figure
figure which is used to draw
None axis_d2
Axis which shows the difference between training and test background.
def finish(self, *args, **kwargs)
fill_kwargs
Default keyword arguments for fill_between function.
None ymin
Minimum y value.
def set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
plots
create empty list for plots
None ymax
Maximum y value.
errorband_kwargs
Default keyword arguments for errorband function.
None axis
Main axis which is used to draw.
def add(self, *args, **kwargs)
None xmin
Minimum x value.
def set_fill_options(self, fill_kwargs=None)
def __init__(self, figure=None, axis=None)
None figure
figure which is used to draw
None plots
Plots added to the axis so far.
prop_cycler
Property cycler used to give plots unique colors.
errorbar_kwargs
Default keyword arguments for errorbar function.
labels
create empty list for labels
axis
divide figure into subplots
def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
def set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Overrides default errorbar options for datapoint errorbars.
None labels
Labels of the plots added so far.
def add_subplot(self, gridspecs)
def set_plot_options(self, plot_kwargs={ 'linestyle':''})
plot_kwargs
Default keyword arguments for plot function.
None xmax
Maximum x value.
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
def add(self, data, columns, *masks)
def add(self, data, column, mask=None, weight_column=None, label=None)
distribution
The distribution plot.
range_in_std
Show only a certain range in terms of standard deviations of the data.
None box_axes
Axes for the boxplots.
box_axes
create empty list for box axes
normed
Normalize histograms before drawing them.
def __init__(self, figure=None, axis=None, normed=False, range_in_std=None)
def weighted_mean_and_std(x, w)