development/doxygen/mva_2scripts_2basf2__mva__evaluation_2plotting_8py_source.html

#!/usr/bin/env python3


import copy

import math


import pandas

import numpy

import itertools

import matplotlib.pyplot as plt

import matplotlib.artist

import matplotlib.figure

import matplotlib.gridspec

import matplotlib.colors

import matplotlib.patches

import matplotlib.ticker

import matplotlib.patheffects as PathEffects


from basf2_mva_evaluation import histogram


import basf2 as b2


import basf2_mva_util

import matplotlib


# Do not use standard backend TkAgg, because it is NOT thread-safe

# You will get an RuntimeError: main thread is not in main loop otherwise!

matplotlib.use("svg")


# Use the Belle II style while producing the plots

plt.style.use("belle2")


class Plotter:

    """

    Base class for all Plotters.

    """


    # stupid workaround for doxygen refusing to document things


    plots = None


    labels = None


    xmin = None


    xmax = None


    ymin = None


    ymax = None

    yscale = 0.0

    xscale = 0.0


    figure = None


    axis = None


    def __init__(self, figure=None, axis=None):

        """

        Creates a new figure and axis if None is given, sets the default plot parameters

        @param figure default draw figure which is used

        @param axis default draw axis which is used

        """

        b2.B2INFO("Create new figure for class " + str(type(self)))

        if figure is None:


            self.figurefigure = matplotlib.figure.Figure(figsize=(12, 8), dpi=120)

            self.figurefigure.set_tight_layout(True)

        else:

            self.figurefigure = figure


        if axis is None:


            self.axisaxis = self.figurefigure.add_subplot(1, 1, 1)

        else:

            self.axisaxis = axis


        self.plotsplots = []


        self.labelslabels = []


        self.xmin, self.xmaxxmax = float(0), float(1)


        self.ymin, self.ymaxymax = float(0), float(1)


        self.yscaleyscale = 0.1


        self.xscalexscale = 0.0


        self.plot_kwargs = None


        self.errorbar_kwargs = None


        self.errorband_kwargs = None


        self.fill_kwargs = None


        self.set_plot_options()

        self.set_errorbar_options()

        self.set_errorband_options()

        self.set_fill_options()


        self.prop_cycler = itertools.cycle(plt.rcParams["axes.prop_cycle"])


    def add_subplot(self, gridspecs):

        """

        Adds a new subplot to the figure, updates all other axes

        according to the given gridspec

        @param gridspecs gridspecs for all axes including the new one

        """

        for gs, ax in zip(gridspecs[:-1], self.figurefigure.axes):

            ax.set_position(gs.get_position(self.figurefigure))

            ax.set_subplotspec(gs)

        axis = self.figurefigure.add_subplot(gridspecs[-1], sharex=self.axisaxis)

        return axis


    def save(self, filename):

        """

        Save the figure into a file

        @param filename of the file

        """

        b2.B2INFO("Save figure for class " + str(type(self)))

        from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

        canvas = FigureCanvas(self.figurefigure)

        canvas.print_figure(filename, dpi=50)

        return self


    def set_plot_options(self, plot_kwargs={'linestyle': ''}):

        """

        Overrides default plot options for datapoint plot

        @param plot_kwargs keyword arguments for the plot function

        """

        self.plot_kwargs = copy.copy(plot_kwargs)

        return self


    def set_errorbar_options(self, errorbar_kwargs={'fmt': '.', 'elinewidth': 3, 'alpha': 1}):

        """

        Overrides default errorbar options for datapoint errorbars

        @param errorbar_kwargs keyword arguments for the errorbar function

        """

        self.errorbar_kwargs = copy.copy(errorbar_kwargs)

        return self


    def set_errorband_options(self, errorband_kwargs={'alpha': 0.5}):

        """

        Overrides default errorband options for datapoint errorband

        @param errorbar_kwargs keyword arguments for the fill_between function

        """

        self.errorband_kwargs = copy.copy(errorband_kwargs)

        return self


    def set_fill_options(self, fill_kwargs=None):

        """

        Overrides default fill_between options for datapoint errorband

        @param fill_kwargs keyword arguments for the fill_between function

        """

        self.fill_kwargs = copy.copy(fill_kwargs)

        return self


    def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None):

        """

        Plot the given datapoints, with plot, errorbar and make a errorband with fill_between

        @param x coordinates of the data points

        @param y coordinates of the data points

        @param xerr symmetric error on x data points

        @param yerr symmetric error on y data points

        """

        p = e = f = None

        plot_kwargs = copy.copy(self.plot_kwargs)

        errorbar_kwargs = copy.copy(self.errorbar_kwargs)

        errorband_kwargs = copy.copy(self.errorband_kwargs)

        fill_kwargs = copy.copy(self.fill_kwargs)


        if plot_kwargs is None or 'color' not in plot_kwargs:

            color = next(self.prop_cycler)

            color = color['color']

            plot_kwargs['color'] = color

        else:

            color = plot_kwargs['color']

        color = matplotlib.colors.ColorConverter().to_rgb(color)

        patch = matplotlib.patches.Patch(color=color, alpha=0.5)

        patch.get_color = patch.get_facecolor

        patches = [patch]


        if plot_kwargs is not None:

            p, = axis.plot(x, y, rasterized=True, **plot_kwargs)

            patches.append(p)


        if errorbar_kwargs is not None and (xerr is not None or yerr is not None):

            if 'color' not in errorbar_kwargs:

                errorbar_kwargs['color'] = color

            if 'ecolor' not in errorbar_kwargs:

                errorbar_kwargs['ecolor'] = [0.5 * x for x in color]


            # fully mask nan values.

            # Needed until https://github.com/matplotlib/matplotlib/pull/23333 makes it into the externals.

            # TODO: remove in release 8.

            if not isinstance(xerr, (numpy.ndarray, list)):

                xerr = xerr*numpy.ones(len(x))

            mask = numpy.logical_and.reduce([numpy.isfinite(v) for v in [x, y, xerr, yerr]])


            e = axis.errorbar(

                x[mask], y[mask], xerr=numpy.where(

                    xerr[mask] < 0, 0.0, xerr[mask]), yerr=numpy.where(

                    yerr[mask] < 0, 0.0, yerr[mask]), rasterized=True, **errorbar_kwargs)

            patches.append(e)


        if errorband_kwargs is not None and yerr is not None:

            if 'color' not in errorband_kwargs:

                errorband_kwargs['color'] = color

            if xerr is not None:

                # Ensure that xerr and yerr are iterable numpy arrays

                xerr = x + xerr - x

                yerr = y + yerr - y

                for _x, _y, _xe, _ye in zip(x, y, xerr, yerr):

                    axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=True,

                                                                **errorband_kwargs))

            else:

                f = axis.fill_between(x, y - yerr, y + yerr, interpolate=True, rasterized=True, **errorband_kwargs)


        if fill_kwargs is not None:

            # to fill the last bin of a histogram

            x = numpy.append(x, x[-1]+2*xerr[-1])

            y = numpy.append(y, y[-1])

            xerr = numpy.append(xerr, xerr[-1])


            axis.fill_between(x-xerr, y, 0, rasterized=True, **fill_kwargs)


        return (tuple(patches), p, e, f)


    def add(self, *args, **kwargs):

        """

        Add a new plot to this plotter

        """

        return NotImplemented


    def finish(self, *args, **kwargs):

        """

        Finish plotting and set labels, legends and stuff

        """

        return NotImplemented


    def scale_limits(self):

        """

        Scale limits to increase distance to boundaries

        """

        self.ymin *= 1.0 - math.copysign(self.yscaleyscale, self.ymin)

        self.ymaxymax *= 1.0 + math.copysign(self.yscaleyscale, self.ymaxymax)

        self.xmin *= 1.0 - math.copysign(self.xscalexscale, self.xmin)

        self.xmaxxmax *= 1.0 + math.copysign(self.xscalexscale, self.xmaxxmax)

        return self


class PurityAndEfficiencyOverCut(Plotter):

    """

    Plots the purity and the efficiency over the cut value (for cut choosing)

    """


    def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):

        """

        Add a new curve to the plot

        @param data pandas.DataFrame containing all data

        @param column which is used to calculate efficiency and purity for different cuts

        @param signal_mask boolean numpy.array defining which events are signal events

        @param bckgrd_mask boolean numpy.array defining which events are background events

        @param weight_column column in data containing the weights for each event

        """


        hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)


        if normed:

            efficiency, efficiency_error = hists.get_efficiency(['Signal'])

            purity, purity_error = hists.get_purity(['Signal'], ['Background'])

        else:

            efficiency, efficiency_error = hists.get_true_positives(['Signal'])

            purity, purity_error = hists.get_false_positives(['Background'])


        cuts = hists.bin_centers


        self.xmin, self.xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmaxxmaxxmax])

        self.ymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(efficiency), numpy.nanmin(purity), self.ymin]), \

            numpy.nanmax([numpy.nanmax(efficiency), numpy.nanmax(purity), self.ymaxymaxymax])


        self.plotsplots.append(self._plot_datapoints(self.axisaxis, cuts, efficiency, xerr=0, yerr=efficiency_error))


        if normed:

            self.labelslabels.append("Efficiency")

        else:

            self.labelslabels.append("True positive")


        self.plotsplots.append(self._plot_datapoints(self.axisaxis, cuts, purity, xerr=0, yerr=purity_error))


        if normed:

            self.labelslabels.append("Purity")

        else:

            self.labelslabels.append("False positive")


        return self


    def finish(self):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))

        self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))

        self.axisaxis.set_title("Classification Plot")

        self.axisaxis.get_xaxis().set_label_text('Cut Value')

        self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)

        return self


class SignalToNoiseOverCut(Plotter):

    """

    Plots the signal to noise ratio over the cut value (for cut choosing)

    """


    def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):

        """

        Add a new curve to the plot

        @param data pandas.DataFrame containing all data

        @param column which is used to calculate signal to noise ratio for different cuts

        @param signal_mask boolean numpy.array defining which events are signal events

        @param bckgrd_mask boolean numpy.array defining which events are background events

        @param weight_column column in data containing the weights for each event

        """


        hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)


        signal2noise, signal2noise_error = hists.get_signal_to_noise(['Signal'], ['Background'])


        cuts = hists.bin_centers


        self.xmin, self.xmaxxmaxxmax = numpy.nanmin([numpy.nanmin(cuts), self.xmin]), numpy.nanmax([numpy.nanmax(cuts), self.xmaxxmaxxmax])

        self.ymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(signal2noise), self.ymin]), \

            numpy.nanmax([numpy.nanmax(signal2noise), self.ymaxymaxymax])


        self.plotsplots.append(self._plot_datapoints(self.axisaxis, cuts, signal2noise, xerr=0, yerr=signal2noise_error))


        self.labelslabels.append(column)


        return self


    def finish(self):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))

        self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))

        self.axisaxis.set_title("Signal to Noise Plot")

        self.axisaxis.get_xaxis().set_label_text('Cut Value')

        self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)

        return self


class PurityOverEfficiency(Plotter):

    """

    Plots the purity over the efficiency also known as ROC curve

    """


    def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):

        """

        Add a new curve to the ROC plot

        @param data pandas.DataFrame containing all data

        @param column which is used to calculate efficiency and purity for different cuts

        @param signal_mask boolean numpy.array defining which events are signal events

        @param bckgrd_mask boolean numpy.array defining which events are background events

        @param weight_column column in data containing the weights for each event

        """

        hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)

        efficiency, efficiency_error = hists.get_efficiency(['Signal'])

        purity, purity_error = hists.get_purity(['Signal'], ['Background'])


        self.xmin, self.xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmaxxmaxxmax])

        self.ymin, self.ymaxymaxymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymaxymaxymax])


        p = self._plot_datapoints(self.axisaxis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)

        self.plotsplots.append(p)

        if label is not None:

            self.labelslabels.append(label)

        else:

            self.labelslabels.append(column)

        return self


    def finish(self):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))

        self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))

        self.axisaxis.set_title("ROC Purity Plot")

        self.axisaxis.get_xaxis().set_label_text('Efficiency')

        self.axisaxis.get_yaxis().set_label_text('Purity')

        self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)

        return self


class RejectionOverEfficiency(Plotter):

    """

    Plots the rejection over the efficiency also known as ROC curve

    """


    def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):

        """

        Add a new curve to the ROC plot

        @param data pandas.DataFrame containing all data

        @param column which is used to calculate efficiency and purity for different cuts

        @param signal_mask boolean numpy.array defining which events are signal events

        @param bckgrd_mask boolean numpy.array defining which events are background events

        @param weight_column column in data containing the weights for each event

        """

        hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)

        efficiency, efficiency_error = hists.get_efficiency(['Signal'])

        rejection, rejection_error = hists.get_efficiency(['Background'])

        rejection = 1 - rejection

        if isinstance(efficiency, int) and not isinstance(rejection, int):

            efficiency = numpy.array([efficiency] * len(rejection))

        elif isinstance(rejection, int) and not isinstance(efficiency, int):

            rejection = numpy.array([rejection] * len(efficiency))

        elif isinstance(rejection, int) and isinstance(efficiency, int):

            efficiency = numpy.array([efficiency])

            rejection = numpy.array([rejection])


        self.xmin, self.xmaxxmaxxmax = numpy.nanmin([efficiency.min(), self.xmin]), numpy.nanmax([efficiency.max(), self.xmaxxmaxxmax])

        self.ymin, self.ymaxymaxymax = numpy.nanmin([rejection.min(), self.ymin]), numpy.nanmax([rejection.max(), self.ymaxymaxymax])


        auc = numpy.abs(numpy.trapz(rejection, efficiency))


        p = self._plot_datapoints(self.axisaxis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)

        self.plotsplots.append(p)

        if label is not None:

            self.labelslabels.append(label[:10] + f" ({auc:.2f})")

        else:

            self.labelslabels.append(column[:10] + f" ({auc:.2f})")

        return self


    def finish(self):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))

        self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))

        self.axisaxis.set_title("ROC Rejection Plot")

        self.axisaxis.get_xaxis().set_label_text('Signal Efficiency')

        self.axisaxis.get_yaxis().set_label_text('Background Rejection')

        self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)

        return self


class Multiplot(Plotter):

    """

    Plots multiple other plots into a grid 3x?

    """


    figure = None


    axis = None


    def __init__(self, cls, number_of_plots, figure=None):

        """

        Creates a new figure if None is given, sets the default plot parameters

        @param figure default draw figure which is used

        """

        if figure is None:


            self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(12, 8), dpi=120)

            self.figurefigurefigurefigure.set_tight_layout(True)

        else:

            self.figurefigurefigurefigure = figure


        if number_of_plots == 1:

            gs = matplotlib.gridspec.GridSpec(1, 1)

        elif number_of_plots == 2:

            gs = matplotlib.gridspec.GridSpec(1, 2)

        elif number_of_plots == 3:

            gs = matplotlib.gridspec.GridSpec(1, 3)

        else:

            gs = matplotlib.gridspec.GridSpec(int(numpy.ceil(number_of_plots / 3)), 3)


        self.sub_plots = [cls(self.figurefigurefigurefigure, self.figurefigurefigurefigure.add_subplot(gs[i // 3, i % 3])) for i in range(number_of_plots)]


        self.axisaxisaxisaxis = self.sub_plots[0].axis

        super().__init__(self.figurefigurefigurefigure, self.axisaxisaxisaxis)


    def add(self, i, *args, **kwargs):

        """

        Call add function of ith subplot

        @param i position of the subplot

        """

        self.sub_plots[i].add(*args, **kwargs)


    def finish(self):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        for plot in self.sub_plots:

            plot.finish()

        return self


class Diagonal(Plotter):

    """

    Plots the purity in each bin over the classifier output.

    """


    def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None):

        """

        Add a new curve to the Diagonal plot

        @param data pandas.DataFrame containing all data

        @param column which is used to calculate purity for different cuts

        @param signal_mask boolean numpy.array defining which events are signal events

        @param bckgrd_mask boolean numpy.array defining which events are background events

        @param weight_column column in data containing the weights for each event

        """

        hists = histogram.Histograms(data, column, {'Signal': signal_mask, 'Background': bckgrd_mask}, weight_column=weight_column)

        purity, purity_error = hists.get_purity_per_bin(['Signal'], ['Background'])


        self.xmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)

        # self.ymin, self.ymax = numpy.nanmin([numpy.nanmin(purity), self.ymin]), numpy.nanmax([numpy.nanmax(purity), self.ymax])

        self.ymin, self.ymaxymaxymax = 0, 1


        p = self._plot_datapoints(self.axisaxis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)

        self.plotsplots.append(p)

        self.labelslabels.append(column)

        return self


    def finish(self):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        self.scale_limits()

        self.axisaxis.plot((0.0, 1.0), (0.0, 1.0), color='black')

        self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))

        self.axisaxis.set_ylim((self.ymin, self.ymaxymaxymax))

        self.axisaxis.set_title("Diagonal Plot")

        self.axisaxis.get_xaxis().set_label_text('Classifier Output')

        self.axisaxis.get_yaxis().set_label_text('Purity Per Bin')

        self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)

        return self


class Distribution(Plotter):

    """

    Plots distribution of a quantity

    """


    def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,

                 keep_first_binning=False, range_in_std=None):

        """

        Creates a new figure and axis if None is given, sets the default plot parameters

        @param figure default draw figure which is used

        @param axis default draw axis which is used

        @param normed true if histograms should be normed before drawing

        @param keep_first_binning use the binning of the first distribution for further plots

        @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean

        """

        super().__init__(figure, axis)


        self.normed_to_all_entries = normed_to_all_entries


        self.normed_to_bin_width = normed_to_bin_width


        self.range_in_std = range_in_std

        # if self.normed_to_all_entries or self.normed_to_bin_width:


        self.yminymin = float(0)


        self.ymaxymaxymax = float('-inf')


        self.xminxmin = float('inf')


        self.xmaxxmaxxmax = float('-inf')


        self.keep_first_binning = keep_first_binning


        self.first_binning = None


        self.x_axis_label = ''


    def add(self, data, column, mask=None, weight_column=None, label=None):

        """

        Add a new distribution to the plots

        @param data pandas.DataFrame containing all data

        @param column which is used to calculate distribution histogram

        @param mask boolean numpy.array defining which events are used for the histogram

        @param weight_column column in data containing the weights for each event

        """

        if mask is None:

            mask = numpy.ones(len(data)).astype('bool')


        bins = 100

        if self.keep_first_binning and self.first_binning is not None:

            bins = self.first_binning

        hists = histogram.Histograms(data, column, {'Total': mask}, weight_column=weight_column,

                                     bins=bins, equal_frequency=False, range_in_std=self.range_in_std)

        if self.keep_first_binning and self.first_binning is None:

            self.first_binning = hists.bins

        hist, hist_error = hists.get_hist('Total')


        if self.normed_to_all_entries:

            normalization = float(numpy.sum(hist))

            hist = hist / normalization

            hist_error = hist_error / normalization


        if self.normed_to_bin_width:

            hist = hist / hists.bin_widths

            hist_error = hist_error / hists.bin_widths


        self.xminxmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xminxmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)

        self.yminymin = numpy.nanmin([hist.min(), self.yminymin])

        self.ymaxymaxymax = numpy.nanmax([(hist + hist_error).max(), self.ymaxymaxymax])


        p = self._plot_datapoints(self.axisaxis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)

        self.plotsplots.append(p)

        self.x_axis_label = column


        appendix = ''

        if self.ymaxymaxymax <= self.yminymin or self.xmaxxmaxxmax <= self.xminxmin:

            appendix = ' No data to plot!'


        if label is None:

            self.labelslabels.append(column + appendix)

        else:

            self.labelslabels.append(label + appendix)

        return self


    def finish(self):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        self.axisaxis.set_title("Distribution Plot")

        self.axisaxis.get_xaxis().set_label_text(self.x_axis_label)


        self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)


        if self.ymaxymaxymax <= self.yminymin or self.xmaxxmaxxmax <= self.xminxmin:

            self.axisaxis.set_xlim((0., 1.))

            self.axisaxis.set_ylim((0., 1.))

            self.axisaxis.text(0.36, 0.5, 'No data to plot', fontsize=60, color='black')

            return self


        self.scale_limits()


        self.axisaxis.set_xlim((self.xminxmin, self.xmaxxmaxxmax))

        self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))


        if self.normed_to_all_entries and self.normed_to_bin_width:

            self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / (# Entries * Bin Width)')

        elif self.normed_to_all_entries:

            self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / # Entries')

        elif self.normed_to_bin_width:

            self.axisaxis.get_yaxis().set_label_text('# Entries per Bin / Bin Width')

        else:

            self.axisaxis.get_yaxis().set_label_text('# Entries per Bin')


        return self


class Box(Plotter):

    """

    Create a boxplot

    """


    def __init__(self, figure=None, axis=None, x_axis_label=None):

        """

        Creates a new figure and axis if None is given, sets the default plot parameters

        @param figure default draw figure which is used

        @param axis default draw axis which is used

        """

        super().__init__(figure=figure, axis=axis)


        self.x_axis_label = x_axis_label


    def add(self, data, column, mask=None, weight_column=None):

        """

        Add a new boxplot to the plots

        @param data pandas.DataFrame containing all data

        @param column which is used to calculate boxplot quantities

        @param mask boolean numpy.array defining which events are used for the histogram

        @param weight_column column in data containing the weights for each event

        """

        if mask is None:

            mask = numpy.ones(len(data)).astype('bool')

        x = data[column][mask]

        if weight_column is not None:

            # weight = data[weight_column][mask]

            b2.B2WARNING("Weights are currently not used in boxplot, due to limitations in matplotlib")


        if len(x) == 0:

            b2.B2WARNING("Ignore empty boxplot.")

            return self


        # we don't plot outliers as they cause the file size to explode if large datasets are used

        p = self.axisaxis.boxplot(x, sym='k.', whis=1.5, vert=False, patch_artist=True, showmeans=True, widths=1,

                              boxprops=dict(facecolor='blue', alpha=0.5), showfliers=False,

                              # medianprobs=dict(color='blue'),

                              # meanprobs=dict(color='red'),

                              )

        self.plotsplots.append(p)

        self.labelslabels.append(column)

        if not self.x_axis_label:

            self.x_axis_label = column

        r"""

        self.axisaxis.text(0.1, 0.9, (r'$     \mu = {:.2f}$' + '\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),

                       fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axisaxis.transAxes)

        self.axisaxis.text(0.4, 0.9, (r'$  \sigma = {:.2f}$' + '\n' + r'$IQD = {:.2f}$').format(x.std(),

                                                                                            x.quantile(0.75) - x.quantile(0.25)),

                       fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axisaxis.transAxes)

        self.axisaxis.text(0.7, 0.9, (r'$min = {:.2f}$' + '\n' + r'$max = {:.2f}$').format(x.min(), x.max()),

                       fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axisaxis.transAxes)

        """


        return self


    def finish(self):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        matplotlib.artist.setp(self.axisaxis.get_yaxis(), visible=False)

        self.axisaxis.get_xaxis().set_label_text(self.x_axis_label)

        self.axisaxis.set_title("Box Plot")

        return self


class Difference(Plotter):

    """

    Plots the difference between two histograms

    """


    def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):

        """

        Creates a new figure and axis if None is given, sets the default plot parameters

        @param figure default draw figure which is used

        @param axis default draw axis which is used

        @param normed normalize minuend and subtrahend before comparing them

        @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes

        """

        super().__init__(figure, axis)

        self.normed = normed

        self.shift_to_zero = shift_to_zero

        if self.normed:

            self.yminymin = -0.01

            self.ymaxymaxymax = 0.01

        else:

            self.yminymin = -1

            self.ymaxymaxymax = 1


    def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):

        """

        Add a new difference plot

        @param data pandas.DataFrame containing all data

        @param column which is used to calculate distribution histogram

        @param minuend_mask boolean numpy.array defining which events are for the minuend histogram

        @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram

        @param weight_column column in data containing the weights for each event

        @param label label for the legend if None, the column name is used

        """

        hists = histogram.Histograms(data, column, {'Minuend': minuend_mask, 'Subtrahend': subtrahend_mask},

                                     weight_column=weight_column, equal_frequency=False)

        minuend, minuend_error = hists.get_hist('Minuend')

        subtrahend, subtrahend_error = hists.get_hist('Subtrahend')


        difference_error = histogram.poisson_error(minuend + subtrahend)

        if self.normed:

            difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))

            minuend = minuend / numpy.sum(minuend)

            subtrahend = subtrahend / numpy.sum(subtrahend)

        difference = minuend - subtrahend


        if self.shift_to_zero:

            difference = difference - numpy.mean(difference)


        self.xmin, self.xmaxxmaxxmax = min(hists.bin_centers.min(), self.xmin), max(hists.bin_centers.max(), self.xmaxxmaxxmax)

        self.yminymin = min((difference - difference_error).min(), self.yminymin)

        self.ymaxymaxymax = max((difference + difference_error).max(), self.ymaxymaxymax)


        p = self._plot_datapoints(self.axisaxis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)

        self.plotsplots.append(p)

        if label is None:

            self.labelslabels.append(label)

        else:

            self.labelslabels.append(column)

        self.x_axis_label = column

        return self


    def finish(self, line_color='black'):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        self.axisaxis.plot((self.xmin, self.xmaxxmaxxmax), (0, 0), color=line_color, linewidth=4, rasterized=True)

        self.scale_limits()

        self.axisaxis.set_xlim((self.xmin, self.xmaxxmaxxmax))

        self.axisaxis.set_ylim((self.yminymin, self.ymaxymaxymax))

        self.axisaxis.set_title("Difference Plot")

        self.axisaxis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))

        self.axisaxis.get_xaxis().set_label_text(self.x_axis_label)

        self.axisaxis.get_yaxis().set_label_text('Diff.')

        self.axisaxis.legend([x[0] for x in self.plotsplots], self.labelslabels, loc='best', fancybox=True, framealpha=0.5)

        return self


class Overtraining(Plotter):

    """

    Create TMVA-like overtraining control plot for a classification training

    """


    figure = None


    axis = None


    axis_d1 = None


    axis_d2 = None


    def __init__(self, figure=None):

        """

        Creates a new figure if None is given, sets the default plot parameters

        @param figure default draw figure which is used

        """

        if figure is None:


            self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(12, 8), dpi=120)

            self.figurefigurefigurefigure.set_tight_layout(True)

        else:

            self.figurefigurefigurefigure = figure


        gs = matplotlib.gridspec.GridSpec(5, 1)


        self.axisaxisaxisaxis = self.figurefigurefigurefigure.add_subplot(gs[:3, :])


        self.axis_d1axis_d1 = self.figurefigurefigurefigure.add_subplot(gs[3, :], sharex=self.axisaxisaxisaxis)


        self.axis_d2axis_d2 = self.figurefigurefigurefigure.add_subplot(gs[4, :], sharex=self.axisaxisaxisaxis)


        super().__init__(self.figurefigurefigurefigure, self.axisaxisaxisaxis)


    def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):

        """

        Add a new overtraining plot, I recommend to draw only one overtraining plot at the time,

        otherwise there are too many curves in the plot to recognize anything in the plot.

        @param data pandas.DataFrame containing all data

        @param column which is used to calculate distribution histogram

        @param train_mask boolean numpy.array defining which events are training events

        @param test_mask boolean numpy.array defining which events are test events

        @param signal_mask boolean numpy.array defining which events are signal events

        @param bckgrd_mask boolean numpy.array defining which events are background events

        @param weight_column column in data containing the weights for each event

        """

        distribution = Distribution(self.figurefigurefigurefigure, self.axisaxisaxisaxis, normed_to_all_entries=True)


        distribution.set_plot_options(self.plot_kwargs)

        distribution.set_errorbar_options(self.errorbar_kwargs)

        distribution.set_errorband_options(self.errorband_kwargs)

        distribution.add(data, column, test_mask & signal_mask, weight_column)

        distribution.add(data, column, test_mask & bckgrd_mask, weight_column)


        distribution.set_plot_options(

            {'color': distribution.plots[0][0][0].get_color(), 'linestyle': '-', 'lw': 4, 'drawstyle': 'steps-mid'})

        distribution.set_fill_options({'color': distribution.plots[0][0][0].get_color(), 'alpha': 0.5, 'step': 'post'})

        distribution.set_errorbar_options(None)

        distribution.set_errorband_options(None)

        distribution.add(data, column, train_mask & signal_mask, weight_column)

        distribution.set_plot_options(

            {'color': distribution.plots[1][0][0].get_color(), 'linestyle': '-', 'lw': 4, 'drawstyle': 'steps-mid'})

        distribution.set_fill_options({'color': distribution.plots[1][0][0].get_color(), 'alpha': 0.5, 'step': 'post'})

        distribution.add(data, column, train_mask & bckgrd_mask, weight_column)


        distribution.labels = ['Test-Signal', 'Test-Background', 'Train-Signal', 'Train-Background']

        distribution.finish()


        self.plot_kwargs['color'] = distribution.plots[0][0][0].get_color()

        difference_signal = Difference(self.figurefigurefigurefigure, self.axis_d1axis_d1, shift_to_zero=True, normed=True)

        difference_signal.set_plot_options(self.plot_kwargs)

        difference_signal.set_errorbar_options(self.errorbar_kwargs)

        difference_signal.set_errorband_options(self.errorband_kwargs)

        difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)

        self.axis_d1axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))

        self.axis_d1axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))

        difference_signal.plots = difference_signal.labels = []

        difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())


        self.plot_kwargs['color'] = distribution.plots[1][0][0].get_color()

        difference_bckgrd = Difference(self.figurefigurefigurefigure, self.axis_d2axis_d2, shift_to_zero=True, normed=True)

        difference_bckgrd.set_plot_options(self.plot_kwargs)

        difference_bckgrd.set_errorbar_options(self.errorbar_kwargs)

        difference_bckgrd.set_errorband_options(self.errorband_kwargs)

        difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)

        self.axis_d2axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))

        self.axis_d2axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))

        difference_bckgrd.plots = difference_bckgrd.labels = []

        difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())


        try:

            import scipy.stats

            # Kolmogorov smirnov test

            if len(data[column][train_mask & signal_mask]) == 0 or len(data[column][test_mask & signal_mask]) == 0:

                b2.B2WARNING("Cannot calculate kolmogorov smirnov test for signal due to missing data")

            else:

                ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])

                props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)

                self.axis_d1axis_d1.text(0.1, 0.9, r'signal (train - test) difference $p={:.2f}$'.format(ks[1]),  bbox=props,

                                  verticalalignment='top', horizontalalignment='left', transform=self.axis_d1axis_d1.transAxes)

            if len(data[column][train_mask & bckgrd_mask]) == 0 or len(data[column][test_mask & bckgrd_mask]) == 0:

                b2.B2WARNING("Cannot calculate kolmogorov smirnov test for background due to missing data")

            else:

                ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])

                props = dict(boxstyle='round', edgecolor='gray', facecolor='white', linewidth=0.1, alpha=0.5)

                self.axis_d2axis_d2.text(0.1, 0.9, r'background (train - test) difference $p={:.2f}$'.format(ks[1]),

                                  bbox=props,

                                  verticalalignment='top', horizontalalignment='left', transform=self.axis_d2axis_d2.transAxes)

        except ImportError:

            b2.B2WARNING("Cannot calculate kolmogorov smirnov test please install scipy!")


        return self


    def finish(self):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        self.axisaxisaxisaxis.set_title("Overtraining Plot")

        self.axis_d1axis_d1.set_title("")

        self.axis_d2axis_d2.set_title("")

        matplotlib.artist.setp(self.axisaxisaxisaxis.get_xticklabels(), visible=False)

        matplotlib.artist.setp(self.axis_d1axis_d1.get_xticklabels(), visible=False)

        self.axisaxisaxisaxis.get_xaxis().set_label_text('')

        self.axis_d1axis_d1.get_xaxis().set_label_text('')

        self.axis_d2axis_d2.get_xaxis().set_label_text('Classifier Output')

        return self


class VerboseDistribution(Plotter):

    """

    Plots distribution of a quantity including boxplots

    """


    box_axes = None


    def __init__(self, figure=None, axis=None, normed=False, range_in_std=None, x_axis_label=None):

        """

        Creates a new figure and axis if None is given, sets the default plot parameters

        @param figure default draw figure which is used

        @param axis default draw axis which is used

        @param normed true if the histograms should be normed before drawing

        @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean

        """

        super().__init__(figure, axis)


        self.normed = normed


        self.range_in_std = range_in_std


        self.box_axesbox_axes = []


        self.distribution = Distribution(self.figurefigure, self.axisaxis, normed_to_all_entries=self.normed, range_in_std=self.range_in_std)


        self.x_axis_label = x_axis_label


    def add(self, data, column, mask=None, weight_column=None, label=None):

        """

        Add a new distribution plot, with additional information like a boxplot compared to

        the ordinary Distribution plot.

        @param data pandas.DataFrame containing all data

        @param column which is used to calculate distribution histogram

        @param mask boolean numpy.array defining which events are used for the distribution histogram

        @param weight_column column in data containing the weights for each event

        """

        self.distribution.set_plot_options(self.plot_kwargs)

        self.distribution.set_errorbar_options(self.errorbar_kwargs)

        self.distribution.set_errorband_options(self.errorband_kwargs)

        self.distribution.add(data, column, mask, weight_column, label=label)


        n = len(self.box_axesbox_axes) + 1

        gs = matplotlib.gridspec.GridSpec(4 * n, 1)

        gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :] for i in range(n)]

        box_axis = self.add_subplot(gridspecs)


        if self.range_in_std is not None:

            mean, std = histogram.weighted_mean_and_std(data[column], None if weight_column is None else data[weight_column])

            # Everything outside mean +- range_in_std * std is considered not inside the mask

            mask = mask & (data[column] > (mean - self.range_in_std * std)) & (data[column] < (mean + self.range_in_std * std))

        box = Box(self.figurefigure, box_axis, x_axis_label=self.x_axis_label)

        box.add(data, column, mask, weight_column)

        if len(box.plots) > 0:

            box.plots[0]['boxes'][0].set_facecolor(self.distribution.plots[-1][0][0].get_color())

        box.finish()


        self.box_axesbox_axes.append(box_axis)

        return self


    def finish(self):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        self.distribution.finish()

        matplotlib.artist.setp(self.axisaxis.get_xticklabels(), visible=False)

        self.axisaxis.get_xaxis().set_label_text('')

        for box_axis in self.box_axesbox_axes[:-1]:

            matplotlib.artist.setp(box_axis.get_xticklabels(), visible=False)

            box_axis.set_title("")

            box_axis.get_xaxis().set_label_text('')

        self.box_axesbox_axes[-1].set_title("")

        self.axisaxis.set_title("Distribution Plot")

        self.axisaxis.legend([x[0] for x in self.distribution.plots], self.distribution.labels,

                         loc='best', fancybox=True, framealpha=0.5)

        return self


class Correlation(Plotter):

    """

    Plots change of a distribution of a quantity depending on the cut on a classifier

    """


    figure = None


    axis = None


    axis_d1 = None


    axis_d2 = None


    def __init__(self, figure=None):

        """

        Creates a new figure if None is given, sets the default plot parameters

        @param figure default draw figure which is used

        """

        if figure is None:


            self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(12, 8), dpi=120)

            self.figurefigurefigurefigure.set_tight_layout(True)

        else:

            self.figurefigurefigurefigure = figure


        gs = matplotlib.gridspec.GridSpec(3, 2)


        self.axisaxisaxisaxis = self.figurefigurefigurefigure.add_subplot(gs[0, :])


        self.axis_d1axis_d1 = self.figurefigurefigurefigure.add_subplot(gs[1, :], sharex=self.axisaxisaxisaxis)


        self.axis_d2axis_d2 = self.figurefigurefigurefigure.add_subplot(gs[2, :], sharex=self.axisaxisaxisaxis)


        super().__init__(self.figurefigurefigurefigure, self.axisaxisaxisaxis)


    def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):

        """

        Add a new correlation plot.

        @param data pandas.DataFrame containing all data

        @param column which is used to calculate distribution histogram

        @param cut_column which is used to calculate cut on the other quantity defined by column

        @param quantiles list of quantiles between 0 and 100, defining the different cuts

        @param weight_column column in data containing the weights for each event

        """

        if len(data[cut_column]) == 0:

            b2.B2WARNING("Ignore empty Correlation.")

            return self


        axes = [self.axisaxisaxisaxis, self.axis_d1axis_d1, self.axis_d2axis_d2]


        for i, (l, m) in enumerate([('.', signal_mask | bckgrd_mask), ('S', signal_mask), ('B', bckgrd_mask)]):


            if weight_column is not None:

                weights = numpy.array(data[weight_column][m])

            else:

                weights = numpy.ones(len(data[column][m]))


            xrange = numpy.percentile(data[column][m], [5, 95])


            colormap = plt.get_cmap('coolwarm')

            tmp, x = numpy.histogram(data[column][m], bins=100,

                                     range=xrange, density=True, weights=weights)

            bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]

            axes[i].plot(bin_center, tmp, color='black', lw=1)


            for quantil in numpy.arange(5, 100, 5):

                cut = numpy.percentile(data[cut_column][m], quantil)

                sel = data[cut_column][m] >= cut

                y, x = numpy.histogram(data[column][m][sel], bins=100,

                                       range=xrange, density=True, weights=weights[sel])

                bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]

                axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))

                tmp = y


            axes[i].set_ylim(bottom=0)


            flatness_score = basf2_mva_util.calculate_flatness(data[column][m], data[cut_column][m], weights)

            axes[i].set_title(r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))

        return self


    def finish(self):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        return self


class TSNE(Plotter):

    """

    Plots multivariate distribution using TSNE algorithm

    """


    def add(self, data, columns, *masks):

        """

        Add a new correlation plot.

        @param data pandas.DataFrame containing all data

        @param columns which are used to calculate the correlations

        @param masks different classes to show in TSNE

        """

        try:

            import sklearn

            import sklearn.manifold

            model = sklearn.manifold.TSNE(n_components=2, random_state=0)

            data = numpy.array([data[column] for column in columns]).T

            model.fit(data)

            for mask in masks:

                data = numpy.array([data[column][mask] for column in columns]).T

                data = model.transform(data)

                self.axisaxis.scatter(data[:, 0], data[:, 1], rasterized=True)

        except ImportError:

            print("Cannot create TSNE plot. Install sklearn if you want it")

        return self


    def finish(self):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        return self


class Importance(Plotter):

    """

    Plots importance matrix

    """


    def add(self, data, columns, variables):

        """

        Add a new correlation plot.

        @param data pandas.DataFrame containing all data

        @param columns which are used to calculate the correlations

        """

        self.figurefigure.set_tight_layout(True)


        def norm(x):

            width = (numpy.max(x) - numpy.min(x))

            if width <= 0:

                return numpy.zeros(x.shape)

            return (x - numpy.min(x)) / width * 100


        importance_matrix = numpy.vstack([norm(data[column]) for column in columns]).T

        importance_heatmap = self.axisaxis.pcolor(importance_matrix, cmap=plt.cm.RdBu, vmin=0.0, vmax=100,

                                              rasterized=True)


        # put the major ticks at the middle of each cell

        self.axisaxis.set_yticks(numpy.arange(importance_matrix.shape[0]) + 0.5, minor=False)

        self.axisaxis.set_xticks(numpy.arange(importance_matrix.shape[1]) + 0.5, minor=False)


        self.axisaxis.set_xticklabels(columns, minor=False, rotation=90)

        self.axisaxis.set_yticklabels(variables, minor=False)


        self.axisaxis.xaxis.tick_top()


        for y in range(importance_matrix.shape[0]):

            for x in range(importance_matrix.shape[1]):

                txt = self.axisaxis.text(x + 0.5, y + 0.5, f'{importance_matrix[y, x]:.0f}',

                                     size=14,

                                     horizontalalignment='center',

                                     verticalalignment='center',

                                     color='w')

                txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])


        cb = self.figurefigure.colorbar(importance_heatmap, ticks=[0.0, 100], orientation='vertical')

        cb.ax.set_yticklabels(['low', 'high'])


        # remove whitespace

        self.axisaxis.set_ylim(0, importance_matrix.shape[0])


        self.axisaxis.set_aspect('equal')


        return self


    def finish(self):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        return self


class CorrelationMatrix(Plotter):

    """

    Plots correlation matrix

    """


    figure = None


    signal_axis = None


    bckgrd_axis = None


    def __init__(self, figure=None):

        """

        Creates a new figure if None is given, sets the default plot parameters

        @param figure default draw figure which is used

        """

        if figure is None:


            self.figurefigurefigurefigure = matplotlib.figure.Figure(figsize=(12, 8), dpi=120)

            self.figurefigurefigurefigure.set_tight_layout(True)

        else:

            self.figurefigurefigurefigure = figure


        gs = matplotlib.gridspec.GridSpec(8, 2)


        self.signal_axissignal_axis = self.figurefigurefigurefigure.add_subplot(gs[:6, 0])


        self.bckgrd_axisbckgrd_axis = self.figurefigurefigurefigure.add_subplot(gs[:6, 1], sharey=self.signal_axissignal_axis)


        self.colorbar_axis = self.figurefigurefigurefigure.add_subplot(gs[7, :])


        self.axisaxisaxis = self.signal_axissignal_axis


        super().__init__(self.figurefigurefigurefigure, self.axisaxisaxis)


    def add(self, data, columns, signal_mask, bckgrd_mask):

        """

        Add a new correlation plot.

        @param data pandas.DataFrame containing all data

        @param columns which are used to calculate the correlations

        """

        signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask] for column in columns])) * 100

        bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask] for column in columns])) * 100


        signal_heatmap = self.signal_axissignal_axis.pcolor(signal_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)

        self.bckgrd_axisbckgrd_axis.pcolor(bckgrd_corr, cmap=plt.cm.RdBu, vmin=-100.0, vmax=100.0)


        self.signal_axissignal_axis.invert_yaxis()

        self.signal_axissignal_axis.xaxis.tick_top()

        self.bckgrd_axisbckgrd_axis.invert_yaxis()

        self.bckgrd_axisbckgrd_axis.xaxis.tick_top()


        # put the major ticks at the middle of each cell

        self.signal_axissignal_axis.set_xticks(numpy.arange(signal_corr.shape[0]) + 0.5, minor=False)

        self.signal_axissignal_axis.set_yticks(numpy.arange(signal_corr.shape[1]) + 0.5, minor=False)


        self.signal_axissignal_axis.set_xticklabels(columns, minor=False, rotation=90)

        self.signal_axissignal_axis.set_yticklabels(columns, minor=False)


        # put the major ticks at the middle of each cell

        self.bckgrd_axisbckgrd_axis.set_xticks(numpy.arange(bckgrd_corr.shape[0]) + 0.5, minor=False)

        self.bckgrd_axisbckgrd_axis.set_yticks(numpy.arange(bckgrd_corr.shape[1]) + 0.5, minor=False)


        self.bckgrd_axisbckgrd_axis.set_xticklabels(columns, minor=False, rotation=90)

        self.bckgrd_axisbckgrd_axis.set_yticklabels(columns, minor=False)


        for y in range(signal_corr.shape[0]):

            for x in range(signal_corr.shape[1]):

                txt = self.signal_axissignal_axis.text(x + 0.5, y + 0.5, f'{signal_corr[y, x]:.0f}',

                                            size=14,

                                            horizontalalignment='center',

                                            verticalalignment='center',

                                            color='w')

                txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])


        for y in range(bckgrd_corr.shape[0]):

            for x in range(bckgrd_corr.shape[1]):

                txt = self.bckgrd_axisbckgrd_axis.text(x + 0.5, y + 0.5, f'{bckgrd_corr[y, x]:.0f}',

                                            size=14,

                                            horizontalalignment='center',

                                            verticalalignment='center',

                                            color='w')

                txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='k')])


        cb = self.figurefigurefigurefigure.colorbar(signal_heatmap, cax=self.colorbar_axis, ticks=[-100, 0, 100], orientation='horizontal')

        cb.solids.set_rasterized(True)

        cb.ax.set_xticklabels(['negative', 'uncorrelated', 'positive'])


        self.signal_axissignal_axis.text(0.5, -1.0, "Signal", horizontalalignment='center')

        self.bckgrd_axisbckgrd_axis.text(0.5, -1.0, "Background", horizontalalignment='center')


        # remove whitespace

        self.signal_axissignal_axis.set_xlim(0, signal_corr.shape[0])

        self.signal_axissignal_axis.set_ylim(0, signal_corr.shape[1])

        self.bckgrd_axisbckgrd_axis.set_xlim(0, bckgrd_corr.shape[0])

        self.bckgrd_axisbckgrd_axis.set_ylim(0, bckgrd_corr.shape[1])

        return self


    def finish(self):

        """

        Sets limits, title, axis-labels and legend of the plot

        """

        matplotlib.artist.setp(self.bckgrd_axisbckgrd_axis.get_yticklabels(), visible=False)

        return self


if __name__ == '__main__':


    def get_data(N, columns):

        """

        Creates fake data for example plots

        """

        N /= 2

        n = len(columns) - 1

        xs = numpy.random.normal(0, size=(N, n))

        xb = numpy.random.normal(1, size=(N, n))

        ys = numpy.zeros(N)

        yb = numpy.ones(N)

        data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)

        return data.reindex(numpy.random.permutation(data.index))


    import seaborn

    # Set nice searborn settings

    seaborn.set(font_scale=3)

    seaborn.set_style('whitegrid')


    # Standard plots

    N = 100000

    data = get_data(N, columns=['FastBDT', 'NeuroBayes', 'isSignal'])

    data['type'] = ''

    data.type.iloc[:N / 2] = 'Train'

    data.type.iloc[N / 2:] = 'Test'


    p = Box()

    p.add(data, 'FastBDT')

    p.finish()

    p.save('box_plot.png')


    p = VerboseDistribution()

    p.add(data, 'FastBDT')

    p.add(data, 'NeuroBayes')

    p.finish()

    p.save('verbose_distribution_plot.png')


    p = PurityOverEfficiency()

    p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)

    p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)

    p.finish()

    p.save('roc_purity_plot.png')


    p = RejectionOverEfficiency()

    p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)

    p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)

    p.finish()

    p.save('roc_rejection_plot.png')


    p = Diagonal()

    p.add(data, 'FastBDT', data['isSignal'] == 1, data['isSignal'] == 0)

    p.add(data, 'NeuroBayes', data['isSignal'] == 1, data['isSignal'] == 0)

    p.finish()

    p.save('diagonal_plot.png')


    p = Distribution()

    p.add(data, 'FastBDT')

    p.add(data, 'NeuroBayes')

    p.finish()

    p.save('distribution_plot.png')


    p = Difference()

    p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test')

    p.add(data, 'NeuroBayes', data['type'] == 'Train', data['type'] == 'Test')

    p.finish()

    p.save('difference_plot.png')


    p = Overtraining()

    p.add(data, 'FastBDT', data['type'] == 'Train', data['type'] == 'Test', data['isSignal'] == 1, data['isSignal'] == 0)

    p.finish()

    p.save('overtraining_plot.png')


    p = Correlation()

    p.add(data, 'FastBDT', 'NeuroBayes', [0, 20, 40, 60, 80, 100], data['isSignal'] == 0)

    p.finish()

    p.save('correlation_plot.png')


    p = CorrelationMatrix()

    data['FastBDT2'] = data['FastBDT']**2

    data['NeuroBayes2'] = data['NeuroBayes']**2

    data['FastBDT3'] = data['FastBDT']**3

    data['NeuroBayes3'] = data['NeuroBayes']**3

    p.add(data, ['FastBDT', 'NeuroBayes', 'FastBDT2', 'NeuroBayes2', 'FastBDT3', 'NeuroBayes3'])

    p.finish()

    p.save('correlation_matrix.png')

basf2_mva_util.calculate_flatness
def calculate_flatness(f, p, w=None)
Definition: basf2_mva_util.py:102

histogram.Histograms
Definition: histogram.py:42

plotting.Box
Definition: plotting.py:701

plotting.Box.x_axis_label
x_axis_label
Label on x axis.
Definition: plotting.py:717

plotting.Box.__init__
def __init__(self, figure=None, axis=None, x_axis_label=None)
Definition: plotting.py:708

plotting.Box.add
def add(self, data, column, mask=None, weight_column=None)
Definition: plotting.py:719

plotting.Box.finish
def finish(self)
Definition: plotting.py:760

plotting.CorrelationMatrix
Definition: plotting.py:1245

plotting.CorrelationMatrix.signal_axis
signal_axis
add signal subplot
Definition: plotting.py:1270

plotting.CorrelationMatrix.add
def add(self, data, columns, signal_mask, bckgrd_mask)
Definition: plotting.py:1280

plotting.CorrelationMatrix.colorbar_axis
colorbar_axis
Colorbar axis contains the colorbar.
Definition: plotting.py:1274

plotting.CorrelationMatrix.bckgrd_axis
None bckgrd_axis
Axis which shows the correlation of the background samples.
Definition: plotting.py:1254

plotting.CorrelationMatrix.figure
figure
create figure
Definition: plotting.py:1263

plotting.CorrelationMatrix.__init__
def __init__(self, figure=None)
Definition: plotting.py:1256

plotting.CorrelationMatrix.figure
None figure
figure which is used to draw
Definition: plotting.py:1250

plotting.CorrelationMatrix.signal_axis
None signal_axis
Main axis which shows the correlation of the signal samples.
Definition: plotting.py:1252

plotting.CorrelationMatrix.bckgrd_axis
bckgrd_axis
add background subplot
Definition: plotting.py:1272

plotting.CorrelationMatrix.axis
axis
Usual axis object which every Plotter object needs, here it is just a dummy.
Definition: plotting.py:1276

plotting.CorrelationMatrix.finish
def finish(self)
Definition: plotting.py:1343

plotting.Correlation
Definition: plotting.py:1067

plotting.Correlation.add
def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
Definition: plotting.py:1102

plotting.Correlation.axis_d1
axis_d1
define second subplot
Definition: plotting.py:1096

plotting.Correlation.figure
figure
create figure
Definition: plotting.py:1087

plotting.Correlation.axis_d1
None axis_d1
Axis which shows shape of signal.
Definition: plotting.py:1076

plotting.Correlation.axis
None axis
Main axis which is used to draw.
Definition: plotting.py:1074

plotting.Correlation.__init__
def __init__(self, figure=None)
Definition: plotting.py:1080

plotting.Correlation.axis_d2
axis_d2
define third subplot
Definition: plotting.py:1098

plotting.Correlation.figure
None figure
figure which is used to draw
Definition: plotting.py:1072

plotting.Correlation.axis_d2
None axis_d2
Axis which shows shape of background.
Definition: plotting.py:1078

plotting.Correlation.axis
axis
define first subplot
Definition: plotting.py:1094

plotting.Correlation.finish
def finish(self)
Definition: plotting.py:1147

plotting.Diagonal
Definition: plotting.py:539

plotting.Diagonal.ymax
ymax
Maximum y value.
Definition: plotting.py:562

plotting.Diagonal.xmax
xmax
Maximum x value.
Definition: plotting.py:560

plotting.Diagonal.add
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:548

plotting.Diagonal.finish
def finish(self)
Definition: plotting.py:569

plotting.Difference
Definition: plotting.py:770

plotting.Difference.x_axis_label
x_axis_label
Label on x axis.
Definition: plotting.py:840

plotting.Difference.shift_to_zero
shift_to_zero
Mean difference is shifted to zero (removes constant offset) if this is true.
Definition: plotting.py:797

plotting.Difference.__init__
def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
Definition: plotting.py:787

plotting.Difference.ymax
ymax
Maximum y value.
Definition: plotting.py:800

plotting.Difference.xmax
xmax
Maximum x value.
Definition: plotting.py:830

plotting.Difference.add
def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
Definition: plotting.py:805

plotting.Difference.ymin
ymin
min y value
Definition: plotting.py:799

plotting.Difference.finish
def finish(self, line_color='black')
Definition: plotting.py:843

plotting.Difference.normed
normed
Minuend and subtrahend are normed before comparing them if this is true.
Definition: plotting.py:796

plotting.Distribution
Definition: plotting.py:584

plotting.Distribution.__init__
def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
Definition: plotting.py:590

plotting.Distribution.add
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:622

plotting.Distribution.xmin
xmin
size in x/y
Definition: plotting.py:612

plotting.Distribution.x_axis_label
x_axis_label
x axis label
Definition: plotting.py:620

plotting.Distribution.keep_first_binning
keep_first_binning
Keep first binning if user wants so.
Definition: plotting.py:616

plotting.Distribution.normed_to_all_entries
normed_to_all_entries
Normalize histograms before drawing them.
Definition: plotting.py:601

plotting.Distribution.first_binning
first_binning
first binning
Definition: plotting.py:618

plotting.Distribution.ymax
ymax
size in x/y
Definition: plotting.py:610

plotting.Distribution.range_in_std
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:605

plotting.Distribution.xmax
xmax
size in x/y
Definition: plotting.py:614

plotting.Distribution.ymin
ymin
size in x/y
Definition: plotting.py:608

plotting.Distribution.normed_to_bin_width
normed_to_bin_width
Normalize histograms before drawing them.
Definition: plotting.py:603

plotting.Distribution.finish
def finish(self)
Definition: plotting.py:669

plotting.Importance
Definition: plotting.py:1187

plotting.Importance.add
def add(self, data, columns, variables)
Definition: plotting.py:1192

plotting.Importance.finish
def finish(self)
Definition: plotting.py:1238

plotting.Multiplot
Definition: plotting.py:487

plotting.Multiplot.add
def add(self, i, *args, **kwargs)
Definition: plotting.py:523

plotting.Multiplot.figure
figure
create figure
Definition: plotting.py:503

plotting.Multiplot.__init__
def __init__(self, cls, number_of_plots, figure=None)
Definition: plotting.py:496

plotting.Multiplot.axis
None axis
Main axis.
Definition: plotting.py:494

plotting.Multiplot.figure
None figure
figure which is used to draw
Definition: plotting.py:492

plotting.Multiplot.sub_plots
sub_plots
the subplots which are displayed in the grid
Definition: plotting.py:518

plotting.Multiplot.axis
axis
the axis of the first subplot
Definition: plotting.py:520

plotting.Multiplot.finish
def finish(self)
Definition: plotting.py:530

plotting.Overtraining
Definition: plotting.py:859

plotting.Overtraining.axis_d1
axis_d1
define second subplot
Definition: plotting.py:889

plotting.Overtraining.figure
figure
create figure
Definition: plotting.py:880

plotting.Overtraining.add
def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
Definition: plotting.py:895

plotting.Overtraining.axis_d1
None axis_d1
Axis which shows the difference between training and test signal.
Definition: plotting.py:869

plotting.Overtraining.axis
None axis
Main axis which is used to draw.
Definition: plotting.py:867

plotting.Overtraining.__init__
def __init__(self, figure=None)
Definition: plotting.py:873

plotting.Overtraining.axis_d2
axis_d2
define third subplot
Definition: plotting.py:891

plotting.Overtraining.figure
None figure
figure which is used to draw
Definition: plotting.py:865

plotting.Overtraining.axis_d2
None axis_d2
Axis which shows the difference between training and test background.
Definition: plotting.py:871

plotting.Overtraining.axis
axis
define first subplot
Definition: plotting.py:887

plotting.Overtraining.finish
def finish(self)
Definition: plotting.py:974

plotting.Plotter
Definition: plotting.py:43

plotting.Plotter.finish
def finish(self, *args, **kwargs)
Definition: plotting.py:259

plotting.Plotter.fill_kwargs
fill_kwargs
Default keyword arguments for fill_between function.
Definition: plotting.py:117

plotting.Plotter.ymin
None ymin
Minimum y value.
Definition: plotting.py:67

plotting.Plotter.set_errorband_options
def set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
Definition: plotting.py:166

plotting.Plotter.plots
plots
create empty list for plots
Definition: plotting.py:98

plotting.Plotter.xscale
float xscale
limit scale
Definition: plotting.py:71

plotting.Plotter.figure
figure
create figure
Definition: plotting.py:86

plotting.Plotter.ymax
None ymax
Maximum y value.
Definition: plotting.py:69

plotting.Plotter.errorband_kwargs
errorband_kwargs
Default keyword arguments for errorband function.
Definition: plotting.py:115

plotting.Plotter.axis
None axis
Main axis which is used to draw.
Definition: plotting.py:75

plotting.Plotter.scale_limits
def scale_limits(self)
Definition: plotting.py:265

plotting.Plotter.add
def add(self, *args, **kwargs)
Definition: plotting.py:253

plotting.Plotter.xmin
None xmin
Minimum x value.
Definition: plotting.py:63

plotting.Plotter.set_fill_options
def set_fill_options(self, fill_kwargs=None)
Definition: plotting.py:174

plotting.Plotter.save
def save(self, filename)
Definition: plotting.py:139

plotting.Plotter.__init__
def __init__(self, figure=None, axis=None)
Definition: plotting.py:77

plotting.Plotter.figure
None figure
figure which is used to draw
Definition: plotting.py:73

plotting.Plotter.ymax
ymax
set y limits
Definition: plotting.py:104

plotting.Plotter.plots
None plots
Plots added to the axis so far.
Definition: plotting.py:59

plotting.Plotter.prop_cycler
prop_cycler
Property cycler used to give plots unique colors.
Definition: plotting.py:125

plotting.Plotter.xmax
xmax
set x limits
Definition: plotting.py:102

plotting.Plotter.errorbar_kwargs
errorbar_kwargs
Default keyword arguments for errorbar function.
Definition: plotting.py:113

plotting.Plotter.labels
labels
create empty list for labels
Definition: plotting.py:100

plotting.Plotter.axis
axis
divide figure into subplots
Definition: plotting.py:93

plotting.Plotter._plot_datapoints
def _plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
Definition: plotting.py:182

plotting.Plotter.set_errorbar_options
def set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Overrides default errorbar options for datapoint errorbars.
Definition: plotting.py:158

plotting.Plotter.yscale
float yscale
limit scale
Definition: plotting.py:70

plotting.Plotter.labels
None labels
Labels of the plots added so far.
Definition: plotting.py:61

plotting.Plotter.add_subplot
def add_subplot(self, gridspecs)
Definition: plotting.py:127

plotting.Plotter.set_plot_options
def set_plot_options(self, plot_kwargs={ 'linestyle':''})
Definition: plotting.py:150

plotting.Plotter.plot_kwargs
plot_kwargs
Default keyword arguments for plot function.
Definition: plotting.py:111

plotting.Plotter.xmax
None xmax
Maximum x value.
Definition: plotting.py:65

plotting.PurityAndEfficiencyOverCut
Definition: plotting.py:276

plotting.PurityAndEfficiencyOverCut.add
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:285

plotting.PurityAndEfficiencyOverCut.ymax
ymax
Maximum y value.
Definition: plotting.py:307

plotting.PurityAndEfficiencyOverCut.xmax
xmax
Maximum x value.
Definition: plotting.py:306

plotting.PurityAndEfficiencyOverCut.finish
def finish(self)
Definition: plotting.py:326

plotting.PurityOverEfficiency
Definition: plotting.py:385

plotting.PurityOverEfficiency.ymax
ymax
Maximum y value.
Definition: plotting.py:408

plotting.PurityOverEfficiency.xmax
xmax
Maximum x value.
Definition: plotting.py:407

plotting.PurityOverEfficiency.finish
def finish(self)
Definition: plotting.py:418

plotting.PurityOverEfficiency.add
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:394

plotting.RejectionOverEfficiency
Definition: plotting.py:431

plotting.RejectionOverEfficiency.ymax
ymax
Maximum y value.
Definition: plotting.py:462

plotting.RejectionOverEfficiency.xmax
xmax
Maximum x value.
Definition: plotting.py:461

plotting.RejectionOverEfficiency.finish
def finish(self)
Definition: plotting.py:474

plotting.RejectionOverEfficiency.add
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
Definition: plotting.py:440

plotting.SignalToNoiseOverCut
Definition: plotting.py:338

plotting.SignalToNoiseOverCut.add
def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
Definition: plotting.py:347

plotting.SignalToNoiseOverCut.ymax
ymax
Maximum y value.
Definition: plotting.py:364

plotting.SignalToNoiseOverCut.xmax
xmax
Maximum x value.
Definition: plotting.py:363

plotting.SignalToNoiseOverCut.finish
def finish(self)
Definition: plotting.py:373

plotting.TSNE
Definition: plotting.py:1154

plotting.TSNE.add
def add(self, data, columns, *masks)
Definition: plotting.py:1159

plotting.TSNE.finish
def finish(self)
Definition: plotting.py:1180

plotting.VerboseDistribution
Definition: plotting.py:989

plotting.VerboseDistribution.add
def add(self, data, column, mask=None, weight_column=None, label=None)
Definition: plotting.py:1017

plotting.VerboseDistribution.x_axis_label
x_axis_label
x axis label
Definition: plotting.py:1015

plotting.VerboseDistribution.distribution
distribution
The distribution plot.
Definition: plotting.py:1013

plotting.VerboseDistribution.range_in_std
range_in_std
Show only a certain range in terms of standard deviations of the data.
Definition: plotting.py:1009

plotting.VerboseDistribution.box_axes
None box_axes
Axes for the boxplots.
Definition: plotting.py:995

plotting.VerboseDistribution.box_axes
box_axes
create empty list for box axes
Definition: plotting.py:1011

plotting.VerboseDistribution.normed
normed
Normalize histograms before drawing them.
Definition: plotting.py:1007

plotting.VerboseDistribution.finish
def finish(self)
Definition: plotting.py:1049

plotting.VerboseDistribution.__init__
def __init__(self, figure=None, axis=None, normed=False, range_in_std=None, x_axis_label=None)
Definition: plotting.py:997

histogram.weighted_mean_and_std
def weighted_mean_and_std(x, w)
Definition: histogram.py:31

histogram.poisson_error
def poisson_error(n_tot)
Definition: histogram.py:24

plot
Definition: plot.py:1