development/doxygen/sysvar_8py_source.html

#!/usr/bin/env python3


import pandas as pd

import numpy as np

from dataclasses import dataclass

import matplotlib.pyplot as plt

import pdg

import warnings

from pandas.errors import PerformanceWarning


"""

A module that adds corrections to analysis dataframe.

It adds weight variations according to the total uncertainty for easier error propagation.

"""


_weight_cols = ['data_MC_ratio',

                'data_MC_uncertainty_stat_dn',

                'data_MC_uncertainty_stat_up',

                'data_MC_uncertainty_sys_dn',

                'data_MC_uncertainty_sys_up'

                ]


_correction_types = ['PID', 'FEI']

_fei_mode_col = 'dec_mode'


@dataclass

class ReweighterParticle:

    """

    Class that stores the information of a particle.

    """


    prefix: str


    type: str


    merged_table: pd.DataFrame


    pdg_binning: dict


    variable_aliases: dict


    weight_name: str


    column_names: list = None


    sys_seed: int = None


    cov: np.ndarray = None


    syscorr: bool = True


    coverage: float = None


    plot_values: dict = None


    def get_varname(self, varname: str) -> str:

        """

        Returns the variable name with the prefix and use alias if defined.

        """

        name = varname

        if self.variable_aliases and varname in self.variable_aliases:

            name = self.variable_aliases[varname]

        if name.startswith(self.prefix):

            return name

        return f'{self.prefix}{name}'


    def get_binning_variables(self) -> list:

        """

        Returns the list of variables that are used for the binning

        """

        variables = set(sum([list(d.keys()) for d in self.pdg_binning.values()], []))

        return [f'{self.get_varname(var)}' for var in variables]


    def get_pdg_variables(self) -> list:

        """

        Returns the list of variables that are used for the PDG codes

        """

        pdg_vars = ['PDG']


        if self.typetype == "PID":

            pdg_vars += ['mcPDG']

        return [f'{self.get_varname(var)}' for var in pdg_vars]


    def generate_variations(self,

                            n_variations: int,

                            rho_sys: np.ndarray = None,

                            rho_stat: np.ndarray = None) -> None:

        """

        Generates variations of weights according to the uncertainties

        """

        self.merged_table['stat_error'] = self.merged_table[["data_MC_uncertainty_stat_up",

                                                             "data_MC_uncertainty_stat_dn"]].max(axis=1)

        self.merged_table['sys_error'] = self.merged_table[["data_MC_uncertainty_sys_up",

                                                            "data_MC_uncertainty_sys_dn"]].max(axis=1)

        self.merged_table["error"] = np.sqrt(self.merged_table["stat_error"] ** 2 + self.merged_table["sys_error"] ** 2)

        means = self.merged_table["data_MC_ratio"].values


        self.column_namescolumn_names = [f"{self.weight_name}_{i}" for i in range(n_variations)]

        cov = self.get_covariance(n_variations, rho_sys, rho_stat)

        weights = cov + means

        self.merged_table[self.weight_name] = self.merged_table["data_MC_ratio"]

        self.merged_table[self.column_namescolumn_names] = weights.T

        self.column_namescolumn_names.insert(0, self.weight_name)


    def get_covariance(self,

                       n_variations: int,

                       rho_sys: np.ndarray = None,

                       rho_stat: np.ndarray = None) -> np.ndarray:

        """

        Returns the covariance matrix of the weights

        """

        len_means = len(self.merged_table["data_MC_ratio"])

        zeros = np.zeros(len_means)

        if self.cov is None:

            if rho_sys is None:

                if self.syscorr:

                    rho_sys = np.ones((len_means, len_means))

                else:

                    rho_sys = np.identity(len_means)

            if rho_stat is None:

                rho_stat = np.identity(len_means)

            sys_cov = np.matmul(

                np.matmul(np.diag(self.merged_table['sys_error']), rho_sys), np.diag(self.merged_table['sys_error'])

            )

            stat_cov = np.matmul(

                np.matmul(np.diag(self.merged_table['stat_error']), rho_stat), np.diag(self.merged_table['stat_error'])

            )

            np.random.seed(self.sys_seed)

            sys = np.random.multivariate_normal(zeros, sys_cov, n_variations)

            np.random.seed(None)

            stat = np.random.multivariate_normal(zeros, stat_cov, n_variations)

            return sys + stat

        errors = np.random.multivariate_normal(zeros, self.cov, n_variations)

        return errors


    def __str__(self) -> str:

        """

        Converts the object to a string.

        """

        separator = '------------------'

        title = 'ReweighterParticle'

        prefix_str = f'Type: {self.type} Prefix: {self.prefix}'

        columns = _weight_cols

        merged_table_str = f'Merged table:\n{self.merged_table[columns].describe()}'

        pdg_binning_str = 'PDG binning:\n'

        for pdgs in self.pdg_binning:

            pdg_binning_str += f'{pdgs}: {self.pdg_binning[pdgs]}\n'

        return '\n'.join([separator, title, prefix_str, merged_table_str, pdg_binning_str]) + separator


    def plot_coverage(self, fig=None, axs=None):

        """

        Plots the coverage of the ntuple.

        """

        if self.plot_values is None:

            return

        vars = set(sum([list(d.keys()) for d in self.plot_values.values()], []))

        if fig is None:

            fig, axs = plt.subplots(len(self.plot_values), len(vars), figsize=(5*len(vars), 3*len(self.plot_values)), dpi=120)

        axs = np.array(axs)

        if len(axs.shape) < 1:

            axs = axs.reshape(len(self.plot_values), len(vars))

        bin_plt = {'linewidth': 3, 'linestyle': '--', 'color': '0.5'}

        fig.suptitle(f'{self.type} particle {self.prefix.strip("_")}')

        for (reco_pdg, mc_pdg), ax_row in zip(self.plot_values, axs):

            for var, ax in zip(self.plot_values[(reco_pdg, mc_pdg)], ax_row):

                ymin = 0

                ymax = self.plot_values[(reco_pdg, mc_pdg)][var][1].max()*1.1

                # Plot binning

                if self.typetype == 'PID':

                    ax.vlines(self.pdg_binning[(reco_pdg, mc_pdg)][var], ymin, ymax,

                              label='Binning',

                              alpha=0.8,

                              **bin_plt)

                elif self.typetype == 'FEI':

                    values = np.array([int(val[4:]) for val in self.pdg_binning[(reco_pdg, mc_pdg)][var]])

                    ax.bar(values+0.5,

                           np.ones(len(values))*ymax,

                           width=1,

                           alpha=0.5,

                           label='Binning',

                           **bin_plt)

                    rest = np.setdiff1d(self.plot_values[(reco_pdg, mc_pdg)][var][0], values)

                    ax.bar(rest+0.5,

                           np.ones(len(rest))*ymax,

                           width=1,

                           alpha=0.2,

                           label='Rest category',

                           **bin_plt)

                # Plot values

                widths = (self.plot_values[(reco_pdg, mc_pdg)][var][0][1:] - self.plot_values[(reco_pdg, mc_pdg)][var][0][:-1])

                centers = self.plot_values[(reco_pdg, mc_pdg)][var][0][:-1] + widths/2

                ax.bar(centers,

                       self.plot_values[(reco_pdg, mc_pdg)][var][1],

                       width=widths,

                       label='Values',

                       alpha=0.8)

                ax.set_title(f'True {pdg.to_name(mc_pdg)} to reco {pdg.to_name(reco_pdg)} coverage')

                ax.set_xlabel(var)

        axs[-1][-1].legend()

        fig.tight_layout()

        return fig, axs


class Reweighter:

    """

    Class that reweights the dataframe.


    Args:

        n_variations (int): Number of weight variations to generate.

        weight_name (str): Name of the weight column.

        evaluate_plots (bool): Flag to indicate if the plots should be evaluated.

        nbins (int): Number of bins for the plots.

    """


    def __init__(self,

                 n_variations: int = 100,

                 weight_name: str = "Weight",

                 evaluate_plots: bool = True,

                 nbins: int = 50,

                 fillna: float = 1.0) -> None:

        """

        Initializes the Reweighter class.

        """


        self.n_variations = n_variations


        self.particles = []


        self.correlations = []


        self.weight_name = weight_name


        self.weights_generated = False


        self.evaluate_plots = evaluate_plots


        self.nbins = nbins


        self.fillna = fillna


    def get_bin_columns(self, weight_df) -> list:

        """

        Returns the kinematic bin columns of the dataframe.

        """

        return [col for col in weight_df.columns if col.endswith('_min') or col.endswith('_max')]


    def get_binning(self, weight_df) -> dict:

        """

        Returns the kinematic binning of the dataframe.

        """

        columns = self.get_bin_columns(weight_df)

        var_names = {'_'.join(col.split('_')[:-1]) for col in columns}

        bin_dict = {}

        for var_name in var_names:

            bin_dict[var_name] = []

            for col in columns:

                if col.startswith(var_name):

                    bin_dict[var_name] += list(weight_df[col].values)

            bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))

        return bin_dict


    def get_fei_binning(self, weight_df) -> dict:

        """

        Returns the irregular binning of the dataframe.

        """

        return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith('mode'),

                                             _fei_mode_col].value_counts().index.to_list()}


    def get_ntuple_variables(self,

                             ntuple_df: pd.DataFrame,

                             particle: ReweighterParticle) -> None:

        """

        Checks if the variables are in the ntuple and returns them.


        Args:

            ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.

            particle (ReweighterParticle): Particle object containing the necessary variables.

        """

        ntuple_variables = particle.get_binning_variables()

        ntuple_variables += particle.get_pdg_variables()

        for var in ntuple_variables:

            if var not in ntuple_df.columns:

                raise ValueError(f'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')

        return ntuple_variables


    def merge_pid_weight_tables(self,

                                weights_dict: dict,

                                pdg_pid_variable_dict: dict) -> pd.DataFrame:

        """

        Merges the efficiency and fake rate weight tables.


        Args:

            weights_dict (dict): Dictionary containing the weight tables.

            pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.

        """

        weight_dfs = []

        for reco_pdg, mc_pdg in weights_dict:

            if reco_pdg not in pdg_pid_variable_dict:

                raise ValueError(f'Reconstructed PDG code {reco_pdg} not found in thresholds!')

            weight_df = weights_dict[(reco_pdg, mc_pdg)]

            weight_df['mcPDG'] = mc_pdg

            weight_df['PDG'] = reco_pdg

            # Check if these are legacy tables:

            if 'charge' in weight_df.columns:

                charge_dict = {'+': [0, 2], '-': [-2, 0]}

                weight_df[['charge_min', 'charge_max']] = [charge_dict[val] for val in weight_df['charge'].values]

                weight_df = weight_df.drop(columns=['charge'])

                # If iso_score is a single value, drop the min and max columns

                if 'iso_score_min' in weight_df.columns and len(weight_df['iso_score_min'].unique()) == 1:

                    weight_df = weight_df.drop(columns=['iso_score_min', 'iso_score_max'])

            pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]

            threshold = pdg_pid_variable_dict[reco_pdg][1]

            selected_weights = weight_df.query(f'variable == "{pid_variable_name}" and threshold == {threshold}')

            if len(selected_weights) == 0:

                available_variables = weight_df['variable'].unique()

                available_thresholds = weight_df['threshold'].unique()

                raise ValueError(f'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'

                                 f' variable {pid_variable_name} and threshold {threshold}!\n'

                                 f' Available variables: {available_variables}\n'

                                 f' Available thresholds: {available_thresholds}')

            weight_dfs.append(selected_weights)

        return pd.concat(weight_dfs, ignore_index=True)


    def add_pid_weight_columns(self,

                               ntuple_df: pd.DataFrame,

                               particle: ReweighterParticle) -> None:

        """

        Adds a weight and uncertainty columns to the dataframe.


        Args:

            ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.

            particle (ReweighterParticle): Particle object.

        """

        # Apply a weight value from the weight table to the ntuple, based on the binning

        binning_df = pd.DataFrame(index=ntuple_df.index)

        # Take absolute value of mcPDG for binning because we have charge already

        binning_df['mcPDG'] = ntuple_df[f'{particle.get_varname("mcPDG")}'].abs()

        binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()

        plot_values = {}

        for reco_pdg, mc_pdg in particle.pdg_binning:

            ntuple_cut = f'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'

            if ntuple_df.query(ntuple_cut).empty:

                continue

            plot_values[(reco_pdg, mc_pdg)] = {}

            for var in particle.pdg_binning[(reco_pdg, mc_pdg)]:

                labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])

                          for i in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]

                binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(

                    ntuple_cut)[f'{particle.get_varname(var)}'],

                    particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)

                binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),

                               f'{var}_min'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),

                                                              var].str[0]

                binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),

                               f'{var}_max'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),

                                                              var].str[1]

                binning_df.drop(var, axis=1, inplace=True)

                if self.evaluate_plots:

                    values = ntuple_df.query(ntuple_cut)[f'{particle.get_varname(var)}']

                    if len(values.unique()) < 2:

                        print(f'Skip {var} for plotting!')

                        continue

                    x_range = np.linspace(values.min(), values.max(), self.nbins)

                    plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=True)[0]

        # merge the weight table with the ntuple on binning columns

        weight_cols = _weight_cols

        if particle.column_names:

            weight_cols = particle.column_names

        binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],

                                      on=binning_df.columns.tolist(), how='left')

        binning_df.index = ntuple_df.index

        particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)

        particle.plot_values = plot_values

        for col in weight_cols:

            ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]

            ntuple_df[f'{particle.get_varname(col)}'] = ntuple_df[f'{particle.get_varname(col)}'].fillna(self.fillna)


    def add_pid_particle(self,

                         prefix: str,

                         weights_dict: dict,

                         pdg_pid_variable_dict: dict,

                         variable_aliases: dict = None,

                         sys_seed: int = None,

                         syscorr: bool = True) -> None:

        """

        Adds weight variations according to the total uncertainty for easier error propagation.


        Args:

            prefix (str): Prefix for the new columns.

            weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.

            pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.

            variable_aliases (dict): Dictionary containing variable aliases.

            sys_seed (int): Seed for the systematic variations.

            syscorr (bool): When true assume systematics are 100% correlated defaults to

        true. Note this is overridden by provision of a None value rho_sys

        """

        # Empty prefix means no prefix

        if prefix is None:

            prefix = ''

        # Add underscore if not present

        if prefix and not prefix.endswith('_'):

            prefix += '_'

        if self.get_particle(prefix):

            raise ValueError(f"Particle with prefix '{prefix}' already exists!")

        if variable_aliases is None:

            variable_aliases = {}

        merged_weight_df = self.merge_pid_weight_tables(weights_dict, pdg_pid_variable_dict)

        pdg_binning = {(reco_pdg, mc_pdg): self.get_binning(merged_weight_df.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))

                       for reco_pdg, mc_pdg in merged_weight_df[['PDG', 'mcPDG']].value_counts().index.to_list()}

        particle = ReweighterParticle(prefix,

                                      type='PID',

                                      merged_table=merged_weight_df,

                                      pdg_binning=pdg_binning,

                                      variable_aliases=variable_aliases,

                                      weight_name=self.weight_name,

                                      sys_seed=sys_seed,

                                      syscorr=syscorr)

        self.particles += [particle]


    def get_particle(self, prefix: str) -> ReweighterParticle:

        """

        Get a particle by its prefix.

        """

        cands = [particle for particle in self.particles if particle.prefix.strip('_') == prefix.strip('_')]

        if len(cands) == 0:

            return None

        return cands[0]


    def convert_fei_table(self, table: pd.DataFrame, threshold: float):

        """

        Checks if the tables are provided in a legacy format and converts them to the standard format.

        """

        result = None

        str_to_pdg = {'B+': 521, 'B-': 521, 'B0': 511}

        if 'cal' in table.columns:

            result = pd.DataFrame(index=table.index)

            result['data_MC_ratio'] = table['cal']

            result['PDG'] = table['Btag'].apply(lambda x: str_to_pdg.get(x))

            # Assume these are only efficiency tables

            result['mcPDG'] = result['PDG']

            result['threshold'] = table['sig_prob_threshold']

            result[_fei_mode_col] = table[_fei_mode_col]

            result['data_MC_uncertainty_stat_dn'] = table['cal_stat_error']

            result['data_MC_uncertainty_stat_up'] = table['cal_stat_error']

            result['data_MC_uncertainty_sys_dn'] = table['cal_sys_error']

            result['data_MC_uncertainty_sys_up'] = table['cal_sys_error']

        elif 'cal factor' in table.columns:

            result = pd.DataFrame(index=table.index)

            result['data_MC_ratio'] = table['cal factor']

            result['PDG'] = table['Btype'].apply(lambda x: str_to_pdg.get(x))

            result['mcPDG'] = result['PDG']

            result['threshold'] = table['sig prob cut']

            # Assign the total error to the stat uncertainty and set syst. one to 0

            result['data_MC_uncertainty_stat_dn'] = table['error']

            result['data_MC_uncertainty_stat_up'] = table['error']

            result['data_MC_uncertainty_sys_dn'] = 0

            result['data_MC_uncertainty_sys_up'] = 0

            result[_fei_mode_col] = table['mode']

        else:

            result = table

        result = result.query(f'threshold == {threshold}')

        if len(result) == 0:

            raise ValueError(f'No weights found for threshold {threshold}!')

        return result


    def add_fei_particle(self, prefix: str,

                         table: pd.DataFrame,

                         threshold: float,

                         cov: np.ndarray = None,

                         variable_aliases: dict = None,

                         ) -> None:

        """

        Adds weight variations according to the total uncertainty for easier error propagation.


        Args:

            prefix (str): Prefix for the new columns.

            table (pandas.DataFrame): Dataframe containing the efficiency weights.

            threshold (float): Threshold for the efficiency weights.

            cov (numpy.ndarray): Covariance matrix for the efficiency weights.

            variable_aliases (dict): Dictionary containing variable aliases.

        """

        # Empty prefix means no prefix

        if prefix is None:

            prefix = ''

        if prefix and not prefix.endswith('_'):

            prefix += '_'

        if self.get_particle(prefix):

            raise ValueError(f"Particle with prefix '{prefix}' already exists!")

        if variable_aliases is None:

            variable_aliases = {}

        if table is None or len(table) == 0:

            raise ValueError('No weights provided!')

        converted_table = self.convert_fei_table(table, threshold)

        pdg_binning = {(reco_pdg, mc_pdg): self.get_fei_binning(converted_table.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))

                       for reco_pdg, mc_pdg in converted_table[['PDG', 'mcPDG']].value_counts().index.to_list()}

        particle = ReweighterParticle(prefix,

                                      type='FEI',

                                      merged_table=converted_table,

                                      pdg_binning=pdg_binning,

                                      variable_aliases=variable_aliases,

                                      weight_name=self.weight_name,

                                      cov=cov)

        self.particles += [particle]


    def add_fei_weight_columns(self, ntuple_df: pd.DataFrame, particle: ReweighterParticle):

        """

        Adds weight columns according to the FEI calibration tables

        """

        rest_str = 'rest'

        particle.merged_table[_fei_mode_col]

        # Apply a weight value from the weight table to the ntuple, based on the binning

        binning_df = pd.DataFrame(index=ntuple_df.index)

        # Take absolute value of mcPDG for binning because we have charge already

        binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()

        # Copy the mode ID from the ntuple

        binning_df['num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)

        # Default value in case if reco PDG is not a B-meson PDG

        binning_df[_fei_mode_col] = np.nan

        plot_values = {}

        for reco_pdg, mc_pdg in particle.pdg_binning:

            plot_values[(reco_pdg, mc_pdg)] = {}

            binning_df.loc[binning_df['PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(

                f'PDG == {reco_pdg} and {_fei_mode_col}.str.lower() == "{rest_str}"')[_fei_mode_col].values[0]

            for mode in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:

                binning_df.loc[(binning_df['PDG'] == reco_pdg) & (binning_df['num_mode'] == int(mode[4:])), _fei_mode_col] = mode

            if self.evaluate_plots:

                values = ntuple_df[f'{particle.get_varname(_fei_mode_col)}']

                x_range = np.linspace(values.min(), values.max(), int(values.max())+1)

                plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=True)[0]


        # merge the weight table with the ntuple on binning columns

        weight_cols = _weight_cols

        if particle.column_names:

            weight_cols = particle.column_names

        binning_df = binning_df.merge(particle.merged_table[weight_cols + ['PDG', _fei_mode_col]],

                                      on=['PDG', _fei_mode_col], how='left')

        binning_df.index = ntuple_df.index

        particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)

        particle.plot_values = plot_values

        for col in weight_cols:

            ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]


    def reweight(self,

                 df: pd.DataFrame,

                 generate_variations: bool = True):

        """

        Reweights the dataframe according to the weight tables.


        Args:

            df (pandas.DataFrame): Dataframe containing the analysis ntuple.

            generate_variations (bool): When true generate weight variations.

        """

        for particle in self.particles:

            if particle.type not in _correction_types:

                raise ValueError(f'Particle type {particle.type} not supported!')

            print(f'Required variables: {self.get_ntuple_variables(df, particle)}')

            if generate_variations:

                particle.generate_variations(n_variations=self.n_variations)

            if particle.type == 'PID':

                self.add_pid_weight_columns(df, particle)

            elif particle.type == 'FEI':

                self.add_fei_weight_columns(df, particle)

        return df


    def print_coverage(self):

        """

        Prints the coverage of each particle.

        """

        print('Coverage:')

        for particle in self.particles:

            print(f'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')


    def plot_coverage(self):

        """

        Plots the coverage of each particle.

        """

        for particle in self.particles:

            particle.plot_coverage()


def add_weights_to_dataframe(prefix: str,

                             df: pd.DataFrame,

                             systematic: str,

                             custom_tables: dict = None,

                             custom_thresholds: dict = None,

                             **kw_args) -> pd.DataFrame:

    """

    Helper method that adds weights to a dataframe.


    Args:

        prefix (str): Prefix for the new columns.

        df (pandas.DataFrame): Dataframe containing the analysis ntuple.

        systematic (str): Type  of the systematic corrections, options: "custom_PID" and "custom_FEI".

        MC_production (str): Name of the MC production.

        custom_tables (dict): Dictionary containing the custom efficiency weights.

        custom_thresholds (dict): Dictionary containing the custom thresholds for the  custom efficiency weights.

        n_variations (int): Number of variations to generate.

        generate_variations (bool): When true generate weight variations.

        weight_name (str): Name of the weight column.

        show_plots (bool): When true show the coverage plots.

        variable_aliases (dict): Dictionary containing variable aliases.

        cov_matrix (numpy.ndarray): Covariance matrix for the custom efficiency weights.

        fillna (int): Value to fill NaN values with.

        sys_seed (int): Seed for the systematic variations.

        syscorr (bool): When true assume systematics are 100% correlated defaults to true.

        **kw_args: Additional arguments for the Reweighter class.

    """

    generate_variations = kw_args.get('generate_variations', True)

    n_variations = kw_args.get('n_variations', 100)

    weight_name = kw_args.get('weight_name', "Weight")

    fillna = kw_args.get('fillna', 1.0)

    # Catch performance warnings from pandas

    with warnings.catch_warnings():

        warnings.simplefilter("ignore", category=PerformanceWarning)

        reweighter = Reweighter(n_variations=n_variations,

                                weight_name=weight_name,

                                fillna=fillna)

        variable_aliases = kw_args.get('variable_aliases')

        if systematic.lower() == 'custom_fei':

            cov_matrix = kw_args.get('cov_matrix')

            reweighter.add_fei_particle(prefix=prefix,

                                        table=custom_tables,

                                        threshold=custom_thresholds,

                                        variable_aliases=variable_aliases,

                                        cov=cov_matrix

                                        )

        elif systematic.lower() == 'custom_pid':

            sys_seed = kw_args.get('sys_seed')

            syscorr = kw_args.get('syscorr')

            if syscorr is None:

                syscorr = True

            reweighter.add_pid_particle(prefix=prefix,

                                        weights_dict=custom_tables,

                                        pdg_pid_variable_dict=custom_thresholds,

                                        variable_aliases=variable_aliases,

                                        sys_seed=sys_seed,

                                        syscorr=syscorr

                                        )

        else:

            raise ValueError(f'Systematic {systematic} is not supported!')


        result = reweighter.reweight(df, generate_variations=generate_variations)

        if kw_args.get('show_plots'):

            reweighter.print_coverage()

            reweighter.plot_coverage()

        return result

sysvar.ReweighterParticle
Definition: sysvar.py:36

sysvar.ReweighterParticle.weight_name
str weight_name
Weight column name that will be added to the ntuple.
Definition: sysvar.py:56

sysvar.ReweighterParticle.type
str type
Type of the particle (PID or FEI)
Definition: sysvar.py:44

sysvar.ReweighterParticle.merged_table
pd merged_table
Merged table of the weights.
Definition: sysvar.py:47

sysvar.ReweighterParticle.pdg_binning
dict pdg_binning
Kinematic binning of the weight table per particle.
Definition: sysvar.py:50

sysvar.ReweighterParticle.cov
np cov
Covariance matrix corresponds to the total uncertainty.
Definition: sysvar.py:65

sysvar.ReweighterParticle.get_varname
str get_varname(self, str varname)
Definition: sysvar.py:76

sysvar.ReweighterParticle.prefix
str prefix
Prefix of the particle in the ntuple.
Definition: sysvar.py:41

sysvar.ReweighterParticle.sys_seed
int sys_seed
Random seed for systematics.
Definition: sysvar.py:62

sysvar.ReweighterParticle.get_pdg_variables
list get_pdg_variables(self)
Definition: sysvar.py:94

sysvar.ReweighterParticle.type
type
Add the mcPDG code requirement for PID particle.
Definition: sysvar.py:100

sysvar.ReweighterParticle.get_binning_variables
list get_binning_variables(self)
Definition: sysvar.py:87

sysvar.ReweighterParticle.plot_values
dict plot_values
Values for the plots.
Definition: sysvar.py:74

sysvar.ReweighterParticle.get_covariance
np.ndarray get_covariance(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
Definition: sysvar.py:128

sysvar.ReweighterParticle.syscorr
bool syscorr
When true assume systematics are 100% correlated.
Definition: sysvar.py:68

sysvar.ReweighterParticle.generate_variations
None generate_variations(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
Definition: sysvar.py:107

sysvar.ReweighterParticle.plot_coverage
def plot_coverage(self, fig=None, axs=None)
Definition: sysvar.py:170

sysvar.ReweighterParticle.__str__
str __str__(self)
Definition: sysvar.py:156

sysvar.ReweighterParticle.column_names
list column_names
Internal list of the names of the weight columns.
Definition: sysvar.py:59

sysvar.ReweighterParticle.variable_aliases
dict variable_aliases
Variable aliases of the weight table.
Definition: sysvar.py:53

sysvar.ReweighterParticle.column_names
column_names
Names of the varied weight columns.
Definition: sysvar.py:118

sysvar.Reweighter
Definition: sysvar.py:224

sysvar.Reweighter.n_variations
n_variations
Number of weight variations to generate.
Definition: sysvar.py:245

sysvar.Reweighter.merge_pid_weight_tables
pd.DataFrame merge_pid_weight_tables(self, dict weights_dict, dict pdg_pid_variable_dict)
Definition: sysvar.py:308

sysvar.Reweighter.nbins
nbins
Number of bins for the plots.
Definition: sysvar.py:257

sysvar.Reweighter.add_pid_particle
None add_pid_particle(self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)
Definition: sysvar.py:404

sysvar.Reweighter.__init__
None __init__(self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)
Definition: sysvar.py:240

sysvar.Reweighter.add_pid_weight_columns
None add_pid_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition: sysvar.py:346

sysvar.Reweighter.weight_name
weight_name
Name of the weight column.
Definition: sysvar.py:251

sysvar.Reweighter.add_fei_weight_columns
def add_fei_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition: sysvar.py:525

sysvar.Reweighter.get_ntuple_variables
None get_ntuple_variables(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition: sysvar.py:291

sysvar.Reweighter.fillna
fillna
Value to fill NaN values.
Definition: sysvar.py:259

sysvar.Reweighter.weights_generated
weights_generated
Flag to indicate if the weights have been generated.
Definition: sysvar.py:253

sysvar.Reweighter.get_fei_binning
dict get_fei_binning(self, weight_df)
Definition: sysvar.py:282

sysvar.Reweighter.reweight
def reweight(self, pd.DataFrame df, bool generate_variations=True)
Definition: sysvar.py:565

sysvar.Reweighter.convert_fei_table
def convert_fei_table(self, pd.DataFrame table, float threshold)
Definition: sysvar.py:449

sysvar.Reweighter.print_coverage
def print_coverage(self)
Definition: sysvar.py:585

sysvar.Reweighter.get_binning
dict get_binning(self, weight_df)
Definition: sysvar.py:267

sysvar.Reweighter.get_bin_columns
list get_bin_columns(self, weight_df)
Definition: sysvar.py:261

sysvar.Reweighter.correlations
correlations
Correlations between the particles.
Definition: sysvar.py:249

sysvar.Reweighter.add_fei_particle
None add_fei_particle(self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)
Definition: sysvar.py:491

sysvar.Reweighter.get_particle
ReweighterParticle get_particle(self, str prefix)
Definition: sysvar.py:440

sysvar.Reweighter.particles
particles
List of particles.
Definition: sysvar.py:247

sysvar.Reweighter.evaluate_plots
evaluate_plots
Flag to indicate if the plots should be evaluated.
Definition: sysvar.py:255

sysvar.Reweighter.plot_coverage
def plot_coverage(self)
Definition: sysvar.py:593