Public Member Functions
None	__init__ (self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)

list	get_bin_columns (self, weight_df)

dict	get_binning (self, weight_df)

dict	get_fei_binning (self, weight_df)

None	get_ntuple_variables (self, pd.DataFrame ntuple_df, ReweighterParticle particle)

pd.DataFrame	merge_pid_weight_tables (self, dict weights_dict, dict pdg_pid_variable_dict)

None	add_pid_weight_columns (self, pd.DataFrame ntuple_df, ReweighterParticle particle)

None	add_pid_particle (self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)

ReweighterParticle	get_particle (self, str prefix)

def	convert_fei_table (self, pd.DataFrame table, float threshold)

None	add_fei_particle (self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)

def	add_fei_weight_columns (self, pd.DataFrame ntuple_df, ReweighterParticle particle)

def	reweight (self, pd.DataFrame df, bool generate_variations=True)

def	print_coverage (self)

def	plot_coverage (self)

Public Attributes
	n_variations
	Number of weight variations to generate.

	particles
	List of particles.

	correlations
	Correlations between the particles.

	weight_name
	Name of the weight column.

	weights_generated
	Flag to indicate if the weights have been generated.

	evaluate_plots
	Flag to indicate if the plots should be evaluated.

	nbins
	Number of bins for the plots.

	fillna
	Value to fill NaN values.

Detailed Description

Class that reweights the dataframe.

Args:
    n_variations (int): Number of weight variations to generate.
    weight_name (str): Name of the weight column.
    evaluate_plots (bool): Flag to indicate if the plots should be evaluated.
    nbins (int): Number of bins for the plots.

Definition at line 224 of file sysvar.py.

Constructor & Destructor Documentation

◆ init()

None __init__	(		self,
		int	n_variations = `100`,
		str	weight_name = `"Weight"`,
		bool	evaluate_plots = `True`,
		int	nbins = `50`,
		float	fillna = `1.0`
	)

Initializes the Reweighter class.

Definition at line 235 of file sysvar.py.

                 fillna: float = 1.0) -> None:
        """
        Initializes the Reweighter class.
        """
        
        self.n_variations = n_variations
        
        self.particles = []
        
        self.correlations = []
        
        self.weight_name = weight_name
        
        self.weights_generated = False
        
        self.evaluate_plots = evaluate_plots
        
        self.nbins = nbins
        
        self.fillna = fillna

Member Function Documentation

◆ add_fei_particle()

None add_fei_particle	(		self,
		str	prefix,
		pd.DataFrame	table,
		float	threshold,
		np.ndarray	cov = `None`,
		dict	variable_aliases = `None`
	)

Adds weight variations according to the total uncertainty for easier error propagation.

Args:
    prefix (str): Prefix for the new columns.
    table (pandas.DataFrame): Dataframe containing the efficiency weights.
    threshold (float): Threshold for the efficiency weights.
    cov (numpy.ndarray): Covariance matrix for the efficiency weights.
    variable_aliases (dict): Dictionary containing variable aliases.

Definition at line 486 of file sysvar.py.

                         ) -> None:
        """
        Adds weight variations according to the total uncertainty for easier error propagation.
 
        Args:
            prefix (str): Prefix for the new columns.
            table (pandas.DataFrame): Dataframe containing the efficiency weights.
            threshold (float): Threshold for the efficiency weights.
            cov (numpy.ndarray): Covariance matrix for the efficiency weights.
            variable_aliases (dict): Dictionary containing variable aliases.
        """
        # Empty prefix means no prefix
        if prefix is None:
            prefix = ''
        if prefix and not prefix.endswith('_'):
            prefix += '_'
        if self.get_particle(prefix):
            raise ValueError(f"Particle with prefix '{prefix}' already exists!")
        if variable_aliases is None:
            variable_aliases = {}
        if table is None or len(table) == 0:
            raise ValueError('No weights provided!')
        converted_table = self.convert_fei_table(table, threshold)
        pdg_binning = {(reco_pdg, mc_pdg): self.get_fei_binning(converted_table.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
                       for reco_pdg, mc_pdg in converted_table[['PDG', 'mcPDG']].value_counts().index.to_list()}
        particle = ReweighterParticle(prefix,
                                      type='FEI',
                                      merged_table=converted_table,
                                      pdg_binning=pdg_binning,
                                      variable_aliases=variable_aliases,
                                      weight_name=self.weight_name,
                                      cov=cov)
        self.particles += [particle]
 

◆ add_fei_weight_columns()

def add_fei_weight_columns	(		self,
		pd.DataFrame	ntuple_df,
		ReweighterParticle	particle
	)

Adds weight columns according to the FEI calibration tables

Definition at line 525 of file sysvar.py.

    def add_fei_weight_columns(self, ntuple_df: pd.DataFrame, particle: ReweighterParticle):
        """
        Adds weight columns according to the FEI calibration tables
        """
        rest_str = 'rest'
        particle.merged_table[_fei_mode_col]
        # Apply a weight value from the weight table to the ntuple, based on the binning
        binning_df = pd.DataFrame(index=ntuple_df.index)
        # Take absolute value of mcPDG for binning because we have charge already
        binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
        # Copy the mode ID from the ntuple
        binning_df['num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)
        # Default value in case if reco PDG is not a B-meson PDG
        binning_df[_fei_mode_col] = np.nan
        plot_values = {}
        for reco_pdg, mc_pdg in particle.pdg_binning:
            plot_values[(reco_pdg, mc_pdg)] = {}
            binning_df.loc[binning_df['PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(
                f'PDG == {reco_pdg} and {_fei_mode_col}.str.lower() == "{rest_str}"')[_fei_mode_col].values[0]
            for mode in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:
                binning_df.loc[(binning_df['PDG'] == reco_pdg) & (binning_df['num_mode'] == int(mode[4:])), _fei_mode_col] = mode
            if self.evaluate_plots:
                values = ntuple_df[f'{particle.get_varname(_fei_mode_col)}']
                x_range = np.linspace(values.min(), values.max(), int(values.max())+1)
                plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=True)[0]
 
        # merge the weight table with the ntuple on binning columns
        weight_cols = _weight_cols
        if particle.column_names:
            weight_cols = particle.column_names
        binning_df = binning_df.merge(particle.merged_table[weight_cols + ['PDG', _fei_mode_col]],
                                      on=['PDG', _fei_mode_col], how='left')
        binning_df.index = ntuple_df.index
        particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
        particle.plot_values = plot_values
        for col in weight_cols:
            ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
 

◆ add_pid_particle()

None add_pid_particle	(		self,
		str	prefix,
		dict	weights_dict,
		dict	pdg_pid_variable_dict,
		dict	variable_aliases = `None`,
		int	sys_seed = `None`,
		bool	syscorr = `True`
	)

Adds weight variations according to the total uncertainty for easier error propagation.

Args:
    prefix (str): Prefix for the new columns.
    weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
    pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
    variable_aliases (dict): Dictionary containing variable aliases.
    sys_seed (int): Seed for the systematic variations.
    syscorr (bool): When true assume systematics are 100% correlated defaults to
true. Note this is overridden by provision of a None value rho_sys

Definition at line 398 of file sysvar.py.

                         syscorr: bool = True) -> None:
        """
        Adds weight variations according to the total uncertainty for easier error propagation.
 
        Args:
            prefix (str): Prefix for the new columns.
            weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
            pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
            variable_aliases (dict): Dictionary containing variable aliases.
            sys_seed (int): Seed for the systematic variations.
            syscorr (bool): When true assume systematics are 100% correlated defaults to
        true. Note this is overridden by provision of a None value rho_sys
        """
        # Empty prefix means no prefix
        if prefix is None:
            prefix = ''
        # Add underscore if not present
        if prefix and not prefix.endswith('_'):
            prefix += '_'
        if self.get_particle(prefix):
            raise ValueError(f"Particle with prefix '{prefix}' already exists!")
        if variable_aliases is None:
            variable_aliases = {}
        merged_weight_df = self.merge_pid_weight_tables(weights_dict, pdg_pid_variable_dict)
        pdg_binning = {(reco_pdg, mc_pdg): self.get_binning(merged_weight_df.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
                       for reco_pdg, mc_pdg in merged_weight_df[['PDG', 'mcPDG']].value_counts().index.to_list()}
        particle = ReweighterParticle(prefix,
                                      type='PID',
                                      merged_table=merged_weight_df,
                                      pdg_binning=pdg_binning,
                                      variable_aliases=variable_aliases,
                                      weight_name=self.weight_name,
                                      sys_seed=sys_seed,
                                      syscorr=syscorr)
        self.particles += [particle]
 

◆ add_pid_weight_columns()

None add_pid_weight_columns	(		self,
		pd.DataFrame	ntuple_df,
		ReweighterParticle	particle
	)

Adds a weight and uncertainty columns to the dataframe.

Args:
    ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    particle (ReweighterParticle): Particle object.

Definition at line 344 of file sysvar.py.

                               particle: ReweighterParticle) -> None:
        """
        Adds a weight and uncertainty columns to the dataframe.
 
        Args:
            ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
            particle (ReweighterParticle): Particle object.
        """
        # Apply a weight value from the weight table to the ntuple, based on the binning
        binning_df = pd.DataFrame(index=ntuple_df.index)
        # Take absolute value of mcPDG for binning because we have charge already
        binning_df['mcPDG'] = ntuple_df[f'{particle.get_varname("mcPDG")}'].abs()
        binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
        plot_values = {}
        for reco_pdg, mc_pdg in particle.pdg_binning:
            ntuple_cut = f'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'
            if ntuple_df.query(ntuple_cut).empty:
                continue
            plot_values[(reco_pdg, mc_pdg)] = {}
            for var in particle.pdg_binning[(reco_pdg, mc_pdg)]:
                labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])
                          for i in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]
                binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(
                    ntuple_cut)[f'{particle.get_varname(var)}'],
                    particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)
                binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
                               f'{var}_min'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
                                                              var].str[0]
                binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
                               f'{var}_max'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
                                                              var].str[1]
                binning_df.drop(var, axis=1, inplace=True)
                if self.evaluate_plots:
                    values = ntuple_df.query(ntuple_cut)[f'{particle.get_varname(var)}']
                    if len(values.unique()) < 2:
                        print(f'Skip {var} for plotting!')
                        continue
                    x_range = np.linspace(values.min(), values.max(), self.nbins)
                    plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=True)[0]
        # merge the weight table with the ntuple on binning columns
        weight_cols = _weight_cols
        if particle.column_names:
            weight_cols = particle.column_names
        binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],
                                      on=binning_df.columns.tolist(), how='left')
        binning_df.index = ntuple_df.index
        particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
        particle.plot_values = plot_values
        for col in weight_cols:
            ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
            ntuple_df[f'{particle.get_varname(col)}'] = ntuple_df[f'{particle.get_varname(col)}'].fillna(self.fillna)
 

◆ convert_fei_table()

def convert_fei_table	(		self,
		pd.DataFrame	table,
		float	threshold
	)

Checks if the tables are provided in a legacy format and converts them to the standard format.

Definition at line 449 of file sysvar.py.

    def convert_fei_table(self, table: pd.DataFrame, threshold: float):
        """
        Checks if the tables are provided in a legacy format and converts them to the standard format.
        """
        result = None
        str_to_pdg = {'B+': 521, 'B-': 521, 'B0': 511}
        if 'cal' in table.columns:
            result = pd.DataFrame(index=table.index)
            result['data_MC_ratio'] = table['cal']
            result['PDG'] = table['Btag'].apply(lambda x: str_to_pdg.get(x))
            # Assume these are only efficiency tables
            result['mcPDG'] = result['PDG']
            result['threshold'] = table['sig_prob_threshold']
            result[_fei_mode_col] = table[_fei_mode_col]
            result['data_MC_uncertainty_stat_dn'] = table['cal_stat_error']
            result['data_MC_uncertainty_stat_up'] = table['cal_stat_error']
            result['data_MC_uncertainty_sys_dn'] = table['cal_sys_error']
            result['data_MC_uncertainty_sys_up'] = table['cal_sys_error']
        elif 'cal factor' in table.columns:
            result = pd.DataFrame(index=table.index)
            result['data_MC_ratio'] = table['cal factor']
            result['PDG'] = table['Btype'].apply(lambda x: str_to_pdg.get(x))
            result['mcPDG'] = result['PDG']
            result['threshold'] = table['sig prob cut']
            # Assign the total error to the stat uncertainty and set syst. one to 0
            result['data_MC_uncertainty_stat_dn'] = table['error']
            result['data_MC_uncertainty_stat_up'] = table['error']
            result['data_MC_uncertainty_sys_dn'] = 0
            result['data_MC_uncertainty_sys_up'] = 0
            result[_fei_mode_col] = table['mode']
        else:
            result = table
        result = result.query(f'threshold == {threshold}')
        if len(result) == 0:
            raise ValueError(f'No weights found for threshold {threshold}!')
        return result
 

◆ get_bin_columns()

list get_bin_columns	(	self,
		weight_df
	)

Returns the kinematic bin columns of the dataframe.

Definition at line 261 of file sysvar.py.

    def get_bin_columns(self, weight_df) -> list:
        """
        Returns the kinematic bin columns of the dataframe.
        """
        return [col for col in weight_df.columns if col.endswith('_min') or col.endswith('_max')]
 

◆ get_binning()

dict get_binning	(	self,
		weight_df
	)

Returns the kinematic binning of the dataframe.

Definition at line 267 of file sysvar.py.

    def get_binning(self, weight_df) -> dict:
        """
        Returns the kinematic binning of the dataframe.
        """
        columns = self.get_bin_columns(weight_df)
        var_names = {'_'.join(col.split('_')[:-1]) for col in columns}
        bin_dict = {}
        for var_name in var_names:
            bin_dict[var_name] = []
            for col in columns:
                if col.startswith(var_name):
                    bin_dict[var_name] += list(weight_df[col].values)
            bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))
        return bin_dict
 

◆ get_fei_binning()

dict get_fei_binning	(	self,
		weight_df
	)

Returns the irregular binning of the dataframe.

Definition at line 282 of file sysvar.py.

    def get_fei_binning(self, weight_df) -> dict:
        """
        Returns the irregular binning of the dataframe.
        """
        return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith('mode'),
                                             _fei_mode_col].value_counts().index.to_list()}
 

◆ get_ntuple_variables()

None get_ntuple_variables	(		self,
		pd.DataFrame	ntuple_df,
		ReweighterParticle	particle
	)

Checks if the variables are in the ntuple and returns them.

Args:
    ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    particle (ReweighterParticle): Particle object containing the necessary variables.

Definition at line 289 of file sysvar.py.

                             particle: ReweighterParticle) -> None:
        """
        Checks if the variables are in the ntuple and returns them.
 
        Args:
            ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
            particle (ReweighterParticle): Particle object containing the necessary variables.
        """
        ntuple_variables = particle.get_binning_variables()
        ntuple_variables += particle.get_pdg_variables()
        for var in ntuple_variables:
            if var not in ntuple_df.columns:
                raise ValueError(f'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')
        return ntuple_variables
 

◆ get_particle()

ReweighterParticle get_particle	(		self,
		str	prefix
	)

Get a particle by its prefix.

Definition at line 440 of file sysvar.py.

    def get_particle(self, prefix: str) -> ReweighterParticle:
        """
        Get a particle by its prefix.
        """
        cands = [particle for particle in self.particles if particle.prefix.strip('_') == prefix.strip('_')]
        if len(cands) == 0:
            return None
        return cands[0]
 

◆ merge_pid_weight_tables()

pd.DataFrame merge_pid_weight_tables	(		self,
		dict	weights_dict,
		dict	pdg_pid_variable_dict
	)

Merges the efficiency and fake rate weight tables.

Args:
    weights_dict (dict): Dictionary containing the weight tables.
    pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.

Definition at line 306 of file sysvar.py.

                                pdg_pid_variable_dict: dict) -> pd.DataFrame:
        """
        Merges the efficiency and fake rate weight tables.
 
        Args:
            weights_dict (dict): Dictionary containing the weight tables.
            pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.
        """
        weight_dfs = []
        for reco_pdg, mc_pdg in weights_dict:
            if reco_pdg not in pdg_pid_variable_dict:
                raise ValueError(f'Reconstructed PDG code {reco_pdg} not found in thresholds!')
            weight_df = weights_dict[(reco_pdg, mc_pdg)]
            weight_df['mcPDG'] = mc_pdg
            weight_df['PDG'] = reco_pdg
            # Check if these are legacy tables:
            if 'charge' in weight_df.columns:
                charge_dict = {'+': [0, 2], '-': [-2, 0]}
                weight_df[['charge_min', 'charge_max']] = [charge_dict[val] for val in weight_df['charge'].values]
                weight_df = weight_df.drop(columns=['charge'])
                # If iso_score is a single value, drop the min and max columns
                if 'iso_score_min' in weight_df.columns and len(weight_df['iso_score_min'].unique()) == 1:
                    weight_df = weight_df.drop(columns=['iso_score_min', 'iso_score_max'])
            pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]
            threshold = pdg_pid_variable_dict[reco_pdg][1]
            selected_weights = weight_df.query(f'variable == "{pid_variable_name}" and threshold == {threshold}')
            if len(selected_weights) == 0:
                available_variables = weight_df['variable'].unique()
                available_thresholds = weight_df['threshold'].unique()
                raise ValueError(f'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'
                                 f' variable {pid_variable_name} and threshold {threshold}!\n'
                                 f' Available variables: {available_variables}\n'
                                 f' Available thresholds: {available_thresholds}')
            weight_dfs.append(selected_weights)
        return pd.concat(weight_dfs, ignore_index=True)
 

◆ plot_coverage()

def plot_coverage ( self )

Plots the coverage of each particle.

Definition at line 593 of file sysvar.py.

    def plot_coverage(self):
        """
        Plots the coverage of each particle.
        """
        for particle in self.particles:
            particle.plot_coverage()
 
 

◆ print_coverage()

def print_coverage ( self )

Prints the coverage of each particle.

Definition at line 585 of file sysvar.py.

    def print_coverage(self):
        """
        Prints the coverage of each particle.
        """
        print('Coverage:')
        for particle in self.particles:
            print(f'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')
 

◆ reweight()

def reweight	(		self,
		pd.DataFrame	df,
		bool	generate_variations = `True`
	)

Reweights the dataframe according to the weight tables.

Args:
    df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    generate_variations (bool): When true generate weight variations.

Definition at line 563 of file sysvar.py.

                 generate_variations: bool = True):
        """
        Reweights the dataframe according to the weight tables.
 
        Args:
            df (pandas.DataFrame): Dataframe containing the analysis ntuple.
            generate_variations (bool): When true generate weight variations.
        """
        for particle in self.particles:
            if particle.type not in _correction_types:
                raise ValueError(f'Particle type {particle.type} not supported!')
            print(f'Required variables: {self.get_ntuple_variables(df, particle)}')
            if generate_variations:
                particle.generate_variations(n_variations=self.n_variations)
            if particle.type == 'PID':
                self.add_pid_weight_columns(df, particle)
            elif particle.type == 'FEI':
                self.add_fei_weight_columns(df, particle)
        return df
 

Member Data Documentation

◆ correlations

correlations

Correlations between the particles.

Definition at line 249 of file sysvar.py.

◆ evaluate_plots

evaluate_plots

Flag to indicate if the plots should be evaluated.

Definition at line 255 of file sysvar.py.

◆ fillna

fillna

Value to fill NaN values.

Definition at line 259 of file sysvar.py.

◆ n_variations

n_variations

Number of weight variations to generate.

Definition at line 245 of file sysvar.py.

◆ nbins

nbins

Number of bins for the plots.

Definition at line 257 of file sysvar.py.

◆ particles

particles

List of particles.

Definition at line 247 of file sysvar.py.

◆ weight_name

weight_name

Name of the weight column.

Definition at line 251 of file sysvar.py.

◆ weights_generated

weights_generated

Flag to indicate if the weights have been generated.

Definition at line 253 of file sysvar.py.

The documentation for this class was generated from the following file:

analysis/scripts/sysvar.py

Public Member Functions

Public Attributes