Public Member Functions
None	__init__ (self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)

list	get_bin_columns (self, weight_df)

dict	get_binning (self, weight_df)

dict	get_fei_binning (self, weight_df)

None	get_ntuple_variables (self, pd.DataFrame ntuple_df, ReweighterParticle particle)

pd.DataFrame	merge_pid_weight_tables (self, dict weights_dict, dict pdg_pid_variable_dict)

None	add_pid_weight_columns (self, pd.DataFrame ntuple_df, ReweighterParticle particle)

None	add_pid_particle (self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)

ReweighterParticle	get_particle (self, str prefix)

	convert_fei_table (self, pd.DataFrame table, float threshold)

None	add_fei_particle (self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)

	add_fei_weight_columns (self, pd.DataFrame ntuple_df, ReweighterParticle particle)

	reweight (self, pd.DataFrame df, bool generate_variations=True)

	print_coverage (self)

	plot_coverage (self)

Public Attributes
	n_variations = n_variations
	Number of weight variations to generate.

list	particles = []
	List of particles.

list	correlations = []
	Correlations between the particles.

	weight_name = weight_name
	Name of the weight column.

bool	weights_generated = False
	Flag to indicate if the weights have been generated.

	evaluate_plots = evaluate_plots
	Flag to indicate if the plots should be evaluated.

	nbins = nbins
	Number of bins for the plots.

	fillna = fillna
	Value to fill NaN values.

Detailed Description

Class that reweights the dataframe.

Args:
    n_variations (int): Number of weight variations to generate.
    weight_name (str): Name of the weight column.
    evaluate_plots (bool): Flag to indicate if the plots should be evaluated.
    nbins (int): Number of bins for the plots.

Definition at line 228 of file sysvar.py.

Constructor & Destructor Documentation

◆ init()

None __init__	(		self,
		int	n_variations = 100,
		str	weight_name = "Weight",
		bool	evaluate_plots = True,
		int	nbins = 50,
		float	fillna = 1.0 )

Initializes the Reweighter class.

Definition at line 239 of file sysvar.py.

                 fillna: float = 1.0) -> None:
        """
        Initializes the Reweighter class.
        """
        
        self.n_variations = n_variations
        
        self.particles = []
        
        self.correlations = []
        
        self.weight_name = weight_name
        
        self.weights_generated = False
        
        self.evaluate_plots = evaluate_plots
        
        self.nbins = nbins
        
        self.fillna = fillna

Member Function Documentation

◆ add_fei_particle()

None add_fei_particle	(		self,
		str	prefix,
		pd.DataFrame	table,
		float	threshold,
		np.ndarray	cov = None,
		dict	variable_aliases = None )

Adds weight variations according to the total uncertainty for easier error propagation.

Args:
    prefix (str): Prefix for the new columns.
    table (pandas.DataFrame): Dataframe containing the efficiency weights.
    threshold (float): Threshold for the efficiency weights.
    cov (numpy.ndarray): Covariance matrix for the efficiency weights.
    variable_aliases (dict): Dictionary containing variable aliases.

Definition at line 490 of file sysvar.py.

                         ) -> None:
        """
        Adds weight variations according to the total uncertainty for easier error propagation.
 
        Args:
            prefix (str): Prefix for the new columns.
            table (pandas.DataFrame): Dataframe containing the efficiency weights.
            threshold (float): Threshold for the efficiency weights.
            cov (numpy.ndarray): Covariance matrix for the efficiency weights.
            variable_aliases (dict): Dictionary containing variable aliases.
        """
        # Empty prefix means no prefix
        if prefix is None:
            prefix = ''
        if prefix and not prefix.endswith('_'):
            prefix += '_'
        if self.get_particle(prefix):
            raise ValueError(f"Particle with prefix '{prefix}' already exists!")
        if variable_aliases is None:
            variable_aliases = {}
        if table is None or len(table) == 0:
            raise ValueError('No weights provided!')
        converted_table = self.convert_fei_table(table, threshold)
        pdg_binning = {(reco_pdg, mc_pdg): self.get_fei_binning(converted_table.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
                       for reco_pdg, mc_pdg in converted_table[['PDG', 'mcPDG']].value_counts().index.to_list()}
        particle = ReweighterParticle(prefix,
                                      type='FEI',
                                      merged_table=converted_table,
                                      pdg_binning=pdg_binning,
                                      variable_aliases=variable_aliases,
                                      weight_name=self.weight_name,
                                      cov=cov)
        self.particles += [particle]
 

◆ add_fei_weight_columns()

add_fei_weight_columns	(		self,
		pd.DataFrame	ntuple_df,
		ReweighterParticle	particle )

Adds weight columns according to the FEI calibration tables

Definition at line 529 of file sysvar.py.

    def add_fei_weight_columns(self, ntuple_df: pd.DataFrame, particle: ReweighterParticle):
        """
        Adds weight columns according to the FEI calibration tables
        """
        rest_str = 'rest'
        particle.merged_table[_fei_mode_col]
        # Apply a weight value from the weight table to the ntuple, based on the binning
        binning_df = pd.DataFrame(index=ntuple_df.index)
        # Take absolute value of mcPDG for binning because we have charge already
        binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
        # Copy the mode ID from the ntuple
        binning_df['num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)
        # Default value in case if reco PDG is not a B-meson PDG
        binning_df[_fei_mode_col] = np.nan
        plot_values = {}
        for reco_pdg, mc_pdg in particle.pdg_binning:
            plot_values[(reco_pdg, mc_pdg)] = {}
            binning_df.loc[binning_df['PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(
                f'PDG == {reco_pdg} and {_fei_mode_col}.str.lower() == "{rest_str}"')[_fei_mode_col].values[0]
            for mode in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:
                binning_df.loc[(binning_df['PDG'] == reco_pdg) & (binning_df['num_mode'] == int(mode[4:])), _fei_mode_col] = mode
            if self.evaluate_plots:
                values = ntuple_df[f'{particle.get_varname(_fei_mode_col)}']
                x_range = np.linspace(values.min(), values.max(), int(values.max())+1)
                plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=True)[0]
 
        # merge the weight table with the ntuple on binning columns
        weight_cols = _weight_cols
        if particle.column_names:
            weight_cols = particle.column_names
        binning_df = binning_df.merge(particle.merged_table[weight_cols + ['PDG', _fei_mode_col]],
                                      on=['PDG', _fei_mode_col], how='left')
        binning_df.index = ntuple_df.index
        particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
        particle.plot_values = plot_values
        for col in weight_cols:
            ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
 

◆ add_pid_particle()

None add_pid_particle	(		self,
		str	prefix,
		dict	weights_dict,
		dict	pdg_pid_variable_dict,
		dict	variable_aliases = None,
		int	sys_seed = None,
		bool	syscorr = True )

Adds weight variations according to the total uncertainty for easier error propagation.

Args:
    prefix (str): Prefix for the new columns.
    weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
    pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
    variable_aliases (dict): Dictionary containing variable aliases.
    sys_seed (int): Seed for the systematic variations.
    syscorr (bool): When true assume systematics are 100% correlated defaults to
true. Note this is overridden by provision of a None value rho_sys

Definition at line 402 of file sysvar.py.

                         syscorr: bool = True) -> None:
        """
        Adds weight variations according to the total uncertainty for easier error propagation.
 
        Args:
            prefix (str): Prefix for the new columns.
            weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
            pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
            variable_aliases (dict): Dictionary containing variable aliases.
            sys_seed (int): Seed for the systematic variations.
            syscorr (bool): When true assume systematics are 100% correlated defaults to
        true. Note this is overridden by provision of a None value rho_sys
        """
        # Empty prefix means no prefix
        if prefix is None:
            prefix = ''
        # Add underscore if not present
        if prefix and not prefix.endswith('_'):
            prefix += '_'
        if self.get_particle(prefix):
            raise ValueError(f"Particle with prefix '{prefix}' already exists!")
        if variable_aliases is None:
            variable_aliases = {}
        merged_weight_df = self.merge_pid_weight_tables(weights_dict, pdg_pid_variable_dict)
        pdg_binning = {(reco_pdg, mc_pdg): self.get_binning(merged_weight_df.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
                       for reco_pdg, mc_pdg in merged_weight_df[['PDG', 'mcPDG']].value_counts().index.to_list()}
        particle = ReweighterParticle(prefix,
                                      type='PID',
                                      merged_table=merged_weight_df,
                                      pdg_binning=pdg_binning,
                                      variable_aliases=variable_aliases,
                                      weight_name=self.weight_name,
                                      sys_seed=sys_seed,
                                      syscorr=syscorr)
        self.particles += [particle]
 

◆ add_pid_weight_columns()

None add_pid_weight_columns	(		self,
		pd.DataFrame	ntuple_df,
		ReweighterParticle	particle )

Adds a weight and uncertainty columns to the dataframe.

Args:
    ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    particle (ReweighterParticle): Particle object.

Definition at line 348 of file sysvar.py.

                               particle: ReweighterParticle) -> None:
        """
        Adds a weight and uncertainty columns to the dataframe.
 
        Args:
            ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
            particle (ReweighterParticle): Particle object.
        """
        # Apply a weight value from the weight table to the ntuple, based on the binning
        binning_df = pd.DataFrame(index=ntuple_df.index)
        # Take absolute value of mcPDG for binning because we have charge already
        binning_df['mcPDG'] = ntuple_df[f'{particle.get_varname("mcPDG")}'].abs()
        binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
        plot_values = {}
        for reco_pdg, mc_pdg in particle.pdg_binning:
            ntuple_cut = f'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'
            if ntuple_df.query(ntuple_cut).empty:
                continue
            plot_values[(reco_pdg, mc_pdg)] = {}
            for var in particle.pdg_binning[(reco_pdg, mc_pdg)]:
                labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])
                          for i in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]
                binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(
                    ntuple_cut)[f'{particle.get_varname(var)}'],
                    particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)
                binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
                               f'{var}_min'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
                                                              var].str[0]
                binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
                               f'{var}_max'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
                                                              var].str[1]
                binning_df.drop(var, axis=1, inplace=True)
                if self.evaluate_plots:
                    values = ntuple_df.query(ntuple_cut)[f'{particle.get_varname(var)}']
                    if len(values.unique()) < 2:
                        print(f'Skip {var} for plotting!')
                        continue
                    x_range = np.linspace(values.min(), values.max(), self.nbins)
                    plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=True)[0]
        # merge the weight table with the ntuple on binning columns
        weight_cols = _weight_cols
        if particle.column_names:
            weight_cols = particle.column_names
        binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],
                                      on=binning_df.columns.tolist(), how='left')
        binning_df.index = ntuple_df.index
        particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
        particle.plot_values = plot_values
        for col in weight_cols:
            ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
            ntuple_df[f'{particle.get_varname(col)}'] = ntuple_df[f'{particle.get_varname(col)}'].fillna(self.fillna)
 

◆ convert_fei_table()

convert_fei_table	(		self,
		pd.DataFrame	table,
		float	threshold )

Checks if the tables are provided in a legacy format and converts them to the standard format.

Definition at line 453 of file sysvar.py.

    def convert_fei_table(self, table: pd.DataFrame, threshold: float):
        """
        Checks if the tables are provided in a legacy format and converts them to the standard format.
        """
        result = None
        str_to_pdg = {'B+': 521, 'B-': 521, 'B0': 511}
        if 'cal' in table.columns:
            result = pd.DataFrame(index=table.index)
            result['data_MC_ratio'] = table['cal']
            result['PDG'] = table['Btag'].apply(lambda x: str_to_pdg.get(x))
            # Assume these are only efficiency tables
            result['mcPDG'] = result['PDG']
            result['threshold'] = table['sig_prob_threshold']
            result[_fei_mode_col] = table[_fei_mode_col]
            result['data_MC_uncertainty_stat_dn'] = table['cal_stat_error']
            result['data_MC_uncertainty_stat_up'] = table['cal_stat_error']
            result['data_MC_uncertainty_sys_dn'] = table['cal_sys_error']
            result['data_MC_uncertainty_sys_up'] = table['cal_sys_error']
        elif 'cal factor' in table.columns:
            result = pd.DataFrame(index=table.index)
            result['data_MC_ratio'] = table['cal factor']
            result['PDG'] = table['Btype'].apply(lambda x: str_to_pdg.get(x))
            result['mcPDG'] = result['PDG']
            result['threshold'] = table['sig prob cut']
            # Assign the total error to the stat uncertainty and set syst. one to 0
            result['data_MC_uncertainty_stat_dn'] = table['error']
            result['data_MC_uncertainty_stat_up'] = table['error']
            result['data_MC_uncertainty_sys_dn'] = 0
            result['data_MC_uncertainty_sys_up'] = 0
            result[_fei_mode_col] = table['mode']
        else:
            result = table
        result = result.query(f'threshold == {threshold}')
        if len(result) == 0:
            raise ValueError(f'No weights found for threshold {threshold}!')
        return result
 

◆ get_bin_columns()

list get_bin_columns	(		self,
			weight_df )

Returns the kinematic bin columns of the dataframe.

Definition at line 265 of file sysvar.py.

    def get_bin_columns(self, weight_df) -> list:
        """
        Returns the kinematic bin columns of the dataframe.
        """
        return [col for col in weight_df.columns if col.endswith('_min') or col.endswith('_max')]
 

◆ get_binning()

dict get_binning	(		self,
			weight_df )

Returns the kinematic binning of the dataframe.

Definition at line 271 of file sysvar.py.

    def get_binning(self, weight_df) -> dict:
        """
        Returns the kinematic binning of the dataframe.
        """
        columns = self.get_bin_columns(weight_df)
        var_names = {'_'.join(col.split('_')[:-1]) for col in columns}
        bin_dict = {}
        for var_name in var_names:
            bin_dict[var_name] = []
            for col in columns:
                if col.startswith(var_name):
                    bin_dict[var_name] += list(weight_df[col].values)
            bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))
        return bin_dict
 

◆ get_fei_binning()

dict get_fei_binning	(		self,
			weight_df )

Returns the irregular binning of the dataframe.

Definition at line 286 of file sysvar.py.

    def get_fei_binning(self, weight_df) -> dict:
        """
        Returns the irregular binning of the dataframe.
        """
        return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith('mode'),
                                             _fei_mode_col].value_counts().index.to_list()}
 

◆ get_ntuple_variables()

None get_ntuple_variables	(		self,
		pd.DataFrame	ntuple_df,
		ReweighterParticle	particle )

Checks if the variables are in the ntuple and returns them.

Args:
    ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    particle (ReweighterParticle): Particle object containing the necessary variables.

Definition at line 293 of file sysvar.py.

                             particle: ReweighterParticle) -> None:
        """
        Checks if the variables are in the ntuple and returns them.
 
        Args:
            ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
            particle (ReweighterParticle): Particle object containing the necessary variables.
        """
        ntuple_variables = particle.get_binning_variables()
        ntuple_variables += particle.get_pdg_variables()
        for var in ntuple_variables:
            if var not in ntuple_df.columns:
                raise ValueError(f'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')
        return ntuple_variables
 

◆ get_particle()

ReweighterParticle get_particle	(		self,
		str	prefix )

Get a particle by its prefix.

Definition at line 444 of file sysvar.py.

    def get_particle(self, prefix: str) -> ReweighterParticle:
        """
        Get a particle by its prefix.
        """
        cands = [particle for particle in self.particles if particle.prefix.strip('_') == prefix.strip('_')]
        if len(cands) == 0:
            return None
        return cands[0]
 

◆ merge_pid_weight_tables()

pd.DataFrame merge_pid_weight_tables	(		self,
		dict	weights_dict,
		dict	pdg_pid_variable_dict )

Merges the efficiency and fake rate weight tables.

Args:
    weights_dict (dict): Dictionary containing the weight tables.
    pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.

Definition at line 310 of file sysvar.py.

                                pdg_pid_variable_dict: dict) -> pd.DataFrame:
        """
        Merges the efficiency and fake rate weight tables.
 
        Args:
            weights_dict (dict): Dictionary containing the weight tables.
            pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.
        """
        weight_dfs = []
        for reco_pdg, mc_pdg in weights_dict:
            if reco_pdg not in pdg_pid_variable_dict:
                raise ValueError(f'Reconstructed PDG code {reco_pdg} not found in thresholds!')
            weight_df = weights_dict[(reco_pdg, mc_pdg)]
            weight_df['mcPDG'] = mc_pdg
            weight_df['PDG'] = reco_pdg
            # Check if these are legacy tables:
            if 'charge' in weight_df.columns:
                charge_dict = {'+': [0, 2], '-': [-2, 0]}
                weight_df[['charge_min', 'charge_max']] = [charge_dict[val] for val in weight_df['charge'].values]
                weight_df = weight_df.drop(columns=['charge'])
                # If iso_score is a single value, drop the min and max columns
                if 'iso_score_min' in weight_df.columns and len(weight_df['iso_score_min'].unique()) == 1:
                    weight_df = weight_df.drop(columns=['iso_score_min', 'iso_score_max'])
            pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]
            threshold = pdg_pid_variable_dict[reco_pdg][1]
            selected_weights = weight_df.query(f'variable == "{pid_variable_name}" and threshold == {threshold}')
            if len(selected_weights) == 0:
                available_variables = weight_df['variable'].unique()
                available_thresholds = weight_df['threshold'].unique()
                raise ValueError(f'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'
                                 f' variable {pid_variable_name} and threshold {threshold}!\n'
                                 f' Available variables: {available_variables}\n'
                                 f' Available thresholds: {available_thresholds}')
            weight_dfs.append(selected_weights)
        return pd.concat(weight_dfs, ignore_index=True)
 

◆ plot_coverage()

plot_coverage ( self )

Plots the coverage of each particle.

Definition at line 597 of file sysvar.py.

    def plot_coverage(self):
        """
        Plots the coverage of each particle.
        """
        for particle in self.particles:
            particle.plot_coverage()
 
 

◆ print_coverage()

print_coverage ( self )

Prints the coverage of each particle.

Definition at line 589 of file sysvar.py.

    def print_coverage(self):
        """
        Prints the coverage of each particle.
        """
        print('Coverage:')
        for particle in self.particles:
            print(f'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')
 

◆ reweight()

reweight	(		self,
		pd.DataFrame	df,
		bool	generate_variations = True )

Reweights the dataframe according to the weight tables.

Args:
    df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    generate_variations (bool): When true generate weight variations.

Definition at line 567 of file sysvar.py.

                 generate_variations: bool = True):
        """
        Reweights the dataframe according to the weight tables.
 
        Args:
            df (pandas.DataFrame): Dataframe containing the analysis ntuple.
            generate_variations (bool): When true generate weight variations.
        """
        for particle in self.particles:
            if particle.type not in _correction_types:
                raise ValueError(f'Particle type {particle.type} not supported!')
            print(f'Required variables: {self.get_ntuple_variables(df, particle)}')
            if generate_variations:
                particle.generate_variations(n_variations=self.n_variations)
            if particle.type == 'PID':
                self.add_pid_weight_columns(df, particle)
            elif particle.type == 'FEI':
                self.add_fei_weight_columns(df, particle)
        return df
 

Member Data Documentation

◆ correlations

list correlations = []

Correlations between the particles.

Definition at line 253 of file sysvar.py.

◆ evaluate_plots

evaluate_plots = evaluate_plots

Flag to indicate if the plots should be evaluated.

Definition at line 259 of file sysvar.py.

◆ fillna

fillna = fillna

Value to fill NaN values.

Definition at line 263 of file sysvar.py.

◆ n_variations

n_variations = n_variations

Number of weight variations to generate.

Definition at line 249 of file sysvar.py.

◆ nbins

nbins = nbins

Number of bins for the plots.

Definition at line 261 of file sysvar.py.

◆ particles

list particles = []

List of particles.

Definition at line 251 of file sysvar.py.

◆ weight_name

weight_name = weight_name

Name of the weight column.

Definition at line 255 of file sysvar.py.

◆ weights_generated

bool weights_generated = False

Flag to indicate if the weights have been generated.

Definition at line 257 of file sysvar.py.

The documentation for this class was generated from the following file:

analysis/scripts/sysvar.py

Public Member Functions

Public Attributes