13from dataclasses
import dataclass
14import matplotlib.pyplot
as plt
17from pandas.errors
import PerformanceWarning
20A module that adds corrections to analysis dataframe.
21It adds weight variations according to the total uncertainty for easier error propagation.
24_weight_cols = [
'data_MC_ratio',
25 'data_MC_uncertainty_stat_dn',
26 'data_MC_uncertainty_stat_up',
27 'data_MC_uncertainty_sys_dn',
28 'data_MC_uncertainty_sys_up'
31_correction_types = [
'PID',
'FEI']
32_fei_mode_col =
'dec_mode'
38 Class that stores the information of a particle.
47 merged_table: pd.DataFrame
53 variable_aliases: dict
59 column_names: list = None
65 cov: np.ndarray =
None
71 coverage: float =
None
74 plot_values: dict =
None
78 Returns the variable name with the prefix
and use alias
if defined.
83 if name.startswith(self.
prefix):
85 return f
'{self.prefix}{name}'
89 Returns the list of variables that are used for the binning
91 variables = set(sum([list(d.keys()) for d
in self.
pdg_binning.values()], []))
92 return [f
'{self.get_varname(var)}' for var
in variables]
96 Returns the list of variables that are used for the PDG codes
101 pdg_vars += [
'mcPDG']
102 return [f
'{self.get_varname(var)}' for var
in pdg_vars]
106 rho_sys: np.ndarray =
None,
107 rho_stat: np.ndarray =
None) ->
None:
109 Generates variations of weights according to the uncertainties
112 "data_MC_uncertainty_stat_dn"]].max(axis=1)
114 "data_MC_uncertainty_sys_dn"]].max(axis=1)
120 weights = cov + means
127 rho_sys: np.ndarray =
None,
128 rho_stat: np.ndarray =
None) -> np.ndarray:
130 Returns the covariance matrix of the weights
133 zeros = np.zeros(len_means)
137 rho_sys = np.ones((len_means, len_means))
139 rho_sys = np.identity(len_means)
141 rho_stat = np.identity(len_means)
145 stat_cov = np.matmul(
149 sys = np.random.multivariate_normal(zeros, sys_cov, n_variations)
151 stat = np.random.multivariate_normal(zeros, stat_cov, n_variations)
153 errors = np.random.multivariate_normal(zeros, self.
cov, n_variations)
158 Converts the object to a string.
160 separator = '------------------'
161 title =
'ReweighterParticle'
162 prefix_str = f
'Type: {self.type} Prefix: {self.prefix}'
163 columns = _weight_cols
164 merged_table_str = f
'Merged table:\n{self.merged_table[columns].describe()}'
165 pdg_binning_str =
'PDG binning:\n'
167 pdg_binning_str += f
'{pdgs}: {self.pdg_binning[pdgs]}\n'
168 return '\n'.join([separator, title, prefix_str, merged_table_str, pdg_binning_str]) + separator
172 Plots the coverage of the ntuple.
176 vars = set(sum([list(d.keys())
for d
in self.
plot_values.values()], []))
178 fig, axs = plt.subplots(len(self.
plot_values), len(vars), figsize=(5*len(vars), 3*len(self.
plot_values)), dpi=120)
180 if len(axs.shape) < 1:
181 axs = axs.reshape(len(self.
plot_values), len(vars))
182 bin_plt = {
'linewidth': 3,
'linestyle':
'--',
'color':
'0.5'}
183 fig.suptitle(f
'{self.type} particle {self.prefix.strip("_")}')
184 for (reco_pdg, mc_pdg), ax_row
in zip(self.
plot_values, axs):
185 for var, ax
in zip(self.
plot_values[(reco_pdg, mc_pdg)], ax_row):
187 ymax = self.
plot_values[(reco_pdg, mc_pdg)][var][1].max()*1.1
190 ax.vlines(self.
pdg_binning[(reco_pdg, mc_pdg)][var], ymin, ymax,
195 values = np.array([int(val[4:])
for val
in self.
pdg_binning[(reco_pdg, mc_pdg)][var]])
197 np.ones(len(values))*ymax,
202 rest = np.setdiff1d(self.
plot_values[(reco_pdg, mc_pdg)][var][0], values)
204 np.ones(len(rest))*ymax,
207 label=
'Rest category',
210 widths = (self.
plot_values[(reco_pdg, mc_pdg)][var][0][1:] - self.
plot_values[(reco_pdg, mc_pdg)][var][0][:-1])
211 centers = self.
plot_values[(reco_pdg, mc_pdg)][var][0][:-1] + widths/2
217 ax.set_title(f
'True {pdg.to_name(mc_pdg)} to reco {pdg.to_name(reco_pdg)} coverage')
226 Class that reweights the dataframe.
229 n_variations (int): Number of weight variations to generate.
230 weight_name (str): Name of the weight column.
231 evaluate_plots (bool): Flag to indicate if the plots should be evaluated.
232 nbins (int): Number of bins
for the plots.
236 n_variations: int = 100,
237 weight_name: str =
"Weight",
238 evaluate_plots: bool =
True,
240 fillna: float = 1.0) ->
None:
242 Initializes the Reweighter class.
263 Returns the kinematic bin columns of the dataframe.
265 return [col
for col
in weight_df.columns
if col.endswith(
'_min')
or col.endswith(
'_max')]
269 Returns the kinematic binning of the dataframe.
272 var_names = {'_'.join(col.split(
'_')[:-1])
for col
in columns}
274 for var_name
in var_names:
275 bin_dict[var_name] = []
277 if col.startswith(var_name):
278 bin_dict[var_name] += list(weight_df[col].values)
279 bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))
284 Returns the irregular binning of the dataframe.
286 return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith(
'mode'),
287 _fei_mode_col].value_counts().index.to_list()}
290 ntuple_df: pd.DataFrame,
291 particle: ReweighterParticle) ->
None:
293 Checks if the variables are
in the ntuple
and returns them.
296 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
297 particle (ReweighterParticle): Particle object containing the necessary variables.
299 ntuple_variables = particle.get_binning_variables()
300 ntuple_variables += particle.get_pdg_variables()
301 for var
in ntuple_variables:
302 if var
not in ntuple_df.columns:
303 raise ValueError(f
'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')
304 return ntuple_variables
308 pdg_pid_variable_dict: dict) -> pd.DataFrame:
310 Merges the efficiency and fake rate weight tables.
313 weights_dict (dict): Dictionary containing the weight tables.
314 pdg_pid_variable_dict (dict): Dictionary containing the PDG codes
and variable names.
317 for reco_pdg, mc_pdg
in weights_dict:
318 if reco_pdg
not in pdg_pid_variable_dict:
319 raise ValueError(f
'Reconstructed PDG code {reco_pdg} not found in thresholds!')
320 weight_df = weights_dict[(reco_pdg, mc_pdg)]
321 weight_df[
'mcPDG'] = mc_pdg
322 weight_df[
'PDG'] = reco_pdg
324 if 'charge' in weight_df.columns:
325 charge_dict = {
'+': [0, 2],
'-': [-2, 0]}
326 weight_df[[
'charge_min',
'charge_max']] = [charge_dict[val]
for val
in weight_df[
'charge'].values]
327 weight_df = weight_df.drop(columns=[
'charge'])
329 if 'iso_score_min' in weight_df.columns
and len(weight_df[
'iso_score_min'].unique()) == 1:
330 weight_df = weight_df.drop(columns=[
'iso_score_min',
'iso_score_max'])
331 pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]
332 threshold = pdg_pid_variable_dict[reco_pdg][1]
333 selected_weights = weight_df.query(f
'variable == "{pid_variable_name}" and threshold == {threshold}')
334 if len(selected_weights) == 0:
335 available_variables = weight_df[
'variable'].unique()
336 available_thresholds = weight_df[
'threshold'].unique()
337 raise ValueError(f
'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'
338 f
' variable {pid_variable_name} and threshold {threshold}!\n'
339 f
' Available variables: {available_variables}\n'
340 f
' Available thresholds: {available_thresholds}')
341 weight_dfs.append(selected_weights)
342 return pd.concat(weight_dfs, ignore_index=
True)
345 ntuple_df: pd.DataFrame,
346 particle: ReweighterParticle) ->
None:
348 Adds a weight and uncertainty columns to the dataframe.
351 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
352 particle (ReweighterParticle): Particle object.
355 binning_df = pd.DataFrame(index=ntuple_df.index)
357 binning_df[
'mcPDG'] = ntuple_df[f
'{particle.get_varname("mcPDG")}'].abs()
358 binning_df[
'PDG'] = ntuple_df[f
'{particle.get_varname("PDG")}'].abs()
360 for reco_pdg, mc_pdg
in particle.pdg_binning:
361 ntuple_cut = f
'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'
362 if ntuple_df.query(ntuple_cut).empty:
364 plot_values[(reco_pdg, mc_pdg)] = {}
365 for var
in particle.pdg_binning[(reco_pdg, mc_pdg)]:
366 labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])
367 for i
in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]
368 binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(
369 ntuple_cut)[f
'{particle.get_varname(var)}'],
370 particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)
371 binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg),
372 f
'{var}_min'] = binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg),
374 binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg),
375 f
'{var}_max'] = binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg),
377 binning_df.drop(var, axis=1, inplace=
True)
379 values = ntuple_df.query(ntuple_cut)[f
'{particle.get_varname(var)}']
380 if len(values.unique()) < 2:
381 print(f
'Skip {var} for plotting!')
383 x_range = np.linspace(values.min(), values.max(), self.
nbins)
384 plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=
True)[0]
386 weight_cols = _weight_cols
387 if particle.column_names:
388 weight_cols = particle.column_names
389 binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],
390 on=binning_df.columns.tolist(), how=
'left')
391 binning_df.index = ntuple_df.index
392 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
393 particle.plot_values = plot_values
394 for col
in weight_cols:
395 ntuple_df[f
'{particle.get_varname(col)}'] = binning_df[col]
396 ntuple_df[f
'{particle.get_varname(col)}'] = ntuple_df[f
'{particle.get_varname(col)}'].
fillna(self.
fillna)
401 pdg_pid_variable_dict: dict,
402 variable_aliases: dict =
None,
403 sys_seed: int =
None,
404 syscorr: bool =
True) ->
None:
406 Adds weight variations according to the total uncertainty for easier error propagation.
409 prefix (str): Prefix
for the new columns.
410 weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
411 pdg_pid_variable_dict (dict): Dictionary containing the PID variables
and thresholds.
412 variable_aliases (dict): Dictionary containing variable aliases.
413 sys_seed (int): Seed
for the systematic variations.
414 syscorr (bool): When true assume systematics are 100% correlated defaults to
415 true. Note this
is overridden by provision of a
None value rho_sys
421 if prefix
and not prefix.endswith(
'_'):
424 raise ValueError(f
"Particle with prefix '{prefix}' already exists!")
425 if variable_aliases
is None:
426 variable_aliases = {}
428 pdg_binning = {(reco_pdg, mc_pdg): self.
get_binning(merged_weight_df.query(f
'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
429 for reco_pdg, mc_pdg
in merged_weight_df[[
'PDG',
'mcPDG']].value_counts().index.to_list()}
432 merged_table=merged_weight_df,
433 pdg_binning=pdg_binning,
434 variable_aliases=variable_aliases,
442 Get a particle by its prefix.
444 cands = [particle for particle
in self.
particles if particle.prefix.strip(
'_') == prefix.strip(
'_')]
451 Checks if the tables are provided
in a legacy format
and converts them to the standard format.
454 str_to_pdg = {
'B+': 521,
'B-': 521,
'B0': 511}
455 if 'cal' in table.columns:
456 result = pd.DataFrame(index=table.index)
457 result[
'data_MC_ratio'] = table[
'cal']
458 result[
'PDG'] = table[
'Btag'].apply(
lambda x: str_to_pdg.get(x))
460 result[
'mcPDG'] = result[
'PDG']
461 result[
'threshold'] = table[
'sig_prob_threshold']
462 result[_fei_mode_col] = table[_fei_mode_col]
463 result[
'data_MC_uncertainty_stat_dn'] = table[
'cal_stat_error']
464 result[
'data_MC_uncertainty_stat_up'] = table[
'cal_stat_error']
465 result[
'data_MC_uncertainty_sys_dn'] = table[
'cal_sys_error']
466 result[
'data_MC_uncertainty_sys_up'] = table[
'cal_sys_error']
467 elif 'cal factor' in table.columns:
468 result = pd.DataFrame(index=table.index)
469 result[
'data_MC_ratio'] = table[
'cal factor']
470 result[
'PDG'] = table[
'Btype'].apply(
lambda x: str_to_pdg.get(x))
471 result[
'mcPDG'] = result[
'PDG']
472 result[
'threshold'] = table[
'sig prob cut']
474 result[
'data_MC_uncertainty_stat_dn'] = table[
'error']
475 result[
'data_MC_uncertainty_stat_up'] = table[
'error']
476 result[
'data_MC_uncertainty_sys_dn'] = 0
477 result[
'data_MC_uncertainty_sys_up'] = 0
478 result[_fei_mode_col] = table[
'mode']
481 result = result.query(f
'threshold == {threshold}')
483 raise ValueError(f
'No weights found for threshold {threshold}!')
489 cov: np.ndarray =
None,
490 variable_aliases: dict =
None,
493 Adds weight variations according to the total uncertainty for easier error propagation.
496 prefix (str): Prefix
for the new columns.
497 table (pandas.DataFrame): Dataframe containing the efficiency weights.
498 threshold (float): Threshold
for the efficiency weights.
499 cov (numpy.ndarray): Covariance matrix
for the efficiency weights.
500 variable_aliases (dict): Dictionary containing variable aliases.
505 if prefix
and not prefix.endswith(
'_'):
508 raise ValueError(f
"Particle with prefix '{prefix}' already exists!")
509 if variable_aliases
is None:
510 variable_aliases = {}
511 if table
is None or len(table) == 0:
512 raise ValueError(
'No weights provided!')
514 pdg_binning = {(reco_pdg, mc_pdg): self.
get_fei_binning(converted_table.query(f
'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
515 for reco_pdg, mc_pdg
in converted_table[[
'PDG',
'mcPDG']].value_counts().index.to_list()}
518 merged_table=converted_table,
519 pdg_binning=pdg_binning,
520 variable_aliases=variable_aliases,
527 Adds weight columns according to the FEI calibration tables
530 particle.merged_table[_fei_mode_col]
532 binning_df = pd.DataFrame(index=ntuple_df.index)
534 binning_df[
'PDG'] = ntuple_df[f
'{particle.get_varname("PDG")}'].abs()
536 binning_df[
'num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)
538 binning_df[_fei_mode_col] = np.nan
540 for reco_pdg, mc_pdg
in particle.pdg_binning:
541 plot_values[(reco_pdg, mc_pdg)] = {}
542 binning_df.loc[binning_df[
'PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(
543 f
'PDG == {reco_pdg} and {_fei_mode_col}.str.lower() == "{rest_str}"')[_fei_mode_col].values[0]
544 for mode
in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:
545 binning_df.loc[(binning_df[
'PDG'] == reco_pdg) & (binning_df[
'num_mode'] == int(mode[4:])), _fei_mode_col] = mode
547 values = ntuple_df[f
'{particle.get_varname(_fei_mode_col)}']
548 x_range = np.linspace(values.min(), values.max(), int(values.max())+1)
549 plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=
True)[0]
552 weight_cols = _weight_cols
553 if particle.column_names:
554 weight_cols = particle.column_names
555 binning_df = binning_df.merge(particle.merged_table[weight_cols + [
'PDG', _fei_mode_col]],
556 on=[
'PDG', _fei_mode_col], how=
'left')
557 binning_df.index = ntuple_df.index
558 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
559 particle.plot_values = plot_values
560 for col
in weight_cols:
561 ntuple_df[f
'{particle.get_varname(col)}'] = binning_df[col]
565 generate_variations: bool =
True):
567 Reweights the dataframe according to the weight tables.
570 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
571 generate_variations (bool): When true generate weight variations.
574 if particle.type
not in _correction_types:
575 raise ValueError(f
'Particle type {particle.type} not supported!')
576 print(f
'Required variables: {self.get_ntuple_variables(df, particle)}')
577 if generate_variations:
578 particle.generate_variations(n_variations=self.
n_variations)
579 if particle.type ==
'PID':
581 elif particle.type ==
'FEI':
587 Prints the coverage of each particle.
591 print(f
'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')
595 Plots the coverage of each particle.
598 particle.plot_coverage()
601def add_weights_to_dataframe(prefix: str,
604 custom_tables: dict =
None,
605 custom_thresholds: dict =
None,
606 **kw_args) -> pd.DataFrame:
608 Helper method that adds weights to a dataframe.
611 prefix (str): Prefix for the new columns.
612 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
613 systematic (str): Type of the systematic corrections, options:
"custom_PID" and "custom_FEI".
614 MC_production (str): Name of the MC production.
615 custom_tables (dict): Dictionary containing the custom efficiency weights.
616 custom_thresholds (dict): Dictionary containing the custom thresholds
for the custom efficiency weights.
617 n_variations (int): Number of variations to generate.
618 generate_variations (bool): When true generate weight variations.
619 weight_name (str): Name of the weight column.
620 show_plots (bool): When true show the coverage plots.
621 variable_aliases (dict): Dictionary containing variable aliases.
622 cov_matrix (numpy.ndarray): Covariance matrix
for the custom efficiency weights.
623 fillna (int): Value to fill NaN values
with.
624 sys_seed (int): Seed
for the systematic variations.
625 syscorr (bool): When true assume systematics are 100% correlated defaults to true.
626 **kw_args: Additional arguments
for the Reweighter
class.
628 generate_variations = kw_args.get('generate_variations',
True)
629 n_variations = kw_args.get(
'n_variations', 100)
630 weight_name = kw_args.get(
'weight_name',
"Weight")
631 fillna = kw_args.get(
'fillna', 1.0)
633 with warnings.catch_warnings():
634 warnings.simplefilter(
"ignore", category=PerformanceWarning)
635 reweighter =
Reweighter(n_variations=n_variations,
636 weight_name=weight_name,
638 variable_aliases = kw_args.get(
'variable_aliases')
639 if systematic.lower() ==
'custom_fei':
640 cov_matrix = kw_args.get(
'cov_matrix')
641 reweighter.add_fei_particle(prefix=prefix,
643 threshold=custom_thresholds,
644 variable_aliases=variable_aliases,
647 elif systematic.lower() ==
'custom_pid':
648 sys_seed = kw_args.get(
'sys_seed')
649 syscorr = kw_args.get(
'syscorr')
652 reweighter.add_pid_particle(prefix=prefix,
653 weights_dict=custom_tables,
654 pdg_pid_variable_dict=custom_thresholds,
655 variable_aliases=variable_aliases,
660 raise ValueError(f
'Systematic {systematic} is not supported!')
662 result = reweighter.reweight(df, generate_variations=generate_variations)
663 if kw_args.get(
'show_plots'):
664 reweighter.print_coverage()
665 reweighter.plot_coverage()
str weight_name
Weight column name that will be added to the ntuple.
str type
Type of the particle (PID or FEI)
pd merged_table
Merged table of the weights.
dict pdg_binning
Kinematic binning of the weight table per particle.
np cov
Covariance matrix corresponds to the total uncertainty.
str get_varname(self, str varname)
str prefix
Prefix of the particle in the ntuple.
int sys_seed
Random seed for systematics.
list get_pdg_variables(self)
type
Add the mcPDG code requirement for PID particle.
list get_binning_variables(self)
dict plot_values
Values for the plots.
np.ndarray get_covariance(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
bool syscorr
When true assume systematics are 100% correlated.
None generate_variations(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
def plot_coverage(self, fig=None, axs=None)
list column_names
Internal list of the names of the weight columns.
dict variable_aliases
Variable aliases of the weight table.
column_names
Names of the varied weight columns.
n_variations
Number of weight variations to generate.
pd.DataFrame merge_pid_weight_tables(self, dict weights_dict, dict pdg_pid_variable_dict)
nbins
Number of bins for the plots.
None add_pid_particle(self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)
None __init__(self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)
None add_pid_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
weight_name
Name of the weight column.
def add_fei_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
None get_ntuple_variables(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
fillna
Value to fill NaN values.
weights_generated
Flag to indicate if the weights have been generated.
dict get_fei_binning(self, weight_df)
def reweight(self, pd.DataFrame df, bool generate_variations=True)
def convert_fei_table(self, pd.DataFrame table, float threshold)
dict get_binning(self, weight_df)
list get_bin_columns(self, weight_df)
correlations
Correlations between the particles.
None add_fei_particle(self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)
ReweighterParticle get_particle(self, str prefix)
particles
List of particles.
evaluate_plots
Flag to indicate if the plots should be evaluated.