13from dataclasses
import dataclass
14import matplotlib.pyplot
as plt
17from pandas.errors
import PerformanceWarning
20A module that adds corrections to analysis dataframe.
21It adds weight variations according to the total uncertainty for easier error propagation.
24_weight_cols = [
'data_MC_ratio',
25 'data_MC_uncertainty_stat_dn',
26 'data_MC_uncertainty_stat_up',
27 'data_MC_uncertainty_sys_dn',
28 'data_MC_uncertainty_sys_up'
31_correction_types = [
'PID',
'FEI']
32_fei_mode_col =
'dec_mode'
38 Class that stores the information of a particle.
48 merged_table: pd.DataFrame
56 variable_aliases: dict
63 column_names: list =
None
69 cov: np.ndarray =
None
75 coverage: float =
None
78 plot_values: dict =
None
82 Returns the variable name with the prefix and use alias if defined.
89 return f
'{self.prefix}{name}'
93 Returns the list of variables that are used for the binning
95 variables = set(sum([list(d.keys())
for d
in self.
pdg_binning.values()], []))
96 return [f
'{self.get_varname(var)}' for var
in variables]
100 Returns the list of variables that are used for the PDG codes
105 pdg_vars += [
'mcPDG']
106 return [f
'{self.get_varname(var)}' for var
in pdg_vars]
110 rho_sys: np.ndarray =
None,
111 rho_stat: np.ndarray =
None) ->
None:
113 Generates variations of weights according to the uncertainties
116 "data_MC_uncertainty_stat_dn"]].max(axis=1)
118 "data_MC_uncertainty_sys_dn"]].max(axis=1)
122 self.
column_names = [f
"{self.weight_name}_{i}" for i
in range(n_variations)]
124 weights = cov + means
131 rho_sys: np.ndarray =
None,
132 rho_stat: np.ndarray =
None) -> np.ndarray:
134 Returns the covariance matrix of the weights
137 zeros = np.zeros(len_means)
141 rho_sys = np.ones((len_means, len_means))
143 rho_sys = np.identity(len_means)
145 rho_stat = np.identity(len_means)
149 stat_cov = np.matmul(
153 sys = np.random.multivariate_normal(zeros, sys_cov, n_variations)
155 stat = np.random.multivariate_normal(zeros, stat_cov, n_variations)
157 errors = np.random.multivariate_normal(zeros, self.
cov, n_variations)
162 Converts the object to a string.
164 separator =
'------------------'
165 title =
'ReweighterParticle'
166 prefix_str = f
'Type: {self.type} Prefix: {self.prefix}'
167 columns = _weight_cols
168 merged_table_str = f
'Merged table:\n{self.merged_table[columns].describe()}'
169 pdg_binning_str =
'PDG binning:\n'
171 pdg_binning_str += f
'{pdgs}: {self.pdg_binning[pdgs]}\n'
172 return '\n'.join([separator, title, prefix_str, merged_table_str, pdg_binning_str]) + separator
176 Plots the coverage of the ntuple.
180 vars = set(sum([list(d.keys())
for d
in self.
plot_values.values()], []))
182 fig, axs = plt.subplots(len(self.
plot_values), len(vars), figsize=(5*len(vars), 3*len(self.
plot_values)), dpi=120)
184 if len(axs.shape) < 1:
185 axs = axs.reshape(len(self.
plot_values), len(vars))
186 bin_plt = {
'linewidth': 3,
'linestyle':
'--',
'color':
'0.5'}
187 fig.suptitle(f
'{self.type} particle {self.prefix.strip("_")}')
188 for (reco_pdg, mc_pdg), ax_row
in zip(self.
plot_values, axs):
189 for var, ax
in zip(self.
plot_values[(reco_pdg, mc_pdg)], ax_row):
191 ymax = self.
plot_values[(reco_pdg, mc_pdg)][var][1].max()*1.1
193 if self.
type ==
'PID':
194 ax.vlines(self.
pdg_binning[(reco_pdg, mc_pdg)][var], ymin, ymax,
198 elif self.
type ==
'FEI':
199 values = np.array([int(val[4:])
for val
in self.
pdg_binning[(reco_pdg, mc_pdg)][var]])
201 np.ones(len(values))*ymax,
206 rest = np.setdiff1d(self.
plot_values[(reco_pdg, mc_pdg)][var][0], values)
208 np.ones(len(rest))*ymax,
211 label=
'Rest category',
214 widths = (self.
plot_values[(reco_pdg, mc_pdg)][var][0][1:] - self.
plot_values[(reco_pdg, mc_pdg)][var][0][:-1])
215 centers = self.
plot_values[(reco_pdg, mc_pdg)][var][0][:-1] + widths/2
221 ax.set_title(f
'True {pdg.to_name(mc_pdg)} to reco {pdg.to_name(reco_pdg)} coverage')
230 Class that reweights the dataframe.
233 n_variations (int): Number of weight variations to generate.
234 weight_name (str): Name of the weight column.
235 evaluate_plots (bool): Flag to indicate if the plots should be evaluated.
236 nbins (int): Number of bins for the plots.
240 n_variations: int = 100,
241 weight_name: str =
"Weight",
242 evaluate_plots: bool =
True,
244 fillna: float = 1.0) ->
None:
246 Initializes the Reweighter class.
267 Returns the kinematic bin columns of the dataframe.
269 return [col
for col
in weight_df.columns
if col.endswith(
'_min')
or col.endswith(
'_max')]
273 Returns the kinematic binning of the dataframe.
276 var_names = {
'_'.join(col.split(
'_')[:-1])
for col
in columns}
278 for var_name
in var_names:
279 bin_dict[var_name] = []
281 if col.startswith(var_name):
282 bin_dict[var_name] += list(weight_df[col].values)
283 bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))
288 Returns the irregular binning of the dataframe.
290 return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith(
'mode'),
291 _fei_mode_col].value_counts().index.to_list()}
294 ntuple_df: pd.DataFrame,
295 particle: ReweighterParticle) ->
None:
297 Checks if the variables are in the ntuple and returns them.
300 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
301 particle (ReweighterParticle): Particle object containing the necessary variables.
303 ntuple_variables = particle.get_binning_variables()
304 ntuple_variables += particle.get_pdg_variables()
305 for var
in ntuple_variables:
306 if var
not in ntuple_df.columns:
307 raise ValueError(f
'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')
308 return ntuple_variables
312 pdg_pid_variable_dict: dict) -> pd.DataFrame:
314 Merges the efficiency and fake rate weight tables.
317 weights_dict (dict): Dictionary containing the weight tables.
318 pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.
321 for reco_pdg, mc_pdg
in weights_dict:
322 if reco_pdg
not in pdg_pid_variable_dict:
323 raise ValueError(f
'Reconstructed PDG code {reco_pdg} not found in thresholds!')
324 weight_df = weights_dict[(reco_pdg, mc_pdg)]
325 weight_df[
'mcPDG'] = mc_pdg
326 weight_df[
'PDG'] = reco_pdg
328 if 'charge' in weight_df.columns:
329 charge_dict = {
'+': [0, 2],
'-': [-2, 0]}
330 weight_df[[
'charge_min',
'charge_max']] = [charge_dict[val]
for val
in weight_df[
'charge'].values]
331 weight_df = weight_df.drop(columns=[
'charge'])
333 if 'iso_score_min' in weight_df.columns
and len(weight_df[
'iso_score_min'].unique()) == 1:
334 weight_df = weight_df.drop(columns=[
'iso_score_min',
'iso_score_max'])
335 pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]
336 threshold = pdg_pid_variable_dict[reco_pdg][1]
337 selected_weights = weight_df.query(f
'variable == "{pid_variable_name}" and threshold == {threshold}')
338 if len(selected_weights) == 0:
339 available_variables = weight_df[
'variable'].unique()
340 available_thresholds = weight_df[
'threshold'].unique()
341 raise ValueError(f
'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'
342 f
' variable {pid_variable_name} and threshold {threshold}!\n'
343 f
' Available variables: {available_variables}\n'
344 f
' Available thresholds: {available_thresholds}')
345 weight_dfs.append(selected_weights)
346 return pd.concat(weight_dfs, ignore_index=
True)
349 ntuple_df: pd.DataFrame,
350 particle: ReweighterParticle) ->
None:
352 Adds a weight and uncertainty columns to the dataframe.
355 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
356 particle (ReweighterParticle): Particle object.
359 binning_df = pd.DataFrame(index=ntuple_df.index)
361 binning_df[
'mcPDG'] = ntuple_df[f
'{particle.get_varname("mcPDG")}'].abs()
362 binning_df[
'PDG'] = ntuple_df[f
'{particle.get_varname("PDG")}'].abs()
364 for reco_pdg, mc_pdg
in particle.pdg_binning:
365 ntuple_cut = f
'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'
366 if ntuple_df.query(ntuple_cut).empty:
368 plot_values[(reco_pdg, mc_pdg)] = {}
369 for var
in particle.pdg_binning[(reco_pdg, mc_pdg)]:
370 labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])
371 for i
in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]
372 binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(
373 ntuple_cut)[f
'{particle.get_varname(var)}'],
374 particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)
375 binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg),
376 f
'{var}_min'] = binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg),
378 binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg),
379 f
'{var}_max'] = binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg),
381 binning_df.drop(var, axis=1, inplace=
True)
383 values = ntuple_df.query(ntuple_cut)[f
'{particle.get_varname(var)}']
384 if len(values.unique()) < 2:
385 print(f
'Skip {var} for plotting!')
387 x_range = np.linspace(values.min(), values.max(), self.
nbins)
388 plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=
True)[0]
390 weight_cols = _weight_cols
391 if particle.column_names:
392 weight_cols = particle.column_names
393 binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],
394 on=binning_df.columns.tolist(), how=
'left')
395 binning_df.index = ntuple_df.index
396 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
397 particle.plot_values = plot_values
398 for col
in weight_cols:
399 ntuple_df[f
'{particle.get_varname(col)}'] = binning_df[col]
400 ntuple_df[f
'{particle.get_varname(col)}'] = ntuple_df[f
'{particle.get_varname(col)}'].
fillna(self.
fillna)
405 pdg_pid_variable_dict: dict,
406 variable_aliases: dict =
None,
407 sys_seed: int =
None,
408 syscorr: bool =
True) ->
None:
410 Adds weight variations according to the total uncertainty for easier error propagation.
413 prefix (str): Prefix for the new columns.
414 weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
415 pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
416 variable_aliases (dict): Dictionary containing variable aliases.
417 sys_seed (int): Seed for the systematic variations.
418 syscorr (bool): When true assume systematics are 100% correlated defaults to
419 true. Note this is overridden by provision of a None value rho_sys
425 if prefix
and not prefix.endswith(
'_'):
428 raise ValueError(f
"Particle with prefix '{prefix}' already exists!")
429 if variable_aliases
is None:
430 variable_aliases = {}
432 pdg_binning = {(reco_pdg, mc_pdg): self.
get_binning(merged_weight_df.query(f
'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
433 for reco_pdg, mc_pdg
in merged_weight_df[[
'PDG',
'mcPDG']].value_counts().index.to_list()}
436 merged_table=merged_weight_df,
437 pdg_binning=pdg_binning,
438 variable_aliases=variable_aliases,
446 Get a particle by its prefix.
448 cands = [particle
for particle
in self.
particles if particle.prefix.strip(
'_') == prefix.strip(
'_')]
455 Checks if the tables are provided in a legacy format and converts them to the standard format.
458 str_to_pdg = {
'B+': 521,
'B-': 521,
'B0': 511}
459 if 'cal' in table.columns:
460 result = pd.DataFrame(index=table.index)
461 result[
'data_MC_ratio'] = table[
'cal']
462 result[
'PDG'] = table[
'Btag'].apply(
lambda x: str_to_pdg.get(x))
464 result[
'mcPDG'] = result[
'PDG']
465 result[
'threshold'] = table[
'sig_prob_threshold']
466 result[_fei_mode_col] = table[_fei_mode_col]
467 result[
'data_MC_uncertainty_stat_dn'] = table[
'cal_stat_error']
468 result[
'data_MC_uncertainty_stat_up'] = table[
'cal_stat_error']
469 result[
'data_MC_uncertainty_sys_dn'] = table[
'cal_sys_error']
470 result[
'data_MC_uncertainty_sys_up'] = table[
'cal_sys_error']
471 elif 'cal factor' in table.columns:
472 result = pd.DataFrame(index=table.index)
473 result[
'data_MC_ratio'] = table[
'cal factor']
474 result[
'PDG'] = table[
'Btype'].apply(
lambda x: str_to_pdg.get(x))
475 result[
'mcPDG'] = result[
'PDG']
476 result[
'threshold'] = table[
'sig prob cut']
478 result[
'data_MC_uncertainty_stat_dn'] = table[
'error']
479 result[
'data_MC_uncertainty_stat_up'] = table[
'error']
480 result[
'data_MC_uncertainty_sys_dn'] = 0
481 result[
'data_MC_uncertainty_sys_up'] = 0
482 result[_fei_mode_col] = table[
'mode']
485 result = result.query(f
'threshold == {threshold}')
487 raise ValueError(f
'No weights found for threshold {threshold}!')
493 cov: np.ndarray =
None,
494 variable_aliases: dict =
None,
497 Adds weight variations according to the total uncertainty for easier error propagation.
500 prefix (str): Prefix for the new columns.
501 table (pandas.DataFrame): Dataframe containing the efficiency weights.
502 threshold (float): Threshold for the efficiency weights.
503 cov (numpy.ndarray): Covariance matrix for the efficiency weights.
504 variable_aliases (dict): Dictionary containing variable aliases.
509 if prefix
and not prefix.endswith(
'_'):
512 raise ValueError(f
"Particle with prefix '{prefix}' already exists!")
513 if variable_aliases
is None:
514 variable_aliases = {}
515 if table
is None or len(table) == 0:
516 raise ValueError(
'No weights provided!')
518 pdg_binning = {(reco_pdg, mc_pdg): self.
get_fei_binning(converted_table.query(f
'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
519 for reco_pdg, mc_pdg
in converted_table[[
'PDG',
'mcPDG']].value_counts().index.to_list()}
522 merged_table=converted_table,
523 pdg_binning=pdg_binning,
524 variable_aliases=variable_aliases,
531 Adds weight columns according to the FEI calibration tables
534 particle.merged_table[_fei_mode_col]
536 binning_df = pd.DataFrame(index=ntuple_df.index)
538 binning_df[
'PDG'] = ntuple_df[f
'{particle.get_varname("PDG")}'].abs()
540 binning_df[
'num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)
542 binning_df[_fei_mode_col] = np.nan
544 for reco_pdg, mc_pdg
in particle.pdg_binning:
545 plot_values[(reco_pdg, mc_pdg)] = {}
546 binning_df.loc[binning_df[
'PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(
547 f
'PDG == {reco_pdg} and {_fei_mode_col}.str.lower() == "{rest_str}"')[_fei_mode_col].values[0]
548 for mode
in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:
549 binning_df.loc[(binning_df[
'PDG'] == reco_pdg) & (binning_df[
'num_mode'] == int(mode[4:])), _fei_mode_col] = mode
551 values = ntuple_df[f
'{particle.get_varname(_fei_mode_col)}']
552 x_range = np.linspace(values.min(), values.max(), int(values.max())+1)
553 plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=
True)[0]
556 weight_cols = _weight_cols
557 if particle.column_names:
558 weight_cols = particle.column_names
559 binning_df = binning_df.merge(particle.merged_table[weight_cols + [
'PDG', _fei_mode_col]],
560 on=[
'PDG', _fei_mode_col], how=
'left')
561 binning_df.index = ntuple_df.index
562 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
563 particle.plot_values = plot_values
564 for col
in weight_cols:
565 ntuple_df[f
'{particle.get_varname(col)}'] = binning_df[col]
569 generate_variations: bool =
True):
571 Reweights the dataframe according to the weight tables.
574 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
575 generate_variations (bool): When true generate weight variations.
578 if particle.type
not in _correction_types:
579 raise ValueError(f
'Particle type {particle.type} not supported!')
580 print(f
'Required variables: {self.get_ntuple_variables(df, particle)}')
581 if generate_variations:
582 particle.generate_variations(n_variations=self.
n_variations)
583 if particle.type ==
'PID':
585 elif particle.type ==
'FEI':
591 Prints the coverage of each particle.
595 print(f
'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')
599 Plots the coverage of each particle.
602 particle.plot_coverage()
605def add_weights_to_dataframe(prefix: str,
608 custom_tables: dict =
None,
609 custom_thresholds: dict =
None,
610 **kw_args) -> pd.DataFrame:
612 Helper method that adds weights to a dataframe.
615 prefix (str): Prefix for the new columns.
616 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
617 systematic (str): Type of the systematic corrections, options: "custom_PID" and "custom_FEI".
618 MC_production (str): Name of the MC production.
619 custom_tables (dict): Dictionary containing the custom efficiency weights.
620 custom_thresholds (dict): Dictionary containing the custom thresholds for the custom efficiency weights.
621 n_variations (int): Number of variations to generate.
622 generate_variations (bool): When true generate weight variations.
623 weight_name (str): Name of the weight column.
624 show_plots (bool): When true show the coverage plots.
625 variable_aliases (dict): Dictionary containing variable aliases.
626 cov_matrix (numpy.ndarray): Covariance matrix for the custom efficiency weights.
627 fillna (int): Value to fill NaN values with.
628 sys_seed (int): Seed for the systematic variations.
629 syscorr (bool): When true assume systematics are 100% correlated defaults to true.
630 **kw_args: Additional arguments for the Reweighter class.
632 generate_variations = kw_args.get(
'generate_variations',
True)
633 n_variations = kw_args.get(
'n_variations', 100)
634 weight_name = kw_args.get(
'weight_name',
"Weight")
635 fillna = kw_args.get(
'fillna', 1.0)
637 with warnings.catch_warnings():
638 warnings.simplefilter(
"ignore", category=PerformanceWarning)
639 reweighter =
Reweighter(n_variations=n_variations,
640 weight_name=weight_name,
642 variable_aliases = kw_args.get(
'variable_aliases')
643 if systematic.lower() ==
'custom_fei':
644 cov_matrix = kw_args.get(
'cov_matrix')
645 reweighter.add_fei_particle(prefix=prefix,
647 threshold=custom_thresholds,
648 variable_aliases=variable_aliases,
651 elif systematic.lower() ==
'custom_pid':
652 sys_seed = kw_args.get(
'sys_seed')
653 syscorr = kw_args.get(
'syscorr')
656 reweighter.add_pid_particle(prefix=prefix,
657 weights_dict=custom_tables,
658 pdg_pid_variable_dict=custom_thresholds,
659 variable_aliases=variable_aliases,
664 raise ValueError(f
'Systematic {systematic} is not supported!')
666 result = reweighter.reweight(df, generate_variations=generate_variations)
667 if kw_args.get(
'show_plots'):
668 reweighter.print_coverage()
669 reweighter.plot_coverage()
str type
Add the mcPDG code requirement for PID particle.
variable_aliases
Variable aliases of the weight table.
pd merged_table
Type of the particle (PID or FEI)
int sys_seed
Random seed for systematics.
np cov
Covariance matrix corresponds to the total uncertainty.
str get_varname(self, str varname)
weight_name
Weight column name that will be added to the ntuple.
plot_coverage(self, fig=None, axs=None)
list get_pdg_variables(self)
list get_binning_variables(self)
pdg_binning
Kinematic binning of the weight table per particle.
np.ndarray get_covariance(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
bool syscorr
When true assume systematics are 100% correlated.
prefix
Prefix of the particle in the ntuple.
dict plot_values
Values for the plots.
None generate_variations(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
list column_names
Internal list of the names of the weight columns.
n_variations
Number of weight variations to generate.
pd.DataFrame merge_pid_weight_tables(self, dict weights_dict, dict pdg_pid_variable_dict)
nbins
Number of bins for the plots.
None add_pid_particle(self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)
list particles
List of particles.
add_fei_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
None __init__(self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)
None add_pid_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
weight_name
Name of the weight column.
None get_ntuple_variables(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
fillna
Value to fill NaN values.
dict get_fei_binning(self, weight_df)
convert_fei_table(self, pd.DataFrame table, float threshold)
bool weights_generated
Flag to indicate if the weights have been generated.
dict get_binning(self, weight_df)
list get_bin_columns(self, weight_df)
reweight(self, pd.DataFrame df, bool generate_variations=True)
None add_fei_particle(self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)
ReweighterParticle get_particle(self, str prefix)
list correlations
Correlations between the particles.
evaluate_plots
Flag to indicate if the plots should be evaluated.