Belle II Software light-2601-hyperion
sysvar.py
1#!/usr/bin/env python3
2
3
10
11import pandas as pd
12import numpy as np
13from dataclasses import dataclass
14import matplotlib.pyplot as plt
15import pdg
16import warnings
17import ast
18from pandas.errors import PerformanceWarning
19
20warnings.warn(
21 "This module will soon be deprecated and eventually removed in a future release. "
22 "Its functionality is being taken over by the standalone SysVar package: "
23 "https://gitlab.desy.de/belle2/software/sysvar",
24 FutureWarning,
25 stacklevel=2
26)
27
28"""
29A module that adds corrections to analysis dataframe.
30It adds weight variations according to the total uncertainty for easier error propagation.
31"""
32
33_weight_cols = ['data_MC_ratio',
34 'data_MC_uncertainty_stat_dn',
35 'data_MC_uncertainty_stat_up',
36 'data_MC_uncertainty_sys_dn',
37 'data_MC_uncertainty_sys_up'
38 ]
39
40_correction_types = ['PID', 'FEI']
41_fei_mode_col = 'dec_mode'
42
43
44@dataclass
46 """
47 Class that stores the information of a particle.
48 """
49
51 prefix: str
52
53
54 type: str
55
56
57 merged_table: pd.DataFrame
58
59
61 pdg_binning: dict
62
63
65 variable_aliases: dict
66
67
69 weight_name: str
70
71
72 column_names: list = None
73
74
75 sys_seed: int = None
76
77
78 cov: np.ndarray = None
79
80
81 syscorr: bool = True
82
83
84 coverage: float = None
85
86
87 plot_values: dict = None
88
89 def get_varname(self, varname: str) -> str:
90 """
91 Returns the variable name with the prefix and use alias if defined.
92 """
93 name = varname
94 if self.variable_aliases and varname in self.variable_aliases:
95 name = self.variable_aliases[varname]
96 if name.startswith(self.prefix):
97 return name
98 return f'{self.prefix}{name}'
99
100 def get_binning_variables(self) -> list:
101 """
102 Returns the list of variables that are used for the binning
103 """
104 variables = set(sum([list(d.keys()) for d in self.pdg_binning.values()], []))
105 return [f'{self.get_varname(var)}' for var in variables]
106
107 def get_pdg_variables(self) -> list:
108 """
109 Returns the list of variables that are used for the PDG codes
110 """
111 pdg_vars = ['PDG']
112
113 if self.type == "PID":
114 pdg_vars += ['mcPDG']
115 return [f'{self.get_varname(var)}' for var in pdg_vars]
116
118 n_variations: int,
119 rho_sys: np.ndarray = None,
120 rho_stat: np.ndarray = None) -> None:
121 """
122 Generates variations of weights according to the uncertainties
123 """
124 self.merged_table['stat_error'] = self.merged_table[["data_MC_uncertainty_stat_up",
125 "data_MC_uncertainty_stat_dn"]].max(axis=1)
126 self.merged_table['sys_error'] = self.merged_table[["data_MC_uncertainty_sys_up",
127 "data_MC_uncertainty_sys_dn"]].max(axis=1)
128 self.merged_table["error"] = np.sqrt(self.merged_table["stat_error"] ** 2 + self.merged_table["sys_error"] ** 2)
129 means = self.merged_table["data_MC_ratio"].values
130
131 self.column_names = [f"{self.weight_name}_{i}" for i in range(n_variations)]
132 cov = self.get_covariance(n_variations, rho_sys, rho_stat)
133 weights = cov + means
134 self.merged_table[self.weight_name] = self.merged_table["data_MC_ratio"]
135 self.merged_table[self.column_names] = weights.T
136 self.column_names.insert(0, self.weight_name)
137
139 n_variations: int,
140 rho_sys: np.ndarray = None,
141 rho_stat: np.ndarray = None) -> np.ndarray:
142 """
143 Returns the covariance matrix of the weights
144 """
145 len_means = len(self.merged_table["data_MC_ratio"])
146 zeros = np.zeros(len_means)
147 if self.cov is None:
148 if rho_sys is None:
149 if self.syscorr:
150 rho_sys = np.ones((len_means, len_means))
151 else:
152 rho_sys = np.identity(len_means)
153 if rho_stat is None:
154 rho_stat = np.identity(len_means)
155 sys_cov = np.matmul(
156 np.matmul(np.diag(self.merged_table['sys_error']), rho_sys), np.diag(self.merged_table['sys_error'])
157 )
158 stat_cov = np.matmul(
159 np.matmul(np.diag(self.merged_table['stat_error']), rho_stat), np.diag(self.merged_table['stat_error'])
160 )
161 np.random.seed(self.sys_seed)
162 sys = np.random.multivariate_normal(zeros, sys_cov, n_variations)
163 np.random.seed(None)
164 stat = np.random.multivariate_normal(zeros, stat_cov, n_variations)
165 return sys + stat
166 errors = np.random.multivariate_normal(zeros, self.cov, n_variations)
167 return errors
168
169 def __str__(self) -> str:
170 """
171 Converts the object to a string.
172 """
173 separator = '------------------'
174 title = 'ReweighterParticle'
175 prefix_str = f'Type: {self.type} Prefix: {self.prefix}'
176 columns = _weight_cols
177 merged_table_str = f'Merged table:\n{self.merged_table[columns].describe()}'
178 pdg_binning_str = 'PDG binning:\n'
179 for pdgs in self.pdg_binning:
180 pdg_binning_str += f'{pdgs}: {self.pdg_binning[pdgs]}\n'
181 return '\n'.join([separator, title, prefix_str, merged_table_str, pdg_binning_str]) + separator
182
183 def plot_coverage(self, fig=None, axs=None):
184 """
185 Plots the coverage of the ntuple.
186 """
187 if self.plot_values is None:
188 return
189 vars = set(sum([list(d.keys()) for d in self.plot_values.values()], []))
190 if fig is None:
191 fig, axs = plt.subplots(len(self.plot_values), len(vars), figsize=(5*len(vars), 3*len(self.plot_values)), dpi=120)
192 axs = np.array(axs)
193 if len(axs.shape) < 1:
194 axs = axs.reshape(len(self.plot_values), len(vars))
195 bin_plt = {'linewidth': 3, 'linestyle': '--', 'color': '0.5'}
196 fig.suptitle(f'{self.type} particle {self.prefix.strip("_")}')
197 for (reco_pdg, mc_pdg), ax_row in zip(self.plot_values, axs):
198 for var, ax in zip(self.plot_values[(reco_pdg, mc_pdg)], ax_row):
199 ymin = 0
200 ymax = self.plot_values[(reco_pdg, mc_pdg)][var][1].max()*1.1
201 # Plot binning
202 if self.type == 'PID':
203 ax.vlines(self.pdg_binning[(reco_pdg, mc_pdg)][var], ymin, ymax,
204 label='Binning',
205 alpha=0.8,
206 **bin_plt)
207 elif self.type == 'FEI':
208 values = np.array([int(val[4:]) for val in self.pdg_binning[(reco_pdg, mc_pdg)][var]])
209 ax.bar(values+0.5,
210 np.ones(len(values))*ymax,
211 width=1,
212 alpha=0.5,
213 label='Binning',
214 **bin_plt)
215 rest = np.setdiff1d(self.plot_values[(reco_pdg, mc_pdg)][var][0], values)
216 ax.bar(rest+0.5,
217 np.ones(len(rest))*ymax,
218 width=1,
219 alpha=0.2,
220 label='Rest category',
221 **bin_plt)
222 # Plot values
223 widths = (self.plot_values[(reco_pdg, mc_pdg)][var][0][1:] - self.plot_values[(reco_pdg, mc_pdg)][var][0][:-1])
224 centers = self.plot_values[(reco_pdg, mc_pdg)][var][0][:-1] + widths/2
225 ax.bar(centers,
226 self.plot_values[(reco_pdg, mc_pdg)][var][1],
227 width=widths,
228 label='Values',
229 alpha=0.8)
230 ax.set_title(f'True {pdg.to_name(mc_pdg)} to reco {pdg.to_name(reco_pdg)} coverage')
231 ax.set_xlabel(var)
232 axs[-1][-1].legend()
233 fig.tight_layout()
234 return fig, axs
235
236
238 """
239 Class that reweights the dataframe.
240
241 Args:
242 n_variations (int): Number of weight variations to generate.
243 weight_name (str): Name of the weight column.
244 evaluate_plots (bool): Flag to indicate if the plots should be evaluated.
245 nbins (int): Number of bins for the plots.
246 """
247
248 def __init__(self,
249 n_variations: int = 100,
250 weight_name: str = "Weight",
251 evaluate_plots: bool = True,
252 nbins: int = 50,
253 fillna: float = 1.0) -> None:
254 """
255 Initializes the Reweighter class.
256 """
257
258 self.n_variations = n_variations
259
260 self.particles = []
261
262 self.correlations = []
263
264 self.weight_name = weight_name
265
266 self.weights_generated = False
267
268 self.evaluate_plots = evaluate_plots
269
270 self.nbins = nbins
271
272 self.fillna = fillna
273
274 def get_bin_columns(self, weight_df) -> list:
275 """
276 Returns the kinematic bin columns of the dataframe.
277 """
278 return [col for col in weight_df.columns if col.endswith('_min') or col.endswith('_max')]
279
280 def get_binning(self, weight_df) -> dict:
281 """
282 Returns the kinematic binning of the dataframe.
283 """
284 columns = self.get_bin_columns(weight_df)
285 var_names = {'_'.join(col.split('_')[:-1]) for col in columns}
286 bin_dict = {}
287 for var_name in var_names:
288 bin_dict[var_name] = []
289 for col in columns:
290 if col.startswith(var_name):
291 bin_dict[var_name] += list(weight_df[col].values)
292 bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))
293 return bin_dict
294
295 def get_fei_binning(self, weight_df) -> dict:
296 """
297 Returns the irregular binning of the dataframe.
298 """
299 return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith('mode'),
300 _fei_mode_col].value_counts().index.to_list()}
301
303 ntuple_df: pd.DataFrame,
304 particle: ReweighterParticle) -> None:
305 """
306 Checks if the variables are in the ntuple and returns them.
307
308 Args:
309 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
310 particle (ReweighterParticle): Particle object containing the necessary variables.
311 """
312 ntuple_variables = particle.get_binning_variables()
313 ntuple_variables += particle.get_pdg_variables()
314 for var in ntuple_variables:
315 if var not in ntuple_df.columns:
316 raise ValueError(f'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')
317 return ntuple_variables
318
320 weights_dict: dict,
321 pdg_pid_variable_dict: dict) -> pd.DataFrame:
322 """
323 Merges the efficiency and fake rate weight tables.
324
325 Args:
326 weights_dict (dict): Dictionary containing the weight tables.
327 pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.
328 """
329 weight_dfs = []
330 for reco_pdg, mc_pdg in weights_dict:
331 if reco_pdg not in pdg_pid_variable_dict:
332 raise ValueError(f'Reconstructed PDG code {reco_pdg} not found in thresholds!')
333 weight_df = weights_dict[(reco_pdg, mc_pdg)]
334 weight_df['mcPDG'] = mc_pdg
335 weight_df['PDG'] = reco_pdg
336 # Check if these are legacy tables:
337 if 'charge' in weight_df.columns:
338 charge_dict = {'+': [0, 2], '-': [-2, 0]}
339 weight_df[['charge_min', 'charge_max']] = [charge_dict[val] for val in weight_df['charge'].values]
340 weight_df = weight_df.drop(columns=['charge'])
341 # If iso_score is a single value, drop the min and max columns
342 if 'iso_score_min' in weight_df.columns and len(weight_df['iso_score_min'].unique()) == 1:
343 weight_df = weight_df.drop(columns=['iso_score_min', 'iso_score_max'])
344 pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]
345 threshold = pdg_pid_variable_dict[reco_pdg][1]
346 selected_weights = weight_df.query(f'variable == "{pid_variable_name}" and threshold == {threshold}')
347 if len(selected_weights) == 0:
348 available_variables = weight_df['variable'].unique()
349 available_thresholds = weight_df['threshold'].unique()
350 raise ValueError(f'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'
351 f' variable {pid_variable_name} and threshold {threshold}!\n'
352 f' Available variables: {available_variables}\n'
353 f' Available thresholds: {available_thresholds}')
354 weight_dfs.append(selected_weights)
355 return pd.concat(weight_dfs, ignore_index=True)
356
358 ntuple_df: pd.DataFrame,
359 particle: ReweighterParticle) -> None:
360 """
361 Adds a weight and uncertainty columns to the dataframe.
362
363 Args:
364 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
365 particle (ReweighterParticle): Particle object.
366 """
367 # Apply a weight value from the weight table to the ntuple, based on the binning
368 binning_df = pd.DataFrame(index=ntuple_df.index)
369 # Take absolute value of mcPDG for binning because we have charge already
370 binning_df['mcPDG'] = ntuple_df[f'{particle.get_varname("mcPDG")}'].abs()
371 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
372 plot_values = {}
373 for reco_pdg, mc_pdg in particle.pdg_binning:
374 ntuple_cut = f'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'
375 if ntuple_df.query(ntuple_cut).empty:
376 continue
377 plot_values[(reco_pdg, mc_pdg)] = {}
378 for var in particle.pdg_binning[(reco_pdg, mc_pdg)]:
379 labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])
380 for i in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]
381 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(
382 ntuple_cut)[f'{particle.get_varname(var)}'],
383 particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)
384 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
385 f'{var}_min'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
386 var].str[0]
387 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
388 f'{var}_max'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
389 var].str[1]
390 binning_df.drop(var, axis=1, inplace=True)
391 if self.evaluate_plots:
392 values = ntuple_df.query(ntuple_cut)[f'{particle.get_varname(var)}']
393 if len(values.unique()) < 2:
394 print(f'Skip {var} for plotting!')
395 continue
396 x_range = np.linspace(values.min(), values.max(), self.nbins)
397 plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=True)[0]
398 # merge the weight table with the ntuple on binning columns
399 weight_cols = _weight_cols
400 if particle.column_names:
401 weight_cols = particle.column_names
402 binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],
403 on=binning_df.columns.tolist(), how='left')
404 binning_df.index = ntuple_df.index
405 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
406 particle.plot_values = plot_values
407 for col in weight_cols:
408 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
409 ntuple_df[f'{particle.get_varname(col)}'] = ntuple_df[f'{particle.get_varname(col)}'].fillna(self.fillna)
410
412 prefix: str,
413 weights_dict: dict,
414 pdg_pid_variable_dict: dict,
415 variable_aliases: dict = None,
416 sys_seed: int = None,
417 syscorr: bool = True) -> None:
418 """
419 Adds weight variations according to the total uncertainty for easier error propagation.
420
421 Args:
422 prefix (str): Prefix for the new columns.
423 weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
424 pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
425 variable_aliases (dict): Dictionary containing variable aliases.
426 sys_seed (int): Seed for the systematic variations.
427 syscorr (bool): When true assume systematics are 100% correlated defaults to
428 true. Note this is overridden by provision of a None value rho_sys
429 """
430 # Empty prefix means no prefix
431 if prefix is None:
432 prefix = ''
433 # Add underscore if not present
434 if prefix and not prefix.endswith('_'):
435 prefix += '_'
436 if self.get_particle(prefix):
437 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
438 if variable_aliases is None:
439 variable_aliases = {}
440 merged_weight_df = self.merge_pid_weight_tables(weights_dict, pdg_pid_variable_dict)
441 pdg_binning = {(reco_pdg, mc_pdg): self.get_binning(merged_weight_df.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
442 for reco_pdg, mc_pdg in merged_weight_df[['PDG', 'mcPDG']].value_counts().index.to_list()}
443 particle = ReweighterParticle(prefix,
444 type='PID',
445 merged_table=merged_weight_df,
446 pdg_binning=pdg_binning,
447 variable_aliases=variable_aliases,
448 weight_name=self.weight_name,
449 sys_seed=sys_seed,
450 syscorr=syscorr)
451 self.particles += [particle]
452
453 def get_particle(self, prefix: str) -> ReweighterParticle:
454 """
455 Get a particle by its prefix.
456 """
457 cands = [particle for particle in self.particles if particle.prefix.strip('_') == prefix.strip('_')]
458 if len(cands) == 0:
459 return None
460 return cands[0]
461
462 def convert_fei_table(self, table: pd.DataFrame, threshold: float):
463 """
464 Checks if the tables are provided in a legacy format and converts them to the standard format.
465 """
466 result = None
467 str_to_pdg = {'B+': 521, 'B-': 521, 'B0': 511}
468 if 'cal' in table.columns:
469 result = pd.DataFrame(index=table.index)
470 result['data_MC_ratio'] = table['cal']
471 result['PDG'] = table['Btag'].apply(lambda x: str_to_pdg.get(x))
472 # Assume these are only efficiency tables
473 result['mcPDG'] = result['PDG']
474 result['threshold'] = table['sig_prob_threshold']
475 result[_fei_mode_col] = table[_fei_mode_col]
476 result['data_MC_uncertainty_stat_dn'] = table['cal_stat_error']
477 result['data_MC_uncertainty_stat_up'] = table['cal_stat_error']
478 result['data_MC_uncertainty_sys_dn'] = table['cal_sys_error']
479 result['data_MC_uncertainty_sys_up'] = table['cal_sys_error']
480 elif 'cal factor' in table.columns:
481 result = pd.DataFrame(index=table.index)
482 result['data_MC_ratio'] = table['cal factor']
483 result['PDG'] = table['Btype'].apply(lambda x: str_to_pdg.get(x))
484 result['mcPDG'] = result['PDG']
485 result['threshold'] = table['sig prob cut']
486 # Assign the total error to the stat uncertainty and set syst. one to 0
487 result['data_MC_uncertainty_stat_dn'] = table['error']
488 result['data_MC_uncertainty_stat_up'] = table['error']
489 result['data_MC_uncertainty_sys_dn'] = 0
490 result['data_MC_uncertainty_sys_up'] = 0
491 result[_fei_mode_col] = table['mode']
492 elif 'dmID' in table.columns:
493 result = pd.DataFrame(index=table.index)
494 result['data_MC_ratio'] = table['central_value']
495 result['PDG'] = table['PDG'].apply(ast.literal_eval).str[0].abs()
496 result['mcPDG'] = result['PDG']
497 result['threshold'] = table['sigProb']
498 # Assign the total error to the stat uncertainty and set syst. one to 0
499 result['data_MC_uncertainty_stat_dn'] = table['total_error']
500 result['data_MC_uncertainty_stat_up'] = table['total_error']
501 result['data_MC_uncertainty_sys_dn'] = 0
502 result['data_MC_uncertainty_sys_up'] = 0
503 result[_fei_mode_col] = ('mode' + table['dmID'].astype(str)).replace('mode999', 'rest')
504 else:
505 result = table
506 result = result.query(f'threshold == {threshold}')
507 if len(result) == 0:
508 raise ValueError(f'No weights found for threshold {threshold}!')
509 return result
510
511 def add_fei_particle(self, prefix: str,
512 table: pd.DataFrame,
513 threshold: float,
514 cov: np.ndarray = None,
515 variable_aliases: dict = None,
516 ) -> None:
517 """
518 Adds weight variations according to the total uncertainty for easier error propagation.
519
520 Args:
521 prefix (str): Prefix for the new columns.
522 table (pandas.DataFrame): Dataframe containing the efficiency weights.
523 threshold (float): Threshold for the efficiency weights.
524 cov (numpy.ndarray): Covariance matrix for the efficiency weights.
525 variable_aliases (dict): Dictionary containing variable aliases.
526 """
527 # Empty prefix means no prefix
528 if prefix is None:
529 prefix = ''
530 if prefix and not prefix.endswith('_'):
531 prefix += '_'
532 if self.get_particle(prefix):
533 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
534 if variable_aliases is None:
535 variable_aliases = {}
536 if table is None or len(table) == 0:
537 raise ValueError('No weights provided!')
538 converted_table = self.convert_fei_table(table, threshold)
539 pdg_binning = {(reco_pdg, mc_pdg): self.get_fei_binning(converted_table.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
540 for reco_pdg, mc_pdg in converted_table[['PDG', 'mcPDG']].value_counts().index.to_list()}
541 particle = ReweighterParticle(prefix,
542 type='FEI',
543 merged_table=converted_table,
544 pdg_binning=pdg_binning,
545 variable_aliases=variable_aliases,
546 weight_name=self.weight_name,
547 cov=cov)
548 self.particles += [particle]
549
550 def add_fei_weight_columns(self, ntuple_df: pd.DataFrame, particle: ReweighterParticle):
551 """
552 Adds weight columns according to the FEI calibration tables
553 """
554 rest_str = 'rest'
555 particle.merged_table[_fei_mode_col]
556 # Apply a weight value from the weight table to the ntuple, based on the binning
557 binning_df = pd.DataFrame(index=ntuple_df.index)
558 # Take absolute value of mcPDG for binning because we have charge already
559 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
560 # Copy the mode ID from the ntuple
561 binning_df['num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)
562 # Default value in case if reco PDG is not a B-meson PDG
563 binning_df[_fei_mode_col] = np.nan
564 plot_values = {}
565 for reco_pdg, mc_pdg in particle.pdg_binning:
566 plot_values[(reco_pdg, mc_pdg)] = {}
567 binning_df.loc[binning_df['PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(
568 f'PDG == {reco_pdg} and {_fei_mode_col}.str.lower() == "{rest_str}"')[_fei_mode_col].values[0]
569 for mode in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:
570 binning_df.loc[(binning_df['PDG'] == reco_pdg) & (binning_df['num_mode'] == int(mode[4:])), _fei_mode_col] = mode
571 if self.evaluate_plots:
572 values = ntuple_df[f'{particle.get_varname(_fei_mode_col)}']
573 x_range = np.linspace(values.min(), values.max(), int(values.max())+1)
574 plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=True)[0]
575
576 # merge the weight table with the ntuple on binning columns
577 weight_cols = _weight_cols
578 if particle.column_names:
579 weight_cols = particle.column_names
580 binning_df = binning_df.merge(particle.merged_table[weight_cols + ['PDG', _fei_mode_col]],
581 on=['PDG', _fei_mode_col], how='left')
582 binning_df.index = ntuple_df.index
583 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
584 particle.plot_values = plot_values
585 for col in weight_cols:
586 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
587
588 def reweight(self,
589 df: pd.DataFrame,
590 generate_variations: bool = True):
591 """
592 Reweights the dataframe according to the weight tables.
593
594 Args:
595 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
596 generate_variations (bool): When true generate weight variations.
597 """
598 for particle in self.particles:
599 if particle.type not in _correction_types:
600 raise ValueError(f'Particle type {particle.type} not supported!')
601 print(f'Required variables: {self.get_ntuple_variables(df, particle)}')
602 if generate_variations:
603 particle.generate_variations(n_variations=self.n_variations)
604 if particle.type == 'PID':
605 self.add_pid_weight_columns(df, particle)
606 elif particle.type == 'FEI':
607 self.add_fei_weight_columns(df, particle)
608 return df
609
610 def print_coverage(self):
611 """
612 Prints the coverage of each particle.
613 """
614 print('Coverage:')
615 for particle in self.particles:
616 print(f'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')
617
618 def plot_coverage(self):
619 """
620 Plots the coverage of each particle.
621 """
622 for particle in self.particles:
623 particle.plot_coverage()
624
625
626def add_weights_to_dataframe(prefix: str,
627 df: pd.DataFrame,
628 systematic: str,
629 custom_tables: dict = None,
630 custom_thresholds: dict = None,
631 **kw_args) -> pd.DataFrame:
632 """
633 Helper method that adds weights to a dataframe.
634
635 Args:
636 prefix (str): Prefix for the new columns.
637 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
638 systematic (str): Type of the systematic corrections, options: "custom_PID" and "custom_FEI".
639 MC_production (str): Name of the MC production.
640 custom_tables (dict): Dictionary containing the custom efficiency weights.
641 custom_thresholds (dict): Dictionary containing the custom thresholds for the custom efficiency weights.
642 n_variations (int): Number of variations to generate.
643 generate_variations (bool): When true generate weight variations.
644 weight_name (str): Name of the weight column.
645 show_plots (bool): When true show the coverage plots.
646 variable_aliases (dict): Dictionary containing variable aliases.
647 cov_matrix (numpy.ndarray): Covariance matrix for the custom efficiency weights.
648 fillna (int): Value to fill NaN values with.
649 sys_seed (int): Seed for the systematic variations.
650 syscorr (bool): When true assume systematics are 100% correlated defaults to true.
651 **kw_args: Additional arguments for the Reweighter class.
652 """
653 generate_variations = kw_args.get('generate_variations', True)
654 n_variations = kw_args.get('n_variations', 100)
655 weight_name = kw_args.get('weight_name', "Weight")
656 fillna = kw_args.get('fillna', 1.0)
657 # Catch performance warnings from pandas
658 with warnings.catch_warnings():
659 warnings.simplefilter("ignore", category=PerformanceWarning)
660 reweighter = Reweighter(n_variations=n_variations,
661 weight_name=weight_name,
662 fillna=fillna)
663 variable_aliases = kw_args.get('variable_aliases')
664 if systematic.lower() == 'custom_fei':
665 cov_matrix = kw_args.get('cov_matrix')
666 reweighter.add_fei_particle(prefix=prefix,
667 table=custom_tables,
668 threshold=custom_thresholds,
669 variable_aliases=variable_aliases,
670 cov=cov_matrix
671 )
672 elif systematic.lower() == 'custom_pid':
673 sys_seed = kw_args.get('sys_seed')
674 syscorr = kw_args.get('syscorr')
675 if syscorr is None:
676 syscorr = True
677 reweighter.add_pid_particle(prefix=prefix,
678 weights_dict=custom_tables,
679 pdg_pid_variable_dict=custom_thresholds,
680 variable_aliases=variable_aliases,
681 sys_seed=sys_seed,
682 syscorr=syscorr
683 )
684 else:
685 raise ValueError(f'Systematic {systematic} is not supported!')
686
687 result = reweighter.reweight(df, generate_variations=generate_variations)
688 if kw_args.get('show_plots'):
689 reweighter.print_coverage()
690 reweighter.plot_coverage()
691 return result
str type
Add the mcPDG code requirement for PID particle.
Definition sysvar.py:113
variable_aliases
Variable aliases of the weight table.
Definition sysvar.py:94
pd merged_table
Type of the particle (PID or FEI)
Definition sysvar.py:57
int sys_seed
Random seed for systematics.
Definition sysvar.py:75
np cov
Covariance matrix corresponds to the total uncertainty.
Definition sysvar.py:78
str get_varname(self, str varname)
Definition sysvar.py:89
weight_name
Weight column name that will be added to the ntuple.
Definition sysvar.py:136
plot_coverage(self, fig=None, axs=None)
Definition sysvar.py:183
list get_pdg_variables(self)
Definition sysvar.py:107
list get_binning_variables(self)
Definition sysvar.py:100
pdg_binning
Kinematic binning of the weight table per particle.
Definition sysvar.py:179
np.ndarray get_covariance(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
Definition sysvar.py:141
bool syscorr
When true assume systematics are 100% correlated.
Definition sysvar.py:81
prefix
Prefix of the particle in the ntuple.
Definition sysvar.py:96
dict plot_values
Values for the plots.
Definition sysvar.py:87
None generate_variations(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
Definition sysvar.py:120
list column_names
Internal list of the names of the weight columns.
Definition sysvar.py:72
n_variations
Number of weight variations to generate.
Definition sysvar.py:258
pd.DataFrame merge_pid_weight_tables(self, dict weights_dict, dict pdg_pid_variable_dict)
Definition sysvar.py:321
nbins
Number of bins for the plots.
Definition sysvar.py:270
None add_pid_particle(self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)
Definition sysvar.py:417
list particles
List of particles.
Definition sysvar.py:260
add_fei_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition sysvar.py:550
None __init__(self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)
Definition sysvar.py:253
print_coverage(self)
Definition sysvar.py:610
None add_pid_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition sysvar.py:359
weight_name
Name of the weight column.
Definition sysvar.py:264
None get_ntuple_variables(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition sysvar.py:304
fillna
Value to fill NaN values.
Definition sysvar.py:272
dict get_fei_binning(self, weight_df)
Definition sysvar.py:295
convert_fei_table(self, pd.DataFrame table, float threshold)
Definition sysvar.py:462
bool weights_generated
Flag to indicate if the weights have been generated.
Definition sysvar.py:266
dict get_binning(self, weight_df)
Definition sysvar.py:280
list get_bin_columns(self, weight_df)
Definition sysvar.py:274
reweight(self, pd.DataFrame df, bool generate_variations=True)
Definition sysvar.py:590
plot_coverage(self)
Definition sysvar.py:618
None add_fei_particle(self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)
Definition sysvar.py:516
ReweighterParticle get_particle(self, str prefix)
Definition sysvar.py:453
list correlations
Correlations between the particles.
Definition sysvar.py:262
evaluate_plots
Flag to indicate if the plots should be evaluated.
Definition sysvar.py:268