Belle II Software light-2408-savannah
sysvar.py
1#!/usr/bin/env python3
2
3
10
11import pandas as pd
12import numpy as np
13from dataclasses import dataclass
14import matplotlib.pyplot as plt
15import pdg
16
17"""
18A module that adds corrections to analysis dataframe.
19It adds weight variations according to the total uncertainty for easier error propagation.
20"""
21
22_weight_cols = ['data_MC_ratio',
23 'data_MC_uncertainty_stat_dn',
24 'data_MC_uncertainty_stat_up',
25 'data_MC_uncertainty_sys_dn',
26 'data_MC_uncertainty_sys_up'
27 ]
28
29_correction_types = ['PID', 'FEI']
30_fei_mode_col = 'dec_mode'
31
32
33@dataclass
35 """
36 Class that stores the information of a particle.
37 """
38
39 prefix: str
40
41
42 type: str
43
44
45 merged_table: pd.DataFrame
46
47
48 pdg_binning: dict
49
50
51 variable_aliases: dict
52
53
54 weight_name: str
55
56
57 column_names: list = None
58
59
60 sys_seed: int = None
61
62
63 cov: np.ndarray = None
64
65
66 syscorr: bool = True
67
68
69 coverage: float = None
70
71
72 plot_values: dict = None
73
74 def get_varname(self, varname: str) -> str:
75 """
76 Returns the variable name with the prefix and use alias if defined.
77 """
78 name = varname
79 if self.variable_aliases and varname in self.variable_aliases:
80 name = self.variable_aliases[varname]
81 if name.startswith(self.prefix):
82 return name
83 return f'{self.prefix}{name}'
84
85 def get_binning_variables(self) -> list:
86 """
87 Returns the list of variables that are used for the binning
88 """
89 variables = set(sum([list(d.keys()) for d in self.pdg_binning.values()], []))
90 return [f'{self.get_varname(var)}' for var in variables]
91
92 def get_pdg_variables(self) -> list:
93 """
94 Returns the list of variables that are used for the PDG codes
95 """
96 pdg_vars = ['PDG']
97
98 if self.typetype == "PID":
99 pdg_vars += ['mcPDG']
100 return [f'{self.get_varname(var)}' for var in pdg_vars]
101
103 n_variations: int,
104 rho_sys: np.ndarray = None,
105 rho_stat: np.ndarray = None) -> None:
106 """
107 Generates variations of weights according to the uncertainties
108 """
109 self.merged_table['stat_error'] = self.merged_table[["data_MC_uncertainty_stat_up",
110 "data_MC_uncertainty_stat_dn"]].max(axis=1)
111 self.merged_table['sys_error'] = self.merged_table[["data_MC_uncertainty_sys_up",
112 "data_MC_uncertainty_sys_dn"]].max(axis=1)
113 self.merged_table["error"] = np.sqrt(self.merged_table["stat_error"] ** 2 + self.merged_table["sys_error"] ** 2)
114 means = self.merged_table["data_MC_ratio"].values
115
116 self.column_namescolumn_names = [f"{self.weight_name}_{i}" for i in range(n_variations)]
117 cov = self.get_covariance(n_variations, rho_sys, rho_stat)
118 weights = cov + means
119 self.merged_table[self.weight_name] = self.merged_table["data_MC_ratio"]
120 self.merged_table[self.column_namescolumn_names] = weights.T
121 self.column_namescolumn_names.insert(0, self.weight_name)
122
124 n_variations: int,
125 rho_sys: np.ndarray = None,
126 rho_stat: np.ndarray = None) -> np.ndarray:
127 """
128 Returns the covariance matrix of the weights
129 """
130 len_means = len(self.merged_table["data_MC_ratio"])
131 zeros = np.zeros(len_means)
132 if self.cov is None:
133 if rho_sys is None:
134 if self.syscorr:
135 rho_sys = np.ones((len_means, len_means))
136 else:
137 rho_sys = np.identity(len_means)
138 if rho_stat is None:
139 rho_stat = np.identity(len_means)
140 sys_cov = np.matmul(
141 np.matmul(np.diag(self.merged_table['sys_error']), rho_sys), np.diag(self.merged_table['sys_error'])
142 )
143 stat_cov = np.matmul(
144 np.matmul(np.diag(self.merged_table['stat_error']), rho_stat), np.diag(self.merged_table['stat_error'])
145 )
146 np.random.seed(self.sys_seed)
147 sys = np.random.multivariate_normal(zeros, sys_cov, n_variations)
148 np.random.seed(None)
149 stat = np.random.multivariate_normal(zeros, stat_cov, n_variations)
150 return sys + stat
151 errors = np.random.multivariate_normal(zeros, self.cov, n_variations)
152 return errors
153
154 def __str__(self) -> str:
155 """
156 Converts the object to a string.
157 """
158 separator = '------------------'
159 title = 'ReweighterParticle'
160 prefix_str = f'Type: {self.type} Prefix: {self.prefix}'
161 columns = _weight_cols
162 merged_table_str = f'Merged table:\n{self.merged_table[columns].describe()}'
163 pdg_binning_str = 'PDG binning:\n'
164 for pdgs in self.pdg_binning:
165 pdg_binning_str += f'{pdgs}: {self.pdg_binning[pdgs]}\n'
166 return '\n'.join([separator, title, prefix_str, merged_table_str, pdg_binning_str]) + separator
167
168 def plot_coverage(self, fig=None, axs=None):
169 """
170 Plots the coverage of the ntuple.
171 """
172 if self.plot_values is None:
173 return
174 vars = set(sum([list(d.keys()) for d in self.plot_values.values()], []))
175 if fig is None:
176 fig, axs = plt.subplots(len(self.plot_values), len(vars), figsize=(5*len(vars), 3*len(self.plot_values)), dpi=120)
177 axs = np.array(axs)
178 if len(axs.shape) < 1:
179 axs = axs.reshape(len(self.plot_values), len(vars))
180 bin_plt = {'linewidth': 3, 'linestyle': '--', 'color': '0.5'}
181 fig.suptitle(f'{self.type} particle {self.prefix.strip("_")}')
182 for (reco_pdg, mc_pdg), ax_row in zip(self.plot_values, axs):
183 for var, ax in zip(self.plot_values[(reco_pdg, mc_pdg)], ax_row):
184 ymin = 0
185 ymax = self.plot_values[(reco_pdg, mc_pdg)][var][1].max()*1.1
186 # Plot binning
187 if self.typetype == 'PID':
188 ax.vlines(self.pdg_binning[(reco_pdg, mc_pdg)][var], ymin, ymax,
189 label='Binning',
190 alpha=0.8,
191 **bin_plt)
192 elif self.typetype == 'FEI':
193 values = np.array([int(val[4:]) for val in self.pdg_binning[(reco_pdg, mc_pdg)][var]])
194 ax.bar(values+0.5,
195 np.ones(len(values))*ymax,
196 width=1,
197 alpha=0.5,
198 label='Binning',
199 **bin_plt)
200 rest = np.setdiff1d(self.plot_values[(reco_pdg, mc_pdg)][var][0], values)
201 ax.bar(rest+0.5,
202 np.ones(len(rest))*ymax,
203 width=1,
204 alpha=0.2,
205 label='Rest category',
206 **bin_plt)
207 # Plot values
208 widths = (self.plot_values[(reco_pdg, mc_pdg)][var][0][1:] - self.plot_values[(reco_pdg, mc_pdg)][var][0][:-1])
209 centers = self.plot_values[(reco_pdg, mc_pdg)][var][0][:-1] + widths/2
210 ax.bar(centers,
211 self.plot_values[(reco_pdg, mc_pdg)][var][1],
212 width=widths,
213 label='Values',
214 alpha=0.8)
215 ax.set_title(f'True {pdg.to_name(mc_pdg)} to reco {pdg.to_name(reco_pdg)} coverage')
216 ax.set_xlabel(var)
217 axs[-1][-1].legend()
218 fig.tight_layout()
219 return fig, axs
220
221
223 """
224 Class that reweights the dataframe.
225
226 Args:
227 n_variations (int): Number of weight variations to generate.
228 weight_name (str): Name of the weight column.
229 evaluate_plots (bool): Flag to indicate if the plots should be evaluated.
230 nbins (int): Number of bins for the plots.
231 """
232
233 def __init__(self,
234 n_variations: int = 100,
235 weight_name: str = "Weight",
236 evaluate_plots: bool = True,
237 nbins: int = 50,
238 fillna: float = 1.0) -> None:
239 """
240 Initializes the Reweighter class.
241 """
242
243 self.n_variations = n_variations
244
245 self.particles = []
246
247 self.correlations = []
248
249 self.weight_name = weight_name
250
251 self.weights_generated = False
252
253 self.evaluate_plots = evaluate_plots
254
255 self.nbins = nbins
256
257 self.fillna = fillna
258
259 def get_bin_columns(self, weight_df) -> list:
260 """
261 Returns the kinematic bin columns of the dataframe.
262 """
263 return [col for col in weight_df.columns if col.endswith('_min') or col.endswith('_max')]
264
265 def get_binning(self, weight_df) -> dict:
266 """
267 Returns the kinematic binning of the dataframe.
268 """
269 columns = self.get_bin_columns(weight_df)
270 var_names = {'_'.join(col.split('_')[:-1]) for col in columns}
271 bin_dict = {}
272 for var_name in var_names:
273 bin_dict[var_name] = []
274 for col in columns:
275 if col.startswith(var_name):
276 bin_dict[var_name] += list(weight_df[col].values)
277 bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))
278 return bin_dict
279
280 def get_fei_binning(self, weight_df) -> dict:
281 """
282 Returns the irregular binning of the dataframe.
283 """
284 return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith('mode'),
285 _fei_mode_col].value_counts().index.to_list()}
286
288 ntuple_df: pd.DataFrame,
289 particle: ReweighterParticle) -> None:
290 """
291 Checks if the variables are in the ntuple and returns them.
292
293 Args:
294 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
295 particle (ReweighterParticle): Particle object containing the necessary variables.
296 """
297 ntuple_variables = particle.get_binning_variables()
298 ntuple_variables += particle.get_pdg_variables()
299 for var in ntuple_variables:
300 if var not in ntuple_df.columns:
301 raise ValueError(f'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')
302 return ntuple_variables
303
305 weights_dict: dict,
306 pdg_pid_variable_dict: dict) -> pd.DataFrame:
307 """
308 Merges the efficiency and fake rate weight tables.
309
310 Args:
311 weights_dict (dict): Dictionary containing the weight tables.
312 pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.
313 """
314 weight_dfs = []
315 for reco_pdg, mc_pdg in weights_dict:
316 if reco_pdg not in pdg_pid_variable_dict:
317 raise ValueError(f'Reconstructed PDG code {reco_pdg} not found in thresholds!')
318 weight_df = weights_dict[(reco_pdg, mc_pdg)]
319 weight_df['mcPDG'] = mc_pdg
320 weight_df['PDG'] = reco_pdg
321 # Check if these are legacy tables:
322 if 'charge' in weight_df.columns:
323 charge_dict = {'+': [0, 2], '-': [-2, 0]}
324 weight_df[['charge_min', 'charge_max']] = [charge_dict[val] for val in weight_df['charge'].values]
325 weight_df = weight_df.drop(columns=['charge'])
326 # If iso_score is a single value, drop the min and max columns
327 if 'iso_score_min' in weight_df.columns and len(weight_df['iso_score_min'].unique()) == 1:
328 weight_df = weight_df.drop(columns=['iso_score_min', 'iso_score_max'])
329 pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]
330 threshold = pdg_pid_variable_dict[reco_pdg][1]
331 selected_weights = weight_df.query(f'variable == "{pid_variable_name}" and threshold == {threshold}')
332 if len(selected_weights) == 0:
333 available_variables = weight_df['variable'].unique()
334 available_thresholds = weight_df['threshold'].unique()
335 raise ValueError(f'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'
336 f' variable {pid_variable_name} and threshold {threshold}!\n'
337 f' Available variables: {available_variables}\n'
338 f' Available thresholds: {available_thresholds}')
339 weight_dfs.append(selected_weights)
340 return pd.concat(weight_dfs, ignore_index=True)
341
343 ntuple_df: pd.DataFrame,
344 particle: ReweighterParticle) -> None:
345 """
346 Adds a weight and uncertainty columns to the dataframe.
347
348 Args:
349 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
350 particle (ReweighterParticle): Particle object.
351 """
352 # Apply a weight value from the weight table to the ntuple, based on the binning
353 binning_df = pd.DataFrame(index=ntuple_df.index)
354 # Take absolute value of mcPDG for binning because we have charge already
355 binning_df['mcPDG'] = ntuple_df[f'{particle.get_varname("mcPDG")}'].abs()
356 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
357 plot_values = {}
358 for reco_pdg, mc_pdg in particle.pdg_binning:
359 ntuple_cut = f'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'
360 if ntuple_df.query(ntuple_cut).empty:
361 continue
362 plot_values[(reco_pdg, mc_pdg)] = {}
363 for var in particle.pdg_binning[(reco_pdg, mc_pdg)]:
364 labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])
365 for i in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]
366 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(
367 ntuple_cut)[f'{particle.get_varname(var)}'],
368 particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)
369 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
370 f'{var}_min'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
371 var].str[0]
372 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
373 f'{var}_max'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
374 var].str[1]
375 binning_df.drop(var, axis=1, inplace=True)
376 if self.evaluate_plots:
377 values = ntuple_df.query(ntuple_cut)[f'{particle.get_varname(var)}']
378 if len(values.unique()) < 2:
379 print(f'Skip {var} for plotting!')
380 continue
381 x_range = np.linspace(values.min(), values.max(), self.nbins)
382 plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=True)[0]
383 # merge the weight table with the ntuple on binning columns
384 weight_cols = _weight_cols
385 if particle.column_names:
386 weight_cols = particle.column_names
387 binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],
388 on=binning_df.columns.tolist(), how='left')
389 binning_df.index = ntuple_df.index
390 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
391 particle.plot_values = plot_values
392 for col in weight_cols:
393 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
394 ntuple_df[f'{particle.get_varname(col)}'] = ntuple_df[f'{particle.get_varname(col)}'].fillna(self.fillna)
395
397 prefix: str,
398 weights_dict: dict,
399 pdg_pid_variable_dict: dict,
400 variable_aliases: dict = None,
401 sys_seed: int = None,
402 syscorr: bool = True) -> None:
403 """
404 Adds weight variations according to the total uncertainty for easier error propagation.
405
406 Args:
407 prefix (str): Prefix for the new columns.
408 weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
409 pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
410 variable_aliases (dict): Dictionary containing variable aliases.
411 sys_seed (int): Seed for the systematic variations.
412 syscorr (bool): When true assume systematics are 100% correlated defaults to
413 true. Note this is overridden by provision of a None value rho_sys
414 """
415 # Empty prefix means no prefix
416 if prefix is None:
417 prefix = ''
418 # Add underscore if not present
419 if prefix and not prefix.endswith('_'):
420 prefix += '_'
421 if self.get_particle(prefix):
422 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
423 if variable_aliases is None:
424 variable_aliases = {}
425 merged_weight_df = self.merge_pid_weight_tables(weights_dict, pdg_pid_variable_dict)
426 pdg_binning = {(reco_pdg, mc_pdg): self.get_binning(merged_weight_df.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
427 for reco_pdg, mc_pdg in merged_weight_df[['PDG', 'mcPDG']].value_counts().index.to_list()}
428 particle = ReweighterParticle(prefix,
429 type='PID',
430 merged_table=merged_weight_df,
431 pdg_binning=pdg_binning,
432 variable_aliases=variable_aliases,
433 weight_name=self.weight_name,
434 sys_seed=sys_seed,
435 syscorr=syscorr)
436 self.particles += [particle]
437
438 def get_particle(self, prefix: str) -> ReweighterParticle:
439 """
440 Get a particle by its prefix.
441 """
442 cands = [particle for particle in self.particles if particle.prefix.strip('_') == prefix.strip('_')]
443 if len(cands) == 0:
444 return None
445 return cands[0]
446
447 def convert_fei_table(self, table: pd.DataFrame, threshold: float):
448 """
449 Checks if the tables are provided in a legacy format and converts them to the standard format.
450 """
451 result = None
452 if 'cal' in table.columns:
453 result = pd.DataFrame(index=table.index)
454 result['data_MC_ratio'] = table['cal']
455 str_to_pdg = {'B+': 521, 'B-': 521, 'B0': 511}
456 result['PDG'] = table['Btag'].apply(lambda x: str_to_pdg.get(x))
457 # Assume these are only efficiency tables
458 result['mcPDG'] = result['PDG']
459 result['threshold'] = table['sig_prob_threshold']
460 result[_fei_mode_col] = table[_fei_mode_col]
461 result['data_MC_uncertainty_stat_dn'] = table['cal_stat_error']
462 result['data_MC_uncertainty_stat_up'] = table['cal_stat_error']
463 result['data_MC_uncertainty_sys_dn'] = table['cal_sys_error']
464 result['data_MC_uncertainty_sys_up'] = table['cal_sys_error']
465 else:
466 result = table
467 result = result.query(f'threshold == {threshold}')
468 if len(result) == 0:
469 raise ValueError(f'No weights found for threshold {threshold}!')
470 return result
471
472 def add_fei_particle(self, prefix: str,
473 table: pd.DataFrame,
474 threshold: float,
475 cov: np.ndarray = None,
476 variable_aliases: dict = None,
477 ) -> None:
478 """
479 Adds weight variations according to the total uncertainty for easier error propagation.
480
481 Args:
482 prefix (str): Prefix for the new columns.
483 table (pandas.DataFrame): Dataframe containing the efficiency weights.
484 threshold (float): Threshold for the efficiency weights.
485 cov (numpy.ndarray): Covariance matrix for the efficiency weights.
486 variable_aliases (dict): Dictionary containing variable aliases.
487 """
488 # Empty prefix means no prefix
489 if prefix is None:
490 prefix = ''
491 if prefix and not prefix.endswith('_'):
492 prefix += '_'
493 if self.get_particle(prefix):
494 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
495 if variable_aliases is None:
496 variable_aliases = {}
497 if table is None or len(table) == 0:
498 raise ValueError('No weights provided!')
499 converted_table = self.convert_fei_table(table, threshold)
500 pdg_binning = {(reco_pdg, mc_pdg): self.get_fei_binning(converted_table.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
501 for reco_pdg, mc_pdg in converted_table[['PDG', 'mcPDG']].value_counts().index.to_list()}
502 particle = ReweighterParticle(prefix,
503 type='FEI',
504 merged_table=converted_table,
505 pdg_binning=pdg_binning,
506 variable_aliases=variable_aliases,
507 weight_name=self.weight_name,
508 cov=cov)
509 self.particles += [particle]
510
511 def add_fei_weight_columns(self, ntuple_df: pd.DataFrame, particle: ReweighterParticle):
512 """
513 Adds weight columns according to the FEI calibration tables
514 """
515 rest_str = 'Rest'
516 # Apply a weight value from the weight table to the ntuple, based on the binning
517 binning_df = pd.DataFrame(index=ntuple_df.index)
518 # Take absolute value of mcPDG for binning because we have charge already
519 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
520 # Copy the mode ID from the ntuple
521 binning_df['num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)
522 # Default value in case if reco PDG is not a B-meson PDG
523 binning_df[_fei_mode_col] = np.nan
524 plot_values = {}
525 for reco_pdg, mc_pdg in particle.pdg_binning:
526 plot_values[(reco_pdg, mc_pdg)] = {}
527 binning_df.loc[binning_df['PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(
528 f'PDG == {reco_pdg} and {_fei_mode_col} == "{rest_str}"')[_fei_mode_col].values[0]
529 for mode in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:
530 binning_df.loc[(binning_df['PDG'] == reco_pdg) & (binning_df['num_mode'] == int(mode[4:])), _fei_mode_col] = mode
531 if self.evaluate_plots:
532 values = ntuple_df[f'{particle.get_varname(_fei_mode_col)}']
533 x_range = np.linspace(values.min(), values.max(), int(values.max())+1)
534 plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=True)[0]
535
536 # merge the weight table with the ntuple on binning columns
537 weight_cols = _weight_cols
538 if particle.column_names:
539 weight_cols = particle.column_names
540 binning_df = binning_df.merge(particle.merged_table[weight_cols + ['PDG', _fei_mode_col]],
541 on=['PDG', _fei_mode_col], how='left')
542 binning_df.index = ntuple_df.index
543 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
544 particle.plot_values = plot_values
545 for col in weight_cols:
546 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
547
548 def reweight(self,
549 df: pd.DataFrame,
550 generate_variations: bool = True):
551 """
552 Reweights the dataframe according to the weight tables.
553
554 Args:
555 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
556 generate_variations (bool): When true generate weight variations.
557 """
558 for particle in self.particles:
559 if particle.type not in _correction_types:
560 raise ValueError(f'Particle type {particle.type} not supported!')
561 print(f'Required variables: {self.get_ntuple_variables(df, particle)}')
562 if generate_variations:
563 particle.generate_variations(n_variations=self.n_variations)
564 if particle.type == 'PID':
565 self.add_pid_weight_columns(df, particle)
566 elif particle.type == 'FEI':
567 self.add_fei_weight_columns(df, particle)
568 return df
569
570 def print_coverage(self):
571 """
572 Prints the coverage of each particle.
573 """
574 print('Coverage:')
575 for particle in self.particles:
576 print(f'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')
577
578 def plot_coverage(self):
579 """
580 Plots the coverage of each particle.
581 """
582 for particle in self.particles:
583 particle.plot_coverage()
584
585
586def add_weights_to_dataframe(prefix: str,
587 df: pd.DataFrame,
588 systematic: str,
589 custom_tables: dict = None,
590 custom_thresholds: dict = None,
591 **kw_args) -> pd.DataFrame:
592 """
593 Helper method that adds weights to a dataframe.
594
595 Args:
596 prefix (str): Prefix for the new columns.
597 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
598 systematic (str): Type of the systematic corrections, options: "custom_PID" and "custom_FEI".
599 MC_production (str): Name of the MC production.
600 custom_tables (dict): Dictionary containing the custom efficiency weights.
601 custom_thresholds (dict): Dictionary containing the custom thresholds for the custom efficiency weights.
602 n_variations (int): Number of variations to generate.
603 generate_variations (bool): When true generate weight variations.
604 weight_name (str): Name of the weight column.
605 show_plots (bool): When true show the coverage plots.
606 variable_aliases (dict): Dictionary containing variable aliases.
607 cov_matrix (numpy.ndarray): Covariance matrix for the custom efficiency weights.
608 fillna (int): Value to fill NaN values with.
609 sys_seed (int): Seed for the systematic variations.
610 syscorr (bool): When true assume systematics are 100% correlated defaults to true.
611 **kw_args: Additional arguments for the Reweighter class.
612 """
613 generate_variations = kw_args.get('generate_variations', True)
614 n_variations = kw_args.get('n_variations', 100)
615 weight_name = kw_args.get('weight_name', "Weight")
616 fillna = kw_args.get('fillna', 1.0)
617 reweighter = Reweighter(n_variations=n_variations,
618 weight_name=weight_name,
619 fillna=fillna)
620 variable_aliases = kw_args.get('variable_aliases')
621 if systematic.lower() == 'custom_fei':
622 cov_matrix = kw_args.get('cov_matrix')
623 reweighter.add_fei_particle(prefix=prefix,
624 table=custom_tables,
625 threshold=custom_thresholds,
626 variable_aliases=variable_aliases,
627 cov=cov_matrix
628 )
629 elif systematic.lower() == 'custom_pid':
630 sys_seed = kw_args.get('sys_seed')
631 syscorr = kw_args.get('syscorr')
632 if syscorr is None:
633 syscorr = True
634 reweighter.add_pid_particle(prefix=prefix,
635 weights_dict=custom_tables,
636 pdg_pid_variable_dict=custom_thresholds,
637 variable_aliases=variable_aliases,
638 sys_seed=sys_seed,
639 syscorr=syscorr
640 )
641 else:
642 raise ValueError(f'Systematic {systematic} is not supported!')
643
644 result = reweighter.reweight(df, generate_variations=generate_variations)
645 if kw_args.get('show_plots'):
646 reweighter.print_coverage()
647 reweighter.plot_coverage()
648 return result
str weight_name
Weight column name that will be added to the ntuple.
Definition: sysvar.py:54
str type
Type of the particle (PID or FEI)
Definition: sysvar.py:42
pd merged_table
Merged table of the weights.
Definition: sysvar.py:45
dict pdg_binning
Kinematic binning of the weight table per particle.
Definition: sysvar.py:48
np cov
Covariance matrix corresponds to the total uncertainty.
Definition: sysvar.py:63
str get_varname(self, str varname)
Definition: sysvar.py:74
str prefix
Prefix of the particle in the ntuple.
Definition: sysvar.py:39
int sys_seed
Random seed for systematics.
Definition: sysvar.py:60
list get_pdg_variables(self)
Definition: sysvar.py:92
type
Add the mcPDG code requirement for PID particle.
Definition: sysvar.py:98
list get_binning_variables(self)
Definition: sysvar.py:85
dict plot_values
Values for the plots.
Definition: sysvar.py:72
np.ndarray get_covariance(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
Definition: sysvar.py:126
bool syscorr
When true assume systematics are 100% correlated.
Definition: sysvar.py:66
None generate_variations(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
Definition: sysvar.py:105
def plot_coverage(self, fig=None, axs=None)
Definition: sysvar.py:168
list column_names
Internal list of the names of the weight columns.
Definition: sysvar.py:57
dict variable_aliases
Variable aliases of the weight table.
Definition: sysvar.py:51
column_names
Names of the varied weight columns.
Definition: sysvar.py:116
n_variations
Number of weight variations to generate.
Definition: sysvar.py:243
pd.DataFrame merge_pid_weight_tables(self, dict weights_dict, dict pdg_pid_variable_dict)
Definition: sysvar.py:306
nbins
Number of bins for the plots.
Definition: sysvar.py:255
None add_pid_particle(self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)
Definition: sysvar.py:402
None __init__(self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)
Definition: sysvar.py:238
None add_pid_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition: sysvar.py:344
weight_name
Name of the weight column.
Definition: sysvar.py:249
def add_fei_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition: sysvar.py:511
None get_ntuple_variables(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition: sysvar.py:289
fillna
Value to fill NaN values.
Definition: sysvar.py:257
weights_generated
Flag to indicate if the weights have been generated.
Definition: sysvar.py:251
dict get_fei_binning(self, weight_df)
Definition: sysvar.py:280
def reweight(self, pd.DataFrame df, bool generate_variations=True)
Definition: sysvar.py:550
def convert_fei_table(self, pd.DataFrame table, float threshold)
Definition: sysvar.py:447
def print_coverage(self)
Definition: sysvar.py:570
dict get_binning(self, weight_df)
Definition: sysvar.py:265
list get_bin_columns(self, weight_df)
Definition: sysvar.py:259
correlations
Correlations between the particles.
Definition: sysvar.py:247
None add_fei_particle(self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)
Definition: sysvar.py:477
ReweighterParticle get_particle(self, str prefix)
Definition: sysvar.py:438
particles
List of particles.
Definition: sysvar.py:245
evaluate_plots
Flag to indicate if the plots should be evaluated.
Definition: sysvar.py:253
def plot_coverage(self)
Definition: sysvar.py:578