Belle II Software light-2406-ragdoll
sysvar.py
1#!/usr/bin/env python3
2
3
10
11import pandas as pd
12import numpy as np
13from dataclasses import dataclass
14import matplotlib.pyplot as plt
15import pdg
16
17"""
18A module that adds corrections to analysis dataframe.
19It adds weight variations according to the total uncertainty for easier error propagation.
20"""
21
22_weight_cols = ['data_MC_ratio',
23 'data_MC_uncertainty_stat_dn',
24 'data_MC_uncertainty_stat_up',
25 'data_MC_uncertainty_sys_dn',
26 'data_MC_uncertainty_sys_up'
27 ]
28
29_correction_types = ['PID', 'FEI']
30_fei_mode_col = 'dec_mode'
31
32
33@dataclass
35 """
36 Class that stores the information of a particle.
37 """
38
39 prefix: str
40
41
42 type: str
43
44
45 merged_table: pd.DataFrame
46
47
48 pdg_binning: dict
49
50
51 variable_aliases: dict
52
53
54 weight_name: str
55
56
57 column_names: list = None
58
59
60 sys_seed: int = None
61
62
63 cov: np.ndarray = None
64
65
66 syscorr: bool = True
67
68
69 coverage: float = None
70
71
72 plot_values: dict = None
73
74 def get_varname(self, varname: str) -> str:
75 """
76 Returns the variable name with the prefix and use alias if defined.
77 """
78 name = varname
79 if self.variable_aliases and varname in self.variable_aliases:
80 name = self.variable_aliases[varname]
81 if name.startswith(self.prefix):
82 return name
83 return f'{self.prefix}{name}'
84
85 def get_binning_variables(self) -> list:
86 """
87 Returns the list of variables that are used for the binning
88 """
89 variables = set(sum([list(d.keys()) for d in self.pdg_binning.values()], []))
90 return [f'{self.get_varname(var)}' for var in variables]
91
92 def get_pdg_variables(self) -> list:
93 """
94 Returns the list of variables that are used for the PDG codes
95 """
96 pdg_vars = ['PDG']
97
98 if self.typetype == "PID":
99 pdg_vars += ['mcPDG']
100 return [f'{self.get_varname(var)}' for var in pdg_vars]
101
103 n_variations: int,
104 rho_sys: np.ndarray = None,
105 rho_stat: np.ndarray = None) -> None:
106 """
107 Generates variations of weights according to the uncertainties
108 """
109 self.merged_table['stat_error'] = self.merged_table[["data_MC_uncertainty_stat_up",
110 "data_MC_uncertainty_stat_dn"]].max(axis=1)
111 self.merged_table['sys_error'] = self.merged_table[["data_MC_uncertainty_sys_up",
112 "data_MC_uncertainty_sys_dn"]].max(axis=1)
113 self.merged_table["error"] = np.sqrt(self.merged_table["stat_error"] ** 2 + self.merged_table["sys_error"] ** 2)
114 means = self.merged_table["data_MC_ratio"].values
115
116 self.column_namescolumn_names = [f"{self.weight_name}_{i}" for i in range(n_variations)]
117 cov = self.get_covariance(n_variations, rho_sys, rho_stat)
118 weights = cov + means
119 self.merged_table[self.weight_name] = self.merged_table["data_MC_ratio"]
120 self.merged_table[self.column_namescolumn_names] = weights.T
121 self.column_namescolumn_names.insert(0, self.weight_name)
122
124 n_variations: int,
125 rho_sys: np.ndarray = None,
126 rho_stat: np.ndarray = None) -> np.ndarray:
127 """
128 Returns the covariance matrix of the weights
129 """
130 len_means = len(self.merged_table["data_MC_ratio"])
131 zeros = np.zeros(len_means)
132 if self.cov is None:
133 if rho_sys is None:
134 if self.syscorr:
135 rho_sys = np.ones((len_means, len_means))
136 else:
137 rho_sys = np.identity(len_means)
138 if rho_stat is None:
139 rho_stat = np.identity(len_means)
140 sys_cov = np.matmul(
141 np.matmul(np.diag(self.merged_table['sys_error']), rho_sys), np.diag(self.merged_table['sys_error'])
142 )
143 stat_cov = np.matmul(
144 np.matmul(np.diag(self.merged_table['stat_error']), rho_stat), np.diag(self.merged_table['stat_error'])
145 )
146 np.random.seed(self.sys_seed)
147 sys = np.random.multivariate_normal(zeros, sys_cov, n_variations)
148 np.random.seed(None)
149 stat = np.random.multivariate_normal(zeros, stat_cov, n_variations)
150 return sys + stat
151 errors = np.random.multivariate_normal(zeros, self.cov, n_variations)
152 return errors
153
154 def __str__(self) -> str:
155 """
156 Converts the object to a string.
157 """
158 separator = '------------------'
159 title = 'ReweighterParticle'
160 prefix_str = f'Type: {self.type} Prefix: {self.prefix}'
161 columns = _weight_cols
162 merged_table_str = f'Merged table:\n{self.merged_table[columns].describe()}'
163 pdg_binning_str = 'PDG binning:\n'
164 for pdgs in self.pdg_binning:
165 pdg_binning_str += f'{pdgs}: {self.pdg_binning[pdgs]}\n'
166 return '\n'.join([separator, title, prefix_str, merged_table_str, pdg_binning_str]) + separator
167
168 def plot_coverage(self, fig=None, axs=None):
169 """
170 Plots the coverage of the ntuple.
171 """
172 if self.plot_values is None:
173 return
174 vars = set(sum([list(d.keys()) for d in self.plot_values.values()], []))
175 if fig is None:
176 fig, axs = plt.subplots(len(self.plot_values), len(vars), figsize=(5*len(vars), 3*len(self.plot_values)), dpi=120)
177 axs = np.array(axs)
178 if len(axs.shape) < 1:
179 axs = axs.reshape(len(self.plot_values), len(vars))
180 bin_plt = {'linewidth': 3, 'linestyle': '--', 'color': '0.5'}
181 fig.suptitle(f'{self.type} particle {self.prefix.strip("_")}')
182 for (reco_pdg, mc_pdg), ax_row in zip(self.plot_values, axs):
183 for var, ax in zip(self.plot_values[(reco_pdg, mc_pdg)], ax_row):
184 ymin = 0
185 ymax = self.plot_values[(reco_pdg, mc_pdg)][var][1].max()*1.1
186 # Plot binning
187 if self.typetype == 'PID':
188 ax.vlines(self.pdg_binning[(reco_pdg, mc_pdg)][var], ymin, ymax,
189 label='Binning',
190 alpha=0.8,
191 **bin_plt)
192 elif self.typetype == 'FEI':
193 values = np.array([int(val[4:]) for val in self.pdg_binning[(reco_pdg, mc_pdg)][var]])
194 ax.bar(values+0.5,
195 np.ones(len(values))*ymax,
196 width=1,
197 alpha=0.5,
198 label='Binning',
199 **bin_plt)
200 rest = np.setdiff1d(self.plot_values[(reco_pdg, mc_pdg)][var][0], values)
201 ax.bar(rest+0.5,
202 np.ones(len(rest))*ymax,
203 width=1,
204 alpha=0.2,
205 label='Rest category',
206 **bin_plt)
207 # Plot values
208 widths = (self.plot_values[(reco_pdg, mc_pdg)][var][0][1:] - self.plot_values[(reco_pdg, mc_pdg)][var][0][:-1])
209 centers = self.plot_values[(reco_pdg, mc_pdg)][var][0][:-1] + widths/2
210 ax.bar(centers,
211 self.plot_values[(reco_pdg, mc_pdg)][var][1],
212 width=widths,
213 label='Values',
214 alpha=0.8)
215 ax.set_title(f'True {pdg.to_name(mc_pdg)} to reco {pdg.to_name(reco_pdg)} coverage')
216 ax.set_xlabel(var)
217 axs[-1][-1].legend()
218 fig.tight_layout()
219 return fig, axs
220
221
223 """
224 Class that reweights the dataframe.
225
226 Args:
227 n_variations (int): Number of weight variations to generate.
228 weight_name (str): Name of the weight column.
229 evaluate_plots (bool): Flag to indicate if the plots should be evaluated.
230 nbins (int): Number of bins for the plots.
231 """
232
233 def __init__(self,
234 n_variations: int = 100,
235 weight_name: str = "Weight",
236 evaluate_plots: bool = True,
237 nbins: int = 50,
238 fillna: float = 1.0) -> None:
239 """
240 Initializes the Reweighter class.
241 """
242
243 self.n_variations = n_variations
244
245 self.particles = []
246
247 self.correlations = []
248
249 self.weight_name = weight_name
250
251 self.weights_generated = False
252
253 self.evaluate_plots = evaluate_plots
254
255 self.nbins = nbins
256
257 self.fillna = fillna
258
259 def get_bin_columns(self, weight_df) -> list:
260 """
261 Returns the kinematic bin columns of the dataframe.
262 """
263 return [col for col in weight_df.columns if col.endswith('_min') or col.endswith('_max')]
264
265 def get_binning(self, weight_df) -> dict:
266 """
267 Returns the kinematic binning of the dataframe.
268 """
269 columns = self.get_bin_columns(weight_df)
270 var_names = {'_'.join(col.split('_')[:-1]) for col in columns}
271 bin_dict = {}
272 for var_name in var_names:
273 bin_dict[var_name] = []
274 for col in columns:
275 if col.startswith(var_name):
276 bin_dict[var_name] += list(weight_df[col].values)
277 bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))
278 return bin_dict
279
280 def get_fei_binning(self, weight_df) -> dict:
281 """
282 Returns the irregular binning of the dataframe.
283 """
284 return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith('mode'),
285 _fei_mode_col].value_counts().index.to_list()}
286
288 ntuple_df: pd.DataFrame,
289 particle: ReweighterParticle) -> None:
290 """
291 Checks if the variables are in the ntuple and returns them.
292
293 Args:
294 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
295 particle (ReweighterParticle): Particle object containing the necessary variables.
296 """
297 ntuple_variables = particle.get_binning_variables()
298 ntuple_variables += particle.get_pdg_variables()
299 for var in ntuple_variables:
300 if var not in ntuple_df.columns:
301 raise ValueError(f'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')
302 return ntuple_variables
303
305 weights_dict: dict,
306 pdg_pid_variable_dict: dict) -> pd.DataFrame:
307 """
308 Merges the efficiency and fake rate weight tables.
309
310 Args:
311 weights_dict (dict): Dictionary containing the weight tables.
312 pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.
313 """
314 weight_dfs = []
315 for reco_pdg, mc_pdg in weights_dict:
316 if reco_pdg not in pdg_pid_variable_dict:
317 raise ValueError(f'Reconstructed PDG code {reco_pdg} not found in thresholds!')
318 weight_df = weights_dict[(reco_pdg, mc_pdg)]
319 weight_df['mcPDG'] = mc_pdg
320 weight_df['PDG'] = reco_pdg
321 # Check if these are legacy tables:
322 if 'charge' in weight_df.columns:
323 charge_dict = {'+': [0, 2], '-': [-2, 0]}
324 weight_df[['charge_min', 'charge_max']] = [charge_dict[val] for val in weight_df['charge'].values]
325 weight_df = weight_df.drop(columns=['charge'])
326 # If iso_score is a single value, drop the min and max columns
327 if 'iso_score_min' in weight_df.columns and len(weight_df['iso_score_min'].unique()) == 1:
328 weight_df = weight_df.drop(columns=['iso_score_min', 'iso_score_max'])
329 pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]
330 threshold = pdg_pid_variable_dict[reco_pdg][1]
331 selected_weights = weight_df.query(f'variable == "{pid_variable_name}" and threshold == {threshold}')
332 if len(selected_weights) == 0:
333 available_variables = weight_df['variable'].unique()
334 available_thresholds = weight_df['threshold'].unique()
335 raise ValueError(f'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'
336 f' variable {pid_variable_name} and threshold {threshold}!\n'
337 f' Available variables: {available_variables}\n'
338 f' Available thresholds: {available_thresholds}')
339 weight_dfs.append(selected_weights)
340 return pd.concat(weight_dfs, ignore_index=True)
341
343 ntuple_df: pd.DataFrame,
344 particle: ReweighterParticle) -> None:
345 """
346 Adds a weight and uncertainty columns to the dataframe.
347
348 Args:
349 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
350 particle (ReweighterParticle): Particle object.
351 """
352 # Apply a weight value from the weight table to the ntuple, based on the binning
353 binning_df = pd.DataFrame(index=ntuple_df.index)
354 # Take absolute value of mcPDG for binning because we have charge already
355 binning_df['mcPDG'] = ntuple_df[f'{particle.get_varname("mcPDG")}'].abs()
356 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
357 plot_values = {}
358 for reco_pdg, mc_pdg in particle.pdg_binning:
359 ntuple_cut = f'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'
360 if ntuple_df.query(ntuple_cut).empty:
361 continue
362 plot_values[(reco_pdg, mc_pdg)] = {}
363 for var in particle.pdg_binning[(reco_pdg, mc_pdg)]:
364 labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])
365 for i in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]
366 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(
367 ntuple_cut)[f'{particle.get_varname(var)}'],
368 particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)
369 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
370 f'{var}_min'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
371 var].str[0]
372 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
373 f'{var}_max'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
374 var].str[1]
375 binning_df.drop(var, axis=1, inplace=True)
376 if self.evaluate_plots:
377 values = ntuple_df.query(ntuple_cut)[f'{particle.get_varname(var)}']
378 if len(values.unique()) < 2:
379 print(f'Skip {var} for plotting!')
380 continue
381 x_range = np.linspace(values.min(), values.max(), self.nbins)
382 plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=True)[0]
383 # merge the weight table with the ntuple on binning columns
384 weight_cols = _weight_cols
385 if particle.column_names:
386 weight_cols = particle.column_names
387 binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],
388 on=binning_df.columns.tolist(), how='left')
389 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
390 particle.plot_values = plot_values
391 for col in weight_cols:
392 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
393 ntuple_df[f'{particle.get_varname(col)}'] = ntuple_df[f'{particle.get_varname(col)}'].fillna(self.fillna)
394
396 prefix: str,
397 weights_dict: dict,
398 pdg_pid_variable_dict: dict,
399 variable_aliases: dict = None,
400 sys_seed: int = None,
401 syscorr: bool = True) -> None:
402 """
403 Adds weight variations according to the total uncertainty for easier error propagation.
404
405 Args:
406 prefix (str): Prefix for the new columns.
407 weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
408 pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
409 variable_aliases (dict): Dictionary containing variable aliases.
410 sys_seed (int): Seed for the systematic variations.
411 syscorr (bool): When true assume systematics are 100% correlated defaults to
412 true. Note this is overridden by provision of a None value rho_sys
413 """
414 # Empty prefix means no prefix
415 if prefix is None:
416 prefix = ''
417 # Add underscore if not present
418 if prefix and not prefix.endswith('_'):
419 prefix += '_'
420 if self.get_particle(prefix):
421 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
422 if variable_aliases is None:
423 variable_aliases = {}
424 merged_weight_df = self.merge_pid_weight_tables(weights_dict, pdg_pid_variable_dict)
425 pdg_binning = {(reco_pdg, mc_pdg): self.get_binning(merged_weight_df.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
426 for reco_pdg, mc_pdg in merged_weight_df[['PDG', 'mcPDG']].value_counts().index.to_list()}
427 particle = ReweighterParticle(prefix,
428 type='PID',
429 merged_table=merged_weight_df,
430 pdg_binning=pdg_binning,
431 variable_aliases=variable_aliases,
432 weight_name=self.weight_name,
433 sys_seed=sys_seed,
434 syscorr=syscorr)
435 self.particles += [particle]
436
437 def get_particle(self, prefix: str) -> ReweighterParticle:
438 """
439 Get a particle by its prefix.
440 """
441 cands = [particle for particle in self.particles if particle.prefix.strip('_') == prefix.strip('_')]
442 if len(cands) == 0:
443 return None
444 return cands[0]
445
446 def convert_fei_table(self, table: pd.DataFrame, threshold: float):
447 """
448 Checks if the tables are provided in a legacy format and converts them to the standard format.
449 """
450 result = None
451 if 'cal' in table.columns:
452 result = pd.DataFrame(index=table.index)
453 result['data_MC_ratio'] = table['cal']
454 str_to_pdg = {'B+': 521, 'B-': 521, 'B0': 511}
455 result['PDG'] = table['Btag'].apply(lambda x: str_to_pdg.get(x))
456 # Assume these are only efficiency tables
457 result['mcPDG'] = result['PDG']
458 result['threshold'] = table['sig_prob_threshold']
459 result[_fei_mode_col] = table[_fei_mode_col]
460 result['data_MC_uncertainty_stat_dn'] = table['cal_stat_error']
461 result['data_MC_uncertainty_stat_up'] = table['cal_stat_error']
462 result['data_MC_uncertainty_sys_dn'] = table['cal_sys_error']
463 result['data_MC_uncertainty_sys_up'] = table['cal_sys_error']
464 else:
465 result = table
466 result = result.query(f'threshold == {threshold}')
467 if len(result) == 0:
468 raise ValueError(f'No weights found for threshold {threshold}!')
469 return result
470
471 def add_fei_particle(self, prefix: str,
472 table: pd.DataFrame,
473 threshold: float,
474 cov: np.ndarray = None,
475 variable_aliases: dict = None,
476 ) -> None:
477 """
478 Adds weight variations according to the total uncertainty for easier error propagation.
479
480 Args:
481 prefix (str): Prefix for the new columns.
482 table (pandas.DataFrame): Dataframe containing the efficiency weights.
483 threshold (float): Threshold for the efficiency weights.
484 cov (numpy.ndarray): Covariance matrix for the efficiency weights.
485 variable_aliases (dict): Dictionary containing variable aliases.
486 """
487 # Empty prefix means no prefix
488 if prefix is None:
489 prefix = ''
490 if prefix and not prefix.endswith('_'):
491 prefix += '_'
492 if self.get_particle(prefix):
493 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
494 if variable_aliases is None:
495 variable_aliases = {}
496 if table is None or len(table) == 0:
497 raise ValueError('No weights provided!')
498 converted_table = self.convert_fei_table(table, threshold)
499 pdg_binning = {(reco_pdg, mc_pdg): self.get_fei_binning(converted_table.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
500 for reco_pdg, mc_pdg in converted_table[['PDG', 'mcPDG']].value_counts().index.to_list()}
501 particle = ReweighterParticle(prefix,
502 type='FEI',
503 merged_table=converted_table,
504 pdg_binning=pdg_binning,
505 variable_aliases=variable_aliases,
506 weight_name=self.weight_name,
507 cov=cov)
508 self.particles += [particle]
509
510 def add_fei_weight_columns(self, ntuple_df: pd.DataFrame, particle: ReweighterParticle):
511 """
512 Adds weight columns according to the FEI calibration tables
513 """
514 rest_str = 'Rest'
515 # Apply a weight value from the weight table to the ntuple, based on the binning
516 binning_df = pd.DataFrame(index=ntuple_df.index)
517 # Take absolute value of mcPDG for binning because we have charge already
518 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
519 # Copy the mode ID from the ntuple
520 binning_df['num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)
521 # Default value in case if reco PDG is not a B-meson PDG
522 binning_df[_fei_mode_col] = np.nan
523 plot_values = {}
524 for reco_pdg, mc_pdg in particle.pdg_binning:
525 plot_values[(reco_pdg, mc_pdg)] = {}
526 binning_df.loc[binning_df['PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(
527 f'PDG == {reco_pdg} and {_fei_mode_col} == "{rest_str}"')[_fei_mode_col].values[0]
528 for mode in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:
529 binning_df.loc[(binning_df['PDG'] == reco_pdg) & (binning_df['num_mode'] == int(mode[4:])), _fei_mode_col] = mode
530 if self.evaluate_plots:
531 values = ntuple_df[f'{particle.get_varname(_fei_mode_col)}']
532 x_range = np.linspace(values.min(), values.max(), int(values.max())+1)
533 plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=True)[0]
534
535 # merge the weight table with the ntuple on binning columns
536 weight_cols = _weight_cols
537 if particle.column_names:
538 weight_cols = particle.column_names
539 binning_df = binning_df.merge(particle.merged_table[weight_cols + ['PDG', _fei_mode_col]],
540 on=['PDG', _fei_mode_col], how='left')
541 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
542 particle.plot_values = plot_values
543 for col in weight_cols:
544 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
545
546 def reweight(self,
547 df: pd.DataFrame,
548 generate_variations: bool = True):
549 """
550 Reweights the dataframe according to the weight tables.
551
552 Args:
553 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
554 generate_variations (bool): When true generate weight variations.
555 """
556 for particle in self.particles:
557 if particle.type not in _correction_types:
558 raise ValueError(f'Particle type {particle.type} not supported!')
559 print(f'Required variables: {self.get_ntuple_variables(df, particle)}')
560 if generate_variations:
561 particle.generate_variations(n_variations=self.n_variations)
562 if particle.type == 'PID':
563 self.add_pid_weight_columns(df, particle)
564 elif particle.type == 'FEI':
565 self.add_fei_weight_columns(df, particle)
566 return df
567
568 def print_coverage(self):
569 """
570 Prints the coverage of each particle.
571 """
572 print('Coverage:')
573 for particle in self.particles:
574 print(f'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')
575
576 def plot_coverage(self):
577 """
578 Plots the coverage of each particle.
579 """
580 for particle in self.particles:
581 particle.plot_coverage()
582
583
584def add_weights_to_dataframe(prefix: str,
585 df: pd.DataFrame,
586 systematic: str,
587 custom_tables: dict = None,
588 custom_thresholds: dict = None,
589 **kw_args) -> pd.DataFrame:
590 """
591 Helper method that adds weights to a dataframe.
592
593 Args:
594 prefix (str): Prefix for the new columns.
595 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
596 systematic (str): Type of the systematic corrections, options: "custom_PID" and "custom_FEI".
597 MC_production (str): Name of the MC production.
598 custom_tables (dict): Dictionary containing the custom efficiency weights.
599 custom_thresholds (dict): Dictionary containing the custom thresholds for the custom efficiency weights.
600 n_variations (int): Number of variations to generate.
601 generate_variations (bool): When true generate weight variations.
602 weight_name (str): Name of the weight column.
603 show_plots (bool): When true show the coverage plots.
604 variable_aliases (dict): Dictionary containing variable aliases.
605 cov_matrix (numpy.ndarray): Covariance matrix for the custom efficiency weights.
606 fillna (int): Value to fill NaN values with.
607 sys_seed (int): Seed for the systematic variations.
608 syscorr (bool): When true assume systematics are 100% correlated defaults to true.
609 **kw_args: Additional arguments for the Reweighter class.
610 """
611 generate_variations = kw_args.get('generate_variations', True)
612 n_variations = kw_args.get('n_variations', 100)
613 weight_name = kw_args.get('weight_name', "Weight")
614 fillna = kw_args.get('fillna', 1.0)
615 reweighter = Reweighter(n_variations=n_variations,
616 weight_name=weight_name,
617 fillna=fillna)
618 variable_aliases = kw_args.get('variable_aliases')
619 if systematic.lower() == 'custom_fei':
620 cov_matrix = kw_args.get('cov_matrix')
621 reweighter.add_fei_particle(prefix=prefix,
622 table=custom_tables,
623 threshold=custom_thresholds,
624 variable_aliases=variable_aliases,
625 cov=cov_matrix
626 )
627 elif systematic.lower() == 'custom_pid':
628 sys_seed = kw_args.get('sys_seed')
629 syscorr = kw_args.get('syscorr')
630 if syscorr is None:
631 syscorr = True
632 reweighter.add_pid_particle(prefix=prefix,
633 weights_dict=custom_tables,
634 pdg_pid_variable_dict=custom_thresholds,
635 variable_aliases=variable_aliases,
636 sys_seed=sys_seed,
637 syscorr=syscorr
638 )
639 else:
640 raise ValueError(f'Systematic {systematic} is not supported!')
641
642 result = reweighter.reweight(df, generate_variations=generate_variations)
643 if kw_args.get('show_plots'):
644 reweighter.print_coverage()
645 reweighter.plot_coverage()
646 return result
str weight_name
Weight column name that will be added to the ntuple.
Definition: sysvar.py:54
str type
Type of the particle (PID or FEI)
Definition: sysvar.py:42
pd merged_table
Merged table of the weights.
Definition: sysvar.py:45
dict pdg_binning
Kinematic binning of the weight table per particle.
Definition: sysvar.py:48
np cov
Covariance matrix corresponds to the total uncertainty.
Definition: sysvar.py:63
str get_varname(self, str varname)
Definition: sysvar.py:74
str prefix
Prefix of the particle in the ntuple.
Definition: sysvar.py:39
int sys_seed
Random seed for systematics.
Definition: sysvar.py:60
list get_pdg_variables(self)
Definition: sysvar.py:92
type
Add the mcPDG code requirement for PID particle.
Definition: sysvar.py:98
list get_binning_variables(self)
Definition: sysvar.py:85
dict plot_values
Values for the plots.
Definition: sysvar.py:72
np.ndarray get_covariance(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
Definition: sysvar.py:126
bool syscorr
When true assume systematics are 100% correlated.
Definition: sysvar.py:66
None generate_variations(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
Definition: sysvar.py:105
def plot_coverage(self, fig=None, axs=None)
Definition: sysvar.py:168
list column_names
Internal list of the names of the weight columns.
Definition: sysvar.py:57
dict variable_aliases
Variable aliases of the weight table.
Definition: sysvar.py:51
column_names
Names of the varied weight columns.
Definition: sysvar.py:116
n_variations
Number of weight variations to generate.
Definition: sysvar.py:243
pd.DataFrame merge_pid_weight_tables(self, dict weights_dict, dict pdg_pid_variable_dict)
Definition: sysvar.py:306
nbins
Number of bins for the plots.
Definition: sysvar.py:255
None add_pid_particle(self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)
Definition: sysvar.py:401
None __init__(self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)
Definition: sysvar.py:238
None add_pid_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition: sysvar.py:344
weight_name
Name of the weight column.
Definition: sysvar.py:249
def add_fei_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition: sysvar.py:510
None get_ntuple_variables(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition: sysvar.py:289
fillna
Value to fill NaN values.
Definition: sysvar.py:257
weights_generated
Flag to indicate if the weights have been generated.
Definition: sysvar.py:251
dict get_fei_binning(self, weight_df)
Definition: sysvar.py:280
def reweight(self, pd.DataFrame df, bool generate_variations=True)
Definition: sysvar.py:548
def convert_fei_table(self, pd.DataFrame table, float threshold)
Definition: sysvar.py:446
def print_coverage(self)
Definition: sysvar.py:568
dict get_binning(self, weight_df)
Definition: sysvar.py:265
list get_bin_columns(self, weight_df)
Definition: sysvar.py:259
correlations
Correlations between the particles.
Definition: sysvar.py:247
None add_fei_particle(self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)
Definition: sysvar.py:476
ReweighterParticle get_particle(self, str prefix)
Definition: sysvar.py:437
particles
List of particles.
Definition: sysvar.py:245
evaluate_plots
Flag to indicate if the plots should be evaluated.
Definition: sysvar.py:253
def plot_coverage(self)
Definition: sysvar.py:576