Belle II Software development
sysvar.py
1#!/usr/bin/env python3
2
3
10
11import pandas as pd
12import numpy as np
13from dataclasses import dataclass
14import matplotlib.pyplot as plt
15import pdg
16import warnings
17from pandas.errors import PerformanceWarning
18
19"""
20A module that adds corrections to analysis dataframe.
21It adds weight variations according to the total uncertainty for easier error propagation.
22"""
23
24_weight_cols = ['data_MC_ratio',
25 'data_MC_uncertainty_stat_dn',
26 'data_MC_uncertainty_stat_up',
27 'data_MC_uncertainty_sys_dn',
28 'data_MC_uncertainty_sys_up'
29 ]
30
31_correction_types = ['PID', 'FEI']
32_fei_mode_col = 'dec_mode'
33
34
35@dataclass
37 """
38 Class that stores the information of a particle.
39 """
40
41 prefix: str
42
43
44 type: str
45
46
47 merged_table: pd.DataFrame
48
49
50 pdg_binning: dict
51
52
53 variable_aliases: dict
54
55
56 weight_name: str
57
58
59 column_names: list = None
60
61
62 sys_seed: int = None
63
64
65 cov: np.ndarray = None
66
67
68 syscorr: bool = True
69
70
71 coverage: float = None
72
73
74 plot_values: dict = None
75
76 def get_varname(self, varname: str) -> str:
77 """
78 Returns the variable name with the prefix and use alias if defined.
79 """
80 name = varname
81 if self.variable_aliases and varname in self.variable_aliases:
82 name = self.variable_aliases[varname]
83 if name.startswith(self.prefix):
84 return name
85 return f'{self.prefix}{name}'
86
87 def get_binning_variables(self) -> list:
88 """
89 Returns the list of variables that are used for the binning
90 """
91 variables = set(sum([list(d.keys()) for d in self.pdg_binning.values()], []))
92 return [f'{self.get_varname(var)}' for var in variables]
93
94 def get_pdg_variables(self) -> list:
95 """
96 Returns the list of variables that are used for the PDG codes
97 """
98 pdg_vars = ['PDG']
99
100 if self.typetype == "PID":
101 pdg_vars += ['mcPDG']
102 return [f'{self.get_varname(var)}' for var in pdg_vars]
103
105 n_variations: int,
106 rho_sys: np.ndarray = None,
107 rho_stat: np.ndarray = None) -> None:
108 """
109 Generates variations of weights according to the uncertainties
110 """
111 self.merged_table['stat_error'] = self.merged_table[["data_MC_uncertainty_stat_up",
112 "data_MC_uncertainty_stat_dn"]].max(axis=1)
113 self.merged_table['sys_error'] = self.merged_table[["data_MC_uncertainty_sys_up",
114 "data_MC_uncertainty_sys_dn"]].max(axis=1)
115 self.merged_table["error"] = np.sqrt(self.merged_table["stat_error"] ** 2 + self.merged_table["sys_error"] ** 2)
116 means = self.merged_table["data_MC_ratio"].values
117
118 self.column_namescolumn_names = [f"{self.weight_name}_{i}" for i in range(n_variations)]
119 cov = self.get_covariance(n_variations, rho_sys, rho_stat)
120 weights = cov + means
121 self.merged_table[self.weight_name] = self.merged_table["data_MC_ratio"]
122 self.merged_table[self.column_namescolumn_names] = weights.T
123 self.column_namescolumn_names.insert(0, self.weight_name)
124
126 n_variations: int,
127 rho_sys: np.ndarray = None,
128 rho_stat: np.ndarray = None) -> np.ndarray:
129 """
130 Returns the covariance matrix of the weights
131 """
132 len_means = len(self.merged_table["data_MC_ratio"])
133 zeros = np.zeros(len_means)
134 if self.cov is None:
135 if rho_sys is None:
136 if self.syscorr:
137 rho_sys = np.ones((len_means, len_means))
138 else:
139 rho_sys = np.identity(len_means)
140 if rho_stat is None:
141 rho_stat = np.identity(len_means)
142 sys_cov = np.matmul(
143 np.matmul(np.diag(self.merged_table['sys_error']), rho_sys), np.diag(self.merged_table['sys_error'])
144 )
145 stat_cov = np.matmul(
146 np.matmul(np.diag(self.merged_table['stat_error']), rho_stat), np.diag(self.merged_table['stat_error'])
147 )
148 np.random.seed(self.sys_seed)
149 sys = np.random.multivariate_normal(zeros, sys_cov, n_variations)
150 np.random.seed(None)
151 stat = np.random.multivariate_normal(zeros, stat_cov, n_variations)
152 return sys + stat
153 errors = np.random.multivariate_normal(zeros, self.cov, n_variations)
154 return errors
155
156 def __str__(self) -> str:
157 """
158 Converts the object to a string.
159 """
160 separator = '------------------'
161 title = 'ReweighterParticle'
162 prefix_str = f'Type: {self.type} Prefix: {self.prefix}'
163 columns = _weight_cols
164 merged_table_str = f'Merged table:\n{self.merged_table[columns].describe()}'
165 pdg_binning_str = 'PDG binning:\n'
166 for pdgs in self.pdg_binning:
167 pdg_binning_str += f'{pdgs}: {self.pdg_binning[pdgs]}\n'
168 return '\n'.join([separator, title, prefix_str, merged_table_str, pdg_binning_str]) + separator
169
170 def plot_coverage(self, fig=None, axs=None):
171 """
172 Plots the coverage of the ntuple.
173 """
174 if self.plot_values is None:
175 return
176 vars = set(sum([list(d.keys()) for d in self.plot_values.values()], []))
177 if fig is None:
178 fig, axs = plt.subplots(len(self.plot_values), len(vars), figsize=(5*len(vars), 3*len(self.plot_values)), dpi=120)
179 axs = np.array(axs)
180 if len(axs.shape) < 1:
181 axs = axs.reshape(len(self.plot_values), len(vars))
182 bin_plt = {'linewidth': 3, 'linestyle': '--', 'color': '0.5'}
183 fig.suptitle(f'{self.type} particle {self.prefix.strip("_")}')
184 for (reco_pdg, mc_pdg), ax_row in zip(self.plot_values, axs):
185 for var, ax in zip(self.plot_values[(reco_pdg, mc_pdg)], ax_row):
186 ymin = 0
187 ymax = self.plot_values[(reco_pdg, mc_pdg)][var][1].max()*1.1
188 # Plot binning
189 if self.typetype == 'PID':
190 ax.vlines(self.pdg_binning[(reco_pdg, mc_pdg)][var], ymin, ymax,
191 label='Binning',
192 alpha=0.8,
193 **bin_plt)
194 elif self.typetype == 'FEI':
195 values = np.array([int(val[4:]) for val in self.pdg_binning[(reco_pdg, mc_pdg)][var]])
196 ax.bar(values+0.5,
197 np.ones(len(values))*ymax,
198 width=1,
199 alpha=0.5,
200 label='Binning',
201 **bin_plt)
202 rest = np.setdiff1d(self.plot_values[(reco_pdg, mc_pdg)][var][0], values)
203 ax.bar(rest+0.5,
204 np.ones(len(rest))*ymax,
205 width=1,
206 alpha=0.2,
207 label='Rest category',
208 **bin_plt)
209 # Plot values
210 widths = (self.plot_values[(reco_pdg, mc_pdg)][var][0][1:] - self.plot_values[(reco_pdg, mc_pdg)][var][0][:-1])
211 centers = self.plot_values[(reco_pdg, mc_pdg)][var][0][:-1] + widths/2
212 ax.bar(centers,
213 self.plot_values[(reco_pdg, mc_pdg)][var][1],
214 width=widths,
215 label='Values',
216 alpha=0.8)
217 ax.set_title(f'True {pdg.to_name(mc_pdg)} to reco {pdg.to_name(reco_pdg)} coverage')
218 ax.set_xlabel(var)
219 axs[-1][-1].legend()
220 fig.tight_layout()
221 return fig, axs
222
223
225 """
226 Class that reweights the dataframe.
227
228 Args:
229 n_variations (int): Number of weight variations to generate.
230 weight_name (str): Name of the weight column.
231 evaluate_plots (bool): Flag to indicate if the plots should be evaluated.
232 nbins (int): Number of bins for the plots.
233 """
234
235 def __init__(self,
236 n_variations: int = 100,
237 weight_name: str = "Weight",
238 evaluate_plots: bool = True,
239 nbins: int = 50,
240 fillna: float = 1.0) -> None:
241 """
242 Initializes the Reweighter class.
243 """
244
245 self.n_variations = n_variations
246
247 self.particles = []
248
249 self.correlations = []
250
251 self.weight_name = weight_name
252
253 self.weights_generated = False
254
255 self.evaluate_plots = evaluate_plots
256
257 self.nbins = nbins
258
259 self.fillna = fillna
260
261 def get_bin_columns(self, weight_df) -> list:
262 """
263 Returns the kinematic bin columns of the dataframe.
264 """
265 return [col for col in weight_df.columns if col.endswith('_min') or col.endswith('_max')]
266
267 def get_binning(self, weight_df) -> dict:
268 """
269 Returns the kinematic binning of the dataframe.
270 """
271 columns = self.get_bin_columns(weight_df)
272 var_names = {'_'.join(col.split('_')[:-1]) for col in columns}
273 bin_dict = {}
274 for var_name in var_names:
275 bin_dict[var_name] = []
276 for col in columns:
277 if col.startswith(var_name):
278 bin_dict[var_name] += list(weight_df[col].values)
279 bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))
280 return bin_dict
281
282 def get_fei_binning(self, weight_df) -> dict:
283 """
284 Returns the irregular binning of the dataframe.
285 """
286 return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith('mode'),
287 _fei_mode_col].value_counts().index.to_list()}
288
290 ntuple_df: pd.DataFrame,
291 particle: ReweighterParticle) -> None:
292 """
293 Checks if the variables are in the ntuple and returns them.
294
295 Args:
296 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
297 particle (ReweighterParticle): Particle object containing the necessary variables.
298 """
299 ntuple_variables = particle.get_binning_variables()
300 ntuple_variables += particle.get_pdg_variables()
301 for var in ntuple_variables:
302 if var not in ntuple_df.columns:
303 raise ValueError(f'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')
304 return ntuple_variables
305
307 weights_dict: dict,
308 pdg_pid_variable_dict: dict) -> pd.DataFrame:
309 """
310 Merges the efficiency and fake rate weight tables.
311
312 Args:
313 weights_dict (dict): Dictionary containing the weight tables.
314 pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.
315 """
316 weight_dfs = []
317 for reco_pdg, mc_pdg in weights_dict:
318 if reco_pdg not in pdg_pid_variable_dict:
319 raise ValueError(f'Reconstructed PDG code {reco_pdg} not found in thresholds!')
320 weight_df = weights_dict[(reco_pdg, mc_pdg)]
321 weight_df['mcPDG'] = mc_pdg
322 weight_df['PDG'] = reco_pdg
323 # Check if these are legacy tables:
324 if 'charge' in weight_df.columns:
325 charge_dict = {'+': [0, 2], '-': [-2, 0]}
326 weight_df[['charge_min', 'charge_max']] = [charge_dict[val] for val in weight_df['charge'].values]
327 weight_df = weight_df.drop(columns=['charge'])
328 # If iso_score is a single value, drop the min and max columns
329 if 'iso_score_min' in weight_df.columns and len(weight_df['iso_score_min'].unique()) == 1:
330 weight_df = weight_df.drop(columns=['iso_score_min', 'iso_score_max'])
331 pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]
332 threshold = pdg_pid_variable_dict[reco_pdg][1]
333 selected_weights = weight_df.query(f'variable == "{pid_variable_name}" and threshold == {threshold}')
334 if len(selected_weights) == 0:
335 available_variables = weight_df['variable'].unique()
336 available_thresholds = weight_df['threshold'].unique()
337 raise ValueError(f'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'
338 f' variable {pid_variable_name} and threshold {threshold}!\n'
339 f' Available variables: {available_variables}\n'
340 f' Available thresholds: {available_thresholds}')
341 weight_dfs.append(selected_weights)
342 return pd.concat(weight_dfs, ignore_index=True)
343
345 ntuple_df: pd.DataFrame,
346 particle: ReweighterParticle) -> None:
347 """
348 Adds a weight and uncertainty columns to the dataframe.
349
350 Args:
351 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
352 particle (ReweighterParticle): Particle object.
353 """
354 # Apply a weight value from the weight table to the ntuple, based on the binning
355 binning_df = pd.DataFrame(index=ntuple_df.index)
356 # Take absolute value of mcPDG for binning because we have charge already
357 binning_df['mcPDG'] = ntuple_df[f'{particle.get_varname("mcPDG")}'].abs()
358 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
359 plot_values = {}
360 for reco_pdg, mc_pdg in particle.pdg_binning:
361 ntuple_cut = f'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'
362 if ntuple_df.query(ntuple_cut).empty:
363 continue
364 plot_values[(reco_pdg, mc_pdg)] = {}
365 for var in particle.pdg_binning[(reco_pdg, mc_pdg)]:
366 labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])
367 for i in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]
368 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(
369 ntuple_cut)[f'{particle.get_varname(var)}'],
370 particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)
371 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
372 f'{var}_min'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
373 var].str[0]
374 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
375 f'{var}_max'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
376 var].str[1]
377 binning_df.drop(var, axis=1, inplace=True)
378 if self.evaluate_plots:
379 values = ntuple_df.query(ntuple_cut)[f'{particle.get_varname(var)}']
380 if len(values.unique()) < 2:
381 print(f'Skip {var} for plotting!')
382 continue
383 x_range = np.linspace(values.min(), values.max(), self.nbins)
384 plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=True)[0]
385 # merge the weight table with the ntuple on binning columns
386 weight_cols = _weight_cols
387 if particle.column_names:
388 weight_cols = particle.column_names
389 binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],
390 on=binning_df.columns.tolist(), how='left')
391 binning_df.index = ntuple_df.index
392 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
393 particle.plot_values = plot_values
394 for col in weight_cols:
395 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
396 ntuple_df[f'{particle.get_varname(col)}'] = ntuple_df[f'{particle.get_varname(col)}'].fillna(self.fillna)
397
399 prefix: str,
400 weights_dict: dict,
401 pdg_pid_variable_dict: dict,
402 variable_aliases: dict = None,
403 sys_seed: int = None,
404 syscorr: bool = True) -> None:
405 """
406 Adds weight variations according to the total uncertainty for easier error propagation.
407
408 Args:
409 prefix (str): Prefix for the new columns.
410 weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
411 pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
412 variable_aliases (dict): Dictionary containing variable aliases.
413 sys_seed (int): Seed for the systematic variations.
414 syscorr (bool): When true assume systematics are 100% correlated defaults to
415 true. Note this is overridden by provision of a None value rho_sys
416 """
417 # Empty prefix means no prefix
418 if prefix is None:
419 prefix = ''
420 # Add underscore if not present
421 if prefix and not prefix.endswith('_'):
422 prefix += '_'
423 if self.get_particle(prefix):
424 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
425 if variable_aliases is None:
426 variable_aliases = {}
427 merged_weight_df = self.merge_pid_weight_tables(weights_dict, pdg_pid_variable_dict)
428 pdg_binning = {(reco_pdg, mc_pdg): self.get_binning(merged_weight_df.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
429 for reco_pdg, mc_pdg in merged_weight_df[['PDG', 'mcPDG']].value_counts().index.to_list()}
430 particle = ReweighterParticle(prefix,
431 type='PID',
432 merged_table=merged_weight_df,
433 pdg_binning=pdg_binning,
434 variable_aliases=variable_aliases,
435 weight_name=self.weight_name,
436 sys_seed=sys_seed,
437 syscorr=syscorr)
438 self.particles += [particle]
439
440 def get_particle(self, prefix: str) -> ReweighterParticle:
441 """
442 Get a particle by its prefix.
443 """
444 cands = [particle for particle in self.particles if particle.prefix.strip('_') == prefix.strip('_')]
445 if len(cands) == 0:
446 return None
447 return cands[0]
448
449 def convert_fei_table(self, table: pd.DataFrame, threshold: float):
450 """
451 Checks if the tables are provided in a legacy format and converts them to the standard format.
452 """
453 result = None
454 str_to_pdg = {'B+': 521, 'B-': 521, 'B0': 511}
455 if 'cal' in table.columns:
456 result = pd.DataFrame(index=table.index)
457 result['data_MC_ratio'] = table['cal']
458 result['PDG'] = table['Btag'].apply(lambda x: str_to_pdg.get(x))
459 # Assume these are only efficiency tables
460 result['mcPDG'] = result['PDG']
461 result['threshold'] = table['sig_prob_threshold']
462 result[_fei_mode_col] = table[_fei_mode_col]
463 result['data_MC_uncertainty_stat_dn'] = table['cal_stat_error']
464 result['data_MC_uncertainty_stat_up'] = table['cal_stat_error']
465 result['data_MC_uncertainty_sys_dn'] = table['cal_sys_error']
466 result['data_MC_uncertainty_sys_up'] = table['cal_sys_error']
467 elif 'cal factor' in table.columns:
468 result = pd.DataFrame(index=table.index)
469 result['data_MC_ratio'] = table['cal factor']
470 result['PDG'] = table['Btype'].apply(lambda x: str_to_pdg.get(x))
471 result['mcPDG'] = result['PDG']
472 result['threshold'] = table['sig prob cut']
473 # Assign the total error to the stat uncertainty and set syst. one to 0
474 result['data_MC_uncertainty_stat_dn'] = table['error']
475 result['data_MC_uncertainty_stat_up'] = table['error']
476 result['data_MC_uncertainty_sys_dn'] = 0
477 result['data_MC_uncertainty_sys_up'] = 0
478 result[_fei_mode_col] = table['mode']
479 else:
480 result = table
481 result = result.query(f'threshold == {threshold}')
482 if len(result) == 0:
483 raise ValueError(f'No weights found for threshold {threshold}!')
484 return result
485
486 def add_fei_particle(self, prefix: str,
487 table: pd.DataFrame,
488 threshold: float,
489 cov: np.ndarray = None,
490 variable_aliases: dict = None,
491 ) -> None:
492 """
493 Adds weight variations according to the total uncertainty for easier error propagation.
494
495 Args:
496 prefix (str): Prefix for the new columns.
497 table (pandas.DataFrame): Dataframe containing the efficiency weights.
498 threshold (float): Threshold for the efficiency weights.
499 cov (numpy.ndarray): Covariance matrix for the efficiency weights.
500 variable_aliases (dict): Dictionary containing variable aliases.
501 """
502 # Empty prefix means no prefix
503 if prefix is None:
504 prefix = ''
505 if prefix and not prefix.endswith('_'):
506 prefix += '_'
507 if self.get_particle(prefix):
508 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
509 if variable_aliases is None:
510 variable_aliases = {}
511 if table is None or len(table) == 0:
512 raise ValueError('No weights provided!')
513 converted_table = self.convert_fei_table(table, threshold)
514 pdg_binning = {(reco_pdg, mc_pdg): self.get_fei_binning(converted_table.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
515 for reco_pdg, mc_pdg in converted_table[['PDG', 'mcPDG']].value_counts().index.to_list()}
516 particle = ReweighterParticle(prefix,
517 type='FEI',
518 merged_table=converted_table,
519 pdg_binning=pdg_binning,
520 variable_aliases=variable_aliases,
521 weight_name=self.weight_name,
522 cov=cov)
523 self.particles += [particle]
524
525 def add_fei_weight_columns(self, ntuple_df: pd.DataFrame, particle: ReweighterParticle):
526 """
527 Adds weight columns according to the FEI calibration tables
528 """
529 rest_str = 'rest'
530 particle.merged_table[_fei_mode_col]
531 # Apply a weight value from the weight table to the ntuple, based on the binning
532 binning_df = pd.DataFrame(index=ntuple_df.index)
533 # Take absolute value of mcPDG for binning because we have charge already
534 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
535 # Copy the mode ID from the ntuple
536 binning_df['num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)
537 # Default value in case if reco PDG is not a B-meson PDG
538 binning_df[_fei_mode_col] = np.nan
539 plot_values = {}
540 for reco_pdg, mc_pdg in particle.pdg_binning:
541 plot_values[(reco_pdg, mc_pdg)] = {}
542 binning_df.loc[binning_df['PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(
543 f'PDG == {reco_pdg} and {_fei_mode_col}.str.lower() == "{rest_str}"')[_fei_mode_col].values[0]
544 for mode in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:
545 binning_df.loc[(binning_df['PDG'] == reco_pdg) & (binning_df['num_mode'] == int(mode[4:])), _fei_mode_col] = mode
546 if self.evaluate_plots:
547 values = ntuple_df[f'{particle.get_varname(_fei_mode_col)}']
548 x_range = np.linspace(values.min(), values.max(), int(values.max())+1)
549 plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=True)[0]
550
551 # merge the weight table with the ntuple on binning columns
552 weight_cols = _weight_cols
553 if particle.column_names:
554 weight_cols = particle.column_names
555 binning_df = binning_df.merge(particle.merged_table[weight_cols + ['PDG', _fei_mode_col]],
556 on=['PDG', _fei_mode_col], how='left')
557 binning_df.index = ntuple_df.index
558 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
559 particle.plot_values = plot_values
560 for col in weight_cols:
561 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
562
563 def reweight(self,
564 df: pd.DataFrame,
565 generate_variations: bool = True):
566 """
567 Reweights the dataframe according to the weight tables.
568
569 Args:
570 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
571 generate_variations (bool): When true generate weight variations.
572 """
573 for particle in self.particles:
574 if particle.type not in _correction_types:
575 raise ValueError(f'Particle type {particle.type} not supported!')
576 print(f'Required variables: {self.get_ntuple_variables(df, particle)}')
577 if generate_variations:
578 particle.generate_variations(n_variations=self.n_variations)
579 if particle.type == 'PID':
580 self.add_pid_weight_columns(df, particle)
581 elif particle.type == 'FEI':
582 self.add_fei_weight_columns(df, particle)
583 return df
584
585 def print_coverage(self):
586 """
587 Prints the coverage of each particle.
588 """
589 print('Coverage:')
590 for particle in self.particles:
591 print(f'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')
592
593 def plot_coverage(self):
594 """
595 Plots the coverage of each particle.
596 """
597 for particle in self.particles:
598 particle.plot_coverage()
599
600
601def add_weights_to_dataframe(prefix: str,
602 df: pd.DataFrame,
603 systematic: str,
604 custom_tables: dict = None,
605 custom_thresholds: dict = None,
606 **kw_args) -> pd.DataFrame:
607 """
608 Helper method that adds weights to a dataframe.
609
610 Args:
611 prefix (str): Prefix for the new columns.
612 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
613 systematic (str): Type of the systematic corrections, options: "custom_PID" and "custom_FEI".
614 MC_production (str): Name of the MC production.
615 custom_tables (dict): Dictionary containing the custom efficiency weights.
616 custom_thresholds (dict): Dictionary containing the custom thresholds for the custom efficiency weights.
617 n_variations (int): Number of variations to generate.
618 generate_variations (bool): When true generate weight variations.
619 weight_name (str): Name of the weight column.
620 show_plots (bool): When true show the coverage plots.
621 variable_aliases (dict): Dictionary containing variable aliases.
622 cov_matrix (numpy.ndarray): Covariance matrix for the custom efficiency weights.
623 fillna (int): Value to fill NaN values with.
624 sys_seed (int): Seed for the systematic variations.
625 syscorr (bool): When true assume systematics are 100% correlated defaults to true.
626 **kw_args: Additional arguments for the Reweighter class.
627 """
628 generate_variations = kw_args.get('generate_variations', True)
629 n_variations = kw_args.get('n_variations', 100)
630 weight_name = kw_args.get('weight_name', "Weight")
631 fillna = kw_args.get('fillna', 1.0)
632 # Catch performance warnings from pandas
633 with warnings.catch_warnings():
634 warnings.simplefilter("ignore", category=PerformanceWarning)
635 reweighter = Reweighter(n_variations=n_variations,
636 weight_name=weight_name,
637 fillna=fillna)
638 variable_aliases = kw_args.get('variable_aliases')
639 if systematic.lower() == 'custom_fei':
640 cov_matrix = kw_args.get('cov_matrix')
641 reweighter.add_fei_particle(prefix=prefix,
642 table=custom_tables,
643 threshold=custom_thresholds,
644 variable_aliases=variable_aliases,
645 cov=cov_matrix
646 )
647 elif systematic.lower() == 'custom_pid':
648 sys_seed = kw_args.get('sys_seed')
649 syscorr = kw_args.get('syscorr')
650 if syscorr is None:
651 syscorr = True
652 reweighter.add_pid_particle(prefix=prefix,
653 weights_dict=custom_tables,
654 pdg_pid_variable_dict=custom_thresholds,
655 variable_aliases=variable_aliases,
656 sys_seed=sys_seed,
657 syscorr=syscorr
658 )
659 else:
660 raise ValueError(f'Systematic {systematic} is not supported!')
661
662 result = reweighter.reweight(df, generate_variations=generate_variations)
663 if kw_args.get('show_plots'):
664 reweighter.print_coverage()
665 reweighter.plot_coverage()
666 return result
str weight_name
Weight column name that will be added to the ntuple.
Definition: sysvar.py:56
str type
Type of the particle (PID or FEI)
Definition: sysvar.py:44
pd merged_table
Merged table of the weights.
Definition: sysvar.py:47
dict pdg_binning
Kinematic binning of the weight table per particle.
Definition: sysvar.py:50
np cov
Covariance matrix corresponds to the total uncertainty.
Definition: sysvar.py:65
str get_varname(self, str varname)
Definition: sysvar.py:76
str prefix
Prefix of the particle in the ntuple.
Definition: sysvar.py:41
int sys_seed
Random seed for systematics.
Definition: sysvar.py:62
list get_pdg_variables(self)
Definition: sysvar.py:94
type
Add the mcPDG code requirement for PID particle.
Definition: sysvar.py:100
list get_binning_variables(self)
Definition: sysvar.py:87
dict plot_values
Values for the plots.
Definition: sysvar.py:74
np.ndarray get_covariance(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
Definition: sysvar.py:128
bool syscorr
When true assume systematics are 100% correlated.
Definition: sysvar.py:68
None generate_variations(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
Definition: sysvar.py:107
def plot_coverage(self, fig=None, axs=None)
Definition: sysvar.py:170
list column_names
Internal list of the names of the weight columns.
Definition: sysvar.py:59
dict variable_aliases
Variable aliases of the weight table.
Definition: sysvar.py:53
column_names
Names of the varied weight columns.
Definition: sysvar.py:118
n_variations
Number of weight variations to generate.
Definition: sysvar.py:245
pd.DataFrame merge_pid_weight_tables(self, dict weights_dict, dict pdg_pid_variable_dict)
Definition: sysvar.py:308
nbins
Number of bins for the plots.
Definition: sysvar.py:257
None add_pid_particle(self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)
Definition: sysvar.py:404
None __init__(self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)
Definition: sysvar.py:240
None add_pid_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition: sysvar.py:346
weight_name
Name of the weight column.
Definition: sysvar.py:251
def add_fei_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition: sysvar.py:525
None get_ntuple_variables(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition: sysvar.py:291
fillna
Value to fill NaN values.
Definition: sysvar.py:259
weights_generated
Flag to indicate if the weights have been generated.
Definition: sysvar.py:253
dict get_fei_binning(self, weight_df)
Definition: sysvar.py:282
def reweight(self, pd.DataFrame df, bool generate_variations=True)
Definition: sysvar.py:565
def convert_fei_table(self, pd.DataFrame table, float threshold)
Definition: sysvar.py:449
def print_coverage(self)
Definition: sysvar.py:585
dict get_binning(self, weight_df)
Definition: sysvar.py:267
list get_bin_columns(self, weight_df)
Definition: sysvar.py:261
correlations
Correlations between the particles.
Definition: sysvar.py:249
None add_fei_particle(self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)
Definition: sysvar.py:491
ReweighterParticle get_particle(self, str prefix)
Definition: sysvar.py:440
particles
List of particles.
Definition: sysvar.py:247
evaluate_plots
Flag to indicate if the plots should be evaluated.
Definition: sysvar.py:255
def plot_coverage(self)
Definition: sysvar.py:593