Belle II Software development
sysvar.py
1#!/usr/bin/env python3
2
3
10
11import pandas as pd
12import numpy as np
13from dataclasses import dataclass
14import matplotlib.pyplot as plt
15import pdg
16import warnings
17from pandas.errors import PerformanceWarning
18
19"""
20A module that adds corrections to analysis dataframe.
21It adds weight variations according to the total uncertainty for easier error propagation.
22"""
23
24_weight_cols = ['data_MC_ratio',
25 'data_MC_uncertainty_stat_dn',
26 'data_MC_uncertainty_stat_up',
27 'data_MC_uncertainty_sys_dn',
28 'data_MC_uncertainty_sys_up'
29 ]
30
31_correction_types = ['PID', 'FEI']
32_fei_mode_col = 'dec_mode'
33
34
35@dataclass
37 """
38 Class that stores the information of a particle.
39 """
40
42 prefix: str
43
44
45 type: str
46
47
48 merged_table: pd.DataFrame
49
50
52 pdg_binning: dict
53
54
56 variable_aliases: dict
57
58
60 weight_name: str
61
62
63 column_names: list = None
64
65
66 sys_seed: int = None
67
68
69 cov: np.ndarray = None
70
71
72 syscorr: bool = True
73
74
75 coverage: float = None
76
77
78 plot_values: dict = None
79
80 def get_varname(self, varname: str) -> str:
81 """
82 Returns the variable name with the prefix and use alias if defined.
83 """
84 name = varname
85 if self.variable_aliases and varname in self.variable_aliases:
86 name = self.variable_aliases[varname]
87 if name.startswith(self.prefix):
88 return name
89 return f'{self.prefix}{name}'
90
91 def get_binning_variables(self) -> list:
92 """
93 Returns the list of variables that are used for the binning
94 """
95 variables = set(sum([list(d.keys()) for d in self.pdg_binning.values()], []))
96 return [f'{self.get_varname(var)}' for var in variables]
97
98 def get_pdg_variables(self) -> list:
99 """
100 Returns the list of variables that are used for the PDG codes
101 """
102 pdg_vars = ['PDG']
103
104 if self.type == "PID":
105 pdg_vars += ['mcPDG']
106 return [f'{self.get_varname(var)}' for var in pdg_vars]
107
109 n_variations: int,
110 rho_sys: np.ndarray = None,
111 rho_stat: np.ndarray = None) -> None:
112 """
113 Generates variations of weights according to the uncertainties
114 """
115 self.merged_table['stat_error'] = self.merged_table[["data_MC_uncertainty_stat_up",
116 "data_MC_uncertainty_stat_dn"]].max(axis=1)
117 self.merged_table['sys_error'] = self.merged_table[["data_MC_uncertainty_sys_up",
118 "data_MC_uncertainty_sys_dn"]].max(axis=1)
119 self.merged_table["error"] = np.sqrt(self.merged_table["stat_error"] ** 2 + self.merged_table["sys_error"] ** 2)
120 means = self.merged_table["data_MC_ratio"].values
121
122 self.column_names = [f"{self.weight_name}_{i}" for i in range(n_variations)]
123 cov = self.get_covariance(n_variations, rho_sys, rho_stat)
124 weights = cov + means
125 self.merged_table[self.weight_name] = self.merged_table["data_MC_ratio"]
126 self.merged_table[self.column_names] = weights.T
127 self.column_names.insert(0, self.weight_name)
128
130 n_variations: int,
131 rho_sys: np.ndarray = None,
132 rho_stat: np.ndarray = None) -> np.ndarray:
133 """
134 Returns the covariance matrix of the weights
135 """
136 len_means = len(self.merged_table["data_MC_ratio"])
137 zeros = np.zeros(len_means)
138 if self.cov is None:
139 if rho_sys is None:
140 if self.syscorr:
141 rho_sys = np.ones((len_means, len_means))
142 else:
143 rho_sys = np.identity(len_means)
144 if rho_stat is None:
145 rho_stat = np.identity(len_means)
146 sys_cov = np.matmul(
147 np.matmul(np.diag(self.merged_table['sys_error']), rho_sys), np.diag(self.merged_table['sys_error'])
148 )
149 stat_cov = np.matmul(
150 np.matmul(np.diag(self.merged_table['stat_error']), rho_stat), np.diag(self.merged_table['stat_error'])
151 )
152 np.random.seed(self.sys_seed)
153 sys = np.random.multivariate_normal(zeros, sys_cov, n_variations)
154 np.random.seed(None)
155 stat = np.random.multivariate_normal(zeros, stat_cov, n_variations)
156 return sys + stat
157 errors = np.random.multivariate_normal(zeros, self.cov, n_variations)
158 return errors
159
160 def __str__(self) -> str:
161 """
162 Converts the object to a string.
163 """
164 separator = '------------------'
165 title = 'ReweighterParticle'
166 prefix_str = f'Type: {self.type} Prefix: {self.prefix}'
167 columns = _weight_cols
168 merged_table_str = f'Merged table:\n{self.merged_table[columns].describe()}'
169 pdg_binning_str = 'PDG binning:\n'
170 for pdgs in self.pdg_binning:
171 pdg_binning_str += f'{pdgs}: {self.pdg_binning[pdgs]}\n'
172 return '\n'.join([separator, title, prefix_str, merged_table_str, pdg_binning_str]) + separator
173
174 def plot_coverage(self, fig=None, axs=None):
175 """
176 Plots the coverage of the ntuple.
177 """
178 if self.plot_values is None:
179 return
180 vars = set(sum([list(d.keys()) for d in self.plot_values.values()], []))
181 if fig is None:
182 fig, axs = plt.subplots(len(self.plot_values), len(vars), figsize=(5*len(vars), 3*len(self.plot_values)), dpi=120)
183 axs = np.array(axs)
184 if len(axs.shape) < 1:
185 axs = axs.reshape(len(self.plot_values), len(vars))
186 bin_plt = {'linewidth': 3, 'linestyle': '--', 'color': '0.5'}
187 fig.suptitle(f'{self.type} particle {self.prefix.strip("_")}')
188 for (reco_pdg, mc_pdg), ax_row in zip(self.plot_values, axs):
189 for var, ax in zip(self.plot_values[(reco_pdg, mc_pdg)], ax_row):
190 ymin = 0
191 ymax = self.plot_values[(reco_pdg, mc_pdg)][var][1].max()*1.1
192 # Plot binning
193 if self.type == 'PID':
194 ax.vlines(self.pdg_binning[(reco_pdg, mc_pdg)][var], ymin, ymax,
195 label='Binning',
196 alpha=0.8,
197 **bin_plt)
198 elif self.type == 'FEI':
199 values = np.array([int(val[4:]) for val in self.pdg_binning[(reco_pdg, mc_pdg)][var]])
200 ax.bar(values+0.5,
201 np.ones(len(values))*ymax,
202 width=1,
203 alpha=0.5,
204 label='Binning',
205 **bin_plt)
206 rest = np.setdiff1d(self.plot_values[(reco_pdg, mc_pdg)][var][0], values)
207 ax.bar(rest+0.5,
208 np.ones(len(rest))*ymax,
209 width=1,
210 alpha=0.2,
211 label='Rest category',
212 **bin_plt)
213 # Plot values
214 widths = (self.plot_values[(reco_pdg, mc_pdg)][var][0][1:] - self.plot_values[(reco_pdg, mc_pdg)][var][0][:-1])
215 centers = self.plot_values[(reco_pdg, mc_pdg)][var][0][:-1] + widths/2
216 ax.bar(centers,
217 self.plot_values[(reco_pdg, mc_pdg)][var][1],
218 width=widths,
219 label='Values',
220 alpha=0.8)
221 ax.set_title(f'True {pdg.to_name(mc_pdg)} to reco {pdg.to_name(reco_pdg)} coverage')
222 ax.set_xlabel(var)
223 axs[-1][-1].legend()
224 fig.tight_layout()
225 return fig, axs
226
227
229 """
230 Class that reweights the dataframe.
231
232 Args:
233 n_variations (int): Number of weight variations to generate.
234 weight_name (str): Name of the weight column.
235 evaluate_plots (bool): Flag to indicate if the plots should be evaluated.
236 nbins (int): Number of bins for the plots.
237 """
238
239 def __init__(self,
240 n_variations: int = 100,
241 weight_name: str = "Weight",
242 evaluate_plots: bool = True,
243 nbins: int = 50,
244 fillna: float = 1.0) -> None:
245 """
246 Initializes the Reweighter class.
247 """
248
249 self.n_variations = n_variations
250
251 self.particles = []
252
253 self.correlations = []
254
255 self.weight_name = weight_name
256
257 self.weights_generated = False
258
259 self.evaluate_plots = evaluate_plots
260
261 self.nbins = nbins
262
263 self.fillna = fillna
264
265 def get_bin_columns(self, weight_df) -> list:
266 """
267 Returns the kinematic bin columns of the dataframe.
268 """
269 return [col for col in weight_df.columns if col.endswith('_min') or col.endswith('_max')]
270
271 def get_binning(self, weight_df) -> dict:
272 """
273 Returns the kinematic binning of the dataframe.
274 """
275 columns = self.get_bin_columns(weight_df)
276 var_names = {'_'.join(col.split('_')[:-1]) for col in columns}
277 bin_dict = {}
278 for var_name in var_names:
279 bin_dict[var_name] = []
280 for col in columns:
281 if col.startswith(var_name):
282 bin_dict[var_name] += list(weight_df[col].values)
283 bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))
284 return bin_dict
285
286 def get_fei_binning(self, weight_df) -> dict:
287 """
288 Returns the irregular binning of the dataframe.
289 """
290 return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith('mode'),
291 _fei_mode_col].value_counts().index.to_list()}
292
294 ntuple_df: pd.DataFrame,
295 particle: ReweighterParticle) -> None:
296 """
297 Checks if the variables are in the ntuple and returns them.
298
299 Args:
300 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
301 particle (ReweighterParticle): Particle object containing the necessary variables.
302 """
303 ntuple_variables = particle.get_binning_variables()
304 ntuple_variables += particle.get_pdg_variables()
305 for var in ntuple_variables:
306 if var not in ntuple_df.columns:
307 raise ValueError(f'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')
308 return ntuple_variables
309
311 weights_dict: dict,
312 pdg_pid_variable_dict: dict) -> pd.DataFrame:
313 """
314 Merges the efficiency and fake rate weight tables.
315
316 Args:
317 weights_dict (dict): Dictionary containing the weight tables.
318 pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.
319 """
320 weight_dfs = []
321 for reco_pdg, mc_pdg in weights_dict:
322 if reco_pdg not in pdg_pid_variable_dict:
323 raise ValueError(f'Reconstructed PDG code {reco_pdg} not found in thresholds!')
324 weight_df = weights_dict[(reco_pdg, mc_pdg)]
325 weight_df['mcPDG'] = mc_pdg
326 weight_df['PDG'] = reco_pdg
327 # Check if these are legacy tables:
328 if 'charge' in weight_df.columns:
329 charge_dict = {'+': [0, 2], '-': [-2, 0]}
330 weight_df[['charge_min', 'charge_max']] = [charge_dict[val] for val in weight_df['charge'].values]
331 weight_df = weight_df.drop(columns=['charge'])
332 # If iso_score is a single value, drop the min and max columns
333 if 'iso_score_min' in weight_df.columns and len(weight_df['iso_score_min'].unique()) == 1:
334 weight_df = weight_df.drop(columns=['iso_score_min', 'iso_score_max'])
335 pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]
336 threshold = pdg_pid_variable_dict[reco_pdg][1]
337 selected_weights = weight_df.query(f'variable == "{pid_variable_name}" and threshold == {threshold}')
338 if len(selected_weights) == 0:
339 available_variables = weight_df['variable'].unique()
340 available_thresholds = weight_df['threshold'].unique()
341 raise ValueError(f'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'
342 f' variable {pid_variable_name} and threshold {threshold}!\n'
343 f' Available variables: {available_variables}\n'
344 f' Available thresholds: {available_thresholds}')
345 weight_dfs.append(selected_weights)
346 return pd.concat(weight_dfs, ignore_index=True)
347
349 ntuple_df: pd.DataFrame,
350 particle: ReweighterParticle) -> None:
351 """
352 Adds a weight and uncertainty columns to the dataframe.
353
354 Args:
355 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
356 particle (ReweighterParticle): Particle object.
357 """
358 # Apply a weight value from the weight table to the ntuple, based on the binning
359 binning_df = pd.DataFrame(index=ntuple_df.index)
360 # Take absolute value of mcPDG for binning because we have charge already
361 binning_df['mcPDG'] = ntuple_df[f'{particle.get_varname("mcPDG")}'].abs()
362 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
363 plot_values = {}
364 for reco_pdg, mc_pdg in particle.pdg_binning:
365 ntuple_cut = f'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'
366 if ntuple_df.query(ntuple_cut).empty:
367 continue
368 plot_values[(reco_pdg, mc_pdg)] = {}
369 for var in particle.pdg_binning[(reco_pdg, mc_pdg)]:
370 labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])
371 for i in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]
372 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(
373 ntuple_cut)[f'{particle.get_varname(var)}'],
374 particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)
375 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
376 f'{var}_min'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
377 var].str[0]
378 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
379 f'{var}_max'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
380 var].str[1]
381 binning_df.drop(var, axis=1, inplace=True)
382 if self.evaluate_plots:
383 values = ntuple_df.query(ntuple_cut)[f'{particle.get_varname(var)}']
384 if len(values.unique()) < 2:
385 print(f'Skip {var} for plotting!')
386 continue
387 x_range = np.linspace(values.min(), values.max(), self.nbins)
388 plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=True)[0]
389 # merge the weight table with the ntuple on binning columns
390 weight_cols = _weight_cols
391 if particle.column_names:
392 weight_cols = particle.column_names
393 binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],
394 on=binning_df.columns.tolist(), how='left')
395 binning_df.index = ntuple_df.index
396 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
397 particle.plot_values = plot_values
398 for col in weight_cols:
399 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
400 ntuple_df[f'{particle.get_varname(col)}'] = ntuple_df[f'{particle.get_varname(col)}'].fillna(self.fillna)
401
403 prefix: str,
404 weights_dict: dict,
405 pdg_pid_variable_dict: dict,
406 variable_aliases: dict = None,
407 sys_seed: int = None,
408 syscorr: bool = True) -> None:
409 """
410 Adds weight variations according to the total uncertainty for easier error propagation.
411
412 Args:
413 prefix (str): Prefix for the new columns.
414 weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
415 pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
416 variable_aliases (dict): Dictionary containing variable aliases.
417 sys_seed (int): Seed for the systematic variations.
418 syscorr (bool): When true assume systematics are 100% correlated defaults to
419 true. Note this is overridden by provision of a None value rho_sys
420 """
421 # Empty prefix means no prefix
422 if prefix is None:
423 prefix = ''
424 # Add underscore if not present
425 if prefix and not prefix.endswith('_'):
426 prefix += '_'
427 if self.get_particle(prefix):
428 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
429 if variable_aliases is None:
430 variable_aliases = {}
431 merged_weight_df = self.merge_pid_weight_tables(weights_dict, pdg_pid_variable_dict)
432 pdg_binning = {(reco_pdg, mc_pdg): self.get_binning(merged_weight_df.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
433 for reco_pdg, mc_pdg in merged_weight_df[['PDG', 'mcPDG']].value_counts().index.to_list()}
434 particle = ReweighterParticle(prefix,
435 type='PID',
436 merged_table=merged_weight_df,
437 pdg_binning=pdg_binning,
438 variable_aliases=variable_aliases,
439 weight_name=self.weight_name,
440 sys_seed=sys_seed,
441 syscorr=syscorr)
442 self.particles += [particle]
443
444 def get_particle(self, prefix: str) -> ReweighterParticle:
445 """
446 Get a particle by its prefix.
447 """
448 cands = [particle for particle in self.particles if particle.prefix.strip('_') == prefix.strip('_')]
449 if len(cands) == 0:
450 return None
451 return cands[0]
452
453 def convert_fei_table(self, table: pd.DataFrame, threshold: float):
454 """
455 Checks if the tables are provided in a legacy format and converts them to the standard format.
456 """
457 result = None
458 str_to_pdg = {'B+': 521, 'B-': 521, 'B0': 511}
459 if 'cal' in table.columns:
460 result = pd.DataFrame(index=table.index)
461 result['data_MC_ratio'] = table['cal']
462 result['PDG'] = table['Btag'].apply(lambda x: str_to_pdg.get(x))
463 # Assume these are only efficiency tables
464 result['mcPDG'] = result['PDG']
465 result['threshold'] = table['sig_prob_threshold']
466 result[_fei_mode_col] = table[_fei_mode_col]
467 result['data_MC_uncertainty_stat_dn'] = table['cal_stat_error']
468 result['data_MC_uncertainty_stat_up'] = table['cal_stat_error']
469 result['data_MC_uncertainty_sys_dn'] = table['cal_sys_error']
470 result['data_MC_uncertainty_sys_up'] = table['cal_sys_error']
471 elif 'cal factor' in table.columns:
472 result = pd.DataFrame(index=table.index)
473 result['data_MC_ratio'] = table['cal factor']
474 result['PDG'] = table['Btype'].apply(lambda x: str_to_pdg.get(x))
475 result['mcPDG'] = result['PDG']
476 result['threshold'] = table['sig prob cut']
477 # Assign the total error to the stat uncertainty and set syst. one to 0
478 result['data_MC_uncertainty_stat_dn'] = table['error']
479 result['data_MC_uncertainty_stat_up'] = table['error']
480 result['data_MC_uncertainty_sys_dn'] = 0
481 result['data_MC_uncertainty_sys_up'] = 0
482 result[_fei_mode_col] = table['mode']
483 else:
484 result = table
485 result = result.query(f'threshold == {threshold}')
486 if len(result) == 0:
487 raise ValueError(f'No weights found for threshold {threshold}!')
488 return result
489
490 def add_fei_particle(self, prefix: str,
491 table: pd.DataFrame,
492 threshold: float,
493 cov: np.ndarray = None,
494 variable_aliases: dict = None,
495 ) -> None:
496 """
497 Adds weight variations according to the total uncertainty for easier error propagation.
498
499 Args:
500 prefix (str): Prefix for the new columns.
501 table (pandas.DataFrame): Dataframe containing the efficiency weights.
502 threshold (float): Threshold for the efficiency weights.
503 cov (numpy.ndarray): Covariance matrix for the efficiency weights.
504 variable_aliases (dict): Dictionary containing variable aliases.
505 """
506 # Empty prefix means no prefix
507 if prefix is None:
508 prefix = ''
509 if prefix and not prefix.endswith('_'):
510 prefix += '_'
511 if self.get_particle(prefix):
512 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
513 if variable_aliases is None:
514 variable_aliases = {}
515 if table is None or len(table) == 0:
516 raise ValueError('No weights provided!')
517 converted_table = self.convert_fei_table(table, threshold)
518 pdg_binning = {(reco_pdg, mc_pdg): self.get_fei_binning(converted_table.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
519 for reco_pdg, mc_pdg in converted_table[['PDG', 'mcPDG']].value_counts().index.to_list()}
520 particle = ReweighterParticle(prefix,
521 type='FEI',
522 merged_table=converted_table,
523 pdg_binning=pdg_binning,
524 variable_aliases=variable_aliases,
525 weight_name=self.weight_name,
526 cov=cov)
527 self.particles += [particle]
528
529 def add_fei_weight_columns(self, ntuple_df: pd.DataFrame, particle: ReweighterParticle):
530 """
531 Adds weight columns according to the FEI calibration tables
532 """
533 rest_str = 'rest'
534 particle.merged_table[_fei_mode_col]
535 # Apply a weight value from the weight table to the ntuple, based on the binning
536 binning_df = pd.DataFrame(index=ntuple_df.index)
537 # Take absolute value of mcPDG for binning because we have charge already
538 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
539 # Copy the mode ID from the ntuple
540 binning_df['num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)
541 # Default value in case if reco PDG is not a B-meson PDG
542 binning_df[_fei_mode_col] = np.nan
543 plot_values = {}
544 for reco_pdg, mc_pdg in particle.pdg_binning:
545 plot_values[(reco_pdg, mc_pdg)] = {}
546 binning_df.loc[binning_df['PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(
547 f'PDG == {reco_pdg} and {_fei_mode_col}.str.lower() == "{rest_str}"')[_fei_mode_col].values[0]
548 for mode in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:
549 binning_df.loc[(binning_df['PDG'] == reco_pdg) & (binning_df['num_mode'] == int(mode[4:])), _fei_mode_col] = mode
550 if self.evaluate_plots:
551 values = ntuple_df[f'{particle.get_varname(_fei_mode_col)}']
552 x_range = np.linspace(values.min(), values.max(), int(values.max())+1)
553 plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=True)[0]
554
555 # merge the weight table with the ntuple on binning columns
556 weight_cols = _weight_cols
557 if particle.column_names:
558 weight_cols = particle.column_names
559 binning_df = binning_df.merge(particle.merged_table[weight_cols + ['PDG', _fei_mode_col]],
560 on=['PDG', _fei_mode_col], how='left')
561 binning_df.index = ntuple_df.index
562 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
563 particle.plot_values = plot_values
564 for col in weight_cols:
565 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
566
567 def reweight(self,
568 df: pd.DataFrame,
569 generate_variations: bool = True):
570 """
571 Reweights the dataframe according to the weight tables.
572
573 Args:
574 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
575 generate_variations (bool): When true generate weight variations.
576 """
577 for particle in self.particles:
578 if particle.type not in _correction_types:
579 raise ValueError(f'Particle type {particle.type} not supported!')
580 print(f'Required variables: {self.get_ntuple_variables(df, particle)}')
581 if generate_variations:
582 particle.generate_variations(n_variations=self.n_variations)
583 if particle.type == 'PID':
584 self.add_pid_weight_columns(df, particle)
585 elif particle.type == 'FEI':
586 self.add_fei_weight_columns(df, particle)
587 return df
588
589 def print_coverage(self):
590 """
591 Prints the coverage of each particle.
592 """
593 print('Coverage:')
594 for particle in self.particles:
595 print(f'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')
596
597 def plot_coverage(self):
598 """
599 Plots the coverage of each particle.
600 """
601 for particle in self.particles:
602 particle.plot_coverage()
603
604
605def add_weights_to_dataframe(prefix: str,
606 df: pd.DataFrame,
607 systematic: str,
608 custom_tables: dict = None,
609 custom_thresholds: dict = None,
610 **kw_args) -> pd.DataFrame:
611 """
612 Helper method that adds weights to a dataframe.
613
614 Args:
615 prefix (str): Prefix for the new columns.
616 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
617 systematic (str): Type of the systematic corrections, options: "custom_PID" and "custom_FEI".
618 MC_production (str): Name of the MC production.
619 custom_tables (dict): Dictionary containing the custom efficiency weights.
620 custom_thresholds (dict): Dictionary containing the custom thresholds for the custom efficiency weights.
621 n_variations (int): Number of variations to generate.
622 generate_variations (bool): When true generate weight variations.
623 weight_name (str): Name of the weight column.
624 show_plots (bool): When true show the coverage plots.
625 variable_aliases (dict): Dictionary containing variable aliases.
626 cov_matrix (numpy.ndarray): Covariance matrix for the custom efficiency weights.
627 fillna (int): Value to fill NaN values with.
628 sys_seed (int): Seed for the systematic variations.
629 syscorr (bool): When true assume systematics are 100% correlated defaults to true.
630 **kw_args: Additional arguments for the Reweighter class.
631 """
632 generate_variations = kw_args.get('generate_variations', True)
633 n_variations = kw_args.get('n_variations', 100)
634 weight_name = kw_args.get('weight_name', "Weight")
635 fillna = kw_args.get('fillna', 1.0)
636 # Catch performance warnings from pandas
637 with warnings.catch_warnings():
638 warnings.simplefilter("ignore", category=PerformanceWarning)
639 reweighter = Reweighter(n_variations=n_variations,
640 weight_name=weight_name,
641 fillna=fillna)
642 variable_aliases = kw_args.get('variable_aliases')
643 if systematic.lower() == 'custom_fei':
644 cov_matrix = kw_args.get('cov_matrix')
645 reweighter.add_fei_particle(prefix=prefix,
646 table=custom_tables,
647 threshold=custom_thresholds,
648 variable_aliases=variable_aliases,
649 cov=cov_matrix
650 )
651 elif systematic.lower() == 'custom_pid':
652 sys_seed = kw_args.get('sys_seed')
653 syscorr = kw_args.get('syscorr')
654 if syscorr is None:
655 syscorr = True
656 reweighter.add_pid_particle(prefix=prefix,
657 weights_dict=custom_tables,
658 pdg_pid_variable_dict=custom_thresholds,
659 variable_aliases=variable_aliases,
660 sys_seed=sys_seed,
661 syscorr=syscorr
662 )
663 else:
664 raise ValueError(f'Systematic {systematic} is not supported!')
665
666 result = reweighter.reweight(df, generate_variations=generate_variations)
667 if kw_args.get('show_plots'):
668 reweighter.print_coverage()
669 reweighter.plot_coverage()
670 return result
str type
Add the mcPDG code requirement for PID particle.
Definition sysvar.py:104
variable_aliases
Variable aliases of the weight table.
Definition sysvar.py:85
pd merged_table
Type of the particle (PID or FEI)
Definition sysvar.py:48
int sys_seed
Random seed for systematics.
Definition sysvar.py:66
np cov
Covariance matrix corresponds to the total uncertainty.
Definition sysvar.py:69
str get_varname(self, str varname)
Definition sysvar.py:80
weight_name
Weight column name that will be added to the ntuple.
Definition sysvar.py:127
plot_coverage(self, fig=None, axs=None)
Definition sysvar.py:174
list get_pdg_variables(self)
Definition sysvar.py:98
list get_binning_variables(self)
Definition sysvar.py:91
pdg_binning
Kinematic binning of the weight table per particle.
Definition sysvar.py:170
np.ndarray get_covariance(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
Definition sysvar.py:132
bool syscorr
When true assume systematics are 100% correlated.
Definition sysvar.py:72
prefix
Prefix of the particle in the ntuple.
Definition sysvar.py:87
dict plot_values
Values for the plots.
Definition sysvar.py:78
None generate_variations(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
Definition sysvar.py:111
list column_names
Internal list of the names of the weight columns.
Definition sysvar.py:63
n_variations
Number of weight variations to generate.
Definition sysvar.py:249
pd.DataFrame merge_pid_weight_tables(self, dict weights_dict, dict pdg_pid_variable_dict)
Definition sysvar.py:312
nbins
Number of bins for the plots.
Definition sysvar.py:261
None add_pid_particle(self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)
Definition sysvar.py:408
list particles
List of particles.
Definition sysvar.py:251
add_fei_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition sysvar.py:529
None __init__(self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)
Definition sysvar.py:244
print_coverage(self)
Definition sysvar.py:589
None add_pid_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition sysvar.py:350
weight_name
Name of the weight column.
Definition sysvar.py:255
None get_ntuple_variables(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
Definition sysvar.py:295
fillna
Value to fill NaN values.
Definition sysvar.py:263
dict get_fei_binning(self, weight_df)
Definition sysvar.py:286
convert_fei_table(self, pd.DataFrame table, float threshold)
Definition sysvar.py:453
bool weights_generated
Flag to indicate if the weights have been generated.
Definition sysvar.py:257
dict get_binning(self, weight_df)
Definition sysvar.py:271
list get_bin_columns(self, weight_df)
Definition sysvar.py:265
reweight(self, pd.DataFrame df, bool generate_variations=True)
Definition sysvar.py:569
plot_coverage(self)
Definition sysvar.py:597
None add_fei_particle(self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)
Definition sysvar.py:495
ReweighterParticle get_particle(self, str prefix)
Definition sysvar.py:444
list correlations
Correlations between the particles.
Definition sysvar.py:253
evaluate_plots
Flag to indicate if the plots should be evaluated.
Definition sysvar.py:259