Belle II Software development
Reweighter Class Reference

Public Member Functions

None __init__ (self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)
 
list get_bin_columns (self, weight_df)
 
dict get_binning (self, weight_df)
 
dict get_fei_binning (self, weight_df)
 
None get_ntuple_variables (self, pd.DataFrame ntuple_df, ReweighterParticle particle)
 
pd.DataFrame merge_pid_weight_tables (self, dict weights_dict, dict pdg_pid_variable_dict)
 
None add_pid_weight_columns (self, pd.DataFrame ntuple_df, ReweighterParticle particle)
 
None add_pid_particle (self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)
 
ReweighterParticle get_particle (self, str prefix)
 
def convert_fei_table (self, pd.DataFrame table, float threshold)
 
None add_fei_particle (self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)
 
def add_fei_weight_columns (self, pd.DataFrame ntuple_df, ReweighterParticle particle)
 
def reweight (self, pd.DataFrame df, bool generate_variations=True)
 
def print_coverage (self)
 
def plot_coverage (self)
 

Public Attributes

 n_variations
 Number of weight variations to generate.
 
 particles
 List of particles.
 
 correlations
 Correlations between the particles.
 
 weight_name
 Name of the weight column.
 
 weights_generated
 Flag to indicate if the weights have been generated.
 
 evaluate_plots
 Flag to indicate if the plots should be evaluated.
 
 nbins
 Number of bins for the plots.
 
 fillna
 Value to fill NaN values.
 

Detailed Description

Class that reweights the dataframe.

Args:
    n_variations (int): Number of weight variations to generate.
    weight_name (str): Name of the weight column.
    evaluate_plots (bool): Flag to indicate if the plots should be evaluated.
    nbins (int): Number of bins for the plots.

Definition at line 224 of file sysvar.py.

Constructor & Destructor Documentation

◆ __init__()

None __init__ (   self,
int   n_variations = 100,
str   weight_name = "Weight",
bool   evaluate_plots = True,
int   nbins = 50,
float   fillna = 1.0 
)
Initializes the Reweighter class.

Definition at line 235 of file sysvar.py.

240 fillna: float = 1.0) -> None:
241 """
242 Initializes the Reweighter class.
243 """
244
245 self.n_variations = n_variations
246
247 self.particles = []
248
249 self.correlations = []
250
251 self.weight_name = weight_name
252
253 self.weights_generated = False
254
255 self.evaluate_plots = evaluate_plots
256
257 self.nbins = nbins
258
259 self.fillna = fillna
260

Member Function Documentation

◆ add_fei_particle()

None add_fei_particle (   self,
str  prefix,
pd.DataFrame  table,
float  threshold,
np.ndarray   cov = None,
dict   variable_aliases = None 
)
Adds weight variations according to the total uncertainty for easier error propagation.

Args:
    prefix (str): Prefix for the new columns.
    table (pandas.DataFrame): Dataframe containing the efficiency weights.
    threshold (float): Threshold for the efficiency weights.
    cov (numpy.ndarray): Covariance matrix for the efficiency weights.
    variable_aliases (dict): Dictionary containing variable aliases.

Definition at line 486 of file sysvar.py.

491 ) -> None:
492 """
493 Adds weight variations according to the total uncertainty for easier error propagation.
494
495 Args:
496 prefix (str): Prefix for the new columns.
497 table (pandas.DataFrame): Dataframe containing the efficiency weights.
498 threshold (float): Threshold for the efficiency weights.
499 cov (numpy.ndarray): Covariance matrix for the efficiency weights.
500 variable_aliases (dict): Dictionary containing variable aliases.
501 """
502 # Empty prefix means no prefix
503 if prefix is None:
504 prefix = ''
505 if prefix and not prefix.endswith('_'):
506 prefix += '_'
507 if self.get_particle(prefix):
508 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
509 if variable_aliases is None:
510 variable_aliases = {}
511 if table is None or len(table) == 0:
512 raise ValueError('No weights provided!')
513 converted_table = self.convert_fei_table(table, threshold)
514 pdg_binning = {(reco_pdg, mc_pdg): self.get_fei_binning(converted_table.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
515 for reco_pdg, mc_pdg in converted_table[['PDG', 'mcPDG']].value_counts().index.to_list()}
516 particle = ReweighterParticle(prefix,
517 type='FEI',
518 merged_table=converted_table,
519 pdg_binning=pdg_binning,
520 variable_aliases=variable_aliases,
521 weight_name=self.weight_name,
522 cov=cov)
523 self.particles += [particle]
524

◆ add_fei_weight_columns()

def add_fei_weight_columns (   self,
pd.DataFrame  ntuple_df,
ReweighterParticle  particle 
)
Adds weight columns according to the FEI calibration tables

Definition at line 525 of file sysvar.py.

525 def add_fei_weight_columns(self, ntuple_df: pd.DataFrame, particle: ReweighterParticle):
526 """
527 Adds weight columns according to the FEI calibration tables
528 """
529 rest_str = 'rest'
530 particle.merged_table[_fei_mode_col]
531 # Apply a weight value from the weight table to the ntuple, based on the binning
532 binning_df = pd.DataFrame(index=ntuple_df.index)
533 # Take absolute value of mcPDG for binning because we have charge already
534 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
535 # Copy the mode ID from the ntuple
536 binning_df['num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)
537 # Default value in case if reco PDG is not a B-meson PDG
538 binning_df[_fei_mode_col] = np.nan
539 plot_values = {}
540 for reco_pdg, mc_pdg in particle.pdg_binning:
541 plot_values[(reco_pdg, mc_pdg)] = {}
542 binning_df.loc[binning_df['PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(
543 f'PDG == {reco_pdg} and {_fei_mode_col}.str.lower() == "{rest_str}"')[_fei_mode_col].values[0]
544 for mode in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:
545 binning_df.loc[(binning_df['PDG'] == reco_pdg) & (binning_df['num_mode'] == int(mode[4:])), _fei_mode_col] = mode
546 if self.evaluate_plots:
547 values = ntuple_df[f'{particle.get_varname(_fei_mode_col)}']
548 x_range = np.linspace(values.min(), values.max(), int(values.max())+1)
549 plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=True)[0]
550
551 # merge the weight table with the ntuple on binning columns
552 weight_cols = _weight_cols
553 if particle.column_names:
554 weight_cols = particle.column_names
555 binning_df = binning_df.merge(particle.merged_table[weight_cols + ['PDG', _fei_mode_col]],
556 on=['PDG', _fei_mode_col], how='left')
557 binning_df.index = ntuple_df.index
558 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
559 particle.plot_values = plot_values
560 for col in weight_cols:
561 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
562

◆ add_pid_particle()

None add_pid_particle (   self,
str  prefix,
dict  weights_dict,
dict  pdg_pid_variable_dict,
dict   variable_aliases = None,
int   sys_seed = None,
bool   syscorr = True 
)
Adds weight variations according to the total uncertainty for easier error propagation.

Args:
    prefix (str): Prefix for the new columns.
    weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
    pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
    variable_aliases (dict): Dictionary containing variable aliases.
    sys_seed (int): Seed for the systematic variations.
    syscorr (bool): When true assume systematics are 100% correlated defaults to
true. Note this is overridden by provision of a None value rho_sys

Definition at line 398 of file sysvar.py.

404 syscorr: bool = True) -> None:
405 """
406 Adds weight variations according to the total uncertainty for easier error propagation.
407
408 Args:
409 prefix (str): Prefix for the new columns.
410 weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
411 pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
412 variable_aliases (dict): Dictionary containing variable aliases.
413 sys_seed (int): Seed for the systematic variations.
414 syscorr (bool): When true assume systematics are 100% correlated defaults to
415 true. Note this is overridden by provision of a None value rho_sys
416 """
417 # Empty prefix means no prefix
418 if prefix is None:
419 prefix = ''
420 # Add underscore if not present
421 if prefix and not prefix.endswith('_'):
422 prefix += '_'
423 if self.get_particle(prefix):
424 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
425 if variable_aliases is None:
426 variable_aliases = {}
427 merged_weight_df = self.merge_pid_weight_tables(weights_dict, pdg_pid_variable_dict)
428 pdg_binning = {(reco_pdg, mc_pdg): self.get_binning(merged_weight_df.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
429 for reco_pdg, mc_pdg in merged_weight_df[['PDG', 'mcPDG']].value_counts().index.to_list()}
430 particle = ReweighterParticle(prefix,
431 type='PID',
432 merged_table=merged_weight_df,
433 pdg_binning=pdg_binning,
434 variable_aliases=variable_aliases,
435 weight_name=self.weight_name,
436 sys_seed=sys_seed,
437 syscorr=syscorr)
438 self.particles += [particle]
439

◆ add_pid_weight_columns()

None add_pid_weight_columns (   self,
pd.DataFrame  ntuple_df,
ReweighterParticle  particle 
)
Adds a weight and uncertainty columns to the dataframe.

Args:
    ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    particle (ReweighterParticle): Particle object.

Definition at line 344 of file sysvar.py.

346 particle: ReweighterParticle) -> None:
347 """
348 Adds a weight and uncertainty columns to the dataframe.
349
350 Args:
351 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
352 particle (ReweighterParticle): Particle object.
353 """
354 # Apply a weight value from the weight table to the ntuple, based on the binning
355 binning_df = pd.DataFrame(index=ntuple_df.index)
356 # Take absolute value of mcPDG for binning because we have charge already
357 binning_df['mcPDG'] = ntuple_df[f'{particle.get_varname("mcPDG")}'].abs()
358 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
359 plot_values = {}
360 for reco_pdg, mc_pdg in particle.pdg_binning:
361 ntuple_cut = f'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'
362 if ntuple_df.query(ntuple_cut).empty:
363 continue
364 plot_values[(reco_pdg, mc_pdg)] = {}
365 for var in particle.pdg_binning[(reco_pdg, mc_pdg)]:
366 labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])
367 for i in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]
368 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(
369 ntuple_cut)[f'{particle.get_varname(var)}'],
370 particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)
371 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
372 f'{var}_min'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
373 var].str[0]
374 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
375 f'{var}_max'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
376 var].str[1]
377 binning_df.drop(var, axis=1, inplace=True)
378 if self.evaluate_plots:
379 values = ntuple_df.query(ntuple_cut)[f'{particle.get_varname(var)}']
380 if len(values.unique()) < 2:
381 print(f'Skip {var} for plotting!')
382 continue
383 x_range = np.linspace(values.min(), values.max(), self.nbins)
384 plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=True)[0]
385 # merge the weight table with the ntuple on binning columns
386 weight_cols = _weight_cols
387 if particle.column_names:
388 weight_cols = particle.column_names
389 binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],
390 on=binning_df.columns.tolist(), how='left')
391 binning_df.index = ntuple_df.index
392 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
393 particle.plot_values = plot_values
394 for col in weight_cols:
395 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
396 ntuple_df[f'{particle.get_varname(col)}'] = ntuple_df[f'{particle.get_varname(col)}'].fillna(self.fillna)
397

◆ convert_fei_table()

def convert_fei_table (   self,
pd.DataFrame  table,
float  threshold 
)
Checks if the tables are provided in a legacy format and converts them to the standard format.

Definition at line 449 of file sysvar.py.

449 def convert_fei_table(self, table: pd.DataFrame, threshold: float):
450 """
451 Checks if the tables are provided in a legacy format and converts them to the standard format.
452 """
453 result = None
454 str_to_pdg = {'B+': 521, 'B-': 521, 'B0': 511}
455 if 'cal' in table.columns:
456 result = pd.DataFrame(index=table.index)
457 result['data_MC_ratio'] = table['cal']
458 result['PDG'] = table['Btag'].apply(lambda x: str_to_pdg.get(x))
459 # Assume these are only efficiency tables
460 result['mcPDG'] = result['PDG']
461 result['threshold'] = table['sig_prob_threshold']
462 result[_fei_mode_col] = table[_fei_mode_col]
463 result['data_MC_uncertainty_stat_dn'] = table['cal_stat_error']
464 result['data_MC_uncertainty_stat_up'] = table['cal_stat_error']
465 result['data_MC_uncertainty_sys_dn'] = table['cal_sys_error']
466 result['data_MC_uncertainty_sys_up'] = table['cal_sys_error']
467 elif 'cal factor' in table.columns:
468 result = pd.DataFrame(index=table.index)
469 result['data_MC_ratio'] = table['cal factor']
470 result['PDG'] = table['Btype'].apply(lambda x: str_to_pdg.get(x))
471 result['mcPDG'] = result['PDG']
472 result['threshold'] = table['sig prob cut']
473 # Assign the total error to the stat uncertainty and set syst. one to 0
474 result['data_MC_uncertainty_stat_dn'] = table['error']
475 result['data_MC_uncertainty_stat_up'] = table['error']
476 result['data_MC_uncertainty_sys_dn'] = 0
477 result['data_MC_uncertainty_sys_up'] = 0
478 result[_fei_mode_col] = table['mode']
479 else:
480 result = table
481 result = result.query(f'threshold == {threshold}')
482 if len(result) == 0:
483 raise ValueError(f'No weights found for threshold {threshold}!')
484 return result
485

◆ get_bin_columns()

list get_bin_columns (   self,
  weight_df 
)
Returns the kinematic bin columns of the dataframe.

Definition at line 261 of file sysvar.py.

261 def get_bin_columns(self, weight_df) -> list:
262 """
263 Returns the kinematic bin columns of the dataframe.
264 """
265 return [col for col in weight_df.columns if col.endswith('_min') or col.endswith('_max')]
266

◆ get_binning()

dict get_binning (   self,
  weight_df 
)
Returns the kinematic binning of the dataframe.

Definition at line 267 of file sysvar.py.

267 def get_binning(self, weight_df) -> dict:
268 """
269 Returns the kinematic binning of the dataframe.
270 """
271 columns = self.get_bin_columns(weight_df)
272 var_names = {'_'.join(col.split('_')[:-1]) for col in columns}
273 bin_dict = {}
274 for var_name in var_names:
275 bin_dict[var_name] = []
276 for col in columns:
277 if col.startswith(var_name):
278 bin_dict[var_name] += list(weight_df[col].values)
279 bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))
280 return bin_dict
281

◆ get_fei_binning()

dict get_fei_binning (   self,
  weight_df 
)
Returns the irregular binning of the dataframe.

Definition at line 282 of file sysvar.py.

282 def get_fei_binning(self, weight_df) -> dict:
283 """
284 Returns the irregular binning of the dataframe.
285 """
286 return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith('mode'),
287 _fei_mode_col].value_counts().index.to_list()}
288

◆ get_ntuple_variables()

None get_ntuple_variables (   self,
pd.DataFrame  ntuple_df,
ReweighterParticle  particle 
)
Checks if the variables are in the ntuple and returns them.

Args:
    ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    particle (ReweighterParticle): Particle object containing the necessary variables.

Definition at line 289 of file sysvar.py.

291 particle: ReweighterParticle) -> None:
292 """
293 Checks if the variables are in the ntuple and returns them.
294
295 Args:
296 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
297 particle (ReweighterParticle): Particle object containing the necessary variables.
298 """
299 ntuple_variables = particle.get_binning_variables()
300 ntuple_variables += particle.get_pdg_variables()
301 for var in ntuple_variables:
302 if var not in ntuple_df.columns:
303 raise ValueError(f'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')
304 return ntuple_variables
305

◆ get_particle()

ReweighterParticle get_particle (   self,
str  prefix 
)
Get a particle by its prefix.

Definition at line 440 of file sysvar.py.

440 def get_particle(self, prefix: str) -> ReweighterParticle:
441 """
442 Get a particle by its prefix.
443 """
444 cands = [particle for particle in self.particles if particle.prefix.strip('_') == prefix.strip('_')]
445 if len(cands) == 0:
446 return None
447 return cands[0]
448

◆ merge_pid_weight_tables()

pd.DataFrame merge_pid_weight_tables (   self,
dict  weights_dict,
dict  pdg_pid_variable_dict 
)
Merges the efficiency and fake rate weight tables.

Args:
    weights_dict (dict): Dictionary containing the weight tables.
    pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.

Definition at line 306 of file sysvar.py.

308 pdg_pid_variable_dict: dict) -> pd.DataFrame:
309 """
310 Merges the efficiency and fake rate weight tables.
311
312 Args:
313 weights_dict (dict): Dictionary containing the weight tables.
314 pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.
315 """
316 weight_dfs = []
317 for reco_pdg, mc_pdg in weights_dict:
318 if reco_pdg not in pdg_pid_variable_dict:
319 raise ValueError(f'Reconstructed PDG code {reco_pdg} not found in thresholds!')
320 weight_df = weights_dict[(reco_pdg, mc_pdg)]
321 weight_df['mcPDG'] = mc_pdg
322 weight_df['PDG'] = reco_pdg
323 # Check if these are legacy tables:
324 if 'charge' in weight_df.columns:
325 charge_dict = {'+': [0, 2], '-': [-2, 0]}
326 weight_df[['charge_min', 'charge_max']] = [charge_dict[val] for val in weight_df['charge'].values]
327 weight_df = weight_df.drop(columns=['charge'])
328 # If iso_score is a single value, drop the min and max columns
329 if 'iso_score_min' in weight_df.columns and len(weight_df['iso_score_min'].unique()) == 1:
330 weight_df = weight_df.drop(columns=['iso_score_min', 'iso_score_max'])
331 pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]
332 threshold = pdg_pid_variable_dict[reco_pdg][1]
333 selected_weights = weight_df.query(f'variable == "{pid_variable_name}" and threshold == {threshold}')
334 if len(selected_weights) == 0:
335 available_variables = weight_df['variable'].unique()
336 available_thresholds = weight_df['threshold'].unique()
337 raise ValueError(f'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'
338 f' variable {pid_variable_name} and threshold {threshold}!\n'
339 f' Available variables: {available_variables}\n'
340 f' Available thresholds: {available_thresholds}')
341 weight_dfs.append(selected_weights)
342 return pd.concat(weight_dfs, ignore_index=True)
343

◆ plot_coverage()

def plot_coverage (   self)
Plots the coverage of each particle.

Definition at line 593 of file sysvar.py.

593 def plot_coverage(self):
594 """
595 Plots the coverage of each particle.
596 """
597 for particle in self.particles:
598 particle.plot_coverage()
599
600

◆ print_coverage()

def print_coverage (   self)
Prints the coverage of each particle.

Definition at line 585 of file sysvar.py.

585 def print_coverage(self):
586 """
587 Prints the coverage of each particle.
588 """
589 print('Coverage:')
590 for particle in self.particles:
591 print(f'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')
592

◆ reweight()

def reweight (   self,
pd.DataFrame  df,
bool   generate_variations = True 
)
Reweights the dataframe according to the weight tables.

Args:
    df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    generate_variations (bool): When true generate weight variations.

Definition at line 563 of file sysvar.py.

565 generate_variations: bool = True):
566 """
567 Reweights the dataframe according to the weight tables.
568
569 Args:
570 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
571 generate_variations (bool): When true generate weight variations.
572 """
573 for particle in self.particles:
574 if particle.type not in _correction_types:
575 raise ValueError(f'Particle type {particle.type} not supported!')
576 print(f'Required variables: {self.get_ntuple_variables(df, particle)}')
577 if generate_variations:
578 particle.generate_variations(n_variations=self.n_variations)
579 if particle.type == 'PID':
580 self.add_pid_weight_columns(df, particle)
581 elif particle.type == 'FEI':
582 self.add_fei_weight_columns(df, particle)
583 return df
584

Member Data Documentation

◆ correlations

correlations

Correlations between the particles.

Definition at line 249 of file sysvar.py.

◆ evaluate_plots

evaluate_plots

Flag to indicate if the plots should be evaluated.

Definition at line 255 of file sysvar.py.

◆ fillna

fillna

Value to fill NaN values.

Definition at line 259 of file sysvar.py.

◆ n_variations

n_variations

Number of weight variations to generate.

Definition at line 245 of file sysvar.py.

◆ nbins

nbins

Number of bins for the plots.

Definition at line 257 of file sysvar.py.

◆ particles

particles

List of particles.

Definition at line 247 of file sysvar.py.

◆ weight_name

weight_name

Name of the weight column.

Definition at line 251 of file sysvar.py.

◆ weights_generated

weights_generated

Flag to indicate if the weights have been generated.

Definition at line 253 of file sysvar.py.


The documentation for this class was generated from the following file: