Belle II Software prerelease-10-00-00a
Reweighter Class Reference

Public Member Functions

None __init__ (self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)
 
list get_bin_columns (self, weight_df)
 
dict get_binning (self, weight_df)
 
dict get_fei_binning (self, weight_df)
 
None get_ntuple_variables (self, pd.DataFrame ntuple_df, ReweighterParticle particle)
 
pd.DataFrame merge_pid_weight_tables (self, dict weights_dict, dict pdg_pid_variable_dict)
 
None add_pid_weight_columns (self, pd.DataFrame ntuple_df, ReweighterParticle particle)
 
None add_pid_particle (self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)
 
ReweighterParticle get_particle (self, str prefix)
 
 convert_fei_table (self, pd.DataFrame table, float threshold)
 
None add_fei_particle (self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)
 
 add_fei_weight_columns (self, pd.DataFrame ntuple_df, ReweighterParticle particle)
 
 reweight (self, pd.DataFrame df, bool generate_variations=True)
 
 print_coverage (self)
 
 plot_coverage (self)
 

Public Attributes

 n_variations = n_variations
 Number of weight variations to generate.
 
list particles = []
 List of particles.
 
list correlations = []
 Correlations between the particles.
 
 weight_name = weight_name
 Name of the weight column.
 
bool weights_generated = False
 Flag to indicate if the weights have been generated.
 
 evaluate_plots = evaluate_plots
 Flag to indicate if the plots should be evaluated.
 
 nbins = nbins
 Number of bins for the plots.
 
 fillna = fillna
 Value to fill NaN values.
 

Detailed Description

Class that reweights the dataframe.

Args:
    n_variations (int): Number of weight variations to generate.
    weight_name (str): Name of the weight column.
    evaluate_plots (bool): Flag to indicate if the plots should be evaluated.
    nbins (int): Number of bins for the plots.

Definition at line 228 of file sysvar.py.

Constructor & Destructor Documentation

◆ __init__()

None __init__ ( self,
int n_variations = 100,
str weight_name = "Weight",
bool evaluate_plots = True,
int nbins = 50,
float fillna = 1.0 )
Initializes the Reweighter class.

Definition at line 239 of file sysvar.py.

244 fillna: float = 1.0) -> None:
245 """
246 Initializes the Reweighter class.
247 """
248
249 self.n_variations = n_variations
250
251 self.particles = []
252
253 self.correlations = []
254
255 self.weight_name = weight_name
256
257 self.weights_generated = False
258
259 self.evaluate_plots = evaluate_plots
260
261 self.nbins = nbins
262
263 self.fillna = fillna
264

Member Function Documentation

◆ add_fei_particle()

None add_fei_particle ( self,
str prefix,
pd.DataFrame table,
float threshold,
np.ndarray cov = None,
dict variable_aliases = None )
Adds weight variations according to the total uncertainty for easier error propagation.

Args:
    prefix (str): Prefix for the new columns.
    table (pandas.DataFrame): Dataframe containing the efficiency weights.
    threshold (float): Threshold for the efficiency weights.
    cov (numpy.ndarray): Covariance matrix for the efficiency weights.
    variable_aliases (dict): Dictionary containing variable aliases.

Definition at line 490 of file sysvar.py.

495 ) -> None:
496 """
497 Adds weight variations according to the total uncertainty for easier error propagation.
498
499 Args:
500 prefix (str): Prefix for the new columns.
501 table (pandas.DataFrame): Dataframe containing the efficiency weights.
502 threshold (float): Threshold for the efficiency weights.
503 cov (numpy.ndarray): Covariance matrix for the efficiency weights.
504 variable_aliases (dict): Dictionary containing variable aliases.
505 """
506 # Empty prefix means no prefix
507 if prefix is None:
508 prefix = ''
509 if prefix and not prefix.endswith('_'):
510 prefix += '_'
511 if self.get_particle(prefix):
512 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
513 if variable_aliases is None:
514 variable_aliases = {}
515 if table is None or len(table) == 0:
516 raise ValueError('No weights provided!')
517 converted_table = self.convert_fei_table(table, threshold)
518 pdg_binning = {(reco_pdg, mc_pdg): self.get_fei_binning(converted_table.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
519 for reco_pdg, mc_pdg in converted_table[['PDG', 'mcPDG']].value_counts().index.to_list()}
520 particle = ReweighterParticle(prefix,
521 type='FEI',
522 merged_table=converted_table,
523 pdg_binning=pdg_binning,
524 variable_aliases=variable_aliases,
525 weight_name=self.weight_name,
526 cov=cov)
527 self.particles += [particle]
528

◆ add_fei_weight_columns()

add_fei_weight_columns ( self,
pd.DataFrame ntuple_df,
ReweighterParticle particle )
Adds weight columns according to the FEI calibration tables

Definition at line 529 of file sysvar.py.

529 def add_fei_weight_columns(self, ntuple_df: pd.DataFrame, particle: ReweighterParticle):
530 """
531 Adds weight columns according to the FEI calibration tables
532 """
533 rest_str = 'rest'
534 particle.merged_table[_fei_mode_col]
535 # Apply a weight value from the weight table to the ntuple, based on the binning
536 binning_df = pd.DataFrame(index=ntuple_df.index)
537 # Take absolute value of mcPDG for binning because we have charge already
538 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
539 # Copy the mode ID from the ntuple
540 binning_df['num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)
541 # Default value in case if reco PDG is not a B-meson PDG
542 binning_df[_fei_mode_col] = np.nan
543 plot_values = {}
544 for reco_pdg, mc_pdg in particle.pdg_binning:
545 plot_values[(reco_pdg, mc_pdg)] = {}
546 binning_df.loc[binning_df['PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(
547 f'PDG == {reco_pdg} and {_fei_mode_col}.str.lower() == "{rest_str}"')[_fei_mode_col].values[0]
548 for mode in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:
549 binning_df.loc[(binning_df['PDG'] == reco_pdg) & (binning_df['num_mode'] == int(mode[4:])), _fei_mode_col] = mode
550 if self.evaluate_plots:
551 values = ntuple_df[f'{particle.get_varname(_fei_mode_col)}']
552 x_range = np.linspace(values.min(), values.max(), int(values.max())+1)
553 plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=True)[0]
554
555 # merge the weight table with the ntuple on binning columns
556 weight_cols = _weight_cols
557 if particle.column_names:
558 weight_cols = particle.column_names
559 binning_df = binning_df.merge(particle.merged_table[weight_cols + ['PDG', _fei_mode_col]],
560 on=['PDG', _fei_mode_col], how='left')
561 binning_df.index = ntuple_df.index
562 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
563 particle.plot_values = plot_values
564 for col in weight_cols:
565 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
566

◆ add_pid_particle()

None add_pid_particle ( self,
str prefix,
dict weights_dict,
dict pdg_pid_variable_dict,
dict variable_aliases = None,
int sys_seed = None,
bool syscorr = True )
Adds weight variations according to the total uncertainty for easier error propagation.

Args:
    prefix (str): Prefix for the new columns.
    weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
    pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
    variable_aliases (dict): Dictionary containing variable aliases.
    sys_seed (int): Seed for the systematic variations.
    syscorr (bool): When true assume systematics are 100% correlated defaults to
true. Note this is overridden by provision of a None value rho_sys

Definition at line 402 of file sysvar.py.

408 syscorr: bool = True) -> None:
409 """
410 Adds weight variations according to the total uncertainty for easier error propagation.
411
412 Args:
413 prefix (str): Prefix for the new columns.
414 weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
415 pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
416 variable_aliases (dict): Dictionary containing variable aliases.
417 sys_seed (int): Seed for the systematic variations.
418 syscorr (bool): When true assume systematics are 100% correlated defaults to
419 true. Note this is overridden by provision of a None value rho_sys
420 """
421 # Empty prefix means no prefix
422 if prefix is None:
423 prefix = ''
424 # Add underscore if not present
425 if prefix and not prefix.endswith('_'):
426 prefix += '_'
427 if self.get_particle(prefix):
428 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
429 if variable_aliases is None:
430 variable_aliases = {}
431 merged_weight_df = self.merge_pid_weight_tables(weights_dict, pdg_pid_variable_dict)
432 pdg_binning = {(reco_pdg, mc_pdg): self.get_binning(merged_weight_df.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
433 for reco_pdg, mc_pdg in merged_weight_df[['PDG', 'mcPDG']].value_counts().index.to_list()}
434 particle = ReweighterParticle(prefix,
435 type='PID',
436 merged_table=merged_weight_df,
437 pdg_binning=pdg_binning,
438 variable_aliases=variable_aliases,
439 weight_name=self.weight_name,
440 sys_seed=sys_seed,
441 syscorr=syscorr)
442 self.particles += [particle]
443

◆ add_pid_weight_columns()

None add_pid_weight_columns ( self,
pd.DataFrame ntuple_df,
ReweighterParticle particle )
Adds a weight and uncertainty columns to the dataframe.

Args:
    ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    particle (ReweighterParticle): Particle object.

Definition at line 348 of file sysvar.py.

350 particle: ReweighterParticle) -> None:
351 """
352 Adds a weight and uncertainty columns to the dataframe.
353
354 Args:
355 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
356 particle (ReweighterParticle): Particle object.
357 """
358 # Apply a weight value from the weight table to the ntuple, based on the binning
359 binning_df = pd.DataFrame(index=ntuple_df.index)
360 # Take absolute value of mcPDG for binning because we have charge already
361 binning_df['mcPDG'] = ntuple_df[f'{particle.get_varname("mcPDG")}'].abs()
362 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
363 plot_values = {}
364 for reco_pdg, mc_pdg in particle.pdg_binning:
365 ntuple_cut = f'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'
366 if ntuple_df.query(ntuple_cut).empty:
367 continue
368 plot_values[(reco_pdg, mc_pdg)] = {}
369 for var in particle.pdg_binning[(reco_pdg, mc_pdg)]:
370 labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])
371 for i in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]
372 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(
373 ntuple_cut)[f'{particle.get_varname(var)}'],
374 particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)
375 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
376 f'{var}_min'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
377 var].str[0]
378 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
379 f'{var}_max'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
380 var].str[1]
381 binning_df.drop(var, axis=1, inplace=True)
382 if self.evaluate_plots:
383 values = ntuple_df.query(ntuple_cut)[f'{particle.get_varname(var)}']
384 if len(values.unique()) < 2:
385 print(f'Skip {var} for plotting!')
386 continue
387 x_range = np.linspace(values.min(), values.max(), self.nbins)
388 plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=True)[0]
389 # merge the weight table with the ntuple on binning columns
390 weight_cols = _weight_cols
391 if particle.column_names:
392 weight_cols = particle.column_names
393 binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],
394 on=binning_df.columns.tolist(), how='left')
395 binning_df.index = ntuple_df.index
396 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
397 particle.plot_values = plot_values
398 for col in weight_cols:
399 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
400 ntuple_df[f'{particle.get_varname(col)}'] = ntuple_df[f'{particle.get_varname(col)}'].fillna(self.fillna)
401

◆ convert_fei_table()

convert_fei_table ( self,
pd.DataFrame table,
float threshold )
Checks if the tables are provided in a legacy format and converts them to the standard format.

Definition at line 453 of file sysvar.py.

453 def convert_fei_table(self, table: pd.DataFrame, threshold: float):
454 """
455 Checks if the tables are provided in a legacy format and converts them to the standard format.
456 """
457 result = None
458 str_to_pdg = {'B+': 521, 'B-': 521, 'B0': 511}
459 if 'cal' in table.columns:
460 result = pd.DataFrame(index=table.index)
461 result['data_MC_ratio'] = table['cal']
462 result['PDG'] = table['Btag'].apply(lambda x: str_to_pdg.get(x))
463 # Assume these are only efficiency tables
464 result['mcPDG'] = result['PDG']
465 result['threshold'] = table['sig_prob_threshold']
466 result[_fei_mode_col] = table[_fei_mode_col]
467 result['data_MC_uncertainty_stat_dn'] = table['cal_stat_error']
468 result['data_MC_uncertainty_stat_up'] = table['cal_stat_error']
469 result['data_MC_uncertainty_sys_dn'] = table['cal_sys_error']
470 result['data_MC_uncertainty_sys_up'] = table['cal_sys_error']
471 elif 'cal factor' in table.columns:
472 result = pd.DataFrame(index=table.index)
473 result['data_MC_ratio'] = table['cal factor']
474 result['PDG'] = table['Btype'].apply(lambda x: str_to_pdg.get(x))
475 result['mcPDG'] = result['PDG']
476 result['threshold'] = table['sig prob cut']
477 # Assign the total error to the stat uncertainty and set syst. one to 0
478 result['data_MC_uncertainty_stat_dn'] = table['error']
479 result['data_MC_uncertainty_stat_up'] = table['error']
480 result['data_MC_uncertainty_sys_dn'] = 0
481 result['data_MC_uncertainty_sys_up'] = 0
482 result[_fei_mode_col] = table['mode']
483 else:
484 result = table
485 result = result.query(f'threshold == {threshold}')
486 if len(result) == 0:
487 raise ValueError(f'No weights found for threshold {threshold}!')
488 return result
489

◆ get_bin_columns()

list get_bin_columns ( self,
weight_df )
Returns the kinematic bin columns of the dataframe.

Definition at line 265 of file sysvar.py.

265 def get_bin_columns(self, weight_df) -> list:
266 """
267 Returns the kinematic bin columns of the dataframe.
268 """
269 return [col for col in weight_df.columns if col.endswith('_min') or col.endswith('_max')]
270

◆ get_binning()

dict get_binning ( self,
weight_df )
Returns the kinematic binning of the dataframe.

Definition at line 271 of file sysvar.py.

271 def get_binning(self, weight_df) -> dict:
272 """
273 Returns the kinematic binning of the dataframe.
274 """
275 columns = self.get_bin_columns(weight_df)
276 var_names = {'_'.join(col.split('_')[:-1]) for col in columns}
277 bin_dict = {}
278 for var_name in var_names:
279 bin_dict[var_name] = []
280 for col in columns:
281 if col.startswith(var_name):
282 bin_dict[var_name] += list(weight_df[col].values)
283 bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))
284 return bin_dict
285

◆ get_fei_binning()

dict get_fei_binning ( self,
weight_df )
Returns the irregular binning of the dataframe.

Definition at line 286 of file sysvar.py.

286 def get_fei_binning(self, weight_df) -> dict:
287 """
288 Returns the irregular binning of the dataframe.
289 """
290 return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith('mode'),
291 _fei_mode_col].value_counts().index.to_list()}
292

◆ get_ntuple_variables()

None get_ntuple_variables ( self,
pd.DataFrame ntuple_df,
ReweighterParticle particle )
Checks if the variables are in the ntuple and returns them.

Args:
    ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    particle (ReweighterParticle): Particle object containing the necessary variables.

Definition at line 293 of file sysvar.py.

295 particle: ReweighterParticle) -> None:
296 """
297 Checks if the variables are in the ntuple and returns them.
298
299 Args:
300 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
301 particle (ReweighterParticle): Particle object containing the necessary variables.
302 """
303 ntuple_variables = particle.get_binning_variables()
304 ntuple_variables += particle.get_pdg_variables()
305 for var in ntuple_variables:
306 if var not in ntuple_df.columns:
307 raise ValueError(f'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')
308 return ntuple_variables
309

◆ get_particle()

ReweighterParticle get_particle ( self,
str prefix )
Get a particle by its prefix.

Definition at line 444 of file sysvar.py.

444 def get_particle(self, prefix: str) -> ReweighterParticle:
445 """
446 Get a particle by its prefix.
447 """
448 cands = [particle for particle in self.particles if particle.prefix.strip('_') == prefix.strip('_')]
449 if len(cands) == 0:
450 return None
451 return cands[0]
452

◆ merge_pid_weight_tables()

pd.DataFrame merge_pid_weight_tables ( self,
dict weights_dict,
dict pdg_pid_variable_dict )
Merges the efficiency and fake rate weight tables.

Args:
    weights_dict (dict): Dictionary containing the weight tables.
    pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.

Definition at line 310 of file sysvar.py.

312 pdg_pid_variable_dict: dict) -> pd.DataFrame:
313 """
314 Merges the efficiency and fake rate weight tables.
315
316 Args:
317 weights_dict (dict): Dictionary containing the weight tables.
318 pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.
319 """
320 weight_dfs = []
321 for reco_pdg, mc_pdg in weights_dict:
322 if reco_pdg not in pdg_pid_variable_dict:
323 raise ValueError(f'Reconstructed PDG code {reco_pdg} not found in thresholds!')
324 weight_df = weights_dict[(reco_pdg, mc_pdg)]
325 weight_df['mcPDG'] = mc_pdg
326 weight_df['PDG'] = reco_pdg
327 # Check if these are legacy tables:
328 if 'charge' in weight_df.columns:
329 charge_dict = {'+': [0, 2], '-': [-2, 0]}
330 weight_df[['charge_min', 'charge_max']] = [charge_dict[val] for val in weight_df['charge'].values]
331 weight_df = weight_df.drop(columns=['charge'])
332 # If iso_score is a single value, drop the min and max columns
333 if 'iso_score_min' in weight_df.columns and len(weight_df['iso_score_min'].unique()) == 1:
334 weight_df = weight_df.drop(columns=['iso_score_min', 'iso_score_max'])
335 pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]
336 threshold = pdg_pid_variable_dict[reco_pdg][1]
337 selected_weights = weight_df.query(f'variable == "{pid_variable_name}" and threshold == {threshold}')
338 if len(selected_weights) == 0:
339 available_variables = weight_df['variable'].unique()
340 available_thresholds = weight_df['threshold'].unique()
341 raise ValueError(f'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'
342 f' variable {pid_variable_name} and threshold {threshold}!\n'
343 f' Available variables: {available_variables}\n'
344 f' Available thresholds: {available_thresholds}')
345 weight_dfs.append(selected_weights)
346 return pd.concat(weight_dfs, ignore_index=True)
347

◆ plot_coverage()

plot_coverage ( self)
Plots the coverage of each particle.

Definition at line 597 of file sysvar.py.

597 def plot_coverage(self):
598 """
599 Plots the coverage of each particle.
600 """
601 for particle in self.particles:
602 particle.plot_coverage()
603
604

◆ print_coverage()

print_coverage ( self)
Prints the coverage of each particle.

Definition at line 589 of file sysvar.py.

589 def print_coverage(self):
590 """
591 Prints the coverage of each particle.
592 """
593 print('Coverage:')
594 for particle in self.particles:
595 print(f'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')
596

◆ reweight()

reweight ( self,
pd.DataFrame df,
bool generate_variations = True )
Reweights the dataframe according to the weight tables.

Args:
    df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    generate_variations (bool): When true generate weight variations.

Definition at line 567 of file sysvar.py.

569 generate_variations: bool = True):
570 """
571 Reweights the dataframe according to the weight tables.
572
573 Args:
574 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
575 generate_variations (bool): When true generate weight variations.
576 """
577 for particle in self.particles:
578 if particle.type not in _correction_types:
579 raise ValueError(f'Particle type {particle.type} not supported!')
580 print(f'Required variables: {self.get_ntuple_variables(df, particle)}')
581 if generate_variations:
582 particle.generate_variations(n_variations=self.n_variations)
583 if particle.type == 'PID':
584 self.add_pid_weight_columns(df, particle)
585 elif particle.type == 'FEI':
586 self.add_fei_weight_columns(df, particle)
587 return df
588

Member Data Documentation

◆ correlations

list correlations = []

Correlations between the particles.

Definition at line 253 of file sysvar.py.

◆ evaluate_plots

evaluate_plots = evaluate_plots

Flag to indicate if the plots should be evaluated.

Definition at line 259 of file sysvar.py.

◆ fillna

fillna = fillna

Value to fill NaN values.

Definition at line 263 of file sysvar.py.

◆ n_variations

n_variations = n_variations

Number of weight variations to generate.

Definition at line 249 of file sysvar.py.

◆ nbins

nbins = nbins

Number of bins for the plots.

Definition at line 261 of file sysvar.py.

◆ particles

list particles = []

List of particles.

Definition at line 251 of file sysvar.py.

◆ weight_name

weight_name = weight_name

Name of the weight column.

Definition at line 255 of file sysvar.py.

◆ weights_generated

bool weights_generated = False

Flag to indicate if the weights have been generated.

Definition at line 257 of file sysvar.py.


The documentation for this class was generated from the following file: