Belle II Software development
Reweighter Class Reference

Public Member Functions

None __init__ (self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)
 
list get_bin_columns (self, weight_df)
 
dict get_binning (self, weight_df)
 
dict get_fei_binning (self, weight_df)
 
None get_ntuple_variables (self, pd.DataFrame ntuple_df, ReweighterParticle particle)
 
pd.DataFrame merge_pid_weight_tables (self, dict weights_dict, dict pdg_pid_variable_dict)
 
None add_pid_weight_columns (self, pd.DataFrame ntuple_df, ReweighterParticle particle)
 
None add_pid_particle (self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)
 
ReweighterParticle get_particle (self, str prefix)
 
def convert_fei_table (self, pd.DataFrame table, float threshold)
 
None add_fei_particle (self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)
 
def add_fei_weight_columns (self, pd.DataFrame ntuple_df, ReweighterParticle particle)
 
def reweight (self, pd.DataFrame df, bool generate_variations=True)
 
def print_coverage (self)
 
def plot_coverage (self)
 

Public Attributes

 n_variations
 Number of weight variations to generate.
 
 particles
 List of particles.
 
 correlations
 Correlations between the particles.
 
 weight_name
 Name of the weight column.
 
 weights_generated
 Flag to indicate if the weights have been generated.
 
 evaluate_plots
 Flag to indicate if the plots should be evaluated.
 
 nbins
 Number of bins for the plots.
 
 fillna
 Value to fill NaN values.
 

Detailed Description

Class that reweights the dataframe.

Args:
    n_variations (int): Number of weight variations to generate.
    weight_name (str): Name of the weight column.
    evaluate_plots (bool): Flag to indicate if the plots should be evaluated.
    nbins (int): Number of bins for the plots.

Definition at line 222 of file sysvar.py.

Constructor & Destructor Documentation

◆ __init__()

None __init__ (   self,
int   n_variations = 100,
str   weight_name = "Weight",
bool   evaluate_plots = True,
int   nbins = 50,
float   fillna = 1.0 
)
Initializes the Reweighter class.

Definition at line 233 of file sysvar.py.

238 fillna: float = 1.0) -> None:
239 """
240 Initializes the Reweighter class.
241 """
242
243 self.n_variations = n_variations
244
245 self.particles = []
246
247 self.correlations = []
248
249 self.weight_name = weight_name
250
251 self.weights_generated = False
252
253 self.evaluate_plots = evaluate_plots
254
255 self.nbins = nbins
256
257 self.fillna = fillna
258

Member Function Documentation

◆ add_fei_particle()

None add_fei_particle (   self,
str  prefix,
pd.DataFrame  table,
float  threshold,
np.ndarray   cov = None,
dict   variable_aliases = None 
)
Adds weight variations according to the total uncertainty for easier error propagation.

Args:
    prefix (str): Prefix for the new columns.
    table (pandas.DataFrame): Dataframe containing the efficiency weights.
    threshold (float): Threshold for the efficiency weights.
    cov (numpy.ndarray): Covariance matrix for the efficiency weights.
    variable_aliases (dict): Dictionary containing variable aliases.

Definition at line 472 of file sysvar.py.

477 ) -> None:
478 """
479 Adds weight variations according to the total uncertainty for easier error propagation.
480
481 Args:
482 prefix (str): Prefix for the new columns.
483 table (pandas.DataFrame): Dataframe containing the efficiency weights.
484 threshold (float): Threshold for the efficiency weights.
485 cov (numpy.ndarray): Covariance matrix for the efficiency weights.
486 variable_aliases (dict): Dictionary containing variable aliases.
487 """
488 # Empty prefix means no prefix
489 if prefix is None:
490 prefix = ''
491 if prefix and not prefix.endswith('_'):
492 prefix += '_'
493 if self.get_particle(prefix):
494 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
495 if variable_aliases is None:
496 variable_aliases = {}
497 if table is None or len(table) == 0:
498 raise ValueError('No weights provided!')
499 converted_table = self.convert_fei_table(table, threshold)
500 pdg_binning = {(reco_pdg, mc_pdg): self.get_fei_binning(converted_table.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
501 for reco_pdg, mc_pdg in converted_table[['PDG', 'mcPDG']].value_counts().index.to_list()}
502 particle = ReweighterParticle(prefix,
503 type='FEI',
504 merged_table=converted_table,
505 pdg_binning=pdg_binning,
506 variable_aliases=variable_aliases,
507 weight_name=self.weight_name,
508 cov=cov)
509 self.particles += [particle]
510

◆ add_fei_weight_columns()

def add_fei_weight_columns (   self,
pd.DataFrame  ntuple_df,
ReweighterParticle  particle 
)
Adds weight columns according to the FEI calibration tables

Definition at line 511 of file sysvar.py.

511 def add_fei_weight_columns(self, ntuple_df: pd.DataFrame, particle: ReweighterParticle):
512 """
513 Adds weight columns according to the FEI calibration tables
514 """
515 rest_str = 'Rest'
516 # Apply a weight value from the weight table to the ntuple, based on the binning
517 binning_df = pd.DataFrame(index=ntuple_df.index)
518 # Take absolute value of mcPDG for binning because we have charge already
519 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
520 # Copy the mode ID from the ntuple
521 binning_df['num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)
522 # Default value in case if reco PDG is not a B-meson PDG
523 binning_df[_fei_mode_col] = np.nan
524 plot_values = {}
525 for reco_pdg, mc_pdg in particle.pdg_binning:
526 plot_values[(reco_pdg, mc_pdg)] = {}
527 binning_df.loc[binning_df['PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(
528 f'PDG == {reco_pdg} and {_fei_mode_col} == "{rest_str}"')[_fei_mode_col].values[0]
529 for mode in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:
530 binning_df.loc[(binning_df['PDG'] == reco_pdg) & (binning_df['num_mode'] == int(mode[4:])), _fei_mode_col] = mode
531 if self.evaluate_plots:
532 values = ntuple_df[f'{particle.get_varname(_fei_mode_col)}']
533 x_range = np.linspace(values.min(), values.max(), int(values.max())+1)
534 plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=True)[0]
535
536 # merge the weight table with the ntuple on binning columns
537 weight_cols = _weight_cols
538 if particle.column_names:
539 weight_cols = particle.column_names
540 binning_df = binning_df.merge(particle.merged_table[weight_cols + ['PDG', _fei_mode_col]],
541 on=['PDG', _fei_mode_col], how='left')
542 binning_df.index = ntuple_df.index
543 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
544 particle.plot_values = plot_values
545 for col in weight_cols:
546 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
547

◆ add_pid_particle()

None add_pid_particle (   self,
str  prefix,
dict  weights_dict,
dict  pdg_pid_variable_dict,
dict   variable_aliases = None,
int   sys_seed = None,
bool   syscorr = True 
)
Adds weight variations according to the total uncertainty for easier error propagation.

Args:
    prefix (str): Prefix for the new columns.
    weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
    pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
    variable_aliases (dict): Dictionary containing variable aliases.
    sys_seed (int): Seed for the systematic variations.
    syscorr (bool): When true assume systematics are 100% correlated defaults to
true. Note this is overridden by provision of a None value rho_sys

Definition at line 396 of file sysvar.py.

402 syscorr: bool = True) -> None:
403 """
404 Adds weight variations according to the total uncertainty for easier error propagation.
405
406 Args:
407 prefix (str): Prefix for the new columns.
408 weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
409 pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
410 variable_aliases (dict): Dictionary containing variable aliases.
411 sys_seed (int): Seed for the systematic variations.
412 syscorr (bool): When true assume systematics are 100% correlated defaults to
413 true. Note this is overridden by provision of a None value rho_sys
414 """
415 # Empty prefix means no prefix
416 if prefix is None:
417 prefix = ''
418 # Add underscore if not present
419 if prefix and not prefix.endswith('_'):
420 prefix += '_'
421 if self.get_particle(prefix):
422 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
423 if variable_aliases is None:
424 variable_aliases = {}
425 merged_weight_df = self.merge_pid_weight_tables(weights_dict, pdg_pid_variable_dict)
426 pdg_binning = {(reco_pdg, mc_pdg): self.get_binning(merged_weight_df.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
427 for reco_pdg, mc_pdg in merged_weight_df[['PDG', 'mcPDG']].value_counts().index.to_list()}
428 particle = ReweighterParticle(prefix,
429 type='PID',
430 merged_table=merged_weight_df,
431 pdg_binning=pdg_binning,
432 variable_aliases=variable_aliases,
433 weight_name=self.weight_name,
434 sys_seed=sys_seed,
435 syscorr=syscorr)
436 self.particles += [particle]
437

◆ add_pid_weight_columns()

None add_pid_weight_columns (   self,
pd.DataFrame  ntuple_df,
ReweighterParticle  particle 
)
Adds a weight and uncertainty columns to the dataframe.

Args:
    ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    particle (ReweighterParticle): Particle object.

Definition at line 342 of file sysvar.py.

344 particle: ReweighterParticle) -> None:
345 """
346 Adds a weight and uncertainty columns to the dataframe.
347
348 Args:
349 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
350 particle (ReweighterParticle): Particle object.
351 """
352 # Apply a weight value from the weight table to the ntuple, based on the binning
353 binning_df = pd.DataFrame(index=ntuple_df.index)
354 # Take absolute value of mcPDG for binning because we have charge already
355 binning_df['mcPDG'] = ntuple_df[f'{particle.get_varname("mcPDG")}'].abs()
356 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
357 plot_values = {}
358 for reco_pdg, mc_pdg in particle.pdg_binning:
359 ntuple_cut = f'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'
360 if ntuple_df.query(ntuple_cut).empty:
361 continue
362 plot_values[(reco_pdg, mc_pdg)] = {}
363 for var in particle.pdg_binning[(reco_pdg, mc_pdg)]:
364 labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])
365 for i in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]
366 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(
367 ntuple_cut)[f'{particle.get_varname(var)}'],
368 particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)
369 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
370 f'{var}_min'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
371 var].str[0]
372 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
373 f'{var}_max'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
374 var].str[1]
375 binning_df.drop(var, axis=1, inplace=True)
376 if self.evaluate_plots:
377 values = ntuple_df.query(ntuple_cut)[f'{particle.get_varname(var)}']
378 if len(values.unique()) < 2:
379 print(f'Skip {var} for plotting!')
380 continue
381 x_range = np.linspace(values.min(), values.max(), self.nbins)
382 plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=True)[0]
383 # merge the weight table with the ntuple on binning columns
384 weight_cols = _weight_cols
385 if particle.column_names:
386 weight_cols = particle.column_names
387 binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],
388 on=binning_df.columns.tolist(), how='left')
389 binning_df.index = ntuple_df.index
390 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
391 particle.plot_values = plot_values
392 for col in weight_cols:
393 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
394 ntuple_df[f'{particle.get_varname(col)}'] = ntuple_df[f'{particle.get_varname(col)}'].fillna(self.fillna)
395

◆ convert_fei_table()

def convert_fei_table (   self,
pd.DataFrame  table,
float  threshold 
)
Checks if the tables are provided in a legacy format and converts them to the standard format.

Definition at line 447 of file sysvar.py.

447 def convert_fei_table(self, table: pd.DataFrame, threshold: float):
448 """
449 Checks if the tables are provided in a legacy format and converts them to the standard format.
450 """
451 result = None
452 if 'cal' in table.columns:
453 result = pd.DataFrame(index=table.index)
454 result['data_MC_ratio'] = table['cal']
455 str_to_pdg = {'B+': 521, 'B-': 521, 'B0': 511}
456 result['PDG'] = table['Btag'].apply(lambda x: str_to_pdg.get(x))
457 # Assume these are only efficiency tables
458 result['mcPDG'] = result['PDG']
459 result['threshold'] = table['sig_prob_threshold']
460 result[_fei_mode_col] = table[_fei_mode_col]
461 result['data_MC_uncertainty_stat_dn'] = table['cal_stat_error']
462 result['data_MC_uncertainty_stat_up'] = table['cal_stat_error']
463 result['data_MC_uncertainty_sys_dn'] = table['cal_sys_error']
464 result['data_MC_uncertainty_sys_up'] = table['cal_sys_error']
465 else:
466 result = table
467 result = result.query(f'threshold == {threshold}')
468 if len(result) == 0:
469 raise ValueError(f'No weights found for threshold {threshold}!')
470 return result
471

◆ get_bin_columns()

list get_bin_columns (   self,
  weight_df 
)
Returns the kinematic bin columns of the dataframe.

Definition at line 259 of file sysvar.py.

259 def get_bin_columns(self, weight_df) -> list:
260 """
261 Returns the kinematic bin columns of the dataframe.
262 """
263 return [col for col in weight_df.columns if col.endswith('_min') or col.endswith('_max')]
264

◆ get_binning()

dict get_binning (   self,
  weight_df 
)
Returns the kinematic binning of the dataframe.

Definition at line 265 of file sysvar.py.

265 def get_binning(self, weight_df) -> dict:
266 """
267 Returns the kinematic binning of the dataframe.
268 """
269 columns = self.get_bin_columns(weight_df)
270 var_names = {'_'.join(col.split('_')[:-1]) for col in columns}
271 bin_dict = {}
272 for var_name in var_names:
273 bin_dict[var_name] = []
274 for col in columns:
275 if col.startswith(var_name):
276 bin_dict[var_name] += list(weight_df[col].values)
277 bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))
278 return bin_dict
279

◆ get_fei_binning()

dict get_fei_binning (   self,
  weight_df 
)
Returns the irregular binning of the dataframe.

Definition at line 280 of file sysvar.py.

280 def get_fei_binning(self, weight_df) -> dict:
281 """
282 Returns the irregular binning of the dataframe.
283 """
284 return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith('mode'),
285 _fei_mode_col].value_counts().index.to_list()}
286

◆ get_ntuple_variables()

None get_ntuple_variables (   self,
pd.DataFrame  ntuple_df,
ReweighterParticle  particle 
)
Checks if the variables are in the ntuple and returns them.

Args:
    ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    particle (ReweighterParticle): Particle object containing the necessary variables.

Definition at line 287 of file sysvar.py.

289 particle: ReweighterParticle) -> None:
290 """
291 Checks if the variables are in the ntuple and returns them.
292
293 Args:
294 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
295 particle (ReweighterParticle): Particle object containing the necessary variables.
296 """
297 ntuple_variables = particle.get_binning_variables()
298 ntuple_variables += particle.get_pdg_variables()
299 for var in ntuple_variables:
300 if var not in ntuple_df.columns:
301 raise ValueError(f'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')
302 return ntuple_variables
303

◆ get_particle()

ReweighterParticle get_particle (   self,
str  prefix 
)
Get a particle by its prefix.

Definition at line 438 of file sysvar.py.

438 def get_particle(self, prefix: str) -> ReweighterParticle:
439 """
440 Get a particle by its prefix.
441 """
442 cands = [particle for particle in self.particles if particle.prefix.strip('_') == prefix.strip('_')]
443 if len(cands) == 0:
444 return None
445 return cands[0]
446

◆ merge_pid_weight_tables()

pd.DataFrame merge_pid_weight_tables (   self,
dict  weights_dict,
dict  pdg_pid_variable_dict 
)
Merges the efficiency and fake rate weight tables.

Args:
    weights_dict (dict): Dictionary containing the weight tables.
    pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.

Definition at line 304 of file sysvar.py.

306 pdg_pid_variable_dict: dict) -> pd.DataFrame:
307 """
308 Merges the efficiency and fake rate weight tables.
309
310 Args:
311 weights_dict (dict): Dictionary containing the weight tables.
312 pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.
313 """
314 weight_dfs = []
315 for reco_pdg, mc_pdg in weights_dict:
316 if reco_pdg not in pdg_pid_variable_dict:
317 raise ValueError(f'Reconstructed PDG code {reco_pdg} not found in thresholds!')
318 weight_df = weights_dict[(reco_pdg, mc_pdg)]
319 weight_df['mcPDG'] = mc_pdg
320 weight_df['PDG'] = reco_pdg
321 # Check if these are legacy tables:
322 if 'charge' in weight_df.columns:
323 charge_dict = {'+': [0, 2], '-': [-2, 0]}
324 weight_df[['charge_min', 'charge_max']] = [charge_dict[val] for val in weight_df['charge'].values]
325 weight_df = weight_df.drop(columns=['charge'])
326 # If iso_score is a single value, drop the min and max columns
327 if 'iso_score_min' in weight_df.columns and len(weight_df['iso_score_min'].unique()) == 1:
328 weight_df = weight_df.drop(columns=['iso_score_min', 'iso_score_max'])
329 pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]
330 threshold = pdg_pid_variable_dict[reco_pdg][1]
331 selected_weights = weight_df.query(f'variable == "{pid_variable_name}" and threshold == {threshold}')
332 if len(selected_weights) == 0:
333 available_variables = weight_df['variable'].unique()
334 available_thresholds = weight_df['threshold'].unique()
335 raise ValueError(f'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'
336 f' variable {pid_variable_name} and threshold {threshold}!\n'
337 f' Available variables: {available_variables}\n'
338 f' Available thresholds: {available_thresholds}')
339 weight_dfs.append(selected_weights)
340 return pd.concat(weight_dfs, ignore_index=True)
341

◆ plot_coverage()

def plot_coverage (   self)
Plots the coverage of each particle.

Definition at line 578 of file sysvar.py.

578 def plot_coverage(self):
579 """
580 Plots the coverage of each particle.
581 """
582 for particle in self.particles:
583 particle.plot_coverage()
584
585

◆ print_coverage()

def print_coverage (   self)
Prints the coverage of each particle.

Definition at line 570 of file sysvar.py.

570 def print_coverage(self):
571 """
572 Prints the coverage of each particle.
573 """
574 print('Coverage:')
575 for particle in self.particles:
576 print(f'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')
577

◆ reweight()

def reweight (   self,
pd.DataFrame  df,
bool   generate_variations = True 
)
Reweights the dataframe according to the weight tables.

Args:
    df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    generate_variations (bool): When true generate weight variations.

Definition at line 548 of file sysvar.py.

550 generate_variations: bool = True):
551 """
552 Reweights the dataframe according to the weight tables.
553
554 Args:
555 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
556 generate_variations (bool): When true generate weight variations.
557 """
558 for particle in self.particles:
559 if particle.type not in _correction_types:
560 raise ValueError(f'Particle type {particle.type} not supported!')
561 print(f'Required variables: {self.get_ntuple_variables(df, particle)}')
562 if generate_variations:
563 particle.generate_variations(n_variations=self.n_variations)
564 if particle.type == 'PID':
565 self.add_pid_weight_columns(df, particle)
566 elif particle.type == 'FEI':
567 self.add_fei_weight_columns(df, particle)
568 return df
569

Member Data Documentation

◆ correlations

correlations

Correlations between the particles.

Definition at line 247 of file sysvar.py.

◆ evaluate_plots

evaluate_plots

Flag to indicate if the plots should be evaluated.

Definition at line 253 of file sysvar.py.

◆ fillna

fillna

Value to fill NaN values.

Definition at line 257 of file sysvar.py.

◆ n_variations

n_variations

Number of weight variations to generate.

Definition at line 243 of file sysvar.py.

◆ nbins

nbins

Number of bins for the plots.

Definition at line 255 of file sysvar.py.

◆ particles

particles

List of particles.

Definition at line 245 of file sysvar.py.

◆ weight_name

weight_name

Name of the weight column.

Definition at line 249 of file sysvar.py.

◆ weights_generated

weights_generated

Flag to indicate if the weights have been generated.

Definition at line 251 of file sysvar.py.


The documentation for this class was generated from the following file: