Belle II Software light-2406-ragdoll
Reweighter Class Reference

Public Member Functions

None __init__ (self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)
 
list get_bin_columns (self, weight_df)
 
dict get_binning (self, weight_df)
 
dict get_fei_binning (self, weight_df)
 
None get_ntuple_variables (self, pd.DataFrame ntuple_df, ReweighterParticle particle)
 
pd.DataFrame merge_pid_weight_tables (self, dict weights_dict, dict pdg_pid_variable_dict)
 
None add_pid_weight_columns (self, pd.DataFrame ntuple_df, ReweighterParticle particle)
 
None add_pid_particle (self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)
 
ReweighterParticle get_particle (self, str prefix)
 
def convert_fei_table (self, pd.DataFrame table, float threshold)
 
None add_fei_particle (self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)
 
def add_fei_weight_columns (self, pd.DataFrame ntuple_df, ReweighterParticle particle)
 
def reweight (self, pd.DataFrame df, bool generate_variations=True)
 
def print_coverage (self)
 
def plot_coverage (self)
 

Public Attributes

 n_variations
 Number of weight variations to generate.
 
 particles
 List of particles.
 
 correlations
 Correlations between the particles.
 
 weight_name
 Name of the weight column.
 
 weights_generated
 Flag to indicate if the weights have been generated.
 
 evaluate_plots
 Flag to indicate if the plots should be evaluated.
 
 nbins
 Number of bins for the plots.
 
 fillna
 Value to fill NaN values.
 

Detailed Description

Class that reweights the dataframe.

Args:
    n_variations (int): Number of weight variations to generate.
    weight_name (str): Name of the weight column.
    evaluate_plots (bool): Flag to indicate if the plots should be evaluated.
    nbins (int): Number of bins for the plots.

Definition at line 222 of file sysvar.py.

Constructor & Destructor Documentation

◆ __init__()

None __init__ (   self,
int   n_variations = 100,
str   weight_name = "Weight",
bool   evaluate_plots = True,
int   nbins = 50,
float   fillna = 1.0 
)
Initializes the Reweighter class.

Definition at line 233 of file sysvar.py.

238 fillna: float = 1.0) -> None:
239 """
240 Initializes the Reweighter class.
241 """
242
243 self.n_variations = n_variations
244
245 self.particles = []
246
247 self.correlations = []
248
249 self.weight_name = weight_name
250
251 self.weights_generated = False
252
253 self.evaluate_plots = evaluate_plots
254
255 self.nbins = nbins
256
257 self.fillna = fillna
258

Member Function Documentation

◆ add_fei_particle()

None add_fei_particle (   self,
str  prefix,
pd.DataFrame  table,
float  threshold,
np.ndarray   cov = None,
dict   variable_aliases = None 
)
Adds weight variations according to the total uncertainty for easier error propagation.

Args:
    prefix (str): Prefix for the new columns.
    table (pandas.DataFrame): Dataframe containing the efficiency weights.
    threshold (float): Threshold for the efficiency weights.
    cov (numpy.ndarray): Covariance matrix for the efficiency weights.
    variable_aliases (dict): Dictionary containing variable aliases.

Definition at line 471 of file sysvar.py.

476 ) -> None:
477 """
478 Adds weight variations according to the total uncertainty for easier error propagation.
479
480 Args:
481 prefix (str): Prefix for the new columns.
482 table (pandas.DataFrame): Dataframe containing the efficiency weights.
483 threshold (float): Threshold for the efficiency weights.
484 cov (numpy.ndarray): Covariance matrix for the efficiency weights.
485 variable_aliases (dict): Dictionary containing variable aliases.
486 """
487 # Empty prefix means no prefix
488 if prefix is None:
489 prefix = ''
490 if prefix and not prefix.endswith('_'):
491 prefix += '_'
492 if self.get_particle(prefix):
493 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
494 if variable_aliases is None:
495 variable_aliases = {}
496 if table is None or len(table) == 0:
497 raise ValueError('No weights provided!')
498 converted_table = self.convert_fei_table(table, threshold)
499 pdg_binning = {(reco_pdg, mc_pdg): self.get_fei_binning(converted_table.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
500 for reco_pdg, mc_pdg in converted_table[['PDG', 'mcPDG']].value_counts().index.to_list()}
501 particle = ReweighterParticle(prefix,
502 type='FEI',
503 merged_table=converted_table,
504 pdg_binning=pdg_binning,
505 variable_aliases=variable_aliases,
506 weight_name=self.weight_name,
507 cov=cov)
508 self.particles += [particle]
509

◆ add_fei_weight_columns()

def add_fei_weight_columns (   self,
pd.DataFrame  ntuple_df,
ReweighterParticle  particle 
)
Adds weight columns according to the FEI calibration tables

Definition at line 510 of file sysvar.py.

510 def add_fei_weight_columns(self, ntuple_df: pd.DataFrame, particle: ReweighterParticle):
511 """
512 Adds weight columns according to the FEI calibration tables
513 """
514 rest_str = 'Rest'
515 # Apply a weight value from the weight table to the ntuple, based on the binning
516 binning_df = pd.DataFrame(index=ntuple_df.index)
517 # Take absolute value of mcPDG for binning because we have charge already
518 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
519 # Copy the mode ID from the ntuple
520 binning_df['num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)
521 # Default value in case if reco PDG is not a B-meson PDG
522 binning_df[_fei_mode_col] = np.nan
523 plot_values = {}
524 for reco_pdg, mc_pdg in particle.pdg_binning:
525 plot_values[(reco_pdg, mc_pdg)] = {}
526 binning_df.loc[binning_df['PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(
527 f'PDG == {reco_pdg} and {_fei_mode_col} == "{rest_str}"')[_fei_mode_col].values[0]
528 for mode in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:
529 binning_df.loc[(binning_df['PDG'] == reco_pdg) & (binning_df['num_mode'] == int(mode[4:])), _fei_mode_col] = mode
530 if self.evaluate_plots:
531 values = ntuple_df[f'{particle.get_varname(_fei_mode_col)}']
532 x_range = np.linspace(values.min(), values.max(), int(values.max())+1)
533 plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=True)[0]
534
535 # merge the weight table with the ntuple on binning columns
536 weight_cols = _weight_cols
537 if particle.column_names:
538 weight_cols = particle.column_names
539 binning_df = binning_df.merge(particle.merged_table[weight_cols + ['PDG', _fei_mode_col]],
540 on=['PDG', _fei_mode_col], how='left')
541 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
542 particle.plot_values = plot_values
543 for col in weight_cols:
544 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
545

◆ add_pid_particle()

None add_pid_particle (   self,
str  prefix,
dict  weights_dict,
dict  pdg_pid_variable_dict,
dict   variable_aliases = None,
int   sys_seed = None,
bool   syscorr = True 
)
Adds weight variations according to the total uncertainty for easier error propagation.

Args:
    prefix (str): Prefix for the new columns.
    weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
    pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
    variable_aliases (dict): Dictionary containing variable aliases.
    sys_seed (int): Seed for the systematic variations.
    syscorr (bool): When true assume systematics are 100% correlated defaults to
true. Note this is overridden by provision of a None value rho_sys

Definition at line 395 of file sysvar.py.

401 syscorr: bool = True) -> None:
402 """
403 Adds weight variations according to the total uncertainty for easier error propagation.
404
405 Args:
406 prefix (str): Prefix for the new columns.
407 weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
408 pdg_pid_variable_dict (dict): Dictionary containing the PID variables and thresholds.
409 variable_aliases (dict): Dictionary containing variable aliases.
410 sys_seed (int): Seed for the systematic variations.
411 syscorr (bool): When true assume systematics are 100% correlated defaults to
412 true. Note this is overridden by provision of a None value rho_sys
413 """
414 # Empty prefix means no prefix
415 if prefix is None:
416 prefix = ''
417 # Add underscore if not present
418 if prefix and not prefix.endswith('_'):
419 prefix += '_'
420 if self.get_particle(prefix):
421 raise ValueError(f"Particle with prefix '{prefix}' already exists!")
422 if variable_aliases is None:
423 variable_aliases = {}
424 merged_weight_df = self.merge_pid_weight_tables(weights_dict, pdg_pid_variable_dict)
425 pdg_binning = {(reco_pdg, mc_pdg): self.get_binning(merged_weight_df.query(f'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
426 for reco_pdg, mc_pdg in merged_weight_df[['PDG', 'mcPDG']].value_counts().index.to_list()}
427 particle = ReweighterParticle(prefix,
428 type='PID',
429 merged_table=merged_weight_df,
430 pdg_binning=pdg_binning,
431 variable_aliases=variable_aliases,
432 weight_name=self.weight_name,
433 sys_seed=sys_seed,
434 syscorr=syscorr)
435 self.particles += [particle]
436

◆ add_pid_weight_columns()

None add_pid_weight_columns (   self,
pd.DataFrame  ntuple_df,
ReweighterParticle  particle 
)
Adds a weight and uncertainty columns to the dataframe.

Args:
    ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    particle (ReweighterParticle): Particle object.

Definition at line 342 of file sysvar.py.

344 particle: ReweighterParticle) -> None:
345 """
346 Adds a weight and uncertainty columns to the dataframe.
347
348 Args:
349 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
350 particle (ReweighterParticle): Particle object.
351 """
352 # Apply a weight value from the weight table to the ntuple, based on the binning
353 binning_df = pd.DataFrame(index=ntuple_df.index)
354 # Take absolute value of mcPDG for binning because we have charge already
355 binning_df['mcPDG'] = ntuple_df[f'{particle.get_varname("mcPDG")}'].abs()
356 binning_df['PDG'] = ntuple_df[f'{particle.get_varname("PDG")}'].abs()
357 plot_values = {}
358 for reco_pdg, mc_pdg in particle.pdg_binning:
359 ntuple_cut = f'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'
360 if ntuple_df.query(ntuple_cut).empty:
361 continue
362 plot_values[(reco_pdg, mc_pdg)] = {}
363 for var in particle.pdg_binning[(reco_pdg, mc_pdg)]:
364 labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])
365 for i in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]
366 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(
367 ntuple_cut)[f'{particle.get_varname(var)}'],
368 particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)
369 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
370 f'{var}_min'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
371 var].str[0]
372 binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
373 f'{var}_max'] = binning_df.loc[(binning_df['mcPDG'] == mc_pdg) & (binning_df['PDG'] == reco_pdg),
374 var].str[1]
375 binning_df.drop(var, axis=1, inplace=True)
376 if self.evaluate_plots:
377 values = ntuple_df.query(ntuple_cut)[f'{particle.get_varname(var)}']
378 if len(values.unique()) < 2:
379 print(f'Skip {var} for plotting!')
380 continue
381 x_range = np.linspace(values.min(), values.max(), self.nbins)
382 plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=True)[0]
383 # merge the weight table with the ntuple on binning columns
384 weight_cols = _weight_cols
385 if particle.column_names:
386 weight_cols = particle.column_names
387 binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],
388 on=binning_df.columns.tolist(), how='left')
389 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
390 particle.plot_values = plot_values
391 for col in weight_cols:
392 ntuple_df[f'{particle.get_varname(col)}'] = binning_df[col]
393 ntuple_df[f'{particle.get_varname(col)}'] = ntuple_df[f'{particle.get_varname(col)}'].fillna(self.fillna)
394

◆ convert_fei_table()

def convert_fei_table (   self,
pd.DataFrame  table,
float  threshold 
)
Checks if the tables are provided in a legacy format and converts them to the standard format.

Definition at line 446 of file sysvar.py.

446 def convert_fei_table(self, table: pd.DataFrame, threshold: float):
447 """
448 Checks if the tables are provided in a legacy format and converts them to the standard format.
449 """
450 result = None
451 if 'cal' in table.columns:
452 result = pd.DataFrame(index=table.index)
453 result['data_MC_ratio'] = table['cal']
454 str_to_pdg = {'B+': 521, 'B-': 521, 'B0': 511}
455 result['PDG'] = table['Btag'].apply(lambda x: str_to_pdg.get(x))
456 # Assume these are only efficiency tables
457 result['mcPDG'] = result['PDG']
458 result['threshold'] = table['sig_prob_threshold']
459 result[_fei_mode_col] = table[_fei_mode_col]
460 result['data_MC_uncertainty_stat_dn'] = table['cal_stat_error']
461 result['data_MC_uncertainty_stat_up'] = table['cal_stat_error']
462 result['data_MC_uncertainty_sys_dn'] = table['cal_sys_error']
463 result['data_MC_uncertainty_sys_up'] = table['cal_sys_error']
464 else:
465 result = table
466 result = result.query(f'threshold == {threshold}')
467 if len(result) == 0:
468 raise ValueError(f'No weights found for threshold {threshold}!')
469 return result
470

◆ get_bin_columns()

list get_bin_columns (   self,
  weight_df 
)
Returns the kinematic bin columns of the dataframe.

Definition at line 259 of file sysvar.py.

259 def get_bin_columns(self, weight_df) -> list:
260 """
261 Returns the kinematic bin columns of the dataframe.
262 """
263 return [col for col in weight_df.columns if col.endswith('_min') or col.endswith('_max')]
264

◆ get_binning()

dict get_binning (   self,
  weight_df 
)
Returns the kinematic binning of the dataframe.

Definition at line 265 of file sysvar.py.

265 def get_binning(self, weight_df) -> dict:
266 """
267 Returns the kinematic binning of the dataframe.
268 """
269 columns = self.get_bin_columns(weight_df)
270 var_names = {'_'.join(col.split('_')[:-1]) for col in columns}
271 bin_dict = {}
272 for var_name in var_names:
273 bin_dict[var_name] = []
274 for col in columns:
275 if col.startswith(var_name):
276 bin_dict[var_name] += list(weight_df[col].values)
277 bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))
278 return bin_dict
279

◆ get_fei_binning()

dict get_fei_binning (   self,
  weight_df 
)
Returns the irregular binning of the dataframe.

Definition at line 280 of file sysvar.py.

280 def get_fei_binning(self, weight_df) -> dict:
281 """
282 Returns the irregular binning of the dataframe.
283 """
284 return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith('mode'),
285 _fei_mode_col].value_counts().index.to_list()}
286

◆ get_ntuple_variables()

None get_ntuple_variables (   self,
pd.DataFrame  ntuple_df,
ReweighterParticle  particle 
)
Checks if the variables are in the ntuple and returns them.

Args:
    ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    particle (ReweighterParticle): Particle object containing the necessary variables.

Definition at line 287 of file sysvar.py.

289 particle: ReweighterParticle) -> None:
290 """
291 Checks if the variables are in the ntuple and returns them.
292
293 Args:
294 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
295 particle (ReweighterParticle): Particle object containing the necessary variables.
296 """
297 ntuple_variables = particle.get_binning_variables()
298 ntuple_variables += particle.get_pdg_variables()
299 for var in ntuple_variables:
300 if var not in ntuple_df.columns:
301 raise ValueError(f'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')
302 return ntuple_variables
303

◆ get_particle()

ReweighterParticle get_particle (   self,
str  prefix 
)
Get a particle by its prefix.

Definition at line 437 of file sysvar.py.

437 def get_particle(self, prefix: str) -> ReweighterParticle:
438 """
439 Get a particle by its prefix.
440 """
441 cands = [particle for particle in self.particles if particle.prefix.strip('_') == prefix.strip('_')]
442 if len(cands) == 0:
443 return None
444 return cands[0]
445

◆ merge_pid_weight_tables()

pd.DataFrame merge_pid_weight_tables (   self,
dict  weights_dict,
dict  pdg_pid_variable_dict 
)
Merges the efficiency and fake rate weight tables.

Args:
    weights_dict (dict): Dictionary containing the weight tables.
    pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.

Definition at line 304 of file sysvar.py.

306 pdg_pid_variable_dict: dict) -> pd.DataFrame:
307 """
308 Merges the efficiency and fake rate weight tables.
309
310 Args:
311 weights_dict (dict): Dictionary containing the weight tables.
312 pdg_pid_variable_dict (dict): Dictionary containing the PDG codes and variable names.
313 """
314 weight_dfs = []
315 for reco_pdg, mc_pdg in weights_dict:
316 if reco_pdg not in pdg_pid_variable_dict:
317 raise ValueError(f'Reconstructed PDG code {reco_pdg} not found in thresholds!')
318 weight_df = weights_dict[(reco_pdg, mc_pdg)]
319 weight_df['mcPDG'] = mc_pdg
320 weight_df['PDG'] = reco_pdg
321 # Check if these are legacy tables:
322 if 'charge' in weight_df.columns:
323 charge_dict = {'+': [0, 2], '-': [-2, 0]}
324 weight_df[['charge_min', 'charge_max']] = [charge_dict[val] for val in weight_df['charge'].values]
325 weight_df = weight_df.drop(columns=['charge'])
326 # If iso_score is a single value, drop the min and max columns
327 if 'iso_score_min' in weight_df.columns and len(weight_df['iso_score_min'].unique()) == 1:
328 weight_df = weight_df.drop(columns=['iso_score_min', 'iso_score_max'])
329 pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]
330 threshold = pdg_pid_variable_dict[reco_pdg][1]
331 selected_weights = weight_df.query(f'variable == "{pid_variable_name}" and threshold == {threshold}')
332 if len(selected_weights) == 0:
333 available_variables = weight_df['variable'].unique()
334 available_thresholds = weight_df['threshold'].unique()
335 raise ValueError(f'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'
336 f' variable {pid_variable_name} and threshold {threshold}!\n'
337 f' Available variables: {available_variables}\n'
338 f' Available thresholds: {available_thresholds}')
339 weight_dfs.append(selected_weights)
340 return pd.concat(weight_dfs, ignore_index=True)
341

◆ plot_coverage()

def plot_coverage (   self)
Plots the coverage of each particle.

Definition at line 576 of file sysvar.py.

576 def plot_coverage(self):
577 """
578 Plots the coverage of each particle.
579 """
580 for particle in self.particles:
581 particle.plot_coverage()
582
583

◆ print_coverage()

def print_coverage (   self)
Prints the coverage of each particle.

Definition at line 568 of file sysvar.py.

568 def print_coverage(self):
569 """
570 Prints the coverage of each particle.
571 """
572 print('Coverage:')
573 for particle in self.particles:
574 print(f'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')
575

◆ reweight()

def reweight (   self,
pd.DataFrame  df,
bool   generate_variations = True 
)
Reweights the dataframe according to the weight tables.

Args:
    df (pandas.DataFrame): Dataframe containing the analysis ntuple.
    generate_variations (bool): When true generate weight variations.

Definition at line 546 of file sysvar.py.

548 generate_variations: bool = True):
549 """
550 Reweights the dataframe according to the weight tables.
551
552 Args:
553 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
554 generate_variations (bool): When true generate weight variations.
555 """
556 for particle in self.particles:
557 if particle.type not in _correction_types:
558 raise ValueError(f'Particle type {particle.type} not supported!')
559 print(f'Required variables: {self.get_ntuple_variables(df, particle)}')
560 if generate_variations:
561 particle.generate_variations(n_variations=self.n_variations)
562 if particle.type == 'PID':
563 self.add_pid_weight_columns(df, particle)
564 elif particle.type == 'FEI':
565 self.add_fei_weight_columns(df, particle)
566 return df
567

Member Data Documentation

◆ correlations

correlations

Correlations between the particles.

Definition at line 247 of file sysvar.py.

◆ evaluate_plots

evaluate_plots

Flag to indicate if the plots should be evaluated.

Definition at line 253 of file sysvar.py.

◆ fillna

fillna

Value to fill NaN values.

Definition at line 257 of file sysvar.py.

◆ n_variations

n_variations

Number of weight variations to generate.

Definition at line 243 of file sysvar.py.

◆ nbins

nbins

Number of bins for the plots.

Definition at line 255 of file sysvar.py.

◆ particles

particles

List of particles.

Definition at line 245 of file sysvar.py.

◆ weight_name

weight_name

Name of the weight column.

Definition at line 249 of file sysvar.py.

◆ weights_generated

weights_generated

Flag to indicate if the weights have been generated.

Definition at line 251 of file sysvar.py.


The documentation for this class was generated from the following file: