13from dataclasses
import dataclass
14import matplotlib.pyplot
as plt
18A module that adds corrections to analysis dataframe.
19It adds weight variations according to the total uncertainty for easier error propagation.
22_weight_cols = [
'data_MC_ratio',
23 'data_MC_uncertainty_stat_dn',
24 'data_MC_uncertainty_stat_up',
25 'data_MC_uncertainty_sys_dn',
26 'data_MC_uncertainty_sys_up'
29_correction_types = [
'PID',
'FEI']
30_fei_mode_col =
'dec_mode'
36 Class that stores the information of a particle.
45 merged_table: pd.DataFrame
51 variable_aliases: dict
57 column_names: list = None
63 cov: np.ndarray =
None
69 coverage: float =
None
72 plot_values: dict =
None
76 Returns the variable name with the prefix
and use alias
if defined.
81 if name.startswith(self.
prefix):
83 return f
'{self.prefix}{name}'
87 Returns the list of variables that are used for the binning
89 variables = set(sum([list(d.keys()) for d
in self.
pdg_binning.values()], []))
90 return [f
'{self.get_varname(var)}' for var
in variables]
94 Returns the list of variables that are used for the PDG codes
100 return [f
'{self.get_varname(var)}' for var
in pdg_vars]
104 rho_sys: np.ndarray =
None,
105 rho_stat: np.ndarray =
None) ->
None:
107 Generates variations of weights according to the uncertainties
110 "data_MC_uncertainty_stat_dn"]].max(axis=1)
112 "data_MC_uncertainty_sys_dn"]].max(axis=1)
118 weights = cov + means
125 rho_sys: np.ndarray =
None,
126 rho_stat: np.ndarray =
None) -> np.ndarray:
128 Returns the covariance matrix of the weights
131 zeros = np.zeros(len_means)
135 rho_sys = np.ones((len_means, len_means))
137 rho_sys = np.identity(len_means)
139 rho_stat = np.identity(len_means)
143 stat_cov = np.matmul(
147 sys = np.random.multivariate_normal(zeros, sys_cov, n_variations)
149 stat = np.random.multivariate_normal(zeros, stat_cov, n_variations)
151 errors = np.random.multivariate_normal(zeros, self.
cov, n_variations)
156 Converts the object to a string.
158 separator = '------------------'
159 title =
'ReweighterParticle'
160 prefix_str = f
'Type: {self.type} Prefix: {self.prefix}'
161 columns = _weight_cols
162 merged_table_str = f
'Merged table:\n{self.merged_table[columns].describe()}'
163 pdg_binning_str =
'PDG binning:\n'
165 pdg_binning_str += f
'{pdgs}: {self.pdg_binning[pdgs]}\n'
166 return '\n'.join([separator, title, prefix_str, merged_table_str, pdg_binning_str]) + separator
170 Plots the coverage of the ntuple.
174 vars = set(sum([list(d.keys())
for d
in self.
plot_values.values()], []))
176 fig, axs = plt.subplots(len(self.
plot_values), len(vars), figsize=(5*len(vars), 3*len(self.
plot_values)), dpi=120)
178 if len(axs.shape) < 1:
179 axs = axs.reshape(len(self.
plot_values), len(vars))
180 bin_plt = {
'linewidth': 3,
'linestyle':
'--',
'color':
'0.5'}
181 fig.suptitle(f
'{self.type} particle {self.prefix.strip("_")}')
182 for (reco_pdg, mc_pdg), ax_row
in zip(self.
plot_values, axs):
183 for var, ax
in zip(self.
plot_values[(reco_pdg, mc_pdg)], ax_row):
185 ymax = self.
plot_values[(reco_pdg, mc_pdg)][var][1].max()*1.1
188 ax.vlines(self.
pdg_binning[(reco_pdg, mc_pdg)][var], ymin, ymax,
193 values = np.array([int(val[4:])
for val
in self.
pdg_binning[(reco_pdg, mc_pdg)][var]])
195 np.ones(len(values))*ymax,
200 rest = np.setdiff1d(self.
plot_values[(reco_pdg, mc_pdg)][var][0], values)
202 np.ones(len(rest))*ymax,
205 label=
'Rest category',
208 widths = (self.
plot_values[(reco_pdg, mc_pdg)][var][0][1:] - self.
plot_values[(reco_pdg, mc_pdg)][var][0][:-1])
209 centers = self.
plot_values[(reco_pdg, mc_pdg)][var][0][:-1] + widths/2
215 ax.set_title(f
'True {pdg.to_name(mc_pdg)} to reco {pdg.to_name(reco_pdg)} coverage')
224 Class that reweights the dataframe.
227 n_variations (int): Number of weight variations to generate.
228 weight_name (str): Name of the weight column.
229 evaluate_plots (bool): Flag to indicate if the plots should be evaluated.
230 nbins (int): Number of bins
for the plots.
234 n_variations: int = 100,
235 weight_name: str =
"Weight",
236 evaluate_plots: bool =
True,
238 fillna: float = 1.0) ->
None:
240 Initializes the Reweighter class.
261 Returns the kinematic bin columns of the dataframe.
263 return [col
for col
in weight_df.columns
if col.endswith(
'_min')
or col.endswith(
'_max')]
267 Returns the kinematic binning of the dataframe.
270 var_names = {'_'.join(col.split(
'_')[:-1])
for col
in columns}
272 for var_name
in var_names:
273 bin_dict[var_name] = []
275 if col.startswith(var_name):
276 bin_dict[var_name] += list(weight_df[col].values)
277 bin_dict[var_name] = np.array(sorted(set(bin_dict[var_name])))
282 Returns the irregular binning of the dataframe.
284 return {_fei_mode_col: weight_df.loc[weight_df[_fei_mode_col].str.startswith(
'mode'),
285 _fei_mode_col].value_counts().index.to_list()}
288 ntuple_df: pd.DataFrame,
289 particle: ReweighterParticle) ->
None:
291 Checks if the variables are
in the ntuple
and returns them.
294 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
295 particle (ReweighterParticle): Particle object containing the necessary variables.
297 ntuple_variables = particle.get_binning_variables()
298 ntuple_variables += particle.get_pdg_variables()
299 for var
in ntuple_variables:
300 if var
not in ntuple_df.columns:
301 raise ValueError(f
'Variable {var} is not in the ntuple! Required variables are {ntuple_variables}')
302 return ntuple_variables
306 pdg_pid_variable_dict: dict) -> pd.DataFrame:
308 Merges the efficiency and fake rate weight tables.
311 weights_dict (dict): Dictionary containing the weight tables.
312 pdg_pid_variable_dict (dict): Dictionary containing the PDG codes
and variable names.
315 for reco_pdg, mc_pdg
in weights_dict:
316 if reco_pdg
not in pdg_pid_variable_dict:
317 raise ValueError(f
'Reconstructed PDG code {reco_pdg} not found in thresholds!')
318 weight_df = weights_dict[(reco_pdg, mc_pdg)]
319 weight_df[
'mcPDG'] = mc_pdg
320 weight_df[
'PDG'] = reco_pdg
322 if 'charge' in weight_df.columns:
323 charge_dict = {
'+': [0, 2],
'-': [-2, 0]}
324 weight_df[[
'charge_min',
'charge_max']] = [charge_dict[val]
for val
in weight_df[
'charge'].values]
325 weight_df = weight_df.drop(columns=[
'charge'])
327 if 'iso_score_min' in weight_df.columns
and len(weight_df[
'iso_score_min'].unique()) == 1:
328 weight_df = weight_df.drop(columns=[
'iso_score_min',
'iso_score_max'])
329 pid_variable_name = pdg_pid_variable_dict[reco_pdg][0]
330 threshold = pdg_pid_variable_dict[reco_pdg][1]
331 selected_weights = weight_df.query(f
'variable == "{pid_variable_name}" and threshold == {threshold}')
332 if len(selected_weights) == 0:
333 available_variables = weight_df[
'variable'].unique()
334 available_thresholds = weight_df[
'threshold'].unique()
335 raise ValueError(f
'No weights found for PDG code {reco_pdg}, mcPDG {mc_pdg},'
336 f
' variable {pid_variable_name} and threshold {threshold}!\n'
337 f
' Available variables: {available_variables}\n'
338 f
' Available thresholds: {available_thresholds}')
339 weight_dfs.append(selected_weights)
340 return pd.concat(weight_dfs, ignore_index=
True)
343 ntuple_df: pd.DataFrame,
344 particle: ReweighterParticle) ->
None:
346 Adds a weight and uncertainty columns to the dataframe.
349 ntuple_df (pandas.DataFrame): Dataframe containing the analysis ntuple.
350 particle (ReweighterParticle): Particle object.
353 binning_df = pd.DataFrame(index=ntuple_df.index)
355 binning_df[
'mcPDG'] = ntuple_df[f
'{particle.get_varname("mcPDG")}'].abs()
356 binning_df[
'PDG'] = ntuple_df[f
'{particle.get_varname("PDG")}'].abs()
358 for reco_pdg, mc_pdg
in particle.pdg_binning:
359 ntuple_cut = f
'abs({particle.get_varname("mcPDG")}) == {mc_pdg} and abs({particle.get_varname("PDG")}) == {reco_pdg}'
360 if ntuple_df.query(ntuple_cut).empty:
362 plot_values[(reco_pdg, mc_pdg)] = {}
363 for var
in particle.pdg_binning[(reco_pdg, mc_pdg)]:
364 labels = [(particle.pdg_binning[(reco_pdg, mc_pdg)][var][i-1], particle.pdg_binning[(reco_pdg, mc_pdg)][var][i])
365 for i
in range(1, len(particle.pdg_binning[(reco_pdg, mc_pdg)][var]))]
366 binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg), var] = pd.cut(ntuple_df.query(
367 ntuple_cut)[f
'{particle.get_varname(var)}'],
368 particle.pdg_binning[(reco_pdg, mc_pdg)][var], labels=labels)
369 binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg),
370 f
'{var}_min'] = binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg),
372 binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg),
373 f
'{var}_max'] = binning_df.loc[(binning_df[
'mcPDG'] == mc_pdg) & (binning_df[
'PDG'] == reco_pdg),
375 binning_df.drop(var, axis=1, inplace=
True)
377 values = ntuple_df.query(ntuple_cut)[f
'{particle.get_varname(var)}']
378 if len(values.unique()) < 2:
379 print(f
'Skip {var} for plotting!')
381 x_range = np.linspace(values.min(), values.max(), self.
nbins)
382 plot_values[(reco_pdg, mc_pdg)][var] = x_range, np.histogram(values, bins=x_range, density=
True)[0]
384 weight_cols = _weight_cols
385 if particle.column_names:
386 weight_cols = particle.column_names
387 binning_df = binning_df.merge(particle.merged_table[weight_cols + binning_df.columns.tolist()],
388 on=binning_df.columns.tolist(), how=
'left')
389 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
390 particle.plot_values = plot_values
391 for col
in weight_cols:
392 ntuple_df[f
'{particle.get_varname(col)}'] = binning_df[col]
393 ntuple_df[f
'{particle.get_varname(col)}'] = ntuple_df[f
'{particle.get_varname(col)}'].
fillna(self.
fillna)
398 pdg_pid_variable_dict: dict,
399 variable_aliases: dict =
None,
400 sys_seed: int =
None,
401 syscorr: bool =
True) ->
None:
403 Adds weight variations according to the total uncertainty for easier error propagation.
406 prefix (str): Prefix
for the new columns.
407 weights_dict (pandas.DataFrame): Dataframe containing the efficiency weights.
408 pdg_pid_variable_dict (dict): Dictionary containing the PID variables
and thresholds.
409 variable_aliases (dict): Dictionary containing variable aliases.
410 sys_seed (int): Seed
for the systematic variations.
411 syscorr (bool): When true assume systematics are 100% correlated defaults to
412 true. Note this
is overridden by provision of a
None value rho_sys
418 if prefix
and not prefix.endswith(
'_'):
421 raise ValueError(f
"Particle with prefix '{prefix}' already exists!")
422 if variable_aliases
is None:
423 variable_aliases = {}
425 pdg_binning = {(reco_pdg, mc_pdg): self.
get_binning(merged_weight_df.query(f
'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
426 for reco_pdg, mc_pdg
in merged_weight_df[[
'PDG',
'mcPDG']].value_counts().index.to_list()}
429 merged_table=merged_weight_df,
430 pdg_binning=pdg_binning,
431 variable_aliases=variable_aliases,
439 Get a particle by its prefix.
441 cands = [particle for particle
in self.
particles if particle.prefix.strip(
'_') == prefix.strip(
'_')]
448 Checks if the tables are provided
in a legacy format
and converts them to the standard format.
451 if 'cal' in table.columns:
452 result = pd.DataFrame(index=table.index)
453 result[
'data_MC_ratio'] = table[
'cal']
454 str_to_pdg = {
'B+': 521,
'B-': 521,
'B0': 511}
455 result[
'PDG'] = table[
'Btag'].apply(
lambda x: str_to_pdg.get(x))
457 result[
'mcPDG'] = result[
'PDG']
458 result[
'threshold'] = table[
'sig_prob_threshold']
459 result[_fei_mode_col] = table[_fei_mode_col]
460 result[
'data_MC_uncertainty_stat_dn'] = table[
'cal_stat_error']
461 result[
'data_MC_uncertainty_stat_up'] = table[
'cal_stat_error']
462 result[
'data_MC_uncertainty_sys_dn'] = table[
'cal_sys_error']
463 result[
'data_MC_uncertainty_sys_up'] = table[
'cal_sys_error']
466 result = result.query(f
'threshold == {threshold}')
468 raise ValueError(f
'No weights found for threshold {threshold}!')
474 cov: np.ndarray =
None,
475 variable_aliases: dict =
None,
478 Adds weight variations according to the total uncertainty for easier error propagation.
481 prefix (str): Prefix
for the new columns.
482 table (pandas.DataFrame): Dataframe containing the efficiency weights.
483 threshold (float): Threshold
for the efficiency weights.
484 cov (numpy.ndarray): Covariance matrix
for the efficiency weights.
485 variable_aliases (dict): Dictionary containing variable aliases.
490 if prefix
and not prefix.endswith(
'_'):
493 raise ValueError(f
"Particle with prefix '{prefix}' already exists!")
494 if variable_aliases
is None:
495 variable_aliases = {}
496 if table
is None or len(table) == 0:
497 raise ValueError(
'No weights provided!')
499 pdg_binning = {(reco_pdg, mc_pdg): self.
get_fei_binning(converted_table.query(f
'PDG == {reco_pdg} and mcPDG == {mc_pdg}'))
500 for reco_pdg, mc_pdg
in converted_table[[
'PDG',
'mcPDG']].value_counts().index.to_list()}
503 merged_table=converted_table,
504 pdg_binning=pdg_binning,
505 variable_aliases=variable_aliases,
512 Adds weight columns according to the FEI calibration tables
516 binning_df = pd.DataFrame(index=ntuple_df.index)
518 binning_df[
'PDG'] = ntuple_df[f
'{particle.get_varname("PDG")}'].abs()
520 binning_df[
'num_mode'] = ntuple_df[particle.get_varname(_fei_mode_col)].astype(int)
522 binning_df[_fei_mode_col] = np.nan
524 for reco_pdg, mc_pdg
in particle.pdg_binning:
525 plot_values[(reco_pdg, mc_pdg)] = {}
526 binning_df.loc[binning_df[
'PDG'] == reco_pdg, _fei_mode_col] = particle.merged_table.query(
527 f
'PDG == {reco_pdg} and {_fei_mode_col} == "{rest_str}"')[_fei_mode_col].values[0]
528 for mode
in particle.pdg_binning[(reco_pdg, mc_pdg)][_fei_mode_col]:
529 binning_df.loc[(binning_df[
'PDG'] == reco_pdg) & (binning_df[
'num_mode'] == int(mode[4:])), _fei_mode_col] = mode
531 values = ntuple_df[f
'{particle.get_varname(_fei_mode_col)}']
532 x_range = np.linspace(values.min(), values.max(), int(values.max())+1)
533 plot_values[(reco_pdg, mc_pdg)][_fei_mode_col] = x_range, np.histogram(values, bins=x_range, density=
True)[0]
536 weight_cols = _weight_cols
537 if particle.column_names:
538 weight_cols = particle.column_names
539 binning_df = binning_df.merge(particle.merged_table[weight_cols + [
'PDG', _fei_mode_col]],
540 on=[
'PDG', _fei_mode_col], how=
'left')
541 particle.coverage = 1 - binning_df[weight_cols[0]].isna().sum() / len(binning_df)
542 particle.plot_values = plot_values
543 for col
in weight_cols:
544 ntuple_df[f
'{particle.get_varname(col)}'] = binning_df[col]
548 generate_variations: bool =
True):
550 Reweights the dataframe according to the weight tables.
553 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
554 generate_variations (bool): When true generate weight variations.
557 if particle.type
not in _correction_types:
558 raise ValueError(f
'Particle type {particle.type} not supported!')
559 print(f
'Required variables: {self.get_ntuple_variables(df, particle)}')
560 if generate_variations:
561 particle.generate_variations(n_variations=self.
n_variations)
562 if particle.type ==
'PID':
564 elif particle.type ==
'FEI':
570 Prints the coverage of each particle.
574 print(f
'{particle.type} {particle.prefix.strip("_")}: {particle.coverage*100 :0.1f}%')
578 Plots the coverage of each particle.
581 particle.plot_coverage()
584def add_weights_to_dataframe(prefix: str,
587 custom_tables: dict =
None,
588 custom_thresholds: dict =
None,
589 **kw_args) -> pd.DataFrame:
591 Helper method that adds weights to a dataframe.
594 prefix (str): Prefix for the new columns.
595 df (pandas.DataFrame): Dataframe containing the analysis ntuple.
596 systematic (str): Type of the systematic corrections, options:
"custom_PID" and "custom_FEI".
597 MC_production (str): Name of the MC production.
598 custom_tables (dict): Dictionary containing the custom efficiency weights.
599 custom_thresholds (dict): Dictionary containing the custom thresholds
for the custom efficiency weights.
600 n_variations (int): Number of variations to generate.
601 generate_variations (bool): When true generate weight variations.
602 weight_name (str): Name of the weight column.
603 show_plots (bool): When true show the coverage plots.
604 variable_aliases (dict): Dictionary containing variable aliases.
605 cov_matrix (numpy.ndarray): Covariance matrix
for the custom efficiency weights.
606 fillna (int): Value to fill NaN values
with.
607 sys_seed (int): Seed
for the systematic variations.
608 syscorr (bool): When true assume systematics are 100% correlated defaults to true.
609 **kw_args: Additional arguments
for the Reweighter
class.
611 generate_variations = kw_args.get('generate_variations',
True)
612 n_variations = kw_args.get(
'n_variations', 100)
613 weight_name = kw_args.get(
'weight_name',
"Weight")
614 fillna = kw_args.get(
'fillna', 1.0)
615 reweighter =
Reweighter(n_variations=n_variations,
616 weight_name=weight_name,
618 variable_aliases = kw_args.get(
'variable_aliases')
619 if systematic.lower() ==
'custom_fei':
620 cov_matrix = kw_args.get(
'cov_matrix')
621 reweighter.add_fei_particle(prefix=prefix,
623 threshold=custom_thresholds,
624 variable_aliases=variable_aliases,
627 elif systematic.lower() ==
'custom_pid':
628 sys_seed = kw_args.get(
'sys_seed')
629 syscorr = kw_args.get(
'syscorr')
632 reweighter.add_pid_particle(prefix=prefix,
633 weights_dict=custom_tables,
634 pdg_pid_variable_dict=custom_thresholds,
635 variable_aliases=variable_aliases,
640 raise ValueError(f
'Systematic {systematic} is not supported!')
642 result = reweighter.reweight(df, generate_variations=generate_variations)
643 if kw_args.get(
'show_plots'):
644 reweighter.print_coverage()
645 reweighter.plot_coverage()
str weight_name
Weight column name that will be added to the ntuple.
str type
Type of the particle (PID or FEI)
pd merged_table
Merged table of the weights.
dict pdg_binning
Kinematic binning of the weight table per particle.
np cov
Covariance matrix corresponds to the total uncertainty.
str get_varname(self, str varname)
str prefix
Prefix of the particle in the ntuple.
int sys_seed
Random seed for systematics.
list get_pdg_variables(self)
type
Add the mcPDG code requirement for PID particle.
list get_binning_variables(self)
dict plot_values
Values for the plots.
np.ndarray get_covariance(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
bool syscorr
When true assume systematics are 100% correlated.
None generate_variations(self, int n_variations, np.ndarray rho_sys=None, np.ndarray rho_stat=None)
def plot_coverage(self, fig=None, axs=None)
list column_names
Internal list of the names of the weight columns.
dict variable_aliases
Variable aliases of the weight table.
column_names
Names of the varied weight columns.
n_variations
Number of weight variations to generate.
pd.DataFrame merge_pid_weight_tables(self, dict weights_dict, dict pdg_pid_variable_dict)
nbins
Number of bins for the plots.
None add_pid_particle(self, str prefix, dict weights_dict, dict pdg_pid_variable_dict, dict variable_aliases=None, int sys_seed=None, bool syscorr=True)
None __init__(self, int n_variations=100, str weight_name="Weight", bool evaluate_plots=True, int nbins=50, float fillna=1.0)
None add_pid_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
weight_name
Name of the weight column.
def add_fei_weight_columns(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
None get_ntuple_variables(self, pd.DataFrame ntuple_df, ReweighterParticle particle)
fillna
Value to fill NaN values.
weights_generated
Flag to indicate if the weights have been generated.
dict get_fei_binning(self, weight_df)
def reweight(self, pd.DataFrame df, bool generate_variations=True)
def convert_fei_table(self, pd.DataFrame table, float threshold)
dict get_binning(self, weight_df)
list get_bin_columns(self, weight_df)
correlations
Correlations between the particles.
None add_fei_particle(self, str prefix, pd.DataFrame table, float threshold, np.ndarray cov=None, dict variable_aliases=None)
ReweighterParticle get_particle(self, str prefix)
particles
List of particles.
evaluate_plots
Flag to indicate if the plots should be evaluated.