18 def _make_const_lists():
 
   19     """Moving this code into a function to avoid a top-level ROOT import.""" 
   22     PARTICLES, PDG_CODES = [], []
 
   23     for i 
in range(len(ROOT.Belle2.Const.chargedStableSet)):
 
   24         particle = ROOT.Belle2.Const.chargedStableSet.at(i)
 
   25         name = (particle.__repr__()[7:-1]
 
   28                 .replace(
"euteron", 
""))
 
   29         PARTICLES.append(name)
 
   30         PDG_CODES.append(particle.getPDGCode())
 
   35     for det 
in ROOT.Belle2.Const.PIDDetectors.set():
 
   36         DETECTORS.append(ROOT.Belle2.Const.parseDetectors(det))
 
   39     return PARTICLES, PDG_CODES, DETECTORS
 
   43 PARTICLES = [
"e", 
"mu", 
"pi", 
"K", 
"p", 
"d"]
 
   44 PDG_CODES = [11, 13, 211, 321, 2212, 1000010020]
 
   45 DETECTORS = [
"SVD", 
"CDC", 
"TOP", 
"ARICH", 
"ECL", 
"KLM"]
 
   47 P_BINS = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5])
 
   48 THETA_BINS = np.radians(np.array([17, 28, 40, 60, 77, 96, 115, 133, 150]))
 
   51 def _column(particle, detector):
 
   52     """Default column names for detector log-likelihoods. 
   55         particle (str): particle name 
   56         detector (str): detector name 
   59         str: Corresponding column name. 
   61     return f
"{detector}_{particle}" 
   64 def root_column(particle, detector):
 
   65     """Column names for detector log-likelihoods found in our ROOT datafiles. 
   68         particle (str): particle name 
   69         detector (str): detector name 
   72         str: Corresponding column name. 
   74     pdg = PDG_CODES[PARTICLES.index(particle)]
 
   75     return f
"pidLogLikelyhoodOf{pdg}From{detector}" 
   78 def read_root(root_filenames):
 
   79     """Reads one or several ROOT datafiles into a DataFrame. 
   82         root_filenames (list(str) or str): If only one filename, can be given as 
   83             a string. If more than one, should be given as a list or tuple. 
   86         pandas.DataFrame: DataFrame containing the data of the ROOT datafile(s). 
   89     return uproot.concatenate(root_filenames, library=
'pd')
 
   92 def make_h5(df, tags, out_filename, pdg=None, column=root_column):
 
   93     """Make an HDF5 file in our 'slim' format from the given DataFrame. 
   96         df (pandas.DataFrame): The DataFrame containing the data. 
   97         tags (list(str) or str): The particle tags used as a prefix for desired 
   98             columns. e.g. for kaons in a D* decay, this is 'DST_D0_K'. One or 
  100         out_filename (str): Output filename for the h5 file that will be 
  102         pdg (int or None): The PDG code for the particles being 
  103             extracted. If None, uses the values found in the 'mcPDG' column of 
  104             the DataFrame. Defaults to None. 
  105         column: A function which, given the particle and 
  106             detector names, returns the column name for the corresponding 
  107             detector log-likelihood. Defaults to root_column, which assumes 
  108             column names are of the format 
  109             f'pidLogLikelyhoodOf{pdg}From{detector}'. 
  112     if isinstance(tags, str):
 
  116         return np.concatenate(arrs) 
if len(arrs) > 1 
else arrs[0]
 
  119         return _concat([df[f
"{tag}_{col}"].values 
for tag 
in tags])
 
  121     with h5py.File(out_filename, 
"w") 
as f:
 
  123             pdg_values = np.ones(len(df) * len(tags)) * pdg
 
  125             pdg_values = np.abs(_get_all(
"mcPDG"))
 
  127         f.create_dataset(
"pdg", data=pdg_values)
 
  128         f.create_dataset(
"p", data=_get_all(
"p"))
 
  129         f.create_dataset(
"theta", data=np.arccos(_get_all(
"cosTheta")))
 
  130         f.create_dataset(
"phi", data=_get_all(
"phi"))
 
  132         for det 
in DETECTORS:
 
  134                 data = _get_all(column(p, det))
 
  135                 data[np.isnan(data)] = 0
 
  136                 f.create_dataset(f
"{det}/{p}", data=data)
 
  139 def merge_h5s(filenames, out_filename, pdgs=None):
 
  140     """Merge several HDF5 files in our 'slim' format together. 
  143         filenames (list(str)): Filenames of HDF5 files to be merged. 
  144         out_filename (str): Output filename. 
  145         pdgs (list(int)): The PDG tags for the particle types, one per 
  146             filename, to overwrite the 'pdg' columns in those files when 
  147             merging. If None, simply uses the 'pdg' columns from the files. 
  150     fs = [h5py.File(fname, 
"r") 
for fname 
in filenames]
 
  151     m = h5py.File(out_filename, 
"w")
 
  153     keys = [
"p", 
"theta", 
"phi"]
 
  154     keys += [f
"{d}/{p}" for d 
in DETECTORS 
for p 
in PARTICLES]
 
  157         m.create_dataset(key, data=np.concatenate([f[key][()] 
for f 
in fs]))
 
  164                 [np.ones_like(f[
"pdg"][()]) * pdg 
for f, pdg 
in zip(fs, pdgs)]
 
  168         m.create_dataset(
"pdg", data=np.concatenate([f[
"pdg"][()] 
for f 
in fs]))
 
  184     """Split the data in an HDF5 'slim' format file in train, validation, and 
  185     test sets, stored in .npz files for ease of weight training. 
  188         filename (str): Filename of HDF5 input file. 
  189         output_dir (str): Name of output directory, in which the train, 
  190             validation, and test sets will be written. Will be created if it 
  191             does not already exist. 
  192         train_size (float): Fraction of the dataset to use for 
  193             training. Defaults to 0.8. 
  194         val_size (float): Fraction of the dataset to use for 
  195             validation. Defaults to 0.1. 
  196         test_size (float): Fraction of the dataset to use for testing. 
  198         shuffle (bool): Whether to shuffle the dataset before 
  199             splitting. Defaults to True. 
  200         random_state (int or None): Random state for the shuffling. 
  204     from sklearn.model_selection 
import train_test_split
 
  205     from os.path 
import join
 
  206     from os 
import makedirs
 
  208     assert train_size > 0, f
"train_size ({train_size}) must be positive" 
  209     assert val_size >= 0, f
"val_size ({val_size}) may not be negative" 
  210     assert test_size >= 0, f
"test_size ({test_size}) may not be negative" 
  211     assert val_size + test_size != 0, 
"val_size and test_size cannot both be zero" 
  217     if train_size + val_size + test_size != 1:
 
  218         total = train_size + val_size + test_size
 
  219         train_size = train_size / total
 
  220         val_size = val_size / total
 
  221         test_size = test_size / total
 
  224     with h5py.File(filename, 
"r") 
as f:
 
  226             [f[det][p][()] 
for p 
in PARTICLES 
for det 
in DETECTORS], axis=-1
 
  229         theta_data = f[
"theta"][()]
 
  230         labels = np.abs(f[
"pdg"][()])
 
  231         for i, p 
in enumerate(PDG_CODES):
 
  232             labels[labels == p] = i
 
  240     makedirs(output_dir, exist_ok=
True)
 
  241     kw = dict(shuffle=shuffle, random_state=random_state)
 
  244     (X_0, X, y_0, y, p_0, p, t_0, t) = train_test_split(
 
  245         X, y, p, t, train_size=train_size, **kw
 
  247     np.savez(join(output_dir, 
"train.npz"), X=X_0, y=y_0, p=p_0, theta=t_0)
 
  251         size = val_size / (1 - train_size)
 
  252         (X_1, X_2, y_1, y_2, p_1, p_2, t_1, t_2) = train_test_split(
 
  253             X, y, p, t, train_size=size, **kw
 
  256         np.savez(join(output_dir, 
"val.npz"), X=X_1, y=y_1, p=p_1, theta=t_1)
 
  257         np.savez(join(output_dir, 
"test.npz"), X=X_2, y=y_2, p=p_2, theta=t_2)
 
  260         np.savez(join(output_dir, 
"val.npz"), X=X, y=y, p=p, theta=t)
 
  264     """Performs softmax calculation with corrections to help prevent overflow. 
  267         This is the calculation used to convert log-likelihoods to likelihood 
  268         ratios. Implementation following 
  269         https://stackoverflow.com/a/67112412/18837571 
  272         x (:func:`numpy.array`): Data to be softmaxed. Softmax is calculated over the last 
  276         :func:`numpy.array`: Softmaxed data. 
  278     maxes = np.amax(x, axis=-1, keepdims=
True)
 
  279     x_exp = np.exp(x - maxes)
 
  280     return x_exp / np.sum(x_exp, axis=-1, keepdims=
True)
 
  284     """Make a 'labels' column for the DataFrame. The 'labels' column contains 
  285     the particle type labels for each event: 0 for electron, 1 for muon, and so 
  289         df (pandas.DataFrame): DataFrame that 'labels' column will be added to. Must 
  290             not have NaN values in the 'pdg' column. 
  292     labels = np.abs(df[
"pdg"].values)
 
  293     if np.count_nonzero(~np.isfinite(labels)):
 
  295             'Warning: dataset contains NaN values in the "pdg" column. ' 
  296             'This means the "labels" column cannot be made, so most of the ' 
  297             "pidplots methods will fail." 
  299         labels = np.ones_like(labels) * np.nan
 
  301         for i, p 
in enumerate(PDG_CODES):
 
  302             labels[labels == p] = i
 
  303         labels[labels >= 6] = -1
 
  304     df[
"labels"] = labels
 
  307 def make_bins(df, p_bins=P_BINS, theta_bins=THETA_BINS):
 
  308     """Make 'p_bin' and 'theta_bin' column in the given DataFrame. 
  311         df (pandas.DataFrame): The DataFrame to add bins columns to. 
  312         p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV. 
  313             Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV. 
  314         theta_bins (:func:`numpy.array`): The edges of the theta bins in radians. 
  315             Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150] 
  318     df[
"p_bin"] = np.digitize(df[
"p"].values, p_bins) - 1
 
  319     df[
"theta_bin"] = np.digitize(df[
"theta"].values, theta_bins) - 1
 
  322 def make_lrs(df, column=_column):
 
  323     """Makes likelihood ratio columns for each of the six particle types in the 
  327         df (pandas.DataFrame): DataFrame to which the columns will be added. 
  328         column: A function which given the particle and 
  329             detector names returns the corresponding detector log-likelihood 
  330             column name. Defaults to _column, which gives column names of the 
  331             format f"{detector}_{particle}". 
  336             np.sum(df[[column(p, det) 
for det 
in DETECTORS]].values, -1)
 
  343     lrs = softmax(h_logls)
 
  344     for i, p 
in enumerate(PARTICLES):
 
  345         df[f
"lr_{p}"] = lrs[:, i]
 
  348 def make_binary_lrs(df, column=_column):
 
  349     """Makes binary likelihood ratio columns for each of the five non-pion 
  350     particle type hypotheses in the given DataFrame. 
  353         df (pandas.DataFrame): DataFrame to which the columns will be added. 
  354         column: A function which given the particle and 
  355             detector names returns the corresponding detector log-likelihood 
  356             column name. Defaults to _column, which gives column names of the 
  357             format f"{detector}_{particle}". 
  365                 np.sum(df[[column(p, det) 
for det 
in DETECTORS]].values, -1)
 
  370         lrs = softmax(h_logls)
 
  371         df[f
"binary_lr_{h}"] = lrs[:, 0]
 
  375     """Makes a 'pid' column in the given DataFrame. The 'pid' column is the 
  376     predicted particle type. Requires likelihood ratio columns to exist. 
  379         df (pandas.DataFrame): DataFrame to which the 'pid' column will be added. 
  381     lrs = np.stack([df[f
"lr_{p}"].values 
for p 
in PARTICLES], axis=-1)
 
  382     pids = np.argmax(lrs, axis=-1)
 
  386 def compute_det_lrs(d, det, column=_column):
 
  387     """Computes single-detector likelihood ratios from the given DataFrame. 
  390         d (pandas.DataFrame): DataFrame containing the detector log-likelihoods. 
  391         det (str): The name of the detector for which the single-detector 
  392             likelihood ratios will be calculated. 
  393         column: A function which given the particle and 
  394             detector names returns the corresponding detector log-likelihood 
  395             column name. Defaults to _column, which gives column names of the 
  396             format f"{detector}_{particle}". 
  399         :func:`numpy.array`: The detector likelihood ratios. 
  401     h_logls = d[[column(p, det) 
for p 
in PARTICLES]].values
 
  402     lrs = softmax(h_logls)
 
  406 def make_pid_det(df, column=_column):
 
  407     """Makes single-detector PID columns for each of the detectors in the given DataFrame. 
  410         df (pandas.DataFrame): DataFrame to which the columns will be added. 
  411         column: A function which given the particle and 
  412             detector names returns the corresponding detector log-likelihood 
  413             column name. Defaults to _column, which gives column names of the 
  414             format f"{detector}_{particle}". 
  416     for det 
in DETECTORS:
 
  417         mask = df[column(
"e", det)] == 0  
 
  418         lrs = compute_det_lrs(df, det, column=column)
 
  419         pids = np.argmax(lrs, axis=-1)
 
  421         df[f
"pid_{det}"] = pids
 
  424 def compute_abl_lrs(d, det, column=_column):
 
  425     """Computes ablation likelihood ratios from the given DataFrame. 
  428         d (pandas.DataFrame): DataFrame containing the detector log-likelihoods. 
  429         det (str): The name of the detector to be omitted for the ablation. 
  430         column: A function which given the particle and 
  431             detector names returns the corresponding detector log-likelihood 
  432             column name. Defaults to _column, which gives column names of the 
  433             format f"{detector}_{particle}". 
  436         :func:`numpy.array`: The ablation likelihood ratios. 
  440         others = [det2 
for det2 
in DETECTORS 
if det2 != det]
 
  441         return [column(p, det2) 
for det2 
in others]
 
  443     h_logls = np.stack([np.sum(d[_cols(p)].values, -1) 
for p 
in PARTICLES], -1)
 
  444     lrs = softmax(h_logls)
 
  448 def make_pid_abl(df, column=_column):
 
  449     """Makes ablation PID columns for each of the detectors in the given 
  453         df (pandas.DataFrame): DataFrame to which the columns will be added. 
  454         column: A function which given the particle and 
  455             detector names returns the corresponding detector log-likelihood 
  456             column name. Defaults to _column, which gives column names of the 
  457             format f"{detector}_{particle}". 
  459     for det 
in DETECTORS:
 
  460         lrs = compute_abl_lrs(df, det, column=column)
 
  461         pids = np.argmax(lrs, axis=-1)
 
  462         df[f
"pid_no_{det}"] = pids
 
  465 def compute_contrib(d, corr=True):
 
  466     """Computes the detector contributions. 
  469         d (pandas.DataFrame): DataFrame containing the likelihood ratio data. 
  470         corr (bool): Whether to compute contribution to the likelihood 
  471             ratio of the _correct_ hypothesis (True) or the _chosen_ hypothesis 
  472             (False). Defaults to True. 
  475         dict[str, :func:`numpy.array`]: The contributions of each detector. 
  478     for det 
in DETECTORS:
 
  479         reg_lrs = d[[f
"lr_{p}" for p 
in PARTICLES]].values
 
  480         abl_lrs = compute_abl_lrs(d, det)
 
  481         idx = d[
"labels" if corr 
else "pid"].values.astype(int)
 
  482         reg_lr = reg_lrs[np.arange(len(idx)), idx]
 
  483         abl_lr = abl_lrs[np.arange(len(idx)), idx]
 
  484         ctrb = reg_lr - abl_lr
 
  489 def make_contrib(df, corr=True):
 
  490     """Makes columns for the detector contributions in the given DataFrame. 
  493         df (pandas.DataFrame): DataFrame to which the columns will be added. 
  494         corr (bool): Whether to compute contribution to the likelihood 
  495             ratio of the _correct_ hypothesis (True) or the _chosen_ hypothesis 
  496             (False). Defaults to True. 
  498     ctrbs = compute_contrib(df, corr=corr)
 
  499     for det, ctrb 
in ctrbs.items():
 
  500         df[f
"contrib_{det}"] = ctrb
 
  506     theta_bins=THETA_BINS,
 
  510     """Makes all the additional columns for a given DataFrame. 
  513         df (pandas.DataFrame): DataFrame to which the columns will be added. 
  514         p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV. 
  515             Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV. 
  516         theta_bins (:func:`numpy.array`): The edges of the theta bins in radians. 
  517             Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150] 
  519         contrib_corr (bool): Whether to compute contribution to the 
  520             likelihood ratio of the _correct_ hypothesis (True) or the _chosen_ 
  521             hypothesis (False). Defaults to True. 
  522         column: A function which given the particle and 
  523             detector names returns the corresponding detector log-likelihood 
  524             column name. Defaults to _column, which gives column names of the 
  525             format f"{detector}_{particle}". 
  528     make_bins(df, p_bins=p_bins, theta_bins=theta_bins)
 
  529     make_lrs(df, column=column)
 
  530     make_binary_lrs(df, column=column)
 
  532     make_pid_det(df, column=column)
 
  534     make_contrib(df, corr=contrib_corr)
 
  537 def apply_weights(df, weights, p_bins=P_BINS, theta_bins=THETA_BINS, column=_column):
 
  538     """Applies the given weights to the log-likelihood data in the DataFrame. 
  541         df (pandas.DataFrame): DataFrame to which the weights are applied. 
  542         weights (dict[tuple(int), :func:`numpy.array`] or :func:`numpy.array`): The calibration weight 
  543             values. If a dict, keys should be a tuple of ints, and each value is 
  544             the six-by-six array of weights for the bin. If a single np.array, 
  545             should be a six-by-six array of weights to be applied globally. 
  546         p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV. 
  547             Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV. 
  548         theta_bins (:func:`numpy.array`): The edges of the theta bins in radians. 
  549             Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150] 
  551         column: A function which given the particle and 
  552             detector names returns the corresponding detector log-likelihood 
  553             column name. Defaults to _column, which gives column names of the 
  554             format f"{detector}_{particle}". 
  560     if isinstance(weights, dict):
 
  561         for p 
in range(len(p_bins) - 1):
 
  562             p_lo, p_hi = p_bins[p], p_bins[p + 1]
 
  563             p_mask = (df[
"p"] >= p_lo) & (df[
"p"] <= p_hi)
 
  565             for theta 
in range(len(theta_bins) - 1):
 
  566                 t_lo, t_hi = theta_bins[theta], theta_bins[theta + 1]
 
  567                 t_mask = (df[
"theta"] >= t_lo) & (df[
"theta"] <= t_hi)
 
  569                 for i, h 
in enumerate(PARTICLES):
 
  570                     for j, d 
in enumerate(DETECTORS):
 
  571                         df.loc[(p_mask & t_mask), column(h, d)] *= weights[p, theta][
 
  577         for i, h 
in enumerate(PARTICLES):
 
  578             for j, d 
in enumerate(DETECTORS):
 
  579                 df[column(h, d)] *= weights[i, j]
 
  582 def cut_particles(df, allowed_particles, column=_column):
 
  583     """Cuts the log-likelihood data associated with given particle types. 
  586         df (pandas.DataFrame): DataFrame to which the cuts will be applied. 
  587         allowed_particles (list(str)): List of allowed particle types. Any 
  588             particle types not present will be cut, unless the list is empty (in 
  589             which case no cuts are applied). 
  590         column: A function which given the particle and 
  591             detector names returns the corresponding detector log-likelihood 
  592             column name. Defaults to _column, which gives column names of the 
  593             format f"{detector}_{particle}". 
  595     if len(allowed_particles) == 0:
 
  599         if not (p 
in allowed_particles):
 
  601                 df[column(p, d)] = -1e10
 
  604 def read_h5(filename):
 
  605     """Read an HDF5 file in our 'slim' format into a DataFrame. 
  608         filename (str): Input filename. 
  611         pandas.DataFrame: DataFrame containing data. 
  614     with h5py.File(filename, 
"r") 
as f:
 
  615         for key 
in [
"pdg", 
"p", 
"theta", 
"phi"]:
 
  617         for key 
in [f
"{d}/{p}" for d 
in DETECTORS 
for p 
in PARTICLES]:
 
  618             df_key = key.replace(
"/", 
"_")
 
  619             df[df_key] = f[key][()]
 
  620             df[df_key] = df[df_key].fillna(0)
 
  624 def read_npz(filename):
 
  625     """Read an npz file in our training format into a DataFrame. 
  628         filename (str): Input filename. 
  631         pandas.DataFrame: DataFrame containing data. 
  633     data = np.load(filename)
 
  635         data=data[
"X"], columns=[f
"{d}_{p}" for p 
in PARTICLES 
for d 
in DETECTORS],
 
  637     df[
"labels"] = data[
"y"]
 
  639     df[
"theta"] = data[
"theta"]
 
  641     df[
"pdg"] = df[
"labels"]
 
  642     for i, pdg 
in enumerate(PDG_CODES):
 
  643         df.loc[df[
"labels"] == i, 
"pdg"] = pdg
 
  648 def produce_analysis_df(
 
  652     drop_outside_bins=True,
 
  654     allowed_particles=[],
 
  656     theta_bins=THETA_BINS,
 
  659     """Prepares a DataFrame for PID analysis by applying weights, computing and 
  660     adding additional columns, cutting NaNs, and more. 
  663         df (pandas.DataFrame): DataFrame to prepare for analysis. 
  664         compute_cols (bool): Whether to compute and add additional 
  665             columns. Defaults to True. 
  666         drop_nans (bool): Whether to drop rows that contain NaNs. 
  668         drop_outside_bins (bool): Whether to drop rows for particles 
  669             outside of the momentum and theta bins. Defaults to True. 
  670         weights (:func:`numpy.array`): Calibration weights to be applied to the 
  671             detector log-likelihoods. Defaults to None. 
  672         allowed_particles (list(str)): If not empty, specifies the 
  673             allowed particle types. Any not allowed particle types will be 
  674             excluded from the PID calculations. If empty, all particle types are 
  675             considered.  Defaults to []. 
  676         p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV. 
  677             Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV. 
  678         theta_bins (:func:`numpy.array`): The edges of the theta bins in radians. 
  679             Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150] 
  681         column: A function which given the particle and 
  682             detector names returns the corresponding detector log-likelihood 
  683             column name. Defaults to _column, which gives column names of the 
  684             format f"{detector}_{particle}". 
  687         pandas.DataFrame: Return the prepared DataFrame. (Not all modifications in 
  688             this method are in-place.) 
  690     if column 
is not None:
 
  693                 df[f
"{d}_{p}"] = df[column(p, d)]
 
  695     apply_weights(df, weights, p_bins=p_bins, theta_bins=theta_bins)
 
  696     cut_particles(df, allowed_particles)
 
  702             theta_bins=theta_bins,
 
  705         if drop_outside_bins:
 
  707                 np.logical_and.reduce(
 
  709                         df[
"p_bin"].values >= 0,
 
  710                         df[
"p_bin"].values < len(p_bins) - 1,
 
  711                         df[
"theta_bin"].values >= 0,
 
  712                         df[
"theta_bin"].values < len(theta_bins) - 1,
 
  719         df = df[df[
"labels"] >= 0]