18def _make_const_lists():
19 """Moving this code into a function to avoid a top-level ROOT import."""
20 from ROOT
import Belle2
23 PARTICLES, PDG_CODES = [], []
24 for i
in range(len(ROOT.Belle2.Const.chargedStableSet)):
25 particle = ROOT.Belle2.Const.chargedStableSet.at(i)
26 name = (particle.__repr__()[7:-1]
29 .replace(
"euteron",
""))
30 PARTICLES.append(name)
31 PDG_CODES.append(particle.getPDGCode())
36 for det
in ROOT.Belle2.Const.PIDDetectors.set():
37 DETECTORS.append(ROOT.Belle2.Const.parseDetectors(det))
40 return PARTICLES, PDG_CODES, DETECTORS
44PARTICLES = [
"e",
"mu",
"pi",
"K",
"p",
"d"]
45PDG_CODES = [11, 13, 211, 321, 2212, 1000010020]
46DETECTORS = [
"SVD",
"CDC",
"TOP",
"ARICH",
"ECL",
"KLM"]
48P_BINS = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5])
49THETA_BINS = np.radians(np.array([17, 28, 40, 60, 77, 96, 115, 133, 150]))
52def _column(particle, detector):
53 """Default column names for detector log-likelihoods.
56 particle (str): particle name
57 detector (str): detector name
60 str: Corresponding column name.
62 return f
"{detector}_{particle}"
65def root_column(particle, detector):
66 """Column names for detector log-likelihoods found in our ROOT datafiles.
69 particle (str): particle name
70 detector (str): detector name
73 str: Corresponding column name.
75 pdg = PDG_CODES[PARTICLES.index(particle)]
76 return f
"pidLogLikelyhoodOf{pdg}From{detector}"
79def read_root(root_filenames):
80 """Reads one or several ROOT datafiles into a DataFrame.
83 root_filenames (list(str) or str): If only one filename, can be given
as
84 a string. If more than one, should be given
as a list
or tuple.
87 pandas.DataFrame: DataFrame containing the data of the ROOT datafile(s).
90 return uproot.concatenate(root_filenames, library=
'pd')
93def make_h5(df, tags, out_filename, pdg=None, column=root_column):
94 """Make an HDF5 file in our 'slim' format from the given DataFrame.
97 df (pandas.DataFrame): The DataFrame containing the data.
98 tags (list(str) or str): The particle tags used
as a prefix
for desired
99 columns. e.g.
for kaons
in a D* decay, this
is 'DST_D0_K'. One
or
101 out_filename (str): Output filename
for the h5 file that will be
103 pdg (int
or None): The PDG code
for the particles being
104 extracted. If
None, uses the values found
in the
'mcPDG' column of
105 the DataFrame. Defaults to
None.
106 column: A function which, given the particle
and
107 detector names, returns the column name
for the corresponding
108 detector log-likelihood. Defaults to root_column, which assumes
109 column names are of the format
110 f
'pidLogLikelyhoodOf{pdg}From{detector}'.
113 if isinstance(tags, str):
117 return np.concatenate(arrs)
if len(arrs) > 1
else arrs[0]
120 return _concat([df[f
"{tag}_{col}"].values
for tag
in tags])
122 with h5py.File(out_filename,
"w")
as f:
124 pdg_values = np.ones(len(df) * len(tags)) * pdg
126 pdg_values = np.abs(_get_all(
"mcPDG"))
128 f.create_dataset(
"pdg", data=pdg_values)
129 f.create_dataset(
"p", data=_get_all(
"p"))
130 f.create_dataset(
"theta", data=np.arccos(_get_all(
"cosTheta")))
131 f.create_dataset(
"phi", data=_get_all(
"phi"))
133 for det
in DETECTORS:
135 data = _get_all(column(p, det))
136 data[np.isnan(data)] = 0
137 f.create_dataset(f
"{det}/{p}", data=data)
140def merge_h5s(filenames, out_filename, pdgs=None):
141 """Merge several HDF5 files in our 'slim' format together.
144 filenames (list(str)): Filenames of HDF5 files to be merged.
145 out_filename (str): Output filename.
146 pdgs (list(int)): The PDG tags for the particle types, one per
147 filename, to overwrite the
'pdg' columns
in those files when
148 merging. If
None, simply uses the
'pdg' columns
from the files.
151 fs = [h5py.File(fname, "r")
for fname
in filenames]
152 m = h5py.File(out_filename,
"w")
154 keys = [
"p",
"theta",
"phi"]
155 keys += [f
"{d}/{p}" for d
in DETECTORS
for p
in PARTICLES]
158 m.create_dataset(key, data=np.concatenate([f[key][()]
for f
in fs]))
165 [np.ones_like(f[
"pdg"][()]) * pdg
for f, pdg
in zip(fs, pdgs)]
169 m.create_dataset(
"pdg", data=np.concatenate([f[
"pdg"][()]
for f
in fs]))
185 """Split the data in an HDF5 'slim' format file in train, validation, and
186 test sets, stored in .npz files
for ease of weight training.
189 filename (str): Filename of HDF5 input file.
190 output_dir (str): Name of output directory,
in which the train,
191 validation,
and test sets will be written. Will be created
if it
192 does
not already exist.
193 train_size (float): Fraction of the dataset to use
for
194 training. Defaults to 0.8.
195 val_size (float): Fraction of the dataset to use
for
196 validation. Defaults to 0.1.
197 test_size (float): Fraction of the dataset to use
for testing.
199 shuffle (bool): Whether to shuffle the dataset before
200 splitting. Defaults to
True.
201 random_state (int
or None): Random state
for the shuffling.
205 from sklearn.model_selection
import train_test_split
206 from os.path
import join
207 from os
import makedirs
209 assert train_size > 0, f
"train_size ({train_size}) must be positive"
210 assert val_size >= 0, f
"val_size ({val_size}) may not be negative"
211 assert test_size >= 0, f
"test_size ({test_size}) may not be negative"
212 assert val_size + test_size != 0,
"val_size and test_size cannot both be zero"
218 if train_size + val_size + test_size != 1:
219 total = train_size + val_size + test_size
220 train_size = train_size / total
221 val_size = val_size / total
222 test_size = test_size / total
225 with h5py.File(filename,
"r")
as f:
227 [f[det][p][()]
for p
in PARTICLES
for det
in DETECTORS], axis=-1
230 theta_data = f[
"theta"][()]
231 labels = np.abs(f[
"pdg"][()])
232 for i, p
in enumerate(PDG_CODES):
233 labels[labels == p] = i
241 makedirs(output_dir, exist_ok=
True)
242 kw = dict(shuffle=shuffle, random_state=random_state)
245 (X_0, X, y_0, y, p_0, p, t_0, t) = train_test_split(
246 X, y, p, t, train_size=train_size, **kw
248 np.savez(join(output_dir,
"train.npz"), X=X_0, y=y_0, p=p_0, theta=t_0)
252 size = val_size / (1 - train_size)
253 (X_1, X_2, y_1, y_2, p_1, p_2, t_1, t_2) = train_test_split(
254 X, y, p, t, train_size=size, **kw
257 np.savez(join(output_dir,
"val.npz"), X=X_1, y=y_1, p=p_1, theta=t_1)
258 np.savez(join(output_dir,
"test.npz"), X=X_2, y=y_2, p=p_2, theta=t_2)
261 np.savez(join(output_dir,
"val.npz"), X=X, y=y, p=p, theta=t)
265 """Performs softmax calculation with corrections to help prevent overflow.
268 This is the calculation used to convert log-likelihoods to likelihood
269 ratios. Implementation following
270 https://stackoverflow.com/a/67112412/18837571
273 x (:func:`numpy.array`): Data to be softmaxed. Softmax
is calculated over the last
277 :func:`numpy.array`: Softmaxed data.
279 maxes = np.amax(x, axis=-1, keepdims=True)
280 x_exp = np.exp(x - maxes)
281 return x_exp / np.sum(x_exp, axis=-1, keepdims=
True)
285 """Make a 'labels' column for the DataFrame. The 'labels' column contains
286 the particle type labels for each event: 0
for electron, 1
for muon,
and so
290 df (pandas.DataFrame): DataFrame that
'labels' column will be added to. Must
291 not have NaN values
in the
'pdg' column.
293 labels = np.abs(df["pdg"].values)
294 if np.count_nonzero(~np.isfinite(labels)):
296 'Warning: dataset contains NaN values in the "pdg" column. '
297 'This means the "labels" column cannot be made, so most of the '
298 "pidplots methods will fail."
300 labels = np.ones_like(labels) * np.nan
302 for i, p
in enumerate(PDG_CODES):
303 labels[labels == p] = i
304 labels[labels >= 6] = -1
305 df[
"labels"] = labels
308def make_bins(df, p_bins=P_BINS, theta_bins=THETA_BINS):
309 """Make 'p_bin' and 'theta_bin' column in the given DataFrame.
312 df (pandas.DataFrame): The DataFrame to add bins columns to.
313 p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
314 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
315 theta_bins (:func:`numpy.array`): The edges of the theta bins
in radians.
316 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
319 df["p_bin"] = np.digitize(df[
"p"].values, p_bins) - 1
320 df[
"theta_bin"] = np.digitize(df[
"theta"].values, theta_bins) - 1
323def make_lrs(df, column=_column):
324 """Makes likelihood ratio columns for each of the six particle types in the
328 df (pandas.DataFrame): DataFrame to which the columns will be added.
329 column: A function which given the particle and
330 detector names returns the corresponding detector log-likelihood
331 column name. Defaults to _column, which gives column names of the
332 format f
"{detector}_{particle}".
337 np.sum(df[[column(p, det)
for det
in DETECTORS]].values, -1)
344 lrs = softmax(h_logls)
345 for i, p
in enumerate(PARTICLES):
346 df[f
"lr_{p}"] = lrs[:, i]
349def make_binary_lrs(df, column=_column):
350 """Makes binary likelihood ratio columns for each of the five non-pion
351 particle type hypotheses in the given DataFrame.
354 df (pandas.DataFrame): DataFrame to which the columns will be added.
355 column: A function which given the particle
and
356 detector names returns the corresponding detector log-likelihood
357 column name. Defaults to _column, which gives column names of the
358 format f
"{detector}_{particle}".
366 np.sum(df[[column(p, det)
for det
in DETECTORS]].values, -1)
371 lrs = softmax(h_logls)
372 df[f
"binary_lr_{h}"] = lrs[:, 0]
376 """Makes a 'pid' column in the given DataFrame. The 'pid' column is the
377 predicted particle type. Requires likelihood ratio columns to exist.
380 df (pandas.DataFrame): DataFrame to which the 'pid' column will be added.
382 lrs = np.stack([df[f"lr_{p}"].values
for p
in PARTICLES], axis=-1)
383 pids = np.argmax(lrs, axis=-1)
387def compute_det_lrs(d, det, column=_column):
388 """Computes single-detector likelihood ratios from the given DataFrame.
391 d (pandas.DataFrame): DataFrame containing the detector log-likelihoods.
392 det (str): The name of the detector for which the single-detector
393 likelihood ratios will be calculated.
394 column: A function which given the particle
and
395 detector names returns the corresponding detector log-likelihood
396 column name. Defaults to _column, which gives column names of the
397 format f
"{detector}_{particle}".
400 :func:`numpy.array`: The detector likelihood ratios.
402 h_logls = d[[column(p, det) for p
in PARTICLES]].values
403 lrs = softmax(h_logls)
407def make_pid_det(df, column=_column):
408 """Makes single-detector PID columns for each of the detectors in the given DataFrame.
411 df (pandas.DataFrame): DataFrame to which the columns will be added.
412 column: A function which given the particle and
413 detector names returns the corresponding detector log-likelihood
414 column name. Defaults to _column, which gives column names of the
415 format f
"{detector}_{particle}".
417 for det
in DETECTORS:
418 mask = df[column(
"e", det)] == 0
419 lrs = compute_det_lrs(df, det, column=column)
420 pids = np.argmax(lrs, axis=-1)
422 df[f
"pid_{det}"] = pids
425def compute_abl_lrs(d, det, column=_column):
426 """Computes ablation likelihood ratios from the given DataFrame.
429 d (pandas.DataFrame): DataFrame containing the detector log-likelihoods.
430 det (str): The name of the detector to be omitted for the ablation.
431 column: A function which given the particle
and
432 detector names returns the corresponding detector log-likelihood
433 column name. Defaults to _column, which gives column names of the
434 format f
"{detector}_{particle}".
437 :func:`numpy.array`: The ablation likelihood ratios.
441 others = [det2
for det2
in DETECTORS
if det2 != det]
442 return [column(p, det2)
for det2
in others]
444 h_logls = np.stack([np.sum(d[_cols(p)].values, -1)
for p
in PARTICLES], -1)
445 lrs = softmax(h_logls)
449def make_pid_abl(df, column=_column):
450 """Makes ablation PID columns for each of the detectors in the given
454 df (pandas.DataFrame): DataFrame to which the columns will be added.
455 column: A function which given the particle and
456 detector names returns the corresponding detector log-likelihood
457 column name. Defaults to _column, which gives column names of the
458 format f
"{detector}_{particle}".
460 for det
in DETECTORS:
461 lrs = compute_abl_lrs(df, det, column=column)
462 pids = np.argmax(lrs, axis=-1)
463 df[f
"pid_no_{det}"] = pids
466def compute_contrib(d, corr=True):
467 """Computes the detector contributions.
470 d (pandas.DataFrame): DataFrame containing the likelihood ratio data.
471 corr (bool): Whether to compute contribution to the likelihood
472 ratio of the _correct_ hypothesis (True)
or the _chosen_ hypothesis
473 (
False). Defaults to
True.
476 dict[str, :func:`numpy.array`]: The contributions of each detector.
479 for det
in DETECTORS:
480 reg_lrs = d[[f
"lr_{p}" for p
in PARTICLES]].values
481 abl_lrs = compute_abl_lrs(d, det)
482 idx = d[
"labels" if corr
else "pid"].values.astype(int)
483 reg_lr = reg_lrs[np.arange(len(idx)), idx]
484 abl_lr = abl_lrs[np.arange(len(idx)), idx]
485 ctrb = reg_lr - abl_lr
490def make_contrib(df, corr=True):
491 """Makes columns for the detector contributions in the given DataFrame.
494 df (pandas.DataFrame): DataFrame to which the columns will be added.
495 corr (bool): Whether to compute contribution to the likelihood
496 ratio of the _correct_ hypothesis (True)
or the _chosen_ hypothesis
497 (
False). Defaults to
True.
499 ctrbs = compute_contrib(df, corr=corr)
500 for det, ctrb
in ctrbs.items():
501 df[f
"contrib_{det}"] = ctrb
507 theta_bins=THETA_BINS,
511 """Makes all the additional columns for a given DataFrame.
514 df (pandas.DataFrame): DataFrame to which the columns will be added.
515 p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
516 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
517 theta_bins (:func:`numpy.array`): The edges of the theta bins
in radians.
518 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
520 contrib_corr (bool): Whether to compute contribution to the
521 likelihood ratio of the _correct_ hypothesis (
True)
or the _chosen_
522 hypothesis (
False). Defaults to
True.
523 column: A function which given the particle
and
524 detector names returns the corresponding detector log-likelihood
525 column name. Defaults to _column, which gives column names of the
526 format f
"{detector}_{particle}".
529 make_bins(df, p_bins=p_bins, theta_bins=theta_bins)
530 make_lrs(df, column=column)
531 make_binary_lrs(df, column=column)
533 make_pid_det(df, column=column)
535 make_contrib(df, corr=contrib_corr)
538def apply_weights(df, weights, p_bins=P_BINS, theta_bins=THETA_BINS, column=_column):
539 """Applies the given weights to the log-likelihood data in the DataFrame.
542 df (pandas.DataFrame): DataFrame to which the weights are applied.
543 weights (dict[tuple(int), :func:`numpy.array`] or :func:`numpy.array`): The calibration weight
544 values. If a dict, keys should be a tuple of ints,
and each value
is
545 the six-by-six array of weights
for the bin. If a single np.array,
546 should be a six-by-six array of weights to be applied globally.
547 p_bins (:func:`numpy.array`): The edges of the momentum bins
in GeV.
548 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
549 theta_bins (:func:`numpy.array`): The edges of the theta bins
in radians.
550 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
552 column: A function which given the particle
and
553 detector names returns the corresponding detector log-likelihood
554 column name. Defaults to _column, which gives column names of the
555 format f
"{detector}_{particle}".
561 if isinstance(weights, dict):
562 for p
in range(len(p_bins) - 1):
563 p_lo, p_hi = p_bins[p], p_bins[p + 1]
564 p_mask = (df[
"p"] >= p_lo) & (df[
"p"] <= p_hi)
566 for theta
in range(len(theta_bins) - 1):
567 t_lo, t_hi = theta_bins[theta], theta_bins[theta + 1]
568 t_mask = (df[
"theta"] >= t_lo) & (df[
"theta"] <= t_hi)
570 for i, h
in enumerate(PARTICLES):
571 for j, d
in enumerate(DETECTORS):
572 df.loc[(p_mask & t_mask), column(h, d)] *= weights[p, theta][
578 for i, h
in enumerate(PARTICLES):
579 for j, d
in enumerate(DETECTORS):
580 df[column(h, d)] *= weights[i, j]
583def cut_particles(df, allowed_particles, column=_column):
584 """Cuts the log-likelihood data associated with given particle types.
587 df (pandas.DataFrame): DataFrame to which the cuts will be applied.
588 allowed_particles (list(str)): List of allowed particle types. Any
589 particle types not present will be cut, unless the list
is empty (
in
590 which case no cuts are applied).
591 column: A function which given the particle
and
592 detector names returns the corresponding detector log-likelihood
593 column name. Defaults to _column, which gives column names of the
594 format f
"{detector}_{particle}".
596 if len(allowed_particles) == 0:
600 if not (p
in allowed_particles):
602 df[column(p, d)] = -1e10
605def read_h5(filename):
606 """Read an HDF5 file in our 'slim' format into a DataFrame.
609 filename (str): Input filename.
612 pandas.DataFrame: DataFrame containing data.
615 with h5py.File(filename,
"r")
as f:
616 for key
in [
"pdg",
"p",
"theta",
"phi"]:
618 for key
in [f
"{d}/{p}" for d
in DETECTORS
for p
in PARTICLES]:
619 df_key = key.replace(
"/",
"_")
620 df[df_key] = f[key][()]
621 df[df_key] = df[df_key].fillna(0)
625def read_npz(filename):
626 """Read an npz file in our training format into a DataFrame.
629 filename (str): Input filename.
632 pandas.DataFrame: DataFrame containing data.
634 data = np.load(filename)
636 data=data["X"], columns=[f
"{d}_{p}" for p
in PARTICLES
for d
in DETECTORS],
638 df[
"labels"] = data[
"y"]
640 df[
"theta"] = data[
"theta"]
642 df[
"pdg"] = df[
"labels"]
643 for i, pdg
in enumerate(PDG_CODES):
644 df.loc[df[
"labels"] == i,
"pdg"] = pdg
649def produce_analysis_df(
653 drop_outside_bins=True,
655 allowed_particles=[],
657 theta_bins=THETA_BINS,
660 """Prepares a DataFrame for PID analysis by applying weights, computing and
661 adding additional columns, cutting NaNs, and more.
664 df (pandas.DataFrame): DataFrame to prepare
for analysis.
665 compute_cols (bool): Whether to compute
and add additional
666 columns. Defaults to
True.
667 drop_nans (bool): Whether to drop rows that contain NaNs.
669 drop_outside_bins (bool): Whether to drop rows
for particles
670 outside of the momentum
and theta bins. Defaults to
True.
671 weights (:func:`numpy.array`): Calibration weights to be applied to the
672 detector log-likelihoods. Defaults to
None.
673 allowed_particles (list(str)): If
not empty, specifies the
674 allowed particle types. Any
not allowed particle types will be
675 excluded
from the PID calculations. If empty, all particle types are
676 considered. Defaults to [].
677 p_bins (:func:`numpy.array`): The edges of the momentum bins
in GeV.
678 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
679 theta_bins (:func:`numpy.array`): The edges of the theta bins
in radians.
680 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
682 column: A function which given the particle
and
683 detector names returns the corresponding detector log-likelihood
684 column name. Defaults to _column, which gives column names of the
685 format f
"{detector}_{particle}".
688 pandas.DataFrame: Return the prepared DataFrame. (Not all modifications
in
689 this method are
in-place.)
691 if column
is not None:
694 df[f
"{d}_{p}"] = df[column(p, d)]
696 apply_weights(df, weights, p_bins=p_bins, theta_bins=theta_bins)
697 cut_particles(df, allowed_particles)
703 theta_bins=theta_bins,
706 if drop_outside_bins:
708 np.logical_and.reduce(
710 df[
"p_bin"].values >= 0,
711 df[
"p_bin"].values < len(p_bins) - 1,
712 df[
"theta_bin"].values >= 0,
713 df[
"theta_bin"].values < len(theta_bins) - 1,
720 df = df[df[
"labels"] >= 0]