18 def _make_const_lists():
19 """Moving this code into a function to avoid a top-level ROOT import."""
22 PARTICLES, PDG_CODES = [], []
23 for i
in range(len(ROOT.Belle2.Const.chargedStableSet)):
24 particle = ROOT.Belle2.Const.chargedStableSet.at(i)
25 name = (particle.__repr__()[7:-1]
28 .replace(
"euteron",
""))
29 PARTICLES.append(name)
30 PDG_CODES.append(particle.getPDGCode())
35 for det
in ROOT.Belle2.Const.PIDDetectors.set():
36 DETECTORS.append(ROOT.Belle2.Const.parseDetectors(det))
39 return PARTICLES, PDG_CODES, DETECTORS
43 PARTICLES = [
"e",
"mu",
"pi",
"K",
"p",
"d"]
44 PDG_CODES = [11, 13, 211, 321, 2212, 1000010020]
45 DETECTORS = [
"SVD",
"CDC",
"TOP",
"ARICH",
"ECL",
"KLM"]
47 P_BINS = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5])
48 THETA_BINS = np.radians(np.array([17, 28, 40, 60, 77, 96, 115, 133, 150]))
51 def _column(particle, detector):
52 """Default column names for detector log-likelihoods.
55 particle (str): particle name
56 detector (str): detector name
59 str: Corresponding column name.
61 return f
"{detector}_{particle}"
64 def root_column(particle, detector):
65 """Column names for detector log-likelihoods found in our ROOT datafiles.
68 particle (str): particle name
69 detector (str): detector name
72 str: Corresponding column name.
74 pdg = PDG_CODES[PARTICLES.index(particle)]
75 return f
"pidLogLikelyhoodOf{pdg}From{detector}"
78 def read_root(root_filenames):
79 """Reads one or several ROOT datafiles into a DataFrame.
82 root_filenames (list(str) or str): If only one filename, can be given as
83 a string. If more than one, should be given as a list or tuple.
86 pandas.DataFrame: DataFrame containing the data of the ROOT datafile(s).
89 return uproot.concatenate(root_filenames, library=
'pd')
92 def make_h5(df, tags, out_filename, pdg=None, column=root_column):
93 """Make an HDF5 file in our 'slim' format from the given DataFrame.
96 df (pandas.DataFrame): The DataFrame containing the data.
97 tags (list(str) or str): The particle tags used as a prefix for desired
98 columns. e.g. for kaons in a D* decay, this is 'DST_D0_K'. One or
100 out_filename (str): Output filename for the h5 file that will be
102 pdg (int or None): The PDG code for the particles being
103 extracted. If None, uses the values found in the 'mcPDG' column of
104 the DataFrame. Defaults to None.
105 column: A function which, given the particle and
106 detector names, returns the column name for the corresponding
107 detector log-likelihood. Defaults to root_column, which assumes
108 column names are of the format
109 f'pidLogLikelyhoodOf{pdg}From{detector}'.
112 if isinstance(tags, str):
116 return np.concatenate(arrs)
if len(arrs) > 1
else arrs[0]
119 return _concat([df[f
"{tag}_{col}"].values
for tag
in tags])
121 with h5py.File(out_filename,
"w")
as f:
123 pdg_values = np.ones(len(df) * len(tags)) * pdg
125 pdg_values = np.abs(_get_all(
"mcPDG"))
127 f.create_dataset(
"pdg", data=pdg_values)
128 f.create_dataset(
"p", data=_get_all(
"p"))
129 f.create_dataset(
"theta", data=np.arccos(_get_all(
"cosTheta")))
130 f.create_dataset(
"phi", data=_get_all(
"phi"))
132 for det
in DETECTORS:
134 data = _get_all(column(p, det))
135 data[np.isnan(data)] = 0
136 f.create_dataset(f
"{det}/{p}", data=data)
139 def merge_h5s(filenames, out_filename, pdgs=None):
140 """Merge several HDF5 files in our 'slim' format together.
143 filenames (list(str)): Filenames of HDF5 files to be merged.
144 out_filename (str): Output filename.
145 pdgs (list(int)): The PDG tags for the particle types, one per
146 filename, to overwrite the 'pdg' columns in those files when
147 merging. If None, simply uses the 'pdg' columns from the files.
150 fs = [h5py.File(fname,
"r")
for fname
in filenames]
151 m = h5py.File(out_filename,
"w")
153 keys = [
"p",
"theta",
"phi"]
154 keys += [f
"{d}/{p}" for d
in DETECTORS
for p
in PARTICLES]
157 m.create_dataset(key, data=np.concatenate([f[key][()]
for f
in fs]))
164 [np.ones_like(f[
"pdg"][()]) * pdg
for f, pdg
in zip(fs, pdgs)]
168 m.create_dataset(
"pdg", data=np.concatenate([f[
"pdg"][()]
for f
in fs]))
184 """Split the data in an HDF5 'slim' format file in train, validation, and
185 test sets, stored in .npz files for ease of weight training.
188 filename (str): Filename of HDF5 input file.
189 output_dir (str): Name of output directory, in which the train,
190 validation, and test sets will be written. Will be created if it
191 does not already exist.
192 train_size (float): Fraction of the dataset to use for
193 training. Defaults to 0.8.
194 val_size (float): Fraction of the dataset to use for
195 validation. Defaults to 0.1.
196 test_size (float): Fraction of the dataset to use for testing.
198 shuffle (bool): Whether to shuffle the dataset before
199 splitting. Defaults to True.
200 random_state (int or None): Random state for the shuffling.
204 from sklearn.model_selection
import train_test_split
205 from os.path
import join
206 from os
import makedirs
208 assert train_size > 0, f
"train_size ({train_size}) must be positive"
209 assert val_size >= 0, f
"val_size ({val_size}) may not be negative"
210 assert test_size >= 0, f
"test_size ({test_size}) may not be negative"
211 assert val_size + test_size != 0,
"val_size and test_size cannot both be zero"
217 if train_size + val_size + test_size != 1:
218 total = train_size + val_size + test_size
219 train_size = train_size / total
220 val_size = val_size / total
221 test_size = test_size / total
224 with h5py.File(filename,
"r")
as f:
226 [f[det][p][()]
for p
in PARTICLES
for det
in DETECTORS], axis=-1
229 theta_data = f[
"theta"][()]
230 labels = np.abs(f[
"pdg"][()])
231 for i, p
in enumerate(PDG_CODES):
232 labels[labels == p] = i
240 makedirs(output_dir, exist_ok=
True)
241 kw = dict(shuffle=shuffle, random_state=random_state)
244 (X_0, X, y_0, y, p_0, p, t_0, t) = train_test_split(
245 X, y, p, t, train_size=train_size, **kw
247 np.savez(join(output_dir,
"train.npz"), X=X_0, y=y_0, p=p_0, theta=t_0)
251 size = val_size / (1 - train_size)
252 (X_1, X_2, y_1, y_2, p_1, p_2, t_1, t_2) = train_test_split(
253 X, y, p, t, train_size=size, **kw
256 np.savez(join(output_dir,
"val.npz"), X=X_1, y=y_1, p=p_1, theta=t_1)
257 np.savez(join(output_dir,
"test.npz"), X=X_2, y=y_2, p=p_2, theta=t_2)
260 np.savez(join(output_dir,
"val.npz"), X=X, y=y, p=p, theta=t)
264 """Performs softmax calculation with corrections to help prevent overflow.
267 This is the calculation used to convert log-likelihoods to likelihood
268 ratios. Implementation following
269 https://stackoverflow.com/a/67112412/18837571
272 x (:func:`numpy.array`): Data to be softmaxed. Softmax is calculated over the last
276 :func:`numpy.array`: Softmaxed data.
278 maxes = np.amax(x, axis=-1, keepdims=
True)
279 x_exp = np.exp(x - maxes)
280 return x_exp / np.sum(x_exp, axis=-1, keepdims=
True)
284 """Make a 'labels' column for the DataFrame. The 'labels' column contains
285 the particle type labels for each event: 0 for electron, 1 for muon, and so
289 df (pandas.DataFrame): DataFrame that 'labels' column will be added to. Must
290 not have NaN values in the 'pdg' column.
292 labels = np.abs(df[
"pdg"].values)
293 if np.count_nonzero(~np.isfinite(labels)):
295 'Warning: dataset contains NaN values in the "pdg" column. '
296 'This means the "labels" column cannot be made, so most of the '
297 "pidplots methods will fail."
299 labels = np.ones_like(labels) * np.nan
301 for i, p
in enumerate(PDG_CODES):
302 labels[labels == p] = i
303 labels[labels >= 6] = -1
304 df[
"labels"] = labels
307 def make_bins(df, p_bins=P_BINS, theta_bins=THETA_BINS):
308 """Make 'p_bin' and 'theta_bin' column in the given DataFrame.
311 df (pandas.DataFrame): The DataFrame to add bins columns to.
312 p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
313 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
314 theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
315 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
318 df[
"p_bin"] = np.digitize(df[
"p"].values, p_bins) - 1
319 df[
"theta_bin"] = np.digitize(df[
"theta"].values, theta_bins) - 1
322 def make_lrs(df, column=_column):
323 """Makes likelihood ratio columns for each of the six particle types in the
327 df (pandas.DataFrame): DataFrame to which the columns will be added.
328 column: A function which given the particle and
329 detector names returns the corresponding detector log-likelihood
330 column name. Defaults to _column, which gives column names of the
331 format f"{detector}_{particle}".
336 np.sum(df[[column(p, det)
for det
in DETECTORS]].values, -1)
343 lrs = softmax(h_logls)
344 for i, p
in enumerate(PARTICLES):
345 df[f
"lr_{p}"] = lrs[:, i]
348 def make_binary_lrs(df, column=_column):
349 """Makes binary likelihood ratio columns for each of the five non-pion
350 particle type hypotheses in the given DataFrame.
353 df (pandas.DataFrame): DataFrame to which the columns will be added.
354 column: A function which given the particle and
355 detector names returns the corresponding detector log-likelihood
356 column name. Defaults to _column, which gives column names of the
357 format f"{detector}_{particle}".
365 np.sum(df[[column(p, det)
for det
in DETECTORS]].values, -1)
370 lrs = softmax(h_logls)
371 df[f
"binary_lr_{h}"] = lrs[:, 0]
375 """Makes a 'pid' column in the given DataFrame. The 'pid' column is the
376 predicted particle type. Requires likelihood ratio columns to exist.
379 df (pandas.DataFrame): DataFrame to which the 'pid' column will be added.
381 lrs = np.stack([df[f
"lr_{p}"].values
for p
in PARTICLES], axis=-1)
382 pids = np.argmax(lrs, axis=-1)
386 def compute_det_lrs(d, det, column=_column):
387 """Computes single-detector likelihood ratios from the given DataFrame.
390 d (pandas.DataFrame): DataFrame containing the detector log-likelihoods.
391 det (str): The name of the detector for which the single-detector
392 likelihood ratios will be calculated.
393 column: A function which given the particle and
394 detector names returns the corresponding detector log-likelihood
395 column name. Defaults to _column, which gives column names of the
396 format f"{detector}_{particle}".
399 :func:`numpy.array`: The detector likelihood ratios.
401 h_logls = d[[column(p, det)
for p
in PARTICLES]].values
402 lrs = softmax(h_logls)
406 def make_pid_det(df, column=_column):
407 """Makes single-detector PID columns for each of the detectors in the given DataFrame.
410 df (pandas.DataFrame): DataFrame to which the columns will be added.
411 column: A function which given the particle and
412 detector names returns the corresponding detector log-likelihood
413 column name. Defaults to _column, which gives column names of the
414 format f"{detector}_{particle}".
416 for det
in DETECTORS:
417 mask = df[column(
"e", det)] == 0
418 lrs = compute_det_lrs(df, det, column=column)
419 pids = np.argmax(lrs, axis=-1)
421 df[f
"pid_{det}"] = pids
424 def compute_abl_lrs(d, det, column=_column):
425 """Computes ablation likelihood ratios from the given DataFrame.
428 d (pandas.DataFrame): DataFrame containing the detector log-likelihoods.
429 det (str): The name of the detector to be omitted for the ablation.
430 column: A function which given the particle and
431 detector names returns the corresponding detector log-likelihood
432 column name. Defaults to _column, which gives column names of the
433 format f"{detector}_{particle}".
436 :func:`numpy.array`: The ablation likelihood ratios.
440 others = [det2
for det2
in DETECTORS
if det2 != det]
441 return [column(p, det2)
for det2
in others]
443 h_logls = np.stack([np.sum(d[_cols(p)].values, -1)
for p
in PARTICLES], -1)
444 lrs = softmax(h_logls)
448 def make_pid_abl(df, column=_column):
449 """Makes ablation PID columns for each of the detectors in the given
453 df (pandas.DataFrame): DataFrame to which the columns will be added.
454 column: A function which given the particle and
455 detector names returns the corresponding detector log-likelihood
456 column name. Defaults to _column, which gives column names of the
457 format f"{detector}_{particle}".
459 for det
in DETECTORS:
460 lrs = compute_abl_lrs(df, det, column=column)
461 pids = np.argmax(lrs, axis=-1)
462 df[f
"pid_no_{det}"] = pids
465 def compute_contrib(d, corr=True):
466 """Computes the detector contributions.
469 d (pandas.DataFrame): DataFrame containing the likelihood ratio data.
470 corr (bool): Whether to compute contribution to the likelihood
471 ratio of the _correct_ hypothesis (True) or the _chosen_ hypothesis
472 (False). Defaults to True.
475 dict[str, :func:`numpy.array`]: The contributions of each detector.
478 for det
in DETECTORS:
479 reg_lrs = d[[f
"lr_{p}" for p
in PARTICLES]].values
480 abl_lrs = compute_abl_lrs(d, det)
481 idx = d[
"labels" if corr
else "pid"].values.astype(int)
482 reg_lr = reg_lrs[np.arange(len(idx)), idx]
483 abl_lr = abl_lrs[np.arange(len(idx)), idx]
484 ctrb = reg_lr - abl_lr
489 def make_contrib(df, corr=True):
490 """Makes columns for the detector contributions in the given DataFrame.
493 df (pandas.DataFrame): DataFrame to which the columns will be added.
494 corr (bool): Whether to compute contribution to the likelihood
495 ratio of the _correct_ hypothesis (True) or the _chosen_ hypothesis
496 (False). Defaults to True.
498 ctrbs = compute_contrib(df, corr=corr)
499 for det, ctrb
in ctrbs.items():
500 df[f
"contrib_{det}"] = ctrb
506 theta_bins=THETA_BINS,
510 """Makes all the additional columns for a given DataFrame.
513 df (pandas.DataFrame): DataFrame to which the columns will be added.
514 p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
515 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
516 theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
517 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
519 contrib_corr (bool): Whether to compute contribution to the
520 likelihood ratio of the _correct_ hypothesis (True) or the _chosen_
521 hypothesis (False). Defaults to True.
522 column: A function which given the particle and
523 detector names returns the corresponding detector log-likelihood
524 column name. Defaults to _column, which gives column names of the
525 format f"{detector}_{particle}".
528 make_bins(df, p_bins=p_bins, theta_bins=theta_bins)
529 make_lrs(df, column=column)
530 make_binary_lrs(df, column=column)
532 make_pid_det(df, column=column)
534 make_contrib(df, corr=contrib_corr)
537 def apply_weights(df, weights, p_bins=P_BINS, theta_bins=THETA_BINS, column=_column):
538 """Applies the given weights to the log-likelihood data in the DataFrame.
541 df (pandas.DataFrame): DataFrame to which the weights are applied.
542 weights (dict[tuple(int), :func:`numpy.array`] or :func:`numpy.array`): The calibration weight
543 values. If a dict, keys should be a tuple of ints, and each value is
544 the six-by-six array of weights for the bin. If a single np.array,
545 should be a six-by-six array of weights to be applied globally.
546 p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
547 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
548 theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
549 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
551 column: A function which given the particle and
552 detector names returns the corresponding detector log-likelihood
553 column name. Defaults to _column, which gives column names of the
554 format f"{detector}_{particle}".
560 if isinstance(weights, dict):
561 for p
in range(len(p_bins) - 1):
562 p_lo, p_hi = p_bins[p], p_bins[p + 1]
563 p_mask = (df[
"p"] >= p_lo) & (df[
"p"] <= p_hi)
565 for theta
in range(len(theta_bins) - 1):
566 t_lo, t_hi = theta_bins[theta], theta_bins[theta + 1]
567 t_mask = (df[
"theta"] >= t_lo) & (df[
"theta"] <= t_hi)
569 for i, h
in enumerate(PARTICLES):
570 for j, d
in enumerate(DETECTORS):
571 df.loc[(p_mask & t_mask), column(h, d)] *= weights[p, theta][
577 for i, h
in enumerate(PARTICLES):
578 for j, d
in enumerate(DETECTORS):
579 df[column(h, d)] *= weights[i, j]
582 def cut_particles(df, allowed_particles, column=_column):
583 """Cuts the log-likelihood data associated with given particle types.
586 df (pandas.DataFrame): DataFrame to which the cuts will be applied.
587 allowed_particles (list(str)): List of allowed particle types. Any
588 particle types not present will be cut, unless the list is empty (in
589 which case no cuts are applied).
590 column: A function which given the particle and
591 detector names returns the corresponding detector log-likelihood
592 column name. Defaults to _column, which gives column names of the
593 format f"{detector}_{particle}".
595 if len(allowed_particles) == 0:
599 if not (p
in allowed_particles):
601 df[column(p, d)] = -1e10
604 def read_h5(filename):
605 """Read an HDF5 file in our 'slim' format into a DataFrame.
608 filename (str): Input filename.
611 pandas.DataFrame: DataFrame containing data.
614 with h5py.File(filename,
"r")
as f:
615 for key
in [
"pdg",
"p",
"theta",
"phi"]:
617 for key
in [f
"{d}/{p}" for d
in DETECTORS
for p
in PARTICLES]:
618 df_key = key.replace(
"/",
"_")
619 df[df_key] = f[key][()]
620 df[df_key] = df[df_key].fillna(0)
624 def read_npz(filename):
625 """Read an npz file in our training format into a DataFrame.
628 filename (str): Input filename.
631 pandas.DataFrame: DataFrame containing data.
633 data = np.load(filename)
635 data=data[
"X"], columns=[f
"{d}_{p}" for p
in PARTICLES
for d
in DETECTORS],
637 df[
"labels"] = data[
"y"]
639 df[
"theta"] = data[
"theta"]
641 df[
"pdg"] = df[
"labels"]
642 for i, pdg
in enumerate(PDG_CODES):
643 df.loc[df[
"labels"] == i,
"pdg"] = pdg
648 def produce_analysis_df(
652 drop_outside_bins=True,
654 allowed_particles=[],
656 theta_bins=THETA_BINS,
659 """Prepares a DataFrame for PID analysis by applying weights, computing and
660 adding additional columns, cutting NaNs, and more.
663 df (pandas.DataFrame): DataFrame to prepare for analysis.
664 compute_cols (bool): Whether to compute and add additional
665 columns. Defaults to True.
666 drop_nans (bool): Whether to drop rows that contain NaNs.
668 drop_outside_bins (bool): Whether to drop rows for particles
669 outside of the momentum and theta bins. Defaults to True.
670 weights (:func:`numpy.array`): Calibration weights to be applied to the
671 detector log-likelihoods. Defaults to None.
672 allowed_particles (list(str)): If not empty, specifies the
673 allowed particle types. Any not allowed particle types will be
674 excluded from the PID calculations. If empty, all particle types are
675 considered. Defaults to [].
676 p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
677 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
678 theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
679 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
681 column: A function which given the particle and
682 detector names returns the corresponding detector log-likelihood
683 column name. Defaults to _column, which gives column names of the
684 format f"{detector}_{particle}".
687 pandas.DataFrame: Return the prepared DataFrame. (Not all modifications in
688 this method are in-place.)
690 if column
is not None:
693 df[f
"{d}_{p}"] = df[column(p, d)]
695 apply_weights(df, weights, p_bins=p_bins, theta_bins=theta_bins)
696 cut_particles(df, allowed_particles)
702 theta_bins=theta_bins,
705 if drop_outside_bins:
707 np.logical_and.reduce(
709 df[
"p_bin"].values >= 0,
710 df[
"p_bin"].values < len(p_bins) - 1,
711 df[
"theta_bin"].values >= 0,
712 df[
"theta_bin"].values < len(theta_bins) - 1,
719 df = df[df[
"labels"] >= 0]