Belle II Software development
pidDataUtils.py
1#!/usr/bin/env python3
2
3
10
11
12import numpy as np
13import pandas as pd
14import h5py
15import uproot
16
17
18def _make_const_lists():
19 """Moving this code into a function to avoid a top-level ROOT import."""
20 from ROOT import Belle2 # noqa: make Belle2 namespace available
21 import ROOT.Belle2
22
23 PARTICLES, PDG_CODES = [], []
24 for i in range(len(ROOT.Belle2.Const.chargedStableSet)):
25 particle = ROOT.Belle2.Const.chargedStableSet.at(i)
26 name = (particle.__repr__()[7:-1]
27 .replace("-", "")
28 .replace("+", "")
29 .replace("euteron", ""))
30 PARTICLES.append(name)
31 PDG_CODES.append(particle.getPDGCode())
32 # PARTICLES = ["e", "mu", "pi", "K", "p", "d"]
33 # PDG_CODES = [11, 13, 211, 321, 2212, 1000010020]
34
35 DETECTORS = []
36 for det in ROOT.Belle2.Const.PIDDetectors.set():
37 DETECTORS.append(ROOT.Belle2.Const.parseDetectors(det))
38 # DETECTORS = ["SVD", "CDC", "TOP", "ARICH", "ECL", "KLM"]
39
40 return PARTICLES, PDG_CODES, DETECTORS
41
42
43# PARTICLES, PDG_CODES, DETECTORS = _make_const_lists()
44PARTICLES = ["e", "mu", "pi", "K", "p", "d"]
45PDG_CODES = [11, 13, 211, 321, 2212, 1000010020]
46DETECTORS = ["SVD", "CDC", "TOP", "ARICH", "ECL", "KLM"]
47
48P_BINS = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5])
49THETA_BINS = np.radians(np.array([17, 28, 40, 60, 77, 96, 115, 133, 150]))
50
51
52def _column(particle, detector):
53 """Default column names for detector log-likelihoods.
54
55 Args:
56 particle (str): particle name
57 detector (str): detector name
58
59 Returns:
60 str: Corresponding column name.
61 """
62 return f"{detector}_{particle}"
63
64
65def root_column(particle, detector):
66 """Column names for detector log-likelihoods found in our ROOT datafiles.
67
68 Args:
69 particle (str): particle name
70 detector (str): detector name
71
72 Returns:
73 str: Corresponding column name.
74 """
75 pdg = PDG_CODES[PARTICLES.index(particle)]
76 return f"pidLogLikelyhoodOf{pdg}From{detector}"
77
78
79def read_root(root_filenames):
80 """Reads one or several ROOT datafiles into a DataFrame.
81
82 Args:
83 root_filenames (list(str) or str): If only one filename, can be given as
84 a string. If more than one, should be given as a list or tuple.
85
86 Returns:
87 pandas.DataFrame: DataFrame containing the data of the ROOT datafile(s).
88 """
89
90 return uproot.concatenate(root_filenames, library='pd')
91
92
93def make_h5(df, tags, out_filename, pdg=None, column=root_column):
94 """Make an HDF5 file in our 'slim' format from the given DataFrame.
95
96 Args:
97 df (pandas.DataFrame): The DataFrame containing the data.
98 tags (list(str) or str): The particle tags used as a prefix for desired
99 columns. e.g. for kaons in a D* decay, this is 'DST_D0_K'. One or
100 more can be given.
101 out_filename (str): Output filename for the h5 file that will be
102 written.
103 pdg (int or None): The PDG code for the particles being
104 extracted. If None, uses the values found in the 'mcPDG' column of
105 the DataFrame. Defaults to None.
106 column: A function which, given the particle and
107 detector names, returns the column name for the corresponding
108 detector log-likelihood. Defaults to root_column, which assumes
109 column names are of the format
110 f'pidLogLikelyhoodOf{pdg}From{detector}'.
111 """
112
113 if isinstance(tags, str):
114 tags = [tags]
115
116 def _concat(arrs):
117 return np.concatenate(arrs) if len(arrs) > 1 else arrs[0]
118
119 def _get_all(col):
120 return _concat([df[f"{tag}_{col}"].values for tag in tags])
121
122 with h5py.File(out_filename, "w") as f:
123 if pdg is not None:
124 pdg_values = np.ones(len(df) * len(tags)) * pdg
125 else:
126 pdg_values = np.abs(_get_all("mcPDG"))
127
128 f.create_dataset("pdg", data=pdg_values)
129 f.create_dataset("p", data=_get_all("p"))
130 f.create_dataset("theta", data=np.arccos(_get_all("cosTheta")))
131 f.create_dataset("phi", data=_get_all("phi"))
132
133 for det in DETECTORS:
134 for p in PARTICLES:
135 data = _get_all(column(p, det))
136 data[np.isnan(data)] = 0
137 f.create_dataset(f"{det}/{p}", data=data)
138
139
140def merge_h5s(filenames, out_filename, pdgs=None):
141 """Merge several HDF5 files in our 'slim' format together.
142
143 Args:
144 filenames (list(str)): Filenames of HDF5 files to be merged.
145 out_filename (str): Output filename.
146 pdgs (list(int)): The PDG tags for the particle types, one per
147 filename, to overwrite the 'pdg' columns in those files when
148 merging. If None, simply uses the 'pdg' columns from the files.
149 Defaults to None.
150 """
151 fs = [h5py.File(fname, "r") for fname in filenames]
152 m = h5py.File(out_filename, "w")
153
154 keys = ["p", "theta", "phi"]
155 keys += [f"{d}/{p}" for d in DETECTORS for p in PARTICLES]
156
157 for key in keys:
158 m.create_dataset(key, data=np.concatenate([f[key][()] for f in fs]))
159
160 if pdgs is not None:
161 # replace 'pdg' data with kinematic tags
162 m.create_dataset(
163 "pdg",
164 data=np.concatenate(
165 [np.ones_like(f["pdg"][()]) * pdg for f, pdg in zip(fs, pdgs)]
166 ),
167 )
168 else:
169 m.create_dataset("pdg", data=np.concatenate([f["pdg"][()] for f in fs]))
170
171 for f in fs:
172 f.close()
173 m.close()
174
175
176def split_h5(
177 filename,
178 output_dir,
179 train_size=0.8,
180 val_size=0.1,
181 test_size=0.1,
182 shuffle=True,
183 random_state=None,
184):
185 """Split the data in an HDF5 'slim' format file in train, validation, and
186 test sets, stored in .npz files for ease of weight training.
187
188 Args:
189 filename (str): Filename of HDF5 input file.
190 output_dir (str): Name of output directory, in which the train,
191 validation, and test sets will be written. Will be created if it
192 does not already exist.
193 train_size (float): Fraction of the dataset to use for
194 training. Defaults to 0.8.
195 val_size (float): Fraction of the dataset to use for
196 validation. Defaults to 0.1.
197 test_size (float): Fraction of the dataset to use for testing.
198 Defaults to 0.1.
199 shuffle (bool): Whether to shuffle the dataset before
200 splitting. Defaults to True.
201 random_state (int or None): Random state for the shuffling.
202 Defaults to None.
203 """
204
205 from sklearn.model_selection import train_test_split
206 from os.path import join
207 from os import makedirs
208
209 assert train_size > 0, f"train_size ({train_size}) must be positive"
210 assert val_size >= 0, f"val_size ({val_size}) may not be negative"
211 assert test_size >= 0, f"test_size ({test_size}) may not be negative"
212 assert val_size + test_size != 0, "val_size and test_size cannot both be zero"
213
214 if val_size == 0:
215 val_size = test_size
216 test_size = 0
217
218 if train_size + val_size + test_size != 1:
219 total = train_size + val_size + test_size
220 train_size = train_size / total
221 val_size = val_size / total
222 test_size = test_size / total
223
224 # read data
225 with h5py.File(filename, "r") as f:
226 data = np.stack(
227 [f[det][p][()] for p in PARTICLES for det in DETECTORS], axis=-1
228 )
229 p_data = f["p"][()]
230 theta_data = f["theta"][()]
231 labels = np.abs(f["pdg"][()])
232 for i, p in enumerate(PDG_CODES):
233 labels[labels == p] = i
234 mask = labels < 6
235
236 X = data[mask]
237 y = labels[mask]
238 p = p_data[mask]
239 t = theta_data[mask]
240
241 makedirs(output_dir, exist_ok=True)
242 kw = dict(shuffle=shuffle, random_state=random_state)
243
244 # split once
245 (X_0, X, y_0, y, p_0, p, t_0, t) = train_test_split(
246 X, y, p, t, train_size=train_size, **kw
247 )
248 np.savez(join(output_dir, "train.npz"), X=X_0, y=y_0, p=p_0, theta=t_0)
249
250 # split again if desired
251 if test_size != 0:
252 size = val_size / (1 - train_size)
253 (X_1, X_2, y_1, y_2, p_1, p_2, t_1, t_2) = train_test_split(
254 X, y, p, t, train_size=size, **kw
255 )
256
257 np.savez(join(output_dir, "val.npz"), X=X_1, y=y_1, p=p_1, theta=t_1)
258 np.savez(join(output_dir, "test.npz"), X=X_2, y=y_2, p=p_2, theta=t_2)
259
260 else:
261 np.savez(join(output_dir, "val.npz"), X=X, y=y, p=p, theta=t)
262
263
264def softmax(x):
265 """Performs softmax calculation with corrections to help prevent overflow.
266
267 Note:
268 This is the calculation used to convert log-likelihoods to likelihood
269 ratios. Implementation following
270 https://stackoverflow.com/a/67112412/18837571
271
272 Args:
273 x (:func:`numpy.array`): Data to be softmaxed. Softmax is calculated over the last
274 dimension.
275
276 Returns:
277 :func:`numpy.array`: Softmaxed data.
278 """
279 maxes = np.amax(x, axis=-1, keepdims=True)
280 x_exp = np.exp(x - maxes)
281 return x_exp / np.sum(x_exp, axis=-1, keepdims=True)
282
283
284def make_labels(df):
285 """Make a 'labels' column for the DataFrame. The 'labels' column contains
286 the particle type labels for each event: 0 for electron, 1 for muon, and so
287 on.
288
289 Args:
290 df (pandas.DataFrame): DataFrame that 'labels' column will be added to. Must
291 not have NaN values in the 'pdg' column.
292 """
293 labels = np.abs(df["pdg"].values)
294 if np.count_nonzero(~np.isfinite(labels)):
295 print(
296 'Warning: dataset contains NaN values in the "pdg" column. '
297 'This means the "labels" column cannot be made, so most of the '
298 "pidplots methods will fail."
299 )
300 labels = np.ones_like(labels) * np.nan
301 else:
302 for i, p in enumerate(PDG_CODES):
303 labels[labels == p] = i
304 labels[labels >= 6] = -1
305 df["labels"] = labels
306
307
308def make_bins(df, p_bins=P_BINS, theta_bins=THETA_BINS):
309 """Make 'p_bin' and 'theta_bin' column in the given DataFrame.
310
311 Args:
312 df (pandas.DataFrame): The DataFrame to add bins columns to.
313 p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
314 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
315 theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
316 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
317 degrees.
318 """
319 df["p_bin"] = np.digitize(df["p"].values, p_bins) - 1
320 df["theta_bin"] = np.digitize(df["theta"].values, theta_bins) - 1
321
322
323def make_lrs(df, column=_column):
324 """Makes likelihood ratio columns for each of the six particle types in the
325 given DataFrame.
326
327 Args:
328 df (pandas.DataFrame): DataFrame to which the columns will be added.
329 column: A function which given the particle and
330 detector names returns the corresponding detector log-likelihood
331 column name. Defaults to _column, which gives column names of the
332 format f"{detector}_{particle}".
333 """
334 # hypothesis log-likelihoods
335 h_logls = np.stack(
336 [
337 np.sum(df[[column(p, det) for det in DETECTORS]].values, -1)
338 for p in PARTICLES
339 ],
340 -1,
341 )
342
343 # compute likelihood ratios
344 lrs = softmax(h_logls)
345 for i, p in enumerate(PARTICLES):
346 df[f"lr_{p}"] = lrs[:, i]
347
348
349def make_binary_lrs(df, column=_column):
350 """Makes binary likelihood ratio columns for each of the five non-pion
351 particle type hypotheses in the given DataFrame.
352
353 Args:
354 df (pandas.DataFrame): DataFrame to which the columns will be added.
355 column: A function which given the particle and
356 detector names returns the corresponding detector log-likelihood
357 column name. Defaults to _column, which gives column names of the
358 format f"{detector}_{particle}".
359 """
360 for h in PARTICLES:
361 if h == "pi":
362 continue
363
364 h_logls = np.stack(
365 [
366 np.sum(df[[column(p, det) for det in DETECTORS]].values, -1)
367 for p in [h, "pi"]
368 ],
369 -1,
370 )
371 lrs = softmax(h_logls)
372 df[f"binary_lr_{h}"] = lrs[:, 0]
373
374
375def make_pid(df):
376 """Makes a 'pid' column in the given DataFrame. The 'pid' column is the
377 predicted particle type. Requires likelihood ratio columns to exist.
378
379 Args:
380 df (pandas.DataFrame): DataFrame to which the 'pid' column will be added.
381 """
382 lrs = np.stack([df[f"lr_{p}"].values for p in PARTICLES], axis=-1)
383 pids = np.argmax(lrs, axis=-1)
384 df["pid"] = pids
385
386
387def compute_det_lrs(d, det, column=_column):
388 """Computes single-detector likelihood ratios from the given DataFrame.
389
390 Args:
391 d (pandas.DataFrame): DataFrame containing the detector log-likelihoods.
392 det (str): The name of the detector for which the single-detector
393 likelihood ratios will be calculated.
394 column: A function which given the particle and
395 detector names returns the corresponding detector log-likelihood
396 column name. Defaults to _column, which gives column names of the
397 format f"{detector}_{particle}".
398
399 Returns:
400 :func:`numpy.array`: The detector likelihood ratios.
401 """
402 h_logls = d[[column(p, det) for p in PARTICLES]].values
403 lrs = softmax(h_logls)
404 return lrs
405
406
407def make_pid_det(df, column=_column):
408 """Makes single-detector PID columns for each of the detectors in the given DataFrame.
409
410 Args:
411 df (pandas.DataFrame): DataFrame to which the columns will be added.
412 column: A function which given the particle and
413 detector names returns the corresponding detector log-likelihood
414 column name. Defaults to _column, which gives column names of the
415 format f"{detector}_{particle}".
416 """
417 for det in DETECTORS:
418 mask = df[column("e", det)] == 0 # TODO: make more robust
419 lrs = compute_det_lrs(df, det, column=column)
420 pids = np.argmax(lrs, axis=-1)
421 pids[mask] = -1
422 df[f"pid_{det}"] = pids
423
424
425def compute_abl_lrs(d, det, column=_column):
426 """Computes ablation likelihood ratios from the given DataFrame.
427
428 Args:
429 d (pandas.DataFrame): DataFrame containing the detector log-likelihoods.
430 det (str): The name of the detector to be omitted for the ablation.
431 column: A function which given the particle and
432 detector names returns the corresponding detector log-likelihood
433 column name. Defaults to _column, which gives column names of the
434 format f"{detector}_{particle}".
435
436 Returns:
437 :func:`numpy.array`: The ablation likelihood ratios.
438 """
439
440 def _cols(p):
441 others = [det2 for det2 in DETECTORS if det2 != det]
442 return [column(p, det2) for det2 in others]
443
444 h_logls = np.stack([np.sum(d[_cols(p)].values, -1) for p in PARTICLES], -1)
445 lrs = softmax(h_logls)
446 return lrs
447
448
449def make_pid_abl(df, column=_column):
450 """Makes ablation PID columns for each of the detectors in the given
451 DataFrame.
452
453 Args:
454 df (pandas.DataFrame): DataFrame to which the columns will be added.
455 column: A function which given the particle and
456 detector names returns the corresponding detector log-likelihood
457 column name. Defaults to _column, which gives column names of the
458 format f"{detector}_{particle}".
459 """
460 for det in DETECTORS:
461 lrs = compute_abl_lrs(df, det, column=column)
462 pids = np.argmax(lrs, axis=-1)
463 df[f"pid_no_{det}"] = pids
464
465
466def compute_contrib(d, corr=True):
467 """Computes the detector contributions.
468
469 Args:
470 d (pandas.DataFrame): DataFrame containing the likelihood ratio data.
471 corr (bool): Whether to compute contribution to the likelihood
472 ratio of the _correct_ hypothesis (True) or the _chosen_ hypothesis
473 (False). Defaults to True.
474
475 Returns:
476 dict[str, :func:`numpy.array`]: The contributions of each detector.
477 """
478 out = dict()
479 for det in DETECTORS:
480 reg_lrs = d[[f"lr_{p}" for p in PARTICLES]].values
481 abl_lrs = compute_abl_lrs(d, det)
482 idx = d["labels" if corr else "pid"].values.astype(int)
483 reg_lr = reg_lrs[np.arange(len(idx)), idx]
484 abl_lr = abl_lrs[np.arange(len(idx)), idx]
485 ctrb = reg_lr - abl_lr
486 out[det] = ctrb
487 return out
488
489
490def make_contrib(df, corr=True):
491 """Makes columns for the detector contributions in the given DataFrame.
492
493 Args:
494 df (pandas.DataFrame): DataFrame to which the columns will be added.
495 corr (bool): Whether to compute contribution to the likelihood
496 ratio of the _correct_ hypothesis (True) or the _chosen_ hypothesis
497 (False). Defaults to True.
498 """
499 ctrbs = compute_contrib(df, corr=corr)
500 for det, ctrb in ctrbs.items():
501 df[f"contrib_{det}"] = ctrb
502
503
504def make_columns(
505 df,
506 p_bins=P_BINS,
507 theta_bins=THETA_BINS,
508 contrib_corr=True,
509 column=_column,
510):
511 """Makes all the additional columns for a given DataFrame.
512
513 Args:
514 df (pandas.DataFrame): DataFrame to which the columns will be added.
515 p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
516 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
517 theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
518 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
519 degrees.
520 contrib_corr (bool): Whether to compute contribution to the
521 likelihood ratio of the _correct_ hypothesis (True) or the _chosen_
522 hypothesis (False). Defaults to True.
523 column: A function which given the particle and
524 detector names returns the corresponding detector log-likelihood
525 column name. Defaults to _column, which gives column names of the
526 format f"{detector}_{particle}".
527 """
528 make_labels(df)
529 make_bins(df, p_bins=p_bins, theta_bins=theta_bins)
530 make_lrs(df, column=column)
531 make_binary_lrs(df, column=column)
532 make_pid(df)
533 make_pid_det(df, column=column)
534 make_pid_abl(df)
535 make_contrib(df, corr=contrib_corr)
536
537
538def apply_weights(df, weights, p_bins=P_BINS, theta_bins=THETA_BINS, column=_column):
539 """Applies the given weights to the log-likelihood data in the DataFrame.
540
541 Args:
542 df (pandas.DataFrame): DataFrame to which the weights are applied.
543 weights (dict[tuple(int), :func:`numpy.array`] or :func:`numpy.array`): The calibration weight
544 values. If a dict, keys should be a tuple of ints, and each value is
545 the six-by-six array of weights for the bin. If a single np.array,
546 should be a six-by-six array of weights to be applied globally.
547 p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
548 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
549 theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
550 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
551 degrees.
552 column: A function which given the particle and
553 detector names returns the corresponding detector log-likelihood
554 column name. Defaults to _column, which gives column names of the
555 format f"{detector}_{particle}".
556 """
557 if weights is None:
558 return
559
560 # per-bin weights
561 if isinstance(weights, dict):
562 for p in range(len(p_bins) - 1):
563 p_lo, p_hi = p_bins[p], p_bins[p + 1]
564 p_mask = (df["p"] >= p_lo) & (df["p"] <= p_hi)
565
566 for theta in range(len(theta_bins) - 1):
567 t_lo, t_hi = theta_bins[theta], theta_bins[theta + 1]
568 t_mask = (df["theta"] >= t_lo) & (df["theta"] <= t_hi)
569
570 for i, h in enumerate(PARTICLES):
571 for j, d in enumerate(DETECTORS):
572 df.loc[(p_mask & t_mask), column(h, d)] *= weights[p, theta][
573 i, j
574 ]
575
576 # global weights
577 else:
578 for i, h in enumerate(PARTICLES):
579 for j, d in enumerate(DETECTORS):
580 df[column(h, d)] *= weights[i, j]
581
582
583def cut_particles(df, allowed_particles, column=_column):
584 """Cuts the log-likelihood data associated with given particle types.
585
586 Args:
587 df (pandas.DataFrame): DataFrame to which the cuts will be applied.
588 allowed_particles (list(str)): List of allowed particle types. Any
589 particle types not present will be cut, unless the list is empty (in
590 which case no cuts are applied).
591 column: A function which given the particle and
592 detector names returns the corresponding detector log-likelihood
593 column name. Defaults to _column, which gives column names of the
594 format f"{detector}_{particle}".
595 """
596 if len(allowed_particles) == 0:
597 return
598
599 for p in PARTICLES:
600 if not (p in allowed_particles):
601 for d in DETECTORS:
602 df[column(p, d)] = -1e10
603
604
605def read_h5(filename):
606 """Read an HDF5 file in our 'slim' format into a DataFrame.
607
608 Args:
609 filename (str): Input filename.
610
611 Returns:
612 pandas.DataFrame: DataFrame containing data.
613 """
614 df = pd.DataFrame()
615 with h5py.File(filename, "r") as f:
616 for key in ["pdg", "p", "theta", "phi"]:
617 df[key] = f[key][()]
618 for key in [f"{d}/{p}" for d in DETECTORS for p in PARTICLES]:
619 df_key = key.replace("/", "_")
620 df[df_key] = f[key][()]
621 df[df_key] = df[df_key].fillna(0)
622 return df
623
624
625def read_npz(filename):
626 """Read an npz file in our training format into a DataFrame.
627
628 Args:
629 filename (str): Input filename.
630
631 Returns:
632 pandas.DataFrame: DataFrame containing data.
633 """
634 data = np.load(filename)
635 df = pd.DataFrame(
636 data=data["X"], columns=[f"{d}_{p}" for p in PARTICLES for d in DETECTORS],
637 )
638 df["labels"] = data["y"]
639 df["p"] = data["p"]
640 df["theta"] = data["theta"]
641
642 df["pdg"] = df["labels"]
643 for i, pdg in enumerate(PDG_CODES):
644 df.loc[df["labels"] == i, "pdg"] = pdg
645
646 return df
647
648
649def produce_analysis_df(
650 df,
651 compute_cols=True,
652 drop_nans=True,
653 drop_outside_bins=True,
654 weights=None,
655 allowed_particles=[],
656 p_bins=P_BINS,
657 theta_bins=THETA_BINS,
658 column=None,
659):
660 """Prepares a DataFrame for PID analysis by applying weights, computing and
661 adding additional columns, cutting NaNs, and more.
662
663 Args:
664 df (pandas.DataFrame): DataFrame to prepare for analysis.
665 compute_cols (bool): Whether to compute and add additional
666 columns. Defaults to True.
667 drop_nans (bool): Whether to drop rows that contain NaNs.
668 Defaults to True.
669 drop_outside_bins (bool): Whether to drop rows for particles
670 outside of the momentum and theta bins. Defaults to True.
671 weights (:func:`numpy.array`): Calibration weights to be applied to the
672 detector log-likelihoods. Defaults to None.
673 allowed_particles (list(str)): If not empty, specifies the
674 allowed particle types. Any not allowed particle types will be
675 excluded from the PID calculations. If empty, all particle types are
676 considered. Defaults to [].
677 p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
678 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
679 theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
680 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
681 degrees.
682 column: A function which given the particle and
683 detector names returns the corresponding detector log-likelihood
684 column name. Defaults to _column, which gives column names of the
685 format f"{detector}_{particle}".
686
687 Returns:
688 pandas.DataFrame: Return the prepared DataFrame. (Not all modifications in
689 this method are in-place.)
690 """
691 if column is not None:
692 for p in PARTICLES:
693 for d in DETECTORS:
694 df[f"{d}_{p}"] = df[column(p, d)]
695
696 apply_weights(df, weights, p_bins=p_bins, theta_bins=theta_bins)
697 cut_particles(df, allowed_particles)
698
699 if compute_cols:
700 make_columns(
701 df,
702 p_bins=p_bins,
703 theta_bins=theta_bins,
704 contrib_corr=True,
705 )
706 if drop_outside_bins:
707 df = df.loc[
708 np.logical_and.reduce(
709 [
710 df["p_bin"].values >= 0,
711 df["p_bin"].values < len(p_bins) - 1,
712 df["theta_bin"].values >= 0,
713 df["theta_bin"].values < len(theta_bins) - 1,
714 ]
715 )
716 ]
717
718 if drop_nans:
719 df = df.dropna()
720 df = df[df["labels"] >= 0]
721 return df