Belle II Software development
pidDataUtils.py
1#!/usr/bin/env python3
2
3
10
11
12import numpy as np
13import pandas as pd
14import h5py
15import uproot
16
17
18def _make_const_lists():
19 """Moving this code into a function to avoid a top-level ROOT import."""
20 import ROOT.Belle2
21
22 PARTICLES, PDG_CODES = [], []
23 for i in range(len(ROOT.Belle2.Const.chargedStableSet)):
24 particle = ROOT.Belle2.Const.chargedStableSet.at(i)
25 name = (particle.__repr__()[7:-1]
26 .replace("-", "")
27 .replace("+", "")
28 .replace("euteron", ""))
29 PARTICLES.append(name)
30 PDG_CODES.append(particle.getPDGCode())
31 # PARTICLES = ["e", "mu", "pi", "K", "p", "d"]
32 # PDG_CODES = [11, 13, 211, 321, 2212, 1000010020]
33
34 DETECTORS = []
35 for det in ROOT.Belle2.Const.PIDDetectors.set():
36 DETECTORS.append(ROOT.Belle2.Const.parseDetectors(det))
37 # DETECTORS = ["SVD", "CDC", "TOP", "ARICH", "ECL", "KLM"]
38
39 return PARTICLES, PDG_CODES, DETECTORS
40
41
42# PARTICLES, PDG_CODES, DETECTORS = _make_const_lists()
43PARTICLES = ["e", "mu", "pi", "K", "p", "d"]
44PDG_CODES = [11, 13, 211, 321, 2212, 1000010020]
45DETECTORS = ["SVD", "CDC", "TOP", "ARICH", "ECL", "KLM"]
46
47P_BINS = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5])
48THETA_BINS = np.radians(np.array([17, 28, 40, 60, 77, 96, 115, 133, 150]))
49
50
51def _column(particle, detector):
52 """Default column names for detector log-likelihoods.
53
54 Args:
55 particle (str): particle name
56 detector (str): detector name
57
58 Returns:
59 str: Corresponding column name.
60 """
61 return f"{detector}_{particle}"
62
63
64def root_column(particle, detector):
65 """Column names for detector log-likelihoods found in our ROOT datafiles.
66
67 Args:
68 particle (str): particle name
69 detector (str): detector name
70
71 Returns:
72 str: Corresponding column name.
73 """
74 pdg = PDG_CODES[PARTICLES.index(particle)]
75 return f"pidLogLikelyhoodOf{pdg}From{detector}"
76
77
78def read_root(root_filenames):
79 """Reads one or several ROOT datafiles into a DataFrame.
80
81 Args:
82 root_filenames (list(str) or str): If only one filename, can be given as
83 a string. If more than one, should be given as a list or tuple.
84
85 Returns:
86 pandas.DataFrame: DataFrame containing the data of the ROOT datafile(s).
87 """
88
89 return uproot.concatenate(root_filenames, library='pd')
90
91
92def make_h5(df, tags, out_filename, pdg=None, column=root_column):
93 """Make an HDF5 file in our 'slim' format from the given DataFrame.
94
95 Args:
96 df (pandas.DataFrame): The DataFrame containing the data.
97 tags (list(str) or str): The particle tags used as a prefix for desired
98 columns. e.g. for kaons in a D* decay, this is 'DST_D0_K'. One or
99 more can be given.
100 out_filename (str): Output filename for the h5 file that will be
101 written.
102 pdg (int or None): The PDG code for the particles being
103 extracted. If None, uses the values found in the 'mcPDG' column of
104 the DataFrame. Defaults to None.
105 column: A function which, given the particle and
106 detector names, returns the column name for the corresponding
107 detector log-likelihood. Defaults to root_column, which assumes
108 column names are of the format
109 f'pidLogLikelyhoodOf{pdg}From{detector}'.
110 """
111
112 if isinstance(tags, str):
113 tags = [tags]
114
115 def _concat(arrs):
116 return np.concatenate(arrs) if len(arrs) > 1 else arrs[0]
117
118 def _get_all(col):
119 return _concat([df[f"{tag}_{col}"].values for tag in tags])
120
121 with h5py.File(out_filename, "w") as f:
122 if pdg is not None:
123 pdg_values = np.ones(len(df) * len(tags)) * pdg
124 else:
125 pdg_values = np.abs(_get_all("mcPDG"))
126
127 f.create_dataset("pdg", data=pdg_values)
128 f.create_dataset("p", data=_get_all("p"))
129 f.create_dataset("theta", data=np.arccos(_get_all("cosTheta")))
130 f.create_dataset("phi", data=_get_all("phi"))
131
132 for det in DETECTORS:
133 for p in PARTICLES:
134 data = _get_all(column(p, det))
135 data[np.isnan(data)] = 0
136 f.create_dataset(f"{det}/{p}", data=data)
137
138
139def merge_h5s(filenames, out_filename, pdgs=None):
140 """Merge several HDF5 files in our 'slim' format together.
141
142 Args:
143 filenames (list(str)): Filenames of HDF5 files to be merged.
144 out_filename (str): Output filename.
145 pdgs (list(int)): The PDG tags for the particle types, one per
146 filename, to overwrite the 'pdg' columns in those files when
147 merging. If None, simply uses the 'pdg' columns from the files.
148 Defaults to None.
149 """
150 fs = [h5py.File(fname, "r") for fname in filenames]
151 m = h5py.File(out_filename, "w")
152
153 keys = ["p", "theta", "phi"]
154 keys += [f"{d}/{p}" for d in DETECTORS for p in PARTICLES]
155
156 for key in keys:
157 m.create_dataset(key, data=np.concatenate([f[key][()] for f in fs]))
158
159 if pdgs is not None:
160 # replace 'pdg' data with kinematic tags
161 m.create_dataset(
162 "pdg",
163 data=np.concatenate(
164 [np.ones_like(f["pdg"][()]) * pdg for f, pdg in zip(fs, pdgs)]
165 ),
166 )
167 else:
168 m.create_dataset("pdg", data=np.concatenate([f["pdg"][()] for f in fs]))
169
170 for f in fs:
171 f.close()
172 m.close()
173
174
175def split_h5(
176 filename,
177 output_dir,
178 train_size=0.8,
179 val_size=0.1,
180 test_size=0.1,
181 shuffle=True,
182 random_state=None,
183):
184 """Split the data in an HDF5 'slim' format file in train, validation, and
185 test sets, stored in .npz files for ease of weight training.
186
187 Args:
188 filename (str): Filename of HDF5 input file.
189 output_dir (str): Name of output directory, in which the train,
190 validation, and test sets will be written. Will be created if it
191 does not already exist.
192 train_size (float): Fraction of the dataset to use for
193 training. Defaults to 0.8.
194 val_size (float): Fraction of the dataset to use for
195 validation. Defaults to 0.1.
196 test_size (float): Fraction of the dataset to use for testing.
197 Defaults to 0.1.
198 shuffle (bool): Whether to shuffle the dataset before
199 splitting. Defaults to True.
200 random_state (int or None): Random state for the shuffling.
201 Defaults to None.
202 """
203
204 from sklearn.model_selection import train_test_split
205 from os.path import join
206 from os import makedirs
207
208 assert train_size > 0, f"train_size ({train_size}) must be positive"
209 assert val_size >= 0, f"val_size ({val_size}) may not be negative"
210 assert test_size >= 0, f"test_size ({test_size}) may not be negative"
211 assert val_size + test_size != 0, "val_size and test_size cannot both be zero"
212
213 if val_size == 0:
214 val_size = test_size
215 test_size = 0
216
217 if train_size + val_size + test_size != 1:
218 total = train_size + val_size + test_size
219 train_size = train_size / total
220 val_size = val_size / total
221 test_size = test_size / total
222
223 # read data
224 with h5py.File(filename, "r") as f:
225 data = np.stack(
226 [f[det][p][()] for p in PARTICLES for det in DETECTORS], axis=-1
227 )
228 p_data = f["p"][()]
229 theta_data = f["theta"][()]
230 labels = np.abs(f["pdg"][()])
231 for i, p in enumerate(PDG_CODES):
232 labels[labels == p] = i
233 mask = labels < 6
234
235 X = data[mask]
236 y = labels[mask]
237 p = p_data[mask]
238 t = theta_data[mask]
239
240 makedirs(output_dir, exist_ok=True)
241 kw = dict(shuffle=shuffle, random_state=random_state)
242
243 # split once
244 (X_0, X, y_0, y, p_0, p, t_0, t) = train_test_split(
245 X, y, p, t, train_size=train_size, **kw
246 )
247 np.savez(join(output_dir, "train.npz"), X=X_0, y=y_0, p=p_0, theta=t_0)
248
249 # split again if desired
250 if test_size != 0:
251 size = val_size / (1 - train_size)
252 (X_1, X_2, y_1, y_2, p_1, p_2, t_1, t_2) = train_test_split(
253 X, y, p, t, train_size=size, **kw
254 )
255
256 np.savez(join(output_dir, "val.npz"), X=X_1, y=y_1, p=p_1, theta=t_1)
257 np.savez(join(output_dir, "test.npz"), X=X_2, y=y_2, p=p_2, theta=t_2)
258
259 else:
260 np.savez(join(output_dir, "val.npz"), X=X, y=y, p=p, theta=t)
261
262
263def softmax(x):
264 """Performs softmax calculation with corrections to help prevent overflow.
265
266 Note:
267 This is the calculation used to convert log-likelihoods to likelihood
268 ratios. Implementation following
269 https://stackoverflow.com/a/67112412/18837571
270
271 Args:
272 x (:func:`numpy.array`): Data to be softmaxed. Softmax is calculated over the last
273 dimension.
274
275 Returns:
276 :func:`numpy.array`: Softmaxed data.
277 """
278 maxes = np.amax(x, axis=-1, keepdims=True)
279 x_exp = np.exp(x - maxes)
280 return x_exp / np.sum(x_exp, axis=-1, keepdims=True)
281
282
283def make_labels(df):
284 """Make a 'labels' column for the DataFrame. The 'labels' column contains
285 the particle type labels for each event: 0 for electron, 1 for muon, and so
286 on.
287
288 Args:
289 df (pandas.DataFrame): DataFrame that 'labels' column will be added to. Must
290 not have NaN values in the 'pdg' column.
291 """
292 labels = np.abs(df["pdg"].values)
293 if np.count_nonzero(~np.isfinite(labels)):
294 print(
295 'Warning: dataset contains NaN values in the "pdg" column. '
296 'This means the "labels" column cannot be made, so most of the '
297 "pidplots methods will fail."
298 )
299 labels = np.ones_like(labels) * np.nan
300 else:
301 for i, p in enumerate(PDG_CODES):
302 labels[labels == p] = i
303 labels[labels >= 6] = -1
304 df["labels"] = labels
305
306
307def make_bins(df, p_bins=P_BINS, theta_bins=THETA_BINS):
308 """Make 'p_bin' and 'theta_bin' column in the given DataFrame.
309
310 Args:
311 df (pandas.DataFrame): The DataFrame to add bins columns to.
312 p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
313 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
314 theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
315 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
316 degrees.
317 """
318 df["p_bin"] = np.digitize(df["p"].values, p_bins) - 1
319 df["theta_bin"] = np.digitize(df["theta"].values, theta_bins) - 1
320
321
322def make_lrs(df, column=_column):
323 """Makes likelihood ratio columns for each of the six particle types in the
324 given DataFrame.
325
326 Args:
327 df (pandas.DataFrame): DataFrame to which the columns will be added.
328 column: A function which given the particle and
329 detector names returns the corresponding detector log-likelihood
330 column name. Defaults to _column, which gives column names of the
331 format f"{detector}_{particle}".
332 """
333 # hypothesis log-likelihoods
334 h_logls = np.stack(
335 [
336 np.sum(df[[column(p, det) for det in DETECTORS]].values, -1)
337 for p in PARTICLES
338 ],
339 -1,
340 )
341
342 # compute likelihood ratios
343 lrs = softmax(h_logls)
344 for i, p in enumerate(PARTICLES):
345 df[f"lr_{p}"] = lrs[:, i]
346
347
348def make_binary_lrs(df, column=_column):
349 """Makes binary likelihood ratio columns for each of the five non-pion
350 particle type hypotheses in the given DataFrame.
351
352 Args:
353 df (pandas.DataFrame): DataFrame to which the columns will be added.
354 column: A function which given the particle and
355 detector names returns the corresponding detector log-likelihood
356 column name. Defaults to _column, which gives column names of the
357 format f"{detector}_{particle}".
358 """
359 for h in PARTICLES:
360 if h == "pi":
361 continue
362
363 h_logls = np.stack(
364 [
365 np.sum(df[[column(p, det) for det in DETECTORS]].values, -1)
366 for p in [h, "pi"]
367 ],
368 -1,
369 )
370 lrs = softmax(h_logls)
371 df[f"binary_lr_{h}"] = lrs[:, 0]
372
373
374def make_pid(df):
375 """Makes a 'pid' column in the given DataFrame. The 'pid' column is the
376 predicted particle type. Requires likelihood ratio columns to exist.
377
378 Args:
379 df (pandas.DataFrame): DataFrame to which the 'pid' column will be added.
380 """
381 lrs = np.stack([df[f"lr_{p}"].values for p in PARTICLES], axis=-1)
382 pids = np.argmax(lrs, axis=-1)
383 df["pid"] = pids
384
385
386def compute_det_lrs(d, det, column=_column):
387 """Computes single-detector likelihood ratios from the given DataFrame.
388
389 Args:
390 d (pandas.DataFrame): DataFrame containing the detector log-likelihoods.
391 det (str): The name of the detector for which the single-detector
392 likelihood ratios will be calculated.
393 column: A function which given the particle and
394 detector names returns the corresponding detector log-likelihood
395 column name. Defaults to _column, which gives column names of the
396 format f"{detector}_{particle}".
397
398 Returns:
399 :func:`numpy.array`: The detector likelihood ratios.
400 """
401 h_logls = d[[column(p, det) for p in PARTICLES]].values
402 lrs = softmax(h_logls)
403 return lrs
404
405
406def make_pid_det(df, column=_column):
407 """Makes single-detector PID columns for each of the detectors in the given DataFrame.
408
409 Args:
410 df (pandas.DataFrame): DataFrame to which the columns will be added.
411 column: A function which given the particle and
412 detector names returns the corresponding detector log-likelihood
413 column name. Defaults to _column, which gives column names of the
414 format f"{detector}_{particle}".
415 """
416 for det in DETECTORS:
417 mask = df[column("e", det)] == 0 # TODO: make more robust
418 lrs = compute_det_lrs(df, det, column=column)
419 pids = np.argmax(lrs, axis=-1)
420 pids[mask] = -1
421 df[f"pid_{det}"] = pids
422
423
424def compute_abl_lrs(d, det, column=_column):
425 """Computes ablation likelihood ratios from the given DataFrame.
426
427 Args:
428 d (pandas.DataFrame): DataFrame containing the detector log-likelihoods.
429 det (str): The name of the detector to be omitted for the ablation.
430 column: A function which given the particle and
431 detector names returns the corresponding detector log-likelihood
432 column name. Defaults to _column, which gives column names of the
433 format f"{detector}_{particle}".
434
435 Returns:
436 :func:`numpy.array`: The ablation likelihood ratios.
437 """
438
439 def _cols(p):
440 others = [det2 for det2 in DETECTORS if det2 != det]
441 return [column(p, det2) for det2 in others]
442
443 h_logls = np.stack([np.sum(d[_cols(p)].values, -1) for p in PARTICLES], -1)
444 lrs = softmax(h_logls)
445 return lrs
446
447
448def make_pid_abl(df, column=_column):
449 """Makes ablation PID columns for each of the detectors in the given
450 DataFrame.
451
452 Args:
453 df (pandas.DataFrame): DataFrame to which the columns will be added.
454 column: A function which given the particle and
455 detector names returns the corresponding detector log-likelihood
456 column name. Defaults to _column, which gives column names of the
457 format f"{detector}_{particle}".
458 """
459 for det in DETECTORS:
460 lrs = compute_abl_lrs(df, det, column=column)
461 pids = np.argmax(lrs, axis=-1)
462 df[f"pid_no_{det}"] = pids
463
464
465def compute_contrib(d, corr=True):
466 """Computes the detector contributions.
467
468 Args:
469 d (pandas.DataFrame): DataFrame containing the likelihood ratio data.
470 corr (bool): Whether to compute contribution to the likelihood
471 ratio of the _correct_ hypothesis (True) or the _chosen_ hypothesis
472 (False). Defaults to True.
473
474 Returns:
475 dict[str, :func:`numpy.array`]: The contributions of each detector.
476 """
477 out = dict()
478 for det in DETECTORS:
479 reg_lrs = d[[f"lr_{p}" for p in PARTICLES]].values
480 abl_lrs = compute_abl_lrs(d, det)
481 idx = d["labels" if corr else "pid"].values.astype(int)
482 reg_lr = reg_lrs[np.arange(len(idx)), idx]
483 abl_lr = abl_lrs[np.arange(len(idx)), idx]
484 ctrb = reg_lr - abl_lr
485 out[det] = ctrb
486 return out
487
488
489def make_contrib(df, corr=True):
490 """Makes columns for the detector contributions in the given DataFrame.
491
492 Args:
493 df (pandas.DataFrame): DataFrame to which the columns will be added.
494 corr (bool): Whether to compute contribution to the likelihood
495 ratio of the _correct_ hypothesis (True) or the _chosen_ hypothesis
496 (False). Defaults to True.
497 """
498 ctrbs = compute_contrib(df, corr=corr)
499 for det, ctrb in ctrbs.items():
500 df[f"contrib_{det}"] = ctrb
501
502
503def make_columns(
504 df,
505 p_bins=P_BINS,
506 theta_bins=THETA_BINS,
507 contrib_corr=True,
508 column=_column,
509):
510 """Makes all the additional columns for a given DataFrame.
511
512 Args:
513 df (pandas.DataFrame): DataFrame to which the columns will be added.
514 p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
515 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
516 theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
517 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
518 degrees.
519 contrib_corr (bool): Whether to compute contribution to the
520 likelihood ratio of the _correct_ hypothesis (True) or the _chosen_
521 hypothesis (False). Defaults to True.
522 column: A function which given the particle and
523 detector names returns the corresponding detector log-likelihood
524 column name. Defaults to _column, which gives column names of the
525 format f"{detector}_{particle}".
526 """
527 make_labels(df)
528 make_bins(df, p_bins=p_bins, theta_bins=theta_bins)
529 make_lrs(df, column=column)
530 make_binary_lrs(df, column=column)
531 make_pid(df)
532 make_pid_det(df, column=column)
533 make_pid_abl(df)
534 make_contrib(df, corr=contrib_corr)
535
536
537def apply_weights(df, weights, p_bins=P_BINS, theta_bins=THETA_BINS, column=_column):
538 """Applies the given weights to the log-likelihood data in the DataFrame.
539
540 Args:
541 df (pandas.DataFrame): DataFrame to which the weights are applied.
542 weights (dict[tuple(int), :func:`numpy.array`] or :func:`numpy.array`): The calibration weight
543 values. If a dict, keys should be a tuple of ints, and each value is
544 the six-by-six array of weights for the bin. If a single np.array,
545 should be a six-by-six array of weights to be applied globally.
546 p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
547 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
548 theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
549 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
550 degrees.
551 column: A function which given the particle and
552 detector names returns the corresponding detector log-likelihood
553 column name. Defaults to _column, which gives column names of the
554 format f"{detector}_{particle}".
555 """
556 if weights is None:
557 return
558
559 # per-bin weights
560 if isinstance(weights, dict):
561 for p in range(len(p_bins) - 1):
562 p_lo, p_hi = p_bins[p], p_bins[p + 1]
563 p_mask = (df["p"] >= p_lo) & (df["p"] <= p_hi)
564
565 for theta in range(len(theta_bins) - 1):
566 t_lo, t_hi = theta_bins[theta], theta_bins[theta + 1]
567 t_mask = (df["theta"] >= t_lo) & (df["theta"] <= t_hi)
568
569 for i, h in enumerate(PARTICLES):
570 for j, d in enumerate(DETECTORS):
571 df.loc[(p_mask & t_mask), column(h, d)] *= weights[p, theta][
572 i, j
573 ]
574
575 # global weights
576 else:
577 for i, h in enumerate(PARTICLES):
578 for j, d in enumerate(DETECTORS):
579 df[column(h, d)] *= weights[i, j]
580
581
582def cut_particles(df, allowed_particles, column=_column):
583 """Cuts the log-likelihood data associated with given particle types.
584
585 Args:
586 df (pandas.DataFrame): DataFrame to which the cuts will be applied.
587 allowed_particles (list(str)): List of allowed particle types. Any
588 particle types not present will be cut, unless the list is empty (in
589 which case no cuts are applied).
590 column: A function which given the particle and
591 detector names returns the corresponding detector log-likelihood
592 column name. Defaults to _column, which gives column names of the
593 format f"{detector}_{particle}".
594 """
595 if len(allowed_particles) == 0:
596 return
597
598 for p in PARTICLES:
599 if not (p in allowed_particles):
600 for d in DETECTORS:
601 df[column(p, d)] = -1e10
602
603
604def read_h5(filename):
605 """Read an HDF5 file in our 'slim' format into a DataFrame.
606
607 Args:
608 filename (str): Input filename.
609
610 Returns:
611 pandas.DataFrame: DataFrame containing data.
612 """
613 df = pd.DataFrame()
614 with h5py.File(filename, "r") as f:
615 for key in ["pdg", "p", "theta", "phi"]:
616 df[key] = f[key][()]
617 for key in [f"{d}/{p}" for d in DETECTORS for p in PARTICLES]:
618 df_key = key.replace("/", "_")
619 df[df_key] = f[key][()]
620 df[df_key] = df[df_key].fillna(0)
621 return df
622
623
624def read_npz(filename):
625 """Read an npz file in our training format into a DataFrame.
626
627 Args:
628 filename (str): Input filename.
629
630 Returns:
631 pandas.DataFrame: DataFrame containing data.
632 """
633 data = np.load(filename)
634 df = pd.DataFrame(
635 data=data["X"], columns=[f"{d}_{p}" for p in PARTICLES for d in DETECTORS],
636 )
637 df["labels"] = data["y"]
638 df["p"] = data["p"]
639 df["theta"] = data["theta"]
640
641 df["pdg"] = df["labels"]
642 for i, pdg in enumerate(PDG_CODES):
643 df.loc[df["labels"] == i, "pdg"] = pdg
644
645 return df
646
647
648def produce_analysis_df(
649 df,
650 compute_cols=True,
651 drop_nans=True,
652 drop_outside_bins=True,
653 weights=None,
654 allowed_particles=[],
655 p_bins=P_BINS,
656 theta_bins=THETA_BINS,
657 column=None,
658):
659 """Prepares a DataFrame for PID analysis by applying weights, computing and
660 adding additional columns, cutting NaNs, and more.
661
662 Args:
663 df (pandas.DataFrame): DataFrame to prepare for analysis.
664 compute_cols (bool): Whether to compute and add additional
665 columns. Defaults to True.
666 drop_nans (bool): Whether to drop rows that contain NaNs.
667 Defaults to True.
668 drop_outside_bins (bool): Whether to drop rows for particles
669 outside of the momentum and theta bins. Defaults to True.
670 weights (:func:`numpy.array`): Calibration weights to be applied to the
671 detector log-likelihoods. Defaults to None.
672 allowed_particles (list(str)): If not empty, specifies the
673 allowed particle types. Any not allowed particle types will be
674 excluded from the PID calculations. If empty, all particle types are
675 considered. Defaults to [].
676 p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
677 Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
678 theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
679 Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
680 degrees.
681 column: A function which given the particle and
682 detector names returns the corresponding detector log-likelihood
683 column name. Defaults to _column, which gives column names of the
684 format f"{detector}_{particle}".
685
686 Returns:
687 pandas.DataFrame: Return the prepared DataFrame. (Not all modifications in
688 this method are in-place.)
689 """
690 if column is not None:
691 for p in PARTICLES:
692 for d in DETECTORS:
693 df[f"{d}_{p}"] = df[column(p, d)]
694
695 apply_weights(df, weights, p_bins=p_bins, theta_bins=theta_bins)
696 cut_particles(df, allowed_particles)
697
698 if compute_cols:
699 make_columns(
700 df,
701 p_bins=p_bins,
702 theta_bins=theta_bins,
703 contrib_corr=True,
704 )
705 if drop_outside_bins:
706 df = df.loc[
707 np.logical_and.reduce(
708 [
709 df["p_bin"].values >= 0,
710 df["p_bin"].values < len(p_bins) - 1,
711 df["theta_bin"].values >= 0,
712 df["theta_bin"].values < len(theta_bins) - 1,
713 ]
714 )
715 ]
716
717 if drop_nans:
718 df = df.dropna()
719 df = df[df["labels"] >= 0]
720 return df