Belle II Software  release-08-01-10
pidDataUtils.py
1 #!/usr/bin/env python3
2 
3 
10 
11 
12 import numpy as np
13 import pandas as pd
14 import h5py
15 import uproot
16 
17 
18 def _make_const_lists():
19  """Moving this code into a function to avoid a top-level ROOT import."""
20  import ROOT.Belle2
21 
22  PARTICLES, PDG_CODES = [], []
23  for i in range(len(ROOT.Belle2.Const.chargedStableSet)):
24  particle = ROOT.Belle2.Const.chargedStableSet.at(i)
25  name = (particle.__repr__()[7:-1]
26  .replace("-", "")
27  .replace("+", "")
28  .replace("euteron", ""))
29  PARTICLES.append(name)
30  PDG_CODES.append(particle.getPDGCode())
31  # PARTICLES = ["e", "mu", "pi", "K", "p", "d"]
32  # PDG_CODES = [11, 13, 211, 321, 2212, 1000010020]
33 
34  DETECTORS = []
35  for det in ROOT.Belle2.Const.PIDDetectors.set():
36  DETECTORS.append(ROOT.Belle2.Const.parseDetectors(det))
37  # DETECTORS = ["SVD", "CDC", "TOP", "ARICH", "ECL", "KLM"]
38 
39  return PARTICLES, PDG_CODES, DETECTORS
40 
41 
42 # PARTICLES, PDG_CODES, DETECTORS = _make_const_lists()
43 PARTICLES = ["e", "mu", "pi", "K", "p", "d"]
44 PDG_CODES = [11, 13, 211, 321, 2212, 1000010020]
45 DETECTORS = ["SVD", "CDC", "TOP", "ARICH", "ECL", "KLM"]
46 
47 P_BINS = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5])
48 THETA_BINS = np.radians(np.array([17, 28, 40, 60, 77, 96, 115, 133, 150]))
49 
50 
51 def _column(particle, detector):
52  """Default column names for detector log-likelihoods.
53 
54  Args:
55  particle (str): particle name
56  detector (str): detector name
57 
58  Returns:
59  str: Corresponding column name.
60  """
61  return f"{detector}_{particle}"
62 
63 
64 def root_column(particle, detector):
65  """Column names for detector log-likelihoods found in our ROOT datafiles.
66 
67  Args:
68  particle (str): particle name
69  detector (str): detector name
70 
71  Returns:
72  str: Corresponding column name.
73  """
74  pdg = PDG_CODES[PARTICLES.index(particle)]
75  return f"pidLogLikelyhoodOf{pdg}From{detector}"
76 
77 
78 def read_root(root_filenames):
79  """Reads one or several ROOT datafiles into a DataFrame.
80 
81  Args:
82  root_filenames (list(str) or str): If only one filename, can be given as
83  a string. If more than one, should be given as a list or tuple.
84 
85  Returns:
86  pandas.DataFrame: DataFrame containing the data of the ROOT datafile(s).
87  """
88 
89  return uproot.concatenate(root_filenames, library='pd')
90 
91 
92 def make_h5(df, tags, out_filename, pdg=None, column=root_column):
93  """Make an HDF5 file in our 'slim' format from the given DataFrame.
94 
95  Args:
96  df (pandas.DataFrame): The DataFrame containing the data.
97  tags (list(str) or str): The particle tags used as a prefix for desired
98  columns. e.g. for kaons in a D* decay, this is 'DST_D0_K'. One or
99  more can be given.
100  out_filename (str): Output filename for the h5 file that will be
101  written.
102  pdg (int or None): The PDG code for the particles being
103  extracted. If None, uses the values found in the 'mcPDG' column of
104  the DataFrame. Defaults to None.
105  column: A function which, given the particle and
106  detector names, returns the column name for the corresponding
107  detector log-likelihood. Defaults to root_column, which assumes
108  column names are of the format
109  f'pidLogLikelyhoodOf{pdg}From{detector}'.
110  """
111 
112  if isinstance(tags, str):
113  tags = [tags]
114 
115  def _concat(arrs):
116  return np.concatenate(arrs) if len(arrs) > 1 else arrs[0]
117 
118  def _get_all(col):
119  return _concat([df[f"{tag}_{col}"].values for tag in tags])
120 
121  with h5py.File(out_filename, "w") as f:
122  if pdg is not None:
123  pdg_values = np.ones(len(df) * len(tags)) * pdg
124  else:
125  pdg_values = np.abs(_get_all("mcPDG"))
126 
127  f.create_dataset("pdg", data=pdg_values)
128  f.create_dataset("p", data=_get_all("p"))
129  f.create_dataset("theta", data=np.arccos(_get_all("cosTheta")))
130  f.create_dataset("phi", data=_get_all("phi"))
131 
132  for det in DETECTORS:
133  for p in PARTICLES:
134  data = _get_all(column(p, det))
135  data[np.isnan(data)] = 0
136  f.create_dataset(f"{det}/{p}", data=data)
137 
138 
139 def merge_h5s(filenames, out_filename, pdgs=None):
140  """Merge several HDF5 files in our 'slim' format together.
141 
142  Args:
143  filenames (list(str)): Filenames of HDF5 files to be merged.
144  out_filename (str): Output filename.
145  pdgs (list(int)): The PDG tags for the particle types, one per
146  filename, to overwrite the 'pdg' columns in those files when
147  merging. If None, simply uses the 'pdg' columns from the files.
148  Defaults to None.
149  """
150  fs = [h5py.File(fname, "r") for fname in filenames]
151  m = h5py.File(out_filename, "w")
152 
153  keys = ["p", "theta", "phi"]
154  keys += [f"{d}/{p}" for d in DETECTORS for p in PARTICLES]
155 
156  for key in keys:
157  m.create_dataset(key, data=np.concatenate([f[key][()] for f in fs]))
158 
159  if pdgs is not None:
160  # replace 'pdg' data with kinematic tags
161  m.create_dataset(
162  "pdg",
163  data=np.concatenate(
164  [np.ones_like(f["pdg"][()]) * pdg for f, pdg in zip(fs, pdgs)]
165  ),
166  )
167  else:
168  m.create_dataset("pdg", data=np.concatenate([f["pdg"][()] for f in fs]))
169 
170  for f in fs:
171  f.close()
172  m.close()
173 
174 
175 def split_h5(
176  filename,
177  output_dir,
178  train_size=0.8,
179  val_size=0.1,
180  test_size=0.1,
181  shuffle=True,
182  random_state=None,
183 ):
184  """Split the data in an HDF5 'slim' format file in train, validation, and
185  test sets, stored in .npz files for ease of weight training.
186 
187  Args:
188  filename (str): Filename of HDF5 input file.
189  output_dir (str): Name of output directory, in which the train,
190  validation, and test sets will be written. Will be created if it
191  does not already exist.
192  train_size (float): Fraction of the dataset to use for
193  training. Defaults to 0.8.
194  val_size (float): Fraction of the dataset to use for
195  validation. Defaults to 0.1.
196  test_size (float): Fraction of the dataset to use for testing.
197  Defaults to 0.1.
198  shuffle (bool): Whether to shuffle the dataset before
199  splitting. Defaults to True.
200  random_state (int or None): Random state for the shuffling.
201  Defaults to None.
202  """
203 
204  from sklearn.model_selection import train_test_split
205  from os.path import join
206  from os import makedirs
207 
208  assert train_size > 0, f"train_size ({train_size}) must be positive"
209  assert val_size >= 0, f"val_size ({val_size}) may not be negative"
210  assert test_size >= 0, f"test_size ({test_size}) may not be negative"
211  assert val_size + test_size != 0, "val_size and test_size cannot both be zero"
212 
213  if val_size == 0:
214  val_size = test_size
215  test_size = 0
216 
217  if train_size + val_size + test_size != 1:
218  total = train_size + val_size + test_size
219  train_size = train_size / total
220  val_size = val_size / total
221  test_size = test_size / total
222 
223  # read data
224  with h5py.File(filename, "r") as f:
225  data = np.stack(
226  [f[det][p][()] for p in PARTICLES for det in DETECTORS], axis=-1
227  )
228  p_data = f["p"][()]
229  theta_data = f["theta"][()]
230  labels = np.abs(f["pdg"][()])
231  for i, p in enumerate(PDG_CODES):
232  labels[labels == p] = i
233  mask = labels < 6
234 
235  X = data[mask]
236  y = labels[mask]
237  p = p_data[mask]
238  t = theta_data[mask]
239 
240  makedirs(output_dir, exist_ok=True)
241  kw = dict(shuffle=shuffle, random_state=random_state)
242 
243  # split once
244  (X_0, X, y_0, y, p_0, p, t_0, t) = train_test_split(
245  X, y, p, t, train_size=train_size, **kw
246  )
247  np.savez(join(output_dir, "train.npz"), X=X_0, y=y_0, p=p_0, theta=t_0)
248 
249  # split again if desired
250  if test_size != 0:
251  size = val_size / (1 - train_size)
252  (X_1, X_2, y_1, y_2, p_1, p_2, t_1, t_2) = train_test_split(
253  X, y, p, t, train_size=size, **kw
254  )
255 
256  np.savez(join(output_dir, "val.npz"), X=X_1, y=y_1, p=p_1, theta=t_1)
257  np.savez(join(output_dir, "test.npz"), X=X_2, y=y_2, p=p_2, theta=t_2)
258 
259  else:
260  np.savez(join(output_dir, "val.npz"), X=X, y=y, p=p, theta=t)
261 
262 
263 def softmax(x):
264  """Performs softmax calculation with corrections to help prevent overflow.
265 
266  Note:
267  This is the calculation used to convert log-likelihoods to likelihood
268  ratios. Implementation following
269  https://stackoverflow.com/a/67112412/18837571
270 
271  Args:
272  x (:func:`numpy.array`): Data to be softmaxed. Softmax is calculated over the last
273  dimension.
274 
275  Returns:
276  :func:`numpy.array`: Softmaxed data.
277  """
278  maxes = np.amax(x, axis=-1, keepdims=True)
279  x_exp = np.exp(x - maxes)
280  return x_exp / np.sum(x_exp, axis=-1, keepdims=True)
281 
282 
283 def make_labels(df):
284  """Make a 'labels' column for the DataFrame. The 'labels' column contains
285  the particle type labels for each event: 0 for electron, 1 for muon, and so
286  on.
287 
288  Args:
289  df (pandas.DataFrame): DataFrame that 'labels' column will be added to. Must
290  not have NaN values in the 'pdg' column.
291  """
292  labels = np.abs(df["pdg"].values)
293  if np.count_nonzero(~np.isfinite(labels)):
294  print(
295  'Warning: dataset contains NaN values in the "pdg" column. '
296  'This means the "labels" column cannot be made, so most of the '
297  "pidplots methods will fail."
298  )
299  labels = np.ones_like(labels) * np.nan
300  else:
301  for i, p in enumerate(PDG_CODES):
302  labels[labels == p] = i
303  labels[labels >= 6] = -1
304  df["labels"] = labels
305 
306 
307 def make_bins(df, p_bins=P_BINS, theta_bins=THETA_BINS):
308  """Make 'p_bin' and 'theta_bin' column in the given DataFrame.
309 
310  Args:
311  df (pandas.DataFrame): The DataFrame to add bins columns to.
312  p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
313  Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
314  theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
315  Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
316  degrees.
317  """
318  df["p_bin"] = np.digitize(df["p"].values, p_bins) - 1
319  df["theta_bin"] = np.digitize(df["theta"].values, theta_bins) - 1
320 
321 
322 def make_lrs(df, column=_column):
323  """Makes likelihood ratio columns for each of the six particle types in the
324  given DataFrame.
325 
326  Args:
327  df (pandas.DataFrame): DataFrame to which the columns will be added.
328  column: A function which given the particle and
329  detector names returns the corresponding detector log-likelihood
330  column name. Defaults to _column, which gives column names of the
331  format f"{detector}_{particle}".
332  """
333  # hypothesis log-likelihoods
334  h_logls = np.stack(
335  [
336  np.sum(df[[column(p, det) for det in DETECTORS]].values, -1)
337  for p in PARTICLES
338  ],
339  -1,
340  )
341 
342  # compute likelihood ratios
343  lrs = softmax(h_logls)
344  for i, p in enumerate(PARTICLES):
345  df[f"lr_{p}"] = lrs[:, i]
346 
347 
348 def make_binary_lrs(df, column=_column):
349  """Makes binary likelihood ratio columns for each of the five non-pion
350  particle type hypotheses in the given DataFrame.
351 
352  Args:
353  df (pandas.DataFrame): DataFrame to which the columns will be added.
354  column: A function which given the particle and
355  detector names returns the corresponding detector log-likelihood
356  column name. Defaults to _column, which gives column names of the
357  format f"{detector}_{particle}".
358  """
359  for h in PARTICLES:
360  if h == "pi":
361  continue
362 
363  h_logls = np.stack(
364  [
365  np.sum(df[[column(p, det) for det in DETECTORS]].values, -1)
366  for p in [h, "pi"]
367  ],
368  -1,
369  )
370  lrs = softmax(h_logls)
371  df[f"binary_lr_{h}"] = lrs[:, 0]
372 
373 
374 def make_pid(df):
375  """Makes a 'pid' column in the given DataFrame. The 'pid' column is the
376  predicted particle type. Requires likelihood ratio columns to exist.
377 
378  Args:
379  df (pandas.DataFrame): DataFrame to which the 'pid' column will be added.
380  """
381  lrs = np.stack([df[f"lr_{p}"].values for p in PARTICLES], axis=-1)
382  pids = np.argmax(lrs, axis=-1)
383  df["pid"] = pids
384 
385 
386 def compute_det_lrs(d, det, column=_column):
387  """Computes single-detector likelihood ratios from the given DataFrame.
388 
389  Args:
390  d (pandas.DataFrame): DataFrame containing the detector log-likelihoods.
391  det (str): The name of the detector for which the single-detector
392  likelihood ratios will be calculated.
393  column: A function which given the particle and
394  detector names returns the corresponding detector log-likelihood
395  column name. Defaults to _column, which gives column names of the
396  format f"{detector}_{particle}".
397 
398  Returns:
399  :func:`numpy.array`: The detector likelihood ratios.
400  """
401  h_logls = d[[column(p, det) for p in PARTICLES]].values
402  lrs = softmax(h_logls)
403  return lrs
404 
405 
406 def make_pid_det(df, column=_column):
407  """Makes single-detector PID columns for each of the detectors in the given DataFrame.
408 
409  Args:
410  df (pandas.DataFrame): DataFrame to which the columns will be added.
411  column: A function which given the particle and
412  detector names returns the corresponding detector log-likelihood
413  column name. Defaults to _column, which gives column names of the
414  format f"{detector}_{particle}".
415  """
416  for det in DETECTORS:
417  mask = df[column("e", det)] == 0 # TODO: make more robust
418  lrs = compute_det_lrs(df, det, column=column)
419  pids = np.argmax(lrs, axis=-1)
420  pids[mask] = -1
421  df[f"pid_{det}"] = pids
422 
423 
424 def compute_abl_lrs(d, det, column=_column):
425  """Computes ablation likelihood ratios from the given DataFrame.
426 
427  Args:
428  d (pandas.DataFrame): DataFrame containing the detector log-likelihoods.
429  det (str): The name of the detector to be omitted for the ablation.
430  column: A function which given the particle and
431  detector names returns the corresponding detector log-likelihood
432  column name. Defaults to _column, which gives column names of the
433  format f"{detector}_{particle}".
434 
435  Returns:
436  :func:`numpy.array`: The ablation likelihood ratios.
437  """
438 
439  def _cols(p):
440  others = [det2 for det2 in DETECTORS if det2 != det]
441  return [column(p, det2) for det2 in others]
442 
443  h_logls = np.stack([np.sum(d[_cols(p)].values, -1) for p in PARTICLES], -1)
444  lrs = softmax(h_logls)
445  return lrs
446 
447 
448 def make_pid_abl(df, column=_column):
449  """Makes ablation PID columns for each of the detectors in the given
450  DataFrame.
451 
452  Args:
453  df (pandas.DataFrame): DataFrame to which the columns will be added.
454  column: A function which given the particle and
455  detector names returns the corresponding detector log-likelihood
456  column name. Defaults to _column, which gives column names of the
457  format f"{detector}_{particle}".
458  """
459  for det in DETECTORS:
460  lrs = compute_abl_lrs(df, det, column=column)
461  pids = np.argmax(lrs, axis=-1)
462  df[f"pid_no_{det}"] = pids
463 
464 
465 def compute_contrib(d, corr=True):
466  """Computes the detector contributions.
467 
468  Args:
469  d (pandas.DataFrame): DataFrame containing the likelihood ratio data.
470  corr (bool): Whether to compute contribution to the likelihood
471  ratio of the _correct_ hypothesis (True) or the _chosen_ hypothesis
472  (False). Defaults to True.
473 
474  Returns:
475  dict[str, :func:`numpy.array`]: The contributions of each detector.
476  """
477  out = dict()
478  for det in DETECTORS:
479  reg_lrs = d[[f"lr_{p}" for p in PARTICLES]].values
480  abl_lrs = compute_abl_lrs(d, det)
481  idx = d["labels" if corr else "pid"].values.astype(int)
482  reg_lr = reg_lrs[np.arange(len(idx)), idx]
483  abl_lr = abl_lrs[np.arange(len(idx)), idx]
484  ctrb = reg_lr - abl_lr
485  out[det] = ctrb
486  return out
487 
488 
489 def make_contrib(df, corr=True):
490  """Makes columns for the detector contributions in the given DataFrame.
491 
492  Args:
493  df (pandas.DataFrame): DataFrame to which the columns will be added.
494  corr (bool): Whether to compute contribution to the likelihood
495  ratio of the _correct_ hypothesis (True) or the _chosen_ hypothesis
496  (False). Defaults to True.
497  """
498  ctrbs = compute_contrib(df, corr=corr)
499  for det, ctrb in ctrbs.items():
500  df[f"contrib_{det}"] = ctrb
501 
502 
503 def make_columns(
504  df,
505  p_bins=P_BINS,
506  theta_bins=THETA_BINS,
507  contrib_corr=True,
508  column=_column,
509 ):
510  """Makes all the additional columns for a given DataFrame.
511 
512  Args:
513  df (pandas.DataFrame): DataFrame to which the columns will be added.
514  p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
515  Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
516  theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
517  Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
518  degrees.
519  contrib_corr (bool): Whether to compute contribution to the
520  likelihood ratio of the _correct_ hypothesis (True) or the _chosen_
521  hypothesis (False). Defaults to True.
522  column: A function which given the particle and
523  detector names returns the corresponding detector log-likelihood
524  column name. Defaults to _column, which gives column names of the
525  format f"{detector}_{particle}".
526  """
527  make_labels(df)
528  make_bins(df, p_bins=p_bins, theta_bins=theta_bins)
529  make_lrs(df, column=column)
530  make_binary_lrs(df, column=column)
531  make_pid(df)
532  make_pid_det(df, column=column)
533  make_pid_abl(df)
534  make_contrib(df, corr=contrib_corr)
535 
536 
537 def apply_weights(df, weights, p_bins=P_BINS, theta_bins=THETA_BINS, column=_column):
538  """Applies the given weights to the log-likelihood data in the DataFrame.
539 
540  Args:
541  df (pandas.DataFrame): DataFrame to which the weights are applied.
542  weights (dict[tuple(int), :func:`numpy.array`] or :func:`numpy.array`): The calibration weight
543  values. If a dict, keys should be a tuple of ints, and each value is
544  the six-by-six array of weights for the bin. If a single np.array,
545  should be a six-by-six array of weights to be applied globally.
546  p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
547  Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
548  theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
549  Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
550  degrees.
551  column: A function which given the particle and
552  detector names returns the corresponding detector log-likelihood
553  column name. Defaults to _column, which gives column names of the
554  format f"{detector}_{particle}".
555  """
556  if weights is None:
557  return
558 
559  # per-bin weights
560  if isinstance(weights, dict):
561  for p in range(len(p_bins) - 1):
562  p_lo, p_hi = p_bins[p], p_bins[p + 1]
563  p_mask = (df["p"] >= p_lo) & (df["p"] <= p_hi)
564 
565  for theta in range(len(theta_bins) - 1):
566  t_lo, t_hi = theta_bins[theta], theta_bins[theta + 1]
567  t_mask = (df["theta"] >= t_lo) & (df["theta"] <= t_hi)
568 
569  for i, h in enumerate(PARTICLES):
570  for j, d in enumerate(DETECTORS):
571  df.loc[(p_mask & t_mask), column(h, d)] *= weights[p, theta][
572  i, j
573  ]
574 
575  # global weights
576  else:
577  for i, h in enumerate(PARTICLES):
578  for j, d in enumerate(DETECTORS):
579  df[column(h, d)] *= weights[i, j]
580 
581 
582 def cut_particles(df, allowed_particles, column=_column):
583  """Cuts the log-likelihood data associated with given particle types.
584 
585  Args:
586  df (pandas.DataFrame): DataFrame to which the cuts will be applied.
587  allowed_particles (list(str)): List of allowed particle types. Any
588  particle types not present will be cut, unless the list is empty (in
589  which case no cuts are applied).
590  column: A function which given the particle and
591  detector names returns the corresponding detector log-likelihood
592  column name. Defaults to _column, which gives column names of the
593  format f"{detector}_{particle}".
594  """
595  if len(allowed_particles) == 0:
596  return
597 
598  for p in PARTICLES:
599  if not (p in allowed_particles):
600  for d in DETECTORS:
601  df[column(p, d)] = -1e10
602 
603 
604 def read_h5(filename):
605  """Read an HDF5 file in our 'slim' format into a DataFrame.
606 
607  Args:
608  filename (str): Input filename.
609 
610  Returns:
611  pandas.DataFrame: DataFrame containing data.
612  """
613  df = pd.DataFrame()
614  with h5py.File(filename, "r") as f:
615  for key in ["pdg", "p", "theta", "phi"]:
616  df[key] = f[key][()]
617  for key in [f"{d}/{p}" for d in DETECTORS for p in PARTICLES]:
618  df_key = key.replace("/", "_")
619  df[df_key] = f[key][()]
620  df[df_key] = df[df_key].fillna(0)
621  return df
622 
623 
624 def read_npz(filename):
625  """Read an npz file in our training format into a DataFrame.
626 
627  Args:
628  filename (str): Input filename.
629 
630  Returns:
631  pandas.DataFrame: DataFrame containing data.
632  """
633  data = np.load(filename)
634  df = pd.DataFrame(
635  data=data["X"], columns=[f"{d}_{p}" for p in PARTICLES for d in DETECTORS],
636  )
637  df["labels"] = data["y"]
638  df["p"] = data["p"]
639  df["theta"] = data["theta"]
640 
641  df["pdg"] = df["labels"]
642  for i, pdg in enumerate(PDG_CODES):
643  df.loc[df["labels"] == i, "pdg"] = pdg
644 
645  return df
646 
647 
648 def produce_analysis_df(
649  df,
650  compute_cols=True,
651  drop_nans=True,
652  drop_outside_bins=True,
653  weights=None,
654  allowed_particles=[],
655  p_bins=P_BINS,
656  theta_bins=THETA_BINS,
657  column=None,
658 ):
659  """Prepares a DataFrame for PID analysis by applying weights, computing and
660  adding additional columns, cutting NaNs, and more.
661 
662  Args:
663  df (pandas.DataFrame): DataFrame to prepare for analysis.
664  compute_cols (bool): Whether to compute and add additional
665  columns. Defaults to True.
666  drop_nans (bool): Whether to drop rows that contain NaNs.
667  Defaults to True.
668  drop_outside_bins (bool): Whether to drop rows for particles
669  outside of the momentum and theta bins. Defaults to True.
670  weights (:func:`numpy.array`): Calibration weights to be applied to the
671  detector log-likelihoods. Defaults to None.
672  allowed_particles (list(str)): If not empty, specifies the
673  allowed particle types. Any not allowed particle types will be
674  excluded from the PID calculations. If empty, all particle types are
675  considered. Defaults to [].
676  p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
677  Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
678  theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
679  Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
680  degrees.
681  column: A function which given the particle and
682  detector names returns the corresponding detector log-likelihood
683  column name. Defaults to _column, which gives column names of the
684  format f"{detector}_{particle}".
685 
686  Returns:
687  pandas.DataFrame: Return the prepared DataFrame. (Not all modifications in
688  this method are in-place.)
689  """
690  if column is not None:
691  for p in PARTICLES:
692  for d in DETECTORS:
693  df[f"{d}_{p}"] = df[column(p, d)]
694 
695  apply_weights(df, weights, p_bins=p_bins, theta_bins=theta_bins)
696  cut_particles(df, allowed_particles)
697 
698  if compute_cols:
699  make_columns(
700  df,
701  p_bins=p_bins,
702  theta_bins=theta_bins,
703  contrib_corr=True,
704  )
705  if drop_outside_bins:
706  df = df.loc[
707  np.logical_and.reduce(
708  [
709  df["p_bin"].values >= 0,
710  df["p_bin"].values < len(p_bins) - 1,
711  df["theta_bin"].values >= 0,
712  df["theta_bin"].values < len(theta_bins) - 1,
713  ]
714  )
715  ]
716 
717  if drop_nans:
718  df = df.dropna()
719  df = df[df["labels"] >= 0]
720  return df