Belle II Software  release-05-01-25
utils.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 """
4 This module contains various utility functions for the prompt calibration CAF scripts to use.
5 """
6 from basf2 import B2INFO
7 from collections import defaultdict, OrderedDict
8 from itertools import groupby
9 import ROOT
10 from caf.utils import ExpRun, IoV
11 from random import choice
12 
13 
14 def filter_by_max_files_per_run(files_to_iov, max_files_per_run=1, min_events_per_file=0):
15  """This function creates a new files_to_iov dictionary by adding files
16  until the maximum numbe of files per run is reached. After this no more files
17  are added.
18 
19  It makes the assumption that the IoV is a single run, and that the exp_low and run_low of the IoV object
20  can be used to create the ExpRun fr comparison of whether to add a new input file.
21 
22  Parameters:
23  files_to_iov (dict): The standard dictionary you might as input to a Calibration. It is of the form
24 
25  >>> files_to_iov = {"file_path.root": IoV(1,1,1,1),}
26 
27  max_files_per_run (int): The maximum number of files that we will add to the output dictionary for each run in the
28  input dictionary.
29 
30  min_events_per_file (int): The minimum number of events that is allowed to be in any included file's tree.
31 
32  Returns:
33  dict: The same style of dict as the input file_to_iov, but filtered down.
34  """
35  B2INFO(f"Beginning filtering process to only choose {max_files_per_run} file(s) per run.")
36  if min_events_per_file:
37  B2INFO(f"We also require that each file must have at least {min_events_per_file} events in the tree.")
38 
39  # Our dictionary for appending files to and checking the number per run
40  run_to_files = defaultdict(list)
41  for input_file, file_iov in files_to_iov.items():
42  run = ExpRun(exp=file_iov.exp_low, run=file_iov.run_low)
43  run_files = run_to_files.get(run, None)
44  if not run_files or len(run_files) < max_files_per_run:
45  if not min_events_per_file or (min_events_per_file and events_in_basf2_file(input_file) >= min_events_per_file):
46  B2INFO(f"Choosing input file for {run}: {input_file}")
47  run_to_files[run].append(input_file)
48 
49  # runs_to_files was useful for looking up number of files per run. But we want to invert this back to a
50  # files_to_iov object, just with less files.
51  # In case the input dictionary was OrderedDict we keep the ordering.
52  # Python's dictionaries are ordered now, but may not always be.
53  new_files_to_iov = OrderedDict()
54  for run, run_files in run_to_files.items():
55  for file_path in run_files:
56  # We made the assumption that the IoVs are single runs
57  new_files_to_iov[file_path] = IoV(*run, *run)
58  return new_files_to_iov
59 
60 
61 def group_files_by_iov(files_to_iov):
62  """
63  Inverts the files_to_iov dictionary to give back a dictionary of IoV -> File list
64 
65  Parameters:
66  files_to_iov (dict): {"/path/to/file1.root": IoV(1,1,1,1), "/path/to/file2.root": IoV(1,1,1,1)}
67 
68  Returns:
69  dict: {IoV(1,1,1,1): ["/path/to/file1.root", "/path/to/file2.root"]}
70  """
71  iov_to_files = OrderedDict()
72  for iov, g in groupby(files_to_iov.items(), lambda g: g[1]):
73  files = [f[0] for f in g]
74  iov_to_files[iov] = files
75  return iov_to_files
76 
77 
78 def filter_by_max_events_per_run(files_to_iov, max_events_per_run, random_select=False):
79  """
80  This function creates a new files_to_iov dictionary by appending files
81  in order until the maximum number of events are reached per run.
82 
83  Parameters:
84  files_to_iov (dict): {"/path/to/file.root": IoV(1,1,1,1)} type dictionary. Same style as used by the CAF
85  for lookup values.
86  max_events_per_run (int): The threshold we want to reach but stop adding files if we reach it.
87  random_select (bool): true will select random nfile and false will take first nfile.
88 
89  Returns:
90  dict: The same style of dict as the input files_to_iov, but filtered down.
91  """
92 
93  # Invert dictionary so that files are grouped against the same IoV
94  iov_to_files = group_files_by_iov(files_to_iov)
95  # Ready a new dict to contain the reduced lists
96  new_iov_to_files = OrderedDict()
97 
98  for iov, files in sorted(iov_to_files.items()):
99  run = ExpRun(iov.exp_low, iov.run_low)
100  total = 0
101  remaining_files = files[:]
102  chosen_files = []
103  while total < max_events_per_run and remaining_files:
104  if random_select:
105  file_path = choice(remaining_files)
106  remaining_files.remove(file_path)
107  else:
108  file_path = remaining_files.pop(0)
109  events = events_in_basf2_file(file_path)
110  # Empty files are skipped
111  if not events:
112  B2INFO(f"No events in {file_path}, skipping...")
113  continue
114  total += events
115  chosen_files.append(file_path)
116  B2INFO(f"Choosing input file for {run}: {file_path} and total events so far {total}")
117 
118  # Don't bother making empty input list for a Run
119  if chosen_files:
120  new_iov_to_files[iov] = chosen_files
121  else:
122  B2INFO(f"No files chosen for {run}")
123 
124  # Now go back to files_to_iov dictionary
125  new_files_to_iov = OrderedDict()
126  for iov, files in new_iov_to_files.items():
127  for path in files:
128  new_files_to_iov[path] = iov
129  return new_files_to_iov
130 
131 
132 def filter_by_select_max_events_from_files(input_file_list, select_max_events_from_files):
133  """
134  This function creates a new list by appending random files until
135  the maximum number of events are reached per data set.
136 
137  Parameters:
138  input_file_list (list): ["/path/to/file2.root", "/path/to/file2.root"]
139  select_max_events_from_files (int): The threshold we want to reach but stop adding files if we reach it.
140 
141  Returns:
142  list: The sorted list of random files or empty list of not enought found
143  """
144 
145  total = 0
146  selected_file = []
147  while total < select_max_events_from_files:
148 
149  if not input_file_list:
150  break
151 
152  file_path = choice(input_file_list)
153  input_file_list.remove(file_path)
154 
155  events = events_in_basf2_file(file_path)
156  # Empty files are skipped
157  if not events:
158  B2INFO(f"No events in {file_path}, skipping...")
159  continue
160 
161  total += events
162  selected_file.append(file_path)
163  B2INFO(f"Choosing random input file: {file_path} and total events so far {total}")
164 
165  # return empty list if request events found
166  if total < select_max_events_from_files:
167  B2INFO(f"total events {total} are less than requested {select_max_events_from_files}")
168  selected_file = []
169 
170  return sorted(selected_file)
171 
172 
173 def events_in_basf2_file(file_path):
174  """Does a quick open and return of the number of entries in a basf2 file's tree object.
175 
176  Parameters:
177  file_path (str): File path to ROOT file
178 
179  Returns:
180  int: Number of entries in tree.
181  """
182  f = ROOT.TFile.Open(file_path, "READ")
183  events = f.tree.GetEntries()
184  f.Close()
185  return events