Belle II Software  release-08-01-10
utils.py
1 #!/usr/bin/env python3
2 
3 
10 """
11 This module contains various utility functions for the prompt calibration CAF scripts to use.
12 """
13 from basf2 import B2INFO
14 from collections import defaultdict, OrderedDict
15 from itertools import groupby
16 import ROOT
17 from caf.utils import ExpRun, IoV
18 from random import choice, shuffle
19 
20 
21 def filter_by_max_files_per_run(files_to_iov, max_files_per_run=1, min_events_per_file=0, random_select=False):
22  """This function creates a new files_to_iov dictionary by adding files
23  until the maximum number of files per run is reached. After this no more files
24  are added.
25 
26  It makes the assumption that the IoV is a single run, and that the exp_low and run_low of the IoV object
27  can be used to create the ExpRun fr comparison of whether to add a new input file.
28 
29  Parameters:
30  files_to_iov (dict): The standard dictionary you might as input to a Calibration. It is of the form
31 
32  >>> files_to_iov = {"file_path.root": IoV(1,1,1,1),}
33 
34  max_files_per_run (int): The maximum number of files that we will add to the output dictionary for each run in the
35  input dictionary.
36 
37  min_events_per_file (int): The minimum number of events that is allowed to be in any included file's tree.
38  random_select (bool): true will select random nfile and false will take first nfile.
39 
40  Returns:
41  dict: The same style of dict as the input file_to_iov, but filtered down.
42  """
43  B2INFO(f"Beginning filtering process to only choose {max_files_per_run} file(s) per run.")
44  if min_events_per_file:
45  B2INFO(f"We also require that each file must have at least {min_events_per_file} events in the tree.")
46 
47  # Shuffle the order of the dictionary for a random selection, converting back to the same type
48  if random_select:
49  files_to_iov_list = list(files_to_iov.items())
50  shuffle(files_to_iov_list)
51  files_to_iov = type(files_to_iov)(files_to_iov_list)
52 
53  # Our dictionary for appending files to and checking the number per run
54  run_to_files = defaultdict(list)
55  for input_file, file_iov in files_to_iov.items():
56  run = ExpRun(exp=file_iov.exp_low, run=file_iov.run_low)
57  run_files = run_to_files.get(run, None)
58  if not run_files or len(run_files) < max_files_per_run:
59  if not min_events_per_file or (min_events_per_file and events_in_basf2_file(input_file) >= min_events_per_file):
60  B2INFO(f"Choosing input file for {run}: {input_file}")
61  run_to_files[run].append(input_file)
62 
63  # runs_to_files was useful for looking up number of files per run. But we want to invert this back to a
64  # files_to_iov object, just with less files.
65  # In case the input dictionary was OrderedDict we keep the ordering.
66  # Python's dictionaries are ordered now, but may not always be.
67  new_files_to_iov = OrderedDict()
68  for run, run_files in run_to_files.items():
69  for file_path in run_files:
70  # We made the assumption that the IoVs are single runs
71  new_files_to_iov[file_path] = IoV(*run, *run)
72  return new_files_to_iov
73 
74 
75 def group_files_by_iov(files_to_iov):
76  """
77  Inverts the files_to_iov dictionary to give back a dictionary of IoV -> File list
78 
79  Parameters:
80  files_to_iov (dict): {"/path/to/file1.root": IoV(1,1,1,1), "/path/to/file2.root": IoV(1,1,1,1)}
81 
82  Returns:
83  dict: {IoV(1,1,1,1): ["/path/to/file1.root", "/path/to/file2.root"]}
84  """
85  iov_to_files = OrderedDict()
86  for iov, g in groupby(files_to_iov.items(), lambda g: g[1]):
87  files = [f[0] for f in g]
88  iov_to_files[iov] = files
89  return iov_to_files
90 
91 
92 def filter_by_max_events_per_run(files_to_iov, max_events_per_run, random_select=False, max_events_per_file=0):
93  """
94  This function creates a new files_to_iov dictionary by appending files
95  in order until the maximum number of events are reached per run.
96  Each file contributes a maximum of events specified by "max_events_per_file".
97 
98  Parameters:
99  files_to_iov (dict): {"/path/to/file.root": IoV(1,1,1,1)} type dictionary. Same style as used by the CAF
100  for lookup values.
101  max_events_per_run (int): The threshold we want to reach but stop adding files if we reach it.
102  random_select (bool): true will select random nfile and false will take first nfile.
103  max_events_per_file (int): true will limit the contribution from each file to max events specified.
104 
105  Returns:
106  dict: The same style of dict as the input files_to_iov, but filtered down.
107  """
108 
109  # Invert dictionary so that files are grouped against the same IoV
110  iov_to_files = group_files_by_iov(files_to_iov)
111  # Ready a new dict to contain the reduced lists
112  new_iov_to_files = OrderedDict()
113 
114  for iov, files in sorted(iov_to_files.items()):
115  run = ExpRun(iov.exp_low, iov.run_low)
116  total = 0
117  remaining_files = files[:]
118  chosen_files = []
119  while total < max_events_per_run and remaining_files:
120  if random_select:
121  file_path = choice(remaining_files)
122  remaining_files.remove(file_path)
123  else:
124  file_path = remaining_files.pop(0)
125  events = events_in_basf2_file(file_path)
126  # Empty files are skipped
127  if not events:
128  B2INFO(f"No events in {file_path}, skipping...")
129  continue
130  total += events if max_events_per_file <= 0 or events <= max_events_per_file else max_events_per_file
131  chosen_files.append(file_path)
132  B2INFO(f"Choosing input file for {run}: {file_path} and total events so far {total}")
133 
134  # Don't bother making empty input list for a Run
135  if chosen_files:
136  new_iov_to_files[iov] = chosen_files
137  else:
138  B2INFO(f"No files chosen for {run}")
139 
140  # Now go back to files_to_iov dictionary
141  new_files_to_iov = OrderedDict()
142  for iov, files in new_iov_to_files.items():
143  for path in files:
144  new_files_to_iov[path] = iov
145  return new_files_to_iov
146 
147 
148 def filter_by_select_max_events_from_files(input_file_list, select_max_events_from_files):
149  """
150  This function creates a new list by appending random files until
151  the maximum number of events are reached per data set.
152 
153  Parameters:
154  input_file_list (list): ["/path/to/file2.root", "/path/to/file2.root"]
155  select_max_events_from_files (int): The threshold we want to reach but stop adding files if we reach it.
156 
157  Returns:
158  list: The sorted list of random files or empty list of not enought found
159  """
160 
161  total = 0
162  selected_file = []
163  while total < select_max_events_from_files:
164 
165  if not input_file_list:
166  break
167 
168  file_path = choice(input_file_list)
169  input_file_list.remove(file_path)
170 
171  events = events_in_basf2_file(file_path)
172  # Empty files are skipped
173  if not events:
174  B2INFO(f"No events in {file_path}, skipping...")
175  continue
176 
177  total += events
178  selected_file.append(file_path)
179  B2INFO(f"Choosing random input file: {file_path} and total events so far {total}")
180 
181  # return empty list if request events found
182  if total < select_max_events_from_files:
183  B2INFO(f"total events {total} are less than requested {select_max_events_from_files}")
184  selected_file = []
185 
186  return sorted(selected_file)
187 
188 
189 def events_in_basf2_file(file_path):
190  """Does a quick open and return of the number of entries in a basf2 file's tree object.
191 
192  Parameters:
193  file_path (str): File path to ROOT file
194 
195  Returns:
196  int: Number of entries in tree.
197  """
198  f = ROOT.TFile.Open(file_path, "READ")
199  events = f.tree.GetEntries()
200  f.Close()
201  return events