Belle II Software  release-06-00-14
utils.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 
11 """
12 This module contains various utility functions for the prompt calibration CAF scripts to use.
13 """
14 from basf2 import B2INFO
15 from collections import defaultdict, OrderedDict
16 from itertools import groupby
17 import ROOT
18 from caf.utils import ExpRun, IoV
19 from random import choice, shuffle
20 
21 
22 def filter_by_max_files_per_run(files_to_iov, max_files_per_run=1, min_events_per_file=0, random_select=False):
23  """This function creates a new files_to_iov dictionary by adding files
24  until the maximum number of files per run is reached. After this no more files
25  are added.
26 
27  It makes the assumption that the IoV is a single run, and that the exp_low and run_low of the IoV object
28  can be used to create the ExpRun fr comparison of whether to add a new input file.
29 
30  Parameters:
31  files_to_iov (dict): The standard dictionary you might as input to a Calibration. It is of the form
32 
33  >>> files_to_iov = {"file_path.root": IoV(1,1,1,1),}
34 
35  max_files_per_run (int): The maximum number of files that we will add to the output dictionary for each run in the
36  input dictionary.
37 
38  min_events_per_file (int): The minimum number of events that is allowed to be in any included file's tree.
39  random_select (bool): true will select random nfile and false will take first nfile.
40 
41  Returns:
42  dict: The same style of dict as the input file_to_iov, but filtered down.
43  """
44  B2INFO(f"Beginning filtering process to only choose {max_files_per_run} file(s) per run.")
45  if min_events_per_file:
46  B2INFO(f"We also require that each file must have at least {min_events_per_file} events in the tree.")
47 
48  # Shuffle the order of the dictionary for a random selection, converting back to the same type
49  if random_select:
50  files_to_iov_list = list(files_to_iov.items())
51  shuffle(files_to_iov_list)
52  files_to_iov = type(files_to_iov)(files_to_iov_list)
53 
54  # Our dictionary for appending files to and checking the number per run
55  run_to_files = defaultdict(list)
56  for input_file, file_iov in files_to_iov.items():
57  run = ExpRun(exp=file_iov.exp_low, run=file_iov.run_low)
58  run_files = run_to_files.get(run, None)
59  if not run_files or len(run_files) < max_files_per_run:
60  if not min_events_per_file or (min_events_per_file and events_in_basf2_file(input_file) >= min_events_per_file):
61  B2INFO(f"Choosing input file for {run}: {input_file}")
62  run_to_files[run].append(input_file)
63 
64  # runs_to_files was useful for looking up number of files per run. But we want to invert this back to a
65  # files_to_iov object, just with less files.
66  # In case the input dictionary was OrderedDict we keep the ordering.
67  # Python's dictionaries are ordered now, but may not always be.
68  new_files_to_iov = OrderedDict()
69  for run, run_files in run_to_files.items():
70  for file_path in run_files:
71  # We made the assumption that the IoVs are single runs
72  new_files_to_iov[file_path] = IoV(*run, *run)
73  return new_files_to_iov
74 
75 
76 def group_files_by_iov(files_to_iov):
77  """
78  Inverts the files_to_iov dictionary to give back a dictionary of IoV -> File list
79 
80  Parameters:
81  files_to_iov (dict): {"/path/to/file1.root": IoV(1,1,1,1), "/path/to/file2.root": IoV(1,1,1,1)}
82 
83  Returns:
84  dict: {IoV(1,1,1,1): ["/path/to/file1.root", "/path/to/file2.root"]}
85  """
86  iov_to_files = OrderedDict()
87  for iov, g in groupby(files_to_iov.items(), lambda g: g[1]):
88  files = [f[0] for f in g]
89  iov_to_files[iov] = files
90  return iov_to_files
91 
92 
93 def filter_by_max_events_per_run(files_to_iov, max_events_per_run, random_select=False):
94  """
95  This function creates a new files_to_iov dictionary by appending files
96  in order until the maximum number of events are reached per run.
97 
98  Parameters:
99  files_to_iov (dict): {"/path/to/file.root": IoV(1,1,1,1)} type dictionary. Same style as used by the CAF
100  for lookup values.
101  max_events_per_run (int): The threshold we want to reach but stop adding files if we reach it.
102  random_select (bool): true will select random nfile and false will take first nfile.
103 
104  Returns:
105  dict: The same style of dict as the input files_to_iov, but filtered down.
106  """
107 
108  # Invert dictionary so that files are grouped against the same IoV
109  iov_to_files = group_files_by_iov(files_to_iov)
110  # Ready a new dict to contain the reduced lists
111  new_iov_to_files = OrderedDict()
112 
113  for iov, files in sorted(iov_to_files.items()):
114  run = ExpRun(iov.exp_low, iov.run_low)
115  total = 0
116  remaining_files = files[:]
117  chosen_files = []
118  while total < max_events_per_run and remaining_files:
119  if random_select:
120  file_path = choice(remaining_files)
121  remaining_files.remove(file_path)
122  else:
123  file_path = remaining_files.pop(0)
124  events = events_in_basf2_file(file_path)
125  # Empty files are skipped
126  if not events:
127  B2INFO(f"No events in {file_path}, skipping...")
128  continue
129  total += events
130  chosen_files.append(file_path)
131  B2INFO(f"Choosing input file for {run}: {file_path} and total events so far {total}")
132 
133  # Don't bother making empty input list for a Run
134  if chosen_files:
135  new_iov_to_files[iov] = chosen_files
136  else:
137  B2INFO(f"No files chosen for {run}")
138 
139  # Now go back to files_to_iov dictionary
140  new_files_to_iov = OrderedDict()
141  for iov, files in new_iov_to_files.items():
142  for path in files:
143  new_files_to_iov[path] = iov
144  return new_files_to_iov
145 
146 
147 def filter_by_select_max_events_from_files(input_file_list, select_max_events_from_files):
148  """
149  This function creates a new list by appending random files until
150  the maximum number of events are reached per data set.
151 
152  Parameters:
153  input_file_list (list): ["/path/to/file2.root", "/path/to/file2.root"]
154  select_max_events_from_files (int): The threshold we want to reach but stop adding files if we reach it.
155 
156  Returns:
157  list: The sorted list of random files or empty list of not enought found
158  """
159 
160  total = 0
161  selected_file = []
162  while total < select_max_events_from_files:
163 
164  if not input_file_list:
165  break
166 
167  file_path = choice(input_file_list)
168  input_file_list.remove(file_path)
169 
170  events = events_in_basf2_file(file_path)
171  # Empty files are skipped
172  if not events:
173  B2INFO(f"No events in {file_path}, skipping...")
174  continue
175 
176  total += events
177  selected_file.append(file_path)
178  B2INFO(f"Choosing random input file: {file_path} and total events so far {total}")
179 
180  # return empty list if request events found
181  if total < select_max_events_from_files:
182  B2INFO(f"total events {total} are less than requested {select_max_events_from_files}")
183  selected_file = []
184 
185  return sorted(selected_file)
186 
187 
188 def events_in_basf2_file(file_path):
189  """Does a quick open and return of the number of entries in a basf2 file's tree object.
190 
191  Parameters:
192  file_path (str): File path to ROOT file
193 
194  Returns:
195  int: Number of entries in tree.
196  """
197  f = ROOT.TFile.Open(file_path, "READ")
198  events = f.tree.GetEntries()
199  f.Close()
200  return events