Belle II Software  release-05-02-19
utils.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 """
4 This module contains various utility functions for the prompt calibration CAF scripts to use.
5 """
6 from basf2 import B2INFO
7 from collections import defaultdict, OrderedDict
8 from itertools import groupby
9 import ROOT
10 from caf.utils import ExpRun, IoV
11 from random import choice, shuffle
12 
13 
14 def filter_by_max_files_per_run(files_to_iov, max_files_per_run=1, min_events_per_file=0, random_select=False):
15  """This function creates a new files_to_iov dictionary by adding files
16  until the maximum number of files per run is reached. After this no more files
17  are added.
18 
19  It makes the assumption that the IoV is a single run, and that the exp_low and run_low of the IoV object
20  can be used to create the ExpRun fr comparison of whether to add a new input file.
21 
22  Parameters:
23  files_to_iov (dict): The standard dictionary you might as input to a Calibration. It is of the form
24 
25  >>> files_to_iov = {"file_path.root": IoV(1,1,1,1),}
26 
27  max_files_per_run (int): The maximum number of files that we will add to the output dictionary for each run in the
28  input dictionary.
29 
30  min_events_per_file (int): The minimum number of events that is allowed to be in any included file's tree.
31  random_select (bool): true will select random nfile and false will take first nfile.
32 
33  Returns:
34  dict: The same style of dict as the input file_to_iov, but filtered down.
35  """
36  B2INFO(f"Beginning filtering process to only choose {max_files_per_run} file(s) per run.")
37  if min_events_per_file:
38  B2INFO(f"We also require that each file must have at least {min_events_per_file} events in the tree.")
39 
40  # Shuffle the order of the dictionary for a random selection, converting back to the same type
41  if random_select:
42  files_to_iov_list = list(files_to_iov.items())
43  shuffle(files_to_iov_list)
44  files_to_iov = type(files_to_iov)(files_to_iov_list)
45 
46  # Our dictionary for appending files to and checking the number per run
47  run_to_files = defaultdict(list)
48  for input_file, file_iov in files_to_iov.items():
49  run = ExpRun(exp=file_iov.exp_low, run=file_iov.run_low)
50  run_files = run_to_files.get(run, None)
51  if not run_files or len(run_files) < max_files_per_run:
52  if not min_events_per_file or (min_events_per_file and events_in_basf2_file(input_file) >= min_events_per_file):
53  B2INFO(f"Choosing input file for {run}: {input_file}")
54  run_to_files[run].append(input_file)
55 
56  # runs_to_files was useful for looking up number of files per run. But we want to invert this back to a
57  # files_to_iov object, just with less files.
58  # In case the input dictionary was OrderedDict we keep the ordering.
59  # Python's dictionaries are ordered now, but may not always be.
60  new_files_to_iov = OrderedDict()
61  for run, run_files in run_to_files.items():
62  for file_path in run_files:
63  # We made the assumption that the IoVs are single runs
64  new_files_to_iov[file_path] = IoV(*run, *run)
65  return new_files_to_iov
66 
67 
68 def group_files_by_iov(files_to_iov):
69  """
70  Inverts the files_to_iov dictionary to give back a dictionary of IoV -> File list
71 
72  Parameters:
73  files_to_iov (dict): {"/path/to/file1.root": IoV(1,1,1,1), "/path/to/file2.root": IoV(1,1,1,1)}
74 
75  Returns:
76  dict: {IoV(1,1,1,1): ["/path/to/file1.root", "/path/to/file2.root"]}
77  """
78  iov_to_files = OrderedDict()
79  for iov, g in groupby(files_to_iov.items(), lambda g: g[1]):
80  files = [f[0] for f in g]
81  iov_to_files[iov] = files
82  return iov_to_files
83 
84 
85 def filter_by_max_events_per_run(files_to_iov, max_events_per_run, random_select=False):
86  """
87  This function creates a new files_to_iov dictionary by appending files
88  in order until the maximum number of events are reached per run.
89 
90  Parameters:
91  files_to_iov (dict): {"/path/to/file.root": IoV(1,1,1,1)} type dictionary. Same style as used by the CAF
92  for lookup values.
93  max_events_per_run (int): The threshold we want to reach but stop adding files if we reach it.
94  random_select (bool): true will select random nfile and false will take first nfile.
95 
96  Returns:
97  dict: The same style of dict as the input files_to_iov, but filtered down.
98  """
99 
100  # Invert dictionary so that files are grouped against the same IoV
101  iov_to_files = group_files_by_iov(files_to_iov)
102  # Ready a new dict to contain the reduced lists
103  new_iov_to_files = OrderedDict()
104 
105  for iov, files in sorted(iov_to_files.items()):
106  run = ExpRun(iov.exp_low, iov.run_low)
107  total = 0
108  remaining_files = files[:]
109  chosen_files = []
110  while total < max_events_per_run and remaining_files:
111  if random_select:
112  file_path = choice(remaining_files)
113  remaining_files.remove(file_path)
114  else:
115  file_path = remaining_files.pop(0)
116  events = events_in_basf2_file(file_path)
117  # Empty files are skipped
118  if not events:
119  B2INFO(f"No events in {file_path}, skipping...")
120  continue
121  total += events
122  chosen_files.append(file_path)
123  B2INFO(f"Choosing input file for {run}: {file_path} and total events so far {total}")
124 
125  # Don't bother making empty input list for a Run
126  if chosen_files:
127  new_iov_to_files[iov] = chosen_files
128  else:
129  B2INFO(f"No files chosen for {run}")
130 
131  # Now go back to files_to_iov dictionary
132  new_files_to_iov = OrderedDict()
133  for iov, files in new_iov_to_files.items():
134  for path in files:
135  new_files_to_iov[path] = iov
136  return new_files_to_iov
137 
138 
139 def filter_by_select_max_events_from_files(input_file_list, select_max_events_from_files):
140  """
141  This function creates a new list by appending random files until
142  the maximum number of events are reached per data set.
143 
144  Parameters:
145  input_file_list (list): ["/path/to/file2.root", "/path/to/file2.root"]
146  select_max_events_from_files (int): The threshold we want to reach but stop adding files if we reach it.
147 
148  Returns:
149  list: The sorted list of random files or empty list of not enought found
150  """
151 
152  total = 0
153  selected_file = []
154  while total < select_max_events_from_files:
155 
156  if not input_file_list:
157  break
158 
159  file_path = choice(input_file_list)
160  input_file_list.remove(file_path)
161 
162  events = events_in_basf2_file(file_path)
163  # Empty files are skipped
164  if not events:
165  B2INFO(f"No events in {file_path}, skipping...")
166  continue
167 
168  total += events
169  selected_file.append(file_path)
170  B2INFO(f"Choosing random input file: {file_path} and total events so far {total}")
171 
172  # return empty list if request events found
173  if total < select_max_events_from_files:
174  B2INFO(f"total events {total} are less than requested {select_max_events_from_files}")
175  selected_file = []
176 
177  return sorted(selected_file)
178 
179 
180 def events_in_basf2_file(file_path):
181  """Does a quick open and return of the number of entries in a basf2 file's tree object.
182 
183  Parameters:
184  file_path (str): File path to ROOT file
185 
186  Returns:
187  int: Number of entries in tree.
188  """
189  f = ROOT.TFile.Open(file_path, "READ")
190  events = f.tree.GetEntries()
191  f.Close()
192  return events