Belle II Software development
utils.py
1#!/usr/bin/env python3
2
3
10"""
11This module contains various utility functions for the prompt calibration CAF scripts to use.
12"""
13from basf2 import B2INFO
14from collections import defaultdict, OrderedDict
15from itertools import groupby
16import ROOT
17from caf.utils import ExpRun, IoV
18from random import choice, shuffle
19
20
21def filter_by_max_files_per_run(files_to_iov, max_files_per_run=1, min_events_per_file=0, random_select=False):
22 """This function creates a new files_to_iov dictionary by adding files
23 until the maximum number of files per run is reached. After this no more files
24 are added.
25
26 It makes the assumption that the IoV is a single run, and that the exp_low and run_low of the IoV object
27 can be used to create the ExpRun fr comparison of whether to add a new input file.
28
29 Parameters:
30 files_to_iov (dict): The standard dictionary you might as input to a Calibration. It is of the form
31
32 >>> files_to_iov = {"file_path.root": IoV(1,1,1,1),}
33
34 max_files_per_run (int): The maximum number of files that we will add to the output dictionary for each run in the
35 input dictionary.
36
37 min_events_per_file (int): The minimum number of events that is allowed to be in any included file's tree. random_select (bool): true will select random nfile and false will take first nfile.
38
39 Returns:
40 dict: The same style of dict as the input file_to_iov, but filtered down.
41 """
42 B2INFO(f"Beginning filtering process to only choose {max_files_per_run} file(s) per run.")
43 if min_events_per_file:
44 B2INFO(f"We also require that each file must have at least {min_events_per_file} events in the tree.")
45
46 # Shuffle the order of the dictionary for a random selection, converting back to the same type
47 if random_select:
48 files_to_iov_list = list(files_to_iov.items())
49 shuffle(files_to_iov_list)
50 files_to_iov = type(files_to_iov)(files_to_iov_list)
51
52 # Our dictionary for appending files to and checking the number per run
53 run_to_files = defaultdict(list)
54 for input_file, file_iov in files_to_iov.items():
55 run = ExpRun(exp=file_iov.exp_low, run=file_iov.run_low)
56 run_files = run_to_files.get(run, None)
57 if not run_files or len(run_files) < max_files_per_run:
58 if not min_events_per_file or (min_events_per_file and events_in_basf2_file(input_file) >= min_events_per_file):
59 B2INFO(f"Choosing input file for {run}: {input_file}")
60 run_to_files[run].append(input_file)
61
62 # runs_to_files was useful for looking up number of files per run. But we want to invert this back to a
63 # files_to_iov object, just with less files.
64 # In case the input dictionary was OrderedDict we keep the ordering.
65 # Python's dictionaries are ordered now, but may not always be.
66 new_files_to_iov = OrderedDict()
67 for run, run_files in run_to_files.items():
68 for file_path in run_files:
69 # We made the assumption that the IoVs are single runs
70 new_files_to_iov[file_path] = IoV(*run, *run)
71 return new_files_to_iov
72
73
74def group_files_by_iov(files_to_iov):
75 """
76 Inverts the files_to_iov dictionary to give back a dictionary of IoV -> File list
77
78 Parameters:
79 files_to_iov (dict): {"/path/to/file1.root": IoV(1,1,1,1), "/path/to/file2.root": IoV(1,1,1,1)}
80
81 Returns:
82 dict: {IoV(1,1,1,1): ["/path/to/file1.root", "/path/to/file2.root"]}
83 """
84 iov_to_files = OrderedDict()
85 for iov, g in groupby(files_to_iov.items(), lambda g: g[1]):
86 files = [f[0] for f in g]
87 iov_to_files[iov] = files
88 return iov_to_files
89
90
91def filter_by_max_events_per_run(files_to_iov, max_events_per_run, random_select=False, max_events_per_file=0):
92 """
93 This function creates a new files_to_iov dictionary by appending files
94 in order until the maximum number of events are reached per run.
95 Each file contributes a maximum of events specified by "max_events_per_file".
96
97 Parameters:
98 files_to_iov (dict): {"/path/to/file.root": IoV(1,1,1,1)} type dictionary. Same style as used by the CAF
99 for lookup values.
100 max_events_per_run (int): The threshold we want to reach but stop adding files if we reach it.
101 random_select (bool): true will select random nfile and false will take first nfile.
102 max_events_per_file (int): true will limit the contribution from each file to max events specified.
103
104 Returns:
105 dict: The same style of dict as the input files_to_iov, but filtered down.
106 """
107
108 # Invert dictionary so that files are grouped against the same IoV
109 iov_to_files = group_files_by_iov(files_to_iov)
110 # Ready a new dict to contain the reduced lists
111 new_iov_to_files = OrderedDict()
112
113 for iov, files in sorted(iov_to_files.items()):
114 run = ExpRun(iov.exp_low, iov.run_low)
115 total = 0
116 remaining_files = files[:]
117 chosen_files = []
118 while total < max_events_per_run and remaining_files:
119 if random_select:
120 file_path = choice(remaining_files)
121 remaining_files.remove(file_path)
122 else:
123 file_path = remaining_files.pop(0)
124 events = events_in_basf2_file(file_path)
125 # Empty files are skipped
126 if not events:
127 B2INFO(f"No events in {file_path}, skipping...")
128 continue
129 total += events if max_events_per_file <= 0 or events <= max_events_per_file else max_events_per_file
130 chosen_files.append(file_path)
131 B2INFO(f"Choosing input file for {run}: {file_path} and total events so far {total}")
132
133 # Don't bother making empty input list for a Run
134 if chosen_files:
135 new_iov_to_files[iov] = chosen_files
136 else:
137 B2INFO(f"No files chosen for {run}")
138
139 # Now go back to files_to_iov dictionary
140 new_files_to_iov = OrderedDict()
141 for iov, files in new_iov_to_files.items():
142 for path in files:
143 new_files_to_iov[path] = iov
144 return new_files_to_iov
145
146
147def filter_by_select_max_events_from_files(input_file_list, select_max_events_from_files):
148 """
149 This function creates a new list by appending random files until
150 the maximum number of events are reached per data set.
151
152 Parameters:
153 input_file_list (list): ["/path/to/file2.root", "/path/to/file2.root"]
154 select_max_events_from_files (int): The threshold we want to reach but stop adding files if we reach it.
155
156 Returns:
157 list: The sorted list of random files or empty list of not enough found
158 """
159
160 total = 0
161 selected_file = []
162 while total < select_max_events_from_files:
163
164 if not input_file_list:
165 break
166
167 file_path = choice(input_file_list)
168 input_file_list.remove(file_path)
169
170 events = events_in_basf2_file(file_path)
171 # Empty files are skipped
172 if not events:
173 B2INFO(f"No events in {file_path}, skipping...")
174 continue
175
176 total += events
177 selected_file.append(file_path)
178 B2INFO(f"Choosing random input file: {file_path} and total events so far {total}")
179
180 # return empty list if request events found
181 if total < select_max_events_from_files:
182 B2INFO(f"total events {total} are less than requested {select_max_events_from_files}")
183 selected_file = []
184
185 return sorted(selected_file)
186
187
188def events_in_basf2_file(file_path):
189 """Does a quick open and return of the number of entries in a basf2 file's tree object.
190
191 Parameters:
192 file_path (str): File path to ROOT file
193
194 Returns:
195 int: Number of entries in tree.
196 """
197 f = ROOT.TFile.Open(file_path, "READ")
198 events = f.tree.GetEntries()
199 f.Close()
200 return events
201