Belle II Software light-2411-aldebaran
b2file-merge.cc
1/**************************************************************************
2 * basf2 (Belle II Analysis Software Framework) *
3 * Author: The Belle II Collaboration *
4 * *
5 * See git log for contributors and copyright holders. *
6 * This file is licensed under LGPL-3.0, see LICENSE.md. *
7 **************************************************************************/
8#include <framework/dataobjects/FileMetaData.h>
9#include <framework/io/RootIOUtilities.h>
10#include <framework/io/RootFileInfo.h>
11#include <framework/logging/Logger.h>
12#include <framework/pcore/Mergeable.h>
13#include <framework/core/FileCatalog.h>
14#include <framework/utilities/KeyValuePrinter.h>
15
16#include <boost/program_options.hpp>
17#include <boost/algorithm/string.hpp>
18
19#include <TFile.h>
20#include <TTree.h>
21#include <TBranchElement.h>
22
23#include <filesystem>
24#include <iostream>
25#include <iomanip>
26#include <string>
27#include <set>
28#include <regex>
29
30using namespace Belle2;
31namespace po = boost::program_options;
32namespace fs = std::filesystem;
33
36using EventInfo = std::tuple<int, int, unsigned int>;
37
38namespace {
41 std::string removeLegacyGt(const std::string& globaltags)
42 {
43 std::regex legacy_gt(",?Legacy_IP_Information");
44 return std::regex_replace(globaltags, legacy_gt, "");
45 }
46}
47
48int main(int argc, char* argv[])
49{
50 // Parse options
51 std::string outputfilename;
52 std::vector<std::string> inputfilenames;
53 po::options_description options("Options");
54 options.add_options()
55 ("help,h", "print all available options")
56 ("output,o", po::value<std::string>(&outputfilename), "output file name")
57 ("file", po::value<std::vector<std::string>>(&inputfilenames), "filename to merge")
58 ("force,f", "overwrite existing file")
59 ("no-catalog", "don't register output file in file catalog, This is now the default")
60 ("add-to-catalog", "register the output file in the file catalog")
61 ("quiet,q", "if given don't print infos, just warnings and errors");
62 po::positional_options_description positional;
63 positional.add("output", 1);
64 positional.add("file", -1);
65 po::variables_map variables;
66 po::store(po::command_line_parser(argc, argv).options(options).positional(positional).run(), variables);
67 po::notify(variables);
68 if (variables.count("help") || variables.count("output") == 0 || inputfilenames.empty()) {
69 std::cout << "Usage: " << argv[0] << " [<options>] OUTPUTFILE INPUTFILE [INPUTFILE...]" << std::endl;
70 std::cout << " " << argv[0] << " [<options>] [--file INPUTFILE...] "
71 << "-o OUTPUTFILE [--file INPUTFILE...]" << std::endl << std::endl;
72 std::cout << options << std::endl;
73 std::cout << (R"DOC(
74This program is intended to merge files created by separate basf2 jobs. It's
75similar to hadd but does correctly update the metadata in the file and merges
76the objects in the persistent tree correctly.
77
78The following restrictions apply:
79 - The files have to be created with the same release and steering file
80 - The persistent tree is only allowed to contain FileMetaData and objects
81 inheriting from Mergeable and the same list of objects needs to be present
82 in all files.
83 - The event tree needs to contain the same DataStore entries in all files.
84)DOC");
85 return 1;
86 }
87
88 // Remove the {module:} from log messages
89 auto logConfig = LogSystem::Instance().getLogConfig();
92 }
93 if(variables.count("quiet")>0){
94 logConfig->setLogLevel(LogConfig::c_Warning);
95 }
96
97 B2INFO("Merging files into " << std::quoted(outputfilename));
98 // check output file
99 if (fs::exists(outputfilename) && variables.count("force")==0) {
100 B2ERROR("Output file exists, use -f to force overwriting it");
101 return 1;
102 }
103 // First we check all input files for consistency ...
104
105 // the final metadata we will write out
106 FileMetaData* outputMetaData{nullptr};
107 // set of all parent LFNs encountered in any file
108 std::set<std::string> allParents;
109 // map of all mergeable objects found in the persistent tree. The size_t is
110 // for counting to make sure we see all objects in all files
111 std::map<std::string, std::pair<Mergeable*, size_t>> persistentMergeables;
112 // set of all random seeds to print warning on duplicates
113 std::set<std::string> allSeeds;
114 // set of all users
115 std::set<std::string> allUsers;
116 // EventInfo for the high/low event numbers of the final FileMetaData
117 std::optional<EventInfo> lowEvt, highEvt;
118 // map of sets of all branch names in the event trees to compare against to make sure
119 // that they're the same in all files
120 std::map<std::string, std::set<std::string>> allEventBranches;
121 // set of all ntuple trees names to compare against to make sure
122 // that they're the same in all files (if they exist)
123 std::set<std::string> allEventTrees;
124 // Release version to compare against. Same as FileMetaData::getRelease() but with the optional -modified removed
125 std::string outputRelease;
126
127 // so let's loop over all files and create FileMetaData and merge persistent
128 // objects if they inherit from Mergeable, bail if there's something else in
129 // there. The idea is that merging the persistent stuff is fast so we catch
130 // errors more quickly when we do this as a first step and events later on.
131 for (const auto& input : inputfilenames) {
132 try {
133 RootIOUtilities::RootFileInfo fileInfo(input);
134 // Ok, load the FileMetaData from the tree
135 const auto &fileMetaData = fileInfo.getFileMetaData();
136 auto description = fileMetaData.getDataDescription();
137 auto isNtuple = description.find("isNtupleMetaData");
138 // File looks usable, start checking metadata ...
139 B2INFO("adding file " << std::quoted(input));
140 if(LogSystem::Instance().isLevelEnabled(LogConfig::c_Info)) fileMetaData.Print("all");
141 auto trees = fileInfo.getTreeNames();
142 if(allEventTrees.empty()) {
143 std::swap(allEventTrees,trees);
144 }else{
145 if(trees!=allEventTrees){
146 B2ERROR("Trees in " << std::quoted(input) << " differ from "
147 << std::quoted(inputfilenames.front()));
148 continue;
149 }
150 }
151 for(const auto& tree : allEventTrees) {
152 auto branches = ((tree=="tree") &&
153 ((isNtuple==description.end()) || (isNtuple->second != "True"))
154 ) ? fileInfo.getBranchNames() : fileInfo.getNtupleBranchNames(tree);
155 if(branches.empty()) {
156 throw std::runtime_error("Could not find any branches in " + tree);
157 }
158 if(allEventBranches[tree].empty()) {
159 std::swap(allEventBranches[tree],branches);
160 }else{
161 if(branches!=allEventBranches[tree]){
162 B2ERROR("Branches in " << std::quoted(input + ":" + tree) << " differ from "
163 << std::quoted(inputfilenames.front() + ":" + tree));
164 }
165 }
166 }
167 // File looks good so far, now fix the persistent stuff, i.e. merge all
168 // objects in persistent tree
169 for(TObject* brObj: *fileInfo.getPersistentTree().GetListOfBranches()){
170 auto* br = dynamic_cast<TBranchElement*>(brObj);
171 // FileMetaData is handled separately
172 if(br && br->GetTargetClass() == FileMetaData::Class() && std::string(br->GetName()) == "FileMetaData")
173 continue;
174 // Make sure the branch is mergeable
175 if(!br) continue;
176 if(!br->GetTargetClass()->InheritsFrom(Mergeable::Class())){
177 B2ERROR("Branch " << std::quoted(br->GetName()) << " in persistent tree not inheriting from Mergable");
178 continue;
179 }
180 // Ok, it's an object we now how to handle so get it from the tree
181 Mergeable* object{nullptr};
182 br->SetAddress(&object);
183 if(br->GetEntry(0)<=0) {
184 B2ERROR("Could not read branch " << std::quoted(br->GetName()) << " of entry 0 from persistent tree in "
185 << std::quoted(input));
186 continue;
187 }
188 // and either insert it into the map of mergeables or merge with the existing one
189 auto it = persistentMergeables.insert(std::make_pair(br->GetName(), std::make_pair(object, 1)));
190 if(!it.second) {
191 try {
192 it.first->second.first->merge(object);
193 }catch(std::exception &e){
194 B2FATAL("Cannot merge " << std::quoted(br->GetName()) << " in " << std::quoted(input) << ": " << e.what());
195 }
196 it.first->second.second++;
197 // ok, merged, get rid of it.
198 delete object;
199 }else{
200 B2INFO("Found mergeable object " << std::quoted(br->GetName()) << " in persistent tree");
201 }
202 }
203
204 std::string release = fileMetaData.getRelease();
205 if(release == "") {
206 B2ERROR("Cannot determine release used to create " << std::quoted(input));
207 continue;
208 }else if(boost::algorithm::ends_with(fileMetaData.getRelease(), "-modified")){
209 B2WARNING("File " << std::quoted(input) << " created with modified software "
210 << fileMetaData.getRelease()
211 << ": cannot verify that files are compatible");
212 release = release.substr(0, release.size() - std::string("-modified").size());
213 }
214
215 // so, event tree looks good too. Now we merge the FileMetaData
216 if (!outputMetaData) {
217 // first input file, just take the event metadata
218 outputMetaData = new FileMetaData(fileMetaData);
219 outputRelease = release;
220 } else {
221 // check meta data for consistency, we could move this into FileMetaData...
222 if(release != outputRelease) {
223 B2ERROR("Release in " << std::quoted(input) << " differs from previous files: " <<
224 fileMetaData.getRelease() << " != " << outputMetaData->getRelease());
225 }
226 if(fileMetaData.getSteering() != outputMetaData->getSteering()){
227 // printing both steering files is not useful for anyone so just throw an error
228 B2ERROR("Steering file for " << std::quoted(input) << " differs from previous files.");
229 }
230 if(fileMetaData.getDatabaseGlobalTag() != outputMetaData->getDatabaseGlobalTag()){
231 // Related to BII-6093: we were adding the legacy gt only dependent on input file age, not creation release.
232 // This means there is a chance we want to merge files with and without the globaltag added if they cross the
233 // boundary. It doesn't hurt to keep the gt but we know we could process some of the files without it so as a remedy we
234 // check if the only difference is the legacy gt and if so we remove it from the output metadata ...
235 if(removeLegacyGt(fileMetaData.getDatabaseGlobalTag()) == removeLegacyGt(outputMetaData->getDatabaseGlobalTag())) {
236 outputMetaData->setDatabaseGlobalTag(removeLegacyGt(outputMetaData->getDatabaseGlobalTag()));
237 } else {
238 B2ERROR("Database globalTag in " << std::quoted(input) << " differs from previous files: " <<
239 fileMetaData.getDatabaseGlobalTag() << " != " << outputMetaData->getDatabaseGlobalTag());
240 }
241 }
242 if(fileMetaData.getDataDescription() != outputMetaData->getDataDescription()){
243 KeyValuePrinter cur(true);
244 for (const auto& descrPair : outputMetaData->getDataDescription())
245 cur.put(descrPair.first, descrPair.second);
246 KeyValuePrinter prev(true);
247 for (const auto& descrPair : fileMetaData.getDataDescription())
248 prev.put(descrPair.first, descrPair.second);
249
250 B2ERROR("dataDescription in " << std::quoted(input) << " differs from previous files:\n" << cur.string() << " vs.\n" << prev.string());
251 }
252 if(fileMetaData.isMC() != outputMetaData->isMC()){
253 B2ERROR("Type (real/MC) for " << std::quoted(input) << " differs from previous files.");
254 }
255 // update event numbers ...
256 outputMetaData->setMcEvents(outputMetaData->getMcEvents() + fileMetaData.getMcEvents());
257 outputMetaData->setNEvents(outputMetaData->getNEvents() + fileMetaData.getNEvents());
258 outputMetaData->setNFullEvents(outputMetaData->getNFullEvents() + fileMetaData.getNFullEvents());
259 }
260 if(fileMetaData.getNEvents() < 1) {
261 B2WARNING("File " << std::quoted(input) << " is empty.");
262 } else {
263 // make sure we have the correct low/high event numbers
264 EventInfo curLowEvt = EventInfo{fileMetaData.getExperimentLow(), fileMetaData.getRunLow(), fileMetaData.getEventLow()};
265 EventInfo curHighEvt = EventInfo{fileMetaData.getExperimentHigh(), fileMetaData.getRunHigh(), fileMetaData.getEventHigh()};
266 if(!lowEvt or curLowEvt < *lowEvt) lowEvt = curLowEvt;
267 if(!highEvt or curHighEvt > *highEvt) highEvt = curHighEvt;
268 }
269 // check if we have seen this random seed already in one of the previous files
270 auto it = allSeeds.insert(fileMetaData.getRandomSeed());
271 if(!it.second) {
272 B2WARNING("Duplicate Random Seed: " << std::quoted(fileMetaData.getRandomSeed()) << " present in more then one file");
273 }
274 allUsers.insert(fileMetaData.getUser());
275 // remember all parent files we encounter
276 for (int i = 0; i < fileMetaData.getNParents(); ++i) {
277 allParents.insert(fileMetaData.getParent(i));
278 }
279 }catch(std::exception &e) {
280 B2ERROR("input file " << std::quoted(input) << ": " << e.what());
281 }
282 }
283
284 //Check if the same mergeables were found in all files
285 for(const auto &val: persistentMergeables){
286 if(val.second.second != inputfilenames.size()){
287 B2ERROR("Mergeable " << std::quoted(val.first) << " only present in " << val.second.second << " out of "
288 << inputfilenames.size() << " files");
289 }
290 }
291
292 // Check for user names
293 if(allUsers.size()>1) {
294 B2WARNING("Multiple different users created input files: " << boost::algorithm::join(allUsers, ", "));
295 }
296
297 // Stop processing in case of error
298 if (LogSystem::Instance().getMessageCounter(LogConfig::c_Error) > 0) return 1;
299
300 if(!outputMetaData){
301 // technically it's rather impossible to arrive here: if there were no
302 // input files we exit with a usage message and if any of the files could
303 // not be processed then the error count should be >0. Nevertheless
304 // let's do this check to be on the very safe side and to make clang
305 // analyzer happy.
306 B2FATAL("For some reason no files could be processed");
307 return 1;
308 }
309 if(!lowEvt) {
310 B2WARNING("All Files were empty");
311 lowEvt = EventInfo{-1, -1, 0};
312 highEvt = EventInfo{-1, -1, 0};
313 }
314
315 // Final changes to metadata
316 outputMetaData->setLfn("");
317 outputMetaData->setParents(std::vector<std::string>(allParents.begin(), allParents.end()));
318 outputMetaData->setLow(std::get<0>(*lowEvt), std::get<1>(*lowEvt), std::get<2>(*lowEvt));
319 outputMetaData->setHigh(std::get<0>(*highEvt), std::get<1>(*highEvt), std::get<2>(*highEvt));
320 // If more then one file set an empty random seed
321 if(inputfilenames.size()>1){
322 outputMetaData->setRandomSeed("");
323 }
324 RootIOUtilities::setCreationData(*outputMetaData);
325
326 // OK we have a valid FileMetaData and merged all persistent objects, now do
327 // the conversion of the event trees and create the output file.
328 TFile output(outputfilename.c_str(), "RECREATE");
329 if (output.IsZombie()) {
330 B2ERROR("Could not create output file " << std::quoted(outputfilename));
331 return 1;
332 }
333
334 for (const auto& treeName : allEventTrees) {
335 TTree* outputEventTree{nullptr};
336 for (const auto& input : inputfilenames) {
337 B2INFO("processing events from " << std::quoted(input + ":" + treeName));
338 TFile tfile(input.c_str());
339 auto* tree = dynamic_cast<TTree*>(tfile.Get(treeName.c_str()));
340 if(!outputEventTree){
341 output.cd();
342 outputEventTree = tree->CloneTree(0);
343 }else{
344 outputEventTree->CopyAddresses(tree);
345 }
346 // Now let's copy all entries without unpacking (fast), layout the
347 // baskets in an optimal order for sequential reading (SortBasketByEntry)
348 // and rebuild the index in case some parts of the index are missing
349 outputEventTree->CopyEntries(tree, -1, "fast SortBasketsByEntry BuildIndexOnError");
350 // and reset the branch addresses to not be connected anymore
351 outputEventTree->CopyAddresses(tree, true);
352 // finally clean up and close file.
353 delete tree;
354 tfile.Close();
355 }
356 assert(outputEventTree);
357 // make sure we have an index ...
358 if(!outputEventTree->GetTreeIndex()) {
359 B2INFO("No Index found: building new index");
360 RootIOUtilities::buildIndex(outputEventTree);
361 }
362 // and finally write the tree
363 output.cd();
364 outputEventTree->Write();
365 // check if the number of full events in the metadata is zero:
366 // if so calculate number of full events now:
367 if (outputMetaData->getNFullEvents() == 0) {
368 outputMetaData->setNFullEvents(outputEventTree->GetEntries("EventMetaData.m_errorFlag == 0"));
369 }
370 }
371
372 B2INFO("Done processing events");
373
374 // we need to set the LFN to the absolute path name
375 outputMetaData->setLfn(fs::absolute(outputfilename).string());
376 // and maybe register it in the file catalog
377 if(variables.count("add-to-catalog")>0) {
378 FileCatalog::Instance().registerFile(outputfilename, *outputMetaData);
379 }
380 B2INFO("Writing FileMetaData");
381 // Create persistent tree
382 output.cd();
383 TTree outputMetaDataTree("persistent", "persistent");
384 outputMetaDataTree.Branch("FileMetaData", &outputMetaData);
385 for(auto &it: persistentMergeables){
386 outputMetaDataTree.Branch(it.first.c_str(), &it.second.first);
387 }
388 outputMetaDataTree.Fill();
389 outputMetaDataTree.Write();
390
391 // now clean up the mess ...
392 for(const auto& val: persistentMergeables){
393 delete val.second.first;
394 }
395 persistentMergeables.clear();
396 delete outputMetaData;
397 output.Close();
398}
static FileCatalog & Instance()
Static method to get a reference to the FileCatalog instance.
Definition: FileCatalog.cc:23
virtual bool registerFile(const std::string &fileName, FileMetaData &metaData, const std::string &oldLFN="")
Register a file in the (local) file catalog.
Definition: FileCatalog.cc:90
Metadata information about a file.
Definition: FileMetaData.h:29
create human-readable or JSON output for key value pairs.
@ c_Error
Error: for things that went wrong and have to be fixed.
Definition: LogConfig.h:30
@ c_Info
Info: for informational messages, e.g.
Definition: LogConfig.h:27
@ c_Fatal
Fatal: for situations were the program execution can not be continued.
Definition: LogConfig.h:31
@ c_Warning
Warning: for potential problems that the user should pay attention to.
Definition: LogConfig.h:29
@ c_Level
Log level of the message.
Definition: LogConfig.h:36
@ c_Message
Log message text.
Definition: LogConfig.h:37
void setLogInfo(ELogLevel logLevel, unsigned int logInfo)
Configure the printed log information for the given level.
Definition: LogConfig.h:127
LogConfig * getLogConfig()
Returns global log system configuration.
Definition: LogSystem.h:78
static LogSystem & Instance()
Static method to get a reference to the LogSystem instance.
Definition: LogSystem.cc:31
Abstract base class for objects that can be merged.
Definition: Mergeable.h:31
Helper class to factorize some necessary tasks when working with Belle2 output files.
Definition: RootFileInfo.h:27
void setCreationData(FileMetaData &metadata)
Fill the creation info of a file meta data: site, user, data.
void buildIndex(TTree *tree)
Build TTreeIndex on tree (assumes either EventMetaData branch exists or is a ntuple tree).
Abstract base class for different kinds of events.
Definition: ClusterUtils.h:24