Belle II Software development
b2file-merge.cc
1/**************************************************************************
2 * basf2 (Belle II Analysis Software Framework) *
3 * Author: The Belle II Collaboration *
4 * *
5 * See git log for contributors and copyright holders. *
6 * This file is licensed under LGPL-3.0, see LICENSE.md. *
7 **************************************************************************/
8#include <framework/dataobjects/FileMetaData.h>
9#include <framework/io/RootIOUtilities.h>
10#include <framework/io/RootFileInfo.h>
11#include <framework/logging/Logger.h>
12#include <framework/pcore/Mergeable.h>
13#include <framework/core/FileCatalog.h>
14#include <framework/utilities/KeyValuePrinter.h>
15#include <framework/core/MetadataService.h>
16
17#include <boost/program_options.hpp>
18#include <boost/algorithm/string.hpp>
19
20#include <TFile.h>
21#include <TTree.h>
22#include <TBranchElement.h>
23#include <TDirectory.h>
24#include <TH1.h>
25#include <TKey.h>
26#include <TClass.h>
27
28#include <filesystem>
29#include <iostream>
30#include <iomanip>
31#include <memory>
32#include <string>
33#include <set>
34#include <regex>
35
36using namespace Belle2;
37namespace po = boost::program_options;
38namespace fs = std::filesystem;
39
42using EventInfo = std::tuple<int, int, unsigned int>;
43
44namespace {
47 std::string removeLegacyGt(const std::string& globaltags)
48 {
49 std::regex legacy_gt(",?Legacy_IP_Information");
50 return std::regex_replace(globaltags, legacy_gt, "");
51 }
52
59 void collectHistograms(TDirectory& dir, const std::string& prefix,
60 std::map<std::string, std::pair<TH1*, size_t>>& histograms)
61 {
62 for (TObject* keyObj : *dir.GetListOfKeys()) {
63 auto* key = dynamic_cast<TKey*>(keyObj);
64 if (!key) continue;
65 const std::string name{key->GetName()};
66 const std::string path = prefix.empty() ? name : (prefix + "/" + name);
67 TClass* cls = TClass::GetClass(key->GetClassName());
68 if (!cls) continue;
69 if (cls->InheritsFrom(TDirectory::Class())) {
70 auto* subdir = dynamic_cast<TDirectory*>(key->ReadObj());
71 if (subdir) collectHistograms(*subdir, path, histograms);
72 } else if (cls->InheritsFrom(TH1::Class())) {
73 auto* hist = dynamic_cast<TH1*>(key->ReadObj());
74 if (!hist) continue;
75 auto it = histograms.find(path);
76 if (it == histograms.end()) {
77 hist->SetDirectory(nullptr); // detach from input file so we own it
78 histograms.emplace(path, std::make_pair(hist, size_t{1}));
79 } else {
80 it->second.first->Add(hist);
81 it->second.second++;
82 delete hist;
83 }
84 }
85 }
86 }
87
92 void writeHistograms(TFile& output,
93 const std::map<std::string, std::pair<TH1*, size_t>>& histograms)
94 {
95 for (const auto& [path, histCount] : histograms) {
96 output.cd();
97 const auto slash = path.rfind('/');
98 if (slash != std::string::npos) {
99 // Create intermediate directories level by level
100 TDirectory* dir = &output;
101 size_t start = 0;
102 const std::string dirpath = path.substr(0, slash);
103 while (start < dirpath.size()) {
104 const size_t end = dirpath.find('/', start);
105 const std::string part = (end == std::string::npos)
106 ? dirpath.substr(start) : dirpath.substr(start, end - start);
107 TDirectory* sub = dir->GetDirectory(part.c_str());
108 if (!sub) {
109 dir->mkdir(part.c_str());
110 sub = dir->GetDirectory(part.c_str());
111 }
112 if (!sub) break;
113 dir = sub;
114 if (end == std::string::npos) break;
115 start = end + 1;
116 }
117 dir->cd();
118 }
119 histCount.first->Write();
120 }
121 output.cd();
122 }
123}
124
125int main(int argc, char* argv[])
126{
127 // Parse options
128 std::string outputfilename;
129 std::vector<std::string> inputfilenames;
130 std::string jsonfilename;
131 po::options_description options("Options");
132 options.add_options()
133 ("help,h", "print all available options")
134 ("output,o", po::value<std::string>(&outputfilename), "output file name")
135 ("file", po::value<std::vector<std::string>>(&inputfilenames), "filename to merge")
136 ("force,f", "overwrite existing file")
137 ("no-catalog", "don't register output file in file catalog, This is now the default")
138 ("add-to-catalog", "register the output file in the file catalog")
139 ("job-information", po::value<std::string>(&jsonfilename), "create json file with metadata of output file and execution status")
140 ("quiet,q", "if given don't print infos, just warnings and errors");
141 po::positional_options_description positional;
142 positional.add("output", 1);
143 positional.add("file", -1);
144 po::variables_map variables;
145 po::store(po::command_line_parser(argc, argv).options(options).positional(positional).run(), variables);
146 po::notify(variables);
147 if (variables.count("help") || variables.count("output") == 0 || inputfilenames.empty()) {
148 std::cout << "Usage: " << argv[0] << " [<options>] OUTPUTFILE INPUTFILE [INPUTFILE...]" << std::endl;
149 std::cout << " " << argv[0] << " [<options>] [--file INPUTFILE...] "
150 << "-o OUTPUTFILE [--file INPUTFILE...]" << std::endl << std::endl;
151 std::cout << options << std::endl;
152 std::cout << (R"DOC(
153This program is intended to merge files created by separate basf2 jobs. It's
154similar to hadd but does correctly update the metadata in the file and merges
155the objects in the persistent tree correctly.
156
157The following restrictions apply:
158 - The files have to be created with the same release and steering file
159 - The persistent tree is only allowed to contain FileMetaData and objects
160 inheriting from Mergeable and the same list of objects needs to be present
161 in all files.
162 - The event tree needs to contain the same DataStore entries in all files.
163)DOC");
164 return 1;
165 }
166
167 //Initialize metadata service
169 if (!jsonfilename.empty()) {
171 }
172
173 // Remove the {module:} from log messages
174 auto logConfig = LogSystem::Instance().getLogConfig();
176 logConfig->setLogInfo(l, LogConfig::c_Level | LogConfig::c_Message);
177 }
178 if(variables.count("quiet")>0){
179 logConfig->setLogLevel(LogConfig::c_Warning);
180 }
181
182 B2INFO("Merging files into " << std::quoted(outputfilename));
183 // check output file
184 if (fs::exists(outputfilename) && variables.count("force")==0) {
185 B2ERROR("Output file exists, use -f to force overwriting it");
186 return 1;
187 }
188 // First we check all input files for consistency ...
189
190 // the final metadata we will write out
191 FileMetaData* outputMetaData{nullptr};
192 // set of all parent LFNs encountered in any file
193 std::set<std::string> allParents;
194 // map of all mergeable objects found in the persistent tree. The size_t is
195 // for counting to make sure we see all objects in all files
196 std::map<std::string, std::pair<Mergeable*, size_t>> persistentMergeables;
197 // set of all random seeds to print warning on duplicates
198 std::set<std::string> allSeeds;
199 // set of all users
200 std::set<std::string> allUsers;
201 // EventInfo for the high/low event numbers of the final FileMetaData
202 std::optional<EventInfo> lowEvt, highEvt;
203 // map of sets of all branch names in the event trees to compare against to make sure
204 // that they're the same in all files
205 std::map<std::string, std::set<std::string>> allEventBranches;
206 // set of all ntuple trees names to compare against to make sure
207 // that they're the same in all files (if they exist)
208 std::set<std::string> allEventTrees;
209 // map from histogram path to (accumulated TH1*, count of files where it was found)
210 std::map<std::string, std::pair<TH1*, size_t>> mergedHistograms;
211 // Release version to compare against. Same as FileMetaData::getRelease() but with the optional -modified removed
212 std::string outputRelease;
213
214 // so let's loop over all files and create FileMetaData and merge persistent
215 // objects if they inherit from Mergeable, bail if there's something else in
216 // there. The idea is that merging the persistent stuff is fast so we catch
217 // errors more quickly when we do this as a first step and events later on.
218 for (const auto& input : inputfilenames) {
219 try {
220 RootIOUtilities::RootFileInfo fileInfo(input);
221 // Ok, load the FileMetaData from the tree
222 const auto &fileMetaData = fileInfo.getFileMetaData();
223 auto description = fileMetaData.getDataDescription();
224 auto isNtuple = description.find("isNtupleMetaData");
225 // File looks usable, start checking metadata ...
226 B2INFO("adding file " << std::quoted(input));
227 if(LogSystem::Instance().isLevelEnabled(LogConfig::c_Info)) fileMetaData.Print("all");
228 auto trees = fileInfo.getTreeNames();
229 if(allEventTrees.empty()) {
230 std::swap(allEventTrees,trees);
231 }else{
232 if(trees!=allEventTrees){
233 B2ERROR("Trees in " << std::quoted(input) << " differ from "
234 << std::quoted(inputfilenames.front()));
235 continue;
236 }
237 }
238 for(const auto& tree : allEventTrees) {
239 auto branches = ((tree=="tree") &&
240 ((isNtuple==description.end()) || (isNtuple->second != "True"))
241 ) ? fileInfo.getBranchNames() : fileInfo.getNtupleBranchNames(tree);
242 if(branches.empty()) {
243 throw std::runtime_error("Could not find any branches in " + tree);
244 }
245 if(allEventBranches[tree].empty()) {
246 std::swap(allEventBranches[tree],branches);
247 }else{
248 if(branches!=allEventBranches[tree]){
249 B2ERROR("Branches in " << std::quoted(input + ":" + tree) << " differ from "
250 << std::quoted(inputfilenames.front() + ":" + tree));
251 }
252 }
253 }
254 // Collect and accumulate any histograms stored directly in this file
255 collectHistograms(fileInfo.getFile(), "", mergedHistograms);
256
257 // File looks good so far, now fix the persistent stuff, i.e. merge all
258 // objects in persistent tree
259 for(TObject* brObj: *fileInfo.getPersistentTree().GetListOfBranches()){
260 auto* br = dynamic_cast<TBranchElement*>(brObj);
261 // FileMetaData is handled separately
262 if(br && br->GetTargetClass() == FileMetaData::Class() && std::string(br->GetName()) == "FileMetaData")
263 continue;
264 // Make sure the branch is mergeable
265 if(!br) continue;
266 if(!br->GetTargetClass()->InheritsFrom(Mergeable::Class())){
267 B2ERROR("Branch " << std::quoted(br->GetName()) << " in persistent tree not inheriting from Mergeable");
268 continue;
269 }
270 // Ok, it's an object we now how to handle so get it from the tree
271 Mergeable* object{nullptr};
272 br->SetAddress(&object);
273 if(br->GetEntry(0)<=0) {
274 B2ERROR("Could not read branch " << std::quoted(br->GetName()) << " of entry 0 from persistent tree in "
275 << std::quoted(input));
276 continue;
277 }
278 // and either insert it into the map of mergeables or merge with the existing one
279 auto it = persistentMergeables.insert(std::make_pair(br->GetName(), std::make_pair(object, 1)));
280 if(!it.second) {
281 try {
282 it.first->second.first->merge(object);
283 }catch(std::exception &e){
284 B2FATAL("Cannot merge " << std::quoted(br->GetName()) << " in " << std::quoted(input) << ": " << e.what());
285 }
286 it.first->second.second++;
287 // ok, merged, get rid of it.
288 delete object;
289 }else{
290 B2INFO("Found mergeable object " << std::quoted(br->GetName()) << " in persistent tree");
291 }
292 }
293
294 std::string release = fileMetaData.getRelease();
295 if(release == "") {
296 B2ERROR("Cannot determine release used to create " << std::quoted(input));
297 continue;
298 } else if (fileMetaData.getRelease().ends_with("-modified")) {
299 B2WARNING("File " << std::quoted(input) << " created with modified software "
300 << fileMetaData.getRelease()
301 << ": cannot verify that files are compatible");
302 release = release.substr(0, release.size() - std::string("-modified").size());
303 }
304
305 // so, event tree looks good too. Now we merge the FileMetaData
306 if (!outputMetaData) {
307 // first input file, just take the event metadata
308 outputMetaData = new FileMetaData(fileMetaData);
309 outputRelease = release;
310 } else {
311 // check meta data for consistency, we could move this into FileMetaData...
312 if(release != outputRelease) {
313 B2ERROR("Release in " << std::quoted(input) << " differs from previous files: " <<
314 fileMetaData.getRelease() << " != " << outputMetaData->getRelease());
315 }
316 if(fileMetaData.getSteering() != outputMetaData->getSteering()){
317 // printing both steering files is not useful for anyone so just throw an error
318 B2ERROR("Steering file for " << std::quoted(input) << " differs from previous files.");
319 }
320 if(fileMetaData.getDatabaseGlobalTag() != outputMetaData->getDatabaseGlobalTag()){
321 // Related to BII-6093: we were adding the legacy gt only dependent on input file age, not creation release.
322 // This means there is a chance we want to merge files with and without the globaltag added if they cross the
323 // boundary. It doesn't hurt to keep the gt but we know we could process some of the files without it so as a remedy we
324 // check if the only difference is the legacy gt and if so we remove it from the output metadata ...
325 if(removeLegacyGt(fileMetaData.getDatabaseGlobalTag()) == removeLegacyGt(outputMetaData->getDatabaseGlobalTag())) {
326 outputMetaData->setDatabaseGlobalTag(removeLegacyGt(outputMetaData->getDatabaseGlobalTag()));
327 } else {
328 B2ERROR("Database globalTag in " << std::quoted(input) << " differs from previous files: " <<
329 fileMetaData.getDatabaseGlobalTag() << " != " << outputMetaData->getDatabaseGlobalTag());
330 }
331 }
332 if(fileMetaData.getDataDescription() != outputMetaData->getDataDescription()){
333 KeyValuePrinter cur(true);
334 for (const auto& descrPair : outputMetaData->getDataDescription())
335 cur.put(descrPair.first, descrPair.second);
336 KeyValuePrinter prev(true);
337 for (const auto& descrPair : fileMetaData.getDataDescription())
338 prev.put(descrPair.first, descrPair.second);
339
340 B2ERROR("dataDescription in " << std::quoted(input) << " differs from previous files:\n" << cur.string() << " vs.\n" << prev.string());
341 }
342 if(fileMetaData.isMC() != outputMetaData->isMC()){
343 B2ERROR("Type (real/MC) for " << std::quoted(input) << " differs from previous files.");
344 }
345 // update event numbers ...
346 outputMetaData->setMcEvents(outputMetaData->getMcEvents() + fileMetaData.getMcEvents());
347 outputMetaData->setNEvents(outputMetaData->getNEvents() + fileMetaData.getNEvents());
348 outputMetaData->setNFullEvents(outputMetaData->getNFullEvents() + fileMetaData.getNFullEvents());
349 }
350 if(fileMetaData.getNEvents() < 1) {
351 B2WARNING("File " << std::quoted(input) << " is empty.");
352 } else {
353 // make sure we have the correct low/high event numbers
354 EventInfo curLowEvt = EventInfo{fileMetaData.getExperimentLow(), fileMetaData.getRunLow(), fileMetaData.getEventLow()};
355 EventInfo curHighEvt = EventInfo{fileMetaData.getExperimentHigh(), fileMetaData.getRunHigh(), fileMetaData.getEventHigh()};
356 if(!lowEvt or curLowEvt < *lowEvt) lowEvt = curLowEvt;
357 if(!highEvt or curHighEvt > *highEvt) highEvt = curHighEvt;
358 }
359 // check if we have seen this random seed already in one of the previous files
360 auto it = allSeeds.insert(fileMetaData.getRandomSeed());
361 if(!it.second) {
362 B2WARNING("Duplicate Random Seed: " << std::quoted(fileMetaData.getRandomSeed()) << " present in more then one file");
363 }
364 allUsers.insert(fileMetaData.getUser());
365 // remember all parent files we encounter
366 for (int i = 0; i < fileMetaData.getNParents(); ++i) {
367 allParents.insert(fileMetaData.getParent(i));
368 }
369 }catch(std::exception &e) {
370 B2ERROR("input file " << std::quoted(input) << ": " << e.what());
371 }
372 }
373
374 //Check if the same mergeables were found in all files
375 for(const auto &val: persistentMergeables){
376 if(val.second.second != inputfilenames.size()){
377 B2ERROR("Mergeable " << std::quoted(val.first) << " only present in " << val.second.second << " out of "
378 << inputfilenames.size() << " files");
379 }
380 }
381
382 // Check for user names
383 if(allUsers.size()>1) {
384 B2WARNING("Multiple different users created input files: " << boost::algorithm::join(allUsers, ", "));
385 }
386
387 // Stop processing in case of error
388 if (LogSystem::Instance().getMessageCounter(LogConfig::c_Error) > 0) return 1;
389
390 if(!outputMetaData){
391 // technically it's rather impossible to arrive here: if there were no
392 // input files we exit with a usage message and if any of the files could
393 // not be processed then the error count should be >0. Nevertheless
394 // let's do this check to be on the very safe side and to make clang
395 // analyzer happy.
396 B2FATAL("For some reason no files could be processed");
397 return 1;
398 }
399 if(!lowEvt) {
400 B2WARNING("All Files were empty");
401 lowEvt = EventInfo{-1, -1, 0};
402 highEvt = EventInfo{-1, -1, 0};
403 }
404
405 // Final changes to metadata
406 outputMetaData->setLfn("");
407 outputMetaData->setParents(std::vector<std::string>(allParents.begin(), allParents.end()));
408 outputMetaData->setLow(std::get<0>(*lowEvt), std::get<1>(*lowEvt), std::get<2>(*lowEvt));
409 outputMetaData->setHigh(std::get<0>(*highEvt), std::get<1>(*highEvt), std::get<2>(*highEvt));
410 // If more then one file set an empty random seed
411 if(inputfilenames.size()>1){
412 outputMetaData->setRandomSeed("");
413 }
414 RootIOUtilities::setCreationData(*outputMetaData);
415 // Set (again) the release, since it's overwritten by the previous line
416 outputMetaData->setRelease(outputRelease);
417
418 // OK we have a valid FileMetaData and merged all persistent objects, now do
419 // the conversion of the event trees and create the output file.
420 auto output = std::unique_ptr<TFile>{TFile::Open(outputfilename.c_str(), "RECREATE")};
421 if (output == nullptr or output->IsZombie()) {
422 B2ERROR("Could not create output file " << std::quoted(outputfilename));
423 return 1;
424 }
425
426 for (const auto& treeName : allEventTrees) {
427 TTree* outputEventTree{nullptr};
428 for (const auto& input : inputfilenames) {
429 B2INFO("processing events from " << std::quoted(input + ":" + treeName));
430 auto tfile = std::unique_ptr<TFile>{TFile::Open(input.c_str(), "READ")};
431 // At this point, we already checked that the input files are valid and exist
432 // so it's safe to access tfile directly
433 auto* tree = dynamic_cast<TTree*>(tfile->Get(treeName.c_str()));
434 if (!outputEventTree){
435 output->cd();
436 outputEventTree = tree->CloneTree(0);
437 } else {
438 outputEventTree->CopyAddresses(tree);
439 }
440 // Now let's copy all entries without unpacking (fast), layout the
441 // baskets in an optimal order for sequential reading (SortBasketByEntry)
442 // and rebuild the index in case some parts of the index are missing
443 outputEventTree->CopyEntries(tree, -1, "fast SortBasketsByEntry BuildIndexOnError");
444 // and reset the branch addresses to not be connected anymore
445 outputEventTree->CopyAddresses(tree, true);
446 // finally clean up and close file.
447 delete tree;
448 tfile->Close();
449 }
450 assert(outputEventTree);
451 // make sure we have an index ...
452 if(!outputEventTree->GetTreeIndex()) {
453 B2INFO("No Index found: building new index");
454 RootIOUtilities::buildIndex(outputEventTree);
455 }
456 // and finally write the tree
457 output->cd();
458 outputEventTree->Write();
459 // check if the number of full events in the metadata is zero:
460 // if so calculate number of full events now:
461 if (outputMetaData->getNFullEvents() == 0) {
462 outputMetaData->setNFullEvents(outputEventTree->GetEntries("EventMetaData.m_errorFlag == 0"));
463 }
464 }
465
466 B2INFO("Done processing events");
467
468 // Write merged histograms to output and check consistency
469 if (!mergedHistograms.empty()) {
470 B2INFO("Writing histograms");
471 writeHistograms(*output, mergedHistograms);
472 for (const auto& [name, histCount] : mergedHistograms) {
473 if (histCount.second != inputfilenames.size()) {
474 B2ERROR("Histogram " << std::quoted(name) << " only present in "
475 << histCount.second << " out of " << inputfilenames.size() << " files");
476 }
477 }
478 for (auto& [name, histCount] : mergedHistograms) {
479 delete histCount.first;
480 }
481 mergedHistograms.clear();
482 }
483
484 // we need to set the LFN to the absolute path name
485 outputMetaData->setLfn(fs::absolute(outputfilename).string());
486 // and maybe register it in the file catalog
487 if(variables.count("add-to-catalog")>0) {
488 FileCatalog::Instance().registerFile(outputfilename, *outputMetaData);
489 }
490 B2INFO("Writing FileMetaData");
491 // Create persistent tree
492 output->cd();
493 TTree outputMetaDataTree("persistent", "persistent");
494 outputMetaDataTree.Branch("FileMetaData", &outputMetaData);
495 for(auto &it: persistentMergeables){
496 outputMetaDataTree.Branch(it.first.c_str(), &it.second.first);
497 }
498 outputMetaDataTree.Fill();
499 outputMetaDataTree.Write();
500
501 // now clean up the mess ...
502 for(const auto& val: persistentMergeables){
503 delete val.second.first;
504 }
505 persistentMergeables.clear();
506 auto outputMetaDataCopy = *outputMetaData;
507 delete outputMetaData;
508 output->Close();
509
510 // and now add it to the metadata service
511 MetadataService::Instance().addRootOutputFile(outputfilename, &outputMetaDataCopy, "b2file-merge");
512
513 // report completion in job metadata
514 MetadataService::Instance().addBasf2Status("finished successfully");
516}
static FileCatalog & Instance()
Static method to get a reference to the FileCatalog instance.
virtual bool registerFile(const std::string &fileName, FileMetaData &metaData, const std::string &oldLFN="")
Register a file in the (local) file catalog.
Metadata information about a file.
void setLfn(const std::string &lfn)
Setter for LFN.
create human-readable or JSON output for key value pairs.
@ c_Error
Error: for things that went wrong and have to be fixed.
Definition LogConfig.h:30
@ c_Info
Info: for informational messages, e.g.
Definition LogConfig.h:27
@ c_Fatal
Fatal: for situations were the program execution can not be continued.
Definition LogConfig.h:31
@ c_Warning
Warning: for potential problems that the user should pay attention to.
Definition LogConfig.h:29
@ c_Level
Log level of the message.
Definition LogConfig.h:36
@ c_Message
Log message text.
Definition LogConfig.h:37
LogConfig * getLogConfig()
Returns global log system configuration.
Definition LogSystem.h:78
static LogSystem & Instance()
Static method to get a reference to the LogSystem instance.
Definition LogSystem.cc:28
Abstract base class for objects that can be merged.
Definition Mergeable.h:31
void addRootOutputFile(const std::string &fileName, const FileMetaData *metaData=nullptr, const char *type="RootOutput")
Add the metadata of a root output file.
void addBasf2Status(const std::string &message="")
Add metadata of basf2 status.
void setJsonFileName(const std::string &fileName)
Set the name of the json metadata file.
static MetadataService & Instance()
Static method to get a reference to the MetadataService instance.
void finishBasf2(bool success=true)
Add metadata for basf2 completion.
Helper class to factorize some necessary tasks when working with Belle2 output files.
void setCreationData(FileMetaData &metadata)
Fill the creation info of a file meta data: site, user, data.
void buildIndex(TTree *tree)
Build TTreeIndex on tree (assumes either EventMetaData branch exists or is a ntuple tree).
Abstract base class for different kinds of events.