9#include <framework/dataobjects/FileMetaData.h>
10#include <framework/core/FileCatalog.h>
11#include <framework/core/MetadataService.h>
12#include <framework/datastore/DataStore.h>
13#include <framework/io/RootFileInfo.h>
14#include <framework/io/RootIOUtilities.h>
15#include <framework/logging/Logger.h>
16#include <framework/pcore/Mergeable.h>
17#include <framework/utilities/KeyValuePrinter.h>
19#include <boost/program_options.hpp>
20#include <boost/algorithm/string.hpp>
24#include <TBranchElement.h>
25#include <TDirectory.h>
39namespace po = boost::program_options;
40namespace fs = std::filesystem;
44using EventInfo = std::tuple<int, int, unsigned int>;
49 std::string removeLegacyGt(
const std::string& globaltags)
51 std::regex legacy_gt(
",?Legacy_IP_Information");
52 return std::regex_replace(globaltags, legacy_gt,
"");
61 void collectHistograms(TDirectory& dir,
const std::string& prefix,
62 std::map<std::string, std::pair<TH1*, size_t>>& histograms)
64 for (TObject* keyObj : *dir.GetListOfKeys()) {
65 auto* key =
dynamic_cast<TKey*
>(keyObj);
67 const std::string name{key->GetName()};
68 const std::string path = prefix.empty() ? name : (prefix +
"/" + name);
69 TClass* cls = TClass::GetClass(key->GetClassName());
71 if (cls->InheritsFrom(TDirectory::Class())) {
72 auto* subdir =
dynamic_cast<TDirectory*
>(key->ReadObj());
73 if (subdir) collectHistograms(*subdir, path, histograms);
74 }
else if (cls->InheritsFrom(TH1::Class())) {
75 auto* hist =
dynamic_cast<TH1*
>(key->ReadObj());
77 auto it = histograms.find(path);
78 if (it == histograms.end()) {
79 hist->SetDirectory(
nullptr);
80 histograms.emplace(path, std::make_pair(hist,
size_t{1}));
82 it->second.first->Add(hist);
94 void writeHistograms(TFile& output,
95 const std::map<std::string, std::pair<TH1*, size_t>>& histograms)
97 for (
const auto& [path, histCount] : histograms) {
99 const auto slash = path.rfind(
'/');
100 if (slash != std::string::npos) {
102 TDirectory* dir = &output;
104 const std::string dirpath = path.substr(0, slash);
105 while (start < dirpath.size()) {
106 const size_t end = dirpath.find(
'/', start);
107 const std::string part = (end == std::string::npos)
108 ? dirpath.substr(start) : dirpath.substr(start, end - start);
109 TDirectory* sub = dir->GetDirectory(part.c_str());
111 dir->mkdir(part.c_str());
112 sub = dir->GetDirectory(part.c_str());
116 if (end == std::string::npos)
break;
121 histCount.first->Write();
127int main(
int argc,
char* argv[])
130 std::string outputfilename;
131 std::vector<std::string> inputfilenames;
132 std::string jsonfilename;
133 po::options_description options(
"Options");
134 options.add_options()
135 (
"help,h",
"print all available options")
136 (
"output,o", po::value<std::string>(&outputfilename),
"output file name")
137 (
"file", po::value<std::vector<std::string>>(&inputfilenames),
"filename to merge")
138 (
"force,f",
"overwrite existing file")
139 (
"no-catalog",
"don't register output file in file catalog, This is now the default")
140 (
"add-to-catalog",
"register the output file in the file catalog")
141 (
"job-information", po::value<std::string>(&jsonfilename),
"create json file with metadata of output file and execution status")
142 (
"quiet,q",
"if given don't print infos, just warnings and errors")
143 (
"reoptimize,O",
"reoptimize basket size when merging TTrees, equivalent to `hadd -O` (much slower!)");
144 po::positional_options_description positional;
145 positional.add(
"output", 1);
146 positional.add(
"file", -1);
147 po::variables_map variables;
148 po::store(po::command_line_parser(argc, argv).options(options).positional(positional).run(), variables);
149 po::notify(variables);
150 if (variables.count(
"help") || variables.count(
"output") == 0 || inputfilenames.empty()) {
151 std::cout <<
"Usage: " << argv[0] <<
" [<options>] OUTPUTFILE INPUTFILE [INPUTFILE...]" << std::endl;
152 std::cout <<
" " << argv[0] <<
" [<options>] [--file INPUTFILE...] "
153 <<
"-o OUTPUTFILE [--file INPUTFILE...]" << std::endl << std::endl;
154 std::cout << options << std::endl;
156This program is intended to merge files created by separate basf2 jobs. It's
157similar to hadd but does correctly update the metadata in the file and merges
158the objects in the persistent tree correctly.
160The following restrictions apply:
161 - The files have to be created with the same release and steering file
162 - The persistent tree is only allowed to contain FileMetaData and objects
163 inheriting from Mergeable and the same list of objects needs to be present
165 - The event tree needs to contain the same DataStore entries in all files.
172 if (!jsonfilename.empty()) {
181 if(variables.count(
"quiet")>0){
185 B2INFO(
"Merging files into " << std::quoted(outputfilename));
187 if (fs::exists(outputfilename) && variables.count(
"force")==0) {
188 B2ERROR(
"Output file exists, use -f to force overwriting it");
196 const bool reoptimize = variables.count(
"reoptimize") > 0;
197 const std::string copyOptions = reoptimize
198 ?
"SortBasketsByEntry BuildIndexOnError"
199 :
"fast SortBasketsByEntry BuildIndexOnError";
200 B2INFO(
"Will use the merging options: " << std::quoted(copyOptions));
207 std::set<std::string> allParents;
210 std::map<std::string, std::pair<Mergeable*, size_t>> persistentMergeables;
212 std::set<std::string> allSeeds;
214 std::set<std::string> allUsers;
216 std::optional<EventInfo> lowEvt, highEvt;
219 std::map<std::string, std::set<std::string>> allEventBranches;
222 std::set<std::string> allEventTrees;
224 std::map<std::string, std::pair<TH1*, size_t>> mergedHistograms;
226 std::string outputRelease;
232 for (
const auto& input : inputfilenames) {
236 const auto &fileMetaData = fileInfo.getFileMetaData();
237 auto description = fileMetaData.getDataDescription();
238 auto isNtuple = description.find(
"isNtupleMetaData");
240 B2INFO(
"adding file " << std::quoted(input));
242 auto trees = fileInfo.getTreeNames();
243 if(allEventTrees.empty()) {
244 std::swap(allEventTrees,trees);
246 if(trees!=allEventTrees){
247 B2ERROR(
"Trees in " << std::quoted(input) <<
" differ from "
248 << std::quoted(inputfilenames.front()));
252 for(
const auto& tree : allEventTrees) {
253 auto branches = ((tree==
"tree") &&
254 ((isNtuple==description.end()) || (isNtuple->second !=
"True"))
255 ) ? fileInfo.getBranchNames() : fileInfo.getNtupleBranchNames(tree);
256 if(branches.empty()) {
257 throw std::runtime_error(
"Could not find any branches in " + tree);
259 if(allEventBranches[tree].empty()) {
260 std::swap(allEventBranches[tree],branches);
262 if(branches!=allEventBranches[tree]){
263 B2ERROR(
"Branches in " << std::quoted(input +
":" + tree) <<
" differ from "
264 << std::quoted(inputfilenames.front() +
":" + tree));
269 collectHistograms(fileInfo.getFile(),
"", mergedHistograms);
273 for(TObject* brObj: *fileInfo.getPersistentTree().GetListOfBranches()){
274 auto* br =
dynamic_cast<TBranchElement*
>(brObj);
276 if(br && br->GetTargetClass() == FileMetaData::Class() && std::string(br->GetName()) ==
"FileMetaData")
280 if(!br->GetTargetClass()->InheritsFrom(Mergeable::Class())){
286 br->SetAddress(&
object);
287 if(br->GetEntry(0)<=0) {
289 << std::quoted(input));
293 auto it = persistentMergeables.insert(std::make_pair(br->GetName(), std::make_pair(
object, 1)));
296 it.first->second.first->merge(
object);
297 }
catch(std::exception &e){
298 B2FATAL(
"Cannot merge " << std::quoted(br->GetName()) <<
" in " << std::quoted(input) <<
": " << e.what());
300 it.first->second.second++;
308 std::string release = fileMetaData.getRelease();
310 B2ERROR(
"Cannot determine release used to create " << std::quoted(input));
312 }
else if (fileMetaData.getRelease().ends_with(
"-modified")) {
313 B2WARNING(
"File " << std::quoted(input) <<
" created with modified software "
314 << fileMetaData.getRelease()
315 <<
": cannot verify that files are compatible");
316 release = release.substr(0, release.size() - std::string(
"-modified").size());
320 if (!outputMetaData) {
323 outputRelease = release;
326 if(release != outputRelease) {
327 B2ERROR(
"Release in " << std::quoted(input) <<
" differs from previous files: " <<
328 fileMetaData.getRelease() <<
" != " << outputMetaData->getRelease());
330 if(fileMetaData.getSteering() != outputMetaData->getSteering()){
332 B2ERROR(
"Steering file for " << std::quoted(input) <<
" differs from previous files.");
334 if(fileMetaData.getDatabaseGlobalTag() != outputMetaData->getDatabaseGlobalTag()){
339 if(removeLegacyGt(fileMetaData.getDatabaseGlobalTag()) == removeLegacyGt(outputMetaData->getDatabaseGlobalTag())) {
340 outputMetaData->setDatabaseGlobalTag(removeLegacyGt(outputMetaData->getDatabaseGlobalTag()));
342 B2ERROR(
"Database globalTag in " << std::quoted(input) <<
" differs from previous files: " <<
343 fileMetaData.getDatabaseGlobalTag() <<
" != " << outputMetaData->getDatabaseGlobalTag());
346 if(fileMetaData.getDataDescription() != outputMetaData->getDataDescription()){
348 for (
const auto& descrPair : outputMetaData->getDataDescription())
349 cur.put(descrPair.first, descrPair.second);
351 for (
const auto& descrPair : fileMetaData.getDataDescription())
352 prev.put(descrPair.first, descrPair.second);
354 B2ERROR(
"dataDescription in " << std::quoted(input) <<
" differs from previous files:\n" << cur.string() <<
" vs.\n" << prev.string());
356 if(fileMetaData.isMC() != outputMetaData->isMC()){
357 B2ERROR(
"Type (real/MC) for " << std::quoted(input) <<
" differs from previous files.");
360 outputMetaData->setMcEvents(outputMetaData->getMcEvents() + fileMetaData.getMcEvents());
361 outputMetaData->setNEvents(outputMetaData->getNEvents() + fileMetaData.getNEvents());
362 outputMetaData->setNFullEvents(outputMetaData->getNFullEvents() + fileMetaData.getNFullEvents());
364 if(fileMetaData.getNEvents() < 1) {
365 B2WARNING(
"File " << std::quoted(input) <<
" is empty.");
368 EventInfo curLowEvt = EventInfo{fileMetaData.getExperimentLow(), fileMetaData.getRunLow(), fileMetaData.getEventLow()};
369 EventInfo curHighEvt = EventInfo{fileMetaData.getExperimentHigh(), fileMetaData.getRunHigh(), fileMetaData.getEventHigh()};
370 if(!lowEvt or curLowEvt < *lowEvt) lowEvt = curLowEvt;
371 if(!highEvt or curHighEvt > *highEvt) highEvt = curHighEvt;
374 auto it = allSeeds.insert(fileMetaData.getRandomSeed());
376 B2WARNING(
"Duplicate Random Seed: " << std::quoted(fileMetaData.getRandomSeed()) <<
" present in more then one file");
378 allUsers.insert(fileMetaData.getUser());
380 for (
int i = 0; i < fileMetaData.getNParents(); ++i) {
381 allParents.insert(fileMetaData.getParent(i));
383 }
catch(std::exception &e) {
384 B2ERROR(
"input file " << std::quoted(input) <<
": " << e.what());
389 for(
const auto &val: persistentMergeables){
390 if(val.second.second != inputfilenames.size()){
391 B2ERROR(
"Mergeable " << std::quoted(val.first) <<
" only present in " << val.second.second <<
" out of "
392 << inputfilenames.size() <<
" files");
397 if(allUsers.size()>1) {
398 B2WARNING(
"Multiple different users created input files: " << boost::algorithm::join(allUsers,
", "));
410 B2FATAL(
"For some reason no files could be processed");
414 B2WARNING(
"All Files were empty");
415 lowEvt = EventInfo{-1, -1, 0};
416 highEvt = EventInfo{-1, -1, 0};
420 outputMetaData->
setLfn(
"");
421 outputMetaData->setParents(std::vector<std::string>(allParents.begin(), allParents.end()));
422 outputMetaData->setLow(std::get<0>(*lowEvt), std::get<1>(*lowEvt), std::get<2>(*lowEvt));
423 outputMetaData->setHigh(std::get<0>(*highEvt), std::get<1>(*highEvt), std::get<2>(*highEvt));
425 if(inputfilenames.size()>1){
426 outputMetaData->setRandomSeed(
"");
430 outputMetaData->setRelease(outputRelease);
434 auto output = std::unique_ptr<TFile>{TFile::Open(outputfilename.c_str(),
"RECREATE")};
435 if (output ==
nullptr or output->IsZombie()) {
436 B2ERROR(
"Could not create output file " << std::quoted(outputfilename));
440 for (
const auto& treeName : allEventTrees) {
441 TTree* outputEventTree{
nullptr};
442 for (
const auto& input : inputfilenames) {
443 B2INFO(
"processing events from " << std::quoted(input +
":" + treeName));
444 auto tfile = std::unique_ptr<TFile>{TFile::Open(input.c_str(),
"READ")};
447 auto* tree =
dynamic_cast<TTree*
>(tfile->Get(treeName.c_str()));
448 if (!outputEventTree){
450 outputEventTree = tree->CloneTree(0);
452 outputEventTree->CopyAddresses(tree);
455 outputEventTree->CopyEntries(tree, -1, copyOptions.c_str());
457 outputEventTree->CopyAddresses(tree,
true);
462 assert(outputEventTree);
465 if(!outputEventTree->GetTreeIndex()) {
466 B2INFO(
"No Index found: building new index");
472 outputEventTree->Write();
475 if (outputMetaData->getNFullEvents() == 0) {
476 outputMetaData->setNFullEvents(outputEventTree->GetEntries(
"EventMetaData.m_errorFlag == 0"));
480 B2INFO(
"Done processing events");
483 if (!mergedHistograms.empty()) {
484 B2INFO(
"Writing histograms");
485 writeHistograms(*output, mergedHistograms);
486 for (
const auto& [name, histCount] : mergedHistograms) {
487 if (histCount.second != inputfilenames.size()) {
488 B2ERROR(
"Histogram " << std::quoted(name) <<
" only present in "
489 << histCount.second <<
" out of " << inputfilenames.size() <<
" files");
492 for (
auto& [name, histCount] : mergedHistograms) {
493 delete histCount.first;
495 mergedHistograms.clear();
499 outputMetaData->setLfn(fs::absolute(outputfilename).
string());
501 if(variables.count(
"add-to-catalog")>0) {
504 B2INFO(
"Writing FileMetaData");
508 outputMetaDataTree.Branch(
"FileMetaData", &outputMetaData);
509 for(
auto &it: persistentMergeables){
510 outputMetaDataTree.Branch(it.first.c_str(), &it.second.first);
512 outputMetaDataTree.Fill();
513 outputMetaDataTree.Write();
516 for(
const auto& val: persistentMergeables){
517 delete val.second.first;
519 persistentMergeables.clear();
520 auto outputMetaDataCopy = *outputMetaData;
521 delete outputMetaData;
@ c_Persistent
Object is available during entire execution time.
@ c_Event
Different object in each event, all objects/arrays are invalidated after event() function has been ca...
static FileCatalog & Instance()
Static method to get a reference to the FileCatalog instance.
virtual bool registerFile(const std::string &fileName, FileMetaData &metaData, const std::string &oldLFN="")
Register a file in the (local) file catalog.
create human-readable or JSON output for key value pairs.
@ c_Error
Error: for things that went wrong and have to be fixed.
@ c_Info
Info: for informational messages, e.g.
@ c_Fatal
Fatal: for situations were the program execution can not be continued.
@ c_Warning
Warning: for potential problems that the user should pay attention to.
@ c_Level
Log level of the message.
@ c_Message
Log message text.
LogConfig * getLogConfig()
Returns global log system configuration.
static LogSystem & Instance()
Static method to get a reference to the LogSystem instance.
Abstract base class for objects that can be merged.
Helper class to factorize some necessary tasks when working with Belle2 output files.
const std::string c_treeNames[]
Names of trees.
void setCreationData(FileMetaData &metadata)
Fill the creation info of a file meta data: site, user, data.
void buildIndex(TTree *tree)
Build TTreeIndex on tree (assumes either EventMetaData branch exists or is a ntuple tree).
Abstract base class for different kinds of events.