Belle II Software development
b2file-merge.cc
1/**************************************************************************
2 * basf2 (Belle II Analysis Software Framework) *
3 * Author: The Belle II Collaboration *
4 * *
5 * See git log for contributors and copyright holders. *
6 * This file is licensed under LGPL-3.0, see LICENSE.md. *
7 **************************************************************************/
8#include <framework/dataobjects/FileMetaData.h>
9#include <framework/io/RootIOUtilities.h>
10#include <framework/io/RootFileInfo.h>
11#include <framework/logging/Logger.h>
12#include <framework/pcore/Mergeable.h>
13#include <framework/core/FileCatalog.h>
14#include <framework/utilities/KeyValuePrinter.h>
15#include <framework/core/MetadataService.h>
16
17#include <boost/program_options.hpp>
18#include <boost/algorithm/string.hpp>
19
20#include <TFile.h>
21#include <TTree.h>
22#include <TBranchElement.h>
23
24#include <filesystem>
25#include <iostream>
26#include <iomanip>
27#include <memory>
28#include <string>
29#include <set>
30#include <regex>
31
32using namespace Belle2;
33namespace po = boost::program_options;
34namespace fs = std::filesystem;
35
38using EventInfo = std::tuple<int, int, unsigned int>;
39
40namespace {
43 std::string removeLegacyGt(const std::string& globaltags)
44 {
45 std::regex legacy_gt(",?Legacy_IP_Information");
46 return std::regex_replace(globaltags, legacy_gt, "");
47 }
48}
49
50int main(int argc, char* argv[])
51{
52 // Parse options
53 std::string outputfilename;
54 std::vector<std::string> inputfilenames;
55 std::string jsonfilename;
56 po::options_description options("Options");
57 options.add_options()
58 ("help,h", "print all available options")
59 ("output,o", po::value<std::string>(&outputfilename), "output file name")
60 ("file", po::value<std::vector<std::string>>(&inputfilenames), "filename to merge")
61 ("force,f", "overwrite existing file")
62 ("no-catalog", "don't register output file in file catalog, This is now the default")
63 ("add-to-catalog", "register the output file in the file catalog")
64 ("job-information", po::value<std::string>(&jsonfilename), "create json file with metadata of output file and execution status")
65 ("quiet,q", "if given don't print infos, just warnings and errors");
66 po::positional_options_description positional;
67 positional.add("output", 1);
68 positional.add("file", -1);
69 po::variables_map variables;
70 po::store(po::command_line_parser(argc, argv).options(options).positional(positional).run(), variables);
71 po::notify(variables);
72 if (variables.count("help") || variables.count("output") == 0 || inputfilenames.empty()) {
73 std::cout << "Usage: " << argv[0] << " [<options>] OUTPUTFILE INPUTFILE [INPUTFILE...]" << std::endl;
74 std::cout << " " << argv[0] << " [<options>] [--file INPUTFILE...] "
75 << "-o OUTPUTFILE [--file INPUTFILE...]" << std::endl << std::endl;
76 std::cout << options << std::endl;
77 std::cout << (R"DOC(
78This program is intended to merge files created by separate basf2 jobs. It's
79similar to hadd but does correctly update the metadata in the file and merges
80the objects in the persistent tree correctly.
81
82The following restrictions apply:
83 - The files have to be created with the same release and steering file
84 - The persistent tree is only allowed to contain FileMetaData and objects
85 inheriting from Mergeable and the same list of objects needs to be present
86 in all files.
87 - The event tree needs to contain the same DataStore entries in all files.
88)DOC");
89 return 1;
90 }
91
92 //Initialize metadata service
94 if (!jsonfilename.empty()) {
96 }
97
98 // Remove the {module:} from log messages
99 auto logConfig = LogSystem::Instance().getLogConfig();
102 }
103 if(variables.count("quiet")>0){
104 logConfig->setLogLevel(LogConfig::c_Warning);
105 }
106
107 B2INFO("Merging files into " << std::quoted(outputfilename));
108 // check output file
109 if (fs::exists(outputfilename) && variables.count("force")==0) {
110 B2ERROR("Output file exists, use -f to force overwriting it");
111 return 1;
112 }
113 // First we check all input files for consistency ...
114
115 // the final metadata we will write out
116 FileMetaData* outputMetaData{nullptr};
117 // set of all parent LFNs encountered in any file
118 std::set<std::string> allParents;
119 // map of all mergeable objects found in the persistent tree. The size_t is
120 // for counting to make sure we see all objects in all files
121 std::map<std::string, std::pair<Mergeable*, size_t>> persistentMergeables;
122 // set of all random seeds to print warning on duplicates
123 std::set<std::string> allSeeds;
124 // set of all users
125 std::set<std::string> allUsers;
126 // EventInfo for the high/low event numbers of the final FileMetaData
127 std::optional<EventInfo> lowEvt, highEvt;
128 // map of sets of all branch names in the event trees to compare against to make sure
129 // that they're the same in all files
130 std::map<std::string, std::set<std::string>> allEventBranches;
131 // set of all ntuple trees names to compare against to make sure
132 // that they're the same in all files (if they exist)
133 std::set<std::string> allEventTrees;
134 // Release version to compare against. Same as FileMetaData::getRelease() but with the optional -modified removed
135 std::string outputRelease;
136
137 // so let's loop over all files and create FileMetaData and merge persistent
138 // objects if they inherit from Mergeable, bail if there's something else in
139 // there. The idea is that merging the persistent stuff is fast so we catch
140 // errors more quickly when we do this as a first step and events later on.
141 for (const auto& input : inputfilenames) {
142 try {
143 RootIOUtilities::RootFileInfo fileInfo(input);
144 // Ok, load the FileMetaData from the tree
145 const auto &fileMetaData = fileInfo.getFileMetaData();
146 auto description = fileMetaData.getDataDescription();
147 auto isNtuple = description.find("isNtupleMetaData");
148 // File looks usable, start checking metadata ...
149 B2INFO("adding file " << std::quoted(input));
150 if(LogSystem::Instance().isLevelEnabled(LogConfig::c_Info)) fileMetaData.Print("all");
151 auto trees = fileInfo.getTreeNames();
152 if(allEventTrees.empty()) {
153 std::swap(allEventTrees,trees);
154 }else{
155 if(trees!=allEventTrees){
156 B2ERROR("Trees in " << std::quoted(input) << " differ from "
157 << std::quoted(inputfilenames.front()));
158 continue;
159 }
160 }
161 for(const auto& tree : allEventTrees) {
162 auto branches = ((tree=="tree") &&
163 ((isNtuple==description.end()) || (isNtuple->second != "True"))
164 ) ? fileInfo.getBranchNames() : fileInfo.getNtupleBranchNames(tree);
165 if(branches.empty()) {
166 throw std::runtime_error("Could not find any branches in " + tree);
167 }
168 if(allEventBranches[tree].empty()) {
169 std::swap(allEventBranches[tree],branches);
170 }else{
171 if(branches!=allEventBranches[tree]){
172 B2ERROR("Branches in " << std::quoted(input + ":" + tree) << " differ from "
173 << std::quoted(inputfilenames.front() + ":" + tree));
174 }
175 }
176 }
177 // File looks good so far, now fix the persistent stuff, i.e. merge all
178 // objects in persistent tree
179 for(TObject* brObj: *fileInfo.getPersistentTree().GetListOfBranches()){
180 auto* br = dynamic_cast<TBranchElement*>(brObj);
181 // FileMetaData is handled separately
182 if(br && br->GetTargetClass() == FileMetaData::Class() && std::string(br->GetName()) == "FileMetaData")
183 continue;
184 // Make sure the branch is mergeable
185 if(!br) continue;
186 if(!br->GetTargetClass()->InheritsFrom(Mergeable::Class())){
187 B2ERROR("Branch " << std::quoted(br->GetName()) << " in persistent tree not inheriting from Mergeable");
188 continue;
189 }
190 // Ok, it's an object we now how to handle so get it from the tree
191 Mergeable* object{nullptr};
192 br->SetAddress(&object);
193 if(br->GetEntry(0)<=0) {
194 B2ERROR("Could not read branch " << std::quoted(br->GetName()) << " of entry 0 from persistent tree in "
195 << std::quoted(input));
196 continue;
197 }
198 // and either insert it into the map of mergeables or merge with the existing one
199 auto it = persistentMergeables.insert(std::make_pair(br->GetName(), std::make_pair(object, 1)));
200 if(!it.second) {
201 try {
202 it.first->second.first->merge(object);
203 }catch(std::exception &e){
204 B2FATAL("Cannot merge " << std::quoted(br->GetName()) << " in " << std::quoted(input) << ": " << e.what());
205 }
206 it.first->second.second++;
207 // ok, merged, get rid of it.
208 delete object;
209 }else{
210 B2INFO("Found mergeable object " << std::quoted(br->GetName()) << " in persistent tree");
211 }
212 }
213
214 std::string release = fileMetaData.getRelease();
215 if(release == "") {
216 B2ERROR("Cannot determine release used to create " << std::quoted(input));
217 continue;
218 }else if(boost::algorithm::ends_with(fileMetaData.getRelease(), "-modified")){
219 B2WARNING("File " << std::quoted(input) << " created with modified software "
220 << fileMetaData.getRelease()
221 << ": cannot verify that files are compatible");
222 release = release.substr(0, release.size() - std::string("-modified").size());
223 }
224
225 // so, event tree looks good too. Now we merge the FileMetaData
226 if (!outputMetaData) {
227 // first input file, just take the event metadata
228 outputMetaData = new FileMetaData(fileMetaData);
229 outputRelease = release;
230 } else {
231 // check meta data for consistency, we could move this into FileMetaData...
232 if(release != outputRelease) {
233 B2ERROR("Release in " << std::quoted(input) << " differs from previous files: " <<
234 fileMetaData.getRelease() << " != " << outputMetaData->getRelease());
235 }
236 if(fileMetaData.getSteering() != outputMetaData->getSteering()){
237 // printing both steering files is not useful for anyone so just throw an error
238 B2ERROR("Steering file for " << std::quoted(input) << " differs from previous files.");
239 }
240 if(fileMetaData.getDatabaseGlobalTag() != outputMetaData->getDatabaseGlobalTag()){
241 // Related to BII-6093: we were adding the legacy gt only dependent on input file age, not creation release.
242 // This means there is a chance we want to merge files with and without the globaltag added if they cross the
243 // boundary. It doesn't hurt to keep the gt but we know we could process some of the files without it so as a remedy we
244 // check if the only difference is the legacy gt and if so we remove it from the output metadata ...
245 if(removeLegacyGt(fileMetaData.getDatabaseGlobalTag()) == removeLegacyGt(outputMetaData->getDatabaseGlobalTag())) {
246 outputMetaData->setDatabaseGlobalTag(removeLegacyGt(outputMetaData->getDatabaseGlobalTag()));
247 } else {
248 B2ERROR("Database globalTag in " << std::quoted(input) << " differs from previous files: " <<
249 fileMetaData.getDatabaseGlobalTag() << " != " << outputMetaData->getDatabaseGlobalTag());
250 }
251 }
252 if(fileMetaData.getDataDescription() != outputMetaData->getDataDescription()){
253 KeyValuePrinter cur(true);
254 for (const auto& descrPair : outputMetaData->getDataDescription())
255 cur.put(descrPair.first, descrPair.second);
256 KeyValuePrinter prev(true);
257 for (const auto& descrPair : fileMetaData.getDataDescription())
258 prev.put(descrPair.first, descrPair.second);
259
260 B2ERROR("dataDescription in " << std::quoted(input) << " differs from previous files:\n" << cur.string() << " vs.\n" << prev.string());
261 }
262 if(fileMetaData.isMC() != outputMetaData->isMC()){
263 B2ERROR("Type (real/MC) for " << std::quoted(input) << " differs from previous files.");
264 }
265 // update event numbers ...
266 outputMetaData->setMcEvents(outputMetaData->getMcEvents() + fileMetaData.getMcEvents());
267 outputMetaData->setNEvents(outputMetaData->getNEvents() + fileMetaData.getNEvents());
268 outputMetaData->setNFullEvents(outputMetaData->getNFullEvents() + fileMetaData.getNFullEvents());
269 }
270 if(fileMetaData.getNEvents() < 1) {
271 B2WARNING("File " << std::quoted(input) << " is empty.");
272 } else {
273 // make sure we have the correct low/high event numbers
274 EventInfo curLowEvt = EventInfo{fileMetaData.getExperimentLow(), fileMetaData.getRunLow(), fileMetaData.getEventLow()};
275 EventInfo curHighEvt = EventInfo{fileMetaData.getExperimentHigh(), fileMetaData.getRunHigh(), fileMetaData.getEventHigh()};
276 if(!lowEvt or curLowEvt < *lowEvt) lowEvt = curLowEvt;
277 if(!highEvt or curHighEvt > *highEvt) highEvt = curHighEvt;
278 }
279 // check if we have seen this random seed already in one of the previous files
280 auto it = allSeeds.insert(fileMetaData.getRandomSeed());
281 if(!it.second) {
282 B2WARNING("Duplicate Random Seed: " << std::quoted(fileMetaData.getRandomSeed()) << " present in more then one file");
283 }
284 allUsers.insert(fileMetaData.getUser());
285 // remember all parent files we encounter
286 for (int i = 0; i < fileMetaData.getNParents(); ++i) {
287 allParents.insert(fileMetaData.getParent(i));
288 }
289 }catch(std::exception &e) {
290 B2ERROR("input file " << std::quoted(input) << ": " << e.what());
291 }
292 }
293
294 //Check if the same mergeables were found in all files
295 for(const auto &val: persistentMergeables){
296 if(val.second.second != inputfilenames.size()){
297 B2ERROR("Mergeable " << std::quoted(val.first) << " only present in " << val.second.second << " out of "
298 << inputfilenames.size() << " files");
299 }
300 }
301
302 // Check for user names
303 if(allUsers.size()>1) {
304 B2WARNING("Multiple different users created input files: " << boost::algorithm::join(allUsers, ", "));
305 }
306
307 // Stop processing in case of error
308 if (LogSystem::Instance().getMessageCounter(LogConfig::c_Error) > 0) return 1;
309
310 if(!outputMetaData){
311 // technically it's rather impossible to arrive here: if there were no
312 // input files we exit with a usage message and if any of the files could
313 // not be processed then the error count should be >0. Nevertheless
314 // let's do this check to be on the very safe side and to make clang
315 // analyzer happy.
316 B2FATAL("For some reason no files could be processed");
317 return 1;
318 }
319 if(!lowEvt) {
320 B2WARNING("All Files were empty");
321 lowEvt = EventInfo{-1, -1, 0};
322 highEvt = EventInfo{-1, -1, 0};
323 }
324
325 // Final changes to metadata
326 outputMetaData->setLfn("");
327 outputMetaData->setParents(std::vector<std::string>(allParents.begin(), allParents.end()));
328 outputMetaData->setLow(std::get<0>(*lowEvt), std::get<1>(*lowEvt), std::get<2>(*lowEvt));
329 outputMetaData->setHigh(std::get<0>(*highEvt), std::get<1>(*highEvt), std::get<2>(*highEvt));
330 // If more then one file set an empty random seed
331 if(inputfilenames.size()>1){
332 outputMetaData->setRandomSeed("");
333 }
334 RootIOUtilities::setCreationData(*outputMetaData);
335 // Set (again) the release, since it's overwritten by the previous line
336 outputMetaData->setRelease(outputRelease);
337
338 // OK we have a valid FileMetaData and merged all persistent objects, now do
339 // the conversion of the event trees and create the output file.
340 auto output = std::unique_ptr<TFile>{TFile::Open(outputfilename.c_str(), "RECREATE")};
341 if (output == nullptr or output->IsZombie()) {
342 B2ERROR("Could not create output file " << std::quoted(outputfilename));
343 return 1;
344 }
345
346 for (const auto& treeName : allEventTrees) {
347 TTree* outputEventTree{nullptr};
348 for (const auto& input : inputfilenames) {
349 B2INFO("processing events from " << std::quoted(input + ":" + treeName));
350 auto tfile = std::unique_ptr<TFile>{TFile::Open(input.c_str(), "READ")};
351 // At this point, we already checked that the input files are valid and exist
352 // so it's safe to access tfile directly
353 auto* tree = dynamic_cast<TTree*>(tfile->Get(treeName.c_str()));
354 if (!outputEventTree){
355 output->cd();
356 outputEventTree = tree->CloneTree(0);
357 } else {
358 outputEventTree->CopyAddresses(tree);
359 }
360 // Now let's copy all entries without unpacking (fast), layout the
361 // baskets in an optimal order for sequential reading (SortBasketByEntry)
362 // and rebuild the index in case some parts of the index are missing
363 outputEventTree->CopyEntries(tree, -1, "fast SortBasketsByEntry BuildIndexOnError");
364 // and reset the branch addresses to not be connected anymore
365 outputEventTree->CopyAddresses(tree, true);
366 // finally clean up and close file.
367 delete tree;
368 tfile->Close();
369 }
370 assert(outputEventTree);
371 // make sure we have an index ...
372 if(!outputEventTree->GetTreeIndex()) {
373 B2INFO("No Index found: building new index");
374 RootIOUtilities::buildIndex(outputEventTree);
375 }
376 // and finally write the tree
377 output->cd();
378 outputEventTree->Write();
379 // check if the number of full events in the metadata is zero:
380 // if so calculate number of full events now:
381 if (outputMetaData->getNFullEvents() == 0) {
382 outputMetaData->setNFullEvents(outputEventTree->GetEntries("EventMetaData.m_errorFlag == 0"));
383 }
384 }
385
386 B2INFO("Done processing events");
387
388 // we need to set the LFN to the absolute path name
389 outputMetaData->setLfn(fs::absolute(outputfilename).string());
390 // and maybe register it in the file catalog
391 if(variables.count("add-to-catalog")>0) {
392 FileCatalog::Instance().registerFile(outputfilename, *outputMetaData);
393 }
394 B2INFO("Writing FileMetaData");
395 // Create persistent tree
396 output->cd();
397 TTree outputMetaDataTree("persistent", "persistent");
398 outputMetaDataTree.Branch("FileMetaData", &outputMetaData);
399 for(auto &it: persistentMergeables){
400 outputMetaDataTree.Branch(it.first.c_str(), &it.second.first);
401 }
402 outputMetaDataTree.Fill();
403 outputMetaDataTree.Write();
404
405 // now clean up the mess ...
406 for(const auto& val: persistentMergeables){
407 delete val.second.first;
408 }
409 persistentMergeables.clear();
410 auto outputMetaDataCopy = *outputMetaData;
411 delete outputMetaData;
412 output->Close();
413
414 // and now add it to the metadata service
415 MetadataService::Instance().addRootOutputFile(outputfilename, &outputMetaDataCopy, "b2file-merge");
416
417 // report completion in job metadata
418 MetadataService::Instance().addBasf2Status("finished successfully");
420}
static FileCatalog & Instance()
Static method to get a reference to the FileCatalog instance.
Definition: FileCatalog.cc:23
virtual bool registerFile(const std::string &fileName, FileMetaData &metaData, const std::string &oldLFN="")
Register a file in the (local) file catalog.
Definition: FileCatalog.cc:90
Metadata information about a file.
Definition: FileMetaData.h:29
create human-readable or JSON output for key value pairs.
@ c_Error
Error: for things that went wrong and have to be fixed.
Definition: LogConfig.h:30
@ c_Info
Info: for informational messages, e.g.
Definition: LogConfig.h:27
@ c_Fatal
Fatal: for situations were the program execution can not be continued.
Definition: LogConfig.h:31
@ c_Warning
Warning: for potential problems that the user should pay attention to.
Definition: LogConfig.h:29
@ c_Level
Log level of the message.
Definition: LogConfig.h:36
@ c_Message
Log message text.
Definition: LogConfig.h:37
void setLogInfo(ELogLevel logLevel, unsigned int logInfo)
Configure the printed log information for the given level.
Definition: LogConfig.h:127
LogConfig * getLogConfig()
Returns global log system configuration.
Definition: LogSystem.h:78
static LogSystem & Instance()
Static method to get a reference to the LogSystem instance.
Definition: LogSystem.cc:28
Abstract base class for objects that can be merged.
Definition: Mergeable.h:31
void addRootOutputFile(const std::string &fileName, const FileMetaData *metaData=nullptr, const char *type="RootOutput")
Add the metadata of a root output file.
void addBasf2Status(const std::string &message="")
Add metadata of basf2 status.
void setJsonFileName(const std::string &fileName)
Set the name of the json metadata file.
static MetadataService & Instance()
Static method to get a reference to the MetadataService instance.
void finishBasf2(bool success=true)
Add metadata for basf2 completion.
Helper class to factorize some necessary tasks when working with Belle2 output files.
Definition: RootFileInfo.h:27
void setCreationData(FileMetaData &metadata)
Fill the creation info of a file meta data: site, user, data.
void buildIndex(TTree *tree)
Build TTreeIndex on tree (assumes either EventMetaData branch exists or is a ntuple tree).
Abstract base class for different kinds of events.