Belle II Software release-09-00-01
b2file-merge.cc
1/**************************************************************************
2 * basf2 (Belle II Analysis Software Framework) *
3 * Author: The Belle II Collaboration *
4 * *
5 * See git log for contributors and copyright holders. *
6 * This file is licensed under LGPL-3.0, see LICENSE.md. *
7 **************************************************************************/
8#include <framework/dataobjects/FileMetaData.h>
9#include <framework/io/RootIOUtilities.h>
10#include <framework/io/RootFileInfo.h>
11#include <framework/logging/Logger.h>
12#include <framework/pcore/Mergeable.h>
13#include <framework/core/FileCatalog.h>
14#include <framework/utilities/KeyValuePrinter.h>
15
16#include <boost/program_options.hpp>
17#include <boost/algorithm/string.hpp>
18
19#include <TFile.h>
20#include <TTree.h>
21#include <TBranchElement.h>
22
23#include <filesystem>
24#include <iostream>
25#include <iomanip>
26#include <memory>
27#include <string>
28#include <set>
29#include <regex>
30
31using namespace Belle2;
32namespace po = boost::program_options;
33namespace fs = std::filesystem;
34
37using EventInfo = std::tuple<int, int, unsigned int>;
38
39namespace {
42 std::string removeLegacyGt(const std::string& globaltags)
43 {
44 std::regex legacy_gt(",?Legacy_IP_Information");
45 return std::regex_replace(globaltags, legacy_gt, "");
46 }
47}
48
49int main(int argc, char* argv[])
50{
51 // Parse options
52 std::string outputfilename;
53 std::vector<std::string> inputfilenames;
54 po::options_description options("Options");
55 options.add_options()
56 ("help,h", "print all available options")
57 ("output,o", po::value<std::string>(&outputfilename), "output file name")
58 ("file", po::value<std::vector<std::string>>(&inputfilenames), "filename to merge")
59 ("force,f", "overwrite existing file")
60 ("no-catalog", "don't register output file in file catalog, This is now the default")
61 ("add-to-catalog", "register the output file in the file catalog")
62 ("quiet,q", "if given don't print infos, just warnings and errors");
63 po::positional_options_description positional;
64 positional.add("output", 1);
65 positional.add("file", -1);
66 po::variables_map variables;
67 po::store(po::command_line_parser(argc, argv).options(options).positional(positional).run(), variables);
68 po::notify(variables);
69 if (variables.count("help") || variables.count("output") == 0 || inputfilenames.empty()) {
70 std::cout << "Usage: " << argv[0] << " [<options>] OUTPUTFILE INPUTFILE [INPUTFILE...]" << std::endl;
71 std::cout << " " << argv[0] << " [<options>] [--file INPUTFILE...] "
72 << "-o OUTPUTFILE [--file INPUTFILE...]" << std::endl << std::endl;
73 std::cout << options << std::endl;
74 std::cout << (R"DOC(
75This program is intended to merge files created by separate basf2 jobs. It's
76similar to hadd but does correctly update the metadata in the file and merges
77the objects in the persistent tree correctly.
78
79The following restrictions apply:
80 - The files have to be created with the same release and steering file
81 - The persistent tree is only allowed to contain FileMetaData and objects
82 inheriting from Mergeable and the same list of objects needs to be present
83 in all files.
84 - The event tree needs to contain the same DataStore entries in all files.
85)DOC");
86 return 1;
87 }
88
89 // Remove the {module:} from log messages
90 auto logConfig = LogSystem::Instance().getLogConfig();
93 }
94 if(variables.count("quiet")>0){
95 logConfig->setLogLevel(LogConfig::c_Warning);
96 }
97
98 B2INFO("Merging files into " << std::quoted(outputfilename));
99 // check output file
100 if (fs::exists(outputfilename) && variables.count("force")==0) {
101 B2ERROR("Output file exists, use -f to force overwriting it");
102 return 1;
103 }
104 // First we check all input files for consistency ...
105
106 // the final metadata we will write out
107 FileMetaData* outputMetaData{nullptr};
108 // set of all parent LFNs encountered in any file
109 std::set<std::string> allParents;
110 // map of all mergeable objects found in the persistent tree. The size_t is
111 // for counting to make sure we see all objects in all files
112 std::map<std::string, std::pair<Mergeable*, size_t>> persistentMergeables;
113 // set of all random seeds to print warning on duplicates
114 std::set<std::string> allSeeds;
115 // set of all users
116 std::set<std::string> allUsers;
117 // EventInfo for the high/low event numbers of the final FileMetaData
118 std::optional<EventInfo> lowEvt, highEvt;
119 // set of all branch names in the event tree to compare against to make sure
120 // that they're the same in all files
121 std::set<std::string> allEventBranches;
122 // Release version to compare against. Same as FileMetaData::getRelease() but with the optional -modified removed
123 std::string outputRelease;
124
125 // so let's loop over all files and create FileMetaData and merge persistent
126 // objects if they inherit from Mergeable, bail if there's something else in
127 // there. The idea is that merging the persistent stuff is fast so we catch
128 // errors more quickly when we do this as a first step and events later on.
129 for (const auto& input : inputfilenames) {
130 try {
131 RootIOUtilities::RootFileInfo fileInfo(input);
132 // Ok, load the FileMetaData from the tree
133 const auto &fileMetaData = fileInfo.getFileMetaData();
134 // File looks usable, start checking metadata ...
135 B2INFO("adding file " << std::quoted(input));
136 if(LogSystem::Instance().isLevelEnabled(LogConfig::c_Info)) fileMetaData.Print("all");
137
138 auto branches = fileInfo.getBranchNames();
139 if(branches.empty()) {
140 throw std::runtime_error("Could not find any branches in event tree");
141 }
142 if(allEventBranches.empty()) {
143 std::swap(allEventBranches,branches);
144 }else{
145 if(branches!=allEventBranches){
146 B2ERROR("Branches in " << std::quoted(input) << " differ from "
147 << std::quoted(inputfilenames.front()));
148 }
149 }
150
151 // File looks good so far, now fix the persistent stuff, i.e. merge all
152 // objects in persistent tree
153 for(TObject* brObj: *fileInfo.getPersistentTree().GetListOfBranches()){
154 auto* br = dynamic_cast<TBranchElement*>(brObj);
155 // FileMetaData is handled separately
156 if(br && br->GetTargetClass() == FileMetaData::Class() && std::string(br->GetName()) == "FileMetaData")
157 continue;
158 // Make sure the branch is mergeable
159 if(!br) continue;
160 if(!br->GetTargetClass()->InheritsFrom(Mergeable::Class())){
161 B2ERROR("Branch " << std::quoted(br->GetName()) << " in persistent tree not inheriting from Mergable");
162 continue;
163 }
164 // Ok, it's an object we now how to handle so get it from the tree
165 Mergeable* object{nullptr};
166 br->SetAddress(&object);
167 if(br->GetEntry(0)<=0) {
168 B2ERROR("Could not read branch " << std::quoted(br->GetName()) << " of entry 0 from persistent tree in "
169 << std::quoted(input));
170 continue;
171 }
172 // and either insert it into the map of mergeables or merge with the existing one
173 auto it = persistentMergeables.insert(std::make_pair(br->GetName(), std::make_pair(object, 1)));
174 if(!it.second) {
175 try {
176 it.first->second.first->merge(object);
177 }catch(std::exception &e){
178 B2FATAL("Cannot merge " << std::quoted(br->GetName()) << " in " << std::quoted(input) << ": " << e.what());
179 }
180 it.first->second.second++;
181 // ok, merged, get rid of it.
182 delete object;
183 }else{
184 B2INFO("Found mergeable object " << std::quoted(br->GetName()) << " in persistent tree");
185 }
186 }
187
188 std::string release = fileMetaData.getRelease();
189 if(release == "") {
190 B2ERROR("Cannot determine release used to create " << std::quoted(input));
191 continue;
192 }else if(boost::algorithm::ends_with(fileMetaData.getRelease(), "-modified")){
193 B2WARNING("File " << std::quoted(input) << " created with modified software "
194 << fileMetaData.getRelease()
195 << ": cannot verify that files are compatible");
196 release = release.substr(0, release.size() - std::string("-modified").size());
197 }
198
199 // so, event tree looks good too. Now we merge the FileMetaData
200 if (!outputMetaData) {
201 // first input file, just take the event metadata
202 outputMetaData = new FileMetaData(fileMetaData);
203 outputRelease = release;
204 } else {
205 // check meta data for consistency, we could move this into FileMetaData...
206 if(release != outputRelease) {
207 B2ERROR("Release in " << std::quoted(input) << " differs from previous files: " <<
208 fileMetaData.getRelease() << " != " << outputMetaData->getRelease());
209 }
210 if(fileMetaData.getSteering() != outputMetaData->getSteering()){
211 // printing both steering files is not useful for anyone so just throw an error
212 B2ERROR("Steering file for " << std::quoted(input) << " differs from previous files.");
213 }
214 if(fileMetaData.getDatabaseGlobalTag() != outputMetaData->getDatabaseGlobalTag()){
215 // Related to BII-6093: we were adding the legacy gt only dependent on input file age, not creation release.
216 // This means there is a chance we want to merge files with and without the globaltag added if they cross the
217 // boundary. It doesn't hurt to keep the gt but we know we could process some of the files without it so as a remedy we
218 // check if the only difference is the legacy gt and if so we remove it from the output metadata ...
219 if(removeLegacyGt(fileMetaData.getDatabaseGlobalTag()) == removeLegacyGt(outputMetaData->getDatabaseGlobalTag())) {
220 outputMetaData->setDatabaseGlobalTag(removeLegacyGt(outputMetaData->getDatabaseGlobalTag()));
221 } else {
222 B2ERROR("Database globalTag in " << std::quoted(input) << " differs from previous files: " <<
223 fileMetaData.getDatabaseGlobalTag() << " != " << outputMetaData->getDatabaseGlobalTag());
224 }
225 }
226 if(fileMetaData.getDataDescription() != outputMetaData->getDataDescription()){
227 KeyValuePrinter cur(true);
228 for (const auto& descrPair : outputMetaData->getDataDescription())
229 cur.put(descrPair.first, descrPair.second);
230 KeyValuePrinter prev(true);
231 for (const auto& descrPair : fileMetaData.getDataDescription())
232 prev.put(descrPair.first, descrPair.second);
233
234 B2ERROR("dataDescription in " << std::quoted(input) << " differs from previous files:\n" << cur.string() << " vs.\n" << prev.string());
235 }
236 if(fileMetaData.isMC() != outputMetaData->isMC()){
237 B2ERROR("Type (real/MC) for " << std::quoted(input) << " differs from previous files.");
238 }
239 // update event numbers ...
240 outputMetaData->setMcEvents(outputMetaData->getMcEvents() + fileMetaData.getMcEvents());
241 outputMetaData->setNEvents(outputMetaData->getNEvents() + fileMetaData.getNEvents());
242 outputMetaData->setNFullEvents(outputMetaData->getNFullEvents() + fileMetaData.getNFullEvents());
243 }
244 if(fileMetaData.getNEvents() < 1) {
245 B2WARNING("File " << std::quoted(input) << " is empty.");
246 } else {
247 // make sure we have the correct low/high event numbers
248 EventInfo curLowEvt = EventInfo{fileMetaData.getExperimentLow(), fileMetaData.getRunLow(), fileMetaData.getEventLow()};
249 EventInfo curHighEvt = EventInfo{fileMetaData.getExperimentHigh(), fileMetaData.getRunHigh(), fileMetaData.getEventHigh()};
250 if(!lowEvt or curLowEvt < *lowEvt) lowEvt = curLowEvt;
251 if(!highEvt or curHighEvt > *highEvt) highEvt = curHighEvt;
252 }
253 // check if we have seen this random seed already in one of the previous files
254 auto it = allSeeds.insert(fileMetaData.getRandomSeed());
255 if(!it.second) {
256 B2WARNING("Duplicate Random Seed: " << std::quoted(fileMetaData.getRandomSeed()) << " present in more then one file");
257 }
258 allUsers.insert(fileMetaData.getUser());
259 // remember all parent files we encounter
260 for (int i = 0; i < fileMetaData.getNParents(); ++i) {
261 allParents.insert(fileMetaData.getParent(i));
262 }
263 }catch(std::exception &e) {
264 B2ERROR("input file " << std::quoted(input) << ": " << e.what());
265 }
266 }
267
268 //Check if the same mergeables were found in all files
269 for(const auto &val: persistentMergeables){
270 if(val.second.second != inputfilenames.size()){
271 B2ERROR("Mergeable " << std::quoted(val.first) << " only present in " << val.second.second << " out of "
272 << inputfilenames.size() << " files");
273 }
274 }
275
276 // Check for user names
277 if(allUsers.size()>1) {
278 B2WARNING("Multiple different users created input files: " << boost::algorithm::join(allUsers, ", "));
279 }
280
281 // Stop processing in case of error
282 if (LogSystem::Instance().getMessageCounter(LogConfig::c_Error) > 0) return 1;
283
284 if(!outputMetaData){
285 // technically it's rather impossible to arrive here: if there were no
286 // input files we exit with a usage message and if any of the files could
287 // not be processed then the error count should be >0. Nevertheless
288 // let's do this check to be on the very safe side and to make clang
289 // analyzer happy.
290 B2FATAL("For some reason no files could be processed");
291 return 1;
292 }
293 if(!lowEvt) {
294 B2WARNING("All Files were empty");
295 lowEvt = EventInfo{-1, -1, 0};
296 highEvt = EventInfo{-1, -1, 0};
297 }
298
299 // Final changes to metadata
300 outputMetaData->setLfn("");
301 outputMetaData->setParents(std::vector<std::string>(allParents.begin(), allParents.end()));
302 outputMetaData->setLow(std::get<0>(*lowEvt), std::get<1>(*lowEvt), std::get<2>(*lowEvt));
303 outputMetaData->setHigh(std::get<0>(*highEvt), std::get<1>(*highEvt), std::get<2>(*highEvt));
304 // If more then one file set an empty random seed
305 if(inputfilenames.size()>1){
306 outputMetaData->setRandomSeed("");
307 }
308 RootIOUtilities::setCreationData(*outputMetaData);
309
310 // OK we have a valid FileMetaData and merged all persistent objects, now do
311 // the conversion of the event trees and create the output file.
312 auto output = std::unique_ptr<TFile>{TFile::Open(outputfilename.c_str(), "RECREATE")};
313 if (output == nullptr or output->IsZombie()) {
314 B2ERROR("Could not create output file " << std::quoted(outputfilename));
315 return 1;
316 }
317
318 TTree* outputEventTree{nullptr};
319 for (const auto& input : inputfilenames) {
320 B2INFO("processing events from " << std::quoted(input));
321 auto tfile = std::unique_ptr<TFile>{TFile::Open(input.c_str(), "READ")};
322 // At this point, we already checked that the input files are valid and exist
323 // so it's safe to access tfile directly
324 auto* tree = dynamic_cast<TTree*>(tfile->Get("tree"));
325 if (!outputEventTree){
326 output->cd();
327 outputEventTree = tree->CloneTree(0);
328 } else {
329 outputEventTree->CopyAddresses(tree);
330 }
331 // Now let's copy all entries without unpacking (fast), layout the
332 // baskets in an optimal order for sequential reading (SortBasketByEntry)
333 // and rebuild the index in case some parts of the index are missing
334 outputEventTree->CopyEntries(tree, -1, "fast SortBasketsByEntry BuildIndexOnError");
335 // and reset the branch addresses to not be connected anymore
336 outputEventTree->CopyAddresses(tree, true);
337 // finally clean up and close file.
338 delete tree;
339 tfile->Close();
340 }
341 assert(outputEventTree);
342 // make sure we have an index ...
343 if(!outputEventTree->GetTreeIndex()) {
344 B2INFO("No Index found: building new index");
345 RootIOUtilities::buildIndex(outputEventTree);
346 }
347 // and finally write the tree
348 output->cd();
349 outputEventTree->Write();
350 B2INFO("Done processing events");
351
352 // check if the number of full events in the metadata is zero:
353 // if so calculate number of full events now:
354 if (outputMetaData->getNFullEvents() == 0) {
355 outputMetaData->setNFullEvents(outputEventTree->GetEntries("EventMetaData.m_errorFlag == 0"));
356 }
357
358 // we need to set the LFN to the absolute path name
359 outputMetaData->setLfn(fs::absolute(outputfilename).string());
360 // and maybe register it in the file catalog
361 if(variables.count("add-to-catalog")>0) {
362 FileCatalog::Instance().registerFile(outputfilename, *outputMetaData);
363 }
364 B2INFO("Writing FileMetaData");
365 // Create persistent tree
366 output->cd();
367 TTree outputMetaDataTree("persistent", "persistent");
368 outputMetaDataTree.Branch("FileMetaData", &outputMetaData);
369 for(auto &it: persistentMergeables){
370 outputMetaDataTree.Branch(it.first.c_str(), &it.second.first);
371 }
372 outputMetaDataTree.Fill();
373 outputMetaDataTree.Write();
374
375 // now clean up the mess ...
376 for(const auto& val: persistentMergeables){
377 delete val.second.first;
378 }
379 persistentMergeables.clear();
380 delete outputMetaData;
381 output->Close();
382}
static FileCatalog & Instance()
Static method to get a reference to the FileCatalog instance.
Definition: FileCatalog.cc:23
virtual bool registerFile(const std::string &fileName, FileMetaData &metaData, const std::string &oldLFN="")
Register a file in the (local) file catalog.
Definition: FileCatalog.cc:90
Metadata information about a file.
Definition: FileMetaData.h:29
create human-readable or JSON output for key value pairs.
@ c_Error
Error: for things that went wrong and have to be fixed.
Definition: LogConfig.h:30
@ c_Info
Info: for informational messages, e.g.
Definition: LogConfig.h:27
@ c_Fatal
Fatal: for situations were the program execution can not be continued.
Definition: LogConfig.h:31
@ c_Warning
Warning: for potential problems that the user should pay attention to.
Definition: LogConfig.h:29
@ c_Level
Log level of the message.
Definition: LogConfig.h:36
@ c_Message
Log message text.
Definition: LogConfig.h:37
void setLogInfo(ELogLevel logLevel, unsigned int logInfo)
Configure the printed log information for the given level.
Definition: LogConfig.h:127
LogConfig * getLogConfig()
Returns global log system configuration.
Definition: LogSystem.h:78
static LogSystem & Instance()
Static method to get a reference to the LogSystem instance.
Definition: LogSystem.cc:31
Abstract base class for objects that can be merged.
Definition: Mergeable.h:31
Helper class to factorize some necessary tasks when working with Belle2 output files.
Definition: RootFileInfo.h:27
void setCreationData(FileMetaData &metadata)
Fill the creation info of a file meta data: site, user, data.
void buildIndex(TTree *tree)
Build TTreeIndex on tree (assumes EventMetaData branch exists there).
Abstract base class for different kinds of events.