Belle II Software  light-2212-foldex
b2file-merge.cc
1 /**************************************************************************
2  * basf2 (Belle II Analysis Software Framework) *
3  * Author: The Belle II Collaboration *
4  * *
5  * See git log for contributors and copyright holders. *
6  * This file is licensed under LGPL-3.0, see LICENSE.md. *
7  **************************************************************************/
8 #include <framework/dataobjects/FileMetaData.h>
9 #include <framework/io/RootIOUtilities.h>
10 #include <framework/io/RootFileInfo.h>
11 #include <framework/logging/Logger.h>
12 #include <framework/pcore/Mergeable.h>
13 #include <framework/core/FileCatalog.h>
14 #include <framework/utilities/KeyValuePrinter.h>
15 
16 #include <boost/program_options.hpp>
17 #include <boost/filesystem.hpp>
18 #include <boost/algorithm/string.hpp>
19 
20 #include <TFile.h>
21 #include <TTree.h>
22 #include <TBranchElement.h>
23 
24 #include <iostream>
25 #include <iomanip>
26 #include <string>
27 #include <set>
28 #include <regex>
29 
30 using namespace Belle2;
31 namespace po = boost::program_options;
32 namespace fs = boost::filesystem;
33 
36 using EventInfo = std::tuple<int, int, unsigned int>;
37 
38 namespace {
41  std::string removeLegacyGt(const std::string& globaltags)
42  {
43  std::regex legacy_gt(",?Legacy_IP_Information");
44  return std::regex_replace(globaltags, legacy_gt, "");
45  }
46 }
47 
48 int main(int argc, char* argv[])
49 {
50  // Parse options
51  std::string outputfilename;
52  std::vector<std::string> inputfilenames;
53  po::options_description options("Options");
54  options.add_options()
55  ("help,h", "print all available options")
56  ("output,o", po::value<std::string>(&outputfilename), "output file name")
57  ("file", po::value<std::vector<std::string>>(&inputfilenames), "filename to merge")
58  ("force,f", "overwrite existing file")
59  ("no-catalog", "don't register output file in file catalog, This is now the default")
60  ("add-to-catalog", "register the output file in the file catalog")
61  ("quiet,q", "if given don't print infos, just warnings and errors");
62  po::positional_options_description positional;
63  positional.add("output", 1);
64  positional.add("file", -1);
65  po::variables_map variables;
66  po::store(po::command_line_parser(argc, argv).options(options).positional(positional).run(), variables);
67  po::notify(variables);
68  if (variables.count("help") || variables.count("output") == 0 || inputfilenames.empty()) {
69  std::cout << "Usage: " << argv[0] << " [<options>] OUTPUTFILE INPUTFILE [INPUTFILE...]" << std::endl;
70  std::cout << " " << argv[0] << " [<options>] [--file INPUTFILE...] "
71  << "-o OUTPUTFILE [--file INPUTFILE...]" << std::endl << std::endl;
72  std::cout << options << std::endl;
73  std::cout << (R"DOC(
74 This program is intended to merge files created by separate basf2 jobs. It's
75 similar to hadd but does correctly update the metadata in the file and merges
76 the objects in the persistent tree correctly.
77 
78 The following restrictions apply:
79  - The files have to be created with the same release and steering file
80  - The persistent tree is only allowed to contain FileMetaData and objects
81  inheriting from Mergeable and the same list of objects needs to be present
82  in all files.
83  - The event tree needs to contain the same DataStore entries in all files.
84 )DOC");
85  return 1;
86  }
87 
88  // Remove the {module:} from log messages
89  auto logConfig = LogSystem::Instance().getLogConfig();
92  }
93  if(variables.count("quiet")>0){
94  logConfig->setLogLevel(LogConfig::c_Warning);
95  }
96 
97  B2INFO("Merging files into " << std::quoted(outputfilename));
98  // check output file
99  if (fs::exists(outputfilename) && variables.count("force")==0) {
100  B2ERROR("Output file exists, use -f to force overwriting it");
101  return 1;
102  }
103  // First we check all input files for consistency ...
104 
105  // the final metadata we will write out
106  FileMetaData* outputMetaData{nullptr};
107  // set of all parent LFNs encountered in any file
108  std::set<std::string> allParents;
109  // map of all mergeable objects found in the persistent tree. The size_t is
110  // for counting to make sure we see all objects in all files
111  std::map<std::string, std::pair<Mergeable*, size_t>> persistentMergeables;
112  // set of all random seeds to print warning on duplicates
113  std::set<std::string> allSeeds;
114  // set of all users
115  std::set<std::string> allUsers;
116  // EventInfo for the high/low event numbers of the final FileMetaData
117  std::optional<EventInfo> lowEvt, highEvt;
118  // set of all branch names in the event tree to compare against to make sure
119  // that they're the same in all files
120  std::set<std::string> allEventBranches;
121  // Release version to compare against. Same as FileMetaData::getRelease() but with the optional -modified removed
122  std::string outputRelease;
123 
124  // so let's loop over all files and create FileMetaData and merge persistent
125  // objects if they inherit from Mergeable, bail if there's something else in
126  // there. The idea is that merging the persistent stuff is fast so we catch
127  // errors more quickly when we do this as a first step and events later on.
128  for (const auto& input : inputfilenames) {
129  try {
130  RootIOUtilities::RootFileInfo fileInfo(input);
131  // Ok, load the FileMetaData from the tree
132  const auto &fileMetaData = fileInfo.getFileMetaData();
133  // File looks usable, start checking metadata ...
134  B2INFO("adding file " << std::quoted(input));
135  if(LogSystem::Instance().isLevelEnabled(LogConfig::c_Info)) fileMetaData.Print("all");
136 
137  auto branches = fileInfo.getBranchNames();
138  if(branches.empty()) {
139  throw std::runtime_error("Could not find any branches in event tree");
140  }
141  if(allEventBranches.empty()) {
142  std::swap(allEventBranches,branches);
143  }else{
144  if(branches!=allEventBranches){
145  B2ERROR("Branches in " << std::quoted(input) << " differ from "
146  << std::quoted(inputfilenames.front()));
147  }
148  }
149 
150  // File looks good so far, now fix the persistent stuff, i.e. merge all
151  // objects in persistent tree
152  for(TObject* brObj: *fileInfo.getPersistentTree().GetListOfBranches()){
153  auto* br = dynamic_cast<TBranchElement*>(brObj);
154  // FileMetaData is handled separately
155  if(br && br->GetTargetClass() == FileMetaData::Class() && std::string(br->GetName()) == "FileMetaData")
156  continue;
157  // Make sure the branch is mergeable
158  if(!br) continue;
159  if(!br->GetTargetClass()->InheritsFrom(Mergeable::Class())){
160  B2ERROR("Branch " << std::quoted(br->GetName()) << " in persistent tree not inheriting from Mergable");
161  continue;
162  }
163  // Ok, it's an object we now how to handle so get it from the tree
164  Mergeable* object{nullptr};
165  br->SetAddress(&object);
166  if(br->GetEntry(0)<=0) {
167  B2ERROR("Could not read branch " << std::quoted(br->GetName()) << " of entry 0 from persistent tree in "
168  << std::quoted(input));
169  continue;
170  }
171  // and either insert it into the map of mergeables or merge with the existing one
172  auto it = persistentMergeables.insert(std::make_pair(br->GetName(), std::make_pair(object, 1)));
173  if(!it.second) {
174  try {
175  it.first->second.first->merge(object);
176  }catch(std::exception &e){
177  B2FATAL("Cannot merge " << std::quoted(br->GetName()) << " in " << std::quoted(input) << ": " << e.what());
178  }
179  it.first->second.second++;
180  // ok, merged, get rid of it.
181  delete object;
182  }else{
183  B2INFO("Found mergeable object " << std::quoted(br->GetName()) << " in persistent tree");
184  }
185  }
186 
187  std::string release = fileMetaData.getRelease();
188  if(release == "") {
189  B2ERROR("Cannot determine release used to create " << std::quoted(input));
190  continue;
191  }else if(boost::algorithm::ends_with(fileMetaData.getRelease(), "-modified")){
192  B2WARNING("File " << std::quoted(input) << " created with modified software "
193  << fileMetaData.getRelease()
194  << ": cannot verify that files are compatible");
195  release = release.substr(0, release.size() - std::string("-modified").size());
196  }
197 
198  // so, event tree looks good too. Now we merge the FileMetaData
199  if (!outputMetaData) {
200  // first input file, just take the event metadata
201  outputMetaData = new FileMetaData(fileMetaData);
202  outputRelease = release;
203  } else {
204  // check meta data for consistency, we could move this into FileMetaData...
205  if(release != outputRelease) {
206  B2ERROR("Release in " << std::quoted(input) << " differs from previous files: " <<
207  fileMetaData.getRelease() << " != " << outputMetaData->getRelease());
208  }
209  if(fileMetaData.getSteering() != outputMetaData->getSteering()){
210  // printing both steering files is not useful for anyone so just throw an error
211  B2ERROR("Steering file for " << std::quoted(input) << " differs from previous files.");
212  }
213  if(fileMetaData.getDatabaseGlobalTag() != outputMetaData->getDatabaseGlobalTag()){
214  // Related to BII-6093: we were adding the legacy gt only dependent on input file age, not creation release.
215  // This means there is a chance we want to merge files with and without the globaltag added if they cross the
216  // boundary. It doesn't hurt to keep the gt but we know we could process some of the files without it so as a remedy we
217  // check if the only difference is the legacy gt and if so we remove it from the output metadata ...
218  if(removeLegacyGt(fileMetaData.getDatabaseGlobalTag()) == removeLegacyGt(outputMetaData->getDatabaseGlobalTag())) {
219  outputMetaData->setDatabaseGlobalTag(removeLegacyGt(outputMetaData->getDatabaseGlobalTag()));
220  } else {
221  B2ERROR("Database globalTag in " << std::quoted(input) << " differs from previous files: " <<
222  fileMetaData.getDatabaseGlobalTag() << " != " << outputMetaData->getDatabaseGlobalTag());
223  }
224  }
225  if(fileMetaData.getDataDescription() != outputMetaData->getDataDescription()){
226  KeyValuePrinter cur(true);
227  for (const auto& descrPair : outputMetaData->getDataDescription())
228  cur.put(descrPair.first, descrPair.second);
229  KeyValuePrinter prev(true);
230  for (const auto& descrPair : fileMetaData.getDataDescription())
231  prev.put(descrPair.first, descrPair.second);
232 
233  B2ERROR("dataDescription in " << std::quoted(input) << " differs from previous files:\n" << cur.string() << " vs.\n" << prev.string());
234  }
235  if(fileMetaData.isMC() != outputMetaData->isMC()){
236  B2ERROR("Type (real/MC) for " << std::quoted(input) << " differs from previous files.");
237  }
238  // update event numbers ...
239  outputMetaData->setMcEvents(outputMetaData->getMcEvents() + fileMetaData.getMcEvents());
240  outputMetaData->setNEvents(outputMetaData->getNEvents() + fileMetaData.getNEvents());
241  }
242  if(fileMetaData.getNEvents() < 1) {
243  B2WARNING("File " << std::quoted(input) << " is empty.");
244  } else {
245  // make sure we have the correct low/high event numbers
246  EventInfo curLowEvt = EventInfo{fileMetaData.getExperimentLow(), fileMetaData.getRunLow(), fileMetaData.getEventLow()};
247  EventInfo curHighEvt = EventInfo{fileMetaData.getExperimentHigh(), fileMetaData.getRunHigh(), fileMetaData.getEventHigh()};
248  if(!lowEvt or curLowEvt < *lowEvt) lowEvt = curLowEvt;
249  if(!highEvt or curHighEvt > *highEvt) highEvt = curHighEvt;
250  }
251  // check if we have seen this random seed already in one of the previous files
252  auto it = allSeeds.insert(fileMetaData.getRandomSeed());
253  if(!it.second) {
254  B2WARNING("Duplicate Random Seed: " << std::quoted(fileMetaData.getRandomSeed()) << " present in more then one file");
255  }
256  allUsers.insert(fileMetaData.getUser());
257  // remember all parent files we encounter
258  for (int i = 0; i < fileMetaData.getNParents(); ++i) {
259  allParents.insert(fileMetaData.getParent(i));
260  }
261  }catch(std::exception &e) {
262  B2ERROR("input file " << std::quoted(input) << ": " << e.what());
263  }
264  }
265 
266  //Check if the same mergeables were found in all files
267  for(const auto &val: persistentMergeables){
268  if(val.second.second != inputfilenames.size()){
269  B2ERROR("Mergeable " << std::quoted(val.first) << " only present in " << val.second.second << " out of "
270  << inputfilenames.size() << " files");
271  }
272  }
273 
274  // Check for user names
275  if(allUsers.size()>1) {
276  B2WARNING("Multiple different users created input files: " << boost::algorithm::join(allUsers, ", "));
277  }
278 
279  // Stop processing in case of error
280  if (LogSystem::Instance().getMessageCounter(LogConfig::c_Error) > 0) return 1;
281 
282  if(!outputMetaData){
283  // technically it's rather impossible to arrive here: if there were no
284  // input files we exit with a usage message and if any of the files could
285  // not be processed then the error count should be >0. Nevertheless
286  // let's do this check to be on the very safe side and to make clang
287  // analyzer happy.
288  B2FATAL("For some reason no files could be processed");
289  return 1;
290  }
291  if(!lowEvt) {
292  B2WARNING("All Files were empty");
293  lowEvt = EventInfo{-1, -1, 0};
294  highEvt = EventInfo{-1, -1, 0};
295  }
296 
297  // Final changes to metadata
298  outputMetaData->setLfn("");
299  outputMetaData->setParents(std::vector<std::string>(allParents.begin(), allParents.end()));
300  outputMetaData->setLow(std::get<0>(*lowEvt), std::get<1>(*lowEvt), std::get<2>(*lowEvt));
301  outputMetaData->setHigh(std::get<0>(*highEvt), std::get<1>(*highEvt), std::get<2>(*highEvt));
302  // If more then one file set an empty random seed
303  if(inputfilenames.size()>1){
304  outputMetaData->setRandomSeed("");
305  }
306  RootIOUtilities::setCreationData(*outputMetaData);
307 
308  // OK we have a valid FileMetaData and merged all persistent objects, now do
309  // the conversion of the event trees and create the output file.
310  TFile output(outputfilename.c_str(), "RECREATE");
311  if (output.IsZombie()) {
312  B2ERROR("Could not create output file " << std::quoted(outputfilename));
313  return 1;
314  }
315 
316  TTree* outputEventTree{nullptr};
317  for (const auto& input : inputfilenames) {
318  B2INFO("processing events from " << std::quoted(input));
319  TFile tfile(input.c_str());
320  auto* tree = dynamic_cast<TTree*>(tfile.Get("tree"));
321  if(!outputEventTree){
322  output.cd();
323  outputEventTree = tree->CloneTree(0);
324  }else{
325  outputEventTree->CopyAddresses(tree);
326  }
327  // Now let's copy all entries without unpacking (fast), layout the
328  // baskets in an optimal order for sequential reading (SortBasketByEntry)
329  // and rebuild the index in case some parts of the index are missing
330  outputEventTree->CopyEntries(tree, -1, "fast SortBasketsByEntry BuildIndexOnError");
331  // and reset the branch addresses to not be connected anymore
332  outputEventTree->CopyAddresses(tree, true);
333  // finally clean up and close file.
334  delete tree;
335  tfile.Close();
336  }
337  // make sure we have an index ...
338  if(!outputEventTree->GetTreeIndex()) {
339  B2INFO("No Index found: building new index");
340  RootIOUtilities::buildIndex(outputEventTree);
341  }
342  // and finally write the tree
343  output.cd();
344  outputEventTree->Write();
345  B2INFO("Done processing events");
346 
347  // we need to set the LFN to the absolute path name
348  outputMetaData->setLfn(fs::absolute(outputfilename, fs::initial_path()).string());
349  // and maybe register it in the file catalog
350  if(variables.count("add-to-catalog")>0) {
351  FileCatalog::Instance().registerFile(outputfilename, *outputMetaData);
352  }
353  B2INFO("Writing FileMetaData");
354  // Create persistent tree
355  output.cd();
356  TTree outputMetaDataTree("persistent", "persistent");
357  outputMetaDataTree.Branch("FileMetaData", &outputMetaData);
358  for(auto &it: persistentMergeables){
359  outputMetaDataTree.Branch(it.first.c_str(), &it.second.first);
360  }
361  outputMetaDataTree.Fill();
362  outputMetaDataTree.Write();
363 
364  // now clean up the mess ...
365  for(const auto& val: persistentMergeables){
366  delete val.second.first;
367  }
368  persistentMergeables.clear();
369  delete outputMetaData;
370  output.Close();
371 }
static FileCatalog & Instance()
Static method to get a reference to the FileCatalog instance.
Definition: FileCatalog.cc:23
virtual bool registerFile(const std::string &fileName, FileMetaData &metaData, const std::string &oldLFN="")
Register a file in the (local) file catalog.
Definition: FileCatalog.cc:90
Metadata information about a file.
Definition: FileMetaData.h:29
create human-readable or JSON output for key value pairs.
@ c_Error
Error: for things that went wrong and have to be fixed.
Definition: LogConfig.h:30
@ c_Info
Info: for informational messages, e.g.
Definition: LogConfig.h:27
@ c_Fatal
Fatal: for situations were the program execution can not be continued.
Definition: LogConfig.h:31
@ c_Warning
Warning: for potential problems that the user should pay attention to.
Definition: LogConfig.h:29
@ c_Level
Log level of the message.
Definition: LogConfig.h:36
@ c_Message
Log message text.
Definition: LogConfig.h:37
void setLogInfo(ELogLevel logLevel, unsigned int logInfo)
Configure the printed log information for the given level.
Definition: LogConfig.h:127
LogConfig * getLogConfig()
Returns global log system configuration.
Definition: LogSystem.h:78
static LogSystem & Instance()
Static method to get a reference to the LogSystem instance.
Definition: LogSystem.cc:31
Abstract base class for objects that can be merged.
Definition: Mergeable.h:31
Helper class to factorize some necessary tasks when working with Belle2 output files.
Definition: RootFileInfo.h:27
void setCreationData(FileMetaData &metadata)
Fill the creation info of a file meta data: site, user, data.
void buildIndex(TTree *tree)
Build TTreeIndex on tree (assumes EventMetaData branch exists there).
Abstract base class for different kinds of events.
Definition: ClusterUtils.h:23
int main(int argc, char **argv)
Run all tests.
Definition: test_main.cc:75