Belle II Software  release-05-01-25
b2file-merge.cc
1 #include <framework/dataobjects/FileMetaData.h>
2 #include <framework/io/RootIOUtilities.h>
3 #include <framework/io/RootFileInfo.h>
4 #include <framework/logging/Logger.h>
5 #include <framework/pcore/Mergeable.h>
6 #include <framework/core/FileCatalog.h>
7 #include <framework/utilities/KeyValuePrinter.h>
8 
9 #include <boost/program_options.hpp>
10 #include <boost/filesystem.hpp>
11 #include <boost/algorithm/string.hpp>
12 
13 #include <TFile.h>
14 #include <TTree.h>
15 #include <TBranchElement.h>
16 
17 #include <iostream>
18 #include <iomanip>
19 #include <string>
20 #include <set>
21 #include <regex>
22 
23 using namespace Belle2;
24 namespace po = boost::program_options;
25 namespace fs = boost::filesystem;
26 
29 using EventInfo = std::tuple<int, int, unsigned int>;
30 
31 namespace {
34  std::string removeLegacyGt(const std::string& globaltags)
35  {
36  std::regex legacy_gt(",?Legacy_IP_Information");
37  return std::regex_replace(globaltags, legacy_gt, "");
38  }
39 }
40 
41 int main(int argc, char* argv[])
42 {
43  // Parse options
44  std::string outputfilename;
45  std::vector<std::string> inputfilenames;
46  po::options_description options("Options");
47  options.add_options()
48  ("help,h", "print all available options")
49  ("output,o", po::value<std::string>(&outputfilename), "output file name")
50  ("file", po::value<std::vector<std::string>>(&inputfilenames), "filename to merge")
51  ("force,f", "overwrite existing file")
52  ("no-catalog", "don't register output file in file catalog, This is now the default")
53  ("add-to-catalog", "register the output file in the file catalog")
54  ("quiet,q", "if given don't print infos, just warnings and errors");
55  po::positional_options_description positional;
56  positional.add("output", 1);
57  positional.add("file", -1);
58  po::variables_map variables;
59  po::store(po::command_line_parser(argc, argv).options(options).positional(positional).run(), variables);
60  po::notify(variables);
61  if (variables.count("help") || variables.count("output") == 0 || inputfilenames.empty()) {
62  std::cout << "Usage: " << argv[0] << " [<options>] OUTPUTFILE INPUTFILE [INPUTFILE...]" << std::endl;
63  std::cout << " " << argv[0] << " [<options>] [--file INPUTFILE...] "
64  << "-o OUTPUTFILE [--file INPUTFILE...]" << std::endl << std::endl;
65  std::cout << options << std::endl;
66  std::cout << (R"DOC(
67 This program is intended to merge files created by separate basf2 jobs. It's
68 similar to hadd but does correctly update the metadata in the file and merges
69 the objects in the persistent tree correctly.
70 
71 The following restrictions apply:
72  - The files have to be created with the same release and steering file
73  - The persistent tree is only allowed to contain FileMetaData and objects
74  inheriting from Mergeable and the same list of objects needs to be present
75  in all files.
76  - The event tree needs to contain the same DataStore entries in all files.
77 )DOC");
78  return 1;
79  }
80 
81  // Remove the {module:} from log messages
82  auto logConfig = LogSystem::Instance().getLogConfig();
85  }
86  if(variables.count("quiet")>0){
87  logConfig->setLogLevel(LogConfig::c_Warning);
88  }
89 
90  B2INFO("Merging files into " << std::quoted(outputfilename));
91  // check output file
92  if (fs::exists(outputfilename) && variables.count("force")==0) {
93  B2ERROR("Output file exists, use -f to force overwriting it");
94  return 1;
95  }
96  // First we check all input files for consistency ...
97 
98  // the final metadata we will write out
99  FileMetaData* outputMetaData{nullptr};
100  // set of all parent LFNs encountered in any file
101  std::set<std::string> allParents;
102  // map of all mergeable objects found in the persistent tree. The size_t is
103  // for counting to make sure we see all objects in all files
104  std::map<std::string, std::pair<Mergeable*, size_t>> persistentMergeables;
105  // set of all random seeds to print warning on duplicates
106  std::set<std::string> allSeeds;
107  // set of all users
108  std::set<std::string> allUsers;
109  // EventInfo for the high/low event numbers of the final FileMetaData
110  std::optional<EventInfo> lowEvt, highEvt;
111  // set of all branch names in the event tree to compare against to make sure
112  // that they're the same in all files
113  std::set<std::string> allEventBranches;
114  // Release version to compare against. Same as FileMetaData::getRelease() but with the optional -modified removed
115  std::string outputRelease;
116 
117  // so let's loop over all files and create FileMetaData and merge persistent
118  // objects if they inherit from Mergeable, bail if there's something else in
119  // there. The idea is that merging the persistent stuff is fast so we catch
120  // errors more quickly when we do this as a first step and events later on.
121  for (const auto& input : inputfilenames) {
122  try {
123  RootIOUtilities::RootFileInfo fileInfo(input);
124  // Ok, load the FileMetaData from the tree
125  const auto &fileMetaData = fileInfo.getFileMetaData();
126  // File looks usable, start checking metadata ...
127  B2INFO("adding file " << std::quoted(input));
128  if(LogSystem::Instance().isLevelEnabled(LogConfig::c_Info)) fileMetaData.Print("all");
129 
130  auto branches = fileInfo.getBranchNames();
131  if(branches.empty()) {
132  throw std::runtime_error("Could not find any branches in event tree");
133  }
134  if(allEventBranches.empty()) {
135  std::swap(allEventBranches,branches);
136  }else{
137  if(branches!=allEventBranches){
138  B2ERROR("Branches in " << std::quoted(input) << " differ from "
139  << std::quoted(inputfilenames.front()));
140  }
141  }
142 
143  // File looks good so far, now fix the persistent stuff, i.e. merge all
144  // objects in persistent tree
145  for(TObject* brObj: *fileInfo.getPersistentTree().GetListOfBranches()){
146  auto* br = dynamic_cast<TBranchElement*>(brObj);
147  // FileMetaData is handled separately
148  if(br && br->GetTargetClass() == FileMetaData::Class() && std::string(br->GetName()) == "FileMetaData")
149  continue;
150  // Make sure the branch is mergeable
151  if(!br || !br->GetTargetClass()->InheritsFrom(Mergeable::Class())){
152  B2ERROR("Branch " << std::quoted(br->GetName()) << " in persistent tree not inheriting from Mergable");
153  continue;
154  }
155  // Ok, it's an object we now how to handle so get it from the tree
156  Mergeable* object{nullptr};
157  br->SetAddress(&object);
158  if(br->GetEntry(0)<=0) {
159  B2ERROR("Could not read branch " << std::quoted(br->GetName()) << " of entry 0 from persistent tree in "
160  << std::quoted(input));
161  continue;
162  }
163  // and either insert it into the map of mergeables or merge with the existing one
164  auto it = persistentMergeables.insert(std::make_pair(br->GetName(), std::make_pair(object, 1)));
165  if(!it.second) {
166  try {
167  it.first->second.first->merge(object);
168  }catch(std::exception &e){
169  B2FATAL("Cannot merge " << std::quoted(br->GetName()) << " in " << std::quoted(input) << ": " << e.what());
170  }
171  it.first->second.second++;
172  // ok, merged, get rid of it.
173  delete object;
174  }else{
175  B2INFO("Found mergeable object " << std::quoted(br->GetName()) << " in persistent tree");
176  }
177  }
178 
179  std::string release = fileMetaData.getRelease();
180  if(release == "") {
181  B2ERROR("Cannot determine release used to create " << std::quoted(input));
182  continue;
183  }else if(boost::algorithm::ends_with(fileMetaData.getRelease(), "-modified")){
184  B2WARNING("File " << std::quoted(input) << " created with modified software "
185  << fileMetaData.getRelease()
186  << ": cannot verify that files are compatible");
187  release = release.substr(0, release.size() - std::string("-modified").size());
188  }
189 
190  // so, event tree looks good too. Now we merge the FileMetaData
191  if (!outputMetaData) {
192  // first input file, just take the event metadata
193  outputMetaData = new FileMetaData(fileMetaData);
194  outputRelease = release;
195  } else {
196  // check meta data for consistency, we could move this into FileMetaData...
197  if(release != outputRelease) {
198  B2ERROR("Release in " << std::quoted(input) << " differs from previous files: " <<
199  fileMetaData.getRelease() << " != " << outputMetaData->getRelease());
200  }
201  if(fileMetaData.getSteering() != outputMetaData->getSteering()){
202  // printing both steering files is not useful for anyone so just throw an error
203  B2ERROR("Steering file for " << std::quoted(input) << " differs from previous files.");
204  }
205  if(fileMetaData.getDatabaseGlobalTag() != outputMetaData->getDatabaseGlobalTag()){
206  // Related to BII-6093: we were adding the legacy gt only dependent on input file age, not creation release.
207  // This means there is a chance we want to merge files with and without the globaltag added if they cross the
208  // boundary. It doesn't hurt to keep the gt but we know we could process some of the files without it so as a remedy we
209  // check if the only difference is the legacy gt and if so we remove it from the output metadata ...
210  if(removeLegacyGt(fileMetaData.getDatabaseGlobalTag()) == removeLegacyGt(outputMetaData->getDatabaseGlobalTag())) {
211  outputMetaData->setDatabaseGlobalTag(removeLegacyGt(outputMetaData->getDatabaseGlobalTag()));
212  } else {
213  B2ERROR("Database globalTag in " << std::quoted(input) << " differs from previous files: " <<
214  fileMetaData.getDatabaseGlobalTag() << " != " << outputMetaData->getDatabaseGlobalTag());
215  }
216  }
217  if(fileMetaData.getDataDescription() != outputMetaData->getDataDescription()){
218  KeyValuePrinter cur(true);
219  for (const auto& descrPair : outputMetaData->getDataDescription())
220  cur.put(descrPair.first, descrPair.second);
221  KeyValuePrinter prev(true);
222  for (const auto& descrPair : fileMetaData.getDataDescription())
223  prev.put(descrPair.first, descrPair.second);
224 
225  B2ERROR("dataDescription in " << std::quoted(input) << " differs from previous files:\n" << cur.string() << " vs.\n" << prev.string());
226  }
227  if(fileMetaData.isMC() != outputMetaData->isMC()){
228  B2ERROR("Type (real/MC) for " << std::quoted(input) << " differs from previous files.");
229  }
230  // update event numbers ...
231  outputMetaData->setMcEvents(outputMetaData->getMcEvents() + fileMetaData.getMcEvents());
232  outputMetaData->setNEvents(outputMetaData->getNEvents() + fileMetaData.getNEvents());
233  }
234  if(fileMetaData.getNEvents() < 1) {
235  B2WARNING("File " << std::quoted(input) << " is empty.");
236  } else {
237  // make sure we have the correct low/high event numbers
238  EventInfo curLowEvt = EventInfo{fileMetaData.getExperimentLow(), fileMetaData.getRunLow(), fileMetaData.getEventLow()};
239  EventInfo curHighEvt = EventInfo{fileMetaData.getExperimentHigh(), fileMetaData.getRunHigh(), fileMetaData.getEventHigh()};
240  if(!lowEvt or curLowEvt < *lowEvt) lowEvt = curLowEvt;
241  if(!highEvt or curHighEvt > *highEvt) highEvt = curHighEvt;
242  }
243  // check if we have seen this random seed already in one of the previous files
244  auto it = allSeeds.insert(fileMetaData.getRandomSeed());
245  if(!it.second) {
246  B2WARNING("Duplicate Random Seed: " << std::quoted(fileMetaData.getRandomSeed()) << " present in more then one file");
247  }
248  allUsers.insert(fileMetaData.getUser());
249  // remember all parent files we encounter
250  for (int i = 0; i < fileMetaData.getNParents(); ++i) {
251  allParents.insert(fileMetaData.getParent(i));
252  }
253  }catch(std::exception &e) {
254  B2ERROR("input file " << std::quoted(input) << ": " << e.what());
255  }
256  }
257 
258  //Check if the same mergeables were found in all files
259  for(const auto &val: persistentMergeables){
260  if(val.second.second != inputfilenames.size()){
261  B2ERROR("Mergeable " << std::quoted(val.first) << " only present in " << val.second.second << " out of "
262  << inputfilenames.size() << " files");
263  }
264  }
265 
266  // Check for user names
267  if(allUsers.size()>1) {
268  B2WARNING("Multiple different users created input files: " << boost::algorithm::join(allUsers, ", "));
269  }
270 
271  // Stop processing in case of error
272  if (LogSystem::Instance().getMessageCounter(LogConfig::c_Error) > 0) return 1;
273 
274  if(!outputMetaData){
275  // technically it's rather impossible to arrive here: if there were no
276  // input files we exit with a usage message and if any of the files could
277  // not be processed then the error count should be >0. Nevertheless
278  // let's do this check to be on the very safe side and to make clang
279  // analyzer happy.
280  B2FATAL("For some reason no files could be processed");
281  return 1;
282  }
283  if(!lowEvt) {
284  B2WARNING("All Files were empty");
285  lowEvt = EventInfo{-1, -1, 0};
286  highEvt = EventInfo{-1, -1, 0};
287  }
288 
289  // Final changes to metadata
290  outputMetaData->setLfn("");
291  outputMetaData->setParents(std::vector<std::string>(allParents.begin(), allParents.end()));
292  outputMetaData->setLow(std::get<0>(*lowEvt), std::get<1>(*lowEvt), std::get<2>(*lowEvt));
293  outputMetaData->setHigh(std::get<0>(*highEvt), std::get<1>(*highEvt), std::get<2>(*highEvt));
294  // If more then one file set an empty random seed
295  if(inputfilenames.size()>1){
296  outputMetaData->setRandomSeed("");
297  }
298  RootIOUtilities::setCreationData(*outputMetaData);
299 
300  // OK we have a valid FileMetaData and merged all persistent objects, now do
301  // the conversion of the event trees and create the output file.
302  TFile output(outputfilename.c_str(), "RECREATE");
303  if (output.IsZombie()) {
304  B2ERROR("Could not create output file " << std::quoted(outputfilename));
305  return 1;
306  }
307 
308  TTree* outputEventTree{nullptr};
309  for (const auto& input : inputfilenames) {
310  B2INFO("processing events from " << std::quoted(input));
311  TFile tfile(input.c_str());
312  auto* tree = dynamic_cast<TTree*>(tfile.Get("tree"));
313  if(!outputEventTree){
314  output.cd();
315  outputEventTree = tree->CloneTree(0);
316  }else{
317  outputEventTree->CopyAddresses(tree);
318  }
319  // Now let's copy all entries without unpacking (fast), layout the
320  // baskets in an optimal order for sequential reading (SortBasketByEntry)
321  // and rebuild the index in case some parts of the index are missing
322  outputEventTree->CopyEntries(tree, -1, "fast SortBasketsByEntry BuildIndexOnError");
323  // and reset the branch addresses to not be connected anymore
324  outputEventTree->CopyAddresses(tree, true);
325  // finally clean up and close file.
326  delete tree;
327  tfile.Close();
328  }
329  // make sure we have an index ...
330  if(!outputEventTree->GetTreeIndex()) {
331  B2INFO("No Index found: building new index");
332  RootIOUtilities::buildIndex(outputEventTree);
333  }
334  // and finally write the tree
335  output.cd();
336  outputEventTree->Write();
337  B2INFO("Done processing events");
338 
339  // we need to set the LFN to the absolute path name
340  outputMetaData->setLfn(fs::absolute(outputfilename, fs::initial_path()).string());
341  // and maybe register it in the file catalog
342  if(variables.count("add-to-catalog")>0) {
343  FileCatalog::Instance().registerFile(outputfilename, *outputMetaData);
344  }
345  B2INFO("Writing FileMetaData");
346  // Create persistent tree
347  output.cd();
348  TTree outputMetaDataTree("persistent", "persistent");
349  outputMetaDataTree.Branch("FileMetaData", &outputMetaData);
350  for(auto &it: persistentMergeables){
351  outputMetaDataTree.Branch(it.first.c_str(), &it.second.first);
352  }
353  outputMetaDataTree.Fill();
354  outputMetaDataTree.Write();
355 
356  // now clean up the mess ...
357  for(const auto& val: persistentMergeables){
358  delete val.second.first;
359  }
360  persistentMergeables.clear();
361  delete outputMetaData;
362  output.Close();
363 }
Belle2::KeyValuePrinter
create human-readable or JSON output for key value pairs.
Definition: KeyValuePrinter.h:56
prepareAsicCrosstalkSimDB.e
e
aux.
Definition: prepareAsicCrosstalkSimDB.py:53
Belle2::LogConfig::c_Fatal
@ c_Fatal
Fatal: for situations were the program execution can not be continued.
Definition: LogConfig.h:41
Belle2::FileCatalog::Instance
static FileCatalog & Instance()
Static method to get a reference to the FileCatalog instance.
Definition: FileCatalog.cc:25
Belle2::LogConfig::c_Info
@ c_Info
Info: for informational messages, e.g.
Definition: LogConfig.h:37
Belle2::FileMetaData
Metadata information about a file.
Definition: FileMetaData.h:39
Belle2::RootIOUtilities::buildIndex
void buildIndex(TTree *tree)
Build TTreeIndex on tree (assumes EventMetaData branch exists there).
Definition: RootIOUtilities.cc:145
Belle2::RootIOUtilities::RootFileInfo
Helper class to factorize some necessary tasks when working with Belle2 output files.
Definition: RootFileInfo.h:28
Belle2::LogSystem::getLogConfig
LogConfig * getLogConfig()
Returns global log system configuration.
Definition: LogSystem.h:88
main
int main(int argc, char **argv)
Run all tests.
Definition: test_main.cc:77
Belle2
Abstract base class for different kinds of events.
Definition: MillepedeAlgorithm.h:19
Belle2::LogConfig::c_Level
@ c_Level
Log level of the message.
Definition: LogConfig.h:46
Belle2::RootIOUtilities::setCreationData
void setCreationData(FileMetaData &metadata)
Fill the creation info of a file meta data: site, user, data.
Definition: RootIOUtilities.cc:177
Belle2::LogConfig::c_Error
@ c_Error
Error: for things that went wrong and have to be fixed.
Definition: LogConfig.h:40
Belle2::LogSystem::Instance
static LogSystem & Instance()
Static method to get a reference to the LogSystem instance.
Definition: LogSystem.cc:33
Belle2::FileCatalog::registerFile
virtual bool registerFile(const std::string &fileName, FileMetaData &metaData, const std::string &oldLFN="")
Register a file in the (local) file catalog.
Definition: FileCatalog.cc:92
Belle2::LogConfig::setLogInfo
void setLogInfo(ELogLevel logLevel, unsigned int logInfo)
Configure the printed log information for the given level.
Definition: LogConfig.h:123
Belle2::LogConfig::c_Message
@ c_Message
Log message text.
Definition: LogConfig.h:47
Belle2::LogConfig::c_Warning
@ c_Warning
Warning: for potential problems that the user should pay attention to.
Definition: LogConfig.h:39
Belle2::Mergeable
Abstract base class for objects that can be merged.
Definition: Mergeable.h:33