Belle II Software  release-05-01-25
core.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 # @cond SUPPRESS_DOXYGEN
5 
6 # Thomas Keck 2017
7 
8 """
9  The Full Event Interpretation Algorithm
10 
11  Some basic facts:
12  - The algorithm will automatically reconstruct B mesons and calculate a signal probability for each candidate.
13  - It can be used for hadronic and semileptonic tagging.
14  - The algorithm has to be trained on MC, and can afterwards be applied on data.
15  - The training requires O(100) million MC events
16  - The weight files are stored in the Belle 2 Condition database
17 
18  Read this file if you want to understand the technical details of the FEI.
19 
20  The FEI follows a hierarchical approach.
21  There are 7 stages:
22  (Stage -1: Write out information about the provided data sample)
23  Stage 0: Final State Particles (FSP)
24  Stage 1: pi0, J/Psi, Lambda0
25  Stage 2: K_S0, Sigma+
26  Stage 3: D and Lambda_c mesons
27  Stage 4: D* mesons
28  Stage 5: B mesons
29  Stage 6: Finish
30 
31  Most stages consists of:
32  - Create Particle Candidates
33  - Apply Cuts
34  - Do vertex Fitting
35  - Apply a multivariate classification method
36  - Apply more Cuts
37 
38  The FEI will reconstruct these 7 stages during the training phase,
39  since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
40  to train all the necessary multivariate classifiers.
41 """
42 
43 # FEI defines own command line options, therefore we disable
44 # the ROOT command line options, which otherwise interfere sometimes.
45 from ROOT import PyConfig
46 PyConfig.IgnoreCommandLineOptions = True # noqa
47 
48 # FEI uses multi-threading for parallel execution of tasks therefore
49 # the ROOT gui-thread is disabled, which otherwise interferes sometimes
50 PyConfig.StartGuiThread = False # noqa
51 import ROOT
52 from ROOT import Belle2
53 
54 # Import basf2
55 import basf2
56 from basf2 import B2INFO, B2WARNING
57 import pybasf2
58 import modularAnalysis as ma
59 import b2bii
60 
61 import basf2_mva
62 
63 # Should come after basf2 import
64 import pdg
65 
66 from fei import config
67 
68 # Standard python modules
69 import collections
70 import argparse
71 import os
72 import shutil
73 import typing
74 import pickle
75 import re
76 import functools
77 import subprocess
78 import multiprocessing
79 import pickle
80 
81 # Simple object containing the output of fei
82 FeiState = collections.namedtuple('FeiState', 'path, stage, plists')
83 
84 
85 class TrainingDataInformation(object):
86  """
87  Contains the relevant information about the used training data.
88  Basically we write out the number of MC particles in the whole dataset.
89  This numbers we can use to calculate what fraction of candidates we have to write
90  out as TrainingData to get a reasonable amount of candidates to train on
91  (too few candidates will lead to heavy overtraining, too many won't fit into memory).
92  Secondly we can use this information for the generation of the monitoring pdfs,
93  where we calculate reconstruction efficiencies.
94  """
95 
96  def __init__(self, particles: typing.Sequence[config.Particle]):
97  """
98  Create a new TrainingData object
99  @param particles list of config.Particle objects
100  """
101 
102  self.particles = particles
103 
104  self.filename = 'mcParticlesCount.root'
105 
106  def available(self) -> bool:
107  """
108  Check if the relevant information is already available
109  """
110  return os.path.isfile(self.filename)
111 
112  def reconstruct(self) -> pybasf2.Path:
113  """
114  Returns pybasf2.Path which counts the number of MCParticles in each event.
115  @param particles list of config.Particle objects
116  """
117  # Unique absolute pdg-codes of all particles
118  pdgs = set([abs(pdg.from_name(particle.name)) for particle in self.particles])
119 
120  path = basf2.create_path()
121  module = basf2.register_module('VariablesToHistogram')
122  module.set_name("VariablesToHistogram_MCCount")
123  module.param('variables', [(f'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5) for pdg in pdgs])
124  module.param('fileName', self.filename)
125  path.add_module(module)
126  return path
127 
128  def get_mc_counts(self):
129  """
130  Read out the number of MC particles from the file created by reconstruct
131  """
132  # Unique absolute pdg-codes of all particles
133  root_file = ROOT.TFile(self.filename)
134  mc_counts = {}
135 
137 
138  for key in root_file.GetListOfKeys():
139  variable = Belle2.invertMakeROOTCompatible(key.GetName())
140  pdg = abs(int(variable[len('NumberOfMCParticlesInEvent('):-len(")")]))
141  hist = key.ReadObj()
142  mc_counts[pdg] = {}
143  mc_counts[pdg]['sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
144  for bin in range(hist.GetNbinsX()))
145  mc_counts[pdg]['std'] = hist.GetStdDev()
146  mc_counts[pdg]['avg'] = hist.GetMean()
147  mc_counts[pdg]['max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
148  mc_counts[pdg]['min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
149 
150  mc_counts[0] = {}
151  mc_counts[0]['sum'] = hist.GetEntries()
152  return mc_counts
153 
154 
155 class FSPLoader(object):
156  """
157  Steers the loading of FSP particles.
158  This does NOT include RootInput, Geometry or anything required before loading FSPs,
159  the user has to add this himself (because it depends on the MC campaign and if you want
160  to use Belle 1 or Belle 2).
161  """
162 
163  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
164  """
165  Create a new FSPLoader object
166  @param particles list of config.Particle objects
167  @param config config.FeiConfiguration object
168  """
169 
170  self.particles = particles
171 
172  self.config = config
173 
174  def reconstruct(self) -> pybasf2.Path:
175  """
176  Returns pybasf2.Path which loads the FSP Particles
177  """
178  path = basf2.create_path()
179 
180  if b2bii.isB2BII():
181  ma.fillParticleLists([('K+:FSP', ''), ('pi+:FSP', ''), ('e+:FSP', ''),
182  ('mu+:FSP', ''), ('p+:FSP', '')], writeOut=True, path=path)
183  for outputList, inputList in [('gamma:FSP', 'gamma:mdst'), ('K_S0:V0', 'K_S0:mdst'),
184  ('Lambda0:V0', 'Lambda0:mdst'), ('K_L0:FSP', 'K_L0:mdst'),
185  ('pi0:FSP', 'pi0:mdst'), ('gamma:V0', 'gamma:v0mdst')]:
186  ma.copyParticles(outputList, inputList, writeOut=True, path=path)
187  else:
188  ma.fillParticleLists([('K+:FSP', ''), ('pi+:FSP', ''), ('e+:FSP', ''),
189  ('mu+:FSP', ''), ('gamma:FSP', ''),
190  ('p+:FSP', ''), ('K_L0:FSP', '')], writeOut=True, path=path)
191  ma.fillParticleList('K_S0:V0 -> pi+ pi-', '', writeOut=True, path=path)
192  ma.fillParticleList('Lambda0:V0 -> p+ pi-', '', writeOut=True, path=path)
193  ma.fillConvertedPhotonsList('gamma:V0 -> e+ e-', '', writeOut=True, path=path)
194 
195  if self.config.monitor:
196  names = ['e+', 'K+', 'pi+', 'mu+', 'gamma', 'K_S0', 'p+', 'K_L0', 'Lambda0', 'pi0']
197  filename = 'Monitor_FSPLoader.root'
198  pdgs = set([abs(pdg.from_name(name)) for name in names])
199  variables = [(f'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5) for pdg in pdgs]
200  ma.variablesToHistogram('', variables=variables, filename=filename, path=path)
201  return path
202 
203 
204 class TrainingData(object):
205  """
206  Steers the creation of the training data.
207  The training data is used to train a multivariate classifier for each channel.
208  The training of the FEI at its core is just generating this training data for each channel.
209  After we created the training data for a stage, we have to train the classifiers (see Teacher class further down).
210  """
211 
212  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
213  mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
214  """
215  Create a new TrainingData object
216  @param particles list of config.Particle objects
217  @param config config.FeiConfiguration object
218  @param mc_counts containing number of MC Particles
219  """
220 
221  self.particles = particles
222 
223  self.config = config
224 
225  self.mc_counts = mc_counts
226 
227  def reconstruct(self) -> pybasf2.Path:
228  """
229  Returns pybasf2.Path which creates the training data for the given particles
230  """
231  path = basf2.create_path()
232 
233  for particle in self.particles:
234  pdgcode = abs(pdg.from_name(particle.name))
235  nSignal = self.mc_counts[pdgcode]['sum']
236  # For D-Mesons we usually have a efficiency of 10^-3 including branching fraction
237  if pdgcode > 400:
238  nSignal /= 1000
239  # For B-Mesons we usually have a efficiency of 10^-4 including branching fraction
240  if pdgcode > 500:
241  nSignal /= 10000
242 
243  for channel in particle.channels:
244  filename = f'{channel.label}.root'
245 
246  nBackground = self.mc_counts[0]['sum'] * channel.preCutConfig.bestCandidateCut
247  inverseSamplingRates = {}
248  # For some very pure channels (Jpsi), this sampling can be too aggressive and training fails.
249  # It can therefore be disabled in the preCutConfig.
250  if nBackground > Teacher.MaximumNumberOfMVASamples and not channel.preCutConfig.noBackgroundSampling:
251  inverseSamplingRates[0] = int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1
252  if nSignal > Teacher.MaximumNumberOfMVASamples:
253  inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
254 
255  spectators = [channel.mvaConfig.target]
256  if channel.mvaConfig.sPlotVariable is not None:
257  spectators.append(channel.mvaConfig.sPlotVariable)
258 
259  if self.config.monitor:
260  hist_variables = ['mcErrors', 'mcParticleStatus'] + channel.mvaConfig.variables + spectators
261  hist_variables_2d = [(x, channel.mvaConfig.target)
262  for x in channel.mvaConfig.variables + spectators if x is not channel.mvaConfig.target]
263  hist_filename = f'Monitor_TrainingData_{channel.label}.root'
264  ma.variablesToHistogram(channel.name,
265  variables=config.variables2binnings(hist_variables),
266  variables_2d=config.variables2binnings_2d(hist_variables_2d),
267  filename=config.removeJPsiSlash(hist_filename), path=path)
268 
269  teacher = basf2.register_module('VariablesToNtuple')
270  teacher.set_name('VariablesToNtuple_' + channel.name)
271  teacher.param('fileName', filename)
272  teacher.param('treeName', 'variables')
273  teacher.param('variables', channel.mvaConfig.variables + spectators)
274  teacher.param('particleList', channel.name)
275  teacher.param('sampling', (channel.mvaConfig.target, inverseSamplingRates))
276  path.add_module(teacher)
277  return path
278 
279 
280 class PreReconstruction(object):
281  """
282  Steers the reconstruction phase before the mva method was applied
283  It Includes:
284  - The ParticleCombination (for each particle and channel we create candidates using
285  the daughter candidates from the previous stages)
286  - MC Matching
287  - Vertex Fitting (this is the slowest part of the whole FEI, KFit is used by default,
288  but you can use fastFit as a drop-in replacement https://github.com/thomaskeck/FastFit/,
289  this will speed up the whole FEI by a factor 2-3)
290  """
291 
292  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
293  """
294  Create a new PreReconstruction object
295  @param particles list of config.Particle objects
296  @param config config.FeiConfiguration object
297  """
298 
299  self.particles = particles
300 
301  self.config = config
302 
303  def reconstruct(self) -> pybasf2.Path:
304  """
305  Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
306  """
307  path = basf2.create_path()
308 
309  for particle in self.particles:
310  for channel in particle.channels:
311 
312  if len(channel.daughters) == 1:
313  ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=True, path=path)
314  ma.variablesToExtraInfo(channel.name, {f'constant({channel.decayModeID})': 'decayModeID'}, path=path)
315  else:
316  ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
317  writeOut=True, path=path)
318  if self.config.monitor:
319  ma.matchMCTruth(channel.name, path=path)
320  bc_variable = channel.preCutConfig.bestCandidateVariable
321  hist_variables = [bc_variable, 'mcErrors', 'mcParticleStatus', channel.mvaConfig.target]
322  hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
323  (bc_variable, 'mcErrors'),
324  (bc_variable, 'mcParticleStatus')]
325  filename = f'Monitor_PreReconstruction_BeforeRanking_{channel.label}.root'
326  ma.variablesToHistogram(channel.name,
327  variables=config.variables2binnings(hist_variables),
328  variables_2d=config.variables2binnings_2d(hist_variables_2d),
329  filename=filename, path=path)
330 
331  if channel.preCutConfig.bestCandidateMode == 'lowest':
332  ma.rankByLowest(channel.name,
333  channel.preCutConfig.bestCandidateVariable,
334  channel.preCutConfig.bestCandidateCut,
335  'preCut_rank',
336  path=path)
337  elif channel.preCutConfig.bestCandidateMode == 'highest':
338  ma.rankByHighest(channel.name,
339  channel.preCutConfig.bestCandidateVariable,
340  channel.preCutConfig.bestCandidateCut,
341  'preCut_rank',
342  path=path)
343  else:
344  raise RuntimeError("Unknown bestCandidateMode " + repr(channel.preCutConfig.bestCandidateMode))
345 
346  if self.config.monitor:
347  filename = f'Monitor_PreReconstruction_AfterRanking_{channel.label}.root'
348  hist_variables += ['extraInfo(preCut_rank)']
349  hist_variables_2d += [('extraInfo(preCut_rank)', channel.mvaConfig.target),
350  ('extraInfo(preCut_rank)', 'mcErrors'),
351  ('extraInfo(preCut_rank)', 'mcParticleStatus')]
352  ma.variablesToHistogram(channel.name,
353  variables=config.variables2binnings(hist_variables),
354  variables_2d=config.variables2binnings_2d(hist_variables_2d),
355  filename=filename, path=path)
356  # If we are not in monitor mode we do the mc matching now,
357  # otherwise we did it above already!
358  elif self.config.training:
359  ma.matchMCTruth(channel.name, path=path)
360 
361  if b2bii.isB2BII() and particle.name in ['K_S0', 'Lambda0']:
362  pvfit = basf2.register_module('ParticleVertexFitter')
363  pvfit.set_name('ParticleVertexFitter_' + channel.name)
364  pvfit.param('listName', channel.name)
365  pvfit.param('confidenceLevel', channel.preCutConfig.vertexCut)
366  pvfit.param('vertexFitter', 'KFit')
367  pvfit.param('fitType', 'vertex')
368  pvfit.set_log_level(basf2.logging.log_level.ERROR) # let's not produce gigabytes of uninteresting warnings
369  path.add_module(pvfit)
370  elif re.findall(r"[\w']+", channel.decayString).count('pi0') > 1 and particle.name != 'pi0':
371  basf2.B2INFO(f"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
372  elif len(channel.daughters) > 1:
373  pvfit = basf2.register_module('ParticleVertexFitter')
374  pvfit.set_name('ParticleVertexFitter_' + channel.name)
375  pvfit.param('listName', channel.name)
376  pvfit.param('confidenceLevel', channel.preCutConfig.vertexCut)
377  pvfit.param('vertexFitter', 'KFit')
378  pvfit.param('fitType', 'vertex')
379  pvfit.set_log_level(basf2.logging.log_level.ERROR) # let's not produce gigabytes of uninteresting warnings
380  path.add_module(pvfit)
381 
382  if self.config.monitor:
383  hist_variables = ['chiProb', 'mcErrors', 'mcParticleStatus', channel.mvaConfig.target]
384  hist_variables_2d = [('chiProb', channel.mvaConfig.target),
385  ('chiProb', 'mcErrors'),
386  ('chiProb', 'mcParticleStatus')]
387  filename = f'Monitor_PreReconstruction_AfterVertex_{channel.label}.root'
388  ma.variablesToHistogram(channel.name,
389  variables=config.variables2binnings(hist_variables),
390  variables_2d=config.variables2binnings_2d(hist_variables_2d),
391  filename=filename, path=path)
392 
393  return path
394 
395 
396 class PostReconstruction(object):
397  """
398  Steers the reconstruction phase after the mva method was applied
399  It Includes:
400  - The application of the mva method itself.
401  - Copying all channel lists in a common one for each particle defined in particles
402  - Tag unique signal candidates, to avoid double counting of channels with overlap
403  """
404 
405  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
406  """
407  Create a new PostReconstruction object
408  @param particles list of config.Particle objects
409  @param config config.FeiConfiguration object
410  """
411 
412  self.particles = particles
413 
414  self.config = config
415 
416  def get_missing_channels(self) -> typing.Sequence[str]:
417  """
418  Returns all channels for which the weightfile is missing
419  """
420  missing = []
421  for particle in self.particles:
422  for channel in particle.channels:
423  # weightfile = self.config.prefix + '_' + channel.label
424  weightfile = channel.label + '.xml'
425  if not basf2_mva.available(weightfile):
426  missing += [channel.label]
427  return missing
428 
429  def available(self) -> bool:
430  """
431  Check if the relevant information is already available
432  """
433  return len(self.get_missing_channels()) == 0
434 
435  def reconstruct(self) -> pybasf2.Path:
436  """
437  Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
438  """
439  path = basf2.create_path()
440 
441  for particle in self.particles:
442  for channel in particle.channels:
443  expert = basf2.register_module('MVAExpert')
444  expert.set_name('MVAExpert_' + channel.name)
445  if self.config.training:
446  expert.param('identifier', channel.label + '.xml')
447  else:
448  expert.param('identifier', self.config.prefix + '_' + channel.label)
449  expert.param('extraInfoName', 'SignalProbability')
450  expert.param('listNames', [channel.name])
451  path.add_module(expert)
452 
453  uniqueSignal = basf2.register_module('TagUniqueSignal')
454  uniqueSignal.param('particleList', channel.name)
455  uniqueSignal.param('target', channel.mvaConfig.target)
456  uniqueSignal.param('extraInfoName', 'uniqueSignal')
457  uniqueSignal.set_name('TagUniqueSignal_' + channel.name)
458  path.add_module(uniqueSignal)
459 
460  if self.config.monitor:
461  hist_variables = ['mcErrors', 'mcParticleStatus', 'extraInfo(uniqueSignal)', 'extraInfo(SignalProbability)',
462  channel.mvaConfig.target, 'extraInfo(decayModeID)']
463  hist_variables_2d = [('extraInfo(SignalProbability)', channel.mvaConfig.target),
464  ('extraInfo(SignalProbability)', 'mcErrors'),
465  ('extraInfo(SignalProbability)', 'mcParticleStatus'),
466  ('extraInfo(decayModeID)', channel.mvaConfig.target),
467  ('extraInfo(decayModeID)', 'mcErrors'),
468  ('extraInfo(decayModeID)', 'extraInfo(uniqueSignal)'),
469  ('extraInfo(decayModeID)', 'mcParticleStatus')]
470  filename = f'Monitor_PostReconstruction_AfterMVA_{channel.label}.root'
471  ma.variablesToHistogram(channel.name,
472  variables=config.variables2binnings(hist_variables),
473  variables_2d=config.variables2binnings_2d(hist_variables_2d),
474  filename=filename, path=path)
475 
476  cutstring = ''
477  if particle.postCutConfig.value > 0.0:
478  cutstring = str(particle.postCutConfig.value) + ' < extraInfo(SignalProbability)'
479 
480  ma.copyLists(particle.identifier, [c.name for c in particle.channels], writeOut=True, path=path)
481 
482  if self.config.monitor:
483  hist_variables = ['mcErrors', 'mcParticleStatus', 'extraInfo(uniqueSignal)', 'extraInfo(SignalProbability)',
484  particle.mvaConfig.target, 'extraInfo(decayModeID)']
485  hist_variables_2d = [('extraInfo(decayModeID)', particle.mvaConfig.target),
486  ('extraInfo(decayModeID)', 'mcErrors'),
487  ('extraInfo(decayModeID)', 'mcParticleStatus')]
488  filename = f'Monitor_PostReconstruction_BeforePostCut_{particle.identifier}.root'
489  ma.variablesToHistogram(particle.identifier,
490  variables=config.variables2binnings(hist_variables),
491  variables_2d=config.variables2binnings_2d(hist_variables_2d),
492  filename=config.removeJPsiSlash(filename), path=path)
493 
494  ma.applyCuts(particle.identifier, cutstring, path=path)
495 
496  if self.config.monitor:
497  filename = f'Monitor_PostReconstruction_BeforeRanking_{particle.identifier}.root'
498  ma.variablesToHistogram(particle.identifier,
499  variables=config.variables2binnings(hist_variables),
500  variables_2d=config.variables2binnings_2d(hist_variables_2d),
501  filename=config.removeJPsiSlash(filename), path=path)
502 
503  ma.rankByHighest(particle.identifier, 'extraInfo(SignalProbability)',
504  particle.postCutConfig.bestCandidateCut, 'postCut_rank', path=path)
505 
506  if self.config.monitor:
507  hist_variables += ['extraInfo(postCut_rank)']
508  hist_variables_2d += [('extraInfo(decayModeID)', 'extraInfo(postCut_rank)'),
509  (particle.mvaConfig.target, 'extraInfo(postCut_rank)'),
510  ('mcErrors', 'extraInfo(postCut_rank)'),
511  ('mcParticleStatus', 'extraInfo(postCut_rank)')]
512  filename = f'Monitor_PostReconstruction_AfterRanking_{particle.identifier}.root'
513  ma.variablesToHistogram(particle.identifier,
514  variables=config.variables2binnings(hist_variables),
515  variables_2d=config.variables2binnings_2d(hist_variables_2d),
516  filename=config.removeJPsiSlash(filename), path=path)
517 
518  if 'B' in particle.identifier:
519  variables = ['extraInfo(SignalProbability)', 'Mbc', 'mcErrors', 'mcParticleStatus', particle.mvaConfig.target,
520  'cosThetaBetweenParticleAndNominalB', 'extraInfo(uniqueSignal)', 'extraInfo(decayModeID)']
521  else:
522  variables = ['extraInfo(SignalProbability)', 'mcErrors', 'mcParticleStatus', particle.mvaConfig.target,
523  'extraInfo(uniqueSignal)', 'extraInfo(decayModeID)']
524 
525  filename = f'Monitor_Final_{particle.identifier}.root'
526  ma.variablesToNtuple(particle.identifier, variables, treename='variables',
527  filename=config.removeJPsiSlash(filename), path=path)
528 
529  return path
530 
531 
532 class Teacher(object):
533  """
534  Performs all necessary trainings for all training data files which are
535  available but where there is no weight file available yet.
536  This class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
537  The trainings are run in parallel using multi-threading of python.
538  Each training is done by a subprocess call, the training command (passed by config.externTeacher) can be either
539  * basf2_mva_teacher, the training will be done directly on the machine
540  * externClustTeacher, the training will be submitted to the batch system of KEKCC
541  """
542 
544  MaximumNumberOfMVASamples = int(1e7)
545 
547  MinimumNumberOfMVASamples = int(5e2)
548 
549  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
550  """
551  Create a new Teacher object
552  @param particles list of config.Particle objects
553  @param config config.FeiConfiguration object
554  """
555 
556  self.particles = particles
557 
558  self.config = config
559 
560  @staticmethod
561  def create_fake_weightfile(channel: str):
562  """
563  Create a fake weight file using the trivial method, it will always return 0.0
564  @param channel for which we create a fake weight file
565  """
566  content = f"""
567  <?xml version="1.0" encoding="utf-8"?>
568  <method>Trivial</method>
569  <weightfile>{channel}.xml</weightfile>
570  <treename>tree</treename>
571  <target_variable>isSignal</target_variable>
572  <weight_variable>__weight__</weight_variable>
573  <signal_class>1</signal_class>
574  <max_events>0</max_events>
575  <number_feature_variables>1</number_feature_variables>
576  <variable0>M</variable0>
577  <number_spectator_variables>0</number_spectator_variables>
578  <number_data_files>1</number_data_files>
579  <datafile0>train.root</datafile0>
580  <Trivial_version>1</Trivial_version>
581  <Trivial_output>0</Trivial_output>
582  <signal_fraction>0.066082567</signal_fraction>
583  """
584  with open(channel + ".xml", "w") as f:
585  f.write(content)
586 
587  @staticmethod
588  def check_if_weightfile_is_fake(filename: str):
589  """
590  Checks if the provided filename is a fake-weight file or not
591  @param filename the filename of the weight file
592  """
593  try:
594  return '<method>Trivial</method>' in open(filename, 'r').readlines()[2]
595  except BaseException:
596  return True
597  return True
598 
599  def upload(self, channel: str):
600  """
601  Upload the weight file into the condition database
602  @param channel whose weight file is uploaded
603  """
604  disk = channel + '.xml'
605  dbase = self.config.prefix + '_' + channel
606  basf2_mva.upload(disk, dbase)
607  return (disk, dbase)
608 
609  def do_all_trainings(self):
610  """
611  Do all trainings for which we find training data
612  """
613  job_list = []
614  for particle in self.particles:
615  for channel in particle.channels:
616  filename = f'{channel.label}.root'
617  # weightfile = self.config.prefix + '_' + channel.label
618  weightfile = channel.label + '.xml'
619  if not basf2_mva.available(weightfile) and os.path.isfile(filename):
620  f = ROOT.TFile(filename)
621  if not f:
622  B2WARNING("Training of MVC failed. Couldn't find ROOT file. "
623  f"Ignoring channel {channel.label}.")
624  self.create_fake_weightfile(channel.label)
625  self.upload(channel.label)
626  continue
627  keys = [m for m in f.GetListOfKeys()]
628  if not keys:
629  B2WARNING("Training of MVC failed. ROOT file does not contain a tree. "
630  f"Ignoring channel {channel.label}.")
631  self.create_fake_weightfile(channel.label)
632  self.upload(channel.label)
633  continue
634  tree = keys[0].ReadObj()
635  nSig = tree.GetEntries(channel.mvaConfig.target + ' == 1.0')
636  nBg = tree.GetEntries(channel.mvaConfig.target + ' != 1.0')
637  if nSig < Teacher.MinimumNumberOfMVASamples:
638  B2WARNING("Training of MVC failed. "
639  f"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
640  self.create_fake_weightfile(channel.label)
641  self.upload(channel.label)
642  continue
643  if nBg < Teacher.MinimumNumberOfMVASamples:
644  B2WARNING("Training of MVC failed. "
645  f"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
646  self.create_fake_weightfile(channel.label)
647  self.upload(channel.label)
648  continue
649  variable_str = "' '".join(channel.mvaConfig.variables)
650 
651  command = (f"{self.config.externTeacher}"
652  f" --method '{channel.mvaConfig.method}'"
653  f" --target_variable '{channel.mvaConfig.target}'"
654  f" --treename variables --datafile '{channel.label}.root'"
655  f" --signal_class 1 --variables '{variable_str}'"
656  f" --identifier '{channel.label}.xml'"
657  f" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1")
658  B2INFO(f"Used following command to invoke teacher: \n {command}")
659  job_list.append((channel.label, command))
660 
661  p = multiprocessing.Pool(None, maxtasksperchild=1)
662  func = functools.partial(subprocess.call, shell=True)
663  p.map(func, [c for _, c in job_list])
664  p.close()
665  p.join()
666 
667  weightfiles = []
668  for name, _ in job_list:
669  if not basf2_mva.available(name + '.xml'):
670  B2WARNING("Training of MVC failed. For unknown reasons, check the logfile")
671  self.create_fake_weightfile(name)
672  weightfiles.append(self.upload(name))
673  return weightfiles
674 
675 
676 def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
677  """
678  Convert an old FEI training into the new format.
679  The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
680  using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
681  @param particles list of config.Particle objects
682  @param config config.FeiConfiguration object
683  """
684  summary = pickle.load(open(configuration.legacy, 'rb'))
685  channel2lists = {k: v[2] for k, v in summary['channel2lists'].items()}
686 
687  teacher = Teacher(particles, configuration)
688 
689  for particle in particles:
690  for channel in particle.channels:
691  new_weightfile = configuration.prefix + '_' + channel.label
692  old_weightfile = configuration.prefix + '_' + channel2lists[channel.label.replace('Jpsi', 'J/psi')]
693  if not basf2_mva.available(new_weightfile):
694  if old_weightfile is None or not basf2_mva.available(old_weightfile):
695  Teacher.create_fake_weightfile(channel.label)
696  teacher.upload(channel.label)
697  else:
698  basf2_mva.download(old_weightfile, channel.label + '.xml')
699  teacher.upload(channel.label)
700 
701 
702 def get_stages_from_particles(particles: typing.Sequence[config.Particle]):
703  """
704  Returns the hierarchical structure of the FEI.
705  Each stage depends on the particles in the previous stage.
706  The final stage is empty (meaning everything is done, and the training is finished at this point).
707  @param particles list of config.Particle objects
708  """
709  stages = [
710  [p for p in particles if p.name in ['e+', 'K+', 'pi+', 'mu+', 'gamma', 'p+', 'K_L0']],
711  [p for p in particles if p.name in ['pi0', 'J/psi', 'Lambda0']],
712  [p for p in particles if p.name in ['K_S0', 'Sigma+']],
713  [p for p in particles if p.name in ['D+', 'D0', 'D_s+', 'Lambda_c+']],
714  [p for p in particles if p.name in ['D*+', 'D*0', 'D_s*+']],
715  [p for p in particles if p.name in ['B0', 'B+']],
716  []
717  ]
718 
719  for p in particles:
720  if p.name not in [p.name for stage in stages for p in stage]:
721  raise RuntimeError(f"Unknown particle {p.name}: Not implemented in FEI")
722 
723  return stages
724 
725 
726 def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
727  """
728  Performs the training of mva classifiers for all available training data,
729  this function must be either called by the user after each stage of the FEI during training,
730  or (more likely) is called by the distributed.py script after merging the outputs of all jobs,
731  @param particles list of config.Particle objects
732  @param config config.FeiConfiguration object
733  @return list of tuple with weight file on disk and identifier in database for all trained classifiers
734  """
735  teacher = Teacher(particles, configuration)
736  return teacher.do_all_trainings()
737 
738 
739 def save_summary(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration, cache: int):
740  """
741  Creates the Summary.pickle, which is used to keep track of the stage during the training,
742  and can be used later to investigate which configuration was used exactly to create the training.
743  @param particles list of config.Particle objects
744  @param config config.FeiConfiguration object
745  @param cache current cache level
746  """
747  configuration = config.FeiConfiguration(configuration.prefix, cache,
748  configuration.monitor, configuration.legacy, configuration.externTeacher,
749  configuration.training)
750  # Backup existing Summary.pickle files
751  for i in [8, 7, 6, 5, 4, 3, 2, 1, 0]:
752  if os.path.isfile(f'Summary.pickle.backup_{i}'):
753  shutil.copyfile(f'Summary.pickle.backup_{i}', f'Summary.pickle.backup_{i + 1}')
754  if os.path.isfile('Summary.pickle'):
755  shutil.copyfile('Summary.pickle', 'Summary.pickle.backup_0')
756  pickle.dump((particles, configuration), open('Summary.pickle', 'wb'))
757 
758 
759 def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
760  """
761  The most important function of the FEI.
762  This creates the FEI path for training/fitting (both terms are equal), and application/inference (both terms are equal).
763  The whole FEI is defined by the particles which are reconstructed (see default_channels.py)
764  and the configuration (see config.py).
765 
766  TRAINING
767  For training this function is called multiple times, each time the FEI reconstructs one more stage in the hierarchical structure
768  i.e. we start with FSP, pi0, KS_0, D, D*, and with B mesons. You have to set configuration.training to True for training mode.
769  All weight files created during the training will be stored in your local database.
770  If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
771  to the central database first (see documentation for the Belle2 Condition Database).
772 
773  APPLICATION
774  For application you call this function once, and it returns the whole path which will reconstruct B mesons
775  with an associated signal probability. You have to set configuration.training to False for application mode.
776 
777  MONITORING
778  You can always turn on the monitoring (configuration.monitor = True),
779  to write out ROOT Histograms of many quantities for each stage,
780  using these histograms you can use the printReporting.py or latexReporting.py scripts to automatically create pdf files.
781 
782  LEGACY
783  This function can also use old FEI trainings (version 3), just pass the Summary.pickle file of the old training,
784  and the weight files will be automatically converted to the new naming scheme.
785 
786  @param particles list of config.Particle objects
787  @param config config.FeiConfiguration object
788  """
789  print(r"""
790  ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
791  |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
792  | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
793 
794  Author: Thomas Keck 2014 - 2017
795  Please cite my PhD thesis
796  """)
797 
798  # The cache parameter of the configuration object is used during training to keep track,
799  # which reconstruction steps are already performed.
800  # For fitting/training we start by default with -1, meaning we still have to create the TrainingDataInformation,
801  # which is used to determine the number of candidates we have to write out for the FSP trainings in stage 0.
802  # For inference/application we start by default with 0, because we don't need the TrainingDataInformation in stage 0.
803  # During the training we save the particles and configuration (including the current cache stage) in the Summary.pickle object.
804  if configuration.cache is None:
805  if os.path.isfile('Summary.pickle'):
806  print("Cache: Replaced particles and configuration with the ones from Summary.pickle!")
807  particles, configuration = pickle.load(open('Summary.pickle', 'rb'))
808  cache = configuration.cache
809  else:
810  if configuration.training:
811  cache = -1
812  else:
813  cache = 0
814  else:
815  cache = configuration.cache
816 
817  # Now we start building the training or application path
818  path = basf2.create_path()
819 
820  # There are in total 7 stages.
821  # For training we start with -1 and go to 7 one stage at a time
822  # For application we can run stage 0 to 7 at once
823  stages = get_stages_from_particles(particles)
824 
825  # If the user provided a Summary.pickle file of a FEIv3 training we
826  # convert the old weight files (with hashes), to the new naming scheme.
827  # Afterwards the algorithm runs as usual
828  if configuration.legacy is not None:
829  convert_legacy_training(particles, configuration)
830 
831  # During the training we require the number of MC particles in the whole processed
832  # data sample, because we don't want to write out billions of e.g. pion candidates.
833  # Knowing the total amount of MC particles we can write out only every e.g. 10th candidate
834  # That's why we have to write out the TrainingDataInformation before doing anything during the training phase.
835  # During application we only need this if we run in monitor mode, and want to write out a summary in the end,
836  # the summary contains efficiency, and the efficiency calculation requires the total number of MC particles.
837  training_data_information = TrainingDataInformation(particles)
838  if cache < 0 and configuration.training:
839  print("Stage 0: Run over all files to count the number of events and McParticles")
840  path.add_path(training_data_information.reconstruct())
841  if configuration.training:
842  save_summary(particles, configuration, 0)
843  return FeiState(path, 0, [])
844  elif not configuration.training and configuration.monitor:
845  path.add_path(training_data_information.reconstruct())
846 
847  # We load the Final State particles
848  # It is assumed that the user takes care of adding RootInput, Geometry, and everything
849  # which is required to read in data, so we directly start to load the FSP particles
850  # used by the FEI.
851  loader = FSPLoader(particles, configuration)
852  if cache < 1:
853  print("Stage 0: Load FSP particles")
854  path.add_path(loader.reconstruct())
855 
856  # Now we reconstruct each stage one after another.
857  # Each stage consists of two parts:
858  # PreReconstruction (before the mva method was applied):
859  # - Particle combination
860  # - Do vertex fitting
861  # - Some simple cuts and best candidate selection
862  # PostReconstruction (after the mva method was applied):
863  # - Apply the mva method
864  # - Apply cuts on the mva output and best candidate selection
865  #
866  # If the weight files for the PostReconstruction are not available for the current stage and we are in training mode,
867  # we have to create the training data. The training itself is done by the do_trainings function which is called
868  # either by the user after each step, or by the distributed.py script
869  #
870  # If the weight files for the PostReconstruction are not available for the current stage and we are not in training mode,
871  # we keep going, as soon as the user will call process on the produced path he will get an error message that the
872  # weight files are missing.
873  #
874  # Finally we keep track of the ParticleLists we use, so the user can run the RemoveParticles module to reduce the size of the
875  # intermediate output of RootOutput.
876  used_lists = []
877  for stage, stage_particles in enumerate(stages):
878  pre_reconstruction = PreReconstruction(stage_particles, configuration)
879  if cache <= stage:
880  print(f"Stage {stage}: PreReconstruct particles: ", [p.name for p in stage_particles])
881  path.add_path(pre_reconstruction.reconstruct())
882 
883  post_reconstruction = PostReconstruction(stage_particles, configuration)
884  if configuration.training and not post_reconstruction.available():
885  print(f"Stage {stage}: Create training data for particles: ", [p.name for p in stage_particles])
886  mc_counts = training_data_information.get_mc_counts()
887  training_data = TrainingData(stage_particles, configuration, mc_counts)
888  path.add_path(training_data.reconstruct())
889  used_lists += [channel.name for particle in stage_particles for channel in particle.channels]
890  break
891  if cache <= stage + 1:
892  path.add_path(post_reconstruction.reconstruct())
893  used_lists += [particle.identifier for particle in stage_particles]
894 
895  # If we run in monitor mode we are interested in the ModuleStatistics,
896  # these statistics contain the runtime for each module which was run.
897  if configuration.monitor:
898  output = basf2.register_module('RootOutput')
899  output.param('outputFileName', 'Monitor_ModuleStatistics.root')
900  output.param('branchNames', ['EventMetaData']) # cannot be removed, but of only small effect on file size
901  output.param('branchNamesPersistent', ['ProcessStatistics'])
902  output.param('ignoreCommandLineOverride', True)
903  path.add_module(output)
904 
905  # As mentioned above the FEI keeps track of the stages which are already reconstructed during the training
906  # so we write out the Summary.pickle here, and increase the stage by one.
907  if configuration.training or configuration.monitor:
908  save_summary(particles, configuration, stage + 1)
909 
910  # Finally we return the path, the stage and the used lists to the user.
911  return FeiState(path, stage + 1, plists=used_lists)
912 
913 # @endcond
b2bii.isB2BII
def isB2BII()
Definition: b2bii.py:6
pdg.from_name
def from_name(name)
Definition: pdg.py:50
Belle2::invertMakeROOTCompatible
std::string invertMakeROOTCompatible(std::string str)
Invert makeROOTCompatible operation.
Definition: MakeROOTCompatible.cc:89
dft.TfData.TfDataBasf2Stub.__init__
def __init__(self, batch_size, feature_number, event_number, train_fraction)
Definition: TfData.py:115
Belle2::Variable::Manager
Global list of available variables.
Definition: Manager.h:108