Belle II Software  release-08-01-10
core.py
1 #!/usr/bin/env python3
2 
3 
10 
11 # @cond SUPPRESS_DOXYGEN
12 
13 """
14  The Full Event Interpretation Algorithm
15 
16  Some basic facts:
17  - The algorithm will automatically reconstruct B mesons and calculate a signal probability for each candidate.
18  - It can be used for hadronic and semileptonic tagging.
19  - The algorithm has to be trained on MC, and can afterwards be applied on data.
20  - The training requires O(100) million MC events
21  - The weight files are stored in the Belle II Condition database
22 
23  Read this file if you want to understand the technical details of the FEI.
24 
25  The FEI follows a hierarchical approach.
26  There are 7 stages:
27  (Stage -1: Write out information about the provided data sample)
28  Stage 0: Final State Particles (FSP)
29  Stage 1: pi0, J/Psi, Lambda0
30  Stage 2: K_S0, Sigma+
31  Stage 3: D and Lambda_c mesons
32  Stage 4: D* mesons
33  Stage 5: B mesons
34  Stage 6: Finish
35 
36  Most stages consists of:
37  - Create Particle Candidates
38  - Apply Cuts
39  - Do vertex Fitting
40  - Apply a multivariate classification method
41  - Apply more Cuts
42 
43  The FEI will reconstruct these 7 stages during the training phase,
44  since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
45  to train all the necessary multivariate classifiers.
46 """
47 
48 # Import basf2
49 import basf2
50 from basf2 import B2INFO, B2WARNING
51 import pybasf2
52 import modularAnalysis as ma
53 import b2bii
54 
55 # Should come after basf2 import
56 import pdg
57 from fei import config
58 import basf2_mva
59 
60 # Standard python modules
61 import collections
62 import os
63 import shutil
64 import typing
65 import pickle
66 import re
67 import functools
68 import subprocess
69 import multiprocessing
70 
71 # Simple object containing the output of fei
72 FeiState = collections.namedtuple('FeiState', 'path, stage, plists')
73 
74 
75 class TrainingDataInformation:
76  """
77  Contains the relevant information about the used training data.
78  Basically we write out the number of MC particles in the whole dataset.
79  This numbers we can use to calculate what fraction of candidates we have to write
80  out as TrainingData to get a reasonable amount of candidates to train on
81  (too few candidates will lead to heavy overtraining, too many won't fit into memory).
82  Secondly we can use this information for the generation of the monitoring pdfs,
83  where we calculate reconstruction efficiencies.
84  """
85 
86  def __init__(self, particles: typing.Sequence[config.Particle]):
87  """
88  Create a new TrainingData object
89  @param particles list of config.Particle objects
90  """
91 
92  self.particles = particles
93 
94  self.filename = 'mcParticlesCount.root'
95 
96  def available(self) -> bool:
97  """
98  Check if the relevant information is already available
99  """
100  return os.path.isfile(self.filename)
101 
102  def reconstruct(self) -> pybasf2.Path:
103  """
104  Returns pybasf2.Path which counts the number of MCParticles in each event.
105  @param particles list of config.Particle objects
106  """
107  # Unique absolute pdg-codes of all particles
108  pdgs = {abs(pdg.from_name(particle.name)) for particle in self.particles}
109 
110  path = basf2.create_path()
111  module = basf2.register_module('VariablesToHistogram')
112  module.set_name("VariablesToHistogram_MCCount")
113  module.param('variables', [(f'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5) for pdg in pdgs])
114  module.param('fileName', self.filename)
115  path.add_module(module)
116  return path
117 
118  def get_mc_counts(self):
119  """
120  Read out the number of MC particles from the file created by reconstruct
121  """
122  # Unique absolute pdg-codes of all particles
123  # Always avoid the top-level 'import ROOT'.
124  import ROOT # noqa
125  root_file = ROOT.TFile.Open(self.filename, 'read')
126  mc_counts = {}
127 
128  for key in root_file.GetListOfKeys():
129  variable = ROOT.Belle2.MakeROOTCompatible.invertMakeROOTCompatible(key.GetName())
130  pdg = abs(int(variable[len('NumberOfMCParticlesInEvent('):-len(")")]))
131  hist = key.ReadObj()
132  mc_counts[pdg] = {}
133  mc_counts[pdg]['sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
134  for bin in range(hist.GetNbinsX()))
135  mc_counts[pdg]['std'] = hist.GetStdDev()
136  mc_counts[pdg]['avg'] = hist.GetMean()
137  mc_counts[pdg]['max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
138  mc_counts[pdg]['min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
139 
140  mc_counts[0] = {}
141  mc_counts[0]['sum'] = hist.GetEntries()
142  root_file.Close()
143  return mc_counts
144 
145 
146 class FSPLoader:
147  """
148  Steers the loading of FSP particles.
149  This does NOT include RootInput, Geometry or anything required before loading FSPs,
150  the user has to add this himself (because it depends on the MC campaign and if you want
151  to use Belle or Belle II).
152  """
153 
154  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
155  """
156  Create a new FSPLoader object
157  @param particles list of config.Particle objects
158  @param config config.FeiConfiguration object
159  """
160 
161  self.particles = particles
162 
163  self.config = config
164 
165  def reconstruct(self) -> pybasf2.Path:
166  """
167  Returns pybasf2.Path which loads the FSP Particles
168  """
169  path = basf2.create_path()
170 
171  if b2bii.isB2BII():
172  ma.fillParticleLists([('K+:FSP', ''), ('pi+:FSP', ''), ('e+:FSP', ''),
173  ('mu+:FSP', ''), ('p+:FSP', '')], writeOut=True, path=path)
174  for outputList, inputList in [('gamma:FSP', 'gamma:mdst'), ('K_S0:V0', 'K_S0:mdst'),
175  ('Lambda0:V0', 'Lambda0:mdst'), ('K_L0:FSP', 'K_L0:mdst'),
176  ('pi0:FSP', 'pi0:mdst'), ('gamma:V0', 'gamma:v0mdst')]:
177  ma.copyParticles(outputList, inputList, writeOut=True, path=path)
178  else:
179  ma.fillParticleLists([('K+:FSP', ''), ('pi+:FSP', ''), ('e+:FSP', ''),
180  ('mu+:FSP', ''), ('gamma:FSP', ''),
181  ('p+:FSP', ''), ('K_L0:FSP', '')], writeOut=True, path=path)
182  ma.fillParticleList('K_S0:V0 -> pi+ pi-', '', writeOut=True, path=path)
183  ma.fillParticleList('Lambda0:V0 -> p+ pi-', '', writeOut=True, path=path)
184  ma.fillConvertedPhotonsList('gamma:V0 -> e+ e-', '', writeOut=True, path=path)
185 
186  if self.config.monitor:
187  names = ['e+', 'K+', 'pi+', 'mu+', 'gamma', 'K_S0', 'p+', 'K_L0', 'Lambda0', 'pi0']
188  filename = 'Monitor_FSPLoader.root'
189  pdgs = {abs(pdg.from_name(name)) for name in names}
190  variables = [(f'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5) for pdg in pdgs]
191  ma.variablesToHistogram('', variables=variables, filename=filename, path=path)
192  return path
193 
194 
195 class TrainingData:
196  """
197  Steers the creation of the training data.
198  The training data is used to train a multivariate classifier for each channel.
199  The training of the FEI at its core is just generating this training data for each channel.
200  After we created the training data for a stage, we have to train the classifiers (see Teacher class further down).
201  """
202 
203  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
204  mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
205  """
206  Create a new TrainingData object
207  @param particles list of config.Particle objects
208  @param config config.FeiConfiguration object
209  @param mc_counts containing number of MC Particles
210  """
211 
212  self.particles = particles
213 
214  self.config = config
215 
216  self.mc_counts = mc_counts
217 
218  def reconstruct(self) -> pybasf2.Path:
219  """
220  Returns pybasf2.Path which creates the training data for the given particles
221  """
222  path = basf2.create_path()
223 
224  for particle in self.particles:
225  pdgcode = abs(pdg.from_name(particle.name))
226  nSignal = self.mc_counts[pdgcode]['sum']
227  # For D-Mesons we usually have a efficiency of 10^-3 including branching fraction
228  if pdgcode > 400:
229  nSignal /= 1000
230  # For B-Mesons we usually have a efficiency of 10^-4 including branching fraction
231  if pdgcode > 500:
232  nSignal /= 10000
233 
234  for channel in particle.channels:
235  filename = 'training_input.root'
236 
237  nBackground = self.mc_counts[0]['sum'] * channel.preCutConfig.bestCandidateCut
238  inverseSamplingRates = {}
239  # For some very pure channels (Jpsi), this sampling can be too aggressive and training fails.
240  # It can therefore be disabled in the preCutConfig.
241  if nBackground > Teacher.MaximumNumberOfMVASamples and not channel.preCutConfig.noBackgroundSampling:
242  inverseSamplingRates[0] = int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1
243  if nSignal > Teacher.MaximumNumberOfMVASamples:
244  inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
245 
246  spectators = [channel.mvaConfig.target]
247  if channel.mvaConfig.sPlotVariable is not None:
248  spectators.append(channel.mvaConfig.sPlotVariable)
249 
250  if self.config.monitor:
251  hist_variables = ['mcErrors', 'mcParticleStatus'] + channel.mvaConfig.variables + spectators
252  hist_variables_2d = [(x, channel.mvaConfig.target)
253  for x in channel.mvaConfig.variables + spectators if x is not channel.mvaConfig.target]
254  hist_filename = 'Monitor_TrainingData.root'
255  ma.variablesToHistogram(channel.name, variables=config.variables2binnings(hist_variables),
256  variables_2d=config.variables2binnings_2d(hist_variables_2d),
257  filename=config.removeJPsiSlash(hist_filename),
258  directory=config.removeJPsiSlash(f'{channel.label}'), path=path)
259 
260  teacher = basf2.register_module('VariablesToNtuple')
261  teacher.set_name('VariablesToNtuple_' + channel.name)
262  teacher.param('fileName', filename)
263  teacher.param('treeName', f'{channel.label} variables')
264  teacher.param('variables', channel.mvaConfig.variables + spectators)
265  teacher.param('particleList', channel.name)
266  teacher.param('sampling', (channel.mvaConfig.target, inverseSamplingRates))
267  path.add_module(teacher)
268  return path
269 
270 
271 class PreReconstruction:
272  """
273  Steers the reconstruction phase before the mva method was applied
274  It Includes:
275  - The ParticleCombination (for each particle and channel we create candidates using
276  the daughter candidates from the previous stages)
277  - MC Matching
278  - Vertex Fitting (this is the slowest part of the whole FEI, KFit is used by default,
279  but you can use fastFit as a drop-in replacement https://github.com/thomaskeck/FastFit/,
280  this will speed up the whole FEI by a factor 2-3)
281  """
282 
283  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
284  """
285  Create a new PreReconstruction object
286  @param particles list of config.Particle objects
287  @param config config.FeiConfiguration object
288  """
289 
290  self.particles = particles
291 
292  self.config = config
293 
294  def reconstruct(self) -> pybasf2.Path:
295  """
296  Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
297  """
298  path = basf2.create_path()
299 
300  for particle in self.particles:
301  for channel in particle.channels:
302 
303  if len(channel.daughters) == 1:
304  ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=True, path=path)
305  v2EI = basf2.register_module('VariablesToExtraInfo')
306  v2EI.set_name('VariablesToExtraInfo_' + channel.name)
307  v2EI.param('particleList', channel.name)
308  v2EI.param('variables', {f'constant({channel.decayModeID})': 'decayModeID'})
309  # suppress warning that decay mode ID won't be overwritten if it already exists
310  v2EI.set_log_level(basf2.logging.log_level.ERROR)
311  path.add_module(v2EI)
312  else:
313  ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
314  writeOut=True, path=path)
315  if self.config.monitor:
316  ma.matchMCTruth(channel.name, path=path)
317  bc_variable = channel.preCutConfig.bestCandidateVariable
318  hist_variables = [bc_variable, 'mcErrors', 'mcParticleStatus', channel.mvaConfig.target]
319  hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
320  (bc_variable, 'mcErrors'),
321  (bc_variable, 'mcParticleStatus')]
322  filename = 'Monitor_PreReconstruction_BeforeRanking.root'
323  ma.variablesToHistogram(channel.name,
324  variables=config.variables2binnings(hist_variables),
325  variables_2d=config.variables2binnings_2d(hist_variables_2d),
326  filename=filename, directory=f'{channel.label}', path=path)
327 
328  if channel.preCutConfig.bestCandidateMode == 'lowest':
329  ma.rankByLowest(channel.name,
330  channel.preCutConfig.bestCandidateVariable,
331  channel.preCutConfig.bestCandidateCut,
332  'preCut_rank',
333  path=path)
334  elif channel.preCutConfig.bestCandidateMode == 'highest':
335  ma.rankByHighest(channel.name,
336  channel.preCutConfig.bestCandidateVariable,
337  channel.preCutConfig.bestCandidateCut,
338  'preCut_rank',
339  path=path)
340  else:
341  raise RuntimeError("Unknown bestCandidateMode " + repr(channel.preCutConfig.bestCandidateMode))
342 
343  if self.config.monitor:
344  filename = 'Monitor_PreReconstruction_AfterRanking.root'
345  hist_variables += ['extraInfo(preCut_rank)']
346  hist_variables_2d += [('extraInfo(preCut_rank)', channel.mvaConfig.target),
347  ('extraInfo(preCut_rank)', 'mcErrors'),
348  ('extraInfo(preCut_rank)', 'mcParticleStatus')]
349  ma.variablesToHistogram(channel.name,
350  variables=config.variables2binnings(hist_variables),
351  variables_2d=config.variables2binnings_2d(hist_variables_2d),
352  filename=filename, directory=f'{channel.label}', path=path)
353  # If we are not in monitor mode we do the mc matching now,
354  # otherwise we did it above already!
355  elif self.config.training:
356  ma.matchMCTruth(channel.name, path=path)
357 
358  if b2bii.isB2BII() and particle.name in ['K_S0', 'Lambda0']:
359  pvfit = basf2.register_module('ParticleVertexFitter')
360  pvfit.set_name('ParticleVertexFitter_' + channel.name)
361  pvfit.param('listName', channel.name)
362  pvfit.param('confidenceLevel', channel.preCutConfig.vertexCut)
363  pvfit.param('vertexFitter', 'KFit')
364  pvfit.param('fitType', 'vertex')
365  pvfit.set_log_level(basf2.logging.log_level.ERROR) # let's not produce gigabytes of uninteresting warnings
366  path.add_module(pvfit)
367  elif re.findall(r"[\w']+", channel.decayString).count('pi0') > 1 and particle.name != 'pi0':
368  basf2.B2INFO(f"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
369  elif len(channel.daughters) > 1:
370  pvfit = basf2.register_module('ParticleVertexFitter')
371  pvfit.set_name('ParticleVertexFitter_' + channel.name)
372  pvfit.param('listName', channel.name)
373  pvfit.param('confidenceLevel', channel.preCutConfig.vertexCut)
374  pvfit.param('vertexFitter', 'KFit')
375  pvfit.param('fitType', 'vertex')
376  pvfit.set_log_level(basf2.logging.log_level.ERROR) # let's not produce gigabytes of uninteresting warnings
377  path.add_module(pvfit)
378 
379  if self.config.monitor:
380  hist_variables = ['chiProb', 'mcErrors', 'mcParticleStatus', channel.mvaConfig.target]
381  hist_variables_2d = [('chiProb', channel.mvaConfig.target),
382  ('chiProb', 'mcErrors'),
383  ('chiProb', 'mcParticleStatus')]
384  filename = 'Monitor_PreReconstruction_AfterVertex.root'
385  ma.variablesToHistogram(channel.name,
386  variables=config.variables2binnings(hist_variables),
387  variables_2d=config.variables2binnings_2d(hist_variables_2d),
388  filename=filename, directory=f'{channel.label}', path=path)
389 
390  return path
391 
392 
393 class PostReconstruction:
394  """
395  Steers the reconstruction phase after the mva method was applied
396  It Includes:
397  - The application of the mva method itself.
398  - Copying all channel lists in a common one for each particle defined in particles
399  - Tag unique signal candidates, to avoid double counting of channels with overlap
400  """
401 
402  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
403  """
404  Create a new PostReconstruction object
405  @param particles list of config.Particle objects
406  @param config config.FeiConfiguration object
407  """
408 
409  self.particles = particles
410 
411  self.config = config
412 
413  def get_missing_channels(self) -> typing.Sequence[str]:
414  """
415  Returns all channels for which the weightfile is missing
416  """
417  missing = []
418  for particle in self.particles:
419  for channel in particle.channels:
420  # weightfile = self.config.prefix + '_' + channel.label
421  weightfile = channel.label + '.xml'
422  if not basf2_mva.available(weightfile):
423  missing += [channel.label]
424  return missing
425 
426  def available(self) -> bool:
427  """
428  Check if the relevant information is already available
429  """
430  return len(self.get_missing_channels()) == 0
431 
432  def reconstruct(self) -> pybasf2.Path:
433  """
434  Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
435  """
436  path = basf2.create_path()
437 
438  for particle in self.particles:
439  for channel in particle.channels:
440  expert = basf2.register_module('MVAExpert')
441  expert.set_name('MVAExpert_' + channel.name)
442  if self.config.training:
443  expert.param('identifier', channel.label + '.xml')
444  else:
445  expert.param('identifier', self.config.prefix + '_' + channel.label)
446  expert.param('extraInfoName', 'SignalProbability')
447  expert.param('listNames', [channel.name])
448  # suppress warning that signal probability won't be overwritten if it already exists
449  expert.set_log_level(basf2.logging.log_level.ERROR)
450  path.add_module(expert)
451 
452  uniqueSignal = basf2.register_module('TagUniqueSignal')
453  uniqueSignal.param('particleList', channel.name)
454  uniqueSignal.param('target', channel.mvaConfig.target)
455  uniqueSignal.param('extraInfoName', 'uniqueSignal')
456  uniqueSignal.set_name('TagUniqueSignal_' + channel.name)
457  # suppress warning that unique signal extra info won't be overwritten if it already exists
458  uniqueSignal.set_log_level(basf2.logging.log_level.ERROR)
459  path.add_module(uniqueSignal)
460 
461  if self.config.monitor:
462  hist_variables = ['mcErrors', 'mcParticleStatus', 'extraInfo(uniqueSignal)', 'extraInfo(SignalProbability)',
463  channel.mvaConfig.target, 'extraInfo(decayModeID)']
464  hist_variables_2d = [('extraInfo(SignalProbability)', channel.mvaConfig.target),
465  ('extraInfo(SignalProbability)', 'mcErrors'),
466  ('extraInfo(SignalProbability)', 'mcParticleStatus'),
467  ('extraInfo(decayModeID)', channel.mvaConfig.target),
468  ('extraInfo(decayModeID)', 'mcErrors'),
469  ('extraInfo(decayModeID)', 'extraInfo(uniqueSignal)'),
470  ('extraInfo(decayModeID)', 'mcParticleStatus')]
471  filename = 'Monitor_PostReconstruction_AfterMVA.root'
472  ma.variablesToHistogram(channel.name,
473  variables=config.variables2binnings(hist_variables),
474  variables_2d=config.variables2binnings_2d(hist_variables_2d),
475  filename=filename, directory=f'{channel.label}', path=path)
476 
477  cutstring = ''
478  if particle.postCutConfig.value > 0.0:
479  cutstring = str(particle.postCutConfig.value) + ' < extraInfo(SignalProbability)'
480 
481  ma.mergeListsWithBestDuplicate(particle.identifier, [c.name for c in particle.channels],
482  variable='particleSource', writeOut=True, path=path)
483 
484  if self.config.monitor:
485  hist_variables = ['mcErrors', 'mcParticleStatus', 'extraInfo(uniqueSignal)', 'extraInfo(SignalProbability)',
486  particle.mvaConfig.target, 'extraInfo(decayModeID)']
487  hist_variables_2d = [('extraInfo(decayModeID)', particle.mvaConfig.target),
488  ('extraInfo(decayModeID)', 'mcErrors'),
489  ('extraInfo(decayModeID)', 'mcParticleStatus')]
490  filename = 'Monitor_PostReconstruction_BeforePostCut.root'
491  ma.variablesToHistogram(
492  particle.identifier,
493  variables=config.variables2binnings(hist_variables),
494  variables_2d=config.variables2binnings_2d(hist_variables_2d),
495  filename=config.removeJPsiSlash(filename),
496  directory=config.removeJPsiSlash(f'{particle.identifier}'),
497  path=path)
498 
499  ma.applyCuts(particle.identifier, cutstring, path=path)
500 
501  if self.config.monitor:
502  filename = 'Monitor_PostReconstruction_BeforeRanking.root'
503  ma.variablesToHistogram(
504  particle.identifier,
505  variables=config.variables2binnings(hist_variables),
506  variables_2d=config.variables2binnings_2d(hist_variables_2d),
507  filename=config.removeJPsiSlash(filename),
508  directory=config.removeJPsiSlash(f'{particle.identifier}'),
509  path=path)
510 
511  ma.rankByHighest(particle.identifier, 'extraInfo(SignalProbability)',
512  particle.postCutConfig.bestCandidateCut, 'postCut_rank', path=path)
513 
514  if self.config.monitor:
515  hist_variables += ['extraInfo(postCut_rank)']
516  hist_variables_2d += [('extraInfo(decayModeID)', 'extraInfo(postCut_rank)'),
517  (particle.mvaConfig.target, 'extraInfo(postCut_rank)'),
518  ('mcErrors', 'extraInfo(postCut_rank)'),
519  ('mcParticleStatus', 'extraInfo(postCut_rank)')]
520  filename = 'Monitor_PostReconstruction_AfterRanking.root'
521  ma.variablesToHistogram(
522  particle.identifier,
523  variables=config.variables2binnings(hist_variables),
524  variables_2d=config.variables2binnings_2d(hist_variables_2d),
525  filename=config.removeJPsiSlash(filename),
526  directory=config.removeJPsiSlash(f'{particle.identifier}'),
527  path=path)
528 
529  variables = ['extraInfo(SignalProbability)', 'mcErrors', 'mcParticleStatus', particle.mvaConfig.target,
530  'extraInfo(uniqueSignal)', 'extraInfo(decayModeID)']
531 
532  if 'B_s0' == particle.name:
533  variables += ['Mbc']
534  elif 'B' in particle.name:
535  variables += ['Mbc', 'cosThetaBetweenParticleAndNominalB']
536 
537  filename = 'Monitor_Final.root'
538  ma.variablesToNtuple(particle.identifier, variables, treename=config.removeJPsiSlash(
539  f'{particle.identifier} variables'), filename=config.removeJPsiSlash(filename), path=path)
540  return path
541 
542 
543 class Teacher:
544  """
545  Performs all necessary trainings for all training data files which are
546  available but where there is no weight file available yet.
547  This class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
548  The trainings are run in parallel using multi-threading of python.
549  Each training is done by a subprocess call, the training command (passed by config.externTeacher) can be either
550  * basf2_mva_teacher, the training will be done directly on the machine
551  * externClustTeacher, the training will be submitted to the batch system of KEKCC
552  """
553 
555  MaximumNumberOfMVASamples = int(1e7)
556 
558  MinimumNumberOfMVASamples = int(5e2)
559 
560  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
561  """
562  Create a new Teacher object
563  @param particles list of config.Particle objects
564  @param config config.FeiConfiguration object
565  """
566 
567  self.particles = particles
568 
569  self.config = config
570 
571  @staticmethod
572  def create_fake_weightfile(channel: str):
573  """
574  Create a fake weight file using the trivial method, it will always return 0.0
575  @param channel for which we create a fake weight file
576  """
577  content = f"""
578  <?xml version="1.0" encoding="utf-8"?>
579  <method>Trivial</method>
580  <weightfile>{channel}.xml</weightfile>
581  <treename>tree</treename>
582  <target_variable>isSignal</target_variable>
583  <weight_variable>__weight__</weight_variable>
584  <signal_class>1</signal_class>
585  <max_events>0</max_events>
586  <number_feature_variables>1</number_feature_variables>
587  <variable0>M</variable0>
588  <number_spectator_variables>0</number_spectator_variables>
589  <number_data_files>1</number_data_files>
590  <datafile0>train.root</datafile0>
591  <Trivial_version>1</Trivial_version>
592  <Trivial_output>0</Trivial_output>
593  <signal_fraction>0.066082567</signal_fraction>
594  """
595  with open(channel + ".xml", "w") as f:
596  f.write(content)
597 
598  @staticmethod
599  def check_if_weightfile_is_fake(filename: str):
600  """
601  Checks if the provided filename is a fake-weight file or not
602  @param filename the filename of the weight file
603  """
604  try:
605  return '<method>Trivial</method>' in open(filename).readlines()[2]
606  except BaseException:
607  return True
608  return True
609 
610  def upload(self, channel: str):
611  """
612  Upload the weight file into the condition database
613  @param channel whose weight file is uploaded
614  """
615  disk = channel + '.xml'
616  dbase = self.config.prefix + '_' + channel
617  basf2_mva.upload(disk, dbase)
618  return (disk, dbase)
619 
620  def do_all_trainings(self):
621  """
622  Do all trainings for which we find training data
623  """
624  # Always avoid the top-level 'import ROOT'.
625  import ROOT # noqa
626  # FEI uses multi-threading for parallel execution of tasks therefore
627  # the ROOT gui-thread is disabled, which otherwise interferes sometimes
628  ROOT.PyConfig.StartGuiThread = False
629  job_list = []
630  filename = 'training_input.root'
631  if not os.path.isfile(filename):
632  B2WARNING("Training of MVC failed. Couldn't find ROOT file. "
633  "No weight files will be provided.")
634  else:
635  f = ROOT.TFile.Open(filename, 'read')
636  if f.IsZombie():
637  B2WARNING("Training of MVC failed. ROOT file corrupt. "
638  "No weight files will be provided.")
639  elif len([k.GetName() for k in f.GetListOfKeys()]) == 0:
640  B2WARNING("Training of MVC failed. ROOT file does not contain any trees. "
641  "No weight files will be provided.")
642  else:
643  for particle in self.particles:
644  for channel in particle.channels:
645  weightfile = channel.label + '.xml'
646  if not basf2_mva.available(weightfile):
647  keys = [m for m in f.GetListOfKeys() if f"{channel.label}" in m.GetName()]
648  if not keys:
649  continue
650  tree = keys[0].ReadObj()
651  nSig = tree.GetEntries(channel.mvaConfig.target + ' == 1.0')
652  nBg = tree.GetEntries(channel.mvaConfig.target + ' != 1.0')
653  if nSig < Teacher.MinimumNumberOfMVASamples:
654  B2WARNING("Training of MVC failed. "
655  f"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
656  self.create_fake_weightfile(channel.label)
657  self.upload(channel.label)
658  continue
659  if nBg < Teacher.MinimumNumberOfMVASamples:
660  B2WARNING("Training of MVC failed. "
661  f"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
662  self.create_fake_weightfile(channel.label)
663  self.upload(channel.label)
664  continue
665  variable_str = "' '".join(channel.mvaConfig.variables)
666 
667  command = (f"{self.config.externTeacher}"
668  f" --method '{channel.mvaConfig.method}'"
669  f" --target_variable '{channel.mvaConfig.target}'"
670  f" --treename '{channel.label} variables' --datafile 'training_input.root'"
671  f" --signal_class 1 --variables '{variable_str}'"
672  f" --identifier '{channel.label}.xml'"
673  f" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1")
674  B2INFO(f"Used following command to invoke teacher: \n {command}")
675  job_list.append((channel.label, command))
676  f.Close()
677  p = multiprocessing.Pool(None, maxtasksperchild=1)
678  func = functools.partial(subprocess.call, shell=True)
679  p.map(func, [c for _, c in job_list])
680  p.close()
681  p.join()
682  weightfiles = []
683  for name, _ in job_list:
684  if not basf2_mva.available(name + '.xml'):
685  B2WARNING("Training of MVC failed. For unknown reasons, check the logfile")
686  self.create_fake_weightfile(name)
687  weightfiles.append(self.upload(name))
688  return weightfiles
689 
690 
691 def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
692  """
693  Convert an old FEI training into the new format.
694  The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
695  using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
696  @param particles list of config.Particle objects
697  @param config config.FeiConfiguration object
698  """
699  summary = pickle.load(open(configuration.legacy, 'rb'))
700  channel2lists = {k: v[2] for k, v in summary['channel2lists'].items()}
701 
702  teacher = Teacher(particles, configuration)
703 
704  for particle in particles:
705  for channel in particle.channels:
706  new_weightfile = configuration.prefix + '_' + channel.label
707  old_weightfile = configuration.prefix + '_' + channel2lists[channel.label.replace('Jpsi', 'J/psi')]
708  if not basf2_mva.available(new_weightfile):
709  if old_weightfile is None or not basf2_mva.available(old_weightfile):
710  Teacher.create_fake_weightfile(channel.label)
711  teacher.upload(channel.label)
712  else:
713  basf2_mva.download(old_weightfile, channel.label + '.xml')
714  teacher.upload(channel.label)
715 
716 
717 def get_stages_from_particles(particles: typing.Sequence[config.Particle]):
718  """
719  Returns the hierarchical structure of the FEI.
720  Each stage depends on the particles in the previous stage.
721  The final stage is empty (meaning everything is done, and the training is finished at this point).
722  @param particles list of config.Particle objects
723  """
724  stages = [
725  [p for p in particles if p.name in ['e+', 'K+', 'pi+', 'mu+', 'gamma', 'p+', 'K_L0']],
726  [p for p in particles if p.name in ['pi0', 'J/psi', 'Lambda0']],
727  [p for p in particles if p.name in ['K_S0', 'Sigma+']],
728  [p for p in particles if p.name in ['D+', 'D0', 'D_s+', 'Lambda_c+']],
729  [p for p in particles if p.name in ['D*+', 'D*0', 'D_s*+']],
730  [p for p in particles if p.name in ['B0', 'B+', 'B_s0']],
731  []
732  ]
733 
734  for p in particles:
735  if p.name not in [p.name for stage in stages for p in stage]:
736  raise RuntimeError(f"Unknown particle {p.name}: Not implemented in FEI")
737 
738  return stages
739 
740 
741 def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
742  """
743  Performs the training of mva classifiers for all available training data,
744  this function must be either called by the user after each stage of the FEI during training,
745  or (more likely) is called by the distributed.py script after merging the outputs of all jobs,
746  @param particles list of config.Particle objects
747  @param config config.FeiConfiguration object
748  @return list of tuple with weight file on disk and identifier in database for all trained classifiers
749  """
750  teacher = Teacher(particles, configuration)
751  return teacher.do_all_trainings()
752 
753 
754 def save_summary(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration, cache: int):
755  """
756  Creates the Summary.pickle, which is used to keep track of the stage during the training,
757  and can be used later to investigate which configuration was used exactly to create the training.
758  @param particles list of config.Particle objects
759  @param config config.FeiConfiguration object
760  @param cache current cache level
761  """
762  configuration = config.FeiConfiguration(configuration.prefix, cache,
763  configuration.monitor, configuration.legacy, configuration.externTeacher,
764  configuration.training)
765  # Backup existing Summary.pickle files
766  for i in [8, 7, 6, 5, 4, 3, 2, 1, 0]:
767  if os.path.isfile(f'Summary.pickle.backup_{i}'):
768  shutil.copyfile(f'Summary.pickle.backup_{i}', f'Summary.pickle.backup_{i + 1}')
769  if os.path.isfile('Summary.pickle'):
770  shutil.copyfile('Summary.pickle', 'Summary.pickle.backup_0')
771  pickle.dump((particles, configuration), open('Summary.pickle', 'wb'))
772 
773 
774 def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
775  """
776  The most important function of the FEI.
777  This creates the FEI path for training/fitting (both terms are equal), and application/inference (both terms are equal).
778  The whole FEI is defined by the particles which are reconstructed (see default_channels.py)
779  and the configuration (see config.py).
780 
781  TRAINING
782  For training this function is called multiple times, each time the FEI reconstructs one more stage in the hierarchical structure
783  i.e. we start with FSP, pi0, KS_0, D, D*, and with B mesons. You have to set configuration.training to True for training mode.
784  All weight files created during the training will be stored in your local database.
785  If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
786  to the central database first (see documentation for the Belle2 Condition Database).
787 
788  APPLICATION
789  For application you call this function once, and it returns the whole path which will reconstruct B mesons
790  with an associated signal probability. You have to set configuration.training to False for application mode.
791 
792  MONITORING
793  You can always turn on the monitoring (configuration.monitor = True),
794  to write out ROOT Histograms of many quantities for each stage,
795  using these histograms you can use the printReporting.py or latexReporting.py scripts to automatically create pdf files.
796 
797  LEGACY
798  This function can also use old FEI trainings (version 3), just pass the Summary.pickle file of the old training,
799  and the weight files will be automatically converted to the new naming scheme.
800 
801  @param particles list of config.Particle objects
802  @param config config.FeiConfiguration object
803  """
804  print(r"""
805  ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
806  |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
807  | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
808 
809  Author: Thomas Keck 2014 - 2017
810  Please cite my PhD thesis
811  """)
812 
813  # The cache parameter of the configuration object is used during training to keep track,
814  # which reconstruction steps are already performed.
815  # For fitting/training we start by default with -1, meaning we still have to create the TrainingDataInformation,
816  # which is used to determine the number of candidates we have to write out for the FSP trainings in stage 0.
817  # For inference/application we start by default with 0, because we don't need the TrainingDataInformation in stage 0.
818  # During the training we save the particles and configuration (including the current cache stage) in the Summary.pickle object.
819  if configuration.cache is None:
820  if os.path.isfile('Summary.pickle'):
821  print("Cache: Replaced particles and configuration with the ones from Summary.pickle!")
822  particles, configuration = pickle.load(open('Summary.pickle', 'rb'))
823  cache = configuration.cache
824  else:
825  if configuration.training:
826  cache = -1
827  else:
828  cache = 0
829  else:
830  cache = configuration.cache
831 
832  # Now we start building the training or application path
833  path = basf2.create_path()
834 
835  # There are in total 7 stages.
836  # For training we start with -1 and go to 7 one stage at a time
837  # For application we can run stage 0 to 7 at once
838  stages = get_stages_from_particles(particles)
839 
840  # If the user provided a Summary.pickle file of a FEIv3 training we
841  # convert the old weight files (with hashes), to the new naming scheme.
842  # Afterwards the algorithm runs as usual
843  if configuration.legacy is not None:
844  convert_legacy_training(particles, configuration)
845 
846  # During the training we require the number of MC particles in the whole processed
847  # data sample, because we don't want to write out billions of e.g. pion candidates.
848  # Knowing the total amount of MC particles we can write out only every e.g. 10th candidate
849  # That's why we have to write out the TrainingDataInformation before doing anything during the training phase.
850  # During application we only need this if we run in monitor mode, and want to write out a summary in the end,
851  # the summary contains efficiency, and the efficiency calculation requires the total number of MC particles.
852  training_data_information = TrainingDataInformation(particles)
853  if cache < 0 and configuration.training:
854  print("Stage 0: Run over all files to count the number of events and McParticles")
855  path.add_path(training_data_information.reconstruct())
856  if configuration.training:
857  save_summary(particles, configuration, 0)
858  return FeiState(path, 0, [])
859  elif not configuration.training and configuration.monitor:
860  path.add_path(training_data_information.reconstruct())
861 
862  # We load the Final State particles
863  # It is assumed that the user takes care of adding RootInput, Geometry, and everything
864  # which is required to read in data, so we directly start to load the FSP particles
865  # used by the FEI.
866  loader = FSPLoader(particles, configuration)
867  if cache < 1:
868  print("Stage 0: Load FSP particles")
869  path.add_path(loader.reconstruct())
870 
871  # Now we reconstruct each stage one after another.
872  # Each stage consists of two parts:
873  # PreReconstruction (before the mva method was applied):
874  # - Particle combination
875  # - Do vertex fitting
876  # - Some simple cuts and best candidate selection
877  # PostReconstruction (after the mva method was applied):
878  # - Apply the mva method
879  # - Apply cuts on the mva output and best candidate selection
880  #
881  # If the weight files for the PostReconstruction are not available for the current stage and we are in training mode,
882  # we have to create the training data. The training itself is done by the do_trainings function which is called
883  # either by the user after each step, or by the distributed.py script
884  #
885  # If the weight files for the PostReconstruction are not available for the current stage and we are not in training mode,
886  # we keep going, as soon as the user will call process on the produced path he will get an error message that the
887  # weight files are missing.
888  #
889  # Finally we keep track of the ParticleLists we use, so the user can run the RemoveParticles module to reduce the size of the
890  # intermediate output of RootOutput.
891  used_lists = []
892  for stage, stage_particles in enumerate(stages):
893  pre_reconstruction = PreReconstruction(stage_particles, configuration)
894  if cache <= stage:
895  print(f"Stage {stage}: PreReconstruct particles: ", [p.name for p in stage_particles])
896  path.add_path(pre_reconstruction.reconstruct())
897 
898  post_reconstruction = PostReconstruction(stage_particles, configuration)
899  if configuration.training and not post_reconstruction.available():
900  print(f"Stage {stage}: Create training data for particles: ", [p.name for p in stage_particles])
901  mc_counts = training_data_information.get_mc_counts()
902  training_data = TrainingData(stage_particles, configuration, mc_counts)
903  path.add_path(training_data.reconstruct())
904  used_lists += [channel.name for particle in stage_particles for channel in particle.channels]
905  break
906  if cache <= stage + 1:
907  path.add_path(post_reconstruction.reconstruct())
908  used_lists += [particle.identifier for particle in stage_particles]
909 
910  # If we run in monitor mode we are interested in the ModuleStatistics,
911  # these statistics contain the runtime for each module which was run.
912  if configuration.monitor:
913  output = basf2.register_module('RootOutput')
914  output.param('outputFileName', 'Monitor_ModuleStatistics.root')
915  output.param('branchNames', ['EventMetaData']) # cannot be removed, but of only small effect on file size
916  output.param('branchNamesPersistent', ['ProcessStatistics'])
917  output.param('ignoreCommandLineOverride', True)
918  path.add_module(output)
919 
920  # As mentioned above the FEI keeps track of the stages which are already reconstructed during the training
921  # so we write out the Summary.pickle here, and increase the stage by one.
922  if configuration.training or configuration.monitor:
923  save_summary(particles, configuration, stage + 1)
924 
925  # Finally we return the path, the stage and the used lists to the user.
926  return FeiState(path, stage + 1, plists=used_lists)
927 
928 # @endcond
def isB2BII()
Definition: b2bii.py:14
def from_name(name)
Definition: pdg.py:62