Belle II Software  light-2403-persian
core.py
1 #!/usr/bin/env python3
2 
3 
10 
11 # @cond SUPPRESS_DOXYGEN
12 
13 """
14  The Full Event Interpretation Algorithm
15 
16  Some basic facts:
17  - The algorithm will automatically reconstruct B mesons and calculate a signal probability for each candidate.
18  - It can be used for hadronic and semileptonic tagging.
19  - The algorithm has to be trained on MC, and can afterwards be applied on data.
20  - The training requires O(100) million MC events
21  - The weight files are stored in the Belle II Condition database
22 
23  Read this file if you want to understand the technical details of the FEI.
24 
25  The FEI follows a hierarchical approach.
26  There are 7 stages:
27  (Stage -1: Write out information about the provided data sample)
28  Stage 0: Final State Particles (FSP)
29  Stage 1: pi0, J/Psi, Lambda0
30  Stage 2: K_S0, Sigma+
31  Stage 3: D and Lambda_c mesons
32  Stage 4: D* mesons
33  Stage 5: B mesons
34  Stage 6: Finish
35 
36  Most stages consists of:
37  - Create Particle Candidates
38  - Apply Cuts
39  - Do vertex Fitting
40  - Apply a multivariate classification method
41  - Apply more Cuts
42 
43  The FEI will reconstruct these 7 stages during the training phase,
44  since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
45  to train all the necessary multivariate classifiers.
46 """
47 
48 # Import basf2
49 import basf2
50 from basf2 import B2INFO, B2WARNING
51 import pybasf2
52 import modularAnalysis as ma
53 import b2bii
54 
55 # Should come after basf2 import
56 import pdg
57 from fei import config
58 import basf2_mva
59 
60 # Standard python modules
61 import collections
62 import os
63 import shutil
64 import typing
65 import pickle
66 import re
67 import functools
68 import subprocess
69 import multiprocessing
70 
71 # Simple object containing the output of fei
72 FeiState = collections.namedtuple('FeiState', 'path, stage, plists')
73 
74 
75 class TrainingDataInformation:
76  """
77  Contains the relevant information about the used training data.
78  Basically we write out the number of MC particles in the whole dataset.
79  This numbers we can use to calculate what fraction of candidates we have to write
80  out as TrainingData to get a reasonable amount of candidates to train on
81  (too few candidates will lead to heavy overtraining, too many won't fit into memory).
82  Secondly we can use this information for the generation of the monitoring pdfs,
83  where we calculate reconstruction efficiencies.
84  """
85 
86  def __init__(self, particles: typing.Sequence[config.Particle]):
87  """
88  Create a new TrainingData object
89  @param particles list of config.Particle objects
90  """
91 
92  self.particles = particles
93 
94  self.filename = 'mcParticlesCount.root'
95 
96  def available(self) -> bool:
97  """
98  Check if the relevant information is already available
99  """
100  return os.path.isfile(self.filename)
101 
102  def reconstruct(self) -> pybasf2.Path:
103  """
104  Returns pybasf2.Path which counts the number of MCParticles in each event.
105  @param particles list of config.Particle objects
106  """
107  # Unique absolute pdg-codes of all particles
108  pdgs = {abs(pdg.from_name(particle.name)) for particle in self.particles}
109 
110  path = basf2.create_path()
111  module = basf2.register_module('VariablesToHistogram')
112  module.set_name("VariablesToHistogram_MCCount")
113  module.param('variables', [(f'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5) for pdg in pdgs])
114  module.param('fileName', self.filename)
115  path.add_module(module)
116  return path
117 
118  def get_mc_counts(self):
119  """
120  Read out the number of MC particles from the file created by reconstruct
121  """
122  # Unique absolute pdg-codes of all particles
123  # Always avoid the top-level 'import ROOT'.
124  import ROOT # noqa
125  root_file = ROOT.TFile.Open(self.filename, 'read')
126  mc_counts = {}
127 
128  for key in root_file.GetListOfKeys():
129  variable = ROOT.Belle2.MakeROOTCompatible.invertMakeROOTCompatible(key.GetName())
130  pdg = abs(int(variable[len('NumberOfMCParticlesInEvent('):-len(")")]))
131  hist = key.ReadObj()
132  mc_counts[pdg] = {}
133  mc_counts[pdg]['sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
134  for bin in range(hist.GetNbinsX()))
135  mc_counts[pdg]['std'] = hist.GetStdDev()
136  mc_counts[pdg]['avg'] = hist.GetMean()
137  mc_counts[pdg]['max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
138  mc_counts[pdg]['min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
139 
140  mc_counts[0] = {}
141  mc_counts[0]['sum'] = hist.GetEntries()
142  root_file.Close()
143  return mc_counts
144 
145 
146 class FSPLoader:
147  """
148  Steers the loading of FSP particles.
149  This does NOT include RootInput, Geometry or anything required before loading FSPs,
150  the user has to add this himself (because it depends on the MC campaign and if you want
151  to use Belle or Belle II).
152  """
153 
154  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
155  """
156  Create a new FSPLoader object
157  @param particles list of config.Particle objects
158  @param config config.FeiConfiguration object
159  """
160 
161  self.particles = particles
162 
163  self.config = config
164 
165  def reconstruct(self) -> pybasf2.Path:
166  """
167  Returns pybasf2.Path which loads the FSP Particles
168  """
169  path = basf2.create_path()
170 
171  if b2bii.isB2BII():
172  ma.fillParticleLists([('K+:FSP', ''), ('pi+:FSP', ''), ('e+:FSP', ''),
173  ('mu+:FSP', ''), ('p+:FSP', '')], writeOut=True, path=path)
174  for outputList, inputList in [('gamma:FSP', 'gamma:mdst'), ('K_S0:V0', 'K_S0:mdst'),
175  ('Lambda0:V0', 'Lambda0:mdst'), ('K_L0:FSP', 'K_L0:mdst'),
176  ('pi0:FSP', 'pi0:mdst'), ('gamma:V0', 'gamma:v0mdst')]:
177  ma.copyParticles(outputList, inputList, writeOut=True, path=path)
178  else:
179  ma.fillParticleLists([('K+:FSP', ''), ('pi+:FSP', ''), ('e+:FSP', ''),
180  ('mu+:FSP', ''), ('gamma:FSP', ''),
181  ('p+:FSP', ''), ('K_L0:FSP', '')], writeOut=True, path=path)
182  ma.fillParticleList('K_S0:V0 -> pi+ pi-', '', writeOut=True, path=path)
183  ma.fillParticleList('Lambda0:V0 -> p+ pi-', '', writeOut=True, path=path)
184  ma.fillConvertedPhotonsList('gamma:V0 -> e+ e-', '', writeOut=True, path=path)
185 
186  if self.config.monitor:
187  names = ['e+', 'K+', 'pi+', 'mu+', 'gamma', 'K_S0', 'p+', 'K_L0', 'Lambda0', 'pi0']
188  filename = 'Monitor_FSPLoader.root'
189  pdgs = {abs(pdg.from_name(name)) for name in names}
190  variables = [(f'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5) for pdg in pdgs]
191  ma.variablesToHistogram('', variables=variables, filename=filename, path=path)
192  return path
193 
194 
195 class TrainingData:
196  """
197  Steers the creation of the training data.
198  The training data is used to train a multivariate classifier for each channel.
199  The training of the FEI at its core is just generating this training data for each channel.
200  After we created the training data for a stage, we have to train the classifiers (see Teacher class further down).
201  """
202 
203  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
204  mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
205  """
206  Create a new TrainingData object
207  @param particles list of config.Particle objects
208  @param config config.FeiConfiguration object
209  @param mc_counts containing number of MC Particles
210  """
211 
212  self.particles = particles
213 
214  self.config = config
215 
216  self.mc_counts = mc_counts
217 
218  def reconstruct(self) -> pybasf2.Path:
219  """
220  Returns pybasf2.Path which creates the training data for the given particles
221  """
222  path = basf2.create_path()
223 
224  for particle in self.particles:
225  pdgcode = abs(pdg.from_name(particle.name))
226  nSignal = self.mc_counts[pdgcode]['sum']
227  # For D-Mesons we usually have a efficiency of 10^-3 including branching fraction
228  if pdgcode > 400:
229  nSignal /= 1000
230  # For B-Mesons we usually have a efficiency of 10^-4 including branching fraction
231  if pdgcode > 500:
232  nSignal /= 10000
233 
234  for channel in particle.channels:
235  filename = 'training_input.root'
236 
237  nBackground = self.mc_counts[0]['sum'] * channel.preCutConfig.bestCandidateCut
238  inverseSamplingRates = {}
239  # For some very pure channels (Jpsi), this sampling can be too aggressive and training fails.
240  # It can therefore be disabled in the preCutConfig.
241  if nBackground > Teacher.MaximumNumberOfMVASamples and not channel.preCutConfig.noBackgroundSampling:
242  inverseSamplingRates[0] = int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1
243  if nSignal > Teacher.MaximumNumberOfMVASamples:
244  inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
245 
246  spectators = [channel.mvaConfig.target]
247  if channel.mvaConfig.sPlotVariable is not None:
248  spectators.append(channel.mvaConfig.sPlotVariable)
249 
250  if self.config.monitor:
251  hist_variables = ['mcErrors', 'mcParticleStatus'] + channel.mvaConfig.variables + spectators
252  hist_variables_2d = [(x, channel.mvaConfig.target)
253  for x in channel.mvaConfig.variables + spectators if x is not channel.mvaConfig.target]
254  hist_filename = 'Monitor_TrainingData.root'
255  ma.variablesToHistogram(channel.name, variables=config.variables2binnings(hist_variables),
256  variables_2d=config.variables2binnings_2d(hist_variables_2d),
257  filename=config.removeJPsiSlash(hist_filename),
258  directory=config.removeJPsiSlash(f'{channel.label}'), path=path)
259 
260  teacher = basf2.register_module('VariablesToNtuple')
261  teacher.set_name('VariablesToNtuple_' + channel.name)
262  teacher.param('fileName', filename)
263  teacher.param('treeName', f'{channel.label} variables')
264  teacher.param('variables', channel.mvaConfig.variables + spectators)
265  teacher.param('particleList', channel.name)
266  teacher.param('sampling', (channel.mvaConfig.target, inverseSamplingRates))
267  path.add_module(teacher)
268  return path
269 
270 
271 class PreReconstruction:
272  """
273  Steers the reconstruction phase before the mva method was applied
274  It Includes:
275  - The ParticleCombination (for each particle and channel we create candidates using
276  the daughter candidates from the previous stages)
277  - MC Matching
278  - Vertex Fitting (this is the slowest part of the whole FEI, KFit is used by default,
279  but you can use fastFit as a drop-in replacement https://github.com/thomaskeck/FastFit/,
280  this will speed up the whole FEI by a factor 2-3)
281  """
282 
283  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
284  """
285  Create a new PreReconstruction object
286  @param particles list of config.Particle objects
287  @param config config.FeiConfiguration object
288  """
289 
290  self.particles = particles
291 
292  self.config = config
293 
294  def reconstruct(self) -> pybasf2.Path:
295  """
296  Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
297  """
298  path = basf2.create_path()
299 
300  for particle in self.particles:
301  for channel in particle.channels:
302 
303  if len(channel.daughters) == 1:
304  ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=True, path=path)
305  v2EI = basf2.register_module('VariablesToExtraInfo')
306  v2EI.set_name('VariablesToExtraInfo_' + channel.name)
307  v2EI.param('particleList', channel.name)
308  v2EI.param('variables', {f'constant({channel.decayModeID})': 'decayModeID'})
309  # suppress warning that decay mode ID won't be overwritten if it already exists
310  v2EI.set_log_level(basf2.logging.log_level.ERROR)
311  path.add_module(v2EI)
312  else:
313  ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
314  writeOut=True, path=path)
315  if self.config.monitor:
316  ma.matchMCTruth(channel.name, path=path)
317  bc_variable = channel.preCutConfig.bestCandidateVariable
318  hist_variables = [bc_variable, 'mcErrors', 'mcParticleStatus', channel.mvaConfig.target]
319  hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
320  (bc_variable, 'mcErrors'),
321  (bc_variable, 'mcParticleStatus')]
322  filename = 'Monitor_PreReconstruction_BeforeRanking.root'
323  ma.variablesToHistogram(channel.name,
324  variables=config.variables2binnings(hist_variables),
325  variables_2d=config.variables2binnings_2d(hist_variables_2d),
326  filename=filename, directory=f'{channel.label}', path=path)
327 
328  if channel.preCutConfig.bestCandidateMode == 'lowest':
329  ma.rankByLowest(channel.name,
330  channel.preCutConfig.bestCandidateVariable,
331  channel.preCutConfig.bestCandidateCut,
332  'preCut_rank',
333  path=path)
334  elif channel.preCutConfig.bestCandidateMode == 'highest':
335  ma.rankByHighest(channel.name,
336  channel.preCutConfig.bestCandidateVariable,
337  channel.preCutConfig.bestCandidateCut,
338  'preCut_rank',
339  path=path)
340  else:
341  raise RuntimeError("Unknown bestCandidateMode " + repr(channel.preCutConfig.bestCandidateMode))
342 
343  if self.config.monitor:
344  filename = 'Monitor_PreReconstruction_AfterRanking.root'
345  hist_variables += ['extraInfo(preCut_rank)']
346  hist_variables_2d += [('extraInfo(preCut_rank)', channel.mvaConfig.target),
347  ('extraInfo(preCut_rank)', 'mcErrors'),
348  ('extraInfo(preCut_rank)', 'mcParticleStatus')]
349  ma.variablesToHistogram(channel.name,
350  variables=config.variables2binnings(hist_variables),
351  variables_2d=config.variables2binnings_2d(hist_variables_2d),
352  filename=filename, directory=f'{channel.label}', path=path)
353  # If we are not in monitor mode we do the mc matching now,
354  # otherwise we did it above already!
355  elif self.config.training:
356  ma.matchMCTruth(channel.name, path=path)
357 
358  if b2bii.isB2BII() and particle.name in ['K_S0', 'Lambda0']:
359  pvfit = basf2.register_module('ParticleVertexFitter')
360  pvfit.set_name('ParticleVertexFitter_' + channel.name)
361  pvfit.param('listName', channel.name)
362  pvfit.param('confidenceLevel', channel.preCutConfig.vertexCut)
363  pvfit.param('vertexFitter', 'KFit')
364  pvfit.param('fitType', 'vertex')
365  pvfit.set_log_level(basf2.logging.log_level.ERROR) # let's not produce gigabytes of uninteresting warnings
366  path.add_module(pvfit)
367  elif re.findall(r"[\w']+", channel.decayString).count('pi0') > 1 and particle.name != 'pi0':
368  basf2.B2INFO(f"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
369  elif len(channel.daughters) > 1:
370  pvfit = basf2.register_module('ParticleVertexFitter')
371  pvfit.set_name('ParticleVertexFitter_' + channel.name)
372  pvfit.param('listName', channel.name)
373  pvfit.param('confidenceLevel', channel.preCutConfig.vertexCut)
374  pvfit.param('vertexFitter', 'KFit')
375  if particle.name in ['pi0']:
376  pvfit.param('fitType', 'mass')
377  else:
378  pvfit.param('fitType', 'vertex')
379  pvfit.set_log_level(basf2.logging.log_level.ERROR) # let's not produce gigabytes of uninteresting warnings
380  path.add_module(pvfit)
381 
382  if self.config.monitor:
383  hist_variables = ['chiProb', 'mcErrors', 'mcParticleStatus', channel.mvaConfig.target]
384  hist_variables_2d = [('chiProb', channel.mvaConfig.target),
385  ('chiProb', 'mcErrors'),
386  ('chiProb', 'mcParticleStatus')]
387  filename = 'Monitor_PreReconstruction_AfterVertex.root'
388  ma.variablesToHistogram(channel.name,
389  variables=config.variables2binnings(hist_variables),
390  variables_2d=config.variables2binnings_2d(hist_variables_2d),
391  filename=filename, directory=f'{channel.label}', path=path)
392 
393  return path
394 
395 
396 class PostReconstruction:
397  """
398  Steers the reconstruction phase after the mva method was applied
399  It Includes:
400  - The application of the mva method itself.
401  - Copying all channel lists in a common one for each particle defined in particles
402  - Tag unique signal candidates, to avoid double counting of channels with overlap
403  """
404 
405  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
406  """
407  Create a new PostReconstruction object
408  @param particles list of config.Particle objects
409  @param config config.FeiConfiguration object
410  """
411 
412  self.particles = particles
413 
414  self.config = config
415 
416  def get_missing_channels(self) -> typing.Sequence[str]:
417  """
418  Returns all channels for which the weightfile is missing
419  """
420  missing = []
421  for particle in self.particles:
422  for channel in particle.channels:
423  # weightfile = self.config.prefix + '_' + channel.label
424  weightfile = channel.label + '.xml'
425  if not basf2_mva.available(weightfile):
426  missing += [channel.label]
427  return missing
428 
429  def available(self) -> bool:
430  """
431  Check if the relevant information is already available
432  """
433  return len(self.get_missing_channels()) == 0
434 
435  def reconstruct(self) -> pybasf2.Path:
436  """
437  Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
438  """
439  path = basf2.create_path()
440 
441  for particle in self.particles:
442  for channel in particle.channels:
443  expert = basf2.register_module('MVAExpert')
444  expert.set_name('MVAExpert_' + channel.name)
445  if self.config.training:
446  expert.param('identifier', channel.label + '.xml')
447  else:
448  expert.param('identifier', self.config.prefix + '_' + channel.label)
449  expert.param('extraInfoName', 'SignalProbability')
450  expert.param('listNames', [channel.name])
451  # suppress warning that signal probability won't be overwritten if it already exists
452  expert.set_log_level(basf2.logging.log_level.ERROR)
453  path.add_module(expert)
454 
455  uniqueSignal = basf2.register_module('TagUniqueSignal')
456  uniqueSignal.param('particleList', channel.name)
457  uniqueSignal.param('target', channel.mvaConfig.target)
458  uniqueSignal.param('extraInfoName', 'uniqueSignal')
459  uniqueSignal.set_name('TagUniqueSignal_' + channel.name)
460  # suppress warning that unique signal extra info won't be overwritten if it already exists
461  uniqueSignal.set_log_level(basf2.logging.log_level.ERROR)
462  path.add_module(uniqueSignal)
463 
464  if self.config.monitor:
465  hist_variables = ['mcErrors', 'mcParticleStatus', 'extraInfo(uniqueSignal)', 'extraInfo(SignalProbability)',
466  channel.mvaConfig.target, 'extraInfo(decayModeID)']
467  hist_variables_2d = [('extraInfo(SignalProbability)', channel.mvaConfig.target),
468  ('extraInfo(SignalProbability)', 'mcErrors'),
469  ('extraInfo(SignalProbability)', 'mcParticleStatus'),
470  ('extraInfo(decayModeID)', channel.mvaConfig.target),
471  ('extraInfo(decayModeID)', 'mcErrors'),
472  ('extraInfo(decayModeID)', 'extraInfo(uniqueSignal)'),
473  ('extraInfo(decayModeID)', 'mcParticleStatus')]
474  filename = 'Monitor_PostReconstruction_AfterMVA.root'
475  ma.variablesToHistogram(channel.name,
476  variables=config.variables2binnings(hist_variables),
477  variables_2d=config.variables2binnings_2d(hist_variables_2d),
478  filename=filename, directory=f'{channel.label}', path=path)
479 
480  cutstring = ''
481  if particle.postCutConfig.value > 0.0:
482  cutstring = str(particle.postCutConfig.value) + ' < extraInfo(SignalProbability)'
483 
484  ma.mergeListsWithBestDuplicate(particle.identifier, [c.name for c in particle.channels],
485  variable='particleSource', writeOut=True, path=path)
486 
487  if self.config.monitor:
488  hist_variables = ['mcErrors', 'mcParticleStatus', 'extraInfo(uniqueSignal)', 'extraInfo(SignalProbability)',
489  particle.mvaConfig.target, 'extraInfo(decayModeID)']
490  hist_variables_2d = [('extraInfo(decayModeID)', particle.mvaConfig.target),
491  ('extraInfo(decayModeID)', 'mcErrors'),
492  ('extraInfo(decayModeID)', 'mcParticleStatus')]
493  filename = 'Monitor_PostReconstruction_BeforePostCut.root'
494  ma.variablesToHistogram(
495  particle.identifier,
496  variables=config.variables2binnings(hist_variables),
497  variables_2d=config.variables2binnings_2d(hist_variables_2d),
498  filename=config.removeJPsiSlash(filename),
499  directory=config.removeJPsiSlash(f'{particle.identifier}'),
500  path=path)
501 
502  ma.applyCuts(particle.identifier, cutstring, path=path)
503 
504  if self.config.monitor:
505  filename = 'Monitor_PostReconstruction_BeforeRanking.root'
506  ma.variablesToHistogram(
507  particle.identifier,
508  variables=config.variables2binnings(hist_variables),
509  variables_2d=config.variables2binnings_2d(hist_variables_2d),
510  filename=config.removeJPsiSlash(filename),
511  directory=config.removeJPsiSlash(f'{particle.identifier}'),
512  path=path)
513 
514  ma.rankByHighest(particle.identifier, 'extraInfo(SignalProbability)',
515  particle.postCutConfig.bestCandidateCut, 'postCut_rank', path=path)
516 
517  if self.config.monitor:
518  hist_variables += ['extraInfo(postCut_rank)']
519  hist_variables_2d += [('extraInfo(decayModeID)', 'extraInfo(postCut_rank)'),
520  (particle.mvaConfig.target, 'extraInfo(postCut_rank)'),
521  ('mcErrors', 'extraInfo(postCut_rank)'),
522  ('mcParticleStatus', 'extraInfo(postCut_rank)')]
523  filename = 'Monitor_PostReconstruction_AfterRanking.root'
524  ma.variablesToHistogram(
525  particle.identifier,
526  variables=config.variables2binnings(hist_variables),
527  variables_2d=config.variables2binnings_2d(hist_variables_2d),
528  filename=config.removeJPsiSlash(filename),
529  directory=config.removeJPsiSlash(f'{particle.identifier}'),
530  path=path)
531 
532  variables = ['extraInfo(SignalProbability)', 'mcErrors', 'mcParticleStatus', particle.mvaConfig.target,
533  'extraInfo(uniqueSignal)', 'extraInfo(decayModeID)']
534 
535  if 'B_s0' == particle.name:
536  variables += ['Mbc']
537  elif 'B' in particle.name:
538  variables += ['Mbc', 'cosThetaBetweenParticleAndNominalB']
539 
540  filename = 'Monitor_Final.root'
541  ma.variablesToNtuple(particle.identifier, variables, treename=config.removeJPsiSlash(
542  f'{particle.identifier} variables'), filename=config.removeJPsiSlash(filename), path=path)
543  return path
544 
545 
546 class Teacher:
547  """
548  Performs all necessary trainings for all training data files which are
549  available but where there is no weight file available yet.
550  This class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
551  The trainings are run in parallel using multi-threading of python.
552  Each training is done by a subprocess call, the training command (passed by config.externTeacher) can be either
553  * basf2_mva_teacher, the training will be done directly on the machine
554  * externClustTeacher, the training will be submitted to the batch system of KEKCC
555  """
556 
558  MaximumNumberOfMVASamples = int(1e7)
559 
561  MinimumNumberOfMVASamples = int(5e2)
562 
563  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
564  """
565  Create a new Teacher object
566  @param particles list of config.Particle objects
567  @param config config.FeiConfiguration object
568  """
569 
570  self.particles = particles
571 
572  self.config = config
573 
574  @staticmethod
575  def create_fake_weightfile(channel: str):
576  """
577  Create a fake weight file using the trivial method, it will always return 0.0
578  @param channel for which we create a fake weight file
579  """
580  content = f"""
581  <?xml version="1.0" encoding="utf-8"?>
582  <method>Trivial</method>
583  <weightfile>{channel}.xml</weightfile>
584  <treename>tree</treename>
585  <target_variable>isSignal</target_variable>
586  <weight_variable>__weight__</weight_variable>
587  <signal_class>1</signal_class>
588  <max_events>0</max_events>
589  <number_feature_variables>1</number_feature_variables>
590  <variable0>M</variable0>
591  <number_spectator_variables>0</number_spectator_variables>
592  <number_data_files>1</number_data_files>
593  <datafile0>train.root</datafile0>
594  <Trivial_version>1</Trivial_version>
595  <Trivial_output>0</Trivial_output>
596  <signal_fraction>0.066082567</signal_fraction>
597  """
598  with open(channel + ".xml", "w") as f:
599  f.write(content)
600 
601  @staticmethod
602  def check_if_weightfile_is_fake(filename: str):
603  """
604  Checks if the provided filename is a fake-weight file or not
605  @param filename the filename of the weight file
606  """
607  try:
608  return '<method>Trivial</method>' in open(filename).readlines()[2]
609  except BaseException:
610  return True
611  return True
612 
613  def upload(self, channel: str):
614  """
615  Upload the weight file into the condition database
616  @param channel whose weight file is uploaded
617  """
618  disk = channel + '.xml'
619  dbase = self.config.prefix + '_' + channel
620  basf2_mva.upload(disk, dbase)
621  return (disk, dbase)
622 
623  def do_all_trainings(self):
624  """
625  Do all trainings for which we find training data
626  """
627  # Always avoid the top-level 'import ROOT'.
628  import ROOT # noqa
629  # FEI uses multi-threading for parallel execution of tasks therefore
630  # the ROOT gui-thread is disabled, which otherwise interferes sometimes
631  ROOT.PyConfig.StartGuiThread = False
632  job_list = []
633  filename = 'training_input.root'
634  if not os.path.isfile(filename):
635  B2WARNING("Training of MVC failed. Couldn't find ROOT file. "
636  "No weight files will be provided.")
637  else:
638  f = ROOT.TFile.Open(filename, 'read')
639  if f.IsZombie():
640  B2WARNING("Training of MVC failed. ROOT file corrupt. "
641  "No weight files will be provided.")
642  elif len([k.GetName() for k in f.GetListOfKeys()]) == 0:
643  B2WARNING("Training of MVC failed. ROOT file does not contain any trees. "
644  "No weight files will be provided.")
645  else:
646  for particle in self.particles:
647  for channel in particle.channels:
648  weightfile = channel.label + '.xml'
649  if not basf2_mva.available(weightfile):
650  keys = [m for m in f.GetListOfKeys() if f"{channel.label}" in m.GetName()]
651  if not keys:
652  continue
653  tree = keys[0].ReadObj()
654  nSig = tree.GetEntries(channel.mvaConfig.target + ' == 1.0')
655  nBg = tree.GetEntries(channel.mvaConfig.target + ' != 1.0')
656  if nSig < Teacher.MinimumNumberOfMVASamples:
657  B2WARNING("Training of MVC failed. "
658  f"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
659  self.create_fake_weightfile(channel.label)
660  self.upload(channel.label)
661  continue
662  if nBg < Teacher.MinimumNumberOfMVASamples:
663  B2WARNING("Training of MVC failed. "
664  f"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
665  self.create_fake_weightfile(channel.label)
666  self.upload(channel.label)
667  continue
668  variable_str = "' '".join(channel.mvaConfig.variables)
669 
670  command = (f"{self.config.externTeacher}"
671  f" --method '{channel.mvaConfig.method}'"
672  f" --target_variable '{channel.mvaConfig.target}'"
673  f" --treename '{channel.label} variables' --datafile 'training_input.root'"
674  f" --signal_class 1 --variables '{variable_str}'"
675  f" --identifier '{channel.label}.xml'"
676  f" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1")
677  B2INFO(f"Used following command to invoke teacher: \n {command}")
678  job_list.append((channel.label, command))
679  f.Close()
680  p = multiprocessing.Pool(None, maxtasksperchild=1)
681  func = functools.partial(subprocess.call, shell=True)
682  p.map(func, [c for _, c in job_list])
683  p.close()
684  p.join()
685  weightfiles = []
686  for name, _ in job_list:
687  if not basf2_mva.available(name + '.xml'):
688  B2WARNING("Training of MVC failed. For unknown reasons, check the logfile")
689  self.create_fake_weightfile(name)
690  weightfiles.append(self.upload(name))
691  return weightfiles
692 
693 
694 def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
695  """
696  Convert an old FEI training into the new format.
697  The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
698  using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
699  @param particles list of config.Particle objects
700  @param config config.FeiConfiguration object
701  """
702  summary = pickle.load(open(configuration.legacy, 'rb'))
703  channel2lists = {k: v[2] for k, v in summary['channel2lists'].items()}
704 
705  teacher = Teacher(particles, configuration)
706 
707  for particle in particles:
708  for channel in particle.channels:
709  new_weightfile = configuration.prefix + '_' + channel.label
710  old_weightfile = configuration.prefix + '_' + channel2lists[channel.label.replace('Jpsi', 'J/psi')]
711  if not basf2_mva.available(new_weightfile):
712  if old_weightfile is None or not basf2_mva.available(old_weightfile):
713  Teacher.create_fake_weightfile(channel.label)
714  teacher.upload(channel.label)
715  else:
716  basf2_mva.download(old_weightfile, channel.label + '.xml')
717  teacher.upload(channel.label)
718 
719 
720 def get_stages_from_particles(particles: typing.Sequence[config.Particle]):
721  """
722  Returns the hierarchical structure of the FEI.
723  Each stage depends on the particles in the previous stage.
724  The final stage is empty (meaning everything is done, and the training is finished at this point).
725  @param particles list of config.Particle objects
726  """
727  stages = [
728  [p for p in particles if p.name in ['e+', 'K+', 'pi+', 'mu+', 'gamma', 'p+', 'K_L0']],
729  [p for p in particles if p.name in ['pi0', 'J/psi', 'Lambda0']],
730  [p for p in particles if p.name in ['K_S0', 'Sigma+']],
731  [p for p in particles if p.name in ['D+', 'D0', 'D_s+', 'Lambda_c+']],
732  [p for p in particles if p.name in ['D*+', 'D*0', 'D_s*+']],
733  [p for p in particles if p.name in ['B0', 'B+', 'B_s0']],
734  []
735  ]
736 
737  for p in particles:
738  if p.name not in [p.name for stage in stages for p in stage]:
739  raise RuntimeError(f"Unknown particle {p.name}: Not implemented in FEI")
740 
741  return stages
742 
743 
744 def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
745  """
746  Performs the training of mva classifiers for all available training data,
747  this function must be either called by the user after each stage of the FEI during training,
748  or (more likely) is called by the distributed.py script after merging the outputs of all jobs,
749  @param particles list of config.Particle objects
750  @param config config.FeiConfiguration object
751  @return list of tuple with weight file on disk and identifier in database for all trained classifiers
752  """
753  teacher = Teacher(particles, configuration)
754  return teacher.do_all_trainings()
755 
756 
757 def save_summary(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration, cache: int):
758  """
759  Creates the Summary.pickle, which is used to keep track of the stage during the training,
760  and can be used later to investigate which configuration was used exactly to create the training.
761  @param particles list of config.Particle objects
762  @param config config.FeiConfiguration object
763  @param cache current cache level
764  """
765  configuration = config.FeiConfiguration(configuration.prefix, cache,
766  configuration.monitor, configuration.legacy, configuration.externTeacher,
767  configuration.training)
768  # Backup existing Summary.pickle files
769  for i in [8, 7, 6, 5, 4, 3, 2, 1, 0]:
770  if os.path.isfile(f'Summary.pickle.backup_{i}'):
771  shutil.copyfile(f'Summary.pickle.backup_{i}', f'Summary.pickle.backup_{i + 1}')
772  if os.path.isfile('Summary.pickle'):
773  shutil.copyfile('Summary.pickle', 'Summary.pickle.backup_0')
774  pickle.dump((particles, configuration), open('Summary.pickle', 'wb'))
775 
776 
777 def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
778  """
779  The most important function of the FEI.
780  This creates the FEI path for training/fitting (both terms are equal), and application/inference (both terms are equal).
781  The whole FEI is defined by the particles which are reconstructed (see default_channels.py)
782  and the configuration (see config.py).
783 
784  TRAINING
785  For training this function is called multiple times, each time the FEI reconstructs one more stage in the hierarchical structure
786  i.e. we start with FSP, pi0, KS_0, D, D*, and with B mesons. You have to set configuration.training to True for training mode.
787  All weight files created during the training will be stored in your local database.
788  If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
789  to the central database first (see documentation for the Belle2 Condition Database).
790 
791  APPLICATION
792  For application you call this function once, and it returns the whole path which will reconstruct B mesons
793  with an associated signal probability. You have to set configuration.training to False for application mode.
794 
795  MONITORING
796  You can always turn on the monitoring (configuration.monitor = True),
797  to write out ROOT Histograms of many quantities for each stage,
798  using these histograms you can use the printReporting.py or latexReporting.py scripts to automatically create pdf files.
799 
800  LEGACY
801  This function can also use old FEI trainings (version 3), just pass the Summary.pickle file of the old training,
802  and the weight files will be automatically converted to the new naming scheme.
803 
804  @param particles list of config.Particle objects
805  @param config config.FeiConfiguration object
806  """
807  print(r"""
808  ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
809  |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
810  | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
811 
812  Author: Thomas Keck 2014 - 2017
813  Please cite my PhD thesis
814  """)
815 
816  # The cache parameter of the configuration object is used during training to keep track,
817  # which reconstruction steps are already performed.
818  # For fitting/training we start by default with -1, meaning we still have to create the TrainingDataInformation,
819  # which is used to determine the number of candidates we have to write out for the FSP trainings in stage 0.
820  # For inference/application we start by default with 0, because we don't need the TrainingDataInformation in stage 0.
821  # During the training we save the particles and configuration (including the current cache stage) in the Summary.pickle object.
822  if configuration.cache is None:
823  if os.path.isfile('Summary.pickle'):
824  print("Cache: Replaced particles and configuration with the ones from Summary.pickle!")
825  particles, configuration = pickle.load(open('Summary.pickle', 'rb'))
826  cache = configuration.cache
827  else:
828  if configuration.training:
829  cache = -1
830  else:
831  cache = 0
832  else:
833  cache = configuration.cache
834 
835  # Now we start building the training or application path
836  path = basf2.create_path()
837 
838  # There are in total 7 stages.
839  # For training we start with -1 and go to 7 one stage at a time
840  # For application we can run stage 0 to 7 at once
841  stages = get_stages_from_particles(particles)
842 
843  # If the user provided a Summary.pickle file of a FEIv3 training we
844  # convert the old weight files (with hashes), to the new naming scheme.
845  # Afterwards the algorithm runs as usual
846  if configuration.legacy is not None:
847  convert_legacy_training(particles, configuration)
848 
849  # During the training we require the number of MC particles in the whole processed
850  # data sample, because we don't want to write out billions of e.g. pion candidates.
851  # Knowing the total amount of MC particles we can write out only every e.g. 10th candidate
852  # That's why we have to write out the TrainingDataInformation before doing anything during the training phase.
853  # During application we only need this if we run in monitor mode, and want to write out a summary in the end,
854  # the summary contains efficiency, and the efficiency calculation requires the total number of MC particles.
855  training_data_information = TrainingDataInformation(particles)
856  if cache < 0 and configuration.training:
857  print("Stage 0: Run over all files to count the number of events and McParticles")
858  path.add_path(training_data_information.reconstruct())
859  if configuration.training:
860  save_summary(particles, configuration, 0)
861  return FeiState(path, 0, [])
862  elif not configuration.training and configuration.monitor:
863  path.add_path(training_data_information.reconstruct())
864 
865  # We load the Final State particles
866  # It is assumed that the user takes care of adding RootInput, Geometry, and everything
867  # which is required to read in data, so we directly start to load the FSP particles
868  # used by the FEI.
869  loader = FSPLoader(particles, configuration)
870  if cache < 1:
871  print("Stage 0: Load FSP particles")
872  path.add_path(loader.reconstruct())
873 
874  # Now we reconstruct each stage one after another.
875  # Each stage consists of two parts:
876  # PreReconstruction (before the mva method was applied):
877  # - Particle combination
878  # - Do vertex fitting
879  # - Some simple cuts and best candidate selection
880  # PostReconstruction (after the mva method was applied):
881  # - Apply the mva method
882  # - Apply cuts on the mva output and best candidate selection
883  #
884  # If the weight files for the PostReconstruction are not available for the current stage and we are in training mode,
885  # we have to create the training data. The training itself is done by the do_trainings function which is called
886  # either by the user after each step, or by the distributed.py script
887  #
888  # If the weight files for the PostReconstruction are not available for the current stage and we are not in training mode,
889  # we keep going, as soon as the user will call process on the produced path he will get an error message that the
890  # weight files are missing.
891  #
892  # Finally we keep track of the ParticleLists we use, so the user can run the RemoveParticles module to reduce the size of the
893  # intermediate output of RootOutput.
894  used_lists = []
895  for stage, stage_particles in enumerate(stages):
896  pre_reconstruction = PreReconstruction(stage_particles, configuration)
897  if cache <= stage:
898  print(f"Stage {stage}: PreReconstruct particles: ", [p.name for p in stage_particles])
899  path.add_path(pre_reconstruction.reconstruct())
900 
901  post_reconstruction = PostReconstruction(stage_particles, configuration)
902  if configuration.training and not post_reconstruction.available():
903  print(f"Stage {stage}: Create training data for particles: ", [p.name for p in stage_particles])
904  mc_counts = training_data_information.get_mc_counts()
905  training_data = TrainingData(stage_particles, configuration, mc_counts)
906  path.add_path(training_data.reconstruct())
907  used_lists += [channel.name for particle in stage_particles for channel in particle.channels]
908  break
909  if cache <= stage + 1:
910  path.add_path(post_reconstruction.reconstruct())
911  used_lists += [particle.identifier for particle in stage_particles]
912 
913  # If we run in monitor mode we are interested in the ModuleStatistics,
914  # these statistics contain the runtime for each module which was run.
915  if configuration.monitor:
916  output = basf2.register_module('RootOutput')
917  output.param('outputFileName', 'Monitor_ModuleStatistics.root')
918  output.param('branchNames', ['EventMetaData']) # cannot be removed, but of only small effect on file size
919  output.param('branchNamesPersistent', ['ProcessStatistics'])
920  output.param('ignoreCommandLineOverride', True)
921  path.add_module(output)
922 
923  # As mentioned above the FEI keeps track of the stages which are already reconstructed during the training
924  # so we write out the Summary.pickle here, and increase the stage by one.
925  if configuration.training or configuration.monitor:
926  save_summary(particles, configuration, stage + 1)
927 
928  # Finally we return the path, the stage and the used lists to the user.
929  return FeiState(path, stage + 1, plists=used_lists)
930 
931 # @endcond
def isB2BII()
Definition: b2bii.py:14
def from_name(name)
Definition: pdg.py:63