Belle II Software  light-2205-abys
core.py
1 #!/usr/bin/env python3
2 
3 
10 
11 # @cond SUPPRESS_DOXYGEN
12 
13 """
14  The Full Event Interpretation Algorithm
15 
16  Some basic facts:
17  - The algorithm will automatically reconstruct B mesons and calculate a signal probability for each candidate.
18  - It can be used for hadronic and semileptonic tagging.
19  - The algorithm has to be trained on MC, and can afterwards be applied on data.
20  - The training requires O(100) million MC events
21  - The weight files are stored in the Belle 2 Condition database
22 
23  Read this file if you want to understand the technical details of the FEI.
24 
25  The FEI follows a hierarchical approach.
26  There are 7 stages:
27  (Stage -1: Write out information about the provided data sample)
28  Stage 0: Final State Particles (FSP)
29  Stage 1: pi0, J/Psi, Lambda0
30  Stage 2: K_S0, Sigma+
31  Stage 3: D and Lambda_c mesons
32  Stage 4: D* mesons
33  Stage 5: B mesons
34  Stage 6: Finish
35 
36  Most stages consists of:
37  - Create Particle Candidates
38  - Apply Cuts
39  - Do vertex Fitting
40  - Apply a multivariate classification method
41  - Apply more Cuts
42 
43  The FEI will reconstruct these 7 stages during the training phase,
44  since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
45  to train all the necessary multivariate classifiers.
46 """
47 
48 # Import basf2
49 import basf2
50 from basf2 import B2INFO, B2WARNING
51 import pybasf2
52 import modularAnalysis as ma
53 import b2bii
54 
55 # Should come after basf2 import
56 import pdg
57 from fei import config
58 import basf2_mva
59 
60 # Standard python modules
61 import collections
62 import os
63 import shutil
64 import typing
65 import pickle
66 import re
67 import functools
68 import subprocess
69 import multiprocessing
70 
71 # Simple object containing the output of fei
72 FeiState = collections.namedtuple('FeiState', 'path, stage, plists')
73 
74 
75 class TrainingDataInformation:
76  """
77  Contains the relevant information about the used training data.
78  Basically we write out the number of MC particles in the whole dataset.
79  This numbers we can use to calculate what fraction of candidates we have to write
80  out as TrainingData to get a reasonable amount of candidates to train on
81  (too few candidates will lead to heavy overtraining, too many won't fit into memory).
82  Secondly we can use this information for the generation of the monitoring pdfs,
83  where we calculate reconstruction efficiencies.
84  """
85 
86  def __init__(self, particles: typing.Sequence[config.Particle]):
87  """
88  Create a new TrainingData object
89  @param particles list of config.Particle objects
90  """
91 
92  self.particles = particles
93 
94  self.filename = 'mcParticlesCount.root'
95 
96  def available(self) -> bool:
97  """
98  Check if the relevant information is already available
99  """
100  return os.path.isfile(self.filename)
101 
102  def reconstruct(self) -> pybasf2.Path:
103  """
104  Returns pybasf2.Path which counts the number of MCParticles in each event.
105  @param particles list of config.Particle objects
106  """
107  # Unique absolute pdg-codes of all particles
108  pdgs = {abs(pdg.from_name(particle.name)) for particle in self.particles}
109 
110  path = basf2.create_path()
111  module = basf2.register_module('VariablesToHistogram')
112  module.set_name("VariablesToHistogram_MCCount")
113  module.param('variables', [(f'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5) for pdg in pdgs])
114  module.param('fileName', self.filename)
115  path.add_module(module)
116  return path
117 
118  def get_mc_counts(self):
119  """
120  Read out the number of MC particles from the file created by reconstruct
121  """
122  # Unique absolute pdg-codes of all particles
123  import ROOT # noqa
124  root_file = ROOT.TFile.Open(self.filename, 'read')
125  mc_counts = {}
126 
127  for key in root_file.GetListOfKeys():
128  variable = ROOT.Belle2.MakeROOTCompatible.invertMakeROOTCompatible(key.GetName())
129  pdg = abs(int(variable[len('NumberOfMCParticlesInEvent('):-len(")")]))
130  hist = key.ReadObj()
131  mc_counts[pdg] = {}
132  mc_counts[pdg]['sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
133  for bin in range(hist.GetNbinsX()))
134  mc_counts[pdg]['std'] = hist.GetStdDev()
135  mc_counts[pdg]['avg'] = hist.GetMean()
136  mc_counts[pdg]['max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
137  mc_counts[pdg]['min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
138 
139  mc_counts[0] = {}
140  mc_counts[0]['sum'] = hist.GetEntries()
141  root_file.Close()
142  return mc_counts
143 
144 
145 class FSPLoader:
146  """
147  Steers the loading of FSP particles.
148  This does NOT include RootInput, Geometry or anything required before loading FSPs,
149  the user has to add this himself (because it depends on the MC campaign and if you want
150  to use Belle 1 or Belle 2).
151  """
152 
153  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
154  """
155  Create a new FSPLoader object
156  @param particles list of config.Particle objects
157  @param config config.FeiConfiguration object
158  """
159 
160  self.particles = particles
161 
162  self.config = config
163 
164  def reconstruct(self) -> pybasf2.Path:
165  """
166  Returns pybasf2.Path which loads the FSP Particles
167  """
168  path = basf2.create_path()
169 
170  if b2bii.isB2BII():
171  ma.fillParticleLists([('K+:FSP', ''), ('pi+:FSP', ''), ('e+:FSP', ''),
172  ('mu+:FSP', ''), ('p+:FSP', '')], writeOut=True, path=path)
173  for outputList, inputList in [('gamma:FSP', 'gamma:mdst'), ('K_S0:V0', 'K_S0:mdst'),
174  ('Lambda0:V0', 'Lambda0:mdst'), ('K_L0:FSP', 'K_L0:mdst'),
175  ('pi0:FSP', 'pi0:mdst'), ('gamma:V0', 'gamma:v0mdst')]:
176  ma.copyParticles(outputList, inputList, writeOut=True, path=path)
177  else:
178  ma.fillParticleLists([('K+:FSP', ''), ('pi+:FSP', ''), ('e+:FSP', ''),
179  ('mu+:FSP', ''), ('gamma:FSP', ''),
180  ('p+:FSP', ''), ('K_L0:FSP', '')], writeOut=True, loadPhotonBeamBackgroundMVA=False, path=path)
181  ma.fillParticleList('K_S0:V0 -> pi+ pi-', '', writeOut=True, path=path)
182  ma.fillParticleList('Lambda0:V0 -> p+ pi-', '', writeOut=True, path=path)
183  ma.fillConvertedPhotonsList('gamma:V0 -> e+ e-', '', writeOut=True, path=path)
184 
185  if self.config.monitor:
186  names = ['e+', 'K+', 'pi+', 'mu+', 'gamma', 'K_S0', 'p+', 'K_L0', 'Lambda0', 'pi0']
187  filename = 'Monitor_FSPLoader.root'
188  pdgs = {abs(pdg.from_name(name)) for name in names}
189  variables = [(f'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5) for pdg in pdgs]
190  ma.variablesToHistogram('', variables=variables, filename=filename, path=path)
191  return path
192 
193 
194 class TrainingData:
195  """
196  Steers the creation of the training data.
197  The training data is used to train a multivariate classifier for each channel.
198  The training of the FEI at its core is just generating this training data for each channel.
199  After we created the training data for a stage, we have to train the classifiers (see Teacher class further down).
200  """
201 
202  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
203  mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
204  """
205  Create a new TrainingData object
206  @param particles list of config.Particle objects
207  @param config config.FeiConfiguration object
208  @param mc_counts containing number of MC Particles
209  """
210 
211  self.particles = particles
212 
213  self.config = config
214 
215  self.mc_counts = mc_counts
216 
217  def reconstruct(self) -> pybasf2.Path:
218  """
219  Returns pybasf2.Path which creates the training data for the given particles
220  """
221  path = basf2.create_path()
222 
223  for particle in self.particles:
224  pdgcode = abs(pdg.from_name(particle.name))
225  nSignal = self.mc_counts[pdgcode]['sum']
226  # For D-Mesons we usually have a efficiency of 10^-3 including branching fraction
227  if pdgcode > 400:
228  nSignal /= 1000
229  # For B-Mesons we usually have a efficiency of 10^-4 including branching fraction
230  if pdgcode > 500:
231  nSignal /= 10000
232 
233  for channel in particle.channels:
234  filename = 'training_input.root'
235 
236  nBackground = self.mc_counts[0]['sum'] * channel.preCutConfig.bestCandidateCut
237  inverseSamplingRates = {}
238  # For some very pure channels (Jpsi), this sampling can be too aggressive and training fails.
239  # It can therefore be disabled in the preCutConfig.
240  if nBackground > Teacher.MaximumNumberOfMVASamples and not channel.preCutConfig.noBackgroundSampling:
241  inverseSamplingRates[0] = int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1
242  if nSignal > Teacher.MaximumNumberOfMVASamples:
243  inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
244 
245  spectators = [channel.mvaConfig.target]
246  if channel.mvaConfig.sPlotVariable is not None:
247  spectators.append(channel.mvaConfig.sPlotVariable)
248 
249  if self.config.monitor:
250  hist_variables = ['mcErrors', 'mcParticleStatus'] + channel.mvaConfig.variables + spectators
251  hist_variables_2d = [(x, channel.mvaConfig.target)
252  for x in channel.mvaConfig.variables + spectators if x is not channel.mvaConfig.target]
253  hist_filename = 'Monitor_TrainingData.root'
254  ma.variablesToHistogram(channel.name, variables=config.variables2binnings(hist_variables),
255  variables_2d=config.variables2binnings_2d(hist_variables_2d),
256  filename=config.removeJPsiSlash(hist_filename),
257  directory=config.removeJPsiSlash(f'{channel.label}'), path=path)
258 
259  teacher = basf2.register_module('VariablesToNtuple')
260  teacher.set_name('VariablesToNtuple_' + channel.name)
261  teacher.param('fileName', filename)
262  teacher.param('treeName', f'{channel.label} variables')
263  teacher.param('variables', channel.mvaConfig.variables + spectators)
264  teacher.param('particleList', channel.name)
265  teacher.param('sampling', (channel.mvaConfig.target, inverseSamplingRates))
266  path.add_module(teacher)
267  return path
268 
269 
270 class PreReconstruction:
271  """
272  Steers the reconstruction phase before the mva method was applied
273  It Includes:
274  - The ParticleCombination (for each particle and channel we create candidates using
275  the daughter candidates from the previous stages)
276  - MC Matching
277  - Vertex Fitting (this is the slowest part of the whole FEI, KFit is used by default,
278  but you can use fastFit as a drop-in replacement https://github.com/thomaskeck/FastFit/,
279  this will speed up the whole FEI by a factor 2-3)
280  """
281 
282  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
283  """
284  Create a new PreReconstruction object
285  @param particles list of config.Particle objects
286  @param config config.FeiConfiguration object
287  """
288 
289  self.particles = particles
290 
291  self.config = config
292 
293  def reconstruct(self) -> pybasf2.Path:
294  """
295  Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
296  """
297  path = basf2.create_path()
298 
299  for particle in self.particles:
300  for channel in particle.channels:
301 
302  if len(channel.daughters) == 1:
303  ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=True, path=path)
304  v2EI = basf2.register_module('VariablesToExtraInfo')
305  v2EI.set_name('VariablesToExtraInfo_' + channel.name)
306  v2EI.param('particleList', channel.name)
307  v2EI.param('variables', {f'constant({channel.decayModeID})': 'decayModeID'})
308  # suppress warning that decay mode ID won't be overwritten if it already exists
309  v2EI.set_log_level(basf2.logging.log_level.ERROR)
310  path.add_module(v2EI)
311  else:
312  ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
313  writeOut=True, path=path)
314  if self.config.monitor:
315  ma.matchMCTruth(channel.name, path=path)
316  bc_variable = channel.preCutConfig.bestCandidateVariable
317  hist_variables = [bc_variable, 'mcErrors', 'mcParticleStatus', channel.mvaConfig.target]
318  hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
319  (bc_variable, 'mcErrors'),
320  (bc_variable, 'mcParticleStatus')]
321  filename = 'Monitor_PreReconstruction_BeforeRanking.root'
322  ma.variablesToHistogram(channel.name,
323  variables=config.variables2binnings(hist_variables),
324  variables_2d=config.variables2binnings_2d(hist_variables_2d),
325  filename=filename, directory=f'{channel.label}', path=path)
326 
327  if channel.preCutConfig.bestCandidateMode == 'lowest':
328  ma.rankByLowest(channel.name,
329  channel.preCutConfig.bestCandidateVariable,
330  channel.preCutConfig.bestCandidateCut,
331  'preCut_rank',
332  path=path)
333  elif channel.preCutConfig.bestCandidateMode == 'highest':
334  ma.rankByHighest(channel.name,
335  channel.preCutConfig.bestCandidateVariable,
336  channel.preCutConfig.bestCandidateCut,
337  'preCut_rank',
338  path=path)
339  else:
340  raise RuntimeError("Unknown bestCandidateMode " + repr(channel.preCutConfig.bestCandidateMode))
341 
342  if self.config.monitor:
343  filename = 'Monitor_PreReconstruction_AfterRanking.root'
344  hist_variables += ['extraInfo(preCut_rank)']
345  hist_variables_2d += [('extraInfo(preCut_rank)', channel.mvaConfig.target),
346  ('extraInfo(preCut_rank)', 'mcErrors'),
347  ('extraInfo(preCut_rank)', 'mcParticleStatus')]
348  ma.variablesToHistogram(channel.name,
349  variables=config.variables2binnings(hist_variables),
350  variables_2d=config.variables2binnings_2d(hist_variables_2d),
351  filename=filename, directory=f'{channel.label}', path=path)
352  # If we are not in monitor mode we do the mc matching now,
353  # otherwise we did it above already!
354  elif self.config.training:
355  ma.matchMCTruth(channel.name, path=path)
356 
357  if b2bii.isB2BII() and particle.name in ['K_S0', 'Lambda0']:
358  pvfit = basf2.register_module('ParticleVertexFitter')
359  pvfit.set_name('ParticleVertexFitter_' + channel.name)
360  pvfit.param('listName', channel.name)
361  pvfit.param('confidenceLevel', channel.preCutConfig.vertexCut)
362  pvfit.param('vertexFitter', 'KFit')
363  pvfit.param('fitType', 'vertex')
364  pvfit.set_log_level(basf2.logging.log_level.ERROR) # let's not produce gigabytes of uninteresting warnings
365  path.add_module(pvfit)
366  elif re.findall(r"[\w']+", channel.decayString).count('pi0') > 1 and particle.name != 'pi0':
367  basf2.B2INFO(f"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
368  elif len(channel.daughters) > 1:
369  pvfit = basf2.register_module('ParticleVertexFitter')
370  pvfit.set_name('ParticleVertexFitter_' + channel.name)
371  pvfit.param('listName', channel.name)
372  pvfit.param('confidenceLevel', channel.preCutConfig.vertexCut)
373  pvfit.param('vertexFitter', 'KFit')
374  pvfit.param('fitType', 'vertex')
375  pvfit.set_log_level(basf2.logging.log_level.ERROR) # let's not produce gigabytes of uninteresting warnings
376  path.add_module(pvfit)
377 
378  if self.config.monitor:
379  hist_variables = ['chiProb', 'mcErrors', 'mcParticleStatus', channel.mvaConfig.target]
380  hist_variables_2d = [('chiProb', channel.mvaConfig.target),
381  ('chiProb', 'mcErrors'),
382  ('chiProb', 'mcParticleStatus')]
383  filename = 'Monitor_PreReconstruction_AfterVertex.root'
384  ma.variablesToHistogram(channel.name,
385  variables=config.variables2binnings(hist_variables),
386  variables_2d=config.variables2binnings_2d(hist_variables_2d),
387  filename=filename, directory=f'{channel.label}', path=path)
388 
389  return path
390 
391 
392 class PostReconstruction:
393  """
394  Steers the reconstruction phase after the mva method was applied
395  It Includes:
396  - The application of the mva method itself.
397  - Copying all channel lists in a common one for each particle defined in particles
398  - Tag unique signal candidates, to avoid double counting of channels with overlap
399  """
400 
401  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
402  """
403  Create a new PostReconstruction object
404  @param particles list of config.Particle objects
405  @param config config.FeiConfiguration object
406  """
407 
408  self.particles = particles
409 
410  self.config = config
411 
412  def get_missing_channels(self) -> typing.Sequence[str]:
413  """
414  Returns all channels for which the weightfile is missing
415  """
416  missing = []
417  for particle in self.particles:
418  for channel in particle.channels:
419  # weightfile = self.config.prefix + '_' + channel.label
420  weightfile = channel.label + '.xml'
421  if not basf2_mva.available(weightfile):
422  missing += [channel.label]
423  return missing
424 
425  def available(self) -> bool:
426  """
427  Check if the relevant information is already available
428  """
429  return len(self.get_missing_channels()) == 0
430 
431  def reconstruct(self) -> pybasf2.Path:
432  """
433  Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
434  """
435  path = basf2.create_path()
436 
437  for particle in self.particles:
438  for channel in particle.channels:
439  expert = basf2.register_module('MVAExpert')
440  expert.set_name('MVAExpert_' + channel.name)
441  if self.config.training:
442  expert.param('identifier', channel.label + '.xml')
443  else:
444  expert.param('identifier', self.config.prefix + '_' + channel.label)
445  expert.param('extraInfoName', 'SignalProbability')
446  expert.param('listNames', [channel.name])
447  # suppress warning that signal probability won't be overwritten if it already exists
448  expert.set_log_level(basf2.logging.log_level.ERROR)
449  path.add_module(expert)
450 
451  uniqueSignal = basf2.register_module('TagUniqueSignal')
452  uniqueSignal.param('particleList', channel.name)
453  uniqueSignal.param('target', channel.mvaConfig.target)
454  uniqueSignal.param('extraInfoName', 'uniqueSignal')
455  uniqueSignal.set_name('TagUniqueSignal_' + channel.name)
456  # suppress warning that unique signal extra info won't be overwritten if it already exists
457  uniqueSignal.set_log_level(basf2.logging.log_level.ERROR)
458  path.add_module(uniqueSignal)
459 
460  if self.config.monitor:
461  hist_variables = ['mcErrors', 'mcParticleStatus', 'extraInfo(uniqueSignal)', 'extraInfo(SignalProbability)',
462  channel.mvaConfig.target, 'extraInfo(decayModeID)']
463  hist_variables_2d = [('extraInfo(SignalProbability)', channel.mvaConfig.target),
464  ('extraInfo(SignalProbability)', 'mcErrors'),
465  ('extraInfo(SignalProbability)', 'mcParticleStatus'),
466  ('extraInfo(decayModeID)', channel.mvaConfig.target),
467  ('extraInfo(decayModeID)', 'mcErrors'),
468  ('extraInfo(decayModeID)', 'extraInfo(uniqueSignal)'),
469  ('extraInfo(decayModeID)', 'mcParticleStatus')]
470  filename = 'Monitor_PostReconstruction_AfterMVA.root'
471  ma.variablesToHistogram(channel.name,
472  variables=config.variables2binnings(hist_variables),
473  variables_2d=config.variables2binnings_2d(hist_variables_2d),
474  filename=filename, directory=f'{channel.label}', path=path)
475 
476  cutstring = ''
477  if particle.postCutConfig.value > 0.0:
478  cutstring = str(particle.postCutConfig.value) + ' < extraInfo(SignalProbability)'
479 
480  ma.mergeListsWithBestDuplicate(particle.identifier, [c.name for c in particle.channels],
481  variable='particleSource', writeOut=True, path=path)
482 
483  if self.config.monitor:
484  hist_variables = ['mcErrors', 'mcParticleStatus', 'extraInfo(uniqueSignal)', 'extraInfo(SignalProbability)',
485  particle.mvaConfig.target, 'extraInfo(decayModeID)']
486  hist_variables_2d = [('extraInfo(decayModeID)', particle.mvaConfig.target),
487  ('extraInfo(decayModeID)', 'mcErrors'),
488  ('extraInfo(decayModeID)', 'mcParticleStatus')]
489  filename = 'Monitor_PostReconstruction_BeforePostCut.root'
490  ma.variablesToHistogram(
491  particle.identifier,
492  variables=config.variables2binnings(hist_variables),
493  variables_2d=config.variables2binnings_2d(hist_variables_2d),
494  filename=config.removeJPsiSlash(filename),
495  directory=config.removeJPsiSlash(f'{particle.identifier}'),
496  path=path)
497 
498  ma.applyCuts(particle.identifier, cutstring, path=path)
499 
500  if self.config.monitor:
501  filename = 'Monitor_PostReconstruction_BeforeRanking.root'
502  ma.variablesToHistogram(
503  particle.identifier,
504  variables=config.variables2binnings(hist_variables),
505  variables_2d=config.variables2binnings_2d(hist_variables_2d),
506  filename=config.removeJPsiSlash(filename),
507  directory=config.removeJPsiSlash(f'{particle.identifier}'),
508  path=path)
509 
510  ma.rankByHighest(particle.identifier, 'extraInfo(SignalProbability)',
511  particle.postCutConfig.bestCandidateCut, 'postCut_rank', path=path)
512 
513  if self.config.monitor:
514  hist_variables += ['extraInfo(postCut_rank)']
515  hist_variables_2d += [('extraInfo(decayModeID)', 'extraInfo(postCut_rank)'),
516  (particle.mvaConfig.target, 'extraInfo(postCut_rank)'),
517  ('mcErrors', 'extraInfo(postCut_rank)'),
518  ('mcParticleStatus', 'extraInfo(postCut_rank)')]
519  filename = 'Monitor_PostReconstruction_AfterRanking.root'
520  ma.variablesToHistogram(
521  particle.identifier,
522  variables=config.variables2binnings(hist_variables),
523  variables_2d=config.variables2binnings_2d(hist_variables_2d),
524  filename=config.removeJPsiSlash(filename),
525  directory=config.removeJPsiSlash(f'{particle.identifier}'),
526  path=path)
527 
528  variables = ['extraInfo(SignalProbability)', 'mcErrors', 'mcParticleStatus', particle.mvaConfig.target,
529  'extraInfo(uniqueSignal)', 'extraInfo(decayModeID)']
530 
531  if 'B_s0' == particle.name:
532  variables += ['Mbc']
533  elif 'B' in particle.name:
534  variables += ['Mbc', 'cosThetaBetweenParticleAndNominalB']
535 
536  filename = 'Monitor_Final.root'
537  ma.variablesToNtuple(particle.identifier, variables, treename=config.removeJPsiSlash(
538  f'{particle.identifier} variables'), filename=config.removeJPsiSlash(filename), path=path)
539  return path
540 
541 
542 class Teacher:
543  """
544  Performs all necessary trainings for all training data files which are
545  available but where there is no weight file available yet.
546  This class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
547  The trainings are run in parallel using multi-threading of python.
548  Each training is done by a subprocess call, the training command (passed by config.externTeacher) can be either
549  * basf2_mva_teacher, the training will be done directly on the machine
550  * externClustTeacher, the training will be submitted to the batch system of KEKCC
551  """
552 
554  MaximumNumberOfMVASamples = int(1e7)
555 
557  MinimumNumberOfMVASamples = int(5e2)
558 
559  def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
560  """
561  Create a new Teacher object
562  @param particles list of config.Particle objects
563  @param config config.FeiConfiguration object
564  """
565 
566  self.particles = particles
567 
568  self.config = config
569 
570  @staticmethod
571  def create_fake_weightfile(channel: str):
572  """
573  Create a fake weight file using the trivial method, it will always return 0.0
574  @param channel for which we create a fake weight file
575  """
576  content = f"""
577  <?xml version="1.0" encoding="utf-8"?>
578  <method>Trivial</method>
579  <weightfile>{channel}.xml</weightfile>
580  <treename>tree</treename>
581  <target_variable>isSignal</target_variable>
582  <weight_variable>__weight__</weight_variable>
583  <signal_class>1</signal_class>
584  <max_events>0</max_events>
585  <number_feature_variables>1</number_feature_variables>
586  <variable0>M</variable0>
587  <number_spectator_variables>0</number_spectator_variables>
588  <number_data_files>1</number_data_files>
589  <datafile0>train.root</datafile0>
590  <Trivial_version>1</Trivial_version>
591  <Trivial_output>0</Trivial_output>
592  <signal_fraction>0.066082567</signal_fraction>
593  """
594  with open(channel + ".xml", "w") as f:
595  f.write(content)
596 
597  @staticmethod
598  def check_if_weightfile_is_fake(filename: str):
599  """
600  Checks if the provided filename is a fake-weight file or not
601  @param filename the filename of the weight file
602  """
603  try:
604  return '<method>Trivial</method>' in open(filename).readlines()[2]
605  except BaseException:
606  return True
607  return True
608 
609  def upload(self, channel: str):
610  """
611  Upload the weight file into the condition database
612  @param channel whose weight file is uploaded
613  """
614  disk = channel + '.xml'
615  dbase = self.config.prefix + '_' + channel
616  basf2_mva.upload(disk, dbase)
617  return (disk, dbase)
618 
619  def do_all_trainings(self):
620  """
621  Do all trainings for which we find training data
622  """
623  import ROOT # noqa
624  # FEI uses multi-threading for parallel execution of tasks therefore
625  # the ROOT gui-thread is disabled, which otherwise interferes sometimes
626  ROOT.PyConfig.StartGuiThread = False
627  job_list = []
628  filename = 'training_input.root'
629  if not os.path.isfile(filename):
630  B2WARNING("Training of MVC failed. Couldn't find ROOT file. "
631  "No weight files will be provided.")
632  else:
633  f = ROOT.TFile.Open(filename, 'read')
634  if f.IsZombie():
635  B2WARNING("Training of MVC failed. ROOT file corrupt. "
636  "No weight files will be provided.")
637  elif len([k.GetName() for k in f.GetListOfKeys()]) == 0:
638  B2WARNING("Training of MVC failed. ROOT file does not contain any trees. "
639  "No weight files will be provided.")
640  else:
641  for particle in self.particles:
642  for channel in particle.channels:
643  weightfile = channel.label + '.xml'
644  if not basf2_mva.available(weightfile):
645  keys = [m for m in f.GetListOfKeys() if f"{channel.label}" in m.GetName()]
646  if not keys:
647  continue
648  tree = keys[0].ReadObj()
649  nSig = tree.GetEntries(channel.mvaConfig.target + ' == 1.0')
650  nBg = tree.GetEntries(channel.mvaConfig.target + ' != 1.0')
651  if nSig < Teacher.MinimumNumberOfMVASamples:
652  B2WARNING("Training of MVC failed. "
653  f"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
654  self.create_fake_weightfile(channel.label)
655  self.upload(channel.label)
656  continue
657  if nBg < Teacher.MinimumNumberOfMVASamples:
658  B2WARNING("Training of MVC failed. "
659  f"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
660  self.create_fake_weightfile(channel.label)
661  self.upload(channel.label)
662  continue
663  variable_str = "' '".join(channel.mvaConfig.variables)
664 
665  command = (f"{self.config.externTeacher}"
666  f" --method '{channel.mvaConfig.method}'"
667  f" --target_variable '{channel.mvaConfig.target}'"
668  f" --treename '{channel.label} variables' --datafile 'training_input.root'"
669  f" --signal_class 1 --variables '{variable_str}'"
670  f" --identifier '{channel.label}.xml'"
671  f" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1")
672  B2INFO(f"Used following command to invoke teacher: \n {command}")
673  job_list.append((channel.label, command))
674  f.Close()
675  p = multiprocessing.Pool(None, maxtasksperchild=1)
676  func = functools.partial(subprocess.call, shell=True)
677  p.map(func, [c for _, c in job_list])
678  p.close()
679  p.join()
680  weightfiles = []
681  for name, _ in job_list:
682  if not basf2_mva.available(name + '.xml'):
683  B2WARNING("Training of MVC failed. For unknown reasons, check the logfile")
684  self.create_fake_weightfile(name)
685  weightfiles.append(self.upload(name))
686  return weightfiles
687 
688 
689 def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
690  """
691  Convert an old FEI training into the new format.
692  The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
693  using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
694  @param particles list of config.Particle objects
695  @param config config.FeiConfiguration object
696  """
697  summary = pickle.load(open(configuration.legacy, 'rb'))
698  channel2lists = {k: v[2] for k, v in summary['channel2lists'].items()}
699 
700  teacher = Teacher(particles, configuration)
701 
702  for particle in particles:
703  for channel in particle.channels:
704  new_weightfile = configuration.prefix + '_' + channel.label
705  old_weightfile = configuration.prefix + '_' + channel2lists[channel.label.replace('Jpsi', 'J/psi')]
706  if not basf2_mva.available(new_weightfile):
707  if old_weightfile is None or not basf2_mva.available(old_weightfile):
708  Teacher.create_fake_weightfile(channel.label)
709  teacher.upload(channel.label)
710  else:
711  basf2_mva.download(old_weightfile, channel.label + '.xml')
712  teacher.upload(channel.label)
713 
714 
715 def get_stages_from_particles(particles: typing.Sequence[config.Particle]):
716  """
717  Returns the hierarchical structure of the FEI.
718  Each stage depends on the particles in the previous stage.
719  The final stage is empty (meaning everything is done, and the training is finished at this point).
720  @param particles list of config.Particle objects
721  """
722  stages = [
723  [p for p in particles if p.name in ['e+', 'K+', 'pi+', 'mu+', 'gamma', 'p+', 'K_L0']],
724  [p for p in particles if p.name in ['pi0', 'J/psi', 'Lambda0']],
725  [p for p in particles if p.name in ['K_S0', 'Sigma+']],
726  [p for p in particles if p.name in ['D+', 'D0', 'D_s+', 'Lambda_c+']],
727  [p for p in particles if p.name in ['D*+', 'D*0', 'D_s*+']],
728  [p for p in particles if p.name in ['B0', 'B+', 'B_s0']],
729  []
730  ]
731 
732  for p in particles:
733  if p.name not in [p.name for stage in stages for p in stage]:
734  raise RuntimeError(f"Unknown particle {p.name}: Not implemented in FEI")
735 
736  return stages
737 
738 
739 def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
740  """
741  Performs the training of mva classifiers for all available training data,
742  this function must be either called by the user after each stage of the FEI during training,
743  or (more likely) is called by the distributed.py script after merging the outputs of all jobs,
744  @param particles list of config.Particle objects
745  @param config config.FeiConfiguration object
746  @return list of tuple with weight file on disk and identifier in database for all trained classifiers
747  """
748  teacher = Teacher(particles, configuration)
749  return teacher.do_all_trainings()
750 
751 
752 def save_summary(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration, cache: int):
753  """
754  Creates the Summary.pickle, which is used to keep track of the stage during the training,
755  and can be used later to investigate which configuration was used exactly to create the training.
756  @param particles list of config.Particle objects
757  @param config config.FeiConfiguration object
758  @param cache current cache level
759  """
760  configuration = config.FeiConfiguration(configuration.prefix, cache,
761  configuration.monitor, configuration.legacy, configuration.externTeacher,
762  configuration.training)
763  # Backup existing Summary.pickle files
764  for i in [8, 7, 6, 5, 4, 3, 2, 1, 0]:
765  if os.path.isfile(f'Summary.pickle.backup_{i}'):
766  shutil.copyfile(f'Summary.pickle.backup_{i}', f'Summary.pickle.backup_{i + 1}')
767  if os.path.isfile('Summary.pickle'):
768  shutil.copyfile('Summary.pickle', 'Summary.pickle.backup_0')
769  pickle.dump((particles, configuration), open('Summary.pickle', 'wb'))
770 
771 
772 def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
773  """
774  The most important function of the FEI.
775  This creates the FEI path for training/fitting (both terms are equal), and application/inference (both terms are equal).
776  The whole FEI is defined by the particles which are reconstructed (see default_channels.py)
777  and the configuration (see config.py).
778 
779  TRAINING
780  For training this function is called multiple times, each time the FEI reconstructs one more stage in the hierarchical structure
781  i.e. we start with FSP, pi0, KS_0, D, D*, and with B mesons. You have to set configuration.training to True for training mode.
782  All weight files created during the training will be stored in your local database.
783  If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
784  to the central database first (see documentation for the Belle2 Condition Database).
785 
786  APPLICATION
787  For application you call this function once, and it returns the whole path which will reconstruct B mesons
788  with an associated signal probability. You have to set configuration.training to False for application mode.
789 
790  MONITORING
791  You can always turn on the monitoring (configuration.monitor = True),
792  to write out ROOT Histograms of many quantities for each stage,
793  using these histograms you can use the printReporting.py or latexReporting.py scripts to automatically create pdf files.
794 
795  LEGACY
796  This function can also use old FEI trainings (version 3), just pass the Summary.pickle file of the old training,
797  and the weight files will be automatically converted to the new naming scheme.
798 
799  @param particles list of config.Particle objects
800  @param config config.FeiConfiguration object
801  """
802  print(r"""
803  ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
804  |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
805  | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
806 
807  Author: Thomas Keck 2014 - 2017
808  Please cite my PhD thesis
809  """)
810 
811  # The cache parameter of the configuration object is used during training to keep track,
812  # which reconstruction steps are already performed.
813  # For fitting/training we start by default with -1, meaning we still have to create the TrainingDataInformation,
814  # which is used to determine the number of candidates we have to write out for the FSP trainings in stage 0.
815  # For inference/application we start by default with 0, because we don't need the TrainingDataInformation in stage 0.
816  # During the training we save the particles and configuration (including the current cache stage) in the Summary.pickle object.
817  if configuration.cache is None:
818  if os.path.isfile('Summary.pickle'):
819  print("Cache: Replaced particles and configuration with the ones from Summary.pickle!")
820  particles, configuration = pickle.load(open('Summary.pickle', 'rb'))
821  cache = configuration.cache
822  else:
823  if configuration.training:
824  cache = -1
825  else:
826  cache = 0
827  else:
828  cache = configuration.cache
829 
830  # Now we start building the training or application path
831  path = basf2.create_path()
832 
833  # There are in total 7 stages.
834  # For training we start with -1 and go to 7 one stage at a time
835  # For application we can run stage 0 to 7 at once
836  stages = get_stages_from_particles(particles)
837 
838  # If the user provided a Summary.pickle file of a FEIv3 training we
839  # convert the old weight files (with hashes), to the new naming scheme.
840  # Afterwards the algorithm runs as usual
841  if configuration.legacy is not None:
842  convert_legacy_training(particles, configuration)
843 
844  # During the training we require the number of MC particles in the whole processed
845  # data sample, because we don't want to write out billions of e.g. pion candidates.
846  # Knowing the total amount of MC particles we can write out only every e.g. 10th candidate
847  # That's why we have to write out the TrainingDataInformation before doing anything during the training phase.
848  # During application we only need this if we run in monitor mode, and want to write out a summary in the end,
849  # the summary contains efficiency, and the efficiency calculation requires the total number of MC particles.
850  training_data_information = TrainingDataInformation(particles)
851  if cache < 0 and configuration.training:
852  print("Stage 0: Run over all files to count the number of events and McParticles")
853  path.add_path(training_data_information.reconstruct())
854  if configuration.training:
855  save_summary(particles, configuration, 0)
856  return FeiState(path, 0, [])
857  elif not configuration.training and configuration.monitor:
858  path.add_path(training_data_information.reconstruct())
859 
860  # We load the Final State particles
861  # It is assumed that the user takes care of adding RootInput, Geometry, and everything
862  # which is required to read in data, so we directly start to load the FSP particles
863  # used by the FEI.
864  loader = FSPLoader(particles, configuration)
865  if cache < 1:
866  print("Stage 0: Load FSP particles")
867  path.add_path(loader.reconstruct())
868 
869  # Now we reconstruct each stage one after another.
870  # Each stage consists of two parts:
871  # PreReconstruction (before the mva method was applied):
872  # - Particle combination
873  # - Do vertex fitting
874  # - Some simple cuts and best candidate selection
875  # PostReconstruction (after the mva method was applied):
876  # - Apply the mva method
877  # - Apply cuts on the mva output and best candidate selection
878  #
879  # If the weight files for the PostReconstruction are not available for the current stage and we are in training mode,
880  # we have to create the training data. The training itself is done by the do_trainings function which is called
881  # either by the user after each step, or by the distributed.py script
882  #
883  # If the weight files for the PostReconstruction are not available for the current stage and we are not in training mode,
884  # we keep going, as soon as the user will call process on the produced path he will get an error message that the
885  # weight files are missing.
886  #
887  # Finally we keep track of the ParticleLists we use, so the user can run the RemoveParticles module to reduce the size of the
888  # intermediate output of RootOutput.
889  used_lists = []
890  for stage, stage_particles in enumerate(stages):
891  pre_reconstruction = PreReconstruction(stage_particles, configuration)
892  if cache <= stage:
893  print(f"Stage {stage}: PreReconstruct particles: ", [p.name for p in stage_particles])
894  path.add_path(pre_reconstruction.reconstruct())
895 
896  post_reconstruction = PostReconstruction(stage_particles, configuration)
897  if configuration.training and not post_reconstruction.available():
898  print(f"Stage {stage}: Create training data for particles: ", [p.name for p in stage_particles])
899  mc_counts = training_data_information.get_mc_counts()
900  training_data = TrainingData(stage_particles, configuration, mc_counts)
901  path.add_path(training_data.reconstruct())
902  used_lists += [channel.name for particle in stage_particles for channel in particle.channels]
903  break
904  if cache <= stage + 1:
905  path.add_path(post_reconstruction.reconstruct())
906  used_lists += [particle.identifier for particle in stage_particles]
907 
908  # If we run in monitor mode we are interested in the ModuleStatistics,
909  # these statistics contain the runtime for each module which was run.
910  if configuration.monitor:
911  output = basf2.register_module('RootOutput')
912  output.param('outputFileName', 'Monitor_ModuleStatistics.root')
913  output.param('branchNames', ['EventMetaData']) # cannot be removed, but of only small effect on file size
914  output.param('branchNamesPersistent', ['ProcessStatistics'])
915  output.param('ignoreCommandLineOverride', True)
916  path.add_module(output)
917 
918  # As mentioned above the FEI keeps track of the stages which are already reconstructed during the training
919  # so we write out the Summary.pickle here, and increase the stage by one.
920  if configuration.training or configuration.monitor:
921  save_summary(particles, configuration, stage + 1)
922 
923  # Finally we return the path, the stage and the used lists to the user.
924  return FeiState(path, stage + 1, plists=used_lists)
925 
926 # @endcond
def isB2BII()
Definition: b2bii.py:14
def from_name(name)
Definition: pdg.py:62