Belle II Software development
core.py
1#!/usr/bin/env python3
2
3
10
11# @cond SUPPRESS_DOXYGEN
12
13"""
14 The Full Event Interpretation Algorithm
15
16 Some basic facts:
17 - The algorithm will automatically reconstruct B mesons and calculate a signal probability for each candidate.
18 - It can be used for hadronic and semileptonic tagging.
19 - The algorithm has to be trained on MC, and can afterwards be applied on data.
20 - The training requires O(100) million MC events
21 - The weight files are stored in the Belle II Condition database
22
23 Read this file if you want to understand the technical details of the FEI.
24
25 The FEI follows a hierarchical approach.
26 There are 7 stages:
27 (Stage -1: Write out information about the provided data sample)
28 Stage 0: Final State Particles (FSP)
29 Stage 1: pi0, J/Psi, Lambda0
30 Stage 2: K_S0, Sigma+
31 Stage 3: D and Lambda_c mesons
32 Stage 4: D* mesons
33 Stage 5: B mesons
34 Stage 6: Finish
35
36 Most stages consists of:
37 - Create Particle Candidates
38 - Apply Cuts
39 - Do vertex Fitting
40 - Apply a multivariate classification method
41 - Apply more Cuts
42
43 The FEI will reconstruct these 7 stages during the training phase,
44 since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
45 to train all the necessary multivariate classifiers.
46"""
47
48# Import basf2
49import basf2
50from basf2 import B2INFO, B2WARNING
51import pybasf2
52import modularAnalysis as ma
53import b2bii
54
55# Should come after basf2 import
56import pdg
57from fei import config
58import basf2_mva
59
60# Standard python modules
61import collections
62import os
63import shutil
64import typing
65import pickle
66import re
67import functools
68import subprocess
69import multiprocessing
70
71# Simple object containing the output of fei
72FeiState = collections.namedtuple('FeiState', 'path, stage, plists')
73
74
75class TrainingDataInformation:
76 """
77 Contains the relevant information about the used training data.
78 Basically we write out the number of MC particles in the whole dataset.
79 This numbers we can use to calculate what fraction of candidates we have to write
80 out as TrainingData to get a reasonable amount of candidates to train on
81 (too few candidates will lead to heavy overtraining, too many won't fit into memory). Secondly we can use this information for the generation of the monitoring pdfs,
82 where we calculate reconstruction efficiencies.
83 """
84
85 def __init__(self, particles: typing.Sequence[config.Particle]):
86 """
87 Create a new TrainingData object
88 @param particles list of config.Particle objects
89 """
90
91 self.particles = particles
92
93 self.filename = 'mcParticlesCount.root'
94
95 def available(self) -> bool:
96 """
97 Check if the relevant information is already available
98 """
99 return os.path.isfile(self.filename)
100
101 def reconstruct(self) -> pybasf2.Path:
102 """
103 Returns pybasf2.Path which counts the number of MCParticles in each event.
104 @param particles list of config.Particle objects
105 """
106 # Unique absolute pdg-codes of all particles
107 pdgs = {abs(pdg.from_name(particle.name)) for particle in self.particles}
108
109 path = basf2.create_path()
110 module = basf2.register_module('VariablesToHistogram')
111 module.set_name("VariablesToHistogram_MCCount")
112 module.param('variables', [(f'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5) for pdg in pdgs])
113 module.param('fileName', self.filename)
114 module.param('ignoreCommandLineOverride', True)
115 path.add_module(module)
116 return path
117
118 def get_mc_counts(self):
119 """
120 Read out the number of MC particles from the file created by reconstruct
121 """
122 # Unique absolute pdg-codes of all particles
123 # Always avoid the top-level 'import ROOT'.
124 import ROOT # noqa
125 root_file = ROOT.TFile.Open(self.filename, 'read')
126 mc_counts = {}
127
128 for key in root_file.GetListOfKeys():
129 variable = ROOT.Belle2.MakeROOTCompatible.invertMakeROOTCompatible(key.GetName())
130 pdg = abs(int(variable[len('NumberOfMCParticlesInEvent('):-len(")")]))
131 hist = key.ReadObj()
132 mc_counts[pdg] = {}
133 mc_counts[pdg]['sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
134 for bin in range(hist.GetNbinsX()))
135 mc_counts[pdg]['std'] = hist.GetStdDev()
136 mc_counts[pdg]['avg'] = hist.GetMean()
137 mc_counts[pdg]['max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
138 mc_counts[pdg]['min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
139
140 mc_counts[0] = {}
141 mc_counts[0]['sum'] = hist.GetEntries()
142 root_file.Close()
143 return mc_counts
144
145
146class FSPLoader:
147 """
148 Steers the loading of FSP particles.
149 This does NOT include RootInput, Geometry or anything required before loading FSPs,
150 the user has to add this himself (because it depends on the MC campaign and if you want
151 to use Belle or Belle II).
152 """
153
154 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
155 """
156 Create a new FSPLoader object
157 @param particles list of config.Particle objects
158 @param config config.FeiConfiguration object
159 """
160
161 self.particles = particles
162
163 self.config = config
164
165 def reconstruct(self) -> pybasf2.Path:
166 """
167 Returns pybasf2.Path which loads the FSP Particles
168 """
169 path = basf2.create_path()
170
171 if b2bii.isB2BII():
172 ma.fillParticleLists([('K+:FSP', ''), ('pi+:FSP', ''), ('e+:FSP', ''),
173 ('mu+:FSP', ''), ('p+:FSP', '')], writeOut=True, path=path)
174 for outputList, inputList in [('gamma:FSP', 'gamma:mdst'), ('K_S0:V0', 'K_S0:mdst'),
175 ('Lambda0:V0', 'Lambda0:mdst'), ('K_L0:FSP', 'K_L0:mdst'),
176 ('pi0:FSP', 'pi0:mdst'), ('gamma:V0', 'gamma:v0mdst')]:
177 ma.copyParticles(outputList, inputList, writeOut=True, path=path)
178 else:
179 ma.fillParticleLists([('K+:FSP', ''), ('pi+:FSP', ''), ('e+:FSP', ''),
180 ('mu+:FSP', ''), ('gamma:FSP', ''),
181 ('p+:FSP', ''), ('K_L0:FSP', '')], writeOut=True, path=path)
182 ma.fillParticleList('K_S0:V0 -> pi+ pi-', '', writeOut=True, path=path)
183 ma.fillParticleList('Lambda0:V0 -> p+ pi-', '', writeOut=True, path=path)
184 ma.fillConvertedPhotonsList('gamma:V0 -> e+ e-', '', writeOut=True, path=path)
185
186 if self.config.monitor:
187 names = ['e+', 'K+', 'pi+', 'mu+', 'gamma', 'K_S0', 'p+', 'K_L0', 'Lambda0', 'pi0']
188 filename = 'Monitor_FSPLoader.root'
189 pdgs = {abs(pdg.from_name(name)) for name in names}
190 variables = [(f'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5) for pdg in pdgs]
191 ma.variablesToHistogram('', variables=variables, filename=filename, ignoreCommandLineOverride=True, path=path)
192 return path
193
194
195class TrainingData:
196 """
197 Steers the creation of the training data.
198 The training data is used to train a multivariate classifier for each channel.
199 The training of the FEI at its core is just generating this training data for each channel.
200 After we created the training data for a stage, we have to train the classifiers (see Teacher class further down).
201 """
202
203 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
204 mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
205 """
206 Create a new TrainingData object
207 @param particles list of config.Particle objects
208 @param config config.FeiConfiguration object
209 @param mc_counts containing number of MC Particles
210 """
211
212 self.particles = particles
213
214 self.config = config
215
216 self.mc_counts = mc_counts
217
218 def reconstruct(self) -> pybasf2.Path:
219 """
220 Returns pybasf2.Path which creates the training data for the given particles
221 """
222 path = basf2.create_path()
223
224 for particle in self.particles:
225 pdgcode = abs(pdg.from_name(particle.name))
226 nSignal = self.mc_counts[pdgcode]['sum']
227 # For D-Mesons we usually have a efficiency of 10^-3 including branching fraction
228 if pdgcode > 400:
229 nSignal /= 1000
230 # For B-Mesons we usually have a efficiency of 10^-4 including branching fraction
231 if pdgcode > 500:
232 nSignal /= 10000
233
234 for channel in particle.channels:
235 filename = 'training_input.root'
236
237 nBackground = self.mc_counts[0]['sum'] * channel.preCutConfig.bestCandidateCut
238 inverseSamplingRates = {}
239 # For some very pure channels (Jpsi), this sampling can be too aggressive and training fails.
240 # It can therefore be disabled in the preCutConfig.
241 if nBackground > Teacher.MaximumNumberOfMVASamples and not channel.preCutConfig.noBackgroundSampling:
242 inverseSamplingRates[0] = int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1
243 if nSignal > Teacher.MaximumNumberOfMVASamples:
244 inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
245
246 spectators = [channel.mvaConfig.target]
247 if channel.mvaConfig.sPlotVariable is not None:
248 spectators.append(channel.mvaConfig.sPlotVariable)
249
250 if self.config.monitor:
251 hist_variables = ['mcErrors', 'mcParticleStatus'] + channel.mvaConfig.variables + spectators
252 hist_variables_2d = [(x, channel.mvaConfig.target)
253 for x in channel.mvaConfig.variables + spectators if x is not channel.mvaConfig.target]
254 hist_filename = 'Monitor_TrainingData.root'
255 ma.variablesToHistogram(channel.name, variables=config.variables2binnings(hist_variables),
256 variables_2d=config.variables2binnings_2d(hist_variables_2d),
257 filename=config.removeJPsiSlash(hist_filename),
258 ignoreCommandLineOverride=True,
259 directory=config.removeJPsiSlash(f'{channel.label}'), path=path)
260
261 teacher = basf2.register_module('VariablesToNtuple')
262 teacher.set_name('VariablesToNtuple_' + channel.name)
263 teacher.param('fileName', filename)
264 teacher.param('treeName', f'{channel.label} variables')
265 teacher.param('variables', channel.mvaConfig.variables + spectators)
266 teacher.param('particleList', channel.name)
267 teacher.param('sampling', (channel.mvaConfig.target, inverseSamplingRates))
268 teacher.param('ignoreCommandLineOverride', True)
269 path.add_module(teacher)
270 return path
271
272
273class PreReconstruction:
274 """
275 Steers the reconstruction phase before the mva method was applied
276 It Includes:
277 - The ParticleCombination (for each particle and channel we create candidates using
278 the daughter candidates from the previous stages)
279 - MC Matching
280 - Vertex Fitting (this is the slowest part of the whole FEI, KFit is used by default,
281 but you can use fastFit as a drop-in replacement https://github.com/thomaskeck/FastFit/,
282 this will speed up the whole FEI by a factor 2-3)
283 """
284
285 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
286 """
287 Create a new PreReconstruction object
288 @param particles list of config.Particle objects
289 @param config config.FeiConfiguration object
290 """
291
292 self.particles = particles
293
294 self.config = config
295
296 def reconstruct(self) -> pybasf2.Path:
297 """
298 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
299 """
300 path = basf2.create_path()
301
302 for particle in self.particles:
303 for channel in particle.channels:
304
305 if len(channel.daughters) == 1:
306 ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=True, path=path)
307 v2EI = basf2.register_module('VariablesToExtraInfo')
308 v2EI.set_name('VariablesToExtraInfo_' + channel.name)
309 v2EI.param('particleList', channel.name)
310 v2EI.param('variables', {f'constant({channel.decayModeID})': 'decayModeID'})
311 # suppress warning that decay mode ID won't be overwritten if it already exists
312 v2EI.set_log_level(basf2.logging.log_level.ERROR)
313 path.add_module(v2EI)
314 else:
315 ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
316 writeOut=True, path=path)
317 if self.config.monitor:
318 ma.matchMCTruth(channel.name, path=path)
319 bc_variable = channel.preCutConfig.bestCandidateVariable
320 hist_variables = [bc_variable, 'mcErrors', 'mcParticleStatus', channel.mvaConfig.target]
321 hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
322 (bc_variable, 'mcErrors'),
323 (bc_variable, 'mcParticleStatus')]
324 filename = 'Monitor_PreReconstruction_BeforeRanking.root'
325 ma.variablesToHistogram(
326 channel.name,
327 variables=config.variables2binnings(hist_variables),
328 variables_2d=config.variables2binnings_2d(hist_variables_2d),
329 filename=filename,
330 ignoreCommandLineOverride=True,
331 directory=f'{channel.label}',
332 path=path)
333
334 if channel.preCutConfig.bestCandidateMode == 'lowest':
335 ma.rankByLowest(channel.name,
336 channel.preCutConfig.bestCandidateVariable,
337 channel.preCutConfig.bestCandidateCut,
338 'preCut_rank',
339 path=path)
340 elif channel.preCutConfig.bestCandidateMode == 'highest':
341 ma.rankByHighest(channel.name,
342 channel.preCutConfig.bestCandidateVariable,
343 channel.preCutConfig.bestCandidateCut,
344 'preCut_rank',
345 path=path)
346 else:
347 raise RuntimeError("Unknown bestCandidateMode " + repr(channel.preCutConfig.bestCandidateMode))
348
349 if self.config.monitor:
350 filename = 'Monitor_PreReconstruction_AfterRanking.root'
351 hist_variables += ['extraInfo(preCut_rank)']
352 hist_variables_2d += [('extraInfo(preCut_rank)', channel.mvaConfig.target),
353 ('extraInfo(preCut_rank)', 'mcErrors'),
354 ('extraInfo(preCut_rank)', 'mcParticleStatus')]
355 ma.variablesToHistogram(
356 channel.name,
357 variables=config.variables2binnings(hist_variables),
358 variables_2d=config.variables2binnings_2d(hist_variables_2d),
359 filename=filename,
360 ignoreCommandLineOverride=True,
361 directory=f'{channel.label}',
362 path=path)
363 # If we are not in monitor mode we do the mc matching now,
364 # otherwise we did it above already!
365 elif self.config.training:
366 ma.matchMCTruth(channel.name, path=path)
367
368 if b2bii.isB2BII() and particle.name in ['K_S0', 'Lambda0']:
369 pvfit = basf2.register_module('ParticleVertexFitter')
370 pvfit.set_name('ParticleVertexFitter_' + channel.name)
371 pvfit.param('listName', channel.name)
372 pvfit.param('confidenceLevel', channel.preCutConfig.vertexCut)
373 pvfit.param('vertexFitter', 'KFit')
374 pvfit.param('fitType', 'vertex')
375 pvfit.set_log_level(basf2.logging.log_level.ERROR) # let's not produce gigabytes of uninteresting warnings
376 path.add_module(pvfit)
377 elif re.findall(r"[\w']+", channel.decayString).count('pi0') > 1 and particle.name != 'pi0':
378 basf2.B2INFO(f"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
379 elif len(channel.daughters) > 1:
380 pvfit = basf2.register_module('ParticleVertexFitter')
381 pvfit.set_name('ParticleVertexFitter_' + channel.name)
382 pvfit.param('listName', channel.name)
383 pvfit.param('confidenceLevel', channel.preCutConfig.vertexCut)
384 pvfit.param('vertexFitter', 'KFit')
385 if particle.name in ['pi0']:
386 pvfit.param('fitType', 'mass')
387 else:
388 pvfit.param('fitType', 'vertex')
389 pvfit.set_log_level(basf2.logging.log_level.ERROR) # let's not produce gigabytes of uninteresting warnings
390 path.add_module(pvfit)
391
392 if self.config.monitor:
393 hist_variables = ['chiProb', 'mcErrors', 'mcParticleStatus', channel.mvaConfig.target]
394 hist_variables_2d = [('chiProb', channel.mvaConfig.target),
395 ('chiProb', 'mcErrors'),
396 ('chiProb', 'mcParticleStatus')]
397 filename = 'Monitor_PreReconstruction_AfterVertex.root'
398 ma.variablesToHistogram(
399 channel.name,
400 variables=config.variables2binnings(hist_variables),
401 variables_2d=config.variables2binnings_2d(hist_variables_2d),
402 filename=filename,
403 ignoreCommandLineOverride=True,
404 directory=f'{channel.label}',
405 path=path)
406
407 return path
408
409
410class PostReconstruction:
411 """
412 Steers the reconstruction phase after the mva method was applied
413 It Includes:
414 - The application of the mva method itself.
415 - Copying all channel lists in a common one for each particle defined in particles
416 - Tag unique signal candidates, to avoid double counting of channels with overlap
417 """
418
419 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
420 """
421 Create a new PostReconstruction object
422 @param particles list of config.Particle objects
423 @param config config.FeiConfiguration object
424 """
425
426 self.particles = particles
427
428 self.config = config
429
430 def get_missing_channels(self) -> typing.Sequence[str]:
431 """
432 Returns all channels for which the weightfile is missing
433 """
434 missing = []
435 for particle in self.particles:
436 for channel in particle.channels:
437 # weightfile = self.config.prefix + '_' + channel.label
438 weightfile = channel.label + '.xml'
439 if not basf2_mva.available(weightfile):
440 missing += [channel.label]
441 return missing
442
443 def available(self) -> bool:
444 """
445 Check if the relevant information is already available
446 """
447 return len(self.get_missing_channels()) == 0
448
449 def reconstruct(self) -> pybasf2.Path:
450 """
451 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
452 """
453 path = basf2.create_path()
454
455 for particle in self.particles:
456 for channel in particle.channels:
457 expert = basf2.register_module('MVAExpert')
458 expert.set_name('MVAExpert_' + channel.name)
459 if self.config.training:
460 expert.param('identifier', channel.label + '.xml')
461 else:
462 expert.param('identifier', self.config.prefix + '_' + channel.label)
463 expert.param('extraInfoName', 'SignalProbability')
464 expert.param('listNames', [channel.name])
465 # suppress warning that signal probability won't be overwritten if it already exists
466 expert.set_log_level(basf2.logging.log_level.ERROR)
467 path.add_module(expert)
468
469 uniqueSignal = basf2.register_module('TagUniqueSignal')
470 uniqueSignal.param('particleList', channel.name)
471 uniqueSignal.param('target', channel.mvaConfig.target)
472 uniqueSignal.param('extraInfoName', 'uniqueSignal')
473 uniqueSignal.set_name('TagUniqueSignal_' + channel.name)
474 # suppress warning that unique signal extra info won't be overwritten if it already exists
475 uniqueSignal.set_log_level(basf2.logging.log_level.ERROR)
476 path.add_module(uniqueSignal)
477
478 if self.config.monitor:
479 hist_variables = ['mcErrors', 'mcParticleStatus', 'extraInfo(uniqueSignal)', 'extraInfo(SignalProbability)',
480 channel.mvaConfig.target, 'extraInfo(decayModeID)']
481 hist_variables_2d = [('extraInfo(SignalProbability)', channel.mvaConfig.target),
482 ('extraInfo(SignalProbability)', 'mcErrors'),
483 ('extraInfo(SignalProbability)', 'mcParticleStatus'),
484 ('extraInfo(decayModeID)', channel.mvaConfig.target),
485 ('extraInfo(decayModeID)', 'mcErrors'),
486 ('extraInfo(decayModeID)', 'extraInfo(uniqueSignal)'),
487 ('extraInfo(decayModeID)', 'mcParticleStatus')]
488 filename = 'Monitor_PostReconstruction_AfterMVA.root'
489 ma.variablesToHistogram(
490 channel.name,
491 variables=config.variables2binnings(hist_variables),
492 variables_2d=config.variables2binnings_2d(hist_variables_2d),
493 filename=filename,
494 ignoreCommandLineOverride=True,
495 directory=f'{channel.label}',
496 path=path)
497
498 cutstring = ''
499 if particle.postCutConfig.value > 0.0:
500 cutstring = str(particle.postCutConfig.value) + ' < extraInfo(SignalProbability)'
501
502 ma.mergeListsWithBestDuplicate(particle.identifier, [c.name for c in particle.channels],
503 variable='particleSource', writeOut=True, path=path)
504
505 if self.config.monitor:
506 hist_variables = ['mcErrors', 'mcParticleStatus', 'extraInfo(uniqueSignal)', 'extraInfo(SignalProbability)',
507 particle.mvaConfig.target, 'extraInfo(decayModeID)']
508 hist_variables_2d = [('extraInfo(decayModeID)', particle.mvaConfig.target),
509 ('extraInfo(decayModeID)', 'mcErrors'),
510 ('extraInfo(decayModeID)', 'mcParticleStatus')]
511 filename = 'Monitor_PostReconstruction_BeforePostCut.root'
512 ma.variablesToHistogram(
513 particle.identifier,
514 variables=config.variables2binnings(hist_variables),
515 variables_2d=config.variables2binnings_2d(hist_variables_2d),
516 filename=config.removeJPsiSlash(filename),
517 ignoreCommandLineOverride=True,
518 directory=config.removeJPsiSlash(f'{particle.identifier}'),
519 path=path)
520
521 ma.applyCuts(particle.identifier, cutstring, path=path)
522
523 if self.config.monitor:
524 filename = 'Monitor_PostReconstruction_BeforeRanking.root'
525 ma.variablesToHistogram(
526 particle.identifier,
527 variables=config.variables2binnings(hist_variables),
528 variables_2d=config.variables2binnings_2d(hist_variables_2d),
529 filename=config.removeJPsiSlash(filename),
530 ignoreCommandLineOverride=True,
531 directory=config.removeJPsiSlash(f'{particle.identifier}'),
532 path=path)
533
534 ma.rankByHighest(particle.identifier, 'extraInfo(SignalProbability)',
535 particle.postCutConfig.bestCandidateCut, 'postCut_rank', path=path)
536
537 if self.config.monitor:
538 hist_variables += ['extraInfo(postCut_rank)']
539 hist_variables_2d += [('extraInfo(decayModeID)', 'extraInfo(postCut_rank)'),
540 (particle.mvaConfig.target, 'extraInfo(postCut_rank)'),
541 ('mcErrors', 'extraInfo(postCut_rank)'),
542 ('mcParticleStatus', 'extraInfo(postCut_rank)')]
543 filename = 'Monitor_PostReconstruction_AfterRanking.root'
544 ma.variablesToHistogram(
545 particle.identifier,
546 variables=config.variables2binnings(hist_variables),
547 variables_2d=config.variables2binnings_2d(hist_variables_2d),
548 filename=config.removeJPsiSlash(filename),
549 ignoreCommandLineOverride=True,
550 directory=config.removeJPsiSlash(f'{particle.identifier}'),
551 path=path)
552
553 variables = ['extraInfo(SignalProbability)', 'mcErrors', 'mcParticleStatus', particle.mvaConfig.target,
554 'extraInfo(uniqueSignal)', 'extraInfo(decayModeID)']
555
556 if 'B_s0' == particle.name:
557 variables += ['Mbc']
558 elif 'B' in particle.name:
559 variables += ['Mbc', 'cosThetaBetweenParticleAndNominalB']
560
561 filename = 'Monitor_Final.root'
562 ma.variablesToNtuple(
563 particle.identifier,
564 variables,
565 treename=config.removeJPsiSlash(f'{particle.identifier} variables'),
566 filename=config.removeJPsiSlash(filename),
567 ignoreCommandLineOverride=True,
568 path=path)
569 return path
570
571
572class Teacher:
573 """
574 Performs all necessary trainings for all training data files which are
575 available but where there is no weight file available yet.
576 This class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
577 The trainings are run in parallel using multi-threading of python.
578 Each training is done by a subprocess call, the training command (passed by config.externTeacher) can be either
579 * basf2_mva_teacher, the training will be done directly on the machine
580 * externClustTeacher, the training will be submitted to the batch system of KEKCC
581 """
582
584 MaximumNumberOfMVASamples = int(1e7)
585
587 MinimumNumberOfMVASamples = int(5e2)
588
589 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
590 """
591 Create a new Teacher object
592 @param particles list of config.Particle objects
593 @param config config.FeiConfiguration object
594 """
595
596 self.particles = particles
597
598 self.config = config
599
600 @staticmethod
601 def create_fake_weightfile(channel: str):
602 """
603 Create a fake weight file using the trivial method, it will always return 0.0
604 @param channel for which we create a fake weight file
605 """
606 content = f"""
607 <?xml version="1.0" encoding="utf-8"?>
608 <method>Trivial</method>
609 <weightfile>{channel}.xml</weightfile>
610 <treename>tree</treename>
611 <target_variable>isSignal</target_variable>
612 <weight_variable>__weight__</weight_variable>
613 <signal_class>1</signal_class>
614 <max_events>0</max_events>
615 <number_feature_variables>1</number_feature_variables>
616 <variable0>M</variable0>
617 <number_spectator_variables>0</number_spectator_variables>
618 <number_data_files>1</number_data_files>
619 <datafile0>train.root</datafile0>
620 <Trivial_version>1</Trivial_version>
621 <Trivial_output>0</Trivial_output>
622 <signal_fraction>0.066082567</signal_fraction>
623 """
624 with open(channel + ".xml", "w") as f:
625 f.write(content)
626
627 @staticmethod
628 def check_if_weightfile_is_fake(filename: str):
629 """
630 Checks if the provided filename is a fake-weight file or not
631 @param filename the filename of the weight file
632 """
633 try:
634 return '<method>Trivial</method>' in open(filename).readlines()[2]
635 except BaseException:
636 return True
637 return True
638
639 def upload(self, channel: str):
640 """
641 Upload the weight file into the condition database
642 @param channel whose weight file is uploaded
643 """
644 disk = channel + '.xml'
645 dbase = self.config.prefix + '_' + channel
646 basf2_mva.upload(disk, dbase)
647 return (disk, dbase)
648
649 def do_all_trainings(self):
650 """
651 Do all trainings for which we find training data
652 """
653 # Always avoid the top-level 'import ROOT'.
654 import ROOT # noqa
655 # FEI uses multi-threading for parallel execution of tasks therefore
656 # the ROOT gui-thread is disabled, which otherwise interferes sometimes
657 ROOT.PyConfig.StartGuiThread = False
658 job_list = []
659 filename = 'training_input.root'
660 if not os.path.isfile(filename):
661 B2WARNING("Training of MVC failed. Couldn't find ROOT file. "
662 "No weight files will be provided.")
663 else:
664 f = ROOT.TFile.Open(filename, 'read')
665 if f.IsZombie():
666 B2WARNING("Training of MVC failed. ROOT file corrupt. "
667 "No weight files will be provided.")
668 elif len([k.GetName() for k in f.GetListOfKeys()]) == 0:
669 B2WARNING("Training of MVC failed. ROOT file does not contain any trees. "
670 "No weight files will be provided.")
671 else:
672 for particle in self.particles:
673 for channel in particle.channels:
674 weightfile = channel.label + '.xml'
675 if not basf2_mva.available(weightfile):
676 keys = [m for m in f.GetListOfKeys() if f"{channel.label}" in m.GetName()]
677 if not keys:
678 continue
679 tree = keys[0].ReadObj()
680 nSig = tree.GetEntries(channel.mvaConfig.target + ' == 1.0')
681 nBg = tree.GetEntries(channel.mvaConfig.target + ' != 1.0')
682 if nSig < Teacher.MinimumNumberOfMVASamples:
683 B2WARNING("Training of MVC failed. "
684 f"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
685 self.create_fake_weightfile(channel.label)
686 self.upload(channel.label)
687 continue
688 if nBg < Teacher.MinimumNumberOfMVASamples:
689 B2WARNING("Training of MVC failed. "
690 f"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
691 self.create_fake_weightfile(channel.label)
692 self.upload(channel.label)
693 continue
694 variable_str = "' '".join(channel.mvaConfig.variables)
695
696 command = (f"{self.config.externTeacher}"
697 f" --method '{channel.mvaConfig.method}'"
698 f" --target_variable '{channel.mvaConfig.target}'"
699 f" --treename '{channel.label} variables' --datafile 'training_input.root'"
700 f" --signal_class 1 --variables '{variable_str}'"
701 f" --identifier '{channel.label}.xml'"
702 f" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1")
703 B2INFO(f"Used following command to invoke teacher: \n {command}")
704 job_list.append((channel.label, command))
705 f.Close()
706 p = multiprocessing.Pool(None, maxtasksperchild=1)
707 func = functools.partial(subprocess.call, shell=True)
708 p.map(func, [c for _, c in job_list])
709 p.close()
710 p.join()
711 weightfiles = []
712 for name, _ in job_list:
713 if not basf2_mva.available(name + '.xml'):
714 B2WARNING("Training of MVC failed. For unknown reasons, check the logfile")
715 self.create_fake_weightfile(name)
716 weightfiles.append(self.upload(name))
717 return weightfiles
718
719
720def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
721 """
722 Convert an old FEI training into the new format.
723 The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
724 using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
725 @param particles list of config.Particle objects
726 @param config config.FeiConfiguration object
727 """
728 summary = pickle.load(open(configuration.legacy, 'rb'))
729 channel2lists = {k: v[2] for k, v in summary['channel2lists'].items()}
730
731 teacher = Teacher(particles, configuration)
732
733 for particle in particles:
734 for channel in particle.channels:
735 new_weightfile = configuration.prefix + '_' + channel.label
736 old_weightfile = configuration.prefix + '_' + channel2lists[channel.label.replace('Jpsi', 'J/psi')]
737 if not basf2_mva.available(new_weightfile):
738 if old_weightfile is None or not basf2_mva.available(old_weightfile):
739 Teacher.create_fake_weightfile(channel.label)
740 teacher.upload(channel.label)
741 else:
742 basf2_mva.download(old_weightfile, channel.label + '.xml')
743 teacher.upload(channel.label)
744
745
746def get_stages_from_particles(particles: typing.Sequence[config.Particle]):
747 """
748 Returns the hierarchical structure of the FEI.
749 Each stage depends on the particles in the previous stage.
750 The final stage is empty (meaning everything is done, and the training is finished at this point).
751 @param particles list of config.Particle objects
752 """
753 stages = [
754 [p for p in particles if p.name in ['e+', 'K+', 'pi+', 'mu+', 'gamma', 'p+', 'K_L0']],
755 [p for p in particles if p.name in ['pi0', 'J/psi', 'Lambda0']],
756 [p for p in particles if p.name in ['K_S0', 'Sigma+']],
757 [p for p in particles if p.name in ['D+', 'D0', 'D_s+', 'Lambda_c+']],
758 [p for p in particles if p.name in ['D*+', 'D*0', 'D_s*+']],
759 [p for p in particles if p.name in ['B0', 'B+', 'B_s0']],
760 []
761 ]
762
763 for p in particles:
764 if p.name not in [p.name for stage in stages for p in stage]:
765 raise RuntimeError(f"Unknown particle {p.name}: Not implemented in FEI")
766
767 return stages
768
769
770def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
771 """
772 Performs the training of mva classifiers for all available training data,
773 this function must be either called by the user after each stage of the FEI during training,
774 or (more likely) is called by the distributed.py script after merging the outputs of all jobs,
775 @param particles list of config.Particle objects
776 @param config config.FeiConfiguration object
777 @return list of tuple with weight file on disk and identifier in database for all trained classifiers
778 """
779 teacher = Teacher(particles, configuration)
780 return teacher.do_all_trainings()
781
782
783def save_summary(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration, cache: int):
784 """
785 Creates the Summary.pickle, which is used to keep track of the stage during the training,
786 and can be used later to investigate which configuration was used exactly to create the training.
787 @param particles list of config.Particle objects
788 @param config config.FeiConfiguration object
789 @param cache current cache level
790 """
791 configuration = config.FeiConfiguration(configuration.prefix, cache,
792 configuration.monitor, configuration.legacy, configuration.externTeacher,
793 configuration.training)
794 # Backup existing Summary.pickle files
795 for i in [8, 7, 6, 5, 4, 3, 2, 1, 0]:
796 if os.path.isfile(f'Summary.pickle.backup_{i}'):
797 shutil.copyfile(f'Summary.pickle.backup_{i}', f'Summary.pickle.backup_{i + 1}')
798 if os.path.isfile('Summary.pickle'):
799 shutil.copyfile('Summary.pickle', 'Summary.pickle.backup_0')
800 pickle.dump((particles, configuration), open('Summary.pickle', 'wb'))
801
802
803def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
804 """
805 The most important function of the FEI.
806 This creates the FEI path for training/fitting (both terms are equal), and application/inference (both terms are equal).
807 The whole FEI is defined by the particles which are reconstructed (see default_channels.py)
808 and the configuration (see config.py).
809
810 TRAINING
811 For training this function is called multiple times, each time the FEI reconstructs one more stage in the hierarchical structure
812 i.e. we start with FSP, pi0, KS_0, D, D*, and with B mesons. You have to set configuration.training to True for training mode.
813 All weight files created during the training will be stored in your local database.
814 If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
815 to the central database first (see documentation for the Belle2 Condition Database).
816
817 APPLICATION
818 For application you call this function once, and it returns the whole path which will reconstruct B mesons
819 with an associated signal probability. You have to set configuration.training to False for application mode.
820
821 MONITORING
822 You can always turn on the monitoring (configuration.monitor = True),
823 to write out ROOT Histograms of many quantities for each stage,
824 using these histograms you can use the printReporting.py or latexReporting.py scripts to automatically create pdf files.
825
826 LEGACY
827 This function can also use old FEI trainings (version 3), just pass the Summary.pickle file of the old training,
828 and the weight files will be automatically converted to the new naming scheme.
829
830 @param particles list of config.Particle objects
831 @param config config.FeiConfiguration object
832 """
833 print(r"""
834 ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
835 |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
836 | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
837
838 Author: Thomas Keck 2014 - 2017
839 Please cite my PhD thesis
840 """)
841
842 # The cache parameter of the configuration object is used during training to keep track,
843 # which reconstruction steps are already performed.
844 # For fitting/training we start by default with -1, meaning we still have to create the TrainingDataInformation,
845 # which is used to determine the number of candidates we have to write out for the FSP trainings in stage 0.
846 # For inference/application we start by default with 0, because we don't need the TrainingDataInformation in stage 0.
847 # During the training we save the particles and configuration (including the current cache stage) in the Summary.pickle object.
848 if configuration.cache is None:
849 if os.path.isfile('Summary.pickle'):
850 print("Cache: Replaced particles and configuration with the ones from Summary.pickle!")
851 particles, configuration = pickle.load(open('Summary.pickle', 'rb'))
852 cache = configuration.cache
853 else:
854 if configuration.training:
855 cache = -1
856 else:
857 cache = 0
858 else:
859 cache = configuration.cache
860
861 # Now we start building the training or application path
862 path = basf2.create_path()
863
864 # There are in total 7 stages.
865 # For training we start with -1 and go to 7 one stage at a time
866 # For application we can run stage 0 to 7 at once
867 stages = get_stages_from_particles(particles)
868
869 # If the user provided a Summary.pickle file of a FEIv3 training we
870 # convert the old weight files (with hashes), to the new naming scheme.
871 # Afterwards the algorithm runs as usual
872 if configuration.legacy is not None:
873 convert_legacy_training(particles, configuration)
874
875 # During the training we require the number of MC particles in the whole processed
876 # data sample, because we don't want to write out billions of e.g. pion candidates.
877 # Knowing the total amount of MC particles we can write out only every e.g. 10th candidate
878 # That's why we have to write out the TrainingDataInformation before doing anything during the training phase.
879 # During application we only need this if we run in monitor mode, and want to write out a summary in the end,
880 # the summary contains efficiency, and the efficiency calculation requires the total number of MC particles.
881 training_data_information = TrainingDataInformation(particles)
882 if cache < 0 and configuration.training:
883 print("Stage 0: Run over all files to count the number of events and McParticles")
884 path.add_path(training_data_information.reconstruct())
885 if configuration.training:
886 save_summary(particles, configuration, 0)
887 return FeiState(path, 0, [])
888 elif not configuration.training and configuration.monitor:
889 path.add_path(training_data_information.reconstruct())
890
891 # We load the Final State particles
892 # It is assumed that the user takes care of adding RootInput, Geometry, and everything
893 # which is required to read in data, so we directly start to load the FSP particles
894 # used by the FEI.
895 loader = FSPLoader(particles, configuration)
896 if cache < 1:
897 print("Stage 0: Load FSP particles")
898 path.add_path(loader.reconstruct())
899
900 # Now we reconstruct each stage one after another.
901 # Each stage consists of two parts:
902 # PreReconstruction (before the mva method was applied):
903 # - Particle combination
904 # - Do vertex fitting
905 # - Some simple cuts and best candidate selection
906 # PostReconstruction (after the mva method was applied):
907 # - Apply the mva method
908 # - Apply cuts on the mva output and best candidate selection
909 #
910 # If the weight files for the PostReconstruction are not available for the current stage and we are in training mode,
911 # we have to create the training data. The training itself is done by the do_trainings function which is called
912 # either by the user after each step, or by the distributed.py script
913 #
914 # If the weight files for the PostReconstruction are not available for the current stage and we are not in training mode,
915 # we keep going, as soon as the user will call process on the produced path he will get an error message that the
916 # weight files are missing.
917 #
918 # Finally we keep track of the ParticleLists we use, so the user can run the RemoveParticles module to reduce the size of the
919 # intermediate output of RootOutput.
920 used_lists = []
921 for stage, stage_particles in enumerate(stages):
922 pre_reconstruction = PreReconstruction(stage_particles, configuration)
923 if cache <= stage:
924 print(f"Stage {stage}: PreReconstruct particles: ", [p.name for p in stage_particles])
925 path.add_path(pre_reconstruction.reconstruct())
926
927 post_reconstruction = PostReconstruction(stage_particles, configuration)
928 if configuration.training and not post_reconstruction.available():
929 print(f"Stage {stage}: Create training data for particles: ", [p.name for p in stage_particles])
930 mc_counts = training_data_information.get_mc_counts()
931 training_data = TrainingData(stage_particles, configuration, mc_counts)
932 path.add_path(training_data.reconstruct())
933 used_lists += [channel.name for particle in stage_particles for channel in particle.channels]
934 break
935 if cache <= stage + 1:
936 path.add_path(post_reconstruction.reconstruct())
937 used_lists += [particle.identifier for particle in stage_particles]
938
939 # If we run in monitor mode we are interested in the ModuleStatistics,
940 # these statistics contain the runtime for each module which was run.
941 if configuration.monitor:
942 output = basf2.register_module('RootOutput')
943 output.param('outputFileName', 'Monitor_ModuleStatistics.root')
944 output.param('branchNames', ['EventMetaData']) # cannot be removed, but of only small effect on file size
945 output.param('branchNamesPersistent', ['ProcessStatistics'])
946 output.param('ignoreCommandLineOverride', True)
947 path.add_module(output)
948
949 # As mentioned above the FEI keeps track of the stages which are already reconstructed during the training
950 # so we write out the Summary.pickle here, and increase the stage by one.
951 if configuration.training or configuration.monitor:
952 save_summary(particles, configuration, stage + 1)
953
954 # Finally we return the path, the stage and the used lists to the user.
955 return FeiState(path, stage + 1, plists=used_lists)
956
957# @endcond
958
def isB2BII()
Definition: b2bii.py:14
def from_name(name)
Definition: pdg.py:63