14 The Full Event Interpretation Algorithm
17 - The algorithm will automatically reconstruct B mesons and calculate a signal probability for each candidate.
18 - It can be used for hadronic and semileptonic tagging.
19 - The algorithm has to be trained on MC, and can afterwards be applied on data.
20 - The training requires O(100) million MC events
21 - The weight files are stored in the Belle 2 Condition database
23 Read this file if you want to understand the technical details of the FEI.
25 The FEI follows a hierarchical approach.
27 (Stage -1: Write out information about the provided data sample)
28 Stage 0: Final State Particles (FSP)
29 Stage 1: pi0, J/Psi, Lambda0
31 Stage 3: D and Lambda_c mesons
36 Most stages consists of:
37 - Create Particle Candidates
40 - Apply a multivariate classification method
43 The FEI will reconstruct these 7 stages during the training phase,
44 since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
45 to train all the necessary multivariate classifiers.
50 from basf2
import B2INFO, B2WARNING
52 import modularAnalysis
as ma
57 from fei
import config
69 import multiprocessing
72 FeiState = collections.namedtuple(
'FeiState',
'path, stage, plists')
75 class TrainingDataInformation:
77 Contains the relevant information about the used training data.
78 Basically we write out the number of MC particles in the whole dataset.
79 This numbers we can use to calculate what fraction of candidates we have to write
80 out as TrainingData to get a reasonable amount of candidates to train on
81 (too few candidates will lead to heavy overtraining, too many won't fit into memory).
82 Secondly we can use this information for the generation of the monitoring pdfs,
83 where we calculate reconstruction efficiencies.
86 def __init__(self, particles: typing.Sequence[config.Particle]):
88 Create a new TrainingData object
89 @param particles list of config.Particle objects
92 self.particles = particles
94 self.filename =
'mcParticlesCount.root'
96 def available(self) -> bool:
98 Check if the relevant information is already available
100 return os.path.isfile(self.filename)
102 def reconstruct(self) -> pybasf2.Path:
104 Returns pybasf2.Path which counts the number of MCParticles in each event.
105 @param particles list of config.Particle objects
108 pdgs = {abs(
pdg.from_name(particle.name))
for particle
in self.particles}
110 path = basf2.create_path()
111 module = basf2.register_module(
'VariablesToHistogram')
112 module.set_name(
"VariablesToHistogram_MCCount")
113 module.param(
'variables', [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs])
114 module.param(
'fileName', self.filename)
115 path.add_module(module)
118 def get_mc_counts(self):
120 Read out the number of MC particles from the file created by reconstruct
124 root_file = ROOT.TFile.Open(self.filename,
'read')
127 for key
in root_file.GetListOfKeys():
128 variable = ROOT.Belle2.MakeROOTCompatible.invertMakeROOTCompatible(key.GetName())
129 pdg = abs(int(variable[len(
'NumberOfMCParticlesInEvent('):-len(
")")]))
132 mc_counts[pdg][
'sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
133 for bin
in range(hist.GetNbinsX()))
134 mc_counts[pdg][
'std'] = hist.GetStdDev()
135 mc_counts[pdg][
'avg'] = hist.GetMean()
136 mc_counts[pdg][
'max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
137 mc_counts[pdg][
'min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
140 mc_counts[0][
'sum'] = hist.GetEntries()
147 Steers the loading of FSP particles.
148 This does NOT include RootInput, Geometry or anything required before loading FSPs,
149 the user has to add this himself (because it depends on the MC campaign and if you want
150 to use Belle 1 or Belle 2).
153 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
155 Create a new FSPLoader object
156 @param particles list of config.Particle objects
157 @param config config.FeiConfiguration object
160 self.particles = particles
164 def reconstruct(self) -> pybasf2.Path:
166 Returns pybasf2.Path which loads the FSP Particles
168 path = basf2.create_path()
171 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
172 (
'mu+:FSP',
''), (
'p+:FSP',
'')], writeOut=
True, path=path)
173 for outputList, inputList
in [(
'gamma:FSP',
'gamma:mdst'), (
'K_S0:V0',
'K_S0:mdst'),
174 (
'Lambda0:V0',
'Lambda0:mdst'), (
'K_L0:FSP',
'K_L0:mdst'),
175 (
'pi0:FSP',
'pi0:mdst'), (
'gamma:V0',
'gamma:v0mdst')]:
176 ma.copyParticles(outputList, inputList, writeOut=
True, path=path)
178 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
179 (
'mu+:FSP',
''), (
'gamma:FSP',
''),
180 (
'p+:FSP',
''), (
'K_L0:FSP',
'')], writeOut=
True, loadPhotonBeamBackgroundMVA=
False, path=path)
181 ma.fillParticleList(
'K_S0:V0 -> pi+ pi-',
'', writeOut=
True, path=path)
182 ma.fillParticleList(
'Lambda0:V0 -> p+ pi-',
'', writeOut=
True, path=path)
183 ma.fillConvertedPhotonsList(
'gamma:V0 -> e+ e-',
'', writeOut=
True, path=path)
185 if self.config.monitor:
186 names = [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'K_S0',
'p+',
'K_L0',
'Lambda0',
'pi0']
187 filename =
'Monitor_FSPLoader.root'
189 variables = [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs]
190 ma.variablesToHistogram(
'', variables=variables, filename=filename, path=path)
196 Steers the creation of the training data.
197 The training data is used to train a multivariate classifier for each channel.
198 The training of the FEI at its core is just generating this training data for each channel.
199 After we created the training data for a stage, we have to train the classifiers (see Teacher class further down).
202 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
203 mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
205 Create a new TrainingData object
206 @param particles list of config.Particle objects
207 @param config config.FeiConfiguration object
208 @param mc_counts containing number of MC Particles
211 self.particles = particles
215 self.mc_counts = mc_counts
217 def reconstruct(self) -> pybasf2.Path:
219 Returns pybasf2.Path which creates the training data for the given particles
221 path = basf2.create_path()
223 for particle
in self.particles:
225 nSignal = self.mc_counts[pdgcode][
'sum']
233 for channel
in particle.channels:
234 filename =
'training_input.root'
236 nBackground = self.mc_counts[0][
'sum'] * channel.preCutConfig.bestCandidateCut
237 inverseSamplingRates = {}
240 if nBackground > Teacher.MaximumNumberOfMVASamples
and not channel.preCutConfig.noBackgroundSampling:
241 inverseSamplingRates[0] = int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1
242 if nSignal > Teacher.MaximumNumberOfMVASamples:
243 inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
245 spectators = [channel.mvaConfig.target]
246 if channel.mvaConfig.sPlotVariable
is not None:
247 spectators.append(channel.mvaConfig.sPlotVariable)
249 if self.config.monitor:
250 hist_variables = [
'mcErrors',
'mcParticleStatus'] + channel.mvaConfig.variables + spectators
251 hist_variables_2d = [(x, channel.mvaConfig.target)
252 for x
in channel.mvaConfig.variables + spectators
if x
is not channel.mvaConfig.target]
253 hist_filename =
'Monitor_TrainingData.root'
254 ma.variablesToHistogram(channel.name, variables=config.variables2binnings(hist_variables),
255 variables_2d=config.variables2binnings_2d(hist_variables_2d),
256 filename=config.removeJPsiSlash(hist_filename),
257 directory=config.removeJPsiSlash(f
'{channel.label}'), path=path)
259 teacher = basf2.register_module(
'VariablesToNtuple')
260 teacher.set_name(
'VariablesToNtuple_' + channel.name)
261 teacher.param(
'fileName', filename)
262 teacher.param(
'treeName', f
'{channel.label} variables')
263 teacher.param(
'variables', channel.mvaConfig.variables + spectators)
264 teacher.param(
'particleList', channel.name)
265 teacher.param(
'sampling', (channel.mvaConfig.target, inverseSamplingRates))
266 path.add_module(teacher)
270 class PreReconstruction:
272 Steers the reconstruction phase before the mva method was applied
274 - The ParticleCombination (for each particle and channel we create candidates using
275 the daughter candidates from the previous stages)
277 - Vertex Fitting (this is the slowest part of the whole FEI, KFit is used by default,
278 but you can use fastFit as a drop-in replacement https://github.com/thomaskeck/FastFit/,
279 this will speed up the whole FEI by a factor 2-3)
282 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
284 Create a new PreReconstruction object
285 @param particles list of config.Particle objects
286 @param config config.FeiConfiguration object
289 self.particles = particles
293 def reconstruct(self) -> pybasf2.Path:
295 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
297 path = basf2.create_path()
299 for particle
in self.particles:
300 for channel
in particle.channels:
302 if len(channel.daughters) == 1:
303 ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=
True, path=path)
304 v2EI = basf2.register_module(
'VariablesToExtraInfo')
305 v2EI.set_name(
'VariablesToExtraInfo_' + channel.name)
306 v2EI.param(
'particleList', channel.name)
307 v2EI.param(
'variables', {f
'constant({channel.decayModeID})':
'decayModeID'})
309 v2EI.set_log_level(basf2.logging.log_level.ERROR)
310 path.add_module(v2EI)
312 ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
313 writeOut=
True, path=path)
314 if self.config.monitor:
315 ma.matchMCTruth(channel.name, path=path)
316 bc_variable = channel.preCutConfig.bestCandidateVariable
317 hist_variables = [bc_variable,
'mcErrors',
'mcParticleStatus', channel.mvaConfig.target]
318 hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
319 (bc_variable,
'mcErrors'),
320 (bc_variable,
'mcParticleStatus')]
321 filename =
'Monitor_PreReconstruction_BeforeRanking.root'
322 ma.variablesToHistogram(channel.name,
323 variables=config.variables2binnings(hist_variables),
324 variables_2d=config.variables2binnings_2d(hist_variables_2d),
325 filename=filename, directory=f
'{channel.label}', path=path)
327 if channel.preCutConfig.bestCandidateMode ==
'lowest':
328 ma.rankByLowest(channel.name,
329 channel.preCutConfig.bestCandidateVariable,
330 channel.preCutConfig.bestCandidateCut,
333 elif channel.preCutConfig.bestCandidateMode ==
'highest':
334 ma.rankByHighest(channel.name,
335 channel.preCutConfig.bestCandidateVariable,
336 channel.preCutConfig.bestCandidateCut,
340 raise RuntimeError(
"Unknown bestCandidateMode " + repr(channel.preCutConfig.bestCandidateMode))
342 if self.config.monitor:
343 filename =
'Monitor_PreReconstruction_AfterRanking.root'
344 hist_variables += [
'extraInfo(preCut_rank)']
345 hist_variables_2d += [(
'extraInfo(preCut_rank)', channel.mvaConfig.target),
346 (
'extraInfo(preCut_rank)',
'mcErrors'),
347 (
'extraInfo(preCut_rank)',
'mcParticleStatus')]
348 ma.variablesToHistogram(channel.name,
349 variables=config.variables2binnings(hist_variables),
350 variables_2d=config.variables2binnings_2d(hist_variables_2d),
351 filename=filename, directory=f
'{channel.label}', path=path)
354 elif self.config.training:
355 ma.matchMCTruth(channel.name, path=path)
358 pvfit = basf2.register_module(
'ParticleVertexFitter')
359 pvfit.set_name(
'ParticleVertexFitter_' + channel.name)
360 pvfit.param(
'listName', channel.name)
361 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
362 pvfit.param(
'vertexFitter',
'KFit')
363 pvfit.param(
'fitType',
'vertex')
364 pvfit.set_log_level(basf2.logging.log_level.ERROR)
365 path.add_module(pvfit)
366 elif re.findall(
r"[\w']+", channel.decayString).count(
'pi0') > 1
and particle.name !=
'pi0':
367 basf2.B2INFO(f
"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
368 elif len(channel.daughters) > 1:
369 pvfit = basf2.register_module(
'ParticleVertexFitter')
370 pvfit.set_name(
'ParticleVertexFitter_' + channel.name)
371 pvfit.param(
'listName', channel.name)
372 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
373 pvfit.param(
'vertexFitter',
'KFit')
374 pvfit.param(
'fitType',
'vertex')
375 pvfit.set_log_level(basf2.logging.log_level.ERROR)
376 path.add_module(pvfit)
378 if self.config.monitor:
379 hist_variables = [
'chiProb',
'mcErrors',
'mcParticleStatus', channel.mvaConfig.target]
380 hist_variables_2d = [(
'chiProb', channel.mvaConfig.target),
381 (
'chiProb',
'mcErrors'),
382 (
'chiProb',
'mcParticleStatus')]
383 filename =
'Monitor_PreReconstruction_AfterVertex.root'
384 ma.variablesToHistogram(channel.name,
385 variables=config.variables2binnings(hist_variables),
386 variables_2d=config.variables2binnings_2d(hist_variables_2d),
387 filename=filename, directory=f
'{channel.label}', path=path)
392 class PostReconstruction:
394 Steers the reconstruction phase after the mva method was applied
396 - The application of the mva method itself.
397 - Copying all channel lists in a common one for each particle defined in particles
398 - Tag unique signal candidates, to avoid double counting of channels with overlap
401 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
403 Create a new PostReconstruction object
404 @param particles list of config.Particle objects
405 @param config config.FeiConfiguration object
408 self.particles = particles
412 def get_missing_channels(self) -> typing.Sequence[str]:
414 Returns all channels for which the weightfile is missing
417 for particle
in self.particles:
418 for channel
in particle.channels:
420 weightfile = channel.label +
'.xml'
421 if not basf2_mva.available(weightfile):
422 missing += [channel.label]
425 def available(self) -> bool:
427 Check if the relevant information is already available
429 return len(self.get_missing_channels()) == 0
431 def reconstruct(self) -> pybasf2.Path:
433 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
435 path = basf2.create_path()
437 for particle
in self.particles:
438 for channel
in particle.channels:
439 expert = basf2.register_module(
'MVAExpert')
440 expert.set_name(
'MVAExpert_' + channel.name)
441 if self.config.training:
442 expert.param(
'identifier', channel.label +
'.xml')
444 expert.param(
'identifier', self.config.prefix +
'_' + channel.label)
445 expert.param(
'extraInfoName',
'SignalProbability')
446 expert.param(
'listNames', [channel.name])
448 expert.set_log_level(basf2.logging.log_level.ERROR)
449 path.add_module(expert)
451 uniqueSignal = basf2.register_module(
'TagUniqueSignal')
452 uniqueSignal.param(
'particleList', channel.name)
453 uniqueSignal.param(
'target', channel.mvaConfig.target)
454 uniqueSignal.param(
'extraInfoName',
'uniqueSignal')
455 uniqueSignal.set_name(
'TagUniqueSignal_' + channel.name)
457 uniqueSignal.set_log_level(basf2.logging.log_level.ERROR)
458 path.add_module(uniqueSignal)
460 if self.config.monitor:
461 hist_variables = [
'mcErrors',
'mcParticleStatus',
'extraInfo(uniqueSignal)',
'extraInfo(SignalProbability)',
462 channel.mvaConfig.target,
'extraInfo(decayModeID)']
463 hist_variables_2d = [(
'extraInfo(SignalProbability)', channel.mvaConfig.target),
464 (
'extraInfo(SignalProbability)',
'mcErrors'),
465 (
'extraInfo(SignalProbability)',
'mcParticleStatus'),
466 (
'extraInfo(decayModeID)', channel.mvaConfig.target),
467 (
'extraInfo(decayModeID)',
'mcErrors'),
468 (
'extraInfo(decayModeID)',
'extraInfo(uniqueSignal)'),
469 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
470 filename =
'Monitor_PostReconstruction_AfterMVA.root'
471 ma.variablesToHistogram(channel.name,
472 variables=config.variables2binnings(hist_variables),
473 variables_2d=config.variables2binnings_2d(hist_variables_2d),
474 filename=filename, directory=f
'{channel.label}', path=path)
477 if particle.postCutConfig.value > 0.0:
478 cutstring = str(particle.postCutConfig.value) +
' < extraInfo(SignalProbability)'
480 ma.mergeListsWithBestDuplicate(particle.identifier, [c.name
for c
in particle.channels],
481 variable=
'particleSource', writeOut=
True, path=path)
483 if self.config.monitor:
484 hist_variables = [
'mcErrors',
'mcParticleStatus',
'extraInfo(uniqueSignal)',
'extraInfo(SignalProbability)',
485 particle.mvaConfig.target,
'extraInfo(decayModeID)']
486 hist_variables_2d = [(
'extraInfo(decayModeID)', particle.mvaConfig.target),
487 (
'extraInfo(decayModeID)',
'mcErrors'),
488 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
489 filename =
'Monitor_PostReconstruction_BeforePostCut.root'
490 ma.variablesToHistogram(
492 variables=config.variables2binnings(hist_variables),
493 variables_2d=config.variables2binnings_2d(hist_variables_2d),
494 filename=config.removeJPsiSlash(filename),
495 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
498 ma.applyCuts(particle.identifier, cutstring, path=path)
500 if self.config.monitor:
501 filename =
'Monitor_PostReconstruction_BeforeRanking.root'
502 ma.variablesToHistogram(
504 variables=config.variables2binnings(hist_variables),
505 variables_2d=config.variables2binnings_2d(hist_variables_2d),
506 filename=config.removeJPsiSlash(filename),
507 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
510 ma.rankByHighest(particle.identifier,
'extraInfo(SignalProbability)',
511 particle.postCutConfig.bestCandidateCut,
'postCut_rank', path=path)
513 if self.config.monitor:
514 hist_variables += [
'extraInfo(postCut_rank)']
515 hist_variables_2d += [(
'extraInfo(decayModeID)',
'extraInfo(postCut_rank)'),
516 (particle.mvaConfig.target,
'extraInfo(postCut_rank)'),
517 (
'mcErrors',
'extraInfo(postCut_rank)'),
518 (
'mcParticleStatus',
'extraInfo(postCut_rank)')]
519 filename =
'Monitor_PostReconstruction_AfterRanking.root'
520 ma.variablesToHistogram(
522 variables=config.variables2binnings(hist_variables),
523 variables_2d=config.variables2binnings_2d(hist_variables_2d),
524 filename=config.removeJPsiSlash(filename),
525 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
528 variables = [
'extraInfo(SignalProbability)',
'mcErrors',
'mcParticleStatus', particle.mvaConfig.target,
529 'extraInfo(uniqueSignal)',
'extraInfo(decayModeID)']
531 if 'B_s0' == particle.name:
533 elif 'B' in particle.name:
534 variables += [
'Mbc',
'cosThetaBetweenParticleAndNominalB']
536 filename =
'Monitor_Final.root'
537 ma.variablesToNtuple(particle.identifier, variables, treename=config.removeJPsiSlash(
538 f
'{particle.identifier} variables'), filename=config.removeJPsiSlash(filename), path=path)
544 Performs all necessary trainings for all training data files which are
545 available but where there is no weight file available yet.
546 This class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
547 The trainings are run in parallel using multi-threading of python.
548 Each training is done by a subprocess call, the training command (passed by config.externTeacher) can be either
549 * basf2_mva_teacher, the training will be done directly on the machine
550 * externClustTeacher, the training will be submitted to the batch system of KEKCC
554 MaximumNumberOfMVASamples = int(1e7)
557 MinimumNumberOfMVASamples = int(5e2)
559 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
561 Create a new Teacher object
562 @param particles list of config.Particle objects
563 @param config config.FeiConfiguration object
566 self.particles = particles
571 def create_fake_weightfile(channel: str):
573 Create a fake weight file using the trivial method, it will always return 0.0
574 @param channel for which we create a fake weight file
577 <?xml version="1.0" encoding="utf-8"?>
578 <method>Trivial</method>
579 <weightfile>{channel}.xml</weightfile>
580 <treename>tree</treename>
581 <target_variable>isSignal</target_variable>
582 <weight_variable>__weight__</weight_variable>
583 <signal_class>1</signal_class>
584 <max_events>0</max_events>
585 <number_feature_variables>1</number_feature_variables>
586 <variable0>M</variable0>
587 <number_spectator_variables>0</number_spectator_variables>
588 <number_data_files>1</number_data_files>
589 <datafile0>train.root</datafile0>
590 <Trivial_version>1</Trivial_version>
591 <Trivial_output>0</Trivial_output>
592 <signal_fraction>0.066082567</signal_fraction>
594 with open(channel +
".xml",
"w")
as f:
598 def check_if_weightfile_is_fake(filename: str):
600 Checks if the provided filename is a fake-weight file or not
601 @param filename the filename of the weight file
604 return '<method>Trivial</method>' in open(filename).readlines()[2]
605 except BaseException:
609 def upload(self, channel: str):
611 Upload the weight file into the condition database
612 @param channel whose weight file is uploaded
614 disk = channel +
'.xml'
615 dbase = self.config.prefix +
'_' + channel
616 basf2_mva.upload(disk, dbase)
619 def do_all_trainings(self):
621 Do all trainings for which we find training data
626 ROOT.PyConfig.StartGuiThread =
False
628 filename =
'training_input.root'
629 if not os.path.isfile(filename):
630 B2WARNING(
"Training of MVC failed. Couldn't find ROOT file. "
631 "No weight files will be provided.")
633 f = ROOT.TFile.Open(filename,
'read')
635 B2WARNING(
"Training of MVC failed. ROOT file corrupt. "
636 "No weight files will be provided.")
637 elif len([k.GetName()
for k
in f.GetListOfKeys()]) == 0:
638 B2WARNING(
"Training of MVC failed. ROOT file does not contain any trees. "
639 "No weight files will be provided.")
641 for particle
in self.particles:
642 for channel
in particle.channels:
643 weightfile = channel.label +
'.xml'
644 if not basf2_mva.available(weightfile):
645 keys = [m
for m
in f.GetListOfKeys()
if f
"{channel.label}" in m.GetName()]
648 tree = keys[0].ReadObj()
649 nSig = tree.GetEntries(channel.mvaConfig.target +
' == 1.0')
650 nBg = tree.GetEntries(channel.mvaConfig.target +
' != 1.0')
651 if nSig < Teacher.MinimumNumberOfMVASamples:
652 B2WARNING(
"Training of MVC failed. "
653 f
"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
654 self.create_fake_weightfile(channel.label)
655 self.upload(channel.label)
657 if nBg < Teacher.MinimumNumberOfMVASamples:
658 B2WARNING(
"Training of MVC failed. "
659 f
"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
660 self.create_fake_weightfile(channel.label)
661 self.upload(channel.label)
663 variable_str =
"' '".join(channel.mvaConfig.variables)
665 command = (f
"{self.config.externTeacher}"
666 f
" --method '{channel.mvaConfig.method}'"
667 f
" --target_variable '{channel.mvaConfig.target}'"
668 f
" --treename '{channel.label} variables' --datafile 'training_input.root'"
669 f
" --signal_class 1 --variables '{variable_str}'"
670 f
" --identifier '{channel.label}.xml'"
671 f
" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1")
672 B2INFO(f
"Used following command to invoke teacher: \n {command}")
673 job_list.append((channel.label, command))
675 p = multiprocessing.Pool(
None, maxtasksperchild=1)
676 func = functools.partial(subprocess.call, shell=
True)
677 p.map(func, [c
for _, c
in job_list])
681 for name, _
in job_list:
682 if not basf2_mva.available(name +
'.xml'):
683 B2WARNING(
"Training of MVC failed. For unknown reasons, check the logfile")
684 self.create_fake_weightfile(name)
685 weightfiles.append(self.upload(name))
689 def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
691 Convert an old FEI training into the new format.
692 The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
693 using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
694 @param particles list of config.Particle objects
695 @param config config.FeiConfiguration object
697 summary = pickle.load(open(configuration.legacy,
'rb'))
698 channel2lists = {k: v[2]
for k, v
in summary[
'channel2lists'].items()}
700 teacher = Teacher(particles, configuration)
702 for particle
in particles:
703 for channel
in particle.channels:
704 new_weightfile = configuration.prefix +
'_' + channel.label
705 old_weightfile = configuration.prefix +
'_' + channel2lists[channel.label.replace(
'Jpsi',
'J/psi')]
706 if not basf2_mva.available(new_weightfile):
707 if old_weightfile
is None or not basf2_mva.available(old_weightfile):
708 Teacher.create_fake_weightfile(channel.label)
709 teacher.upload(channel.label)
711 basf2_mva.download(old_weightfile, channel.label +
'.xml')
712 teacher.upload(channel.label)
715 def get_stages_from_particles(particles: typing.Sequence[config.Particle]):
717 Returns the hierarchical structure of the FEI.
718 Each stage depends on the particles in the previous stage.
719 The final stage is empty (meaning everything is done, and the training is finished at this point).
720 @param particles list of config.Particle objects
723 [p
for p
in particles
if p.name
in [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'p+',
'K_L0']],
724 [p
for p
in particles
if p.name
in [
'pi0',
'J/psi',
'Lambda0']],
725 [p
for p
in particles
if p.name
in [
'K_S0',
'Sigma+']],
726 [p
for p
in particles
if p.name
in [
'D+',
'D0',
'D_s+',
'Lambda_c+']],
727 [p
for p
in particles
if p.name
in [
'D*+',
'D*0',
'D_s*+']],
728 [p
for p
in particles
if p.name
in [
'B0',
'B+',
'B_s0']],
733 if p.name
not in [p.name
for stage
in stages
for p
in stage]:
734 raise RuntimeError(f
"Unknown particle {p.name}: Not implemented in FEI")
739 def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
741 Performs the training of mva classifiers for all available training data,
742 this function must be either called by the user after each stage of the FEI during training,
743 or (more likely) is called by the distributed.py script after merging the outputs of all jobs,
744 @param particles list of config.Particle objects
745 @param config config.FeiConfiguration object
746 @return list of tuple with weight file on disk and identifier in database for all trained classifiers
748 teacher = Teacher(particles, configuration)
749 return teacher.do_all_trainings()
752 def save_summary(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration, cache: int):
754 Creates the Summary.pickle, which is used to keep track of the stage during the training,
755 and can be used later to investigate which configuration was used exactly to create the training.
756 @param particles list of config.Particle objects
757 @param config config.FeiConfiguration object
758 @param cache current cache level
760 configuration = config.FeiConfiguration(configuration.prefix, cache,
761 configuration.monitor, configuration.legacy, configuration.externTeacher,
762 configuration.training)
764 for i
in [8, 7, 6, 5, 4, 3, 2, 1, 0]:
765 if os.path.isfile(f
'Summary.pickle.backup_{i}'):
766 shutil.copyfile(f
'Summary.pickle.backup_{i}', f
'Summary.pickle.backup_{i + 1}')
767 if os.path.isfile(
'Summary.pickle'):
768 shutil.copyfile(
'Summary.pickle',
'Summary.pickle.backup_0')
769 pickle.dump((particles, configuration), open(
'Summary.pickle',
'wb'))
772 def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
774 The most important function of the FEI.
775 This creates the FEI path for training/fitting (both terms are equal), and application/inference (both terms are equal).
776 The whole FEI is defined by the particles which are reconstructed (see default_channels.py)
777 and the configuration (see config.py).
780 For training this function is called multiple times, each time the FEI reconstructs one more stage in the hierarchical structure
781 i.e. we start with FSP, pi0, KS_0, D, D*, and with B mesons. You have to set configuration.training to True for training mode.
782 All weight files created during the training will be stored in your local database.
783 If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
784 to the central database first (see documentation for the Belle2 Condition Database).
787 For application you call this function once, and it returns the whole path which will reconstruct B mesons
788 with an associated signal probability. You have to set configuration.training to False for application mode.
791 You can always turn on the monitoring (configuration.monitor = True),
792 to write out ROOT Histograms of many quantities for each stage,
793 using these histograms you can use the printReporting.py or latexReporting.py scripts to automatically create pdf files.
796 This function can also use old FEI trainings (version 3), just pass the Summary.pickle file of the old training,
797 and the weight files will be automatically converted to the new naming scheme.
799 @param particles list of config.Particle objects
800 @param config config.FeiConfiguration object
803 ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
804 |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
805 | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
807 Author: Thomas Keck 2014 - 2017
808 Please cite my PhD thesis
817 if configuration.cache
is None:
818 if os.path.isfile(
'Summary.pickle'):
819 print(
"Cache: Replaced particles and configuration with the ones from Summary.pickle!")
820 particles, configuration = pickle.load(open(
'Summary.pickle',
'rb'))
821 cache = configuration.cache
823 if configuration.training:
828 cache = configuration.cache
831 path = basf2.create_path()
836 stages = get_stages_from_particles(particles)
841 if configuration.legacy
is not None:
842 convert_legacy_training(particles, configuration)
850 training_data_information = TrainingDataInformation(particles)
851 if cache < 0
and configuration.training:
852 print(
"Stage 0: Run over all files to count the number of events and McParticles")
853 path.add_path(training_data_information.reconstruct())
854 if configuration.training:
855 save_summary(particles, configuration, 0)
856 return FeiState(path, 0, [])
857 elif not configuration.training
and configuration.monitor:
858 path.add_path(training_data_information.reconstruct())
864 loader = FSPLoader(particles, configuration)
866 print(
"Stage 0: Load FSP particles")
867 path.add_path(loader.reconstruct())
890 for stage, stage_particles
in enumerate(stages):
891 pre_reconstruction = PreReconstruction(stage_particles, configuration)
893 print(f
"Stage {stage}: PreReconstruct particles: ", [p.name
for p
in stage_particles])
894 path.add_path(pre_reconstruction.reconstruct())
896 post_reconstruction = PostReconstruction(stage_particles, configuration)
897 if configuration.training
and not post_reconstruction.available():
898 print(f
"Stage {stage}: Create training data for particles: ", [p.name
for p
in stage_particles])
899 mc_counts = training_data_information.get_mc_counts()
900 training_data = TrainingData(stage_particles, configuration, mc_counts)
901 path.add_path(training_data.reconstruct())
902 used_lists += [channel.name
for particle
in stage_particles
for channel
in particle.channels]
904 if cache <= stage + 1:
905 path.add_path(post_reconstruction.reconstruct())
906 used_lists += [particle.identifier
for particle
in stage_particles]
910 if configuration.monitor:
911 output = basf2.register_module(
'RootOutput')
912 output.param(
'outputFileName',
'Monitor_ModuleStatistics.root')
913 output.param(
'branchNames', [
'EventMetaData'])
914 output.param(
'branchNamesPersistent', [
'ProcessStatistics'])
915 output.param(
'ignoreCommandLineOverride',
True)
916 path.add_module(output)
920 if configuration.training
or configuration.monitor:
921 save_summary(particles, configuration, stage + 1)
924 return FeiState(path, stage + 1, plists=used_lists)