14 The Full Event Interpretation Algorithm
17 - The algorithm will automatically reconstruct B mesons and calculate a signal probability
for each candidate.
18 - It can be used
for hadronic
and semileptonic tagging.
19 - The algorithm has to be trained on MC,
and can afterwards be applied on data.
20 - The training requires O(100) million MC events
21 - The weight files are stored
in the Belle II Condition database
23 Read this file
if you want to understand the technical details of the FEI.
25 The FEI follows a hierarchical approach.
27 (Stage -1: Write out information about the provided data sample)
28 Stage 0: Final State Particles (FSP)
29 Stage 1: pi0, J/Psi, Lambda0
31 Stage 3: D
and Lambda_c mesons
36 Most stages consists of:
37 - Create Particle Candidates
40 - Apply a multivariate classification method
43 The FEI will reconstruct these 7 stages during the training phase,
44 since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
45 to train all the necessary multivariate classifiers.
50from basf2 import B2INFO, B2WARNING
52import modularAnalysis as ma
55# Should come after basf2 import
60# Standard python modules
71# Simple object containing the output of fei
72FeiState = collections.namedtuple('FeiState', 'path, stage, plists')
75class TrainingDataInformation:
77 Contains the relevant information about the used training data.
78 Basically we write out the number of MC particles in the whole dataset.
79 This numbers we can use to calculate what fraction of candidates we have to write
80 out
as TrainingData to get a reasonable amount of candidates to train on
81 (too few candidates will lead to heavy overtraining, too many won
't fit into memory). Secondly we can use this information for the generation of the monitoring pdfs,
82 where we calculate reconstruction efficiencies.
85 def __init__(self, particles: typing.Sequence[config.Particle]):
87 Create a new TrainingData object
88 @param particles list of config.Particle objects
91 self.particles = particles
93 self.filename = 'mcParticlesCount.root'
95 def available(self) -> bool:
97 Check if the relevant information
is already available
99 return os.path.isfile(self.filename)
101 def reconstruct(self) -> pybasf2.Path:
103 Returns pybasf2.Path which counts the number of MCParticles in each event.
104 @param particles list of config.Particle objects
107 pdgs = {abs(
pdg.from_name(particle.name))
for particle
in self.particles}
109 path = basf2.create_path()
110 module = basf2.register_module(
'VariablesToHistogram')
111 module.set_name(
"VariablesToHistogram_MCCount")
112 module.param(
'variables', [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs])
113 module.param(
'fileName', self.filename)
114 path.add_module(module)
117 def get_mc_counts(self):
119 Read out the number of MC particles from the file created by reconstruct
124 root_file = ROOT.TFile.Open(self.filename,
'read')
127 for key
in root_file.GetListOfKeys():
128 variable = ROOT.Belle2.MakeROOTCompatible.invertMakeROOTCompatible(key.GetName())
129 pdg = abs(int(variable[len(
'NumberOfMCParticlesInEvent('):-len(
")")]))
132 mc_counts[pdg][
'sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
133 for bin
in range(hist.GetNbinsX()))
134 mc_counts[pdg][
'std'] = hist.GetStdDev()
135 mc_counts[pdg][
'avg'] = hist.GetMean()
136 mc_counts[pdg][
'max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
137 mc_counts[pdg][
'min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
140 mc_counts[0][
'sum'] = hist.GetEntries()
147 Steers the loading of FSP particles.
148 This does NOT include RootInput, Geometry or anything required before loading FSPs,
149 the user has to add this himself (because it depends on the MC campaign
and if you want
150 to use Belle
or Belle II).
153 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
155 Create a new FSPLoader object
156 @param particles list of config.Particle objects
157 @param config config.FeiConfiguration object
160 self.particles = particles
164 def reconstruct(self) -> pybasf2.Path:
166 Returns pybasf2.Path which loads the FSP Particles
168 path = basf2.create_path()
171 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
172 (
'mu+:FSP',
''), (
'p+:FSP',
'')], writeOut=
True, path=path)
173 for outputList, inputList
in [(
'gamma:FSP',
'gamma:mdst'), (
'K_S0:V0',
'K_S0:mdst'),
174 (
'Lambda0:V0',
'Lambda0:mdst'), (
'K_L0:FSP',
'K_L0:mdst'),
175 (
'pi0:FSP',
'pi0:mdst'), (
'gamma:V0',
'gamma:v0mdst')]:
176 ma.copyParticles(outputList, inputList, writeOut=
True, path=path)
178 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
179 (
'mu+:FSP',
''), (
'gamma:FSP',
''),
180 (
'p+:FSP',
''), (
'K_L0:FSP',
'')], writeOut=
True, path=path)
181 ma.fillParticleList(
'K_S0:V0 -> pi+ pi-',
'', writeOut=
True, path=path)
182 ma.fillParticleList(
'Lambda0:V0 -> p+ pi-',
'', writeOut=
True, path=path)
183 ma.fillConvertedPhotonsList(
'gamma:V0 -> e+ e-',
'', writeOut=
True, path=path)
185 if self.config.monitor:
186 names = [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'K_S0',
'p+',
'K_L0',
'Lambda0',
'pi0']
187 filename =
'Monitor_FSPLoader.root'
189 variables = [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs]
190 ma.variablesToHistogram(
'', variables=variables, filename=filename, path=path)
196 Steers the creation of the training data.
197 The training data is used to train a multivariate classifier
for each channel.
198 The training of the FEI at its core
is just generating this training data
for each channel.
199 After we created the training data
for a stage, we have to train the classifiers (see Teacher
class further down).
202 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
203 mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
205 Create a new TrainingData object
206 @param particles list of config.Particle objects
207 @param config config.FeiConfiguration object
208 @param mc_counts containing number of MC Particles
211 self.particles = particles
215 self.mc_counts = mc_counts
217 def reconstruct(self) -> pybasf2.Path:
219 Returns pybasf2.Path which creates the training data for the given particles
221 path = basf2.create_path()
223 for particle
in self.particles:
225 nSignal = self.mc_counts[pdgcode][
'sum']
233 for channel
in particle.channels:
234 filename =
'training_input.root'
236 nBackground = self.mc_counts[0][
'sum'] * channel.preCutConfig.bestCandidateCut
237 inverseSamplingRates = {}
240 if nBackground > Teacher.MaximumNumberOfMVASamples
and not channel.preCutConfig.noBackgroundSampling:
241 inverseSamplingRates[0] = int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1
242 if nSignal > Teacher.MaximumNumberOfMVASamples:
243 inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
245 spectators = [channel.mvaConfig.target]
246 if channel.mvaConfig.sPlotVariable
is not None:
247 spectators.append(channel.mvaConfig.sPlotVariable)
249 if self.config.monitor:
250 hist_variables = [
'mcErrors',
'mcParticleStatus'] + channel.mvaConfig.variables + spectators
251 hist_variables_2d = [(x, channel.mvaConfig.target)
252 for x
in channel.mvaConfig.variables + spectators
if x
is not channel.mvaConfig.target]
253 hist_filename =
'Monitor_TrainingData.root'
254 ma.variablesToHistogram(channel.name, variables=config.variables2binnings(hist_variables),
255 variables_2d=config.variables2binnings_2d(hist_variables_2d),
256 filename=config.removeJPsiSlash(hist_filename),
257 directory=config.removeJPsiSlash(f
'{channel.label}'), path=path)
259 teacher = basf2.register_module(
'VariablesToNtuple')
260 teacher.set_name(
'VariablesToNtuple_' + channel.name)
261 teacher.param(
'fileName', filename)
262 teacher.param(
'treeName', f
'{channel.label} variables')
263 teacher.param(
'variables', channel.mvaConfig.variables + spectators)
264 teacher.param(
'particleList', channel.name)
265 teacher.param(
'sampling', (channel.mvaConfig.target, inverseSamplingRates))
266 path.add_module(teacher)
270class PreReconstruction:
272 Steers the reconstruction phase before the mva method was applied
274 - The ParticleCombination (for each particle
and channel we create candidates using
275 the daughter candidates
from the previous stages)
277 - Vertex Fitting (this
is the slowest part of the whole FEI, KFit
is used by default,
278 but you can use fastFit
as a drop-
in replacement https://github.com/thomaskeck/FastFit/,
279 this will speed up the whole FEI by a factor 2-3)
282 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
284 Create a new PreReconstruction object
285 @param particles list of config.Particle objects
286 @param config config.FeiConfiguration object
289 self.particles = particles
293 def reconstruct(self) -> pybasf2.Path:
295 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting
if necessary
297 path = basf2.create_path()
299 for particle
in self.particles:
300 for channel
in particle.channels:
302 if len(channel.daughters) == 1:
303 ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=
True, path=path)
304 v2EI = basf2.register_module(
'VariablesToExtraInfo')
305 v2EI.set_name(
'VariablesToExtraInfo_' + channel.name)
306 v2EI.param(
'particleList', channel.name)
307 v2EI.param(
'variables', {f
'constant({channel.decayModeID})':
'decayModeID'})
309 v2EI.set_log_level(basf2.logging.log_level.ERROR)
310 path.add_module(v2EI)
312 ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
313 writeOut=
True, path=path)
314 if self.config.monitor:
315 ma.matchMCTruth(channel.name, path=path)
316 bc_variable = channel.preCutConfig.bestCandidateVariable
317 hist_variables = [bc_variable,
'mcErrors',
'mcParticleStatus', channel.mvaConfig.target]
318 hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
319 (bc_variable,
'mcErrors'),
320 (bc_variable,
'mcParticleStatus')]
321 filename =
'Monitor_PreReconstruction_BeforeRanking.root'
322 ma.variablesToHistogram(channel.name,
323 variables=config.variables2binnings(hist_variables),
324 variables_2d=config.variables2binnings_2d(hist_variables_2d),
325 filename=filename, directory=f
'{channel.label}', path=path)
327 if channel.preCutConfig.bestCandidateMode ==
'lowest':
328 ma.rankByLowest(channel.name,
329 channel.preCutConfig.bestCandidateVariable,
330 channel.preCutConfig.bestCandidateCut,
333 elif channel.preCutConfig.bestCandidateMode ==
'highest':
334 ma.rankByHighest(channel.name,
335 channel.preCutConfig.bestCandidateVariable,
336 channel.preCutConfig.bestCandidateCut,
340 raise RuntimeError(
"Unknown bestCandidateMode " + repr(channel.preCutConfig.bestCandidateMode))
342 if self.config.monitor:
343 filename =
'Monitor_PreReconstruction_AfterRanking.root'
344 hist_variables += [
'extraInfo(preCut_rank)']
345 hist_variables_2d += [(
'extraInfo(preCut_rank)', channel.mvaConfig.target),
346 (
'extraInfo(preCut_rank)',
'mcErrors'),
347 (
'extraInfo(preCut_rank)',
'mcParticleStatus')]
348 ma.variablesToHistogram(channel.name,
349 variables=config.variables2binnings(hist_variables),
350 variables_2d=config.variables2binnings_2d(hist_variables_2d),
351 filename=filename, directory=f
'{channel.label}', path=path)
354 elif self.config.training:
355 ma.matchMCTruth(channel.name, path=path)
358 pvfit = basf2.register_module(
'ParticleVertexFitter')
359 pvfit.set_name(
'ParticleVertexFitter_' + channel.name)
360 pvfit.param(
'listName', channel.name)
361 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
362 pvfit.param(
'vertexFitter',
'KFit')
363 pvfit.param(
'fitType',
'vertex')
364 pvfit.set_log_level(basf2.logging.log_level.ERROR)
365 path.add_module(pvfit)
366 elif re.findall(
r"[\w']+", channel.decayString).count(
'pi0') > 1
and particle.name !=
'pi0':
367 basf2.B2INFO(f
"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
368 elif len(channel.daughters) > 1:
369 pvfit = basf2.register_module(
'ParticleVertexFitter')
370 pvfit.set_name(
'ParticleVertexFitter_' + channel.name)
371 pvfit.param(
'listName', channel.name)
372 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
373 pvfit.param(
'vertexFitter',
'KFit')
374 if particle.name
in [
'pi0']:
375 pvfit.param(
'fitType',
'mass')
377 pvfit.param(
'fitType',
'vertex')
378 pvfit.set_log_level(basf2.logging.log_level.ERROR)
379 path.add_module(pvfit)
381 if self.config.monitor:
382 hist_variables = [
'chiProb',
'mcErrors',
'mcParticleStatus', channel.mvaConfig.target]
383 hist_variables_2d = [(
'chiProb', channel.mvaConfig.target),
384 (
'chiProb',
'mcErrors'),
385 (
'chiProb',
'mcParticleStatus')]
386 filename =
'Monitor_PreReconstruction_AfterVertex.root'
387 ma.variablesToHistogram(channel.name,
388 variables=config.variables2binnings(hist_variables),
389 variables_2d=config.variables2binnings_2d(hist_variables_2d),
390 filename=filename, directory=f
'{channel.label}', path=path)
395class PostReconstruction:
397 Steers the reconstruction phase after the mva method was applied
399 - The application of the mva method itself.
400 - Copying all channel lists in a common one
for each particle defined
in particles
401 - Tag unique signal candidates, to avoid double counting of channels
with overlap
404 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
406 Create a new PostReconstruction object
407 @param particles list of config.Particle objects
408 @param config config.FeiConfiguration object
411 self.particles = particles
415 def get_missing_channels(self) -> typing.Sequence[str]:
417 Returns all channels for which the weightfile
is missing
420 for particle
in self.particles:
421 for channel
in particle.channels:
423 weightfile = channel.label +
'.xml'
424 if not basf2_mva.available(weightfile):
425 missing += [channel.label]
428 def available(self) -> bool:
430 Check if the relevant information
is already available
432 return len(self.get_missing_channels()) == 0
434 def reconstruct(self) -> pybasf2.Path:
436 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting
if necessary
438 path = basf2.create_path()
440 for particle
in self.particles:
441 for channel
in particle.channels:
442 expert = basf2.register_module(
'MVAExpert')
443 expert.set_name(
'MVAExpert_' + channel.name)
444 if self.config.training:
445 expert.param(
'identifier', channel.label +
'.xml')
447 expert.param(
'identifier', self.config.prefix +
'_' + channel.label)
448 expert.param(
'extraInfoName',
'SignalProbability')
449 expert.param(
'listNames', [channel.name])
451 expert.set_log_level(basf2.logging.log_level.ERROR)
452 path.add_module(expert)
454 uniqueSignal = basf2.register_module(
'TagUniqueSignal')
455 uniqueSignal.param(
'particleList', channel.name)
456 uniqueSignal.param(
'target', channel.mvaConfig.target)
457 uniqueSignal.param(
'extraInfoName',
'uniqueSignal')
458 uniqueSignal.set_name(
'TagUniqueSignal_' + channel.name)
460 uniqueSignal.set_log_level(basf2.logging.log_level.ERROR)
461 path.add_module(uniqueSignal)
463 if self.config.monitor:
464 hist_variables = [
'mcErrors',
'mcParticleStatus',
'extraInfo(uniqueSignal)',
'extraInfo(SignalProbability)',
465 channel.mvaConfig.target,
'extraInfo(decayModeID)']
466 hist_variables_2d = [(
'extraInfo(SignalProbability)', channel.mvaConfig.target),
467 (
'extraInfo(SignalProbability)',
'mcErrors'),
468 (
'extraInfo(SignalProbability)',
'mcParticleStatus'),
469 (
'extraInfo(decayModeID)', channel.mvaConfig.target),
470 (
'extraInfo(decayModeID)',
'mcErrors'),
471 (
'extraInfo(decayModeID)',
'extraInfo(uniqueSignal)'),
472 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
473 filename =
'Monitor_PostReconstruction_AfterMVA.root'
474 ma.variablesToHistogram(channel.name,
475 variables=config.variables2binnings(hist_variables),
476 variables_2d=config.variables2binnings_2d(hist_variables_2d),
477 filename=filename, directory=f
'{channel.label}', path=path)
480 if particle.postCutConfig.value > 0.0:
481 cutstring = str(particle.postCutConfig.value) +
' < extraInfo(SignalProbability)'
483 ma.mergeListsWithBestDuplicate(particle.identifier, [c.name
for c
in particle.channels],
484 variable=
'particleSource', writeOut=
True, path=path)
486 if self.config.monitor:
487 hist_variables = [
'mcErrors',
'mcParticleStatus',
'extraInfo(uniqueSignal)',
'extraInfo(SignalProbability)',
488 particle.mvaConfig.target,
'extraInfo(decayModeID)']
489 hist_variables_2d = [(
'extraInfo(decayModeID)', particle.mvaConfig.target),
490 (
'extraInfo(decayModeID)',
'mcErrors'),
491 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
492 filename =
'Monitor_PostReconstruction_BeforePostCut.root'
493 ma.variablesToHistogram(
495 variables=config.variables2binnings(hist_variables),
496 variables_2d=config.variables2binnings_2d(hist_variables_2d),
497 filename=config.removeJPsiSlash(filename),
498 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
501 ma.applyCuts(particle.identifier, cutstring, path=path)
503 if self.config.monitor:
504 filename =
'Monitor_PostReconstruction_BeforeRanking.root'
505 ma.variablesToHistogram(
507 variables=config.variables2binnings(hist_variables),
508 variables_2d=config.variables2binnings_2d(hist_variables_2d),
509 filename=config.removeJPsiSlash(filename),
510 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
513 ma.rankByHighest(particle.identifier,
'extraInfo(SignalProbability)',
514 particle.postCutConfig.bestCandidateCut,
'postCut_rank', path=path)
516 if self.config.monitor:
517 hist_variables += [
'extraInfo(postCut_rank)']
518 hist_variables_2d += [(
'extraInfo(decayModeID)',
'extraInfo(postCut_rank)'),
519 (particle.mvaConfig.target,
'extraInfo(postCut_rank)'),
520 (
'mcErrors',
'extraInfo(postCut_rank)'),
521 (
'mcParticleStatus',
'extraInfo(postCut_rank)')]
522 filename =
'Monitor_PostReconstruction_AfterRanking.root'
523 ma.variablesToHistogram(
525 variables=config.variables2binnings(hist_variables),
526 variables_2d=config.variables2binnings_2d(hist_variables_2d),
527 filename=config.removeJPsiSlash(filename),
528 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
531 variables = [
'extraInfo(SignalProbability)',
'mcErrors',
'mcParticleStatus', particle.mvaConfig.target,
532 'extraInfo(uniqueSignal)',
'extraInfo(decayModeID)']
534 if 'B_s0' == particle.name:
536 elif 'B' in particle.name:
537 variables += [
'Mbc',
'cosThetaBetweenParticleAndNominalB']
539 filename =
'Monitor_Final.root'
540 ma.variablesToNtuple(particle.identifier, variables, treename=config.removeJPsiSlash(
541 f
'{particle.identifier} variables'), filename=config.removeJPsiSlash(filename), path=path)
547 Performs all necessary trainings for all training data files which are
548 available but where there
is no weight file available yet.
549 This
class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
550 The trainings are run
in parallel using multi-threading of python.
551 Each training
is done by a subprocess call, the training command (passed by config.externTeacher) can be either
552 * basf2_mva_teacher, the training will be done directly on the machine
553 * externClustTeacher, the training will be submitted to the batch system of KEKCC
557 MaximumNumberOfMVASamples = int(1e7)
560 MinimumNumberOfMVASamples = int(5e2)
562 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
564 Create a new Teacher object
565 @param particles list of config.Particle objects
566 @param config config.FeiConfiguration object
569 self.particles = particles
574 def create_fake_weightfile(channel: str):
576 Create a fake weight file using the trivial method, it will always return 0.0
577 @param channel
for which we create a fake weight file
580 <?xml version="1.0" encoding=
"utf-8"?>
581 <method>Trivial</method>
582 <weightfile>{channel}.xml</weightfile>
583 <treename>tree</treename>
584 <target_variable>isSignal</target_variable>
585 <weight_variable>__weight__</weight_variable>
586 <signal_class>1</signal_class>
587 <max_events>0</max_events>
588 <number_feature_variables>1</number_feature_variables>
589 <variable0>M</variable0>
590 <number_spectator_variables>0</number_spectator_variables>
591 <number_data_files>1</number_data_files>
592 <datafile0>train.root</datafile0>
593 <Trivial_version>1</Trivial_version>
594 <Trivial_output>0</Trivial_output>
595 <signal_fraction>0.066082567</signal_fraction>
597 with open(channel +
".xml",
"w")
as f:
601 def check_if_weightfile_is_fake(filename: str):
603 Checks if the provided filename
is a fake-weight file
or not
604 @param filename the filename of the weight file
607 return '<method>Trivial</method>' in open(filename).readlines()[2]
608 except BaseException:
612 def upload(self, channel: str):
614 Upload the weight file into the condition database
615 @param channel whose weight file
is uploaded
617 disk = channel + '.xml'
618 dbase = self.config.prefix +
'_' + channel
619 basf2_mva.upload(disk, dbase)
622 def do_all_trainings(self):
624 Do all trainings for which we find training data
630 ROOT.PyConfig.StartGuiThread =
False
632 filename =
'training_input.root'
633 if not os.path.isfile(filename):
634 B2WARNING(
"Training of MVC failed. Couldn't find ROOT file. "
635 "No weight files will be provided.")
637 f = ROOT.TFile.Open(filename,
'read')
639 B2WARNING(
"Training of MVC failed. ROOT file corrupt. "
640 "No weight files will be provided.")
641 elif len([k.GetName()
for k
in f.GetListOfKeys()]) == 0:
642 B2WARNING(
"Training of MVC failed. ROOT file does not contain any trees. "
643 "No weight files will be provided.")
645 for particle
in self.particles:
646 for channel
in particle.channels:
647 weightfile = channel.label +
'.xml'
648 if not basf2_mva.available(weightfile):
649 keys = [m
for m
in f.GetListOfKeys()
if f
"{channel.label}" in m.GetName()]
652 tree = keys[0].ReadObj()
653 nSig = tree.GetEntries(channel.mvaConfig.target +
' == 1.0')
654 nBg = tree.GetEntries(channel.mvaConfig.target +
' != 1.0')
655 if nSig < Teacher.MinimumNumberOfMVASamples:
656 B2WARNING(
"Training of MVC failed. "
657 f
"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
658 self.create_fake_weightfile(channel.label)
659 self.upload(channel.label)
661 if nBg < Teacher.MinimumNumberOfMVASamples:
662 B2WARNING(
"Training of MVC failed. "
663 f
"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
664 self.create_fake_weightfile(channel.label)
665 self.upload(channel.label)
667 variable_str =
"' '".join(channel.mvaConfig.variables)
669 command = (f
"{self.config.externTeacher}"
670 f
" --method '{channel.mvaConfig.method}'"
671 f
" --target_variable '{channel.mvaConfig.target}'"
672 f
" --treename '{channel.label} variables' --datafile 'training_input.root'"
673 f
" --signal_class 1 --variables '{variable_str}'"
674 f
" --identifier '{channel.label}.xml'"
675 f
" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1")
676 B2INFO(f
"Used following command to invoke teacher: \n {command}")
677 job_list.append((channel.label, command))
679 p = multiprocessing.Pool(
None, maxtasksperchild=1)
680 func = functools.partial(subprocess.call, shell=
True)
681 p.map(func, [c
for _, c
in job_list])
685 for name, _
in job_list:
686 if not basf2_mva.available(name +
'.xml'):
687 B2WARNING(
"Training of MVC failed. For unknown reasons, check the logfile")
688 self.create_fake_weightfile(name)
689 weightfiles.append(self.upload(name))
693def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
695 Convert an old FEI training into the new format.
696 The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
697 using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
698 @param particles list of config.Particle objects
699 @param config config.FeiConfiguration object
701 summary = pickle.load(open(configuration.legacy, 'rb'))
702 channel2lists = {k: v[2]
for k, v
in summary[
'channel2lists'].items()}
704 teacher = Teacher(particles, configuration)
706 for particle
in particles:
707 for channel
in particle.channels:
708 new_weightfile = configuration.prefix +
'_' + channel.label
709 old_weightfile = configuration.prefix +
'_' + channel2lists[channel.label.replace(
'Jpsi',
'J/psi')]
710 if not basf2_mva.available(new_weightfile):
711 if old_weightfile
is None or not basf2_mva.available(old_weightfile):
712 Teacher.create_fake_weightfile(channel.label)
713 teacher.upload(channel.label)
715 basf2_mva.download(old_weightfile, channel.label +
'.xml')
716 teacher.upload(channel.label)
719def get_stages_from_particles(particles: typing.Sequence[config.Particle]):
721 Returns the hierarchical structure of the FEI.
722 Each stage depends on the particles in the previous stage.
723 The final stage
is empty (meaning everything
is done,
and the training
is finished at this point).
724 @param particles list of config.Particle objects
727 [p for p
in particles
if p.name
in [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'p+',
'K_L0']],
728 [p
for p
in particles
if p.name
in [
'pi0',
'J/psi',
'Lambda0']],
729 [p
for p
in particles
if p.name
in [
'K_S0',
'Sigma+']],
730 [p
for p
in particles
if p.name
in [
'D+',
'D0',
'D_s+',
'Lambda_c+']],
731 [p
for p
in particles
if p.name
in [
'D*+',
'D*0',
'D_s*+']],
732 [p
for p
in particles
if p.name
in [
'B0',
'B+',
'B_s0']],
737 if p.name
not in [p.name
for stage
in stages
for p
in stage]:
738 raise RuntimeError(f
"Unknown particle {p.name}: Not implemented in FEI")
743def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
745 Performs the training of mva classifiers for all available training data,
746 this function must be either called by the user after each stage of the FEI during training,
747 or (more likely)
is called by the distributed.py script after merging the outputs of all jobs,
748 @param particles list of config.Particle objects
749 @param config config.FeiConfiguration object
750 @return list of tuple
with weight file on disk
and identifier
in database
for all trained classifiers
752 teacher = Teacher(particles, configuration)
753 return teacher.do_all_trainings()
756def save_summary(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration, cache: int):
758 Creates the Summary.pickle, which is used to keep track of the stage during the training,
759 and can be used later to investigate which configuration was used exactly to create the training.
760 @param particles list of config.Particle objects
761 @param config config.FeiConfiguration object
762 @param cache current cache level
764 configuration = config.FeiConfiguration(configuration.prefix, cache,
765 configuration.monitor, configuration.legacy, configuration.externTeacher,
766 configuration.training)
768 for i
in [8, 7, 6, 5, 4, 3, 2, 1, 0]:
769 if os.path.isfile(f
'Summary.pickle.backup_{i}'):
770 shutil.copyfile(f
'Summary.pickle.backup_{i}', f
'Summary.pickle.backup_{i + 1}')
771 if os.path.isfile(
'Summary.pickle'):
772 shutil.copyfile(
'Summary.pickle',
'Summary.pickle.backup_0')
773 pickle.dump((particles, configuration), open(
'Summary.pickle',
'wb'))
776def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
778 The most important function of the FEI.
779 This creates the FEI path for training/fitting (both terms are equal),
and application/inference (both terms are equal).
780 The whole FEI
is defined by the particles which are reconstructed (see default_channels.py)
781 and the configuration (see config.py).
784 For training this function
is called multiple times, each time the FEI reconstructs one more stage
in the hierarchical structure
785 i.e. we start
with FSP, pi0, KS_0, D, D*,
and with B mesons. You have to set configuration.training to
True for training mode.
786 All weight files created during the training will be stored
in your local database.
787 If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
788 to the central database first (see documentation
for the Belle2 Condition Database).
791 For application you call this function once,
and it returns the whole path which will reconstruct B mesons
792 with an associated signal probability. You have to set configuration.training to
False for application mode.
795 You can always turn on the monitoring (configuration.monitor =
True),
796 to write out ROOT Histograms of many quantities
for each stage,
797 using these histograms you can use the printReporting.py
or latexReporting.py scripts to automatically create pdf files.
800 This function can also use old FEI trainings (version 3), just
pass the Summary.pickle file of the old training,
801 and the weight files will be automatically converted to the new naming scheme.
803 @param particles list of config.Particle objects
804 @param config config.FeiConfiguration object
807 ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
808 |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
809 | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
811 Author: Thomas Keck 2014 - 2017
812 Please cite my PhD thesis
821 if configuration.cache
is None:
822 if os.path.isfile(
'Summary.pickle'):
823 print(
"Cache: Replaced particles and configuration with the ones from Summary.pickle!")
824 particles, configuration = pickle.load(open(
'Summary.pickle',
'rb'))
825 cache = configuration.cache
827 if configuration.training:
832 cache = configuration.cache
835 path = basf2.create_path()
840 stages = get_stages_from_particles(particles)
845 if configuration.legacy
is not None:
846 convert_legacy_training(particles, configuration)
854 training_data_information = TrainingDataInformation(particles)
855 if cache < 0
and configuration.training:
856 print(
"Stage 0: Run over all files to count the number of events and McParticles")
857 path.add_path(training_data_information.reconstruct())
858 if configuration.training:
859 save_summary(particles, configuration, 0)
860 return FeiState(path, 0, [])
861 elif not configuration.training
and configuration.monitor:
862 path.add_path(training_data_information.reconstruct())
868 loader = FSPLoader(particles, configuration)
870 print(
"Stage 0: Load FSP particles")
871 path.add_path(loader.reconstruct())
894 for stage, stage_particles
in enumerate(stages):
895 pre_reconstruction = PreReconstruction(stage_particles, configuration)
897 print(f
"Stage {stage}: PreReconstruct particles: ", [p.name
for p
in stage_particles])
898 path.add_path(pre_reconstruction.reconstruct())
900 post_reconstruction = PostReconstruction(stage_particles, configuration)
901 if configuration.training
and not post_reconstruction.available():
902 print(f
"Stage {stage}: Create training data for particles: ", [p.name
for p
in stage_particles])
903 mc_counts = training_data_information.get_mc_counts()
904 training_data = TrainingData(stage_particles, configuration, mc_counts)
905 path.add_path(training_data.reconstruct())
906 used_lists += [channel.name
for particle
in stage_particles
for channel
in particle.channels]
908 if cache <= stage + 1:
909 path.add_path(post_reconstruction.reconstruct())
910 used_lists += [particle.identifier
for particle
in stage_particles]
914 if configuration.monitor:
915 output = basf2.register_module(
'RootOutput')
916 output.param(
'outputFileName',
'Monitor_ModuleStatistics.root')
917 output.param(
'branchNames', [
'EventMetaData'])
918 output.param(
'branchNamesPersistent', [
'ProcessStatistics'])
919 output.param(
'ignoreCommandLineOverride',
True)
920 path.add_module(output)
924 if configuration.training
or configuration.monitor:
925 save_summary(particles, configuration, stage + 1)
928 return FeiState(path, stage + 1, plists=used_lists)