14 The Full Event Interpretation Algorithm
17 - The algorithm will automatically reconstruct B mesons and calculate a signal probability for each candidate.
18 - It can be used for hadronic and semileptonic tagging.
19 - The algorithm has to be trained on MC, and can afterwards be applied on data.
20 - The training requires O(100) million MC events
21 - The weight files are stored in the Belle II Condition database
23 Read this file if you want to understand the technical details of the FEI.
25 The FEI follows a hierarchical approach.
27 (Stage -1: Write out information about the provided data sample)
28 Stage 0: Final State Particles (FSP)
29 Stage 1: pi0, J/Psi, Lambda0
31 Stage 3: D and Lambda_c mesons
36 Most stages consists of:
37 - Create Particle Candidates
40 - Apply a multivariate classification method
43 The FEI will reconstruct these 7 stages during the training phase,
44 since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
45 to train all the necessary multivariate classifiers.
50 from basf2
import B2INFO, B2WARNING
52 import modularAnalysis
as ma
57 from fei
import config
69 import multiprocessing
72 FeiState = collections.namedtuple(
'FeiState',
'path, stage, plists')
75 class TrainingDataInformation:
77 Contains the relevant information about the used training data.
78 Basically we write out the number of MC particles in the whole dataset.
79 This numbers we can use to calculate what fraction of candidates we have to write
80 out as TrainingData to get a reasonable amount of candidates to train on
81 (too few candidates will lead to heavy overtraining, too many won't fit into memory).
82 Secondly we can use this information for the generation of the monitoring pdfs,
83 where we calculate reconstruction efficiencies.
86 def __init__(self, particles: typing.Sequence[config.Particle]):
88 Create a new TrainingData object
89 @param particles list of config.Particle objects
92 self.particles = particles
94 self.filename =
'mcParticlesCount.root'
96 def available(self) -> bool:
98 Check if the relevant information is already available
100 return os.path.isfile(self.filename)
102 def reconstruct(self) -> pybasf2.Path:
104 Returns pybasf2.Path which counts the number of MCParticles in each event.
105 @param particles list of config.Particle objects
108 pdgs = {abs(
pdg.from_name(particle.name))
for particle
in self.particles}
110 path = basf2.create_path()
111 module = basf2.register_module(
'VariablesToHistogram')
112 module.set_name(
"VariablesToHistogram_MCCount")
113 module.param(
'variables', [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs])
114 module.param(
'fileName', self.filename)
115 path.add_module(module)
118 def get_mc_counts(self):
120 Read out the number of MC particles from the file created by reconstruct
125 root_file = ROOT.TFile.Open(self.filename,
'read')
128 for key
in root_file.GetListOfKeys():
129 variable = ROOT.Belle2.MakeROOTCompatible.invertMakeROOTCompatible(key.GetName())
130 pdg = abs(int(variable[len(
'NumberOfMCParticlesInEvent('):-len(
")")]))
133 mc_counts[pdg][
'sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
134 for bin
in range(hist.GetNbinsX()))
135 mc_counts[pdg][
'std'] = hist.GetStdDev()
136 mc_counts[pdg][
'avg'] = hist.GetMean()
137 mc_counts[pdg][
'max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
138 mc_counts[pdg][
'min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
141 mc_counts[0][
'sum'] = hist.GetEntries()
148 Steers the loading of FSP particles.
149 This does NOT include RootInput, Geometry or anything required before loading FSPs,
150 the user has to add this himself (because it depends on the MC campaign and if you want
151 to use Belle or Belle II).
154 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
156 Create a new FSPLoader object
157 @param particles list of config.Particle objects
158 @param config config.FeiConfiguration object
161 self.particles = particles
165 def reconstruct(self) -> pybasf2.Path:
167 Returns pybasf2.Path which loads the FSP Particles
169 path = basf2.create_path()
172 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
173 (
'mu+:FSP',
''), (
'p+:FSP',
'')], writeOut=
True, path=path)
174 for outputList, inputList
in [(
'gamma:FSP',
'gamma:mdst'), (
'K_S0:V0',
'K_S0:mdst'),
175 (
'Lambda0:V0',
'Lambda0:mdst'), (
'K_L0:FSP',
'K_L0:mdst'),
176 (
'pi0:FSP',
'pi0:mdst'), (
'gamma:V0',
'gamma:v0mdst')]:
177 ma.copyParticles(outputList, inputList, writeOut=
True, path=path)
179 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
180 (
'mu+:FSP',
''), (
'gamma:FSP',
''),
181 (
'p+:FSP',
''), (
'K_L0:FSP',
'')], writeOut=
True, path=path)
182 ma.fillParticleList(
'K_S0:V0 -> pi+ pi-',
'', writeOut=
True, path=path)
183 ma.fillParticleList(
'Lambda0:V0 -> p+ pi-',
'', writeOut=
True, path=path)
184 ma.fillConvertedPhotonsList(
'gamma:V0 -> e+ e-',
'', writeOut=
True, path=path)
186 if self.config.monitor:
187 names = [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'K_S0',
'p+',
'K_L0',
'Lambda0',
'pi0']
188 filename =
'Monitor_FSPLoader.root'
190 variables = [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs]
191 ma.variablesToHistogram(
'', variables=variables, filename=filename, path=path)
197 Steers the creation of the training data.
198 The training data is used to train a multivariate classifier for each channel.
199 The training of the FEI at its core is just generating this training data for each channel.
200 After we created the training data for a stage, we have to train the classifiers (see Teacher class further down).
203 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
204 mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
206 Create a new TrainingData object
207 @param particles list of config.Particle objects
208 @param config config.FeiConfiguration object
209 @param mc_counts containing number of MC Particles
212 self.particles = particles
216 self.mc_counts = mc_counts
218 def reconstruct(self) -> pybasf2.Path:
220 Returns pybasf2.Path which creates the training data for the given particles
222 path = basf2.create_path()
224 for particle
in self.particles:
226 nSignal = self.mc_counts[pdgcode][
'sum']
234 for channel
in particle.channels:
235 filename =
'training_input.root'
237 nBackground = self.mc_counts[0][
'sum'] * channel.preCutConfig.bestCandidateCut
238 inverseSamplingRates = {}
241 if nBackground > Teacher.MaximumNumberOfMVASamples
and not channel.preCutConfig.noBackgroundSampling:
242 inverseSamplingRates[0] = int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1
243 if nSignal > Teacher.MaximumNumberOfMVASamples:
244 inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
246 spectators = [channel.mvaConfig.target]
247 if channel.mvaConfig.sPlotVariable
is not None:
248 spectators.append(channel.mvaConfig.sPlotVariable)
250 if self.config.monitor:
251 hist_variables = [
'mcErrors',
'mcParticleStatus'] + channel.mvaConfig.variables + spectators
252 hist_variables_2d = [(x, channel.mvaConfig.target)
253 for x
in channel.mvaConfig.variables + spectators
if x
is not channel.mvaConfig.target]
254 hist_filename =
'Monitor_TrainingData.root'
255 ma.variablesToHistogram(channel.name, variables=config.variables2binnings(hist_variables),
256 variables_2d=config.variables2binnings_2d(hist_variables_2d),
257 filename=config.removeJPsiSlash(hist_filename),
258 directory=config.removeJPsiSlash(f
'{channel.label}'), path=path)
260 teacher = basf2.register_module(
'VariablesToNtuple')
261 teacher.set_name(
'VariablesToNtuple_' + channel.name)
262 teacher.param(
'fileName', filename)
263 teacher.param(
'treeName', f
'{channel.label} variables')
264 teacher.param(
'variables', channel.mvaConfig.variables + spectators)
265 teacher.param(
'particleList', channel.name)
266 teacher.param(
'sampling', (channel.mvaConfig.target, inverseSamplingRates))
267 path.add_module(teacher)
271 class PreReconstruction:
273 Steers the reconstruction phase before the mva method was applied
275 - The ParticleCombination (for each particle and channel we create candidates using
276 the daughter candidates from the previous stages)
278 - Vertex Fitting (this is the slowest part of the whole FEI, KFit is used by default,
279 but you can use fastFit as a drop-in replacement https://github.com/thomaskeck/FastFit/,
280 this will speed up the whole FEI by a factor 2-3)
283 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
285 Create a new PreReconstruction object
286 @param particles list of config.Particle objects
287 @param config config.FeiConfiguration object
290 self.particles = particles
294 def reconstruct(self) -> pybasf2.Path:
296 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
298 path = basf2.create_path()
300 for particle
in self.particles:
301 for channel
in particle.channels:
303 if len(channel.daughters) == 1:
304 ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=
True, path=path)
305 v2EI = basf2.register_module(
'VariablesToExtraInfo')
306 v2EI.set_name(
'VariablesToExtraInfo_' + channel.name)
307 v2EI.param(
'particleList', channel.name)
308 v2EI.param(
'variables', {f
'constant({channel.decayModeID})':
'decayModeID'})
310 v2EI.set_log_level(basf2.logging.log_level.ERROR)
311 path.add_module(v2EI)
313 ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
314 writeOut=
True, path=path)
315 if self.config.monitor:
316 ma.matchMCTruth(channel.name, path=path)
317 bc_variable = channel.preCutConfig.bestCandidateVariable
318 hist_variables = [bc_variable,
'mcErrors',
'mcParticleStatus', channel.mvaConfig.target]
319 hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
320 (bc_variable,
'mcErrors'),
321 (bc_variable,
'mcParticleStatus')]
322 filename =
'Monitor_PreReconstruction_BeforeRanking.root'
323 ma.variablesToHistogram(channel.name,
324 variables=config.variables2binnings(hist_variables),
325 variables_2d=config.variables2binnings_2d(hist_variables_2d),
326 filename=filename, directory=f
'{channel.label}', path=path)
328 if channel.preCutConfig.bestCandidateMode ==
'lowest':
329 ma.rankByLowest(channel.name,
330 channel.preCutConfig.bestCandidateVariable,
331 channel.preCutConfig.bestCandidateCut,
334 elif channel.preCutConfig.bestCandidateMode ==
'highest':
335 ma.rankByHighest(channel.name,
336 channel.preCutConfig.bestCandidateVariable,
337 channel.preCutConfig.bestCandidateCut,
341 raise RuntimeError(
"Unknown bestCandidateMode " + repr(channel.preCutConfig.bestCandidateMode))
343 if self.config.monitor:
344 filename =
'Monitor_PreReconstruction_AfterRanking.root'
345 hist_variables += [
'extraInfo(preCut_rank)']
346 hist_variables_2d += [(
'extraInfo(preCut_rank)', channel.mvaConfig.target),
347 (
'extraInfo(preCut_rank)',
'mcErrors'),
348 (
'extraInfo(preCut_rank)',
'mcParticleStatus')]
349 ma.variablesToHistogram(channel.name,
350 variables=config.variables2binnings(hist_variables),
351 variables_2d=config.variables2binnings_2d(hist_variables_2d),
352 filename=filename, directory=f
'{channel.label}', path=path)
355 elif self.config.training:
356 ma.matchMCTruth(channel.name, path=path)
359 pvfit = basf2.register_module(
'ParticleVertexFitter')
360 pvfit.set_name(
'ParticleVertexFitter_' + channel.name)
361 pvfit.param(
'listName', channel.name)
362 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
363 pvfit.param(
'vertexFitter',
'KFit')
364 pvfit.param(
'fitType',
'vertex')
365 pvfit.set_log_level(basf2.logging.log_level.ERROR)
366 path.add_module(pvfit)
367 elif re.findall(
r"[\w']+", channel.decayString).count(
'pi0') > 1
and particle.name !=
'pi0':
368 basf2.B2INFO(f
"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
369 elif len(channel.daughters) > 1:
370 pvfit = basf2.register_module(
'ParticleVertexFitter')
371 pvfit.set_name(
'ParticleVertexFitter_' + channel.name)
372 pvfit.param(
'listName', channel.name)
373 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
374 pvfit.param(
'vertexFitter',
'KFit')
375 if particle.name
in [
'pi0']:
376 pvfit.param(
'fitType',
'mass')
378 pvfit.param(
'fitType',
'vertex')
379 pvfit.set_log_level(basf2.logging.log_level.ERROR)
380 path.add_module(pvfit)
382 if self.config.monitor:
383 hist_variables = [
'chiProb',
'mcErrors',
'mcParticleStatus', channel.mvaConfig.target]
384 hist_variables_2d = [(
'chiProb', channel.mvaConfig.target),
385 (
'chiProb',
'mcErrors'),
386 (
'chiProb',
'mcParticleStatus')]
387 filename =
'Monitor_PreReconstruction_AfterVertex.root'
388 ma.variablesToHistogram(channel.name,
389 variables=config.variables2binnings(hist_variables),
390 variables_2d=config.variables2binnings_2d(hist_variables_2d),
391 filename=filename, directory=f
'{channel.label}', path=path)
396 class PostReconstruction:
398 Steers the reconstruction phase after the mva method was applied
400 - The application of the mva method itself.
401 - Copying all channel lists in a common one for each particle defined in particles
402 - Tag unique signal candidates, to avoid double counting of channels with overlap
405 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
407 Create a new PostReconstruction object
408 @param particles list of config.Particle objects
409 @param config config.FeiConfiguration object
412 self.particles = particles
416 def get_missing_channels(self) -> typing.Sequence[str]:
418 Returns all channels for which the weightfile is missing
421 for particle
in self.particles:
422 for channel
in particle.channels:
424 weightfile = channel.label +
'.xml'
425 if not basf2_mva.available(weightfile):
426 missing += [channel.label]
429 def available(self) -> bool:
431 Check if the relevant information is already available
433 return len(self.get_missing_channels()) == 0
435 def reconstruct(self) -> pybasf2.Path:
437 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
439 path = basf2.create_path()
441 for particle
in self.particles:
442 for channel
in particle.channels:
443 expert = basf2.register_module(
'MVAExpert')
444 expert.set_name(
'MVAExpert_' + channel.name)
445 if self.config.training:
446 expert.param(
'identifier', channel.label +
'.xml')
448 expert.param(
'identifier', self.config.prefix +
'_' + channel.label)
449 expert.param(
'extraInfoName',
'SignalProbability')
450 expert.param(
'listNames', [channel.name])
452 expert.set_log_level(basf2.logging.log_level.ERROR)
453 path.add_module(expert)
455 uniqueSignal = basf2.register_module(
'TagUniqueSignal')
456 uniqueSignal.param(
'particleList', channel.name)
457 uniqueSignal.param(
'target', channel.mvaConfig.target)
458 uniqueSignal.param(
'extraInfoName',
'uniqueSignal')
459 uniqueSignal.set_name(
'TagUniqueSignal_' + channel.name)
461 uniqueSignal.set_log_level(basf2.logging.log_level.ERROR)
462 path.add_module(uniqueSignal)
464 if self.config.monitor:
465 hist_variables = [
'mcErrors',
'mcParticleStatus',
'extraInfo(uniqueSignal)',
'extraInfo(SignalProbability)',
466 channel.mvaConfig.target,
'extraInfo(decayModeID)']
467 hist_variables_2d = [(
'extraInfo(SignalProbability)', channel.mvaConfig.target),
468 (
'extraInfo(SignalProbability)',
'mcErrors'),
469 (
'extraInfo(SignalProbability)',
'mcParticleStatus'),
470 (
'extraInfo(decayModeID)', channel.mvaConfig.target),
471 (
'extraInfo(decayModeID)',
'mcErrors'),
472 (
'extraInfo(decayModeID)',
'extraInfo(uniqueSignal)'),
473 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
474 filename =
'Monitor_PostReconstruction_AfterMVA.root'
475 ma.variablesToHistogram(channel.name,
476 variables=config.variables2binnings(hist_variables),
477 variables_2d=config.variables2binnings_2d(hist_variables_2d),
478 filename=filename, directory=f
'{channel.label}', path=path)
481 if particle.postCutConfig.value > 0.0:
482 cutstring = str(particle.postCutConfig.value) +
' < extraInfo(SignalProbability)'
484 ma.mergeListsWithBestDuplicate(particle.identifier, [c.name
for c
in particle.channels],
485 variable=
'particleSource', writeOut=
True, path=path)
487 if self.config.monitor:
488 hist_variables = [
'mcErrors',
'mcParticleStatus',
'extraInfo(uniqueSignal)',
'extraInfo(SignalProbability)',
489 particle.mvaConfig.target,
'extraInfo(decayModeID)']
490 hist_variables_2d = [(
'extraInfo(decayModeID)', particle.mvaConfig.target),
491 (
'extraInfo(decayModeID)',
'mcErrors'),
492 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
493 filename =
'Monitor_PostReconstruction_BeforePostCut.root'
494 ma.variablesToHistogram(
496 variables=config.variables2binnings(hist_variables),
497 variables_2d=config.variables2binnings_2d(hist_variables_2d),
498 filename=config.removeJPsiSlash(filename),
499 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
502 ma.applyCuts(particle.identifier, cutstring, path=path)
504 if self.config.monitor:
505 filename =
'Monitor_PostReconstruction_BeforeRanking.root'
506 ma.variablesToHistogram(
508 variables=config.variables2binnings(hist_variables),
509 variables_2d=config.variables2binnings_2d(hist_variables_2d),
510 filename=config.removeJPsiSlash(filename),
511 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
514 ma.rankByHighest(particle.identifier,
'extraInfo(SignalProbability)',
515 particle.postCutConfig.bestCandidateCut,
'postCut_rank', path=path)
517 if self.config.monitor:
518 hist_variables += [
'extraInfo(postCut_rank)']
519 hist_variables_2d += [(
'extraInfo(decayModeID)',
'extraInfo(postCut_rank)'),
520 (particle.mvaConfig.target,
'extraInfo(postCut_rank)'),
521 (
'mcErrors',
'extraInfo(postCut_rank)'),
522 (
'mcParticleStatus',
'extraInfo(postCut_rank)')]
523 filename =
'Monitor_PostReconstruction_AfterRanking.root'
524 ma.variablesToHistogram(
526 variables=config.variables2binnings(hist_variables),
527 variables_2d=config.variables2binnings_2d(hist_variables_2d),
528 filename=config.removeJPsiSlash(filename),
529 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
532 variables = [
'extraInfo(SignalProbability)',
'mcErrors',
'mcParticleStatus', particle.mvaConfig.target,
533 'extraInfo(uniqueSignal)',
'extraInfo(decayModeID)']
535 if 'B_s0' == particle.name:
537 elif 'B' in particle.name:
538 variables += [
'Mbc',
'cosThetaBetweenParticleAndNominalB']
540 filename =
'Monitor_Final.root'
541 ma.variablesToNtuple(particle.identifier, variables, treename=config.removeJPsiSlash(
542 f
'{particle.identifier} variables'), filename=config.removeJPsiSlash(filename), path=path)
548 Performs all necessary trainings for all training data files which are
549 available but where there is no weight file available yet.
550 This class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
551 The trainings are run in parallel using multi-threading of python.
552 Each training is done by a subprocess call, the training command (passed by config.externTeacher) can be either
553 * basf2_mva_teacher, the training will be done directly on the machine
554 * externClustTeacher, the training will be submitted to the batch system of KEKCC
558 MaximumNumberOfMVASamples = int(1e7)
561 MinimumNumberOfMVASamples = int(5e2)
563 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
565 Create a new Teacher object
566 @param particles list of config.Particle objects
567 @param config config.FeiConfiguration object
570 self.particles = particles
575 def create_fake_weightfile(channel: str):
577 Create a fake weight file using the trivial method, it will always return 0.0
578 @param channel for which we create a fake weight file
581 <?xml version="1.0" encoding="utf-8"?>
582 <method>Trivial</method>
583 <weightfile>{channel}.xml</weightfile>
584 <treename>tree</treename>
585 <target_variable>isSignal</target_variable>
586 <weight_variable>__weight__</weight_variable>
587 <signal_class>1</signal_class>
588 <max_events>0</max_events>
589 <number_feature_variables>1</number_feature_variables>
590 <variable0>M</variable0>
591 <number_spectator_variables>0</number_spectator_variables>
592 <number_data_files>1</number_data_files>
593 <datafile0>train.root</datafile0>
594 <Trivial_version>1</Trivial_version>
595 <Trivial_output>0</Trivial_output>
596 <signal_fraction>0.066082567</signal_fraction>
598 with open(channel +
".xml",
"w")
as f:
602 def check_if_weightfile_is_fake(filename: str):
604 Checks if the provided filename is a fake-weight file or not
605 @param filename the filename of the weight file
608 return '<method>Trivial</method>' in open(filename).readlines()[2]
609 except BaseException:
613 def upload(self, channel: str):
615 Upload the weight file into the condition database
616 @param channel whose weight file is uploaded
618 disk = channel +
'.xml'
619 dbase = self.config.prefix +
'_' + channel
620 basf2_mva.upload(disk, dbase)
623 def do_all_trainings(self):
625 Do all trainings for which we find training data
631 ROOT.PyConfig.StartGuiThread =
False
633 filename =
'training_input.root'
634 if not os.path.isfile(filename):
635 B2WARNING(
"Training of MVC failed. Couldn't find ROOT file. "
636 "No weight files will be provided.")
638 f = ROOT.TFile.Open(filename,
'read')
640 B2WARNING(
"Training of MVC failed. ROOT file corrupt. "
641 "No weight files will be provided.")
642 elif len([k.GetName()
for k
in f.GetListOfKeys()]) == 0:
643 B2WARNING(
"Training of MVC failed. ROOT file does not contain any trees. "
644 "No weight files will be provided.")
646 for particle
in self.particles:
647 for channel
in particle.channels:
648 weightfile = channel.label +
'.xml'
649 if not basf2_mva.available(weightfile):
650 keys = [m
for m
in f.GetListOfKeys()
if f
"{channel.label}" in m.GetName()]
653 tree = keys[0].ReadObj()
654 nSig = tree.GetEntries(channel.mvaConfig.target +
' == 1.0')
655 nBg = tree.GetEntries(channel.mvaConfig.target +
' != 1.0')
656 if nSig < Teacher.MinimumNumberOfMVASamples:
657 B2WARNING(
"Training of MVC failed. "
658 f
"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
659 self.create_fake_weightfile(channel.label)
660 self.upload(channel.label)
662 if nBg < Teacher.MinimumNumberOfMVASamples:
663 B2WARNING(
"Training of MVC failed. "
664 f
"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
665 self.create_fake_weightfile(channel.label)
666 self.upload(channel.label)
668 variable_str =
"' '".join(channel.mvaConfig.variables)
670 command = (f
"{self.config.externTeacher}"
671 f
" --method '{channel.mvaConfig.method}'"
672 f
" --target_variable '{channel.mvaConfig.target}'"
673 f
" --treename '{channel.label} variables' --datafile 'training_input.root'"
674 f
" --signal_class 1 --variables '{variable_str}'"
675 f
" --identifier '{channel.label}.xml'"
676 f
" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1")
677 B2INFO(f
"Used following command to invoke teacher: \n {command}")
678 job_list.append((channel.label, command))
680 p = multiprocessing.Pool(
None, maxtasksperchild=1)
681 func = functools.partial(subprocess.call, shell=
True)
682 p.map(func, [c
for _, c
in job_list])
686 for name, _
in job_list:
687 if not basf2_mva.available(name +
'.xml'):
688 B2WARNING(
"Training of MVC failed. For unknown reasons, check the logfile")
689 self.create_fake_weightfile(name)
690 weightfiles.append(self.upload(name))
694 def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
696 Convert an old FEI training into the new format.
697 The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
698 using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
699 @param particles list of config.Particle objects
700 @param config config.FeiConfiguration object
702 summary = pickle.load(open(configuration.legacy,
'rb'))
703 channel2lists = {k: v[2]
for k, v
in summary[
'channel2lists'].items()}
705 teacher = Teacher(particles, configuration)
707 for particle
in particles:
708 for channel
in particle.channels:
709 new_weightfile = configuration.prefix +
'_' + channel.label
710 old_weightfile = configuration.prefix +
'_' + channel2lists[channel.label.replace(
'Jpsi',
'J/psi')]
711 if not basf2_mva.available(new_weightfile):
712 if old_weightfile
is None or not basf2_mva.available(old_weightfile):
713 Teacher.create_fake_weightfile(channel.label)
714 teacher.upload(channel.label)
716 basf2_mva.download(old_weightfile, channel.label +
'.xml')
717 teacher.upload(channel.label)
720 def get_stages_from_particles(particles: typing.Sequence[config.Particle]):
722 Returns the hierarchical structure of the FEI.
723 Each stage depends on the particles in the previous stage.
724 The final stage is empty (meaning everything is done, and the training is finished at this point).
725 @param particles list of config.Particle objects
728 [p
for p
in particles
if p.name
in [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'p+',
'K_L0']],
729 [p
for p
in particles
if p.name
in [
'pi0',
'J/psi',
'Lambda0']],
730 [p
for p
in particles
if p.name
in [
'K_S0',
'Sigma+']],
731 [p
for p
in particles
if p.name
in [
'D+',
'D0',
'D_s+',
'Lambda_c+']],
732 [p
for p
in particles
if p.name
in [
'D*+',
'D*0',
'D_s*+']],
733 [p
for p
in particles
if p.name
in [
'B0',
'B+',
'B_s0']],
738 if p.name
not in [p.name
for stage
in stages
for p
in stage]:
739 raise RuntimeError(f
"Unknown particle {p.name}: Not implemented in FEI")
744 def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
746 Performs the training of mva classifiers for all available training data,
747 this function must be either called by the user after each stage of the FEI during training,
748 or (more likely) is called by the distributed.py script after merging the outputs of all jobs,
749 @param particles list of config.Particle objects
750 @param config config.FeiConfiguration object
751 @return list of tuple with weight file on disk and identifier in database for all trained classifiers
753 teacher = Teacher(particles, configuration)
754 return teacher.do_all_trainings()
757 def save_summary(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration, cache: int):
759 Creates the Summary.pickle, which is used to keep track of the stage during the training,
760 and can be used later to investigate which configuration was used exactly to create the training.
761 @param particles list of config.Particle objects
762 @param config config.FeiConfiguration object
763 @param cache current cache level
765 configuration = config.FeiConfiguration(configuration.prefix, cache,
766 configuration.monitor, configuration.legacy, configuration.externTeacher,
767 configuration.training)
769 for i
in [8, 7, 6, 5, 4, 3, 2, 1, 0]:
770 if os.path.isfile(f
'Summary.pickle.backup_{i}'):
771 shutil.copyfile(f
'Summary.pickle.backup_{i}', f
'Summary.pickle.backup_{i + 1}')
772 if os.path.isfile(
'Summary.pickle'):
773 shutil.copyfile(
'Summary.pickle',
'Summary.pickle.backup_0')
774 pickle.dump((particles, configuration), open(
'Summary.pickle',
'wb'))
777 def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
779 The most important function of the FEI.
780 This creates the FEI path for training/fitting (both terms are equal), and application/inference (both terms are equal).
781 The whole FEI is defined by the particles which are reconstructed (see default_channels.py)
782 and the configuration (see config.py).
785 For training this function is called multiple times, each time the FEI reconstructs one more stage in the hierarchical structure
786 i.e. we start with FSP, pi0, KS_0, D, D*, and with B mesons. You have to set configuration.training to True for training mode.
787 All weight files created during the training will be stored in your local database.
788 If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
789 to the central database first (see documentation for the Belle2 Condition Database).
792 For application you call this function once, and it returns the whole path which will reconstruct B mesons
793 with an associated signal probability. You have to set configuration.training to False for application mode.
796 You can always turn on the monitoring (configuration.monitor = True),
797 to write out ROOT Histograms of many quantities for each stage,
798 using these histograms you can use the printReporting.py or latexReporting.py scripts to automatically create pdf files.
801 This function can also use old FEI trainings (version 3), just pass the Summary.pickle file of the old training,
802 and the weight files will be automatically converted to the new naming scheme.
804 @param particles list of config.Particle objects
805 @param config config.FeiConfiguration object
808 ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
809 |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
810 | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
812 Author: Thomas Keck 2014 - 2017
813 Please cite my PhD thesis
822 if configuration.cache
is None:
823 if os.path.isfile(
'Summary.pickle'):
824 print(
"Cache: Replaced particles and configuration with the ones from Summary.pickle!")
825 particles, configuration = pickle.load(open(
'Summary.pickle',
'rb'))
826 cache = configuration.cache
828 if configuration.training:
833 cache = configuration.cache
836 path = basf2.create_path()
841 stages = get_stages_from_particles(particles)
846 if configuration.legacy
is not None:
847 convert_legacy_training(particles, configuration)
855 training_data_information = TrainingDataInformation(particles)
856 if cache < 0
and configuration.training:
857 print(
"Stage 0: Run over all files to count the number of events and McParticles")
858 path.add_path(training_data_information.reconstruct())
859 if configuration.training:
860 save_summary(particles, configuration, 0)
861 return FeiState(path, 0, [])
862 elif not configuration.training
and configuration.monitor:
863 path.add_path(training_data_information.reconstruct())
869 loader = FSPLoader(particles, configuration)
871 print(
"Stage 0: Load FSP particles")
872 path.add_path(loader.reconstruct())
895 for stage, stage_particles
in enumerate(stages):
896 pre_reconstruction = PreReconstruction(stage_particles, configuration)
898 print(f
"Stage {stage}: PreReconstruct particles: ", [p.name
for p
in stage_particles])
899 path.add_path(pre_reconstruction.reconstruct())
901 post_reconstruction = PostReconstruction(stage_particles, configuration)
902 if configuration.training
and not post_reconstruction.available():
903 print(f
"Stage {stage}: Create training data for particles: ", [p.name
for p
in stage_particles])
904 mc_counts = training_data_information.get_mc_counts()
905 training_data = TrainingData(stage_particles, configuration, mc_counts)
906 path.add_path(training_data.reconstruct())
907 used_lists += [channel.name
for particle
in stage_particles
for channel
in particle.channels]
909 if cache <= stage + 1:
910 path.add_path(post_reconstruction.reconstruct())
911 used_lists += [particle.identifier
for particle
in stage_particles]
915 if configuration.monitor:
916 output = basf2.register_module(
'RootOutput')
917 output.param(
'outputFileName',
'Monitor_ModuleStatistics.root')
918 output.param(
'branchNames', [
'EventMetaData'])
919 output.param(
'branchNamesPersistent', [
'ProcessStatistics'])
920 output.param(
'ignoreCommandLineOverride',
True)
921 path.add_module(output)
925 if configuration.training
or configuration.monitor:
926 save_summary(particles, configuration, stage + 1)
929 return FeiState(path, stage + 1, plists=used_lists)