14 The Full Event Interpretation Algorithm
17 - The algorithm will automatically reconstruct B mesons and calculate a signal probability for each candidate.
18 - It can be used for hadronic and semileptonic tagging.
19 - The algorithm has to be trained on MC, and can afterwards be applied on data.
20 - The training requires O(100) million MC events
21 - The weight files are stored in the Belle II Condition database
23 Read this file if you want to understand the technical details of the FEI.
25 The FEI follows a hierarchical approach.
27 (Stage -1: Write out information about the provided data sample)
28 Stage 0: Final State Particles (FSP)
29 Stage 1: pi0, J/Psi, Lambda0
31 Stage 3: D and Lambda_c mesons
36 Most stages consists of:
37 - Create Particle Candidates
40 - Apply a multivariate classification method
43 The FEI will reconstruct these 7 stages during the training phase,
44 since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
45 to train all the necessary multivariate classifiers.
50 from basf2
import B2INFO, B2WARNING
52 import modularAnalysis
as ma
57 from fei
import config
69 import multiprocessing
72 FeiState = collections.namedtuple(
'FeiState',
'path, stage, plists')
75 class TrainingDataInformation:
77 Contains the relevant information about the used training data.
78 Basically we write out the number of MC particles in the whole dataset.
79 This numbers we can use to calculate what fraction of candidates we have to write
80 out as TrainingData to get a reasonable amount of candidates to train on
81 (too few candidates will lead to heavy overtraining, too many won't fit into memory).
82 Secondly we can use this information for the generation of the monitoring pdfs,
83 where we calculate reconstruction efficiencies.
86 def __init__(self, particles: typing.Sequence[config.Particle]):
88 Create a new TrainingData object
89 @param particles list of config.Particle objects
92 self.particles = particles
94 self.filename =
'mcParticlesCount.root'
96 def available(self) -> bool:
98 Check if the relevant information is already available
100 return os.path.isfile(self.filename)
102 def reconstruct(self) -> pybasf2.Path:
104 Returns pybasf2.Path which counts the number of MCParticles in each event.
105 @param particles list of config.Particle objects
108 pdgs = {abs(
pdg.from_name(particle.name))
for particle
in self.particles}
110 path = basf2.create_path()
111 module = basf2.register_module(
'VariablesToHistogram')
112 module.set_name(
"VariablesToHistogram_MCCount")
113 module.param(
'variables', [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs])
114 module.param(
'fileName', self.filename)
115 path.add_module(module)
118 def get_mc_counts(self):
120 Read out the number of MC particles from the file created by reconstruct
125 root_file = ROOT.TFile.Open(self.filename,
'read')
128 for key
in root_file.GetListOfKeys():
129 variable = ROOT.Belle2.MakeROOTCompatible.invertMakeROOTCompatible(key.GetName())
130 pdg = abs(int(variable[len(
'NumberOfMCParticlesInEvent('):-len(
")")]))
133 mc_counts[pdg][
'sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
134 for bin
in range(hist.GetNbinsX()))
135 mc_counts[pdg][
'std'] = hist.GetStdDev()
136 mc_counts[pdg][
'avg'] = hist.GetMean()
137 mc_counts[pdg][
'max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
138 mc_counts[pdg][
'min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
141 mc_counts[0][
'sum'] = hist.GetEntries()
148 Steers the loading of FSP particles.
149 This does NOT include RootInput, Geometry or anything required before loading FSPs,
150 the user has to add this himself (because it depends on the MC campaign and if you want
151 to use Belle or Belle II).
154 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
156 Create a new FSPLoader object
157 @param particles list of config.Particle objects
158 @param config config.FeiConfiguration object
161 self.particles = particles
165 def reconstruct(self) -> pybasf2.Path:
167 Returns pybasf2.Path which loads the FSP Particles
169 path = basf2.create_path()
172 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
173 (
'mu+:FSP',
''), (
'p+:FSP',
'')], writeOut=
True, path=path)
174 for outputList, inputList
in [(
'gamma:FSP',
'gamma:mdst'), (
'K_S0:V0',
'K_S0:mdst'),
175 (
'Lambda0:V0',
'Lambda0:mdst'), (
'K_L0:FSP',
'K_L0:mdst'),
176 (
'pi0:FSP',
'pi0:mdst'), (
'gamma:V0',
'gamma:v0mdst')]:
177 ma.copyParticles(outputList, inputList, writeOut=
True, path=path)
179 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
180 (
'mu+:FSP',
''), (
'gamma:FSP',
''),
181 (
'p+:FSP',
''), (
'K_L0:FSP',
'')], writeOut=
True, path=path)
182 ma.fillParticleList(
'K_S0:V0 -> pi+ pi-',
'', writeOut=
True, path=path)
183 ma.fillParticleList(
'Lambda0:V0 -> p+ pi-',
'', writeOut=
True, path=path)
184 ma.fillConvertedPhotonsList(
'gamma:V0 -> e+ e-',
'', writeOut=
True, path=path)
186 if self.config.monitor:
187 names = [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'K_S0',
'p+',
'K_L0',
'Lambda0',
'pi0']
188 filename =
'Monitor_FSPLoader.root'
190 variables = [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs]
191 ma.variablesToHistogram(
'', variables=variables, filename=filename, path=path)
197 Steers the creation of the training data.
198 The training data is used to train a multivariate classifier for each channel.
199 The training of the FEI at its core is just generating this training data for each channel.
200 After we created the training data for a stage, we have to train the classifiers (see Teacher class further down).
203 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
204 mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
206 Create a new TrainingData object
207 @param particles list of config.Particle objects
208 @param config config.FeiConfiguration object
209 @param mc_counts containing number of MC Particles
212 self.particles = particles
216 self.mc_counts = mc_counts
218 def reconstruct(self) -> pybasf2.Path:
220 Returns pybasf2.Path which creates the training data for the given particles
222 path = basf2.create_path()
224 for particle
in self.particles:
226 nSignal = self.mc_counts[pdgcode][
'sum']
234 for channel
in particle.channels:
235 filename =
'training_input.root'
237 nBackground = self.mc_counts[0][
'sum'] * channel.preCutConfig.bestCandidateCut
238 inverseSamplingRates = {}
241 if nBackground > Teacher.MaximumNumberOfMVASamples
and not channel.preCutConfig.noBackgroundSampling:
242 inverseSamplingRates[0] = int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1
243 if nSignal > Teacher.MaximumNumberOfMVASamples:
244 inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
246 spectators = [channel.mvaConfig.target]
247 if channel.mvaConfig.sPlotVariable
is not None:
248 spectators.append(channel.mvaConfig.sPlotVariable)
250 if self.config.monitor:
251 hist_variables = [
'mcErrors',
'mcParticleStatus'] + channel.mvaConfig.variables + spectators
252 hist_variables_2d = [(x, channel.mvaConfig.target)
253 for x
in channel.mvaConfig.variables + spectators
if x
is not channel.mvaConfig.target]
254 hist_filename =
'Monitor_TrainingData.root'
255 ma.variablesToHistogram(channel.name, variables=config.variables2binnings(hist_variables),
256 variables_2d=config.variables2binnings_2d(hist_variables_2d),
257 filename=config.removeJPsiSlash(hist_filename),
258 directory=config.removeJPsiSlash(f
'{channel.label}'), path=path)
260 teacher = basf2.register_module(
'VariablesToNtuple')
261 teacher.set_name(
'VariablesToNtuple_' + channel.name)
262 teacher.param(
'fileName', filename)
263 teacher.param(
'treeName', f
'{channel.label} variables')
264 teacher.param(
'variables', channel.mvaConfig.variables + spectators)
265 teacher.param(
'particleList', channel.name)
266 teacher.param(
'sampling', (channel.mvaConfig.target, inverseSamplingRates))
267 path.add_module(teacher)
271 class PreReconstruction:
273 Steers the reconstruction phase before the mva method was applied
275 - The ParticleCombination (for each particle and channel we create candidates using
276 the daughter candidates from the previous stages)
278 - Vertex Fitting (this is the slowest part of the whole FEI, KFit is used by default,
279 but you can use fastFit as a drop-in replacement https://github.com/thomaskeck/FastFit/,
280 this will speed up the whole FEI by a factor 2-3)
283 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
285 Create a new PreReconstruction object
286 @param particles list of config.Particle objects
287 @param config config.FeiConfiguration object
290 self.particles = particles
294 def reconstruct(self) -> pybasf2.Path:
296 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
298 path = basf2.create_path()
300 for particle
in self.particles:
301 for channel
in particle.channels:
303 if len(channel.daughters) == 1:
304 ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=
True, path=path)
305 v2EI = basf2.register_module(
'VariablesToExtraInfo')
306 v2EI.set_name(
'VariablesToExtraInfo_' + channel.name)
307 v2EI.param(
'particleList', channel.name)
308 v2EI.param(
'variables', {f
'constant({channel.decayModeID})':
'decayModeID'})
310 v2EI.set_log_level(basf2.logging.log_level.ERROR)
311 path.add_module(v2EI)
313 ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
314 writeOut=
True, path=path)
315 if self.config.monitor:
316 ma.matchMCTruth(channel.name, path=path)
317 bc_variable = channel.preCutConfig.bestCandidateVariable
318 hist_variables = [bc_variable,
'mcErrors',
'mcParticleStatus', channel.mvaConfig.target]
319 hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
320 (bc_variable,
'mcErrors'),
321 (bc_variable,
'mcParticleStatus')]
322 filename =
'Monitor_PreReconstruction_BeforeRanking.root'
323 ma.variablesToHistogram(channel.name,
324 variables=config.variables2binnings(hist_variables),
325 variables_2d=config.variables2binnings_2d(hist_variables_2d),
326 filename=filename, directory=f
'{channel.label}', path=path)
328 if channel.preCutConfig.bestCandidateMode ==
'lowest':
329 ma.rankByLowest(channel.name,
330 channel.preCutConfig.bestCandidateVariable,
331 channel.preCutConfig.bestCandidateCut,
334 elif channel.preCutConfig.bestCandidateMode ==
'highest':
335 ma.rankByHighest(channel.name,
336 channel.preCutConfig.bestCandidateVariable,
337 channel.preCutConfig.bestCandidateCut,
341 raise RuntimeError(
"Unknown bestCandidateMode " + repr(channel.preCutConfig.bestCandidateMode))
343 if self.config.monitor:
344 filename =
'Monitor_PreReconstruction_AfterRanking.root'
345 hist_variables += [
'extraInfo(preCut_rank)']
346 hist_variables_2d += [(
'extraInfo(preCut_rank)', channel.mvaConfig.target),
347 (
'extraInfo(preCut_rank)',
'mcErrors'),
348 (
'extraInfo(preCut_rank)',
'mcParticleStatus')]
349 ma.variablesToHistogram(channel.name,
350 variables=config.variables2binnings(hist_variables),
351 variables_2d=config.variables2binnings_2d(hist_variables_2d),
352 filename=filename, directory=f
'{channel.label}', path=path)
355 elif self.config.training:
356 ma.matchMCTruth(channel.name, path=path)
359 pvfit = basf2.register_module(
'ParticleVertexFitter')
360 pvfit.set_name(
'ParticleVertexFitter_' + channel.name)
361 pvfit.param(
'listName', channel.name)
362 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
363 pvfit.param(
'vertexFitter',
'KFit')
364 pvfit.param(
'fitType',
'vertex')
365 pvfit.set_log_level(basf2.logging.log_level.ERROR)
366 path.add_module(pvfit)
367 elif re.findall(
r"[\w']+", channel.decayString).count(
'pi0') > 1
and particle.name !=
'pi0':
368 basf2.B2INFO(f
"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
369 elif len(channel.daughters) > 1:
370 pvfit = basf2.register_module(
'ParticleVertexFitter')
371 pvfit.set_name(
'ParticleVertexFitter_' + channel.name)
372 pvfit.param(
'listName', channel.name)
373 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
374 pvfit.param(
'vertexFitter',
'KFit')
375 pvfit.param(
'fitType',
'vertex')
376 pvfit.set_log_level(basf2.logging.log_level.ERROR)
377 path.add_module(pvfit)
379 if self.config.monitor:
380 hist_variables = [
'chiProb',
'mcErrors',
'mcParticleStatus', channel.mvaConfig.target]
381 hist_variables_2d = [(
'chiProb', channel.mvaConfig.target),
382 (
'chiProb',
'mcErrors'),
383 (
'chiProb',
'mcParticleStatus')]
384 filename =
'Monitor_PreReconstruction_AfterVertex.root'
385 ma.variablesToHistogram(channel.name,
386 variables=config.variables2binnings(hist_variables),
387 variables_2d=config.variables2binnings_2d(hist_variables_2d),
388 filename=filename, directory=f
'{channel.label}', path=path)
393 class PostReconstruction:
395 Steers the reconstruction phase after the mva method was applied
397 - The application of the mva method itself.
398 - Copying all channel lists in a common one for each particle defined in particles
399 - Tag unique signal candidates, to avoid double counting of channels with overlap
402 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
404 Create a new PostReconstruction object
405 @param particles list of config.Particle objects
406 @param config config.FeiConfiguration object
409 self.particles = particles
413 def get_missing_channels(self) -> typing.Sequence[str]:
415 Returns all channels for which the weightfile is missing
418 for particle
in self.particles:
419 for channel
in particle.channels:
421 weightfile = channel.label +
'.xml'
422 if not basf2_mva.available(weightfile):
423 missing += [channel.label]
426 def available(self) -> bool:
428 Check if the relevant information is already available
430 return len(self.get_missing_channels()) == 0
432 def reconstruct(self) -> pybasf2.Path:
434 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
436 path = basf2.create_path()
438 for particle
in self.particles:
439 for channel
in particle.channels:
440 expert = basf2.register_module(
'MVAExpert')
441 expert.set_name(
'MVAExpert_' + channel.name)
442 if self.config.training:
443 expert.param(
'identifier', channel.label +
'.xml')
445 expert.param(
'identifier', self.config.prefix +
'_' + channel.label)
446 expert.param(
'extraInfoName',
'SignalProbability')
447 expert.param(
'listNames', [channel.name])
449 expert.set_log_level(basf2.logging.log_level.ERROR)
450 path.add_module(expert)
452 uniqueSignal = basf2.register_module(
'TagUniqueSignal')
453 uniqueSignal.param(
'particleList', channel.name)
454 uniqueSignal.param(
'target', channel.mvaConfig.target)
455 uniqueSignal.param(
'extraInfoName',
'uniqueSignal')
456 uniqueSignal.set_name(
'TagUniqueSignal_' + channel.name)
458 uniqueSignal.set_log_level(basf2.logging.log_level.ERROR)
459 path.add_module(uniqueSignal)
461 if self.config.monitor:
462 hist_variables = [
'mcErrors',
'mcParticleStatus',
'extraInfo(uniqueSignal)',
'extraInfo(SignalProbability)',
463 channel.mvaConfig.target,
'extraInfo(decayModeID)']
464 hist_variables_2d = [(
'extraInfo(SignalProbability)', channel.mvaConfig.target),
465 (
'extraInfo(SignalProbability)',
'mcErrors'),
466 (
'extraInfo(SignalProbability)',
'mcParticleStatus'),
467 (
'extraInfo(decayModeID)', channel.mvaConfig.target),
468 (
'extraInfo(decayModeID)',
'mcErrors'),
469 (
'extraInfo(decayModeID)',
'extraInfo(uniqueSignal)'),
470 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
471 filename =
'Monitor_PostReconstruction_AfterMVA.root'
472 ma.variablesToHistogram(channel.name,
473 variables=config.variables2binnings(hist_variables),
474 variables_2d=config.variables2binnings_2d(hist_variables_2d),
475 filename=filename, directory=f
'{channel.label}', path=path)
478 if particle.postCutConfig.value > 0.0:
479 cutstring = str(particle.postCutConfig.value) +
' < extraInfo(SignalProbability)'
481 ma.mergeListsWithBestDuplicate(particle.identifier, [c.name
for c
in particle.channels],
482 variable=
'particleSource', writeOut=
True, path=path)
484 if self.config.monitor:
485 hist_variables = [
'mcErrors',
'mcParticleStatus',
'extraInfo(uniqueSignal)',
'extraInfo(SignalProbability)',
486 particle.mvaConfig.target,
'extraInfo(decayModeID)']
487 hist_variables_2d = [(
'extraInfo(decayModeID)', particle.mvaConfig.target),
488 (
'extraInfo(decayModeID)',
'mcErrors'),
489 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
490 filename =
'Monitor_PostReconstruction_BeforePostCut.root'
491 ma.variablesToHistogram(
493 variables=config.variables2binnings(hist_variables),
494 variables_2d=config.variables2binnings_2d(hist_variables_2d),
495 filename=config.removeJPsiSlash(filename),
496 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
499 ma.applyCuts(particle.identifier, cutstring, path=path)
501 if self.config.monitor:
502 filename =
'Monitor_PostReconstruction_BeforeRanking.root'
503 ma.variablesToHistogram(
505 variables=config.variables2binnings(hist_variables),
506 variables_2d=config.variables2binnings_2d(hist_variables_2d),
507 filename=config.removeJPsiSlash(filename),
508 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
511 ma.rankByHighest(particle.identifier,
'extraInfo(SignalProbability)',
512 particle.postCutConfig.bestCandidateCut,
'postCut_rank', path=path)
514 if self.config.monitor:
515 hist_variables += [
'extraInfo(postCut_rank)']
516 hist_variables_2d += [(
'extraInfo(decayModeID)',
'extraInfo(postCut_rank)'),
517 (particle.mvaConfig.target,
'extraInfo(postCut_rank)'),
518 (
'mcErrors',
'extraInfo(postCut_rank)'),
519 (
'mcParticleStatus',
'extraInfo(postCut_rank)')]
520 filename =
'Monitor_PostReconstruction_AfterRanking.root'
521 ma.variablesToHistogram(
523 variables=config.variables2binnings(hist_variables),
524 variables_2d=config.variables2binnings_2d(hist_variables_2d),
525 filename=config.removeJPsiSlash(filename),
526 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
529 variables = [
'extraInfo(SignalProbability)',
'mcErrors',
'mcParticleStatus', particle.mvaConfig.target,
530 'extraInfo(uniqueSignal)',
'extraInfo(decayModeID)']
532 if 'B_s0' == particle.name:
534 elif 'B' in particle.name:
535 variables += [
'Mbc',
'cosThetaBetweenParticleAndNominalB']
537 filename =
'Monitor_Final.root'
538 ma.variablesToNtuple(particle.identifier, variables, treename=config.removeJPsiSlash(
539 f
'{particle.identifier} variables'), filename=config.removeJPsiSlash(filename), path=path)
545 Performs all necessary trainings for all training data files which are
546 available but where there is no weight file available yet.
547 This class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
548 The trainings are run in parallel using multi-threading of python.
549 Each training is done by a subprocess call, the training command (passed by config.externTeacher) can be either
550 * basf2_mva_teacher, the training will be done directly on the machine
551 * externClustTeacher, the training will be submitted to the batch system of KEKCC
555 MaximumNumberOfMVASamples = int(1e7)
558 MinimumNumberOfMVASamples = int(5e2)
560 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
562 Create a new Teacher object
563 @param particles list of config.Particle objects
564 @param config config.FeiConfiguration object
567 self.particles = particles
572 def create_fake_weightfile(channel: str):
574 Create a fake weight file using the trivial method, it will always return 0.0
575 @param channel for which we create a fake weight file
578 <?xml version="1.0" encoding="utf-8"?>
579 <method>Trivial</method>
580 <weightfile>{channel}.xml</weightfile>
581 <treename>tree</treename>
582 <target_variable>isSignal</target_variable>
583 <weight_variable>__weight__</weight_variable>
584 <signal_class>1</signal_class>
585 <max_events>0</max_events>
586 <number_feature_variables>1</number_feature_variables>
587 <variable0>M</variable0>
588 <number_spectator_variables>0</number_spectator_variables>
589 <number_data_files>1</number_data_files>
590 <datafile0>train.root</datafile0>
591 <Trivial_version>1</Trivial_version>
592 <Trivial_output>0</Trivial_output>
593 <signal_fraction>0.066082567</signal_fraction>
595 with open(channel +
".xml",
"w")
as f:
599 def check_if_weightfile_is_fake(filename: str):
601 Checks if the provided filename is a fake-weight file or not
602 @param filename the filename of the weight file
605 return '<method>Trivial</method>' in open(filename).readlines()[2]
606 except BaseException:
610 def upload(self, channel: str):
612 Upload the weight file into the condition database
613 @param channel whose weight file is uploaded
615 disk = channel +
'.xml'
616 dbase = self.config.prefix +
'_' + channel
617 basf2_mva.upload(disk, dbase)
620 def do_all_trainings(self):
622 Do all trainings for which we find training data
628 ROOT.PyConfig.StartGuiThread =
False
630 filename =
'training_input.root'
631 if not os.path.isfile(filename):
632 B2WARNING(
"Training of MVC failed. Couldn't find ROOT file. "
633 "No weight files will be provided.")
635 f = ROOT.TFile.Open(filename,
'read')
637 B2WARNING(
"Training of MVC failed. ROOT file corrupt. "
638 "No weight files will be provided.")
639 elif len([k.GetName()
for k
in f.GetListOfKeys()]) == 0:
640 B2WARNING(
"Training of MVC failed. ROOT file does not contain any trees. "
641 "No weight files will be provided.")
643 for particle
in self.particles:
644 for channel
in particle.channels:
645 weightfile = channel.label +
'.xml'
646 if not basf2_mva.available(weightfile):
647 keys = [m
for m
in f.GetListOfKeys()
if f
"{channel.label}" in m.GetName()]
650 tree = keys[0].ReadObj()
651 nSig = tree.GetEntries(channel.mvaConfig.target +
' == 1.0')
652 nBg = tree.GetEntries(channel.mvaConfig.target +
' != 1.0')
653 if nSig < Teacher.MinimumNumberOfMVASamples:
654 B2WARNING(
"Training of MVC failed. "
655 f
"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
656 self.create_fake_weightfile(channel.label)
657 self.upload(channel.label)
659 if nBg < Teacher.MinimumNumberOfMVASamples:
660 B2WARNING(
"Training of MVC failed. "
661 f
"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
662 self.create_fake_weightfile(channel.label)
663 self.upload(channel.label)
665 variable_str =
"' '".join(channel.mvaConfig.variables)
667 command = (f
"{self.config.externTeacher}"
668 f
" --method '{channel.mvaConfig.method}'"
669 f
" --target_variable '{channel.mvaConfig.target}'"
670 f
" --treename '{channel.label} variables' --datafile 'training_input.root'"
671 f
" --signal_class 1 --variables '{variable_str}'"
672 f
" --identifier '{channel.label}.xml'"
673 f
" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1")
674 B2INFO(f
"Used following command to invoke teacher: \n {command}")
675 job_list.append((channel.label, command))
677 p = multiprocessing.Pool(
None, maxtasksperchild=1)
678 func = functools.partial(subprocess.call, shell=
True)
679 p.map(func, [c
for _, c
in job_list])
683 for name, _
in job_list:
684 if not basf2_mva.available(name +
'.xml'):
685 B2WARNING(
"Training of MVC failed. For unknown reasons, check the logfile")
686 self.create_fake_weightfile(name)
687 weightfiles.append(self.upload(name))
691 def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
693 Convert an old FEI training into the new format.
694 The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
695 using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
696 @param particles list of config.Particle objects
697 @param config config.FeiConfiguration object
699 summary = pickle.load(open(configuration.legacy,
'rb'))
700 channel2lists = {k: v[2]
for k, v
in summary[
'channel2lists'].items()}
702 teacher = Teacher(particles, configuration)
704 for particle
in particles:
705 for channel
in particle.channels:
706 new_weightfile = configuration.prefix +
'_' + channel.label
707 old_weightfile = configuration.prefix +
'_' + channel2lists[channel.label.replace(
'Jpsi',
'J/psi')]
708 if not basf2_mva.available(new_weightfile):
709 if old_weightfile
is None or not basf2_mva.available(old_weightfile):
710 Teacher.create_fake_weightfile(channel.label)
711 teacher.upload(channel.label)
713 basf2_mva.download(old_weightfile, channel.label +
'.xml')
714 teacher.upload(channel.label)
717 def get_stages_from_particles(particles: typing.Sequence[config.Particle]):
719 Returns the hierarchical structure of the FEI.
720 Each stage depends on the particles in the previous stage.
721 The final stage is empty (meaning everything is done, and the training is finished at this point).
722 @param particles list of config.Particle objects
725 [p
for p
in particles
if p.name
in [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'p+',
'K_L0']],
726 [p
for p
in particles
if p.name
in [
'pi0',
'J/psi',
'Lambda0']],
727 [p
for p
in particles
if p.name
in [
'K_S0',
'Sigma+']],
728 [p
for p
in particles
if p.name
in [
'D+',
'D0',
'D_s+',
'Lambda_c+']],
729 [p
for p
in particles
if p.name
in [
'D*+',
'D*0',
'D_s*+']],
730 [p
for p
in particles
if p.name
in [
'B0',
'B+',
'B_s0']],
735 if p.name
not in [p.name
for stage
in stages
for p
in stage]:
736 raise RuntimeError(f
"Unknown particle {p.name}: Not implemented in FEI")
741 def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
743 Performs the training of mva classifiers for all available training data,
744 this function must be either called by the user after each stage of the FEI during training,
745 or (more likely) is called by the distributed.py script after merging the outputs of all jobs,
746 @param particles list of config.Particle objects
747 @param config config.FeiConfiguration object
748 @return list of tuple with weight file on disk and identifier in database for all trained classifiers
750 teacher = Teacher(particles, configuration)
751 return teacher.do_all_trainings()
754 def save_summary(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration, cache: int):
756 Creates the Summary.pickle, which is used to keep track of the stage during the training,
757 and can be used later to investigate which configuration was used exactly to create the training.
758 @param particles list of config.Particle objects
759 @param config config.FeiConfiguration object
760 @param cache current cache level
762 configuration = config.FeiConfiguration(configuration.prefix, cache,
763 configuration.monitor, configuration.legacy, configuration.externTeacher,
764 configuration.training)
766 for i
in [8, 7, 6, 5, 4, 3, 2, 1, 0]:
767 if os.path.isfile(f
'Summary.pickle.backup_{i}'):
768 shutil.copyfile(f
'Summary.pickle.backup_{i}', f
'Summary.pickle.backup_{i + 1}')
769 if os.path.isfile(
'Summary.pickle'):
770 shutil.copyfile(
'Summary.pickle',
'Summary.pickle.backup_0')
771 pickle.dump((particles, configuration), open(
'Summary.pickle',
'wb'))
774 def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
776 The most important function of the FEI.
777 This creates the FEI path for training/fitting (both terms are equal), and application/inference (both terms are equal).
778 The whole FEI is defined by the particles which are reconstructed (see default_channels.py)
779 and the configuration (see config.py).
782 For training this function is called multiple times, each time the FEI reconstructs one more stage in the hierarchical structure
783 i.e. we start with FSP, pi0, KS_0, D, D*, and with B mesons. You have to set configuration.training to True for training mode.
784 All weight files created during the training will be stored in your local database.
785 If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
786 to the central database first (see documentation for the Belle2 Condition Database).
789 For application you call this function once, and it returns the whole path which will reconstruct B mesons
790 with an associated signal probability. You have to set configuration.training to False for application mode.
793 You can always turn on the monitoring (configuration.monitor = True),
794 to write out ROOT Histograms of many quantities for each stage,
795 using these histograms you can use the printReporting.py or latexReporting.py scripts to automatically create pdf files.
798 This function can also use old FEI trainings (version 3), just pass the Summary.pickle file of the old training,
799 and the weight files will be automatically converted to the new naming scheme.
801 @param particles list of config.Particle objects
802 @param config config.FeiConfiguration object
805 ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
806 |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
807 | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
809 Author: Thomas Keck 2014 - 2017
810 Please cite my PhD thesis
819 if configuration.cache
is None:
820 if os.path.isfile(
'Summary.pickle'):
821 print(
"Cache: Replaced particles and configuration with the ones from Summary.pickle!")
822 particles, configuration = pickle.load(open(
'Summary.pickle',
'rb'))
823 cache = configuration.cache
825 if configuration.training:
830 cache = configuration.cache
833 path = basf2.create_path()
838 stages = get_stages_from_particles(particles)
843 if configuration.legacy
is not None:
844 convert_legacy_training(particles, configuration)
852 training_data_information = TrainingDataInformation(particles)
853 if cache < 0
and configuration.training:
854 print(
"Stage 0: Run over all files to count the number of events and McParticles")
855 path.add_path(training_data_information.reconstruct())
856 if configuration.training:
857 save_summary(particles, configuration, 0)
858 return FeiState(path, 0, [])
859 elif not configuration.training
and configuration.monitor:
860 path.add_path(training_data_information.reconstruct())
866 loader = FSPLoader(particles, configuration)
868 print(
"Stage 0: Load FSP particles")
869 path.add_path(loader.reconstruct())
892 for stage, stage_particles
in enumerate(stages):
893 pre_reconstruction = PreReconstruction(stage_particles, configuration)
895 print(f
"Stage {stage}: PreReconstruct particles: ", [p.name
for p
in stage_particles])
896 path.add_path(pre_reconstruction.reconstruct())
898 post_reconstruction = PostReconstruction(stage_particles, configuration)
899 if configuration.training
and not post_reconstruction.available():
900 print(f
"Stage {stage}: Create training data for particles: ", [p.name
for p
in stage_particles])
901 mc_counts = training_data_information.get_mc_counts()
902 training_data = TrainingData(stage_particles, configuration, mc_counts)
903 path.add_path(training_data.reconstruct())
904 used_lists += [channel.name
for particle
in stage_particles
for channel
in particle.channels]
906 if cache <= stage + 1:
907 path.add_path(post_reconstruction.reconstruct())
908 used_lists += [particle.identifier
for particle
in stage_particles]
912 if configuration.monitor:
913 output = basf2.register_module(
'RootOutput')
914 output.param(
'outputFileName',
'Monitor_ModuleStatistics.root')
915 output.param(
'branchNames', [
'EventMetaData'])
916 output.param(
'branchNamesPersistent', [
'ProcessStatistics'])
917 output.param(
'ignoreCommandLineOverride',
True)
918 path.add_module(output)
922 if configuration.training
or configuration.monitor:
923 save_summary(particles, configuration, stage + 1)
926 return FeiState(path, stage + 1, plists=used_lists)