14 The Full Event Interpretation Algorithm
17 - The algorithm will automatically reconstruct B mesons and calculate a signal probability
for each candidate.
18 - It can be used
for hadronic
and semileptonic tagging.
19 - The algorithm has to be trained on MC,
and can afterwards be applied on data.
20 - The training requires O(100) million MC events
21 - The weight files are stored
in the Belle II Condition database
23 Read this file
if you want to understand the technical details of the FEI.
25 The FEI follows a hierarchical approach.
27 (Stage -1: Write out information about the provided data sample)
28 Stage 0: Final State Particles (FSP)
29 Stage 1: pi0, J/Psi, Lambda0
31 Stage 3: D
and Lambda_c mesons
36 Most stages consists of:
37 - Create Particle Candidates
40 - Apply a multivariate classification method
43 The FEI will reconstruct these 7 stages during the training phase,
44 since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
45 to train all the necessary multivariate classifiers.
50from basf2 import B2INFO, B2WARNING
52import modularAnalysis as ma
55# Should come after basf2 import
60# Standard python modules
71# Simple object containing the output of fei
72FeiState = collections.namedtuple('FeiState', 'path, stage, plists')
75class TrainingDataInformation:
77 Contains the relevant information about the used training data.
78 Basically we write out the number of MC particles in the whole dataset.
79 This numbers we can use to calculate what fraction of candidates we have to write
80 out
as TrainingData to get a reasonable amount of candidates to train on
81 (too few candidates will lead to heavy overtraining, too many won
't fit into memory). Secondly we can use this information for the generation of the monitoring pdfs,
82 where we calculate reconstruction efficiencies.
85 def __init__(self, particles: typing.Sequence[config.Particle]):
87 Create a new TrainingData object
88 @param particles list of config.Particle objects
91 self.particles = particles
93 self.filename = 'mcParticlesCount.root'
95 def available(self) -> bool:
97 Check if the relevant information
is already available
99 return os.path.isfile(self.filename)
101 def reconstruct(self) -> pybasf2.Path:
103 Returns pybasf2.Path which counts the number of MCParticles in each event.
104 @param particles list of config.Particle objects
107 pdgs = {abs(
pdg.from_name(particle.name))
for particle
in self.particles}
109 path = basf2.create_path()
110 module = basf2.register_module(
'VariablesToHistogram')
111 module.set_name(
"VariablesToHistogram_MCCount")
112 module.param(
'variables', [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs])
113 module.param(
'fileName', self.filename)
114 module.param(
'ignoreCommandLineOverride',
True)
115 path.add_module(module)
118 def get_mc_counts(self):
120 Read out the number of MC particles from the file created by reconstruct
125 root_file = ROOT.TFile.Open(self.filename,
'read')
128 for key
in root_file.GetListOfKeys():
129 variable = ROOT.Belle2.MakeROOTCompatible.invertMakeROOTCompatible(key.GetName())
130 pdg = abs(int(variable[len(
'NumberOfMCParticlesInEvent('):-len(
")")]))
133 mc_counts[pdg][
'sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
134 for bin
in range(hist.GetNbinsX()))
135 mc_counts[pdg][
'std'] = hist.GetStdDev()
136 mc_counts[pdg][
'avg'] = hist.GetMean()
137 mc_counts[pdg][
'max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
138 mc_counts[pdg][
'min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
141 mc_counts[0][
'sum'] = hist.GetEntries()
148 Steers the loading of FSP particles.
149 This does NOT include RootInput, Geometry or anything required before loading FSPs,
150 the user has to add this himself (because it depends on the MC campaign
and if you want
151 to use Belle
or Belle II).
154 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
156 Create a new FSPLoader object
157 @param particles list of config.Particle objects
158 @param config config.FeiConfiguration object
161 self.particles = particles
165 def reconstruct(self) -> pybasf2.Path:
167 Returns pybasf2.Path which loads the FSP Particles
169 path = basf2.create_path()
172 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
173 (
'mu+:FSP',
''), (
'p+:FSP',
'')], writeOut=
True, path=path)
174 for outputList, inputList
in [(
'gamma:FSP',
'gamma:mdst'), (
'K_S0:V0',
'K_S0:mdst'),
175 (
'Lambda0:V0',
'Lambda0:mdst'), (
'K_L0:FSP',
'K_L0:mdst'),
176 (
'pi0:FSP',
'pi0:mdst'), (
'gamma:V0',
'gamma:v0mdst')]:
177 ma.copyParticles(outputList, inputList, writeOut=
True, path=path)
179 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
180 (
'mu+:FSP',
''), (
'gamma:FSP',
''),
181 (
'p+:FSP',
''), (
'K_L0:FSP',
'')], writeOut=
True, path=path)
182 ma.fillParticleList(
'K_S0:V0 -> pi+ pi-',
'', writeOut=
True, path=path)
183 ma.fillParticleList(
'Lambda0:V0 -> p+ pi-',
'', writeOut=
True, path=path)
184 ma.fillConvertedPhotonsList(
'gamma:V0 -> e+ e-',
'', writeOut=
True, path=path)
186 if self.config.monitor:
187 names = [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'K_S0',
'p+',
'K_L0',
'Lambda0',
'pi0']
188 filename =
'Monitor_FSPLoader.root'
190 variables = [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs]
191 ma.variablesToHistogram(
'', variables=variables, filename=filename, ignoreCommandLineOverride=
True, path=path)
197 Steers the creation of the training data.
198 The training data is used to train a multivariate classifier
for each channel.
199 The training of the FEI at its core
is just generating this training data
for each channel.
200 After we created the training data
for a stage, we have to train the classifiers (see Teacher
class further down).
203 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
204 mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
206 Create a new TrainingData object
207 @param particles list of config.Particle objects
208 @param config config.FeiConfiguration object
209 @param mc_counts containing number of MC Particles
212 self.particles = particles
216 self.mc_counts = mc_counts
218 def reconstruct(self) -> pybasf2.Path:
220 Returns pybasf2.Path which creates the training data for the given particles
222 path = basf2.create_path()
224 for particle
in self.particles:
226 nSignal = self.mc_counts[pdgcode][
'sum']
234 for channel
in particle.channels:
235 filename =
'training_input.root'
237 nBackground = self.mc_counts[0][
'sum'] * channel.preCutConfig.bestCandidateCut
238 inverseSamplingRates = {}
241 if nBackground > Teacher.MaximumNumberOfMVASamples
and not channel.preCutConfig.noBackgroundSampling:
242 inverseSamplingRates[0] = int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1
243 if nSignal > Teacher.MaximumNumberOfMVASamples:
244 inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
246 spectators = [channel.mvaConfig.target]
247 if channel.mvaConfig.sPlotVariable
is not None:
248 spectators.append(channel.mvaConfig.sPlotVariable)
250 if self.config.monitor:
251 hist_variables = [
'mcErrors',
'mcParticleStatus'] + channel.mvaConfig.variables + spectators
252 hist_variables_2d = [(x, channel.mvaConfig.target)
253 for x
in channel.mvaConfig.variables + spectators
if x
is not channel.mvaConfig.target]
254 hist_filename =
'Monitor_TrainingData.root'
255 ma.variablesToHistogram(channel.name, variables=config.variables2binnings(hist_variables),
256 variables_2d=config.variables2binnings_2d(hist_variables_2d),
257 filename=config.removeJPsiSlash(hist_filename),
258 ignoreCommandLineOverride=
True,
259 directory=config.removeJPsiSlash(f
'{channel.label}'), path=path)
261 teacher = basf2.register_module(
'VariablesToNtuple')
262 teacher.set_name(
'VariablesToNtuple_' + channel.name)
263 teacher.param(
'fileName', filename)
264 teacher.param(
'treeName', f
'{channel.label} variables')
265 teacher.param(
'variables', channel.mvaConfig.variables + spectators)
266 teacher.param(
'particleList', channel.name)
267 teacher.param(
'sampling', (channel.mvaConfig.target, inverseSamplingRates))
268 teacher.param(
'ignoreCommandLineOverride',
True)
269 path.add_module(teacher)
273class PreReconstruction:
275 Steers the reconstruction phase before the mva method was applied
277 - The ParticleCombination (for each particle
and channel we create candidates using
278 the daughter candidates
from the previous stages)
280 - Vertex Fitting (this
is the slowest part of the whole FEI, KFit
is used by default,
281 but you can use fastFit
as a drop-
in replacement https://github.com/thomaskeck/FastFit/,
282 this will speed up the whole FEI by a factor 2-3)
285 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
287 Create a new PreReconstruction object
288 @param particles list of config.Particle objects
289 @param config config.FeiConfiguration object
292 self.particles = particles
296 def reconstruct(self) -> pybasf2.Path:
298 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting
if necessary
300 path = basf2.create_path()
302 for particle
in self.particles:
303 for channel
in particle.channels:
305 if len(channel.daughters) == 1:
306 ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=
True, path=path)
307 v2EI = basf2.register_module(
'VariablesToExtraInfo')
308 v2EI.set_name(
'VariablesToExtraInfo_' + channel.name)
309 v2EI.param(
'particleList', channel.name)
310 v2EI.param(
'variables', {f
'constant({channel.decayModeID})':
'decayModeID'})
312 v2EI.set_log_level(basf2.logging.log_level.ERROR)
313 path.add_module(v2EI)
315 ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
316 writeOut=
True, path=path)
317 if self.config.monitor:
318 ma.matchMCTruth(channel.name, path=path)
319 bc_variable = channel.preCutConfig.bestCandidateVariable
320 hist_variables = [bc_variable,
'mcErrors',
'mcParticleStatus', channel.mvaConfig.target]
321 hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
322 (bc_variable,
'mcErrors'),
323 (bc_variable,
'mcParticleStatus')]
324 filename =
'Monitor_PreReconstruction_BeforeRanking.root'
325 ma.variablesToHistogram(
327 variables=config.variables2binnings(hist_variables),
328 variables_2d=config.variables2binnings_2d(hist_variables_2d),
330 ignoreCommandLineOverride=
True,
331 directory=f
'{channel.label}',
334 if channel.preCutConfig.bestCandidateMode ==
'lowest':
335 ma.rankByLowest(channel.name,
336 channel.preCutConfig.bestCandidateVariable,
337 channel.preCutConfig.bestCandidateCut,
340 elif channel.preCutConfig.bestCandidateMode ==
'highest':
341 ma.rankByHighest(channel.name,
342 channel.preCutConfig.bestCandidateVariable,
343 channel.preCutConfig.bestCandidateCut,
347 raise RuntimeError(
"Unknown bestCandidateMode " + repr(channel.preCutConfig.bestCandidateMode))
349 if self.config.monitor:
350 filename =
'Monitor_PreReconstruction_AfterRanking.root'
351 hist_variables += [
'extraInfo(preCut_rank)']
352 hist_variables_2d += [(
'extraInfo(preCut_rank)', channel.mvaConfig.target),
353 (
'extraInfo(preCut_rank)',
'mcErrors'),
354 (
'extraInfo(preCut_rank)',
'mcParticleStatus')]
355 ma.variablesToHistogram(
357 variables=config.variables2binnings(hist_variables),
358 variables_2d=config.variables2binnings_2d(hist_variables_2d),
360 ignoreCommandLineOverride=
True,
361 directory=f
'{channel.label}',
365 elif self.config.training:
366 ma.matchMCTruth(channel.name, path=path)
369 pvfit = basf2.register_module(
'ParticleVertexFitter')
370 pvfit.set_name(
'ParticleVertexFitter_' + channel.name)
371 pvfit.param(
'listName', channel.name)
372 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
373 pvfit.param(
'vertexFitter',
'KFit')
374 pvfit.param(
'fitType',
'vertex')
375 pvfit.set_log_level(basf2.logging.log_level.ERROR)
376 path.add_module(pvfit)
377 elif re.findall(
r"[\w']+", channel.decayString).count(
'pi0') > 1
and particle.name !=
'pi0':
378 basf2.B2INFO(f
"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
379 elif len(channel.daughters) > 1:
380 pvfit = basf2.register_module(
'ParticleVertexFitter')
381 pvfit.set_name(
'ParticleVertexFitter_' + channel.name)
382 pvfit.param(
'listName', channel.name)
383 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
384 pvfit.param(
'vertexFitter',
'KFit')
385 if particle.name
in [
'pi0']:
386 pvfit.param(
'fitType',
'mass')
388 pvfit.param(
'fitType',
'vertex')
389 pvfit.set_log_level(basf2.logging.log_level.ERROR)
390 path.add_module(pvfit)
392 if self.config.monitor:
393 hist_variables = [
'chiProb',
'mcErrors',
'mcParticleStatus', channel.mvaConfig.target]
394 hist_variables_2d = [(
'chiProb', channel.mvaConfig.target),
395 (
'chiProb',
'mcErrors'),
396 (
'chiProb',
'mcParticleStatus')]
397 filename =
'Monitor_PreReconstruction_AfterVertex.root'
398 ma.variablesToHistogram(
400 variables=config.variables2binnings(hist_variables),
401 variables_2d=config.variables2binnings_2d(hist_variables_2d),
403 ignoreCommandLineOverride=
True,
404 directory=f
'{channel.label}',
410class PostReconstruction:
412 Steers the reconstruction phase after the mva method was applied
414 - The application of the mva method itself.
415 - Copying all channel lists in a common one
for each particle defined
in particles
416 - Tag unique signal candidates, to avoid double counting of channels
with overlap
419 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
421 Create a new PostReconstruction object
422 @param particles list of config.Particle objects
423 @param config config.FeiConfiguration object
426 self.particles = particles
430 def get_missing_channels(self) -> typing.Sequence[str]:
432 Returns all channels for which the weightfile
is missing
435 for particle
in self.particles:
436 for channel
in particle.channels:
438 weightfile = channel.label +
'.xml'
439 if not basf2_mva.available(weightfile):
440 missing += [channel.label]
443 def available(self) -> bool:
445 Check if the relevant information
is already available
447 return len(self.get_missing_channels()) == 0
449 def reconstruct(self) -> pybasf2.Path:
451 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting
if necessary
453 path = basf2.create_path()
455 for particle
in self.particles:
456 for channel
in particle.channels:
457 expert = basf2.register_module(
'MVAExpert')
458 expert.set_name(
'MVAExpert_' + channel.name)
459 if self.config.training:
460 expert.param(
'identifier', channel.label +
'.xml')
462 expert.param(
'identifier', self.config.prefix +
'_' + channel.label)
463 expert.param(
'extraInfoName',
'SignalProbability')
464 expert.param(
'listNames', [channel.name])
466 expert.set_log_level(basf2.logging.log_level.ERROR)
467 path.add_module(expert)
469 uniqueSignal = basf2.register_module(
'TagUniqueSignal')
470 uniqueSignal.param(
'particleList', channel.name)
471 uniqueSignal.param(
'target', channel.mvaConfig.target)
472 uniqueSignal.param(
'extraInfoName',
'uniqueSignal')
473 uniqueSignal.set_name(
'TagUniqueSignal_' + channel.name)
475 uniqueSignal.set_log_level(basf2.logging.log_level.ERROR)
476 path.add_module(uniqueSignal)
478 if self.config.monitor:
479 hist_variables = [
'mcErrors',
'mcParticleStatus',
'extraInfo(uniqueSignal)',
'extraInfo(SignalProbability)',
480 channel.mvaConfig.target,
'extraInfo(decayModeID)']
481 hist_variables_2d = [(
'extraInfo(SignalProbability)', channel.mvaConfig.target),
482 (
'extraInfo(SignalProbability)',
'mcErrors'),
483 (
'extraInfo(SignalProbability)',
'mcParticleStatus'),
484 (
'extraInfo(decayModeID)', channel.mvaConfig.target),
485 (
'extraInfo(decayModeID)',
'mcErrors'),
486 (
'extraInfo(decayModeID)',
'extraInfo(uniqueSignal)'),
487 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
488 filename =
'Monitor_PostReconstruction_AfterMVA.root'
489 ma.variablesToHistogram(
491 variables=config.variables2binnings(hist_variables),
492 variables_2d=config.variables2binnings_2d(hist_variables_2d),
494 ignoreCommandLineOverride=
True,
495 directory=f
'{channel.label}',
499 if particle.postCutConfig.value > 0.0:
500 cutstring = str(particle.postCutConfig.value) +
' < extraInfo(SignalProbability)'
502 ma.mergeListsWithBestDuplicate(particle.identifier, [c.name
for c
in particle.channels],
503 variable=
'particleSource', writeOut=
True, path=path)
505 if self.config.monitor:
506 hist_variables = [
'mcErrors',
'mcParticleStatus',
'extraInfo(uniqueSignal)',
'extraInfo(SignalProbability)',
507 particle.mvaConfig.target,
'extraInfo(decayModeID)']
508 hist_variables_2d = [(
'extraInfo(decayModeID)', particle.mvaConfig.target),
509 (
'extraInfo(decayModeID)',
'mcErrors'),
510 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
511 filename =
'Monitor_PostReconstruction_BeforePostCut.root'
512 ma.variablesToHistogram(
514 variables=config.variables2binnings(hist_variables),
515 variables_2d=config.variables2binnings_2d(hist_variables_2d),
516 filename=config.removeJPsiSlash(filename),
517 ignoreCommandLineOverride=
True,
518 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
521 ma.applyCuts(particle.identifier, cutstring, path=path)
523 if self.config.monitor:
524 filename =
'Monitor_PostReconstruction_BeforeRanking.root'
525 ma.variablesToHistogram(
527 variables=config.variables2binnings(hist_variables),
528 variables_2d=config.variables2binnings_2d(hist_variables_2d),
529 filename=config.removeJPsiSlash(filename),
530 ignoreCommandLineOverride=
True,
531 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
534 ma.rankByHighest(particle.identifier,
'extraInfo(SignalProbability)',
535 particle.postCutConfig.bestCandidateCut,
'postCut_rank', path=path)
537 if self.config.monitor:
538 hist_variables += [
'extraInfo(postCut_rank)']
539 hist_variables_2d += [(
'extraInfo(decayModeID)',
'extraInfo(postCut_rank)'),
540 (particle.mvaConfig.target,
'extraInfo(postCut_rank)'),
541 (
'mcErrors',
'extraInfo(postCut_rank)'),
542 (
'mcParticleStatus',
'extraInfo(postCut_rank)')]
543 filename =
'Monitor_PostReconstruction_AfterRanking.root'
544 ma.variablesToHistogram(
546 variables=config.variables2binnings(hist_variables),
547 variables_2d=config.variables2binnings_2d(hist_variables_2d),
548 filename=config.removeJPsiSlash(filename),
549 ignoreCommandLineOverride=
True,
550 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
553 variables = [
'extraInfo(SignalProbability)',
'mcErrors',
'mcParticleStatus', particle.mvaConfig.target,
554 'extraInfo(uniqueSignal)',
'extraInfo(decayModeID)']
556 if 'B_s0' == particle.name:
558 elif 'B' in particle.name:
559 variables += [
'Mbc',
'cosThetaBetweenParticleAndNominalB']
561 filename =
'Monitor_Final.root'
562 ma.variablesToNtuple(
565 treename=config.removeJPsiSlash(f
'{particle.identifier} variables'),
566 filename=config.removeJPsiSlash(filename),
567 ignoreCommandLineOverride=
True,
574 Performs all necessary trainings for all training data files which are
575 available but where there
is no weight file available yet.
576 This
class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
577 The trainings are run
in parallel using multi-threading of python.
578 Each training
is done by a subprocess call, the training command (passed by config.externTeacher) can be either
579 * basf2_mva_teacher, the training will be done directly on the machine
580 * externClustTeacher, the training will be submitted to the batch system of KEKCC
584 MaximumNumberOfMVASamples = int(1e7)
587 MinimumNumberOfMVASamples = int(5e2)
589 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
591 Create a new Teacher object
592 @param particles list of config.Particle objects
593 @param config config.FeiConfiguration object
596 self.particles = particles
601 def create_fake_weightfile(channel: str):
603 Create a fake weight file using the trivial method, it will always return 0.0
604 @param channel
for which we create a fake weight file
607 <?xml version="1.0" encoding=
"utf-8"?>
608 <method>Trivial</method>
609 <weightfile>{channel}.xml</weightfile>
610 <treename>tree</treename>
611 <target_variable>isSignal</target_variable>
612 <weight_variable>__weight__</weight_variable>
613 <signal_class>1</signal_class>
614 <max_events>0</max_events>
615 <number_feature_variables>1</number_feature_variables>
616 <variable0>M</variable0>
617 <number_spectator_variables>0</number_spectator_variables>
618 <number_data_files>1</number_data_files>
619 <datafile0>train.root</datafile0>
620 <Trivial_version>1</Trivial_version>
621 <Trivial_output>0</Trivial_output>
622 <signal_fraction>0.066082567</signal_fraction>
624 with open(channel +
".xml",
"w")
as f:
628 def check_if_weightfile_is_fake(filename: str):
630 Checks if the provided filename
is a fake-weight file
or not
631 @param filename the filename of the weight file
634 return '<method>Trivial</method>' in open(filename).readlines()[2]
635 except BaseException:
639 def upload(self, channel: str):
641 Upload the weight file into the condition database
642 @param channel whose weight file
is uploaded
644 disk = channel + '.xml'
645 dbase = self.config.prefix +
'_' + channel
646 basf2_mva.upload(disk, dbase)
649 def do_all_trainings(self):
651 Do all trainings for which we find training data
657 ROOT.PyConfig.StartGuiThread =
False
659 filename =
'training_input.root'
660 if not os.path.isfile(filename):
661 B2WARNING(
"Training of MVC failed. Couldn't find ROOT file. "
662 "No weight files will be provided.")
664 f = ROOT.TFile.Open(filename,
'read')
666 B2WARNING(
"Training of MVC failed. ROOT file corrupt. "
667 "No weight files will be provided.")
668 elif len([k.GetName()
for k
in f.GetListOfKeys()]) == 0:
669 B2WARNING(
"Training of MVC failed. ROOT file does not contain any trees. "
670 "No weight files will be provided.")
672 for particle
in self.particles:
673 for channel
in particle.channels:
674 weightfile = channel.label +
'.xml'
675 if not basf2_mva.available(weightfile):
676 keys = [m
for m
in f.GetListOfKeys()
if f
"{channel.label}" in m.GetName()]
679 tree = keys[0].ReadObj()
680 nSig = tree.GetEntries(channel.mvaConfig.target +
' == 1.0')
681 nBg = tree.GetEntries(channel.mvaConfig.target +
' != 1.0')
682 if nSig < Teacher.MinimumNumberOfMVASamples:
683 B2WARNING(
"Training of MVC failed. "
684 f
"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
685 self.create_fake_weightfile(channel.label)
686 self.upload(channel.label)
688 if nBg < Teacher.MinimumNumberOfMVASamples:
689 B2WARNING(
"Training of MVC failed. "
690 f
"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
691 self.create_fake_weightfile(channel.label)
692 self.upload(channel.label)
694 variable_str =
"' '".join(channel.mvaConfig.variables)
696 command = (f
"{self.config.externTeacher}"
697 f
" --method '{channel.mvaConfig.method}'"
698 f
" --target_variable '{channel.mvaConfig.target}'"
699 f
" --treename '{channel.label} variables' --datafile 'training_input.root'"
700 f
" --signal_class 1 --variables '{variable_str}'"
701 f
" --identifier '{channel.label}.xml'"
702 f
" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1")
703 B2INFO(f
"Used following command to invoke teacher: \n {command}")
704 job_list.append((channel.label, command))
706 p = multiprocessing.Pool(
None, maxtasksperchild=1)
707 func = functools.partial(subprocess.call, shell=
True)
708 p.map(func, [c
for _, c
in job_list])
712 for name, _
in job_list:
713 if not basf2_mva.available(name +
'.xml'):
714 B2WARNING(
"Training of MVC failed. For unknown reasons, check the logfile")
715 self.create_fake_weightfile(name)
716 weightfiles.append(self.upload(name))
720def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
722 Convert an old FEI training into the new format.
723 The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
724 using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
725 @param particles list of config.Particle objects
726 @param config config.FeiConfiguration object
728 summary = pickle.load(open(configuration.legacy, 'rb'))
729 channel2lists = {k: v[2]
for k, v
in summary[
'channel2lists'].items()}
731 teacher = Teacher(particles, configuration)
733 for particle
in particles:
734 for channel
in particle.channels:
735 new_weightfile = configuration.prefix +
'_' + channel.label
736 old_weightfile = configuration.prefix +
'_' + channel2lists[channel.label.replace(
'Jpsi',
'J/psi')]
737 if not basf2_mva.available(new_weightfile):
738 if old_weightfile
is None or not basf2_mva.available(old_weightfile):
739 Teacher.create_fake_weightfile(channel.label)
740 teacher.upload(channel.label)
742 basf2_mva.download(old_weightfile, channel.label +
'.xml')
743 teacher.upload(channel.label)
746def get_stages_from_particles(particles: typing.Sequence[config.Particle]):
748 Returns the hierarchical structure of the FEI.
749 Each stage depends on the particles in the previous stage.
750 The final stage
is empty (meaning everything
is done,
and the training
is finished at this point).
751 @param particles list of config.Particle objects
754 [p for p
in particles
if p.name
in [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'p+',
'K_L0']],
755 [p
for p
in particles
if p.name
in [
'pi0',
'J/psi',
'Lambda0']],
756 [p
for p
in particles
if p.name
in [
'K_S0',
'Sigma+']],
757 [p
for p
in particles
if p.name
in [
'D+',
'D0',
'D_s+',
'Lambda_c+']],
758 [p
for p
in particles
if p.name
in [
'D*+',
'D*0',
'D_s*+']],
759 [p
for p
in particles
if p.name
in [
'B0',
'B+',
'B_s0']],
764 if p.name
not in [p.name
for stage
in stages
for p
in stage]:
765 raise RuntimeError(f
"Unknown particle {p.name}: Not implemented in FEI")
770def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
772 Performs the training of mva classifiers for all available training data,
773 this function must be either called by the user after each stage of the FEI during training,
774 or (more likely)
is called by the distributed.py script after merging the outputs of all jobs,
775 @param particles list of config.Particle objects
776 @param config config.FeiConfiguration object
777 @return list of tuple
with weight file on disk
and identifier
in database
for all trained classifiers
779 teacher = Teacher(particles, configuration)
780 return teacher.do_all_trainings()
783def save_summary(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration, cache: int):
785 Creates the Summary.pickle, which is used to keep track of the stage during the training,
786 and can be used later to investigate which configuration was used exactly to create the training.
787 @param particles list of config.Particle objects
788 @param config config.FeiConfiguration object
789 @param cache current cache level
791 configuration = config.FeiConfiguration(configuration.prefix, cache,
792 configuration.monitor, configuration.legacy, configuration.externTeacher,
793 configuration.training)
795 for i
in [8, 7, 6, 5, 4, 3, 2, 1, 0]:
796 if os.path.isfile(f
'Summary.pickle.backup_{i}'):
797 shutil.copyfile(f
'Summary.pickle.backup_{i}', f
'Summary.pickle.backup_{i + 1}')
798 if os.path.isfile(
'Summary.pickle'):
799 shutil.copyfile(
'Summary.pickle',
'Summary.pickle.backup_0')
800 pickle.dump((particles, configuration), open(
'Summary.pickle',
'wb'))
803def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
805 The most important function of the FEI.
806 This creates the FEI path for training/fitting (both terms are equal),
and application/inference (both terms are equal).
807 The whole FEI
is defined by the particles which are reconstructed (see default_channels.py)
808 and the configuration (see config.py).
811 For training this function
is called multiple times, each time the FEI reconstructs one more stage
in the hierarchical structure
812 i.e. we start
with FSP, pi0, KS_0, D, D*,
and with B mesons. You have to set configuration.training to
True for training mode.
813 All weight files created during the training will be stored
in your local database.
814 If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
815 to the central database first (see documentation
for the Belle2 Condition Database).
818 For application you call this function once,
and it returns the whole path which will reconstruct B mesons
819 with an associated signal probability. You have to set configuration.training to
False for application mode.
822 You can always turn on the monitoring (configuration.monitor =
True),
823 to write out ROOT Histograms of many quantities
for each stage,
824 using these histograms you can use the printReporting.py
or latexReporting.py scripts to automatically create pdf files.
827 This function can also use old FEI trainings (version 3), just
pass the Summary.pickle file of the old training,
828 and the weight files will be automatically converted to the new naming scheme.
830 @param particles list of config.Particle objects
831 @param config config.FeiConfiguration object
834 ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
835 |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
836 | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
838 Author: Thomas Keck 2014 - 2017
839 Please cite my PhD thesis
848 if configuration.cache
is None:
849 if os.path.isfile(
'Summary.pickle'):
850 print(
"Cache: Replaced particles and configuration with the ones from Summary.pickle!")
851 particles, configuration = pickle.load(open(
'Summary.pickle',
'rb'))
852 cache = configuration.cache
854 if configuration.training:
859 cache = configuration.cache
862 path = basf2.create_path()
867 stages = get_stages_from_particles(particles)
872 if configuration.legacy
is not None:
873 convert_legacy_training(particles, configuration)
881 training_data_information = TrainingDataInformation(particles)
882 if cache < 0
and configuration.training:
883 print(
"Stage 0: Run over all files to count the number of events and McParticles")
884 path.add_path(training_data_information.reconstruct())
885 if configuration.training:
886 save_summary(particles, configuration, 0)
887 return FeiState(path, 0, [])
888 elif not configuration.training
and configuration.monitor:
889 path.add_path(training_data_information.reconstruct())
895 loader = FSPLoader(particles, configuration)
897 print(
"Stage 0: Load FSP particles")
898 path.add_path(loader.reconstruct())
921 for stage, stage_particles
in enumerate(stages):
922 pre_reconstruction = PreReconstruction(stage_particles, configuration)
924 print(f
"Stage {stage}: PreReconstruct particles: ", [p.name
for p
in stage_particles])
925 path.add_path(pre_reconstruction.reconstruct())
927 post_reconstruction = PostReconstruction(stage_particles, configuration)
928 if configuration.training
and not post_reconstruction.available():
929 print(f
"Stage {stage}: Create training data for particles: ", [p.name
for p
in stage_particles])
930 mc_counts = training_data_information.get_mc_counts()
931 training_data = TrainingData(stage_particles, configuration, mc_counts)
932 path.add_path(training_data.reconstruct())
933 used_lists += [channel.name
for particle
in stage_particles
for channel
in particle.channels]
935 if cache <= stage + 1:
936 path.add_path(post_reconstruction.reconstruct())
937 used_lists += [particle.identifier
for particle
in stage_particles]
941 if configuration.monitor:
942 output = basf2.register_module(
'RootOutput')
943 output.param(
'outputFileName',
'Monitor_ModuleStatistics.root')
944 output.param(
'branchNames', [
'EventMetaData'])
945 output.param(
'branchNamesPersistent', [
'ProcessStatistics'])
946 output.param(
'ignoreCommandLineOverride',
True)
947 path.add_module(output)
951 if configuration.training
or configuration.monitor:
952 save_summary(particles, configuration, stage + 1)
955 return FeiState(path, stage + 1, plists=used_lists)