9 The Full Event Interpretation Algorithm
12 - The algorithm will automatically reconstruct B mesons and calculate a signal probability for each candidate.
13 - It can be used for hadronic and semileptonic tagging.
14 - The algorithm has to be trained on MC, and can afterwards be applied on data.
15 - The training requires O(100) million MC events
16 - The weight files are stored in the Belle 2 Condition database
18 Read this file if you want to understand the technical details of the FEI.
20 The FEI follows a hierarchical approach.
22 (Stage -1: Write out information about the provided data sample)
23 Stage 0: Final State Particles (FSP)
24 Stage 1: pi0, J/Psi, Lambda0
26 Stage 3: D and Lambda_c mesons
31 Most stages consists of:
32 - Create Particle Candidates
35 - Apply a multivariate classification method
38 The FEI will reconstruct these 7 stages during the training phase,
39 since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
40 to train all the necessary multivariate classifiers.
45 from ROOT
import PyConfig
46 PyConfig.IgnoreCommandLineOptions =
True
50 PyConfig.StartGuiThread =
False
52 from ROOT
import Belle2
56 from basf2
import B2INFO, B2WARNING
58 import modularAnalysis
as ma
66 from fei
import config
78 import multiprocessing
82 FeiState = collections.namedtuple(
'FeiState',
'path, stage, plists')
85 class TrainingDataInformation(object):
87 Contains the relevant information about the used training data.
88 Basically we write out the number of MC particles in the whole dataset.
89 This numbers we can use to calculate what fraction of candidates we have to write
90 out as TrainingData to get a reasonable amount of candidates to train on
91 (too few candidates will lead to heavy overtraining, too many won't fit into memory).
92 Secondly we can use this information for the generation of the monitoring pdfs,
93 where we calculate reconstruction efficiencies.
96 def __init__(self, particles: typing.Sequence[config.Particle]):
98 Create a new TrainingData object
99 @param particles list of config.Particle objects
102 self.particles = particles
104 self.filename =
'mcParticlesCount.root'
106 def available(self) -> bool:
108 Check if the relevant information is already available
110 return os.path.isfile(self.filename)
112 def reconstruct(self) -> pybasf2.Path:
114 Returns pybasf2.Path which counts the number of MCParticles in each event.
115 @param particles list of config.Particle objects
118 pdgs = set([abs(
pdg.from_name(particle.name))
for particle
in self.particles])
120 path = basf2.create_path()
121 module = basf2.register_module(
'VariablesToHistogram')
122 module.set_name(
"VariablesToHistogram_MCCount")
123 module.param(
'variables', [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs])
124 module.param(
'fileName', self.filename)
125 path.add_module(module)
128 def get_mc_counts(self):
130 Read out the number of MC particles from the file created by reconstruct
133 root_file = ROOT.TFile(self.filename)
138 for key
in root_file.GetListOfKeys():
140 pdg = abs(int(variable[len(
'NumberOfMCParticlesInEvent('):-len(
")")]))
143 mc_counts[pdg][
'sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
144 for bin
in range(hist.GetNbinsX()))
145 mc_counts[pdg][
'std'] = hist.GetStdDev()
146 mc_counts[pdg][
'avg'] = hist.GetMean()
147 mc_counts[pdg][
'max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
148 mc_counts[pdg][
'min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
151 mc_counts[0][
'sum'] = hist.GetEntries()
155 class FSPLoader(object):
157 Steers the loading of FSP particles.
158 This does NOT include RootInput, Geometry or anything required before loading FSPs,
159 the user has to add this himself (because it depends on the MC campaign and if you want
160 to use Belle 1 or Belle 2).
163 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
165 Create a new FSPLoader object
166 @param particles list of config.Particle objects
167 @param config config.FeiConfiguration object
170 self.particles = particles
174 def reconstruct(self) -> pybasf2.Path:
176 Returns pybasf2.Path which loads the FSP Particles
178 path = basf2.create_path()
181 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
182 (
'mu+:FSP',
''), (
'p+:FSP',
'')], writeOut=
True, path=path)
183 for outputList, inputList
in [(
'gamma:FSP',
'gamma:mdst'), (
'K_S0:V0',
'K_S0:mdst'),
184 (
'Lambda0:V0',
'Lambda0:mdst'), (
'K_L0:FSP',
'K_L0:mdst'),
185 (
'pi0:FSP',
'pi0:mdst'), (
'gamma:V0',
'gamma:v0mdst')]:
186 ma.copyParticles(outputList, inputList, writeOut=
True, path=path)
188 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
189 (
'mu+:FSP',
''), (
'gamma:FSP',
''),
190 (
'p+:FSP',
''), (
'K_L0:FSP',
'')], writeOut=
True, path=path)
191 ma.fillParticleList(
'K_S0:V0 -> pi+ pi-',
'', writeOut=
True, path=path)
192 ma.fillParticleList(
'Lambda0:V0 -> p+ pi-',
'', writeOut=
True, path=path)
193 ma.fillConvertedPhotonsList(
'gamma:V0 -> e+ e-',
'', writeOut=
True, path=path)
195 if self.config.monitor:
196 names = [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'K_S0',
'p+',
'K_L0',
'Lambda0',
'pi0']
197 filename =
'Monitor_FSPLoader.root'
199 variables = [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs]
200 ma.variablesToHistogram(
'', variables=variables, filename=filename, path=path)
204 class TrainingData(object):
206 Steers the creation of the training data.
207 The training data is used to train a multivariate classifier for each channel.
208 The training of the FEI at its core is just generating this training data for each channel.
209 After we created the training data for a stage, we have to train the classifiers (see Teacher class further down).
212 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
213 mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
215 Create a new TrainingData object
216 @param particles list of config.Particle objects
217 @param config config.FeiConfiguration object
218 @param mc_counts containing number of MC Particles
221 self.particles = particles
225 self.mc_counts = mc_counts
227 def reconstruct(self) -> pybasf2.Path:
229 Returns pybasf2.Path which creates the training data for the given particles
231 path = basf2.create_path()
233 for particle
in self.particles:
235 nSignal = self.mc_counts[pdgcode][
'sum']
243 for channel
in particle.channels:
244 filename = f
'{channel.label}.root'
246 nBackground = self.mc_counts[0][
'sum'] * channel.preCutConfig.bestCandidateCut
247 inverseSamplingRates = {}
250 if nBackground > Teacher.MaximumNumberOfMVASamples
and not channel.preCutConfig.noBackgroundSampling:
251 inverseSamplingRates[0] = int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1
252 if nSignal > Teacher.MaximumNumberOfMVASamples:
253 inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
255 spectators = [channel.mvaConfig.target]
256 if channel.mvaConfig.sPlotVariable
is not None:
257 spectators.append(channel.mvaConfig.sPlotVariable)
259 if self.config.monitor:
260 hist_variables = [
'mcErrors',
'mcParticleStatus'] + channel.mvaConfig.variables + spectators
261 hist_variables_2d = [(x, channel.mvaConfig.target)
262 for x
in channel.mvaConfig.variables + spectators
if x
is not channel.mvaConfig.target]
263 hist_filename = f
'Monitor_TrainingData_{channel.label}.root'
264 ma.variablesToHistogram(channel.name,
265 variables=config.variables2binnings(hist_variables),
266 variables_2d=config.variables2binnings_2d(hist_variables_2d),
267 filename=config.removeJPsiSlash(hist_filename), path=path)
269 teacher = basf2.register_module(
'VariablesToNtuple')
270 teacher.set_name(
'VariablesToNtuple_' + channel.name)
271 teacher.param(
'fileName', filename)
272 teacher.param(
'treeName',
'variables')
273 teacher.param(
'variables', channel.mvaConfig.variables + spectators)
274 teacher.param(
'particleList', channel.name)
275 teacher.param(
'sampling', (channel.mvaConfig.target, inverseSamplingRates))
276 path.add_module(teacher)
280 class PreReconstruction(object):
282 Steers the reconstruction phase before the mva method was applied
284 - The ParticleCombination (for each particle and channel we create candidates using
285 the daughter candidates from the previous stages)
287 - Vertex Fitting (this is the slowest part of the whole FEI, KFit is used by default,
288 but you can use fastFit as a drop-in replacement https://github.com/thomaskeck/FastFit/,
289 this will speed up the whole FEI by a factor 2-3)
292 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
294 Create a new PreReconstruction object
295 @param particles list of config.Particle objects
296 @param config config.FeiConfiguration object
299 self.particles = particles
303 def reconstruct(self) -> pybasf2.Path:
305 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
307 path = basf2.create_path()
309 for particle
in self.particles:
310 for channel
in particle.channels:
312 if len(channel.daughters) == 1:
313 ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=
True, path=path)
314 ma.variablesToExtraInfo(channel.name, {f
'constant({channel.decayModeID})':
'decayModeID'}, path=path)
316 ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
317 writeOut=
True, path=path)
318 if self.config.monitor:
319 ma.matchMCTruth(channel.name, path=path)
320 bc_variable = channel.preCutConfig.bestCandidateVariable
321 hist_variables = [bc_variable,
'mcErrors',
'mcParticleStatus', channel.mvaConfig.target]
322 hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
323 (bc_variable,
'mcErrors'),
324 (bc_variable,
'mcParticleStatus')]
325 filename = f
'Monitor_PreReconstruction_BeforeRanking_{channel.label}.root'
326 ma.variablesToHistogram(channel.name,
327 variables=config.variables2binnings(hist_variables),
328 variables_2d=config.variables2binnings_2d(hist_variables_2d),
329 filename=filename, path=path)
331 if channel.preCutConfig.bestCandidateMode ==
'lowest':
332 ma.rankByLowest(channel.name,
333 channel.preCutConfig.bestCandidateVariable,
334 channel.preCutConfig.bestCandidateCut,
337 elif channel.preCutConfig.bestCandidateMode ==
'highest':
338 ma.rankByHighest(channel.name,
339 channel.preCutConfig.bestCandidateVariable,
340 channel.preCutConfig.bestCandidateCut,
344 raise RuntimeError(
"Unknown bestCandidateMode " + repr(channel.preCutConfig.bestCandidateMode))
346 if self.config.monitor:
347 filename = f
'Monitor_PreReconstruction_AfterRanking_{channel.label}.root'
348 hist_variables += [
'extraInfo(preCut_rank)']
349 hist_variables_2d += [(
'extraInfo(preCut_rank)', channel.mvaConfig.target),
350 (
'extraInfo(preCut_rank)',
'mcErrors'),
351 (
'extraInfo(preCut_rank)',
'mcParticleStatus')]
352 ma.variablesToHistogram(channel.name,
353 variables=config.variables2binnings(hist_variables),
354 variables_2d=config.variables2binnings_2d(hist_variables_2d),
355 filename=filename, path=path)
358 elif self.config.training:
359 ma.matchMCTruth(channel.name, path=path)
362 pvfit = basf2.register_module(
'ParticleVertexFitter')
363 pvfit.set_name(
'ParticleVertexFitter_' + channel.name)
364 pvfit.param(
'listName', channel.name)
365 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
366 pvfit.param(
'vertexFitter',
'KFit')
367 pvfit.param(
'fitType',
'vertex')
368 pvfit.set_log_level(basf2.logging.log_level.ERROR)
369 path.add_module(pvfit)
370 elif re.findall(
r"[\w']+", channel.decayString).count(
'pi0') > 1
and particle.name !=
'pi0':
371 basf2.B2INFO(f
"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
372 elif len(channel.daughters) > 1:
373 pvfit = basf2.register_module(
'ParticleVertexFitter')
374 pvfit.set_name(
'ParticleVertexFitter_' + channel.name)
375 pvfit.param(
'listName', channel.name)
376 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
377 pvfit.param(
'vertexFitter',
'KFit')
378 pvfit.param(
'fitType',
'vertex')
379 pvfit.set_log_level(basf2.logging.log_level.ERROR)
380 path.add_module(pvfit)
382 if self.config.monitor:
383 hist_variables = [
'chiProb',
'mcErrors',
'mcParticleStatus', channel.mvaConfig.target]
384 hist_variables_2d = [(
'chiProb', channel.mvaConfig.target),
385 (
'chiProb',
'mcErrors'),
386 (
'chiProb',
'mcParticleStatus')]
387 filename = f
'Monitor_PreReconstruction_AfterVertex_{channel.label}.root'
388 ma.variablesToHistogram(channel.name,
389 variables=config.variables2binnings(hist_variables),
390 variables_2d=config.variables2binnings_2d(hist_variables_2d),
391 filename=filename, path=path)
396 class PostReconstruction(object):
398 Steers the reconstruction phase after the mva method was applied
400 - The application of the mva method itself.
401 - Copying all channel lists in a common one for each particle defined in particles
402 - Tag unique signal candidates, to avoid double counting of channels with overlap
405 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
407 Create a new PostReconstruction object
408 @param particles list of config.Particle objects
409 @param config config.FeiConfiguration object
412 self.particles = particles
416 def get_missing_channels(self) -> typing.Sequence[str]:
418 Returns all channels for which the weightfile is missing
421 for particle
in self.particles:
422 for channel
in particle.channels:
424 weightfile = channel.label +
'.xml'
425 if not basf2_mva.available(weightfile):
426 missing += [channel.label]
429 def available(self) -> bool:
431 Check if the relevant information is already available
433 return len(self.get_missing_channels()) == 0
435 def reconstruct(self) -> pybasf2.Path:
437 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
439 path = basf2.create_path()
441 for particle
in self.particles:
442 for channel
in particle.channels:
443 expert = basf2.register_module(
'MVAExpert')
444 expert.set_name(
'MVAExpert_' + channel.name)
445 if self.config.training:
446 expert.param(
'identifier', channel.label +
'.xml')
448 expert.param(
'identifier', self.config.prefix +
'_' + channel.label)
449 expert.param(
'extraInfoName',
'SignalProbability')
450 expert.param(
'listNames', [channel.name])
451 path.add_module(expert)
453 uniqueSignal = basf2.register_module(
'TagUniqueSignal')
454 uniqueSignal.param(
'particleList', channel.name)
455 uniqueSignal.param(
'target', channel.mvaConfig.target)
456 uniqueSignal.param(
'extraInfoName',
'uniqueSignal')
457 uniqueSignal.set_name(
'TagUniqueSignal_' + channel.name)
458 path.add_module(uniqueSignal)
460 if self.config.monitor:
461 hist_variables = [
'mcErrors',
'mcParticleStatus',
'extraInfo(uniqueSignal)',
'extraInfo(SignalProbability)',
462 channel.mvaConfig.target,
'extraInfo(decayModeID)']
463 hist_variables_2d = [(
'extraInfo(SignalProbability)', channel.mvaConfig.target),
464 (
'extraInfo(SignalProbability)',
'mcErrors'),
465 (
'extraInfo(SignalProbability)',
'mcParticleStatus'),
466 (
'extraInfo(decayModeID)', channel.mvaConfig.target),
467 (
'extraInfo(decayModeID)',
'mcErrors'),
468 (
'extraInfo(decayModeID)',
'extraInfo(uniqueSignal)'),
469 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
470 filename = f
'Monitor_PostReconstruction_AfterMVA_{channel.label}.root'
471 ma.variablesToHistogram(channel.name,
472 variables=config.variables2binnings(hist_variables),
473 variables_2d=config.variables2binnings_2d(hist_variables_2d),
474 filename=filename, path=path)
477 if particle.postCutConfig.value > 0.0:
478 cutstring = str(particle.postCutConfig.value) +
' < extraInfo(SignalProbability)'
480 ma.copyLists(particle.identifier, [c.name
for c
in particle.channels], writeOut=
True, path=path)
482 if self.config.monitor:
483 hist_variables = [
'mcErrors',
'mcParticleStatus',
'extraInfo(uniqueSignal)',
'extraInfo(SignalProbability)',
484 particle.mvaConfig.target,
'extraInfo(decayModeID)']
485 hist_variables_2d = [(
'extraInfo(decayModeID)', particle.mvaConfig.target),
486 (
'extraInfo(decayModeID)',
'mcErrors'),
487 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
488 filename = f
'Monitor_PostReconstruction_BeforePostCut_{particle.identifier}.root'
489 ma.variablesToHistogram(particle.identifier,
490 variables=config.variables2binnings(hist_variables),
491 variables_2d=config.variables2binnings_2d(hist_variables_2d),
492 filename=config.removeJPsiSlash(filename), path=path)
494 ma.applyCuts(particle.identifier, cutstring, path=path)
496 if self.config.monitor:
497 filename = f
'Monitor_PostReconstruction_BeforeRanking_{particle.identifier}.root'
498 ma.variablesToHistogram(particle.identifier,
499 variables=config.variables2binnings(hist_variables),
500 variables_2d=config.variables2binnings_2d(hist_variables_2d),
501 filename=config.removeJPsiSlash(filename), path=path)
503 ma.rankByHighest(particle.identifier,
'extraInfo(SignalProbability)',
504 particle.postCutConfig.bestCandidateCut,
'postCut_rank', path=path)
506 if self.config.monitor:
507 hist_variables += [
'extraInfo(postCut_rank)']
508 hist_variables_2d += [(
'extraInfo(decayModeID)',
'extraInfo(postCut_rank)'),
509 (particle.mvaConfig.target,
'extraInfo(postCut_rank)'),
510 (
'mcErrors',
'extraInfo(postCut_rank)'),
511 (
'mcParticleStatus',
'extraInfo(postCut_rank)')]
512 filename = f
'Monitor_PostReconstruction_AfterRanking_{particle.identifier}.root'
513 ma.variablesToHistogram(particle.identifier,
514 variables=config.variables2binnings(hist_variables),
515 variables_2d=config.variables2binnings_2d(hist_variables_2d),
516 filename=config.removeJPsiSlash(filename), path=path)
518 if 'B' in particle.identifier:
519 variables = [
'extraInfo(SignalProbability)',
'Mbc',
'mcErrors',
'mcParticleStatus', particle.mvaConfig.target,
520 'cosThetaBetweenParticleAndNominalB',
'extraInfo(uniqueSignal)',
'extraInfo(decayModeID)']
522 variables = [
'extraInfo(SignalProbability)',
'mcErrors',
'mcParticleStatus', particle.mvaConfig.target,
523 'extraInfo(uniqueSignal)',
'extraInfo(decayModeID)']
525 filename = f
'Monitor_Final_{particle.identifier}.root'
526 ma.variablesToNtuple(particle.identifier, variables, treename=
'variables',
527 filename=config.removeJPsiSlash(filename), path=path)
532 class Teacher(object):
534 Performs all necessary trainings for all training data files which are
535 available but where there is no weight file available yet.
536 This class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
537 The trainings are run in parallel using multi-threading of python.
538 Each training is done by a subprocess call, the training command (passed by config.externTeacher) can be either
539 * basf2_mva_teacher, the training will be done directly on the machine
540 * externClustTeacher, the training will be submitted to the batch system of KEKCC
544 MaximumNumberOfMVASamples = int(1e7)
547 MinimumNumberOfMVASamples = int(5e2)
549 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
551 Create a new Teacher object
552 @param particles list of config.Particle objects
553 @param config config.FeiConfiguration object
556 self.particles = particles
561 def create_fake_weightfile(channel: str):
563 Create a fake weight file using the trivial method, it will always return 0.0
564 @param channel for which we create a fake weight file
567 <?xml version="1.0" encoding="utf-8"?>
568 <method>Trivial</method>
569 <weightfile>{channel}.xml</weightfile>
570 <treename>tree</treename>
571 <target_variable>isSignal</target_variable>
572 <weight_variable>__weight__</weight_variable>
573 <signal_class>1</signal_class>
574 <max_events>0</max_events>
575 <number_feature_variables>1</number_feature_variables>
576 <variable0>M</variable0>
577 <number_spectator_variables>0</number_spectator_variables>
578 <number_data_files>1</number_data_files>
579 <datafile0>train.root</datafile0>
580 <Trivial_version>1</Trivial_version>
581 <Trivial_output>0</Trivial_output>
582 <signal_fraction>0.066082567</signal_fraction>
584 with open(channel +
".xml",
"w")
as f:
588 def check_if_weightfile_is_fake(filename: str):
590 Checks if the provided filename is a fake-weight file or not
591 @param filename the filename of the weight file
594 return '<method>Trivial</method>' in open(filename,
'r').readlines()[2]
595 except BaseException:
599 def upload(self, channel: str):
601 Upload the weight file into the condition database
602 @param channel whose weight file is uploaded
604 disk = channel +
'.xml'
605 dbase = self.config.prefix +
'_' + channel
606 basf2_mva.upload(disk, dbase)
609 def do_all_trainings(self):
611 Do all trainings for which we find training data
614 for particle
in self.particles:
615 for channel
in particle.channels:
616 filename = f
'{channel.label}.root'
618 weightfile = channel.label +
'.xml'
619 if not basf2_mva.available(weightfile)
and os.path.isfile(filename):
620 f = ROOT.TFile(filename)
622 B2WARNING(
"Training of MVC failed. Couldn't find ROOT file. "
623 f
"Ignoring channel {channel.label}.")
624 self.create_fake_weightfile(channel.label)
625 self.upload(channel.label)
627 keys = [m
for m
in f.GetListOfKeys()]
629 B2WARNING(
"Training of MVC failed. ROOT file does not contain a tree. "
630 f
"Ignoring channel {channel.label}.")
631 self.create_fake_weightfile(channel.label)
632 self.upload(channel.label)
634 tree = keys[0].ReadObj()
635 nSig = tree.GetEntries(channel.mvaConfig.target +
' == 1.0')
636 nBg = tree.GetEntries(channel.mvaConfig.target +
' != 1.0')
637 if nSig < Teacher.MinimumNumberOfMVASamples:
638 B2WARNING(
"Training of MVC failed. "
639 f
"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
640 self.create_fake_weightfile(channel.label)
641 self.upload(channel.label)
643 if nBg < Teacher.MinimumNumberOfMVASamples:
644 B2WARNING(
"Training of MVC failed. "
645 f
"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
646 self.create_fake_weightfile(channel.label)
647 self.upload(channel.label)
649 variable_str =
"' '".join(channel.mvaConfig.variables)
651 command = (f
"{self.config.externTeacher}"
652 f
" --method '{channel.mvaConfig.method}'"
653 f
" --target_variable '{channel.mvaConfig.target}'"
654 f
" --treename variables --datafile '{channel.label}.root'"
655 f
" --signal_class 1 --variables '{variable_str}'"
656 f
" --identifier '{channel.label}.xml'"
657 f
" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1")
658 B2INFO(f
"Used following command to invoke teacher: \n {command}")
659 job_list.append((channel.label, command))
661 p = multiprocessing.Pool(
None, maxtasksperchild=1)
662 func = functools.partial(subprocess.call, shell=
True)
663 p.map(func, [c
for _, c
in job_list])
668 for name, _
in job_list:
669 if not basf2_mva.available(name +
'.xml'):
670 B2WARNING(
"Training of MVC failed. For unknown reasons, check the logfile")
671 self.create_fake_weightfile(name)
672 weightfiles.append(self.upload(name))
676 def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
678 Convert an old FEI training into the new format.
679 The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
680 using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
681 @param particles list of config.Particle objects
682 @param config config.FeiConfiguration object
684 summary = pickle.load(open(configuration.legacy,
'rb'))
685 channel2lists = {k: v[2]
for k, v
in summary[
'channel2lists'].items()}
687 teacher = Teacher(particles, configuration)
689 for particle
in particles:
690 for channel
in particle.channels:
691 new_weightfile = configuration.prefix +
'_' + channel.label
692 old_weightfile = configuration.prefix +
'_' + channel2lists[channel.label.replace(
'Jpsi',
'J/psi')]
693 if not basf2_mva.available(new_weightfile):
694 if old_weightfile
is None or not basf2_mva.available(old_weightfile):
695 Teacher.create_fake_weightfile(channel.label)
696 teacher.upload(channel.label)
698 basf2_mva.download(old_weightfile, channel.label +
'.xml')
699 teacher.upload(channel.label)
702 def get_stages_from_particles(particles: typing.Sequence[config.Particle]):
704 Returns the hierarchical structure of the FEI.
705 Each stage depends on the particles in the previous stage.
706 The final stage is empty (meaning everything is done, and the training is finished at this point).
707 @param particles list of config.Particle objects
710 [p
for p
in particles
if p.name
in [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'p+',
'K_L0']],
711 [p
for p
in particles
if p.name
in [
'pi0',
'J/psi',
'Lambda0']],
712 [p
for p
in particles
if p.name
in [
'K_S0',
'Sigma+']],
713 [p
for p
in particles
if p.name
in [
'D+',
'D0',
'D_s+',
'Lambda_c+']],
714 [p
for p
in particles
if p.name
in [
'D*+',
'D*0',
'D_s*+']],
715 [p
for p
in particles
if p.name
in [
'B0',
'B+']],
720 if p.name
not in [p.name
for stage
in stages
for p
in stage]:
721 raise RuntimeError(f
"Unknown particle {p.name}: Not implemented in FEI")
726 def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
728 Performs the training of mva classifiers for all available training data,
729 this function must be either called by the user after each stage of the FEI during training,
730 or (more likely) is called by the distributed.py script after merging the outputs of all jobs,
731 @param particles list of config.Particle objects
732 @param config config.FeiConfiguration object
733 @return list of tuple with weight file on disk and identifier in database for all trained classifiers
735 teacher = Teacher(particles, configuration)
736 return teacher.do_all_trainings()
739 def save_summary(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration, cache: int):
741 Creates the Summary.pickle, which is used to keep track of the stage during the training,
742 and can be used later to investigate which configuration was used exactly to create the training.
743 @param particles list of config.Particle objects
744 @param config config.FeiConfiguration object
745 @param cache current cache level
747 configuration = config.FeiConfiguration(configuration.prefix, cache,
748 configuration.monitor, configuration.legacy, configuration.externTeacher,
749 configuration.training)
751 for i
in [8, 7, 6, 5, 4, 3, 2, 1, 0]:
752 if os.path.isfile(f
'Summary.pickle.backup_{i}'):
753 shutil.copyfile(f
'Summary.pickle.backup_{i}', f
'Summary.pickle.backup_{i + 1}')
754 if os.path.isfile(
'Summary.pickle'):
755 shutil.copyfile(
'Summary.pickle',
'Summary.pickle.backup_0')
756 pickle.dump((particles, configuration), open(
'Summary.pickle',
'wb'))
759 def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
761 The most important function of the FEI.
762 This creates the FEI path for training/fitting (both terms are equal), and application/inference (both terms are equal).
763 The whole FEI is defined by the particles which are reconstructed (see default_channels.py)
764 and the configuration (see config.py).
767 For training this function is called multiple times, each time the FEI reconstructs one more stage in the hierarchical structure
768 i.e. we start with FSP, pi0, KS_0, D, D*, and with B mesons. You have to set configuration.training to True for training mode.
769 All weight files created during the training will be stored in your local database.
770 If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
771 to the central database first (see documentation for the Belle2 Condition Database).
774 For application you call this function once, and it returns the whole path which will reconstruct B mesons
775 with an associated signal probability. You have to set configuration.training to False for application mode.
778 You can always turn on the monitoring (configuration.monitor = True),
779 to write out ROOT Histograms of many quantities for each stage,
780 using these histograms you can use the printReporting.py or latexReporting.py scripts to automatically create pdf files.
783 This function can also use old FEI trainings (version 3), just pass the Summary.pickle file of the old training,
784 and the weight files will be automatically converted to the new naming scheme.
786 @param particles list of config.Particle objects
787 @param config config.FeiConfiguration object
790 ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
791 |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
792 | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
794 Author: Thomas Keck 2014 - 2017
795 Please cite my PhD thesis
804 if configuration.cache
is None:
805 if os.path.isfile(
'Summary.pickle'):
806 print(
"Cache: Replaced particles and configuration with the ones from Summary.pickle!")
807 particles, configuration = pickle.load(open(
'Summary.pickle',
'rb'))
808 cache = configuration.cache
810 if configuration.training:
815 cache = configuration.cache
818 path = basf2.create_path()
823 stages = get_stages_from_particles(particles)
828 if configuration.legacy
is not None:
829 convert_legacy_training(particles, configuration)
837 training_data_information = TrainingDataInformation(particles)
838 if cache < 0
and configuration.training:
839 print(
"Stage 0: Run over all files to count the number of events and McParticles")
840 path.add_path(training_data_information.reconstruct())
841 if configuration.training:
842 save_summary(particles, configuration, 0)
843 return FeiState(path, 0, [])
844 elif not configuration.training
and configuration.monitor:
845 path.add_path(training_data_information.reconstruct())
851 loader = FSPLoader(particles, configuration)
853 print(
"Stage 0: Load FSP particles")
854 path.add_path(loader.reconstruct())
877 for stage, stage_particles
in enumerate(stages):
878 pre_reconstruction = PreReconstruction(stage_particles, configuration)
880 print(f
"Stage {stage}: PreReconstruct particles: ", [p.name
for p
in stage_particles])
881 path.add_path(pre_reconstruction.reconstruct())
883 post_reconstruction = PostReconstruction(stage_particles, configuration)
884 if configuration.training
and not post_reconstruction.available():
885 print(f
"Stage {stage}: Create training data for particles: ", [p.name
for p
in stage_particles])
886 mc_counts = training_data_information.get_mc_counts()
887 training_data = TrainingData(stage_particles, configuration, mc_counts)
888 path.add_path(training_data.reconstruct())
889 used_lists += [channel.name
for particle
in stage_particles
for channel
in particle.channels]
891 if cache <= stage + 1:
892 path.add_path(post_reconstruction.reconstruct())
893 used_lists += [particle.identifier
for particle
in stage_particles]
897 if configuration.monitor:
898 output = basf2.register_module(
'RootOutput')
899 output.param(
'outputFileName',
'Monitor_ModuleStatistics.root')
900 output.param(
'branchNames', [
'EventMetaData'])
901 output.param(
'branchNamesPersistent', [
'ProcessStatistics'])
902 output.param(
'ignoreCommandLineOverride',
True)
903 path.add_module(output)
907 if configuration.training
or configuration.monitor:
908 save_summary(particles, configuration, stage + 1)
911 return FeiState(path, stage + 1, plists=used_lists)