14 The Full Event Interpretation Algorithm
17 - The algorithm will automatically reconstruct B mesons and calculate a signal probability for each candidate.
18 - It can be used for hadronic and semileptonic tagging.
19 - The algorithm has to be trained on MC, and can afterwards be applied on data.
20 - The training requires O(100) million MC events
21 - The weight files are stored in the Belle II Condition database
23 Read this file if you want to understand the technical details of the FEI.
25 The FEI follows a hierarchical approach.
27 (Stage -1: Write out information about the provided data sample)
28 Stage 0: Final State Particles (FSP)
29 Stage 1: pi0, J/Psi, Lambda0
31 Stage 3: D and Lambda_c mesons
36 Most stages consists of:
37 - Create Particle Candidates
40 - Apply a multivariate classification method
43 The FEI will reconstruct these 7 stages during the training phase,
44 since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
45 to train all the necessary multivariate classifiers.
50from basf2
import B2INFO, B2WARNING, B2ERROR
52import modularAnalysis
as ma
72FeiState = collections.namedtuple(
'FeiState',
'path, stage, plists, fsplists, excludelists')
75class TrainingDataInformation:
77 Contains the relevant information about the used training data.
78 Basically we write out the number of MC particles in the whole dataset.
79 This numbers we can use to calculate what fraction of candidates we have to write
80 out as TrainingData to get a reasonable amount of candidates to train on
81 (too few candidates will lead to heavy overtraining, too many won't fit into memory).
82 Secondly we can use this information for the generation of the monitoring pdfs,
83 where we calculate reconstruction efficiencies.
86 def __init__(self, particles: typing.Sequence[config.Particle], outputPath: str =
''):
88 Create a new TrainingData object
89 @param particles list of config.Particle objects
90 @param outputPath path to the output directory
93 self.particles = particles
95 self.filename = os.path.join(outputPath,
'mcParticlesCount.root')
97 def available(self) -> bool:
99 Check if the relevant information is already available
101 return os.path.isfile(self.filename)
103 def reconstruct(self) -> pybasf2.Path:
105 Returns pybasf2.Path which counts the number of MCParticles in each event.
106 @param particles list of config.Particle objects
109 pdgs = {abs(
pdg.from_name(particle.name))
for particle
in self.particles}
111 path = basf2.create_path()
112 module = basf2.register_module(
'VariablesToHistogram')
113 module.set_name(
"VariablesToHistogram_MCCount")
114 module.param(
'variables', [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs])
115 module.param(
'fileName', self.filename)
116 module.param(
'ignoreCommandLineOverride',
True)
117 path.add_module(module)
120 def get_mc_counts(self):
122 Read out the number of MC particles from the file created by reconstruct
127 root_file = ROOT.TFile.Open(self.filename,
'read')
130 for key
in root_file.GetListOfKeys():
131 variable = ROOT.Belle2.MakeROOTCompatible.invertMakeROOTCompatible(key.GetName())
132 pdg = abs(int(variable[len(
'NumberOfMCParticlesInEvent('):-len(
")")]))
135 mc_counts[pdg][
'sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
136 for bin
in range(hist.GetNbinsX()))
137 mc_counts[pdg][
'std'] = hist.GetStdDev()
138 mc_counts[pdg][
'avg'] = hist.GetMean()
139 mc_counts[pdg][
'max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
140 mc_counts[pdg][
'min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
143 mc_counts[0][
'sum'] = hist.GetEntries()
150 Steers the loading of FSP particles.
151 This does NOT include RootInput, Geometry or anything required before loading FSPs,
152 the user has to add this himself (because it depends on the MC campaign and if you want
153 to use Belle or Belle II).
156 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
158 Create a new FSPLoader object
159 @param particles list of config.Particle objects
160 @param config config.FeiConfiguration object
163 self.particles = particles
167 def get_fsp_lists(self) -> typing.List[str]:
169 Returns a list of FSP particle lists which are used in the FEI.
170 This is used to create the RootOutput module.
172 fsps = [
'K+:FSP',
'pi+:FSP',
'e+:FSP',
'mu+:FSP',
'p+:FSP',
'gamma:FSP',
'K_S0:V0',
'Lambda0:V0',
'K_L0:FSP',
'gamma:V0']
177 def reconstruct(self) -> pybasf2.Path:
179 Returns pybasf2.Path which loads the FSP Particles
181 path = basf2.create_path()
184 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
185 (
'mu+:FSP',
''), (
'p+:FSP',
'')], writeOut=
True, path=path)
186 for outputList, inputList
in [(
'gamma:FSP',
'gamma:mdst'), (
'K_S0:V0',
'K_S0:mdst'),
187 (
'Lambda0:V0',
'Lambda0:mdst'), (
'K_L0:FSP',
'K_L0:mdst'),
188 (
'pi0:FSP',
'pi0:mdst'), (
'gamma:V0',
'gamma:v0mdst')]:
189 ma.copyParticles(outputList, inputList, writeOut=
True, path=path)
191 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
192 (
'mu+:FSP',
''), (
'gamma:FSP',
''),
193 (
'p+:FSP',
''), (
'K_L0:FSP',
'')], writeOut=
True, path=path)
194 ma.fillParticleList(
'K_S0:V0 -> pi+ pi-',
'', writeOut=
True, path=path)
195 ma.fillParticleList(
'Lambda0:V0 -> p+ pi-',
'', writeOut=
True, path=path)
196 ma.fillConvertedPhotonsList(
'gamma:V0 -> e+ e-',
'', writeOut=
True, path=path)
198 if self.config.monitor:
199 names = [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'K_S0',
'p+',
'K_L0',
'Lambda0',
'pi0']
200 filename = os.path.join(self.config.monitoring_path,
'Monitor_FSPLoader.root')
202 variables = [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs]
203 ma.variablesToHistogram(
'', variables=variables, filename=filename, ignoreCommandLineOverride=
True, path=path)
209 Steers the creation of the training data.
210 The training data is used to train a multivariate classifier for each channel.
211 The training of the FEI at its core is just generating this training data for each channel.
212 After we created the training data for a stage, we have to train the classifiers (see Teacher class further down).
215 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
216 mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
218 Create a new TrainingData object
219 @param particles list of config.Particle objects
220 @param config config.FeiConfiguration object
221 @param mc_counts containing number of MC Particles
224 self.particles = particles
228 self.mc_counts = mc_counts
230 def reconstruct(self) -> pybasf2.Path:
232 Returns pybasf2.Path which creates the training data for the given particles
235 path = basf2.create_path()
237 for particle
in self.particles:
239 nSignal = self.mc_counts[pdgcode][
'sum']
240 print(f
"FEI-core: TrainingData: nSignal for {particle.name}: {nSignal}")
249 for channel
in particle.channels:
250 weightfile = f
'{channel.label}.xml'
251 if basf2_mva.available(weightfile):
252 B2INFO(f
"FEI-core: Skipping preparing Training Data for {weightfile}, already available")
254 filename =
'training_input.root'
257 nBackground = self.mc_counts[0][
'sum'] * channel.preCutConfig.bestCandidateCut
258 inverseSamplingRates = {}
261 if nBackground > Teacher.MaximumNumberOfMVASamples
and not channel.preCutConfig.noBackgroundSampling:
262 inverseSamplingRates[0] = max(
263 1, int((int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1) * channel.preCutConfig.bkgSamplingFactor))
264 elif channel.preCutConfig.bkgSamplingFactor > 1:
265 inverseSamplingRates[0] = int(channel.preCutConfig.bkgSamplingFactor)
267 if nSignal > Teacher.MaximumNumberOfMVASamples
and not channel.preCutConfig.noSignalSampling:
268 inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
270 spectators = [channel.mvaConfig.target] + list(channel.mvaConfig.spectators.keys())
271 if channel.mvaConfig.sPlotVariable
is not None:
272 spectators.append(channel.mvaConfig.sPlotVariable)
274 if self.config.monitor:
275 hist_variables = [
'mcErrors',
'mcParticleStatus'] + channel.mvaConfig.variables + spectators
276 hist_variables_2d = [(x, channel.mvaConfig.target)
277 for x
in channel.mvaConfig.variables + spectators
if x
is not channel.mvaConfig.target]
278 hist_filename = os.path.join(self.config.monitoring_path,
'Monitor_TrainingData.root')
279 ma.variablesToHistogram(channel.name, variables=config.variables2binnings(hist_variables),
280 variables_2d=config.variables2binnings_2d(hist_variables_2d),
281 filename=hist_filename,
282 ignoreCommandLineOverride=
True,
283 directory=config.removeJPsiSlash(f
'{channel.label}'), path=path)
285 teacher = basf2.register_module(
'VariablesToNtuple')
286 teacher.set_name(f
'VariablesToNtuple_{channel.name}')
287 teacher.param(
'fileName', filename)
288 teacher.param(
'treeName', ROOT.Belle2.MakeROOTCompatible.makeROOTCompatible(f
'{channel.label} variables'))
289 teacher.param(
'variables', channel.mvaConfig.variables + spectators)
290 teacher.param(
'particleList', channel.name)
291 teacher.param(
'sampling', (channel.mvaConfig.target, inverseSamplingRates))
292 teacher.param(
'ignoreCommandLineOverride',
True)
293 path.add_module(teacher)
297class PreReconstruction:
299 Steers the reconstruction phase before the mva method was applied
301 - The ParticleCombination (for each particle and channel we create candidates using
302 the daughter candidates from the previous stages)
304 - Vertex Fitting (this is the slowest part of the whole FEI, KFit is used by default,
305 but you can use fastFit as a drop-in replacement https://github.com/thomaskeck/FastFit/,
306 this will speed up the whole FEI by a factor 2-3)
309 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
311 Create a new PreReconstruction object
312 @param particles list of config.Particle objects
313 @param config config.FeiConfiguration object
316 self.particles = particles
320 def reconstruct(self) -> pybasf2.Path:
322 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
324 path = basf2.create_path()
326 for particle
in self.particles:
327 for channel
in particle.channels:
330 channel.daughters[0].split(
':')[0]) ==
pdg.from_name(particle.name)):
331 ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=
True, path=path)
332 v2EI = basf2.register_module(
'VariablesToExtraInfo')
333 v2EI.set_name(f
'VariablesToExtraInfo_{channel.name}')
334 v2EI.param(
'particleList', channel.name)
335 v2EI.param(
'variables', {f
'constant({channel.decayModeID})':
'decayModeID'})
337 v2EI.set_log_level(basf2.logging.log_level.ERROR)
338 path.add_module(v2EI)
340 ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
341 writeOut=
True, path=path)
342 if self.config.monitor:
343 ma.matchMCTruth(channel.name, path=path)
344 bc_variable = channel.preCutConfig.bestCandidateVariable
345 if self.config.monitor ==
'simple':
346 hist_variables = [channel.mvaConfig.target,
'extraInfo(decayModeID)']
347 hist_variables_2d = [(channel.mvaConfig.target,
'extraInfo(decayModeID)')]
349 hist_variables = [bc_variable,
'mcErrors',
'mcParticleStatus',
350 channel.mvaConfig.target] + list(channel.mvaConfig.spectators.keys())
351 hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
352 (bc_variable,
'mcErrors'),
353 (bc_variable,
'mcParticleStatus')]
354 for specVar
in channel.mvaConfig.spectators:
355 hist_variables_2d.append((bc_variable, specVar))
356 hist_variables_2d.append((channel.mvaConfig.target, specVar))
357 filename = os.path.join(self.config.monitoring_path,
'Monitor_PreReconstruction_BeforeRanking.root')
358 ma.variablesToHistogram(
360 variables=config.variables2binnings(hist_variables),
361 variables_2d=config.variables2binnings_2d(hist_variables_2d),
363 ignoreCommandLineOverride=
True,
364 directory=f
'{channel.label}',
367 if channel.preCutConfig.bestCandidateMode ==
'lowest':
368 ma.rankByLowest(channel.name,
369 channel.preCutConfig.bestCandidateVariable,
370 channel.preCutConfig.bestCandidateCut,
373 elif channel.preCutConfig.bestCandidateMode ==
'highest':
374 ma.rankByHighest(channel.name,
375 channel.preCutConfig.bestCandidateVariable,
376 channel.preCutConfig.bestCandidateCut,
380 raise RuntimeError(f
'Unknown bestCandidateMode {repr(channel.preCutConfig.bestCandidateMode)}')
382 if 'gamma' in channel.decayString
and channel.pi0veto:
383 ma.buildRestOfEvent(channel.name, path=path)
384 Ddaughter_roe_path = basf2.Path()
385 deadEndPath = basf2.Path()
386 ma.signalSideParticleFilter(channel.name,
'', Ddaughter_roe_path, deadEndPath)
387 ma.fillParticleList(
'gamma:roe',
'isInRestOfEvent == 1', path=Ddaughter_roe_path)
389 matches = list(re.finditer(
'gamma', channel.decayString))
391 for igamma
in range(len(matches)):
392 start, end = matches[igamma-1].span()
393 tempString = f
'{channel.decayString[:start]}^gamma{channel.decayString[end:]}'
394 ma.fillSignalSideParticleList(f
'gamma:sig_{igamma}', tempString, path=Ddaughter_roe_path)
395 ma.reconstructDecay(f
'pi0:veto_{igamma} -> gamma:sig_{igamma} gamma:roe',
'', path=Ddaughter_roe_path)
396 pi0lists.append(f
'pi0:veto_{igamma}')
397 ma.copyLists(
'pi0:veto', pi0lists, writeOut=
False, path=Ddaughter_roe_path)
398 ma.rankByLowest(
'pi0:veto',
'abs(dM)', 1, path=Ddaughter_roe_path)
399 ma.matchMCTruth(
'pi0:veto', path=Ddaughter_roe_path)
400 ma.variableToSignalSideExtraInfo(
403 'InvM':
'pi0vetoMass',
404 'formula((daughter(0,E)-daughter(1,E))/(daughter(0,E)+daughter(1,E)))':
'pi0vetoEnergyAsymmetry',
406 path=Ddaughter_roe_path
408 path.for_each(
'RestOfEvent',
'RestOfEvents', Ddaughter_roe_path)
410 if self.config.monitor:
411 filename = os.path.join(self.config.monitoring_path,
'Monitor_PreReconstruction_AfterRanking.root')
412 if self.config.monitor !=
'simple':
413 hist_variables += [
'extraInfo(preCut_rank)']
414 hist_variables_2d += [(
'extraInfo(preCut_rank)', channel.mvaConfig.target),
415 (
'extraInfo(preCut_rank)',
'mcErrors'),
416 (
'extraInfo(preCut_rank)',
'mcParticleStatus')]
417 for specVar
in channel.mvaConfig.spectators:
418 hist_variables_2d.append((
'extraInfo(preCut_rank)', specVar))
419 ma.variablesToHistogram(
421 variables=config.variables2binnings(hist_variables),
422 variables_2d=config.variables2binnings_2d(hist_variables_2d),
424 ignoreCommandLineOverride=
True,
425 directory=f
'{channel.label}',
429 elif self.config.training:
430 ma.matchMCTruth(channel.name, path=path)
433 pvfit = basf2.register_module(
'ParticleVertexFitter')
434 pvfit.set_name(f
'ParticleVertexFitter_{channel.name}')
435 pvfit.param(
'listName', channel.name)
436 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
437 pvfit.param(
'vertexFitter',
'KFit')
438 pvfit.param(
'fitType',
'vertex')
439 pvfit.set_log_level(basf2.logging.log_level.ERROR)
440 path.add_module(pvfit)
441 elif re.findall(
r"[\w']+", channel.decayString).count(
'pi0') > 1
and particle.name !=
'pi0':
442 basf2.B2INFO(f
"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
443 elif len(channel.daughters) > 1:
444 pvfit = basf2.register_module(
'ParticleVertexFitter')
445 pvfit.set_name(f
'ParticleVertexFitter_{channel.name}')
446 pvfit.param(
'listName', channel.name)
447 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
448 pvfit.param(
'vertexFitter',
'KFit')
449 if particle.name
in [
'pi0']:
450 pvfit.param(
'fitType',
'mass')
452 pvfit.param(
'fitType',
'vertex')
453 pvfit.set_log_level(basf2.logging.log_level.ERROR)
454 path.add_module(pvfit)
456 if self.config.monitor:
457 if self.config.monitor ==
'simple':
458 hist_variables = [channel.mvaConfig.target,
'extraInfo(decayModeID)']
459 hist_variables_2d = [(channel.mvaConfig.target,
'extraInfo(decayModeID)')]
461 hist_variables = [
'chiProb',
'mcErrors',
'mcParticleStatus',
462 channel.mvaConfig.target] + list(channel.mvaConfig.spectators.keys())
463 hist_variables_2d = [(
'chiProb', channel.mvaConfig.target),
464 (
'chiProb',
'mcErrors'),
465 (
'chiProb',
'mcParticleStatus')]
466 for specVar
in channel.mvaConfig.spectators:
467 hist_variables_2d.append((
'chiProb', specVar))
468 hist_variables_2d.append((channel.mvaConfig.target, specVar))
469 filename = os.path.join(self.config.monitoring_path,
'Monitor_PreReconstruction_AfterVertex.root')
470 ma.variablesToHistogram(
472 variables=config.variables2binnings(hist_variables),
473 variables_2d=config.variables2binnings_2d(hist_variables_2d),
475 ignoreCommandLineOverride=
True,
476 directory=f
'{channel.label}',
482class PostReconstruction:
484 Steers the reconstruction phase after the mva method was applied
486 - The application of the mva method itself.
487 - Copying all channel lists in a common one for each particle defined in particles
488 - Tag unique signal candidates, to avoid double counting of channels with overlap
491 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
493 Create a new PostReconstruction object
494 @param particles list of config.Particle objects
495 @param config config.FeiConfiguration object
498 self.particles = particles
502 def get_missing_channels(self) -> typing.Sequence[str]:
504 Returns all channels for which the weightfile is missing
507 for particle
in self.particles:
508 for channel
in particle.channels:
510 weightfile = f
'{channel.label}.xml'
511 if not basf2_mva.available(weightfile):
512 missing += [channel.label]
515 def available(self) -> bool:
517 Check if the relevant information is already available
519 return len(self.get_missing_channels()) == 0
521 def reconstruct(self) -> pybasf2.Path:
523 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
526 path = basf2.create_path()
528 for particle
in self.particles:
529 for channel
in particle.channels:
530 expert = basf2.register_module(
'MVAExpert')
531 expert.set_name(f
'MVAExpert_{channel.name}')
532 if self.config.training:
533 expert.param(
'identifier', f
'{channel.label}.xml')
535 expert.param(
'identifier', f
'{self.config.prefix}_{channel.label}')
536 expert.param(
'extraInfoName',
'SignalProbability')
537 expert.param(
'listNames', [channel.name])
539 expert.set_log_level(basf2.logging.log_level.ERROR)
540 path.add_module(expert)
542 if self.config.monitor:
543 if self.config.monitor ==
'simple':
544 hist_variables = [channel.mvaConfig.target,
'extraInfo(decayModeID)']
545 hist_variables_2d = [(channel.mvaConfig.target,
'extraInfo(decayModeID)')]
547 hist_variables = [
'mcErrors',
549 'extraInfo(SignalProbability)',
550 channel.mvaConfig.target,
551 'extraInfo(decayModeID)'] + list(channel.mvaConfig.spectators.keys())
552 hist_variables_2d = [(
'extraInfo(SignalProbability)', channel.mvaConfig.target),
553 (
'extraInfo(SignalProbability)',
'mcErrors'),
554 (
'extraInfo(SignalProbability)',
'mcParticleStatus'),
555 (
'extraInfo(decayModeID)', channel.mvaConfig.target),
556 (
'extraInfo(decayModeID)',
'mcErrors'),
557 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
558 for specVar
in channel.mvaConfig.spectators:
559 hist_variables_2d.append((
'extraInfo(SignalProbability)', specVar))
560 hist_variables_2d.append((
'extraInfo(decayModeID)', specVar))
561 hist_variables_2d.append((channel.mvaConfig.target, specVar))
562 filename = os.path.join(self.config.monitoring_path,
'Monitor_PostReconstruction_AfterMVA.root')
563 ma.variablesToHistogram(
565 variables=config.variables2binnings(hist_variables),
566 variables_2d=config.variables2binnings_2d(hist_variables_2d),
568 ignoreCommandLineOverride=
True,
569 directory=f
'{channel.label}',
573 if particle.postCutConfig.value > 0.0:
574 cutstring = f
'{particle.postCutConfig.value} < extraInfo(SignalProbability)'
576 ma.mergeListsWithBestDuplicate(particle.identifier, [c.name
for c
in particle.channels],
577 variable=
'particleSource', writeOut=
True, path=path)
579 if self.config.monitor:
580 if self.config.monitor ==
'simple':
581 hist_variables = [particle.mvaConfig.target,
'extraInfo(decayModeID)']
582 hist_variables_2d = [(particle.mvaConfig.target,
'extraInfo(decayModeID)')]
584 hist_variables = [
'mcErrors',
586 'extraInfo(SignalProbability)',
587 particle.mvaConfig.target,
588 'extraInfo(decayModeID)'] + list(particle.mvaConfig.spectators.keys())
589 hist_variables_2d = [(
'extraInfo(decayModeID)', particle.mvaConfig.target),
590 (
'extraInfo(decayModeID)',
'mcErrors'),
591 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
592 for specVar
in particle.mvaConfig.spectators:
593 hist_variables_2d.append((
'extraInfo(SignalProbability)', specVar))
594 hist_variables_2d.append((
'extraInfo(decayModeID)', specVar))
595 hist_variables_2d.append((particle.mvaConfig.target, specVar))
596 filename = os.path.join(self.config.monitoring_path,
'Monitor_PostReconstruction_BeforePostCut.root')
597 ma.variablesToHistogram(
599 variables=config.variables2binnings(hist_variables),
600 variables_2d=config.variables2binnings_2d(hist_variables_2d),
602 ignoreCommandLineOverride=
True,
603 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
606 ma.applyCuts(particle.identifier, cutstring, path=path)
608 if self.config.monitor:
609 filename = os.path.join(self.config.monitoring_path,
'Monitor_PostReconstruction_BeforeRanking.root')
610 ma.variablesToHistogram(
612 variables=config.variables2binnings(hist_variables),
613 variables_2d=config.variables2binnings_2d(hist_variables_2d),
615 ignoreCommandLineOverride=
True,
616 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
619 ma.rankByHighest(particle.identifier,
'extraInfo(SignalProbability)',
620 particle.postCutConfig.bestCandidateCut,
'postCut_rank', path=path)
622 uniqueSignal = basf2.register_module(
'TagUniqueSignal')
623 uniqueSignal.param(
'particleList', particle.identifier)
624 uniqueSignal.param(
'target', particle.mvaConfig.target)
625 uniqueSignal.param(
'extraInfoName',
'uniqueSignal')
626 uniqueSignal.set_name(f
'TagUniqueSignal_{particle.identifier}')
628 uniqueSignal.set_log_level(basf2.logging.log_level.ERROR)
629 path.add_module(uniqueSignal)
631 if self.config.monitor:
632 if self.config.monitor !=
'simple':
633 hist_variables += [
'extraInfo(postCut_rank)']
634 hist_variables_2d += [(
'extraInfo(decayModeID)',
'extraInfo(postCut_rank)'),
635 (particle.mvaConfig.target,
'extraInfo(postCut_rank)'),
636 (
'mcErrors',
'extraInfo(postCut_rank)'),
637 (
'mcParticleStatus',
'extraInfo(postCut_rank)')]
638 for specVar
in particle.mvaConfig.spectators:
639 hist_variables_2d.append((
'extraInfo(postCut_rank)', specVar))
640 filename = os.path.join(self.config.monitoring_path,
'Monitor_PostReconstruction_AfterRanking.root')
641 ma.variablesToHistogram(
643 variables=config.variables2binnings(hist_variables),
644 variables_2d=config.variables2binnings_2d(hist_variables_2d),
646 ignoreCommandLineOverride=
True,
647 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
650 filename = os.path.join(self.config.monitoring_path,
'Monitor_Final.root')
651 if self.config.monitor ==
'simple':
652 hist_variables = [
'extraInfo(uniqueSignal)',
'extraInfo(decayModeID)']
653 hist_variables_2d = [(
'extraInfo(uniqueSignal)',
'extraInfo(decayModeID)')]
654 ma.variablesToHistogram(
656 variables=config.variables2binnings(hist_variables),
657 variables_2d=config.variables2binnings_2d(hist_variables_2d),
659 ignoreCommandLineOverride=
True,
660 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
663 variables = [
'extraInfo(SignalProbability)',
'mcErrors',
'mcParticleStatus', particle.mvaConfig.target,
664 'extraInfo(uniqueSignal)',
'extraInfo(decayModeID)'] + list(particle.mvaConfig.spectators.keys())
666 ma.variablesToNtuple(
669 treename=ROOT.Belle2.MakeROOTCompatible.makeROOTCompatible(
670 config.removeJPsiSlash(f
'{particle.identifier} variables')),
672 ignoreCommandLineOverride=
True,
679 Performs all necessary trainings for all training data files which are
680 available but where there is no weight file available yet.
681 This class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
682 The trainings are run in parallel using multi-threading of python.
683 Each training is done by a subprocess call, the training command (passed by config.externTeacher) can be either
684 * basf2_mva_teacher, the training will be done directly on the machine
685 * externClustTeacher, the training will be submitted to the batch system of KEKCC
689 MaximumNumberOfMVASamples = int(1e7)
692 MinimumNumberOfMVASamples = int(5e2)
694 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
696 Create a new Teacher object
697 @param particles list of config.Particle objects
698 @param config config.FeiConfiguration object
701 self.particles = particles
706 def create_fake_weightfile(channel: str):
708 Create a fake weight file using the trivial method, it will always return 0.0
709 @param channel for which we create a fake weight file
712 <?xml version="1.0" encoding="utf-8"?>
713 <method>Trivial</method>
714 <weightfile>{channel}.xml</weightfile>
715 <treename>tree</treename>
716 <target_variable>isSignal</target_variable>
717 <weight_variable>__weight__</weight_variable>
718 <signal_class>1</signal_class>
719 <max_events>0</max_events>
720 <number_feature_variables>1</number_feature_variables>
721 <variable0>M</variable0>
722 <number_spectator_variables>0</number_spectator_variables>
723 <number_data_files>1</number_data_files>
724 <datafile0>train.root</datafile0>
725 <Trivial_version>1</Trivial_version>
726 <Trivial_output>0</Trivial_output>
727 <signal_fraction>0.066082567</signal_fraction>
729 with open(f
'{channel}.xml',
"w")
as f:
733 def check_if_weightfile_is_fake(filename: str):
735 Checks if the provided filename is a fake-weight file or not
736 @param filename the filename of the weight file
739 return '<method>Trivial</method>' in open(filename).readlines()[2]
740 except BaseException:
744 def upload(self, channel: str):
746 Upload the weight file into the condition database
747 @param channel whose weight file is uploaded
749 disk = f
'{channel}.xml'
750 dbase = f
'{self.config.prefix}_{channel}'
751 basf2_mva.upload(disk, dbase)
752 print(f
"FEI-core: Uploading {dbase} to localdb")
755 def do_all_trainings(self):
757 Do all trainings for which we find training data
763 ROOT.PyConfig.StartGuiThread =
False
766 all_stage_particles = get_stages_from_particles(self.particles)
767 if self.config.cache
is None:
768 stagesToTrain = range(1, len(all_stage_particles)+1)
770 stagesToTrain = [self.config.cache]
772 filename =
'training_input.root'
773 if os.path.isfile(filename):
774 f = ROOT.TFile.Open(filename,
'read')
776 B2WARNING(f
'Training of MVC failed: {filename}. ROOT file corrupt. No weight files will be provided.')
777 elif len([k.GetName()
for k
in f.GetListOfKeys()]) == 0:
779 f
'Training of MVC failed: {filename}. ROOT file has no trees. No weight files will be provided.')
781 for istage
in stagesToTrain:
782 for particle
in all_stage_particles[istage-1]:
783 for channel
in particle.channels:
784 weightfile = f
'{channel.label}.xml'
785 if basf2_mva.available(weightfile):
786 B2INFO(f
"FEI-core: Skipping {weightfile}, already available")
789 treeName = ROOT.Belle2.MakeROOTCompatible.makeROOTCompatible(f
'{channel.label} variables')
790 keys = [m
for m
in f.GetListOfKeys()
if treeName
in m.GetName()]
792 B2WARNING(
"Training of MVC failed. "
793 f
"Couldn't find tree for channel {channel}. Ignoring channel.")
796 B2WARNING(f
"Found more than one tree for channel {channel}. Taking first tree from: {keys}")
797 tree = keys[0].ReadObj()
798 total_entries = tree.GetEntries()
799 nSig = tree.GetEntries(f
'{channel.mvaConfig.target}==1.0')
800 nBg = tree.GetEntries(f
'{channel.mvaConfig.target}==0.0')
802 f
'FEI-core: Number of events for channel: {channel.label}, '
803 f
'Total: {total_entries}, Signal: {nSig}, Background: {nBg}')
804 if nSig < Teacher.MinimumNumberOfMVASamples:
805 B2WARNING(
"Training of MVC failed. "
806 f
"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
807 self.create_fake_weightfile(channel.label)
808 self.upload(channel.label)
810 if nBg < Teacher.MinimumNumberOfMVASamples:
811 B2WARNING(
"Training of MVC failed. "
812 f
"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
813 self.create_fake_weightfile(channel.label)
814 self.upload(channel.label)
816 variable_str =
"' '".join(channel.mvaConfig.variables)
818 spectators = list(channel.mvaConfig.spectators.keys())
819 if channel.mvaConfig.sPlotVariable
is not None:
820 spectators.append(channel.mvaConfig.sPlotVariable)
821 spectators_str =
"' '".join(spectators)
823 treeName = ROOT.Belle2.MakeROOTCompatible.makeROOTCompatible(f
'{channel.label} variables')
824 command = (f
"{self.config.externTeacher}"
825 f
" --method '{channel.mvaConfig.method}'"
826 f
" --target_variable '{channel.mvaConfig.target}'"
827 f
" --treename '{treeName}'"
828 f
" --datafile 'training_input.root'"
830 f
" --variables '{variable_str}'"
831 f
" --identifier '{weightfile}'")
832 if len(spectators) > 0:
833 command += f
" --spectators '{spectators_str}'"
834 command += f
" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1"
835 B2INFO(f
"Used following command to invoke teacher: \n {command}")
836 job_list.append((channel.label, command))
839 if len(job_list) > 0:
840 p = multiprocessing.Pool(
None, maxtasksperchild=1)
841 func = functools.partial(subprocess.call, shell=
True)
842 p.map(func, [c
for _, c
in job_list])
846 for name, _
in job_list:
847 if not basf2_mva.available(f
'{name}.xml'):
848 B2WARNING(
"Training of MVC failed. For unknown reasons, check the logfile", f
'{name}.log')
849 self.create_fake_weightfile(name)
850 weightfiles.append(self.upload(name))
854def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
856 Convert an old FEI training into the new format.
857 The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
858 using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
859 @param particles list of config.Particle objects
860 @param config config.FeiConfiguration object
862 summary = pickle.load(open(configuration.legacy,
'rb'))
863 channel2lists = {k: v[2]
for k, v
in summary[
'channel2lists'].items()}
865 teacher = Teacher(particles, configuration)
867 for particle
in particles:
868 for channel
in particle.channels:
869 new_weightfile = f
'{configuration.prefix}_{channel.label}'
870 old_weightfile = f
'{configuration.prefix}_{channel2lists[channel.label.replace("Jpsi", "J/psi")]}'
871 if not basf2_mva.available(new_weightfile):
872 if old_weightfile
is None or not basf2_mva.available(old_weightfile):
873 Teacher.create_fake_weightfile(channel.label)
874 teacher.upload(channel.label)
876 basf2_mva.download(old_weightfile, f
'{channel.label}.xml')
877 teacher.upload(channel.label)
880def get_stages_from_particles(particles: typing.Sequence[typing.Union[config.Particle, str]]):
882 Returns the hierarchical structure of the FEI.
883 Each stage depends on the particles in the previous stage.
884 The final stage is empty (meaning everything is done, and the training is finished at this point).
885 @param particles list of config.Particle or string objects
888 return p.split(
":")[0]
if isinstance(p, str)
else p.name
891 return (p.split(
":")[1]
if isinstance(p, str)
else p.label).lower()
894 [p
for p
in particles
if get_pname(p)
in [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'p+',
'K_L0']],
895 [p
for p
in particles
if get_pname(p)
in [
'pi0',
'J/psi',
'Lambda0']],
896 [p
for p
in particles
if get_pname(p)
in [
'K_S0',
'Sigma+']],
897 [p
for p
in particles
if get_pname(p)
in [
'D+',
'D0',
'D_s+',
'Lambda_c+']
and 'tag' not in get_plabel(p)],
898 [p
for p
in particles
if get_pname(p)
in [
'D*+',
'D*0',
'D_s*+']
and 'tag' not in get_plabel(p)],
899 [p
for p
in particles
if get_pname(p)
in [
'B0',
'B+',
'B_s0']
or 'tag' in get_plabel(p)],
905 if pname
not in [pname
for stage
in stages
for p
in stage]:
906 raise RuntimeError(f
"Unknown particle {pname}: Not implemented in FEI")
911def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
913 Performs the training of mva classifiers for all available training data,
914 this function must be either called by the user after each stage of the FEI during training,
915 or (more likely) is called by the distributed.py script after merging the outputs of all jobs,
916 @param particles list of config.Particle objects
917 @param config config.FeiConfiguration object
918 @return list of tuple with weight file on disk and identifier in database for all trained classifiers
920 teacher = Teacher(particles, configuration)
921 return teacher.do_all_trainings()
924def save_summary(particles: typing.Sequence[config.Particle],
925 configuration: config.FeiConfiguration,
927 roundMode: int =
None,
928 pickleName: str =
'Summary.pickle'):
930 Creates the Summary.pickle, which is used to keep track of the stage during the training,
931 and can be used later to investigate which configuration was used exactly to create the training.
932 @param particles list of config.Particle objects
933 @param config config.FeiConfiguration object
934 @param cache current cache level
935 @param roundMode mode of current round of training
936 @param pickleName name of the pickle file
938 if roundMode
is None:
939 roundMode = configuration.roundMode
940 configuration = configuration._replace(cache=cache, roundMode=roundMode)
942 for i
in range(8, -1, -1):
943 if os.path.isfile(f
'{pickleName}.backup_{i}'):
944 shutil.copyfile(f
'{pickleName}.backup_{i}', f
'{pickleName}.backup_{i+1}')
945 if os.path.isfile(pickleName):
946 shutil.copyfile(pickleName, f
'{pickleName}.backup_0')
947 pickle.dump((particles, configuration), open(pickleName,
'wb'))
950def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
952 The most important function of the FEI.
953 This creates the FEI path for training/fitting (both terms are equal), and application/inference (both terms are equal).
954 The whole FEI is defined by the particles which are reconstructed (see default_channels.py)
955 and the configuration (see config.py).
958 For training this function is called multiple times, each time the FEI reconstructs one more stage in the hierarchical structure
959 i.e. we start with FSP, pi0, KS_0, D, D*, and with B mesons. You have to set configuration.training to True for training mode.
960 All weight files created during the training will be stored in your local database.
961 If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
962 to the central database first (see documentation for the Belle2 Condition Database).
965 For application you call this function once, and it returns the whole path which will reconstruct B mesons
966 with an associated signal probability. You have to set configuration.training to False for application mode.
969 You can always turn on the monitoring (configuration.monitor != False),
970 to write out ROOT Histograms of many quantities for each stage,
971 using these histograms you can use the printReporting.py or latexReporting.py scripts to automatically create pdf files.
974 This function can also use old FEI trainings (version 3), just pass the Summary.pickle file of the old training,
975 and the weight files will be automatically converted to the new naming scheme.
977 @param particles list of config.Particle objects
978 @param config config.FeiConfiguration object
981 ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
982 |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
983 | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
985 Author: Thomas Keck 2014 - 2017
986 Please cite my PhD thesis
997 if configuration.training
and (configuration.monitor
and (configuration.monitoring_path !=
'')):
998 B2ERROR(
"FEI-core: Custom Monitoring path is not allowed during training!")
1000 if configuration.cache
is None:
1001 pickleName =
'Summary.pickle'
1002 if configuration.monitor:
1003 pickleName = os.path.join(configuration.monitoring_path, pickleName)
1005 if os.path.isfile(pickleName):
1006 particles_bkp, config_bkp = pickle.load(open(pickleName,
'rb'))
1008 for fd
in configuration._fields:
1009 if fd ==
'cache' or fd ==
'roundMode':
1011 if getattr(configuration, fd) != getattr(config_bkp, fd):
1013 f
"FEI-core: Configuration changed: {fd} from {getattr(config_bkp, fd)} to {getattr(configuration, fd)}")
1015 configuration = config_bkp
1016 cache = configuration.cache
1017 print(
"Cache: Replaced particles from steering and configuration from Summary.pickle: ", cache, configuration.roundMode)
1019 if configuration.training:
1024 cache = configuration.cache
1027 path = basf2.create_path()
1032 stages = get_stages_from_particles(particles)
1037 if configuration.legacy
is not None:
1038 convert_legacy_training(particles, configuration)
1046 training_data_information = TrainingDataInformation(particles, outputPath=configuration.monitoring_path)
1047 if cache < 0
and configuration.training:
1048 print(
"Stage 0: Run over all files to count the number of events and McParticles")
1049 path.add_path(training_data_information.reconstruct())
1050 if configuration.training:
1051 save_summary(particles, configuration, 0)
1052 return FeiState(path, 0, [], [], [])
1053 elif not configuration.training
and configuration.monitor:
1054 path.add_path(training_data_information.reconstruct())
1060 loader = FSPLoader(particles, configuration)
1062 print(
"Stage 0: Load FSP particles")
1063 path.add_path(loader.reconstruct())
1086 for stage, stage_particles
in enumerate(stages):
1087 if len(stage_particles) == 0:
1088 print(f
"Stage {stage}: No particles to reconstruct in this stage, skipping!")
1091 pre_reconstruction = PreReconstruction(stage_particles, configuration)
1092 post_reconstruction = PostReconstruction(stage_particles, configuration)
1095 print(f
"Stage {stage}: PreReconstruct particles: ", [p.name
for p
in stage_particles])
1096 path.add_path(pre_reconstruction.reconstruct())
1097 if configuration.training
and not (post_reconstruction.available()
and configuration.roundMode == 0):
1098 print(f
"Stage {stage}: Create training data for particles: ", [p.name
for p
in stage_particles])
1099 mc_counts = training_data_information.get_mc_counts()
1100 training_data = TrainingData(stage_particles, configuration, mc_counts)
1101 path.add_path(training_data.reconstruct())
1102 used_lists += [channel.name
for particle
in stage_particles
for channel
in particle.channels]
1105 used_lists += [particle.identifier
for particle
in stage_particles]
1106 if (stage >= cache - 1)
and not ((configuration.roundMode == 1)
and configuration.training):
1107 if (configuration.roundMode == 3)
and configuration.training:
1108 print(f
"Stage {stage}: BDTs already applied for particles, no postReco needed: ", [p.name
for p
in stage_particles])
1110 print(f
"Stage {stage}: Apply BDT for particles: ", [p.name
for p
in stage_particles])
1111 if configuration.training
and not post_reconstruction.available():
1112 raise RuntimeError(
"FEI-core: training of current stage was not successful, please retrain!")
1113 path.add_path(post_reconstruction.reconstruct())
1114 if (((configuration.roundMode == 2)
or (configuration.roundMode == 3))
and configuration.training):
1116 fsps_of_all_stages = [fsp
for sublist
in get_stages_from_particles(loader.get_fsp_lists())
for fsp
in sublist]
1119 if configuration.training
and (configuration.roundMode == 3):
1120 dontRemove = used_lists + fsps_of_all_stages
1122 cleanup = basf2.register_module(
'RemoveParticlesNotInLists')
1123 print(
"FEI-REtrain: pruning basf2_input.root of higher stages")
1124 cleanup.param(
'particleLists', dontRemove)
1125 path.add_module(cleanup)
1129 excludedParticlesNonConjugated = [p.identifier
for p
in particles
if p.identifier
not in dontRemove]
1130 excludedParticles = [
1131 str(name)
for name
in list(
1132 ROOT.Belle2.ParticleListName.addAntiParticleLists(excludedParticlesNonConjugated))]
1133 root_file = ROOT.TFile.Open(
'basf2_input.root',
"READ")
1134 tree = root_file.Get(
'tree')
1135 for branch
in tree.GetListOfBranches():
1136 branchName = branch.GetName()
1137 if any(exParticle
in branchName
for exParticle
in excludedParticles):
1138 excludelists.append(branchName)
1139 print(
"Exclude lists from output: ", excludelists)
1143 if configuration.monitor:
1144 print(
"Add ModuleStatistics")
1145 output = basf2.register_module(
'RootOutput')
1146 output.param(
'outputFileName', os.path.join(configuration.monitoring_path,
'Monitor_ModuleStatistics.root'))
1147 output.param(
'branchNames', [
'EventMetaData'])
1148 output.param(
'branchNamesPersistent', [
'ProcessStatistics'])
1149 output.param(
'ignoreCommandLineOverride',
True)
1150 path.add_module(output)
1154 if configuration.training
or configuration.monitor:
1155 print(
"Save Summary.pickle")
1156 save_summary(particles, configuration, stage+1, pickleName=os.path.join(configuration.monitoring_path,
'Summary.pickle'))
1159 return FeiState(path, stage+1, plists=used_lists, fsplists=fsps_of_all_stages, excludelists=excludelists)