14 The Full Event Interpretation Algorithm
17 - The algorithm will automatically reconstruct B mesons and calculate a signal probability for each candidate.
18 - It can be used for hadronic and semileptonic tagging.
19 - The algorithm has to be trained on MC, and can afterwards be applied on data.
20 - The training requires O(100) million MC events
21 - The weight files are stored in the Belle II Condition database
23 Read this file if you want to understand the technical details of the FEI.
25 The FEI follows a hierarchical approach.
27 (Stage -1: Write out information about the provided data sample)
28 Stage 0: Final State Particles (FSP)
29 Stage 1: pi0, J/Psi, Lambda0
31 Stage 3: D and Lambda_c mesons
36 Most stages consists of:
37 - Create Particle Candidates
40 - Apply a multivariate classification method
43 The FEI will reconstruct these 7 stages during the training phase,
44 since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
45 to train all the necessary multivariate classifiers.
50from basf2
import B2INFO, B2WARNING, B2ERROR
52import modularAnalysis
as ma
72FeiState = collections.namedtuple(
'FeiState',
'path, stage, plists')
75class TrainingDataInformation:
77 Contains the relevant information about the used training data.
78 Basically we write out the number of MC particles in the whole dataset.
79 This numbers we can use to calculate what fraction of candidates we have to write
80 out as TrainingData to get a reasonable amount of candidates to train on
81 (too few candidates will lead to heavy overtraining, too many won't fit into memory).
82 Secondly we can use this information for the generation of the monitoring pdfs,
83 where we calculate reconstruction efficiencies.
86 def __init__(self, particles: typing.Sequence[config.Particle], outputPath: str =
''):
88 Create a new TrainingData object
89 @param particles list of config.Particle objects
90 @param outputPath path to the output directory
93 self.particles = particles
95 self.filename = os.path.join(outputPath,
'mcParticlesCount.root')
97 def available(self) -> bool:
99 Check if the relevant information is already available
101 return os.path.isfile(self.filename)
103 def reconstruct(self) -> pybasf2.Path:
105 Returns pybasf2.Path which counts the number of MCParticles in each event.
106 @param particles list of config.Particle objects
109 pdgs = {abs(
pdg.from_name(particle.name))
for particle
in self.particles}
111 path = basf2.create_path()
112 module = basf2.register_module(
'VariablesToHistogram')
113 module.set_name(
"VariablesToHistogram_MCCount")
114 module.param(
'variables', [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs])
115 module.param(
'fileName', self.filename)
116 module.param(
'ignoreCommandLineOverride',
True)
117 path.add_module(module)
120 def get_mc_counts(self):
122 Read out the number of MC particles from the file created by reconstruct
127 root_file = ROOT.TFile.Open(self.filename,
'read')
130 for key
in root_file.GetListOfKeys():
131 variable = ROOT.Belle2.MakeROOTCompatible.invertMakeROOTCompatible(key.GetName())
132 pdg = abs(int(variable[len(
'NumberOfMCParticlesInEvent('):-len(
")")]))
135 mc_counts[pdg][
'sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
136 for bin
in range(hist.GetNbinsX()))
137 mc_counts[pdg][
'std'] = hist.GetStdDev()
138 mc_counts[pdg][
'avg'] = hist.GetMean()
139 mc_counts[pdg][
'max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
140 mc_counts[pdg][
'min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
143 mc_counts[0][
'sum'] = hist.GetEntries()
150 Steers the loading of FSP particles.
151 This does NOT include RootInput, Geometry or anything required before loading FSPs,
152 the user has to add this himself (because it depends on the MC campaign and if you want
153 to use Belle or Belle II).
156 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
158 Create a new FSPLoader object
159 @param particles list of config.Particle objects
160 @param config config.FeiConfiguration object
163 self.particles = particles
167 def reconstruct(self) -> pybasf2.Path:
169 Returns pybasf2.Path which loads the FSP Particles
171 path = basf2.create_path()
174 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
175 (
'mu+:FSP',
''), (
'p+:FSP',
'')], writeOut=
True, path=path)
176 for outputList, inputList
in [(
'gamma:FSP',
'gamma:mdst'), (
'K_S0:V0',
'K_S0:mdst'),
177 (
'Lambda0:V0',
'Lambda0:mdst'), (
'K_L0:FSP',
'K_L0:mdst'),
178 (
'pi0:FSP',
'pi0:mdst'), (
'gamma:V0',
'gamma:v0mdst')]:
179 ma.copyParticles(outputList, inputList, writeOut=
True, path=path)
181 ma.fillParticleLists([(
'K+:FSP',
''), (
'pi+:FSP',
''), (
'e+:FSP',
''),
182 (
'mu+:FSP',
''), (
'gamma:FSP',
''),
183 (
'p+:FSP',
''), (
'K_L0:FSP',
'')], writeOut=
True, path=path)
184 ma.fillParticleList(
'K_S0:V0 -> pi+ pi-',
'', writeOut=
True, path=path)
185 ma.fillParticleList(
'Lambda0:V0 -> p+ pi-',
'', writeOut=
True, path=path)
186 ma.fillConvertedPhotonsList(
'gamma:V0 -> e+ e-',
'', writeOut=
True, path=path)
188 if self.config.monitor:
189 names = [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'K_S0',
'p+',
'K_L0',
'Lambda0',
'pi0']
190 filename = os.path.join(self.config.monitoring_path,
'Monitor_FSPLoader.root')
192 variables = [(f
'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5)
for pdg
in pdgs]
193 ma.variablesToHistogram(
'', variables=variables, filename=filename, ignoreCommandLineOverride=
True, path=path)
199 Steers the creation of the training data.
200 The training data is used to train a multivariate classifier for each channel.
201 The training of the FEI at its core is just generating this training data for each channel.
202 After we created the training data for a stage, we have to train the classifiers (see Teacher class further down).
205 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
206 mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
208 Create a new TrainingData object
209 @param particles list of config.Particle objects
210 @param config config.FeiConfiguration object
211 @param mc_counts containing number of MC Particles
214 self.particles = particles
218 self.mc_counts = mc_counts
220 def reconstruct(self) -> pybasf2.Path:
222 Returns pybasf2.Path which creates the training data for the given particles
224 path = basf2.create_path()
226 for particle
in self.particles:
228 nSignal = self.mc_counts[pdgcode][
'sum']
236 for channel
in particle.channels:
237 filename =
'training_input.root'
240 nBackground = self.mc_counts[0][
'sum'] * channel.preCutConfig.bestCandidateCut
241 inverseSamplingRates = {}
244 if nBackground > Teacher.MaximumNumberOfMVASamples
and not channel.preCutConfig.noBackgroundSampling:
245 inverseSamplingRates[0] = max(
246 1, int((int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1) * channel.preCutConfig.bkgSamplingFactor))
247 elif channel.preCutConfig.bkgSamplingFactor > 1:
248 inverseSamplingRates[0] = int(channel.preCutConfig.bkgSamplingFactor)
250 if nSignal > Teacher.MaximumNumberOfMVASamples
and not channel.preCutConfig.noSignalSampling:
251 inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
253 spectators = [channel.mvaConfig.target] + list(channel.mvaConfig.spectators.keys())
254 if channel.mvaConfig.sPlotVariable
is not None:
255 spectators.append(channel.mvaConfig.sPlotVariable)
257 if self.config.monitor:
258 hist_variables = [
'mcErrors',
'mcParticleStatus'] + channel.mvaConfig.variables + spectators
259 hist_variables_2d = [(x, channel.mvaConfig.target)
260 for x
in channel.mvaConfig.variables + spectators
if x
is not channel.mvaConfig.target]
261 hist_filename = os.path.join(self.config.monitoring_path,
'Monitor_TrainingData.root')
262 ma.variablesToHistogram(channel.name, variables=config.variables2binnings(hist_variables),
263 variables_2d=config.variables2binnings_2d(hist_variables_2d),
264 filename=hist_filename,
265 ignoreCommandLineOverride=
True,
266 directory=config.removeJPsiSlash(f
'{channel.label}'), path=path)
268 teacher = basf2.register_module(
'VariablesToNtuple')
269 teacher.set_name(
'VariablesToNtuple_' + channel.name)
270 teacher.param(
'fileName', filename)
271 teacher.param(
'treeName', f
'{channel.label} variables')
272 teacher.param(
'variables', channel.mvaConfig.variables + spectators)
273 teacher.param(
'particleList', channel.name)
274 teacher.param(
'sampling', (channel.mvaConfig.target, inverseSamplingRates))
275 teacher.param(
'ignoreCommandLineOverride',
True)
276 path.add_module(teacher)
280class PreReconstruction:
282 Steers the reconstruction phase before the mva method was applied
284 - The ParticleCombination (for each particle and channel we create candidates using
285 the daughter candidates from the previous stages)
287 - Vertex Fitting (this is the slowest part of the whole FEI, KFit is used by default,
288 but you can use fastFit as a drop-in replacement https://github.com/thomaskeck/FastFit/,
289 this will speed up the whole FEI by a factor 2-3)
292 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
294 Create a new PreReconstruction object
295 @param particles list of config.Particle objects
296 @param config config.FeiConfiguration object
299 self.particles = particles
303 def reconstruct(self) -> pybasf2.Path:
305 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
307 path = basf2.create_path()
309 for particle
in self.particles:
310 for channel
in particle.channels:
313 channel.daughters[0].split(
':')[0]) ==
pdg.from_name(particle.name)):
314 ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=
True, path=path)
315 v2EI = basf2.register_module(
'VariablesToExtraInfo')
316 v2EI.set_name(
'VariablesToExtraInfo_' + channel.name)
317 v2EI.param(
'particleList', channel.name)
318 v2EI.param(
'variables', {f
'constant({channel.decayModeID})':
'decayModeID'})
320 v2EI.set_log_level(basf2.logging.log_level.ERROR)
321 path.add_module(v2EI)
323 ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
324 writeOut=
True, path=path)
325 if self.config.monitor:
326 ma.matchMCTruth(channel.name, path=path)
327 bc_variable = channel.preCutConfig.bestCandidateVariable
328 if self.config.monitor ==
'simple':
329 hist_variables = [channel.mvaConfig.target,
'extraInfo(decayModeID)']
330 hist_variables_2d = [(channel.mvaConfig.target,
'extraInfo(decayModeID)')]
332 hist_variables = [bc_variable,
'mcErrors',
'mcParticleStatus',
333 channel.mvaConfig.target] + list(channel.mvaConfig.spectators.keys())
334 hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
335 (bc_variable,
'mcErrors'),
336 (bc_variable,
'mcParticleStatus')]
337 for specVar
in channel.mvaConfig.spectators:
338 hist_variables_2d.append((bc_variable, specVar))
339 hist_variables_2d.append((channel.mvaConfig.target, specVar))
340 filename = os.path.join(self.config.monitoring_path,
'Monitor_PreReconstruction_BeforeRanking.root')
341 ma.variablesToHistogram(
343 variables=config.variables2binnings(hist_variables),
344 variables_2d=config.variables2binnings_2d(hist_variables_2d),
346 ignoreCommandLineOverride=
True,
347 directory=f
'{channel.label}',
350 if channel.preCutConfig.bestCandidateMode ==
'lowest':
351 ma.rankByLowest(channel.name,
352 channel.preCutConfig.bestCandidateVariable,
353 channel.preCutConfig.bestCandidateCut,
356 elif channel.preCutConfig.bestCandidateMode ==
'highest':
357 ma.rankByHighest(channel.name,
358 channel.preCutConfig.bestCandidateVariable,
359 channel.preCutConfig.bestCandidateCut,
363 raise RuntimeError(
"Unknown bestCandidateMode " + repr(channel.preCutConfig.bestCandidateMode))
365 if 'gamma' in channel.decayString
and channel.pi0veto:
366 ma.buildRestOfEvent(channel.name, path=path)
367 Ddaughter_roe_path = basf2.Path()
368 deadEndPath = basf2.Path()
369 ma.signalSideParticleFilter(channel.name,
'', Ddaughter_roe_path, deadEndPath)
370 ma.fillParticleList(
'gamma:roe',
'isInRestOfEvent == 1', path=Ddaughter_roe_path)
372 matches = list(re.finditer(
'gamma', channel.decayString))
374 for igamma
in range(len(matches)):
375 start, end = matches[igamma-1].span()
376 tempString = channel.decayString[:start] +
'^gamma' + channel.decayString[end:]
377 ma.fillSignalSideParticleList(f
'gamma:sig_{igamma}', tempString, path=Ddaughter_roe_path)
378 ma.reconstructDecay(f
'pi0:veto_{igamma} -> gamma:sig_{igamma} gamma:roe',
'', path=Ddaughter_roe_path)
379 pi0lists.append(f
'pi0:veto_{igamma}')
380 ma.copyLists(
'pi0:veto', pi0lists, writeOut=
False, path=Ddaughter_roe_path)
381 ma.rankByLowest(
'pi0:veto',
'abs(dM)', 1, path=Ddaughter_roe_path)
382 ma.matchMCTruth(
'pi0:veto', path=Ddaughter_roe_path)
383 ma.variableToSignalSideExtraInfo(
386 'InvM':
'pi0vetoMass',
387 'formula((daughter(0,E)-daughter(1,E))/(daughter(0,E)+daughter(1,E)))':
'pi0vetoEnergyAsymmetry',
389 path=Ddaughter_roe_path
391 path.for_each(
'RestOfEvent',
'RestOfEvents', Ddaughter_roe_path)
393 if self.config.monitor:
394 filename = os.path.join(self.config.monitoring_path,
'Monitor_PreReconstruction_AfterRanking.root')
395 if self.config.monitor !=
'simple':
396 hist_variables += [
'extraInfo(preCut_rank)']
397 hist_variables_2d += [(
'extraInfo(preCut_rank)', channel.mvaConfig.target),
398 (
'extraInfo(preCut_rank)',
'mcErrors'),
399 (
'extraInfo(preCut_rank)',
'mcParticleStatus')]
400 for specVar
in channel.mvaConfig.spectators:
401 hist_variables_2d.append((
'extraInfo(preCut_rank)', specVar))
402 ma.variablesToHistogram(
404 variables=config.variables2binnings(hist_variables),
405 variables_2d=config.variables2binnings_2d(hist_variables_2d),
407 ignoreCommandLineOverride=
True,
408 directory=f
'{channel.label}',
412 elif self.config.training:
413 ma.matchMCTruth(channel.name, path=path)
416 pvfit = basf2.register_module(
'ParticleVertexFitter')
417 pvfit.set_name(
'ParticleVertexFitter_' + channel.name)
418 pvfit.param(
'listName', channel.name)
419 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
420 pvfit.param(
'vertexFitter',
'KFit')
421 pvfit.param(
'fitType',
'vertex')
422 pvfit.set_log_level(basf2.logging.log_level.ERROR)
423 path.add_module(pvfit)
424 elif re.findall(
r"[\w']+", channel.decayString).count(
'pi0') > 1
and particle.name !=
'pi0':
425 basf2.B2INFO(f
"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
426 elif len(channel.daughters) > 1:
427 pvfit = basf2.register_module(
'ParticleVertexFitter')
428 pvfit.set_name(
'ParticleVertexFitter_' + channel.name)
429 pvfit.param(
'listName', channel.name)
430 pvfit.param(
'confidenceLevel', channel.preCutConfig.vertexCut)
431 pvfit.param(
'vertexFitter',
'KFit')
432 if particle.name
in [
'pi0']:
433 pvfit.param(
'fitType',
'mass')
435 pvfit.param(
'fitType',
'vertex')
436 pvfit.set_log_level(basf2.logging.log_level.ERROR)
437 path.add_module(pvfit)
439 if self.config.monitor:
440 if self.config.monitor ==
'simple':
441 hist_variables = [channel.mvaConfig.target,
'extraInfo(decayModeID)']
442 hist_variables_2d = [(channel.mvaConfig.target,
'extraInfo(decayModeID)')]
444 hist_variables = [
'chiProb',
'mcErrors',
'mcParticleStatus',
445 channel.mvaConfig.target] + list(channel.mvaConfig.spectators.keys())
446 hist_variables_2d = [(
'chiProb', channel.mvaConfig.target),
447 (
'chiProb',
'mcErrors'),
448 (
'chiProb',
'mcParticleStatus')]
449 for specVar
in channel.mvaConfig.spectators:
450 hist_variables_2d.append((
'chiProb', specVar))
451 hist_variables_2d.append((channel.mvaConfig.target, specVar))
452 filename = os.path.join(self.config.monitoring_path,
'Monitor_PreReconstruction_AfterVertex.root')
453 ma.variablesToHistogram(
455 variables=config.variables2binnings(hist_variables),
456 variables_2d=config.variables2binnings_2d(hist_variables_2d),
458 ignoreCommandLineOverride=
True,
459 directory=f
'{channel.label}',
465class PostReconstruction:
467 Steers the reconstruction phase after the mva method was applied
469 - The application of the mva method itself.
470 - Copying all channel lists in a common one for each particle defined in particles
471 - Tag unique signal candidates, to avoid double counting of channels with overlap
474 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
476 Create a new PostReconstruction object
477 @param particles list of config.Particle objects
478 @param config config.FeiConfiguration object
481 self.particles = particles
485 def get_missing_channels(self) -> typing.Sequence[str]:
487 Returns all channels for which the weightfile is missing
490 for particle
in self.particles:
491 for channel
in particle.channels:
493 weightfile = channel.label +
'.xml'
494 if not basf2_mva.available(weightfile):
495 missing += [channel.label]
498 def available(self) -> bool:
500 Check if the relevant information is already available
502 return len(self.get_missing_channels()) == 0
504 def reconstruct(self) -> pybasf2.Path:
506 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
508 path = basf2.create_path()
510 for particle
in self.particles:
511 for channel
in particle.channels:
512 expert = basf2.register_module(
'MVAExpert')
513 expert.set_name(
'MVAExpert_' + channel.name)
514 if self.config.training:
515 expert.param(
'identifier', channel.label +
'.xml')
517 expert.param(
'identifier', self.config.prefix +
'_' + channel.label)
518 expert.param(
'extraInfoName',
'SignalProbability')
519 expert.param(
'listNames', [channel.name])
521 expert.set_log_level(basf2.logging.log_level.ERROR)
522 path.add_module(expert)
524 if self.config.monitor:
525 if self.config.monitor ==
'simple':
526 hist_variables = [channel.mvaConfig.target,
'extraInfo(decayModeID)']
527 hist_variables_2d = [(channel.mvaConfig.target,
'extraInfo(decayModeID)')]
529 hist_variables = [
'mcErrors',
531 'extraInfo(SignalProbability)',
532 channel.mvaConfig.target,
533 'extraInfo(decayModeID)'] + list(channel.mvaConfig.spectators.keys())
534 hist_variables_2d = [(
'extraInfo(SignalProbability)', channel.mvaConfig.target),
535 (
'extraInfo(SignalProbability)',
'mcErrors'),
536 (
'extraInfo(SignalProbability)',
'mcParticleStatus'),
537 (
'extraInfo(decayModeID)', channel.mvaConfig.target),
538 (
'extraInfo(decayModeID)',
'mcErrors'),
539 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
540 for specVar
in channel.mvaConfig.spectators:
541 hist_variables_2d.append((
'extraInfo(SignalProbability)', specVar))
542 hist_variables_2d.append((
'extraInfo(decayModeID)', specVar))
543 hist_variables_2d.append((channel.mvaConfig.target, specVar))
544 filename = os.path.join(self.config.monitoring_path,
'Monitor_PostReconstruction_AfterMVA.root')
545 ma.variablesToHistogram(
547 variables=config.variables2binnings(hist_variables),
548 variables_2d=config.variables2binnings_2d(hist_variables_2d),
550 ignoreCommandLineOverride=
True,
551 directory=f
'{channel.label}',
555 if particle.postCutConfig.value > 0.0:
556 cutstring = str(particle.postCutConfig.value) +
' < extraInfo(SignalProbability)'
558 ma.mergeListsWithBestDuplicate(particle.identifier, [c.name
for c
in particle.channels],
559 variable=
'particleSource', writeOut=
True, path=path)
561 if self.config.monitor:
562 if self.config.monitor ==
'simple':
563 hist_variables = [particle.mvaConfig.target,
'extraInfo(decayModeID)']
564 hist_variables_2d = [(particle.mvaConfig.target,
'extraInfo(decayModeID)')]
566 hist_variables = [
'mcErrors',
568 'extraInfo(SignalProbability)',
569 particle.mvaConfig.target,
570 'extraInfo(decayModeID)'] + list(particle.mvaConfig.spectators.keys())
571 hist_variables_2d = [(
'extraInfo(decayModeID)', particle.mvaConfig.target),
572 (
'extraInfo(decayModeID)',
'mcErrors'),
573 (
'extraInfo(decayModeID)',
'mcParticleStatus')]
574 for specVar
in particle.mvaConfig.spectators:
575 hist_variables_2d.append((
'extraInfo(SignalProbability)', specVar))
576 hist_variables_2d.append((
'extraInfo(decayModeID)', specVar))
577 hist_variables_2d.append((particle.mvaConfig.target, specVar))
578 filename = os.path.join(self.config.monitoring_path,
'Monitor_PostReconstruction_BeforePostCut.root')
579 ma.variablesToHistogram(
581 variables=config.variables2binnings(hist_variables),
582 variables_2d=config.variables2binnings_2d(hist_variables_2d),
584 ignoreCommandLineOverride=
True,
585 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
588 ma.applyCuts(particle.identifier, cutstring, path=path)
590 if self.config.monitor:
591 filename = os.path.join(self.config.monitoring_path,
'Monitor_PostReconstruction_BeforeRanking.root')
592 ma.variablesToHistogram(
594 variables=config.variables2binnings(hist_variables),
595 variables_2d=config.variables2binnings_2d(hist_variables_2d),
597 ignoreCommandLineOverride=
True,
598 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
601 ma.rankByHighest(particle.identifier,
'extraInfo(SignalProbability)',
602 particle.postCutConfig.bestCandidateCut,
'postCut_rank', path=path)
604 uniqueSignal = basf2.register_module(
'TagUniqueSignal')
605 uniqueSignal.param(
'particleList', particle.identifier)
606 uniqueSignal.param(
'target', particle.mvaConfig.target)
607 uniqueSignal.param(
'extraInfoName',
'uniqueSignal')
608 uniqueSignal.set_name(
'TagUniqueSignal_' + particle.identifier)
610 uniqueSignal.set_log_level(basf2.logging.log_level.ERROR)
611 path.add_module(uniqueSignal)
613 if self.config.monitor:
614 if self.config.monitor !=
'simple':
615 hist_variables += [
'extraInfo(postCut_rank)']
616 hist_variables_2d += [(
'extraInfo(decayModeID)',
'extraInfo(postCut_rank)'),
617 (particle.mvaConfig.target,
'extraInfo(postCut_rank)'),
618 (
'mcErrors',
'extraInfo(postCut_rank)'),
619 (
'mcParticleStatus',
'extraInfo(postCut_rank)')]
620 for specVar
in particle.mvaConfig.spectators:
621 hist_variables_2d.append((
'extraInfo(postCut_rank)', specVar))
622 filename = os.path.join(self.config.monitoring_path,
'Monitor_PostReconstruction_AfterRanking.root')
623 ma.variablesToHistogram(
625 variables=config.variables2binnings(hist_variables),
626 variables_2d=config.variables2binnings_2d(hist_variables_2d),
628 ignoreCommandLineOverride=
True,
629 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
632 filename = os.path.join(self.config.monitoring_path,
'Monitor_Final.root')
633 if self.config.monitor ==
'simple':
634 hist_variables = [
'extraInfo(uniqueSignal)',
'extraInfo(decayModeID)']
635 hist_variables_2d = [(
'extraInfo(uniqueSignal)',
'extraInfo(decayModeID)')]
636 ma.variablesToHistogram(
638 variables=config.variables2binnings(hist_variables),
639 variables_2d=config.variables2binnings_2d(hist_variables_2d),
641 ignoreCommandLineOverride=
True,
642 directory=config.removeJPsiSlash(f
'{particle.identifier}'),
645 variables = [
'extraInfo(SignalProbability)',
'mcErrors',
'mcParticleStatus', particle.mvaConfig.target,
646 'extraInfo(uniqueSignal)',
'extraInfo(decayModeID)']
647 variables += list(particle.mvaConfig.spectators.keys())
649 ma.variablesToNtuple(
652 treename=config.removeJPsiSlash(f
'{particle.identifier} variables'),
654 ignoreCommandLineOverride=
True,
661 Performs all necessary trainings for all training data files which are
662 available but where there is no weight file available yet.
663 This class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
664 The trainings are run in parallel using multi-threading of python.
665 Each training is done by a subprocess call, the training command (passed by config.externTeacher) can be either
666 * basf2_mva_teacher, the training will be done directly on the machine
667 * externClustTeacher, the training will be submitted to the batch system of KEKCC
671 MaximumNumberOfMVASamples = int(1e7)
674 MinimumNumberOfMVASamples = int(5e2)
676 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
678 Create a new Teacher object
679 @param particles list of config.Particle objects
680 @param config config.FeiConfiguration object
683 self.particles = particles
688 def create_fake_weightfile(channel: str):
690 Create a fake weight file using the trivial method, it will always return 0.0
691 @param channel for which we create a fake weight file
694 <?xml version="1.0" encoding="utf-8"?>
695 <method>Trivial</method>
696 <weightfile>{channel}.xml</weightfile>
697 <treename>tree</treename>
698 <target_variable>isSignal</target_variable>
699 <weight_variable>__weight__</weight_variable>
700 <signal_class>1</signal_class>
701 <max_events>0</max_events>
702 <number_feature_variables>1</number_feature_variables>
703 <variable0>M</variable0>
704 <number_spectator_variables>0</number_spectator_variables>
705 <number_data_files>1</number_data_files>
706 <datafile0>train.root</datafile0>
707 <Trivial_version>1</Trivial_version>
708 <Trivial_output>0</Trivial_output>
709 <signal_fraction>0.066082567</signal_fraction>
711 with open(channel +
".xml",
"w")
as f:
715 def check_if_weightfile_is_fake(filename: str):
717 Checks if the provided filename is a fake-weight file or not
718 @param filename the filename of the weight file
721 return '<method>Trivial</method>' in open(filename).readlines()[2]
722 except BaseException:
726 def upload(self, channel: str):
728 Upload the weight file into the condition database
729 @param channel whose weight file is uploaded
731 disk = channel +
'.xml'
732 dbase = self.config.prefix +
'_' + channel
733 basf2_mva.upload(disk, dbase)
736 def do_all_trainings(self):
738 Do all trainings for which we find training data
744 ROOT.PyConfig.StartGuiThread =
False
746 filename =
'training_input.root'
747 if not os.path.isfile(filename):
748 B2WARNING(
"Training of MVC failed. Couldn't find ROOT file. "
749 "No weight files will be provided.")
751 f = ROOT.TFile.Open(filename,
'read')
753 B2WARNING(
"Training of MVC failed. ROOT file corrupt. "
754 "No weight files will be provided.")
755 elif len([k.GetName()
for k
in f.GetListOfKeys()]) == 0:
756 B2WARNING(
"Training of MVC failed. ROOT file does not contain any trees. "
757 "No weight files will be provided.")
759 for particle
in self.particles:
760 for channel
in particle.channels:
761 weightfile = channel.label +
'.xml'
762 if not basf2_mva.available(weightfile):
763 keys = [m
for m
in f.GetListOfKeys()
if f
"{channel.label} variables" in m.GetName()]
766 tree = keys[0].ReadObj()
767 nSig = tree.GetEntries(channel.mvaConfig.target +
' == 1')
768 nBg = tree.GetEntries(channel.mvaConfig.target +
' == 0')
769 if nSig < Teacher.MinimumNumberOfMVASamples:
770 B2WARNING(
"Training of MVC failed. "
771 f
"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
772 self.create_fake_weightfile(channel.label)
773 self.upload(channel.label)
775 if nBg < Teacher.MinimumNumberOfMVASamples:
776 B2WARNING(
"Training of MVC failed. "
777 f
"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
778 self.create_fake_weightfile(channel.label)
779 self.upload(channel.label)
781 variable_str =
"' '".join(channel.mvaConfig.variables)
783 spectators = list(channel.mvaConfig.spectators.keys())
784 if channel.mvaConfig.sPlotVariable
is not None:
785 spectators.append(channel.mvaConfig.sPlotVariable)
786 spectators_str =
"' '".join(spectators)
788 command = (f
"{self.config.externTeacher}"
789 f
" --method '{channel.mvaConfig.method}'"
790 f
" --target_variable '{channel.mvaConfig.target}'"
791 f
" --treename '{channel.label} variables' --datafile 'training_input.root'"
792 f
" --signal_class 1 --variables '{variable_str}'"
793 f
" --identifier '{channel.label}.xml'")
794 if len(spectators) > 0:
795 command += f
" --spectators '{spectators_str}'"
796 command += f
" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1"
797 B2INFO(f
"Used following command to invoke teacher: \n {command}")
798 job_list.append((channel.label, command))
800 p = multiprocessing.Pool(
None, maxtasksperchild=1)
801 func = functools.partial(subprocess.call, shell=
True)
802 p.map(func, [c
for _, c
in job_list])
806 for name, _
in job_list:
807 if not basf2_mva.available(name +
'.xml'):
808 B2WARNING(
"Training of MVC failed. For unknown reasons, check the logfile")
809 self.create_fake_weightfile(name)
810 weightfiles.append(self.upload(name))
814def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
816 Convert an old FEI training into the new format.
817 The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
818 using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
819 @param particles list of config.Particle objects
820 @param config config.FeiConfiguration object
822 summary = pickle.load(open(configuration.legacy,
'rb'))
823 channel2lists = {k: v[2]
for k, v
in summary[
'channel2lists'].items()}
825 teacher = Teacher(particles, configuration)
827 for particle
in particles:
828 for channel
in particle.channels:
829 new_weightfile = configuration.prefix +
'_' + channel.label
830 old_weightfile = configuration.prefix +
'_' + channel2lists[channel.label.replace(
'Jpsi',
'J/psi')]
831 if not basf2_mva.available(new_weightfile):
832 if old_weightfile
is None or not basf2_mva.available(old_weightfile):
833 Teacher.create_fake_weightfile(channel.label)
834 teacher.upload(channel.label)
836 basf2_mva.download(old_weightfile, channel.label +
'.xml')
837 teacher.upload(channel.label)
840def get_stages_from_particles(particles: typing.Sequence[config.Particle]):
842 Returns the hierarchical structure of the FEI.
843 Each stage depends on the particles in the previous stage.
844 The final stage is empty (meaning everything is done, and the training is finished at this point).
845 @param particles list of config.Particle objects
848 [p
for p
in particles
if p.name
in [
'e+',
'K+',
'pi+',
'mu+',
'gamma',
'p+',
'K_L0']],
849 [p
for p
in particles
if p.name
in [
'pi0',
'J/psi',
'Lambda0']],
850 [p
for p
in particles
if p.name
in [
'K_S0',
'Sigma+']],
851 [p
for p
in particles
if (p.name
in [
'D+',
'D0',
'D_s+',
'Lambda_c+'])
and (
"tag" not in p.label.lower())],
852 [p
for p
in particles
if (p.name
in [
'D*+',
'D*0',
'D_s*+'])
and (
"tag" not in p.label.lower())],
853 [p
for p
in particles
if p.name
in [
'B0',
'B+',
'B_s0']
or (
"tag" in p.label.lower())],
858 if p.name
not in [p.name
for stage
in stages
for p
in stage]:
859 raise RuntimeError(f
"Unknown particle {p.name}: Not implemented in FEI")
864def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
866 Performs the training of mva classifiers for all available training data,
867 this function must be either called by the user after each stage of the FEI during training,
868 or (more likely) is called by the distributed.py script after merging the outputs of all jobs,
869 @param particles list of config.Particle objects
870 @param config config.FeiConfiguration object
871 @return list of tuple with weight file on disk and identifier in database for all trained classifiers
873 teacher = Teacher(particles, configuration)
874 return teacher.do_all_trainings()
877def save_summary(particles: typing.Sequence[config.Particle],
878 configuration: config.FeiConfiguration,
880 pickleName: str =
'Summary.pickle'):
882 Creates the Summary.pickle, which is used to keep track of the stage during the training,
883 and can be used later to investigate which configuration was used exactly to create the training.
884 @param particles list of config.Particle objects
885 @param config config.FeiConfiguration object
886 @param cache current cache level
887 @param pickleName name of the pickle file
889 configuration = config.FeiConfiguration(configuration.prefix, cache,
890 configuration.monitor, configuration.legacy, configuration.externTeacher,
891 configuration.training)
893 for i
in [8, 7, 6, 5, 4, 3, 2, 1, 0]:
894 if os.path.isfile(f
'{pickleName}.backup_{i}'):
895 shutil.copyfile(f
'{pickleName}.backup_{i}', f
'{pickleName}.backup_{i + 1}')
896 if os.path.isfile(pickleName):
897 shutil.copyfile(pickleName, f
'{pickleName}.backup_0')
898 pickle.dump((particles, configuration), open(pickleName,
'wb'))
901def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
903 The most important function of the FEI.
904 This creates the FEI path for training/fitting (both terms are equal), and application/inference (both terms are equal).
905 The whole FEI is defined by the particles which are reconstructed (see default_channels.py)
906 and the configuration (see config.py).
909 For training this function is called multiple times, each time the FEI reconstructs one more stage in the hierarchical structure
910 i.e. we start with FSP, pi0, KS_0, D, D*, and with B mesons. You have to set configuration.training to True for training mode.
911 All weight files created during the training will be stored in your local database.
912 If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
913 to the central database first (see documentation for the Belle2 Condition Database).
916 For application you call this function once, and it returns the whole path which will reconstruct B mesons
917 with an associated signal probability. You have to set configuration.training to False for application mode.
920 You can always turn on the monitoring (configuration.monitor != False),
921 to write out ROOT Histograms of many quantities for each stage,
922 using these histograms you can use the printReporting.py or latexReporting.py scripts to automatically create pdf files.
925 This function can also use old FEI trainings (version 3), just pass the Summary.pickle file of the old training,
926 and the weight files will be automatically converted to the new naming scheme.
928 @param particles list of config.Particle objects
929 @param config config.FeiConfiguration object
932 ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
933 |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
934 | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
936 Author: Thomas Keck 2014 - 2017
937 Please cite my PhD thesis
946 if configuration.training
and (configuration.monitor
and (configuration.monitoring_path !=
'')):
947 B2ERROR(
"FEI-core: Custom Monitoring path is not allowed during training!")
949 if configuration.cache
is None:
950 pickleName =
'Summary.pickle'
951 if configuration.monitor:
952 pickleName = os.path.join(configuration.monitoring_path, pickleName)
954 if os.path.isfile(pickleName):
955 print(
"Cache: Replaced particles and configuration with the ones from Summary.pickle!")
956 particles, configuration = pickle.load(open(pickleName,
'rb'))
957 cache = configuration.cache
959 if configuration.training:
964 cache = configuration.cache
967 path = basf2.create_path()
972 stages = get_stages_from_particles(particles)
977 if configuration.legacy
is not None:
978 convert_legacy_training(particles, configuration)
986 training_data_information = TrainingDataInformation(particles, outputPath=configuration.monitoring_path)
987 if cache < 0
and configuration.training:
988 print(
"Stage 0: Run over all files to count the number of events and McParticles")
989 path.add_path(training_data_information.reconstruct())
990 if configuration.training:
991 save_summary(particles, configuration, 0)
992 return FeiState(path, 0, [])
993 elif not configuration.training
and configuration.monitor:
994 path.add_path(training_data_information.reconstruct())
1000 loader = FSPLoader(particles, configuration)
1002 print(
"Stage 0: Load FSP particles")
1003 path.add_path(loader.reconstruct())
1026 for stage, stage_particles
in enumerate(stages):
1027 pre_reconstruction = PreReconstruction(stage_particles, configuration)
1029 print(f
"Stage {stage}: PreReconstruct particles: ", [p.name
for p
in stage_particles])
1030 path.add_path(pre_reconstruction.reconstruct())
1032 post_reconstruction = PostReconstruction(stage_particles, configuration)
1033 if configuration.training
and not post_reconstruction.available():
1034 print(f
"Stage {stage}: Create training data for particles: ", [p.name
for p
in stage_particles])
1035 mc_counts = training_data_information.get_mc_counts()
1036 training_data = TrainingData(stage_particles, configuration, mc_counts)
1037 path.add_path(training_data.reconstruct())
1038 used_lists += [channel.name
for particle
in stage_particles
for channel
in particle.channels]
1040 if cache <= stage + 1:
1041 path.add_path(post_reconstruction.reconstruct())
1042 used_lists += [particle.identifier
for particle
in stage_particles]
1046 if configuration.monitor:
1047 output = basf2.register_module(
'RootOutput')
1048 output.param(
'outputFileName', os.path.join(configuration.monitoring_path,
'Monitor_ModuleStatistics.root'))
1049 output.param(
'branchNames', [
'EventMetaData'])
1050 output.param(
'branchNamesPersistent', [
'ProcessStatistics'])
1051 output.param(
'ignoreCommandLineOverride',
True)
1052 path.add_module(output)
1056 if configuration.training
or configuration.monitor:
1057 save_summary(particles, configuration, stage + 1, pickleName=os.path.join(configuration.monitoring_path,
'Summary.pickle'))
1060 return FeiState(path, stage + 1, plists=used_lists)