Belle II Software light-2505-deimos
core.py
1#!/usr/bin/env python3
2
3
10
11# @cond SUPPRESS_DOXYGEN
12
13"""
14 The Full Event Interpretation Algorithm
15
16 Some basic facts:
17 - The algorithm will automatically reconstruct B mesons and calculate a signal probability for each candidate.
18 - It can be used for hadronic and semileptonic tagging.
19 - The algorithm has to be trained on MC, and can afterwards be applied on data.
20 - The training requires O(100) million MC events
21 - The weight files are stored in the Belle II Condition database
22
23 Read this file if you want to understand the technical details of the FEI.
24
25 The FEI follows a hierarchical approach.
26 There are 7 stages:
27 (Stage -1: Write out information about the provided data sample)
28 Stage 0: Final State Particles (FSP)
29 Stage 1: pi0, J/Psi, Lambda0
30 Stage 2: K_S0, Sigma+
31 Stage 3: D and Lambda_c mesons
32 Stage 4: D* mesons
33 Stage 5: B mesons
34 Stage 6: Finish
35
36 Most stages consists of:
37 - Create Particle Candidates
38 - Apply Cuts
39 - Do vertex Fitting
40 - Apply a multivariate classification method
41 - Apply more Cuts
42
43 The FEI will reconstruct these 7 stages during the training phase,
44 since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
45 to train all the necessary multivariate classifiers.
46"""
47
48# Import basf2
49import basf2
50from basf2 import B2INFO, B2WARNING, B2ERROR
51import pybasf2
52import modularAnalysis as ma
53import b2bii
54
55# Should come after basf2 import
56import pdg
57from fei import config
58import basf2_mva
59
60# Standard python modules
61import collections
62import os
63import shutil
64import typing
65import pickle
66import re
67import functools
68import subprocess
69import multiprocessing
70
71# Simple object containing the output of fei
72FeiState = collections.namedtuple('FeiState', 'path, stage, plists')
73
74
75class TrainingDataInformation:
76 """
77 Contains the relevant information about the used training data.
78 Basically we write out the number of MC particles in the whole dataset.
79 This numbers we can use to calculate what fraction of candidates we have to write
80 out as TrainingData to get a reasonable amount of candidates to train on
81 (too few candidates will lead to heavy overtraining, too many won't fit into memory).
82 Secondly we can use this information for the generation of the monitoring pdfs,
83 where we calculate reconstruction efficiencies.
84 """
85
86 def __init__(self, particles: typing.Sequence[config.Particle], outputPath: str = ''):
87 """
88 Create a new TrainingData object
89 @param particles list of config.Particle objects
90 @param outputPath path to the output directory
91 """
92
93 self.particles = particles
94
95 self.filename = os.path.join(outputPath, 'mcParticlesCount.root')
96
97 def available(self) -> bool:
98 """
99 Check if the relevant information is already available
100 """
101 return os.path.isfile(self.filename)
102
103 def reconstruct(self) -> pybasf2.Path:
104 """
105 Returns pybasf2.Path which counts the number of MCParticles in each event.
106 @param particles list of config.Particle objects
107 """
108 # Unique absolute pdg-codes of all particles
109 pdgs = {abs(pdg.from_name(particle.name)) for particle in self.particles}
110
111 path = basf2.create_path()
112 module = basf2.register_module('VariablesToHistogram')
113 module.set_name("VariablesToHistogram_MCCount")
114 module.param('variables', [(f'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5) for pdg in pdgs])
115 module.param('fileName', self.filename)
116 module.param('ignoreCommandLineOverride', True)
117 path.add_module(module)
118 return path
119
120 def get_mc_counts(self):
121 """
122 Read out the number of MC particles from the file created by reconstruct
123 """
124 # Unique absolute pdg-codes of all particles
125 # Always avoid the top-level 'import ROOT'.
126 import ROOT # noqa
127 root_file = ROOT.TFile.Open(self.filename, 'read')
128 mc_counts = {}
129
130 for key in root_file.GetListOfKeys():
131 variable = ROOT.Belle2.MakeROOTCompatible.invertMakeROOTCompatible(key.GetName())
132 pdg = abs(int(variable[len('NumberOfMCParticlesInEvent('):-len(")")]))
133 hist = key.ReadObj()
134 mc_counts[pdg] = {}
135 mc_counts[pdg]['sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
136 for bin in range(hist.GetNbinsX()))
137 mc_counts[pdg]['std'] = hist.GetStdDev()
138 mc_counts[pdg]['avg'] = hist.GetMean()
139 mc_counts[pdg]['max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
140 mc_counts[pdg]['min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
141
142 mc_counts[0] = {}
143 mc_counts[0]['sum'] = hist.GetEntries()
144 root_file.Close()
145 return mc_counts
146
147
148class FSPLoader:
149 """
150 Steers the loading of FSP particles.
151 This does NOT include RootInput, Geometry or anything required before loading FSPs,
152 the user has to add this himself (because it depends on the MC campaign and if you want
153 to use Belle or Belle II).
154 """
155
156 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
157 """
158 Create a new FSPLoader object
159 @param particles list of config.Particle objects
160 @param config config.FeiConfiguration object
161 """
162
163 self.particles = particles
164
165 self.config = config
166
167 def reconstruct(self) -> pybasf2.Path:
168 """
169 Returns pybasf2.Path which loads the FSP Particles
170 """
171 path = basf2.create_path()
172
173 if b2bii.isB2BII():
174 ma.fillParticleLists([('K+:FSP', ''), ('pi+:FSP', ''), ('e+:FSP', ''),
175 ('mu+:FSP', ''), ('p+:FSP', '')], writeOut=True, path=path)
176 for outputList, inputList in [('gamma:FSP', 'gamma:mdst'), ('K_S0:V0', 'K_S0:mdst'),
177 ('Lambda0:V0', 'Lambda0:mdst'), ('K_L0:FSP', 'K_L0:mdst'),
178 ('pi0:FSP', 'pi0:mdst'), ('gamma:V0', 'gamma:v0mdst')]:
179 ma.copyParticles(outputList, inputList, writeOut=True, path=path)
180 else:
181 ma.fillParticleLists([('K+:FSP', ''), ('pi+:FSP', ''), ('e+:FSP', ''),
182 ('mu+:FSP', ''), ('gamma:FSP', ''),
183 ('p+:FSP', ''), ('K_L0:FSP', '')], writeOut=True, path=path)
184 ma.fillParticleList('K_S0:V0 -> pi+ pi-', '', writeOut=True, path=path)
185 ma.fillParticleList('Lambda0:V0 -> p+ pi-', '', writeOut=True, path=path)
186 ma.fillConvertedPhotonsList('gamma:V0 -> e+ e-', '', writeOut=True, path=path)
187
188 if self.config.monitor:
189 names = ['e+', 'K+', 'pi+', 'mu+', 'gamma', 'K_S0', 'p+', 'K_L0', 'Lambda0', 'pi0']
190 filename = os.path.join(self.config.monitoring_path, 'Monitor_FSPLoader.root')
191 pdgs = {abs(pdg.from_name(name)) for name in names}
192 variables = [(f'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5) for pdg in pdgs]
193 ma.variablesToHistogram('', variables=variables, filename=filename, ignoreCommandLineOverride=True, path=path)
194 return path
195
196
197class TrainingData:
198 """
199 Steers the creation of the training data.
200 The training data is used to train a multivariate classifier for each channel.
201 The training of the FEI at its core is just generating this training data for each channel.
202 After we created the training data for a stage, we have to train the classifiers (see Teacher class further down).
203 """
204
205 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
206 mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
207 """
208 Create a new TrainingData object
209 @param particles list of config.Particle objects
210 @param config config.FeiConfiguration object
211 @param mc_counts containing number of MC Particles
212 """
213
214 self.particles = particles
215
216 self.config = config
217
218 self.mc_counts = mc_counts
219
220 def reconstruct(self) -> pybasf2.Path:
221 """
222 Returns pybasf2.Path which creates the training data for the given particles
223 """
224 path = basf2.create_path()
225
226 for particle in self.particles:
227 pdgcode = abs(pdg.from_name(particle.name))
228 nSignal = self.mc_counts[pdgcode]['sum']
229 # For D-Mesons we usually have a efficiency of 10^-3 including branching fraction
230 if pdgcode > 400:
231 nSignal /= 1000
232 # For B-Mesons we usually have a efficiency of 10^-4 including branching fraction
233 if pdgcode > 500:
234 nSignal /= 10000
235
236 for channel in particle.channels:
237 filename = 'training_input.root'
238
239 # nBackground = nEvents * nBestCandidates
240 nBackground = self.mc_counts[0]['sum'] * channel.preCutConfig.bestCandidateCut
241 inverseSamplingRates = {}
242 # For some very pure channels (Jpsi), this sampling can be too aggressive and training fails.
243 # It can therefore be disabled in the preCutConfig.
244 if nBackground > Teacher.MaximumNumberOfMVASamples and not channel.preCutConfig.noBackgroundSampling:
245 inverseSamplingRates[0] = max(
246 1, int((int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1) * channel.preCutConfig.bkgSamplingFactor))
247 elif channel.preCutConfig.bkgSamplingFactor > 1:
248 inverseSamplingRates[0] = int(channel.preCutConfig.bkgSamplingFactor)
249
250 if nSignal > Teacher.MaximumNumberOfMVASamples and not channel.preCutConfig.noSignalSampling:
251 inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
252
253 spectators = [channel.mvaConfig.target] + list(channel.mvaConfig.spectators.keys())
254 if channel.mvaConfig.sPlotVariable is not None:
255 spectators.append(channel.mvaConfig.sPlotVariable)
256
257 if self.config.monitor:
258 hist_variables = ['mcErrors', 'mcParticleStatus'] + channel.mvaConfig.variables + spectators
259 hist_variables_2d = [(x, channel.mvaConfig.target)
260 for x in channel.mvaConfig.variables + spectators if x is not channel.mvaConfig.target]
261 hist_filename = os.path.join(self.config.monitoring_path, 'Monitor_TrainingData.root')
262 ma.variablesToHistogram(channel.name, variables=config.variables2binnings(hist_variables),
263 variables_2d=config.variables2binnings_2d(hist_variables_2d),
264 filename=hist_filename,
265 ignoreCommandLineOverride=True,
266 directory=config.removeJPsiSlash(f'{channel.label}'), path=path)
267
268 teacher = basf2.register_module('VariablesToNtuple')
269 teacher.set_name('VariablesToNtuple_' + channel.name)
270 teacher.param('fileName', filename)
271 teacher.param('treeName', f'{channel.label} variables')
272 teacher.param('variables', channel.mvaConfig.variables + spectators)
273 teacher.param('particleList', channel.name)
274 teacher.param('sampling', (channel.mvaConfig.target, inverseSamplingRates))
275 teacher.param('ignoreCommandLineOverride', True)
276 path.add_module(teacher)
277 return path
278
279
280class PreReconstruction:
281 """
282 Steers the reconstruction phase before the mva method was applied
283 It Includes:
284 - The ParticleCombination (for each particle and channel we create candidates using
285 the daughter candidates from the previous stages)
286 - MC Matching
287 - Vertex Fitting (this is the slowest part of the whole FEI, KFit is used by default,
288 but you can use fastFit as a drop-in replacement https://github.com/thomaskeck/FastFit/,
289 this will speed up the whole FEI by a factor 2-3)
290 """
291
292 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
293 """
294 Create a new PreReconstruction object
295 @param particles list of config.Particle objects
296 @param config config.FeiConfiguration object
297 """
298
299 self.particles = particles
300
301 self.config = config
302
303 def reconstruct(self) -> pybasf2.Path:
304 """
305 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
306 """
307 path = basf2.create_path()
308
309 for particle in self.particles:
310 for channel in particle.channels:
311
312 if (len(channel.daughters) == 1) and (pdg.from_name(
313 channel.daughters[0].split(':')[0]) == pdg.from_name(particle.name)):
314 ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=True, path=path)
315 v2EI = basf2.register_module('VariablesToExtraInfo')
316 v2EI.set_name('VariablesToExtraInfo_' + channel.name)
317 v2EI.param('particleList', channel.name)
318 v2EI.param('variables', {f'constant({channel.decayModeID})': 'decayModeID'})
319 # suppress warning that decay mode ID won't be overwritten if it already exists
320 v2EI.set_log_level(basf2.logging.log_level.ERROR)
321 path.add_module(v2EI)
322 else:
323 ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
324 writeOut=True, path=path)
325 if self.config.monitor:
326 ma.matchMCTruth(channel.name, path=path)
327 bc_variable = channel.preCutConfig.bestCandidateVariable
328 if self.config.monitor == 'simple':
329 hist_variables = [channel.mvaConfig.target, 'extraInfo(decayModeID)']
330 hist_variables_2d = [(channel.mvaConfig.target, 'extraInfo(decayModeID)')]
331 else:
332 hist_variables = [bc_variable, 'mcErrors', 'mcParticleStatus',
333 channel.mvaConfig.target] + list(channel.mvaConfig.spectators.keys())
334 hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
335 (bc_variable, 'mcErrors'),
336 (bc_variable, 'mcParticleStatus')]
337 for specVar in channel.mvaConfig.spectators:
338 hist_variables_2d.append((bc_variable, specVar))
339 hist_variables_2d.append((channel.mvaConfig.target, specVar))
340 filename = os.path.join(self.config.monitoring_path, 'Monitor_PreReconstruction_BeforeRanking.root')
341 ma.variablesToHistogram(
342 channel.name,
343 variables=config.variables2binnings(hist_variables),
344 variables_2d=config.variables2binnings_2d(hist_variables_2d),
345 filename=filename,
346 ignoreCommandLineOverride=True,
347 directory=f'{channel.label}',
348 path=path)
349
350 if channel.preCutConfig.bestCandidateMode == 'lowest':
351 ma.rankByLowest(channel.name,
352 channel.preCutConfig.bestCandidateVariable,
353 channel.preCutConfig.bestCandidateCut,
354 'preCut_rank',
355 path=path)
356 elif channel.preCutConfig.bestCandidateMode == 'highest':
357 ma.rankByHighest(channel.name,
358 channel.preCutConfig.bestCandidateVariable,
359 channel.preCutConfig.bestCandidateCut,
360 'preCut_rank',
361 path=path)
362 else:
363 raise RuntimeError("Unknown bestCandidateMode " + repr(channel.preCutConfig.bestCandidateMode))
364
365 if 'gamma' in channel.decayString and channel.pi0veto:
366 ma.buildRestOfEvent(channel.name, path=path)
367 Ddaughter_roe_path = basf2.Path()
368 deadEndPath = basf2.Path()
369 ma.signalSideParticleFilter(channel.name, '', Ddaughter_roe_path, deadEndPath)
370 ma.fillParticleList('gamma:roe', 'isInRestOfEvent == 1', path=Ddaughter_roe_path)
371
372 matches = list(re.finditer('gamma', channel.decayString))
373 pi0lists = []
374 for igamma in range(len(matches)):
375 start, end = matches[igamma-1].span()
376 tempString = channel.decayString[:start] + '^gamma' + channel.decayString[end:]
377 ma.fillSignalSideParticleList(f'gamma:sig_{igamma}', tempString, path=Ddaughter_roe_path)
378 ma.reconstructDecay(f'pi0:veto_{igamma} -> gamma:sig_{igamma} gamma:roe', '', path=Ddaughter_roe_path)
379 pi0lists.append(f'pi0:veto_{igamma}')
380 ma.copyLists('pi0:veto', pi0lists, writeOut=False, path=Ddaughter_roe_path)
381 ma.rankByLowest('pi0:veto', 'abs(dM)', 1, path=Ddaughter_roe_path)
382 ma.matchMCTruth('pi0:veto', path=Ddaughter_roe_path)
383 ma.variableToSignalSideExtraInfo(
384 'pi0:veto',
385 {
386 'InvM': 'pi0vetoMass',
387 'formula((daughter(0,E)-daughter(1,E))/(daughter(0,E)+daughter(1,E)))': 'pi0vetoEnergyAsymmetry',
388 },
389 path=Ddaughter_roe_path
390 )
391 path.for_each('RestOfEvent', 'RestOfEvents', Ddaughter_roe_path)
392
393 if self.config.monitor:
394 filename = os.path.join(self.config.monitoring_path, 'Monitor_PreReconstruction_AfterRanking.root')
395 if self.config.monitor != 'simple':
396 hist_variables += ['extraInfo(preCut_rank)']
397 hist_variables_2d += [('extraInfo(preCut_rank)', channel.mvaConfig.target),
398 ('extraInfo(preCut_rank)', 'mcErrors'),
399 ('extraInfo(preCut_rank)', 'mcParticleStatus')]
400 for specVar in channel.mvaConfig.spectators:
401 hist_variables_2d.append(('extraInfo(preCut_rank)', specVar))
402 ma.variablesToHistogram(
403 channel.name,
404 variables=config.variables2binnings(hist_variables),
405 variables_2d=config.variables2binnings_2d(hist_variables_2d),
406 filename=filename,
407 ignoreCommandLineOverride=True,
408 directory=f'{channel.label}',
409 path=path)
410 # If we are not in monitor mode we do the mc matching now,
411 # otherwise we did it above already!
412 elif self.config.training:
413 ma.matchMCTruth(channel.name, path=path)
414
415 if b2bii.isB2BII() and particle.name in ['K_S0', 'Lambda0']:
416 pvfit = basf2.register_module('ParticleVertexFitter')
417 pvfit.set_name('ParticleVertexFitter_' + channel.name)
418 pvfit.param('listName', channel.name)
419 pvfit.param('confidenceLevel', channel.preCutConfig.vertexCut)
420 pvfit.param('vertexFitter', 'KFit')
421 pvfit.param('fitType', 'vertex')
422 pvfit.set_log_level(basf2.logging.log_level.ERROR) # let's not produce gigabytes of uninteresting warnings
423 path.add_module(pvfit)
424 elif re.findall(r"[\w']+", channel.decayString).count('pi0') > 1 and particle.name != 'pi0':
425 basf2.B2INFO(f"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
426 elif len(channel.daughters) > 1:
427 pvfit = basf2.register_module('ParticleVertexFitter')
428 pvfit.set_name('ParticleVertexFitter_' + channel.name)
429 pvfit.param('listName', channel.name)
430 pvfit.param('confidenceLevel', channel.preCutConfig.vertexCut)
431 pvfit.param('vertexFitter', 'KFit')
432 if particle.name in ['pi0']:
433 pvfit.param('fitType', 'mass')
434 else:
435 pvfit.param('fitType', 'vertex')
436 pvfit.set_log_level(basf2.logging.log_level.ERROR) # let's not produce gigabytes of uninteresting warnings
437 path.add_module(pvfit)
438
439 if self.config.monitor:
440 if self.config.monitor == 'simple':
441 hist_variables = [channel.mvaConfig.target, 'extraInfo(decayModeID)']
442 hist_variables_2d = [(channel.mvaConfig.target, 'extraInfo(decayModeID)')]
443 else:
444 hist_variables = ['chiProb', 'mcErrors', 'mcParticleStatus',
445 channel.mvaConfig.target] + list(channel.mvaConfig.spectators.keys())
446 hist_variables_2d = [('chiProb', channel.mvaConfig.target),
447 ('chiProb', 'mcErrors'),
448 ('chiProb', 'mcParticleStatus')]
449 for specVar in channel.mvaConfig.spectators:
450 hist_variables_2d.append(('chiProb', specVar))
451 hist_variables_2d.append((channel.mvaConfig.target, specVar))
452 filename = os.path.join(self.config.monitoring_path, 'Monitor_PreReconstruction_AfterVertex.root')
453 ma.variablesToHistogram(
454 channel.name,
455 variables=config.variables2binnings(hist_variables),
456 variables_2d=config.variables2binnings_2d(hist_variables_2d),
457 filename=filename,
458 ignoreCommandLineOverride=True,
459 directory=f'{channel.label}',
460 path=path)
461
462 return path
463
464
465class PostReconstruction:
466 """
467 Steers the reconstruction phase after the mva method was applied
468 It Includes:
469 - The application of the mva method itself.
470 - Copying all channel lists in a common one for each particle defined in particles
471 - Tag unique signal candidates, to avoid double counting of channels with overlap
472 """
473
474 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
475 """
476 Create a new PostReconstruction object
477 @param particles list of config.Particle objects
478 @param config config.FeiConfiguration object
479 """
480
481 self.particles = particles
482
483 self.config = config
484
485 def get_missing_channels(self) -> typing.Sequence[str]:
486 """
487 Returns all channels for which the weightfile is missing
488 """
489 missing = []
490 for particle in self.particles:
491 for channel in particle.channels:
492 # weightfile = self.config.prefix + '_' + channel.label
493 weightfile = channel.label + '.xml'
494 if not basf2_mva.available(weightfile):
495 missing += [channel.label]
496 return missing
497
498 def available(self) -> bool:
499 """
500 Check if the relevant information is already available
501 """
502 return len(self.get_missing_channels()) == 0
503
504 def reconstruct(self) -> pybasf2.Path:
505 """
506 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
507 """
508 path = basf2.create_path()
509
510 for particle in self.particles:
511 for channel in particle.channels:
512 expert = basf2.register_module('MVAExpert')
513 expert.set_name('MVAExpert_' + channel.name)
514 if self.config.training:
515 expert.param('identifier', channel.label + '.xml')
516 else:
517 expert.param('identifier', self.config.prefix + '_' + channel.label)
518 expert.param('extraInfoName', 'SignalProbability')
519 expert.param('listNames', [channel.name])
520 # suppress warning that signal probability won't be overwritten if it already exists
521 expert.set_log_level(basf2.logging.log_level.ERROR)
522 path.add_module(expert)
523
524 if self.config.monitor:
525 if self.config.monitor == 'simple':
526 hist_variables = [channel.mvaConfig.target, 'extraInfo(decayModeID)']
527 hist_variables_2d = [(channel.mvaConfig.target, 'extraInfo(decayModeID)')]
528 else:
529 hist_variables = ['mcErrors',
530 'mcParticleStatus',
531 'extraInfo(SignalProbability)',
532 channel.mvaConfig.target,
533 'extraInfo(decayModeID)'] + list(channel.mvaConfig.spectators.keys())
534 hist_variables_2d = [('extraInfo(SignalProbability)', channel.mvaConfig.target),
535 ('extraInfo(SignalProbability)', 'mcErrors'),
536 ('extraInfo(SignalProbability)', 'mcParticleStatus'),
537 ('extraInfo(decayModeID)', channel.mvaConfig.target),
538 ('extraInfo(decayModeID)', 'mcErrors'),
539 ('extraInfo(decayModeID)', 'mcParticleStatus')]
540 for specVar in channel.mvaConfig.spectators:
541 hist_variables_2d.append(('extraInfo(SignalProbability)', specVar))
542 hist_variables_2d.append(('extraInfo(decayModeID)', specVar))
543 hist_variables_2d.append((channel.mvaConfig.target, specVar))
544 filename = os.path.join(self.config.monitoring_path, 'Monitor_PostReconstruction_AfterMVA.root')
545 ma.variablesToHistogram(
546 channel.name,
547 variables=config.variables2binnings(hist_variables),
548 variables_2d=config.variables2binnings_2d(hist_variables_2d),
549 filename=filename,
550 ignoreCommandLineOverride=True,
551 directory=f'{channel.label}',
552 path=path)
553
554 cutstring = ''
555 if particle.postCutConfig.value > 0.0:
556 cutstring = str(particle.postCutConfig.value) + ' < extraInfo(SignalProbability)'
557
558 ma.mergeListsWithBestDuplicate(particle.identifier, [c.name for c in particle.channels],
559 variable='particleSource', writeOut=True, path=path)
560
561 if self.config.monitor:
562 if self.config.monitor == 'simple':
563 hist_variables = [particle.mvaConfig.target, 'extraInfo(decayModeID)']
564 hist_variables_2d = [(particle.mvaConfig.target, 'extraInfo(decayModeID)')]
565 else:
566 hist_variables = ['mcErrors',
567 'mcParticleStatus',
568 'extraInfo(SignalProbability)',
569 particle.mvaConfig.target,
570 'extraInfo(decayModeID)'] + list(particle.mvaConfig.spectators.keys())
571 hist_variables_2d = [('extraInfo(decayModeID)', particle.mvaConfig.target),
572 ('extraInfo(decayModeID)', 'mcErrors'),
573 ('extraInfo(decayModeID)', 'mcParticleStatus')]
574 for specVar in particle.mvaConfig.spectators:
575 hist_variables_2d.append(('extraInfo(SignalProbability)', specVar))
576 hist_variables_2d.append(('extraInfo(decayModeID)', specVar))
577 hist_variables_2d.append((particle.mvaConfig.target, specVar))
578 filename = os.path.join(self.config.monitoring_path, 'Monitor_PostReconstruction_BeforePostCut.root')
579 ma.variablesToHistogram(
580 particle.identifier,
581 variables=config.variables2binnings(hist_variables),
582 variables_2d=config.variables2binnings_2d(hist_variables_2d),
583 filename=filename,
584 ignoreCommandLineOverride=True,
585 directory=config.removeJPsiSlash(f'{particle.identifier}'),
586 path=path)
587
588 ma.applyCuts(particle.identifier, cutstring, path=path)
589
590 if self.config.monitor:
591 filename = os.path.join(self.config.monitoring_path, 'Monitor_PostReconstruction_BeforeRanking.root')
592 ma.variablesToHistogram(
593 particle.identifier,
594 variables=config.variables2binnings(hist_variables),
595 variables_2d=config.variables2binnings_2d(hist_variables_2d),
596 filename=filename,
597 ignoreCommandLineOverride=True,
598 directory=config.removeJPsiSlash(f'{particle.identifier}'),
599 path=path)
600
601 ma.rankByHighest(particle.identifier, 'extraInfo(SignalProbability)',
602 particle.postCutConfig.bestCandidateCut, 'postCut_rank', path=path)
603
604 uniqueSignal = basf2.register_module('TagUniqueSignal')
605 uniqueSignal.param('particleList', particle.identifier)
606 uniqueSignal.param('target', particle.mvaConfig.target)
607 uniqueSignal.param('extraInfoName', 'uniqueSignal')
608 uniqueSignal.set_name('TagUniqueSignal_' + particle.identifier)
609 # suppress warning that unique signal extra info won't be overwritten if it already exists
610 uniqueSignal.set_log_level(basf2.logging.log_level.ERROR)
611 path.add_module(uniqueSignal)
612
613 if self.config.monitor:
614 if self.config.monitor != 'simple':
615 hist_variables += ['extraInfo(postCut_rank)']
616 hist_variables_2d += [('extraInfo(decayModeID)', 'extraInfo(postCut_rank)'),
617 (particle.mvaConfig.target, 'extraInfo(postCut_rank)'),
618 ('mcErrors', 'extraInfo(postCut_rank)'),
619 ('mcParticleStatus', 'extraInfo(postCut_rank)')]
620 for specVar in particle.mvaConfig.spectators:
621 hist_variables_2d.append(('extraInfo(postCut_rank)', specVar))
622 filename = os.path.join(self.config.monitoring_path, 'Monitor_PostReconstruction_AfterRanking.root')
623 ma.variablesToHistogram(
624 particle.identifier,
625 variables=config.variables2binnings(hist_variables),
626 variables_2d=config.variables2binnings_2d(hist_variables_2d),
627 filename=filename,
628 ignoreCommandLineOverride=True,
629 directory=config.removeJPsiSlash(f'{particle.identifier}'),
630 path=path)
631
632 filename = os.path.join(self.config.monitoring_path, 'Monitor_Final.root')
633 if self.config.monitor == 'simple':
634 hist_variables = ['extraInfo(uniqueSignal)', 'extraInfo(decayModeID)']
635 hist_variables_2d = [('extraInfo(uniqueSignal)', 'extraInfo(decayModeID)')]
636 ma.variablesToHistogram(
637 particle.identifier,
638 variables=config.variables2binnings(hist_variables),
639 variables_2d=config.variables2binnings_2d(hist_variables_2d),
640 filename=filename,
641 ignoreCommandLineOverride=True,
642 directory=config.removeJPsiSlash(f'{particle.identifier}'),
643 path=path)
644 else:
645 variables = ['extraInfo(SignalProbability)', 'mcErrors', 'mcParticleStatus', particle.mvaConfig.target,
646 'extraInfo(uniqueSignal)', 'extraInfo(decayModeID)']
647 variables += list(particle.mvaConfig.spectators.keys())
648
649 ma.variablesToNtuple(
650 particle.identifier,
651 variables,
652 treename=config.removeJPsiSlash(f'{particle.identifier} variables'),
653 filename=filename,
654 ignoreCommandLineOverride=True,
655 path=path)
656 return path
657
658
659class Teacher:
660 """
661 Performs all necessary trainings for all training data files which are
662 available but where there is no weight file available yet.
663 This class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
664 The trainings are run in parallel using multi-threading of python.
665 Each training is done by a subprocess call, the training command (passed by config.externTeacher) can be either
666 * basf2_mva_teacher, the training will be done directly on the machine
667 * externClustTeacher, the training will be submitted to the batch system of KEKCC
668 """
669
671 MaximumNumberOfMVASamples = int(1e7)
672
674 MinimumNumberOfMVASamples = int(5e2)
675
676 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
677 """
678 Create a new Teacher object
679 @param particles list of config.Particle objects
680 @param config config.FeiConfiguration object
681 """
682
683 self.particles = particles
684
685 self.config = config
686
687 @staticmethod
688 def create_fake_weightfile(channel: str):
689 """
690 Create a fake weight file using the trivial method, it will always return 0.0
691 @param channel for which we create a fake weight file
692 """
693 content = f"""
694 <?xml version="1.0" encoding="utf-8"?>
695 <method>Trivial</method>
696 <weightfile>{channel}.xml</weightfile>
697 <treename>tree</treename>
698 <target_variable>isSignal</target_variable>
699 <weight_variable>__weight__</weight_variable>
700 <signal_class>1</signal_class>
701 <max_events>0</max_events>
702 <number_feature_variables>1</number_feature_variables>
703 <variable0>M</variable0>
704 <number_spectator_variables>0</number_spectator_variables>
705 <number_data_files>1</number_data_files>
706 <datafile0>train.root</datafile0>
707 <Trivial_version>1</Trivial_version>
708 <Trivial_output>0</Trivial_output>
709 <signal_fraction>0.066082567</signal_fraction>
710 """
711 with open(channel + ".xml", "w") as f:
712 f.write(content)
713
714 @staticmethod
715 def check_if_weightfile_is_fake(filename: str):
716 """
717 Checks if the provided filename is a fake-weight file or not
718 @param filename the filename of the weight file
719 """
720 try:
721 return '<method>Trivial</method>' in open(filename).readlines()[2]
722 except BaseException:
723 return True
724 return True
725
726 def upload(self, channel: str):
727 """
728 Upload the weight file into the condition database
729 @param channel whose weight file is uploaded
730 """
731 disk = channel + '.xml'
732 dbase = self.config.prefix + '_' + channel
733 basf2_mva.upload(disk, dbase)
734 return (disk, dbase)
735
736 def do_all_trainings(self):
737 """
738 Do all trainings for which we find training data
739 """
740 # Always avoid the top-level 'import ROOT'.
741 import ROOT # noqa
742 # FEI uses multi-threading for parallel execution of tasks therefore
743 # the ROOT gui-thread is disabled, which otherwise interferes sometimes
744 ROOT.PyConfig.StartGuiThread = False
745 job_list = []
746 filename = 'training_input.root'
747 if not os.path.isfile(filename):
748 B2WARNING("Training of MVC failed. Couldn't find ROOT file. "
749 "No weight files will be provided.")
750 else:
751 f = ROOT.TFile.Open(filename, 'read')
752 if f.IsZombie():
753 B2WARNING("Training of MVC failed. ROOT file corrupt. "
754 "No weight files will be provided.")
755 elif len([k.GetName() for k in f.GetListOfKeys()]) == 0:
756 B2WARNING("Training of MVC failed. ROOT file does not contain any trees. "
757 "No weight files will be provided.")
758 else:
759 for particle in self.particles:
760 for channel in particle.channels:
761 weightfile = channel.label + '.xml'
762 if not basf2_mva.available(weightfile):
763 keys = [m for m in f.GetListOfKeys() if f"{channel.label} variables" in m.GetName()]
764 if not keys:
765 continue
766 tree = keys[0].ReadObj()
767 nSig = tree.GetEntries(channel.mvaConfig.target + ' == 1')
768 nBg = tree.GetEntries(channel.mvaConfig.target + ' == 0')
769 if nSig < Teacher.MinimumNumberOfMVASamples:
770 B2WARNING("Training of MVC failed. "
771 f"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
772 self.create_fake_weightfile(channel.label)
773 self.upload(channel.label)
774 continue
775 if nBg < Teacher.MinimumNumberOfMVASamples:
776 B2WARNING("Training of MVC failed. "
777 f"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
778 self.create_fake_weightfile(channel.label)
779 self.upload(channel.label)
780 continue
781 variable_str = "' '".join(channel.mvaConfig.variables)
782
783 spectators = list(channel.mvaConfig.spectators.keys())
784 if channel.mvaConfig.sPlotVariable is not None:
785 spectators.append(channel.mvaConfig.sPlotVariable)
786 spectators_str = "' '".join(spectators)
787
788 command = (f"{self.config.externTeacher}"
789 f" --method '{channel.mvaConfig.method}'"
790 f" --target_variable '{channel.mvaConfig.target}'"
791 f" --treename '{channel.label} variables' --datafile 'training_input.root'"
792 f" --signal_class 1 --variables '{variable_str}'"
793 f" --identifier '{channel.label}.xml'")
794 if len(spectators) > 0:
795 command += f" --spectators '{spectators_str}'"
796 command += f" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1"
797 B2INFO(f"Used following command to invoke teacher: \n {command}")
798 job_list.append((channel.label, command))
799 f.Close()
800 p = multiprocessing.Pool(None, maxtasksperchild=1)
801 func = functools.partial(subprocess.call, shell=True)
802 p.map(func, [c for _, c in job_list])
803 p.close()
804 p.join()
805 weightfiles = []
806 for name, _ in job_list:
807 if not basf2_mva.available(name + '.xml'):
808 B2WARNING("Training of MVC failed. For unknown reasons, check the logfile")
809 self.create_fake_weightfile(name)
810 weightfiles.append(self.upload(name))
811 return weightfiles
812
813
814def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
815 """
816 Convert an old FEI training into the new format.
817 The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
818 using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
819 @param particles list of config.Particle objects
820 @param config config.FeiConfiguration object
821 """
822 summary = pickle.load(open(configuration.legacy, 'rb'))
823 channel2lists = {k: v[2] for k, v in summary['channel2lists'].items()}
824
825 teacher = Teacher(particles, configuration)
826
827 for particle in particles:
828 for channel in particle.channels:
829 new_weightfile = configuration.prefix + '_' + channel.label
830 old_weightfile = configuration.prefix + '_' + channel2lists[channel.label.replace('Jpsi', 'J/psi')]
831 if not basf2_mva.available(new_weightfile):
832 if old_weightfile is None or not basf2_mva.available(old_weightfile):
833 Teacher.create_fake_weightfile(channel.label)
834 teacher.upload(channel.label)
835 else:
836 basf2_mva.download(old_weightfile, channel.label + '.xml')
837 teacher.upload(channel.label)
838
839
840def get_stages_from_particles(particles: typing.Sequence[config.Particle]):
841 """
842 Returns the hierarchical structure of the FEI.
843 Each stage depends on the particles in the previous stage.
844 The final stage is empty (meaning everything is done, and the training is finished at this point).
845 @param particles list of config.Particle objects
846 """
847 stages = [
848 [p for p in particles if p.name in ['e+', 'K+', 'pi+', 'mu+', 'gamma', 'p+', 'K_L0']],
849 [p for p in particles if p.name in ['pi0', 'J/psi', 'Lambda0']],
850 [p for p in particles if p.name in ['K_S0', 'Sigma+']],
851 [p for p in particles if (p.name in ['D+', 'D0', 'D_s+', 'Lambda_c+']) and ("tag" not in p.label.lower())],
852 [p for p in particles if (p.name in ['D*+', 'D*0', 'D_s*+']) and ("tag" not in p.label.lower())],
853 [p for p in particles if p.name in ['B0', 'B+', 'B_s0'] or ("tag" in p.label.lower())],
854 []
855 ]
856
857 for p in particles:
858 if p.name not in [p.name for stage in stages for p in stage]:
859 raise RuntimeError(f"Unknown particle {p.name}: Not implemented in FEI")
860
861 return stages
862
863
864def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
865 """
866 Performs the training of mva classifiers for all available training data,
867 this function must be either called by the user after each stage of the FEI during training,
868 or (more likely) is called by the distributed.py script after merging the outputs of all jobs,
869 @param particles list of config.Particle objects
870 @param config config.FeiConfiguration object
871 @return list of tuple with weight file on disk and identifier in database for all trained classifiers
872 """
873 teacher = Teacher(particles, configuration)
874 return teacher.do_all_trainings()
875
876
877def save_summary(particles: typing.Sequence[config.Particle],
878 configuration: config.FeiConfiguration,
879 cache: int,
880 pickleName: str = 'Summary.pickle'):
881 """
882 Creates the Summary.pickle, which is used to keep track of the stage during the training,
883 and can be used later to investigate which configuration was used exactly to create the training.
884 @param particles list of config.Particle objects
885 @param config config.FeiConfiguration object
886 @param cache current cache level
887 @param pickleName name of the pickle file
888 """
889 configuration = config.FeiConfiguration(configuration.prefix, cache,
890 configuration.monitor, configuration.legacy, configuration.externTeacher,
891 configuration.training)
892 # Backup existing Summary.pickle files
893 for i in [8, 7, 6, 5, 4, 3, 2, 1, 0]:
894 if os.path.isfile(f'{pickleName}.backup_{i}'):
895 shutil.copyfile(f'{pickleName}.backup_{i}', f'{pickleName}.backup_{i + 1}')
896 if os.path.isfile(pickleName):
897 shutil.copyfile(pickleName, f'{pickleName}.backup_0')
898 pickle.dump((particles, configuration), open(pickleName, 'wb'))
899
900
901def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
902 """
903 The most important function of the FEI.
904 This creates the FEI path for training/fitting (both terms are equal), and application/inference (both terms are equal).
905 The whole FEI is defined by the particles which are reconstructed (see default_channels.py)
906 and the configuration (see config.py).
907
908 TRAINING
909 For training this function is called multiple times, each time the FEI reconstructs one more stage in the hierarchical structure
910 i.e. we start with FSP, pi0, KS_0, D, D*, and with B mesons. You have to set configuration.training to True for training mode.
911 All weight files created during the training will be stored in your local database.
912 If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
913 to the central database first (see documentation for the Belle2 Condition Database).
914
915 APPLICATION
916 For application you call this function once, and it returns the whole path which will reconstruct B mesons
917 with an associated signal probability. You have to set configuration.training to False for application mode.
918
919 MONITORING
920 You can always turn on the monitoring (configuration.monitor != False),
921 to write out ROOT Histograms of many quantities for each stage,
922 using these histograms you can use the printReporting.py or latexReporting.py scripts to automatically create pdf files.
923
924 LEGACY
925 This function can also use old FEI trainings (version 3), just pass the Summary.pickle file of the old training,
926 and the weight files will be automatically converted to the new naming scheme.
927
928 @param particles list of config.Particle objects
929 @param config config.FeiConfiguration object
930 """
931 print(r"""
932 ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
933 |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
934 | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
935
936 Author: Thomas Keck 2014 - 2017
937 Please cite my PhD thesis
938 """)
939
940 # The cache parameter of the configuration object is used during training to keep track,
941 # which reconstruction steps are already performed.
942 # For fitting/training we start by default with -1, meaning we still have to create the TrainingDataInformation,
943 # which is used to determine the number of candidates we have to write out for the FSP trainings in stage 0.
944 # For inference/application we start by default with 0, because we don't need the TrainingDataInformation in stage 0.
945 # During the training we save the particles and configuration (including the current cache stage) in the Summary.pickle object.
946 if configuration.training and (configuration.monitor and (configuration.monitoring_path != '')):
947 B2ERROR("FEI-core: Custom Monitoring path is not allowed during training!")
948
949 if configuration.cache is None:
950 pickleName = 'Summary.pickle'
951 if configuration.monitor:
952 pickleName = os.path.join(configuration.monitoring_path, pickleName)
953
954 if os.path.isfile(pickleName):
955 print("Cache: Replaced particles and configuration with the ones from Summary.pickle!")
956 particles, configuration = pickle.load(open(pickleName, 'rb'))
957 cache = configuration.cache
958 else:
959 if configuration.training:
960 cache = -1
961 else:
962 cache = 0
963 else:
964 cache = configuration.cache
965
966 # Now we start building the training or application path
967 path = basf2.create_path()
968
969 # There are in total 7 stages.
970 # For training we start with -1 and go to 7 one stage at a time
971 # For application we can run stage 0 to 7 at once
972 stages = get_stages_from_particles(particles)
973
974 # If the user provided a Summary.pickle file of a FEIv3 training we
975 # convert the old weight files (with hashes), to the new naming scheme.
976 # Afterwards the algorithm runs as usual
977 if configuration.legacy is not None:
978 convert_legacy_training(particles, configuration)
979
980 # During the training we require the number of MC particles in the whole processed
981 # data sample, because we don't want to write out billions of e.g. pion candidates.
982 # Knowing the total amount of MC particles we can write out only every e.g. 10th candidate
983 # That's why we have to write out the TrainingDataInformation before doing anything during the training phase.
984 # During application we only need this if we run in monitor mode, and want to write out a summary in the end,
985 # the summary contains efficiency, and the efficiency calculation requires the total number of MC particles.
986 training_data_information = TrainingDataInformation(particles, outputPath=configuration.monitoring_path)
987 if cache < 0 and configuration.training:
988 print("Stage 0: Run over all files to count the number of events and McParticles")
989 path.add_path(training_data_information.reconstruct())
990 if configuration.training:
991 save_summary(particles, configuration, 0)
992 return FeiState(path, 0, [])
993 elif not configuration.training and configuration.monitor:
994 path.add_path(training_data_information.reconstruct())
995
996 # We load the Final State particles
997 # It is assumed that the user takes care of adding RootInput, Geometry, and everything
998 # which is required to read in data, so we directly start to load the FSP particles
999 # used by the FEI.
1000 loader = FSPLoader(particles, configuration)
1001 if cache < 1:
1002 print("Stage 0: Load FSP particles")
1003 path.add_path(loader.reconstruct())
1004
1005 # Now we reconstruct each stage one after another.
1006 # Each stage consists of two parts:
1007 # PreReconstruction (before the mva method was applied):
1008 # - Particle combination
1009 # - Do vertex fitting
1010 # - Some simple cuts and best candidate selection
1011 # PostReconstruction (after the mva method was applied):
1012 # - Apply the mva method
1013 # - Apply cuts on the mva output and best candidate selection
1014 #
1015 # If the weight files for the PostReconstruction are not available for the current stage and we are in training mode,
1016 # we have to create the training data. The training itself is done by the do_trainings function which is called
1017 # either by the user after each step, or by the distributed.py script
1018 #
1019 # If the weight files for the PostReconstruction are not available for the current stage and we are not in training mode,
1020 # we keep going, as soon as the user will call process on the produced path he will get an error message that the
1021 # weight files are missing.
1022 #
1023 # Finally we keep track of the ParticleLists we use, so the user can run the RemoveParticles module to reduce the size of the
1024 # intermediate output of RootOutput.
1025 used_lists = []
1026 for stage, stage_particles in enumerate(stages):
1027 pre_reconstruction = PreReconstruction(stage_particles, configuration)
1028 if cache <= stage:
1029 print(f"Stage {stage}: PreReconstruct particles: ", [p.name for p in stage_particles])
1030 path.add_path(pre_reconstruction.reconstruct())
1031
1032 post_reconstruction = PostReconstruction(stage_particles, configuration)
1033 if configuration.training and not post_reconstruction.available():
1034 print(f"Stage {stage}: Create training data for particles: ", [p.name for p in stage_particles])
1035 mc_counts = training_data_information.get_mc_counts()
1036 training_data = TrainingData(stage_particles, configuration, mc_counts)
1037 path.add_path(training_data.reconstruct())
1038 used_lists += [channel.name for particle in stage_particles for channel in particle.channels]
1039 break
1040 if cache <= stage + 1:
1041 path.add_path(post_reconstruction.reconstruct())
1042 used_lists += [particle.identifier for particle in stage_particles]
1043
1044 # If we run in monitor mode we are interested in the ModuleStatistics,
1045 # these statistics contain the runtime for each module which was run.
1046 if configuration.monitor:
1047 output = basf2.register_module('RootOutput')
1048 output.param('outputFileName', os.path.join(configuration.monitoring_path, 'Monitor_ModuleStatistics.root'))
1049 output.param('branchNames', ['EventMetaData']) # cannot be removed, but of only small effect on file size
1050 output.param('branchNamesPersistent', ['ProcessStatistics'])
1051 output.param('ignoreCommandLineOverride', True)
1052 path.add_module(output)
1053
1054 # As mentioned above the FEI keeps track of the stages which are already reconstructed during the training
1055 # so we write out the Summary.pickle here, and increase the stage by one.
1056 if configuration.training or configuration.monitor:
1057 save_summary(particles, configuration, stage + 1, pickleName=os.path.join(configuration.monitoring_path, 'Summary.pickle'))
1058
1059 # Finally we return the path, the stage and the used lists to the user.
1060 return FeiState(path, stage + 1, plists=used_lists)
1061
1062# @endcond
isB2BII()
Definition b2bii.py:14
from_name(name)
Definition pdg.py:63