Belle II Software light-2405-quaxo
core.py
1#!/usr/bin/env python3
2
3
10
11# @cond SUPPRESS_DOXYGEN
12
13"""
14 The Full Event Interpretation Algorithm
15
16 Some basic facts:
17 - The algorithm will automatically reconstruct B mesons and calculate a signal probability for each candidate.
18 - It can be used for hadronic and semileptonic tagging.
19 - The algorithm has to be trained on MC, and can afterwards be applied on data.
20 - The training requires O(100) million MC events
21 - The weight files are stored in the Belle II Condition database
22
23 Read this file if you want to understand the technical details of the FEI.
24
25 The FEI follows a hierarchical approach.
26 There are 7 stages:
27 (Stage -1: Write out information about the provided data sample)
28 Stage 0: Final State Particles (FSP)
29 Stage 1: pi0, J/Psi, Lambda0
30 Stage 2: K_S0, Sigma+
31 Stage 3: D and Lambda_c mesons
32 Stage 4: D* mesons
33 Stage 5: B mesons
34 Stage 6: Finish
35
36 Most stages consists of:
37 - Create Particle Candidates
38 - Apply Cuts
39 - Do vertex Fitting
40 - Apply a multivariate classification method
41 - Apply more Cuts
42
43 The FEI will reconstruct these 7 stages during the training phase,
44 since the stages depend on one another, you have to run basf2 multiple (7) times on the same data
45 to train all the necessary multivariate classifiers.
46"""
47
48# Import basf2
49import basf2
50from basf2 import B2INFO, B2WARNING
51import pybasf2
52import modularAnalysis as ma
53import b2bii
54
55# Should come after basf2 import
56import pdg
57from fei import config
58import basf2_mva
59
60# Standard python modules
61import collections
62import os
63import shutil
64import typing
65import pickle
66import re
67import functools
68import subprocess
69import multiprocessing
70
71# Simple object containing the output of fei
72FeiState = collections.namedtuple('FeiState', 'path, stage, plists')
73
74
75class TrainingDataInformation:
76 """
77 Contains the relevant information about the used training data.
78 Basically we write out the number of MC particles in the whole dataset.
79 This numbers we can use to calculate what fraction of candidates we have to write
80 out as TrainingData to get a reasonable amount of candidates to train on
81 (too few candidates will lead to heavy overtraining, too many won't fit into memory). Secondly we can use this information for the generation of the monitoring pdfs,
82 where we calculate reconstruction efficiencies.
83 """
84
85 def __init__(self, particles: typing.Sequence[config.Particle]):
86 """
87 Create a new TrainingData object
88 @param particles list of config.Particle objects
89 """
90
91 self.particles = particles
92
93 self.filename = 'mcParticlesCount.root'
94
95 def available(self) -> bool:
96 """
97 Check if the relevant information is already available
98 """
99 return os.path.isfile(self.filename)
100
101 def reconstruct(self) -> pybasf2.Path:
102 """
103 Returns pybasf2.Path which counts the number of MCParticles in each event.
104 @param particles list of config.Particle objects
105 """
106 # Unique absolute pdg-codes of all particles
107 pdgs = {abs(pdg.from_name(particle.name)) for particle in self.particles}
108
109 path = basf2.create_path()
110 module = basf2.register_module('VariablesToHistogram')
111 module.set_name("VariablesToHistogram_MCCount")
112 module.param('variables', [(f'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5) for pdg in pdgs])
113 module.param('fileName', self.filename)
114 path.add_module(module)
115 return path
116
117 def get_mc_counts(self):
118 """
119 Read out the number of MC particles from the file created by reconstruct
120 """
121 # Unique absolute pdg-codes of all particles
122 # Always avoid the top-level 'import ROOT'.
123 import ROOT # noqa
124 root_file = ROOT.TFile.Open(self.filename, 'read')
125 mc_counts = {}
126
127 for key in root_file.GetListOfKeys():
128 variable = ROOT.Belle2.MakeROOTCompatible.invertMakeROOTCompatible(key.GetName())
129 pdg = abs(int(variable[len('NumberOfMCParticlesInEvent('):-len(")")]))
130 hist = key.ReadObj()
131 mc_counts[pdg] = {}
132 mc_counts[pdg]['sum'] = sum(hist.GetXaxis().GetBinCenter(bin + 1) * hist.GetBinContent(bin + 1)
133 for bin in range(hist.GetNbinsX()))
134 mc_counts[pdg]['std'] = hist.GetStdDev()
135 mc_counts[pdg]['avg'] = hist.GetMean()
136 mc_counts[pdg]['max'] = hist.GetXaxis().GetBinCenter(hist.FindLastBinAbove(0.0))
137 mc_counts[pdg]['min'] = hist.GetXaxis().GetBinCenter(hist.FindFirstBinAbove(0.0))
138
139 mc_counts[0] = {}
140 mc_counts[0]['sum'] = hist.GetEntries()
141 root_file.Close()
142 return mc_counts
143
144
145class FSPLoader:
146 """
147 Steers the loading of FSP particles.
148 This does NOT include RootInput, Geometry or anything required before loading FSPs,
149 the user has to add this himself (because it depends on the MC campaign and if you want
150 to use Belle or Belle II).
151 """
152
153 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
154 """
155 Create a new FSPLoader object
156 @param particles list of config.Particle objects
157 @param config config.FeiConfiguration object
158 """
159
160 self.particles = particles
161
162 self.config = config
163
164 def reconstruct(self) -> pybasf2.Path:
165 """
166 Returns pybasf2.Path which loads the FSP Particles
167 """
168 path = basf2.create_path()
169
170 if b2bii.isB2BII():
171 ma.fillParticleLists([('K+:FSP', ''), ('pi+:FSP', ''), ('e+:FSP', ''),
172 ('mu+:FSP', ''), ('p+:FSP', '')], writeOut=True, path=path)
173 for outputList, inputList in [('gamma:FSP', 'gamma:mdst'), ('K_S0:V0', 'K_S0:mdst'),
174 ('Lambda0:V0', 'Lambda0:mdst'), ('K_L0:FSP', 'K_L0:mdst'),
175 ('pi0:FSP', 'pi0:mdst'), ('gamma:V0', 'gamma:v0mdst')]:
176 ma.copyParticles(outputList, inputList, writeOut=True, path=path)
177 else:
178 ma.fillParticleLists([('K+:FSP', ''), ('pi+:FSP', ''), ('e+:FSP', ''),
179 ('mu+:FSP', ''), ('gamma:FSP', ''),
180 ('p+:FSP', ''), ('K_L0:FSP', '')], writeOut=True, path=path)
181 ma.fillParticleList('K_S0:V0 -> pi+ pi-', '', writeOut=True, path=path)
182 ma.fillParticleList('Lambda0:V0 -> p+ pi-', '', writeOut=True, path=path)
183 ma.fillConvertedPhotonsList('gamma:V0 -> e+ e-', '', writeOut=True, path=path)
184
185 if self.config.monitor:
186 names = ['e+', 'K+', 'pi+', 'mu+', 'gamma', 'K_S0', 'p+', 'K_L0', 'Lambda0', 'pi0']
187 filename = 'Monitor_FSPLoader.root'
188 pdgs = {abs(pdg.from_name(name)) for name in names}
189 variables = [(f'NumberOfMCParticlesInEvent({pdg})', 100, -0.5, 99.5) for pdg in pdgs]
190 ma.variablesToHistogram('', variables=variables, filename=filename, path=path)
191 return path
192
193
194class TrainingData:
195 """
196 Steers the creation of the training data.
197 The training data is used to train a multivariate classifier for each channel.
198 The training of the FEI at its core is just generating this training data for each channel.
199 After we created the training data for a stage, we have to train the classifiers (see Teacher class further down).
200 """
201
202 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration,
203 mc_counts: typing.Mapping[int, typing.Mapping[str, float]]):
204 """
205 Create a new TrainingData object
206 @param particles list of config.Particle objects
207 @param config config.FeiConfiguration object
208 @param mc_counts containing number of MC Particles
209 """
210
211 self.particles = particles
212
213 self.config = config
214
215 self.mc_counts = mc_counts
216
217 def reconstruct(self) -> pybasf2.Path:
218 """
219 Returns pybasf2.Path which creates the training data for the given particles
220 """
221 path = basf2.create_path()
222
223 for particle in self.particles:
224 pdgcode = abs(pdg.from_name(particle.name))
225 nSignal = self.mc_counts[pdgcode]['sum']
226 # For D-Mesons we usually have a efficiency of 10^-3 including branching fraction
227 if pdgcode > 400:
228 nSignal /= 1000
229 # For B-Mesons we usually have a efficiency of 10^-4 including branching fraction
230 if pdgcode > 500:
231 nSignal /= 10000
232
233 for channel in particle.channels:
234 filename = 'training_input.root'
235
236 nBackground = self.mc_counts[0]['sum'] * channel.preCutConfig.bestCandidateCut
237 inverseSamplingRates = {}
238 # For some very pure channels (Jpsi), this sampling can be too aggressive and training fails.
239 # It can therefore be disabled in the preCutConfig.
240 if nBackground > Teacher.MaximumNumberOfMVASamples and not channel.preCutConfig.noBackgroundSampling:
241 inverseSamplingRates[0] = int(nBackground / Teacher.MaximumNumberOfMVASamples) + 1
242 if nSignal > Teacher.MaximumNumberOfMVASamples:
243 inverseSamplingRates[1] = int(nSignal / Teacher.MaximumNumberOfMVASamples) + 1
244
245 spectators = [channel.mvaConfig.target]
246 if channel.mvaConfig.sPlotVariable is not None:
247 spectators.append(channel.mvaConfig.sPlotVariable)
248
249 if self.config.monitor:
250 hist_variables = ['mcErrors', 'mcParticleStatus'] + channel.mvaConfig.variables + spectators
251 hist_variables_2d = [(x, channel.mvaConfig.target)
252 for x in channel.mvaConfig.variables + spectators if x is not channel.mvaConfig.target]
253 hist_filename = 'Monitor_TrainingData.root'
254 ma.variablesToHistogram(channel.name, variables=config.variables2binnings(hist_variables),
255 variables_2d=config.variables2binnings_2d(hist_variables_2d),
256 filename=config.removeJPsiSlash(hist_filename),
257 directory=config.removeJPsiSlash(f'{channel.label}'), path=path)
258
259 teacher = basf2.register_module('VariablesToNtuple')
260 teacher.set_name('VariablesToNtuple_' + channel.name)
261 teacher.param('fileName', filename)
262 teacher.param('treeName', f'{channel.label} variables')
263 teacher.param('variables', channel.mvaConfig.variables + spectators)
264 teacher.param('particleList', channel.name)
265 teacher.param('sampling', (channel.mvaConfig.target, inverseSamplingRates))
266 path.add_module(teacher)
267 return path
268
269
270class PreReconstruction:
271 """
272 Steers the reconstruction phase before the mva method was applied
273 It Includes:
274 - The ParticleCombination (for each particle and channel we create candidates using
275 the daughter candidates from the previous stages)
276 - MC Matching
277 - Vertex Fitting (this is the slowest part of the whole FEI, KFit is used by default,
278 but you can use fastFit as a drop-in replacement https://github.com/thomaskeck/FastFit/,
279 this will speed up the whole FEI by a factor 2-3)
280 """
281
282 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
283 """
284 Create a new PreReconstruction object
285 @param particles list of config.Particle objects
286 @param config config.FeiConfiguration object
287 """
288
289 self.particles = particles
290
291 self.config = config
292
293 def reconstruct(self) -> pybasf2.Path:
294 """
295 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
296 """
297 path = basf2.create_path()
298
299 for particle in self.particles:
300 for channel in particle.channels:
301
302 if len(channel.daughters) == 1:
303 ma.cutAndCopyList(channel.name, channel.daughters[0], channel.preCutConfig.userCut, writeOut=True, path=path)
304 v2EI = basf2.register_module('VariablesToExtraInfo')
305 v2EI.set_name('VariablesToExtraInfo_' + channel.name)
306 v2EI.param('particleList', channel.name)
307 v2EI.param('variables', {f'constant({channel.decayModeID})': 'decayModeID'})
308 # suppress warning that decay mode ID won't be overwritten if it already exists
309 v2EI.set_log_level(basf2.logging.log_level.ERROR)
310 path.add_module(v2EI)
311 else:
312 ma.reconstructDecay(channel.decayString, channel.preCutConfig.userCut, channel.decayModeID,
313 writeOut=True, path=path)
314 if self.config.monitor:
315 ma.matchMCTruth(channel.name, path=path)
316 bc_variable = channel.preCutConfig.bestCandidateVariable
317 hist_variables = [bc_variable, 'mcErrors', 'mcParticleStatus', channel.mvaConfig.target]
318 hist_variables_2d = [(bc_variable, channel.mvaConfig.target),
319 (bc_variable, 'mcErrors'),
320 (bc_variable, 'mcParticleStatus')]
321 filename = 'Monitor_PreReconstruction_BeforeRanking.root'
322 ma.variablesToHistogram(channel.name,
323 variables=config.variables2binnings(hist_variables),
324 variables_2d=config.variables2binnings_2d(hist_variables_2d),
325 filename=filename, directory=f'{channel.label}', path=path)
326
327 if channel.preCutConfig.bestCandidateMode == 'lowest':
328 ma.rankByLowest(channel.name,
329 channel.preCutConfig.bestCandidateVariable,
330 channel.preCutConfig.bestCandidateCut,
331 'preCut_rank',
332 path=path)
333 elif channel.preCutConfig.bestCandidateMode == 'highest':
334 ma.rankByHighest(channel.name,
335 channel.preCutConfig.bestCandidateVariable,
336 channel.preCutConfig.bestCandidateCut,
337 'preCut_rank',
338 path=path)
339 else:
340 raise RuntimeError("Unknown bestCandidateMode " + repr(channel.preCutConfig.bestCandidateMode))
341
342 if self.config.monitor:
343 filename = 'Monitor_PreReconstruction_AfterRanking.root'
344 hist_variables += ['extraInfo(preCut_rank)']
345 hist_variables_2d += [('extraInfo(preCut_rank)', channel.mvaConfig.target),
346 ('extraInfo(preCut_rank)', 'mcErrors'),
347 ('extraInfo(preCut_rank)', 'mcParticleStatus')]
348 ma.variablesToHistogram(channel.name,
349 variables=config.variables2binnings(hist_variables),
350 variables_2d=config.variables2binnings_2d(hist_variables_2d),
351 filename=filename, directory=f'{channel.label}', path=path)
352 # If we are not in monitor mode we do the mc matching now,
353 # otherwise we did it above already!
354 elif self.config.training:
355 ma.matchMCTruth(channel.name, path=path)
356
357 if b2bii.isB2BII() and particle.name in ['K_S0', 'Lambda0']:
358 pvfit = basf2.register_module('ParticleVertexFitter')
359 pvfit.set_name('ParticleVertexFitter_' + channel.name)
360 pvfit.param('listName', channel.name)
361 pvfit.param('confidenceLevel', channel.preCutConfig.vertexCut)
362 pvfit.param('vertexFitter', 'KFit')
363 pvfit.param('fitType', 'vertex')
364 pvfit.set_log_level(basf2.logging.log_level.ERROR) # let's not produce gigabytes of uninteresting warnings
365 path.add_module(pvfit)
366 elif re.findall(r"[\w']+", channel.decayString).count('pi0') > 1 and particle.name != 'pi0':
367 basf2.B2INFO(f"Ignoring vertex fit for {channel.name} because multiple pi0 are not supported yet.")
368 elif len(channel.daughters) > 1:
369 pvfit = basf2.register_module('ParticleVertexFitter')
370 pvfit.set_name('ParticleVertexFitter_' + channel.name)
371 pvfit.param('listName', channel.name)
372 pvfit.param('confidenceLevel', channel.preCutConfig.vertexCut)
373 pvfit.param('vertexFitter', 'KFit')
374 if particle.name in ['pi0']:
375 pvfit.param('fitType', 'mass')
376 else:
377 pvfit.param('fitType', 'vertex')
378 pvfit.set_log_level(basf2.logging.log_level.ERROR) # let's not produce gigabytes of uninteresting warnings
379 path.add_module(pvfit)
380
381 if self.config.monitor:
382 hist_variables = ['chiProb', 'mcErrors', 'mcParticleStatus', channel.mvaConfig.target]
383 hist_variables_2d = [('chiProb', channel.mvaConfig.target),
384 ('chiProb', 'mcErrors'),
385 ('chiProb', 'mcParticleStatus')]
386 filename = 'Monitor_PreReconstruction_AfterVertex.root'
387 ma.variablesToHistogram(channel.name,
388 variables=config.variables2binnings(hist_variables),
389 variables_2d=config.variables2binnings_2d(hist_variables_2d),
390 filename=filename, directory=f'{channel.label}', path=path)
391
392 return path
393
394
395class PostReconstruction:
396 """
397 Steers the reconstruction phase after the mva method was applied
398 It Includes:
399 - The application of the mva method itself.
400 - Copying all channel lists in a common one for each particle defined in particles
401 - Tag unique signal candidates, to avoid double counting of channels with overlap
402 """
403
404 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
405 """
406 Create a new PostReconstruction object
407 @param particles list of config.Particle objects
408 @param config config.FeiConfiguration object
409 """
410
411 self.particles = particles
412
413 self.config = config
414
415 def get_missing_channels(self) -> typing.Sequence[str]:
416 """
417 Returns all channels for which the weightfile is missing
418 """
419 missing = []
420 for particle in self.particles:
421 for channel in particle.channels:
422 # weightfile = self.config.prefix + '_' + channel.label
423 weightfile = channel.label + '.xml'
424 if not basf2_mva.available(weightfile):
425 missing += [channel.label]
426 return missing
427
428 def available(self) -> bool:
429 """
430 Check if the relevant information is already available
431 """
432 return len(self.get_missing_channels()) == 0
433
434 def reconstruct(self) -> pybasf2.Path:
435 """
436 Returns pybasf2.Path which reconstructs the particles and does the vertex fitting if necessary
437 """
438 path = basf2.create_path()
439
440 for particle in self.particles:
441 for channel in particle.channels:
442 expert = basf2.register_module('MVAExpert')
443 expert.set_name('MVAExpert_' + channel.name)
444 if self.config.training:
445 expert.param('identifier', channel.label + '.xml')
446 else:
447 expert.param('identifier', self.config.prefix + '_' + channel.label)
448 expert.param('extraInfoName', 'SignalProbability')
449 expert.param('listNames', [channel.name])
450 # suppress warning that signal probability won't be overwritten if it already exists
451 expert.set_log_level(basf2.logging.log_level.ERROR)
452 path.add_module(expert)
453
454 uniqueSignal = basf2.register_module('TagUniqueSignal')
455 uniqueSignal.param('particleList', channel.name)
456 uniqueSignal.param('target', channel.mvaConfig.target)
457 uniqueSignal.param('extraInfoName', 'uniqueSignal')
458 uniqueSignal.set_name('TagUniqueSignal_' + channel.name)
459 # suppress warning that unique signal extra info won't be overwritten if it already exists
460 uniqueSignal.set_log_level(basf2.logging.log_level.ERROR)
461 path.add_module(uniqueSignal)
462
463 if self.config.monitor:
464 hist_variables = ['mcErrors', 'mcParticleStatus', 'extraInfo(uniqueSignal)', 'extraInfo(SignalProbability)',
465 channel.mvaConfig.target, 'extraInfo(decayModeID)']
466 hist_variables_2d = [('extraInfo(SignalProbability)', channel.mvaConfig.target),
467 ('extraInfo(SignalProbability)', 'mcErrors'),
468 ('extraInfo(SignalProbability)', 'mcParticleStatus'),
469 ('extraInfo(decayModeID)', channel.mvaConfig.target),
470 ('extraInfo(decayModeID)', 'mcErrors'),
471 ('extraInfo(decayModeID)', 'extraInfo(uniqueSignal)'),
472 ('extraInfo(decayModeID)', 'mcParticleStatus')]
473 filename = 'Monitor_PostReconstruction_AfterMVA.root'
474 ma.variablesToHistogram(channel.name,
475 variables=config.variables2binnings(hist_variables),
476 variables_2d=config.variables2binnings_2d(hist_variables_2d),
477 filename=filename, directory=f'{channel.label}', path=path)
478
479 cutstring = ''
480 if particle.postCutConfig.value > 0.0:
481 cutstring = str(particle.postCutConfig.value) + ' < extraInfo(SignalProbability)'
482
483 ma.mergeListsWithBestDuplicate(particle.identifier, [c.name for c in particle.channels],
484 variable='particleSource', writeOut=True, path=path)
485
486 if self.config.monitor:
487 hist_variables = ['mcErrors', 'mcParticleStatus', 'extraInfo(uniqueSignal)', 'extraInfo(SignalProbability)',
488 particle.mvaConfig.target, 'extraInfo(decayModeID)']
489 hist_variables_2d = [('extraInfo(decayModeID)', particle.mvaConfig.target),
490 ('extraInfo(decayModeID)', 'mcErrors'),
491 ('extraInfo(decayModeID)', 'mcParticleStatus')]
492 filename = 'Monitor_PostReconstruction_BeforePostCut.root'
493 ma.variablesToHistogram(
494 particle.identifier,
495 variables=config.variables2binnings(hist_variables),
496 variables_2d=config.variables2binnings_2d(hist_variables_2d),
497 filename=config.removeJPsiSlash(filename),
498 directory=config.removeJPsiSlash(f'{particle.identifier}'),
499 path=path)
500
501 ma.applyCuts(particle.identifier, cutstring, path=path)
502
503 if self.config.monitor:
504 filename = 'Monitor_PostReconstruction_BeforeRanking.root'
505 ma.variablesToHistogram(
506 particle.identifier,
507 variables=config.variables2binnings(hist_variables),
508 variables_2d=config.variables2binnings_2d(hist_variables_2d),
509 filename=config.removeJPsiSlash(filename),
510 directory=config.removeJPsiSlash(f'{particle.identifier}'),
511 path=path)
512
513 ma.rankByHighest(particle.identifier, 'extraInfo(SignalProbability)',
514 particle.postCutConfig.bestCandidateCut, 'postCut_rank', path=path)
515
516 if self.config.monitor:
517 hist_variables += ['extraInfo(postCut_rank)']
518 hist_variables_2d += [('extraInfo(decayModeID)', 'extraInfo(postCut_rank)'),
519 (particle.mvaConfig.target, 'extraInfo(postCut_rank)'),
520 ('mcErrors', 'extraInfo(postCut_rank)'),
521 ('mcParticleStatus', 'extraInfo(postCut_rank)')]
522 filename = 'Monitor_PostReconstruction_AfterRanking.root'
523 ma.variablesToHistogram(
524 particle.identifier,
525 variables=config.variables2binnings(hist_variables),
526 variables_2d=config.variables2binnings_2d(hist_variables_2d),
527 filename=config.removeJPsiSlash(filename),
528 directory=config.removeJPsiSlash(f'{particle.identifier}'),
529 path=path)
530
531 variables = ['extraInfo(SignalProbability)', 'mcErrors', 'mcParticleStatus', particle.mvaConfig.target,
532 'extraInfo(uniqueSignal)', 'extraInfo(decayModeID)']
533
534 if 'B_s0' == particle.name:
535 variables += ['Mbc']
536 elif 'B' in particle.name:
537 variables += ['Mbc', 'cosThetaBetweenParticleAndNominalB']
538
539 filename = 'Monitor_Final.root'
540 ma.variablesToNtuple(particle.identifier, variables, treename=config.removeJPsiSlash(
541 f'{particle.identifier} variables'), filename=config.removeJPsiSlash(filename), path=path)
542 return path
543
544
545class Teacher:
546 """
547 Performs all necessary trainings for all training data files which are
548 available but where there is no weight file available yet.
549 This class is usually used by the do_trainings function below, to perform the necessary trainings after each stage.
550 The trainings are run in parallel using multi-threading of python.
551 Each training is done by a subprocess call, the training command (passed by config.externTeacher) can be either
552 * basf2_mva_teacher, the training will be done directly on the machine
553 * externClustTeacher, the training will be submitted to the batch system of KEKCC
554 """
555
557 MaximumNumberOfMVASamples = int(1e7)
558
560 MinimumNumberOfMVASamples = int(5e2)
561
562 def __init__(self, particles: typing.Sequence[config.Particle], config: config.FeiConfiguration):
563 """
564 Create a new Teacher object
565 @param particles list of config.Particle objects
566 @param config config.FeiConfiguration object
567 """
568
569 self.particles = particles
570
571 self.config = config
572
573 @staticmethod
574 def create_fake_weightfile(channel: str):
575 """
576 Create a fake weight file using the trivial method, it will always return 0.0
577 @param channel for which we create a fake weight file
578 """
579 content = f"""
580 <?xml version="1.0" encoding="utf-8"?>
581 <method>Trivial</method>
582 <weightfile>{channel}.xml</weightfile>
583 <treename>tree</treename>
584 <target_variable>isSignal</target_variable>
585 <weight_variable>__weight__</weight_variable>
586 <signal_class>1</signal_class>
587 <max_events>0</max_events>
588 <number_feature_variables>1</number_feature_variables>
589 <variable0>M</variable0>
590 <number_spectator_variables>0</number_spectator_variables>
591 <number_data_files>1</number_data_files>
592 <datafile0>train.root</datafile0>
593 <Trivial_version>1</Trivial_version>
594 <Trivial_output>0</Trivial_output>
595 <signal_fraction>0.066082567</signal_fraction>
596 """
597 with open(channel + ".xml", "w") as f:
598 f.write(content)
599
600 @staticmethod
601 def check_if_weightfile_is_fake(filename: str):
602 """
603 Checks if the provided filename is a fake-weight file or not
604 @param filename the filename of the weight file
605 """
606 try:
607 return '<method>Trivial</method>' in open(filename).readlines()[2]
608 except BaseException:
609 return True
610 return True
611
612 def upload(self, channel: str):
613 """
614 Upload the weight file into the condition database
615 @param channel whose weight file is uploaded
616 """
617 disk = channel + '.xml'
618 dbase = self.config.prefix + '_' + channel
619 basf2_mva.upload(disk, dbase)
620 return (disk, dbase)
621
622 def do_all_trainings(self):
623 """
624 Do all trainings for which we find training data
625 """
626 # Always avoid the top-level 'import ROOT'.
627 import ROOT # noqa
628 # FEI uses multi-threading for parallel execution of tasks therefore
629 # the ROOT gui-thread is disabled, which otherwise interferes sometimes
630 ROOT.PyConfig.StartGuiThread = False
631 job_list = []
632 filename = 'training_input.root'
633 if not os.path.isfile(filename):
634 B2WARNING("Training of MVC failed. Couldn't find ROOT file. "
635 "No weight files will be provided.")
636 else:
637 f = ROOT.TFile.Open(filename, 'read')
638 if f.IsZombie():
639 B2WARNING("Training of MVC failed. ROOT file corrupt. "
640 "No weight files will be provided.")
641 elif len([k.GetName() for k in f.GetListOfKeys()]) == 0:
642 B2WARNING("Training of MVC failed. ROOT file does not contain any trees. "
643 "No weight files will be provided.")
644 else:
645 for particle in self.particles:
646 for channel in particle.channels:
647 weightfile = channel.label + '.xml'
648 if not basf2_mva.available(weightfile):
649 keys = [m for m in f.GetListOfKeys() if f"{channel.label}" in m.GetName()]
650 if not keys:
651 continue
652 tree = keys[0].ReadObj()
653 nSig = tree.GetEntries(channel.mvaConfig.target + ' == 1.0')
654 nBg = tree.GetEntries(channel.mvaConfig.target + ' != 1.0')
655 if nSig < Teacher.MinimumNumberOfMVASamples:
656 B2WARNING("Training of MVC failed. "
657 f"Tree contains too few signal events {nSig}. Ignoring channel {channel}.")
658 self.create_fake_weightfile(channel.label)
659 self.upload(channel.label)
660 continue
661 if nBg < Teacher.MinimumNumberOfMVASamples:
662 B2WARNING("Training of MVC failed. "
663 f"Tree contains too few bckgrd events {nBg}. Ignoring channel {channel}.")
664 self.create_fake_weightfile(channel.label)
665 self.upload(channel.label)
666 continue
667 variable_str = "' '".join(channel.mvaConfig.variables)
668
669 command = (f"{self.config.externTeacher}"
670 f" --method '{channel.mvaConfig.method}'"
671 f" --target_variable '{channel.mvaConfig.target}'"
672 f" --treename '{channel.label} variables' --datafile 'training_input.root'"
673 f" --signal_class 1 --variables '{variable_str}'"
674 f" --identifier '{channel.label}.xml'"
675 f" {channel.mvaConfig.config} > '{channel.label}'.log 2>&1")
676 B2INFO(f"Used following command to invoke teacher: \n {command}")
677 job_list.append((channel.label, command))
678 f.Close()
679 p = multiprocessing.Pool(None, maxtasksperchild=1)
680 func = functools.partial(subprocess.call, shell=True)
681 p.map(func, [c for _, c in job_list])
682 p.close()
683 p.join()
684 weightfiles = []
685 for name, _ in job_list:
686 if not basf2_mva.available(name + '.xml'):
687 B2WARNING("Training of MVC failed. For unknown reasons, check the logfile")
688 self.create_fake_weightfile(name)
689 weightfiles.append(self.upload(name))
690 return weightfiles
691
692
693def convert_legacy_training(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
694 """
695 Convert an old FEI training into the new format.
696 The old format used hashes for the weight files, the hashes can be converted to the new naming scheme
697 using the Summary.pickle file outputted by the FEIv3. This file must be passes by the parameter configuration.legacy.
698 @param particles list of config.Particle objects
699 @param config config.FeiConfiguration object
700 """
701 summary = pickle.load(open(configuration.legacy, 'rb'))
702 channel2lists = {k: v[2] for k, v in summary['channel2lists'].items()}
703
704 teacher = Teacher(particles, configuration)
705
706 for particle in particles:
707 for channel in particle.channels:
708 new_weightfile = configuration.prefix + '_' + channel.label
709 old_weightfile = configuration.prefix + '_' + channel2lists[channel.label.replace('Jpsi', 'J/psi')]
710 if not basf2_mva.available(new_weightfile):
711 if old_weightfile is None or not basf2_mva.available(old_weightfile):
712 Teacher.create_fake_weightfile(channel.label)
713 teacher.upload(channel.label)
714 else:
715 basf2_mva.download(old_weightfile, channel.label + '.xml')
716 teacher.upload(channel.label)
717
718
719def get_stages_from_particles(particles: typing.Sequence[config.Particle]):
720 """
721 Returns the hierarchical structure of the FEI.
722 Each stage depends on the particles in the previous stage.
723 The final stage is empty (meaning everything is done, and the training is finished at this point).
724 @param particles list of config.Particle objects
725 """
726 stages = [
727 [p for p in particles if p.name in ['e+', 'K+', 'pi+', 'mu+', 'gamma', 'p+', 'K_L0']],
728 [p for p in particles if p.name in ['pi0', 'J/psi', 'Lambda0']],
729 [p for p in particles if p.name in ['K_S0', 'Sigma+']],
730 [p for p in particles if p.name in ['D+', 'D0', 'D_s+', 'Lambda_c+']],
731 [p for p in particles if p.name in ['D*+', 'D*0', 'D_s*+']],
732 [p for p in particles if p.name in ['B0', 'B+', 'B_s0']],
733 []
734 ]
735
736 for p in particles:
737 if p.name not in [p.name for stage in stages for p in stage]:
738 raise RuntimeError(f"Unknown particle {p.name}: Not implemented in FEI")
739
740 return stages
741
742
743def do_trainings(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration):
744 """
745 Performs the training of mva classifiers for all available training data,
746 this function must be either called by the user after each stage of the FEI during training,
747 or (more likely) is called by the distributed.py script after merging the outputs of all jobs,
748 @param particles list of config.Particle objects
749 @param config config.FeiConfiguration object
750 @return list of tuple with weight file on disk and identifier in database for all trained classifiers
751 """
752 teacher = Teacher(particles, configuration)
753 return teacher.do_all_trainings()
754
755
756def save_summary(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration, cache: int):
757 """
758 Creates the Summary.pickle, which is used to keep track of the stage during the training,
759 and can be used later to investigate which configuration was used exactly to create the training.
760 @param particles list of config.Particle objects
761 @param config config.FeiConfiguration object
762 @param cache current cache level
763 """
764 configuration = config.FeiConfiguration(configuration.prefix, cache,
765 configuration.monitor, configuration.legacy, configuration.externTeacher,
766 configuration.training)
767 # Backup existing Summary.pickle files
768 for i in [8, 7, 6, 5, 4, 3, 2, 1, 0]:
769 if os.path.isfile(f'Summary.pickle.backup_{i}'):
770 shutil.copyfile(f'Summary.pickle.backup_{i}', f'Summary.pickle.backup_{i + 1}')
771 if os.path.isfile('Summary.pickle'):
772 shutil.copyfile('Summary.pickle', 'Summary.pickle.backup_0')
773 pickle.dump((particles, configuration), open('Summary.pickle', 'wb'))
774
775
776def get_path(particles: typing.Sequence[config.Particle], configuration: config.FeiConfiguration) -> FeiState:
777 """
778 The most important function of the FEI.
779 This creates the FEI path for training/fitting (both terms are equal), and application/inference (both terms are equal).
780 The whole FEI is defined by the particles which are reconstructed (see default_channels.py)
781 and the configuration (see config.py).
782
783 TRAINING
784 For training this function is called multiple times, each time the FEI reconstructs one more stage in the hierarchical structure
785 i.e. we start with FSP, pi0, KS_0, D, D*, and with B mesons. You have to set configuration.training to True for training mode.
786 All weight files created during the training will be stored in your local database.
787 If you want to use the FEI training everywhere without copying this database by hand, you have to upload your local database
788 to the central database first (see documentation for the Belle2 Condition Database).
789
790 APPLICATION
791 For application you call this function once, and it returns the whole path which will reconstruct B mesons
792 with an associated signal probability. You have to set configuration.training to False for application mode.
793
794 MONITORING
795 You can always turn on the monitoring (configuration.monitor = True),
796 to write out ROOT Histograms of many quantities for each stage,
797 using these histograms you can use the printReporting.py or latexReporting.py scripts to automatically create pdf files.
798
799 LEGACY
800 This function can also use old FEI trainings (version 3), just pass the Summary.pickle file of the old training,
801 and the weight files will be automatically converted to the new naming scheme.
802
803 @param particles list of config.Particle objects
804 @param config config.FeiConfiguration object
805 """
806 print(r"""
807 ____ _ _ _ _ ____ _ _ ____ _ _ ___ _ _ _ ___ ____ ____ ___ ____ ____ ___ ____ ___ _ ____ _ _
808 |___ | | | | |___ | | |___ |\ | | | |\ | | |___ |__/ |__] |__/ |___ | |__| | | | | |\ |
809 | |__| |___ |___ |___ \/ |___ | \| | | | \| | |___ | \ | | \ |___ | | | | | |__| | \|
810
811 Author: Thomas Keck 2014 - 2017
812 Please cite my PhD thesis
813 """)
814
815 # The cache parameter of the configuration object is used during training to keep track,
816 # which reconstruction steps are already performed.
817 # For fitting/training we start by default with -1, meaning we still have to create the TrainingDataInformation,
818 # which is used to determine the number of candidates we have to write out for the FSP trainings in stage 0.
819 # For inference/application we start by default with 0, because we don't need the TrainingDataInformation in stage 0.
820 # During the training we save the particles and configuration (including the current cache stage) in the Summary.pickle object.
821 if configuration.cache is None:
822 if os.path.isfile('Summary.pickle'):
823 print("Cache: Replaced particles and configuration with the ones from Summary.pickle!")
824 particles, configuration = pickle.load(open('Summary.pickle', 'rb'))
825 cache = configuration.cache
826 else:
827 if configuration.training:
828 cache = -1
829 else:
830 cache = 0
831 else:
832 cache = configuration.cache
833
834 # Now we start building the training or application path
835 path = basf2.create_path()
836
837 # There are in total 7 stages.
838 # For training we start with -1 and go to 7 one stage at a time
839 # For application we can run stage 0 to 7 at once
840 stages = get_stages_from_particles(particles)
841
842 # If the user provided a Summary.pickle file of a FEIv3 training we
843 # convert the old weight files (with hashes), to the new naming scheme.
844 # Afterwards the algorithm runs as usual
845 if configuration.legacy is not None:
846 convert_legacy_training(particles, configuration)
847
848 # During the training we require the number of MC particles in the whole processed
849 # data sample, because we don't want to write out billions of e.g. pion candidates.
850 # Knowing the total amount of MC particles we can write out only every e.g. 10th candidate
851 # That's why we have to write out the TrainingDataInformation before doing anything during the training phase.
852 # During application we only need this if we run in monitor mode, and want to write out a summary in the end,
853 # the summary contains efficiency, and the efficiency calculation requires the total number of MC particles.
854 training_data_information = TrainingDataInformation(particles)
855 if cache < 0 and configuration.training:
856 print("Stage 0: Run over all files to count the number of events and McParticles")
857 path.add_path(training_data_information.reconstruct())
858 if configuration.training:
859 save_summary(particles, configuration, 0)
860 return FeiState(path, 0, [])
861 elif not configuration.training and configuration.monitor:
862 path.add_path(training_data_information.reconstruct())
863
864 # We load the Final State particles
865 # It is assumed that the user takes care of adding RootInput, Geometry, and everything
866 # which is required to read in data, so we directly start to load the FSP particles
867 # used by the FEI.
868 loader = FSPLoader(particles, configuration)
869 if cache < 1:
870 print("Stage 0: Load FSP particles")
871 path.add_path(loader.reconstruct())
872
873 # Now we reconstruct each stage one after another.
874 # Each stage consists of two parts:
875 # PreReconstruction (before the mva method was applied):
876 # - Particle combination
877 # - Do vertex fitting
878 # - Some simple cuts and best candidate selection
879 # PostReconstruction (after the mva method was applied):
880 # - Apply the mva method
881 # - Apply cuts on the mva output and best candidate selection
882 #
883 # If the weight files for the PostReconstruction are not available for the current stage and we are in training mode,
884 # we have to create the training data. The training itself is done by the do_trainings function which is called
885 # either by the user after each step, or by the distributed.py script
886 #
887 # If the weight files for the PostReconstruction are not available for the current stage and we are not in training mode,
888 # we keep going, as soon as the user will call process on the produced path he will get an error message that the
889 # weight files are missing.
890 #
891 # Finally we keep track of the ParticleLists we use, so the user can run the RemoveParticles module to reduce the size of the
892 # intermediate output of RootOutput.
893 used_lists = []
894 for stage, stage_particles in enumerate(stages):
895 pre_reconstruction = PreReconstruction(stage_particles, configuration)
896 if cache <= stage:
897 print(f"Stage {stage}: PreReconstruct particles: ", [p.name for p in stage_particles])
898 path.add_path(pre_reconstruction.reconstruct())
899
900 post_reconstruction = PostReconstruction(stage_particles, configuration)
901 if configuration.training and not post_reconstruction.available():
902 print(f"Stage {stage}: Create training data for particles: ", [p.name for p in stage_particles])
903 mc_counts = training_data_information.get_mc_counts()
904 training_data = TrainingData(stage_particles, configuration, mc_counts)
905 path.add_path(training_data.reconstruct())
906 used_lists += [channel.name for particle in stage_particles for channel in particle.channels]
907 break
908 if cache <= stage + 1:
909 path.add_path(post_reconstruction.reconstruct())
910 used_lists += [particle.identifier for particle in stage_particles]
911
912 # If we run in monitor mode we are interested in the ModuleStatistics,
913 # these statistics contain the runtime for each module which was run.
914 if configuration.monitor:
915 output = basf2.register_module('RootOutput')
916 output.param('outputFileName', 'Monitor_ModuleStatistics.root')
917 output.param('branchNames', ['EventMetaData']) # cannot be removed, but of only small effect on file size
918 output.param('branchNamesPersistent', ['ProcessStatistics'])
919 output.param('ignoreCommandLineOverride', True)
920 path.add_module(output)
921
922 # As mentioned above the FEI keeps track of the stages which are already reconstructed during the training
923 # so we write out the Summary.pickle here, and increase the stage by one.
924 if configuration.training or configuration.monitor:
925 save_summary(particles, configuration, stage + 1)
926
927 # Finally we return the path, the stage and the used lists to the user.
928 return FeiState(path, stage + 1, plists=used_lists)
929
930# @endcond
931
def isB2BII()
Definition: b2bii.py:14
def from_name(name)
Definition: pdg.py:63