Belle II Software  release-08-01-10
DeepFlavorTagger.py
1 #!/usr/bin/env python3
2 
3 
10 
11 import json
12 import os
13 import basf2_mva
14 from basf2 import B2ERROR, B2FATAL
15 import basf2
16 from variables import variables as vm
17 import modularAnalysis as ma
18 
19 
20 def get_variables(particle_list, ranked_variable, variables=None, particleNumber=1):
21  """ creates variable name pattern requested by the basf2 variable getVariableByRank()
22  :param particle_list:
23  :param ranked_variable:
24  :param variables:
25  :param particleNumber:
26  :return:
27  """
28  var_list = []
29  for var in variables:
30  for i_num in range(1, particleNumber + 1):
31  var_list.append('getVariableByRank(' + particle_list + ', ' + ranked_variable + ', ' + var + ', ' +
32  str(i_num) + ')')
33  return var_list
34 
35 
36 def construct_default_variable_names(particle_lists=None, ranked_variable='p', variables=None, particleNumber=5):
37  """ construct default variables (that are sorted by charge and ranked by momentum)
38  :param particle_lists:
39  :param ranked_variable:
40  :param variables:
41  :param particleNumber:
42  :return:
43  """
44  from ROOT import Belle2 # noqa
45  if particle_lists is None:
46  particle_lists = ['pi+:pos_charged', 'pi+:neg_charged']
47 
48  variable_names = []
49  for p_list in particle_lists:
50  variable_names += get_variables(p_list, ranked_variable, variables, particleNumber)
51 
52  # make root compatible
53  root_compatible_list = []
54  for var in variable_names:
55  root_compatible_list.append(Belle2.MakeROOTCompatible.makeROOTCompatible(var))
56 
57  return root_compatible_list
58 
59 
60 def DeepFlavorTagger(particle_lists, mode='expert', working_dir='', uniqueIdentifier='standard', variable_list=None,
61  target='qrCombined', overwrite=False,
62  transform_to_probability=False, signal_fraction=-1.0, classifier_args=None,
63  train_valid_fraction=.92, mva_steering_file='analysis/scripts/dft/tensorflow_dnn_interface.py',
64  maskName='all',
65  path=None):
66  """
67  Interfacing for the DeepFlavorTagger. This function can be used for training (``teacher``), preparation of
68  training datasets (``sampler``) and inference (``expert``).
69 
70  This function requires reconstructed B meson signal particle list and where an RestOfEvent is built.
71 
72  :param particle_lists: string or list[string], particle list(s) of the reconstructed signal B meson
73  :param mode: string, valid modes are ``expert`` (default), ``teacher``, ``sampler``
74  :param working_dir: string, working directory for the method
75  :param uniqueIdentifier: string, database identifier for the method
76  :param variable_list: list[string], name of the basf2 variables used for discrimination
77  :param target: string, target variable
78  :param overwrite: bool, overwrite already (locally!) existing training
79  :param transform_to_probability: bool, enable a purity transformation to compensate potential over-training,
80  can only be set during training
81  :param signal_fraction: float, (experimental) signal fraction override,
82  transform to output to a probability if an uneven signal/background fraction is used in the training data,
83  can only be set during training
84  :param classifier_args: dictionary, customized arguments for the mlp
85  possible attributes of the dictionary are:
86  lr_dec_rate: learning rate decay rate
87  lr_init: learning rate initial value
88  mom_init: momentum initial value
89  min_epochs: minimal number of epochs
90  max_epochs: maximal number of epochs
91  stop_epochs: epochs to stop without improvements on the validation set for early stopping
92  batch_size: batch size
93  seed: random seed for tensorflow
94  layers: [[layer name, activation function, input_width, output_width, init_bias, init_weights],..]
95  wd_coeffs: weight decay coefficients, length of layers
96  cuda_visible_devices: selection of cuda devices
97  tensorboard_dir: addition directory for logging the training process
98  :param train_valid_fraction: float, train-valid fraction (.92). If transform to probability is
99  enabled, train valid fraction will be split into a test set (.5)
100  :param maskName: get ROE particles from a specified ROE mask
101  :param path: basf2 path obj
102  :return: None
103  """
104 
105  if isinstance(particle_lists, str):
106  particle_lists = [particle_lists]
107 
108  if mode not in ['expert', 'teacher', 'sampler']:
109  B2FATAL('Invalid mode %s' % mode)
110 
111  if variable_list is None and mode in ['sampler', 'teacher']:
112  variable_list = [
113  'useCMSFrame(p)',
114  'useCMSFrame(cosTheta)',
115  'useCMSFrame(phi)',
116  'kaonID',
117  'electronID',
118  'muonID',
119  'protonID',
120  'nCDCHits',
121  'nPXDHits',
122  'nSVDHits',
123  'dz',
124  'dr',
125  'chiProb']
126 
127  if variable_list is not None and mode == 'expert':
128  B2ERROR('DFT: Variables from identifier file are used. Input variables will be ignored.')
129 
130  if classifier_args is None:
131  classifier_args = {}
132  else:
133  assert isinstance(classifier_args, dict)
134 
135  classifier_args['transform_to_prob'] = transform_to_probability
136 
137  output_file_name = os.path.join(working_dir, uniqueIdentifier + '_training_data.root')
138 
139  # create roe specific paths
140  roe_path = basf2.create_path()
141  dead_end_path = basf2.create_path()
142 
143  # define dft specific lists to enable multiple calls, if someone really wants to do that
144  extension = particle_lists[0].replace(':', '_to_')
145  roe_particle_list_cut = ''
146  roe_particle_list = 'pi+:dft' + '_' + extension
147 
148  tree_name = 'dft_variables'
149 
150  # filter rest of events only for specific particle list
151  ma.signalSideParticleListsFilter(particle_lists, 'hasRestOfEventTracks > 0', roe_path, dead_end_path)
152 
153  # TODO: particles with empty rest of events seems not to show up in efficiency statistics anymore
154 
155  # create final state particle lists
156  ma.fillParticleList(roe_particle_list, roe_particle_list_cut, path=roe_path)
157 
158  dft_particle_lists = ['pi+:pos_charged', 'pi+:neg_charged']
159 
160  pos_cut = 'charge > 0 and isInRestOfEvent == 1 and passesROEMask(' + maskName + ') > 0.5 and p < infinity'
161  neg_cut = 'charge < 0 and isInRestOfEvent == 1 and passesROEMask(' + maskName + ') > 0.5 and p < infinity'
162 
163  ma.cutAndCopyList(dft_particle_lists[0], roe_particle_list, pos_cut, writeOut=True, path=roe_path)
164  ma.cutAndCopyList(dft_particle_lists[1], roe_particle_list, neg_cut, writeOut=True, path=roe_path)
165 
166  # sort pattern for tagging specific variables
167  rank_variable = 'p'
168  # rank_variable = 'useCMSFrame(p)'
169 
170  # create tagging specific variables
171  if mode != 'expert':
172  features = get_variables(dft_particle_lists[0], rank_variable, variable_list, particleNumber=5)
173  features += get_variables(dft_particle_lists[1], rank_variable, variable_list, particleNumber=5)
174 
175  for particles in dft_particle_lists:
176  ma.rankByHighest(particles, rank_variable, path=roe_path)
177 
178  if mode == 'sampler':
179  if os.path.isfile(output_file_name) and not overwrite:
180  B2FATAL('Outputfile %s already exists. Aborting writeout.' % output_file_name)
181 
182  # and add target
183  all_variables = features + [target]
184 
185  # write to ntuples
186  ma.variablesToNtuple('', all_variables, tree_name, output_file_name, roe_path)
187 
188  # write the command line output for the extern teacher to a file
189  extern_command = 'basf2_mva_teacher --datafile {output_file_name} --treename {tree_name}' \
190  ' --identifier {identifier} --variables "{variables_string}" --target_variable {target}' \
191  ' --method Python --training_fraction {fraction}' \
192  " --config '{classifier_args}' --framework tensorflow" \
193  ' --steering_file {steering_file}'\
194  ''.format(output_file_name=output_file_name, tree_name=tree_name,
195  identifier=uniqueIdentifier,
196  variables_string='" "'.join(features), target=target,
197  classifier_args=json.dumps(classifier_args), fraction=train_valid_fraction,
198  steering_file=mva_steering_file)
199 
200  with open(os.path.join(working_dir, uniqueIdentifier + '_teacher_command'), 'w') as f:
201  f.write(extern_command)
202 
203  elif mode == 'teacher':
204  if not os.path.isfile(output_file_name):
205  B2FATAL('There is no training data file available. Run flavor tagger in sampler mode first.')
206  general_options = basf2_mva.GeneralOptions()
207  general_options.m_datafiles = basf2_mva.vector(output_file_name)
208 
209  general_options.m_treename = tree_name
210  general_options.m_target_variable = target
211  general_options.m_variables = basf2_mva.vector(*features)
212 
213  general_options.m_identifier = uniqueIdentifier
214 
215  specific_options = basf2_mva.PythonOptions()
216  specific_options.m_framework = 'tensorflow'
217  specific_options.m_steering_file = mva_steering_file
218  specific_options.m_training_fraction = train_valid_fraction
219 
220  specific_options.m_config = json.dumps(classifier_args)
221 
222  basf2_mva.teacher(general_options, specific_options)
223 
224  elif mode == 'expert':
225 
226  flavorTaggerInfoBuilder = basf2.register_module('FlavorTaggerInfoBuilder')
227  path.add_module(flavorTaggerInfoBuilder)
228 
229  expert_module = basf2.register_module('MVAExpert')
230  expert_module.param('listNames', particle_lists)
231  expert_module.param('identifier', uniqueIdentifier)
232 
233  expert_module.param('extraInfoName', 'dnn_output')
234  expert_module.param('signalFraction', signal_fraction)
235 
236  roe_path.add_module(expert_module)
237 
238  flavorTaggerInfoFiller = basf2.register_module('FlavorTaggerInfoFiller')
239  flavorTaggerInfoFiller.param('DNNmlp', True)
240  roe_path.add_module(flavorTaggerInfoFiller)
241 
242  # Create standard alias for the output of the flavor tagger
243  vm.addAlias('DNN_qrCombined', 'qrOutput(DNN)')
244 
245  path.for_each('RestOfEvent', 'RestOfEvents', roe_path)
static std::string makeROOTCompatible(std::string str)
Remove special characters that ROOT dislikes in branch names, e.g.