Belle II Software  release-06-02-00
basf2_mva_evaluate.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 
11 
12 import basf2_mva_util
13 
14 from basf2_mva_evaluation import plotting
15 import argparse
16 import tempfile
17 
18 import numpy as np
19 from B2Tools import b2latex, format
20 from basf2 import B2INFO
21 
22 import ROOT
23 
24 import os
25 import shutil
26 import collections
27 from typing import List, Any
28 
29 
30 def get_argument_parser() -> argparse.ArgumentParser:
31  """ Parses the command line options of the fei and returns the corresponding arguments. """
32  parser = argparse.ArgumentParser()
33  parser.add_argument('-id', '--identifiers', dest='identifiers', type=str, required=True, action='append', nargs='+',
34  help='DB Identifier or weightfile')
35  parser.add_argument('-train', '--train_datafiles', dest='train_datafiles', type=str, required=False, action='append', nargs='+',
36  help='Data file containing ROOT TTree used during training')
37  parser.add_argument('-data', '--datafiles', dest='datafiles', type=str, required=True, action='append', nargs='+',
38  help='Data file containing ROOT TTree with independent test data')
39  parser.add_argument('-tree', '--treename', dest='treename', type=str, default='tree', help='Treename in data file')
40  parser.add_argument('-out', '--outputfile', dest='outputfile', type=str, default='output.zip',
41  help='Name of the created .zip archive file if not compiling or a pdf file if compilation is successful.')
42  parser.add_argument('-w', '--working_directory', dest='working_directory', type=str, default='',
43  help="""Working directory where the created images and root files are stored,
44  default is to create a temporary directory.""")
45  parser.add_argument('-n', '--fillnan', dest='fillnan', action='store_true',
46  help='Fill nan and inf values with actual numbers')
47  parser.add_argument('-c', '--compile', dest='compile', action='store_true',
48  help='Compile latex to pdf directly')
49  return parser
50 
51 
52 def unique(input_list: List[Any]) -> List[Any]:
53  """
54  Returns a list containing only unique elements, keeps the original order of the list
55  @param input_list list containing the elements
56  """
57  output = []
58  for x in input_list:
59  if x not in output:
60  output.append(x)
61  return output
62 
63 
64 def create_abbreviations(names, length=5):
65  count = dict()
66  for name in names:
67  abbreviation = name[:length]
68  if abbreviation not in count:
69  count[abbreviation] = 0
70  count[abbreviation] += 1
71  abbreviations = collections.OrderedDict()
72 
73  count2 = dict()
74  for name in names:
75  abbreviation = name[:length]
76  abbreviations[name] = abbreviation
77  if count[abbreviation] > 1:
78  if abbreviation not in count2:
79  count2[abbreviation] = 0
80  count2[abbreviation] += 1
81  abbreviations[name] += str(count2[abbreviation])
82  return abbreviations
83 
84 
85 if __name__ == '__main__':
86 
87  ROOT.gROOT.SetBatch(True)
88 
89  old_cwd = os.getcwd()
90  parser = get_argument_parser()
91  args = parser.parse_args()
92 
93  identifiers = sum(args.identifiers, [])
94  identifier_abbreviations = create_abbreviations(identifiers)
95 
96  datafiles = sum(args.datafiles, [])
97 
98  print("Load methods")
99  methods = [basf2_mva_util.Method(identifier) for identifier in identifiers]
100 
101  print("Apply experts on independent data")
102  test_probability = {}
103  test_target = {}
104  for method in methods:
105  p, t = method.apply_expert(datafiles, args.treename)
106  test_probability[identifier_abbreviations[method.identifier]] = p
107  test_target[identifier_abbreviations[method.identifier]] = t
108 
109  print("Apply experts on training data")
110  train_probability = {}
111  train_target = {}
112  if args.train_datafiles is not None:
113  train_datafiles = sum(args.train_datafiles, [])
114  for method in methods:
115  p, t = method.apply_expert(train_datafiles, args.treename)
116  train_probability[identifier_abbreviations[method.identifier]] = p
117  train_target[identifier_abbreviations[method.identifier]] = t
118 
119  variables = unique(v for method in methods for v in method.variables)
120  variable_abbreviations = create_abbreviations(variables)
121  root_variables = unique(v for method in methods for v in method.root_variables)
122 
123  spectators = unique(v for method in methods for v in method.spectators)
124  spectator_abbreviations = create_abbreviations(spectators)
125  root_spectators = unique(v for method in methods for v in method.root_spectators)
126 
127  print("Load variables array")
128  rootchain = ROOT.TChain(args.treename)
129  for datafile in datafiles:
130  rootchain.Add(datafile)
131 
132  variables_data = basf2_mva_util.tree2dict(rootchain, root_variables, list(variable_abbreviations.values()))
133 
134  if args.fillnan:
135  for column in variable_abbreviations.values():
136  np.nan_to_num(variables_data[column], copy=False)
137 
138  spectators_data = basf2_mva_util.tree2dict(rootchain, root_spectators, list(spectator_abbreviations.values()))
139 
140  print("Create latex file")
141  # Change working directory after experts run, because they might want to access
142  # a localdb in the current working directory.
143  with tempfile.TemporaryDirectory() as tempdir:
144  if args.working_directory == '':
145  os.chdir(tempdir)
146  else:
147  os.chdir(args.working_directory)
148 
149  o = b2latex.LatexFile()
150  o += b2latex.TitlePage(title='Automatic MVA Evaluation',
151  authors=[r'Thomas Keck\\ Moritz Gelb\\ Nils Braun'],
152  abstract='Evaluation plots',
153  add_table_of_contents=True).finish()
154 
155  o += b2latex.Section("Classifiers")
156  o += b2latex.String(r"""
157  This section contains the GeneralOptions and SpecificOptions of all classifiers represented by an XML tree.
158  The same information can be retrieved using the basf2\_mva\_info tool.
159  """)
160 
161  table = b2latex.LongTable(r"ll", "Abbreviations of identifiers", "{name} & {abbr}", r"Identifier & Abbreviation")
162  for identifier in identifiers:
163  table.add(name=format.string(identifier), abbr=format.string(identifier_abbreviations[identifier]))
164  o += table.finish()
165 
166  for method in methods:
167  o += b2latex.SubSection(format.string(method.identifier))
168  o += b2latex.Listing(language='XML').add(method.description).finish()
169 
170  o += b2latex.Section("Variables")
171  o += b2latex.String("""
172  This section contains an overview of the importance and correlation of the variables used by the classifiers.
173  And distribution plots of the variables on the independent dataset. The distributions are normed for signal and
174  background separately, and only the region +- 3 sigma around the mean is shown.
175  """)
176 
177  table = b2latex.LongTable(r"ll", "Abbreviations of variables", "{name} & {abbr}", r"Variable & Abbreviation")
178  for v in variables:
179  table.add(name=format.string(v), abbr=format.string(variable_abbreviations[v]))
180  o += table.finish()
181 
182  o += b2latex.SubSection("Importance")
183  graphics = b2latex.Graphics()
184  p = plotting.Importance()
185  p.add({identifier_abbreviations[i.identifier]: np.array([i.importances.get(v, 0.0) for v in variables]) for i in methods},
186  identifier_abbreviations.values(), variable_abbreviations.values())
187  p.finish()
188  p.save('importance.pdf')
189  graphics.add('importance.pdf', width=1.0)
190  o += graphics.finish()
191 
192  o += b2latex.SubSection("Correlation")
193  first_identifier_abbr = list(identifier_abbreviations.values())[0]
194  graphics = b2latex.Graphics()
196  p.add(variables_data, variable_abbreviations.values(),
197  test_target[first_identifier_abbr] == 1,
198  test_target[first_identifier_abbr] == 0)
199  p.finish()
200  p.save('correlation_plot.pdf')
201  graphics.add('correlation_plot.pdf', width=1.0)
202  o += graphics.finish()
203 
204  for v in variables:
205  variable_abbr = variable_abbreviations[v]
206  o += b2latex.SubSection(format.string(v))
207  graphics = b2latex.Graphics()
208  p = plotting.VerboseDistribution(normed=True, range_in_std=3)
209  p.add(variables_data, variable_abbr, test_target[first_identifier_abbr] == 1, label="Signal")
210  p.add(variables_data, variable_abbr, test_target[first_identifier_abbr] == 0, label="Background")
211  p.finish()
212  p.save('variable_{}.pdf'.format(hash(v)))
213  graphics.add('variable_{}.pdf'.format(hash(v)), width=1.0)
214  o += graphics.finish()
215 
216  o += b2latex.Section("Classifier Plot")
217  o += b2latex.String("This section contains the receiver operating characteristics (ROC), purity projection, ..."
218  "of the classifiers on training and independent data."
219  "The legend of each plot contains the shortened identifier and the area under the ROC curve"
220  "in parenthesis.")
221 
222  o += b2latex.Section("ROC Plot")
223  graphics = b2latex.Graphics()
225  for identifier in identifier_abbreviations.values():
226  p.add(test_probability, identifier, test_target[identifier] == 1, test_target[identifier] == 0)
227  p.finish()
228  p.axis.set_title("ROC Rejection Plot on independent data")
229  p.save('roc_plot_test.pdf')
230  graphics.add('roc_plot_test.pdf', width=1.0)
231  o += graphics.finish()
232 
233  if train_probability:
234  for i, identifier in enumerate(identifiers):
235  graphics = b2latex.Graphics()
237  identifier_abbr = identifier_abbreviations[identifier]
238  p.add(train_probability, identifier_abbr, train_target[identifier_abbr] == 1,
239  train_target[identifier_abbr] == 0, label='Train')
240  p.add(test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
241  test_target[identifier_abbr] == 0, label='Test')
242  p.finish()
243  p.axis.set_title(identifier)
244  p.save('roc_test_{}.pdf'.format(hash(identifier)))
245  graphics.add('roc_test_{}.pdf'.format(hash(identifier)), width=1.0)
246  o += graphics.finish()
247 
248  o += b2latex.Section("Classification Results")
249 
250  for identifier in identifiers:
251  identifier_abbr = identifier_abbreviations[identifier]
252  o += b2latex.SubSection(format.string(identifier_abbr))
253  graphics = b2latex.Graphics()
255  p.add(0, test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
256  test_target[identifier_abbr] == 0, normed=True)
257  p.sub_plots[0].axis.set_title("Classification result in test data for {identifier}".format(identifier=identifier))
258 
259  p.add(1, test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
260  test_target[identifier_abbr] == 0, normed=False)
261  p.sub_plots[1].axis.set_title("Classification result in test data for {identifier}".format(identifier=identifier))
262  p.finish()
263 
264  p.save('classification_result_{identifier}.pdf'.format(identifier=hash(identifier)))
265  graphics.add('classification_result_{identifier}.pdf'.format(identifier=hash(identifier)), width=1)
266  o += graphics.finish()
267 
268  o += b2latex.Section("Diagonal Plot")
269  graphics = b2latex.Graphics()
270  p = plotting.Diagonal()
271  for identifier in identifiers:
272  o += b2latex.SubSection(format.string(identifier_abbr))
273  identifier_abbr = identifier_abbreviations[identifier]
274  p.add(test_probability, identifier_abbr, test_target[identifier_abbr] == 1, test_target[identifier_abbr] == 0)
275  p.finish()
276  p.axis.set_title("Diagonal plot on independent data")
277  p.save('diagonal_plot_test.pdf')
278  graphics.add('diagonal_plot_test.pdf', width=1.0)
279  o += graphics.finish()
280 
281  if train_probability:
282  o += b2latex.SubSection("Overtraining Plot")
283  for identifier in identifiers:
284  identifier_abbr = identifier_abbreviations[identifier]
285  probability = {identifier_abbr: np.r_[train_probability[identifier_abbr], test_probability[identifier_abbr]]}
286  target = np.r_[train_target[identifier_abbr], test_target[identifier_abbr]]
287  train_mask = np.r_[np.ones(len(train_target[identifier_abbr])), np.zeros(len(test_target[identifier_abbr]))]
288  graphics = b2latex.Graphics()
290  p.add(probability, identifier_abbr,
291  train_mask == 1, train_mask == 0,
292  target == 1, target == 0, )
293  p.finish()
294  p.axis.set_title("Overtraining check for {}".format(identifier))
295  p.save('overtraining_plot_{}.pdf'.format(hash(identifier)))
296  graphics.add('overtraining_plot_{}.pdf'.format(hash(identifier)), width=1.0)
297  o += graphics.finish()
298 
299  o += b2latex.Section("Spectators")
300  o += b2latex.String("This section contains the distribution and dependence on the"
301  "classifier outputs of all spectator variables.")
302 
303  table = b2latex.LongTable(r"ll", "Abbreviations of spectators", "{name} & {abbr}", r"Spectator & Abbreviation")
304  for s in spectators:
305  table.add(name=format.string(s), abbr=format.string(spectator_abbreviations[s]))
306  o += table.finish()
307 
308  for spectator in spectators:
309  spectator_abbr = spectator_abbreviations[spectator]
310  o += b2latex.SubSection(format.string(spectator))
311  graphics = b2latex.Graphics()
313  p.add(spectators_data, spectator_abbr, test_target[first_identifier_abbr] == 1, label="Signal")
314  p.add(spectators_data, spectator_abbr, test_target[first_identifier_abbr] == 0, label="Background")
315  p.finish()
316  p.save('spectator_{}.pdf'.format(hash(spectator)))
317  graphics.add('spectator_{}.pdf'.format(hash(spectator)), width=1.0)
318  o += graphics.finish()
319 
320  for identifier in identifiers:
321  o += b2latex.SubSubSection(format.string(spectator) + " with classifier " + format.string(identifier))
322  identifier_abbr = identifier_abbreviations[identifier]
323  data = {identifier_abbr: test_probability[identifier_abbr], spectator_abbr: spectators_data[spectator_abbr]}
324  graphics = b2latex.Graphics()
326  p.add(data, spectator_abbr, identifier_abbr, list(range(10, 100, 10)),
327  test_target[identifier_abbr] == 1,
328  test_target[identifier_abbr] == 0)
329  p.finish()
330  p.save('correlation_plot_{}_{}.pdf'.format(hash(spectator), hash(identifier)))
331  graphics.add('correlation_plot_{}_{}.pdf'.format(hash(spectator), hash(identifier)), width=1.0)
332  o += graphics.finish()
333 
334  if args.compile:
335  B2INFO(f"Creating a PDF file at {args.outputfile}. Please remove the '-c' switch if this fails.")
336  o.save('latex.tex', compile=True)
337  else:
338  B2INFO(f"Creating a .zip archive containing plots and a TeX file at {args.outputfile}."
339  f"Please unpack the archive and compile the latex.tex file with pdflatex.")
340  o.save('latex.tex', compile=False)
341 
342  os.chdir(old_cwd)
343  if args.working_directory == '':
344  working_directory = tempdir
345  else:
346  working_directory = args.working_directory
347 
348  if args.compile:
349  shutil.copy(os.path.join(working_directory, 'latex.pdf'), args.outputfile)
350  else:
351  base_name = os.path.join(old_cwd, args.outputfile.rsplit('.', 1)[0])
352  shutil.make_archive(base_name, 'zip', working_directory)
def tree2dict(tree, tree_columns, dict_columns=None)