Belle II Software  release-05-01-25
basf2_mva_evaluate.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 import basf2_mva_util
5 
6 from basf2_mva_evaluation import plotting
7 import argparse
8 import tempfile
9 
10 import numpy as np
11 from B2Tools import b2latex, format
12 from basf2 import B2INFO
13 
14 import ROOT
15 
16 import os
17 import shutil
18 import collections
19 
20 
21 def getCommandLineOptions():
22  """ Parses the command line options of the fei and returns the corresponding arguments. """
23  parser = argparse.ArgumentParser()
24  parser.add_argument('-id', '--identifiers', dest='identifiers', type=str, required=True, action='append', nargs='+',
25  help='DB Identifier or weightfile')
26  parser.add_argument('-train', '--train_datafiles', dest='train_datafiles', type=str, required=False, action='append', nargs='+',
27  help='Data file containing ROOT TTree used during training')
28  parser.add_argument('-data', '--datafiles', dest='datafiles', type=str, required=True, action='append', nargs='+',
29  help='Data file containing ROOT TTree with independent test data')
30  parser.add_argument('-tree', '--treename', dest='treename', type=str, default='tree', help='Treename in data file')
31  parser.add_argument('-out', '--outputfile', dest='outputfile', type=str, default='output.zip',
32  help='Name of the created .zip archive file if not compiling or a pdf file if compilation is successful.')
33  parser.add_argument('-w', '--working_directory', dest='working_directory', type=str, default='',
34  help="""Working directory where the created images and root files are stored,
35  default is to create a temporary directory.""")
36  parser.add_argument('-n', '--fillnan', dest='fillnan', action='store_true',
37  help='Fill nan and inf values with actual numbers')
38  parser.add_argument('-c', '--compile', dest='compile', action='store_true',
39  help='Compile latex to pdf')
40  args = parser.parse_args()
41  return args
42 
43 
44 def unique(input):
45  """
46  Returns a list containing only unique elements, keeps the original order of the list
47  @param input list containing the elements
48  """
49  output = []
50  for x in input:
51  if x not in output:
52  output.append(x)
53  return output
54 
55 
56 def create_abbreviations(names, length=5):
57  count = dict()
58  for name in names:
59  abbreviation = name[:length]
60  if abbreviation not in count:
61  count[abbreviation] = 0
62  count[abbreviation] += 1
63  abbreviations = collections.OrderedDict()
64 
65  count2 = dict()
66  for name in names:
67  abbreviation = name[:length]
68  abbreviations[name] = abbreviation
69  if count[abbreviation] > 1:
70  if abbreviation not in count2:
71  count2[abbreviation] = 0
72  count2[abbreviation] += 1
73  abbreviations[name] += str(count2[abbreviation])
74  return abbreviations
75 
76 
77 if __name__ == '__main__':
78 
79  ROOT.gROOT.SetBatch(True)
80 
81  old_cwd = os.getcwd()
82  args = getCommandLineOptions()
83 
84  identifiers = sum(args.identifiers, [])
85  identifier_abbreviations = create_abbreviations(identifiers)
86 
87  datafiles = sum(args.datafiles, [])
88 
89  print("Load methods")
90  methods = [basf2_mva_util.Method(identifier) for identifier in identifiers]
91 
92  print("Apply experts on independent data")
93  test_probability = {}
94  test_target = {}
95  for method in methods:
96  p, t = method.apply_expert(datafiles, args.treename)
97  test_probability[identifier_abbreviations[method.identifier]] = p
98  test_target[identifier_abbreviations[method.identifier]] = t
99 
100  print("Apply experts on training data")
101  train_probability = {}
102  train_target = {}
103  if args.train_datafiles is not None:
104  train_datafiles = sum(args.train_datafiles, [])
105  for method in methods:
106  p, t = method.apply_expert(train_datafiles, args.treename)
107  train_probability[identifier_abbreviations[method.identifier]] = p
108  train_target[identifier_abbreviations[method.identifier]] = t
109 
110  variables = unique(v for method in methods for v in method.variables)
111  variable_abbreviations = create_abbreviations(variables)
112  root_variables = unique(v for method in methods for v in method.root_variables)
113 
114  spectators = unique(v for method in methods for v in method.spectators)
115  spectator_abbreviations = create_abbreviations(spectators)
116  root_spectators = unique(v for method in methods for v in method.root_spectators)
117 
118  print("Load variables array")
119  rootchain = ROOT.TChain(args.treename)
120  for datafile in datafiles:
121  rootchain.Add(datafile)
122 
123  variables_data = basf2_mva_util.tree2dict(rootchain, root_variables, list(variable_abbreviations.values()))
124 
125  if args.fillnan:
126  for column in variable_abbreviations.values():
127  np.nan_to_num(variables_data[column], copy=False)
128 
129  spectators_data = basf2_mva_util.tree2dict(rootchain, root_spectators, list(spectator_abbreviations.values()))
130 
131  print("Create latex file")
132  # Change working directory after experts run, because they might want to access
133  # a localdb in the current working directory.
134  with tempfile.TemporaryDirectory() as tempdir:
135  if args.working_directory == '':
136  os.chdir(tempdir)
137  else:
138  os.chdir(args.working_directory)
139 
140  o = b2latex.LatexFile()
141  o += b2latex.TitlePage(title='Automatic MVA Evaluation',
142  authors=[r'Thomas Keck\\ Moritz Gelb\\ Nils Braun'],
143  abstract='Evaluation plots',
144  add_table_of_contents=True).finish()
145 
146  o += b2latex.Section("Classifiers")
147  o += b2latex.String(r"""
148  This section contains the GeneralOptions and SpecificOptions of all classifiers represented by an XML tree.
149  The same information can be retreived using the basf2\_mva\_info tool.
150  """)
151 
152  table = b2latex.LongTable(r"ll", "Abbreviations of identifiers", "{name} & {abbr}", r"Identifier & Abbreviation")
153  for identifier in identifiers:
154  table.add(name=format.string(identifier), abbr=format.string(identifier_abbreviations[identifier]))
155  o += table.finish()
156 
157  for method in methods:
158  o += b2latex.SubSection(format.string(method.identifier))
159  o += b2latex.Listing(language='XML').add(method.description).finish()
160 
161  o += b2latex.Section("Variables")
162  o += b2latex.String("""
163  This section contains an overview of the importance and correlation of the variables used by the classifiers.
164  And distribution plots of the variables on the independent dataset. The distributions are normed for signal and
165  background separately, and only the region +- 3 sigma around the mean is shown.
166  """)
167 
168  table = b2latex.LongTable(r"ll", "Abbreviations of variables", "{name} & {abbr}", r"Variable & Abbreviation")
169  for v in variables:
170  table.add(name=format.string(v), abbr=format.string(variable_abbreviations[v]))
171  o += table.finish()
172 
173  o += b2latex.SubSection("Importance")
174  graphics = b2latex.Graphics()
175  p = plotting.Importance()
176  p.add({identifier_abbreviations[i.identifier]: np.array([i.importances.get(v, 0.0) for v in variables]) for i in methods},
177  identifier_abbreviations.values(), variable_abbreviations.values())
178  p.finish()
179  p.save('importance.pdf')
180  graphics.add('importance.pdf', width=1.0)
181  o += graphics.finish()
182 
183  o += b2latex.SubSection("Correlation")
184  first_identifier_abbr = list(identifier_abbreviations.values())[0]
185  graphics = b2latex.Graphics()
187  p.add(variables_data, variable_abbreviations.values(),
188  test_target[first_identifier_abbr] == 1,
189  test_target[first_identifier_abbr] == 0)
190  p.finish()
191  p.save('correlation_plot.pdf')
192  graphics.add('correlation_plot.pdf', width=1.0)
193  o += graphics.finish()
194 
195  for v in variables:
196  variable_abbr = variable_abbreviations[v]
197  o += b2latex.SubSection(format.string(v))
198  graphics = b2latex.Graphics()
199  p = plotting.VerboseDistribution(normed=True, range_in_std=3)
200  p.add(variables_data, variable_abbr, test_target[first_identifier_abbr] == 1, label="Signal")
201  p.add(variables_data, variable_abbr, test_target[first_identifier_abbr] == 0, label="Background")
202  p.finish()
203  p.save('variable_{}.pdf'.format(hash(v)))
204  graphics.add('variable_{}.pdf'.format(hash(v)), width=1.0)
205  o += graphics.finish()
206 
207  o += b2latex.Section("Classifier Plot")
208  o += b2latex.String("This section contains the receiver operating characteristics (ROC), purity projection, ..."
209  "of the classifiers on training and independent data."
210  "The legend of each plot contains the shortened identifier and the area under the ROC curve"
211  "in parenthesis.")
212 
213  o += b2latex.Section("ROC Plot")
214  graphics = b2latex.Graphics()
216  for identifier in identifier_abbreviations.values():
217  p.add(test_probability, identifier, test_target[identifier] == 1, test_target[identifier] == 0)
218  p.finish()
219  p.axis.set_title("ROC Rejection Plot on independent data")
220  p.save('roc_plot_test.pdf')
221  graphics.add('roc_plot_test.pdf', width=1.0)
222  o += graphics.finish()
223 
224  if train_probability:
225  for i, identifier in enumerate(identifiers):
226  graphics = b2latex.Graphics()
228  identifier_abbr = identifier_abbreviations[identifier]
229  p.add(train_probability, identifier_abbr, train_target[identifier_abbr] == 1,
230  train_target[identifier_abbr] == 0, label='Train')
231  p.add(test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
232  test_target[identifier_abbr] == 0, label='Test')
233  p.finish()
234  p.axis.set_title(identifier)
235  p.save('roc_test_{}.pdf'.format(hash(identifier)))
236  graphics.add('roc_test_{}.pdf'.format(hash(identifier)), width=1.0)
237  o += graphics.finish()
238 
239  o += b2latex.Section("Classification Results")
240 
241  for identifier in identifiers:
242  identifier_abbr = identifier_abbreviations[identifier]
243  o += b2latex.SubSection(format.string(identifier_abbr))
244  graphics = b2latex.Graphics()
246  p.add(0, test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
247  test_target[identifier_abbr] == 0, normed=True)
248  p.sub_plots[0].axis.set_title("Classification result in test data for {identifier}".format(identifier=identifier))
249 
250  p.add(1, test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
251  test_target[identifier_abbr] == 0, normed=False)
252  p.sub_plots[1].axis.set_title("Classification result in test data for {identifier}".format(identifier=identifier))
253  p.finish()
254 
255  p.save('classification_result_{identifier}.pdf'.format(identifier=hash(identifier)))
256  graphics.add('classification_result_{identifier}.pdf'.format(identifier=hash(identifier)), width=1)
257  o += graphics.finish()
258 
259  o += b2latex.Section("Diagonal Plot")
260  graphics = b2latex.Graphics()
261  p = plotting.Diagonal()
262  for identifier in identifiers:
263  o += b2latex.SubSection(format.string(identifier_abbr))
264  identifier_abbr = identifier_abbreviations[identifier]
265  p.add(test_probability, identifier_abbr, test_target[identifier_abbr] == 1, test_target[identifier_abbr] == 0)
266  p.finish()
267  p.axis.set_title("Diagonal plot on independent data")
268  p.save('diagonal_plot_test.pdf')
269  graphics.add('diagonal_plot_test.pdf', width=1.0)
270  o += graphics.finish()
271 
272  if train_probability:
273  o += b2latex.SubSection("Overtraining Plot")
274  for identifier in identifiers:
275  identifier_abbr = identifier_abbreviations[identifier]
276  probability = {identifier_abbr: np.r_[train_probability[identifier_abbr], test_probability[identifier_abbr]]}
277  target = np.r_[train_target[identifier_abbr], test_target[identifier_abbr]]
278  train_mask = np.r_[np.ones(len(train_target[identifier_abbr])), np.zeros(len(test_target[identifier_abbr]))]
279  graphics = b2latex.Graphics()
281  p.add(probability, identifier_abbr,
282  train_mask == 1, train_mask == 0,
283  target == 1, target == 0, )
284  p.finish()
285  p.axis.set_title("Overtraining check for {}".format(identifier))
286  p.save('overtraining_plot_{}.pdf'.format(hash(identifier)))
287  graphics.add('overtraining_plot_{}.pdf'.format(hash(identifier)), width=1.0)
288  o += graphics.finish()
289 
290  o += b2latex.Section("Spectators")
291  o += b2latex.String("This section contains the distribution and dependence on the"
292  "classifier outputs of all spectator variables.")
293 
294  table = b2latex.LongTable(r"ll", "Abbreviations of spectators", "{name} & {abbr}", r"Spectator & Abbreviation")
295  for s in spectators:
296  table.add(name=format.string(s), abbr=format.string(spectator_abbreviations[s]))
297  o += table.finish()
298 
299  for spectator in spectators:
300  spectator_abbr = spectator_abbreviations[spectator]
301  o += b2latex.SubSection(format.string(spectator))
302  graphics = b2latex.Graphics()
304  p.add(spectators_data, spectator_abbr, test_target[first_identifier_abbr] == 1, label="Signal")
305  p.add(spectators_data, spectator_abbr, test_target[first_identifier_abbr] == 0, label="Background")
306  p.finish()
307  p.save('spectator_{}.pdf'.format(hash(spectator)))
308  graphics.add('spectator_{}.pdf'.format(hash(spectator)), width=1.0)
309  o += graphics.finish()
310 
311  for identifier in identifiers:
312  o += b2latex.SubSubSection(format.string(spectator) + " with classifier " + format.string(identifier))
313  identifier_abbr = identifier_abbreviations[identifier]
314  data = {identifier_abbr: test_probability[identifier_abbr], spectator_abbr: spectators_data[spectator_abbr]}
315  graphics = b2latex.Graphics()
317  p.add(data, spectator_abbr, identifier_abbr, list(range(10, 100, 10)),
318  test_target[identifier_abbr] == 1,
319  test_target[identifier_abbr] == 0)
320  p.finish()
321  p.save('correlation_plot_{}_{}.pdf'.format(hash(spectator), hash(identifier)))
322  graphics.add('correlation_plot_{}_{}.pdf'.format(hash(spectator), hash(identifier)), width=1.0)
323  o += graphics.finish()
324 
325  if args.compile:
326  B2INFO(f"Creating a PDF file at {args.outputfile}. Please remove the '-c' switch if this fails.")
327  o.save(f'latex.tex', compile=True)
328  else:
329  B2INFO(f"Creating a .zip archive containing plots and a TeX file at {args.outputfile}."
330  f"Please unpack the archive and compile the latex.tex file with pdflatex.")
331  o.save(f'latex.tex', compile=False)
332 
333  os.chdir(old_cwd)
334  if args.working_directory == '':
335  working_directory = tempdir
336  else:
337  working_directory = args.working_directory
338 
339  if args.compile:
340  shutil.copy(os.path.join(working_directory, f'latex.pdf'), args.outputfile)
341  else:
342  base_name = os.path.join(old_cwd, args.outputfile.rsplit('.', 1)[0])
343  shutil.make_archive(base_name, 'zip', working_directory)
plotting.VerboseDistribution
Definition: plotting.py:937
basf2_mva_util.tree2dict
def tree2dict(tree, tree_columns, dict_columns=None)
Definition: basf2_mva_util.py:16
plotting.Correlation
Definition: plotting.py:1012
plotting.PurityAndEfficiencyOverCut
Definition: plotting.py:241
plotting.RejectionOverEfficiency
Definition: plotting.py:396
plotting.Diagonal
Definition: plotting.py:495
plotting.CorrelationMatrix
Definition: plotting.py:1183
basf2_mva_util.Method
Definition: basf2_mva_util.py:81
plotting.Importance
Definition: plotting.py:1130
plotting.Overtraining
Definition: plotting.py:813
plotting.Multiplot
Definition: plotting.py:445