13from basf2_mva_evaluation
import plotting
14from basf2
import conditions
19from B2Tools
import b2latex, format
20from basf2
import B2INFO
25from typing
import List, Any
28def get_argument_parser() -> argparse.ArgumentParser:
29 """ Parses the command line options of the fei and returns the corresponding arguments. """
30 parser = argparse.ArgumentParser()
31 parser.add_argument(
'-id',
'--identifiers', dest=
'identifiers', type=str, required=
True, action=
'append', nargs=
'+',
32 help=
'DB Identifier or weightfile')
33 parser.add_argument(
'-train',
'--train_datafiles', dest=
'train_datafiles', type=str, required=
False, action=
'append', nargs=
'+',
34 help=
'Data file containing ROOT TTree used during training')
35 parser.add_argument(
'-data',
'--datafiles', dest=
'datafiles', type=str, required=
True, action=
'append', nargs=
'+',
36 help=
'Data file containing ROOT TTree with independent test data')
37 parser.add_argument(
'-tree',
'--treename', dest=
'treename', type=str, default=
'tree', help=
'Treename in data file')
38 parser.add_argument(
'-out',
'--outputfile', dest=
'outputfile', type=str, default=
'output.zip',
39 help=
'Name of the created .zip archive file if not compiling or a pdf file if compilation is successful.')
40 parser.add_argument(
'-w',
'--working_directory', dest=
'working_directory', type=str, default=
'',
41 help=
"""Working directory where the created images and root files are stored,
42 default is to create a temporary directory.""")
43 parser.add_argument(
'-l',
'--localdb', dest=
'localdb', type=str, action=
'append', nargs=
'+', required=
False,
44 help=
"""path or list of paths to local database(s) containing the mvas of interest.
45 The testing payloads are prepended and take precedence over payloads in global tags.""")
46 parser.add_argument(
'-g',
'--globaltag', dest=
'globaltag', type=str, action=
'append', nargs=
'+', required=
False,
47 help=
'globaltag or list of globaltags containing the mvas of interest. The globaltags are prepended.')
48 parser.add_argument(
'-n',
'--fillnan', dest=
'fillnan', action=
'store_true',
49 help=
'Fill nan and inf values with actual numbers')
50 parser.add_argument(
'-c',
'--compile', dest=
'compile', action=
'store_true',
51 help=
'Compile latex to pdf directly')
52 parser.add_argument(
'-a',
'--abbreviation_length', dest=
'abbreviation_length',
53 action=
'store', type=int, default=10,
54 help=
'Number of characters to which variable names are abbreviated.')
55 parser.add_argument(
'-s',
'--importance-scale', dest=
'importance_scale',
56 choices=[
'normalized',
'hundredzero'], default=
'normalized',
57 help=
'Scaling applied to importance values before plotting. '
58 '"normalized" (default): each column is divided by its sum and multiplied by 100; '
59 '"hundredzero": each column is rescaled so the minimum is 0 and the maximum is 100.')
63def unique(input_list: List[Any]) -> List[Any]:
65 Returns a list containing only unique elements, keeps the original order of the list
66 @param input_list list containing the elements
75def flatten(input_list: List[List[Any]]) -> List[Any]:
77 Flattens a list of lists
78 @param input_list list of lists to be flattened
80 return [item
for sublist
in input_list
for item
in sublist]
83def smart_abbreviation(name):
85 shortName = shortName.replace(
"daughter",
"d")
86 shortName = shortName.replace(
"Angle",
"Ang")
87 shortName = shortName.replace(
"useCMSFrame",
"")
88 shortName = shortName.replace(
"useLabFrame",
"")
89 shortName = shortName.replace(
"useRestFrame",
"")
90 shortName = shortName.replace(
"formula",
"")
91 shortName = shortName.replace(
"(",
"")
92 shortName = shortName.replace(
")",
"")
93 shortName = shortName.replace(
"conditionalVariableSelector",
"")
94 shortName = shortName.replace(
",",
"")
95 shortName = shortName.replace(
" ",
"")
99def create_abbreviations(names, length=5):
102 abbreviation = smart_abbreviation(name)[:length]
103 if abbreviation
not in count:
104 count[abbreviation] = 0
105 count[abbreviation] += 1
106 abbreviations = collections.OrderedDict()
110 abbreviation = smart_abbreviation(name)[:length]
111 abbreviations[name] = abbreviation
112 if count[abbreviation] > 1:
113 if abbreviation
not in count2:
114 count2[abbreviation] = 0
115 count2[abbreviation] += 1
116 abbreviations[name] += str(count2[abbreviation])
120if __name__ ==
'__main__':
123 ROOT.PyConfig.IgnoreCommandLineOptions =
True
124 ROOT.PyConfig.StartGuiThread =
False
125 ROOT.gROOT.SetBatch(
True)
127 old_cwd = os.getcwd()
128 parser = get_argument_parser()
129 args = parser.parse_args()
131 identifiers = flatten(args.identifiers)
132 identifier_abbreviations = create_abbreviations(identifiers, args.abbreviation_length)
134 datafiles = flatten(args.datafiles)
135 if args.localdb
is not None:
136 for localdb
in flatten(args.localdb):
137 conditions.prepend_testing_payloads(localdb)
139 if args.globaltag
is not None:
140 for tag
in flatten(args.globaltag):
141 conditions.prepend_globaltag(tag)
143 print(
"Load methods")
146 print(
"Apply experts on independent data")
147 test_probability = {}
149 for method
in methods:
150 p, t = method.apply_expert(datafiles, args.treename)
151 test_probability[identifier_abbreviations[method.identifier]] = p
152 test_target[identifier_abbreviations[method.identifier]] = t
154 print(
"Apply experts on training data")
155 train_probability = {}
157 if args.train_datafiles
is not None:
158 train_datafiles = sum(args.train_datafiles, [])
159 for method
in methods:
160 p, t = method.apply_expert(train_datafiles, args.treename)
161 train_probability[identifier_abbreviations[method.identifier]] = p
162 train_target[identifier_abbreviations[method.identifier]] = t
164 variables = unique(v
for method
in methods
for v
in method.variables)
165 variable_abbreviations = create_abbreviations(variables, args.abbreviation_length)
166 root_variables = unique(v
for method
in methods
for v
in method.root_variables)
168 spectators = unique(v
for method
in methods
for v
in method.spectators)
169 spectator_abbreviations = create_abbreviations(spectators, args.abbreviation_length)
170 root_spectators = unique(v
for method
in methods
for v
in method.root_spectators)
172 print(
"Load variables array")
173 rootchain = ROOT.TChain(args.treename)
174 rootchain_spec = ROOT.TChain(args.treename)
175 for datafile
in datafiles:
176 rootchain.Add(datafile)
177 rootchain_spec.Add(datafile)
186 variable_abbreviations.values()) +
188 spectator_abbreviations.values()))
190 if train_probability:
191 rootchain_train = ROOT.TChain(args.treename)
192 rootchain_train_spec = ROOT.TChain(args.treename)
193 for train_datafile
in train_datafiles:
194 rootchain_train.Add(train_datafile)
195 rootchain_train_spec.Add(train_datafile)
197 spectators_train_data =
basf2_mva_util.chain2dict(rootchain_train, root_spectators, list(spectator_abbreviations.values()))
199 rootchain_train_spec,
203 variable_abbreviations.values()) +
205 spectator_abbreviations.values()))
208 for column
in variable_abbreviations.values():
209 np.nan_to_num(variables_data[column], copy=
False)
210 np.nan_to_num(varSpec_data[column], copy=
False)
211 if train_probability:
212 np.nan_to_num(variables_train_data[column], copy=
False)
213 np.nan_to_num(varSpec_train_data[column], copy=
False)
215 for column
in spectator_abbreviations.values():
216 np.nan_to_num(spectators_data[column], copy=
False)
217 np.nan_to_num(varSpec_data[column], copy=
False)
218 if train_probability:
219 np.nan_to_num(spectators_train_data[column], copy=
False)
220 np.nan_to_num(varSpec_train_data[column], copy=
False)
222 print(
"Create latex file")
225 with tempfile.TemporaryDirectory()
as tempdir:
226 if args.working_directory ==
'':
229 os.chdir(args.working_directory)
231 with open(
'abbreviations.txt',
'w')
as f:
232 f.write(
'Identifier Abbreviation : Identifier \n')
233 for name, abbrev
in identifier_abbreviations.items():
234 f.write(f
'\t{abbrev} : {name}\n')
235 f.write(
'\n\n\nVariable Abbreviation : Variable \n')
236 for name, abbrev
in variable_abbreviations.items():
237 f.write(f
'\t{abbrev} : {name}\n')
238 f.write(
'\n\n\nSpectator Abbreviation : Spectator \n')
239 for name, abbrev
in spectator_abbreviations.items():
240 f.write(f
'\t{abbrev} : {name}\n')
242 o = b2latex.LatexFile()
243 o += b2latex.TitlePage(title=
'Automatic MVA Evaluation',
244 authors=[
r'Thomas Keck\\ Moritz Gelb\\ Nils Braun'],
245 abstract=
'Evaluation plots',
246 add_table_of_contents=
True).finish()
248 o += b2latex.Section(
"Classifiers")
249 o += b2latex.String(
r"""
250 This section contains the GeneralOptions and SpecificOptions of all classifiers represented by an XML tree.
251 The same information can be retrieved using the basf2\_mva\_info tool.
254 table = b2latex.LongTable(
r"ll",
"Abbreviations of identifiers",
"{name} & {abbr}",
r"Identifier & Abbreviation")
255 for identifier
in identifiers:
256 table.add(name=format.string(identifier), abbr=format.string(identifier_abbreviations[identifier]))
259 for method
in methods:
260 o += b2latex.SubSection(format.string(method.identifier))
261 o += b2latex.Listing(language=
'XML').add(method.description).finish()
263 o += b2latex.Section(
"Variables")
264 importance_scale_description = {
265 'normalized':
'Each variable\'s importance is shown as a percentage of the total importance '
266 'for that method (column sums to 100), preserving relative magnitudes between variables.',
267 'hundredzero':
'Each column is rescaled so that the least important variable is 0 and the most '
268 'important is 100. Relative magnitudes between variables are not preserved.',
270 o += b2latex.String(f
"""
271 This section contains an overview of the importance and correlation of the variables used by the classifiers.
272 And distribution plots of the variables on the independent dataset. The distributions are normed for signal and
273 background separately, and only the region +- 3 sigma around the mean is shown.
275 {importance_scale_description[args.importance_scale]}
276 If the method does not provide a ranking, all importances will be 0.
279 table = b2latex.LongTable(
r"ll",
"Abbreviations of variables",
"{name} & {abbr}",
r"Variable & Abbreviation")
281 table.add(name=format.string(v), abbr=format.string(variable_abbreviations[v]))
284 o += b2latex.SubSection(
"Importance")
285 graphics = b2latex.Graphics()
287 p.add({identifier_abbreviations[i.identifier]: np.array([i.importances.get(v, 0.0)
for v
in variables])
for i
in methods},
288 identifier_abbreviations.values(), variable_abbreviations.values(),
289 importance_scale=args.importance_scale)
291 p.save(
'importance.pdf')
292 graphics.add(
'importance.pdf', width=1.0)
293 o += graphics.finish()
295 o += b2latex.SubSection(
"Correlation")
296 first_identifier_abbr = list(identifier_abbreviations.values())[0]
297 graphics = b2latex.Graphics()
299 p.add(variables_data, variable_abbreviations.values(),
300 test_target[first_identifier_abbr] == 1,
301 test_target[first_identifier_abbr] == 0)
303 p.save(
'correlation_plot.pdf')
304 graphics.add(
'correlation_plot.pdf', width=1.0)
305 o += graphics.finish()
307 if train_probability:
308 o += b2latex.SubSection(
"Correlation on Training Data")
309 graphics = b2latex.Graphics()
311 p.add(variables_train_data, variable_abbreviations.values(),
312 train_target[first_identifier_abbr] == 1,
313 train_target[first_identifier_abbr] == 0)
315 p.save(
'correlation_plot_train.pdf')
316 graphics.add(
'correlation_plot_train.pdf', width=1.0)
317 o += graphics.finish()
320 variable_abbr = variable_abbreviations[v]
321 o += b2latex.SubSection(format.string(v))
322 graphics = b2latex.Graphics()
324 p.add(variables_data, variable_abbr, test_target[first_identifier_abbr] == 1, label=
"Sig")
325 p.add(variables_data, variable_abbr, test_target[first_identifier_abbr] == 0, label=
"Bkg")
326 if train_probability:
327 p.add(variables_train_data, variable_abbr, train_target[first_identifier_abbr] == 1, label=
"Sig_train")
328 p.add(variables_train_data, variable_abbr, train_target[first_identifier_abbr] == 0, label=
"Bkg_train")
330 p.save(f
'variable_{variable_abbr}_{hash(v)}.pdf')
331 graphics.add(f
'variable_{variable_abbr}_{hash(v)}.pdf', width=1.0)
332 o += graphics.finish()
334 o += b2latex.Section(
"Classifier Plot")
335 o += b2latex.String(
"This section contains the receiver operating characteristics (ROC), purity projection, ..."
336 "of the classifiers on training and independent data."
337 "The legend of each plot contains the shortened identifier and the area under the ROC curve"
347 for plot_class
in plot_classes:
349 o += b2latex.Section(f
"{plot_class.__name__} Plot")
351 graphics = b2latex.Graphics()
353 for i, identifier
in enumerate(identifiers):
354 identifier_abbr = identifier_abbreviations[identifier]
358 test_target[identifier_abbr] == 1,
359 test_target[identifier_abbr] == 0,
360 label=identifier_abbr)
362 p.axis.set_title(f
"{plot_class.__name__} Plot on independent data")
363 p.save(f
'{plot_class.__name__.lower()}_plot_test.pdf')
364 graphics.add(f
'{plot_class.__name__.lower()}_plot_test.pdf', width=1.0)
365 o += graphics.finish()
367 if train_probability:
368 for i, identifier
in enumerate(identifiers):
369 graphics = b2latex.Graphics()
371 identifier_abbr = identifier_abbreviations[identifier]
372 p.add(train_probability, identifier_abbr, train_target[identifier_abbr] == 1,
373 train_target[identifier_abbr] == 0, label=f
'Train {identifier_abbr}')
374 p.add(test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
375 test_target[identifier_abbr] == 0, label=f
'Test {identifier_abbr}')
377 p.axis.set_title(f
"{plot_class.__name__} Plot for \n" + identifier)
378 p.save(f
'{plot_class.__name__.lower()}_plot_{hash(identifier)}.pdf')
379 graphics.add(f
'{plot_class.__name__.lower()}_plot_{hash(identifier)}.pdf', width=1.0)
380 o += graphics.finish()
382 o += b2latex.Section(
"Classification Results")
383 for identifier
in identifiers:
384 identifier_abbr = identifier_abbreviations[identifier]
385 o += b2latex.SubSection(format.string(identifier_abbr))
386 graphics = b2latex.Graphics()
387 if train_probability:
391 p.add(0, test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
392 test_target[identifier_abbr] == 0, normed=
True)
393 p.sub_plots[0].axis.set_title(f
"Classification result in test data for \n{identifier}")
395 p.add(1, test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
396 test_target[identifier_abbr] == 0, normed=
False)
397 p.sub_plots[1].axis.set_title(f
"Classification result in test data for \n{identifier}")
399 if train_probability:
400 p.add(2, train_probability, identifier_abbr, train_target[identifier_abbr] == 1,
401 train_target[identifier_abbr] == 0, normed=
True)
402 p.sub_plots[2].axis.set_title(f
"Classification result in training data for \n{identifier}")
404 p.add(3, train_probability, identifier_abbr, train_target[identifier_abbr] == 1,
405 train_target[identifier_abbr] == 0, normed=
False)
406 p.sub_plots[3].axis.set_title(f
"Classification result in training data for \n{identifier}")
408 p.figure.subplots_adjust(wspace=0.3, hspace=0.3)
410 p.save(f
'classification_result_{hash(identifier)}.pdf')
411 graphics.add(f
'classification_result_{hash(identifier)}.pdf', width=1)
412 o += graphics.finish()
414 o += b2latex.Section(
"Diagonal Plot")
415 graphics = b2latex.Graphics()
417 for identifier
in identifiers:
418 identifier_abbr = identifier_abbreviations[identifier]
422 test_target[identifier_abbr] == 1,
423 test_target[identifier_abbr] == 0,
424 label=identifier_abbr)
426 p.axis.set_title(
"Diagonal plot on independent data")
427 p.save(
'diagonal_plot_test.pdf')
428 graphics.add(
'diagonal_plot_test.pdf', width=1.0)
429 o += graphics.finish()
431 if train_probability:
432 for identifier
in identifiers:
433 identifier_abbr = identifier_abbreviations[identifier]
434 o += b2latex.SubSection(format.string(identifier_abbr))
435 graphics = b2latex.Graphics()
441 train_target[identifier_abbr] == 1,
442 train_target[identifier_abbr] == 0,
447 test_target[identifier_abbr] == 1,
448 test_target[identifier_abbr] == 0,
452 p.axis.set_title(
"Diagonal plot for \n" + identifier)
453 p.save(f
'diagonal_plot_{hash(identifier)}.pdf')
454 graphics.add(f
'diagonal_plot_{hash(identifier)}.pdf', width=1.0)
455 o += graphics.finish()
457 if train_probability:
458 o += b2latex.Section(
"Overtraining Plot")
459 for identifier
in identifiers:
460 identifier_abbr = identifier_abbreviations[identifier]
461 probability = {identifier_abbr: np.r_[train_probability[identifier_abbr], test_probability[identifier_abbr]]}
462 target = np.r_[train_target[identifier_abbr], test_target[identifier_abbr]]
463 train_mask = np.r_[np.ones(len(train_target[identifier_abbr])), np.zeros(len(test_target[identifier_abbr]))]
464 graphics = b2latex.Graphics()
466 p.add(probability, identifier_abbr,
467 train_mask == 1, train_mask == 0,
468 target == 1, target == 0, )
470 p.axis.set_title(f
"Overtraining check for \n{identifier}")
471 p.save(f
'overtraining_plot_{hash(identifier)}.pdf')
472 graphics.add(f
'overtraining_plot_{hash(identifier)}.pdf', width=1.0)
473 o += graphics.finish()
475 o += b2latex.Section(
"Spectators")
476 o += b2latex.String(
"This section contains the distribution and dependence on the"
477 "classifier outputs of all spectator variables.")
479 table = b2latex.LongTable(
r"ll",
"Abbreviations of spectators",
"{name} & {abbr}",
r"Spectator & Abbreviation")
481 table.add(name=format.string(s), abbr=format.string(spectator_abbreviations[s]))
484 for spectator
in spectators:
485 spectator_abbr = spectator_abbreviations[spectator]
486 o += b2latex.SubSection(format.string(spectator))
487 graphics = b2latex.Graphics()
489 p.add(spectators_data, spectator_abbr, test_target[first_identifier_abbr] == 1, label=
"Sig")
490 p.add(spectators_data, spectator_abbr, test_target[first_identifier_abbr] == 0, label=
"Bkg")
491 if train_probability:
492 p.add(spectators_train_data, spectator_abbr, train_target[first_identifier_abbr] == 1, label=
"Sig_train")
493 p.add(spectators_train_data, spectator_abbr, train_target[first_identifier_abbr] == 0, label=
"Bkg_train")
495 p.save(f
'spectator_{spectator_abbr}_{hash(spectator)}.pdf')
496 graphics.add(f
'spectator_{spectator_abbr}_{hash(spectator)}.pdf', width=1.0)
497 o += graphics.finish()
499 for identifier
in identifiers:
500 o += b2latex.SubSubSection(format.string(spectator) +
" with classifier " + format.string(identifier))
501 identifier_abbr = identifier_abbreviations[identifier]
502 data = {identifier_abbr: test_probability[identifier_abbr], spectator_abbr: spectators_data[spectator_abbr]}
503 graphics = b2latex.Graphics()
505 p.add(data, spectator_abbr, identifier_abbr, list(range(10, 100, 10)),
506 test_target[identifier_abbr] == 1,
507 test_target[identifier_abbr] == 0)
508 p.figure.subplots_adjust(hspace=0.5)
510 p.save(f
'correlation_plot_{spectator_abbr}_{hash(spectator)}_{hash(identifier)}.pdf')
511 graphics.add(f
'correlation_plot_{spectator_abbr}_{hash(spectator)}_{hash(identifier)}.pdf', width=1.0)
512 o += graphics.finish()
514 if train_probability:
515 o += b2latex.SubSubSection(format.string(spectator) +
" with classifier " +
516 format.string(identifier) +
" on training data")
517 data = {identifier_abbr: train_probability[identifier_abbr],
518 spectator_abbr: spectators_train_data[spectator_abbr]}
519 graphics = b2latex.Graphics()
521 p.add(data, spectator_abbr, identifier_abbr, list(range(10, 100, 10)),
522 train_target[identifier_abbr] == 1,
523 train_target[identifier_abbr] == 0)
524 p.figure.subplots_adjust(hspace=0.5)
526 p.save(f
'correlation_plot_{spectator_abbr}_{hash(spectator)}_{hash(identifier)}_train.pdf')
527 graphics.add(f
'correlation_plot_{spectator_abbr}_{hash(spectator)}_{hash(identifier)}_train.pdf', width=1.0)
528 o += graphics.finish()
530 if len(spectators) > 0:
531 o += b2latex.SubSection(
"Correlation of Spectators")
532 first_identifier_abbr = list(identifier_abbreviations.values())[0]
533 graphics = b2latex.Graphics()
537 list(variable_abbreviations.values()) + list(spectator_abbreviations.values()),
538 test_target[first_identifier_abbr] == 1,
539 test_target[first_identifier_abbr] == 0
542 p.save(
'correlation_spec_plot.pdf')
543 graphics.add(
'correlation_spec_plot.pdf', width=1.0)
544 o += graphics.finish()
546 if train_probability:
547 o += b2latex.SubSection(
"Correlation of Spectators on Training Data")
548 graphics = b2latex.Graphics()
552 list(variable_abbreviations.values()) + list(spectator_abbreviations.values()),
553 train_target[first_identifier_abbr] == 1,
554 train_target[first_identifier_abbr] == 0
557 p.save(
'correlation_spec_plot_train.pdf')
558 graphics.add(
'correlation_spec_plot_train.pdf', width=1.0)
559 o += graphics.finish()
562 B2INFO(f
"Creating a PDF file at {args.outputfile}. Please remove the '-c' switch if this fails.")
563 o.save(
'latex.tex', compile=
True)
565 B2INFO(f
"Creating a .zip archive containing plots and a TeX file at {args.outputfile}."
566 f
"Please unpack the archive and compile the latex.tex file with pdflatex.")
567 o.save(
'latex.tex', compile=
False)
570 if args.working_directory ==
'':
571 working_directory = tempdir
573 working_directory = args.working_directory
576 shutil.copy(os.path.join(working_directory,
'latex.pdf'), args.outputfile)
578 base_name = os.path.join(old_cwd, args.outputfile.rsplit(
'.', 1)[0])
579 shutil.make_archive(base_name,
'zip', working_directory)
chain2dict(chain, tree_columns, dict_columns=None, max_entries=None)