14 from basf2_mva_evaluation
import plotting
19 from B2Tools
import b2latex, format
20 from basf2
import B2INFO
27 from typing
import List, Any
30 def get_argument_parser() -> argparse.ArgumentParser:
31 """ Parses the command line options of the fei and returns the corresponding arguments. """
32 parser = argparse.ArgumentParser()
33 parser.add_argument(
'-id',
'--identifiers', dest=
'identifiers', type=str, required=
True, action=
'append', nargs=
'+',
34 help=
'DB Identifier or weightfile')
35 parser.add_argument(
'-train',
'--train_datafiles', dest=
'train_datafiles', type=str, required=
False, action=
'append', nargs=
'+',
36 help=
'Data file containing ROOT TTree used during training')
37 parser.add_argument(
'-data',
'--datafiles', dest=
'datafiles', type=str, required=
True, action=
'append', nargs=
'+',
38 help=
'Data file containing ROOT TTree with independent test data')
39 parser.add_argument(
'-tree',
'--treename', dest=
'treename', type=str, default=
'tree', help=
'Treename in data file')
40 parser.add_argument(
'-out',
'--outputfile', dest=
'outputfile', type=str, default=
'output.zip',
41 help=
'Name of the created .zip archive file if not compiling or a pdf file if compilation is successful.')
42 parser.add_argument(
'-w',
'--working_directory', dest=
'working_directory', type=str, default=
'',
43 help=
"""Working directory where the created images and root files are stored,
44 default is to create a temporary directory.""")
45 parser.add_argument(
'-n',
'--fillnan', dest=
'fillnan', action=
'store_true',
46 help=
'Fill nan and inf values with actual numbers')
47 parser.add_argument(
'-c',
'--compile', dest=
'compile', action=
'store_true',
48 help=
'Compile latex to pdf directly')
52 def unique(input_list: List[Any]) -> List[Any]:
54 Returns a list containing only unique elements, keeps the original order of the list
55 @param input_list list containing the elements
64 def create_abbreviations(names, length=5):
67 abbreviation = name[:length]
68 if abbreviation
not in count:
69 count[abbreviation] = 0
70 count[abbreviation] += 1
71 abbreviations = collections.OrderedDict()
75 abbreviation = name[:length]
76 abbreviations[name] = abbreviation
77 if count[abbreviation] > 1:
78 if abbreviation
not in count2:
79 count2[abbreviation] = 0
80 count2[abbreviation] += 1
81 abbreviations[name] += str(count2[abbreviation])
85 if __name__ ==
'__main__':
87 ROOT.gROOT.SetBatch(
True)
90 parser = get_argument_parser()
91 args = parser.parse_args()
93 identifiers = sum(args.identifiers, [])
94 identifier_abbreviations = create_abbreviations(identifiers)
96 datafiles = sum(args.datafiles, [])
101 print(
"Apply experts on independent data")
102 test_probability = {}
104 for method
in methods:
105 p, t = method.apply_expert(datafiles, args.treename)
106 test_probability[identifier_abbreviations[method.identifier]] = p
107 test_target[identifier_abbreviations[method.identifier]] = t
109 print(
"Apply experts on training data")
110 train_probability = {}
112 if args.train_datafiles
is not None:
113 train_datafiles = sum(args.train_datafiles, [])
114 for method
in methods:
115 p, t = method.apply_expert(train_datafiles, args.treename)
116 train_probability[identifier_abbreviations[method.identifier]] = p
117 train_target[identifier_abbreviations[method.identifier]] = t
119 variables = unique(v
for method
in methods
for v
in method.variables)
120 variable_abbreviations = create_abbreviations(variables)
121 root_variables = unique(v
for method
in methods
for v
in method.root_variables)
123 spectators = unique(v
for method
in methods
for v
in method.spectators)
124 spectator_abbreviations = create_abbreviations(spectators)
125 root_spectators = unique(v
for method
in methods
for v
in method.root_spectators)
127 print(
"Load variables array")
128 rootchain = ROOT.TChain(args.treename)
129 for datafile
in datafiles:
130 rootchain.Add(datafile)
135 for column
in variable_abbreviations.values():
136 np.nan_to_num(variables_data[column], copy=
False)
140 print(
"Create latex file")
143 with tempfile.TemporaryDirectory()
as tempdir:
144 if args.working_directory ==
'':
147 os.chdir(args.working_directory)
149 o = b2latex.LatexFile()
150 o += b2latex.TitlePage(title=
'Automatic MVA Evaluation',
151 authors=[
r'Thomas Keck\\ Moritz Gelb\\ Nils Braun'],
152 abstract=
'Evaluation plots',
153 add_table_of_contents=
True).finish()
155 o += b2latex.Section(
"Classifiers")
156 o += b2latex.String(
r"""
157 This section contains the GeneralOptions and SpecificOptions of all classifiers represented by an XML tree.
158 The same information can be retrieved using the basf2\_mva\_info tool.
161 table = b2latex.LongTable(
r"ll",
"Abbreviations of identifiers",
"{name} & {abbr}",
r"Identifier & Abbreviation")
162 for identifier
in identifiers:
163 table.add(name=format.string(identifier), abbr=format.string(identifier_abbreviations[identifier]))
166 for method
in methods:
167 o += b2latex.SubSection(format.string(method.identifier))
168 o += b2latex.Listing(language=
'XML').add(method.description).finish()
170 o += b2latex.Section(
"Variables")
171 o += b2latex.String(
"""
172 This section contains an overview of the importance and correlation of the variables used by the classifiers.
173 And distribution plots of the variables on the independent dataset. The distributions are normed for signal and
174 background separately, and only the region +- 3 sigma around the mean is shown.
177 table = b2latex.LongTable(
r"ll",
"Abbreviations of variables",
"{name} & {abbr}",
r"Variable & Abbreviation")
179 table.add(name=format.string(v), abbr=format.string(variable_abbreviations[v]))
182 o += b2latex.SubSection(
"Importance")
183 graphics = b2latex.Graphics()
185 p.add({identifier_abbreviations[i.identifier]: np.array([i.importances.get(v, 0.0)
for v
in variables])
for i
in methods},
186 identifier_abbreviations.values(), variable_abbreviations.values())
188 p.save(
'importance.pdf')
189 graphics.add(
'importance.pdf', width=1.0)
190 o += graphics.finish()
192 o += b2latex.SubSection(
"Correlation")
193 first_identifier_abbr = list(identifier_abbreviations.values())[0]
194 graphics = b2latex.Graphics()
196 p.add(variables_data, variable_abbreviations.values(),
197 test_target[first_identifier_abbr] == 1,
198 test_target[first_identifier_abbr] == 0)
200 p.save(
'correlation_plot.pdf')
201 graphics.add(
'correlation_plot.pdf', width=1.0)
202 o += graphics.finish()
205 variable_abbr = variable_abbreviations[v]
206 o += b2latex.SubSection(format.string(v))
207 graphics = b2latex.Graphics()
209 p.add(variables_data, variable_abbr, test_target[first_identifier_abbr] == 1, label=
"Signal")
210 p.add(variables_data, variable_abbr, test_target[first_identifier_abbr] == 0, label=
"Background")
212 p.save(
'variable_{}.pdf'.format(hash(v)))
213 graphics.add(
'variable_{}.pdf'.format(hash(v)), width=1.0)
214 o += graphics.finish()
216 o += b2latex.Section(
"Classifier Plot")
217 o += b2latex.String(
"This section contains the receiver operating characteristics (ROC), purity projection, ..."
218 "of the classifiers on training and independent data."
219 "The legend of each plot contains the shortened identifier and the area under the ROC curve"
222 o += b2latex.Section(
"ROC Plot")
223 graphics = b2latex.Graphics()
225 for identifier
in identifier_abbreviations.values():
226 p.add(test_probability, identifier, test_target[identifier] == 1, test_target[identifier] == 0)
228 p.axis.set_title(
"ROC Rejection Plot on independent data")
229 p.save(
'roc_plot_test.pdf')
230 graphics.add(
'roc_plot_test.pdf', width=1.0)
231 o += graphics.finish()
233 if train_probability:
234 for i, identifier
in enumerate(identifiers):
235 graphics = b2latex.Graphics()
237 identifier_abbr = identifier_abbreviations[identifier]
238 p.add(train_probability, identifier_abbr, train_target[identifier_abbr] == 1,
239 train_target[identifier_abbr] == 0, label=
'Train')
240 p.add(test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
241 test_target[identifier_abbr] == 0, label=
'Test')
243 p.axis.set_title(identifier)
244 p.save(
'roc_test_{}.pdf'.format(hash(identifier)))
245 graphics.add(
'roc_test_{}.pdf'.format(hash(identifier)), width=1.0)
246 o += graphics.finish()
248 o += b2latex.Section(
"Classification Results")
250 for identifier
in identifiers:
251 identifier_abbr = identifier_abbreviations[identifier]
252 o += b2latex.SubSection(format.string(identifier_abbr))
253 graphics = b2latex.Graphics()
255 p.add(0, test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
256 test_target[identifier_abbr] == 0, normed=
True)
257 p.sub_plots[0].axis.set_title(
"Classification result in test data for {identifier}".format(identifier=identifier))
259 p.add(1, test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
260 test_target[identifier_abbr] == 0, normed=
False)
261 p.sub_plots[1].axis.set_title(
"Classification result in test data for {identifier}".format(identifier=identifier))
264 p.save(
'classification_result_{identifier}.pdf'.format(identifier=hash(identifier)))
265 graphics.add(
'classification_result_{identifier}.pdf'.format(identifier=hash(identifier)), width=1)
266 o += graphics.finish()
268 o += b2latex.Section(
"Diagonal Plot")
269 graphics = b2latex.Graphics()
271 for identifier
in identifiers:
272 o += b2latex.SubSection(format.string(identifier_abbr))
273 identifier_abbr = identifier_abbreviations[identifier]
274 p.add(test_probability, identifier_abbr, test_target[identifier_abbr] == 1, test_target[identifier_abbr] == 0)
276 p.axis.set_title(
"Diagonal plot on independent data")
277 p.save(
'diagonal_plot_test.pdf')
278 graphics.add(
'diagonal_plot_test.pdf', width=1.0)
279 o += graphics.finish()
281 if train_probability:
282 o += b2latex.SubSection(
"Overtraining Plot")
283 for identifier
in identifiers:
284 identifier_abbr = identifier_abbreviations[identifier]
285 probability = {identifier_abbr: np.r_[train_probability[identifier_abbr], test_probability[identifier_abbr]]}
286 target = np.r_[train_target[identifier_abbr], test_target[identifier_abbr]]
287 train_mask = np.r_[np.ones(len(train_target[identifier_abbr])), np.zeros(len(test_target[identifier_abbr]))]
288 graphics = b2latex.Graphics()
290 p.add(probability, identifier_abbr,
291 train_mask == 1, train_mask == 0,
292 target == 1, target == 0, )
294 p.axis.set_title(
"Overtraining check for {}".format(identifier))
295 p.save(
'overtraining_plot_{}.pdf'.format(hash(identifier)))
296 graphics.add(
'overtraining_plot_{}.pdf'.format(hash(identifier)), width=1.0)
297 o += graphics.finish()
299 o += b2latex.Section(
"Spectators")
300 o += b2latex.String(
"This section contains the distribution and dependence on the"
301 "classifier outputs of all spectator variables.")
303 table = b2latex.LongTable(
r"ll",
"Abbreviations of spectators",
"{name} & {abbr}",
r"Spectator & Abbreviation")
305 table.add(name=format.string(s), abbr=format.string(spectator_abbreviations[s]))
308 for spectator
in spectators:
309 spectator_abbr = spectator_abbreviations[spectator]
310 o += b2latex.SubSection(format.string(spectator))
311 graphics = b2latex.Graphics()
313 p.add(spectators_data, spectator_abbr, test_target[first_identifier_abbr] == 1, label=
"Signal")
314 p.add(spectators_data, spectator_abbr, test_target[first_identifier_abbr] == 0, label=
"Background")
316 p.save(
'spectator_{}.pdf'.format(hash(spectator)))
317 graphics.add(
'spectator_{}.pdf'.format(hash(spectator)), width=1.0)
318 o += graphics.finish()
320 for identifier
in identifiers:
321 o += b2latex.SubSubSection(format.string(spectator) +
" with classifier " + format.string(identifier))
322 identifier_abbr = identifier_abbreviations[identifier]
323 data = {identifier_abbr: test_probability[identifier_abbr], spectator_abbr: spectators_data[spectator_abbr]}
324 graphics = b2latex.Graphics()
326 p.add(data, spectator_abbr, identifier_abbr, list(range(10, 100, 10)),
327 test_target[identifier_abbr] == 1,
328 test_target[identifier_abbr] == 0)
330 p.save(
'correlation_plot_{}_{}.pdf'.format(hash(spectator), hash(identifier)))
331 graphics.add(
'correlation_plot_{}_{}.pdf'.format(hash(spectator), hash(identifier)), width=1.0)
332 o += graphics.finish()
335 B2INFO(f
"Creating a PDF file at {args.outputfile}. Please remove the '-c' switch if this fails.")
336 o.save(
'latex.tex', compile=
True)
338 B2INFO(f
"Creating a .zip archive containing plots and a TeX file at {args.outputfile}."
339 f
"Please unpack the archive and compile the latex.tex file with pdflatex.")
340 o.save(
'latex.tex', compile=
False)
343 if args.working_directory ==
'':
344 working_directory = tempdir
346 working_directory = args.working_directory
349 shutil.copy(os.path.join(working_directory,
'latex.pdf'), args.outputfile)
351 base_name = os.path.join(old_cwd, args.outputfile.rsplit(
'.', 1)[0])
352 shutil.make_archive(base_name,
'zip', working_directory)
def tree2dict(tree, tree_columns, dict_columns=None)