6 from basf2_mva_evaluation
import plotting
11 from B2Tools
import b2latex, format
12 from basf2
import B2INFO
21 def getCommandLineOptions():
22 """ Parses the command line options of the fei and returns the corresponding arguments. """
23 parser = argparse.ArgumentParser()
24 parser.add_argument(
'-id',
'--identifiers', dest=
'identifiers', type=str, required=
True, action=
'append', nargs=
'+',
25 help=
'DB Identifier or weightfile')
26 parser.add_argument(
'-train',
'--train_datafiles', dest=
'train_datafiles', type=str, required=
False, action=
'append', nargs=
'+',
27 help=
'Data file containing ROOT TTree used during training')
28 parser.add_argument(
'-data',
'--datafiles', dest=
'datafiles', type=str, required=
True, action=
'append', nargs=
'+',
29 help=
'Data file containing ROOT TTree with independent test data')
30 parser.add_argument(
'-tree',
'--treename', dest=
'treename', type=str, default=
'tree', help=
'Treename in data file')
31 parser.add_argument(
'-out',
'--outputfile', dest=
'outputfile', type=str, default=
'output.zip',
32 help=
'Name of the created .zip archive file if not compiling or a pdf file if compilation is successful.')
33 parser.add_argument(
'-w',
'--working_directory', dest=
'working_directory', type=str, default=
'',
34 help=
"""Working directory where the created images and root files are stored,
35 default is to create a temporary directory.""")
36 parser.add_argument(
'-n',
'--fillnan', dest=
'fillnan', action=
'store_true',
37 help=
'Fill nan and inf values with actual numbers')
38 parser.add_argument(
'-c',
'--compile', dest=
'compile', action=
'store_true',
39 help=
'Compile latex to pdf')
40 args = parser.parse_args()
46 Returns a list containing only unique elements, keeps the original order of the list
47 @param input list containing the elements
56 def create_abbreviations(names, length=5):
59 abbreviation = name[:length]
60 if abbreviation
not in count:
61 count[abbreviation] = 0
62 count[abbreviation] += 1
63 abbreviations = collections.OrderedDict()
67 abbreviation = name[:length]
68 abbreviations[name] = abbreviation
69 if count[abbreviation] > 1:
70 if abbreviation
not in count2:
71 count2[abbreviation] = 0
72 count2[abbreviation] += 1
73 abbreviations[name] += str(count2[abbreviation])
77 if __name__ ==
'__main__':
79 ROOT.gROOT.SetBatch(
True)
82 args = getCommandLineOptions()
84 identifiers = sum(args.identifiers, [])
85 identifier_abbreviations = create_abbreviations(identifiers)
87 datafiles = sum(args.datafiles, [])
92 print(
"Apply experts on independent data")
95 for method
in methods:
96 p, t = method.apply_expert(datafiles, args.treename)
97 test_probability[identifier_abbreviations[method.identifier]] = p
98 test_target[identifier_abbreviations[method.identifier]] = t
100 print(
"Apply experts on training data")
101 train_probability = {}
103 if args.train_datafiles
is not None:
104 train_datafiles = sum(args.train_datafiles, [])
105 for method
in methods:
106 p, t = method.apply_expert(train_datafiles, args.treename)
107 train_probability[identifier_abbreviations[method.identifier]] = p
108 train_target[identifier_abbreviations[method.identifier]] = t
110 variables = unique(v
for method
in methods
for v
in method.variables)
111 variable_abbreviations = create_abbreviations(variables)
112 root_variables = unique(v
for method
in methods
for v
in method.root_variables)
114 spectators = unique(v
for method
in methods
for v
in method.spectators)
115 spectator_abbreviations = create_abbreviations(spectators)
116 root_spectators = unique(v
for method
in methods
for v
in method.root_spectators)
118 print(
"Load variables array")
119 rootchain = ROOT.TChain(args.treename)
120 for datafile
in datafiles:
121 rootchain.Add(datafile)
126 for column
in variable_abbreviations.values():
127 np.nan_to_num(variables_data[column], copy=
False)
131 print(
"Create latex file")
134 with tempfile.TemporaryDirectory()
as tempdir:
135 if args.working_directory ==
'':
138 os.chdir(args.working_directory)
140 o = b2latex.LatexFile()
141 o += b2latex.TitlePage(title=
'Automatic MVA Evaluation',
142 authors=[
r'Thomas Keck\\ Moritz Gelb\\ Nils Braun'],
143 abstract=
'Evaluation plots',
144 add_table_of_contents=
True).finish()
146 o += b2latex.Section(
"Classifiers")
147 o += b2latex.String(
r"""
148 This section contains the GeneralOptions and SpecificOptions of all classifiers represented by an XML tree.
149 The same information can be retreived using the basf2\_mva\_info tool.
152 table = b2latex.LongTable(
r"ll",
"Abbreviations of identifiers",
"{name} & {abbr}",
r"Identifier & Abbreviation")
153 for identifier
in identifiers:
154 table.add(name=format.string(identifier), abbr=format.string(identifier_abbreviations[identifier]))
157 for method
in methods:
158 o += b2latex.SubSection(format.string(method.identifier))
159 o += b2latex.Listing(language=
'XML').add(method.description).finish()
161 o += b2latex.Section(
"Variables")
162 o += b2latex.String(
"""
163 This section contains an overview of the importance and correlation of the variables used by the classifiers.
164 And distribution plots of the variables on the independent dataset. The distributions are normed for signal and
165 background separately, and only the region +- 3 sigma around the mean is shown.
168 table = b2latex.LongTable(
r"ll",
"Abbreviations of variables",
"{name} & {abbr}",
r"Variable & Abbreviation")
170 table.add(name=format.string(v), abbr=format.string(variable_abbreviations[v]))
173 o += b2latex.SubSection(
"Importance")
174 graphics = b2latex.Graphics()
176 p.add({identifier_abbreviations[i.identifier]: np.array([i.importances.get(v, 0.0)
for v
in variables])
for i
in methods},
177 identifier_abbreviations.values(), variable_abbreviations.values())
179 p.save(
'importance.pdf')
180 graphics.add(
'importance.pdf', width=1.0)
181 o += graphics.finish()
183 o += b2latex.SubSection(
"Correlation")
184 first_identifier_abbr = list(identifier_abbreviations.values())[0]
185 graphics = b2latex.Graphics()
187 p.add(variables_data, variable_abbreviations.values(),
188 test_target[first_identifier_abbr] == 1,
189 test_target[first_identifier_abbr] == 0)
191 p.save(
'correlation_plot.pdf')
192 graphics.add(
'correlation_plot.pdf', width=1.0)
193 o += graphics.finish()
196 variable_abbr = variable_abbreviations[v]
197 o += b2latex.SubSection(format.string(v))
198 graphics = b2latex.Graphics()
200 p.add(variables_data, variable_abbr, test_target[first_identifier_abbr] == 1, label=
"Signal")
201 p.add(variables_data, variable_abbr, test_target[first_identifier_abbr] == 0, label=
"Background")
203 p.save(
'variable_{}.pdf'.format(hash(v)))
204 graphics.add(
'variable_{}.pdf'.format(hash(v)), width=1.0)
205 o += graphics.finish()
207 o += b2latex.Section(
"Classifier Plot")
208 o += b2latex.String(
"This section contains the receiver operating characteristics (ROC), purity projection, ..."
209 "of the classifiers on training and independent data."
210 "The legend of each plot contains the shortened identifier and the area under the ROC curve"
213 o += b2latex.Section(
"ROC Plot")
214 graphics = b2latex.Graphics()
216 for identifier
in identifier_abbreviations.values():
217 p.add(test_probability, identifier, test_target[identifier] == 1, test_target[identifier] == 0)
219 p.axis.set_title(
"ROC Rejection Plot on independent data")
220 p.save(
'roc_plot_test.pdf')
221 graphics.add(
'roc_plot_test.pdf', width=1.0)
222 o += graphics.finish()
224 if train_probability:
225 for i, identifier
in enumerate(identifiers):
226 graphics = b2latex.Graphics()
228 identifier_abbr = identifier_abbreviations[identifier]
229 p.add(train_probability, identifier_abbr, train_target[identifier_abbr] == 1,
230 train_target[identifier_abbr] == 0, label=
'Train')
231 p.add(test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
232 test_target[identifier_abbr] == 0, label=
'Test')
234 p.axis.set_title(identifier)
235 p.save(
'roc_test_{}.pdf'.format(hash(identifier)))
236 graphics.add(
'roc_test_{}.pdf'.format(hash(identifier)), width=1.0)
237 o += graphics.finish()
239 o += b2latex.Section(
"Classification Results")
241 for identifier
in identifiers:
242 identifier_abbr = identifier_abbreviations[identifier]
243 o += b2latex.SubSection(format.string(identifier_abbr))
244 graphics = b2latex.Graphics()
246 p.add(0, test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
247 test_target[identifier_abbr] == 0, normed=
True)
248 p.sub_plots[0].axis.set_title(
"Classification result in test data for {identifier}".format(identifier=identifier))
250 p.add(1, test_probability, identifier_abbr, test_target[identifier_abbr] == 1,
251 test_target[identifier_abbr] == 0, normed=
False)
252 p.sub_plots[1].axis.set_title(
"Classification result in test data for {identifier}".format(identifier=identifier))
255 p.save(
'classification_result_{identifier}.pdf'.format(identifier=hash(identifier)))
256 graphics.add(
'classification_result_{identifier}.pdf'.format(identifier=hash(identifier)), width=1)
257 o += graphics.finish()
259 o += b2latex.Section(
"Diagonal Plot")
260 graphics = b2latex.Graphics()
262 for identifier
in identifiers:
263 o += b2latex.SubSection(format.string(identifier_abbr))
264 identifier_abbr = identifier_abbreviations[identifier]
265 p.add(test_probability, identifier_abbr, test_target[identifier_abbr] == 1, test_target[identifier_abbr] == 0)
267 p.axis.set_title(
"Diagonal plot on independent data")
268 p.save(
'diagonal_plot_test.pdf')
269 graphics.add(
'diagonal_plot_test.pdf', width=1.0)
270 o += graphics.finish()
272 if train_probability:
273 o += b2latex.SubSection(
"Overtraining Plot")
274 for identifier
in identifiers:
275 identifier_abbr = identifier_abbreviations[identifier]
276 probability = {identifier_abbr: np.r_[train_probability[identifier_abbr], test_probability[identifier_abbr]]}
277 target = np.r_[train_target[identifier_abbr], test_target[identifier_abbr]]
278 train_mask = np.r_[np.ones(len(train_target[identifier_abbr])), np.zeros(len(test_target[identifier_abbr]))]
279 graphics = b2latex.Graphics()
281 p.add(probability, identifier_abbr,
282 train_mask == 1, train_mask == 0,
283 target == 1, target == 0, )
285 p.axis.set_title(
"Overtraining check for {}".format(identifier))
286 p.save(
'overtraining_plot_{}.pdf'.format(hash(identifier)))
287 graphics.add(
'overtraining_plot_{}.pdf'.format(hash(identifier)), width=1.0)
288 o += graphics.finish()
290 o += b2latex.Section(
"Spectators")
291 o += b2latex.String(
"This section contains the distribution and dependence on the"
292 "classifier outputs of all spectator variables.")
294 table = b2latex.LongTable(
r"ll",
"Abbreviations of spectators",
"{name} & {abbr}",
r"Spectator & Abbreviation")
296 table.add(name=format.string(s), abbr=format.string(spectator_abbreviations[s]))
299 for spectator
in spectators:
300 spectator_abbr = spectator_abbreviations[spectator]
301 o += b2latex.SubSection(format.string(spectator))
302 graphics = b2latex.Graphics()
304 p.add(spectators_data, spectator_abbr, test_target[first_identifier_abbr] == 1, label=
"Signal")
305 p.add(spectators_data, spectator_abbr, test_target[first_identifier_abbr] == 0, label=
"Background")
307 p.save(
'spectator_{}.pdf'.format(hash(spectator)))
308 graphics.add(
'spectator_{}.pdf'.format(hash(spectator)), width=1.0)
309 o += graphics.finish()
311 for identifier
in identifiers:
312 o += b2latex.SubSubSection(format.string(spectator) +
" with classifier " + format.string(identifier))
313 identifier_abbr = identifier_abbreviations[identifier]
314 data = {identifier_abbr: test_probability[identifier_abbr], spectator_abbr: spectators_data[spectator_abbr]}
315 graphics = b2latex.Graphics()
317 p.add(data, spectator_abbr, identifier_abbr, list(range(10, 100, 10)),
318 test_target[identifier_abbr] == 1,
319 test_target[identifier_abbr] == 0)
321 p.save(
'correlation_plot_{}_{}.pdf'.format(hash(spectator), hash(identifier)))
322 graphics.add(
'correlation_plot_{}_{}.pdf'.format(hash(spectator), hash(identifier)), width=1.0)
323 o += graphics.finish()
326 B2INFO(f
"Creating a PDF file at {args.outputfile}. Please remove the '-c' switch if this fails.")
327 o.save(f
'latex.tex', compile=
True)
329 B2INFO(f
"Creating a .zip archive containing plots and a TeX file at {args.outputfile}."
330 f
"Please unpack the archive and compile the latex.tex file with pdflatex.")
331 o.save(f
'latex.tex', compile=
False)
334 if args.working_directory ==
'':
335 working_directory = tempdir
337 working_directory = args.working_directory
340 shutil.copy(os.path.join(working_directory, f
'latex.pdf'), args.outputfile)
342 base_name = os.path.join(old_cwd, args.outputfile.rsplit(
'.', 1)[0])
343 shutil.make_archive(base_name,
'zip', working_directory)