Belle II Software development
validationscript.py
1#!/usr/bin/env python3
2
3
10
11import re
12import os
13from typing import Optional, List
14import logging
15from pathlib import Path
16
17# Import XML Parser. Use C-Version, if available
18try:
19 import xml.etree.cElementTree as XMLTree
20except ImportError:
21 import xml.etree.ElementTree as XMLTree
22
23import json_objects
24
25
26# todo [code quality, low prio, easy]: This should be an enum
28
29 """!
30 Enumeration of the states a script can be during its execution
31 cycle
32 """
33
34 # script is waiting for execution
35 class waiting:
36 pass
37
38 # script is running atm
39 class running:
40 pass
41
42 # script execution has been successfully finished
43 class finished:
44 pass
45
46 # script execution has failed
47 class failed:
48 pass
49
50 # script has been marked to be skipped
51 class skipped:
52 pass
53
54 # script output is already cached, do not execute script
55 class cached:
56 pass
57
58
59class Script:
60
61 """!
62 The object representation of a steering file.
63
64 @var path: The path to the steering file
65 @var name: The name of the file, but without special chars or spaces
66 @var package: The package to which the steering file belongs to
67 @var _header: The contents of the XML file header
68 @var dependencies: On which scripts does the steering file depend
69 @var status: The current status, e.g. 'running' or 'finished'
70 @var control: Execute locally or on the cluster?
71 @var returncode: The returncode of the steering file
72 @var _object: Pointer to the object itself. Is this even necessary?
73 """
74
75 def __init__(
76 self, path: str, package: str, log: Optional[logging.Logger] = None
77 ):
78 """!
79 The default constructor.
80 """
81
82 # Pointer to the script object itself
83 # Is this necessary?
84 self._object = self
85
86 # stores the reference to the logging object used in this validation
87 # run
88 if log is None:
89 log = logging.Logger("script")
90 self.log: logging.Logger = log
91
92 # The (absolute) path of the steering file
93 self.path = path
94
95 # The runtime of the script
96 self.runtime: Optional[int] = None
97 self.start_time: Optional[int] = None
98
99 # The name of the steering file. Basically the file name of the
100 # steering file, but everything that is not a letter is replaced
101 # by an underscore. Useful e.g. for cluster controls.
102 self.name = self.sanitize_file_name(str(os.path.basename(self.path)))
103 # useful when displaying the filename to the user
104 self.name_not_sanitized = str(os.path.basename(self.path))
105
106 # The package to which the steering file belongs
107 self.package = package
108
109
110 self._header = dict()
111
112 self.header_parsing_errors = False
113
114 self._header_parsing_attempted = False
115
116 # A list of script objects, on which this script depends
117 self.dependencies = []
118
119 # Current status of the script.
120 # Possible values: waiting, running, finished, failed, skipped
121 self.status = ScriptStatus.waiting
122
123 # Which control is used for executing the script, i.e. cluster or
124 # local. Useful when using different script level, e.g. data creation
125 # scripts are being run on the cluster, but plotting scripts are
126 # executed locally
127 self.control = None
128
129 # The returncode of the script. Should be 0 if all went well.
130 self.returncode: Optional[int] = None
131
132
135 self.job_id: Optional[str] = None
136
137 @staticmethod
138 def sanitize_file_name(file_name):
139 """!
140 Replaces the . between the file name and extension with an underscore _
141 """
142 return re.sub(r"[\W_]+", "_", file_name)
143
144 def to_json(self, current_tag):
145
146 string_status = ""
147
148 if self.status == ScriptStatus.failed:
149 string_status = "failed"
150 elif self.status == ScriptStatus.finished:
151 string_status = "finished"
152 elif self.status == ScriptStatus.running:
153 string_status = "running"
154 elif self.status == ScriptStatus.skipped:
155 string_status = "skipped"
156 elif self.status == ScriptStatus.waiting:
157 string_status = "waiting"
158 elif self.status == ScriptStatus.cached:
159 string_status = "cached"
160
161 # filter for simulated files
162 input_file_names = [ip.split('/')[-1] for ip in self.input_files if '../' in ip]
163 output_file_names = [op.split('/')[-1] for op in self.output_files if '../' in op]
164
165 return json_objects.Script(
166 self.name_not_sanitized,
167 self.path,
168 string_status,
169 log_url=os.path.join(self.package, self.name_not_sanitized)
170 + ".log",
171 return_code=self.returncode,
172 input=input_file_names,
173 output=output_file_names,
174 )
175
176 def get_recursive_dependencies(self, scripts, level=0):
177 """!
178 Loops over all dependencies of this script and recursively retrieves
179 their sub-dependencies
180 """
181
182 if level > 50:
183 self.log.error(
184 f"Recursive dependency lookup reached level {level} and will "
185 f"quit now. Possibly circular dependencies in the validation "
186 f"scripts ? "
187 )
188
189 all_deps = set()
190 for dep in self.dependencies:
191 # only add, if not already in the dependencies list
192 all_deps.add(dep.name)
193
194 next_level = level + 1
195
196 # find script object
197 dep_script = [x for x in scripts if x.name == dep.name]
198 rec_deps = []
199 if len(dep_script) == 1:
200 rec_deps = dep_script[0].get_recursive_dependencies(
201 scripts, next_level
202 )
203 else:
204 self.log.error(
205 f"Depending script with the name {dep.name} could not be "
206 f"found in the list of registered scripts. "
207 )
208
209 # only add, if not already in the dependencies list
210 for rc in rec_deps:
211 all_deps.add(rc)
212
213 return all_deps
214
215 def unique_name(self):
216 """
217 Generates a unique name from the package and name of the script
218 which only occurs once in the whole validation suite
219 """
220 return f"script_unique_name_{self.package}_{self.name}"
221
222 def compute_dependencies(self, scripts):
223 """!
224 Loops over the input files given in the header and tries to find the
225 corresponding Script objects, which will then be stored in the
226 script.dependencies-list
227 @return: None
228 """
229 # Loop over all the dependencies given in the header information
230 for root_file in self.input_files:
231
232 # Find the script which is responsible for the creation of
233 # the input file (in the same package or in validation folder)
234 creator = find_creator(root_file, self.package, scripts, self.log)
235
236 # If no creator could be found, raise an error!
237 if creator is None:
238 self.log.error(
239 f"Unmatched dependency for {self.path}: {root_file} "
240 f"has no creator! This means that we will have to skip "
241 f"this script."
242 )
243 self.status = ScriptStatus.skipped
244
245 # If creator(s) could be found, add those scripts to the
246 # list of scripts on which self depends
247 else:
248 self.dependencies += creator
249
250 # remove double entries
251 self.dependencies = list(set(self.dependencies))
252
253 def load_header(self):
254 """!
255 This method opens the file given in self.path, tries to extract the
256 XML-header of it and then parse it.
257 It then fills the self.header variable with a dict containing the
258 values that were read from the XML header.
259 @return: None
260 """
261 if self._header_parsing_attempted:
262 return
263
264 self._header_parsing_attempted = True
265
266 # Read the file as a whole
267 # We specify encoding and errors here to avoid exceptions for people
268 # with strange preferred encoding settings in their OS
269 with open(self.path, encoding="utf-8", errors="replace") as data:
270 steering_file_content = data.read()
271
272 # Define the regex to extract everything between the <header>-tags
273 pat = re.compile("(<header>.*?</header>)", re.DOTALL | re.M)
274
275 # Apply the regex, i.e. filter out the <header>...</header> part of
276 # each steering file.
277 try:
278 xml = pat.findall(steering_file_content)[0].strip()
279 except IndexError:
280 self.log.error("No file header found: " + self.path)
281 self.header_parsing_errors = True
282 return
283
284 # Create an XML tree from the plain XML code.
285 try:
286 xml_tree = XMLTree.ElementTree(XMLTree.fromstring(xml)).getroot()
287 except XMLTree.ParseError:
288 self.log.error("Invalid XML in header: " + self.path)
289 self.header_parsing_errors = True
290 return
291
292 # we have a header
293 self._header = {}
294
295 # Loop over that tree
296 for branch in xml_tree:
297
298 # The keywords that should be parsed into a list
299 list_tags = ["input", "output", "contact"]
300
301 # If the tag is empty branch.text is None. Replacing None with an
302 # empty string in this case.
303 branch_text = branch.text or ""
304
305 # Format the values of each branch
306 if branch.tag.strip() in list_tags:
307 branch_value = [__.strip() for __ in branch_text.split(",")]
308 if branch_value == [""]:
309 branch_value = []
310 else:
311 branch_value = re.sub(" +", " ", branch_text.replace("\n", ""))
312 branch_value = branch_value.strip()
313
314 # Append the branch and its values to the header-dict. This
315 # implementation technically allows multiple occurrences of the
316 # same <tag></tag>-pair, which will be bundled to the same key in
317 # the key in the returned dictionary
318 if branch.tag.strip() in self._header:
319 self._header[branch.tag.strip()] += branch_value
320 else:
321 self._header[branch.tag.strip()] = branch_value
322
323 # Below are all of the getter methods for accessing data from the header
324 # If the header isn't loaded at the time they are called, we do that.
325
326 @property
327 def input_files(self):
328 """
329 return a list of input files which this script will read.
330 This information is only available, if load_header has been called
331 """
332 self.load_header()
333 return self._header.get("input", [])
334
335 @property
336 def output_files(self):
337 """
338 return a list of output files this script will create.
339 This information is only available, if load_header has been called
340 """
341 self.load_header()
342 return self._header.get("output", [])
343
344 @property
345 def is_cacheable(self):
346 """
347 Returns true, if the script must not be executed if its output
348 files are already present.
349 This information is only available, if load_header has been called
350 """
351 self.load_header()
352 return "cacheable" in self._header
353
354 @property
355 def noexecute(self) -> bool:
356 """ A flag set in the header that tells us to simply ignore this
357 script for the purpose of running the validation.
358 """
359 self.load_header()
360 return "noexecute" in self._header
361
362 @property
363 def description(self) -> str:
364 """ Description of script as set in header """
365 self.load_header()
366 return self._header.get("description", "")
367
368 @property
369 def contact(self) -> str:
370 """ Contact of script as set in header """
371 self.load_header()
372 return self._header.get("contact", "")
373
374 @property
375 def interval(self) -> str:
376 """ Interval of script execution as set in header """
377 self.load_header()
378 return self._header.get("interval", "nightly")
379
380 def remove_output_files(self) -> None:
381 """Remove all output files. This is used to clean up files after a
382 script is marked as failed. Leaving the output files in a possible
383 corrupted state and risk having them found by the validation framework
384 later for crashes isn't sensible.
385 """
386 for f in map(Path, self.output_files):
387 self.log.warning(
388 f"Removing output file {f} (if exists) because script failed"
389 )
390 f.unlink(missing_ok=True)
391
392
393def find_creator(
394 outputfile: str, package: str, scripts: List[Script], log: logging.Logger
395) -> Optional[List[Script]]:
396 """!
397 This function receives the name of a file and tries to find the file
398 in the given package which produces this file, i.e. find the file in
399 whose header 'outputfile' is listed under ``<output>`` ... ``</output>``.
400 It then returns a list of all Scripts who claim to be creating 'outputfile'
401
402 @param outputfile: The file of which we want to know by which script is
403 created
404 @param package: The package in which we want to search for the creator
405 @param scripts: List of all script objects/candidates
406 @param log: Logger
407 """
408
409 # Get a list of all Script objects for scripts in the given package as well
410 # as from the validation-folder
411 candidates = [
412 script
413 for script in scripts
414 if script.package in [package, "validation"]
415 ]
416
417 # Reserve some space for the results we will return
418 results = []
419
420 # Loop over all candidates and check if they have 'outputfile' listed
421 # under their outputs
422 for candidate in candidates:
423 if outputfile in candidate.output_files:
424 results.append(candidate)
425
426 # Return our results and warn if there is more than one creator
427 if len(results) == 0:
428 return None
429 if len(results) > 1:
430 log.warning("Found multiple creators for" + outputfile)
431 return results
STL class.
Enumeration of the states a script can be during its execution cycle.