Belle II Software development
validationscript.py
1#!/usr/bin/env python3
2
3
10
11import re
12import os
13from typing import Optional, List
14import logging
15from pathlib import Path
16
17# Import XML Parser. Use C-Version, if available
18try:
19 import xml.etree.cElementTree as XMLTree
20except ImportError:
21 import xml.etree.ElementTree as XMLTree
22
23import json_objects
24
25
26# todo [code quality, low prio, easy]: This should be an enum
28
29 """!
30 Enumeration of the states a script can be during its execution
31 cycle
32 """
33
34 # script is waiting for execution
35 class waiting:
36 pass
37
38 # script is running atm
39 class running:
40 pass
41
42 # script execution has been successfully finished
43 class finished:
44 pass
45
46 # script execution has failed
47 class failed:
48 pass
49
50 # script has been marked to be skipped
51 class skipped:
52 pass
53
54 # script output is already cached, do not execute script
55 class cached:
56 pass
57
58
59class Script:
60
61 """!
62 The object representation of a steering file.
63
64 @var path: The path to the steering file
65 @var name: The name of the file, but without special chars or spaces
66 @var package: The package to which the steering file belongs to
67 @var _header: The contents of the XML file header
68 @var dependencies: On which scripts does the steering file depend
69 @var status: The current status, e.g. 'running' or 'finished'
70 @var control: Execute locally or on the cluster?
71 @var returncode: The returncode of the steering file
72 @type returncode: Optional[int]
73 @var _object: Pointer to the object itself. Is this even necessary?
74 """
75
76 def __init__(
77 self, path: str, package: str, log: Optional[logging.Logger] = None
78 ):
79 """!
80 The default constructor.
81 """
82
83 # Pointer to the script object itself
84 # Is this necessary?
85 self._object = self
86
87 # stores the reference to the logging object used in this validation
88 # run
89 if log is None:
90 log = logging.Logger("script")
91 self.log: logging.Logger = log
92
93 # The (absolute) path of the steering file
94 self.path = path
95
96 # The runtime of the script
97 self.runtime: Optional[int] = None
98 self.start_time: Optional[int] = None
99
100 # The name of the steering file. Basically the file name of the
101 # steering file, but everything that is not a letter is replaced
102 # by an underscore. Useful e.g. for cluster controls.
103 self.name = self.sanitize_file_name(str(os.path.basename(self.path)))
104 # useful when displaying the filename to the user
105 self.name_not_sanitized = str(os.path.basename(self.path))
106
107 # The package to which the steering file belongs
108 self.package = package
109
110
111 self._header = dict()
112
113 self.header_parsing_errors = False
114
115 self._header_parsing_attempted = False
116
117 # A list of script objects, on which this script depends
118 self.dependencies = []
119
120 # Current status of the script.
121 # Possible values: waiting, running, finished, failed, skipped
122 self.status = ScriptStatus.waiting
123
124 # Which control is used for executing the script, i.e. cluster or
125 # local. Useful when using different script level, e.g. data creation
126 # scripts are being run on the cluster, but plotting scripts are
127 # executed locally
128 self.control = None
129
130 # The returncode of the script. Should be 0 if all went well.
131 self.returncode = None
132
133
136 self.job_id: Optional[str] = None
137
138 @staticmethod
139 def sanitize_file_name(file_name):
140 """!
141 Replaces the . between the file name and extension with an underscore _
142 """
143 return re.sub(r"[\W_]+", "_", file_name)
144
145 def to_json(self, current_tag):
146
147 string_status = ""
148
149 if self.status == ScriptStatus.failed:
150 string_status = "failed"
151 elif self.status == ScriptStatus.finished:
152 string_status = "finished"
153 elif self.status == ScriptStatus.running:
154 string_status = "running"
155 elif self.status == ScriptStatus.skipped:
156 string_status = "skipped"
157 elif self.status == ScriptStatus.waiting:
158 string_status = "waiting"
159 elif self.status == ScriptStatus.cached:
160 string_status = "cached"
161
162 # filter for simulated files
163 input_file_names = [ip.split('/')[-1] for ip in self.input_files if '../' in ip]
164 output_file_names = [op.split('/')[-1] for op in self.output_files if '../' in op]
165
166 return json_objects.Script(
167 self.name_not_sanitized,
168 self.path,
169 string_status,
170 log_url=os.path.join(self.package, self.name_not_sanitized)
171 + ".log",
172 return_code=self.returncode,
173 input=input_file_names,
174 output=output_file_names,
175 )
176
177 def get_recursive_dependencies(self, scripts, level=0):
178 """!
179 Loops over all dependencies of this script and recursively retrieves
180 their sub-dependencies
181 """
182
183 if level > 50:
184 self.log.error(
185 f"Recursive dependency lookup reached level {level} and will "
186 f"quit now. Possibly circular dependencies in the validation "
187 f"scripts ? "
188 )
189
190 all_deps = set()
191 for dep in self.dependencies:
192 # only add, if not already in the dependencies list
193 all_deps.add(dep.name)
194
195 next_level = level + 1
196
197 # find script object
198 dep_script = [x for x in scripts if x.name == dep.name]
199 rec_deps = []
200 if len(dep_script) == 1:
201 rec_deps = dep_script[0].get_recursive_dependencies(
202 scripts, next_level
203 )
204 else:
205 self.log.error(
206 f"Depending script with the name {dep.name} could not be "
207 f"found in the list of registered scripts. "
208 )
209
210 # only add, if not already in the dependencies list
211 for rc in rec_deps:
212 all_deps.add(rc)
213
214 return all_deps
215
216 def unique_name(self):
217 """
218 Generates a unique name from the package and name of the script
219 which only occurs once in the whole validation suite
220 """
221 return f"script_unique_name_{self.package}_{self.name}"
222
223 def compute_dependencies(self, scripts):
224 """!
225 Loops over the input files given in the header and tries to find the
226 corresponding Script objects, which will then be stored in the
227 script.dependencies-list
228 @return: None
229 """
230 # Loop over all the dependencies given in the header information
231 for root_file in self.input_files:
232
233 # Find the script which is responsible for the creation of
234 # the input file (in the same package or in validation folder)
235 creator = find_creator(root_file, self.package, scripts, self.log)
236
237 # If no creator could be found, raise an error!
238 if creator is None:
239 self.log.error(
240 f"Unmatched dependency for {self.path}: {root_file} "
241 f"has no creator! This means that we will have to skip "
242 f"this script."
243 )
244 self.status = ScriptStatus.skipped
245
246 # If creator(s) could be found, add those scripts to the
247 # list of scripts on which self depends
248 else:
249 self.dependencies += creator
250
251 # remove double entries
252 self.dependencies = list(set(self.dependencies))
253
254 def load_header(self):
255 """!
256 This method opens the file given in self.path, tries to extract the
257 XML-header of it and then parse it.
258 It then fills the self.header variable with a dict containing the
259 values that were read from the XML header.
260 @return: None
261 """
262 if self._header_parsing_attempted:
263 return
264
265 self._header_parsing_attempted = True
266
267 # Read the file as a whole
268 # We specify encoding and errors here to avoid exceptions for people
269 # with strange preferred encoding settings in their OS
270 with open(self.path, encoding="utf-8", errors="replace") as data:
271 steering_file_content = data.read()
272
273 # Define the regex to extract everything between the <header>-tags
274 pat = re.compile("(<header>.*?</header>)", re.DOTALL | re.M)
275
276 # Apply the regex, i.e. filter out the <header>...</header> part of
277 # each steering file.
278 try:
279 xml = pat.findall(steering_file_content)[0].strip()
280 except IndexError:
281 self.log.error("No file header found: " + self.path)
282 self.header_parsing_errors = True
283 return
284
285 # Create an XML tree from the plain XML code.
286 try:
287 xml_tree = XMLTree.ElementTree(XMLTree.fromstring(xml)).getroot()
288 except XMLTree.ParseError:
289 self.log.error("Invalid XML in header: " + self.path)
290 self.header_parsing_errors = True
291 return
292
293 # we have a header
294 self._header = {}
295
296 # Loop over that tree
297 for branch in xml_tree:
298
299 # The keywords that should be parsed into a list
300 list_tags = ["input", "output", "contact"]
301
302 # If the tag is empty branch.text is None. Replacing None with an
303 # empty string in this case.
304 branch_text = branch.text or ""
305
306 # Format the values of each branch
307 if branch.tag.strip() in list_tags:
308 branch_value = [__.strip() for __ in branch_text.split(",")]
309 if branch_value == [""]:
310 branch_value = []
311 else:
312 branch_value = re.sub(" +", " ", branch_text.replace("\n", ""))
313 branch_value = branch_value.strip()
314
315 # Append the branch and its values to the header-dict. This
316 # implementation technically allows multiple occurrences of the
317 # same <tag></tag>-pair, which will be bundled to the same key in
318 # the key in the returned dictionary
319 if branch.tag.strip() in self._header:
320 self._header[branch.tag.strip()] += branch_value
321 else:
322 self._header[branch.tag.strip()] = branch_value
323
324 # Below are all of the getter methods for accessing data from the header
325 # If the header isn't loaded at the time they are called, we do that.
326
327 @property
328 def input_files(self):
329 """
330 return a list of input files which this script will read.
331 This information is only available, if load_header has been called
332 """
333 self.load_header()
334 return self._header.get("input", [])
335
336 @property
337 def output_files(self):
338 """
339 return a list of output files this script will create.
340 This information is only available, if load_header has been called
341 """
342 self.load_header()
343 return self._header.get("output", [])
344
345 @property
346 def is_cacheable(self):
347 """
348 Returns true, if the script must not be executed if its output
349 files are already present.
350 This information is only available, if load_header has been called
351 """
352 self.load_header()
353 return "cacheable" in self._header
354
355 @property
356 def noexecute(self) -> bool:
357 """ A flag set in the header that tells us to simply ignore this
358 script for the purpose of running the validation.
359 """
360 self.load_header()
361 return "noexecute" in self._header
362
363 @property
364 def description(self) -> str:
365 """ Description of script as set in header """
366 self.load_header()
367 return self._header.get("description", "")
368
369 @property
370 def contact(self) -> str:
371 """ Contact of script as set in header """
372 self.load_header()
373 return self._header.get("contact", "")
374
375 @property
376 def interval(self) -> str:
377 """ Interval of script execution as set in header """
378 self.load_header()
379 return self._header.get("interval", "nightly")
380
381 def remove_output_files(self) -> None:
382 """Remove all output files. This is used to clean up files after a
383 script is marked as failed. Leaving the output files in a possible
384 corrupted state and risk having them found by the validation framework
385 later for crashes isn't sensible. """
386 for f in map(Path, self.output_files):
387 self.log.warning(
388 f"Removing output file {f} (if exists) because script failed"
389 )
390 f.unlink(missing_ok=True)
391
392
393def find_creator(
394 outputfile: str, package: str, scripts: List[Script], log: logging.Logger
395) -> Optional[List[Script]]:
396 """!
397 This function receives the name of a file and tries to find the file
398 in the given package which produces this file, i.e. find the file in
399 whose header 'outputfile' is listed under <output></output>.
400 It then returns a list of all Scripts who claim to be creating 'outputfile'
401
402 @param outputfile: The file of which we want to know by which script is
403 created
404 @param package: The package in which we want to search for the creator
405 @param scripts: List of all script objects/candidates
406 @param log: Logger
407 """
408
409 # Get a list of all Script objects for scripts in the given package as well
410 # as from the validation-folder
411 candidates = [
412 script
413 for script in scripts
414 if script.package in [package, "validation"]
415 ]
416
417 # Reserve some space for the results we will return
418 results = []
419
420 # Loop over all candidates and check if they have 'outputfile' listed
421 # under their outputs
422 for candidate in candidates:
423 if outputfile in candidate.output_files:
424 results.append(candidate)
425
426 # Return our results and warn if there is more than one creator
427 if len(results) == 0:
428 return None
429 if len(results) > 1:
430 log.warning("Found multiple creators for" + outputfile)
431 return results
432
Enumeration of the states a script can be during its execution cycle.