Belle II Software  release-06-00-14
validationscript.py
1 #!/usr/bin/env python3
2 
3 
10 
11 import re
12 import os
13 from typing import Optional, List
14 import logging
15 from pathlib import Path
16 
17 # Import XML Parser. Use C-Version, if available
18 try:
19  import xml.etree.cElementTree as XMLTree
20 except ImportError:
21  import xml.etree.ElementTree as XMLTree
22 
23 import json_objects
24 
25 
26 # todo [code quality, low prio, easy]: This should be an enum
28 
29  """!
30  Enumeration of the states a script can be during its execution
31  cycle
32  """
33 
34  # script is waiting for execution
35  class waiting:
36  pass
37 
38  # script is running atm
39  class running:
40  pass
41 
42  # script execution has been successfully finished
43  class finished:
44  pass
45 
46  # script execution has failed
47  class failed:
48  pass
49 
50  # script has been marked to be skipped
51  class skipped:
52  pass
53 
54  # script output is already cached, do not execute script
55  class cached:
56  pass
57 
58 
59 class Script:
60 
61  """!
62  The object representation of a steering file.
63 
64  @var path: The path to the steering file
65  @var name: The name of the file, but without special chars or spaces
66  @var package: The package to which the steering file belongs to
67  @var header: The contents of the XML file header
68  @var dependencies: On which scripts does the steering file depend
69  @var status: The current status, e.g. 'running' or 'finished'
70  @var control: Execute locally or on the cluster?
71  @var returncode: The returncode of the steering file
72  @var _object: Pointer to the object itself. Is this even necessary?
73  """
74 
75  def __init__(
76  self, path: str, package: str, log: Optional[logging.Logger] = None
77  ):
78  """!
79  The default constructor.
80  """
81 
82  # Pointer to the script object itself
83  # Is this necessary?
84  self._object = self
85 
86  # stores the reference to the logging object used in this validation
87  # run
88  if log is None:
89  log = logging.Logger("script")
90  self.log: logging.Logger = log
91 
92  # The (absolute) path of the steering file
93  self.path = path
94 
95  # The runtime of the script
96  self.runtime: Optional[int] = None
97  self.start_time: Optional[int] = None
98 
99  # The name of the steering file. Basically the file name of the
100  # steering file, but everything that is not a letter is replaced
101  # by an underscore. Useful e.g. for cluster controls.
102  self.name = self.sanitize_file_name(str(os.path.basename(self.path)))
103  # useful when displaying the filename to the user
104  self.name_not_sanitized = str(os.path.basename(self.path))
105 
106  # The package to which the steering file belongs
107  self.package = package
108 
109 
110  self._header = dict()
111 
112  self.header_parsing_errors = False
113 
114  self._header_parsing_attempted = False
115 
116  # A list of script objects, on which this script depends
117  self.dependencies = []
118 
119  # Current status of the script.
120  # Possible values: waiting, running, finished, failed, skipped
121  self.status = ScriptStatus.waiting
122 
123  # Which control is used for executing the script, i.e. cluster or
124  # local. Useful when using different script level, e.g. data creation
125  # scripts are being run on the cluster, but plotting scripts are
126  # executed locally
127  self.control = None
128 
129  # The returncode of the script. Should be 0 if all went well.
130  self.returncode: Optional[int] = None
131 
132 
135  self.job_id: Optional[str] = None
136 
137  @staticmethod
138  def sanitize_file_name(file_name):
139  """!
140  Replaces the . between the file name and extension with an underscore _
141  """
142  return re.sub(r"[\W_]+", "_", file_name)
143 
144  def to_json(self, current_tag):
145 
146  string_status = ""
147 
148  if self.status == ScriptStatus.failed:
149  string_status = "failed"
150  elif self.status == ScriptStatus.finished:
151  string_status = "finished"
152  elif self.status == ScriptStatus.running:
153  string_status = "running"
154  elif self.status == ScriptStatus.skipped:
155  string_status = "skipped"
156  elif self.status == ScriptStatus.waiting:
157  string_status = "waiting"
158  elif self.status == ScriptStatus.cached:
159  string_status = "cached"
160 
161  return json_objects.Script(
162  self.name_not_sanitized,
163  self.path,
164  string_status,
165  log_url=os.path.join(self.package, self.name_not_sanitized)
166  + ".log",
167  return_code=self.returncode,
168  )
169 
170  def get_recursive_dependencies(self, scripts, level=0):
171  """!
172  Loops over all dependencies of this script and recursively retrieves
173  their sub-dependencies
174  """
175 
176  if level > 50:
177  self.log.error(
178  f"Recurisve dependency lookup reached level {level} and will "
179  f"quit now. Possibly circular dependcencies in the validation "
180  f"scripts ? "
181  )
182 
183  all_deps = set()
184  for dep in self.dependencies:
185  # only add, if not already in the dependencies list
186  all_deps.add(dep.name)
187 
188  next_level = level + 1
189 
190  # find script object
191  dep_script = [x for x in scripts if x.name == dep.name]
192  rec_deps = []
193  if len(dep_script) == 1:
194  rec_deps = dep_script[0].get_recursive_dependencies(
195  scripts, next_level
196  )
197  else:
198  self.log.error(
199  f"Depending script with the name {dep.name} could not be "
200  f"found in the list of registered scripts. "
201  )
202 
203  # only add, if not already in the dependencies list
204  for rc in rec_deps:
205  all_deps.add(rc)
206 
207  return all_deps
208 
209  def unique_name(self):
210  """
211  Generates a unique name from the package and name of the script
212  which only occurs once in th whole validation suite
213  """
214  return f"script_unique_name_{self.package}_{self.name}"
215 
216  def compute_dependencies(self, scripts):
217  """!
218  Loops over the input files given in the header and tries to find the
219  corresponding Script objects, which will then be stored in the
220  script.dependencies-list
221  @return: None
222  """
223  # Loop over all the dependencies given in the header information
224  for root_file in self.input_files:
225 
226  # Find the script which is responsible for the creation of
227  # the input file (in the same package or in validation folder)
228  creator = find_creator(root_file, self.package, scripts, self.log)
229 
230  # If no creator could be found, raise an error!
231  if creator is None:
232  self.log.error(
233  f"Unmatched dependency for {self.path}: {root_file} "
234  f"has no creator! This means that we will have to skip "
235  f"this script."
236  )
237  self.status = ScriptStatus.skipped
238 
239  # If creator(s) could be found, add those scripts to the
240  # list of scripts on which self depends
241  else:
242  self.dependencies += creator
243 
244  # remove double entries
245  self.dependencies = list(set(self.dependencies))
246 
247  def load_header(self):
248  """!
249  This method opens the file given in self.path, tries to extract the
250  XML-header of it and then parse it.
251  It then fills the self.header variable with a dict containing the
252  values that were read from the XML header.
253  @return: None
254  """
255  if self._header_parsing_attempted:
256  return
257 
258  self._header_parsing_attempted = True
259 
260  # Read the file as a whole
261  # We specify encoding and errors here to avoid exceptions for people
262  # with strange preferred encoding settings in their OS
263  with open(self.path, encoding="utf-8", errors="replace") as data:
264  steering_file_content = data.read()
265 
266  # Define the regex to extract everything between the <header>-tags
267  pat = re.compile("(<header>.*?</header>)", re.DOTALL | re.M)
268 
269  # Apply the regex, i.e. filter out the <header>...</header> part of
270  # each steering file.
271  try:
272  xml = pat.findall(steering_file_content)[0].strip()
273  except IndexError:
274  self.log.error("No file header found: " + self.path)
275  self.header_parsing_errors = True
276  return
277 
278  # Create an XML tree from the plain XML code.
279  try:
280  xml_tree = XMLTree.ElementTree(XMLTree.fromstring(xml)).getroot()
281  except XMLTree.ParseError:
282  self.log.error("Invalid XML in header: " + self.path)
283  self.header_parsing_errors = True
284  return
285 
286  # we have a header
287  self._header = {}
288 
289  # Loop over that tree
290  for branch in xml_tree:
291 
292  # The keywords that should be parsed into a list
293  list_tags = ["input", "output", "contact"]
294 
295  # If the tag is empty branch.text is None. Replacing None with an
296  # empty string in this case.
297  branch_text = branch.text or ""
298 
299  # Format the values of each branch
300  if branch.tag.strip() in list_tags:
301  branch_value = [__.strip() for __ in branch_text.split(",")]
302  if branch_value == [""]:
303  branch_value = []
304  else:
305  branch_value = re.sub(" +", " ", branch_text.replace("\n", ""))
306  branch_value = branch_value.strip()
307 
308  # Append the branch and its values to the header-dict. This
309  # implementation technically allows multiple occurrences of the
310  # same <tag></tag>-pair, which will be bundled to the same key in
311  # the key in the returned dictionary
312  if branch.tag.strip() in self._header:
313  self._header[branch.tag.strip()] += branch_value
314  else:
315  self._header[branch.tag.strip()] = branch_value
316 
317  # Below are all of the getter methods for accessing data from the header
318  # If the header isn't loaded at the time they are called, we do that.
319 
320  @property
321  def input_files(self):
322  """
323  return a list of input files which this script will read.
324  This information is only available, if load_header has been called
325  """
326  self.load_header()
327  return self._header.get("input", [])
328 
329  @property
330  def output_files(self):
331  """
332  return a list of output files this script will create.
333  This information is only available, if load_header has been called
334  """
335  self.load_header()
336  return self._header.get("output", [])
337 
338  @property
339  def is_cacheable(self):
340  """
341  Returns true, if the script must not be executed if its output
342  files are already present.
343  This information is only available, if load_header has been called
344  """
345  self.load_header()
346  return "cacheable" in self._header
347 
348  @property
349  def noexecute(self) -> bool:
350  """ A flag set in the header that tells us to simply ignore this
351  script for the purpose of running the validation.
352  """
353  self.load_header()
354  return "noexecute" in self._header
355 
356  @property
357  def description(self) -> str:
358  """ Description of script as set in header """
359  self.load_header()
360  return self._header.get("description", "")
361 
362  @property
363  def contact(self) -> str:
364  """ Contact of script as set in header """
365  self.load_header()
366  return self._header.get("contact", "")
367 
368  @property
369  def interval(self) -> str:
370  """ Interval of script executation as set in header """
371  self.load_header()
372  return self._header.get("interval", "nightly")
373 
374  def remove_output_files(self) -> None:
375  """Remove all output files. This is used to clean up files after a
376  script is marked as failed. Leaving the output files in a possible
377  corrupted state and risk having them found by the validation framework
378  later for crashes isn't sensible.
379  """
380  for f in map(Path, self.output_files):
381  self.log.warning(
382  f"Removing output file {f} (if exists) because script failed"
383  )
384  f.unlink(missing_ok=True)
385 
386 
387 def find_creator(
388  outputfile: str, package: str, scripts: List[Script], log: logging.Logger
389 ) -> Optional[List[Script]]:
390  """!
391  This function receives the name of a file and tries to find the file
392  in the given package which produces this file, i.e. find the file in
393  whose header 'outputfile' is listed under <output></output>.
394  It then returns a list of all Scripts who claim to be creating 'outputfile'
395 
396  @param outputfile: The file of which we want to know by which script is
397  created
398  @param package: The package in which we want to search for the creator
399  @param scripts: List of all script objects/candidates
400  @param log: Logger
401  """
402 
403  # Get a list of all Script objects for scripts in the given package as well
404  # as from the validation-folder
405  candidates = [
406  script
407  for script in scripts
408  if script.package in [package, "validation"]
409  ]
410 
411  # Reserve some space for the results we will return
412  results = []
413 
414  # Loop over all candidates and check if they have 'outputfile' listed
415  # under their outputs
416  for candidate in candidates:
417  if outputfile in candidate.output_files:
418  results.append(candidate)
419 
420  # Return our results and warn if there is more than one creator
421  if len(results) == 0:
422  return None
423  if len(results) > 1:
424  log.warning("Found multiple creators for" + outputfile)
425  return results
Enumeration of the states a script can be during its execution cycle.