Belle II Software  release-08-01-10
validationscript.py
1 #!/usr/bin/env python3
2 
3 
10 
11 import re
12 import os
13 from typing import Optional, List
14 import logging
15 from pathlib import Path
16 
17 # Import XML Parser. Use C-Version, if available
18 try:
19  import xml.etree.cElementTree as XMLTree
20 except ImportError:
21  import xml.etree.ElementTree as XMLTree
22 
23 import json_objects
24 
25 
26 # todo [code quality, low prio, easy]: This should be an enum
28 
29  """!
30  Enumeration of the states a script can be during its execution
31  cycle
32  """
33 
34  # script is waiting for execution
35  class waiting:
36  pass
37 
38  # script is running atm
39  class running:
40  pass
41 
42  # script execution has been successfully finished
43  class finished:
44  pass
45 
46  # script execution has failed
47  class failed:
48  pass
49 
50  # script has been marked to be skipped
51  class skipped:
52  pass
53 
54  # script output is already cached, do not execute script
55  class cached:
56  pass
57 
58 
59 class Script:
60 
61  """!
62  The object representation of a steering file.
63 
64  @var path: The path to the steering file
65  @var name: The name of the file, but without special chars or spaces
66  @var package: The package to which the steering file belongs to
67  @var header: The contents of the XML file header
68  @var dependencies: On which scripts does the steering file depend
69  @var status: The current status, e.g. 'running' or 'finished'
70  @var control: Execute locally or on the cluster?
71  @var returncode: The returncode of the steering file
72  @var _object: Pointer to the object itself. Is this even necessary?
73  """
74 
75  def __init__(
76  self, path: str, package: str, log: Optional[logging.Logger] = None
77  ):
78  """!
79  The default constructor.
80  """
81 
82  # Pointer to the script object itself
83  # Is this necessary?
84  self._object = self
85 
86  # stores the reference to the logging object used in this validation
87  # run
88  if log is None:
89  log = logging.Logger("script")
90  self.log: logging.Logger = log
91 
92  # The (absolute) path of the steering file
93  self.path = path
94 
95  # The runtime of the script
96  self.runtime: Optional[int] = None
97  self.start_time: Optional[int] = None
98 
99  # The name of the steering file. Basically the file name of the
100  # steering file, but everything that is not a letter is replaced
101  # by an underscore. Useful e.g. for cluster controls.
102  self.name = self.sanitize_file_name(str(os.path.basename(self.path)))
103  # useful when displaying the filename to the user
104  self.name_not_sanitized = str(os.path.basename(self.path))
105 
106  # The package to which the steering file belongs
107  self.package = package
108 
109 
110  self._header = dict()
111 
112  self.header_parsing_errors = False
113 
114  self._header_parsing_attempted = False
115 
116  # A list of script objects, on which this script depends
117  self.dependencies = []
118 
119  # Current status of the script.
120  # Possible values: waiting, running, finished, failed, skipped
121  self.status = ScriptStatus.waiting
122 
123  # Which control is used for executing the script, i.e. cluster or
124  # local. Useful when using different script level, e.g. data creation
125  # scripts are being run on the cluster, but plotting scripts are
126  # executed locally
127  self.control = None
128 
129  # The returncode of the script. Should be 0 if all went well.
130  self.returncode: Optional[int] = None
131 
132 
135  self.job_id: Optional[str] = None
136 
137  @staticmethod
138  def sanitize_file_name(file_name):
139  """!
140  Replaces the . between the file name and extension with an underscore _
141  """
142  return re.sub(r"[\W_]+", "_", file_name)
143 
144  def to_json(self, current_tag):
145 
146  string_status = ""
147 
148  if self.status == ScriptStatus.failed:
149  string_status = "failed"
150  elif self.status == ScriptStatus.finished:
151  string_status = "finished"
152  elif self.status == ScriptStatus.running:
153  string_status = "running"
154  elif self.status == ScriptStatus.skipped:
155  string_status = "skipped"
156  elif self.status == ScriptStatus.waiting:
157  string_status = "waiting"
158  elif self.status == ScriptStatus.cached:
159  string_status = "cached"
160 
161  # filter for simulated files
162  input_file_names = [ip.split('/')[-1] for ip in self.input_files if '../' in ip]
163  output_file_names = [op.split('/')[-1] for op in self.output_files if '../' in op]
164 
165  return json_objects.Script(
166  self.name_not_sanitized,
167  self.path,
168  string_status,
169  log_url=os.path.join(self.package, self.name_not_sanitized)
170  + ".log",
171  return_code=self.returncode,
172  input=input_file_names,
173  output=output_file_names,
174  )
175 
176  def get_recursive_dependencies(self, scripts, level=0):
177  """!
178  Loops over all dependencies of this script and recursively retrieves
179  their sub-dependencies
180  """
181 
182  if level > 50:
183  self.log.error(
184  f"Recurisve dependency lookup reached level {level} and will "
185  f"quit now. Possibly circular dependcencies in the validation "
186  f"scripts ? "
187  )
188 
189  all_deps = set()
190  for dep in self.dependencies:
191  # only add, if not already in the dependencies list
192  all_deps.add(dep.name)
193 
194  next_level = level + 1
195 
196  # find script object
197  dep_script = [x for x in scripts if x.name == dep.name]
198  rec_deps = []
199  if len(dep_script) == 1:
200  rec_deps = dep_script[0].get_recursive_dependencies(
201  scripts, next_level
202  )
203  else:
204  self.log.error(
205  f"Depending script with the name {dep.name} could not be "
206  f"found in the list of registered scripts. "
207  )
208 
209  # only add, if not already in the dependencies list
210  for rc in rec_deps:
211  all_deps.add(rc)
212 
213  return all_deps
214 
215  def unique_name(self):
216  """
217  Generates a unique name from the package and name of the script
218  which only occurs once in th whole validation suite
219  """
220  return f"script_unique_name_{self.package}_{self.name}"
221 
222  def compute_dependencies(self, scripts):
223  """!
224  Loops over the input files given in the header and tries to find the
225  corresponding Script objects, which will then be stored in the
226  script.dependencies-list
227  @return: None
228  """
229  # Loop over all the dependencies given in the header information
230  for root_file in self.input_files:
231 
232  # Find the script which is responsible for the creation of
233  # the input file (in the same package or in validation folder)
234  creator = find_creator(root_file, self.package, scripts, self.log)
235 
236  # If no creator could be found, raise an error!
237  if creator is None:
238  self.log.error(
239  f"Unmatched dependency for {self.path}: {root_file} "
240  f"has no creator! This means that we will have to skip "
241  f"this script."
242  )
243  self.status = ScriptStatus.skipped
244 
245  # If creator(s) could be found, add those scripts to the
246  # list of scripts on which self depends
247  else:
248  self.dependencies += creator
249 
250  # remove double entries
251  self.dependencies = list(set(self.dependencies))
252 
253  def load_header(self):
254  """!
255  This method opens the file given in self.path, tries to extract the
256  XML-header of it and then parse it.
257  It then fills the self.header variable with a dict containing the
258  values that were read from the XML header.
259  @return: None
260  """
261  if self._header_parsing_attempted:
262  return
263 
264  self._header_parsing_attempted = True
265 
266  # Read the file as a whole
267  # We specify encoding and errors here to avoid exceptions for people
268  # with strange preferred encoding settings in their OS
269  with open(self.path, encoding="utf-8", errors="replace") as data:
270  steering_file_content = data.read()
271 
272  # Define the regex to extract everything between the <header>-tags
273  pat = re.compile("(<header>.*?</header>)", re.DOTALL | re.M)
274 
275  # Apply the regex, i.e. filter out the <header>...</header> part of
276  # each steering file.
277  try:
278  xml = pat.findall(steering_file_content)[0].strip()
279  except IndexError:
280  self.log.error("No file header found: " + self.path)
281  self.header_parsing_errors = True
282  return
283 
284  # Create an XML tree from the plain XML code.
285  try:
286  xml_tree = XMLTree.ElementTree(XMLTree.fromstring(xml)).getroot()
287  except XMLTree.ParseError:
288  self.log.error("Invalid XML in header: " + self.path)
289  self.header_parsing_errors = True
290  return
291 
292  # we have a header
293  self._header = {}
294 
295  # Loop over that tree
296  for branch in xml_tree:
297 
298  # The keywords that should be parsed into a list
299  list_tags = ["input", "output", "contact"]
300 
301  # If the tag is empty branch.text is None. Replacing None with an
302  # empty string in this case.
303  branch_text = branch.text or ""
304 
305  # Format the values of each branch
306  if branch.tag.strip() in list_tags:
307  branch_value = [__.strip() for __ in branch_text.split(",")]
308  if branch_value == [""]:
309  branch_value = []
310  else:
311  branch_value = re.sub(" +", " ", branch_text.replace("\n", ""))
312  branch_value = branch_value.strip()
313 
314  # Append the branch and its values to the header-dict. This
315  # implementation technically allows multiple occurrences of the
316  # same <tag></tag>-pair, which will be bundled to the same key in
317  # the key in the returned dictionary
318  if branch.tag.strip() in self._header:
319  self._header[branch.tag.strip()] += branch_value
320  else:
321  self._header[branch.tag.strip()] = branch_value
322 
323  # Below are all of the getter methods for accessing data from the header
324  # If the header isn't loaded at the time they are called, we do that.
325 
326  @property
327  def input_files(self):
328  """
329  return a list of input files which this script will read.
330  This information is only available, if load_header has been called
331  """
332  self.load_header()
333  return self._header.get("input", [])
334 
335  @property
336  def output_files(self):
337  """
338  return a list of output files this script will create.
339  This information is only available, if load_header has been called
340  """
341  self.load_header()
342  return self._header.get("output", [])
343 
344  @property
345  def is_cacheable(self):
346  """
347  Returns true, if the script must not be executed if its output
348  files are already present.
349  This information is only available, if load_header has been called
350  """
351  self.load_header()
352  return "cacheable" in self._header
353 
354  @property
355  def noexecute(self) -> bool:
356  """ A flag set in the header that tells us to simply ignore this
357  script for the purpose of running the validation.
358  """
359  self.load_header()
360  return "noexecute" in self._header
361 
362  @property
363  def description(self) -> str:
364  """ Description of script as set in header """
365  self.load_header()
366  return self._header.get("description", "")
367 
368  @property
369  def contact(self) -> str:
370  """ Contact of script as set in header """
371  self.load_header()
372  return self._header.get("contact", "")
373 
374  @property
375  def interval(self) -> str:
376  """ Interval of script executation as set in header """
377  self.load_header()
378  return self._header.get("interval", "nightly")
379 
380  def remove_output_files(self) -> None:
381  """Remove all output files. This is used to clean up files after a
382  script is marked as failed. Leaving the output files in a possible
383  corrupted state and risk having them found by the validation framework
384  later for crashes isn't sensible.
385  """
386  for f in map(Path, self.output_files):
387  self.log.warning(
388  f"Removing output file {f} (if exists) because script failed"
389  )
390  f.unlink(missing_ok=True)
391 
392 
393 def find_creator(
394  outputfile: str, package: str, scripts: List[Script], log: logging.Logger
395 ) -> Optional[List[Script]]:
396  """!
397  This function receives the name of a file and tries to find the file
398  in the given package which produces this file, i.e. find the file in
399  whose header 'outputfile' is listed under <output></output>.
400  It then returns a list of all Scripts who claim to be creating 'outputfile'
401 
402  @param outputfile: The file of which we want to know by which script is
403  created
404  @param package: The package in which we want to search for the creator
405  @param scripts: List of all script objects/candidates
406  @param log: Logger
407  """
408 
409  # Get a list of all Script objects for scripts in the given package as well
410  # as from the validation-folder
411  candidates = [
412  script
413  for script in scripts
414  if script.package in [package, "validation"]
415  ]
416 
417  # Reserve some space for the results we will return
418  results = []
419 
420  # Loop over all candidates and check if they have 'outputfile' listed
421  # under their outputs
422  for candidate in candidates:
423  if outputfile in candidate.output_files:
424  results.append(candidate)
425 
426  # Return our results and warn if there is more than one creator
427  if len(results) == 0:
428  return None
429  if len(results) > 1:
430  log.warning("Found multiple creators for" + outputfile)
431  return results
Enumeration of the states a script can be during its execution cycle.