Belle II Software  release-05-02-19
validationscript.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 import re
5 import os
6 from typing import Optional, Dict, Any, List
7 import logging
8 
9 # A pretty printer. Prints prettier lists, dicts, etc. :)
10 import pprint
11 
12 # Import XML Parser. Use C-Version, if available
13 try:
14  import xml.etree.cElementTree as XMLTree
15 except ImportError:
16  import xml.etree.ElementTree as XMLTree
17 
18 import json_objects
19 
20 
21 pp = pprint.PrettyPrinter(depth=6, indent=1, width=80)
22 
23 
25 
26  """!
27  Enumeration of the states a script can be during its execution
28  cycle
29  """
30 
31  # script is waiting for execution
32  class waiting:
33  pass
34 
35  # script is running atm
36  class running:
37  pass
38 
39  # script execution has been successfully finished
40  class finished:
41  pass
42 
43  # script execution has failed
44  class failed:
45  pass
46 
47  # script has been marked to be skipped
48  class skipped:
49  pass
50 
51  # script output is already cached, do not execute script
52  class cached:
53  pass
54 
55 
56 class Script:
57 
58  """!
59  The object representation of a steering file.
60 
61  @var path: The path to the steering file
62  @var name: The name of the file, but without special chars or spaces
63  @var package: The package to which the steering file belongs to
64  @var header: The contents of the XML file header
65  @var dependencies: On which scripts does the steering file depend
66  @var status: The current status, e.g. 'running' or 'finished'
67  @var control: Execute locally or on the cluster?
68  @var returncode: The returncode of the steering file
69  @var _object: Pointer to the object itself. Is this even necessary?
70  """
71 
72  def __init__(self, path: str, package: str, log: Optional[logging.Logger]):
73  """!
74  The default constructor.
75  """
76 
77  # Pointer to the script object itself
78  # Is this necessary?
79  self._object = self
80 
81  # stores the reference to the logging object used in this validation
82  # run
83  self.log = log
84 
85  # The (absolute) path of the steering file
86  self.path = path
87 
88  # The runtime of the script
89  self.runtime = None # type: Optional[int]
90  self.start_time = None # type: Optional[int]
91 
92  # The name of the steering file. Basically the file name of the
93  # steering file, but everything that is not a letter is replaced
94  # by an underscore. Useful e.g. for cluster controls.
95  self.name = Script.sanitize_file_name(str(os.path.basename(self.path)))
96  # useful when displaying the filename to the user
97  self.name_not_sanitized = str(os.path.basename(self.path))
98 
99  # The package to which the steering file belongs
100  self.package = package
101 
102  # The information from the file header
103  self.header = None # type: Optional[Dict, Any]
104 
105  # A list of script objects, on which this script depends
106  self.dependencies = []
107 
108  # Current status of the script.
109  # Possible values: waiting, running, finished, failed, skipped
110  self.status = ScriptStatus.waiting
111 
112  # Which control is used for executing the script, i.e. cluster or
113  # local. Useful when using different script level, e.g. data creation
114  # scripts are being run on the cluster, but plotting scripts are
115  # executed locally
116  self.control = None
117 
118  # The returncode of the script. Should be 0 if all went well.
119  self.returncode = None # type: Optional[int]
120 
121 
124  self.job_id = None # type: Optional[str]
125 
126  @staticmethod
127  def sanitize_file_name(file_name):
128  """!
129  Replaces the . between the file name and extension with an underscore _
130  """
131  return re.sub(r'[\W_]+', '_', file_name)
132 
133  def dump(self):
134  """!
135  Print out all properties = attributes of a script.
136  @return: None
137  """
138  print()
139  pp.pprint(vars(self))
140 
141  def to_json(self, current_tag):
142 
143  string_status = ""
144 
145  if self.status == ScriptStatus.failed:
146  string_status = "failed"
147  elif self.status == ScriptStatus.finished:
148  string_status = "finished"
149  elif self.status == ScriptStatus.running:
150  string_status = "running"
151  elif self.status == ScriptStatus.skipped:
152  string_status = "skipped"
153  elif self.status == ScriptStatus.waiting:
154  string_status = "waiting"
155  elif self.status == ScriptStatus.cached:
156  string_status = "cached"
157 
158  return json_objects.Script(
159  self.name_not_sanitized,
160  self.path,
161  string_status,
162  log_url=os.path.join(self.package, self.name_not_sanitized) +
163  ".log",
164  return_code=self.returncode
165  )
166 
167  def get_recursive_dependencies(self, scripts, level=0):
168  """!
169  Loops over all dependencies of this script and recursively retrieves
170  their sub-dependencies
171  """
172 
173  if level > 50:
174  self.log.error(
175  f'Recurisve dependency lookup reached level {level} and will '
176  f'quit now. Possibly circular dependcencies in the validation '
177  f'scripts ? '
178  )
179 
180  all_deps = set()
181  for dep in self.dependencies:
182  # only add, if not already in the dependencies list
183  all_deps.add(dep.name)
184 
185  next_level = level + 1
186 
187  # find script object
188  dep_script = [x for x in scripts if x.name == dep.name]
189  rec_deps = []
190  if len(dep_script) == 1:
191  rec_deps = dep_script[0].get_recursive_dependencies(
192  scripts, next_level)
193  else:
194  self.log.error(
195  f'Depending script with the name {dep.name} could not be '
196  f'found in the list of registered scripts. '
197  )
198 
199  # only add, if not already in the dependencies list
200  for rc in rec_deps:
201  all_deps.add(rc)
202 
203  return all_deps
204 
205  def unique_name(self):
206  """
207  Generates a unique name from the package and name of the script
208  which only occurs once in th whole validation suite
209  """
210  return "script_unique_name_{}_{}".format(self.package, self.name)
211 
212  def compute_dependencies(self, scripts):
213  """!
214  Loops over the input files given in the header and tries to find the
215  corresponding Script objects, which will then be stored in the
216  script.dependencies-list
217  @return: None
218  """
219  # If all necessary header information are available:
220  if self.header is not None:
221 
222  # Loop over all the dependencies given in the header information
223  for root_file in self.header.get('input', []):
224 
225  # Find the script which is responsible for the creation of
226  # the input file (in the same package or in validation folder)
227  creator = find_creator(
228  root_file,
229  self.package,
230  scripts,
231  self.log
232  )
233 
234  # If no creator could be found, raise an error!
235  if creator is None:
236  self.log.error(
237  f'Unmatched dependency for {self.path}:{root_file} '
238  f'has no creator!')
239  self.status = ScriptStatus.skipped
240 
241  # If creator(s) could be found, add those scripts to the
242  # list of scripts on which self depends
243  else:
244  self.dependencies += creator
245 
246  # remove double entries
247  self.dependencies = list(set(self.dependencies))
248 
249  # If the necessary header information are not available:
250  else:
251  # If there is a script whose name comes before this script, this
252  # is presumed as a dependency
253 
254  # Get a list of all the script in the same directory
255  in_same_pkg = [script for script in scripts
256  if script.package == self.package]
257 
258  # Divide that list into .py and .c files, because .py files are
259  # always executed before .C files:
260  py_files = [_ for _ in in_same_pkg if _.path.endswith('py')]
261  c_files = [_ for _ in in_same_pkg if _.path.endswith('C')]
262 
263  # Make sure the lists are ordered by the path of the files
264  py_files.sort(key=lambda x: x.path)
265  c_files.sort(key=lambda x: x.path)
266 
267  # Now put the two lists back together
268  in_same_pkg = py_files + c_files
269 
270  if in_same_pkg.index(self) - 1 >= 0:
271  predecessor = in_same_pkg[in_same_pkg.index(self) - 1]
272  self.dependencies.append(predecessor)
273 
274  def get_input_files(self):
275  """
276  return a list of input files which this script will read.
277  This information is only available, if load_header has been called
278  """
279  if self.header is None:
280  return []
281 
282  return self.header.get('input', [])
283 
284  def get_output_files(self):
285  """
286  return a list of output files this script will create.
287  This information is only available, if load_header has been called
288  """
289  if self.header is None:
290  return []
291 
292  return self.header.get('output', [])
293 
294  def is_cacheable(self):
295  """
296  Returns true, if the script must not be executed if its output
297  files are already present.
298  This information is only available, if load_header has been called
299  """
300  if self.header is None:
301  return False
302 
303  return 'cacheable' in self.header
304 
305  def load_header(self):
306  """!
307  This method opens the file given in self.path, tries to extract the
308  XML-header of it and then parse it.
309  It then fills the self.header variable with a dict containing the
310  values that were read from the XML header.
311  @return: None
312  """
313 
314  # Read the file as a whole
315  with open(self.path, "r") as data:
316  steering_file_content = data.read()
317 
318  # Define the regex to extract everything between the <header>-tags
319  pat = re.compile('(<header>.*?</header>)', re.DOTALL | re.M)
320 
321  # Apply the regex, i.e. filter out the <header>...</header> part of
322  # each steering file.
323  try:
324  xml = pat.findall(steering_file_content)[0].strip()
325  except IndexError:
326  self.log.error('No file header found: ' + self.path)
327  return
328 
329  # Create an XML tree from the plain XML code.
330  try:
331  xml_tree = XMLTree.ElementTree(XMLTree.fromstring(xml)).getroot()
332  except XMLTree.ParseError:
333  self.log.error('Invalid XML in header: ' + self.path)
334  return
335 
336  # we have a header
337  self.header = {}
338 
339  # Loop over that tree
340  for branch in xml_tree:
341 
342  # The keywords that should be parsed into a list
343  list_tags = ['input', 'output', 'contact']
344 
345  # If the tag is empty branch.text is None. Replacing None with an
346  # empty string in this case.
347  branch_text = branch.text or ""
348 
349  # Format the values of each branch
350  if branch.tag.strip() in list_tags:
351  branch_value = [__.strip() for __ in branch_text.split(',')]
352  if branch_value == ['']:
353  branch_value = []
354  else:
355  branch_value = re.sub(' +', ' ', branch_text.replace('\n', ''))
356  branch_value = branch_value.strip()
357 
358  # Append the branch and its values to the header-dict. This
359  # implementation technically allows multiple occurrences of the
360  # same <tag></tag>-pair, which will be bundled to the same key in
361  # the key in the returned dictionary
362  if branch.tag.strip() in self.header:
363  self.header[branch.tag.strip()] += branch_value
364  else:
365  self.header[branch.tag.strip()] = branch_value
366 
367 
368 def find_creator(
369  outputfile: str,
370  package: str,
371  scripts: List[Script],
372  log: logging.Logger
373 ) -> Optional[List[Script]]:
374  """!
375  This function receives the name of a file and tries to find the file
376  in the given package which produces this file, i.e. find the file in
377  whose header 'outputfile' is listed under <output></output>.
378  It then returns a list of all Scripts who claim to be creating 'outputfile'
379 
380  @param outputfile: The file of which we want to know by which script is
381  created
382  @param package: The package in which we want to search for the creator
383  """
384 
385  # Get a list of all Script objects for scripts in the given package as well
386  # as from the validation-folder
387  candidates = [script for script in scripts
388  if script.package in [package, 'validation']]
389 
390  # Reserve some space for the results we will return
391  results = []
392 
393  # Loop over all candidates and check if they have 'outputfile' listed
394  # under their outputs
395  for candidate in candidates:
396  if candidate.header and \
397  outputfile in candidate.header.get('output', []):
398  results.append(candidate)
399 
400  # Return our results and warn if there is more than one creator
401  if len(results) == 0:
402  return None
403  if len(results) > 1:
404  log.warning('Found multiple creators for' + outputfile)
405  return results
validationscript.ScriptStatus.cached
Definition: validationscript.py:52
validationscript.ScriptStatus.failed
Definition: validationscript.py:44
validationscript.ScriptStatus.waiting
Definition: validationscript.py:32
validationscript.ScriptStatus.finished
Definition: validationscript.py:40
validationscript.ScriptStatus.skipped
Definition: validationscript.py:48
validationscript.ScriptStatus.running
Definition: validationscript.py:36
validationscript.ScriptStatus
Enumeration of the states a script can be during its execution cycle.
Definition: validationscript.py:24
json_objects.Script
Definition: json_objects.py:77