release-08-01-10/doxygen/clustercontrolsge_8py_source.html

 #!/usr/bin/env python3


 # std

 import logging

 import os

 import subprocess

 import stat

 import shutil


 # ours

 import validationfunctions

 from validationscript import Script


 class Cluster:

     """

     A class that provides the controls for running jobs on a (remote)

     Sun Grid Engine cluster. It provides two methods:

     - is_job_finished(job): Returns True or False, depending on whether the job

         has finished execution

     - execute(job): Takes a job and executes it by sending it to the cluster

     """


     @staticmethod

     def is_supported():

         """

         Check if qsub is available

         """

         return shutil.which("qsub") is not None


     @staticmethod

     def name():

         """

         Returns name of this job contol

         """

         return "cluster-sge"


     @staticmethod

     def description():

         """

         Returns description of this job control

         """

         return "Batch submission via command line to Grid Engine"


     def __init__(self):

         """!

         The default constructor.

         - Holds the current working directory, which is also the location of

           the shellscripts that are being sent to the cluster.

         - Initializes a logger which writes to validate_basf2.py's log.

         - Finds the revision of basf2 that will be set up on the cluster.

         """


         self.submit_commandsubmit_command = (

             "qsub -cwd -l h_vmem={requirement_vmem}G,"

             "h_fsize={requirement_storage}G "

             "-oo {logfile} -q {queuename} -V"

         )


         self.requirement_vmemrequirement_vmem = 4


         self.requirement_storagerequirement_storage = 50


         self.queuenamequeuename = "short.q"


         self.pathpath = os.getcwd()


         self.loggerlogger = logging.getLogger("validate_basf2")


         self.toolstools = self.adjust_pathadjust_path(os.environ["BELLE2_TOOLS"])

         belle2_release_dir = os.environ.get("BELLE2_RELEASE_DIR", None)

         belle2_local_dir = os.environ.get("BELLE2_LOCAL_DIR", None)


         self.b2setupb2setup = "b2setup"

         if belle2_release_dir is not None:

             self.b2setupb2setup += " " + belle2_release_dir.split("/")[-1]

         if belle2_local_dir is not None:

             self.b2setupb2setup = (

                 "MY_BELLE2_DIR="

                 + self.adjust_pathadjust_path(belle2_local_dir)

                 + " "

                 + self.b2setupb2setup

             )

         if os.environ.get("BELLE2_OPTION") != "debug":

             self.b2setupb2setup += "; b2code-option " + os.environ.get("BELLE2_OPTION")


         # Write to log which revision we are using

         self.loggerlogger.debug(f"Setting up the following release: {self.b2setup}")


         # Define the folder in which the log of the cluster messages will be

         # stored (same folder like the log for validate_basf2.py)

         clusterlog_dir = "./html/logs/__general__/"

         if not os.path.exists(clusterlog_dir):

             os.makedirs(clusterlog_dir)


         self.clusterlogclusterlog = open(clusterlog_dir + "clusterlog.log", "w+")


     # noinspection PyMethodMayBeStatic

     def adjust_path(self, path: str):

         """!

         This method can be used if path names are different on submission

         and execution hosts.

         @param path: The past that needs to be adjusted

         @return: The adjusted path

         """


         return path


     # noinspection PyMethodMayBeStatic

     def available(self):

         """!

         The cluster should always be available to accept new jobs.

         @return: Will always return True if the function can be called

         """


         return True


     def execute(self, job: Script, options="", dry=False, tag="current"):

         """!

         Takes a Script object and a string with options and runs it on the

         cluster, either with ROOT or with basf2, depending on the file type.


         @param job: The steering file object that should be executed

         @param options: Options that will be given to the basf2 command

         @param dry: Whether to perform a dry run or not

         @param tag: The folder within the results directory

         @return: None

         """


         # Define the folder in which the results (= the ROOT files) should be

         # created. This is where the files containing plots will end up. By

         # convention, data files will be stored in the parent dir.

         # Then make sure the folder exists (create if it does not exist) and

         # change to cwd to this folder.

         output_dir = os.path.abspath(f"./results/{tag}/{job.package}")

         if not os.path.exists(output_dir):

             os.makedirs(output_dir)


         # Path where log file is supposed to be created

         log_file = output_dir + "/" + os.path.basename(job.path) + ".log"


         # Remove any left over done files

         donefile_path = f"{self.path}/script_{job.name}.done"

         if os.path.isfile(donefile_path):

             os.remove(donefile_path)


         # Now we need to distinguish between .py and .C files:

         extension = os.path.splitext(job.path)[1]

         if extension == ".C":

             # .c files are executed with root

             command = "root -b -q " + job.path

         else:

             # .py files are executed with basf2

             # 'options' contains an option-string for basf2, e.g. '-n 100'

             params = validationfunctions.basf2_command_builder(

                 job.path, options.split()

             )

             command = subprocess.list2cmdline(params)


         # Create a helpfile-shellscript, which contains all the commands that

         # need to be executed by the cluster.

         # First, set up the basf2 tools and perform b2setup with the correct

         # revision. The execute the command (i.e. run basf2 or ROOT on a

         # steering file). Write the return code of that into a *.done file.

         # Delete the helpfile-shellscript.

         tmp_name = self.pathpath + "/" + "script_" + job.name + ".sh"

         with open(tmp_name, "w+") as tmp_file:

             tmp_file.write(

                 "#!/bin/bash \n\n"

                 + "BELLE2_NO_TOOLS_CHECK=1 \n"

                 + f"source {self.tools}/b2setup \n"

                 + "cd {} \n".format(self.adjust_pathadjust_path(output_dir))

                 + f"{command} \n"

                 + "echo $? > {}/script_{}.done \n".format(self.pathpath, job.name)

                 + f"rm {tmp_name} \n"

             )


         # Make the helpfile-shellscript executable

         st = os.stat(tmp_name)

         os.chmod(tmp_name, st.st_mode | stat.S_IEXEC)


         # Prepare the command line command for submission to the cluster

         params = self.submit_commandsubmit_command.format(

             queuename=self.queuenamequeuename,

             requirement_storage=self.requirement_storagerequirement_storage,

             requirement_vmem=self.requirement_vmemrequirement_vmem,

             logfile=log_file,

         ).split() + [tmp_name]


         # Log the command we are about the execute

         self.loggerlogger.debug(subprocess.list2cmdline(params))


         # Submit it to the cluster. The steering

         # file output will be written to 'log_file' (see above).

         # If we are performing a dry run, don't send anything to the cluster

         # and just create the *.done file right away and delete the *.sh file.

         if not dry:

             process = subprocess.Popen(

                 params, stdout=self.clusterlogclusterlog, stderr=subprocess.STDOUT

             )


             # Check whether the submission succeeded

             if process.wait() != 0:

                 job.status = "failed"

         else:

             os.system(f"echo 0 > {self.path}/script_{job.name}.done")

             os.unlink(tmp_name)


     def is_job_finished(self, job: Script):

         """!

         Checks whether the '.done'-file has been created for a job. If so, it

         returns True, else it returns False.

         Also deletes the .done-File once it has returned True.


         @param job: The job of which we want to know if it finished

         @return: True if the job has finished, otherwise False

         """


         # If there is a file indicating the job is done, that is its name:

         donefile_path = f"{self.path}/script_{job.name}.done"


         # Check if such a file exists. If so, this means that the job has

         # finished.

         if os.path.isfile(donefile_path):


             # Read the returncode/exit_status for the job from the *.done-file

             with open(donefile_path) as f:

                 try:

                     returncode = int(f.read().strip())

                 except ValueError:

                     returncode = -654


             # Delete the *.done file

             os.remove(donefile_path)


             # Return that the job is finished + the return code for it

             return [True, returncode]


         # If no such file exists, the job has not yet finished

         else:

             return [False, 0]


     # noinspection PyMethodMayBeStatic

     def terminate(self, job: Script):

         """!

         Terminate a running job, not support with this backend so ignore the

         call.

         """

         self.loggerlogger.error("Script termination not supported.")

clustercontrolsge.Cluster
Definition: clustercontrolsge.py:23

clustercontrolsge.Cluster.logger
logger
Contains a reference to the logger-object from validate_basf2 Set up the logging functionality for th...
Definition: clustercontrolsge.py:90

clustercontrolsge.Cluster.queuename
queuename
Queue best suitable for execution at DESY NAF.
Definition: clustercontrolsge.py:80

clustercontrolsge.Cluster.tools
tools
We need to set up the same environment on the cluster like on the local machine.
Definition: clustercontrolsge.py:97

clustercontrolsge.Cluster.available
def available(self)
The cluster should always be available to accept new jobs.
Definition: clustercontrolsge.py:139

clustercontrolsge.Cluster.b2setup
b2setup
The command for b2setup (and setoption)
Definition: clustercontrolsge.py:102

clustercontrolsge.Cluster.clusterlog
clusterlog
The file object to which all cluster messages will be written.
Definition: clustercontrolsge.py:125

clustercontrolsge.Cluster.is_supported
def is_supported()
Definition: clustercontrolsge.py:33

clustercontrolsge.Cluster.name
def name()
Definition: clustercontrolsge.py:40

clustercontrolsge.Cluster.is_job_finished
def is_job_finished(self, Script job)
Checks whether the '.done'-file has been created for a job.
Definition: clustercontrolsge.py:238

clustercontrolsge.Cluster.path
path
The path, where the help files are being created Maybe there should be a special subfolder for them?
Definition: clustercontrolsge.py:84

clustercontrolsge.Cluster.requirement_vmem
requirement_vmem
required vmem by the job in GB, required on DESY NAF, otherwise jobs get killed due to memory consump...
Definition: clustercontrolsge.py:72

clustercontrolsge.Cluster.execute
def execute(self, Script job, options="", dry=False, tag="current")
Takes a Script object and a string with options and runs it on the cluster, either with ROOT or with ...
Definition: clustercontrolsge.py:147

clustercontrolsge.Cluster.requirement_storage
requirement_storage
the storage IO in GB which can be performed by each job.
Definition: clustercontrolsge.py:77

clustercontrolsge.Cluster.terminate
def terminate(self, Script job)
Terminate a running job, not support with this backend so ignore the call.
Definition: clustercontrolsge.py:273

clustercontrolsge.Cluster.submit_command
submit_command
The command to submit a job.
Definition: clustercontrolsge.py:64

clustercontrolsge.Cluster.description
def description()
Definition: clustercontrolsge.py:47

clustercontrolsge.Cluster.__init__
def __init__(self)
The default constructor.
Definition: clustercontrolsge.py:53

clustercontrolsge.Cluster.adjust_path
def adjust_path(self, str path)
This method can be used if path names are different on submission and execution hosts.
Definition: clustercontrolsge.py:128

validationfunctions.basf2_command_builder
List[str] basf2_command_builder(str steering_file, List[str] parameters, use_multi_processing=False)
Definition: validationfunctions.py:78