release-05-02-19/doxygen/framework_8py_source.html

#!/usr/bin/env python3

# -*- coding: utf-8 -*-


"""

This module implements several objects/functions to configure and run calibrations.

These classes are used to construct the workflow of the calibration job.

The actual processing code is mostly in the `caf.state_machines` module.

"""


__all__ = ["CalibrationBase", "Calibration", "Algorithm", "CAF"]


import os

from threading import Thread

from time import sleep

from pathlib import Path

import shutil

from glob import glob


from basf2 import B2ERROR, B2WARNING, B2INFO, B2FATAL, B2DEBUG

from basf2 import find_file

from basf2 import conditions as b2conditions


from abc import ABC, abstractmethod


import caf

from caf.utils import B2INFO_MULTILINE

from caf.utils import past_from_future_dependencies

from caf.utils import topological_sort

from caf.utils import all_dependencies

from caf.utils import method_dispatch

from caf.utils import temporary_workdir

from caf.utils import find_int_dirs

from caf.utils import LocalDatabase

from caf.utils import CentralDatabase

from caf.utils import parse_file_uri


import caf.strategies as strategies

import caf.runners as runners

from caf import backends

from caf.backends import MaxSubjobsSplitter, MaxFilesSplitter

from caf.state_machines import CalibrationMachine, ConditionError, MachineError

from caf.database import CAFDB


class Collection():

    """

    Keyword Arguments:

        collector (str, basf2.Module): The collector module  or module name for this `Collection`.

        input_files (list[str]): The input files to be used for only this `Collection`.

        pre_collection_path (basf2.Path): The reconstruction `basf2.Path` to be run prior to the Collector module.

        database_chain (list[CentralDatabase, LocalDatabase]): The database chain to be used initially for this `Collection`.

        output_patterns (list[str]): Output patterns of files produced by collector which will be used to pass to the

            `Algorithm.data_input` function. Setting this here, replaces the default completely.

        max_files_for_collector_job (int): Maximum number of input files sent to each collector subjob for this `Collection`.

            Technically this sets the SubjobSplitter to be used, not compatible with max_collector_jobs.

        max_collector_jobs (int): Maximum number of collector subjobs for this `Collection`.

            Input files are split evenly between them. Technically this sets the SubjobSplitter to be used. Not compatible with

            max_files_for_collector_job.

        backend_args (dict): The args for the backend submission of this `Collection`.

    """


    default_max_collector_jobs = 1000


    job_config = "collector_job.json"


    def __init__(self,

                 collector=None,

                 input_files=None,

                 pre_collector_path=None,

                 database_chain=None,

                 output_patterns=None,

                 max_files_per_collector_job=None,

                 max_collector_jobs=None,

                 backend_args=None

                 ):


        self.collector = collector


        self.input_files = []

        if input_files:

            self.input_files = input_files


        self.files_to_iovs = {}


        self.pre_collector_path = None

        if pre_collector_path:

            self.pre_collector_path = pre_collector_path


        self.output_patterns = ["CollectorOutput.root"]

        if output_patterns:

            self.output_patterns = output_patterns


        self.splitter = None

        if max_files_per_collector_job and max_collector_jobs:

            B2FATAL("Cannot set both 'max_files_per_collector_job' and 'max_collector_jobs' of a collection!")

        elif max_files_per_collector_job:

            self.max_files_per_collector_job = max_files_per_collector_job

        elif max_collector_jobs:

            self.max_collector_jobs = max_collector_jobs

        else:

            self.max_collector_jobs = self.default_max_collector_jobs


        self.backend_args = {}

        if backend_args:

            self.backend_args = backend_args


        if database_chain:


            self.database_chain = database_chain

        else:

            self.database_chain = []

            # This may seem weird but the changes to the DB interface mean that they have effectively swapped from being

            # described well by appending to a list to a deque. So we do bit of reversal to translate it back and make the

            # most important GT the last one encountered.

            for tag in reversed(b2conditions.default_globaltags):

                self.use_central_database(tag)


        self.job_script = Path(find_file("calibration/scripts/caf/run_collector_path.py")).absolute()

        """The basf2 steering file that will be used for Collector jobs run by this collection.

This script will be copied into subjob directories as part of the input sandbox."""


        self.job_cmd = ["basf2", self.job_script.name, "--job-information job_info.json"]


    def reset_database(self):

        """

        Remove everything in the database_chain of this Calibration, including the default central database

        tag automatically included from `basf2.conditions.default_globaltags <ConditionsConfiguration.default_globaltags>`.

        """

        self.database_chain = []


    def use_central_database(self, global_tag):

        """

        Parameters:

            global_tag (str): The central database global tag to use for this calibration.


        Using this allows you to add a central database to the head of the global tag database chain for this collection.

        The default database chain is just the central one from `basf2.conditions.default_globaltags`.

        The input file global tag will always be overrided and never used unless explicitly set.


        To turn off central database completely or use a custom tag as the base, you should call `Calibration.reset_database`

        and start adding databases with `Calibration.use_local_database` and `Calibration.use_central_database`.


        Alternatively you could set an empty list as the input database_chain when adding the Collection to the Calibration.


        NOTE!! Since ``release-04-00-00`` the behaviour of basf2 conditions databases has changed.

        All local database files MUST now be at the head of the 'chain', with all central database global tags in their own

        list which will be checked after all local database files have been checked.


        So even if you ask for ``["global_tag1", "localdb/database.txt", "global_tag2"]`` to be the database chain, the real order

        that basf2 will use them is ``["global_tag1", "global_tag2", "localdb/database.txt"]`` where the file is checked first.

        """

        central_db = CentralDatabase(global_tag)

        self.database_chain.append(central_db)


    def use_local_database(self, filename, directory=""):

        """

        Parameters:

            filename (str): The path to the database.txt of the local database

            directory (str): The path to the payloads directory for this local database.


        Append a local database to the chain for this collection.

        You can call this function multiple times and each database will be added to the chain IN ORDER.

        The databases are applied to this collection ONLY.


        NOTE!! Since release-04-00-00 the behaviour of basf2 conditions databases has changed.

        All local database files MUST now be at the head of the 'chain', with all central database global tags in their own

        list which will be checked after all local database files have been checked.


        So even if you ask for ["global_tag1", "localdb/database.txt", "global_tag2"] to be the database chain, the real order

        that basf2 will use them is ["global_tag1", "global_tag2", "localdb/database.txt"] where the file is checked first.

        """

        local_db = LocalDatabase(filename, directory)

        self.database_chain.append(local_db)


    @staticmethod

    def uri_list_from_input_file(input_file):

        """

        Parameters:

            input_file (str): A local file/glob pattern or XROOTD URI


        Returns:

            list: A list of the URIs found from the initial string.

        """

        # By default we assume it is a local file path if no "scheme" is given

        uri = parse_file_uri(input_file)

        if uri.scheme == "file":

            # For local files we also perform a glob just in case it is a wildcard pattern.

            # That way we will have all the uris of files separately

            uris = [parse_file_uri(f).geturl() for f in glob(input_file)]

        else:

            # Just let everything else through and hop the backend can figure it out

            uris = [input_file]

        return uris


    @property

    def input_files(self):

        return self._input_files


    @input_files.setter

    def input_files(self, value):

        if isinstance(value, str):

            # If it's a string, we convert to a list of URIs

            self._input_files = self.uri_list_from_input_file(value)

        elif isinstance(value, list):

            # If it's a list we loop and do the same thing

            total_files = []

            for pattern in value:

                total_files.extend(self.uri_list_from_input_file(pattern))

            self._input_files = total_files

        else:

            raise TypeError("Input files must be a list or string")


    @property

    def collector(self):

        """

        """

        return self._collector


    @collector.setter

    def collector(self, collector):

        """

        """

        # check if collector is already a module or if we need to create one

        # from the name

        if collector:

            from basf2 import Module

            if isinstance(collector, str):

                from basf2 import register_module

                collector = register_module(collector)

            if not isinstance(collector, Module):

                B2ERROR("Collector needs to be either a Module or the name of such a module")


        self._collector = collector


    def is_valid(self):

        if (not self.collector or not self.input_files):

            return False

        else:

            return True


    @property

    def max_collector_jobs(self):

        if self.splitter:

            return self.splitter.max_subjobs

        else:

            return None


    @max_collector_jobs.setter

    def max_collector_jobs(self, value):

        if value is None:

            self.splitter = None

        else:

            self.splitter = MaxSubjobsSplitter(max_subjobs=value)


    @property

    def max_files_per_collector_job(self):

        if self.splitter:

            return self.splitter.max_files_per_subjob

        else:

            return None


    @max_files_per_collector_job.setter

    def max_files_per_collector_job(self, value):

        if value is None:

            self.splitter = None

        else:

            self.splitter = MaxFilesSplitter(max_files_per_subjob=value)


class CalibrationBase(ABC, Thread):

    """

    Abstract base class of Calibration types. The CAF implements the :py:class:`Calibration` class which inherits from

    this and runs the C++ CalibrationCollectorModule and CalibrationAlgorithm classes. But by inheriting from this

    class and providing the minimal necessary methods/attributes you could plug in your own Calibration types

    that doesn't depend on the C++ CAF at all and run everything in your own way.


    .. warning:: Writing your own class inheriting from :py:class:`CalibrationBase` class is not recommended!

                             But it's there if you really need it.


    Parameters:

        name (str): Name of this calibration object. Should be unique if you are going to run it.


    Keyword Arguments:

        input_files (list[str]): Input files for this calibration. May contain wildcard expressions useable by `glob.glob`.

    """


    end_state = "completed"


    fail_state = "failed"


    def __init__(self, name, input_files=None):

        """

        """

        super().__init__()


        self.name = name


        self.future_dependencies = []


        self.dependencies = []


        self.files_to_iovs = {}

        if input_files:


            self.input_files = input_files

        else:

            self.input_files = []


        self.iov = None


        self.output_database_dir = ""


        self.save_payloads = True


        self.jobs_to_submit = []


    @abstractmethod

    def run(self):

        """

        The most important method. Runs inside a new Thread and is called from `CalibrationBase.start`

        once the dependencies of this `CalibrationBase` have returned with state == end_state i.e. "completed".

        """


    @abstractmethod

    def is_valid(self):

        """

        A simple method you should implement that will return True or False depending on whether

        the Calibration has been set up correctly and can be run safely.

        """


    def depends_on(self, calibration):

        """

        Parameters:

            calibration (`CalibrationBase`): The Calibration object which will produce constants that this one depends on.


        Adds dependency of this calibration on another i.e. This calibration

        will not run until the dependency has completed, and the constants produced

        will be used via the database chain.


        You can define multiple dependencies for a single calibration simply

        by calling this multiple times. Be careful when adding the calibration into

        the `CAF` not to add a circular/cyclic dependency. If you do the sort will return an

        empty order and the `CAF` processing  will fail.


        This function appens to the `CalibrationBase.dependencies` and `CalibrationBase.future_dependencies` attributes of this

        `CalibrationBase` and the input one respectively. This prevents us having to do too much recalculation later on.

        """

        # Check that we don't have two calibration names that are the same

        if self.name != calibration.name:

            # Tests if we have the calibrations added as dependencies already and adds if not

            if calibration not in self.dependencies:

                self.dependencies.append(calibration)

            if self not in calibration.dependencies:

                calibration.future_dependencies.append(self)

        else:

            B2WARNING((f"Tried to add {calibration} as a dependency for {self} but they have the same name."

                       "Dependency was not added."))


    def dependencies_met(self):

        """

        Checks if all of the Calibrations that this one depends on have reached a successful end state.

        """

        return all(map(lambda x: x.state == x.end_state, self.dependencies))


    def failed_dependencies(self):

        """

        Returns the list of calibrations in our dependency list that have failed.

        """

        failed = []

        for calibration in self.dependencies:

            if calibration.state == self.fail_state:

                failed.append(calibration)

        return failed


    def _apply_calibration_defaults(self, defaults):

        """

        We pass in default calibration options from the `CAF` instance here if called.

        Won't overwrite any options already set.

        """

        for key, value in defaults.items():

            try:

                if getattr(self, key) is None:

                    setattr(self, key, value)

            except AttributeError:

                print(f"The calibration {self.name} does not support the attribute {key}.")


class Calibration(CalibrationBase):

    """

    Every Calibration object must have at least one collector at least one algorithm.

    You have the option to add in your collector/algorithm by argument here, or add them

    later by changing the properties.


    If you plan to use multiple `Collection` objects I recommend that you only set the name here and add the Collections

    separately via `add_collection()`.


    Parameters:

        name (str): Name of this calibration. It should be unique for use in the `CAF`

    Keyword Arguments:

        collector (str, `basf2.Module`): Should be set to a CalibrationCollectorModule() or a string with the module name.

        algorithms (list, ``ROOT.Belle2.CalibrationAlgorithm``): The algorithm(s) to use for this `Calibration`.

        input_files (str, list[str]): Input files for use by this Calibration. May contain wildcards useable by `glob.glob`


    A Calibration won't be valid in the `CAF` until it has all of these four attributes set. For example:


    >>> cal = Calibration('TestCalibration1')

    >>> col1 = register_module('CaTest')

    >>> cal.add_collection('TestColl', col1)


    or equivalently


    >>> cal = Calibration('TestCalibration1', 'CaTest')


    If you want to run a basf2 :py:class:`path <basf2.Path>` before your collector module when running over data


    >>> cal.pre_collector_path = my_basf2_path


    You don't have to put a RootInput module in this pre-collection path, but you can if

    you need some special parameters. If you want to process sroot files the you have to explicitly add

    SeqRootInput to your pre-collection path.

    The inputFileNames parameter of (Seq)RootInput will be set by the CAF automatically for you.


    You can use optional arguments to pass in some/all during initialisation of the `Calibration` class


    >>> cal = Calibration( 'TestCalibration1', 'CaTest', [alg1,alg2], ['/path/to/file.root'])


    you can change the input file list later on, before running with `CAF`


    >>> cal.input_files = ['path/to/*.root', 'other/path/to/file2.root']


    If you have multiple collections from calling `add_collection()` then you should instead set the pre_collector_path,

    input_files, database chain etc from there. See `Collection`.


    Adding the CalibrationAlgorithm(s) is easy


    >>> alg1 = TestAlgo()

    >>> cal.algorithms = alg1


    Or equivalently


    >>> cal.algorithms = [alg1]


    Or for multiple algorithms for one collector


    >>> alg2 = TestAlgo()

    >>> cal.algorithms = [alg1, alg2]


    Note that when you set the algorithms, they are automatically wrapped and stored as a Python class

    `Algorithm`. To access the C++ algorithm clas underneath directly do:


    >>> cal.algorithms[i].algorithm


    If you have a setup function that you want to run before each of the algorithms, set that with


    >>> cal.pre_algorithms = my_function_object


    If you want a different setup for each algorithm use a list with the same number of elements

    as your algorithm list.


    >>> cal.pre_algorithms = [my_function1, my_function2, ...]


    You can also specify the dependencies of the calibration on others


    >>> cal.depends_on(cal2)


    By doing this, the `CAF` will respect the ordering of the calibrations and will pass the

    calibration constants created by earlier completed calibrations to dependent ones.

    """


    moves = ["submit_collector", "complete", "run_algorithms", "iterate", "fail_fully"]


    alg_output_dir = "algorithm_output"


    checkpoint_states = ["init", "collector_completed", "completed"]


    default_collection_name = "default"


    def __init__(self,

                 name,

                 collector=None,

                 algorithms=None,

                 input_files=None,

                 pre_collector_path=None,

                 database_chain=None,

                 output_patterns=None,

                 max_files_per_collector_job=None,

                 max_collector_jobs=None,

                 backend_args=None

                 ):

        """

        """


        self.collections = {}


        self._algorithms = []


        # Default collection added, will have None type and requires setting later via `self.collector`, or will take the

        # CollectorModule/module name directly.

        self.add_collection(self.default_collection_name,

                            Collection(collector,

                                       input_files,

                                       pre_collector_path,

                                       database_chain,

                                       output_patterns,

                                       max_files_per_collector_job,

                                       max_collector_jobs,

                                       backend_args

                                       ))


        super().__init__(name, input_files)

        if algorithms:


            self.algorithms = algorithms


        self.results = {}


        self.max_iterations = None


        self.ignored_runs = None

        if self.algorithms:


            self.strategies = strategies.SingleIOV

        if database_chain:


            self.database_chain = database_chain

        else:

            self.database_chain = []

            # This database is already applied to the `Collection` automatically, so don't do it again

            for tag in reversed(b2conditions.default_globaltags):

                self.use_central_database(tag, apply_to_default_collection=False)


        self.algorithms_runner = runners.SeqAlgorithmsRunner


        self.backend = None


        self.collector_full_update_interval = 30


        self.heartbeat = 3


        self.machine = None


        self._db_path = None


    def add_collection(self, name, collection):

        """

        Parameters:

            name (str): Unique name of this `Collection` in the Calibration.

            collection (`Collection`): `Collection` object to use.


        Adds a new `Collection` object to the `Calibration`. Any valid Collection will be used in the Calibration.

        A default Collection is automatically added but isn't valid and won't run unless you have assigned a collector

        + input files.

        You can ignore the default one and only add your own custom Collections. You can configure the default from the

        Calibration(...) arguments or after creating the Calibration object via directly setting the cal.collector, cal.input_files

        attributes.

        """

        if name not in self.collections:

            self.collections[name] = collection

        else:

            B2WARNING(f"A Collection with the name '{name}' already exists in this Calibration. It has not been added."

                      "Please use another name.")


    def is_valid(self):

        """

        A full calibration consists of a collector AND an associated algorithm AND input_files.


        Returns False if:

            1) We are missing any of the above.

            2) There are multiple Collections and the Collectors have mis-matched granularities.

            3) Any of our Collectors have granularities that don't match what our Strategy can use.

        """

        if not self.algorithms:

            B2WARNING(f"Empty algorithm list for {self.name}.")

            return False


        if not any([collection.is_valid() for collection in self.collections.values()]):

            B2WARNING(f"No valid Collections for {self.name}.")

            return False


        granularities = []

        for collection in self.collections.values():

            if collection.is_valid():

                collector_params = collection.collector.available_params()

                for param in collector_params:

                    if param.name == "granularity":

                        granularities.append(param.values)

        if len(set(granularities)) > 1:

            B2WARNING("Multiple different granularities set for the Collections in this Calibration.")

            return False


        for alg in self.algorithms:

            alg_type = type(alg.algorithm).__name__

            incorrect_gran = [granularity not in alg.strategy.allowed_granularities for granularity in granularities]

            if any(incorrect_gran):

                B2WARNING(f"Selected strategy for {alg_type} does not match a collector's granularity.")

                return False

        return True


    def reset_database(self, apply_to_default_collection=True):

        """

        Keyword Arguments:

            apply_to_default_collection (bool): Should we also reset the default collection?


        Remove everything in the database_chain of this Calibration, including the default central database

        tag automatically included from `basf2.conditions.default_globaltags`. This will NOT affect the database chain of any

        `Collection` other than the default one. You can prevent the default Collection from having its chain reset by setting

        'apply_to_default_collection' to False.

        """

        self.database_chain = []

        if self.default_collection_name in self.collections and apply_to_default_collection:

            self.collections[self.default_collection_name].reset_database()


    def use_central_database(self, global_tag, apply_to_default_collection=True):

        """

        Parameters:

            global_tag (str): The central database global tag to use for this calibration.


        Keyword Arguments:

            apply_to_default_collection (bool): Should we also call use_central_database on the default collection (if it exists)


        Using this allows you to append a central database to the database chain for this calibration.

        The default database chain is just the central one from `basf2.conditions.default_globaltags`.

        To turn off central database completely or use a custom tag as the base, you should call `Calibration.reset_database`

        and start adding databases with `Calibration.use_local_database` and `Calibration.use_central_database`.


        Note that the database chain attached to the `Calibration` will only affect the default `Collection` (if it exists),

        and the algorithm processes. So calling:


        >> cal.use_central_database("global_tag")


        will modify the database chain used by all the algorithms assigned to this `Calibration`, and modifies the database chain

        assigned to


        >> cal.collections['default'].database_chain


        But calling


        >> cal.use_central_database(file_path, payload_dir, False)


        will add the database to the Algorithm processes, but leave the default Collection database chain untouched.

        So if you have multiple Collections in this Calibration *their database chains are separate*.

        To specify an additional `CentralDatabase` for a different collection, you will have to call:


        >> cal.collections['OtherCollection'].use_central_database("global_tag")

        """

        central_db = CentralDatabase(global_tag)

        self.database_chain.append(central_db)

        if self.default_collection_name in self.collections and apply_to_default_collection:

            self.collections[self.default_collection_name].use_central_database(global_tag)


    def use_local_database(self, filename, directory="", apply_to_default_collection=True):

        """

        Parameters:

            filename (str): The path to the database.txt of the local database


        Keyword Argumemts:

            directory (str): The path to the payloads directory for this local database.

            apply_to_default_collection (bool): Should we also call use_local_database on the default collection (if it exists)


        Append a local database to the chain for this calibration.

        You can call this function multiple times and each database will be added to the chain IN ORDER.

        The databases are applied to this calibration ONLY.

        The Local and Central databases applied via these functions are applied to the algorithm processes and optionally

        the default `Collection` job as a database chain.

        There are other databases applied to the processes later, checked by basf2 in this order:


        1) Local Database from previous iteration of this Calibration.

        2) Local Database chain from output of previous dependent Calibrations.

        3) This chain of Local and Central databases where the last added is checked first.


        Note that this function on the `Calibration` object will only affect the default `Collection` if it exists and if

        'apply_to_default_collection' remains True. So calling:


        >> cal.use_local_database(file_path, payload_dir)


        will modify the database chain used by all the algorithms assigned to this `Calibration`, and modifies the database chain

        assigned to


        >> cal.collections['default'].database_chain


        But calling


        >> cal.use_local_database(file_path, payload_dir, False)


        will add the database to the Algorithm processes, but leave the default Collection database chain untouched.


        If you have multiple Collections in this Calibration *their database chains are separate*.

        To specify an additional `LocalDatabase` for a different collection, you will have to call:


        >> cal.collections['OtherCollection'].use_local_database(file_path, payload_dir)


        """

        local_db = LocalDatabase(filename, directory)

        self.database_chain.append(local_db)

        if self.default_collection_name in self.collections and apply_to_default_collection:

            self.collections[self.default_collection_name].use_local_database(filename, directory)


    def _get_default_collection_attribute(self, attr):

        if self.default_collection_name in self.collections:

            return getattr(self.collections[self.default_collection_name], attr)

        else:

            B2WARNING(f"You tried to get the attribute '{attr}' from the Calibration '{self.name}', "

                      "but the default collection doesn't exist."

                      f"You should use the cal.collections['CollectionName'].{attr} to access a custom "

                      "collection's attributes directly.")

            return None


    def _set_default_collection_attribute(self, attr, value):

        if self.default_collection_name in self.collections:

            setattr(self.collections[self.default_collection_name], attr, value)

        else:

            B2WARNING(f"You tried to set the attribute '{attr}' from the Calibration '{self.name}', "

                      "but the default collection doesn't exist."

                      f"You should use the cal.collections['CollectionName'].{attr} to access a custom "

                      "collection's attributes directly.")


    @property

    def collector(self):

        """

        """

        return self._get_default_collection_attribute("collector")


    @collector.setter

    def collector(self, collector):

        """

        """

        # check if collector is already a module or if we need to create one

        # from the name

        from basf2 import Module

        if isinstance(collector, str):

            from basf2 import register_module

            collector = register_module(collector)

        if not isinstance(collector, Module):

            B2ERROR("Collector needs to be either a Module or the name of such a module")


        self._set_default_collection_attribute("collector", collector)


    @property

    def input_files(self):

        """

        """

        return self._get_default_collection_attribute("input_files")


    @input_files.setter

    def input_files(self, files):

        """

        """

        self._set_default_collection_attribute("input_files", files)


    @property

    def files_to_iovs(self):

        """

        """

        return self._get_default_collection_attribute("files_to_iovs")


    @files_to_iovs.setter

    def files_to_iovs(self, file_map):

        """

        """

        self._set_default_collection_attribute("files_to_iovs", file_map)


    @property

    def pre_collector_path(self):

        """

        """

        return self._get_default_collection_attribute("pre_collector_path")


    @pre_collector_path.setter

    def pre_collector_path(self, path):

        """

        """

        self._set_default_collection_attribute("pre_collector_path", path)


    @property

    def output_patterns(self):

        """

        """

        return self._get_default_collection_attribute("output_patterns")


    @output_patterns.setter

    def output_patterns(self, patterns):

        """

        """

        self._set_default_collection_attribute("output_patterns", patterns)


    @property

    def max_files_per_collector_job(self):

        """

        """

        return self._get_default_collection_attribute("max_files_per_collector_job")


    @max_files_per_collector_job.setter

    def max_files_per_collector_job(self, max_files):

        """

        """

        self._set_default_collection_attribute("max_files_per_collector_job", max_files)


    @property

    def max_collector_jobs(self):

        """

        """

        return self._get_default_collection_attribute("max_collector_jobs")


    @max_collector_jobs.setter

    def max_collector_jobs(self, max_jobs):

        """

        """

        self._set_default_collection_attribute("max_collector_jobs", max_jobs)


    @property

    def backend_args(self):

        """

        """

        return self._get_default_collection_attribute("backend_args")


    @backend_args.setter

    def backend_args(self, args):

        """

        """

        self._set_default_collection_attribute("backend_args", args)


    @property

    def algorithms(self):

        """

        """

        return self._algorithms


    @algorithms.setter

    @method_dispatch

    def algorithms(self, value):

        """

        """

        from ROOT.Belle2 import CalibrationAlgorithm

        if isinstance(value, CalibrationAlgorithm):

            self._algorithms = [Algorithm(value)]

        else:

            B2ERROR(f"Something other than CalibrationAlgorithm instance passed in ({type(value)}). "

                    "Algorithm needs to inherit from Belle2::CalibrationAlgorithm")


    @algorithms.fset.register(tuple)

    @algorithms.fset.register(list)

    def _(self, value):

        """

        Alternate algorithms setter for lists and tuples of CalibrationAlgorithms.

        """

        from ROOT.Belle2 import CalibrationAlgorithm

        if value:

            self._algorithms = []

            for alg in value:

                if isinstance(alg, CalibrationAlgorithm):

                    self._algorithms.append(Algorithm(alg))

                else:

                    B2ERROR((f"Something other than CalibrationAlgorithm instance passed in {type(value)}."

                             "Algorithm needs to inherit from Belle2::CalibrationAlgorithm"))


    @property

    def pre_algorithms(self):

        """

        Callback run prior to each algorithm iteration.

        """

        return [alg.pre_algorithm for alg in self.algorithms]


    @pre_algorithms.setter

    @method_dispatch

    def pre_algorithms(self, func):

        """

        """

        if func:

            for alg in self.algorithms:

                alg.pre_algorithm = func

        else:

            B2ERROR("Something evaluated as False passed in as pre_algorithm function.")


    @pre_algorithms.fset.register(tuple)

    @pre_algorithms.fset.register(list)

    def _(self, values):

        """

        Alternate pre_algorithms setter for lists and tuples of functions, should be one per algorithm.

        """

        if values:

            if len(values) == len(self.algorithms):

                for func, alg in zip(values, self.algorithms):

                    alg.pre_algorithm = func

            else:

                B2ERROR("Number of functions and number of algorithms doesn't match.")

        else:

            B2ERROR("Empty container passed in for pre_algorithm functions")


    @property

    def strategies(self):

        """

        The `caf.strategies.AlgorithmStrategy` or `list` of them used when running the algorithm(s).

        """

        return [alg.strategy for alg in self.algorithms]


    @strategies.setter

    @method_dispatch

    def strategies(self, strategy):

        """

        """

        if strategy:

            for alg in self.algorithms:

                alg.strategy = strategy

        else:

            B2ERROR("Something evaluated as False passed in as a strategy.")


    @strategies.fset.register(tuple)

    @strategies.fset.register(list)

    def _(self, values):

        """

        Alternate strategies setter for lists and tuples of functions, should be one per algorithm.

        """

        if values:

            if len(values) == len(self.algorithms):

                for strategy, alg in zip(strategies, self.algorithms):

                    alg.strategy = strategy

            else:

                B2ERROR("Number of strategies and number of algorithms doesn't match.")

        else:

            B2ERROR("Empty container passed in for strategies list")


    def __repr__(self):

        """

        """

        return self.name


    def run(self):

        """

        Main logic of the Calibration object.

        Will be run in a new Thread by calling the start() method.

        """

        with CAFDB(self._db_path, read_only=True) as db:

            initial_state = db.get_calibration_value(self.name, "checkpoint")

            initial_iteration = db.get_calibration_value(self.name, "iteration")

        B2INFO("Initial status of {} found to be state={}, iteration={}".format(self.name,

                                                                                initial_state,

                                                                                initial_iteration))

        self.machine = CalibrationMachine(self,

                                          iov_to_calibrate=self.iov,

                                          initial_state=initial_state,

                                          iteration=initial_iteration)

        self.state = initial_state

        self.machine.root_dir = Path(os.getcwd(), self.name)

        self.machine.collector_backend = self.backend


        # Before we start running, let's clean up any iteration directories from iterations above our initial one.

        # Should prevent confusion between attempts if we fail again.

        all_iteration_paths = find_int_dirs(self.machine.root_dir)

        for iteration_path in all_iteration_paths:

            if int(iteration_path.name) > initial_iteration:

                shutil.rmtree(iteration_path)


        while self.state != self.end_state and self.state != self.fail_state:

            if self.state == "init":

                try:

                    B2INFO(f"Attempting collector submission for calibration {self.name}.")

                    self.machine.submit_collector()

                except Exception as err:

                    B2FATAL(str(err))


                self._poll_collector()


            # If we failed take us to the final fail state

            if self.state == "collector_failed":

                self.machine.fail_fully()

                return


            # It's possible that we might raise an error while attempting to run due

            # to some problems e.g. Missing collector output files

            # We catch the error and exit with failed state so the CAF will stop

            try:

                B2INFO(f"Attempting to run algorithms for calibration {self.name}.")

                self.machine.run_algorithms()

            except MachineError as err:

                B2ERROR(str(err))

                self.machine.fail()


            # If we failed take us to the final fail state

            if self.machine.state == "algorithms_failed":

                self.machine.fail_fully()

                return


    def _poll_collector(self):

        """

        """

        while self.state == "running_collector":

            try:

                self.machine.complete()

            # ConditionError is thrown when the condtions for the transition have returned false, it's not serious.

            except ConditionError:

                try:

                    B2DEBUG(29, f"Checking if collector jobs for calibration {self.name} have failed.")

                    self.machine.fail()

                except ConditionError:

                    pass

            sleep(self.heartbeat)  # Sleep until we want to check again


    @property

    def state(self):

        """

        The current major state of the calibration in the database file. The machine may have a different state.

        """

        with CAFDB(self._db_path, read_only=True) as db:

            state = db.get_calibration_value(self.name, "state")

        return state


    @state.setter

    def state(self, state):

        """

        """

        B2DEBUG(29, f"Setting {self.name} to state {state}.")

        with CAFDB(self._db_path) as db:

            db.update_calibration_value(self.name, "state", str(state))

            if state in self.checkpoint_states:

                db.update_calibration_value(self.name, "checkpoint", str(state))

        B2DEBUG(29, f"{self.name} set to {state}.")


    @property

    def iteration(self):

        """

        Retrieves the current iteration number in the database file.


        Returns:

            int: The current iteration number

        """

        with CAFDB(self._db_path, read_only=True) as db:

            iteration = db.get_calibration_value(self.name, "iteration")

        return iteration


    @iteration.setter

    def iteration(self, iteration):

        """

        """

        B2DEBUG(29, f"Setting {self.name} to {iteration}.")

        with CAFDB(self._db_path) as db:

            db.update_calibration_value(self.name, "iteration", iteration)

        B2DEBUG(29, f"{self.name} set to {self.iteration}.")


class Algorithm():

    """

    Parameters:

        algorithm: The CalibrationAlgorithm instance that we want to execute.

    Keyword Arguments:

        data_input (types.FunctionType): An optional function that sets the input files of the algorithm.

        pre_algorithm (types.FunctionType): An optional function that runs just prior to execution of the algorithm.

            Useful for set up e.g. module initialisation


    This is a simple wrapper class around the C++ CalibrationAlgorithm class.

    It helps to add functionality to algorithms for use by the Calibration and CAF classes rather

    than separating the logic into those classes directly.


    This is **not** currently a class that a user should interact with much during `CAF`

    setup (unless you're doing something advanced).

    The `Calibration` class should be doing the most of the creation of the defaults for these objects.


    Setting the `data_input` function might be necessary if you have set the `Calibration.output_patterns`.

    Also, setting the `pre_algorithm` to a function that should execute prior to each `strategies.AlgorithmStrategy`

    is often useful i.e. by calling for the Geometry module to initialise.

    """


    def __init__(self, algorithm, data_input=None, pre_algorithm=None):

        """

        """


        self.algorithm = algorithm


        self.name = algorithm.__cppname__[algorithm.__cppname__.rfind('::') + 2:]


        self.data_input = data_input

        if not self.data_input:

            self.data_input = self.default_inputdata_setup


        self.pre_algorithm = pre_algorithm


        self.strategy = strategies.SingleIOV


        self.params = {}


    def default_inputdata_setup(self, input_file_paths):

        """

        Simple setup to set the input file names to the algorithm. Applied to the data_input attribute

        by default. This simply takes all files returned from the `Calibration.output_patterns` and filters

        for only the CollectorOutput.root files. Then it sets them as input files to the CalibrationAlgorithm class.

        """

        collector_output_files = list(filter(lambda file_path: "CollectorOutput.root" == Path(file_path).name,

                                             input_file_paths))

        info_lines = [f"Input files used in {self.name}:"]

        info_lines.extend(collector_output_files)

        B2INFO_MULTILINE(info_lines)

        self.algorithm.setInputFileNames(collector_output_files)


class CAF():

    """

    Parameters:

      calibration_defaults (dict): A dictionary of default options for calibrations run by this `CAF` instance e.g.


                                   >>> calibration_defaults={"max_iterations":2}


    This class holds `Calibration` objects and processes them. It defines the initial configuration/setup

    for the calibrations. But most of the real processing is done through the `caf.state_machines.CalibrationMachine`.


    The `CAF` class essentially does some initial setup, holds the `CalibrationBase` instances and calls the

    `CalibrationBase.start` when the dependencies are met.


    Much of the checking for consistency is done in this class so that no processing is done with an invalid

    setup. Choosing which files to use as input should be done from outside during the setup of the `CAF` and

    `CalibrationBase` instances.

    """


    _db_name = "caf_state.db"


    default_calibration_config = {

                                  "max_iterations": 5,

                                  "ignored_runs": []

                                 }


    def __init__(self, calibration_defaults=None):

        """

        """


        self.calibrations = {}


        self.future_dependencies = {}


        self.dependencies = {}


        self.output_dir = "calibration_results"


        self.order = None


        self._backend = None


        self.heartbeat = 5


        if not calibration_defaults:

            calibration_defaults = {}


        self.calibration_defaults = {**self.default_calibration_config, **calibration_defaults}


        self._db_path = None


    def add_calibration(self, calibration):

        """

        Adds a `Calibration` that is to be used in this program to the list.

        Also adds an empty dependency list to the overall dictionary.

        You should not directly alter a `Calibration` object after it has been

        added here.

        """

        if calibration.is_valid():

            if calibration.name not in self.calibrations:

                self.calibrations[calibration.name] = calibration

            else:

                B2WARNING(f"Tried to add a calibration with the name {calibration.name} twice.")

        else:

            B2WARNING((f"Tried to add incomplete/invalid calibration ({calibration.name}) to the framwork."

                       "It was not added and will not be part of the final process."))


    def _remove_missing_dependencies(self):

        """

        This checks the future and past dependencies of each `Calibration` in the `CAF`.

        If any dependencies are not known to the `CAF` then they are removed from the `Calibration`

        object directly.

        """

        calibration_names = [calibration.name for calibration in self.calibrations.values()]


        def is_dependency_in_caf(dependency):

            """

            Quick function to use with filter() and check dependencies against calibrations known to `CAF`

            """

            dependency_in_caf = dependency.name in calibration_names

            if not dependency_in_caf:

                B2WARNING(f"The calibration {dependency.name} is a required dependency but is not in the CAF."

                          " It has been removed as a dependency.")

            return dependency_in_caf


        # Check that there aren't dependencies on calibrations not added to the framework

        # Remove them from the calibration objects if there are.

        for calibration in self.calibrations.values():

            filtered_future_dependencies = list(filter(is_dependency_in_caf, calibration.future_dependencies))

            calibration.future_dependencies = filtered_future_dependencies


            filtered_dependencies = list(filter(is_dependency_in_caf, calibration.dependencies))

            calibration.dependencies = filtered_dependencies


    def _order_calibrations(self):

        """

        - Uses dependency atrributes of calibrations to create a dependency dictionary and passes it

        to a sorting algorithm.

        - Returns valid OrderedDict if sort was succesful, empty one if it failed (most likely a cyclic dependency)

        """

        # First remove any dependencies on calibrations not added to the CAF

        self._remove_missing_dependencies()

        # Filling dependencies dictionaries of CAF for sorting, only explicit dependencies for now

        # Note that they currently use the names not the calibration objects.

        for calibration in self.calibrations.values():

            future_dependencies_names = [dependency.name for dependency in calibration.future_dependencies]

            past_dependencies_names = [dependency.name for dependency in calibration.dependencies]


            self.future_dependencies[calibration.name] = future_dependencies_names

            self.dependencies[calibration.name] = past_dependencies_names

        # Gives us a list of A (not THE) valid ordering and checks for cyclic dependencies

        order = topological_sort(self.future_dependencies)

        if not order:

            return False


        # Get an ordered dictionary of the sort order but including all implicit dependencies.

        ordered_full_dependencies = all_dependencies(self.future_dependencies, order)


        # Return all the implicit+explicit past dependencies

        full_past_dependencies = past_from_future_dependencies(ordered_full_dependencies)

        # Correct each calibration's dependency list to reflect the implicit dependencies

        for calibration in self.calibrations.values():

            full_deps = full_past_dependencies[calibration.name]

            explicit_deps = [cal.name for cal in calibration.dependencies]

            for dep in full_deps:

                if dep not in explicit_deps:

                    calibration.dependencies.append(self.calibrations[dep])

            # At this point the calibrations have their full dependencies but they aren't in topological

            # sort order. Correct that here

            ordered_dependency_list = []

            for ordered_calibration_name in order:

                if ordered_calibration_name in [dep.name for dep in calibration.dependencies]:

                    ordered_dependency_list.append(self.calibrations[ordered_calibration_name])

            calibration.dependencies = ordered_dependency_list

        order = ordered_full_dependencies

        # We should also patch in all of the implicit dependencies for the calibrations

        return order


    def _check_backend(self):

        """

        Makes sure that the CAF has a valid backend setup. If one isn't set by the user (or if the

        one that is stored isn't a valid Backend object) we should create a default Local backend.

        """

        if not isinstance(self._backend, caf.backends.Backend):


            self.backend = caf.backends.Local()


    def _prune_invalid_collections(self):

        """

        Checks all current calibrations and removes any invalid Collections from their collections list.

        """

        B2INFO("Checking for any invalid Collections in Calibrations.")

        for calibration in self.calibrations.values():

            valid_collections = {}

            for name, collection in calibration.collections.items():

                if collection.is_valid():

                    valid_collections[name] = collection

                else:

                    B2WARNING(f"Removing invalid Collection '{name}' from Calibration '{calibration.name}'.")

            calibration.collections = valid_collections


    def run(self, iov=None):

        """

        Keyword Arguments:

            iov(`caf.utils.IoV`): IoV to calibrate for this processing run. Only the input files necessary to calibrate

                                  this IoV will be used in the collection step.


        This function runs the overall calibration job, saves the outputs to the output_dir directory,

        and creates database payloads.


        Upload of final databases is not done here. This simply creates the local databases in

        the output directory. You should check the validity of your new local database before uploading

        to the conditions DB via the basf2 tools/interface to the DB.

        """

        if not self.calibrations:

            B2FATAL("There were no Calibration objects to run. Maybe you tried to add invalid ones?")

        # Checks whether the dependencies we've added will give a valid order

        order = self._order_calibrations()

        if not order:

            B2FATAL("Couldn't order the calibrations properly. Could be a cyclic dependency.")


        # Check that a backend has been set and use default Local() one if not

        self._check_backend()


        self._prune_invalid_collections()


        # Creates the overall output directory and reset the attribute to use an absolute path to it.

        self.output_dir = self._make_output_dir()


        #  Creates a SQLite DB to save the status of the various calibrations

        self._make_database()


        # Enter the overall output dir during processing and opena  connection to the DB

        with temporary_workdir(self.output_dir):

            db = CAFDB(self._db_path)

            db.open()

            db_initial_calibrations = db.query("select * from calibrations").fetchall()

            for calibration in self.calibrations.values():

                # Apply defaults given to the `CAF` to the calibrations if they aren't set

                calibration._apply_calibration_defaults(self.calibration_defaults)

                calibration._db_path = self._db_path

                calibration.output_database_dir = Path(self.output_dir, calibration.name, "outputdb").as_posix()

                calibration.iov = iov

                if not calibration.backend:

                    calibration.backend = self.backend

                # Do some checking of the db to see if we need to add an entry for this calibration

                if calibration.name not in [db_cal[0] for db_cal in db_initial_calibrations]:

                    db.insert_calibration(calibration.name)

                    db.commit()

                else:

                    for cal_info in db_initial_calibrations:

                        if cal_info[0] == calibration.name:

                            cal_initial_state = cal_info[2]

                            cal_initial_iteration = cal_info[3]

                    B2INFO(f"Previous entry in database found for {calibration.name}.")

                    B2INFO(f"Setting {calibration.name} state to checkpoint state '{cal_initial_state}'.")

                    calibration.state = cal_initial_state

                    B2INFO(f"Setting {calibration.name} iteration to '{cal_initial_iteration}'.")

                    calibration.iteration = cal_initial_iteration

                # Daemonize so that it exits if the main program exits

                calibration.daemon = True


            db.close()


            # Is it possible to keep going?

            keep_running = True

            while keep_running:

                keep_running = False

                # Do we have calibrations that may yet complete?

                remaining_calibrations = []


                for calibration in self.calibrations.values():

                    # Find the currently ended calibrations (may not be joined yet)

                    if (calibration.state == CalibrationBase.end_state or calibration.state == CalibrationBase.fail_state):

                        # Search for any alive Calibrations and join them

                        if calibration.is_alive():

                            B2DEBUG(29, f"Joining {calibration.name}.")

                            calibration.join()

                    else:

                        if calibration.dependencies_met():

                            if not calibration.is_alive():

                                B2DEBUG(29, f"Starting {calibration.name}.")

                                try:

                                    calibration.start()

                                except RuntimeError:

                                    # Catch the case when the calibration just finished so it ended up here

                                    # in the "else" and not above where it should have been joined.

                                    B2DEBUG(29, f"{calibration.name} probably just finished, join it later.")

                            remaining_calibrations.append(calibration)

                        else:

                            if not calibration.failed_dependencies():

                                remaining_calibrations.append(calibration)

                if remaining_calibrations:

                    keep_running = True

                    # Loop over jobs that the calibrations want submitted and submit them.

                    # We do this here because some backends don't like us submitting in parallel from multiple CalibrationThreads

                    # So this is like a mini job queue without getting too clever with it

                    for calibration in remaining_calibrations:

                        for job in calibration.jobs_to_submit[:]:

                            calibration.backend.submit(job)

                            calibration.jobs_to_submit.remove(job)

                sleep(self.heartbeat)


            B2INFO("Printing summary of final CAF status.")

            with CAFDB(self._db_path, read_only=True) as db:

                print(db.output_calibration_table())


    @property

    def backend(self):

        """

        The `backend <backends.Backend>` that runs the collector job.

        When set, this is checked that a `backends.Backend` class instance was passed in.

        """

        return self._backend


    @backend.setter

    def backend(self, backend):

        """

        """

        if isinstance(backend, caf.backends.Backend):

            self._backend = backend

        else:

            B2ERROR('Backend property must inherit from Backend class.')


    def _make_output_dir(self):

        """

        Creates the output directory. If it already exists we are now going to try and restart the program from the last state.


        Returns:

            str: The absolute path of the new output_dir

        """

        p = Path(self.output_dir).resolve()

        if p.is_dir():

            B2INFO(f"{p.as_posix()} output directory already exists. "

                   "We will try to restart from the previous finishing state.")

            return p.as_posix()

        else:

            p.mkdir(parents=True)

            if p.is_dir():

                return p.as_posix()

            else:

                raise FileNotFoundError(f"Attempted to create output_dir {p.as_posix()}, but it didn't work.")


    def _make_database(self):

        """

        Creates the CAF status database. If it already exists we don't overwrite it.

        """

        self._db_path = Path(self.output_dir, self._db_name).absolute()

        if self._db_path.exists():

            B2INFO(f"Previous CAF database found {self._db_path}")

        # Will create a new database + tables, or do nothing but checks we can connect to existing one

        with CAFDB(self._db_path) as db:

            pass