Source code for grafei.model.create_trainer

##########################################################################
# basf2 (Belle II Analysis Software Framework)                           #
# Author: The Belle II Collaboration                                     #
#                                                                        #
# See git log for contributors and copyright holders.                    #
# This file is licensed under LGPL-3.0, see LICENSE.md.                  #
##########################################################################


import ignite
import torch
from torch_geometric.data import Batch
import numpy as np
import collections.abc
from datetime import datetime
from pathlib import Path
import yaml
from .metrics import PerfectLCA, PerfectEvent, PerfectMasses



[docs]
class GraFEIIgniteTrainer:
    """
    Class to setup the ignite trainer and hold all the things associated.

    :param model: The actual PyTorch model.
    :type model: `Model <https://pytorch.org/tutorials/beginner/introyt/modelsyt_tutorial.html>`_
    :param optimizer: Optimizer used in training.
    :type optimizer: `Optimizer <https://pytorch.org/docs/stable/optim.html#torch.optim.Optimizer>`_
    :param loss_fn: Loss function.
    :type loss_fn: `Loss <https://pytorch.org/docs/stable/nn.html#loss-functions>`_
    :param device: Device to use.
    :type device: `Device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.device>`_
    :param configs: Dictionary of run configs from loaded yaml config file.
    :type configs: dict
    :param tags: Various tags to sort train and validation evaluators by, e.g. "Training", "Validation".
    :type tags: list
    :param scheduler: Learning rate scheduler.
    :type scheduler: `Scheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_
    :param ignore_index: Label index to ignore when calculating metrics, e.g. padding.
    :type ignore_index: int
    """

    def __init__(
        self,
        model,
        optimizer,
        loss_fn,
        device,
        configs,
        tags,
        scheduler=None,
        ignore_index=-1.0,
    ):
        """
        Initialization.
        """
        #: Model
        self.model = model
        #: Optimizer
        self.optimizer = optimizer
        #: Configs
        self.configs = configs
        #: Tags
        self.tags = tags
        #: Index to ignore
        self.ignore_index = ignore_index
        #: CPU or GPU
        self.device = device

        #: Run timestamp to distinguish trainings
        self.timestamp = datetime.now().strftime("%Y.%m.%d_%H.%M")

        #: Output directory for checkpoints
        self.run_dir = None
        if self.configs["output"] is not None:
            if ("path" in self.configs["output"].keys()) and (
                self.configs["output"]["path"] is not None
            ):
                self.run_dir = Path(
                    self.configs["output"]["path"],
                    self.configs["output"]["run_name"],
                )

        # Setup ignite trainer
        use_amp = configs["train"]["mixed_precision"] and self.device == torch.device(
            "cuda"
        )

        if use_amp:
            from torch.cuda.amp import autocast
            from torch.cuda.amp import GradScaler

            scaler = GradScaler(enabled=True)

        def _update_model(engine, batch):
            # This just sets the training mode
            model.train()

            optimizer.zero_grad()

            batch = (
                Batch.from_data_list(batch).to(device)
                if isinstance(batch, list)
                else batch.to(device)
            )

            x_y, edge_y, u_y = batch.x_y, batch.edge_y, batch.u_y

            if use_amp:
                with autocast(enabled=True):
                    x_pred, e_pred, u_pred = model(batch)
                    loss = loss_fn(x_pred, x_y, e_pred, edge_y, u_pred, u_y)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                x_pred, e_pred, u_pred = model(batch)
                loss = loss_fn(x_pred, x_y, e_pred, edge_y, u_pred, u_y)
                loss.backward()
                optimizer.step()

            return loss.item()

        #: Ignite trainer
        self.trainer = ignite.engine.Engine(_update_model)

        if scheduler:
            ig_scheduler = ignite.handlers.param_scheduler.LRScheduler(scheduler)
            self.trainer.add_event_handler(ignite.engine.Events.ITERATION_STARTED, ig_scheduler)

        #: Setup train and validation evaluators
        self.evaluators = {}

        for tag in self.tags:
            # Setup metrics
            metrics = {
                # ignite.metrics.Loss takes (y_pred, y, **kwargs) arguments.
                # MultiTrainLoss needs in total 6 arguments to be computed,
                # so the additional ones are passed in a dictionary.
                "loss": ignite.metrics.Loss(
                    loss_fn,
                    output_transform=lambda x: [
                        x[0], x[3],
                        {
                            "edge_input": x[1],
                            "edge_target": x[4],
                            "u_input": x[2],
                            "u_target": x[5],
                        },
                    ],
                    device=device,
                ),
                "perfectLCA": PerfectLCA(
                    ignore_index=ignore_index,
                    device=device,
                    output_transform=lambda x: [
                        x[1], x[4], x[6], x[5], x[7], x[8],
                    ],
                ),
                "perfectMasses": PerfectMasses(
                    ignore_index=ignore_index,
                    device=device,
                    output_transform=lambda x: [x[0], x[3], x[5], x[7], x[8]],
                ),
                "perfectEvent": PerfectEvent(
                    ignore_index=ignore_index,
                    device=device,
                    output_transform=lambda x: [
                        x[0], x[3], x[1], x[4], x[6], x[5], x[7], x[8],
                    ],
                ),
            }

            def _predict_on_batch(engine, batch):
                model.eval()  # It just enables evaluation mode

                batch = (
                    Batch.from_data_list(batch).to(device)
                    if isinstance(batch, list)
                    else batch.to(device)
                )

                x_y, edge_y, u_y, edge_index, torch_batch = (
                    batch.x_y,
                    batch.edge_y,
                    batch.u_y,
                    batch.edge_index,
                    batch.batch,
                )
                num_graph = batch.batch[torch_batch.shape[0] - 1] + 1

                with torch.no_grad():
                    if use_amp:
                        with autocast(enabled=True):
                            x_pred, e_pred, u_pred = model(batch)
                    else:
                        x_pred, e_pred, u_pred = model(batch)

                return (
                    x_pred,
                    e_pred,
                    u_pred,
                    x_y,
                    edge_y,
                    u_y,
                    edge_index,
                    torch_batch,
                    num_graph,
                )

            self.evaluators[tag] = ignite.engine.Engine(_predict_on_batch)

            for metric_name, metric in zip(metrics.keys(), metrics.values()):
                metric.attach(self.evaluators[tag], metric_name)

    def _score_fn(self, engine):
        """Metric to use for early stoppging"""
        return -engine.state.metrics["loss"]

    def _perfect_score_fn(self, engine):
        """Metric to use for checkpoints"""
        return engine.state.metrics["perfectEvent"]

    def _clean_config_dict(self, configs):
        """
        Clean configs to prepare them for writing to file.
        """
        for k, v in configs.items():
            if isinstance(v, collections.abc.Mapping):
                configs[k] = self._clean_config_dict(configs[k])
            elif isinstance(v, np.ndarray):
                configs[k] = v.tolist()
            else:
                configs[k] = v
        return configs

    def setup_handlers(self, cfg_filename="config.yaml"):
        """
        Creates the various ignite handlers (callbacks).

        Args:
            cfg_filename (str): Name of config yaml file to use when saving configs.
        """
        # Create the output directory
        if self.run_dir is not None:
            self.run_dir.mkdir(parents=True, exist_ok=True)
            # And save the configs, putting here to only save when setting up checkpointing
            with open(
                self.run_dir / f"{self.timestamp}_{cfg_filename}", "w"
            ) as outfile:
                cleaned_configs = self._clean_config_dict(self.configs)
                yaml.dump(cleaned_configs, outfile, default_flow_style=False)

        # Setup early stopping
        early_handler = ignite.handlers.EarlyStopping(
            patience=self.configs["train"]["early_stop_patience"],
            score_function=self._score_fn,
            trainer=self.trainer,
            min_delta=1e-3,
        )
        self.evaluators["Validation"].add_event_handler(
            ignite.engine.Events.EPOCH_COMPLETED, early_handler
        )

        # Configure saving the best performing model
        if self.run_dir is not None:
            to_save = {
                "model": self.model,
                "optimizer": self.optimizer,
                "trainer": self.trainer,
            }
            # Note that we judge early stopping above by the validation loss, but save the best model
            # according to validation perfectEvent score. This lets training continue for perfectEvent plateaus
            # so long as the model is still changing (and hopefully improving again after some time).
            best_model_handler = ignite.handlers.Checkpoint(
                to_save=to_save,
                save_handler=ignite.handlers.DiskSaver(
                    self.run_dir, create_dir=True, require_empty=False
                ),
                filename_prefix=self.timestamp,
                score_function=self._perfect_score_fn,
                score_name="validation_perfectEvent",
                n_saved=1,
                global_step_transform=ignite.handlers.global_step_from_engine(
                    self.evaluators["Validation"]
                ),
            )
            self.evaluators["Validation"].add_event_handler(
                ignite.engine.Events.EPOCH_COMPLETED, best_model_handler
            )

        return

    # Set up end of epoch validation procedure
    # Tell it to print epoch results
    def log_results(self, trainer, mode_tags):
        """
        Callback to run evaluation and report the results.

        :param trainer: Trainer passed by ignite to this method.
        :type trainer: `Engine <https://pytorch.org/ignite/generated/ignite.engine.engine.Engine.html#ignite.engine.engine.Engine>`_
        :param mode_tags: Dictionary of mode tags containing (mode, dataset, dataloader) tuples.
        :type mode_tags: dict
        """

        for tag, values in mode_tags.items():
            evaluator = self.evaluators[tag]

            # Need to wrap this in autocast since it calculates metrics (i.e. loss) without autocast switched on
            # This is mostly fine except it fails to correctly cast the class weights tensor passed to the loss
            if self.configs["train"]["mixed_precision"] and self.device == torch.device("cuda"):
                with torch.cuda.amp.autocast():
                    evaluator.run(values[2], epoch_length=None)
            else:
                evaluator.run(values[2], epoch_length=None)

            metrics = evaluator.state.metrics
            message = [f"{tag} Results - Epoch: {trainer.state.epoch}"]
            message.extend([f"Avg {m}: {metrics[m]:.4f}" for m in metrics])
            print(message)