development/doxygen/geometric__datasets_8py_source.html

import itertools

from pathlib import Path

import numpy as np

import torch

from .tree_utils import masses_to_classes

from .dataset_utils import populate_avail_samples, preload_root_data

from .edge_features import compute_edge_features

from .normalize_features import normalize_features

from torch_geometric.data import Data, InMemoryDataset

import uproot


def _preload(self):

    """

    Creates graph objects and stores them into a python list.

    """


    self.x_files = sorted(self.root.glob("**/*.root"))


    # Select the first N files (useful for testing)

    if self.n_files is not None:

        if self.n_files > len(self.x_files):

            print(

                f"WARNING: n_files specified ({self.n_files}) is greater than files in path given ({len(self.x_files)})"

            )


        self.x_files = self.x_files[: self.n_files]


    if len(self.x_files) == 0:

        raise RuntimeError(f"No files found in {self.root}")


    # Save the features

    with uproot.open(self.x_files[0])["Tree"] as t:


        self.features = [f for f in t.keys() if f.startswith("feat_")]


        self.B_reco = int(t["isB"].array(library="np")[0])

        assert self.B_reco in [0, 1, 2], "B_reco should be 0, 1 or 2, something went wrong"


    self.discarded = [

        f for f in self.features if not f[f.find("_") + 1:] in self.node_features

    ]

    self.features = [

        f"feat_{f}" for f in self.node_features if f"feat_{f}" in self.features

    ]


    print(f"Input node features: {self.features}")

    print(f"Discarded node features: {self.discarded}")


    self.edge_features = [f"edge_{f}" for f in self.edge_features]


    self.global_features = [f"glob_{f}" for f in self.global_features] if self.global_features else []

    print(f"Input edge features: {self.edge_features}")

    print(f"Input global features: {self.global_features}")


    self.x, self.y = preload_root_data(

        self.x_files,

        self.features,

        self.discarded,

    )


    self.avail_samples = populate_avail_samples(

        self.x,

        self.y,

        self.B_reco,

    )


    # Select a subset of available samples if requested

    if self.samples and self.samples < len(self.avail_samples):

        print(f"Selecting random subset of {self.samples} samples")

        self.avail_samples = [

            self.avail_samples[i]

            for i in np.random.choice(

                len(self.avail_samples), self.samples, replace=False

            )

        ]

    elif self.samples and (self.samples >= len(self.avail_samples)):

        print(

            f"WARNING: No. samples specified ({self.samples}) exceeds number of samples loaded ({len(self.avail_samples)})"

        )


    return len(self.avail_samples)


def _process_graph(self, idx):

    """

    Actually builds the graph object.


    Args:

        idx (int): Index of training example to be processed.


    Returns:

        torch_geometric.data.Data: Graph object to be used in training.

    """


    file_id, evt, p_index = self.avail_samples[idx]


    x_item = self.x[file_id]

    y_item = self.y[file_id][p_index]


    evt_b_index = x_item["b_index"][evt]

    evt_leaves = x_item["leaves"][evt]

    evt_primary = x_item["primary"][evt]


    y_leaves = y_item["LCA_leaves"][evt]

    # Use this to correctly reshape LCA (might be able to just use shape of y_leaves?)

    n_LCA = y_item["n_LCA"][evt]


    # Get the rows of the X inputs to fetch

    # This is a boolean numpy array

    x_rows = (evt_b_index != -1) if not self.B_reco else evt_b_index == int(p_index)


    # Find the unmatched particles

    unmatched_rows = evt_b_index == -1


    if np.any(unmatched_rows) and self.B_reco:

        # Create a random boolean array the same size as the number of leaves

        rand_mask = np.random.choice(a=[False, True], size=unmatched_rows.size)

        # AND the mask with the unmatched leaves

        # This selects a random subset of the unmatched leaves

        unmatched_rows = np.logical_and(unmatched_rows, rand_mask)


    # Add the unmatched rows to the current decay's rows

    x_rows = np.logical_or(x_rows, unmatched_rows)


    # Here we actually load the data


    # Initialise event's X array

    x = np.empty((x_rows.sum(), len(self.features)))

    x_dis = np.empty((x_rows.sum(), len(self.discarded)))


    # And populate it

    for idx, feat in enumerate(self.features):

        x[:, idx] = x_item["features"][feat][evt][x_rows]

    for idx, feat in enumerate(self.discarded):

        x_dis[:, idx] = x_item["discarded"][feat][evt][x_rows]


    # Same for edge and global features

    x_edges = (

        compute_edge_features(

            self.edge_features,

            self.features + self.discarded,

            np.concatenate([x, x_dis], axis=1),

        )

        if self.edge_features is not []

        else []

    )

    x_global = (

        np.array(

            [

                [

                    x_item["global"][feat + f"_{p_index}"][evt]

                    for feat in self.global_features

                ]

            ]

        )

        if self.global_features != []

        else []

    )


    x_leaves = evt_leaves[x_rows]


    # Set nans to zero, this is a surrogate value, may change in future

    np.nan_to_num(x, copy=False)

    np.nan_to_num(x_edges, copy=False)

    np.nan_to_num(x_global, copy=False)


    # Normalize any features that should be

    if self.normalize is not None:

        normalize_features(

            self.normalize,

            self.features,

            x,

            self.edge_features,

            x_edges,

            self.global_features,

            x_global,

        )


    # Reorder LCA


    # Get LCA indices in order that the leaves appear in reconstructed particles

    # Secondaries aren't in the LCA leaves list so they get a 0

    locs = np.array(

        [

            np.where(y_leaves == i)[0].item() if (i in y_leaves) else 0

            for i in x_leaves

        ]

    )


    # Get the LCA in the correct subset order

    # If we're not allowing secondaries this is all we need

    # If we are this will contain duplicates (since secondary locs are set to 0)

    # We can't load the first locs directly (i.e. y_item[locs, :]) because locs is (intentionally) unsorted

    y_edge = y_item["LCA"][evt].reshape((n_LCA, n_LCA)).astype(int)

    # Get the true mcPDG pf FSPs

    y_mass = masses_to_classes(x_item["mc_pdg"][evt][x_rows])


    # Get the specified row/cols, this inserts dummy rows/cols for secondaries

    y_edge = y_edge[locs, :][:, locs]

    # if self.allow_secondaries:

    # Set everything that's not primary (unmatched and secondaries) rows.cols to 0

    # Note we only consider the subset of leaves that made it into x_rows

    y_edge = np.where(evt_primary[x_rows], y_edge, 0)  # Set the rows

    y_edge = np.where(evt_primary[x_rows][:, None], y_edge, 0)  # Set the columns


    # Set diagonal to -1 (actually not necessary but ok...)

    np.fill_diagonal(y_edge, -1)


    n_nodes = x.shape[0]


    # Target edge tensor: shape [E]

    edge_y = torch.tensor(

        y_edge[np.eye(n_nodes) == 0],

        dtype=torch.long

    )

    # Fill tensor with edge indices: shape [N*(N-1), 2] == [E, 2]

    edge_index = torch.tensor(

        list(itertools.permutations(range(n_nodes), 2)),

        dtype=torch.long,

    )


    # Target global tensor: shape [B, F_g]

    u_y = torch.tensor(

        [[1]], dtype=torch.float

    )


    # Target node tensor: shape [N]

    x_y = torch.tensor(y_mass, dtype=torch.long)


    g = Data(

        x=torch.tensor(x, dtype=torch.float),

        edge_index=edge_index.t().contiguous(),

        edge_attr=torch.tensor(x_edges, dtype=torch.float),

        u=torch.tensor(x_global, dtype=torch.float),

        x_y=x_y,

        edge_y=edge_y,

        u_y=u_y,

    )


    return g


class GraphDataSet(InMemoryDataset):

    """

    Dataset handler for converting Belle II data to PyTorch geometric InMemoryDataset.


    The ROOT format expects the tree in every file to be named ``Tree``,

    and all node features to have the format ``feat_FEATNAME``.


    .. note:: This expects the files under root to have the structure ``root/**/<file_name>.root``

        where the root path is different for train and val.

        The ``**/`` is to handle subdirectories, e.g. ``sub00``.


    Args:

        root (str): Path to ROOT files.

        n_files (int): Load only ``n_files`` files.

        samples (int): Load only ``samples`` events.

        features (list): List of node features names.

        edge_features (list): List of edge features names.

        global_features (list): List of global features names.

        normalize (bool): Whether to normalize input features.

    """


    def __init__(

        self,

        root,

        n_files=None,

        samples=None,

        features=[],

        edge_features=[],

        global_features=[],

        normalize=None,

        **kwargs,

    ):

        """

        Initialization.

        """

        assert isinstance(

            features, list

        ), f'Argument "features" must be a list and not {type(features)}'

        assert len(features) > 0, "You need to use at least one node feature"


        self.root = Path(root)


        self.normalize = normalize


        self.n_files = n_files


        self.node_features = features


        self.edge_features = edge_features


        self.global_features = global_features


        self.samples = samples


        # Delete processed files, in case

        file_path = Path(self.root, "processed")

        files = list(file_path.glob("*.pt"))

        for f in files:

            f.unlink(missing_ok=True)


        # Needs to be called after having assigned all attributes

        super().__init__(root, None, None, None)


        self.data, self.slices = torch.load(self.processed_paths[0])


    @property


    def processed_file_names(self):

        """

        Name of processed file.

        """

        return ["processed_data.pt"]


    def process(self):

        """

        Processes the data to create graph objects and stores them in ``root/processed/processed_data.pt``

        where the root path is different for train and val.


        Called internally by PyTorch.

        """

        num_samples = _preload(self)

        data_list = [_process_graph(self, i) for i in range(num_samples)]

        data, slices = self.collate(data_list)

        torch.save((data, slices), self.processed_paths[0])


        del self.x, self.y, self.avail_samples, data_list, data, slices


array
STL class.

geometric_datasets.GraphDataSet
Definition geometric_datasets.py:258

geometric_datasets.GraphDataSet.__init__
__init__(self, root, n_files=None, samples=None, features=[], edge_features=[], global_features=[], normalize=None, **kwargs)
Definition geometric_datasets.py:289

geometric_datasets.GraphDataSet.n_files
n_files
Number of files.
Definition geometric_datasets.py:305

geometric_datasets.GraphDataSet.global_features
global_features
Global features.
Definition geometric_datasets.py:311

geometric_datasets.GraphDataSet.slices
slices
Data and Slices.
Definition geometric_datasets.py:325

geometric_datasets.GraphDataSet.y
y
delete attributes
Definition geometric_datasets.py:347

geometric_datasets.GraphDataSet.avail_samples
avail_samples
delete attributes
Definition geometric_datasets.py:347

geometric_datasets.GraphDataSet.data
data
Data and Slices.
Definition geometric_datasets.py:325

geometric_datasets.GraphDataSet.processed_file_names
processed_file_names(self)
Definition geometric_datasets.py:328

geometric_datasets.GraphDataSet.x
x
delete attributes
Definition geometric_datasets.py:347

geometric_datasets.GraphDataSet.samples
samples
Samples.
Definition geometric_datasets.py:313

geometric_datasets.GraphDataSet.root
root
Root path.
Definition geometric_datasets.py:299

geometric_datasets.GraphDataSet.edge_features
edge_features
Edge features.
Definition geometric_datasets.py:309

geometric_datasets.GraphDataSet.normalize
normalize
Normalize.
Definition geometric_datasets.py:302

geometric_datasets.GraphDataSet.node_features
node_features
Node features.
Definition geometric_datasets.py:307

geometric_datasets.GraphDataSet.process
process(self)
Definition geometric_datasets.py:334

normalize_features
Definition normalize_features.py:1