light-2403-persian/doxygen/geometric__datasets_8py_source.html

 import itertools

 from pathlib import Path

 import numpy as np

 import torch

 from .tree_utils import masses_to_classes

 from .dataset_utils import populate_avail_samples, preload_root_data

 from .edge_features import compute_edge_features

 from .normalize_features import normalize_features

 from torch_geometric.data import Data, InMemoryDataset

 import uproot


 def _preload(self):

     """

     Creates graph objects and stores them into a python list.

     """


     self.x_files = sorted(self.root.glob("**/*.root"))


     # Select the first N files (useful for testing)

     if self.n_files is not None:

         if self.n_files > len(self.x_files):

             print(

                 f"WARNING: n_files specified ({self.n_files}) is greater than files in path given ({len(self.x_files)})"

             )


         self.x_files = self.x_files[: self.n_files]


     if len(self.x_files) == 0:

         raise RuntimeError(f"No files found in {self.root}")


     # Save the features

     with uproot.open(self.x_files[0])["Tree"] as t:


         self.features = [f for f in t.keys() if f.startswith("feat_")]


         self.B_reco = int(t["isB"].array(library="np")[0])

         assert self.B_reco in [0, 1, 2], "B_reco should be 0, 1 or 2, something went wrong"


     self.discarded = [

         f for f in self.features if not f[f.find("_") + 1:] in self.node_features

     ]

     self.features = [

         f"feat_{f}" for f in self.node_features if f"feat_{f}" in self.features

     ]


     print(f"Input node features: {self.features}")

     print(f"Discarded node features: {self.discarded}")


     self.edge_features = [f"edge_{f}" for f in self.edge_features]


     self.global_features = [f"glob_{f}" for f in self.global_features] if self.global_features else []

     print(f"Input edge features: {self.edge_features}")

     print(f"Input global features: {self.global_features}")


     self.x, self.y = preload_root_data(

         self.x_files,

         self.features,

         self.discarded,

     )


     self.avail_samples = populate_avail_samples(

         self.x,

         self.y,

         self.B_reco,

     )


     # Select a subset of available samples if requested

     if self.samples and self.samples < len(self.avail_samples):

         print(f"Selecting random subset of {self.samples} samples")

         self.avail_samples = [

             self.avail_samples[i]

             for i in np.random.choice(

                 len(self.avail_samples), self.samples, replace=False

             )

         ]

     elif self.samples and (self.samples >= len(self.avail_samples)):

         print(

             f"WARNING: No. samples specified ({self.samples}) exceeds number of samples loaded ({len(self.avail_samples)})"

         )


     return len(self.avail_samples)


 def _process_graph(self, idx):

     """

     Actually builds the graph object.


     Args:

         idx (int): Index of training example to be processed.


     Returns:

         torch_geometric.data.Data: Graph object to be used in training.

     """


     file_id, evt, p_index = self.avail_samples[idx]


     x_item = self.x[file_id]

     y_item = self.y[file_id][p_index]


     evt_b_index = x_item["b_index"][evt]

     evt_leaves = x_item["leaves"][evt]

     evt_primary = x_item["primary"][evt]


     y_leaves = y_item["LCA_leaves"][evt]

     # Use this to correctly reshape LCA (might be able to just use shape of y_leaves?)

     n_LCA = y_item["n_LCA"][evt]


     # Get the rows of the X inputs to fetch

     # This is a boolean numpy array

     x_rows = (evt_b_index != -1) if not self.B_reco else evt_b_index == int(p_index)


     # Find the unmatched particles

     unmatched_rows = evt_b_index == -1


     if np.any(unmatched_rows) and self.B_reco:

         # Create a random boolean array the same size as the number of leaves

         rand_mask = np.random.choice(a=[False, True], size=unmatched_rows.size)

         # AND the mask with the unmatched leaves

         # This selects a random subset of the unmatched leaves

         unmatched_rows = np.logical_and(unmatched_rows, rand_mask)


     # Add the unmatched rows to the current decay's rows

     x_rows = np.logical_or(x_rows, unmatched_rows)


     # Here we actually load the data


     # Initialise event's X array

     x = np.empty((x_rows.sum(), len(self.features)))

     x_dis = np.empty((x_rows.sum(), len(self.discarded)))


     # And populate it

     for idx, feat in enumerate(self.features):

         x[:, idx] = x_item["features"][feat][evt][x_rows]

     for idx, feat in enumerate(self.discarded):

         x_dis[:, idx] = x_item["discarded"][feat][evt][x_rows]


     # Same for edge and global features

     x_edges = (

         compute_edge_features(

             self.edge_features,

             self.features + self.discarded,

             np.concatenate([x, x_dis], axis=1),

         )

         if self.edge_features is not []

         else []

     )

     x_global = (

         np.array(

             [

                 [

                     x_item["global"][feat + f"_{p_index}"][evt]

                     for feat in self.global_features

                 ]

             ]

         )

         if self.global_features != []

         else []

     )


     x_leaves = evt_leaves[x_rows]


     # Set nans to zero, this is a surrogate value, may change in future

     np.nan_to_num(x, copy=False)

     np.nan_to_num(x_edges, copy=False)

     np.nan_to_num(x_global, copy=False)


     # Normalize any features that should be

     if self.normalize is not None:

         normalize_features(

             self.normalize,

             self.features,

             x,

             self.edge_features,

             x_edges,

             self.global_features,

             x_global,

         )


     # Reorder LCA


     # Get LCA indices in order that the leaves appear in reconstructed particles

     # Secondaries aren't in the LCA leaves list so they get a 0

     locs = np.array(

         [

             np.where(y_leaves == i)[0].item() if (i in y_leaves) else 0

             for i in x_leaves

         ]

     )


     # Get the LCA in the correct subset order

     # If we're not allowing secondaries this is all we need

     # If we are this will contain duplicates (since secondary locs are set to 0)

     # We can't load the firs locs directly (i.e. y_item[locs, :]) because locs is (intentionally) unsorted

     y_edge = y_item["LCA"][evt].reshape((n_LCA, n_LCA)).astype(int)

     # Get the true mcPDG pf FSPs

     y_mass = masses_to_classes(x_item["mc_pdg"][evt][x_rows])


     # Get the specificed row/cols, this inserts dummy rows/cols for secondaries

     y_edge = y_edge[locs, :][:, locs]

     # if self.allow_secondaries:

     # Set everything that's not primary (unmatched and secondaries) rows.cols to 0

     # Note we only consider the subset of leaves that made it into x_rows

     y_edge = np.where(evt_primary[x_rows], y_edge, 0)  # Set the rows

     y_edge = np.where(evt_primary[x_rows][:, None], y_edge, 0)  # Set the columns


     # Set diagonal to -1 (actually not necessary but ok...)

     np.fill_diagonal(y_edge, -1)


     n_nodes = x.shape[0]


     # Target edge tensor: shape [E]

     edge_y = torch.tensor(

         y_edge[np.eye(n_nodes) == 0],

         dtype=torch.long

     )

     # Fill tensor with edge indices: shape [N*(N-1), 2] == [E, 2]

     edge_index = torch.tensor(

         list(itertools.permutations(range(n_nodes), 2)),

         dtype=torch.long,

     )


     # Target global tensor: shape [B, F_g]

     u_y = torch.tensor(

         [[1]], dtype=torch.float

     )


     # Target node tensor: shape [N]

     x_y = torch.tensor(y_mass, dtype=torch.long)


     g = Data(

         x=torch.tensor(x, dtype=torch.float),

         edge_index=edge_index.t().contiguous(),

         edge_attr=torch.tensor(x_edges, dtype=torch.float),

         u=torch.tensor(x_global, dtype=torch.float),

         x_y=x_y,

         edge_y=edge_y,

         u_y=u_y,

     )


     return g


 class GraphDataSet(InMemoryDataset):

     """

     Dataset handler for converting Belle II data to PyTorch geometric InMemoryDataset.


     The ROOT format expects the tree in every file to be named ``Tree``,

     and all node features to have the format ``feat_FEATNAME``.


     .. note:: This expects the files under root to have the structure ``root/**/<file_name>.root``

         where the root path is different for train and val.

         The ``**/`` is to handle subdirectories, e.g. ``sub00``.


     Args:

         root (str): Path to ROOT files.

         n_files (int): Load only ``n_files`` files.

         samples (int): Load only ``samples`` events.

         features (list): List of node features names.

         edge_features (list): List of edge features names.

         global_features (list): List of global features names.

         normalize (bool): Whether to normalize input features.

     """


     def __init__(

         self,

         root,

         n_files=None,

         samples=None,

         features=[],

         edge_features=[],

         global_features=[],

         normalize=None,

         **kwargs,

     ):

         """

         Initialization.

         """

         assert isinstance(

             features, list

         ), f'Argument "features" must be a list and not {type(features)}'

         assert len(features) > 0, "You need to use at least one node feature"


         self.rootroot = Path(root)


         self.normalizenormalize = normalize


         self.n_filesn_files = n_files


         self.node_featuresnode_features = features


         self.edge_featuresedge_features = edge_features


         self.global_featuresglobal_features = global_features


         self.samplessamples = samples


         # Delete processed files, in case

         file_path = Path(self.rootroot, "processed")

         files = list(file_path.glob("*.pt"))

         for f in files:

             f.unlink(missing_ok=True)


         # Needs to be called after having assigned all attributes

         super().__init__(root, None, None, None)


         self.data, self.slicesslices = torch.load(self.processed_paths[0])


     @property

     def processed_file_names(self):

         """

         Name of processed file.

         """

         return ["processed_data.pt"]


     def process(self):

         """

         Processes the data to create graph objects and stores them in ``root/processed/processed_data.pt``

         where the root path is different for train and val.


         Called internally by PyTorch.

         """

         num_samples = _preload(self)

         data_list = [_process_graph(self, i) for i in range(num_samples)]

         data, slices = self.collate(data_list)

         torch.save((data, slices), self.processed_paths[0])


         del self.x, self.y, self.avail_samples, data_list, data, slices

geometric_datasets.GraphDataSet
Definition: geometric_datasets.py:258

geometric_datasets.GraphDataSet.processed_file_names
def processed_file_names(self)
Definition: geometric_datasets.py:328

geometric_datasets.GraphDataSet.n_files
n_files
Number of files.
Definition: geometric_datasets.py:305

geometric_datasets.GraphDataSet.global_features
global_features
Global features.
Definition: geometric_datasets.py:311

geometric_datasets.GraphDataSet.slices
slices
Data and Slices.
Definition: geometric_datasets.py:325

geometric_datasets.GraphDataSet.process
def process(self)
Definition: geometric_datasets.py:334

geometric_datasets.GraphDataSet.samples
samples
Samples.
Definition: geometric_datasets.py:313

geometric_datasets.GraphDataSet.root
root
Root path.
Definition: geometric_datasets.py:299

geometric_datasets.GraphDataSet.edge_features
edge_features
Edge features.
Definition: geometric_datasets.py:309

geometric_datasets.GraphDataSet.normalize
normalize
Normalize.
Definition: geometric_datasets.py:302

geometric_datasets.GraphDataSet.node_features
node_features
Node features.
Definition: geometric_datasets.py:307

geometric_datasets.GraphDataSet.__init__
def __init__(self, root, n_files=None, samples=None, features=[], edge_features=[], global_features=[], normalize=None, **kwargs)
Definition: geometric_datasets.py:289

normalize_features
Definition: normalize_features.py:1