Belle II Software development
priorDataLoaderAndModel.py
1#!/usr/bin/env python3
2
3
10
11from torch.nn import Linear
12from torch.nn import ReLU
13from torch.nn import BatchNorm1d
14from torch.nn.init import kaiming_uniform_
15from torch.nn.init import xavier_uniform_
16from torch.nn import Softmax
17import torch
18from torch.utils.data import Dataset
19from torch.utils.data import random_split
20from sklearn.preprocessing import LabelEncoder
21from sklearn.preprocessing import PolynomialFeatures
22import uproot3 as ur
23import numpy as np
24
25
26class PriorDataLoader(Dataset):
27 """
28 Dataloader for PID prior probability training.
29
30 Attributes:
31 x (np.array): Array containing feature data with a second order combination of momentum, cos(theta) and transverse momentum.
32 y (np.array): Array containing the label encoded PDG values.
33
34 """
35
36 def __init__(self, path: str, key: str, particlelist: list, labels: list):
37 """
38 Initialize the dataloader for PID prior training.
39
40 Parameters:
41 path (str): Path to the root file containing the data.
42 key (str): Key (i.e. path) of the tree within the root file.
43 particlelist (list(int)): List of particle PDG values for which the model has to be trained.
44 labels (str): Labels of pandas columns containing cos(theta), momentum and PDG values (in this order).
45
46 """
47 data = ur.open(path)
48 data = data[key].pandas.df(labels)
49 df = data.dropna().reset_index(drop=True)
50 df.loc[:, labels[2]] = df.loc[:, labels[2]].abs()
51 droplist = np.setdiff1d(np.unique(df[labels[2]].values), particlelist)
52 for i in droplist:
53 df = df.drop(df.loc[df[labels[2]] == i].index).reset_index(drop=True)
54 x = df.values[:, 0:2]
55 x = np.hstack((x, (np.sin(np.arccos(x[:, 0])) * x[:, 1]).reshape(-1, 1)))
56 pol = PolynomialFeatures(2, include_bias=False)
57 x = pol.fit_transform(x)
58
59 self.x = x.astype("float32")
60 y = df.values[:, 2]
61 le = LabelEncoder()
62 y = le.fit_transform(y)
63
64 self.y = y.astype("int64")
65
66 def __getitem__(self, index):
67 """
68 Function to get feature and label tensors at the given index location.
69
70 Parameters:
71 index (int): The index of required tensors.
72
73 Returns:
74 Tensors of features and labels at the given index.
75 """
76 return [self.x[index], self.y[index]]
77
78 def __len__(self):
79 """
80 Function to obtain length of a tensor.
81
82 Parameters:
83 None.
84
85 Returns:
86 Number of feature sets.
87 """
88 return len(self.x)
89
90 def get_split(self, n_test: float = 0.1) -> torch.tensor:
91 """
92 Split the input data into training and validation set.
93
94 Parameter:
95 n_test (float): Ratio of number of particles to be taken in the validation set to that of training set.
96
97 Return:
98 A randomly split data set with the ratio given by 'n_test'.
99 """
100 test_size = round(n_test * len(self.x))
101 train_size = len(self.x) - test_size
102 return random_split(self, [train_size, test_size])
103
104
105class PriorModel(torch.nn.Module):
106 """
107 Pytorch model for PID prior probability calculation.
108
109 Attributes:
110 hidden1: Linear layer with 9 inputs and 128 outputs.
111 act1: An RELU activation layer.
112 hidden2: A batch normalization layer.
113 hidden3: Linear layer with 128 inputs and 64 outputs.
114 act2: An RELU activation layer.
115 hidden4: A batch normalization layer.
116 hidden5: Linear layer with 64 inputs and 32 outputs.
117 act3: An RELU activation layer.
118 hidden6: A batch normalization layer.
119 hidden7: Linear layer with 9 inputs and 128 outputs.
120 act4: A softmax activation layer.
121
122 """
123
124 def __init__(self, n_output: int):
125 """
126 Initialize the PID prior probability model.
127
128 Parameter:
129 n_output (int): Number of output nodes.
130
131 """
132 super().__init__()
133
134 self.hidden1 = Linear(9, 128)
135 kaiming_uniform_(self.hidden1.weight, nonlinearity="relu")
136
137 self.act1 = ReLU()
138
139 self.hidden2 = BatchNorm1d(128)
140
141 self.hidden3 = Linear(128, 64)
142 kaiming_uniform_(self.hidden3.weight, nonlinearity="relu")
143
144 self.act2 = ReLU()
145
146 self.hidden4 = BatchNorm1d(64)
147
148 self.hidden5 = Linear(64, 32)
149 kaiming_uniform_(self.hidden5.weight, nonlinearity="relu")
150
151 self.act3 = ReLU()
152
153 self.hidden6 = BatchNorm1d(32)
154
155 self.hidden7 = Linear(32, n_output)
156 xavier_uniform_(self.hidden7.weight)
157
158 self.act4 = Softmax(dim=1)
159
160 def forward(self, x: torch.tensor) -> torch.tensor:
161 """
162 Gives PID prior probabilities for the input features.
163
164 Parameter:
165 x (torch.tensor): A 2D tensor containing features for a particle as a row.
166
167 Returns:
168 A torch tensor containing PID prior probabilities for the provided features.
169 """
170 x = self.hidden1(x)
171 x = self.act1(x)
172 x = self.hidden2(x)
173 x = self.hidden3(x)
174 x = self.act2(x)
175 x = self.hidden4(x)
176 x = self.hidden5(x)
177 x = self.act3(x)
178 x = self.hidden6(x)
179 x = self.hidden7(x)
180 x = self.act4(x)
181 return x
def __init__(self, str path, str key, list particlelist, list labels)
torch.tensor get_split(self, float n_test=0.1)
hidden5
Linear layer with 64 inputs and 32 outputs.
torch.tensor forward(self, torch.tensor x)
hidden1
Linear layer with 9 inputs and 128 outputs.
hidden7
Linear layer with 32 inputs and outputs for each particle in the particlelist.
hidden3
Linear layer with 128 inputs and 64 outputs.