Belle II Software development
priorDataLoaderAndModel.py
1#!/usr/bin/env python3
2
3
10
11from torch.nn import Linear
12from torch.nn import ReLU
13from torch.nn import BatchNorm1d
14from torch.nn.init import kaiming_uniform_
15from torch.nn.init import xavier_uniform_
16from torch.nn import Softmax
17import torch
18from torch.utils.data import Dataset
19from torch.utils.data import random_split
20from sklearn.preprocessing import LabelEncoder
21from sklearn.preprocessing import PolynomialFeatures
22import uproot as ur
23import numpy as np
24
25
26class PriorDataLoader(Dataset):
27 """
28 Dataloader for PID prior probability training.
29
30 Attributes:
31 x (np.array): Array containing feature data with a second order combination of momentum, cos(theta) and transverse momentum.
32 y (np.array): Array containing the label encoded PDG values.
33
34 """
35
36 def __init__(self, path: str, key: str, particlelist: list, labels: list):
37 """
38 Initialize the dataloader for PID prior training.
39
40 Parameters:
41 path (str): Path to the root file containing the data.
42 key (str): Key (i.e. path) of the tree within the root file.
43 particlelist (list(int)): List of particle PDG values for which the model has to be trained.
44 labels (str): Labels of pandas columns containing cos(theta), momentum and PDG values (in this order).
45
46 """
47 df = (
48 ur.open(path)[key]
49 .arrays(labels, library="pd")
50 .dropna()
51 .reset_index(drop=True)
52 )
53 df.loc[:, labels[2]] = df.loc[:, labels[2]].abs()
54 droplist = np.setdiff1d(np.unique(df[labels[2]].values), particlelist)
55 for i in droplist:
56 df = df.drop(df.loc[df[labels[2]] == i].index).reset_index(drop=True)
57 x = df.values[:, 0:2]
58 x = np.hstack((x, (np.sin(np.arccos(x[:, 0])) * x[:, 1]).reshape(-1, 1)))
59 pol = PolynomialFeatures(2, include_bias=False)
60 x = pol.fit_transform(x)
61
62 self.x = x.astype("float32")
63 y = df.values[:, 2]
64 le = LabelEncoder()
65 y = le.fit_transform(y)
66
67 self.y = y.astype("int64")
68
69 def __getitem__(self, index):
70 """
71 Function to get feature and label tensors at the given index location.
72
73 Parameters:
74 index (int): The index of required tensors.
75
76 Returns:
77 Tensors of features and labels at the given index.
78 """
79 return [self.x[index], self.y[index]]
80
81 def __len__(self):
82 """
83 Function to obtain length of a tensor.
84
85 Parameters:
86 None.
87
88 Returns:
89 Number of feature sets.
90 """
91 return len(self.x)
92
93 def get_split(self, n_test: float = 0.1) -> torch.tensor:
94 """
95 Split the input data into training and validation set.
96
97 Parameter:
98 n_test (float): Ratio of number of particles to be taken in the validation set to that of training set.
99
100 Return:
101 A randomly split data set with the ratio given by 'n_test'.
102 """
103 test_size = round(n_test * len(self.x))
104 train_size = len(self.x) - test_size
105 return random_split(self, [train_size, test_size])
106
107
108class PriorModel(torch.nn.Module):
109 """
110 Pytorch model for PID prior probability calculation.
111
112 Attributes:
113 hidden1: Linear layer with 9 inputs and 128 outputs.
114 act1: An RELU activation layer.
115 hidden2: A batch normalization layer.
116 hidden3: Linear layer with 128 inputs and 64 outputs.
117 act2: An RELU activation layer.
118 hidden4: A batch normalization layer.
119 hidden5: Linear layer with 64 inputs and 32 outputs.
120 act3: An RELU activation layer.
121 hidden6: A batch normalization layer.
122 hidden7: Linear layer with 9 inputs and 128 outputs.
123 act4: A softmax activation layer.
124
125 """
126
127 def __init__(self, n_output: int):
128 """
129 Initialize the PID prior probability model.
130
131 Parameter:
132 n_output (int): Number of output nodes.
133
134 """
135 super().__init__()
136
137 self.hidden1 = Linear(9, 128)
138 kaiming_uniform_(self.hidden1.weight, nonlinearity="relu")
139
140 self.act1 = ReLU()
141
142 self.hidden2 = BatchNorm1d(128)
143
144 self.hidden3 = Linear(128, 64)
145 kaiming_uniform_(self.hidden3.weight, nonlinearity="relu")
146
147 self.act2 = ReLU()
148
149 self.hidden4 = BatchNorm1d(64)
150
151 self.hidden5 = Linear(64, 32)
152 kaiming_uniform_(self.hidden5.weight, nonlinearity="relu")
153
154 self.act3 = ReLU()
155
156 self.hidden6 = BatchNorm1d(32)
157
158 self.hidden7 = Linear(32, n_output)
159 xavier_uniform_(self.hidden7.weight)
160
161 self.act4 = Softmax(dim=1)
162
163 def forward(self, x: torch.tensor) -> torch.tensor:
164 """
165 Gives PID prior probabilities for the input features.
166
167 Parameter:
168 x (torch.tensor): A 2D tensor containing features for a particle as a row.
169
170 Returns:
171 A torch tensor containing PID prior probabilities for the provided features.
172 """
173 x = self.hidden1(x)
174 x = self.act1(x)
175 x = self.hidden2(x)
176 x = self.hidden3(x)
177 x = self.act2(x)
178 x = self.hidden4(x)
179 x = self.hidden5(x)
180 x = self.act3(x)
181 x = self.hidden6(x)
182 x = self.hidden7(x)
183 x = self.act4(x)
184 return x
__init__(self, str path, str key, list particlelist, list labels)
torch.tensor get_split(self, float n_test=0.1)
hidden5
Linear layer with 64 inputs and 32 outputs.
torch.tensor forward(self, torch.tensor x)
hidden1
Linear layer with 9 inputs and 128 outputs.
hidden7
Linear layer with 32 inputs and outputs for each particle in the particlelist.
hidden3
Linear layer with 128 inputs and 64 outputs.