Belle II Software development
dplot.py
1#!/usr/bin/env python3
2
3
10
11from basf2 import find_file
12import numpy as np
13import tensorflow as tf
14import basf2_mva
15
17
18
19class Prior:
20 """
21 Calculates prior from signal and background pdfs of the fit variable
22 """
23
24 def __init__(self, z, y):
25 """
26 Constructor of a new prior distribution
27 @param z fit variable
28 @param y target variable
29 """
30
31 self.signal_cdf, self.signal_pdf, self.signal_bins = calculate_cdf_and_pdf(z[y == 1])
32
33 self.bckgrd_cdf, self.bckgrd_pdf, self.bckgrd_bins = calculate_cdf_and_pdf(z[y == 0])
34 # Avoid numerical instabilities
35 self.bckgrd_pdf[0] = self.bckgrd_pdf[-1] = 1
36
37 def get_signal_pdf(self, X):
38 """
39 Calculate signal pdf for given fit variable value
40 @param X nd-array containing fit variable values
41 """
42 return self.signal_pdf[np.digitize(X, bins=self.signal_bins)]
43
44 def get_bckgrd_pdf(self, X):
45 """
46 Calculate background pdf for given fit variable value
47 @param X nd-array containing fit variable values
48 """
49 return self.bckgrd_pdf[np.digitize(X, bins=self.bckgrd_bins)]
50
51 def get_signal_cdf(self, X):
52 """
53 Calculate signal cdf for given fit variable value
54 @param X nd-array containing fit variable values
55 """
56 return self.signal_cdf[np.digitize(X, bins=self.signal_bins)]
57
58 def get_bckgrd_cdf(self, X):
59 """
60 Calculate background cdf for given fit variable value
61 @param X nd-array containing fit variable values
62 """
63 return self.bckgrd_cdf[np.digitize(X, bins=self.bckgrd_bins)]
64
65 def get_prior(self, X):
66 """
67 Calculate prior signal probability for given fit variable value
68 @param X nd-array containing fit variable values
69 """
70 prior = self.get_signal_pdf(X) / (self.get_signal_pdf(X) + self.get_bckgrd_pdf(X))
71 prior = np.where(np.isfinite(prior), prior, 0.5)
72 return prior
73
74 def get_boost_weights(self, X):
75 """
76 Calculate boost weights used in dplot boost training step
77 @param X nd-array containing fit variable values
78 """
79 signal_weight = self.get_signal_cdf(X) / self.get_bckgrd_pdf(X)
80 signal_weight = np.where(np.isfinite(signal_weight), signal_weight, 0)
81 # NOT self.get_bckgrd_cdf() here, signal and background are handled asymmetrical!
82 bckgrd_weight = (1.0 - self.get_signal_cdf(X)) / self.get_bckgrd_pdf(X)
83 bckgrd_weight = np.where(np.isfinite(bckgrd_weight), bckgrd_weight, 0)
84 return np.r_[signal_weight, bckgrd_weight]
85
86 def get_uncorrelation_weights(self, X, boost_prediction):
87 """
88 Calculate uncorrelation weights used in dplot classifier training step
89 @param X nd-array containing fit variable values
90 @param boost_prediction output of the boost classifier
91 """
92 reg_boost_prediction = boost_prediction * 0.99 + 0.005
93 weights = (self.get_signal_cdf(X) / reg_boost_prediction +
94 (1.0 - self.get_signal_cdf(X)) / (1.0 - reg_boost_prediction)) / 2
95 return weights
96
97
98def calculate_cdf_and_pdf(X):
99 """
100 Calculates cdf and pdf of given sample and adds under/overflow bins
101 @param X 1-d np.array
102 """
103 pdf, bins = np.histogram(X, bins=200, density=True)
104 cdf = np.cumsum(pdf * (bins - np.roll(bins, 1))[1:])
105 return np.hstack([0.0, cdf, 1.0]), np.hstack([0.0, pdf, 0.0]), bins
106
107
108def get_model(number_of_features, number_of_spectators, number_of_events, training_fraction, parameters):
109
110 @tf.function
111 def dense(x, W, b, activation_function):
112 return activation_function(tf.matmul(x, W) + b)
113
114 class my_model(tf.Module):
115
116 def __init__(self, **kwargs):
117 super().__init__(**kwargs)
118
119 self.boost_optimizer = tf.optimizers.Adam(0.01)
120 self.inference_optimizer = tf.optimizers.Adam(0.01)
121
122 def create_layer_variables(shape, name, activation_function):
123 weights = tf.Variable(
124 tf.random.truncated_normal(shape, stddev=1.0 / np.sqrt(float(shape[0]))),
125 name=f'{name}_weights')
126 biases = tf.Variable(tf.zeros(shape=[shape[1]]), name=f'{name}_biases')
127 return weights, biases, activation_function
128
129 self.boost_layer_vars = []
130 self.boost_layer_vars.append(create_layer_variables([number_of_features, 20], 'boost_input', tf.nn.sigmoid))
131 for i in range(3):
132 self.boost_layer_vars.append(create_layer_variables([20, 20], f'boost_hidden{i}', tf.nn.sigmoid))
133 self.boost_layer_vars.append(create_layer_variables([20, 1], 'boost_output', tf.nn.sigmoid))
134
135 self.inference_layer_vars = []
136 self.inference_layer_vars.append(create_layer_variables([number_of_features, 20], 'inference_input', tf.nn.sigmoid))
137 for i in range(3):
138 self.inference_layer_vars.append(create_layer_variables([20, 20], f'inference_hidden{i}', tf.nn.sigmoid))
139 self.inference_layer_vars.append(create_layer_variables([20, 1], 'inference_output', tf.nn.sigmoid))
140
141 self.n_boost_layers = len(self.boost_layer_vars)
142 self.n_inference_layers = len(self.inference_layer_vars)
143
144 @tf.function(input_signature=[tf.TensorSpec(shape=[None, number_of_features], dtype=tf.float32)])
145 def __call__(self, x):
146 for i in range(self.n_inference_layers):
147 x = dense(x, *self.inference_layer_vars[i])
148 return x
149
150 @tf.function(input_signature=[tf.TensorSpec(shape=[None, number_of_features], dtype=tf.float32)])
151 def boost(self, x):
152 for i in range(self.n_boost_layers):
153 x = dense(x, *self.boost_layer_vars[i])
154 return x
155
156 @tf.function
157 def loss(self, predicted_y, target_y, w):
158 epsilon = 1e-5
159 diff_from_truth = tf.where(target_y == 1., predicted_y, 1. - predicted_y)
160 cross_entropy = - tf.reduce_sum(w * tf.math.log(diff_from_truth + epsilon)) / tf.reduce_sum(w)
161 return cross_entropy
162
163 state = State(model=my_model())
164 return state
165
166
167def partial_fit(state, X, S, y, w, epoch, batch):
168 """
169 Pass received data to tensorflow session
170 """
171 prior = Prior(S[:, 0], y[:, 0])
172 N = 100
173 batch_size = 100
174
175 # make sure we have the epochs and batches set correctly.
176 assert epoch < 2, "There should only be two iterations, one for the boost training,"\
177 " one for the dplot training. Check the value of m_nIterations."
178 assert batch == 0, "All data should be passed to partial_fit on each call."\
179 " The mini batches are handled internally. Check that m_mini_batch_size=0."
180
181 indices = np.arange(len(X))
182 for i in range(N):
183 np.random.shuffle(indices)
184 for pos in range(0, len(indices), batch_size):
185 if pos + batch_size >= len(indices):
186 break
187 index = indices[pos: pos + batch_size]
188 z_batch = S[index, 0]
189 x_batch = X[index]
190
191 if epoch == 0:
192 x_batch = np.r_[x_batch, x_batch]
193 w_batch = prior.get_boost_weights(z_batch) * np.r_[w[index, 0], w[index, 0]]
194 y_batch = np.r_[np.ones(batch_size), np.zeros(batch_size)]
195 y_batch = np.reshape(y_batch, (-1, 1))
196 optimizer = state.model.boost_optimizer
197 name = 'boost'
198 else:
199 p_batch = state.model.boost(x_batch).numpy()
200 w_batch = prior.get_uncorrelation_weights(z_batch, p_batch.flatten()) * w[index, 0]
201 y_batch = y[index]
202 optimizer = state.model.inference_optimizer
203 name = 'inference'
204
205 w_batch = np.reshape(w_batch, (-1, 1)).astype(np.float32)
206
207 with tf.GradientTape() as tape:
208 if epoch == 0:
209 y_predict_batch = state.model.boost(x_batch)
210 else:
211 y_predict_batch = state.model(x_batch)
212
213 avg_cost = state.model.loss(y_predict_batch, y_batch, w_batch)
214 trainable_variables = [v for v in state.model.trainable_variables if name in v.name]
215 grads = tape.gradient(avg_cost, trainable_variables)
216
217 optimizer.apply_gradients(zip(grads, trainable_variables))
218
219 print("Internal Epoch:", f'{int(i):04}', "cost=", f"{avg_cost:.9f}")
220 return True
221
222
223if __name__ == "__main__":
224
225 train_file = find_file("mva/train_D0toKpipi.root", "examples")
226 training_data = basf2_mva.vector(train_file)
227
228 general_options = basf2_mva.GeneralOptions()
229 general_options.m_datafiles = training_data
230 general_options.m_identifier = "TensorflowDPlot"
231 general_options.m_treename = "tree"
232 variables = ['p', 'pt', 'pz',
233 'daughter(0, p)', 'daughter(0, pz)', 'daughter(0, pt)',
234 'daughter(1, p)', 'daughter(1, pz)', 'daughter(1, pt)',
235 'daughter(2, p)', 'daughter(2, pz)', 'daughter(2, pt)',
236 'chiProb', 'dr', 'dz',
237 'daughter(0, dr)', 'daughter(1, dr)',
238 'daughter(0, dz)', 'daughter(1, dz)',
239 'daughter(0, chiProb)', 'daughter(1, chiProb)', 'daughter(2, chiProb)',
240 'daughter(0, kaonID)', 'daughter(0, pionID)',
241 'daughterInvM(0, 1)', 'daughterInvM(0, 2)', 'daughterInvM(1, 2)']
242 general_options.m_variables = basf2_mva.vector(*variables)
243 general_options.m_spectators = basf2_mva.vector('M')
244 general_options.m_target_variable = "isSignal"
245
246 specific_options = basf2_mva.PythonOptions()
247 specific_options.m_framework = "tensorflow"
248 specific_options.m_steering_file = 'mva/examples/tensorflow/dplot.py'
249 specific_options.m_nIterations = 2 # Feed data twice (first time for boost training, second time for dplot training)
250 specific_options.m_mini_batch_size = 0 # Pass all events each 'batch'
251 basf2_mva.teacher(general_options, specific_options)
def get_prior(self, X)
Definition: dplot.py:65
def get_bckgrd_cdf(self, X)
Definition: dplot.py:58
def __init__(self, z, y)
Definition: dplot.py:24
bckgrd_bins
background cdf, pdf and binning
Definition: dplot.py:33
def get_boost_weights(self, X)
Definition: dplot.py:74
def get_bckgrd_pdf(self, X)
Definition: dplot.py:44
signal_bins
signal cdf, pdf and binning
Definition: dplot.py:31
def get_signal_cdf(self, X)
Definition: dplot.py:51
def get_signal_pdf(self, X)
Definition: dplot.py:37
def get_uncorrelation_weights(self, X, boost_prediction)
Definition: dplot.py:86
Definition: Module.h:22