11from basf2
import find_file
13import tensorflow
as tf
21 Calculates prior from signal
and background pdfs of the fit variable
26 Constructor of a new prior distribution
28 @param y target variable
31 self.signal_cdf, self.signal_pdf, self.signal_bins = calculate_cdf_and_pdf(z[y == 1])
33 self.bckgrd_cdf, self.bckgrd_pdf, self.bckgrd_bins = calculate_cdf_and_pdf(z[y == 0])
35 self.bckgrd_pdf[0] = self.bckgrd_pdf[-1] = 1
39 Calculate signal pdf for given fit variable value
40 @param X nd-array containing fit variable values
42 return self.signal_pdf[np.digitize(X, bins=self.
signal_bins)]
46 Calculate background pdf for given fit variable value
47 @param X nd-array containing fit variable values
49 return self.bckgrd_pdf[np.digitize(X, bins=self.
bckgrd_bins)]
53 Calculate signal cdf for given fit variable value
54 @param X nd-array containing fit variable values
56 return self.signal_cdf[np.digitize(X, bins=self.
signal_bins)]
60 Calculate background cdf for given fit variable value
61 @param X nd-array containing fit variable values
63 return self.bckgrd_cdf[np.digitize(X, bins=self.
bckgrd_bins)]
67 Calculate prior signal probability for given fit variable value
68 @param X nd-array containing fit variable values
71 prior = np.where(np.isfinite(prior), prior, 0.5)
76 Calculate boost weights used in dplot boost training step
77 @param X nd-array containing fit variable values
80 signal_weight = np.where(np.isfinite(signal_weight), signal_weight, 0)
83 bckgrd_weight = np.where(np.isfinite(bckgrd_weight), bckgrd_weight, 0)
84 return np.r_[signal_weight, bckgrd_weight]
88 Calculate uncorrelation weights used in dplot classifier training step
89 @param X nd-array containing fit variable values
90 @param boost_prediction output of the boost classifier
92 reg_boost_prediction = boost_prediction * 0.99 + 0.005
98def calculate_cdf_and_pdf(X):
100 Calculates cdf and pdf of given sample
and adds under/overflow bins
101 @param X 1-d np.array
103 pdf, bins = np.histogram(X, bins=200, density=True)
104 cdf = np.cumsum(pdf * (bins - np.roll(bins, 1))[1:])
105 return np.hstack([0.0, cdf, 1.0]), np.hstack([0.0, pdf, 0.0]), bins
108def get_model(number_of_features, number_of_spectators, number_of_events, training_fraction, parameters):
111 def dense(x, W, b, activation_function):
112 return activation_function(tf.matmul(x, W) + b)
114 class my_model(tf.Module):
119 self.boost_optimizer = tf.optimizers.Adam(0.01)
120 self.inference_optimizer = tf.optimizers.Adam(0.01)
122 def create_layer_variables(shape, name, activation_function):
123 weights = tf.Variable(
124 tf.random.truncated_normal(shape, stddev=1.0 / np.sqrt(float(shape[0]))),
125 name=f
'{name}_weights')
126 biases = tf.Variable(tf.zeros(shape=[shape[1]]), name=f
'{name}_biases')
127 return weights, biases, activation_function
129 self.boost_layer_vars = []
130 self.boost_layer_vars.append(create_layer_variables([number_of_features, 20],
'boost_input', tf.nn.sigmoid))
132 self.boost_layer_vars.append(create_layer_variables([20, 20], f
'boost_hidden{i}', tf.nn.sigmoid))
133 self.boost_layer_vars.append(create_layer_variables([20, 1],
'boost_output', tf.nn.sigmoid))
135 self.inference_layer_vars = []
136 self.inference_layer_vars.append(create_layer_variables([number_of_features, 20],
'inference_input', tf.nn.sigmoid))
138 self.inference_layer_vars.append(create_layer_variables([20, 20], f
'inference_hidden{i}', tf.nn.sigmoid))
139 self.inference_layer_vars.append(create_layer_variables([20, 1],
'inference_output', tf.nn.sigmoid))
141 self.n_boost_layers = len(self.boost_layer_vars)
142 self.n_inference_layers = len(self.inference_layer_vars)
144 @tf.function(input_signature=[tf.TensorSpec(shape=[None, number_of_features], dtype=tf.float32)])
145 def __call__(self, x):
146 for i
in range(self.n_inference_layers):
147 x = dense(x, *self.inference_layer_vars[i])
150 @tf.function(input_signature=[tf.TensorSpec(shape=[None, number_of_features], dtype=tf.float32)])
152 for i
in range(self.n_boost_layers):
153 x = dense(x, *self.boost_layer_vars[i])
157 def loss(self, predicted_y, target_y, w):
159 diff_from_truth = tf.where(target_y == 1., predicted_y, 1. - predicted_y)
160 cross_entropy = - tf.reduce_sum(w * tf.math.log(diff_from_truth + epsilon)) / tf.reduce_sum(w)
163 state =
State(model=my_model())
167def partial_fit(state, X, S, y, w, epoch, batch):
169 Pass received data to tensorflow session
171 prior = Prior(S[:, 0], y[:, 0])
176 assert epoch < 2,
"There should only be two iterations, one for the boost training,"\
177 " one for the dplot training. Check the value of m_nIterations."
178 assert batch == 0,
"All data should be passed to partial_fit on each call."\
179 " The mini batches are handled internally. Check that m_mini_batch_size=0."
181 indices = np.arange(len(X))
183 np.random.shuffle(indices)
184 for pos
in range(0, len(indices), batch_size):
185 if pos + batch_size >= len(indices):
187 index = indices[pos: pos + batch_size]
188 z_batch = S[index, 0]
192 x_batch = np.r_[x_batch, x_batch]
193 w_batch = prior.get_boost_weights(z_batch) * np.r_[w[index, 0], w[index, 0]]
194 y_batch = np.r_[np.ones(batch_size), np.zeros(batch_size)]
195 y_batch = np.reshape(y_batch, (-1, 1))
196 optimizer = state.model.boost_optimizer
199 p_batch = state.model.boost(x_batch).numpy()
200 w_batch = prior.get_uncorrelation_weights(z_batch, p_batch.flatten()) * w[index, 0]
202 optimizer = state.model.inference_optimizer
205 w_batch = np.reshape(w_batch, (-1, 1)).astype(np.float32)
207 with tf.GradientTape()
as tape:
209 y_predict_batch = state.model.boost(x_batch)
211 y_predict_batch = state.model(x_batch)
213 avg_cost = state.model.loss(y_predict_batch, y_batch, w_batch)
214 trainable_variables = [v
for v
in state.model.trainable_variables
if name
in v.name]
215 grads = tape.gradient(avg_cost, trainable_variables)
217 optimizer.apply_gradients(zip(grads, trainable_variables))
219 print(
"Internal Epoch:", f
'{int(i):04}',
"cost=", f
"{avg_cost:.9f}")
223if __name__ ==
"__main__":
225 train_file = find_file(
"mva/train_D0toKpipi.root",
"examples")
226 training_data = basf2_mva.vector(train_file)
228 general_options = basf2_mva.GeneralOptions()
229 general_options.m_datafiles = training_data
230 general_options.m_identifier =
"TensorflowDPlot"
231 general_options.m_treename =
"tree"
232 variables = [
'p',
'pt',
'pz',
233 'daughter(0, p)',
'daughter(0, pz)',
'daughter(0, pt)',
234 'daughter(1, p)',
'daughter(1, pz)',
'daughter(1, pt)',
235 'daughter(2, p)',
'daughter(2, pz)',
'daughter(2, pt)',
236 'chiProb',
'dr',
'dz',
237 'daughter(0, dr)',
'daughter(1, dr)',
238 'daughter(0, dz)',
'daughter(1, dz)',
239 'daughter(0, chiProb)',
'daughter(1, chiProb)',
'daughter(2, chiProb)',
240 'daughter(0, kaonID)',
'daughter(0, pionID)',
241 'daughterInvM(0, 1)',
'daughterInvM(0, 2)',
'daughterInvM(1, 2)']
242 general_options.m_variables = basf2_mva.vector(*variables)
243 general_options.m_spectators = basf2_mva.vector(
'M')
244 general_options.m_target_variable =
"isSignal"
246 specific_options = basf2_mva.PythonOptions()
247 specific_options.m_framework =
"tensorflow"
248 specific_options.m_steering_file =
'mva/examples/tensorflow/dplot.py'
249 specific_options.m_nIterations = 2
250 specific_options.m_mini_batch_size = 0
251 basf2_mva.teacher(general_options, specific_options)
def get_bckgrd_cdf(self, X)
bckgrd_bins
background cdf, pdf and binning
def get_boost_weights(self, X)
def get_bckgrd_pdf(self, X)
signal_bins
signal cdf, pdf and binning
def get_signal_cdf(self, X)
def get_signal_pdf(self, X)
def get_uncorrelation_weights(self, X, boost_prediction)