Belle II Software  light-2212-foldex
dplot.py
1 #!/usr/bin/env python3
2 
3 
10 
11 import numpy as np
12 import tensorflow as tf
13 import basf2_mva
14 
16 
17 
18 class Prior(object):
19  """
20  Calculates prior from signal and background pdfs of the fit variable
21  """
22 
23  def __init__(self, z, y):
24  """
25  Constructor of a new prior distribution
26  @param z fit variable
27  @param y target variable
28  """
29 
30  self.signal_cdf, self.signal_pdf, self.signal_binssignal_bins = calculate_cdf_and_pdf(z[y == 1])
31 
32  self.bckgrd_cdf, self.bckgrd_pdf, self.bckgrd_binsbckgrd_bins = calculate_cdf_and_pdf(z[y == 0])
33  # Avoid numerical instabilities
34  self.bckgrd_pdf[0] = self.bckgrd_pdf[-1] = 1
35 
36  def get_signal_pdf(self, X):
37  """
38  Calculate signal pdf for given fit variable value
39  @param X nd-array containing fit variable values
40  """
41  return self.signal_pdf[np.digitize(X, bins=self.signal_binssignal_bins)]
42 
43  def get_bckgrd_pdf(self, X):
44  """
45  Calculate background pdf for given fit variable value
46  @param X nd-array containing fit variable values
47  """
48  return self.bckgrd_pdf[np.digitize(X, bins=self.bckgrd_binsbckgrd_bins)]
49 
50  def get_signal_cdf(self, X):
51  """
52  Calculate signal cdf for given fit variable value
53  @param X nd-array containing fit variable values
54  """
55  return self.signal_cdf[np.digitize(X, bins=self.signal_binssignal_bins)]
56 
57  def get_bckgrd_cdf(self, X):
58  """
59  Calculate background cdf for given fit variable value
60  @param X nd-array containing fit variable values
61  """
62  return self.bckgrd_cdf[np.digitize(X, bins=self.bckgrd_binsbckgrd_bins)]
63 
64  def get_prior(self, X):
65  """
66  Calculate prior signal probability for given fit variable value
67  @param X nd-array containing fit variable values
68  """
69  prior = self.get_signal_pdfget_signal_pdf(X) / (self.get_signal_pdfget_signal_pdf(X) + self.get_bckgrd_pdfget_bckgrd_pdf(X))
70  prior = np.where(np.isfinite(prior), prior, 0.5)
71  return prior
72 
73  def get_boost_weights(self, X):
74  """
75  Calculate boost weights used in dplot boost training step
76  @param X nd-array containing fit variable values
77  """
78  signal_weight = self.get_signal_cdfget_signal_cdf(X) / self.get_bckgrd_pdfget_bckgrd_pdf(X)
79  signal_weight = np.where(np.isfinite(signal_weight), signal_weight, 0)
80  # NOT self.get_bckgrd_cdf() here, signal and background are handled asymmetrical!
81  bckgrd_weight = (1.0 - self.get_signal_cdfget_signal_cdf(X)) / self.get_bckgrd_pdfget_bckgrd_pdf(X)
82  bckgrd_weight = np.where(np.isfinite(bckgrd_weight), bckgrd_weight, 0)
83  return np.r_[signal_weight, bckgrd_weight]
84 
85  def get_uncorrelation_weights(self, X, boost_prediction):
86  """
87  Calculate uncorrelation weights used in dplot classifier training step
88  @param X nd-array containing fit variable values
89  @param boost_prediction output of the boost classifier
90  """
91  reg_boost_prediction = boost_prediction * 0.99 + 0.005
92  weights = (self.get_signal_cdfget_signal_cdf(X) / reg_boost_prediction +
93  (1.0 - self.get_signal_cdfget_signal_cdf(X)) / (1.0 - reg_boost_prediction)) / 2
94  return weights
95 
96 
97 def calculate_cdf_and_pdf(X):
98  """
99  Calculates cdf and pdf of given sample and adds under/overflow bins
100  @param X 1-d np.array
101  """
102  pdf, bins = np.histogram(X, bins=200, density=True)
103  cdf = np.cumsum(pdf * (bins - np.roll(bins, 1))[1:])
104  return np.hstack([0.0, cdf, 1.0]), np.hstack([0.0, pdf, 0.0]), bins
105 
106 
107 def get_model(number_of_features, number_of_spectators, number_of_events, training_fraction, parameters):
108 
109  @tf.function
110  def dense(x, W, b, activation_function):
111  return activation_function(tf.matmul(x, W) + b)
112 
113  class my_model(tf.Module):
114 
115  def __init__(self, **kwargs):
116  super().__init__(**kwargs)
117 
118  self.boost_optimizer = tf.optimizers.Adam(0.01)
119  self.inference_optimizer = tf.optimizers.Adam(0.01)
120 
121  def create_layer_variables(shape, name, activation_function):
122  weights = tf.Variable(
123  tf.random.truncated_normal(shape, stddev=1.0 / np.sqrt(float(shape[0]))),
124  name=f'{name}_weights')
125  biases = tf.Variable(tf.zeros(shape=[shape[1]]), name=f'{name}_biases')
126  return weights, biases, activation_function
127 
128  self.boost_layer_vars = []
129  self.boost_layer_vars.append(create_layer_variables([number_of_features, 20], 'boost_input', tf.nn.sigmoid))
130  for i in range(3):
131  self.boost_layer_vars.append(create_layer_variables([20, 20], f'boost_hidden{i}', tf.nn.sigmoid))
132  self.boost_layer_vars.append(create_layer_variables([20, 1], 'boost_output', tf.nn.sigmoid))
133 
134  self.inference_layer_vars = []
135  self.inference_layer_vars.append(create_layer_variables([number_of_features, 20], 'inference_input', tf.nn.sigmoid))
136  for i in range(3):
137  self.inference_layer_vars.append(create_layer_variables([20, 20], f'inference_hidden{i}', tf.nn.sigmoid))
138  self.inference_layer_vars.append(create_layer_variables([20, 1], 'inference_output', tf.nn.sigmoid))
139 
140  self.n_boost_layers = len(self.boost_layer_vars)
141  self.n_inference_layers = len(self.inference_layer_vars)
142 
143  @tf.function(input_signature=[tf.TensorSpec(shape=[None, number_of_features], dtype=tf.float32)])
144  def __call__(self, x):
145  for i in range(self.n_inference_layers):
146  x = dense(x, *self.inference_layer_vars[i])
147  return x
148 
149  @tf.function(input_signature=[tf.TensorSpec(shape=[None, number_of_features], dtype=tf.float32)])
150  def boost(self, x):
151  for i in range(self.n_boost_layers):
152  x = dense(x, *self.boost_layer_vars[i])
153  return x
154 
155  @tf.function
156  def loss(self, predicted_y, target_y, w):
157  epsilon = 1e-5
158  diff_from_truth = tf.where(target_y == 1., predicted_y, 1. - predicted_y)
159  cross_entropy = - tf.reduce_sum(w * tf.math.log(diff_from_truth + epsilon)) / tf.reduce_sum(w)
160  return cross_entropy
161 
162  state = State(model=my_model())
163  return state
164 
165 
166 def partial_fit(state, X, S, y, w, epoch, batch):
167  """
168  Pass received data to tensorflow session
169  """
170  prior = Prior(S[:, 0], y[:, 0])
171  N = 100
172  batch_size = 100
173 
174  # make sure we have the epochs and batches set correctly.
175  assert epoch < 2, "There should only be two iterations, one for the boost training,"\
176  " one for the dplot training. Check the value of m_nIterations."
177  assert batch == 0, "All data should be passed to partial_fit on each call."\
178  " The mini batches are handled internally. Check that m_mini_batch_size=0."
179 
180  indices = np.arange(len(X))
181  for i in range(N):
182  np.random.shuffle(indices)
183  for pos in range(0, len(indices), batch_size):
184  if pos + batch_size >= len(indices):
185  break
186  index = indices[pos: pos + batch_size]
187  z_batch = S[index, 0]
188  x_batch = X[index]
189 
190  if epoch == 0:
191  x_batch = np.r_[x_batch, x_batch]
192  w_batch = prior.get_boost_weights(z_batch) * np.r_[w[index, 0], w[index, 0]]
193  y_batch = np.r_[np.ones(batch_size), np.zeros(batch_size)]
194  y_batch = np.reshape(y_batch, (-1, 1))
195  optimizer = state.model.boost_optimizer
196  name = 'boost'
197  else:
198  p_batch = state.model.boost(x_batch).numpy()
199  w_batch = prior.get_uncorrelation_weights(z_batch, p_batch.flatten()) * w[index, 0]
200  y_batch = y[index]
201  optimizer = state.model.inference_optimizer
202  name = 'inference'
203 
204  w_batch = np.reshape(w_batch, (-1, 1)).astype(np.float32)
205 
206  with tf.GradientTape() as tape:
207  if epoch == 0:
208  y_predict_batch = state.model.boost(x_batch)
209  else:
210  y_predict_batch = state.model(x_batch)
211 
212  avg_cost = state.model.loss(y_predict_batch, y_batch, w_batch)
213  trainable_variables = [v for v in state.model.trainable_variables if name in v.name]
214  grads = tape.gradient(avg_cost, trainable_variables)
215 
216  optimizer.apply_gradients(zip(grads, trainable_variables))
217 
218  print("Internal Epoch:", '%04d' % (i), "cost=", "{:.9f}".format(avg_cost))
219  return True
220 
221 
222 if __name__ == "__main__":
223  general_options = basf2_mva.GeneralOptions()
224  general_options.m_datafiles = basf2_mva.vector("train.root")
225  general_options.m_identifier = "TensorflowDPlot"
226  general_options.m_treename = "tree"
227  variables = ['p', 'pt', 'pz',
228  'daughter(0, p)', 'daughter(0, pz)', 'daughter(0, pt)',
229  'daughter(1, p)', 'daughter(1, pz)', 'daughter(1, pt)',
230  'daughter(2, p)', 'daughter(2, pz)', 'daughter(2, pt)',
231  'chiProb', 'dr', 'dz',
232  'daughter(0, dr)', 'daughter(1, dr)',
233  'daughter(0, dz)', 'daughter(1, dz)',
234  'daughter(0, chiProb)', 'daughter(1, chiProb)', 'daughter(2, chiProb)',
235  'daughter(0, kaonID)', 'daughter(0, pionID)',
236  'daughterInvM(0, 1)', 'daughterInvM(0, 2)', 'daughterInvM(1, 2)']
237  general_options.m_variables = basf2_mva.vector(*variables)
238  general_options.m_spectators = basf2_mva.vector('M')
239  general_options.m_target_variable = "isSignal"
240 
241  specific_options = basf2_mva.PythonOptions()
242  specific_options.m_framework = "tensorflow"
243  specific_options.m_steering_file = 'mva/examples/tensorflow/dplot.py'
244  specific_options.m_nIterations = 2 # Feed data twice (first time for boost training, second time for dplot training)
245  specific_options.m_mini_batch_size = 0 # Pass all events each 'batch'
246  basf2_mva.teacher(general_options, specific_options)
def get_prior(self, X)
Definition: dplot.py:64
def get_bckgrd_cdf(self, X)
Definition: dplot.py:57
def __init__(self, z, y)
Definition: dplot.py:23
bckgrd_bins
background cdf, pdf and binning
Definition: dplot.py:32
def get_boost_weights(self, X)
Definition: dplot.py:73
def get_bckgrd_pdf(self, X)
Definition: dplot.py:43
signal_bins
signal cdf, pdf and binning
Definition: dplot.py:30
def get_signal_cdf(self, X)
Definition: dplot.py:50
def get_signal_pdf(self, X)
Definition: dplot.py:36
def get_uncorrelation_weights(self, X, boost_prediction)
Definition: dplot.py:85
Definition: Module.h:22