Belle II Software  release-08-01-10
dplot.py
1 #!/usr/bin/env python3
2 
3 
10 
11 from basf2 import find_file
12 import numpy as np
13 import tensorflow as tf
14 import basf2_mva
15 
17 
18 
19 class Prior:
20  """
21  Calculates prior from signal and background pdfs of the fit variable
22  """
23 
24  def __init__(self, z, y):
25  """
26  Constructor of a new prior distribution
27  @param z fit variable
28  @param y target variable
29  """
30 
31  self.signal_cdf, self.signal_pdf, self.signal_binssignal_bins = calculate_cdf_and_pdf(z[y == 1])
32 
33  self.bckgrd_cdf, self.bckgrd_pdf, self.bckgrd_binsbckgrd_bins = calculate_cdf_and_pdf(z[y == 0])
34  # Avoid numerical instabilities
35  self.bckgrd_pdf[0] = self.bckgrd_pdf[-1] = 1
36 
37  def get_signal_pdf(self, X):
38  """
39  Calculate signal pdf for given fit variable value
40  @param X nd-array containing fit variable values
41  """
42  return self.signal_pdf[np.digitize(X, bins=self.signal_binssignal_bins)]
43 
44  def get_bckgrd_pdf(self, X):
45  """
46  Calculate background pdf for given fit variable value
47  @param X nd-array containing fit variable values
48  """
49  return self.bckgrd_pdf[np.digitize(X, bins=self.bckgrd_binsbckgrd_bins)]
50 
51  def get_signal_cdf(self, X):
52  """
53  Calculate signal cdf for given fit variable value
54  @param X nd-array containing fit variable values
55  """
56  return self.signal_cdf[np.digitize(X, bins=self.signal_binssignal_bins)]
57 
58  def get_bckgrd_cdf(self, X):
59  """
60  Calculate background cdf for given fit variable value
61  @param X nd-array containing fit variable values
62  """
63  return self.bckgrd_cdf[np.digitize(X, bins=self.bckgrd_binsbckgrd_bins)]
64 
65  def get_prior(self, X):
66  """
67  Calculate prior signal probability for given fit variable value
68  @param X nd-array containing fit variable values
69  """
70  prior = self.get_signal_pdfget_signal_pdf(X) / (self.get_signal_pdfget_signal_pdf(X) + self.get_bckgrd_pdfget_bckgrd_pdf(X))
71  prior = np.where(np.isfinite(prior), prior, 0.5)
72  return prior
73 
74  def get_boost_weights(self, X):
75  """
76  Calculate boost weights used in dplot boost training step
77  @param X nd-array containing fit variable values
78  """
79  signal_weight = self.get_signal_cdfget_signal_cdf(X) / self.get_bckgrd_pdfget_bckgrd_pdf(X)
80  signal_weight = np.where(np.isfinite(signal_weight), signal_weight, 0)
81  # NOT self.get_bckgrd_cdf() here, signal and background are handled asymmetrical!
82  bckgrd_weight = (1.0 - self.get_signal_cdfget_signal_cdf(X)) / self.get_bckgrd_pdfget_bckgrd_pdf(X)
83  bckgrd_weight = np.where(np.isfinite(bckgrd_weight), bckgrd_weight, 0)
84  return np.r_[signal_weight, bckgrd_weight]
85 
86  def get_uncorrelation_weights(self, X, boost_prediction):
87  """
88  Calculate uncorrelation weights used in dplot classifier training step
89  @param X nd-array containing fit variable values
90  @param boost_prediction output of the boost classifier
91  """
92  reg_boost_prediction = boost_prediction * 0.99 + 0.005
93  weights = (self.get_signal_cdfget_signal_cdf(X) / reg_boost_prediction +
94  (1.0 - self.get_signal_cdfget_signal_cdf(X)) / (1.0 - reg_boost_prediction)) / 2
95  return weights
96 
97 
98 def calculate_cdf_and_pdf(X):
99  """
100  Calculates cdf and pdf of given sample and adds under/overflow bins
101  @param X 1-d np.array
102  """
103  pdf, bins = np.histogram(X, bins=200, density=True)
104  cdf = np.cumsum(pdf * (bins - np.roll(bins, 1))[1:])
105  return np.hstack([0.0, cdf, 1.0]), np.hstack([0.0, pdf, 0.0]), bins
106 
107 
108 def get_model(number_of_features, number_of_spectators, number_of_events, training_fraction, parameters):
109 
110  @tf.function
111  def dense(x, W, b, activation_function):
112  return activation_function(tf.matmul(x, W) + b)
113 
114  class my_model(tf.Module):
115 
116  def __init__(self, **kwargs):
117  super().__init__(**kwargs)
118 
119  self.boost_optimizer = tf.optimizers.Adam(0.01)
120  self.inference_optimizer = tf.optimizers.Adam(0.01)
121 
122  def create_layer_variables(shape, name, activation_function):
123  weights = tf.Variable(
124  tf.random.truncated_normal(shape, stddev=1.0 / np.sqrt(float(shape[0]))),
125  name=f'{name}_weights')
126  biases = tf.Variable(tf.zeros(shape=[shape[1]]), name=f'{name}_biases')
127  return weights, biases, activation_function
128 
129  self.boost_layer_vars = []
130  self.boost_layer_vars.append(create_layer_variables([number_of_features, 20], 'boost_input', tf.nn.sigmoid))
131  for i in range(3):
132  self.boost_layer_vars.append(create_layer_variables([20, 20], f'boost_hidden{i}', tf.nn.sigmoid))
133  self.boost_layer_vars.append(create_layer_variables([20, 1], 'boost_output', tf.nn.sigmoid))
134 
135  self.inference_layer_vars = []
136  self.inference_layer_vars.append(create_layer_variables([number_of_features, 20], 'inference_input', tf.nn.sigmoid))
137  for i in range(3):
138  self.inference_layer_vars.append(create_layer_variables([20, 20], f'inference_hidden{i}', tf.nn.sigmoid))
139  self.inference_layer_vars.append(create_layer_variables([20, 1], 'inference_output', tf.nn.sigmoid))
140 
141  self.n_boost_layers = len(self.boost_layer_vars)
142  self.n_inference_layers = len(self.inference_layer_vars)
143 
144  @tf.function(input_signature=[tf.TensorSpec(shape=[None, number_of_features], dtype=tf.float32)])
145  def __call__(self, x):
146  for i in range(self.n_inference_layers):
147  x = dense(x, *self.inference_layer_vars[i])
148  return x
149 
150  @tf.function(input_signature=[tf.TensorSpec(shape=[None, number_of_features], dtype=tf.float32)])
151  def boost(self, x):
152  for i in range(self.n_boost_layers):
153  x = dense(x, *self.boost_layer_vars[i])
154  return x
155 
156  @tf.function
157  def loss(self, predicted_y, target_y, w):
158  epsilon = 1e-5
159  diff_from_truth = tf.where(target_y == 1., predicted_y, 1. - predicted_y)
160  cross_entropy = - tf.reduce_sum(w * tf.math.log(diff_from_truth + epsilon)) / tf.reduce_sum(w)
161  return cross_entropy
162 
163  state = State(model=my_model())
164  return state
165 
166 
167 def partial_fit(state, X, S, y, w, epoch, batch):
168  """
169  Pass received data to tensorflow session
170  """
171  prior = Prior(S[:, 0], y[:, 0])
172  N = 100
173  batch_size = 100
174 
175  # make sure we have the epochs and batches set correctly.
176  assert epoch < 2, "There should only be two iterations, one for the boost training,"\
177  " one for the dplot training. Check the value of m_nIterations."
178  assert batch == 0, "All data should be passed to partial_fit on each call."\
179  " The mini batches are handled internally. Check that m_mini_batch_size=0."
180 
181  indices = np.arange(len(X))
182  for i in range(N):
183  np.random.shuffle(indices)
184  for pos in range(0, len(indices), batch_size):
185  if pos + batch_size >= len(indices):
186  break
187  index = indices[pos: pos + batch_size]
188  z_batch = S[index, 0]
189  x_batch = X[index]
190 
191  if epoch == 0:
192  x_batch = np.r_[x_batch, x_batch]
193  w_batch = prior.get_boost_weights(z_batch) * np.r_[w[index, 0], w[index, 0]]
194  y_batch = np.r_[np.ones(batch_size), np.zeros(batch_size)]
195  y_batch = np.reshape(y_batch, (-1, 1))
196  optimizer = state.model.boost_optimizer
197  name = 'boost'
198  else:
199  p_batch = state.model.boost(x_batch).numpy()
200  w_batch = prior.get_uncorrelation_weights(z_batch, p_batch.flatten()) * w[index, 0]
201  y_batch = y[index]
202  optimizer = state.model.inference_optimizer
203  name = 'inference'
204 
205  w_batch = np.reshape(w_batch, (-1, 1)).astype(np.float32)
206 
207  with tf.GradientTape() as tape:
208  if epoch == 0:
209  y_predict_batch = state.model.boost(x_batch)
210  else:
211  y_predict_batch = state.model(x_batch)
212 
213  avg_cost = state.model.loss(y_predict_batch, y_batch, w_batch)
214  trainable_variables = [v for v in state.model.trainable_variables if name in v.name]
215  grads = tape.gradient(avg_cost, trainable_variables)
216 
217  optimizer.apply_gradients(zip(grads, trainable_variables))
218 
219  print("Internal Epoch:", f'{int(i):04}', "cost=", f"{avg_cost:.9f}")
220  return True
221 
222 
223 if __name__ == "__main__":
224 
225  train_file = find_file("mva/train_D0toKpipi.root", "examples")
226  training_data = basf2_mva.vector(train_file)
227 
228  general_options = basf2_mva.GeneralOptions()
229  general_options.m_datafiles = training_data
230  general_options.m_identifier = "TensorflowDPlot"
231  general_options.m_treename = "tree"
232  variables = ['p', 'pt', 'pz',
233  'daughter(0, p)', 'daughter(0, pz)', 'daughter(0, pt)',
234  'daughter(1, p)', 'daughter(1, pz)', 'daughter(1, pt)',
235  'daughter(2, p)', 'daughter(2, pz)', 'daughter(2, pt)',
236  'chiProb', 'dr', 'dz',
237  'daughter(0, dr)', 'daughter(1, dr)',
238  'daughter(0, dz)', 'daughter(1, dz)',
239  'daughter(0, chiProb)', 'daughter(1, chiProb)', 'daughter(2, chiProb)',
240  'daughter(0, kaonID)', 'daughter(0, pionID)',
241  'daughterInvM(0, 1)', 'daughterInvM(0, 2)', 'daughterInvM(1, 2)']
242  general_options.m_variables = basf2_mva.vector(*variables)
243  general_options.m_spectators = basf2_mva.vector('M')
244  general_options.m_target_variable = "isSignal"
245 
246  specific_options = basf2_mva.PythonOptions()
247  specific_options.m_framework = "tensorflow"
248  specific_options.m_steering_file = 'mva/examples/tensorflow/dplot.py'
249  specific_options.m_nIterations = 2 # Feed data twice (first time for boost training, second time for dplot training)
250  specific_options.m_mini_batch_size = 0 # Pass all events each 'batch'
251  basf2_mva.teacher(general_options, specific_options)
def get_prior(self, X)
Definition: dplot.py:65
def get_bckgrd_cdf(self, X)
Definition: dplot.py:58
def __init__(self, z, y)
Definition: dplot.py:24
bckgrd_bins
background cdf, pdf and binning
Definition: dplot.py:33
def get_boost_weights(self, X)
Definition: dplot.py:74
def get_bckgrd_pdf(self, X)
Definition: dplot.py:44
signal_bins
signal cdf, pdf and binning
Definition: dplot.py:31
def get_signal_cdf(self, X)
Definition: dplot.py:51
def get_signal_pdf(self, X)
Definition: dplot.py:37
def get_uncorrelation_weights(self, X, boost_prediction)
Definition: dplot.py:86
Definition: Module.h:22