Belle II Software  release-06-01-15
dplot.py
1 #!/usr/bin/env python3
2 
3 
10 
11 import numpy as np
12 import tensorflow as tf
13 import basf2_mva
14 
16 
17 
18 class Prior(object):
19  """
20  Calculates prior from signal and background pdfs of the fit variable
21  """
22 
23  def __init__(self, z, y):
24  """
25  Constructor of a new prior distribution
26  @param z fit variable
27  @param y target variable
28  """
29 
30  self.signal_cdf, self.signal_pdf, self.signal_binssignal_bins = calculate_cdf_and_pdf(z[y == 1])
31 
32  self.bckgrd_cdf, self.bckgrd_pdf, self.bckgrd_binsbckgrd_bins = calculate_cdf_and_pdf(z[y == 0])
33  # Avoid numerical instabilities
34  self.bckgrd_pdf[0] = self.bckgrd_pdf[-1] = 1
35 
36  def get_signal_pdf(self, X):
37  """
38  Calculate signal pdf for given fit variable value
39  @param X nd-array containing fit variable values
40  """
41  return self.signal_pdf[np.digitize(X, bins=self.signal_binssignal_bins)]
42 
43  def get_bckgrd_pdf(self, X):
44  """
45  Calculate background pdf for given fit variable value
46  @param X nd-array containing fit variable values
47  """
48  return self.bckgrd_pdf[np.digitize(X, bins=self.bckgrd_binsbckgrd_bins)]
49 
50  def get_signal_cdf(self, X):
51  """
52  Calculate signal cdf for given fit variable value
53  @param X nd-array containing fit variable values
54  """
55  return self.signal_cdf[np.digitize(X, bins=self.signal_binssignal_bins)]
56 
57  def get_bckgrd_cdf(self, X):
58  """
59  Calculate background cdf for given fit variable value
60  @param X nd-array containing fit variable values
61  """
62  return self.bckgrd_cdf[np.digitize(X, bins=self.bckgrd_binsbckgrd_bins)]
63 
64  def get_prior(self, X):
65  """
66  Calculate prior signal probability for given fit variable value
67  @param X nd-array containing fit variable values
68  """
69  prior = self.get_signal_pdfget_signal_pdf(X) / (self.get_signal_pdfget_signal_pdf(X) + self.get_bckgrd_pdfget_bckgrd_pdf(X))
70  prior = np.where(np.isfinite(prior), prior, 0.5)
71  return prior
72 
73  def get_boost_weights(self, X):
74  """
75  Calculate boost weights used in dplot boost training step
76  @param X nd-array containing fit variable values
77  """
78  signal_weight = self.get_signal_cdfget_signal_cdf(X) / self.get_bckgrd_pdfget_bckgrd_pdf(X)
79  signal_weight = np.where(np.isfinite(signal_weight), signal_weight, 0)
80  # NOT self.get_bckgrd_cdf() here, signal and background are handled asymmetrical!
81  bckgrd_weight = (1.0 - self.get_signal_cdfget_signal_cdf(X)) / self.get_bckgrd_pdfget_bckgrd_pdf(X)
82  bckgrd_weight = np.where(np.isfinite(bckgrd_weight), bckgrd_weight, 0)
83  return np.r_[signal_weight, bckgrd_weight]
84 
85  def get_uncorrelation_weights(self, X, boost_prediction):
86  """
87  Calculate uncorrelation weights used in dplot classifier training step
88  @param X nd-array containing fit variable values
89  @param boost_prediction output of the boost classifier
90  """
91  reg_boost_prediction = boost_prediction * 0.99 + 0.005
92  weights = (self.get_signal_cdfget_signal_cdf(X) / reg_boost_prediction +
93  (1.0 - self.get_signal_cdfget_signal_cdf(X)) / (1.0 - reg_boost_prediction)) / 2
94  return weights
95 
96 
97 def calculate_cdf_and_pdf(X):
98  """
99  Calculates cdf and pdf of given sample and adds under/overflow bins
100  @param X 1-d np.array
101  """
102  pdf, bins = np.histogram(X, bins=200, density=True)
103  cdf = np.cumsum(pdf * (bins - np.roll(bins, 1))[1:])
104  return np.hstack([0.0, cdf, 1.0]), np.hstack([0.0, pdf, 0.0]), bins
105 
106 
107 def get_model(number_of_features, number_of_spectators, number_of_events, training_fraction, parameters):
108 
109  tf.reset_default_graph()
110  x = tf.placeholder(tf.float32, [None, number_of_features], name='x')
111  y = tf.placeholder(tf.float32, [None, 1], name='y')
112  w = tf.placeholder(tf.float32, [None, 1], name='w')
113 
114  def layer(x, shape, name, unit=tf.sigmoid):
115  with tf.name_scope(name):
116  weights = tf.Variable(tf.truncated_normal(shape, stddev=1.0 / np.sqrt(float(shape[0]))), name='weights')
117  biases = tf.Variable(tf.constant(0.0, shape=[shape[1]]), name='biases')
118  layer = unit(tf.matmul(x, weights) + biases)
119  return layer
120 
121  # Boost network
122  boost_hidden1 = layer(x, [number_of_features, 20], 'boost_hidden1')
123  boost_hidden2 = layer(boost_hidden1, [20, 20], 'boost_hidden2')
124  boost_hidden3 = layer(boost_hidden2, [20, 20], 'boost_hidden3')
125  boost_hidden4 = layer(boost_hidden3, [20, 20], 'boost_hidden4')
126  boost_activation = layer(boost_hidden4, [20, 1], 'boost_sigmoid', unit=tf.sigmoid)
127 
128  epsilon = 1e-5
129  boost_loss = -tf.reduce_sum(y * w * tf.log(boost_activation + epsilon) +
130  (1.0 - y) * w * tf.log(1 - boost_activation + epsilon)) / tf.reduce_sum(w)
131 
132  boost_optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
133  boost_minimize = boost_optimizer.minimize(boost_loss)
134 
135  # Inference network
136  inference_hidden1 = layer(x, [number_of_features, 20], 'inference_hidden1')
137  inference_hidden2 = layer(inference_hidden1, [20, 20], 'inference_hidden2')
138  inference_hidden3 = layer(inference_hidden2, [20, 20], 'inference_hidden3')
139  inference_hidden4 = layer(inference_hidden3, [20, 20], 'inference_hidden4')
140  inference_activation = layer(inference_hidden4, [20, 1], 'inference_sigmoid', unit=tf.sigmoid)
141 
142  epsilon = 1e-5
143  inference_loss = -tf.reduce_sum(y * w * tf.log(inference_activation + epsilon) +
144  (1.0 - y) * w * tf.log(1 - inference_activation + epsilon)) / tf.reduce_sum(w)
145 
146  inference_optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
147  inference_minimize = inference_optimizer.minimize(inference_loss)
148 
149  init = tf.global_variables_initializer()
150 
151  config = tf.ConfigProto()
152  config.gpu_options.allow_growth = True
153  session = tf.Session(config=config)
154  session.run(init)
155 
156  state = State(x, y, inference_activation, inference_loss, inference_minimize, session)
157  state.boost_cost = boost_loss
158  state.boost_optimizer = boost_minimize
159  state.boost_activation = inference_activation
160  state.w = w
161  return state
162 
163 
164 def partial_fit(state, X, S, y, w, epoch):
165  """
166  Pass received data to tensorflow session
167  """
168  prior = Prior(S[:, 0], y[:, 0])
169  N = 100
170  batch_size = 100
171 
172  indices = np.arange(len(X))
173  for i in range(N):
174  np.random.shuffle(indices)
175  for pos in range(0, len(indices), batch_size):
176  if pos + batch_size >= len(indices):
177  break
178  index = indices[pos: pos + batch_size]
179  z_batch = S[index, 0]
180  x_batch = X[index]
181 
182  if epoch == 0:
183  x_batch = np.r_[x_batch, x_batch]
184  w_batch = prior.get_boost_weights(z_batch) * np.r_[w[index, 0], w[index, 0]]
185  y_batch = np.r_[np.ones(batch_size), np.zeros(batch_size)]
186  y_batch = np.reshape(y_batch, (-1, 1))
187  optimizer = state.boost_optimizer
188  cost = state.boost_cost
189  else:
190  p_batch = state.session.run(state.boost_activation, feed_dict={state.x: x_batch})
191  w_batch = prior.get_uncorrelation_weights(z_batch, p_batch.flatten()) * w[index, 0]
192  y_batch = y[index]
193  optimizer = state.optimizer
194  cost = state.cost
195 
196  w_batch = np.reshape(w_batch, (-1, 1))
197  feed_dict = {state.x: x_batch, state.y: y_batch, state.w: w_batch}
198  state.session.run(optimizer, feed_dict=feed_dict)
199  avg_cost = state.session.run(cost, feed_dict=feed_dict)
200  print("Epoch:", '%04d' % (i), "cost=", "{:.9f}".format(avg_cost))
201  return True
202 
203 
204 if __name__ == "__main__":
205  general_options = basf2_mva.GeneralOptions()
206  general_options.m_datafiles = basf2_mva.vector("train.root")
207  general_options.m_identifier = "TensorflowDPlot"
208  general_options.m_treename = "tree"
209  variables = ['p', 'pt', 'pz',
210  'daughter(0, p)', 'daughter(0, pz)', 'daughter(0, pt)',
211  'daughter(1, p)', 'daughter(1, pz)', 'daughter(1, pt)',
212  'daughter(2, p)', 'daughter(2, pz)', 'daughter(2, pt)',
213  'chiProb', 'dr', 'dz',
214  'daughter(0, dr)', 'daughter(1, dr)',
215  'daughter(0, dz)', 'daughter(1, dz)',
216  'daughter(0, chiProb)', 'daughter(1, chiProb)', 'daughter(2, chiProb)',
217  'daughter(0, kaonID)', 'daughter(0, pionID)',
218  'daughterInvariantMass(0, 1)', 'daughterInvariantMass(0, 2)', 'daughterInvariantMass(1, 2)']
219  general_options.m_variables = basf2_mva.vector(*variables)
220  general_options.m_spectators = basf2_mva.vector('M')
221  general_options.m_target_variable = "isSignal"
222 
223  specific_options = basf2_mva.PythonOptions()
224  specific_options.m_framework = "tensorflow"
225  specific_options.m_steering_file = 'mva/examples/tensorflow_dplot.py'
226  specific_options.m_nIterations = 2 # Feed data twice (first time for boost training, second time for dplot training)
227  specific_options.m_mini_batch_size = 0
228  basf2_mva.teacher(general_options, specific_options)
def get_prior(self, X)
Definition: dplot.py:64
def get_bckgrd_cdf(self, X)
Definition: dplot.py:57
def __init__(self, z, y)
Definition: dplot.py:23
bckgrd_bins
background cdf, pdf and binning
Definition: dplot.py:32
def get_boost_weights(self, X)
Definition: dplot.py:73
def get_bckgrd_pdf(self, X)
Definition: dplot.py:43
signal_bins
signal cdf, pdf and binning
Definition: dplot.py:30
def get_signal_cdf(self, X)
Definition: dplot.py:50
def get_signal_pdf(self, X)
Definition: dplot.py:36
def get_uncorrelation_weights(self, X, boost_prediction)
Definition: dplot.py:85