Belle II Software  release-05-02-19
dplot.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 # Thomas Keck 2017
5 
6 import numpy as np
7 import tensorflow as tf
8 import basf2_mva
9 import pandas
10 
12 
13 
14 class Prior(object):
15  """
16  Calculates prior from signal and background pdfs of the fit variable
17  """
18 
19  def __init__(self, z, y):
20  """
21  Constructor of a new prior distribution
22  @param z fit variable
23  @param y target variable
24  """
25 
26  self.signal_cdf, self.signal_pdf, self.signal_bins = calculate_cdf_and_pdf(z[y == 1])
27 
28  self.bckgrd_cdf, self.bckgrd_pdf, self.bckgrd_bins = calculate_cdf_and_pdf(z[y == 0])
29  # Avoid numerical instabilities
30  self.bckgrd_pdf[0] = self.bckgrd_pdf[-1] = 1
31 
32  def get_signal_pdf(self, X):
33  """
34  Calculate signal pdf for given fit variable value
35  @param X nd-array containing fit variable values
36  """
37  return self.signal_pdf[np.digitize(X, bins=self.signal_bins)]
38 
39  def get_bckgrd_pdf(self, X):
40  """
41  Calculate background pdf for given fit variable value
42  @param X nd-array containing fit variable values
43  """
44  return self.bckgrd_pdf[np.digitize(X, bins=self.bckgrd_bins)]
45 
46  def get_signal_cdf(self, X):
47  """
48  Calculate signal cdf for given fit variable value
49  @param X nd-array containing fit variable values
50  """
51  return self.signal_cdf[np.digitize(X, bins=self.signal_bins)]
52 
53  def get_bckgrd_cdf(self, X):
54  """
55  Calculate background cdf for given fit variable value
56  @param X nd-array containing fit variable values
57  """
58  return self.bckgrd_cdf[np.digitize(X, bins=self.bckgrd_bins)]
59 
60  def get_prior(self, X):
61  """
62  Calculate prior signal probability for given fit variable value
63  @param X nd-array containing fit variable values
64  """
65  prior = self.get_signal_pdf(X) / (self.get_signal_pdf(X) + self.get_bckgrd_pdf(X))
66  prior = np.where(np.isfinite(prior), prior, 0.5)
67  return prior
68 
69  def get_boost_weights(self, X):
70  """
71  Calculate boost weights used in dplot boost training step
72  @param X nd-array containing fit variable values
73  """
74  signal_weight = self.get_signal_cdf(X) / self.get_bckgrd_pdf(X)
75  signal_weight = np.where(np.isfinite(signal_weight), signal_weight, 0)
76  # NOT self.get_bckgrd_cdf() here, signal and background are handlet asymmetrical!
77  bckgrd_weight = (1.0 - self.get_signal_cdf(X)) / self.get_bckgrd_pdf(X)
78  bckgrd_weight = np.where(np.isfinite(bckgrd_weight), bckgrd_weight, 0)
79  return np.r_[signal_weight, bckgrd_weight]
80 
81  def get_uncorrelation_weights(self, X, boost_prediction):
82  """
83  Calculate uncorrelation weights used in dplot classifier training step
84  @param X nd-array containing fit variable values
85  @param boost_prediction output of the boost classifier
86  """
87  reg_boost_prediction = boost_prediction * 0.99 + 0.005
88  weights = (self.get_signal_cdf(X) / reg_boost_prediction +
89  (1.0 - self.get_signal_cdf(X)) / (1.0 - reg_boost_prediction)) / 2
90  return weights
91 
92 
93 def calculate_cdf_and_pdf(X):
94  """
95  Calculates cdf and pdf of given sample and adds under/overflow bins
96  @param X 1-d np.array
97  """
98  pdf, bins = np.histogram(X, bins=200, density=True)
99  cdf = np.cumsum(pdf * (bins - np.roll(bins, 1))[1:])
100  return np.hstack([0.0, cdf, 1.0]), np.hstack([0.0, pdf, 0.0]), bins
101 
102 
103 def get_model(number_of_features, number_of_spectators, number_of_events, training_fraction, parameters):
104 
105  tf.reset_default_graph()
106  x = tf.placeholder(tf.float32, [None, number_of_features], name='x')
107  y = tf.placeholder(tf.float32, [None, 1], name='y')
108  w = tf.placeholder(tf.float32, [None, 1], name='w')
109 
110  def layer(x, shape, name, unit=tf.sigmoid):
111  with tf.name_scope(name) as scope:
112  weights = tf.Variable(tf.truncated_normal(shape, stddev=1.0 / np.sqrt(float(shape[0]))), name='weights')
113  biases = tf.Variable(tf.constant(0.0, shape=[shape[1]]), name='biases')
114  layer = unit(tf.matmul(x, weights) + biases)
115  return layer
116 
117  # Boost network
118  boost_hidden1 = layer(x, [number_of_features, 20], 'boost_hidden1')
119  boost_hidden2 = layer(boost_hidden1, [20, 20], 'boost_hidden2')
120  boost_hidden3 = layer(boost_hidden2, [20, 20], 'boost_hidden3')
121  boost_hidden4 = layer(boost_hidden3, [20, 20], 'boost_hidden4')
122  boost_activation = layer(boost_hidden4, [20, 1], 'boost_sigmoid', unit=tf.sigmoid)
123 
124  epsilon = 1e-5
125  boost_loss = -tf.reduce_sum(y * w * tf.log(boost_activation + epsilon) +
126  (1.0 - y) * w * tf.log(1 - boost_activation + epsilon)) / tf.reduce_sum(w)
127 
128  boost_optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
129  boost_minimize = boost_optimizer.minimize(boost_loss)
130 
131  # Inference network
132  inference_hidden1 = layer(x, [number_of_features, 20], 'inference_hidden1')
133  inference_hidden2 = layer(inference_hidden1, [20, 20], 'inference_hidden2')
134  inference_hidden3 = layer(inference_hidden2, [20, 20], 'inference_hidden3')
135  inference_hidden4 = layer(inference_hidden3, [20, 20], 'inference_hidden4')
136  inference_activation = layer(inference_hidden4, [20, 1], 'inference_sigmoid', unit=tf.sigmoid)
137 
138  epsilon = 1e-5
139  inference_loss = -tf.reduce_sum(y * w * tf.log(inference_activation + epsilon) +
140  (1.0 - y) * w * tf.log(1 - inference_activation + epsilon)) / tf.reduce_sum(w)
141 
142  inference_optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
143  inference_minimize = inference_optimizer.minimize(inference_loss)
144 
145  init = tf.global_variables_initializer()
146 
147  config = tf.ConfigProto()
148  config.gpu_options.allow_growth = True
149  session = tf.Session(config=config)
150  session.run(init)
151 
152  state = State(x, y, inference_activation, inference_loss, inference_minimize, session)
153  state.boost_cost = boost_loss
154  state.boost_optimizer = boost_minimize
155  state.boost_activation = inference_activation
156  state.w = w
157  return state
158 
159 
160 def partial_fit(state, X, S, y, w, epoch):
161  """
162  Pass received data to tensorflow session
163  """
164  prior = Prior(S[:, 0], y[:, 0])
165  N = 100
166  batch_size = 100
167 
168  indices = np.arange(len(X))
169  for i in range(N):
170  np.random.shuffle(indices)
171  for pos in range(0, len(indices), batch_size):
172  if pos + batch_size >= len(indices):
173  break
174  index = indices[pos: pos + batch_size]
175  z_batch = S[index, 0]
176  x_batch = X[index]
177 
178  if epoch == 0:
179  x_batch = np.r_[x_batch, x_batch]
180  w_batch = prior.get_boost_weights(z_batch) * np.r_[w[index, 0], w[index, 0]]
181  y_batch = np.r_[np.ones(batch_size), np.zeros(batch_size)]
182  y_batch = np.reshape(y_batch, (-1, 1))
183  optimizer = state.boost_optimizer
184  cost = state.boost_cost
185  else:
186  p_batch = state.session.run(state.boost_activation, feed_dict={state.x: x_batch})
187  w_batch = prior.get_uncorrelation_weights(z_batch, p_batch.flatten()) * w[index, 0]
188  y_batch = y[index]
189  optimizer = state.optimizer
190  cost = state.cost
191 
192  w_batch = np.reshape(w_batch, (-1, 1))
193  feed_dict = {state.x: x_batch, state.y: y_batch, state.w: w_batch}
194  state.session.run(optimizer, feed_dict=feed_dict)
195  avg_cost = state.session.run(cost, feed_dict=feed_dict)
196  print("Epoch:", '%04d' % (i), "cost=", "{:.9f}".format(avg_cost))
197  return True
198 
199 
200 if __name__ == "__main__":
201  general_options = basf2_mva.GeneralOptions()
202  general_options.m_datafiles = basf2_mva.vector("train.root")
203  general_options.m_identifier = "TensorflowDPlot"
204  general_options.m_treename = "tree"
205  variables = ['p', 'pt', 'pz',
206  'daughter(0, p)', 'daughter(0, pz)', 'daughter(0, pt)',
207  'daughter(1, p)', 'daughter(1, pz)', 'daughter(1, pt)',
208  'daughter(2, p)', 'daughter(2, pz)', 'daughter(2, pt)',
209  'chiProb', 'dr', 'dz',
210  'daughter(0, dr)', 'daughter(1, dr)',
211  'daughter(0, dz)', 'daughter(1, dz)',
212  'daughter(0, chiProb)', 'daughter(1, chiProb)', 'daughter(2, chiProb)',
213  'daughter(0, kaonID)', 'daughter(0, pionID)',
214  'daughterInvariantMass(0, 1)', 'daughterInvariantMass(0, 2)', 'daughterInvariantMass(1, 2)']
215  general_options.m_variables = basf2_mva.vector(*variables)
216  general_options.m_spectators = basf2_mva.vector('M')
217  general_options.m_target_variable = "isSignal"
218 
219  specific_options = basf2_mva.PythonOptions()
220  specific_options.m_framework = "tensorflow"
221  specific_options.m_steering_file = 'mva/examples/tensorflow_dplot.py'
222  specific_options.m_nIterations = 2 # Feed data twice (first time for boost training, second time for dplot training)
223  specific_options.m_mini_batch_size = 0
224  basf2_mva.teacher(general_options, specific_options)
dplot.Prior.get_boost_weights
def get_boost_weights(self, X)
Definition: dplot.py:69
dplot.Prior.get_prior
def get_prior(self, X)
Definition: dplot.py:60
dplot.Prior.get_bckgrd_cdf
def get_bckgrd_cdf(self, X)
Definition: dplot.py:53
dplot.Prior.get_uncorrelation_weights
def get_uncorrelation_weights(self, X, boost_prediction)
Definition: dplot.py:81
dplot.Prior.signal_bins
signal_bins
signal cdf, pdf and binning
Definition: dplot.py:26
basf2_mva_python_interface.tensorflow
Definition: tensorflow.py:1
basf2_mva_python_interface.tensorflow.State
Definition: tensorflow.py:12
dplot.Prior.get_bckgrd_pdf
def get_bckgrd_pdf(self, X)
Definition: dplot.py:39
dplot.Prior.bckgrd_bins
bckgrd_bins
background cdf, pdf and binning
Definition: dplot.py:28
dplot.Prior.get_signal_cdf
def get_signal_cdf(self, X)
Definition: dplot.py:46
dplot.Prior.__init__
def __init__(self, z, y)
Definition: dplot.py:19
dplot.Prior
Definition: dplot.py:14
dplot.Prior.get_signal_pdf
def get_signal_pdf(self, X)
Definition: dplot.py:32