Belle II Software  release-08-01-10
preprocessing.py
1 #!/usr/bin/env python3
2 
3 
10 
11 # This example shows how to implement a preprocessing step like equal frequency binning
12 
13 import basf2_mva
14 import basf2_mva_util
15 import time
16 
17 import numpy as np
18 
19 
20 from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
21 from tensorflow.keras.models import Model
22 from tensorflow.keras.optimizers import Adam
23 from tensorflow.keras.losses import binary_crossentropy
24 from tensorflow.keras.activations import sigmoid, tanh
25 from tensorflow.keras.callbacks import Callback
26 
27 from basf2_mva_python_interface.keras import State
28 from basf2_mva_extensions.preprocessing import fast_equal_frequency_binning
29 
30 old_time = time.time()
31 
32 
33 def get_model(number_of_features, number_of_spectators, number_of_events, training_fraction, parameters):
34  """
35  Build feed forward keras model
36  """
37  input = Input(shape=(number_of_features,))
38 
39  net = Dense(units=number_of_features, activation=tanh)(input)
40  for i in range(7):
41  net = Dense(units=number_of_features, activation=tanh)(net)
42  net = BatchNormalization()(net)
43  for i in range(7):
44  net = Dense(units=number_of_features, activation=tanh)(net)
45  net = Dropout(rate=0.4)(net)
46 
47  output = Dense(units=1, activation=sigmoid)(net)
48 
49  # Pass empty preprocessor state as kwarg in the state class.
50  # The interface is designed to automatically save every kwarg, which is passed in the initializer in end_fit.
51  state = State(Model(input, output), preprocessor_state=None)
52 
53  state.model.compile(optimizer=Adam(lr=0.01), loss=binary_crossentropy, metrics=['accuracy'])
54 
55  state.model.summary()
56 
57  return state
58 
59 
60 def begin_fit(state, Xtest, Stest, ytest, wtest, nBatches):
61  """
62  Returns just the state object
63  """
64 
65  state.Xtest = Xtest
66  state.ytest = ytest
67 
68  return state
69 
70 
71 def partial_fit(state, X, S, y, w, epoch, batch):
72  """
73  Pass received data to tensorflow session
74  """
75  # Fit and Apply preprocessor
76  preprocessor = fast_equal_frequency_binning()
77  preprocessor.fit(X)
78  X = preprocessor.apply(X)
79  state.Xtest = preprocessor.apply(state.Xtest)
80 
81  # save preprocessor state in the State class
82  state.preprocessor_state = preprocessor.export_state()
83 
84  class TestCallback(Callback):
85  def on_epoch_end(self, epoch, logs=None):
86  loss, acc = state.model.evaluate(state.Xtest, state.ytest, verbose=0, batch_size=1000)
87  loss2, acc2 = state.model.evaluate(X[:10000], y[:10000], verbose=0, batch_size=1000)
88  print(f'\nTesting loss: {loss}, acc: {acc}')
89  print(f'Training loss: {loss2}, acc: {acc2}')
90 
91  state.model.fit(X, y, batch_size=500, epochs=10, callbacks=[TestCallback()])
92  return False
93 
94 
95 def apply(state, X):
96  """
97  Apply estimator to passed data.
98  Has to be overwritten, because also the expert has to apply preprocessing.
99  """
100  # The preprocessor state is automatically loaded in the load function
101  preprocessor = fast_equal_frequency_binning(state.preprocessor_state)
102  # Apply preprocessor
103  X = preprocessor.apply(X)
104 
105  r = state.model.predict(X).flatten()
106  return np.require(r, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
107 
108 
109 if __name__ == "__main__":
110  from basf2 import conditions, find_file
111  # NOTE: do not use testing payloads in production! Any results obtained like this WILL NOT BE PUBLISHED
112  conditions.testing_payloads = [
113  'localdb/database.txt'
114  ]
115 
116  train_file = find_file("mva/train_D0toKpipi.root", "examples")
117  test_file = find_file("mva/test_D0toKpipi.root", "examples")
118 
119  training_data = basf2_mva.vector(train_file)
120  testing_data = basf2_mva.vector(test_file)
121 
122  general_options = basf2_mva.GeneralOptions()
123  general_options.m_datafiles = training_data
124  general_options.m_identifier = "preprocessed_deep_keras"
125  general_options.m_treename = "tree"
126  variables = ['M', 'p', 'pt', 'pz',
127  'daughter(0, p)', 'daughter(0, pz)', 'daughter(0, pt)',
128  'daughter(1, p)', 'daughter(1, pz)', 'daughter(1, pt)',
129  'daughter(2, p)', 'daughter(2, pz)', 'daughter(2, pt)',
130  'chiProb', 'dr', 'dz',
131  'daughter(0, dr)', 'daughter(1, dr)',
132  'daughter(0, dz)', 'daughter(1, dz)',
133  'daughter(0, chiProb)', 'daughter(1, chiProb)', 'daughter(2, chiProb)',
134  'daughter(0, kaonID)', 'daughter(0, pionID)',
135  'daughterInvM(0, 1)', 'daughterInvM(0, 2)', 'daughterInvM(1, 2)']
136  general_options.m_variables = basf2_mva.vector(*variables)
137  general_options.m_target_variable = "isSignal"
138 
139  specific_options = basf2_mva.PythonOptions()
140  specific_options.m_framework = "keras"
141  specific_options.m_steering_file = 'mva/examples/keras/preprocessing.py'
142  specific_options.m_normalize = True
143  specific_options.m_training_fraction = 0.9
144 
145  training_start = time.time()
146  basf2_mva.teacher(general_options, specific_options)
147  training_stop = time.time()
148  training_time = training_stop - training_start
149  method = basf2_mva_util.Method(general_options.m_identifier)
150  inference_start = time.time()
151  p, t = method.apply_expert(testing_data, general_options.m_treename)
152  inference_stop = time.time()
153  inference_time = inference_stop - inference_start
155 
156  print("Tensorflow", training_time, inference_time, auc)
def calculate_auc_efficiency_vs_background_retention(p, t, w=None)