Belle II Software development
how_to_use_arbitrary_methods.py
1#!/usr/bin/env python3
2
3
10
11# The MVA package does support arbitrary python-based mva frameworks.
12# You just have to:
13# Install them (e.g. via pip3)
14# Provide all necessary hook functions (see below)
15
16import numpy as np
17import basf2_mva
18import basf2_mva_util
19
20
22 """ Let's assume we have written our own classifier (or installed something from github) """
23
24 def __init__(self, *my_fancy_parameters):
25 """ Just print the passed parameters """
26 print(my_fancy_parameters)
27
28 def fit(self, X, y):
29 """ Our method is so good, it doesn't even have to look at the data! """
30 return self
31
32 def predict(self, X):
33 """ Always return 1, this will boost our signal efficiency to the max """
34 return np.ones(len(X))
35
36
37# These are the hooks you should implement
38
39
40def get_model(number_of_features, number_of_spectators, number_of_events, training_fraction, parameters):
41 """
42 This is the first function which is called.
43 It must return a python object representing your method in memory, this object will be passed to all other hook functions.
44 In this case we return our FancyClassifier object.
45 @param number_of_features the total number of features
46 @param number_of_spectators the total number of spectators
47 @param number_of_events the total number of events
48 @param training_fraction the signal fraction in the training (if you do a classification, otherwise the number is meaningless)
49 @param parameters a python object which is created from a json string the user can pass via the m_config argument
50 """
51 return MyFancyClassifier(parameters)
52
53
54def begin_fit(state, Xtest, Stest, ytest, wtest, nBatches):
55 """
56 Is called once per training after get_model.
57 You can initialize your training here.
58 In addition a validation sample is passed, which you can use during the training (if the user set m_training_fraction != 1.0)
59 @param state the return value of get_model
60 @param Xtest numpy array containing the features of the validation sample
61 @param Stest numpy array containing the spectators of the validation sample
62 @param ytest numpy array containing the target values of the validation sample
63 @param wtest numpy array containing the weights of the validation sample
64 @param nBatches int containing the number of batches that will be passed to partial_fit in each epoch.
65
66 Since our method does not support out-of-core fitting, the usual thing is to add
67 some arrays which collect the data passed to partial_fit.
68 Our method doesn't use the validation sample either, so we just don't use it.
69 """
70 state.X = []
71 state.y = []
72 return state
73
74
75def partial_fit(state, X, S, y, w, epoch, batch):
76 """
77 Can be called multiple times per training depending on the user configuration:
78 If m_nIterations == 1 and m_mini_batch_size == 0 (these are the default values)
79 partial_fit is called once with the complete training data
80 If m_nIterations == 1 and m_mini_batch_size != 0
81 partial_fit is called multiple times with only a subset of the training data of the desired size,
82 until the complete dataset was streamed via partial_fit
83 If m_nIterations > 1 and m_mini_batch_size == 0
84 partial_fit is called multiple times, each time with the complete training data
85 If m_nIterations > 1 and m_mini_batch_size != 0
86 partial_fit is called multiple times with only a subset of the training data of the desired size,
87 until the complete dataset was streamed m_nIterations times
88 If m_nIterations == 0
89 partial_fit is called multiple times until partial_fit returns False
90 As soon as partial_fit returns False the streaming of data is stopped.
91 @param state the return value of begin_fit
92 @param X numpy array containing the features of the training sample
93 @param S numpy array containing the spectators of the training sample
94 @param y numpy array containing the target values of the training sample
95 @param w numpy array containing the weights of the training sample
96 @param epoch the index of the current iteration through the total data set.
97 @param batch the index of the current mini batch passed to partial_fit
98
99 Since our method doesn't use the streaming capability,
100 we just collect the data in our state object.
101 """
102 state.X.append(X)
103 state.y.append(y)
104 return True
105
106
107def end_fit(state):
108 """
109 Is called once per training.
110 Here you can finish the training.
111 You must return a pickable object, which is saved in the weightfile,
112 later you must be able to create your estimator from this pickled object in the load function hook (see below).
113 @param state the return value of begin_fit
114
115 We can fit our method here. And since our state object is pickable,
116 we can just return it. You might want to use better mechanism in a real world example,
117 you can look at the implementations of the other methods (like tensorflow) how to save models
118 to files, read them and return them as a pickable object.
119 """
120 state.fit(state.X, state.y)
121 pickable_object_for_weightfile = state
122 return pickable_object_for_weightfile
123
124
125def feature_importance(state):
126 """
127 Called after end_fit.
128 Should return a list containing the feature importances.
129 The feature importances are saved in the weightfile and can be read out by the user.
130 If your method doesn't support feature importance estimation return an empty list.
131 """
132 return []
133
134
135def load(pickable_object_from_weightfile):
136 """
137 Is called once.
138 @param obj the return value of end_fit, which was loaded from the weightfile and unpickled
139 This should return again a state object, which is passed to apply later.
140
141 In our case we directly pickled the state, so there's nothing to do here.
142 In a real world scenario you might have to create files on disk in a temporary directory
143 and recreate your estimator from them. You can look at other methods (like tensorflow) how this is done.
144 """
145 state = pickable_object_from_weightfile
146 return state
147
148
149def apply(state, X):
150 """
151 Is called once per inference.
152 Should return a numpy array with the predicted values.
153 You have to make sure that the numpy array has the correct format (32bit float, C-style ordering)!
154 The last line in this function takes care of this, I strongly recommend to keep this line!
155 @param state the return value of load
156 @param X numpy array containing the features for which a prediction should be returned
157 """
158 p = state.predict(X)
159 return np.require(p, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
160
161
162if __name__ == "__main__":
163 """
164 We written all the necessary hooks, now we can call the mva framework as usual.
165 Other Python-based frameworks like sklearn, tensorflow, xgboost, ... have predefined hooks,
166 but you can overwrite all of them.
167 """
168 from basf2 import conditions, find_file
169 # NOTE: do not use testing payloads in production! Any results obtained like this WILL NOT BE PUBLISHED
170 conditions.testing_payloads = [
171 'localdb/database.txt'
172 ]
173
174 # Create The GeneralOptions object as always
175 variables = ['M', 'p', 'pt', 'pz',
176 'daughter(0, p)', 'daughter(0, pz)', 'daughter(0, pt)',
177 'daughter(1, p)', 'daughter(1, pz)', 'daughter(1, pt)',
178 'daughter(2, p)', 'daughter(2, pz)', 'daughter(2, pt)',
179 'chiProb', 'dr', 'dz',
180 'daughter(0, dr)', 'daughter(1, dr)',
181 'daughter(0, dz)', 'daughter(1, dz)',
182 'daughter(0, chiProb)', 'daughter(1, chiProb)', 'daughter(2, chiProb)',
183 'daughter(0, kaonID)', 'daughter(0, pionID)',
184 'daughterInvM(0, 1)', 'daughterInvM(0, 2)', 'daughterInvM(1, 2)']
185
186 train_file = find_file("mva/train_D0toKpipi.root", "examples")
187 test_file = find_file("mva/test_D0toKpipi.root", "examples")
188
189 training_data = basf2_mva.vector(train_file)
190 testing_data = basf2_mva.vector(test_file)
191
192 general_options = basf2_mva.GeneralOptions()
193 general_options.m_datafiles = training_data
194 general_options.m_treename = "tree"
195 general_options.m_identifier = "MyFancyModel"
196 general_options.m_variables = basf2_mva.vector(*variables)
197 general_options.m_target_variable = "isSignal"
198
199 # With the PythonOptions you can configure some details how the hook functions are called
200 # I describe here every option, but there are reasonable defaults, so usually you only
201 # have to set m_framework and m_steering_file
202 python_options = basf2_mva.PythonOptions()
203
204 # You have to use "custom" as framework,
205 # this will raise a RuntimeError if you forgot to implement any of the hooks
206 python_options.m_framework = "custom"
207
208 # The path to the file were you implemented all the hooks,
209 # in this case this is the same file were we setup the training itself,
210 # but in principle it can be any file, this file will be saved in the weightfile
211 # and it will be executed as soon as the weightfile is loaded! (so the above if __name__ == "__main__" is very important)
212 python_options.m_steering_file = "mva/examples/python/how_to_use_arbitrary_methods.py"
213
214 # You can pass parameters to your get_model hook, in form of a json string
215 # You can use json.dumps to find out the right syntax.
216 # For example if you want to pass a dictionary with some parameters
217 import json
218 config_string = json.dumps({'A': 'Python', 'Dictionary': 'With Parameters', 'And': ['A List']})
219 print("The json config string", config_string)
220 python_options.m_config = config_string
221
222 # You can spit the dataset into a training sample (passed to partial_fit) and a validation sample (passed to begin_fit)
223 # Here we use 70% for training and 30% as validation default is 1.0
224 python_options.m_training_fraction = 0.7
225
226 # You can normalize the input features before passing them to begin_fit, partial_fit and apply.
227 # The normalization is calculated once and saved in the weightfile.
228 # Every feature is shifted to mean 0 and a standard deviation of 1
229 python_options.m_normalize = False
230
231 # As described in partial_fit, the mva package can stream the data to your method.
232 # The following to parameters control the streaming.
233 # If you just want the full dataset at once use the following values (which are the default values)
234 python_options.m_nIterations = 1
235 python_options.m_mini_batch_size = 0
236
237 # Now you can train as usual
238 # Of course you can also use the command line command basf2_mva_teacher to do so
239 basf2_mva.teacher(general_options, python_options)
240
241 # To validate your method it is convenient to use basf2_mva_util to load a trained method
242 method = basf2_mva_util.Method(general_options.m_identifier)
243
244 # Because then it is very easy to apply the method to a test file,
245 # of course you can also apply the method using the MVAExpert module directly in basf2
246 # Or (if you do reconstruction and not analysis) the corresponding modules.
247 p, t = method.apply_expert(testing_data, general_options.m_treename)
248
249 # We calculate the AUC ROC value of the returned probability and target,
250 # our method is very simple, so the AUC won't be good :-)
252 print("Custom Method", auc)
def calculate_auc_efficiency_vs_background_retention(p, t, w=None)