Belle II Software  release-05-02-19
howto_use_arbitrary_methods.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 # Thomas Keck 2017
5 
6 # The MVA package does support arbitrary python-based mva frameworks.
7 # You just have to:
8 # Install them (e.g. via pip3)
9 # Provide all necessary hook functions (see below)
10 
11 import numpy as np
12 
13 
14 class MyFancyClassifier(object):
15  """ Let's assume we have writte our own classifier (or installed something from github) """
16 
17  def __init__(self, *my_fancy_parameters):
18  """ Just print the passed parameters """
19  print(my_fancy_parameters)
20 
21  def fit(self, X, y):
22  """ Our method is so good, it doesn't even have to look at the data! """
23  return self
24 
25  def predict(self, X):
26  """ Always return 1, this will boost our signal efficiency to the max """
27  return np.ones(len(X))
28 
29 
30 import basf2_mva
31 import basf2_mva_util
32 
33 
34 # These are the hooks you should implement
35 
36 
37 def get_model(number_of_features, number_of_spectators, number_of_events, training_fraction, parameters):
38  """
39  This is the first function which is called.
40  It must return a python object representing your method in memory, this object will be passed to all other hook functions.
41  In this case we return our FancyClassifier object.
42  @param number_of_features the total number of features
43  @param number_of_spectators the total number of spectators
44  @param number_of_events the total number of events
45  @param training_fraction the signal fraction in the training (if you do a classification, otherwise the number is meaningless)
46  @param parameters a python object which is created from a json string the user can pass via the m_config argument
47  """
48  return MyFancyClassifier(parameters)
49 
50 
51 def begin_fit(state, Xtest, Stest, ytest, wtest):
52  """
53  Is called once per training after get_model.
54  You can initialize your training here.
55  In addition a validation sample is passed, which you can use during the training (if the user set m_training_fraction != 1.0)
56  @param state the return value of get_model
57  @param Xtest numpy array containing the features of the validation sample
58  @param Stest numpy array containing the spectators of the validation sample
59  @param ytest numpy array containing the target values of the validation sample
60  @param wtest numpy array containing the weights of the validation sample
61 
62  Since our method does not support out-of-core fitting, the usual thing is to add
63  some arrays which collect the data passed to partial_fit.
64  Our method doesn't use the validation sample either, so we just don't use it.
65  """
66  state.X = []
67  state.y = []
68  return state
69 
70 
71 def partial_fit(state, X, S, y, w, epoch):
72  """
73  Can be called multiple times per training depending on the user configuration:
74  If m_nIterations == 1 and m_mini_batch_size == 0 (these are the default values)
75  partial_fit is called once with the complete training data
76  If m_nIterations == 1 and m_mini_batch_size != 0
77  partial_fit is called multiple times with only a subset of the training data of the desired size,
78  until the complete dataset was streamed via partial_fit
79  If m_nIterations > 1 and m_mini_batch_size == 0
80  partial_fit is called multiple times, each time with the complete training data
81  If m_nIterations > 1 and m_mini_batch_size != 0
82  partial_fit is called multiple times with only a subset of the trianing data of the desired size,
83  until the complete dataset was streamed m_nIterations times
84  If m_nIterations == 0
85  partial_fit is called multiple times until partial_fit returns False
86  As soon as partial_fit returns False the streaming of data is stopped.
87  @param state the return value of begin_fit
88  @param X numpy array containing the features of the training sample
89  @param S numpy array containing the spectators of the training sample
90  @param y numpy array containing the target values of the training sample
91  @param w numpy array containing the weights of the training sample
92  @param epoch the total number of previous calls to partial_fit
93 
94  Since our method doesn't use the streaming capability,
95  we just collect the data in our state object.
96  """
97  state.X.append(X)
98  state.y.append(y)
99  return True
100 
101 
102 def end_fit(state):
103  """
104  Is called once per training.
105  Here you can finish the training.
106  You must return a pickable object, which is saved in the weightfile,
107  later you must be able to create your estimator from this pickled object in the load function hook (see below).
108  @param state the return value of begin_fit
109 
110  We can fit our method here. And since our state object is pickable,
111  we can just return it. You might want to use better mechanism in a real world example,
112  you can look at the implementations of the other methods (like tensorflow) howto save models
113  to files, read them and return them as a pickable object.
114  """
115  state.fit(state.X, state.y)
116  pickable_object_for_weightfile = state
117  return pickable_object_for_weightfile
118 
119 
120 def feature_importance(state):
121  """
122  Called after end_fit.
123  Should return a list containing the feature importances.
124  The feature importances are saved in the weightfile and can be read out by the user.
125  If your method doesn't support feature importance estimation return an empty list.
126  """
127  return []
128 
129 
130 def load(pickable_object_from_weightfile):
131  """
132  Is called once.
133  @param obj the return value of end_fit, which was loaded from the weightfile and unpickled
134  This should return again a state object, which is passed to apply later.
135 
136  In our case we directly pickled the state, so there's nothing to do here.
137  In a real world scenario you might have to create files on disk in a temporary directory
138  and recreate your estimator from them. You can look at other methods (like tensorflow) how this is done.
139  """
140  state = pickable_object_from_weightfile
141  return state
142 
143 
144 def apply(state, X):
145  """
146  Is called once per inference.
147  Should return a numpy array with the predicted values.
148  You have to make sure that the numpy array has the correct format (32bit float, C-style ordering)!
149  The last line in this function takes care of this, I strongly recommend to keep this line!
150  @param state the return value of load
151  @param X numpy array containing the features for which a prediction should be returned
152  """
153  p = state.predict(X)
154  return np.require(p, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
155 
156 
157 if __name__ == "__main__":
158  """
159  We written all the necessary hooks, now we can call the mva framework as usual.
160  Other Python-based frameworks like sklearn, tensorflow, xgboost, ... have predefined hooks,
161  but you can overwrite all of them.
162  """
163  from basf2 import conditions
164  # NOTE: do not use testing payloads in production! Any results obtained like this WILL NOT BE PUBLISHED
165  conditions.testing_payloads = [
166  'localdb/database.txt'
167  ]
168 
169  # Create The GeneralOptions object as always
170  variables = ['M', 'p', 'pt', 'pz',
171  'daughter(0, p)', 'daughter(0, pz)', 'daughter(0, pt)',
172  'daughter(1, p)', 'daughter(1, pz)', 'daughter(1, pt)',
173  'daughter(2, p)', 'daughter(2, pz)', 'daughter(2, pt)',
174  'chiProb', 'dr', 'dz',
175  'daughter(0, dr)', 'daughter(1, dr)',
176  'daughter(0, dz)', 'daughter(1, dz)',
177  'daughter(0, chiProb)', 'daughter(1, chiProb)', 'daughter(2, chiProb)',
178  'daughter(0, kaonID)', 'daughter(0, pionID)',
179  'daughterInvariantMass(0, 1)', 'daughterInvariantMass(0, 2)', 'daughterInvariantMass(1, 2)']
180 
181  general_options = basf2_mva.GeneralOptions()
182  general_options.m_datafiles = basf2_mva.vector("train.root")
183  general_options.m_treename = "tree"
184  general_options.m_identifier = "MyFancyModel"
185  general_options.m_variables = basf2_mva.vector(*variables)
186  general_options.m_target_variable = "isSignal"
187 
188  # With the PythonOptions you can configure some details how the hook functions are called
189  # I describe here every option, but there are reasonable defaults, so usually you only
190  # have to set m_framework and m_steering_file
191  python_options = basf2_mva.PythonOptions()
192 
193  # You have to use "custom" as framework,
194  # this will raise a RuntimeError if you forgot to implement any of the hooks
195  python_options.m_framework = "custom"
196 
197  # The path to the file were you implemented all the hooks,
198  # in this case this is the same file were we setup the training itself,
199  # but in principle it can be any file, this file will be saved in the weightfile
200  # and it will be executed as soon as the weightfile is loaded! (so the above if __name__ == "__main__" is very important)
201  python_options.m_steering_file = "mva/examples/python/howto_use_arbitrary_methods.py"
202 
203  # You can pass parameters to your get_model hook, in form of a json string
204  # You can use json.dumps to find out the right syntax.
205  # For example if you want to pass a dictionary with some parameters
206  import json
207  config_string = json.dumps({'A': 'Python', 'Dictionary': 'With Parameters', 'And': ['A List']})
208  print("The json config string", config_string)
209  python_options.m_config = config_string
210 
211  # You can spit the dataset into a training sample (passed to partial_fit) and a validation sample (passed to begin_fit)
212  # Here we use 70% for training and 30% as validation default is 1.0
213  python_options.m_training_fraction = 0.7
214 
215  # You can normalize the input features before passing them to begin_fit, partial_fit and apply.
216  # The normalization is calculated once and saved in the weightfile.
217  # Every feature is shifted to mean 0 and a standard deviation of 1
218  python_options.m_normalize = False
219 
220  # As described in partial_fit, the mva package can stream the data to your method.
221  # The following to parameters control the streaming.
222  # If you just want the full dataset at once use the following values (which are the default values)
223  python_options.m_nIterations = 1
224  python_options.m_mini_batch_size = 0
225 
226  # Now you can train as usual
227  # Of course you can also use the command line command basf2_mva_teacher to do so
228  basf2_mva.teacher(general_options, python_options)
229 
230  # To validate your method it is convenient to use basf2_mva_util to load a trained method
231  method = basf2_mva_util.Method(general_options.m_identifier)
232 
233  # Because then it is very easy to apply the method to a test file,
234  # of course you can also apply the method using the MVAExpert module directly in basf2
235  # Or (if you do reconstruction and not analysis) the corresponding modules.
236  p, t = method.apply_expert(basf2_mva.vector("test.root"), general_options.m_treename)
237 
238  # We calculate the AUC ROC value of the returned probability and target,
239  # our method is very simple, so the AUC won't be good :-)
241  print("Custom Method", auc)
howto_use_arbitrary_methods.MyFancyClassifier.__init__
def __init__(self, *my_fancy_parameters)
Definition: howto_use_arbitrary_methods.py:17
basf2_mva_util.calculate_roc_auc
def calculate_roc_auc(p, t)
Definition: basf2_mva_util.py:39
basf2_mva_util.Method
Definition: basf2_mva_util.py:81
howto_use_arbitrary_methods.MyFancyClassifier.fit
def fit(self, X, y)
Definition: howto_use_arbitrary_methods.py:21
howto_use_arbitrary_methods.MyFancyClassifier.predict
def predict(self, X)
Definition: howto_use_arbitrary_methods.py:25
howto_use_arbitrary_methods.MyFancyClassifier
Definition: howto_use_arbitrary_methods.py:14