Belle II Software  light-2212-foldex
hep_ml.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 
11 
12 import numpy as np
13 
14 try:
15  import pandas
16 except ImportError:
17  print("Please install pandas: pip3 install pandas")
18  import sys
19  sys.exit(1)
20 
21 try:
22  import hep_ml
23  import hep_ml.uboost
24 except ImportError:
25  print("Please install hep_ml: pip3 install hep_ml")
26  import sys
27  sys.exit(1)
28 
29 import collections
30 from basf2 import B2WARNING
31 
32 
33 class State(object):
34  """
35  hep_ml state
36  """
37 
38  def __init__(self, estimator=None):
39  """ Constructor of the state object """
40 
41  self.estimatorestimator = estimator
42 
43 
44 def get_model(number_of_features, number_of_spectators, number_of_events, training_fraction, parameters):
45  """
46  Create hep_ml classifier and store it in a State object.
47  The features are used as train_features in uboost and the spectators are used as uniform_features.
48  You can pass additional parameters as a json-encoded string via m_config to the model.
49  I assume that the parameters are passed as a dictionary,
50  the key 'base_estimator' is passed to DecisionTreeClassifier as keyword arguments
51  other keys are passed to uBoostClassifier as keyword arguments
52  """
53  if isinstance(parameters, collections.Mapping) and 'base_estimator' in parameters:
54  base_tree = hep_ml.uboost.DecisionTreeClassifier(**parameters['base_estimator'])
55  del parameters['base_estimator']
56  else:
57  base_tree = hep_ml.uboost.DecisionTreeClassifier(max_depth=3)
58 
59  train_features = list(range(number_of_features))
60  uniform_features = [number_of_features + i for i in range(number_of_spectators)]
61 
62  if isinstance(parameters, collections.Mapping):
63  if 'uniform_label' not in parameters:
64  parameters['uniform_label'] = [0, 1]
65  parameters['train_features'] = train_features
66  parameters['uniform_features'] = uniform_features
67  clf = hep_ml.uboost.uBoostClassifier(base_estimator=base_tree, **parameters)
68  else:
69  clf = hep_ml.uboost.uBoostClassifier(uniform_features=uniform_features, uniform_label=[0, 1],
70  base_estimator=base_tree, train_features=train_features)
71  return State(clf)
72 
73 
74 def feature_importance(state):
75  """
76  Return a list containing the feature importances
77  """
78  return []
79 
80 
81 def load(obj):
82  """
83  Load sklearn estimator into state
84  """
85  return State(obj)
86 
87 
88 def apply(state, X):
89  """
90  Apply estimator to passed data.
91  If the estimator has a predict_proba it is called, otherwise call just predict.
92  """
93  X = pandas.DataFrame(X)
94  if hasattr(state.estimator, 'predict_proba'):
95  x = state.estimator.predict_proba(X)[:, 1]
96  else:
97  x = state.estimator.predict(X)
98  return np.require(x, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
99 
100 
101 def begin_fit(state, Xtest, Stest, ytest, wtest, nBatches):
102  """
103  Initialize lists which will store the received data
104  """
105  state.X = []
106  state.S = []
107  state.y = []
108  state.w = []
109  return state
110 
111 
112 def partial_fit(state, X, S, y, w, epoch, batch):
113  """
114  Stores received training data.
115  HepML is usually not able to perform a partial fit.
116  """
117  if epoch > 0:
118  B2WARNING("The hep_ml training interface has been called with specific_options.m_nIterations > 1."
119  " This means duplicates of the training sample will be used during training.")
120 
121  state.X.append(X)
122  state.S.append(S)
123  state.y.append(y.flatten())
124  state.w.append(w.flatten())
125  return True
126 
127 
128 def end_fit(state):
129  """
130  Merge received data together and fit estimator
131  """
132  X = pandas.DataFrame(np.hstack([np.vstack(state.X), np.vstack(state.S)]))
133  state.estimator = state.estimator.fit(X, np.hstack(state.y), np.hstack(state.w))
134  return state.estimator
estimator
Pickable sklearn estimator.
Definition: hep_ml.py:41
def __init__(self, estimator=None)
Definition: hep_ml.py:38