Belle II Software  release-08-01-10
Dataset.h
1 /**************************************************************************
2  * basf2 (Belle II Analysis Software Framework) *
3  * Author: The Belle II Collaboration *
4  * *
5  * See git log for contributors and copyright holders. *
6  * This file is licensed under LGPL-3.0, see LICENSE.md. *
7  **************************************************************************/
8 
9 #pragma once
10 #ifndef INCLUDE_GUARD_BELLE2_MVA_DATASET_HEADER
11 #define INCLUDE_GUARD_BELLE2_MVA_DATASET_HEADER
12 
13 #include <mva/interface/Options.h>
14 
15 #include <analysis/VariableManager/Manager.h>
16 
17 #include <TFile.h>
18 #include <TChain.h>
19 
20 #include <string>
21 
22 namespace Belle2 {
27  namespace MVA {
28 
33  class Dataset {
34 
35  public:
40  explicit Dataset(const GeneralOptions& general_options);
41 
45  virtual ~Dataset() = default;
46 
50  Dataset(const Dataset&) = delete;
51 
55  Dataset& operator=(const Dataset&) = delete;
56 
60  virtual unsigned int getNumberOfFeatures() const = 0;
61 
65  virtual unsigned int getNumberOfSpectators() const = 0;
66 
70  virtual unsigned int getNumberOfEvents() const = 0;
71 
76  virtual void loadEvent(unsigned int iEvent) = 0;
77 
81  virtual float getSignalFraction();
82 
87  virtual unsigned int getFeatureIndex(const std::string& feature);
88 
93  virtual unsigned int getSpectatorIndex(const std::string& spectator);
94 
99  virtual std::vector<float> getFeature(unsigned int iFeature);
100 
105  virtual std::vector<float> getSpectator(unsigned int iSpectator);
106 
110  virtual std::vector<float> getWeights();
111 
115  virtual std::vector<float> getTargets();
116 
120  virtual std::vector<bool> getSignals();
121 
123  std::vector<float> m_input;
124  std::vector<float> m_spectators;
125  float m_weight;
126  float m_target;
127  bool m_isSignal;
128  };
129 
130 
135  class SingleDataset : public Dataset {
136 
137  public:
145  SingleDataset(const GeneralOptions& general_options, const std::vector<float>& input, float target = 1.0,
146  const std::vector<float>& spectators = std::vector<float>());
147 
151  virtual unsigned int getNumberOfFeatures() const override { return m_input.size(); }
152 
156  virtual unsigned int getNumberOfSpectators() const override { return m_spectators.size(); }
157 
161  virtual unsigned int getNumberOfEvents() const override { return 1; }
162 
166  virtual void loadEvent(unsigned int) override { };
167 
172  virtual std::vector<float> getFeature(unsigned int iFeature) override { return std::vector<float> {m_input[iFeature]}; }
173 
178  virtual std::vector<float> getSpectator(unsigned int iSpectator) override { return std::vector<float> {m_spectators[iSpectator]}; }
179 
180  };
181 
186  class MultiDataset : public Dataset {
187 
188  public:
197  MultiDataset(const GeneralOptions& general_options, const std::vector<std::vector<float>>& input,
198  const std::vector<std::vector<float>>& spectators,
199  const std::vector<float>& targets = {}, const std::vector<float>& weights = {});
200 
204  virtual unsigned int getNumberOfFeatures() const override { return m_input.size(); }
205 
209  virtual unsigned int getNumberOfSpectators() const override { return m_spectators.size(); }
210 
214  virtual unsigned int getNumberOfEvents() const override { return m_matrix.size(); }
215 
219  virtual void loadEvent(unsigned int iEvent) override;
220 
221 
222  private:
223  std::vector<std::vector<float>> m_matrix;
224  std::vector<std::vector<float>> m_spectator_matrix;
225  std::vector<float> m_targets;
226  std::vector<float> m_weights;
228  };
229 
234  class SubDataset : public Dataset {
235 
236  public:
243  SubDataset(const GeneralOptions& general_options, const std::vector<bool>& events, Dataset& dataset);
244 
248  virtual unsigned int getNumberOfFeatures() const override { return m_feature_indices.size(); }
249 
253  virtual unsigned int getNumberOfSpectators() const override { return m_spectator_indices.size(); }
254 
258  virtual unsigned int getNumberOfEvents() const override { return m_use_event_indices ? m_event_indices.size() : m_dataset.getNumberOfEvents(); }
259 
264  virtual void loadEvent(unsigned int iEvent) override;
265 
270  virtual std::vector<float> getFeature(unsigned int iFeature) override;
271 
276  virtual std::vector<float> getSpectator(unsigned int iSpectator) override;
277 
278  private:
279  bool m_use_event_indices = false;
280  std::vector<unsigned int>
282  std::vector<unsigned int>
284  std::vector<unsigned int>
288  };
289 
294  class CombinedDataset : public Dataset {
295 
296  public:
303  CombinedDataset(const GeneralOptions& general_options, Dataset& signal_dataset, Dataset& background_dataset);
304 
308  virtual unsigned int getNumberOfFeatures() const override { return m_signal_dataset.getNumberOfFeatures(); }
309 
313  virtual unsigned int getNumberOfSpectators() const override { return m_signal_dataset.getNumberOfSpectators(); }
314 
318  virtual unsigned int getNumberOfEvents() const override { return m_signal_dataset.getNumberOfEvents() + m_background_dataset.getNumberOfEvents(); }
319 
324  virtual void loadEvent(unsigned int iEvent) override;
325 
330  virtual std::vector<float> getFeature(unsigned int iFeature) override;
331 
336  virtual std::vector<float> getSpectator(unsigned int iSpectator) override;
337 
338  private:
342  };
343 
344 
349  class ROOTDataset : public Dataset {
350 
351  public:
356  explicit ROOTDataset(const GeneralOptions& _general_options);
357 
361  virtual unsigned int getNumberOfFeatures() const override { return m_input.size(); }
362 
366  virtual unsigned int getNumberOfSpectators() const override { return m_spectators.size(); }
367 
371  virtual unsigned int getNumberOfEvents() const override
372  {
373  return (m_general_options.m_max_events == 0) ? m_tree->GetEntries() : m_general_options.m_max_events;
374  }
375 
380  virtual void loadEvent(unsigned int event) override;
381 
386  virtual std::vector<float> getFeature(unsigned int iFeature) override;
387 
391  virtual std::vector<float> getWeights() override;
392 
397  virtual std::vector<float> getSpectator(unsigned int iSpectator) override;
398 
402  virtual ~ROOTDataset();
403 
404  protected:
406  typedef std::variant<double, float, int, bool> RootDatasetVarVariant;
407 
408  TChain* m_tree = nullptr;
409  std::vector<RootDatasetVarVariant> m_input_variant;
410  std::vector<RootDatasetVarVariant>
416  private:
417 
427  template<class T>
428  std::vector<float> getVectorFromTTree(const std::string& variableType, const std::string& branchName, T& memberVariableTarget);
429 
438  std::vector<float> getVectorFromTTreeVariant(const std::string& variableType, const std::string& branchName,
439  RootDatasetVarVariant& memberVariableTarget);
440 
444  void setRootInputType();
445 
453  template<class T>
454  void setScalarVariableAddress(const std::string& variableType, const std::string& variableName, T& variableTarget);
455 
462  void setScalarVariableAddressVariant(const std::string& variableType, const std::string& variableName,
463  RootDatasetVarVariant& variableTarget);
464 
472  template<class T>
473  void setVectorVariableAddress(const std::string& variableType, const std::vector<std::string>& variableName,
474  T& variableTargets);
475 
482  void setVectorVariableAddressVariant(const std::string& variableType, const std::vector<std::string>& variableName,
483  std::vector<RootDatasetVarVariant>& varVariantTargets);
484 
489 
490 
494  void setBranchAddresses();
495 
501  bool checkForBranch(TTree*, const std::string&) const;
502 
508 
514  void initialiseVarVariantType(const std::string, RootDatasetVarVariant&);
515 
522  void initialiseVarVariantForBranch(const std::string, RootDatasetVarVariant&);
523  };
524  }
526 }
527 #endif
Wraps two other Datasets, one containing signal, the other background events Used by the reweighting ...
Definition: Dataset.h:294
CombinedDataset(const GeneralOptions &general_options, Dataset &signal_dataset, Dataset &background_dataset)
Constructs a new CombinedDataset holding a reference to the wrapped Datasets.
Definition: Dataset.cc:273
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in the wrapped dataset.
Definition: Dataset.h:318
Dataset & m_background_dataset
Reference to the wrapped dataset containing background events.
Definition: Dataset.h:340
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values of one spectator in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:306
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values of one feature in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:296
virtual void loadEvent(unsigned int iEvent) override
Load the event number iEvent from the wrapped dataset.
Definition: Dataset.cc:277
virtual unsigned int getNumberOfSpectators() const override
Returns the number of spectators in this dataset, so the size of the given subset of the spectators.
Definition: Dataset.h:313
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset, so the size of the given subset of the variables.
Definition: Dataset.h:308
Dataset & m_signal_dataset
Reference to the wrapped dataset containing signal events.
Definition: Dataset.h:339
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
Definition: Dataset.h:33
virtual unsigned int getNumberOfEvents() const =0
Returns the number of events in this dataset.
virtual unsigned int getNumberOfSpectators() const =0
Returns the number of spectators in this dataset.
Dataset(const Dataset &)=delete
Specify no copy constructor.
virtual std::vector< bool > getSignals()
Returns all is Signals.
Definition: Dataset.cc:122
virtual unsigned int getNumberOfFeatures() const =0
Returns the number of features in this dataset.
Dataset & operator=(const Dataset &)=delete
Specify no assignment operator.
virtual unsigned int getFeatureIndex(const std::string &feature)
Return index of feature with the given name.
Definition: Dataset.cc:50
virtual std::vector< float > getSpectator(unsigned int iSpectator)
Returns all values of one spectator in a std::vector<float>
Definition: Dataset.cc:86
std::vector< float > m_spectators
Contains all spectators values of the currently loaded event.
Definition: Dataset.h:124
virtual std::vector< float > getTargets()
Returns all targets.
Definition: Dataset.cc:110
virtual void loadEvent(unsigned int iEvent)=0
Load the event number iEvent.
GeneralOptions m_general_options
GeneralOptions passed to this dataset.
Definition: Dataset.h:122
std::vector< float > m_input
Contains all feature values of the currently loaded event.
Definition: Dataset.h:123
Dataset(const GeneralOptions &general_options)
Constructs a new dataset given the general options.
Definition: Dataset.cc:26
virtual std::vector< float > getFeature(unsigned int iFeature)
Returns all values of one feature in a std::vector<float>
Definition: Dataset.cc:74
virtual std::vector< float > getWeights()
Returns all weights.
Definition: Dataset.cc:98
virtual float getSignalFraction()
Returns the signal fraction of the whole sample.
Definition: Dataset.cc:35
bool m_isSignal
Defines if the currently loaded event is signal or background.
Definition: Dataset.h:127
float m_weight
Contains the weight of the currently loaded event.
Definition: Dataset.h:125
virtual unsigned int getSpectatorIndex(const std::string &spectator)
Return index of spectator with the given name.
Definition: Dataset.cc:62
float m_target
Contains the target value of the currently loaded event.
Definition: Dataset.h:126
virtual ~Dataset()=default
Virtual default destructor.
General options which are shared by all MVA trainings.
Definition: Options.h:62
unsigned int m_max_events
Maximum number of events to process, 0 means all.
Definition: Options.h:92
Wraps the data of a multiple event into a Dataset.
Definition: Dataset.h:186
std::vector< float > m_weights
weight vector
Definition: Dataset.h:226
std::vector< std::vector< float > > m_matrix
Feature matrix.
Definition: Dataset.h:223
std::vector< std::vector< float > > m_spectator_matrix
Spectator matrix.
Definition: Dataset.h:224
MultiDataset(const GeneralOptions &general_options, const std::vector< std::vector< float >> &input, const std::vector< std::vector< float >> &spectators, const std::vector< float > &targets={}, const std::vector< float > &weights={})
Constructs a new MultiDataset.
Definition: Dataset.cc:145
std::vector< float > m_targets
target vector
Definition: Dataset.h:225
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in this dataset.
Definition: Dataset.h:214
virtual void loadEvent(unsigned int iEvent) override
Does nothing in the case of a single dataset, because the only event is already loaded.
Definition: Dataset.cc:168
virtual unsigned int getNumberOfSpectators() const override
Returns the number of spectators in this dataset.
Definition: Dataset.h:209
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: Dataset.h:204
Proivdes a dataset from a ROOT file This is the usually used dataset providing training data to the m...
Definition: Dataset.h:349
void setScalarVariableAddress(const std::string &variableType, const std::string &variableName, T &variableTarget)
sets the branch address for a scalar variable to a given target
Definition: Dataset.cc:515
void setBranchAddresses()
Sets the branch addresses of all features, weight and target again.
Definition: Dataset.cc:568
void setTargetRootInputType()
Determines the data type of the target variable and sets it to m_target_data_type.
void setScalarVariableAddressVariant(const std::string &variableType, const std::string &variableName, RootDatasetVarVariant &variableTarget)
sets the branch address for a scalar variable to a given target
Definition: Dataset.cc:536
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in this dataset.
Definition: Dataset.h:371
void initialiseVarVariantForBranch(const std::string, RootDatasetVarVariant &)
Infers the type (double,float,int,bool) from the TTree and initialises the VarVariant with the correc...
Definition: Dataset.cc:615
TChain * m_tree
Pointer to the TChain containing the data.
Definition: Dataset.h:408
virtual void loadEvent(unsigned int event) override
Load the event number iEvent from the TTree.
Definition: Dataset.cc:391
void setVectorVariableAddressVariant(const std::string &variableType, const std::vector< std::string > &variableName, std::vector< RootDatasetVarVariant > &varVariantTargets)
sets the branch address for a vector of VarVariant to a given target
Definition: Dataset.cc:560
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values of one spectator in a std::vector<float>
Definition: Dataset.cc:440
std::vector< float > getVectorFromTTree(const std::string &variableType, const std::string &branchName, T &memberVariableTarget)
Returns all values for a specified variableType and branchName.
Definition: Dataset.cc:474
void initialiseVarVariantType(const std::string, RootDatasetVarVariant &)
Initialises the VarVariant.
Definition: Dataset.cc:598
std::vector< RootDatasetVarVariant > m_spectators_variant
Contains all spectators values of the currently loaded event.
Definition: Dataset.h:411
RootDatasetVarVariant m_target_variant
Contains the target value of the currently loaded event.
Definition: Dataset.h:413
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values of one feature in a std::vector<float>
Definition: Dataset.cc:429
virtual std::vector< float > getWeights() override
Returns all values of of the weights in a std::vector<float>
Definition: Dataset.cc:408
std::vector< RootDatasetVarVariant > m_input_variant
Contains all feature values of the currently loaded event.
Definition: Dataset.h:409
ROOTDataset(const GeneralOptions &_general_options)
Creates a new ROOTDataset.
Definition: Dataset.cc:316
void setRootInputType()
Tries to infer the data-type of the spectator and feature variables in a root file.
Definition: Dataset.cc:632
virtual unsigned int getNumberOfSpectators() const override
Returns the number of features in this dataset.
Definition: Dataset.h:366
bool checkForBranch(TTree *, const std::string &) const
Checks if the given branchname exists in the TTree.
Definition: Dataset.cc:507
void setVectorVariableAddress(const std::string &variableType, const std::vector< std::string > &variableName, T &variableTargets)
sets the branch address for a vector variable to a given target
Definition: Dataset.cc:552
virtual ~ROOTDataset()
Virtual destructor.
Definition: Dataset.cc:452
float castVarVariantToFloat(RootDatasetVarVariant &) const
Casts a VarVariant which can contain <double,int,bool,float> to float.
Definition: Dataset.cc:376
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: Dataset.h:361
std::variant< double, float, int, bool > RootDatasetVarVariant
Typedef for variable types supported by the mva ROOTDataset, can be one of double,...
Definition: Dataset.h:406
std::vector< float > getVectorFromTTreeVariant(const std::string &variableType, const std::string &branchName, RootDatasetVarVariant &memberVariableTarget)
Returns all values for a specified variableType and branchName.
Definition: Dataset.cc:458
RootDatasetVarVariant m_weight_variant
Contains the weight of the currently loaded event.
Definition: Dataset.h:412
Wraps the data of a single event into a Dataset.
Definition: Dataset.h:135
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in this dataset which is always one.
Definition: Dataset.h:161
SingleDataset(const GeneralOptions &general_options, const std::vector< float > &input, float target=1.0, const std::vector< float > &spectators=std::vector< float >())
Constructs a new SingleDataset.
Definition: Dataset.cc:135
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values (in this case only one) of one feature in a std::vector<float>
Definition: Dataset.h:172
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values (in this case only one) of one spectator in a std::vector<float>
Definition: Dataset.h:178
virtual unsigned int getNumberOfSpectators() const override
Returns the number of features in this dataset.
Definition: Dataset.h:156
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: Dataset.h:151
virtual void loadEvent(unsigned int) override
Does nothing in the case of a single dataset, because the only event is already loaded.
Definition: Dataset.h:166
Wraps another Dataset and provides a view to a subset of its features and events.
Definition: Dataset.h:234
Dataset & m_dataset
Reference to the wrapped dataset.
Definition: Dataset.h:286
SubDataset(const GeneralOptions &general_options, const std::vector< bool > &events, Dataset &dataset)
Constructs a new SubDataset holding a reference to the wrapped Dataset.
Definition: Dataset.cc:186
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in the wrapped dataset.
Definition: Dataset.h:258
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values of one spectator in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:259
std::vector< unsigned int > m_feature_indices
Mapping from the position of a feature in the given subset to its position in the wrapped dataset.
Definition: Dataset.h:281
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values of one feature in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:245
std::vector< unsigned int > m_spectator_indices
Mapping from the position of a spectator in the given subset to its position in the wrapped dataset.
Definition: Dataset.h:283
virtual void loadEvent(unsigned int iEvent) override
Load the event number iEvent from the wrapped dataset.
Definition: Dataset.cc:225
virtual unsigned int getNumberOfSpectators() const override
Returns the number of spectators in this dataset, so the size of the given subset of the spectators.
Definition: Dataset.h:253
std::vector< unsigned int > m_event_indices
Mapping from the position of a event in the given subset to its position in the wrapped dataset.
Definition: Dataset.h:285
bool m_use_event_indices
Use only a subset of the wrapped dataset events.
Definition: Dataset.h:279
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset, so the size of the given subset of the variables.
Definition: Dataset.h:248
Abstract base class for different kinds of events.