Belle II Software  release-06-00-14
Dataset.h
1 /**************************************************************************
2  * basf2 (Belle II Analysis Software Framework) *
3  * Author: The Belle II Collaboration *
4  * *
5  * See git log for contributors and copyright holders. *
6  * This file is licensed under LGPL-3.0, see LICENSE.md. *
7  **************************************************************************/
8 
9 #pragma once
10 #ifndef INCLUDE_GUARD_BELLE2_MVA_DATASET_HEADER
11 #define INCLUDE_GUARD_BELLE2_MVA_DATASET_HEADER
12 
13 #include <mva/interface/Options.h>
14 
15 #include <TFile.h>
16 #include <TChain.h>
17 
18 #include <string>
19 
20 namespace Belle2 {
25  namespace MVA {
26 
31  class Dataset {
32 
33  public:
38  explicit Dataset(const GeneralOptions& general_options);
39 
43  virtual ~Dataset() = default;
44 
48  Dataset(const Dataset&) = delete;
49 
53  Dataset& operator=(const Dataset&) = delete;
54 
58  virtual unsigned int getNumberOfFeatures() const = 0;
59 
63  virtual unsigned int getNumberOfSpectators() const = 0;
64 
68  virtual unsigned int getNumberOfEvents() const = 0;
69 
74  virtual void loadEvent(unsigned int iEvent) = 0;
75 
79  virtual float getSignalFraction();
80 
85  virtual unsigned int getFeatureIndex(const std::string& feature);
86 
91  virtual unsigned int getSpectatorIndex(const std::string& spectator);
92 
97  virtual std::vector<float> getFeature(unsigned int iFeature);
98 
103  virtual std::vector<float> getSpectator(unsigned int iSpectator);
104 
108  virtual std::vector<float> getWeights();
109 
113  virtual std::vector<float> getTargets();
114 
118  virtual std::vector<bool> getSignals();
119 
121  std::vector<float> m_input;
122  std::vector<float> m_spectators;
123  float m_weight;
124  float m_target;
125  bool m_isSignal;
126  };
127 
128 
133  class SingleDataset : public Dataset {
134 
135  public:
143  SingleDataset(const GeneralOptions& general_options, const std::vector<float>& input, float target = 1.0,
144  const std::vector<float>& spectators = std::vector<float>());
145 
149  virtual unsigned int getNumberOfFeatures() const override { return m_input.size(); }
150 
154  virtual unsigned int getNumberOfSpectators() const override { return m_spectators.size(); }
155 
159  virtual unsigned int getNumberOfEvents() const override { return 1; }
160 
164  virtual void loadEvent(unsigned int) override { };
165 
170  virtual std::vector<float> getFeature(unsigned int iFeature) override { return std::vector<float> {m_input[iFeature]}; }
171 
176  virtual std::vector<float> getSpectator(unsigned int iSpectator) override { return std::vector<float> {m_spectators[iSpectator]}; }
177 
178  };
179 
184  class MultiDataset : public Dataset {
185 
186  public:
195  MultiDataset(const GeneralOptions& general_options, const std::vector<std::vector<float>>& input,
196  const std::vector<std::vector<float>>& spectators,
197  const std::vector<float>& targets = {}, const std::vector<float>& weights = {});
198 
202  virtual unsigned int getNumberOfFeatures() const override { return m_input.size(); }
203 
207  virtual unsigned int getNumberOfSpectators() const override { return m_spectators.size(); }
208 
212  virtual unsigned int getNumberOfEvents() const override { return m_matrix.size(); }
213 
217  virtual void loadEvent(unsigned int iEvent) override;
218 
219 
220  private:
221  std::vector<std::vector<float>> m_matrix;
222  std::vector<std::vector<float>> m_spectator_matrix;
223  std::vector<float> m_targets;
224  std::vector<float> m_weights;
226  };
227 
232  class SubDataset : public Dataset {
233 
234  public:
241  SubDataset(const GeneralOptions& general_options, const std::vector<bool>& events, Dataset& dataset);
242 
246  virtual unsigned int getNumberOfFeatures() const override { return m_feature_indices.size(); }
247 
251  virtual unsigned int getNumberOfSpectators() const override { return m_spectator_indices.size(); }
252 
256  virtual unsigned int getNumberOfEvents() const override { return m_use_event_indices ? m_event_indices.size() : m_dataset.getNumberOfEvents(); }
257 
262  virtual void loadEvent(unsigned int iEvent) override;
263 
268  virtual std::vector<float> getFeature(unsigned int iFeature) override;
269 
274  virtual std::vector<float> getSpectator(unsigned int iSpectator) override;
275 
276  private:
277  bool m_use_event_indices = false;
278  std::vector<unsigned int>
280  std::vector<unsigned int>
282  std::vector<unsigned int>
286  };
287 
292  class CombinedDataset : public Dataset {
293 
294  public:
301  CombinedDataset(const GeneralOptions& general_options, Dataset& signal_dataset, Dataset& background_dataset);
302 
306  virtual unsigned int getNumberOfFeatures() const override { return m_signal_dataset.getNumberOfFeatures(); }
307 
311  virtual unsigned int getNumberOfSpectators() const override { return m_signal_dataset.getNumberOfSpectators(); }
312 
316  virtual unsigned int getNumberOfEvents() const override { return m_signal_dataset.getNumberOfEvents() + m_background_dataset.getNumberOfEvents(); }
317 
322  virtual void loadEvent(unsigned int iEvent) override;
323 
328  virtual std::vector<float> getFeature(unsigned int iFeature) override;
329 
334  virtual std::vector<float> getSpectator(unsigned int iSpectator) override;
335 
336  private:
340  };
341 
342 
347  class ROOTDataset : public Dataset {
348 
349  public:
354  explicit ROOTDataset(const GeneralOptions& _general_options);
355 
359  virtual unsigned int getNumberOfFeatures() const override { return m_input.size(); }
360 
364  virtual unsigned int getNumberOfSpectators() const override { return m_spectators.size(); }
365 
369  virtual unsigned int getNumberOfEvents() const override
370  {
371  return (m_general_options.m_max_events == 0) ? m_tree->GetEntries() : m_general_options.m_max_events;
372  }
373 
378  virtual void loadEvent(unsigned int event) override;
379 
384  virtual std::vector<float> getFeature(unsigned int iFeature) override;
385 
389  virtual std::vector<float> getWeights() override;
390 
395  virtual std::vector<float> getSpectator(unsigned int iSpectator) override;
396 
406  template<class T>
407  std::vector<float> getVectorFromTTree(std::string& variableType, std::string& branchName, T& memberVariableTarget);
408 
412  void setRootInputType();
413 
421  template<class T>
422  void setScalarVariableAddress(std::string& variableType, std::string& variableName, T& variableTarget);
423 
431  template<class T>
432  void setVectorVariableAddress(std::string& variableType, std::vector<std::string>& variableName,
433  T& variableTargets);
434 
438  virtual ~ROOTDataset();
439 
440 
441  private:
445  void setBranchAddresses();
446 
452  bool checkForBranch(TTree*, const std::string&) const;
453 
454  protected:
455  TChain* m_tree = nullptr;
456  bool m_isDoubleInputType = true;
457  std::vector<double> m_input_double;
458  std::vector<double> m_spectators_double;
461  };
462 
463  }
465 }
466 #endif
Wraps two other Datasets, one containing signal, the other background events Used by the reweighting ...
Definition: Dataset.h:292
CombinedDataset(const GeneralOptions &general_options, Dataset &signal_dataset, Dataset &background_dataset)
Constructs a new CombinedDataset holding a reference to the wrapped Datasets.
Definition: Dataset.cc:273
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in the wrapped dataset.
Definition: Dataset.h:316
Dataset & m_background_dataset
Reference to the wrapped dataset containing background events.
Definition: Dataset.h:338
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values of one spectator in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:306
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values of one feature in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:296
virtual void loadEvent(unsigned int iEvent) override
Load the event number iEvent from the wrapped dataset.
Definition: Dataset.cc:277
virtual unsigned int getNumberOfSpectators() const override
Returns the number of spectators in this dataset, so the size of the given subset of the spectators.
Definition: Dataset.h:311
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset, so the size of the given subset of the variables.
Definition: Dataset.h:306
Dataset & m_signal_dataset
Reference to the wrapped dataset containing signal events.
Definition: Dataset.h:337
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
Definition: Dataset.h:31
virtual unsigned int getNumberOfEvents() const =0
Returns the number of events in this dataset.
virtual unsigned int getNumberOfSpectators() const =0
Returns the number of spectators in this dataset.
Dataset(const Dataset &)=delete
Specify no copy constructor.
virtual std::vector< bool > getSignals()
Returns all is Signals.
Definition: Dataset.cc:122
virtual unsigned int getNumberOfFeatures() const =0
Returns the number of features in this dataset.
Dataset & operator=(const Dataset &)=delete
Specify no assignment operator.
virtual unsigned int getFeatureIndex(const std::string &feature)
Return index of feature with the given name.
Definition: Dataset.cc:50
virtual std::vector< float > getSpectator(unsigned int iSpectator)
Returns all values of one spectator in a std::vector<float>
Definition: Dataset.cc:86
std::vector< float > m_spectators
Contains all spectators values of the currently loaded event.
Definition: Dataset.h:122
virtual std::vector< float > getTargets()
Returns all targets.
Definition: Dataset.cc:110
virtual void loadEvent(unsigned int iEvent)=0
Load the event number iEvent.
GeneralOptions m_general_options
GeneralOptions passed to this dataset.
Definition: Dataset.h:120
std::vector< float > m_input
Contains all feature values of the currently loaded event.
Definition: Dataset.h:121
Dataset(const GeneralOptions &general_options)
Constructs a new dataset given the general options.
Definition: Dataset.cc:26
virtual std::vector< float > getFeature(unsigned int iFeature)
Returns all values of one feature in a std::vector<float>
Definition: Dataset.cc:74
virtual std::vector< float > getWeights()
Returns all weights.
Definition: Dataset.cc:98
virtual float getSignalFraction()
Returns the signal fraction of the whole sample.
Definition: Dataset.cc:35
bool m_isSignal
Defines if the currently loaded event is signal or background.
Definition: Dataset.h:125
float m_weight
Contains the weight of the currently loaded event.
Definition: Dataset.h:123
virtual unsigned int getSpectatorIndex(const std::string &spectator)
Return index of spectator with the given name.
Definition: Dataset.cc:62
float m_target
Contains the target value of the currently loaded event.
Definition: Dataset.h:124
virtual ~Dataset()=default
Virtual default destructor.
General options which are shared by all MVA trainings.
Definition: Options.h:62
unsigned int m_max_events
Maximum number of events to process, 0 means all.
Definition: Options.h:91
Wraps the data of a multiple event into a Dataset.
Definition: Dataset.h:184
std::vector< float > m_weights
weight vector
Definition: Dataset.h:224
std::vector< std::vector< float > > m_matrix
Feature matrix.
Definition: Dataset.h:221
std::vector< std::vector< float > > m_spectator_matrix
Spectator matrix.
Definition: Dataset.h:222
MultiDataset(const GeneralOptions &general_options, const std::vector< std::vector< float >> &input, const std::vector< std::vector< float >> &spectators, const std::vector< float > &targets={}, const std::vector< float > &weights={})
Constructs a new MultiDataset.
Definition: Dataset.cc:145
std::vector< float > m_targets
target vector
Definition: Dataset.h:223
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in this dataset.
Definition: Dataset.h:212
virtual void loadEvent(unsigned int iEvent) override
Does nothing in the case of a single dataset, because the only event is already loaded.
Definition: Dataset.cc:168
virtual unsigned int getNumberOfSpectators() const override
Returns the number of spectators in this dataset.
Definition: Dataset.h:207
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: Dataset.h:202
Proivdes a dataset from a ROOT file This is the usually used dataset providing training data to the m...
Definition: Dataset.h:347
void setBranchAddresses()
Sets the branch addresses of all features, weight and target again.
Definition: Dataset.cc:527
void setVectorVariableAddress(std::string &variableType, std::vector< std::string > &variableName, T &variableTargets)
sets the branch address for a vector variable to a given target
Definition: Dataset.cc:520
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in this dataset.
Definition: Dataset.h:369
TChain * m_tree
Pointer to the TChain containing the data.
Definition: Dataset.h:455
double m_target_double
Contains the target value of the currently loaded event.
Definition: Dataset.h:460
virtual void loadEvent(unsigned int event) override
Load the event number iEvent from the TTree.
Definition: Dataset.cc:375
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values of one spectator in a std::vector<float>
Definition: Dataset.cc:434
std::vector< double > m_spectators_double
Contains all spectators values of the currently loaded event.
Definition: Dataset.h:458
double m_weight_double
Contains the weight of the currently loaded event.
Definition: Dataset.h:459
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values of one feature in a std::vector<float>
Definition: Dataset.cc:418
void setScalarVariableAddress(std::string &variableType, std::string &variableName, T &variableTarget)
sets the branch address for a scalar variable to a given target
Definition: Dataset.cc:498
virtual std::vector< float > getWeights() override
Returns all values of of the weights in a std::vector<float>
Definition: Dataset.cc:392
ROOTDataset(const GeneralOptions &_general_options)
Creates a new ROOTDataset.
Definition: Dataset.cc:316
void setRootInputType()
Tries to infer the data-type of a root file and sets m_isDoubleInputType.
Definition: Dataset.cc:578
virtual unsigned int getNumberOfSpectators() const override
Returns the number of features in this dataset.
Definition: Dataset.h:364
std::vector< double > m_input_double
Contains all feature values of the currently loaded event.
Definition: Dataset.h:457
bool checkForBranch(TTree *, const std::string &) const
Checks if the given branchname exists in the TTree.
Definition: Dataset.cc:490
std::vector< float > getVectorFromTTree(std::string &variableType, std::string &branchName, T &memberVariableTarget)
Returns all values for a specified variableType and branchName.
Definition: Dataset.cc:457
virtual ~ROOTDataset()
Virtual destructor.
Definition: Dataset.cc:450
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: Dataset.h:359
bool m_isDoubleInputType
Defines the expected datatype in the ROOT file.
Definition: Dataset.h:456
Wraps the data of a single event into a Dataset.
Definition: Dataset.h:133
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in this dataset which is always one.
Definition: Dataset.h:159
SingleDataset(const GeneralOptions &general_options, const std::vector< float > &input, float target=1.0, const std::vector< float > &spectators=std::vector< float >())
Constructs a new SingleDataset.
Definition: Dataset.cc:135
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values (in this case only one) of one feature in a std::vector<float>
Definition: Dataset.h:170
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values (in this case only one) of one spectator in a std::vector<float>
Definition: Dataset.h:176
virtual unsigned int getNumberOfSpectators() const override
Returns the number of features in this dataset.
Definition: Dataset.h:154
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: Dataset.h:149
virtual void loadEvent(unsigned int) override
Does nothing in the case of a single dataset, because the only event is already loaded.
Definition: Dataset.h:164
Wraps another Dataset and provides a view to a subset of its features and events.
Definition: Dataset.h:232
Dataset & m_dataset
Reference to the wrapped dataset.
Definition: Dataset.h:284
SubDataset(const GeneralOptions &general_options, const std::vector< bool > &events, Dataset &dataset)
Constructs a new SubDataset holding a reference to the wrapped Dataset.
Definition: Dataset.cc:186
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in the wrapped dataset.
Definition: Dataset.h:256
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values of one spectator in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:259
std::vector< unsigned int > m_feature_indices
Mapping from the position of a feature in the given subset to its position in the wrapped dataset.
Definition: Dataset.h:279
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values of one feature in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:245
std::vector< unsigned int > m_spectator_indices
Mapping from the position of a spectator in the given subset to its position in the wrapped dataset.
Definition: Dataset.h:281
virtual void loadEvent(unsigned int iEvent) override
Load the event number iEvent from the wrapped dataset.
Definition: Dataset.cc:225
virtual unsigned int getNumberOfSpectators() const override
Returns the number of spectators in this dataset, so the size of the given subset of the spectators.
Definition: Dataset.h:251
std::vector< unsigned int > m_event_indices
Mapping from the position of a event in the given subset to its position in the wrapped dataset.
Definition: Dataset.h:283
bool m_use_event_indices
Use only a subset of the wrapped dataset events.
Definition: Dataset.h:277
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset, so the size of the given subset of the variables.
Definition: Dataset.h:246
Abstract base class for different kinds of events.