Belle II Software  light-2212-foldex
Dataset.h
1 /**************************************************************************
2  * basf2 (Belle II Analysis Software Framework) *
3  * Author: The Belle II Collaboration *
4  * *
5  * See git log for contributors and copyright holders. *
6  * This file is licensed under LGPL-3.0, see LICENSE.md. *
7  **************************************************************************/
8 
9 #pragma once
10 #ifndef INCLUDE_GUARD_BELLE2_MVA_DATASET_HEADER
11 #define INCLUDE_GUARD_BELLE2_MVA_DATASET_HEADER
12 
13 #include <mva/interface/Options.h>
14 
15 #include <analysis/VariableManager/Manager.h>
16 
17 #include <TFile.h>
18 #include <TChain.h>
19 
20 #include <string>
21 
22 namespace Belle2 {
27  namespace MVA {
28 
33  class Dataset {
34 
35  public:
40  explicit Dataset(const GeneralOptions& general_options);
41 
45  virtual ~Dataset() = default;
46 
50  Dataset(const Dataset&) = delete;
51 
55  Dataset& operator=(const Dataset&) = delete;
56 
60  virtual unsigned int getNumberOfFeatures() const = 0;
61 
65  virtual unsigned int getNumberOfSpectators() const = 0;
66 
70  virtual unsigned int getNumberOfEvents() const = 0;
71 
76  virtual void loadEvent(unsigned int iEvent) = 0;
77 
81  virtual float getSignalFraction();
82 
87  virtual unsigned int getFeatureIndex(const std::string& feature);
88 
93  virtual unsigned int getSpectatorIndex(const std::string& spectator);
94 
99  virtual std::vector<float> getFeature(unsigned int iFeature);
100 
105  virtual std::vector<float> getSpectator(unsigned int iSpectator);
106 
110  virtual std::vector<float> getWeights();
111 
115  virtual std::vector<float> getTargets();
116 
120  virtual std::vector<bool> getSignals();
121 
123  std::vector<float> m_input;
124  std::vector<float> m_spectators;
125  float m_weight;
126  float m_target;
127  bool m_isSignal;
128  };
129 
130 
135  class SingleDataset : public Dataset {
136 
137  public:
145  SingleDataset(const GeneralOptions& general_options, const std::vector<float>& input, float target = 1.0,
146  const std::vector<float>& spectators = std::vector<float>());
147 
151  virtual unsigned int getNumberOfFeatures() const override { return m_input.size(); }
152 
156  virtual unsigned int getNumberOfSpectators() const override { return m_spectators.size(); }
157 
161  virtual unsigned int getNumberOfEvents() const override { return 1; }
162 
166  virtual void loadEvent(unsigned int) override { };
167 
172  virtual std::vector<float> getFeature(unsigned int iFeature) override { return std::vector<float> {m_input[iFeature]}; }
173 
178  virtual std::vector<float> getSpectator(unsigned int iSpectator) override { return std::vector<float> {m_spectators[iSpectator]}; }
179 
180  };
181 
186  class MultiDataset : public Dataset {
187 
188  public:
197  MultiDataset(const GeneralOptions& general_options, const std::vector<std::vector<float>>& input,
198  const std::vector<std::vector<float>>& spectators,
199  const std::vector<float>& targets = {}, const std::vector<float>& weights = {});
200 
204  virtual unsigned int getNumberOfFeatures() const override { return m_input.size(); }
205 
209  virtual unsigned int getNumberOfSpectators() const override { return m_spectators.size(); }
210 
214  virtual unsigned int getNumberOfEvents() const override { return m_matrix.size(); }
215 
219  virtual void loadEvent(unsigned int iEvent) override;
220 
221 
222  private:
223  std::vector<std::vector<float>> m_matrix;
224  std::vector<std::vector<float>> m_spectator_matrix;
225  std::vector<float> m_targets;
226  std::vector<float> m_weights;
228  };
229 
234  class SubDataset : public Dataset {
235 
236  public:
243  SubDataset(const GeneralOptions& general_options, const std::vector<bool>& events, Dataset& dataset);
244 
248  virtual unsigned int getNumberOfFeatures() const override { return m_feature_indices.size(); }
249 
253  virtual unsigned int getNumberOfSpectators() const override { return m_spectator_indices.size(); }
254 
258  virtual unsigned int getNumberOfEvents() const override { return m_use_event_indices ? m_event_indices.size() : m_dataset.getNumberOfEvents(); }
259 
264  virtual void loadEvent(unsigned int iEvent) override;
265 
270  virtual std::vector<float> getFeature(unsigned int iFeature) override;
271 
276  virtual std::vector<float> getSpectator(unsigned int iSpectator) override;
277 
278  private:
279  bool m_use_event_indices = false;
280  std::vector<unsigned int>
282  std::vector<unsigned int>
284  std::vector<unsigned int>
288  };
289 
294  class CombinedDataset : public Dataset {
295 
296  public:
303  CombinedDataset(const GeneralOptions& general_options, Dataset& signal_dataset, Dataset& background_dataset);
304 
308  virtual unsigned int getNumberOfFeatures() const override { return m_signal_dataset.getNumberOfFeatures(); }
309 
313  virtual unsigned int getNumberOfSpectators() const override { return m_signal_dataset.getNumberOfSpectators(); }
314 
318  virtual unsigned int getNumberOfEvents() const override { return m_signal_dataset.getNumberOfEvents() + m_background_dataset.getNumberOfEvents(); }
319 
324  virtual void loadEvent(unsigned int iEvent) override;
325 
330  virtual std::vector<float> getFeature(unsigned int iFeature) override;
331 
336  virtual std::vector<float> getSpectator(unsigned int iSpectator) override;
337 
338  private:
342  };
343 
344 
349  class ROOTDataset : public Dataset {
350 
351  public:
356  explicit ROOTDataset(const GeneralOptions& _general_options);
357 
361  virtual unsigned int getNumberOfFeatures() const override { return m_input.size(); }
362 
366  virtual unsigned int getNumberOfSpectators() const override { return m_spectators.size(); }
367 
371  virtual unsigned int getNumberOfEvents() const override
372  {
373  return (m_general_options.m_max_events == 0) ? m_tree->GetEntries() : m_general_options.m_max_events;
374  }
375 
380  virtual void loadEvent(unsigned int event) override;
381 
386  virtual std::vector<float> getFeature(unsigned int iFeature) override;
387 
391  virtual std::vector<float> getWeights() override;
392 
397  virtual std::vector<float> getSpectator(unsigned int iSpectator) override;
398 
408  template<class T>
409  std::vector<float> getVectorFromTTree(std::string& variableType, std::string& branchName, T& memberVariableTarget);
410 
414  void setRootInputType();
415 
423  template<class T>
424  void setScalarVariableAddress(std::string& variableType, std::string& variableName, T& variableTarget);
425 
433  template<class T>
434  void setVectorVariableAddress(std::string& variableType, std::vector<std::string>& variableName,
435  T& variableTargets);
436 
444  void setVectorVariableAddress(std::string& variableType, std::vector<std::string>& variableName,
445  std::vector<Variable::Manager::VarVariant>& varVariantTargets);
446 
450  void setTargetRootInputType();
451 
455  virtual ~ROOTDataset();
456 
457 
458  private:
462  void setBranchAddresses();
463 
469  bool checkForBranch(TTree*, const std::string&) const;
470 
471  protected:
472  TChain* m_tree = nullptr;
473  bool m_isFloatInputType = true;
474  std::vector<Variable::Manager::VarVariant> m_input_variant;
475  std::vector<Variable::Manager::VarVariant>
479  Variable::Manager::VariableDataType::c_double;
483  };
484 
485  }
487 }
488 #endif
Wraps two other Datasets, one containing signal, the other background events Used by the reweighting ...
Definition: Dataset.h:294
CombinedDataset(const GeneralOptions &general_options, Dataset &signal_dataset, Dataset &background_dataset)
Constructs a new CombinedDataset holding a reference to the wrapped Datasets.
Definition: Dataset.cc:273
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in the wrapped dataset.
Definition: Dataset.h:318
Dataset & m_background_dataset
Reference to the wrapped dataset containing background events.
Definition: Dataset.h:340
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values of one spectator in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:306
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values of one feature in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:296
virtual void loadEvent(unsigned int iEvent) override
Load the event number iEvent from the wrapped dataset.
Definition: Dataset.cc:277
virtual unsigned int getNumberOfSpectators() const override
Returns the number of spectators in this dataset, so the size of the given subset of the spectators.
Definition: Dataset.h:313
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset, so the size of the given subset of the variables.
Definition: Dataset.h:308
Dataset & m_signal_dataset
Reference to the wrapped dataset containing signal events.
Definition: Dataset.h:339
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
Definition: Dataset.h:33
virtual unsigned int getNumberOfEvents() const =0
Returns the number of events in this dataset.
virtual unsigned int getNumberOfSpectators() const =0
Returns the number of spectators in this dataset.
Dataset(const Dataset &)=delete
Specify no copy constructor.
virtual std::vector< bool > getSignals()
Returns all is Signals.
Definition: Dataset.cc:122
virtual unsigned int getNumberOfFeatures() const =0
Returns the number of features in this dataset.
Dataset & operator=(const Dataset &)=delete
Specify no assignment operator.
virtual unsigned int getFeatureIndex(const std::string &feature)
Return index of feature with the given name.
Definition: Dataset.cc:50
virtual std::vector< float > getSpectator(unsigned int iSpectator)
Returns all values of one spectator in a std::vector<float>
Definition: Dataset.cc:86
std::vector< float > m_spectators
Contains all spectators values of the currently loaded event.
Definition: Dataset.h:124
virtual std::vector< float > getTargets()
Returns all targets.
Definition: Dataset.cc:110
virtual void loadEvent(unsigned int iEvent)=0
Load the event number iEvent.
GeneralOptions m_general_options
GeneralOptions passed to this dataset.
Definition: Dataset.h:122
std::vector< float > m_input
Contains all feature values of the currently loaded event.
Definition: Dataset.h:123
Dataset(const GeneralOptions &general_options)
Constructs a new dataset given the general options.
Definition: Dataset.cc:26
virtual std::vector< float > getFeature(unsigned int iFeature)
Returns all values of one feature in a std::vector<float>
Definition: Dataset.cc:74
virtual std::vector< float > getWeights()
Returns all weights.
Definition: Dataset.cc:98
virtual float getSignalFraction()
Returns the signal fraction of the whole sample.
Definition: Dataset.cc:35
bool m_isSignal
Defines if the currently loaded event is signal or background.
Definition: Dataset.h:127
float m_weight
Contains the weight of the currently loaded event.
Definition: Dataset.h:125
virtual unsigned int getSpectatorIndex(const std::string &spectator)
Return index of spectator with the given name.
Definition: Dataset.cc:62
float m_target
Contains the target value of the currently loaded event.
Definition: Dataset.h:126
virtual ~Dataset()=default
Virtual default destructor.
General options which are shared by all MVA trainings.
Definition: Options.h:62
unsigned int m_max_events
Maximum number of events to process, 0 means all.
Definition: Options.h:92
Wraps the data of a multiple event into a Dataset.
Definition: Dataset.h:186
std::vector< float > m_weights
weight vector
Definition: Dataset.h:226
std::vector< std::vector< float > > m_matrix
Feature matrix.
Definition: Dataset.h:223
std::vector< std::vector< float > > m_spectator_matrix
Spectator matrix.
Definition: Dataset.h:224
MultiDataset(const GeneralOptions &general_options, const std::vector< std::vector< float >> &input, const std::vector< std::vector< float >> &spectators, const std::vector< float > &targets={}, const std::vector< float > &weights={})
Constructs a new MultiDataset.
Definition: Dataset.cc:145
std::vector< float > m_targets
target vector
Definition: Dataset.h:225
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in this dataset.
Definition: Dataset.h:214
virtual void loadEvent(unsigned int iEvent) override
Does nothing in the case of a single dataset, because the only event is already loaded.
Definition: Dataset.cc:168
virtual unsigned int getNumberOfSpectators() const override
Returns the number of spectators in this dataset.
Definition: Dataset.h:209
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: Dataset.h:204
Proivdes a dataset from a ROOT file This is the usually used dataset providing training data to the m...
Definition: Dataset.h:349
void setBranchAddresses()
Sets the branch addresses of all features, weight and target again.
Definition: Dataset.cc:577
void setVectorVariableAddress(std::string &variableType, std::vector< std::string > &variableName, T &variableTargets)
sets the branch address for a vector variable to a given target
Definition: Dataset.cc:557
void setTargetRootInputType()
Determines the data type of the target variable and sets it to m_target_data_type.
Definition: Dataset.cc:710
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in this dataset.
Definition: Dataset.h:371
Variable::Manager::VariableDataType m_target_data_type
Data type of target variable.
Definition: Dataset.h:478
TChain * m_tree
Pointer to the TChain containing the data.
Definition: Dataset.h:472
double m_target_double
Contains the target value of the currently loaded event.
Definition: Dataset.h:480
virtual void loadEvent(unsigned int event) override
Load the event number iEvent from the TTree.
Definition: Dataset.cc:378
int m_target_int
Contains the target value of the currently loaded event.
Definition: Dataset.h:481
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values of one spectator in a std::vector<float>
Definition: Dataset.cc:465
double m_weight_double
Contains the weight of the currently loaded event.
Definition: Dataset.h:477
bool m_isFloatInputType
Defines the expected datatype in the ROOT file.
Definition: Dataset.h:473
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values of one feature in a std::vector<float>
Definition: Dataset.cc:443
void setScalarVariableAddress(std::string &variableType, std::string &variableName, T &variableTarget)
sets the branch address for a scalar variable to a given target
Definition: Dataset.cc:535
virtual std::vector< float > getWeights() override
Returns all values of of the weights in a std::vector<float>
Definition: Dataset.cc:415
ROOTDataset(const GeneralOptions &_general_options)
Creates a new ROOTDataset.
Definition: Dataset.cc:316
void setRootInputType()
Tries to infer the data-type of a root file and sets m_isDoubleInputType.
Definition: Dataset.cc:635
virtual unsigned int getNumberOfSpectators() const override
Returns the number of features in this dataset.
Definition: Dataset.h:366
bool checkForBranch(TTree *, const std::string &) const
Checks if the given branchname exists in the TTree.
Definition: Dataset.cc:527
std::vector< float > getVectorFromTTree(std::string &variableType, std::string &branchName, T &memberVariableTarget)
Returns all values for a specified variableType and branchName.
Definition: Dataset.cc:494
virtual ~ROOTDataset()
Virtual destructor.
Definition: Dataset.cc:487
std::vector< Variable::Manager::VarVariant > m_spectators_variant
Contains all spectators values of the currently loaded event.
Definition: Dataset.h:476
bool m_target_bool
Contains the target value of the currently loaded event.
Definition: Dataset.h:482
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: Dataset.h:361
std::vector< Variable::Manager::VarVariant > m_input_variant
Contains all feature values of the currently loaded event.
Definition: Dataset.h:474
Wraps the data of a single event into a Dataset.
Definition: Dataset.h:135
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in this dataset which is always one.
Definition: Dataset.h:161
SingleDataset(const GeneralOptions &general_options, const std::vector< float > &input, float target=1.0, const std::vector< float > &spectators=std::vector< float >())
Constructs a new SingleDataset.
Definition: Dataset.cc:135
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values (in this case only one) of one feature in a std::vector<float>
Definition: Dataset.h:172
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values (in this case only one) of one spectator in a std::vector<float>
Definition: Dataset.h:178
virtual unsigned int getNumberOfSpectators() const override
Returns the number of features in this dataset.
Definition: Dataset.h:156
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: Dataset.h:151
virtual void loadEvent(unsigned int) override
Does nothing in the case of a single dataset, because the only event is already loaded.
Definition: Dataset.h:166
Wraps another Dataset and provides a view to a subset of its features and events.
Definition: Dataset.h:234
Dataset & m_dataset
Reference to the wrapped dataset.
Definition: Dataset.h:286
SubDataset(const GeneralOptions &general_options, const std::vector< bool > &events, Dataset &dataset)
Constructs a new SubDataset holding a reference to the wrapped Dataset.
Definition: Dataset.cc:186
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in the wrapped dataset.
Definition: Dataset.h:258
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values of one spectator in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:259
std::vector< unsigned int > m_feature_indices
Mapping from the position of a feature in the given subset to its position in the wrapped dataset.
Definition: Dataset.h:281
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values of one feature in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:245
std::vector< unsigned int > m_spectator_indices
Mapping from the position of a spectator in the given subset to its position in the wrapped dataset.
Definition: Dataset.h:283
virtual void loadEvent(unsigned int iEvent) override
Load the event number iEvent from the wrapped dataset.
Definition: Dataset.cc:225
virtual unsigned int getNumberOfSpectators() const override
Returns the number of spectators in this dataset, so the size of the given subset of the spectators.
Definition: Dataset.h:253
std::vector< unsigned int > m_event_indices
Mapping from the position of a event in the given subset to its position in the wrapped dataset.
Definition: Dataset.h:285
bool m_use_event_indices
Use only a subset of the wrapped dataset events.
Definition: Dataset.h:279
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset, so the size of the given subset of the variables.
Definition: Dataset.h:248
VariableDataType
data type of variables
Definition: Manager.h:122
Abstract base class for different kinds of events.
Definition: ClusterUtils.h:23