Belle II Software development
Dataset.h
1/**************************************************************************
2 * basf2 (Belle II Analysis Software Framework) *
3 * Author: The Belle II Collaboration *
4 * *
5 * See git log for contributors and copyright holders. *
6 * This file is licensed under LGPL-3.0, see LICENSE.md. *
7 **************************************************************************/
8
9#pragma once
10#ifndef INCLUDE_GUARD_BELLE2_MVA_DATASET_HEADER
11#define INCLUDE_GUARD_BELLE2_MVA_DATASET_HEADER
12
13#include <mva/interface/Options.h>
14
15#include <analysis/VariableManager/Manager.h>
16
17#include <TFile.h>
18#include <TChain.h>
19
20#include <string>
21
22namespace Belle2 {
27 namespace MVA {
28
33 class Dataset {
34
35 public:
40 explicit Dataset(const GeneralOptions& general_options);
41
45 virtual ~Dataset() = default;
46
50 Dataset(const Dataset&) = delete;
51
55 Dataset& operator=(const Dataset&) = delete;
56
60 virtual unsigned int getNumberOfFeatures() const = 0;
61
65 virtual unsigned int getNumberOfSpectators() const = 0;
66
70 virtual unsigned int getNumberOfEvents() const = 0;
71
76 virtual void loadEvent(unsigned int iEvent) = 0;
77
81 virtual float getSignalFraction();
82
87 virtual unsigned int getFeatureIndex(const std::string& feature);
88
93 virtual unsigned int getSpectatorIndex(const std::string& spectator);
94
99 virtual std::vector<float> getFeature(unsigned int iFeature);
100
105 virtual std::vector<float> getSpectator(unsigned int iSpectator);
106
110 virtual std::vector<float> getWeights();
111
115 virtual std::vector<float> getTargets();
116
120 virtual std::vector<bool> getSignals();
121
123 std::vector<float> m_input;
124 std::vector<float> m_spectators;
125 float m_weight;
126 float m_target;
128 };
129
130
135 class SingleDataset : public Dataset {
136
137 public:
145 SingleDataset(const GeneralOptions& general_options, const std::vector<float>& input, float target = 1.0,
146 const std::vector<float>& spectators = std::vector<float>());
147
151 virtual unsigned int getNumberOfFeatures() const override { return m_input.size(); }
152
156 virtual unsigned int getNumberOfSpectators() const override { return m_spectators.size(); }
157
161 virtual unsigned int getNumberOfEvents() const override { return 1; }
162
166 virtual void loadEvent(unsigned int) override { };
167
172 virtual std::vector<float> getFeature(unsigned int iFeature) override { return std::vector<float> {m_input[iFeature]}; }
173
178 virtual std::vector<float> getSpectator(unsigned int iSpectator) override { return std::vector<float> {m_spectators[iSpectator]}; }
179
180 };
181
186 class MultiDataset : public Dataset {
187
188 public:
197 MultiDataset(const GeneralOptions& general_options, const std::vector<std::vector<float>>& input,
198 const std::vector<std::vector<float>>& spectators,
199 const std::vector<float>& targets = {}, const std::vector<float>& weights = {});
200
204 virtual unsigned int getNumberOfFeatures() const override { return m_input.size(); }
205
209 virtual unsigned int getNumberOfSpectators() const override { return m_spectators.size(); }
210
214 virtual unsigned int getNumberOfEvents() const override { return m_matrix.size(); }
215
219 virtual void loadEvent(unsigned int iEvent) override;
220
221
222 private:
223 std::vector<std::vector<float>> m_matrix;
224 std::vector<std::vector<float>> m_spectator_matrix;
225 std::vector<float> m_targets;
226 std::vector<float> m_weights;
228 };
229
234 class SubDataset : public Dataset {
235
236 public:
243 SubDataset(const GeneralOptions& general_options, const std::vector<bool>& events, Dataset& dataset);
244
248 virtual unsigned int getNumberOfFeatures() const override { return m_feature_indices.size(); }
249
253 virtual unsigned int getNumberOfSpectators() const override { return m_spectator_indices.size(); }
254
258 virtual unsigned int getNumberOfEvents() const override { return m_use_event_indices ? m_event_indices.size() : m_dataset.getNumberOfEvents(); }
259
264 virtual void loadEvent(unsigned int iEvent) override;
265
270 virtual std::vector<float> getFeature(unsigned int iFeature) override;
271
276 virtual std::vector<float> getSpectator(unsigned int iSpectator) override;
277
278 private:
279 bool m_use_event_indices = false;
280 std::vector<unsigned int>
282 std::vector<unsigned int>
284 std::vector<unsigned int>
288 };
289
294 class CombinedDataset : public Dataset {
295
296 public:
303 CombinedDataset(const GeneralOptions& general_options, Dataset& signal_dataset, Dataset& background_dataset);
304
308 virtual unsigned int getNumberOfFeatures() const override { return m_signal_dataset.getNumberOfFeatures(); }
309
313 virtual unsigned int getNumberOfSpectators() const override { return m_signal_dataset.getNumberOfSpectators(); }
314
318 virtual unsigned int getNumberOfEvents() const override { return m_signal_dataset.getNumberOfEvents() + m_background_dataset.getNumberOfEvents(); }
319
324 virtual void loadEvent(unsigned int iEvent) override;
325
330 virtual std::vector<float> getFeature(unsigned int iFeature) override;
331
336 virtual std::vector<float> getSpectator(unsigned int iSpectator) override;
337
338 private:
342 };
343
344
349 class ROOTDataset : public Dataset {
350
351 public:
356 explicit ROOTDataset(const GeneralOptions& _general_options);
357
361 virtual unsigned int getNumberOfFeatures() const override { return m_input.size(); }
362
366 virtual unsigned int getNumberOfSpectators() const override { return m_spectators.size(); }
367
371 virtual unsigned int getNumberOfEvents() const override
372 {
374 }
375
380 virtual void loadEvent(unsigned int event) override;
381
386 virtual std::vector<float> getFeature(unsigned int iFeature) override;
387
391 virtual std::vector<float> getWeights() override;
392
397 virtual std::vector<float> getSpectator(unsigned int iSpectator) override;
398
402 virtual ~ROOTDataset();
403
404 protected:
406 typedef std::variant<double, float, int, bool> RootDatasetVarVariant;
407
408 TChain* m_tree = nullptr;
409 std::vector<RootDatasetVarVariant> m_input_variant;
410 std::vector<RootDatasetVarVariant>
416 private:
417
427 template<class T>
428 std::vector<float> getVectorFromTTree(const std::string& variableType, const std::string& branchName, T& memberVariableTarget);
429
438 std::vector<float> getVectorFromTTreeVariant(const std::string& variableType, const std::string& branchName,
439 RootDatasetVarVariant& memberVariableTarget);
440
444 void setRootInputType();
445
453 template<class T>
454 void setScalarVariableAddress(const std::string& variableType, const std::string& variableName, T& variableTarget);
455
462 void setScalarVariableAddressVariant(const std::string& variableType, const std::string& variableName,
463 RootDatasetVarVariant& variableTarget);
464
472 template<class T>
473 void setVectorVariableAddress(const std::string& variableType, const std::vector<std::string>& variableName,
474 T& variableTargets);
475
482 void setVectorVariableAddressVariant(const std::string& variableType, const std::vector<std::string>& variableName,
483 std::vector<RootDatasetVarVariant>& varVariantTargets);
484
489
490
494 void setBranchAddresses();
495
501 bool checkForBranch(TTree*, const std::string&) const;
502
508
514 void initialiseVarVariantType(const std::string, RootDatasetVarVariant&);
515
523 };
524 }
526}
527#endif
Wraps two other Datasets, one containing signal, the other background events Used by the reweighting ...
Definition: Dataset.h:294
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in the wrapped dataset.
Definition: Dataset.h:318
Dataset & m_background_dataset
Reference to the wrapped dataset containing background events.
Definition: Dataset.h:340
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values of one spectator in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:306
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values of one feature in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:296
virtual void loadEvent(unsigned int iEvent) override
Load the event number iEvent from the wrapped dataset.
Definition: Dataset.cc:277
virtual unsigned int getNumberOfSpectators() const override
Returns the number of spectators in this dataset, so the size of the given subset of the spectators.
Definition: Dataset.h:313
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset, so the size of the given subset of the variables.
Definition: Dataset.h:308
Dataset & m_signal_dataset
Reference to the wrapped dataset containing signal events.
Definition: Dataset.h:339
Abstract base class of all Datasets given to the MVA interface The current event can always be access...
Definition: Dataset.h:33
virtual unsigned int getNumberOfEvents() const =0
Returns the number of events in this dataset.
virtual unsigned int getNumberOfSpectators() const =0
Returns the number of spectators in this dataset.
Dataset(const Dataset &)=delete
Specify no copy constructor.
virtual std::vector< bool > getSignals()
Returns all is Signals.
Definition: Dataset.cc:122
virtual unsigned int getNumberOfFeatures() const =0
Returns the number of features in this dataset.
virtual unsigned int getFeatureIndex(const std::string &feature)
Return index of feature with the given name.
Definition: Dataset.cc:50
virtual std::vector< float > getSpectator(unsigned int iSpectator)
Returns all values of one spectator in a std::vector<float>
Definition: Dataset.cc:86
std::vector< float > m_spectators
Contains all spectators values of the currently loaded event.
Definition: Dataset.h:124
virtual std::vector< float > getTargets()
Returns all targets.
Definition: Dataset.cc:110
virtual void loadEvent(unsigned int iEvent)=0
Load the event number iEvent.
GeneralOptions m_general_options
GeneralOptions passed to this dataset.
Definition: Dataset.h:122
std::vector< float > m_input
Contains all feature values of the currently loaded event.
Definition: Dataset.h:123
virtual std::vector< float > getFeature(unsigned int iFeature)
Returns all values of one feature in a std::vector<float>
Definition: Dataset.cc:74
virtual std::vector< float > getWeights()
Returns all weights.
Definition: Dataset.cc:98
virtual float getSignalFraction()
Returns the signal fraction of the whole sample.
Definition: Dataset.cc:35
bool m_isSignal
Defines if the currently loaded event is signal or background.
Definition: Dataset.h:127
float m_weight
Contains the weight of the currently loaded event.
Definition: Dataset.h:125
virtual unsigned int getSpectatorIndex(const std::string &spectator)
Return index of spectator with the given name.
Definition: Dataset.cc:62
float m_target
Contains the target value of the currently loaded event.
Definition: Dataset.h:126
virtual ~Dataset()=default
Virtual default destructor.
Dataset & operator=(const Dataset &)=delete
Specify no assignment operator.
General options which are shared by all MVA trainings.
Definition: Options.h:62
unsigned int m_max_events
Maximum number of events to process, 0 means all.
Definition: Options.h:92
Wraps the data of a multiple event into a Dataset.
Definition: Dataset.h:186
std::vector< float > m_weights
weight vector
Definition: Dataset.h:226
std::vector< std::vector< float > > m_matrix
Feature matrix.
Definition: Dataset.h:223
std::vector< std::vector< float > > m_spectator_matrix
Spectator matrix.
Definition: Dataset.h:224
std::vector< float > m_targets
target vector
Definition: Dataset.h:225
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in this dataset.
Definition: Dataset.h:214
virtual void loadEvent(unsigned int iEvent) override
Does nothing in the case of a single dataset, because the only event is already loaded.
Definition: Dataset.cc:168
virtual unsigned int getNumberOfSpectators() const override
Returns the number of spectators in this dataset.
Definition: Dataset.h:209
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: Dataset.h:204
Proivdes a dataset from a ROOT file This is the usually used dataset providing training data to the m...
Definition: Dataset.h:349
void setScalarVariableAddress(const std::string &variableType, const std::string &variableName, T &variableTarget)
sets the branch address for a scalar variable to a given target
Definition: Dataset.cc:515
void setBranchAddresses()
Sets the branch addresses of all features, weight and target again.
Definition: Dataset.cc:568
void setTargetRootInputType()
Determines the data type of the target variable and sets it to m_target_data_type.
void setScalarVariableAddressVariant(const std::string &variableType, const std::string &variableName, RootDatasetVarVariant &variableTarget)
sets the branch address for a scalar variable to a given target
Definition: Dataset.cc:536
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in this dataset.
Definition: Dataset.h:371
void initialiseVarVariantForBranch(const std::string, RootDatasetVarVariant &)
Infers the type (double,float,int,bool) from the TTree and initialises the VarVariant with the correc...
Definition: Dataset.cc:615
TChain * m_tree
Pointer to the TChain containing the data.
Definition: Dataset.h:408
virtual void loadEvent(unsigned int event) override
Load the event number iEvent from the TTree.
Definition: Dataset.cc:391
void setVectorVariableAddressVariant(const std::string &variableType, const std::vector< std::string > &variableName, std::vector< RootDatasetVarVariant > &varVariantTargets)
sets the branch address for a vector of VarVariant to a given target
Definition: Dataset.cc:560
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values of one spectator in a std::vector<float>
Definition: Dataset.cc:440
std::vector< float > getVectorFromTTree(const std::string &variableType, const std::string &branchName, T &memberVariableTarget)
Returns all values for a specified variableType and branchName.
Definition: Dataset.cc:474
void initialiseVarVariantType(const std::string, RootDatasetVarVariant &)
Initialises the VarVariant.
Definition: Dataset.cc:598
std::vector< RootDatasetVarVariant > m_spectators_variant
Contains all spectators values of the currently loaded event.
Definition: Dataset.h:411
RootDatasetVarVariant m_target_variant
Contains the target value of the currently loaded event.
Definition: Dataset.h:413
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values of one feature in a std::vector<float>
Definition: Dataset.cc:429
virtual std::vector< float > getWeights() override
Returns all values of of the weights in a std::vector<float>
Definition: Dataset.cc:408
std::vector< RootDatasetVarVariant > m_input_variant
Contains all feature values of the currently loaded event.
Definition: Dataset.h:409
void setRootInputType()
Tries to infer the data-type of the spectator and feature variables in a root file.
Definition: Dataset.cc:632
virtual unsigned int getNumberOfSpectators() const override
Returns the number of features in this dataset.
Definition: Dataset.h:366
bool checkForBranch(TTree *, const std::string &) const
Checks if the given branchname exists in the TTree.
Definition: Dataset.cc:507
void setVectorVariableAddress(const std::string &variableType, const std::vector< std::string > &variableName, T &variableTargets)
sets the branch address for a vector variable to a given target
Definition: Dataset.cc:552
virtual ~ROOTDataset()
Virtual destructor.
Definition: Dataset.cc:452
float castVarVariantToFloat(RootDatasetVarVariant &) const
Casts a VarVariant which can contain <double,int,bool,float> to float.
Definition: Dataset.cc:376
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: Dataset.h:361
std::variant< double, float, int, bool > RootDatasetVarVariant
Typedef for variable types supported by the mva ROOTDataset, can be one of double,...
Definition: Dataset.h:406
std::vector< float > getVectorFromTTreeVariant(const std::string &variableType, const std::string &branchName, RootDatasetVarVariant &memberVariableTarget)
Returns all values for a specified variableType and branchName.
Definition: Dataset.cc:458
RootDatasetVarVariant m_weight_variant
Contains the weight of the currently loaded event.
Definition: Dataset.h:412
Wraps the data of a single event into a Dataset.
Definition: Dataset.h:135
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values (in this case only one) of one feature in a std::vector<float>
Definition: Dataset.h:172
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in this dataset which is always one.
Definition: Dataset.h:161
virtual unsigned int getNumberOfSpectators() const override
Returns the number of features in this dataset.
Definition: Dataset.h:156
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset.
Definition: Dataset.h:151
virtual void loadEvent(unsigned int) override
Does nothing in the case of a single dataset, because the only event is already loaded.
Definition: Dataset.h:166
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values (in this case only one) of one spectator in a std::vector<float>
Definition: Dataset.h:178
Wraps another Dataset and provides a view to a subset of its features and events.
Definition: Dataset.h:234
Dataset & m_dataset
Reference to the wrapped dataset.
Definition: Dataset.h:286
virtual unsigned int getNumberOfEvents() const override
Returns the number of events in the wrapped dataset.
Definition: Dataset.h:258
virtual std::vector< float > getSpectator(unsigned int iSpectator) override
Returns all values of one spectator in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:259
std::vector< unsigned int > m_feature_indices
Mapping from the position of a feature in the given subset to its position in the wrapped dataset.
Definition: Dataset.h:281
virtual std::vector< float > getFeature(unsigned int iFeature) override
Returns all values of one feature in a std::vector<float> of the wrapped dataset.
Definition: Dataset.cc:245
std::vector< unsigned int > m_spectator_indices
Mapping from the position of a spectator in the given subset to its position in the wrapped dataset.
Definition: Dataset.h:283
virtual void loadEvent(unsigned int iEvent) override
Load the event number iEvent from the wrapped dataset.
Definition: Dataset.cc:225
virtual unsigned int getNumberOfSpectators() const override
Returns the number of spectators in this dataset, so the size of the given subset of the spectators.
Definition: Dataset.h:253
std::vector< unsigned int > m_event_indices
Mapping from the position of a event in the given subset to its position in the wrapped dataset.
Definition: Dataset.h:285
bool m_use_event_indices
Use only a subset of the wrapped dataset events.
Definition: Dataset.h:279
virtual unsigned int getNumberOfFeatures() const override
Returns the number of features in this dataset, so the size of the given subset of the variables.
Definition: Dataset.h:248
Abstract base class for different kinds of events.