release-08-01-10/doxygen/basf2__mva__util_8py_source.html

 import tempfile

 import numpy as np


 from basf2 import B2WARNING

 import basf2_mva


 def chain2dict(chain, tree_columns, dict_columns=None):

     """

     Convert a ROOT.TChain into a dictionary of np.arrays

     @param chain the ROOT.TChain

     @param tree_columns the column (or branch) names in the tree

     @param dict_columns the corresponding column names in the dictionary

     """

     if len(tree_columns) == 0:

         return dict()

     if dict_columns is None:

         dict_columns = tree_columns

     try:

         from ROOT import RDataFrame

         rdf = RDataFrame(chain)

         d = np.column_stack(list(rdf.AsNumpy(tree_columns).values()))

         d = np.core.records.fromarrays(d.transpose(), names=dict_columns)

     except ImportError:

         d = {column: np.zeros((chain.GetEntries(),)) for column in dict_columns}

         for iEvent, event in enumerate(chain):

             for dict_column, tree_column in zip(dict_columns, tree_columns):

                 d[dict_column][iEvent] = getattr(event, tree_column)

     return d


 def calculate_roc_auc(p, t):

     """

     Deprecated name of ``calculate_auc_efficiency_vs_purity``


     @param p np.array filled with the probability output of a classifier

     @param t np.array filled with the target (0 or 1)

     """

     B2WARNING(

         "\033[93mcalculate_roc_auc\033[00m has been deprecated and will be removed in future.\n"

         "This change has been made as calculate_roc_auc returned the area under the efficiency-purity curve\n"

         "not the efficiency-background retention curve as expected by users.\n"

         "Please replace calculate_roc_auc with:\n\n"

         "\033[96mcalculate_auc_efficiency_vs_purity(probability, target[, weight])\033[00m:"

         " the current definition of calculate_roc_auc\n"

         "\033[96mcalculate_auc_efficiency_vs_background_retention(probability, target[, weight])\033[00m:"

         " what is commonly known as roc auc\n")

     return calculate_auc_efficiency_vs_purity(p, t)


 def calculate_auc_efficiency_vs_purity(p, t, w=None):

     """

     Calculates the area under the efficiency-purity curve

     @param p np.array filled with the probability output of a classifier

     @param t np.array filled with the target (0 or 1)

     @param w None or np.array filled with weights

     """

     if w is None:

         w = np.ones(t.shape)


     wt = w * t


     N = np.sum(w)

     T = np.sum(wt)


     index = np.argsort(p)

     efficiency = (T - np.cumsum(wt[index])) / float(T)

     purity = (T - np.cumsum(wt[index])) / (N - np.cumsum(w[index]))

     purity = np.where(np.isnan(purity), 0, purity)

     return np.abs(np.trapz(purity, efficiency))


 def calculate_auc_efficiency_vs_background_retention(p, t, w=None):

     """

     Calculates the area under the efficiency-background_retention curve (AUC ROC)

     @param p np.array filled with the probability output of a classifier

     @param t np.array filled with the target (0 or 1)

     @param w None or np.array filled with weights

     """

     if w is None:

         w = np.ones(t.shape)


     wt = w * t


     N = np.sum(w)

     T = np.sum(wt)


     index = np.argsort(p)

     efficiency = (T - np.cumsum(wt[index])) / float(T)

     background_retention = (N - T - np.cumsum((np.abs(1 - t) * w)[index])) / float(N - T)

     return np.abs(np.trapz(efficiency, background_retention))


 def calculate_flatness(f, p, w=None):

     """

     Calculates the flatness of a feature under cuts on a signal probability

     @param f the feature values

     @param p the probability values

     @param w optional weights

     @return the mean standard deviation between the local and global cut selection efficiency

     """

     quantiles = list(range(101))

     binning_feature = np.unique(np.percentile(f, q=quantiles))

     binning_probability = np.unique(np.percentile(p, q=quantiles))

     if len(binning_feature) < 2:

         binning_feature = np.array([np.min(f) - 1, np.max(f) + 1])

     if len(binning_probability) < 2:

         binning_probability = np.array([np.min(p) - 1, np.max(p) + 1])

     hist_n, _ = np.histogramdd(np.c_[p, f],

                                bins=[binning_probability, binning_feature],

                                weights=w)

     hist_inc = hist_n.sum(axis=1)

     hist_inc /= hist_inc.sum(axis=0)

     hist_n /= hist_n.sum(axis=0)

     hist_n = hist_n.cumsum(axis=0)

     hist_inc = hist_inc.cumsum(axis=0)

     diff = (hist_n.T - hist_inc)**2

     return np.sqrt(diff.sum() / (100 * 99))


 class Method:

     """

     Wrapper class providing an interface to the method stored under the given identifier.

     It loads the Options, can apply the expert and train new ones using the current as a prototype.

     This class is used by the basf_mva_evaluation tools

     """


     def __init__(self, identifier):

         """

         Load a method stored under the given identifier

         @param identifier identifying the method

         """

         # Always avoid the top-level 'import ROOT'.

         import ROOT  # noqa

         # Initialize all the available interfaces

         ROOT.Belle2.MVA.AbstractInterface.initSupportedInterfaces()


         self.identifieridentifier = identifier


         self.weightfileweightfile = ROOT.Belle2.MVA.Weightfile.load(self.identifieridentifier)


         self.general_optionsgeneral_options = basf2_mva.GeneralOptions()

         self.general_optionsgeneral_options.load(self.weightfileweightfile.getXMLTree())


         # This piece of code should be correct but leads to random segmentation faults

         # inside python, llvm or pyroot, therefore we use the more dirty code below

         # Ideas why this is happening:

         # 1. Ownership of the unique_ptr returned by getOptions()

         # 2. Some kind of object slicing, although pyroot identifies the correct type

         # 3. Bug in pyroot

         # interfaces = ROOT.Belle2.MVA.AbstractInterface.getSupportedInterfaces()

         # self.interface = interfaces[self.general_options.m_method]

         # self.specific_options = self.interface.getOptions()


         self.specific_optionsspecific_options = None

         if self.general_optionsgeneral_options.m_method == "FastBDT":

             self.specific_optionsspecific_options = basf2_mva.FastBDTOptions()

         elif self.general_optionsgeneral_options.m_method == "TMVAClassification":

             self.specific_optionsspecific_options = basf2_mva.TMVAOptionsClassification()

         elif self.general_optionsgeneral_options.m_method == "TMVARegression":

             self.specific_optionsspecific_options = basf2_mva.TMVAOptionsRegression()

         elif self.general_optionsgeneral_options.m_method == "FANN":

             self.specific_optionsspecific_options = basf2_mva.FANNOptions()

         elif self.general_optionsgeneral_options.m_method == "Python":

             self.specific_optionsspecific_options = basf2_mva.PythonOptions()

         elif self.general_optionsgeneral_options.m_method == "PDF":

             self.specific_optionsspecific_options = basf2_mva.PDFOptions()

         elif self.general_optionsgeneral_options.m_method == "Combination":

             self.specific_optionsspecific_options = basf2_mva.CombinationOptions()

         elif self.general_optionsgeneral_options.m_method == "Reweighter":

             self.specific_optionsspecific_options = basf2_mva.ReweighterOptions()

         elif self.general_optionsgeneral_options.m_method == "Trivial":

             self.specific_optionsspecific_options = basf2_mva.TrivialOptions()

         else:

             raise RuntimeError("Unknown method " + self.general_optionsgeneral_options.m_method)


         self.specific_optionsspecific_options.load(self.weightfileweightfile.getXMLTree())


         variables = [str(v) for v in self.general_optionsgeneral_options.m_variables]

         importances = self.weightfileweightfile.getFeatureImportance()


         self.importancesimportances = {k: importances[k] for k in variables}


         self.variablesvariables = list(sorted(variables, key=lambda v: self.importancesimportances.get(v, 0.0)))


         self.root_variablesroot_variables = [ROOT.Belle2.MakeROOTCompatible.makeROOTCompatible(v) for v in self.variablesvariables]


         self.root_importancesroot_importances = {k: importances[k] for k in self.root_variablesroot_variables}


         self.descriptiondescription = str(basf2_mva.info(self.identifieridentifier))


         self.spectatorsspectators = [str(v) for v in self.general_optionsgeneral_options.m_spectators]


         self.root_spectatorsroot_spectators = [ROOT.Belle2.MakeROOTCompatible.makeROOTCompatible(v) for v in self.spectatorsspectators]


     def train_teacher(self, datafiles, treename, general_options=None, specific_options=None):

         """

         Train a new method using this method as a prototype

         @param datafiles the training datafiles

         @param treename the name of the tree containing the training data

         @param general_options general options given to basf2_mva.teacher

           (if None the options of this method are used)

         @param specific_options specific options given to basf2_mva.teacher

           (if None the options of this method are used)

         """

         # Always avoid the top-level 'import ROOT'.

         import ROOT  # noqa

         if isinstance(datafiles, str):

             datafiles = [datafiles]

         if general_options is None:

             general_options = self.general_optionsgeneral_options

         if specific_options is None:

             specific_options = self.specific_optionsspecific_options


         with tempfile.TemporaryDirectory() as tempdir:

             identifier = tempdir + "/weightfile.xml"


             general_options.m_datafiles = basf2_mva.vector(*datafiles)

             general_options.m_identifier = identifier


             basf2_mva.teacher(general_options, specific_options)


             method = Method(identifier)

         return method


     def apply_expert(self, datafiles, treename):

         """

         Apply the expert of the method to data and return the calculated probability and the target

         @param datafiles the datafiles

         @param treename the name of the tree containing the data

         """

         import ROOT  # noqa

         if isinstance(datafiles, str):

             datafiles = [datafiles]

         with tempfile.TemporaryDirectory() as tempdir:

             identifier = tempdir + "/weightfile.xml"

             ROOT.Belle2.MVA.Weightfile.save(self.weightfileweightfile, identifier)


             rootfilename = tempdir + '/expert.root'

             basf2_mva.expert(basf2_mva.vector(identifier),

                              basf2_mva.vector(*datafiles),

                              treename,

                              rootfilename)

             chain = ROOT.TChain("variables")

             chain.Add(rootfilename)


             expert_target = identifier + '_' + self.general_optionsgeneral_options.m_target_variable

             stripped_expert_target = self.identifieridentifier + '_' + self.general_optionsgeneral_options.m_target_variable


             output_names = [self.identifieridentifier]

             branch_names = [

                     ROOT.Belle2.MakeROOTCompatible.makeROOTCompatible(identifier),

                     ]

             if self.general_optionsgeneral_options.m_nClasses > 2:

                 output_names = [self.identifieridentifier+f'_{i}' for i in range(self.general_optionsgeneral_options.m_nClasses)]

                 branch_names = [

                     ROOT.Belle2.MakeROOTCompatible.makeROOTCompatible(

                         identifier +

                         f'_{i}') for i in range(

                         self.general_optionsgeneral_options.m_nClasses)]


             d = chain2dict(

                 chain,

                 [*branch_names, ROOT.Belle2.MakeROOTCompatible.makeROOTCompatible(expert_target)],

                 [*output_names, stripped_expert_target])


         return (d[self.identifieridentifier] if self.general_optionsgeneral_options.m_nClasses <= 2 else np.array([d[x]

                 for x in output_names]).T), d[stripped_expert_target]

basf2_mva_util.Method
Definition: basf2_mva_util.py:129

basf2_mva_util.Method.specific_options
specific_options
Specific options of the method.
Definition: basf2_mva_util.py:164

basf2_mva_util.Method.apply_expert
def apply_expert(self, datafiles, treename)
Definition: basf2_mva_util.py:236

basf2_mva_util.Method.description
description
Description of the method as a xml string returned by basf2_mva.info.
Definition: basf2_mva_util.py:200

basf2_mva_util.Method.importances
importances
Dictionary of the variable importances calculated by the method.
Definition: basf2_mva_util.py:192

basf2_mva_util.Method.root_importances
root_importances
Dictionary of the variables sorted by their importance but with root compatoble variable names.
Definition: basf2_mva_util.py:198

basf2_mva_util.Method.variables
variables
List of variables sorted by their importance.
Definition: basf2_mva_util.py:194

basf2_mva_util.Method.__init__
def __init__(self, identifier)
Definition: basf2_mva_util.py:136

basf2_mva_util.Method.weightfile
weightfile
Weightfile of the method.
Definition: basf2_mva_util.py:148

basf2_mva_util.Method.root_spectators
root_spectators
List of spectators with root compatible names.
Definition: basf2_mva_util.py:204

basf2_mva_util.Method.spectators
spectators
List of spectators.
Definition: basf2_mva_util.py:202

basf2_mva_util.Method.train_teacher
def train_teacher(self, datafiles, treename, general_options=None, specific_options=None)
Definition: basf2_mva_util.py:206

basf2_mva_util.Method.root_variables
root_variables
List of the variable importances calculated by the method, but with the root compatible variable name...
Definition: basf2_mva_util.py:196

basf2_mva_util.Method.general_options
general_options
General options of the method.
Definition: basf2_mva_util.py:150

basf2_mva_util.Method.identifier
identifier
Identifier of the method.
Definition: basf2_mva_util.py:146