Source code for b2pandas_utils

import basf2
from ROOT import Belle2
import tables
import numpy as np
import warnings
from variables import variables as variable_manager
from variables import std_vector

"""
Python uilities to help create or manage ntuples and work with them in pandas
"""


[docs]class VariablesToHDF5(basf2.Module):
    """
    Dump variables directly to HDF5

    This Module is the equivalent of VariablesToNtuple but creates an hdf5 file
    instead of a root file. It is slower as it is implemented in pure python and
    should currently be considered a proof of concept.
    """
    def __init__(self, listname, variables, filename):
        """Constructor to initialize the internal state

        Arguments:
            listname(str): name of the particle list
            variables(list(str)): list of variables to save for each particle
            filename(str): name of the hdf5 file to be created
        """
        super().__init__()
        #: Output filename
        self._filename = filename
        #: Particle list name
        self._listname = listname
        #: List of variables
        self._variables = variables

[docs]    def initialize(self):
        """Create the hdf5 file and list of variable objects to be used during
        event processing."""
        # get variables from manager
        varnames = variable_manager.resolveCollections(std_vector(*self._variables))
        #: variable objects for each variable
        self._var_objects = [variable_manager.getVariable(n) for n in varnames]

        #: Event metadata
        self._evtmeta = Belle2.PyStoreObj("EventMetaData")
        self._evtmeta.isRequired()
        #: Pointer to the particle list
        self._plist = Belle2.PyStoreObj(self._listname)
        self._plist.isRequired()

        #: The hdf5 file
        self._hdf5file = tables.open_file(self._filename, mode="w", title="Belle2 Variables to HDF5")
        if not self._hdf5file:
            basf2.B2ERROR("Cannot create output file")
            return

        dtype = [("exp", np.int32), ("run", np.int32), ("evt", np.uint32), ("icand", np.uint32), ("ncand", np.uint32)]
        for v in self._var_objects:
            # only float variables for now
            dtype.append((v.name, np.float64))

        #: The data type
        self._dtype = dtype
        filters = tables.Filters(complevel=1, complib='blosc:lz4', fletcher32=False)
        # some variable names are not just A-Za-z0-9 so pytables complains but
        # seems to work. Ignore warning
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            #: The pytable
            self._table = self._hdf5file.create_table("/", self._listname, obj=np.zeros(0, dtype), filters=filters)

[docs]    def event(self):
        """Create a new row in the hdf5 file with for each particle in the list"""
        buf = np.empty(self._plist.getListSize(), dtype=self._dtype)
        # add some extra columns for bookkeeping
        buf["exp"] = self._evtmeta.getExperiment()
        buf["run"] = self._evtmeta.getRun()
        buf["evt"] = self._evtmeta.getEvent()
        buf["ncand"] = len(buf)
        buf["icand"] = np.arange(len(buf))

        for row, p in zip(buf, self._plist):
            for v in self._var_objects:
                # pyroot proxy not working with callables, we should fix this.
                # For now we need to go back by name and call it.
                # should be `row[v.name] = v.func(p)`
                row[v.name] = variable_manager.evaluate(v.name, p)

        self._table.append(buf)

[docs]    def terminate(self):
        """save and close the output"""
        self._table.flush()
        self._hdf5file.close()


def make_mcerrors_readable(dataframe, column="mcErrors"):
    """
    Take a dataframe containing an column with the output of the :b2:var:`mcErrors`
    variable from :b2:mod:`VariablesToNTuple` and convert it to a readable set
    of columns of the form ``{column}_{name}`` where column is the value of the
    ``column`` argument and ``name`` is one of one of the :ref:`mcmatching`
    error flags (without the leading 'c_').

    Arguments:
        dataframe(pandas.DataFrame): the pandas dataframe containing an ntuple
                with column containing the output of the  mcErrors variable
        column(str): the name containing the values from the mcErrors variable
    """

    if column not in dataframe:
        raise KeyError(f"Cannot find coulumn '{column}'")

    # convert mcErrors to int to be able to logical operate on it
    mcErrors = dataframe[column].astype(int)

    # and loop over all the c_ constants in the Belle2.MCMatching class
    for flag in (e for e in dir(Belle2.MCMatching) if e.startswith("c_")):
        try:
            value = int(getattr(Belle2.MCMatching, flag))
        except ValueError:
            # probably the extraInfo column name, ignore
            continue

        # and set the column
        name = column + flag[1:]
        if value == 0:
            dataframe[name] = mcErrors == 0
        else:
            dataframe[name] = (mcErrors & value) == value


# This is just for testing, no need for doxygen to weirdly document it
# @cond
if __name__ == "__main__":
    import modularAnalysis

    p = basf2.create_path()
    p.add_module("EventInfoSetter", evtNumList=100)
    p.add_module("EvtGenInput")
    modularAnalysis.fillParticleListsFromMC([("pi-:gen", "")], path=p)
    a = VariablesToHDF5("pi-:gen", ["M", "E", "px", "py", "pz"], "test.hdf5")
    p.add_module(a)
    # Process the events
    basf2.process(p)
    print(basf2.statistics)
# @endcond