development/doxygen/b2pandas__utils_8py_source.html

from typing import List, Optional

import basf2

import variables

import tables

import numpy as np

import warnings

from pyarrow.parquet import ParquetWriter

from pyarrow.csv import CSVWriter

from pyarrow import ipc

import pyarrow as pa


"""

Python utilities to help create or manage ntuples and work with them in pandas

"""


numpy_to_pyarrow_type_map = {

    np.int32: pa.int32(),

    np.int64: pa.int64(),

    np.uint32: pa.uint32(),

    np.uint64: pa.uint64(),

    np.float32: pa.float32(),

    np.float64: pa.float64(),

    np.bool_: pa.bool_(),

    np.object_: pa.string(),

    np.str_: pa.string(),

}


class VariablesToTable(basf2.Module):

    """

    Base class to dump ntuples into a non root format of your choosing

    """


    def __init__(

        self,

        listname: str,

        variables: List[str],

        filename: str,

        hdf_table_name: Optional[str] = None,

        event_buffer_size: int = 100,

        **writer_kwargs,

    ):

        """Constructor to initialize the internal state


        Arguments:

            listname(str): name of the particle list

            variables(list(str)): list of variables to save for each particle

            filename(str): name of the output file to be created.

                Needs to end with `.csv` for csv output, `.parquet` or `.pq` for parquet output,

                `.h5`, `.hdf` or `.hdf5` for hdf5 output and `.feather` or `.arrow` for feather output

            hdf_table_name(str): name of the table in the hdf5 file.

                If not provided, it will be the same as the listname

            event_buffer_size(int): number of events to buffer before writing to disk,

                higher values will use more memory but write faster and result in smaller files

            **writer_kwargs: additional keyword arguments to pass to the writer.

                For details, see the documentation of the writer in the apache arrow documentation.

                Only use, if you know what you are doing!

        """

        super().__init__()


        self._filename = filename


        self._listname = listname


        self._variables = list(set(variables))


        file_type = self._filename.split(".")[-1]

        if file_type in ["csv"]:

            self._format = "csv"

        elif file_type in ["parquet", "pq"]:

            self._format = "parquet"

        elif file_type in ["h5", "hdf", "hdf5"]:

            self._format = "hdf5"

        elif file_type in ["feather", "arrow"]:

            self._format = "feather"

        else:

            raise ValueError(

                f"Unknown file type ending .{file_type}, supported types are 'csv', "

                "'parquet', 'pq', 'h5', 'hdf', 'hdf5', 'feather' or 'arrow'"

            )


        self._table_name = (

            hdf_table_name if hdf_table_name is not None else self._listname

        )


        self._event_buffer_size = event_buffer_size


        self._event_buffer_counter = 0


        self._writer_kwargs = writer_kwargs


    def initialize(self):

        """

        Setup variable lists, pointers, buffers and file writers

        """

        # Always avoid the top-level 'import ROOT'.

        import ROOT  # noqa


        self._varnames = [

            str(varname)

            for varname in variables.variables.resolveCollections(

                variables.std_vector(*self._variables)

            )

        ]


        self._std_varnames = variables.std_vector(*self._varnames)


        self._evtmeta = ROOT.Belle2.PyStoreObj("EventMetaData")

        self._evtmeta.isRequired()


        self._plist = ROOT.Belle2.PyStoreObj(self._listname)

        self._plist.isRequired()


        dtypes = [

            ("__experiment__", np.int32),

            ("__run__", np.int32),

            ("__event__", np.uint32),

            ("__production__", np.uint32),

            ("__candidate__", np.uint32),

            ("__ncandidates__", np.uint32),

        ]

        for name in self._varnames:

            # only float variables for now

            dtypes.append((name, np.float64))


        self._dtypes = dtypes


        self._buffer = np.empty(self._event_buffer_size * 10, dtype=self._dtypes)


        self._buffer_index = 0


        if self._format == "hdf5":

            self.initialize_hdf5_writer()

        elif self._format == "parquet":

            self.initialize_parquet_writer()

        elif self._format == "csv":

            self.initialize_csv_writer()

        elif self._format == "feather":

            self.initialize_feather_writer()


    @property

    def buffer(self):

        """

        The buffer slice across multiple entries

        """

        return self._buffer[:self._buffer_index]


    @property

    def event_buffer(self):

        """

        The buffer slice for the current event

        """

        return self._buffer[self._buffer_index - self._plist.getListSize(): self._buffer_index]


    def clear_buffer(self):

        """

        Reset the buffer event counter and index

        """

        self._event_buffer_counter = 0

        self._buffer_index = 0


    def append_buffer(self):

        """

        "Append" a new event to the buffer by moving the buffer index forward by particle list size


        Automatically replaces the buffer by a larger one if necessary

        """

        plist_size = self._plist.getListSize()

        if (plist_size + self._buffer_index) > len(self._buffer):

            new_buffer = np.empty(

                # factor 1.5 larger or at least as large as necessary

                max(int(len(self._buffer) * 1.5), self._buffer_index + plist_size),

                dtype=self._dtypes,

            )

            new_buffer[:self._buffer_index] = self.buffer

            self._buffer = new_buffer

        self._buffer_index += plist_size

        self._event_buffer_counter += 1


    def initialize_feather_writer(self):

        """

        Initialize the feather writer using pyarrow

        """


        self._schema = [

            (name, numpy_to_pyarrow_type_map[dt]) for name, dt in self._dtypes

        ]


        self._feather_writer = ipc.RecordBatchFileWriter(

            sink=self._filename,

            schema=pa.schema(self._schema),

            **self._writer_kwargs,

        )


    def initialize_parquet_writer(self):

        """

        Initialize the parquet writer using pyarrow

        """


        self._schema = [

            (name, numpy_to_pyarrow_type_map[dt]) for name, dt in self._dtypes

        ]


        self._parquet_writer = ParquetWriter(

            self._filename, schema=pa.schema(self._schema), **self._writer_kwargs

        )


    def initialize_csv_writer(self):

        """

        Initialize the csv writer using pyarrow

        """


        self._schema = [

            (name, numpy_to_pyarrow_type_map[dt]) for name, dt in self._dtypes

        ]


        self._csv_writer = CSVWriter(self._filename, schema=pa.schema(self._schema), **self._writer_kwargs)


    def initialize_hdf5_writer(self):

        """

        Initialize the hdf5 writer using pytables

        """


        self._hdf5_writer = tables.open_file(

            self._filename, mode="w", title="Belle2 Variables to HDF5"

        )

        filters = tables.Filters(complevel=1, complib="blosc:lz4", fletcher32=False)


        # some variable names are not just A-Za-z0-9 so pytables complains but

        # seems to work. Ignore warning

        with warnings.catch_warnings():

            warnings.simplefilter("ignore")


            self._table = self._hdf5_writer.create_table(

                "/", self._table_name, obj=np.zeros(0, self._dtypes), filters=filters, **self._writer_kwargs

            )


    def fill_event_buffer(self):

        """

        Assign values for all variables for all particles in the particle list to the current event buffer

        """

        buf = self.event_buffer


        # add some extra columns for bookkeeping

        buf["__experiment__"] = self._evtmeta.getExperiment()

        buf["__run__"] = self._evtmeta.getRun()

        buf["__event__"] = self._evtmeta.getEvent()

        buf["__production__"] = self._evtmeta.getProduction()

        buf["__ncandidates__"] = len(buf)

        buf["__candidate__"] = np.arange(len(buf))


        # fill variables into buffer

        vector = variables.variables.evaluateVariables(self._std_varnames, self._plist)

        values = np.array(vector.data()).reshape(-1, len(self._varnames))

        for name, col in zip(self._varnames, values.T):

            buf[name] = col


    @property

    def buffer_full(self):

        """

        check if the buffer is full

        """

        return self._event_buffer_counter == self._event_buffer_size


    def write_buffer(self):

        """

        write the buffer to the output file

        """

        if self._format == "hdf5":

            """Create a new row in the hdf5 file with for each particle in the list"""

            self._table.append(self.buffer)

        else:

            table = {name: self.buffer[name] for name, _ in self._dtypes}

            pa_table = pa.table(table, schema=pa.schema(self._schema))

            if self._format == "parquet":

                self._parquet_writer.write_table(pa_table)

            elif self._format == "csv":

                self._csv_writer.write(pa_table)

            elif self._format == "feather":

                self._feather_writer.write_table(pa_table)


    def event(self):

        """

        Event processing function


        executes the fill_buffer function and writes the data to the output file

        in chunks of event_buffer_size

        """

        self.append_buffer()

        self.fill_event_buffer()

        if self.buffer_full:

            self.write_buffer()

            self.clear_buffer()


    def terminate(self):

        """save and close the output"""

        import ROOT  # noqa

        if len(self.buffer) > 0:

            self.write_buffer()


        if self._format == "hdf5":

            self._table.flush()

            self._hdf5_writer.close()

        elif self._format == "parquet":

            self._parquet_writer.close()

        elif self._format == "csv":

            self._csv_writer.close()

        elif self._format == "feather":

            self._feather_writer.close()

        ROOT.Belle2.MetadataService.Instance().addNtuple(self._filename)


class VariablesToHDF5(VariablesToTable):

    """

    Legacy class to not break existing code

    """


    def __init__(self, listname, variables, filename, hdf_table_name: Optional[str] = None,):

        super().__init__(listname, variables, filename, hdf_table_name)

        assert self._filename.split(".")[-1] in ["h5", "hdf", "hdf5"], (

            "Filename must end with .h5, .hdf or .hdf5 for HDF5 output. "

            f"Got {self._filename}"

        )


def make_mcerrors_readable(dataframe, column="mcErrors"):

    """

    Take a dataframe containing a column with the output of the :b2:var:`mcErrors`

    variable from :b2:mod:`VariablesToNTuple` and convert it to a readable set

    of columns of the form ``{column}_{name}`` where column is the value of the

    ``column`` argument and ``name`` is one of the :ref:`mcmatching`

    error flags (without the leading 'c_').


    Arguments:

        dataframe(pandas.DataFrame): the pandas dataframe containing an ntuple

                with column containing the output of the  mcErrors variable

        column(str): the name containing the values from the mcErrors variable

    """

    # Always avoid the top-level 'import ROOT'.

    import ROOT  # noqa


    if column not in dataframe:

        raise KeyError(f"Cannot find column '{column}'")


    # convert mcErrors to int to be able to logical operate on it

    mcErrors = dataframe[column].astype(int)


    # and loop over all the c_ constants in the Belle2.MCMatching class

    for flag in (e for e in dir(ROOT.Belle2.MCMatching) if e.startswith("c_")):

        try:

            value = int(getattr(ROOT.Belle2.MCMatching, flag))

        except ValueError:

            # probably the extraInfo column name, ignore

            continue


        # and set the column

        name = column + flag[1:]

        if value == 0:

            dataframe[name] = mcErrors == 0

        else:

            dataframe[name] = (mcErrors & value) == value

variables.std_vector
def std_vector(*args)
Definition: __init__.py:142

b2pandas_utils.VariablesToHDF5
Definition: b2pandas_utils.py:329

b2pandas_utils.VariablesToHDF5.__init__
def __init__(self, listname, variables, filename, Optional[str] hdf_table_name=None)
Definition: b2pandas_utils.py:334

b2pandas_utils.VariablesToTable
Definition: b2pandas_utils.py:38

b2pandas_utils.VariablesToTable.initialize_parquet_writer
def initialize_parquet_writer(self)
Definition: b2pandas_utils.py:211

b2pandas_utils.VariablesToTable._event_buffer_counter
_event_buffer_counter
Event buffer counter.
Definition: b2pandas_utils.py:97

b2pandas_utils.VariablesToTable._format
_format
Output format.
Definition: b2pandas_utils.py:78

b2pandas_utils.VariablesToTable._evtmeta
_evtmeta
Event metadata.
Definition: b2pandas_utils.py:120

b2pandas_utils.VariablesToTable.initialize_csv_writer
def initialize_csv_writer(self)
Definition: b2pandas_utils.py:224

b2pandas_utils.VariablesToTable._event_buffer_size
_event_buffer_size
Event buffer size.
Definition: b2pandas_utils.py:95

b2pandas_utils.VariablesToTable.terminate
def terminate(self)
Definition: b2pandas_utils.py:311

b2pandas_utils.VariablesToTable._parquet_writer
_parquet_writer
a writer object to write data into a parquet file
Definition: b2pandas_utils.py:220

b2pandas_utils.VariablesToTable.__init__
def __init__(self, str listname, List[str] variables, str filename, Optional[str] hdf_table_name=None, int event_buffer_size=100, **writer_kwargs)
Definition: b2pandas_utils.py:51

b2pandas_utils.VariablesToTable._buffer
_buffer
event variables buffer (will be automatically grown if necessary)
Definition: b2pandas_utils.py:143

b2pandas_utils.VariablesToTable._table_name
_table_name
Table name in the hdf5 file.
Definition: b2pandas_utils.py:91

b2pandas_utils.VariablesToTable.fill_event_buffer
def fill_event_buffer(self)
Definition: b2pandas_utils.py:254

b2pandas_utils.VariablesToTable._csv_writer
_csv_writer
a writer object to write data into a csv file
Definition: b2pandas_utils.py:233

b2pandas_utils.VariablesToTable._varnames
_varnames
variable names
Definition: b2pandas_utils.py:109

b2pandas_utils.VariablesToTable.write_buffer
def write_buffer(self)
Definition: b2pandas_utils.py:281

b2pandas_utils.VariablesToTable.initialize_feather_writer
def initialize_feather_writer(self)
Definition: b2pandas_utils.py:196

b2pandas_utils.VariablesToTable.append_buffer
def append_buffer(self)
Definition: b2pandas_utils.py:178

b2pandas_utils.VariablesToTable.initialize_hdf5_writer
def initialize_hdf5_writer(self)
Definition: b2pandas_utils.py:235

b2pandas_utils.VariablesToTable._filename
_filename
Output filename.
Definition: b2pandas_utils.py:70

b2pandas_utils.VariablesToTable._writer_kwargs
_writer_kwargs
writer kwargs
Definition: b2pandas_utils.py:99

b2pandas_utils.VariablesToTable._variables
_variables
List of variables.
Definition: b2pandas_utils.py:74

b2pandas_utils.VariablesToTable._std_varnames
_std_varnames
std::vector of variable names
Definition: b2pandas_utils.py:117

b2pandas_utils.VariablesToTable._dtypes
_dtypes
The data type.
Definition: b2pandas_utils.py:140

b2pandas_utils.VariablesToTable.buffer_full
def buffer_full(self)
Definition: b2pandas_utils.py:275

b2pandas_utils.VariablesToTable._plist
_plist
Pointer to the particle list.
Definition: b2pandas_utils.py:124

b2pandas_utils.VariablesToTable._feather_writer
_feather_writer
a writer object to write data into a feather file
Definition: b2pandas_utils.py:205

b2pandas_utils.VariablesToTable._table
_table
The pytable.
Definition: b2pandas_utils.py:250

b2pandas_utils.VariablesToTable._hdf5_writer
_hdf5_writer
The pytable file.
Definition: b2pandas_utils.py:240

b2pandas_utils.VariablesToTable._listname
_listname
Particle list name.
Definition: b2pandas_utils.py:72

b2pandas_utils.VariablesToTable.event_buffer
def event_buffer(self)
Definition: b2pandas_utils.py:165

b2pandas_utils.VariablesToTable.initialize
def initialize(self)
Definition: b2pandas_utils.py:101

b2pandas_utils.VariablesToTable.buffer
def buffer(self)
Definition: b2pandas_utils.py:158

b2pandas_utils.VariablesToTable.clear_buffer
def clear_buffer(self)
Definition: b2pandas_utils.py:171

b2pandas_utils.VariablesToTable._schema
_schema
A list of tuples and py.DataTypes to define the pyarrow schema.
Definition: b2pandas_utils.py:201

b2pandas_utils.VariablesToTable._buffer_index
_buffer_index
current start index in the event variables buffer
Definition: b2pandas_utils.py:146

b2pandas_utils.VariablesToTable.event
def event(self)
Definition: b2pandas_utils.py:298

variables
Definition: variables.py:1