light-2511-gacrux/doxygen/b2pandas__utils_8py_source.html

from typing import List, Optional

import basf2

import variables

import tables

import numpy as np

import warnings

from pyarrow.parquet import ParquetWriter

from pyarrow.csv import CSVWriter

from pyarrow import ipc

import pyarrow as pa


"""

Python utilities to help create or manage ntuples and work with them in pandas

"""


numpy_to_pyarrow_type_map = {

    np.int32: pa.int32(),

    np.int64: pa.int64(),

    np.uint32: pa.uint32(),

    np.uint64: pa.uint64(),

    np.float32: pa.float32(),

    np.float64: pa.float64(),

    np.bool_: pa.bool_(),

    np.object_: pa.string(),

    np.str_: pa.string(),

}


class VariablesToTable(basf2.Module):

    """

    Base class to dump ntuples into a non root format of your choosing


    Arguments:

        listname(str): name of the particle list

        variables(list[str]): list of variables to save for each particle

        filename(str): name of the output file to be created.

            Needs to end with ``.csv`` for csv output, ``.parquet`` or ``.pq`` for parquet output,

            ``.h5``, ``.hdf`` or ``.hdf5`` for hdf5 output and ``.feather`` or ``.arrow`` for feather output

        hdf_table_name(str): name of the table in the hdf5 file.

            If not provided, it will be the same as the listname. Defaults to None.

        event_buffer_size(int): number of events to buffer before writing to disk,

            higher values will use more memory but result in smaller files.

            For some formats, like parquet, this also sets the row group size. Defaults to 100.

        **writer_kwargs: additional keyword arguments to pass to the writer.

            For details, see the documentation of the respective writer in the apache arrow documentation.

            For HDF5, these are passed to ``tables.File.create_table``.

            Only use, if you know what you are doing!

    """


    def __init__(

        self,

        listname: str,

        variables: List[str],

        filename: str,

        hdf_table_name: Optional[str] = None,

        event_buffer_size: int = 100,

        **writer_kwargs,

    ):

        """Constructor to initialize the internal state"""

        super().__init__()


        self._filename = filename


        self._listname = listname


        self._variables = list(set(variables))


        file_type = self._filename.split(".")[-1]


        if file_type in ["csv"]:

            self._format = "csv"

        elif file_type in ["parquet", "pq"]:

            self._format = "parquet"

        elif file_type in ["h5", "hdf", "hdf5"]:

            self._format = "hdf5"

        elif file_type in ["feather", "arrow"]:

            self._format = "feather"

        else:

            raise ValueError(

                f"Unknown file type ending .{file_type}, supported types are 'csv', "

                "'parquet', 'pq', 'h5', 'hdf', 'hdf5', 'feather' or 'arrow'"

            )


        self._table_name = (

            hdf_table_name if hdf_table_name is not None else self._listname

        )


        self._event_buffer_size = event_buffer_size


        self._event_buffer_counter = 0


        self._writer_kwargs = writer_kwargs


    def initialize(self):

        """

        Setup variable lists, pointers, buffers and file writers

        """

        # Always avoid the top-level 'import ROOT'.

        import ROOT  # noqa


        self._varnames = [

            str(varname)

            for varname in variables.variables.resolveCollections(

                variables.std_vector(*self._variables)

            )

        ]


        self._std_varnames = variables.std_vector(*self._varnames)


        self._evtmeta = ROOT.Belle2.PyStoreObj("EventMetaData")

        self._evtmeta.isRequired()


        self._plist = ROOT.Belle2.PyStoreObj(self._listname)

        self._plist.isRequired()


        dtypes = [

            ("__experiment__", np.int32),

            ("__run__", np.int32),

            ("__event__", np.uint32),

            ("__production__", np.uint32),

            ("__candidate__", np.uint32),

            ("__ncandidates__", np.uint32),

        ]

        for name in self._varnames:

            # only float variables for now

            dtypes.append((name, np.float64))


        self._dtypes = dtypes


        self._buffer = np.empty(self._event_buffer_size * 10, dtype=self._dtypes)


        self._buffer_index = 0


        if self._format == "hdf5":

            self.initialize_hdf5_writer()

        elif self._format == "parquet":

            self.initialize_parquet_writer()

        elif self._format == "csv":

            self.initialize_csv_writer()

        elif self._format == "feather":

            self.initialize_feather_writer()


    @property


    def buffer(self):

        """

        The buffer slice across multiple entries

        """

        return self._buffer[:self._buffer_index]


    @property


    def event_buffer(self):

        """

        The buffer slice for the current event

        """

        return self._buffer[self._buffer_index - self._plist.getListSize(): self._buffer_index]


    def clear_buffer(self):

        """

        Reset the buffer event counter and index

        """

        self._event_buffer_counter = 0

        self._buffer_index = 0


    def append_buffer(self):

        """

        "Append" a new event to the buffer by moving the buffer index forward by particle list size


        Automatically replaces the buffer by a larger one if necessary

        """

        plist_size = self._plist.getListSize()

        if (plist_size + self._buffer_index) > len(self._buffer):

            new_buffer = np.empty(

                # factor 1.5 larger or at least as large as necessary

                max(int(len(self._buffer) * 1.5), self._buffer_index + plist_size),

                dtype=self._dtypes,

            )

            new_buffer[:self._buffer_index] = self.buffer

            self._buffer = new_buffer

        self._buffer_index += plist_size

        self._event_buffer_counter += 1


    def initialize_feather_writer(self):

        """

        Initialize the feather writer using pyarrow

        """


        self._schema = [

            (name, numpy_to_pyarrow_type_map[dt]) for name, dt in self._dtypes

        ]


        self._feather_writer = ipc.RecordBatchFileWriter(

            sink=self._filename,

            schema=pa.schema(self._schema),

            **self._writer_kwargs,

        )


    def initialize_parquet_writer(self):

        """

        Initialize the parquet writer using pyarrow

        """


        self._schema = [

            (name, numpy_to_pyarrow_type_map[dt]) for name, dt in self._dtypes

        ]


        self._parquet_writer = ParquetWriter(

            self._filename, schema=pa.schema(self._schema), **self._writer_kwargs

        )


    def initialize_csv_writer(self):

        """

        Initialize the csv writer using pyarrow

        """


        self._schema = [

            (name, numpy_to_pyarrow_type_map[dt]) for name, dt in self._dtypes

        ]


        self._csv_writer = CSVWriter(self._filename, schema=pa.schema(self._schema), **self._writer_kwargs)


    def initialize_hdf5_writer(self):

        """

        Initialize the hdf5 writer using pytables

        """


        self._hdf5_writer = tables.open_file(

            self._filename, mode="w", title="Belle2 Variables to HDF5"

        )

        filters = tables.Filters(complevel=1, complib="blosc:lz4", fletcher32=False)


        # some variable names are not just A-Za-z0-9 so pytables complains but

        # seems to work. Ignore warning

        with warnings.catch_warnings():

            warnings.simplefilter("ignore")


            self._table = self._hdf5_writer.create_table(

                "/", self._table_name, obj=np.zeros(0, self._dtypes), filters=filters, **self._writer_kwargs

            )


    def fill_event_buffer(self):

        """

        Assign values for all variables for all particles in the particle list to the current event buffer

        """

        buf = self.event_buffer


        # add some extra columns for bookkeeping

        buf["__experiment__"] = self._evtmeta.getExperiment()

        buf["__run__"] = self._evtmeta.getRun()

        buf["__event__"] = self._evtmeta.getEvent()

        buf["__production__"] = self._evtmeta.getProduction()

        buf["__ncandidates__"] = len(buf)

        buf["__candidate__"] = np.arange(len(buf))


        # fill variables into buffer

        vector = variables.variables.evaluateVariables(self._std_varnames, self._plist)

        values = np.array(vector.data()).reshape(-1, len(self._varnames))

        for name, col in zip(self._varnames, values.T):

            buf[name] = col


    @property


    def buffer_full(self):

        """

        check if the buffer is full

        """

        return self._event_buffer_counter == self._event_buffer_size


    def write_buffer(self):

        """

        write the buffer to the output file

        """

        if self._format == "hdf5":

            """Create a new row in the hdf5 file with for each particle in the list"""

            # \cond false positive doxygen warning

            self._table.append(self.buffer)

            # \endcond

        else:

            table = {name: self.buffer[name] for name, _ in self._dtypes}

            pa_table = pa.table(table, schema=pa.schema(self._schema))

            if self._format == "parquet":

                self._parquet_writer.write_table(pa_table)

            elif self._format == "csv":

                self._csv_writer.write(pa_table)

            elif self._format == "feather":

                self._feather_writer.write_table(pa_table)


    def event(self):

        """

        Event processing function


        executes the fill_buffer function and writes the data to the output file

        in chunks of event_buffer_size

        """

        self.append_buffer()

        self.fill_event_buffer()

        # \cond false positive doxygen warning

        if self.buffer_full:

            self.write_buffer()

            self.clear_buffer()

        # \endcond


    def terminate(self):

        """save and close the output"""

        import ROOT  # noqa

        # \cond false positive doxygen warning

        if len(self.buffer) > 0:

            self.write_buffer()

        # \endcond


        if self._format == "hdf5":

            self._table.flush()

            self._hdf5_writer.close()

        elif self._format == "parquet":

            self._parquet_writer.close()

        elif self._format == "csv":

            self._csv_writer.close()

        elif self._format == "feather":

            self._feather_writer.close()

        ROOT.Belle2.MetadataService.Instance().addNtuple(self._filename)


class VariablesToHDF5(VariablesToTable):

    """

    Legacy class to not break existing code.


    This class is a wrapper around `VariablesToTable` that enforces HDF5 output

    and uses default settings for buffer size and writer arguments.

    It mostly exists for legacy reasons and new code should use `VariablesToTable` directly.


    Arguments:

        listname(str): name of the particle list

        variables(list[str]): list of variables to save for each particle

        filename(str): name of the output file to be created.

            Must end with ``.h5``, ``.hdf`` or ``.hdf5``.

        hdf_table_name(str): name of the table in the hdf5 file.

            If not provided, it will be the same as the listname. Defaults to None.

    """


    def __init__(self, listname, variables, filename, hdf_table_name: Optional[str] = None):

        """

        Constructor for the legacy HDF5 writer.

        """

        super().__init__(listname, variables, filename, hdf_table_name)

        assert self._filename.split(".")[-1] in ["h5", "hdf", "hdf5"], (

            "Filename must end with .h5, .hdf or .hdf5 for HDF5 output. "

            f"Got {self._filename}"

        )


def make_mcerrors_readable(dataframe, column="mcErrors"):

    """

    Take a dataframe containing a column with the output of the :b2:var:`mcErrors`

    variable from :b2:mod:`VariablesToNTuple` and convert it to a readable set

    of columns of the form ``{column}_{name}`` where column is the value of the

    ``column`` argument and ``name`` is one of the :ref:`mcmatching`

    error flags (without the leading 'c_').


    Arguments:

        dataframe(pandas.DataFrame): the pandas dataframe containing an ntuple

                with column containing the output of the  mcErrors variable

        column(str): the name containing the values from the mcErrors variable

    """

    # Always avoid the top-level 'import ROOT'.

    import ROOT  # noqa


    if column not in dataframe:

        raise KeyError(f"Cannot find column '{column}'")


    # convert mcErrors to int to be able to logical operate on it

    mcErrors = dataframe[column].astype(int)


    # and loop over all the c_ constants in the Belle2.MCMatching class

    for flag in (e for e in dir(ROOT.Belle2.MCMatching) if e.startswith("c_")):

        try:

            value = int(getattr(ROOT.Belle2.MCMatching, flag))

        except ValueError:

            # probably the extraInfo column name, ignore

            continue


        # and set the column

        name = column + flag[1:]

        if value == 0:

            dataframe[name] = mcErrors == 0

        else:

            dataframe[name] = (mcErrors & value) == value

variables.std_vector
std_vector(*args)
Definition __init__.py:144

b2pandas_utils.VariablesToHDF5
Definition b2pandas_utils.py:337

b2pandas_utils.VariablesToHDF5.__init__
__init__(self, listname, variables, filename, Optional[str] hdf_table_name=None)
Definition b2pandas_utils.py:354

b2pandas_utils.VariablesToTable
Definition b2pandas_utils.py:38

b2pandas_utils.VariablesToTable.fill_event_buffer
fill_event_buffer(self)
Definition b2pandas_utils.py:256

b2pandas_utils.VariablesToTable.append_buffer
append_buffer(self)
Definition b2pandas_utils.py:180

b2pandas_utils.VariablesToTable._evtmeta
_evtmeta
Event metadata.
Definition b2pandas_utils.py:122

b2pandas_utils.VariablesToTable._event_buffer_size
_event_buffer_size
Event buffer size.
Definition b2pandas_utils.py:97

b2pandas_utils.VariablesToTable._parquet_writer
_parquet_writer
A list of tuples and py.DataTypes to define the pyarrow schema.
Definition b2pandas_utils.py:222

b2pandas_utils.VariablesToTable.terminate
terminate(self)
Definition b2pandas_utils.py:317

b2pandas_utils.VariablesToTable._buffer
_buffer
event variables buffer (will be automatically grown if necessary)
Definition b2pandas_utils.py:145

b2pandas_utils.VariablesToTable.initialize
initialize(self)
Definition b2pandas_utils.py:103

b2pandas_utils.VariablesToTable._csv_writer
_csv_writer
A list of tuples and py.DataTypes to define the pyarrow schema.
Definition b2pandas_utils.py:235

b2pandas_utils.VariablesToTable.buffer
buffer(self)
Definition b2pandas_utils.py:160

b2pandas_utils.VariablesToTable._varnames
list _varnames
variable names
Definition b2pandas_utils.py:111

b2pandas_utils.VariablesToTable._event_buffer_counter
int _event_buffer_counter
Event buffer counter.
Definition b2pandas_utils.py:99

b2pandas_utils.VariablesToTable.initialize_feather_writer
initialize_feather_writer(self)
Definition b2pandas_utils.py:198

b2pandas_utils.VariablesToTable._filename
_filename
Output filename.
Definition b2pandas_utils.py:71

b2pandas_utils.VariablesToTable._writer_kwargs
_writer_kwargs
writer kwargs
Definition b2pandas_utils.py:101

b2pandas_utils.VariablesToTable._variables
_variables
List of variables.
Definition b2pandas_utils.py:75

b2pandas_utils.VariablesToTable._std_varnames
_std_varnames
std.vector of variable names
Definition b2pandas_utils.py:119

b2pandas_utils.VariablesToTable.event_buffer
event_buffer(self)
Definition b2pandas_utils.py:167

b2pandas_utils.VariablesToTable._dtypes
_dtypes
The data type.
Definition b2pandas_utils.py:142

b2pandas_utils.VariablesToTable.clear_buffer
clear_buffer(self)
Definition b2pandas_utils.py:173

b2pandas_utils.VariablesToTable._plist
_plist
Pointer to the particle list.
Definition b2pandas_utils.py:126

b2pandas_utils.VariablesToTable.buffer_full
buffer_full(self)
Definition b2pandas_utils.py:277

b2pandas_utils.VariablesToTable.initialize_csv_writer
initialize_csv_writer(self)
Definition b2pandas_utils.py:226

b2pandas_utils.VariablesToTable._feather_writer
_feather_writer
a writer object to write data into a feather file
Definition b2pandas_utils.py:207

b2pandas_utils.VariablesToTable._table
_table
The pytable.
Definition b2pandas_utils.py:252

b2pandas_utils.VariablesToTable._format
str _format
Output format.
Definition b2pandas_utils.py:80

b2pandas_utils.VariablesToTable._hdf5_writer
_hdf5_writer
The pytable file.
Definition b2pandas_utils.py:242

b2pandas_utils.VariablesToTable.initialize_hdf5_writer
initialize_hdf5_writer(self)
Definition b2pandas_utils.py:237

b2pandas_utils.VariablesToTable._listname
_listname
Particle list name.
Definition b2pandas_utils.py:73

b2pandas_utils.VariablesToTable.write_buffer
write_buffer(self)
Definition b2pandas_utils.py:283

b2pandas_utils.VariablesToTable._schema
list _schema
A list of tuples and py.DataTypes to define the pyarrow schema.
Definition b2pandas_utils.py:203

b2pandas_utils.VariablesToTable.__init__
__init__(self, str listname, List[str] variables, str filename, Optional[str] hdf_table_name=None, int event_buffer_size=100, **writer_kwargs)
Definition b2pandas_utils.py:67

b2pandas_utils.VariablesToTable._table_name
tuple _table_name
Table name in the hdf5 file.
Definition b2pandas_utils.py:93

b2pandas_utils.VariablesToTable.initialize_parquet_writer
initialize_parquet_writer(self)
Definition b2pandas_utils.py:213

b2pandas_utils.VariablesToTable.event
event(self)
Definition b2pandas_utils.py:302

b2pandas_utils.VariablesToTable._buffer_index
int _buffer_index
current start index in the event variables buffer
Definition b2pandas_utils.py:148