Source code for b2pandas_utils
##########################################################################
# basf2 (Belle II Analysis Software Framework) #
# Author: The Belle II Collaboration #
# #
# See git log for contributors and copyright holders. #
# This file is licensed under LGPL-3.0, see LICENSE.md. #
##########################################################################
import basf2
import variables
import tables
import numpy as np
import warnings
"""
Python utilities to help create or manage ntuples and work with them in pandas
"""
[docs]
class VariablesToHDF5(basf2.Module):
"""
Dump variables directly to HDF5
This Module is the equivalent of VariablesToNtuple but creates an hdf5 file
instead of a root file. It is slower as it is implemented in pure python and
should currently be considered a proof of concept.
"""
def __init__(self, listname, variables, filename):
"""Constructor to initialize the internal state
Arguments:
listname(str): name of the particle list
variables(list(str)): list of variables to save for each particle
filename(str): name of the hdf5 file to be created
"""
super().__init__()
#: Output filename
self._filename = filename
#: Particle list name
self._listname = listname
#: List of variables
self._variables = variables
[docs]
def initialize(self):
"""Create the hdf5 file and list of variable objects to be used during
event processing."""
# Always avoid the top-level 'import ROOT'.
import ROOT # noqa
#: variable names
self._varnames = [
str(varname) for varname in variables.variables.resolveCollections(
variables.std_vector(
*self._variables))]
#: variable objects for each variable
self._var_objects = [variables.variables.getVariable(n) for n in self._varnames]
#: Event metadata
self._evtmeta = ROOT.Belle2.PyStoreObj("EventMetaData")
self._evtmeta.isRequired()
#: Pointer to the particle list
self._plist = ROOT.Belle2.PyStoreObj(self._listname)
self._plist.isRequired()
#: The hdf5 file
self._hdf5file = tables.open_file(self._filename, mode="w", title="Belle2 Variables to HDF5")
if not self._hdf5file:
basf2.B2ERROR("Cannot create output file")
return
dtype = [("exp", np.int32), ("run", np.int32), ("evt", np.uint32),
("prod", np.uint32), ("icand", np.uint32), ("ncand", np.uint32)]
for name in self._varnames:
# only float variables for now
dtype.append((name, np.float64))
#: The data type
self._dtype = dtype
filters = tables.Filters(complevel=1, complib='blosc:lz4', fletcher32=False)
# some variable names are not just A-Za-z0-9 so pytables complains but
# seems to work. Ignore warning
with warnings.catch_warnings():
warnings.simplefilter("ignore")
#: The pytable
self._table = self._hdf5file.create_table("/", self._listname, obj=np.zeros(0, dtype), filters=filters)
[docs]
def event(self):
"""Create a new row in the hdf5 file with for each particle in the list"""
buf = np.empty(self._plist.getListSize(), dtype=self._dtype)
# add some extra columns for bookkeeping
buf["exp"] = self._evtmeta.getExperiment()
buf["run"] = self._evtmeta.getRun()
buf["evt"] = self._evtmeta.getEvent()
buf["prod"] = self._evtmeta.getProduction()
buf["ncand"] = len(buf)
buf["icand"] = np.arange(len(buf))
for row, p in zip(buf, self._plist):
for name, v in zip(self._varnames, self._var_objects):
# pyroot proxy not working with callables, we should fix this.
# For now we need to go back by name and call it.
# should be `row[v.name] = v.func(p)`
row[name] = variables.variables.evaluate(v.name, p)
self._table.append(buf)
[docs]
def terminate(self):
"""save and close the output"""
self._table.flush()
self._hdf5file.close()
import ROOT
ROOT.Belle2.MetadataService.Instance().addHDF5File(self._filename)
def make_mcerrors_readable(dataframe, column="mcErrors"):
"""
Take a dataframe containing an column with the output of the :b2:var:`mcErrors`
variable from :b2:mod:`VariablesToNTuple` and convert it to a readable set
of columns of the form ``{column}_{name}`` where column is the value of the
``column`` argument and ``name`` is one of one of the :ref:`mcmatching`
error flags (without the leading 'c_').
Arguments:
dataframe(pandas.DataFrame): the pandas dataframe containing an ntuple
with column containing the output of the mcErrors variable
column(str): the name containing the values from the mcErrors variable
"""
# Always avoid the top-level 'import ROOT'.
import ROOT # noqa
if column not in dataframe:
raise KeyError(f"Cannot find column '{column}'")
# convert mcErrors to int to be able to logical operate on it
mcErrors = dataframe[column].astype(int)
# and loop over all the c_ constants in the Belle2.MCMatching class
for flag in (e for e in dir(ROOT.Belle2.MCMatching) if e.startswith("c_")):
try:
value = int(getattr(ROOT.Belle2.MCMatching, flag))
except ValueError:
# probably the extraInfo column name, ignore
continue
# and set the column
name = column + flag[1:]
if value == 0:
dataframe[name] = mcErrors == 0
else:
dataframe[name] = (mcErrors & value) == value
# This is just for testing, no need for doxygen to weirdly document it
# @cond
if __name__ == "__main__":
import modularAnalysis
p = basf2.create_path()
p.add_module("EventInfoSetter", evtNumList=100)
p.add_module("EvtGenInput")
modularAnalysis.fillParticleListsFromMC([("pi-:gen", "")], path=p)
a = VariablesToHDF5("pi-:gen", ["M", "E", "px", "py", "pz"], "test.hdf5")
p.add_module(a)
# Process the events
basf2.process(p)
print(basf2.statistics)
# @endcond