Belle II Software development
b2pandas_utils.py
1
8
9import basf2
10import variables
11import tables
12import numpy as np
13import warnings
14
15
16"""
17Python utilities to help create or manage ntuples and work with them in pandas
18"""
19
20
21class VariablesToHDF5(basf2.Module):
22 """
23 Dump variables directly to HDF5
24
25 This Module is the equivalent of VariablesToNtuple but creates an hdf5 file
26 instead of a root file. It is slower as it is implemented in pure python and
27 should currently be considered a proof of concept.
28 """
29
30 def __init__(self, listname, variables, filename):
31 """Constructor to initialize the internal state
32
33 Arguments:
34 listname(str): name of the particle list
35 variables(list(str)): list of variables to save for each particle
36 filename(str): name of the hdf5 file to be created
37 """
38 super().__init__()
39
40 self._filename = filename
41
42 self._listname = listname
43
44 self._variables = variables
45
46 def initialize(self):
47 """Create the hdf5 file and list of variable objects to be used during
48 event processing."""
49 # Always avoid the top-level 'import ROOT'.
50 import ROOT # noqa
51
52 self._varnames = [
53 str(varname) for varname in variables.variables.resolveCollections(
55 *self._variables))]
56
57 self._var_objects = [variables.variables.getVariable(n) for n in self._varnames]
58
59
60 self._evtmeta = ROOT.Belle2.PyStoreObj("EventMetaData")
61 self._evtmeta.isRequired()
62
63 self._plist = ROOT.Belle2.PyStoreObj(self._listname)
64 self._plist.isRequired()
65
66
67 self._hdf5file = tables.open_file(self._filename, mode="w", title="Belle2 Variables to HDF5")
68 if not self._hdf5file:
69 basf2.B2ERROR("Cannot create output file")
70 return
71
72 dtype = [("exp", np.int32), ("run", np.int32), ("evt", np.uint32),
73 ("prod", np.uint32), ("icand", np.uint32), ("ncand", np.uint32)]
74 for name in self._varnames:
75 # only float variables for now
76 dtype.append((name, np.float64))
77
78
79 self._dtype = dtype
80 filters = tables.Filters(complevel=1, complib='blosc:lz4', fletcher32=False)
81 # some variable names are not just A-Za-z0-9 so pytables complains but
82 # seems to work. Ignore warning
83 with warnings.catch_warnings():
84 warnings.simplefilter("ignore")
85
86 self._table = self._hdf5file.create_table("/", self._listname, obj=np.zeros(0, dtype), filters=filters)
87
88 def event(self):
89 """Create a new row in the hdf5 file with for each particle in the list"""
90 buf = np.empty(self._plist.getListSize(), dtype=self._dtype)
91 # add some extra columns for bookkeeping
92 buf["exp"] = self._evtmeta.getExperiment()
93 buf["run"] = self._evtmeta.getRun()
94 buf["evt"] = self._evtmeta.getEvent()
95 buf["prod"] = self._evtmeta.getProduction()
96 buf["ncand"] = len(buf)
97 buf["icand"] = np.arange(len(buf))
98
99 for row, p in zip(buf, self._plist):
100 for name, v in zip(self._varnames, self._var_objects):
101 # pyroot proxy not working with callables, we should fix this.
102 # For now we need to go back by name and call it.
103 # should be `row[v.name] = v.func(p)`
104 row[name] = variables.variables.evaluate(v.name, p)
105
106 self._table.append(buf)
107
108 def terminate(self):
109 """save and close the output"""
110 self._table.flush()
111 self._hdf5file.close()
112 import ROOT
113 ROOT.Belle2.MetadataService.Instance().addHDF5File(self._filename)
114
115
116def make_mcerrors_readable(dataframe, column="mcErrors"):
117 """
118 Take a dataframe containing an column with the output of the :b2:var:`mcErrors`
119 variable from :b2:mod:`VariablesToNTuple` and convert it to a readable set
120 of columns of the form ``{column}_{name}`` where column is the value of the
121 ``column`` argument and ``name`` is one of one of the :ref:`mcmatching`
122 error flags (without the leading 'c_').
123
124 Arguments:
125 dataframe(pandas.DataFrame): the pandas dataframe containing an ntuple
126 with column containing the output of the mcErrors variable
127 column(str): the name containing the values from the mcErrors variable
128 """
129 # Always avoid the top-level 'import ROOT'.
130 import ROOT # noqa
131
132 if column not in dataframe:
133 raise KeyError(f"Cannot find column '{column}'")
134
135 # convert mcErrors to int to be able to logical operate on it
136 mcErrors = dataframe[column].astype(int)
137
138 # and loop over all the c_ constants in the Belle2.MCMatching class
139 for flag in (e for e in dir(ROOT.Belle2.MCMatching) if e.startswith("c_")):
140 try:
141 value = int(getattr(ROOT.Belle2.MCMatching, flag))
142 except ValueError:
143 # probably the extraInfo column name, ignore
144 continue
145
146 # and set the column
147 name = column + flag[1:]
148 if value == 0:
149 dataframe[name] = mcErrors == 0
150 else:
151 dataframe[name] = (mcErrors & value) == value
152
153
154# This is just for testing, no need for doxygen to weirdly document it
155# @cond
156if __name__ == "__main__":
157 import modularAnalysis
158
159 p = basf2.create_path()
160 p.add_module("EventInfoSetter", evtNumList=100)
161 p.add_module("EvtGenInput")
162 modularAnalysis.fillParticleListsFromMC([("pi-:gen", "")], path=p)
163 a = VariablesToHDF5("pi-:gen", ["M", "E", "px", "py", "pz"], "test.hdf5")
164 p.add_module(a)
165 # Process the events
166 basf2.process(p)
167 print(basf2.statistics)
168# @endcond
def std_vector(*args)
Definition: __init__.py:135
_plist
Pointer to the particle list.
_var_objects
variable objects for each variable
_listname
Particle list name.
def __init__(self, listname, variables, filename)
def fillParticleListsFromMC(decayStringsWithCuts, addDaughters=False, skipNonPrimaryDaughters=False, writeOut=False, path=None, skipNonPrimary=False, skipInitial=True)