Belle II Software  release-05-02-19
b2pandas_utils.py
1 import basf2
2 from ROOT import Belle2
3 import tables
4 import numpy as np
5 import warnings
6 from variables import variables as variable_manager
7 from variables import std_vector
8 
9 """
10 Python uilities to help create or manage ntuples and work with them in pandas
11 """
12 
13 
14 class VariablesToHDF5(basf2.Module):
15  """
16  Dump variables directly to HDF5
17 
18  This Module is the equivalent of VariablesToNtuple but creates an hdf5 file
19  instead of a root file. It is slower as it is implemented in pure python and
20  should currently be considered a proof of concept.
21  """
22  def __init__(self, listname, variables, filename):
23  """Constructor to initialize the internal state
24 
25  Arguments:
26  listname(str): name of the particle list
27  variables(list(str)): list of variables to save for each particle
28  filename(str): name of the hdf5 file to be created
29  """
30  super().__init__()
31 
32  self._filename = filename
33 
34  self._listname = listname
35 
36  self._variables = variables
37 
38  def initialize(self):
39  """Create the hdf5 file and list of variable objects to be used during
40  event processing."""
41  # get variables from manager
42  varnames = variable_manager.resolveCollections(std_vector(*self._variables))
43 
44  self._var_objects = [variable_manager.getVariable(n) for n in varnames]
45 
46 
47  self._evtmeta = Belle2.PyStoreObj("EventMetaData")
48  self._evtmeta.isRequired()
49 
51  self._plist.isRequired()
52 
53 
54  self._hdf5file = tables.open_file(self._filename, mode="w", title="Belle2 Variables to HDF5")
55  if not self._hdf5file:
56  basf2.B2ERROR("Cannot create output file")
57  return
58 
59  dtype = [("exp", np.int32), ("run", np.int32), ("evt", np.uint32), ("icand", np.uint32), ("ncand", np.uint32)]
60  for v in self._var_objects:
61  # only float variables for now
62  dtype.append((v.name, np.float64))
63 
64 
65  self._dtype = dtype
66  filters = tables.Filters(complevel=1, complib='blosc:lz4', fletcher32=False)
67  # some variable names are not just A-Za-z0-9 so pytables complains but
68  # seems to work. Ignore warning
69  with warnings.catch_warnings():
70  warnings.simplefilter("ignore")
71 
72  self._table = self._hdf5file.create_table("/", self._listname, obj=np.zeros(0, dtype), filters=filters)
73 
74  def event(self):
75  """Create a new row in the hdf5 file with for each particle in the list"""
76  buf = np.empty(self._plist.getListSize(), dtype=self._dtype)
77  # add some extra columns for bookkeeping
78  buf["exp"] = self._evtmeta.getExperiment()
79  buf["run"] = self._evtmeta.getRun()
80  buf["evt"] = self._evtmeta.getEvent()
81  buf["ncand"] = len(buf)
82  buf["icand"] = np.arange(len(buf))
83 
84  for row, p in zip(buf, self._plist):
85  for v in self._var_objects:
86  # pyroot proxy not working with callables, we should fix this.
87  # For now we need to go back by name and call it.
88  # should be `row[v.name] = v.func(p)`
89  row[v.name] = variable_manager.evaluate(v.name, p)
90 
91  self._table.append(buf)
92 
93  def terminate(self):
94  """save and close the output"""
95  self._table.flush()
96  self._hdf5file.close()
97 
98 
99 def make_mcerrors_readable(dataframe, column="mcErrors"):
100  """
101  Take a dataframe containing an column with the output of the :b2:var:`mcErrors`
102  variable from :b2:mod:`VariablesToNTuple` and convert it to a readable set
103  of columns of the form ``{column}_{name}`` where column is the value of the
104  ``column`` argument and ``name`` is one of one of the :ref:`mcmatching`
105  error flags (without the leading 'c_').
106 
107  Arguments:
108  dataframe(pandas.DataFrame): the pandas dataframe containing an ntuple
109  with column containing the output of the mcErrors variable
110  column(str): the name containing the values from the mcErrors variable
111  """
112 
113  if column not in dataframe:
114  raise KeyError(f"Cannot find coulumn '{column}'")
115 
116  # convert mcErrors to int to be able to logical operate on it
117  mcErrors = dataframe[column].astype(int)
118 
119  # and loop over all the c_ constants in the Belle2.MCMatching class
120  for flag in (e for e in dir(Belle2.MCMatching) if e.startswith("c_")):
121  try:
122  value = int(getattr(Belle2.MCMatching, flag))
123  except ValueError:
124  # probably the extraInfo column name, ignore
125  continue
126 
127  # and set the column
128  name = column + flag[1:]
129  if value == 0:
130  dataframe[name] = mcErrors == 0
131  else:
132  dataframe[name] = (mcErrors & value) == value
133 
134 
135 # This is just for testing, no need for doxygen to weirdly document it
136 # @cond
137 if __name__ == "__main__":
138  import modularAnalysis
139 
140  p = basf2.create_path()
141  p.add_module("EventInfoSetter", evtNumList=100)
142  p.add_module("EvtGenInput")
143  modularAnalysis.fillParticleListsFromMC([("pi-:gen", "")], path=p)
144  a = VariablesToHDF5("pi-:gen", ["M", "E", "px", "py", "pz"], "test.hdf5")
145  p.add_module(a)
146  # Process the events
147  basf2.process(p)
148  print(basf2.statistics)
149 # @endcond
b2pandas_utils.VariablesToHDF5.event
def event(self)
Definition: b2pandas_utils.py:74
b2pandas_utils.VariablesToHDF5._plist
_plist
Pointer to the particle list.
Definition: b2pandas_utils.py:50
b2pandas_utils.VariablesToHDF5
Definition: b2pandas_utils.py:14
Belle2::MCMatching
Functions to perform Monte Carlo matching for reconstructed Particles.
Definition: MCMatching.h:40
b2pandas_utils.VariablesToHDF5._evtmeta
_evtmeta
Event metadata.
Definition: b2pandas_utils.py:47
b2pandas_utils.VariablesToHDF5._filename
_filename
Output filename.
Definition: b2pandas_utils.py:32
Belle2::PyStoreObj
a (simplified) python wrapper for StoreObjPtr.
Definition: PyStoreObj.h:69
b2pandas_utils.VariablesToHDF5.__init__
def __init__(self, listname, variables, filename)
Definition: b2pandas_utils.py:22
basf2.process
def process(path, max_event=0)
Definition: __init__.py:25
b2pandas_utils.VariablesToHDF5._listname
_listname
Particle list name.
Definition: b2pandas_utils.py:34
b2pandas_utils.VariablesToHDF5.terminate
def terminate(self)
Definition: b2pandas_utils.py:93
b2pandas_utils.VariablesToHDF5.initialize
def initialize(self)
Definition: b2pandas_utils.py:38
b2pandas_utils.VariablesToHDF5._hdf5file
_hdf5file
The hdf5 file.
Definition: b2pandas_utils.py:54
b2pandas_utils.VariablesToHDF5._table
_table
The pytable.
Definition: b2pandas_utils.py:72
Belle2::getRun
static ExpRun getRun(map< ExpRun, pair< double, double >> runs, double t)
Get exp number + run number from time.
Definition: Splitter.cc:262
b2pandas_utils.VariablesToHDF5._dtype
_dtype
The data type.
Definition: b2pandas_utils.py:65
b2pandas_utils.VariablesToHDF5._var_objects
_var_objects
variable objects for each variable
Definition: b2pandas_utils.py:44
modularAnalysis.fillParticleListsFromMC
def fillParticleListsFromMC(decayStringsWithCuts, addDaughters=False, skipNonPrimaryDaughters=False, writeOut=False, path=None)
Definition: modularAnalysis.py:939
b2pandas_utils.VariablesToHDF5._variables
_variables
List of variables.
Definition: b2pandas_utils.py:36