Belle II Software development
VariablesToTable Class Reference
Inheritance diagram for VariablesToTable:
VariablesToHDF5

Public Member Functions

 __init__ (self, str listname, List[str] variables, str filename, Optional[str] hdf_table_name=None, int event_buffer_size=100, **writer_kwargs)
 
 initialize (self)
 
 buffer (self)
 
 event_buffer (self)
 
 clear_buffer (self)
 
 append_buffer (self)
 
 initialize_feather_writer (self)
 
 initialize_parquet_writer (self)
 
 initialize_csv_writer (self)
 
 initialize_hdf5_writer (self)
 
 fill_event_buffer (self)
 
 buffer_full (self)
 
 write_buffer (self)
 
 event (self)
 
 terminate (self)
 

Protected Attributes

 _filename = filename
 Output filename.
 
 _listname = listname
 Particle list name.
 
 _variables = list(set(variables))
 List of variables.
 
str _format = "csv"
 Output format.
 
tuple _table_name
 Table name in the hdf5 file.
 
 _event_buffer_size = event_buffer_size
 Event buffer size.
 
int _event_buffer_counter = 0
 Event buffer counter.
 
 _writer_kwargs = writer_kwargs
 writer kwargs
 
list _varnames
 variable names
 
 _std_varnames = variables.std_vector(*self._varnames)
 std.vector of variable names
 
 _evtmeta = ROOT.Belle2.PyStoreObj("EventMetaData")
 Event metadata.
 
 _plist = ROOT.Belle2.PyStoreObj(self._listname)
 Pointer to the particle list.
 
 _dtypes = dtypes
 The data type.
 
 _buffer = np.empty(self._event_buffer_size * 10, dtype=self._dtypes)
 event variables buffer (will be automatically grown if necessary)
 
int _buffer_index = 0
 current start index in the event variables buffer
 
list _schema
 A list of tuples and py.DataTypes to define the pyarrow schema.
 
 _feather_writer
 a writer object to write data into a feather file
 
 _parquet_writer
 A list of tuples and py.DataTypes to define the pyarrow schema.
 
 _csv_writer = CSVWriter(self._filename, schema=pa.schema(self._schema), **self._writer_kwargs)
 A list of tuples and py.DataTypes to define the pyarrow schema.
 
 _hdf5_writer
 The pytable file.
 
 _table
 The pytable.
 

Detailed Description

Base class to dump ntuples into a non root format of your choosing

Arguments:
    listname(str): name of the particle list
    variables(list[str]): list of variables to save for each particle
    filename(str): name of the output file to be created.
        Needs to end with ``.csv`` for csv output, ``.parquet`` or ``.pq`` for parquet output,
        ``.h5``, ``.hdf`` or ``.hdf5`` for hdf5 output and ``.feather`` or ``.arrow`` for feather output
    hdf_table_name(str): name of the table in the hdf5 file.
        If not provided, it will be the same as the listname. Defaults to None.
    event_buffer_size(int): number of events to buffer before writing to disk,
        higher values will use more memory but result in smaller files.
        For some formats, like parquet, this also sets the row group size. Defaults to 100.
    **writer_kwargs: additional keyword arguments to pass to the writer.
        For details, see the documentation of the respective writer in the apache arrow documentation.
        For HDF5, these are passed to ``tables.File.create_table``.
        Only use, if you know what you are doing!

Definition at line 38 of file b2pandas_utils.py.

Constructor & Destructor Documentation

◆ __init__()

__init__ ( self,
str listname,
List[str] variables,
str filename,
Optional[str] hdf_table_name = None,
int event_buffer_size = 100,
** writer_kwargs )
Constructor to initialize the internal state

Definition at line 59 of file b2pandas_utils.py.

67 ):
68 """Constructor to initialize the internal state"""
69 super().__init__()
70
71 self._filename = filename
72
73 self._listname = listname
74
75 self._variables = list(set(variables))
76
77 file_type = self._filename.split(".")[-1]
78
79 if file_type in ["csv"]:
80 self._format = "csv"
81 elif file_type in ["parquet", "pq"]:
82 self._format = "parquet"
83 elif file_type in ["h5", "hdf", "hdf5"]:
84 self._format = "hdf5"
85 elif file_type in ["feather", "arrow"]:
86 self._format = "feather"
87 else:
88 raise ValueError(
89 f"Unknown file type ending .{file_type}, supported types are 'csv', "
90 "'parquet', 'pq', 'h5', 'hdf', 'hdf5', 'feather' or 'arrow'"
91 )
92
93 self._table_name = (
94 hdf_table_name if hdf_table_name is not None else self._listname
95 )
96
97 self._event_buffer_size = event_buffer_size
98
99 self._event_buffer_counter = 0
100
101 self._writer_kwargs = writer_kwargs
102

Member Function Documentation

◆ append_buffer()

append_buffer ( self)
"Append" a new event to the buffer by moving the buffer index forward by particle list size

Automatically replaces the buffer by a larger one if necessary

Definition at line 180 of file b2pandas_utils.py.

180 def append_buffer(self):
181 """
182 "Append" a new event to the buffer by moving the buffer index forward by particle list size
183
184 Automatically replaces the buffer by a larger one if necessary
185 """
186 plist_size = self._plist.getListSize()
187 if (plist_size + self._buffer_index) > len(self._buffer):
188 new_buffer = np.empty(
189 # factor 1.5 larger or at least as large as necessary
190 max(int(len(self._buffer) * 1.5), self._buffer_index + plist_size),
191 dtype=self._dtypes,
192 )
193 new_buffer[:self._buffer_index] = self.buffer
194 self._buffer = new_buffer
195 self._buffer_index += plist_size
196 self._event_buffer_counter += 1
197

◆ buffer()

buffer ( self)
The buffer slice across multiple entries

Definition at line 160 of file b2pandas_utils.py.

160 def buffer(self):
161 """
162 The buffer slice across multiple entries
163 """
164 return self._buffer[:self._buffer_index]
165

◆ buffer_full()

buffer_full ( self)
check if the buffer is full

Definition at line 277 of file b2pandas_utils.py.

277 def buffer_full(self):
278 """
279 check if the buffer is full
280 """
281 return self._event_buffer_counter == self._event_buffer_size
282

◆ clear_buffer()

clear_buffer ( self)
Reset the buffer event counter and index

Definition at line 173 of file b2pandas_utils.py.

173 def clear_buffer(self):
174 """
175 Reset the buffer event counter and index
176 """
177 self._event_buffer_counter = 0
178 self._buffer_index = 0
179

◆ event()

event ( self)
Event processing function

executes the fill_buffer function and writes the data to the output file
in chunks of event_buffer_size

Definition at line 302 of file b2pandas_utils.py.

302 def event(self):
303 """
304 Event processing function
305
306 executes the fill_buffer function and writes the data to the output file
307 in chunks of event_buffer_size
308 """
309 self.append_buffer()
310 self.fill_event_buffer()
311 # \cond false positive doxygen warning
312 if self.buffer_full:
313 self.write_buffer()
314 self.clear_buffer()
315 # \endcond
316

◆ event_buffer()

event_buffer ( self)
The buffer slice for the current event

Definition at line 167 of file b2pandas_utils.py.

167 def event_buffer(self):
168 """
169 The buffer slice for the current event
170 """
171 return self._buffer[self._buffer_index - self._plist.getListSize(): self._buffer_index]
172

◆ fill_event_buffer()

fill_event_buffer ( self)
Assign values for all variables for all particles in the particle list to the current event buffer

Definition at line 256 of file b2pandas_utils.py.

256 def fill_event_buffer(self):
257 """
258 Assign values for all variables for all particles in the particle list to the current event buffer
259 """
260 buf = self.event_buffer
261
262 # add some extra columns for bookkeeping
263 buf["__experiment__"] = self._evtmeta.getExperiment()
264 buf["__run__"] = self._evtmeta.getRun()
265 buf["__event__"] = self._evtmeta.getEvent()
266 buf["__production__"] = self._evtmeta.getProduction()
267 buf["__ncandidates__"] = len(buf)
268 buf["__candidate__"] = np.arange(len(buf))
269
270 # fill variables into buffer
271 vector = variables.variables.evaluateVariables(self._std_varnames, self._plist)
272 values = np.array(vector.data()).reshape(-1, len(self._varnames))
273 for name, col in zip(self._varnames, values.T):
274 buf[name] = col
275

◆ initialize()

initialize ( self)
Setup variable lists, pointers, buffers and file writers

Definition at line 103 of file b2pandas_utils.py.

103 def initialize(self):
104 """
105 Setup variable lists, pointers, buffers and file writers
106 """
107 # Always avoid the top-level 'import ROOT'.
108 import ROOT # noqa
109
110
111 self._varnames = [
112 str(varname)
113 for varname in variables.variables.resolveCollections(
114 variables.std_vector(*self._variables)
115 )
116 ]
117
118
119 self._std_varnames = variables.std_vector(*self._varnames)
120
121
122 self._evtmeta = ROOT.Belle2.PyStoreObj("EventMetaData")
123 self._evtmeta.isRequired()
124
125
126 self._plist = ROOT.Belle2.PyStoreObj(self._listname)
127 self._plist.isRequired()
128
129 dtypes = [
130 ("__experiment__", np.int32),
131 ("__run__", np.int32),
132 ("__event__", np.uint32),
133 ("__production__", np.uint32),
134 ("__candidate__", np.uint32),
135 ("__ncandidates__", np.uint32),
136 ]
137 for name in self._varnames:
138 # only float variables for now
139 dtypes.append((name, np.float64))
140
141
142 self._dtypes = dtypes
143
144
145 self._buffer = np.empty(self._event_buffer_size * 10, dtype=self._dtypes)
146
147
148 self._buffer_index = 0
149
150 if self._format == "hdf5":
151 self.initialize_hdf5_writer()
152 elif self._format == "parquet":
153 self.initialize_parquet_writer()
154 elif self._format == "csv":
155 self.initialize_csv_writer()
156 elif self._format == "feather":
157 self.initialize_feather_writer()
158
std_vector(*args)
Definition __init__.py:144

◆ initialize_csv_writer()

initialize_csv_writer ( self)
Initialize the csv writer using pyarrow

Definition at line 226 of file b2pandas_utils.py.

226 def initialize_csv_writer(self):
227 """
228 Initialize the csv writer using pyarrow
229 """
230
231 self._schema = [
232 (name, numpy_to_pyarrow_type_map[dt]) for name, dt in self._dtypes
233 ]
234
235 self._csv_writer = CSVWriter(self._filename, schema=pa.schema(self._schema), **self._writer_kwargs)
236

◆ initialize_feather_writer()

initialize_feather_writer ( self)
Initialize the feather writer using pyarrow

Definition at line 198 of file b2pandas_utils.py.

198 def initialize_feather_writer(self):
199 """
200 Initialize the feather writer using pyarrow
201 """
202
203 self._schema = [
204 (name, numpy_to_pyarrow_type_map[dt]) for name, dt in self._dtypes
205 ]
206
207 self._feather_writer = ipc.RecordBatchFileWriter(
208 sink=self._filename,
209 schema=pa.schema(self._schema),
210 **self._writer_kwargs,
211 )
212

◆ initialize_hdf5_writer()

initialize_hdf5_writer ( self)
Initialize the hdf5 writer using pytables

Definition at line 237 of file b2pandas_utils.py.

237 def initialize_hdf5_writer(self):
238 """
239 Initialize the hdf5 writer using pytables
240 """
241
242 self._hdf5_writer = tables.open_file(
243 self._filename, mode="w", title="Belle2 Variables to HDF5"
244 )
245 filters = tables.Filters(complevel=1, complib="blosc:lz4", fletcher32=False)
246
247 # some variable names are not just A-Za-z0-9 so pytables complains but
248 # seems to work. Ignore warning
249 with warnings.catch_warnings():
250 warnings.simplefilter("ignore")
251
252 self._table = self._hdf5_writer.create_table(
253 "/", self._table_name, obj=np.zeros(0, self._dtypes), filters=filters, **self._writer_kwargs
254 )
255

◆ initialize_parquet_writer()

initialize_parquet_writer ( self)
Initialize the parquet writer using pyarrow

Definition at line 213 of file b2pandas_utils.py.

213 def initialize_parquet_writer(self):
214 """
215 Initialize the parquet writer using pyarrow
216 """
217
218 self._schema = [
219 (name, numpy_to_pyarrow_type_map[dt]) for name, dt in self._dtypes
220 ]
221
222 self._parquet_writer = ParquetWriter(
223 self._filename, schema=pa.schema(self._schema), **self._writer_kwargs
224 )
225

◆ terminate()

terminate ( self)
save and close the output

Definition at line 317 of file b2pandas_utils.py.

317 def terminate(self):
318 """save and close the output"""
319 import ROOT # noqa
320 # \cond false positive doxygen warning
321 if len(self.buffer) > 0:
322 self.write_buffer()
323 # \endcond
324
325 if self._format == "hdf5":
326 self._table.flush()
327 self._hdf5_writer.close()
328 elif self._format == "parquet":
329 self._parquet_writer.close()
330 elif self._format == "csv":
331 self._csv_writer.close()
332 elif self._format == "feather":
333 self._feather_writer.close()
334 ROOT.Belle2.MetadataService.Instance().addNtuple(self._filename)
335
336

◆ write_buffer()

write_buffer ( self)
write the buffer to the output file

Definition at line 283 of file b2pandas_utils.py.

283 def write_buffer(self):
284 """
285 write the buffer to the output file
286 """
287 if self._format == "hdf5":
288 """Create a new row in the hdf5 file with for each particle in the list"""
289 # \cond false positive doxygen warning
290 self._table.append(self.buffer)
291 # \endcond
292 else:
293 table = {name: self.buffer[name] for name, _ in self._dtypes}
294 pa_table = pa.table(table, schema=pa.schema(self._schema))
295 if self._format == "parquet":
296 self._parquet_writer.write_table(pa_table)
297 elif self._format == "csv":
298 self._csv_writer.write(pa_table)
299 elif self._format == "feather":
300 self._feather_writer.write_table(pa_table)
301

Member Data Documentation

◆ _buffer

_buffer = np.empty(self._event_buffer_size * 10, dtype=self._dtypes)
protected

event variables buffer (will be automatically grown if necessary)

Definition at line 145 of file b2pandas_utils.py.

◆ _buffer_index

_buffer_index = 0
protected

current start index in the event variables buffer

Definition at line 148 of file b2pandas_utils.py.

◆ _csv_writer

_csv_writer = CSVWriter(self._filename, schema=pa.schema(self._schema), **self._writer_kwargs)
protected

A list of tuples and py.DataTypes to define the pyarrow schema.

a writer object to write data into a csv file

Definition at line 235 of file b2pandas_utils.py.

◆ _dtypes

_dtypes = dtypes
protected

The data type.

Definition at line 142 of file b2pandas_utils.py.

◆ _event_buffer_counter

int _event_buffer_counter = 0
protected

Event buffer counter.

Definition at line 99 of file b2pandas_utils.py.

◆ _event_buffer_size

_event_buffer_size = event_buffer_size
protected

Event buffer size.

Definition at line 97 of file b2pandas_utils.py.

◆ _evtmeta

_evtmeta = ROOT.Belle2.PyStoreObj("EventMetaData")
protected

Event metadata.

Definition at line 122 of file b2pandas_utils.py.

◆ _feather_writer

_feather_writer
protected
Initial value:
= ipc.RecordBatchFileWriter(
sink=self._filename,
schema=pa.schema(self._schema),
**self._writer_kwargs,
)

a writer object to write data into a feather file

Definition at line 207 of file b2pandas_utils.py.

◆ _filename

_filename = filename
protected

Output filename.

Definition at line 71 of file b2pandas_utils.py.

◆ _format

str _format = "csv"
protected

Output format.

Definition at line 80 of file b2pandas_utils.py.

◆ _hdf5_writer

_hdf5_writer
protected
Initial value:
= tables.open_file(
self._filename, mode="w", title="Belle2 Variables to HDF5"
)

The pytable file.

Definition at line 242 of file b2pandas_utils.py.

◆ _listname

_listname = listname
protected

Particle list name.

Definition at line 73 of file b2pandas_utils.py.

◆ _parquet_writer

_parquet_writer
protected
Initial value:
= ParquetWriter(
self._filename, schema=pa.schema(self._schema), **self._writer_kwargs
)

A list of tuples and py.DataTypes to define the pyarrow schema.

a writer object to write data into a parquet file

Definition at line 222 of file b2pandas_utils.py.

◆ _plist

_plist = ROOT.Belle2.PyStoreObj(self._listname)
protected

Pointer to the particle list.

Definition at line 126 of file b2pandas_utils.py.

◆ _schema

list _schema
protected
Initial value:
= [
(name, numpy_to_pyarrow_type_map[dt]) for name, dt in self._dtypes
]

A list of tuples and py.DataTypes to define the pyarrow schema.

Definition at line 203 of file b2pandas_utils.py.

◆ _std_varnames

_std_varnames = variables.std_vector(*self._varnames)
protected

std.vector of variable names

Definition at line 119 of file b2pandas_utils.py.

◆ _table

_table
protected
Initial value:
= self._hdf5_writer.create_table(
"/", self._table_name, obj=np.zeros(0, self._dtypes), filters=filters, **self._writer_kwargs
)

The pytable.

Definition at line 252 of file b2pandas_utils.py.

◆ _table_name

tuple _table_name
protected
Initial value:
= (
hdf_table_name if hdf_table_name is not None else self._listname
)

Table name in the hdf5 file.

Definition at line 93 of file b2pandas_utils.py.

◆ _variables

_variables = list(set(variables))
protected

List of variables.

Definition at line 75 of file b2pandas_utils.py.

◆ _varnames

_varnames
protected
Initial value:
= [
str(varname)
for varname in variables.variables.resolveCollections(
variables.std_vector(*self._variables)
)
]

variable names

Definition at line 111 of file b2pandas_utils.py.

◆ _writer_kwargs

_writer_kwargs = writer_kwargs
protected

writer kwargs

Definition at line 101 of file b2pandas_utils.py.


The documentation for this class was generated from the following file: