14from collections
import defaultdict
16from smartBKG
import PREPROC_CONFIG, TOKENIZE_DICT, LIST_FIELDS
19def check_status_bit(status_bit):
21 Check whether the corresponding particle is usable according to its status_bit,
22 which means
not virtual,
not initial,
not ISR
or FSR photon.
25 status_bit (short int): 1-based index of particle showing its status.
26 More details
in `mdst/dataobjects/include/MCParticle.h`
29 bool: Whether conditions are satisfied (
not an unusable particle).
32 (status_bit & 1 << 4 == 0) &
33 (status_bit & 1 << 5 == 0) &
34 (status_bit & 1 << 6 == 0) &
35 (status_bit & 1 << 7 == 0)
39def load_particle_list(mcplist, **meta_kwargs):
41 Collect variables from MC particle list.
45 meta_kwargs: extra event level variables that will be copied through the particle list.
48 pandas dataframe: particle list containing all the necessary information.
50 particle_dict = defaultdict(list)
51 root_prodTime = defaultdict(list)
54 prodTime = mcp.getProductionTime()
56 arrayIndex = mcp.getArrayIndex()
57 mother = mcp.getMother()
59 motherArrayIndex = mother.getArrayIndex()
61 root_prodTime[arrayIndex] = root_prodTime[motherArrayIndex]
62 if mother.isVirtual():
63 motherArrayIndex = arrayIndex
65 motherArrayIndex = arrayIndex
67 root_prodTime[arrayIndex] = prodTime
69 if mcp.isPrimaryParticle()
and check_status_bit(mcp.getStatus()):
70 four_vec = mcp.get4Vector()
71 prod_vec = mcp.getProductionVertex()
73 particle_dict[
'arrayIndex'].append(arrayIndex)
75 particle_dict[
'PDG'].append(mcp.getPDG())
76 particle_dict[
'mass'].append(mcp.getMass())
77 particle_dict[
'charge'].append(mcp.getCharge())
78 particle_dict[
'energy'].append(mcp.getEnergy())
79 particle_dict[
'prodTime'].append(prodTime-root_prodTime[arrayIndex])
80 particle_dict[
'x'].append(prod_vec.x())
81 particle_dict[
'y'].append(prod_vec.y())
82 particle_dict[
'z'].append(prod_vec.z())
83 particle_dict[
'px'].append(four_vec.Px())
84 particle_dict[
'py'].append(four_vec.Py())
85 particle_dict[
'pz'].append(four_vec.Pz())
86 particle_dict[
'motherIndex'].append(motherArrayIndex)
87 particle_dict.update(meta_kwargs)
88 return pd.DataFrame(particle_dict)
99 Load pandas data frame stored in parquet into an awkward array
101 Particle-level quantities will be lists of variable length, the other
102 variables are assumed to be event-level. Grouping will be done based on the
106 df (pandas dataframe): particle-level information.
107 decorr_df (pandas dataframe): event-level information.
108 columns (list): read only the listed columns (
None for all) - passed to `ak.from_parquet`
109 missing_values (bool):
if False, assume there are no missing values
in the particle
110 lists
and drop the masks. For the event-level quantities, replace missing
111 values
with nan. Avoiding option types might speedup the subsequent processing.
112 convert_to_32 (bool): convert int64 to int32
and float64 to float32
115 Awkward array
with particle quantities
as Lists
and event-level quantities
as flat arrays
120 >>>
import pandas
as pd
121 >>> df = pd.DataFrame({
122 ...
"x": [1, 2, 3, 4],
123 ...
"y": [5, 6, 7, 8],
124 ...
"label": [
True,
True,
False,
False],
125 ...
"evtNum": [0, 0, 1, 1],
129 >>> ak_array = ak_from_parquet_df(f)
130 >>> ak_array.particles.x
131 <Array [[1, 2], [3, 4]] type=
'2 * var * int32'>
133 <Array [
True,
False] type=
'2 * bool'>
136 evt_nums = df.evtNum.values
137 df.set_index(
'evtNum')
138 if decorr_df
is not None:
139 decorr_df = decorr_df.set_index(
'evtNum')
140 df = df.join(other=decorr_df, on=
'evtNum', how=
'inner')
141 df = df.reset_index().set_index([
'label',
'evtNum',
'arrayIndex'])
142 unique, indices, counts = np.unique(evt_nums, return_index=
True, return_counts=
True)
144 counts = counts[np.argsort(indices)]
145 ak_array = ak.unflatten(ak.Array(df.to_records()), counts)
147 out = {
"particles": {}}
148 for field
in ak_array.fields:
149 if field
not in LIST_FIELDS:
151 out[field] = ak_array[field][:, 0]
153 out[field] = values_as_32(out[field])
156 out[
"particles"][field] = ak_array[field]
158 out[
"particles"][field] = values_as_32(out[
"particles"][field])
159 out[
"particles"] = ak.zip(out[
"particles"])
160 out = ak.zip(out, depth_limit=1)
162 if not missing_values:
163 out = remove_masks(out)
168def values_as_32(array):
170 Convert int64 to int32 and float64 to float32
in the given array
for the processing
in Pytorch.
173 array (awkward array): any.
176 awkward array: the converted array.
178 ak_type = ak.type(array.layout)
179 while not isinstance(ak_type, ak.types.PrimitiveType):
180 ak_type = ak_type.type
181 dtype = ak_type.dtype
183 return ak.values_astype(array, np.int32)
184 if dtype ==
"float64":
185 return ak.values_astype(array, np.float32)
189def remove_masks(array):
191 Drop masks for particle-level quantities
and replace missing values by nan
for event-level quantities
194 array (awkward array): any.
197 awkward array: the processed array.
200 for field
in out.fields:
201 if field ==
"particles":
203 out[field] = ak.fill_none(out[field], np.nan)
204 for field
in out.particles.fields:
205 masked = out.particles[field].layout
206 if not isinstance(masked, ak.layout.ListOffsetArray64):
208 "Wrong type - this method only works with ListOffsetArray for the particle fields"
210 out[
"particles", field] = ak.Array(
212 ak.layout.ListOffsetArray64(masked.offsets, masked.content)
217def mapped_mother_index_flat(array_indices_flat, mother_indices_flat, total, sizes, dict_size):
219 Map mother indices for particle arrays to handle removed mothers.
222 array_indices_flat (array): flat array indices of the retained particles
from MC particle list.
223 mother_indices_flat (array): flat array indices of the mother particles of the retained particles.
224 total (int): total number of particles
in all the events.
225 sizes (array): numbers of particles
in each event.
226 dict_size (int): maximum number of different indices.
229 array: flat mother indices after correction.
231 out = np.empty(total, dtype=np.int32)
233 idx_dict = np.empty(dict_size, dtype=np.int32)
237 array_indices = array_indices_flat[start:stop]
238 mother_indices = mother_indices_flat[start:stop]
240 for original_index
in mother_indices:
242 idx_dict[original_index] = -1
243 for mapped_index, original_index
in enumerate(array_indices):
245 idx_dict[original_index] = mapped_index
247 for mother_index
in mother_indices:
248 out[i] = idx_dict[mother_index]
254def mapped_mother_index(array_indices, mother_indices):
256 Map mother indices for particle arrays to handle removed mothers
for awkward arrays.
259 array_indices (awkward array): array indices of the retained particles
from MC particle list.
260 mother_indices (awkward array): array indices of the mother particles of the retained particles.
263 awkward array: mother indices after correction.
265 max_dict_index = max(ak.max(array_indices), ak.max(mother_indices))
266 dict_size = max_dict_index + 1
267 flat = mapped_mother_index_flat(
268 ak.to_numpy(ak.flatten(ak.fill_none(array_indices, -1))),
269 ak.to_numpy(ak.flatten(ak.fill_none(mother_indices, -1))),
270 sizes=ak.num(array_indices),
271 total=ak.sum(ak.num(array_indices)),
274 return ak.unflatten(flat, ak.num(array_indices))
277def map_np(array, mapping):
279 Map PDG IDs to tokens.
282 pdg (array): PDG IDs.
285 array: array after PDG ID mapping.
287 unique, inv = np.unique(array, return_inverse=True)
288 np_mapping = np.array([mapping[x]
for x
in unique])
289 return np_mapping[inv]
292def mapped_pdg_id(pdg):
294 Map PDG IDs to tokens for awkward arrays.
297 pdg (awkward array): PDG IDs.
300 awkward array: awkward array after PDG ID mapping.
303 map_np(ak.to_numpy(ak.flatten(pdg)), TOKENIZE_DICT), ak.num(pdg)
307def evaluate_query(array, query):
309 Evaluate a query on the awkward array, pd.DataFrame.evaluate - style
310 Can also pass a callable that takes an awkward array
and returns an awkward array mask
313 array (awkward array
or dataframe): any.
314 query (str): queries
for particle selection.
317 awkward array: awkward array after particle selection.
326 **dict(zip(array.fields, ak.unzip(array))),
327 **dict(zip(array.particles.fields, ak.unzip(array.particles)))
332 joined_query =
" & ".join(f
"({q})" for q
in query)
333 quoted_fieldnames = set(re.findall(
"`[^`]*`", joined_query))
334 name_mapping = {k:
"".join(random.choices(string.ascii_lowercase, k=20))
for k
in quoted_fieldnames}
335 quoted_fields = {name_mapping[field]: array_dict[field.replace(
"`",
"")]
for field
in quoted_fieldnames}
336 for field, rnd_name
in name_mapping.items():
337 joined_query = joined_query.replace(field, rnd_name)
339 return ak.numexpr.evaluate(joined_query, local_dict={**array_dict, **quoted_fields})
342def preprocessed(df, decorr_df=None, particle_selection=PREPROC_CONFIG['cuts']):
344 Preprocess the input dataframe and return an awkward array that
is ready
for graph building.
347 df (pandas dataframe): containing particle-level information.
348 decorr_df (pandas dataframe): containing event-level information.
349 particle_selection (str): queries
for particle selection.
352 awkward array: awkward array after preprocessing.
354 array = ak_from_df(df, decorr_df)[:]
355 array["particles"] = array.particles[evaluate_query(array, particle_selection)]
356 array[
"particles",
"PDG"] = mapped_pdg_id(array.particles.PDG)
357 array[
"particles",
"motherIndex"] = mapped_mother_index(
358 array.particles.arrayIndex,
359 array.particles.motherIndex,
A (simplified) python wrapper for StoreArray.