14from ROOT
import Belle2
15import modularAnalysis
as ma
16from skim.WGs.fei
import feiHadronicB0
17from b2pandas_utils
import VariablesToHDF5
19from smartBKG.utils.preprocess
import load_particle_list, preprocessed
24 Save event numbers to a Parquet file.
27 out_file (str): Output file path for saving the event numbers.
33 This module should be added after the skimming process.
38 Initialize the SaveFlag module.
40 :param out_file: Output file path for saving the event numbers.
48 Initialize the data store and the list to save event numbers before processing events.
57 Process each event and append event numbers to the
pass list.
63 Finalize the module and save the
pass list to a Parquet file.
70 Save MCParticles to Pandas Dataframe.
73 output_file (str): Filename to save training data.
74 Ending with ``parquet`` indicating fast mode, which will generate the final parquet file
for training.
75 Ending
with ``h5`` indicating advanced mode, which will produce a temperary h5 file
for further preprocessing.
76 flag_file (str): Filename of the flag file indicating passing events.
88 Initialize the TrainDataSaver module.
90 :param output_file: Filename to save training data to.
91 :param flag_file: Filename of the flag file indicating passing events.
102 if os.path.exists(output_file):
103 os.remove(output_file)
107 Initialize the data store and the dictionary to save particle features before processing events.
118 Process each event and append event information to the dictionary.
128 Append events on disk in either of the two different ways
and free memory.
130 In fast mode, the dataframe containing particle-level information
and skim labels
is preprocessed
131 and saved
as a parquet file which
is ready
for NN training.
133 In advanced mode, the dataframe
is saved
as a h5 file
and waits
for combination
with event-level information
134 before preprocessing.
139 self.
df_dict.to_hdf(self.
output_file, key=
'mc_information', mode=
'a', format=
'table', append=
True)
145 Process data for training
and save to Parquet file. Two modes are provided:
146 Fast mode: save_vars set to
None, produce the dataset
with only the necessary information
for the training.
147 Advanced mode: save_vars set to a dictionary of event-level variables,
148 run through hard-coded b2 steering code
in self.
process_b2script to produce the required particle lists
149 and save the required variables, can be used
for event-level cuts
or evaluations of the NN performance.
152 in_dir (str): Input directory.
153 out_dir (str): Output directory.
154 job_id (int): Job ID
for batch processing.
155 save_vars (dict): Event-level variables to save
for different particles.
156 By default
None for fast mode.
157 In the example script having Y4S
and B keys
for the corresponding particle list.
163 def __init__(self, in_dir, out_dir, job_id, save_vars=None):
165 Initialize the data_production object.
167 :param in_dir: Input directory.
168 :param out_dir: Output directory.
169 :param job_id: Job ID for batch processing.
170 :param save_vars: Event-level variables to save
for different particles.
171 By default
None for fast mode.
172 In the example script having Y4S
and B keys
for the corresponding particle list.
174 dataName = '_submdst'
177 self.
data = f
'{in_dir}{dataName}{job_id}.root'
179 self.
flag = f
'{in_dir}{flagName}{job_id}.parquet'
180 if save_vars
is not None:
183 os.makedirs(out_dir, exist_ok=
True)
184 os.makedirs(self.
out_temp, exist_ok=
True)
187 'MC': f
'{self.out_temp}mc.h5',
188 'Y4S': f
'{self.out_temp}y4s.h5',
189 'B': f
'{self.out_temp}b.h5'
192 self.
out_file = f
'{out_dir}preprocessed{job_id}.parquet'
198 Process the b2 steering file and the data generation.
206 Skimming process with TrainDataSaver module.
208 :param num_events: Maximum number of events to process.
210 path = ma.create_path()
212 ma.inputMdst(environmentType='default', filename=self.
data, path=path)
213 ma.buildEventShape(path=path)
214 ma.buildEventKinematics(path=path)
222 path.add_module(TrainDataSaver_module)
223 ma.fillParticleListFromMC(
'Upsilon(4S):mc',
'', path=path)
229 path.add_module(v2hdf5_y4s)
231 fei_skim = feiHadronicB0(udstOutput=
False, analysisGlobaltag=ma.getAnalysisGlobaltag())
233 fei_skim.postskim_path.add_module(
234 "BestCandidateSelection",
235 particleList=
"B0:generic",
236 variable=
"extraInfo(SignalProbability)",
237 outputVariable=
"rank_signalprob",
246 fei_skim.postskim_path.add_module(v2hdf5_b)
253 path.add_module(TrainDataSaver_module)
254 b2.process(path, max_event=num_events)
259 Merge file of particle-level information (MC) with those of event-level information (Y4S, B).
260 Preprocess
and save to disk
as Parquet file
in form of Awkward Array.
262 df = pd.read_hdf(self.temp_file['MC'], key=
'mc_information')
263 df_y4s = pd.read_hdf(self.
temp_file[
'Y4S'], key=
'Upsilon(4S):mc')
264 df_b = pd.read_hdf(self.
temp_file[
'B'], key=
'B0:generic')
265 df_merged = df_y4s.merge(df_b.drop(axis=1, labels=[
'icand',
'ncand']), how=
"left")
266 decorr_df = df_merged.rename({
'evt':
'evtNum'}, axis=1)
267 ak.to_parquet(preprocessed(df, decorr_df), self.
out_file)
271 Clean up temporary files.
A (simplified) python wrapper for StoreArray.
a (simplified) python wrapper for StoreObjPtr.
out_file
Output file path for saving the event numbers.
pass_list
List to save event numbers of pass events.
eventInfo
Initialise event metadata from data store.
def __init__(self, out_file=None)
fast_mode
Whether use fast mode or advanced mode.
eventExtraInfo
Initialise event extra info from data store.
df_dict
Pandas dataframe to save particle features.
flag_list
Filename of the flag file indicating passing events.
output_file
Filename to save training data to.
def __init__(self, output_file, flag_file)
eventInfo
Initialise event metadata from data store.
out_file
Final output Parquet file.
save_vars
Variables to save for different event levels.
temp_file
Intermediate files.
data
Input root file generated before skimming.
def process_b2script(self, num_events=2500)
out_temp
Temperary directory to keep intermediate files for advanced mode.
flag
Filename of the flag file indicating passing events.
def __init__(self, in_dir, out_dir, job_id, save_vars=None)