14from ROOT
import Belle2
15import modularAnalysis
as ma
16from skim.WGs.fei
import feiHadronicB0
17from b2pandas_utils
import VariablesToHDF5
19from smartBKG.utils.preprocess
import load_particle_list, preprocessed
24 Save event numbers to a Parquet file.
27 out_file (str): Output file path for saving the event numbers.
33 This module should be added after the skimming process.
38 Initialize the SaveFlag module.
40 :param out_file: Output file path for saving the event numbers.
48 Initialize the data store and the list to save event numbers before processing events.
57 Process each event and append event numbers to the pass list.
63 Finalize the module and save the pass list to a Parquet file.
70 Save MCParticles to Pandas Dataframe.
73 output_file (str): Filename to save training data.
74 Ending with ``parquet`` indicating fast mode, which will generate the final parquet file for training.
75 Ending with ``h5`` indicating advanced mode, which will produce a temporary h5 file for further preprocessing.
76 flag_file (str): Filename of the flag file indicating passing events.
88 Initialize the TrainDataSaver module.
90 :param output_file: Filename to save training data to.
91 :param flag_file: Filename of the flag file indicating passing events.
102 if os.path.exists(output_file):
103 os.remove(output_file)
107 Initialize the data store and the dictionary to save particle features before processing events.
118 Process each event and append event information to the dictionary.
128 Append events on disk in either of the two different ways and free memory.
130 In fast mode, the dataframe containing particle-level information and skim labels is preprocessed
131 and saved as a parquet file which is ready for NN training.
133 In advanced mode, the dataframe is saved as a h5 file and waits for combination with event-level information
134 before preprocessing.
139 self.
df_dict.to_hdf(self.
output_file, key=
'mc_information', mode=
'a', format=
'table', append=
True)
145 Process data for training and save to Parquet file. Two modes are provided:
146 Fast mode: save_vars set to None, produce the dataset with only the necessary information for the training.
147 Advanced mode: save_vars set to a dictionary of event-level variables,
148 run through hard-coded b2 steering code in self.process_b2script to produce the required particle lists
149 and save the required variables, can be used for event-level cuts or evaluations of the NN performance.
152 in_dir (str): Input directory.
153 out_dir (str): Output directory.
154 job_id (int): Job ID for batch processing.
155 save_vars (dict): Event-level variables to save for different particles.
156 By default None for fast mode.
157 In the example script having Y4S and B keys for the corresponding particle list.
163 def __init__(self, in_dir, out_dir, job_id, save_vars=None):
165 Initialize the data_production object.
167 :param in_dir: Input directory.
168 :param out_dir: Output directory.
169 :param job_id: Job ID for batch processing.
170 :param save_vars: Event-level variables to save for different particles.
171 By default None for fast mode.
172 In the example script having Y4S and B keys for the corresponding particle list.
174 dataName =
'_submdst'
177 self.
data = f
'{in_dir}{dataName}{job_id}.root'
179 self.
flag = f
'{in_dir}{flagName}{job_id}.parquet'
180 if save_vars
is not None:
183 os.makedirs(out_dir, exist_ok=
True)
184 os.makedirs(self.
out_temp, exist_ok=
True)
187 'MC': f
'{self.out_temp}mc.h5',
188 'Y4S': f
'{self.out_temp}y4s.h5',
189 'B': f
'{self.out_temp}b.h5'
192 self.
out_file = f
'{out_dir}preprocessed{job_id}.parquet'
198 Process the b2 steering file and the data generation.
206 Skimming process with TrainDataSaver module.
208 :param num_events: Maximum number of events to process.
210 path = ma.create_path()
212 ma.inputMdst(environmentType=
'default', filename=self.
data, path=path)
213 ma.buildEventShape(path=path)
214 ma.buildEventKinematics(path=path)
222 path.add_module(TrainDataSaver_module)
223 ma.fillParticleListFromMC(
'Upsilon(4S):mc',
'', path=path)
229 path.add_module(v2hdf5_y4s)
231 fei_skim = feiHadronicB0(udstOutput=
False, analysisGlobaltag=ma.getAnalysisGlobaltag())
233 fei_skim.postskim_path.add_module(
234 "BestCandidateSelection",
235 particleList=
"B0:generic",
236 variable=
"extraInfo(SignalProbability)",
237 outputVariable=
"rank_signalprob",
246 fei_skim.postskim_path.add_module(v2hdf5_b)
253 path.add_module(TrainDataSaver_module)
254 b2.process(path, max_event=num_events)
258 Merge file of particle-level information (MC) with those of event-level information (Y4S, B).
259 Preprocess and save to disk as Parquet file in form of Awkward Array.
261 df = pd.read_hdf(self.
temp_file[
'MC'], key=
'mc_information')
262 df_y4s = pd.read_hdf(self.
temp_file[
'Y4S'], key=
'Upsilon(4S):mc')
263 df_b = pd.read_hdf(self.
temp_file[
'B'], key=
'B0:generic')
264 df_merged = df_y4s.merge(df_b.drop(axis=1, labels=[
'icand',
'ncand']), how=
"left")
265 decorr_df = df_merged.rename({
'evt':
'evtNum'}, axis=1)
266 ak.to_parquet(preprocessed(df, decorr_df), self.
out_file)
270 Clean up temporary files.
A (simplified) python wrapper for StoreArray.
a (simplified) python wrapper for StoreObjPtr.
out_file
Output file path for saving the event numbers.
list pass_list
List to save event numbers of pass events.
__init__(self, out_file=None)
eventInfo
Initialise event metadata from data store.
fast_mode
Whether use fast mode or advanced mode.
eventExtraInfo
Initialise event extra info from data store.
df_dict
Pandas dataframe to save particle features.
flag_list
Filename of the flag file indicating passing events.
__init__(self, output_file, flag_file)
output_file
Filename to save training data to.
eventInfo
Initialise event metadata from data store.
str out_file
Final output Parquet file.
dict temp_file
Intermediate files.
save_vars
Variables to save for different event levels.
process_b2script(self, num_events=2500)
str out_temp
Temporary directory to keep intermediate files for advanced mode.
str flag
Filename of the flag file indicating passing events.
str data
Input root file generated before skimming.
__init__(self, in_dir, out_dir, job_id, save_vars=None)