Belle II Software development
data_production Class Reference

Public Member Functions

def __init__ (self, in_dir, out_dir, job_id, save_vars=None)
 
def process (self)
 
def process_b2script (self, num_events=2500)
 
def merge_files (self)
 
def clean_up (self)
 

Public Attributes

 data
 Input root file generated before skimming.
 
 flag
 Filename of the flag file indicating passing events.
 
 out_temp
 Temperary directory to keep intermediate files for advanced mode.
 
 temp_file
 Intermediate files.
 
 out_file
 Final output Parquet file.
 
 save_vars
 Variables to save for different event levels.
 

Detailed Description

Process data for training and save to Parquet file. Two modes are provided:
Fast mode: save_vars set to None, produce the dataset with only the necessary information for the training.
Advanced mode: save_vars set to a dictionary of event-level variables,
run through hard-coded b2 steering code in self.process_b2script to produce the required particle lists
and save the required variables, can be used for event-level cuts or evaluations of the NN performance.

Arguments:
    in_dir (str): Input directory.
    out_dir (str): Output directory.
    job_id (int): Job ID for batch processing.
    save_vars (dict): Event-level variables to save for different particles.
        By default None for fast mode.
        In the example script having Y4S and B keys for the corresponding particle list.

Returns:
    None

Definition at line 143 of file NN_trainer_module.py.

Constructor & Destructor Documentation

◆ __init__()

def __init__ (   self,
  in_dir,
  out_dir,
  job_id,
  save_vars = None 
)
Initialize the data_production object.

:param in_dir: Input directory.
:param out_dir: Output directory.
:param job_id: Job ID for batch processing.
:param save_vars: Event-level variables to save for different particles.
By default None for fast mode.
In the example script having Y4S and B keys for the corresponding particle list.

Definition at line 163 of file NN_trainer_module.py.

163 def __init__(self, in_dir, out_dir, job_id, save_vars=None):
164 """
165 Initialize the data_production object.
166
167 :param in_dir: Input directory.
168 :param out_dir: Output directory.
169 :param job_id: Job ID for batch processing.
170 :param save_vars: Event-level variables to save for different particles.
171 By default None for fast mode.
172 In the example script having Y4S and B keys for the corresponding particle list.
173 """
174 dataName = '_submdst'
175 flagName = '_flag'
176
177 self.data = f'{in_dir}{dataName}{job_id}.root'
178
179 self.flag = f'{in_dir}{flagName}{job_id}.parquet'
180 if save_vars is not None:
181
182 self.out_temp = f'{out_dir}_temp{job_id}/'
183 os.makedirs(out_dir, exist_ok=True)
184 os.makedirs(self.out_temp, exist_ok=True)
185
186 self.temp_file = {
187 'MC': f'{self.out_temp}mc.h5',
188 'Y4S': f'{self.out_temp}y4s.h5',
189 'B': f'{self.out_temp}b.h5'
190 }
191
192 self.out_file = f'{out_dir}preprocessed{job_id}.parquet'
193
194 self.save_vars = save_vars
195

Member Function Documentation

◆ clean_up()

def clean_up (   self)
Clean up temporary files.

Definition at line 269 of file NN_trainer_module.py.

269 def clean_up(self):
270 """
271 Clean up temporary files.
272 """
273 # uncomment if needed for batch job
274 # os.remove(self.data)
275 os.remove(self.flag)
276 if self.save_vars is not None:
277 shutil.rmtree(self.out_temp)

◆ merge_files()

def merge_files (   self)
Merge file of particle-level information (MC) with those of event-level information (Y4S, B).
Preprocess and save to disk as Parquet file in form of Awkward Array.

Definition at line 257 of file NN_trainer_module.py.

257 def merge_files(self):
258 """
259 Merge file of particle-level information (MC) with those of event-level information (Y4S, B).
260 Preprocess and save to disk as Parquet file in form of Awkward Array.
261 """
262 df = pd.read_hdf(self.temp_file['MC'], key='mc_information')
263 df_y4s = pd.read_hdf(self.temp_file['Y4S'], key='Upsilon(4S):mc')
264 df_b = pd.read_hdf(self.temp_file['B'], key='B0:generic')
265 df_merged = df_y4s.merge(df_b.drop(axis=1, labels=['icand', 'ncand']), how="left")
266 decorr_df = df_merged.rename({'evt': 'evtNum'}, axis=1)
267 ak.to_parquet(preprocessed(df, decorr_df), self.out_file)
268

◆ process()

def process (   self)
Process the b2 steering file and the data generation.

Definition at line 196 of file NN_trainer_module.py.

196 def process(self):
197 """
198 Process the b2 steering file and the data generation.
199 """
200 self.process_b2script()
201 if self.save_vars is not None:
202 self.merge_files()
203

◆ process_b2script()

def process_b2script (   self,
  num_events = 2500 
)
Skimming process with TrainDataSaver module.

:param num_events: Maximum number of events to process.

Definition at line 204 of file NN_trainer_module.py.

204 def process_b2script(self, num_events=2500):
205 """
206 Skimming process with TrainDataSaver module.
207
208 :param num_events: Maximum number of events to process.
209 """
210 path = ma.create_path()
211
212 ma.inputMdst(environmentType='default', filename=self.data, path=path)
213 ma.buildEventShape(path=path)
214 ma.buildEventKinematics(path=path)
215
216 # process with advance mode
217 if self.save_vars is not None:
218 TrainDataSaver_module = TrainDataSaver(
219 output_file=self.temp_file['MC'],
220 flag_file=self.flag,
221 )
222 path.add_module(TrainDataSaver_module)
223 ma.fillParticleListFromMC('Upsilon(4S):mc', '', path=path)
224 v2hdf5_y4s = VariablesToHDF5(
225 'Upsilon(4S):mc',
226 self.save_vars['Y4S'],
227 filename=self.temp_file['Y4S'],
228 )
229 path.add_module(v2hdf5_y4s)
230
231 fei_skim = feiHadronicB0(udstOutput=False, analysisGlobaltag=ma.getAnalysisGlobaltag())
232 fei_skim(path=path)
233 fei_skim.postskim_path.add_module(
234 "BestCandidateSelection",
235 particleList="B0:generic",
236 variable="extraInfo(SignalProbability)",
237 outputVariable="rank_signalprob",
238 numBest=1,
239 )
240 # Key of saved table is the name of particle list
241 v2hdf5_b = VariablesToHDF5(
242 'B0:generic',
243 self.save_vars['B'],
244 filename=self.temp_file['B'],
245 )
246 fei_skim.postskim_path.add_module(v2hdf5_b)
247 # process with fast mode
248 else:
249 TrainDataSaver_module = TrainDataSaver(
250 output_file=self.out_file,
251 flag_file=self.flag,
252 )
253 path.add_module(TrainDataSaver_module)
254 b2.process(path, max_event=num_events)
255 print(b2.statistics)
256

Member Data Documentation

◆ data

data

Input root file generated before skimming.

Definition at line 177 of file NN_trainer_module.py.

◆ flag

flag

Filename of the flag file indicating passing events.

Definition at line 179 of file NN_trainer_module.py.

◆ out_file

out_file

Final output Parquet file.

Definition at line 192 of file NN_trainer_module.py.

◆ out_temp

out_temp

Temperary directory to keep intermediate files for advanced mode.

Definition at line 182 of file NN_trainer_module.py.

◆ save_vars

save_vars

Variables to save for different event levels.

Definition at line 194 of file NN_trainer_module.py.

◆ temp_file

temp_file

Intermediate files.

Definition at line 186 of file NN_trainer_module.py.


The documentation for this class was generated from the following file: