development/doxygen/NN__trainer__module_8py_source.html

import os

import shutil

import pandas as pd

import awkward as ak


import basf2 as b2

from ROOT import Belle2

import modularAnalysis as ma

from skim.WGs.fei import feiHadronicB0

from b2pandas_utils import VariablesToHDF5


from smartBKG.utils.preprocess import load_particle_list, preprocessed


class SaveFlag(b2.Module):

    """

    Save event numbers to a Parquet file.


    Arguments:

        out_file (str): Output file path for saving the event numbers.


    Returns:

        None


    Note:

        This module should be added after the skimming process.

    """


    def __init__(self, out_file=None):

        """

        Initialize the SaveFlag module.


        :param out_file: Output file path for saving the event numbers.

        """

        super().__init__()


        self.out_file = out_file


    def initialize(self):

        """

        Initialize the data store and the list to save event numbers before processing events.

        """


        self.eventInfo = Belle2.PyStoreObj('EventMetaData')


        self.pass_list = []


    def event(self):

        """

        Process each event and append event numbers to the pass list.

        """

        self.pass_list.append(self.eventInfo.getEvent())


    def terminate(self):

        """

        Finalize the module and save the pass list to a Parquet file.

        """

        ak.to_parquet(self.pass_list, self.out_file)


class TrainDataSaver(b2.Module):

    """

    Save MCParticles to Pandas Dataframe.


    Arguments:

        output_file (str): Filename to save training data.

            Ending with ``parquet`` indicating fast mode, which will generate the final parquet file for training.

            Ending with ``h5`` indicating advanced mode, which will produce a temporary h5 file for further preprocessing.

        flag_file (str): Filename of the flag file indicating passing events.


    Returns:

        None

    """


    def __init__(

        self,

        output_file,

        flag_file,

    ):

        """

        Initialize the TrainDataSaver module.


        :param output_file: Filename to save training data to.

        :param flag_file: Filename of the flag file indicating passing events.

        """

        super().__init__()


        self.output_file = output_file


        self.flag_list = ak.from_parquet(flag_file)


        self.fast_mode = output_file.endswith(".parquet")


        # delete output file if it already exists, since we will append later

        if os.path.exists(output_file):

            os.remove(output_file)


    def initialize(self):

        """

        Initialize the data store and the dictionary to save particle features before processing events.

        """


        self.eventInfo = Belle2.PyStoreObj('EventMetaData')


        self.eventExtraInfo = Belle2.PyStoreObj('EventExtraInfo')


        self.df_dict = pd.DataFrame()


    def event(self):

        """

        Process each event and append event information to the dictionary.

        """

        evtNum = self.eventInfo.getEvent()

        self.df_dict = pd.concat([

            self.df_dict,

            load_particle_list(mcplist=Belle2.PyStoreArray("MCParticles"), evtNum=evtNum, label=(evtNum in self.flag_list))

            ])


    def terminate(self):

        """

        Append events on disk in either of the two different ways and free memory.


        In fast mode, the dataframe containing particle-level information and skim labels is preprocessed

        and saved as a parquet file which is ready for NN training.


        In advanced mode, the dataframe is saved as a h5 file and waits for combination with event-level information

        before preprocessing.

        """

        if self.fast_mode:

            ak.to_parquet(preprocessed(self.df_dict), self.output_file)

        else:

            self.df_dict.to_hdf(self.output_file, key='mc_information', mode='a', format='table', append=True)

        self.df_dict = pd.DataFrame()


class data_production():

    """

    Process data for training and save to Parquet file. Two modes are provided:

    Fast mode: save_vars set to None, produce the dataset with only the necessary information for the training.

    Advanced mode: save_vars set to a dictionary of event-level variables,

    run through hard-coded b2 steering code in self.process_b2script to produce the required particle lists

    and save the required variables, can be used for event-level cuts or evaluations of the NN performance.


    Arguments:

        in_dir (str): Input directory.

        out_dir (str): Output directory.

        job_id (int): Job ID for batch processing.

        save_vars (dict): Event-level variables to save for different particles.

            By default None for fast mode.

            In the example script having Y4S and B keys for the corresponding particle list.


    Returns:

        None

    """


    def __init__(self, in_dir, out_dir, job_id, save_vars=None):

        """

        Initialize the data_production object.


        :param in_dir: Input directory.

        :param out_dir: Output directory.

        :param job_id: Job ID for batch processing.

        :param save_vars: Event-level variables to save for different particles.

        By default None for fast mode.

        In the example script having Y4S and B keys for the corresponding particle list.

        """

        dataName = '_submdst'

        flagName = '_flag'


        self.data = f'{in_dir}{dataName}{job_id}.root'


        self.flag = f'{in_dir}{flagName}{job_id}.parquet'

        if save_vars is not None:


            self.out_temp = f'{out_dir}_temp{job_id}/'

            os.makedirs(out_dir, exist_ok=True)

            os.makedirs(self.out_temp, exist_ok=True)


            self.temp_file = {

                'MC': f'{self.out_temp}mc.h5',

                'Y4S': f'{self.out_temp}y4s.h5',

                'B': f'{self.out_temp}b.h5'

                }


        self.out_file = f'{out_dir}preprocessed{job_id}.parquet'


        self.save_vars = save_vars


    def process(self):

        """

        Process the b2 steering file and the data generation.

        """

        self.process_b2script()

        if self.save_vars is not None:

            self.merge_files()


    def process_b2script(self, num_events=2500):

        """

        Skimming process with TrainDataSaver module.


        :param num_events: Maximum number of events to process.

        """

        path = ma.create_path()


        ma.inputMdst(environmentType='default', filename=self.data, path=path)

        ma.buildEventShape(path=path)

        ma.buildEventKinematics(path=path)


        # process with advance mode

        if self.save_vars is not None:

            TrainDataSaver_module = TrainDataSaver(

                output_file=self.temp_file['MC'],

                flag_file=self.flag,

            )

            path.add_module(TrainDataSaver_module)

            ma.fillParticleListFromMC('Upsilon(4S):mc', '', path=path)

            v2hdf5_y4s = VariablesToHDF5(

                'Upsilon(4S):mc',

                self.save_vars['Y4S'],

                filename=self.temp_file['Y4S'],

            )

            path.add_module(v2hdf5_y4s)


            fei_skim = feiHadronicB0(udstOutput=False, analysisGlobaltag=ma.getAnalysisGlobaltag())

            fei_skim(path=path)

            fei_skim.postskim_path.add_module(

                    "BestCandidateSelection",

                    particleList="B0:generic",

                    variable="extraInfo(SignalProbability)",

                    outputVariable="rank_signalprob",

                    numBest=1,

                )

            # Key of saved table is the name of particle list

            v2hdf5_b = VariablesToHDF5(

                'B0:generic',

                self.save_vars['B'],

                filename=self.temp_file['B'],

            )

            fei_skim.postskim_path.add_module(v2hdf5_b)

        # process with fast mode

        else:

            TrainDataSaver_module = TrainDataSaver(

                output_file=self.out_file,

                flag_file=self.flag,

            )

            path.add_module(TrainDataSaver_module)

        b2.process(path, max_event=num_events)


    def merge_files(self):

        """

        Merge file of particle-level information (MC) with those of event-level information (Y4S, B).

        Preprocess and save to disk as Parquet file in form of Awkward Array.

        """

        df = pd.read_hdf(self.temp_file['MC'], key='mc_information')

        df_y4s = pd.read_hdf(self.temp_file['Y4S'], key='Upsilon(4S):mc')

        df_b = pd.read_hdf(self.temp_file['B'], key='B0:generic')

        df_merged = df_y4s.merge(df_b.drop(axis=1, labels=['icand', 'ncand']), how="left")

        decorr_df = df_merged.rename({'evt': 'evtNum'}, axis=1)

        ak.to_parquet(preprocessed(df, decorr_df), self.out_file)


    def clean_up(self):

        """

        Clean up temporary files.

        """

        # uncomment if needed for batch job

        # os.remove(self.data)

        os.remove(self.flag)

        if self.save_vars is not None:

            shutil.rmtree(self.out_temp)


Belle2::PyStoreArray
A (simplified) python wrapper for StoreArray.
Definition PyStoreArray.h:73

Belle2::PyStoreObj
a (simplified) python wrapper for StoreObjPtr.
Definition PyStoreObj.h:67

NN_trainer_module.SaveFlag
Definition NN_trainer_module.py:22

NN_trainer_module.SaveFlag.out_file
out_file
Output file path for saving the event numbers.
Definition NN_trainer_module.py:44

NN_trainer_module.SaveFlag.terminate
terminate(self)
Definition NN_trainer_module.py:61

NN_trainer_module.SaveFlag.initialize
initialize(self)
Definition NN_trainer_module.py:46

NN_trainer_module.SaveFlag.pass_list
list pass_list
List to save event numbers of pass events.
Definition NN_trainer_module.py:53

NN_trainer_module.SaveFlag.__init__
__init__(self, out_file=None)
Definition NN_trainer_module.py:36

NN_trainer_module.SaveFlag.eventInfo
eventInfo
Initialise event metadata from data store.
Definition NN_trainer_module.py:51

NN_trainer_module.SaveFlag.event
event(self)
Definition NN_trainer_module.py:55

NN_trainer_module.TrainDataSaver
Definition NN_trainer_module.py:68

NN_trainer_module.TrainDataSaver.fast_mode
fast_mode
Whether use fast mode or advanced mode.
Definition NN_trainer_module.py:99

NN_trainer_module.TrainDataSaver.terminate
terminate(self)
Definition NN_trainer_module.py:126

NN_trainer_module.TrainDataSaver.eventExtraInfo
eventExtraInfo
Initialise event extra info from data store.
Definition NN_trainer_module.py:112

NN_trainer_module.TrainDataSaver.initialize
initialize(self)
Definition NN_trainer_module.py:105

NN_trainer_module.TrainDataSaver.df_dict
df_dict
Pandas dataframe to save particle features.
Definition NN_trainer_module.py:114

NN_trainer_module.TrainDataSaver.flag_list
flag_list
Filename of the flag file indicating passing events.
Definition NN_trainer_module.py:97

NN_trainer_module.TrainDataSaver.__init__
__init__(self, output_file, flag_file)
Definition NN_trainer_module.py:86

NN_trainer_module.TrainDataSaver.output_file
output_file
Filename to save training data to.
Definition NN_trainer_module.py:95

NN_trainer_module.TrainDataSaver.eventInfo
eventInfo
Initialise event metadata from data store.
Definition NN_trainer_module.py:110

NN_trainer_module.TrainDataSaver.event
event(self)
Definition NN_trainer_module.py:116

NN_trainer_module.data_production
Definition NN_trainer_module.py:143

NN_trainer_module.data_production.out_file
str out_file
Final output Parquet file.
Definition NN_trainer_module.py:192

NN_trainer_module.data_production.temp_file
dict temp_file
Intermediate files.
Definition NN_trainer_module.py:186

NN_trainer_module.data_production.save_vars
save_vars
Variables to save for different event levels.
Definition NN_trainer_module.py:194

NN_trainer_module.data_production.process_b2script
process_b2script(self, num_events=2500)
Definition NN_trainer_module.py:204

NN_trainer_module.data_production.clean_up
clean_up(self)
Definition NN_trainer_module.py:268

NN_trainer_module.data_production.out_temp
str out_temp
Temporary directory to keep intermediate files for advanced mode.
Definition NN_trainer_module.py:182

NN_trainer_module.data_production.flag
str flag
Filename of the flag file indicating passing events.
Definition NN_trainer_module.py:179

NN_trainer_module.data_production.data
str data
Input root file generated before skimming.
Definition NN_trainer_module.py:177

NN_trainer_module.data_production.merge_files
merge_files(self)
Definition NN_trainer_module.py:256

NN_trainer_module.data_production.__init__
__init__(self, in_dir, out_dir, job_id, save_vars=None)
Definition NN_trainer_module.py:163

NN_trainer_module.data_production.process
process(self)
Definition NN_trainer_module.py:196

b2pandas_utils.VariablesToHDF5
Definition b2pandas_utils.py:336

merge_files
Definition merge_files.py:1