Source code for training

# Author: Arrykrishna Mootoovaloo
# Collaborators: Prof. Alan Heavens, Prof. Andrew Jaffe, Dr. Florent Leclercq
# Email : arrykrish@gmail.com/a.mootoovaloo17@imperial.ac.uk
# Affiliation : Imperial Centre for Inference and Cosmology
# Status : Under Development

'''
Routine to train all GPs in parallel to emulate the 3D Power Spectrum
'''

import numpy as np
import multiprocessing as mp

# our scripts
import utils.common as uc
import utils.helpers as hp
import ml.optimisation as op
import settings as st

np.set_printoptions(precision=4, suppress=True)

logger = uc.get_logger('training', 'training', 'logs')


[docs]def train(cosmologies: np.ndarray, target: np.ndarray, folder_name: str, fname: str, kwargs: dict) -> None:
    '''
    Function to train GPs

    :param: cosmologies (np.ndarray) : array of size N_train x N_dim for the inputs to the GP

    :param: target (np.ndarray) : an array for the targets (function)

    :param: folder_name (str) : name of the folder where the outputs are stored

    :param: fname (str) : name of the GP output

    :param: kwargs (dict) : a dictionary with the settings for the GPs, for example, lambda_cap = 1000
    '''

    # Sometimes the GP might break due to numerical instrability
    # hence, try except

    try:
        # train the GP
        gp_model = op.maximise(cosmologies, target, **kwargs)

        # store the GP in the specific folder
        hp.store_pkl_file(gp_model, folder_name, fname)

        # add some important information to the log file
        info1 = folder_name + '/' + fname
        evidence = np.around(gp_model.min_chi_sqr, 4)

        logger.info(info1)
        logger.info(evidence)

    except BaseException:
        pass


[docs]def worker(args: list) -> None:
    '''
    The argument here is simply the name of the output vector

    :param: args (list) : list containing the arguments to be fed for training GPs
    '''
    train(*args)


[docs]def parallel_training(arguments: list) -> None:
    '''
    Call the parallel processing routine here

    :param: arguments (list) - list of arguments (inputs) to train the GPs

    :return: None
    '''

    # count number of CPU
    ncpu = mp.cpu_count()

    # run procedure in parallel
    pool = mp.Pool(processes=ncpu)
    pool.map(worker, arguments)
    pool.close()


[docs]def main(directory: str = 'semigps') -> None:
    '''
    Main function to train all the Gaussian Process models.

    :param: directory (str) - directory where the GPs are stored.

    :return: None
    '''

    # if we are using the 3 components and excluding neutrino
    if st.components and not st.neutrino:

        # the inputs are fixed
        cosmologies = hp.load_arrays('trainingset/components', 'cosmologies')

        # load the growth factor
        growth_factor = hp.load_arrays('trainingset/components', 'growth_factor')

        # load the q functions
        q_function = hp.load_arrays('trainingset/components', 'q_function')

        # load the linear matter power spectrum
        pk_linear = hp.load_arrays('trainingset/components', 'pk_linear')

        # number of GPs for each component
        n_gf = growth_factor.shape[1]
        n_qf = q_function.shape[1]
        n_pl = pk_linear.shape[1]

        # folders where we want to store the GPs
        folder_gf = directory + '/pknl_components' + st.d_one_plus + '/gf'
        folder_qf = directory + '/pknl_components' + st.d_one_plus + '/qf'
        folder_pl = directory + '/pknl_components' + st.d_one_plus + '/pl'

        # generate arguments to be fed to parallel routine
        # arguments for the growth factor
        arg_gf = [[cosmologies, growth_factor[:, i], folder_gf, 'gp_' + str(i), st.gf_args] for i in range(n_gf)]

        # arguments for the q(k,z) function
        if st.emu_one_plus_q:
            arg_qf = [[cosmologies, 1.0 + q_function[:, i], folder_qf, 'gp_' + str(i), st.qf_args] for i in range(n_qf)]
        else:
            arg_qf = [[cosmologies, q_function[:, i], folder_qf, 'gp_' + str(i), st.qf_args] for i in range(n_qf)]

        # arguments for linear matter power spectrum
        arg_pl = [[cosmologies, pk_linear[:, i], folder_pl, 'gp_' + str(i), st.pl_args] for i in range(n_pl)]

        # idea: should we emulate 1 + q(k,z) rather than q(k,z) to prevent Pk to be negative?
        # idea: should we use log-transformation for A(z) and q(k,z) or 1 + q(k,z)?
        # solution: option for the user to try these possibilities in any case;
        # perform training in parallel
        parallel_training(arg_gf)
        parallel_training(arg_qf)
        parallel_training(arg_pl)

    # if we are using the 3 components and excluding neutrino
    if st.components and st.neutrino:

        # the inputs are fixed
        cosmologies = hp.load_arrays('trainingset/components', 'cosmologies_neutrino')

        # load the growth factor
        growth_factor = hp.load_arrays('trainingset/components_neutrino', 'growth_factor')

        # load the q functions
        q_function = hp.load_arrays('trainingset/components_neutrino', 'q_function')

        # load the linear matter power spectrum
        pk_linear = hp.load_arrays('trainingset/components_neutrino', 'pk_linear')

        # number of GPs for each component
        n_gf = growth_factor.shape[1]
        n_qf = q_function.shape[1]
        n_pl = pk_linear.shape[1]

        # folders where we want to store the GPs
        folder_gf = directory + '/pknl_neutrino_components' + st.d_one_plus + '/gf'
        folder_qf = directory + '/pknl_neutrino_components' + st.d_one_plus + '/qf'
        folder_pl = directory + '/pknl_neutrino_components' + st.d_one_plus + '/pl'

        # generate arguments to be fed to parallel routine
        # arguments for the growth factor
        arg_gf = [[cosmologies, growth_factor[:, i], folder_gf, 'gp_' + str(i), st.gf_args] for i in range(n_gf)]

        # arguments for the q(k,z) function
        if st.emu_one_plus_q:
            arg_qf = [[cosmologies, 1.0 + q_function[:, i], folder_qf, 'gp_' + str(i), st.qf_args] for i in range(n_qf)]
        else:
            arg_qf = [[cosmologies, q_function[:, i], folder_qf, 'gp_' + str(i), st.qf_args] for i in range(n_qf)]

        # arguments for linear matter power spectrum
        arg_pl = [[cosmologies, pk_linear[:, i], folder_pl, 'gp_' + str(i), st.pl_args] for i in range(n_pl)]

        # perform training in parallel
        parallel_training(arg_gf)
        parallel_training(arg_qf)
        parallel_training(arg_pl)

    if not st.components and not st.neutrino:

        # the inputs are fixed
        cosmologies = hp.load_arrays('trainingset/pk_nonlinear', 'cosmologies')

        # the non linear matter power spectrum
        pk_nl = hp.load_arrays('trainingset/pk_nonlinear', 'pk_nl')

        # number of GPs
        npk = pk_nl.shape[1]

        # folder where we want to store the GPs
        folder_pknl = directory + '/pknl' + st.d_one_plus + '/'

        args = [[cosmologies, pk_nl[:, i], folder_pknl, 'gp_' + str(i), st.pknl_args] for i in range(npk)]

        # perform training in parallel
        parallel_training(args)

    if not st.components and st.neutrino:

        # the inputs are fixed
        cosmologies = hp.load_arrays('trainingset/pk_nonlinear', 'cosmologies_netrino')

        # the non linear matter power spectrum
        pk_nl = hp.load_arrays('trainingset/pk_nonlinear', 'pk_nl_neutrino')

        # number of GPs
        npk = pk_nl.shape[1]

        # folder where we want to store the GPs
        folder_pknl = directory + '/pknl_neutrino' + st.d_one_plus + '/'

        args = [[cosmologies, pk_nl[:, i], folder_pknl, 'gp_' + str(i), st.pknl_args] for i in range(npk)]

        # perform training in parallel
        parallel_training(args)


# Standard boilerplate to call the main() function to begin the program.
if __name__ == '__main__':
    main()