Source code for reframechecks.notool.internal_timers_mpi_containers

# Copyright 2019-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
# HPCTools Project Developers. See the top-level LICENSE file for details.
#
# SPDX-License-Identifier: BSD-3-Clause

import os
import sys
import reframe as rfm
import reframe.utility.sanity as sn
from reframe.core.backends import getlauncher
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                '../common')))  # noqa: E402
import sphexa.sanity as sphs


# NOTE: jenkins restricted to 1 cnode
mpi_tasks = [24, 96]  # [24, 48, 96, 192]
cubeside_dict = {1: 30, 12: 78, 24: 100, 48: 125, 96: 157, 192: 198}
steps_dict = {1: 1, 12: 1, 24: 1, 48: 1, 96: 1, 192: 1}  # use same step


# {{{ class SphExa_Container_Base_Check
[docs]class SphExa_Container_Base_Check(rfm.RegressionTest):
    # {{{
    '''
    2 parameters can be set for simulation:

    :arg mpi_task: number of mpi tasks; the size of the cube in the 3D
         square patch test is set with a dictionary depending on mpi_task,
         but cubesize could also be on the list of parameters,
    :arg step: number of simulation steps.

    Dependencies are:
        - compute: inputs (mpi_task, step) ---srun---> *job.out
        - postprocess logs: inputs (*job.out) ---x---> termgraph.in
        - plot data: inputs (termgraph.in) ---termgraph.py---> termgraph.rpt
    '''
    # }}}

    def __init__(self, mpi_task, step, container_d):
        # {{{ pe
        self.descr = 'Tool validation'
        self.valid_prog_environs = ['builtin', 'PrgEnv-gnu', 'PrgEnv-intel',
                                    'PrgEnv-pgi', 'PrgEnv-cray']
        # self.sourcesdir = None
        # self.valid_systems = ['daint:gpu', 'dom:gpu']
        self.valid_systems = ['*']
        self.maintainers = ['JG']
        self.tags = {'sph', 'hpctools', 'cpu', 'container'}
        # }}}

        # {{{ compile
        self.testname = 'sqpatch'
        self.modules = [container_d['modulefiles']]
        self.build_system = 'SingleSource'
        self.sourcepath = f'{self.testname}.cpp'
        self.executable = f'./{self.testname}.exe'
        self.native_executable = self.executable
        # unload xalt to avoid _buffer_decode error and,
        # unload container to build native app:
        prebuild_cmds = [
            'module rm xalt', f"module rm {container_d['modulefiles']}",
            'module load cray-mpich'
        ]
        self.prebuild_cmds = prebuild_cmds
        self.postbuild_cmds = [
            f"mv {container_d['runtime']} {self.native_executable}"]
        self.prgenv_flags = {
            'PrgEnv-gnu': ['-I.', '-I./include', '-std=c++14', '-g', '-O2',
                           '-DUSE_MPI', '-DNDEBUG'],
            'PrgEnv-intel': ['-I.', '-I./include', '-std=c++14', '-g', '-O2',
                             '-DUSE_MPI', '-DNDEBUG'],
            'PrgEnv-cray': ['-I.', '-I./include', '-std=c++17', '-g', '-O2',
                            '-DUSE_MPI', '-DNDEBUG'],
            'PrgEnv-pgi': ['-I.', '-I./include', '-std=c++14', '-g', '-O2',
                           '-DUSE_MPI', '-DNDEBUG'],
        }
        # }}}

        # {{{ run
        ompthread = 1
        self.num_tasks = mpi_task
        self.cubeside = cubeside_dict[mpi_task]
        self.steps = steps_dict[mpi_task]
        self.num_tasks_per_node = 24
        self.num_tasks_per_core = 2
        self.use_multithreading = True
        self.num_cpus_per_task = ompthread
        self.exclusive = True
        self.time_limit = '10m'
        self.variables['OMP_NUM_THREADS'] = str(self.num_cpus_per_task)
        # Note: do not use "container_platform_options = 'run'"
        container_platform_options = container_d['options']
        container_platform_projectdir = container_d['projectdir']
        container_platform_repo = container_d['scratch']
        container_platform_image = f"{container_d['image']}"
        container_platform_variables = container_d['variables']
        container_platform_executable = container_d['executable']
        executable_arguments = container_d['executable_opts']
        self.prerun_cmds += [
            'module rm xalt',
            'module list -t',
            f'## rsync -av {container_platform_projectdir} '
            f'{container_platform_repo}',
        ]
        self.executable = container_d['runtime']
        self.executable_opts = [
            container_platform_options, container_platform_image,
            'bash', '-c', f"'{container_platform_variables} "
            f"{container_platform_executable} {executable_arguments}'", '2>&1']
        # }}}

        # {{{ sanity
        # self.sanity_patterns_l = [
        self.sanity_patterns = \
            sn.assert_found(r'Total time for iteration\(0\)', self.stdout)
        # self.sanity_patterns = sn.all(self.sanity_patterns_l)
        # }}}

        # {{{ performance
        # {{{ internal timers
        self.prerun_cmds += ['echo starttime=`date +%s`']
        self.postrun_cmds += ['echo stoptime=`date +%s`']
        # }}}

        # {{{ perf_patterns:
        # self.perf_patterns = sn.evaluate(sphs.basic_perf_patterns(self))
        # }}}

        # {{{ reference:
        # self.reference = sn.evaluate(sphs.basic_reference_scoped_d(self))
        # self.reference = sn.evaluate(sphsintel.vtune_tool_reference(self))
        # }}}
        # }}}

    # {{{ hooks
    @rfm.run_before('compile')
    def set_compiler_flags(self):
        self.build_system.cxxflags = \
            self.prgenv_flags[self.current_environ.name]
    # }}}
# }}}


# {{{ class MPI_Compute_Singularity_Test:
@rfm.parameterized_test(*[[mpi_task] for mpi_task in mpi_tasks])
class MPI_Compute_Singularity_Test(SphExa_Container_Base_Check):
    # {{{
    '''
    This class run the executable with Singularity
    (and natively too for comparison)
    '''
    # }}}

    def __init__(self, mpi_task):
        # share args with TestBase class
        step = steps_dict[mpi_task]
        cubeside = cubeside_dict[mpi_task]
        self.name = f'compute_singularity_{mpi_task}mpi_{step}steps'
        nativejob_stdout = 'rfm_' + \
            self.name.replace("singularity", "native") + '_job.out'
        container_d = {
            # for now: module use ~/easybuild/dom/haswell/modules/all
            'modulefiles': 'singularity/3.5.3-dom',
            'runtime': 'singularity',
            'options': 'exec',
            'projectdir': '/project/csstaff/piccinal/CONTAINERS/sph',
            'scratch': '$SCRATCH/CONTAINERS/sph',
            'image':
                '$SCRATCH/CONTAINERS/sph/ub1804_cuda102_mpich314_gnu8+sph.sif',
            'variables': '',
            'mount': '',  # '-B"/x:/x"'
            'executable': '/home/bin/gnu8/mpi+omp.app',
            'executable_opts': f'-n {cubeside} -s {step}'
        }
        self.variables['SINGULARITYENV_LD_LIBRARY_PATH'] = \
            '/opt/gcc/8.3.0/snos/lib64:$SINGULARITYENV_LD_LIBRARY_PATH'
        super().__init__(mpi_task, step, container_d)
        # {{{ --- run the native executable too:
        nativejob_launcher = 'srun'
        # TODO: self.nativejob_launcher = self.current_partition.launcher
        postrun_cmds = [
            # native app:
            # f'ldd {self.native_executable}',
            '# --- native run (no container) ---',
            f'echo starttime=`date +%s` > {nativejob_stdout} 2>&1',
            f"{nativejob_launcher} {self.native_executable} "
            f"{container_d['executable_opts']} >> {nativejob_stdout} 2>&1",
            f'echo stoptime=`date +%s` >> {nativejob_stdout} 2>&1',
        ]
        self.postrun_cmds.extend(postrun_cmds)
        # }}}
        self.rpt_dep = None
# }}}


# {{{ class MPI_Compute_Sarus_Test:
@rfm.parameterized_test(*[[mpi_task] for mpi_task in mpi_tasks])
class MPI_Compute_Sarus_Test(SphExa_Container_Base_Check):
    # {{{
    '''
    This class run the executable with Sarus
    '''
    # }}}

    def __init__(self, mpi_task):
        # share args with TestBase class
        step = steps_dict[mpi_task]
        cubeside = cubeside_dict[mpi_task]
        self.name = f'compute_sarus_{mpi_task}mpi_{step}steps'
        container_d = {
            'modulefiles': 'sarus/1.1.0',
            'runtime': 'sarus',
            'options': 'run --mpi',
            'projectdir': '/project/csstaff/piccinal/CONTAINERS/sph',
            'scratch': '$SCRATCH/CONTAINERS/sph',
            'localimage': 'ub1804_cuda102_mpich314_gnu8+sph.tar',
            # 'scratch': '',
            'image': 'load/library/ub1804_cuda102_mpich314_gnu8:sph',
            'variables': '',
            'mount': '',
            'executable': '/home/bin/gnu8/mpi+omp.app',
            'executable_opts': f'-n {cubeside} -s {step}'
        }
        self.prerun_cmds = [
            # sarus rmi ...
            f"{container_d['runtime']} load "
            f"{container_d['scratch']}/{container_d['localimage']} "
            f"{container_d['image']}",
            f"{container_d['runtime']} images",
        ]
        super().__init__(mpi_task, step, container_d)
        self.rpt_dep = None
# }}}


# {{{ class MPI_Collect_Logs_Test:
@rfm.simple_test
class MPI_Collect_Logs_Test(rfm.RunOnlyRegressionTest):
    def __init__(self):
        self.name = 'postproc_containers'
        self.valid_systems = ['*']
        self.valid_prog_environs = ['*']
        self.sourcesdir = None
        self.modules = []
        self.num_tasks_per_node = 1
        self.num_tasks = 1
        self.executable = 'echo "collecting jobs stdout"'
        self.sanity_patterns = sn.assert_not_found(r'error', self.stdout)
        # --- construct list of dependencies from container1 (from testname):
        self.testnames_singularity = \
            [f'compute_singularity_{mpi_task}mpi_{step}steps'
             for step in set(steps_dict.values()) for mpi_task in mpi_tasks]
        # print('self.testnames_singularity=', self.testnames_singularity)
        for test in self.testnames_singularity:
            self.depends_on(test)

        # --- construct list of dependencies from container2 (from testname):
        self.testnames_sarus = \
            [f'compute_sarus_{mpi_task}mpi_{step}steps'
             for step in set(steps_dict.values()) for mpi_task in mpi_tasks]
        # print('self.testnames_sarus=', self.testnames_sarus)
        for test in self.testnames_sarus:
            self.depends_on(test)

    @rfm.require_deps
    def collect_logs(self):
        """
        cp all the stdout logs from the compute jobs for postprocessing
        """
        job_out = '*_job.out'
        # --- singularity test logs:
        for test_index in range(len(self.testnames_singularity)):
            stagedir = \
                self.getdep(self.testnames_singularity[test_index]).stagedir

            self.postrun_cmds.append(f'cp {stagedir}/{job_out} .')
        # --- sarus test logs:
        for test_index in range(len(self.testnames_sarus)):
            stagedir = self.getdep(self.testnames_sarus[test_index]).stagedir
            self.postrun_cmds.append(f'cp {stagedir}/{job_out} .')

    @rfm.run_after('run')
    def extract_data(self):
        """
        returns the time taken by srun by reading timings of all the compute
        jobs (linux date start/stop command) and write results in timings.rpt
        """
        ftgin = open(os.path.join(self.stagedir, 'timings.rpt'), "w")
        # termgraph header:
        # ftgin.write('# Elapsed_time (seconds) = f(mpi_tasks)\n')
        ftgin.write('@ native,singularity,sarus\n')
        # title of column1 not needed i.e this is wrong: ('@ mpi,t1,t2\n')
        job_out = 'job.out'
        # TODO: reuse self.testnames_native here
        # for step in steps:
        for step in set(steps_dict.values()):
            for mpi_task in mpi_tasks:
                # native (i.e no container) -> res_native
                # testname = self.nativejob_stdout
                testname = f'compute_native_{mpi_task}mpi_{step}steps'
                self.rpt_dep = os.path.join(self.stagedir,
                                            f'rfm_{testname}_{job_out}')
                # self.rpt_dep = os.path.join(self.stagedir, nativejob_stdout)
                res_native = sn.evaluate(sphs.elapsed_time_from_date(self))
                # rfm_postproc_containers_job.out: No such file or directory
                # --> update sphs.elapsed_time_from_date with self.rpt
                # --- singularity -> res_singularity
                testname = f'compute_singularity_{mpi_task}mpi_{step}steps'
                self.rpt_dep = os.path.join(self.stagedir,
                                            f'rfm_{testname}_{job_out}')
                res_singularity = \
                    sn.evaluate(sphs.elapsed_time_from_date(self))
                # --- sarus -> res_sarus
                testname = f'compute_sarus_{mpi_task}mpi_{step}steps'
                self.rpt_dep = os.path.join(self.stagedir,
                                            f'rfm_{testname}_{job_out}')
                res_sarus = sn.evaluate(sphs.elapsed_time_from_date(self))
                # --- termgraph data:
                ftgin.write(f'{mpi_task},{res_native},{res_singularity},'
                            f'{res_sarus}\n')
        ftgin.close()
# }}}


# {{{ class MPI_PostprocTest:
@rfm.simple_test
class MPI_Plot_Test(rfm.RunOnlyRegressionTest):
    def __init__(self):
        self.name = 'performance_containers'
        self.sourcesdir = 'src/scripts'
        # This test will be skipped if --system does not match:
        self.valid_systems = ['dom:mc', 'dom:gpu']
        self.valid_prog_environs = ['*']
        self.modules = ['termgraph/0.4.2-python3']
        self.depends_on('postproc_containers')
        self.executable = 'python3'
        # TODO: avg time per step
        self.sanity_patterns = \
            sn.assert_not_found(r'ordinal not in range', self.stderr)

    @rfm.require_deps
    def plot_logs(self):
        stagedir = self.getdep('postproc_containers').stagedir
        rpt = os.path.join(stagedir, 'timings.rpt')
        tgraph = os.path.join(self.stagedir, 'termgraph_cscs.py')
        self.executable_opts = [
            f'{tgraph}', f'{rpt}', '--color', '{green,yellow,red}', '--suffix',
            's', '--title', '"Elapsed time (seconds)"']
        self.postrun_cmds = [f'# cat termgraph.rpt']

# }}}