#!/bin/sh
# set -x
#
#------------------------------------------------------------
# Part of HPCToolkit (hpctoolkit.org)
#------------------------------------------------------------
#
# Copyright (c) 2002-2023, Rice University.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
#   notice, this list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright
#   notice, this list of conditions and the following disclaimer in the
#   documentation and/or other materials provided with the distribution.
#
# * Neither the name of Rice University (RICE) nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# This software is provided by RICE and contributors "as is" and any
# express or implied warranties, including, but not limited to, the
# implied warranties of merchantability and fitness for a particular
# purpose are disclaimed. In no event shall RICE or contributors be
# liable for any direct, indirect, incidental, special, exemplary, or
# consequential damages (including, but not limited to, procurement of
# substitute goods or services; loss of use, data, or profits; or
# business interruption) however caused and on any theory of liability,
# whether in contract, strict liability, or tort (including negligence
# or otherwise) arising in any way out of the use of this software, even
# if advised of the possibility of such damage.
#
# hpcrun -- set the environ variables for profiling with HPCToolkit
# and launch the program.  See 'hpcrun -h' for a list of options.
#

#------------------------------------------------------------
# Values from configure
#------------------------------------------------------------

# If this script can't find its install directory, then set 'prefix'
# here.  This should only be necessary if you've moved both the script
# and the install directory.
#
# Set 'hash_value' to empty to disable matching the hash with the
# install directory.  Note: re-running configure creates a new hash.
# If you copy this script out of its install directory and rerun
# configure, then the hashes won't match.

prefix='/usr/local/hpctoolkit/2024.01.1'
hash_value='e44256d4182c55cef721073272dc98'

VERSION='2024.01.1-release'
git_version='unknown (not a git repo)'
spack_spec='unknown (not a spack build)'

opt_have_cuda='no'
opt_have_gtpin='no'
opt_have_level0='no'
opt_have_opencl='no'
opt_have_rocm='no'
opt_enable_python='no'

cuda_dir='no'
cupti_dir='no'

rocm_dir='no'
hip_dir='no'
hsa_dir='no'
roctracer_dir='no'
rocprofiler_dir='no'

level0_dir=''
igc_dir='no'
gtpin_dir='no'

# Relative paths are relative to HPCTOOLKIT.
hpcfnbounds_dir='libexec/hpctoolkit'
hpcrun_dir='lib/hpctoolkit'
libmonitor_dir='/usr/lib64'
papi_libdir=''
perfmon_libdir='/usr/lib64'
gtpin_lib_path=''
host_arm='no'
prog_name=hpcrun

#------------------------------------------------------------
# On Cray Slingshot 11 systems, there is not yet complete
# support for demand paging on the Cassini NIC. As a result,
# when using pinned memory with Cray MPI, fork and system
# aren't allowed to return because forking a process using
# pinned memory is unsupported. As a result, libmonitor can't
# wrap system as it always has. Instead, we simply invoke
# glibc system directly instead of using libmonitor's
# reimplementation.
#------------------------------------------------------------

export MONITOR_NO_SYSTEM_OVERRIDE=1

#------------------------------------------------------------
# LD_AUDIT is crippled on Arm systems until at least
# glibc 2.35. On Arm systems, emulate LD_AUDIT support at
# present to avoid failures due to LD_AUDIT bugs.
#------------------------------------------------------------

if test $host_arm = "yes"; then
    export HPCRUN_AUDIT_FAKE_AUDITOR=1
fi

#------------------------------------------------------------
# Recent versions of MPI use UCX as a communication
# substrate. Prevent UCX from catching SEGV by dropping it
# from UCX's list of error signals. If UCX catches SEGV,
# it will terminate the program. We can't allow UCX to
# do that since hpcrun may encounter a SEGV as part of its
# operation and want to drop a sample rather than
# terminate the program.
#------------------------------------------------------------
export UCX_ERROR_SIGNALS=ILL,FPE,BUS

#------------------------------------------------------------
# Find path to this script
#------------------------------------------------------------

here="$(dirname "$(realpath "$0")")"
if [ -e "$here"/.libs/libhpcrun.so ]; then
  # This is (almost certainly) the build directory, not the installed version.
  # In this case, we don't have HPCTOOLKIT, HPCRUN_FNBOUNDS_CMD needs to be set,
  # and hpcrun_dir is where the libs are built.
  hpcrun_dir="$here"/.libs
  hpcfnbounds_dir=/nonexistent
  if [ -z "$HPCRUN_FNBOUNDS_CMD" ]; then
    echo "hpcrun: Detected as in build directory, must have HPCRUN_FNBOUNDS_CMD set" >&2
    exit 2
  fi
else
  # Use the usual method based around the install prefix
  hpc_path_to_root=..
#
# Find the root of the install directory (prefix) and set HPCTOOLKIT.
#
hpc_path_to_root="${hpc_path_to_root:-..}"
hash_file='hash-file'

# Test for valid hash value.
case "$hash_value" in
    ????* ) ;;
    * ) hash_value=no ;;
esac

if test "$prog_name" = "" ; then
    prog_name="$0"
fi

# $1 = candidate directory
# $2 = hash value from configure
valid_hpctoolkit_hash()
{
    hfile="$1/lib/hpctoolkit/${hash_file}"
    config_hash="$2"
    ret=1

    if test -f "$hfile" ; then
        inst_hash=$(cat "$hfile")
        test "$config_hash" = "$inst_hash"
        ret=$?
    fi
    return $ret
}

valid_hpctoolkit_dir()
{
    test -d "$1/lib/hpctoolkit"
}

# 1 -- install prefix + hash.  If the install prefix is the same, then
# the script can be anywhere.
found_dir=no

if test "$hash_value" != no ; then
    if valid_hpctoolkit_hash "$prefix" "$hash_value" ; then
        HPCTOOLKIT="$prefix"
        found_dir=yes
    fi
fi

# 2 -- script $0 + hash.  If the launch script is in the bin subdir
# (or libexec), then the install directory can be anywhere.  We expect
# the first two cases to cover almost all valid cases.

if test "$found_dir" = no && test "$hash_value" != no ; then
    script="$0"
    if test -L "$script" ; then
        script=$(realpath "$script")
    fi
    dir=$(dirname "$script")
    dir=$( cd "${dir}/${hpc_path_to_root}" 2>/dev/null && /bin/pwd )

    if valid_hpctoolkit_hash "$dir" "$hash_value" ; then
        HPCTOOLKIT="$dir"
        found_dir=yes
    fi
fi

# 3 -- HPCTOOLKIT + hash.  If you've moved both the script and the
# install directory, then you can set HPCTOOLKIT.

if test "$found_dir" = no && test "$hash_value" != no ; then
    if valid_hpctoolkit_hash "$HPCTOOLKIT" "$hash_value" ; then
        found_dir=yes
    fi
fi

# 4 -- If can't match the hash value, then try prefix or HPCTOOLKIT.
# If you've moved the script and rerun configure, then configure will
# install a new hash which the script won't have.

if test "$found_dir" = no ; then
    if valid_hpctoolkit_dir "$prefix" ; then
        HPCTOOLKIT="$prefix"
        found_dir=yes
    elif valid_hpctoolkit_dir "$HPCTOOLKIT" ; then
        found_dir=yes
    fi
fi

# Unable to find install prefix.

if test "$found_dir" = no ; then
    cat <<EOF >&2
$prog_name: Unable to find the hpctoolkit install directory.
Set prefix in this script or else HPCTOOLKIT in the environment
and try again.
EOF
    exit 1
fi

export HPCTOOLKIT

  # Relative paths are relative to HPCTOOLKIT.
  case "$hpcfnbounds_dir" in
      /* ) ;;
      * ) hpcfnbounds_dir="${HPCTOOLKIT}/${hpcfnbounds_dir}" ;;
  esac
  case "$hpcrun_dir" in
      /* ) ;;
      * ) hpcrun_dir="${HPCTOOLKIT}/${hpcrun_dir}" ;;
  esac
fi

#------------------------------------------------------------
# Usage Message
#------------------------------------------------------------

die()
{
    cat <<EOF 1>&2
hpcrun: $*
use 'hpcrun -h' for a summary of options
EOF
    exit 1
}

usage()
{
    cat <<EOF
Usage:
  hpcrun [profiling-options] <command> [command-arguments]
  hpcrun [info-options]

hpcrun profiles the execution of an arbitrary command <command> using
statistical sampling (rather than instrumentation).  It collects
per-thread call path profiles that represent the full calling context of
sample points.  Sample points may be generated from multiple simultaneous
sampling sources.  hpcrun profiles complex applications that use forks,
execs, threads, and dynamic linking/unlinking; it may be used in conjunction
with parallel process launchers such as MPICH's mpiexec and SLURM's srun.

Note that hpcrun is unable to profile static executables.

To configure hpcrun's sampling sources, specify events and periods using
the -e/--event option.  For an event 'e' and period 'p', after every 'p'
instances of 'e', a sample is generated that causes hpcrun to inspect the
and record information about the monitored <command>.

When <command> terminates, a profile measurement database will be written to
the directory:
  hpctoolkit-<command>-measurements[-<jobid>]
where <jobid> is a job launcher id that associated with the execution, if any.

hpcrun enables a user to abort a process and write the partial profiling
data to disk by sending a signal such as SIGINT (often bound to Ctrl-C).
This can be extremely useful on long-running or misbehaving applications.

Options: Informational
  -l, -L --list-events List available events. (N.B.: some may not be profilable)
  -V, --version        Print version information.
  -h, --help           Print help.

Options: Profiling (Defaults shown in curly brackets {})
  -e <event>[@<howoften>], --event <event>[@<howoften>]
                       <event> may be a Linux system timer (CPUTIME and REALTIME),
                       a specifier for monitoring GPU operations, software or hardware
                       counter events supported by Linux perf, hardware counter events
                       supported by the PAPI library, and more. A complete list of
                       available events can be obtained by running 'hpcrun -L'. The '-e'
                       option may be given multiple times to profile several events at
                       once. If the value for <howoften> is a number, it will be interpreted
                       as a sample period. For Linux perf events, one may specify a sampling
                       frequency for 'howoften' by writing f before a number.  For
                       instance, to sample an event 100 times per second,  specify
                       <howoften>  as '@f100'. For Linux perf events, if no value for
                       <howoften> is specified, hpcrun will monitor the event using
                       frequency-based sampling at 300 samples/second. If no event is
                       specified, hpcrun will collect CPUTIME samples 200 times per
                       second per application thread.

  -a, --attribution <method>
                       Use the given <method> to produce the calling contexts that metrics
                       are attributed to. By default, the calling context is produced by
                       unwinding C/C++ call stack. Passing this option allows for
                       additive or alternative methods to be used, multiple options may be
                       passed to apply multiple modifications. Available methods:

                           -a flat
                                Don't unwind the call stack, instead attribute metrics to
                                the leaf procedures (i.e. produce a "flat" profile).
EOF
if test "$opt_enable_python" = yes
then cat <<EOF

                           -a python
                                Attribute to Python code and functions, instead of to
                                C functions in the Python interpreter.
                                NOTE: May cause crashes or not function if used with a
                                different Python than HPCToolkit was built with.
                                Highly experimental. Use at your own risk.
EOF
fi
cat <<EOF

  -c, --count <howoften>
                       Only  available  for  events  managed  by Linux perf. This
                       option specifies a default value for how often to sample. The
                       value for <howoften> may be a number that will be used as a
                       default event period or an f followed by a number, e.g. f100,
                       to specify a default sampling frequency in samples/second.

  -t, --trace          Generate a call path trace in addition to a call
                       path profile.

  -tt, --ttrace        Enhanced resolution tracing. Generate a call path trace that
                       includes both sample and kernel launches on the CPU in
                       addition to a call path profile. Since additional non-sample
                                           elements are added, any statistical properties of the CPU
                                           traces are disturbed.

  --omp-serial-only    When profiling using the OMPT interface for OpenMP,
                       suppress all samples not in serial code.

  -ds, --delay-sampling
                       Delay starting sampling until the application calls
                       hpctoolkit_sampling_start().

  -f <frac>, -fp <frac>, --process-fraction <frac>
                       Measure only a fraction <frac> of the execution's
                       processes.  For each process, enable measurement
                       (of all threads) with probability <frac>; <frac> is a
                       real number (0.10) or a fraction (1/10) between 0 and 1.

  -m, --merge-threads  Merge non-overlapped threads into one virtual thread.
                       This option is to reduce the number of generated
                       profile and trace files as each thread generates its own
                       profile and trace data. The options are:
                       0 : do not merge non-overlapped threads
                       1 : merge non-overlapped threads (default)

  -o <outpath>, --output <outpath>
                       Directory for output data.
                       {hpctoolkit-<command>-measurements[-<jobid>]}

                       Bug: Without a <jobid> or an output option, multiple
                       profiles of the same <command> will be placed in the
                       same output directory.

  -olr <list>, --only-local-ranks <list>
                       Measure only specified MPI local ranks. <list> is a
                       comma-separated series of local ranks. A local rank
                       is the local index of a rank on a node.
                       {all local ranks measured}

  -r, --retain-recursion
                       Normally, hpcrun will collapse (simple) recursive call chains
                       to save space and analysis time. This option disables that
                       behavior: all elements of a recursive call chain will be recorded
                       NOTE: If the user employs the RETCNT sample source, then this
                             option is enabled: RETCNT implies *all* elements of
                             call chains, including recursive elements, are recorded.

  -nu, --no-unwind     Disable unwinding and collect a flat profile of leaf
                       procedures only instead of full calling contexts.
                       Equivalent to -a flat.

  --rocprofiler-path   Path to the ROCProfiler installation. Usually, this is /opt/rocm
                       or a versioned variant e.g. /opt/rocm-5.4.3. This should match the
                       ROCm installation your application is running with.
                       Defaults to the ROCM_PATH environment variable.

Options to consider only when hpcrun causes an application to fail:

  --disable-auditor    By default, hpcrun uses LD_AUDIT to track dynamic library
                       operations (see NOTES). This option instructs hpcrun
                       to track dynamic library operations by intercepting
                       dlopen and dlclose instead of using LD_AUDIT. Note
                       that this alternate approach can cause problem with
                       load modules that specify a RUNPATH. Due to a glibc bug,
                       this option is disabled by default on ARM.

  --enable-auditor     LD_AUDIT is by default disabled on ARM due to a glib bug.
                       Use this option to enable LD_AUDIT on ARM.

  --disable-auditor-got-rewriting
                       By default, hpcrun uses LD_AUDIT to help track dynamic
                       library operations using an 'auditor' library. When using
                       an auditor library, glibc unnecessarily intercepts
                       every call to a function in a shared library. hpcrun
                       avoids this overhead by rewriting each shared library's
                       global offset table (GOT). Such rewriting is tricky.
                       This option can be used to disable GOT rewriting if
                       it is believed that the rewriting is causing the
                       application to fail.

  --fnbounds-eager-shutdown
                       Hpcrun uses a helper process 'hpcfnbounds' to calculate
                       function bounds in each load module at runtime.  By
                       default, hpcfnbounds is live throughout a program
                       execution.  If your application runs out of memory when
                       using hpcrun, you can use this option to have hpcrun shut
                       down hpcfnbounds after analyzing an application's initial
                       load modules and each dynamically-loaded shared library.
                       Using this option will likely increase runtime overhead.

  --namespace-single   dlmopen may load a shared library into an alternate
                       namespace.  Use of dlmopen to create multiple namespaces
                       can cause an application to crash when using glibc < 2.32
                       (see NOTES). This option asks HPCToolkit to load all
                       shared libraries within the application namespace, which
                       might help avoid a crash.  (Default when performing
                       instruction-level measurements of Intel GPU binaries.)

  --namespace-multiple dlmopen may load a shared library into an alternate
                       namespace. Force HPCToolkit to allow dlmopen to create
                       alternate namespaces.  (Default unless performing
                       instruction-level measurements of Intel GPU binaries.)

  --disable-gprof      Override and disable gprof instrumentation  This option
                       is only useful when using hpcrun to add HPCToolkit's
                       measurement subsystem to a dynamically-linked application
                       that has been compiled with -pg. One can't measure
                       performance with HPCToolkit when gprof instrumentation
                       is active in the same execution.


NOTES:

* hpcrun uses preloaded shared libraries to initiate profiling.  For this
  reason, it cannot be used to profile setuid programs.

* By default, hpcrun use LD_AUDIT to monitor an application's use of dynamic
  libraries. Use of LD_AUDIT is needed to properly interpret a load module's
  RUNPATH attribute. However, use of LD_AUDIT will cause dlmopen to fail
  for glibc < 2.32. Intel's GTPin library for instrumentation of Intel GPU
  binaries uses dlmopen.

EOF
    exit 0
}

#------------------------------------------------------------
# Command Line Options
#------------------------------------------------------------

# Early options that every launch script supports.

case "$1" in
    -V | -version | --version )
        cat <<EOF
${prog_name}: A member of HPCToolkit, version $VERSION
git branch:  $git_version
spack spec:  $spack_spec
install dir: $HPCTOOLKIT
EOF
        if test "$opt_have_cuda" = yes ; then
            echo "NVIDIA CUDA support: yes"
            echo "- CUDA: $cuda_dir"
            if test -n "$cupti_dir" ; then
                echo "- CUPTI: $cupti_dir"
            fi
        else
            echo "NVIDIA CUDA support: no"
        fi
        if test "$opt_have_rocm" = yes ; then
            echo "AMD ROCm support: yes"
            if test "$rocm_dir" != "no" ; then
                echo "- ROCm: $rocm_dir"
            elif test "$hip_dir" = "$hsa_dir" \
                 && test "$hip_dir" = "$roctracer_dir" \
                 && test "$hip_dir" = "$rocprofiler_dir"
            then
                 echo "- ROCm: $hip_dir"
            else
                echo "- HIP: $hip_dir"
                echo "- HSA: $hsa_dir"
                echo "- roctracer: $roctracer_dir"
                echo "- rocprofiler: $rocprofiler_dir"
            fi
        else
            echo "AMD ROCm support: no"
        fi
        if test "$opt_have_level0" = yes ; then
            echo "Intel Level Zero support: yes"
            echo "- Level Zero: $level0_dir"
            if test "$opt_have_gtpin" = yes ; then
                echo "- GTPin: $gtpin_dir"
                echo "- IGC: $igc_dir"
            fi
        else
            echo "Intel Level Zero support: no"
        fi
        echo "OpenCL support: $opt_have_opencl"
        echo "Python support: $opt_enable_python"
        exit 0
        ;;
esac

# Return success (0) if $1 is not empty and not the next option.
non_empty()
{
    case "x$1" in
        x | x-* ) return 1 ;;
        * ) return 0 ;;
    esac
}

# Process options and export environ variables.  LD_LIBRARY_PATH and
# LD_PRELOAD should be delayed until we launch the program, but the
# others can be set now.

audit_list=
preload_list=
gtpin_libdir=
namespace_default=multiple
namespace_multiple=
namespace_single=
rocm_envs_rocprofiler_path="$ROCM_PATH"

HPCRUN_DEBUG_FLAGS=
HPCRUN_EVENT_LIST=
HPCRUN_CONTROL_KNOBS=

while test "x$1" != x
do
    arg="$1" ; shift
    case "$arg" in

        -md | --monitor-debug )
            export MONITOR_DEBUG=1
            ;;

        -d | --debug )
            export HPCRUN_WAIT=1
            ;;

        -dd | --dynamic-debug )
            non_empty "$1" || die "missing argument for $arg"
            export HPCRUN_DEBUG_FLAGS="$HPCRUN_DEBUG_FLAGS $1"
            shift
            ;;

        -ad | --audit-debug )
            export HPCRUN_AUDIT_DEBUG=1
            ;;

        -ck | --control-knob )
            non_empty "$1" || die "missing argument for $arg"
            export HPCRUN_CONTROL_KNOBS="$HPCRUN_CONTROL_KNOBS $1"
            shift
            ;;

        -nu | --no-unwind )
            export HPCRUN_NO_UNWIND=1
            ;;

        -h | -help | --help )
            usage
            ;;

        # --------------------------------------------------

        -a | --attribution )
            non_empty "$1" || die "missing argument for $arg"
            case "$1" in
                flat )
                    export HPCRUN_NO_UNWIND=1
                    ;;

                python )
                    if test "$opt_enable_python" != yes; then
                        die "HPCToolkit was not compiled with Python support enabled"
                    fi
                    export HPCRUN_LOGICAL_PYTHON=1
                    ;;

                * )
                    die "Invalid argument for $arg: $1"
                    ;;
            esac
            shift
            ;;

        # --------------------------------------------------

        -e | --event )
            non_empty "$1" || die "missing argument for $arg"
            case "$1" in
                GA* )
                    preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_ga.so"
                    ;;

                IO* )
                    preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_io.so"
                    ;;

                MEMLEAK* )
                    preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_memleak.so"
                    ;;

                DATACENTRIC* )
                    preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_datacentric.so"
                    ;;

                PTHREAD_WAIT* )
                    preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_pthread.so"
                    ;;

                CPU_GPU_IDLE* )
                    preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_gpu.so"
                    ;;

                MPI* )
                    preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_mpi.so"
                    ;;

                gpu=amd | rocprof::* )
                    if test "$opt_have_rocm" != yes ; then
                        die "HPCToolkit was not compiled with AMD ROCm support enabled"
                    fi
                    export HSA_ENABLE_INTERRUPTS=0
                    export ROCP_TOOL_LIB=libhpcrun.so
                    export ROCP_HSA_INTERCEPT=1
                    export AMD_DIRECT_DISPATCH=0
                    ;;

                gpu=nvidia* )
                    if test "$opt_have_cuda" != yes ; then
                        die "HPCToolkit was not compiled with NVIDIA CUDA support enabled"
                    fi
                    ;;

                gpu=opencl )
                    if test "$opt_have_opencl" != yes ; then
                        die "HPCToolkit was not compiled with OpenCL support enabled"
                    fi
                    preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_opencl.so"
                    ;;

                gpu=level0 )
                    if test "$opt_have_level0" != yes ; then
                        die "HPCToolkit was not compiled with Level0 support enabled"
                    fi
                    preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_level0.so"
                    ;;

                gpu=level0,inst* )
                    if test "$opt_have_level0" != yes ; then
                        die "HPCToolkit was not compiled with Level0 support enabled"
                    fi
                    if test "$opt_have_gtpin" != yes ; then
                        die "HPCToolkit was not compiled with GTPin support enabled"
                    fi
                    gtpin_libdir="${gtpin_lib_path}"
                    export ZET_ENABLE_PROGRAM_INSTRUMENTATION=1
                    export ZE_ENABLE_TRACING_LAYER=1
                    preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_level0.so"
                    export HPCRUN_AUDIT_FAKE_AUDITOR=1
                    namespace_default=single_if_auditing
                    ;;

            esac
            case "$HPCRUN_EVENT_LIST" in
                '' )
                    HPCRUN_EVENT_LIST="$1"
                    ;;

                * )
                    HPCRUN_EVENT_LIST="$HPCRUN_EVENT_LIST $1"
                    ;;
            esac
            shift
            ;;

        -L | -l | --list-events )
            if test "$opt_have_rocm" = yes; then
                export ROCP_TOOL_LIB=libhpcrun.so
            fi
            export HPCRUN_EVENT_LIST=LIST
            ;;

        -ds | --delay-sampling )
            export HPCRUN_DELAY_SAMPLING=1
            ;;

        # --------------------------------------------------

        -c | --count )
            export HPCRUN_PERF_COUNT="$1"
            shift
            ;;

        # --------------------------------------------------

        -t | --trace )
                if test "$HPCRUN_TRACE"; then
                        die "-t/--trace is incompatible with -tt/--ttrace"
                fi
            export HPCRUN_TRACE=1
            ;;

        -tt | --ttrace )
                if test "$HPCRUN_TRACE"; then
                        die "-tt/--ttrace is incompatible with -t/--trace"
                fi
            export HPCRUN_TRACE=2
            ;;

        # --------------------------------------------------

        --fnbounds-eager-shutdown )
            export HPCRUN_FNBOUNDS_SHUTDOWN=1
            ;;

        # --------------------------------------------------

        -js | --jobs-symtab )
            # Deprecated
            shift
            ;;

        # --------------------------------------------------

        -o | --output )
            non_empty "$1" || die "missing argument for $arg"
            export HPCRUN_OUT_PATH="$1"
            shift
            ;;

        # --------------------------------------------------

        -olr | --only-local-ranks )
            non_empty "$1" || die "missing argument for $arg"
            export HPCRUN_LOCAL_RANKS="$1"
            shift
            ;;

        # --------------------------------------------------

        --rocprofiler-path )
            non_empty "$1" || die "missing argument for $arg"
            rocm_envs_rocprofiler_path="$1"
            shift
            ;;

        # --------------------------------------------------

        --disable-gprof )
            preload_list="${preload_list:+${preload_list}:}${hpcrun_dir}/libhpcrun_gprof.so"
            ;;

        # --------------------------------------------------

        --omp-serial-only )
            export HPCRUN_OMP_SERIAL_ONLY=1
            ;;

        # --------------------------------------------------

        --namespace-multiple )
            non_empty "$namespace_single" && die "can't use both --namespace-single and --namespace-multiple"
            namespace_multiple=yes
            ;;

        # --------------------------------------------------

        --namespace-single )
            non_empty "$namespace_multiple" && die "can't use both --namespace-single and --namespace-multiple"
            namespace_single=yes
            ;;

        # --------------------------------------------------

        -r | --retain-recursion )
            export HPCRUN_RETAIN_RECURSION=1
            ;;

        # --------------------------------------------------

        -m | --merge-threads )
            non_empty "$1" || die "missing argument for $arg"
            export HPCRUN_MERGE_THREADS="$1"
            shift
            ;;

        # --------------------------------------------------

        -lm | --low-memsize )
            non_empty "$1" || die "missing argument for $arg"
            export HPCRUN_LOW_MEMSIZE="$1"
            shift
            ;;

        -ms | --memsize )
            non_empty "$1" || die "missing argument for $arg"
            export HPCRUN_MEMSIZE="$1"
            shift
            ;;

        # --------------------------------------------------

        -f | -fp | --process-fraction )
            non_empty "$1" || die "missing argument for $arg"
            export HPCRUN_PROCESS_FRACTION="$1"
            shift
            ;;

        -mp | --memleak-prob )
            non_empty "$1" || die "missing argument for $arg"
            export HPCRUN_MEMLEAK_PROB="$1"
            shift
            ;;

        # --------------------------------------------------

        --disable-auditor-got-rewriting )
            export HPCRUN_AUDIT_DISABLE_PLT_CALL_OPT=1
            ;;

        --disable-auditor )
            export HPCRUN_AUDIT_FAKE_AUDITOR=1
            ;;

        --enable-auditor )
            unset HPCRUN_AUDIT_FAKE_AUDITOR
            ;;

        # --------------------------------------------------

        -- )
            break
            ;;

        -* )
            die "unknown or invalid option: $arg"
            ;;

        * )
            set -- "$arg" "$@"
            break
            ;;
    esac
done

# Add default sampling source if needed.
case "$HPCRUN_EVENT_LIST" in
    '' ) HPCRUN_EVENT_LIST='CPUTIME@5000' ;;
    RETCNT ) HPCRUN_EVENT_LIST='CPUTIME@5000 RETCNT' ;;
esac
export HPCRUN_EVENT_LIST

# There must be a command to run, unless -L is set.
if test -z "$1" ; then
    if test "$HPCRUN_EVENT_LIST" = LIST ; then
        set -- /bin/ls
    else
        die "no command to profile"
    fi
fi


# collapse namespaces, if necessary
if test "$namespace_default" = single_if_auditing; then
  if test "$HPCRUN_AUDIT_FAKE_AUDITOR" = 1; then
    namespace_default=multiple
  else
    namespace_default=single
  fi
fi
if test "$namespace_multiple"x = x ; then
  if test $namespace_default = single || test "$namespace_single"x != x ; then
    preload_list="${hpcrun_dir}/libhpcrun_dlmopen.so${preload_list:+:${preload_list}}"
  fi
fi

#------------------------------------------------------------
# Environment setup
#------------------------------------------------------------

# Set up the environment needed to monitor ROCm applications. If we can't find a file we
# leave the variable blank and checks in hpcrun will detect and report the error.
if test "$opt_have_rocm" = yes; then
    export ROCP_METRICS=
    export HSA_TOOLS_LIB=

    # Fill in ROCP_METRICS and HSA_TOOLS_LIB from the ROCProfiler installation
    if test "$rocm_envs_rocprofiler_path"; then
        for trial in \
            "${rocm_envs_rocprofiler_path}/lib/rocprofiler/metrics.xml" \
            "${rocm_envs_rocprofiler_path}/lib/metrics.xml" \
            "${rocm_envs_rocprofiler_path}/rocprofiler/lib/metrics.xml"
        do
            if test -f "$trial"; then
                export ROCP_METRICS="$trial"
                break
            fi
        done

        for trial in \
            "${rocm_envs_rocprofiler_path}/lib/librocprofiler64.so.1" \
            "${rocm_envs_rocprofiler_path}/lib/librocprofiler64.so.2" \
            "${rocm_envs_rocprofiler_path}/rocprofiler/lib/librocprofiler64.so.1"
        do
            if test -f "$trial"; then
                export HSA_TOOLS_LIB="$trial"
                break
            fi
        done
    fi
fi

#------------------------------------------------------------
# Pre-Launch Sanity Checks
#------------------------------------------------------------

# Find the command on PATH.  We need to run file and nm on the binary,
# so we need an actual path.

command="$1"
case "$command" in
    */* ) ;;
    * )
        OLDIFS="$IFS"
        IFS=:
        for dir in $PATH ; do
            if test -x "${dir}/${command}" ; then
                command="${dir}/${command}"
                break
            fi
        done
        IFS="$OLDIFS"
        ;;
esac

# Sanity checks before launch.

file_exists=no
if type file >/dev/null 2>&1 ; then
    file_exists=yes
fi

if test -x "$command" && test "$file_exists" = yes ; then
    #
    # For dynamic binaries, verify that the application and libhpcrun
    # have the same wordsize, both 32-bit or both 64-bit.
    #
    cmd_file_out=`file -L "$command" 2>/dev/null`
    echo "$cmd_file_out" | grep -E -i -e 'elf.*dynamic' >/dev/null 2>&1
    if test $? -eq 0 ; then
        appl_bit=`expr "$cmd_file_out" : '.*ELF.*\([0-9][0-9].bit\)'`
        file_out=`file -L "${hpcrun_dir}/libhpcrun.so"`
        hpcrun_bit=`expr "$file_out" : '.*ELF.*\([0-9][0-9].bit\)'`
        if test "$appl_bit" != "$hpcrun_bit" ; then
            echo "hpcrun: cannot profile application: $command" 1>&2
            echo "application is $appl_bit but hpctoolkit is $hpcrun_bit" 1>&2
            exit 1
        fi
    fi
    #
    # Static binaries do not work, so fail early.
    #
    echo "$cmd_file_out" | grep -E -i -e 'elf.*static' >/dev/null 2>&1
    if test $? -eq 0 ; then
        echo "hpcrun: refusing attempt to launch static binary: $command" 1>&2
        echo "rebuild the application as a dynamic executable to proceed" 1>&2
        exit 1
    fi
fi

#------------------------------------------------------------
# Final Environ Settings and Exec the Binary
#------------------------------------------------------------

# Disable the darshan I/O library.  This intercepts some I/O functions
# inside signal handlers and can cause deadlock.
export DARSHAN_DISABLE=1

# Add OMP_SKIP_MSB to HPCRUN_DEBUG_FLAGS if the binary contains
# _mp_init.

nm "$command" 2>/dev/null | grep -e ' _mp_init' >/dev/null 2>&1
if test $? -eq 0 ; then
    export HPCRUN_DEBUG_FLAGS="$HPCRUN_DEBUG_FLAGS OMP_SKIP_MSB"
fi

# Enable core files.
ulimit -S -c unlimited >/dev/null 2>&1

#------------------------------------------------------------
# assemble hpc_ld_library_path without any empty entries
#------------------------------------------------------------

hpc_ld_library_path="${hpcrun_dir}"
hpc_ld_library_path="${hpc_ld_library_path}${papi_libdir:+:${papi_libdir}}"
hpc_ld_library_path="${hpc_ld_library_path}${perfmon_libdir:+:${perfmon_libdir}}"
hpc_ld_library_path="${hpc_ld_library_path}${gtpin_libdir:+:${gtpin_libdir}}"

#------------------------------------------------------------
# assemble preload list
#------------------------------------------------------------

preload_list="${libmonitor_dir}/libmonitor.so${preload_list:+:${preload_list}}"
preload_list="${hpcrun_dir}/libhpcrun.so${preload_list:+:${preload_list}}"

# Set up the auditor or fake auditor, depending on what's configured.
if test "$HPCRUN_AUDIT_FAKE_AUDITOR" = 1 ; then
    preload_list="${hpcrun_dir}/libhpcrun_fake_audit.so${preload_list:+:${preload_list}}"
else
    audit_list="${hpcrun_dir}/libhpcrun_audit.so${audit_list:+:${audit_list}}"
fi

# Allow fnbounds command relative to hpcfnbounds_dir.
case "$HPCRUN_FNBOUNDS_CMD" in
    /* ) ;;
    '' ) HPCRUN_FNBOUNDS_CMD="${hpcfnbounds_dir}/hpcfnbounds" ;;
    * )  HPCRUN_FNBOUNDS_CMD="${hpcfnbounds_dir}/$HPCRUN_FNBOUNDS_CMD" ;;
esac

if test ! -x "$HPCRUN_FNBOUNDS_CMD" ; then
    echo "bad HPCRUN_FNBOUNDS_CMD command: $HPCRUN_FNBOUNDS_CMD" >&2
    HPCRUN_FNBOUNDS_CMD="${hpcfnbounds_dir}/hpcfnbounds"
fi
export HPCRUN_FNBOUNDS_CMD

export HPCRUN_AUDIT_MAIN_LIB="${hpcrun_dir}/libhpcrun.so"

if test -n "${hpc_ld_library_path}"; then
  LD_LIBRARY_PATH="${hpc_ld_library_path}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
fi

if test -z "${LD_LIBRARY_PATH}"; then unset LD_LIBRARY_PATH; fi

if test -n "${preload_list}"; then
  LD_PRELOAD="${preload_list}${LD_PRELOAD:+:${LD_PRELOAD}}"
fi
if test -z "${LD_PRELOAD}"; then unset LD_PRELOAD; fi

if test -n "${audit_list}"; then
  LD_AUDIT="${audit_list}${LD_AUDIT:+:${LD_AUDIT}}"
fi
if test -z "${LD_AUDIT}"; then unset LD_AUDIT; fi

export LD_LIBRARY_PATH LD_PRELOAD LD_AUDIT
exec "$@"
