/*
 * Copyright (C) 2002-2021 the Network-Based Computing Laboratory
 * (NBCL), The Ohio State University.
 *
 * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu)
 *
 * For detailed copyright and licensing information, please refer to the
 * copyright file COPYRIGHT in the top level directory.
 */

#include "osu_util.h"

#ifdef _ENABLE_OPENACC_
#include <openacc.h>
#endif

/*
 * GLOBAL VARIABLES
 */
char const * benchmark_header = NULL;
char const * benchmark_name = NULL;
int accel_enabled = 0;
struct options_t options;

struct bad_usage_t bad_usage;

void
print_header(int rank, int full)
{
    switch(options.bench) {
        case MBW_MR :
        case PT2PT :
            if (0 == rank) {
                switch (options.accel) {
                    case CUDA:
                        printf(benchmark_header, "-CUDA");
                        break;
                    case OPENACC:
                        printf(benchmark_header, "-OPENACC");
                        break;
                    case ROCM:
                        printf(benchmark_header, "-ROCM");
                        break;
                    default:
                        printf(benchmark_header, "");
                        break;
                }

                switch (options.accel) {
                    case CUDA:
                    case OPENACC:
                    case ROCM:
                        fprintf(stdout, "# Send Buffer on %s and Receive Buffer on %s\n",
                                'M' == options.src ? ('D' == options.MMsrc ? "MANAGED (MD)" : "MANAGED (MH)") :
                                ('D' == options.src ? "DEVICE (D)" : "HOST (H)"),
                                'M' == options.dst ? ('D' == options.MMdst ? "MANAGED (MD)" : "MANAGED (MH)"):
                                ('D' == options.dst ? "DEVICE (D)" : "HOST (H)"));
                    default:
                        if (options.subtype == BW && options.bench != MBW_MR) {
                            fprintf(stdout, "%-*s%*s", 10, "# Size", FIELD_WIDTH, "Bandwidth (MB/s)");
                        } else if (options.subtype == LAT) {
                            fprintf(stdout, "%-*s%*s", 10, "# Size", FIELD_WIDTH, "Latency (us)");
                        } else if (options.subtype == LAT_MP){
                            fprintf(stdout, "%-*s%*s", 10, "# Size", FIELD_WIDTH,"Latency (us)");
                        } else if (options.subtype == LAT_MT){
                            fprintf(stdout, "%-*s%*s", 10, "# Size", FIELD_WIDTH, "Latency (us)");
                        }
                        if (options.validate && !(options.subtype == BW && options.bench == MBW_MR)){
                            fprintf(stdout, "%*s\n", FIELD_WIDTH, "Validation");
                        } else{
                            fprintf(stdout, "\n");
                        }
                        fflush(stdout);
                }
            }
            break;
        case COLLECTIVE :
            if (rank == 0) {
                fprintf(stdout, HEADER, "");

                if (options.show_size) {
                    fprintf(stdout, "%-*s", 10, "# Size");
                    fprintf(stdout, "%*s", FIELD_WIDTH, "Avg Latency(us)");
                }

                else {
                    fprintf(stdout, "# Avg Latency(us)");
                }

                if (full) {
                    fprintf(stdout, "%*s", FIELD_WIDTH, "Min Latency(us)");
                    fprintf(stdout, "%*s", FIELD_WIDTH, "Max Latency(us)");
                    fprintf(stdout, "%*s\n", 12, "Iterations");
                }

                else {
                    fprintf(stdout, "\n");
                }

                fflush(stdout);
            }
            break;
        default:
            break;
    }
}

void print_data (int rank, int full, int size, double avg_time,
                 double min_time, double max_time, int iterations)
{
    if (rank == 0) {
        if (options.show_size) {
            fprintf(stdout, "%-*d", 10, size);
            fprintf(stdout, "%*.*f", FIELD_WIDTH, FLOAT_PRECISION, avg_time);
        } else {
            fprintf(stdout, "%*.*f", 17, FLOAT_PRECISION, avg_time);
        }

        if (full) {
            fprintf(stdout, "%*.*f%*.*f%*d\n",
                    FIELD_WIDTH, FLOAT_PRECISION, min_time,
                    FIELD_WIDTH, FLOAT_PRECISION, max_time,
                    12, iterations);
        } else {
            fprintf(stdout, "\n");

        }

        fflush(stdout);
    }
}


static int
set_min_message_size (long long value)
{
    if (0 >= value) {
        return -1;
    }

    options.min_message_size = value;

    return 0;
}

static int
set_max_message_size (long long value)
{
    if (0 > value) {
        return -1;
    }

    options.max_message_size = value;

    return 0;
}

static int
set_message_size (char *val_str)
{
    int retval = -1;
    int i, count = 0;
    char *val1, *val2;

    for (i=0; val_str[i]; i++) {
        if (val_str[i] == ':')
            count++;
    }

    if (!count) {
        retval = set_max_message_size(atoll(val_str));
    } else if (count == 1) {
        val1 = strtok(val_str, ":");
        val2 = strtok(NULL, ":");

        if (val1 && val2) {
            retval = set_min_message_size(atoll(val1));
            retval = set_max_message_size(atoll(val2));
        } else if (val1) {
            if (val_str[0] == ':') {
                retval = set_max_message_size(atoll(val1));
            } else {
                retval = set_min_message_size(atoll(val1));
            }
        }
    }

    return retval;
}

static int
set_receiver_threads (int value){
    if (MIN_NUM_THREADS > value || value >= MAX_NUM_THREADS) {
        return -1;
    }

    options.num_threads = value;

    return 0;

}

static int
set_sender_threads (int value){
    if (MIN_NUM_THREADS > value || value >= MAX_NUM_THREADS) {
        return -1;
    }

    options.sender_thread = value;

    return 0;

}

static int
set_threads (char *val_str)
{
    int retval = -1;
    int i, count = 0;
    char *val1, *val2;

    for (i=0; val_str[i]; i++) {
        if (val_str[i] == ':')
            count++;
    }

    if (!count) {
        retval = set_receiver_threads(atoi(val_str));
        options.sender_thread = -1;
    } else if (count == 1) {
        val1 = strtok(val_str, ":");
        val2 = strtok(NULL, ":");

        if (val1 && val2) {
            retval = set_sender_threads(atoi(val1));
            if (retval == -1){
                return retval;
            }
            retval = set_receiver_threads(atoi(val2));
            if (retval == -1){
                return retval;
            }
        }

    }

    return retval;
}

static int
set_receiver_processes (int value){
    if (MIN_NUM_PROCESSES > value || value >= MAX_NUM_PROCESSES) {
        return -1;
    }

    options.num_processes = value;

    return 0;

}

static int
set_sender_processes (int value){
    if (MIN_NUM_PROCESSES > value || value >= MAX_NUM_PROCESSES) {
        return -1;
    }

    options.sender_processes = value;

    return 0;

}

static int
set_processes (char *val_str)
{
    int retval = -1;
    int i, count = 0;
    char *val1, *val2;

    for (i=0; val_str[i]; i++) {
        if (val_str[i] == ':')
            count++;
    }

    if (!count) {
        retval = set_receiver_processes(atoi(val_str));
        options.sender_processes = -1;
    } else if (count == 1) {
        val1 = strtok(val_str, ":");
        val2 = strtok(NULL, ":");

        if (val1 && val2) {
            retval = set_sender_processes(atoi(val1));
            if (retval == -1){
                return retval;
            }
            retval = set_receiver_processes(atoi(val2));
            if (retval == -1){
                return retval;
            }
        }

    }

    return retval;
}

static int set_num_warmup (int value)
{
    if (0 > value) {
        return -1;
    }

    options.skip = value;
    options.skip_large = value;

    return 0;
}

static int set_num_warmup_validation (int value)
{
    if (0 > value) {
        return -1;
    }

    options.warmup_validation = value;

    return 0;
}

static int set_num_iterations (int value)
{
    if (1 > value) {
        return -1;
    }

    options.iterations = value;
    options.iterations_large = value;

    return 0;
}

static int set_window_size (int value)
{
    if (1 > value) {
        return -1;
    }

    options.window_size = value;

    return 0;
}

static int set_device_array_size (int value)
{
    if (value < 1 ) {
        return -1;
    }

    options.device_array_size = value;

    return 0;
}

static int set_num_probes (int value)
{
    if (value < 0 ) {
        return -1;
    }

    options.num_probes = value;

    return 0;
}

static int set_max_memlimit (long long value)
{
    options.max_mem_limit = value;

    if (value < MAX_MEM_LOWER_LIMIT) {
        options.max_mem_limit = MAX_MEM_LOWER_LIMIT;
        fprintf(stderr,"Requested memory limit too low, using [%d] instead.",
                MAX_MEM_LOWER_LIMIT);
    }

    return 0;
}

void set_header (const char * header)
{
    benchmark_header = header;
}

void set_benchmark_name (const char * name)
{
    benchmark_name = name;
}

void enable_accel_support (void)
{
    accel_enabled = ((CUDA_ENABLED || OPENACC_ENABLED || ROCM_ENABLED) &&
            !(options.subtype == LAT_MT || options.subtype == LAT_MP));
}

int process_options (int argc, char *argv[])
{
    extern char * optarg;
    extern int optind, optopt;

    char const * optstring = NULL;
    int c, ret = PO_OKAY;

    int option_index = 0;

    static struct option long_options[] = {
            {"help",                no_argument,        0,  'h'},
            {"version",             no_argument,        0,  'v'},
            {"full",                no_argument,        0,  'f'},
            {"message-size",        required_argument,  0,  'm'},
            {"window-size",         required_argument,  0,  'W'},
            {"num-test-calls",      required_argument,  0,  't'},
            {"iterations",          required_argument,  0,  'i'},
            {"warmup",              required_argument,  0,  'x'},
            {"array-size",          required_argument,  0,  'a'},
            {"sync-option",         required_argument,  0,  's'},
            {"win-options",         required_argument,  0,  'w'},
            {"mem-limit",           required_argument,  0,  'M'},
            {"accelerator",         required_argument,  0,  'd'},
            {"cuda-target",         required_argument,  0,  'r'},
            {"print-rate",          required_argument,  0,  'R'},
            {"num-pairs",           required_argument,  0,  'p'},
            {"vary-window",         required_argument,  0,  'V'},
            {"validation",          no_argument,        0,  'c'},
            {"buffer-num",          required_argument,  0,  'b'},
            {"validation-warmup",   required_argument,  0,  'u'},
    };

    enable_accel_support();

    if (options.bench == PT2PT) {
        if (accel_enabled) {
            if (options.subtype == BW) {
                optstring = "+:x:i:t:m:d:W:hvbcu:";
            } else {
                optstring = "+:x:i:m:d:hvcu:";
            }
        } else{
            if (options.subtype == LAT_MT) {
                optstring = "+:hvm:x:i:t:cu:";
            } else if (options.subtype == LAT_MP) {
                optstring = "+:hvm:x:i:t:cu:";
            } else if (options.subtype == BW) {
                optstring = "+:hvm:x:i:t:W:b:cu:";
            } else {
                optstring = "+:hvm:x:i:b:cu:";
            }
        }
    } else if (options.bench == COLLECTIVE) {
        if (options.subtype == LAT ||
                options.subtype == ALLTOALL ||
                options.subtype == GATHER ||
                options.subtype == REDUCE ||
                options.subtype == SCATTER ||
                options.subtype == REDUCE_SCATTER ||
                options.subtype == BCAST) { /* Blocking */
            optstring = "+:hvfm:i:x:M:a:cu:";
            if (accel_enabled) {
                optstring = (CUDA_KERNEL_ENABLED) ? "+:d:hvfm:i:x:M:r:a:cu:" :
                    "+:d:hvfm:i:x:M:a:cu:";
            }
        } else if (options.subtype == NBC) {
            optstring = "+:hvfm:i:x:M:t:a:";
            if (accel_enabled) {
                optstring = (CUDA_KERNEL_ENABLED) ? "+:d:hvfm:i:x:M:t:r:a:" :
                    "+:d:hvfm:i:x:M:t:a:";
            }
        } else { /* Non-Blocking */
            optstring = "+:hvfm:i:x:M:t:a:cu:";
            if (accel_enabled) {
                optstring = (CUDA_KERNEL_ENABLED) ? "+:d:hvfm:i:x:M:t:r:a:cu:" :
                    "+:d:hvfm:i:x:M:t:a:cu:";
            }
        }
    } else if (options.bench == ONE_SIDED) {
        if(options.subtype == BW) {
            optstring = (accel_enabled) ? "+:w:s:hvm:d:x:i:W:" : "+:w:s:hvm:x:i:W:";
        } else {
            optstring = (accel_enabled) ? "+:w:s:hvm:d:x:i:" : "+:w:s:hvm:x:i:";
        }

    } else if (options.bench == MBW_MR){
        optstring = (accel_enabled) ? "p:W:R:x:i:m:d:Vhvb:cu:" :
            "p:W:R:x:i:m:Vhvb:cu:";
    } else if (options.bench == OSHM || options.bench == UPC || options.bench == UPCXX) {
        optstring = ":hvfm:i:M:";
    } else {
        fprintf(stderr,"Invalid benchmark type");
        exit(1);
    }

    /* Set default options*/
    options.accel = NONE;
    options.show_size = 1;
    options.show_full = 0;
    options.num_probes = 0;
    options.device_array_size = 32;
    options.target = CPU;
    options.min_message_size = MIN_MESSAGE_SIZE;
    if (options.bench == COLLECTIVE) {
        options.max_message_size = MAX_MSG_SIZE_COLL;
    } else {
        options.max_message_size = MAX_MESSAGE_SIZE;
    }
    options.max_mem_limit = MAX_MEM_LIMIT;
    options.window_size_large = WINDOW_SIZE_LARGE;
    options.window_size = WINDOW_SIZE_LARGE;
    options.window_varied = 0;
    options.print_rate = 1;
    options.validate = 0;
    options.buf_num = SINGLE;

    options.src = 'H';
    options.dst = 'H';

    switch (options.subtype) {
        case BW:
            options.iterations = BW_LOOP_SMALL;
            options.skip = BW_SKIP_SMALL;
            options.iterations_large = BW_LOOP_LARGE;
            options.skip_large = BW_SKIP_LARGE;
            options.warmup_validation = VALIDATION_SKIP_DEFAULT;
            break;
        case LAT_MT:
            options.num_threads = DEF_NUM_THREADS;
            options.min_message_size = 0;
            options.sender_thread=-1;
        case LAT_MP:
            options.num_processes = DEF_NUM_PROCESSES;
            options.min_message_size = 0;
            options.sender_processes = DEF_NUM_PROCESSES;
        case LAT:
        case GATHER:
        case ALLTOALL:
        case NBC_ALLTOALL:
        case NBC_GATHER:
        case NBC_REDUCE:
        case NBC_SCATTER:
        case NBC_BCAST:
        case REDUCE:
        case SCATTER:
        case BCAST:
        case REDUCE_SCATTER:
        case NBC:
            if (options.bench == COLLECTIVE) {
                options.iterations = COLL_LOOP_SMALL;
                options.skip = COLL_SKIP_SMALL;
                options.iterations_large = COLL_LOOP_LARGE;
                options.skip_large = COLL_SKIP_LARGE;
            } else {
                options.iterations = LAT_LOOP_SMALL;
                options.skip = LAT_SKIP_SMALL;
                options.iterations_large = LAT_LOOP_LARGE;
                options.skip_large = LAT_SKIP_LARGE;
            }
            if (options.bench == PT2PT) {
                options.min_message_size = 0;
            }
            options.warmup_validation = VALIDATION_SKIP_DEFAULT;
            break;
        default:
            break;
    }

    switch (options.bench) {
        case UPCXX:
        case UPC:
            options.show_size = 0;
        case OSHM:
            options.iterations = OSHM_LOOP_SMALL;
            options.skip = OSHM_SKIP_SMALL;
            options.iterations_large = OSHM_LOOP_LARGE;
            options.skip_large = OSHM_SKIP_LARGE;
            options.max_message_size = 1<<20;
            break;
        default:
            break;
    }

    while ((c = getopt_long(argc, argv, optstring, long_options, &option_index)) != -1) {
        bad_usage.opt = c;
        bad_usage.optarg = NULL;
        bad_usage.message = NULL;

        switch(c) {
            case 'h':
                return PO_HELP_MESSAGE;
            case 'v':
                return PO_VERSION_MESSAGE;
            case 'm':
                if (set_message_size(optarg)) {
                    bad_usage.message = "Invalid Message Size";
                    bad_usage.optarg = optarg;

                    return PO_BAD_USAGE;
                }
                break;
            case 't':
                if (options.bench == COLLECTIVE) {
                    if (set_num_probes(atoi(optarg))){
                        bad_usage.message = "Invalid Number of Probes";
                        bad_usage.optarg = optarg;

                        return PO_BAD_USAGE;
                    }
                } else if (options.bench == PT2PT) {
                    if (options.subtype == LAT_MT) {
                        if (set_threads(optarg)){
                            bad_usage.message = "Invalid Number of Threads";
                            bad_usage.optarg = optarg;

                            return PO_BAD_USAGE;
                        }
                    } else if (options.subtype == LAT_MP) {
                        if (set_processes(optarg)){
                            bad_usage.message = "Invalid Number of Processes";
                            bad_usage.optarg = optarg;

                            return PO_BAD_USAGE;
                        }
                    }
                }
                break;
            case 'i':
                if (set_num_iterations(atoi(optarg))) {
                    bad_usage.message = "Invalid Number of Iterations";
                    bad_usage.optarg = optarg;

                    return PO_BAD_USAGE;
                }
                break;
            case 'x':
                if (set_num_warmup(atoi(optarg))) {
                    bad_usage.message = "Invalid Number of Warmup Iterations";
                    bad_usage.optarg = optarg;

                    return PO_BAD_USAGE;
                }
                break;
            case 'R':
                options.print_rate = atoi(optarg);
                if (0 != options.print_rate && 1 != options.print_rate) {
                    return PO_BAD_USAGE;
                }
                break;
            case 'W':
                if (set_window_size(atoi(optarg))) {
                    bad_usage.message = "Invalid Window Size";
                    bad_usage.optarg = optarg;

                    return PO_BAD_USAGE;
                }
                break;
            case 'V':
                options.window_varied = 1;
                break;
            case 'p':
                options.pairs = atoi(optarg);
                break;
            case 'a':
                if (set_device_array_size(atoi(optarg))){
                    bad_usage.message = "Invalid Device Array Size";
                    bad_usage.optarg = optarg;

                    return PO_BAD_USAGE;
                }
                break;
            case 'f':
                options.show_full = 1;
                break;
            case 'M':
                /*
                 * This function does not error but prints a warning message if
                 * the value is too low.
                 */
                set_max_memlimit(atoll(optarg));
                break;
            case 'd':
                if (!accel_enabled) {
                    bad_usage.message = "Benchmark Does Not Support "
                            "Accelerator Transfers";
                    bad_usage.optarg = optarg;
                    return PO_BAD_USAGE;
                } else if (0 == strncasecmp(optarg, "cuda", 10)) {
                    if (CUDA_ENABLED) {
                        options.accel = CUDA;
                    } else {
                        bad_usage.message = "CUDA Support Not Enabled\n"
                                "Please recompile benchmark with CUDA support";
                        bad_usage.optarg = optarg;
                        return PO_BAD_USAGE;
                    }
                } else if (0 == strncasecmp(optarg, "managed", 10)) {
                    if (CUDA_ENABLED) {
                        options.accel = MANAGED;
                    } else {
                        bad_usage.message = "CUDA Managed Memory Support Not Enabled\n"
                                "Please recompile benchmark with CUDA support";
                        bad_usage.optarg = optarg;
                        return PO_BAD_USAGE;
                    }
                } else if (0 == strncasecmp(optarg, "openacc", 10)) {
                    if (OPENACC_ENABLED) {
                        options.accel = OPENACC;
                    } else {
                        bad_usage.message = "OpenACC Support Not Enabled\n"
                                "Please recompile benchmark with OpenACC support";
                        bad_usage.optarg = optarg;
                        return PO_BAD_USAGE;
                    }
                } else if (0 == strncasecmp(optarg, "rocm", 10)) {
                    if (ROCM_ENABLED) {
                        options.accel = ROCM;
                    } else {
                        bad_usage.message = "ROCm Support Not Enabled\n"
                                "Please recompile benchmark with ROCm support";
                        bad_usage.optarg = optarg;
                        return PO_BAD_USAGE;
                    }
                } else {
                    bad_usage.message = "Invalid Accel Type Specified";
                    bad_usage.optarg = optarg;
                    return PO_BAD_USAGE;
                }
                break;
            case 'r':
                if (CUDA_KERNEL_ENABLED) {
                    if (0 == strncasecmp(optarg, "cpu", 10)) {
                        options.target = CPU;
                    } else if (0 == strncasecmp(optarg, "gpu", 10)) {
                        options.target = GPU;
                    } else if (0 == strncasecmp(optarg, "both", 10)) {
                        options.target = BOTH;
                    } else {
                        bad_usage.message = "Please use cpu, gpu, or both";
                        bad_usage.optarg = optarg;
                        return PO_BAD_USAGE;
                    }
                } else {
                    bad_usage.message = "CUDA Kernel Support Not Enabled\n"
                            "Please recompile benchmark with CUDA Kernel support";
                    bad_usage.optarg = optarg;
                    return PO_BAD_USAGE;
                }
                break;
            case 'w':
                ret = process_one_sided_options(c, optarg);
                if (ret == PO_BAD_USAGE) {
                    bad_usage.message = "Invalid option or invalid argument";
                    bad_usage.opt = c;
                    bad_usage.optarg = optarg;
                    return PO_BAD_USAGE;
                }
                break;
            case 's':
                ret = process_one_sided_options(c, optarg);
                if (ret == PO_BAD_USAGE) {
                    bad_usage.message = "Invalid option or invalid argument";
                    bad_usage.opt = c;
                    bad_usage.optarg = optarg;
                    return PO_BAD_USAGE;
                }
                break;
            case 'c':
                options.validate = 1;
                break;
            case 'u':
                if (set_num_warmup_validation(atoi(optarg))) {
                    bad_usage.message = "Invalid Number of Validation Warmup "
                        " Iterations";
                    bad_usage.optarg = optarg;

                    return PO_BAD_USAGE;
                }
                if (options.warmup_validation >  VALIDATION_SKIP_MAX) {
                    bad_usage.message = "Number of Validation Warmup Iterations"
                        "must be less than 10";
                    bad_usage.optarg = optarg;

                    return PO_BAD_USAGE;
                }
                break;
            case 'b':
                if (0 == strncasecmp(optarg, "single", 10)) {
                    options.buf_num = SINGLE;
                } else if (0 == strncasecmp(optarg, "multiple", 10)) {
                    options.buf_num = MULTIPLE;
                } else {
                    bad_usage.message = "Please use 'single' or 'multiple' for buffer type";
                    bad_usage.optarg = optarg;
                    return PO_BAD_USAGE;
                }
                break;
            case ':':
                bad_usage.message = "Option Missing Required Argument";
                bad_usage.opt = optopt;
                return PO_BAD_USAGE;
            default:
                bad_usage.message = "Invalid option";
                bad_usage.opt = optopt;
                return PO_BAD_USAGE;
        }
    }

    if (!options.validate) {
        options.warmup_validation = 0;
    }

    if (accel_enabled) {
        if ((optind + 2) == argc) {
            options.src = argv[optind][0];
            if (options.src == 'M')
            {
#ifdef _ENABLE_CUDA_KERNEL_
                options.MMsrc = argv[optind][1];
                if (options.MMsrc == '\0') {
                    fprintf(stderr, "The M flag for destination buffer is "
                            "deprecated. Please use MD or MH to set the "
                            "effective location of CUDA Unified Memory buffers "
                            "to be device or host respectively. Currently M "
                            "flag is considered as MH\n");
                    options.MMsrc = 'H';
                } else if (options.MMsrc != 'D' && options.MMsrc != 'H') {
                    fprintf(stderr, "Please use MD or MH to set the effective "
                            "location of CUDA Unified Memory buffers to be "
                            "device or host respectively\n");
                    return PO_BAD_USAGE;
                }
#else
                fprintf(stderr, "Managed memory support requires CUDA kernels.\n");
                return PO_BAD_USAGE;
#endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
            }
            options.dst = argv[optind + 1][0];
            if (options.dst == 'M')
            {
#ifdef _ENABLE_CUDA_KERNEL_
                options.MMdst = argv[optind+1][1];
                if (options.MMdst == '\0') {
                    fprintf(stderr, "The M flag for destination buffer is "
                            "deprecated. Please use MD or MH to set the "
                            "effective location of CUDA Unified Memory buffers "
                            "to be device or host respectively. Currently M "
                            "flag is considered as MH\n");
                    options.MMdst = 'H';
                } else if (options.MMdst != 'D' && options.MMdst != 'H') {
                    fprintf(stderr, "Please use MD or MH to set the effective "
                            "location of CUDA Unified Memory buffers to be "
                            "device or host respectively\n");
                    return PO_BAD_USAGE;
                }
#else
                fprintf(stderr, "Managed memory support requires CUDA kernels.\n");
                return PO_BAD_USAGE;
#endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
            }
            /* No need to check if '-d' is given */
            if (NONE == options.accel || options.bench == PT2PT) {
                setAccel(options.src);
                setAccel(options.dst);
            }
        } else if (optind != argc) {
            return PO_BAD_USAGE;
        }
    }

    return PO_OKAY;
}

/* Set the initial accelerator type */
int setAccel(char buf_type)
{
    switch (buf_type) {
        case 'H':
            break;
        case 'M':
            /* For managed memory benchmarks, use multiple buffers to report
             * accurate performance numbers */
            options.buf_num = MULTIPLE;
        case 'D':
            if (options.bench != PT2PT && options.bench != ONE_SIDED && options.bench != MBW_MR) {
                bad_usage.opt = buf_type;
                bad_usage.message = "This argument is only supported for one-sided and pt2pt benchmarks";
                return PO_BAD_USAGE;
            }
            if (NONE == options.accel || MANAGED == options.accel) {
#if defined(_ENABLE_OPENACC_) && !defined(_ENABLE_CUDA_)
                options.accel = OPENACC;
#elif defined(_ENABLE_CUDA_)
                options.accel = CUDA;
#elif defined(_ENABLE_ROCM_)
                options.accel = ROCM;
#endif
            }
            break;
        default:
            return PO_BAD_USAGE;
    }
    return PO_OKAY;
}


double getMicrosecondTimeStamp()
{
    double retval;
    struct timeval tv;
    if (gettimeofday(&tv, NULL)) {
        perror("gettimeofday");
        abort();
    }
    retval = ((double)tv.tv_sec) * 1000000 + tv.tv_usec;
    return retval;
}

void wtime(double *t)
{
    static int sec = -1;
    struct timeval tv;
    //gettimeofday(&tv, (void *)0);
    gettimeofday(&tv, 0);
    if (sec < 0) sec = tv.tv_sec;
    *t = (tv.tv_sec - sec)*1.0e+6 + tv.tv_usec;
}


/* vi:set sw=4 sts=4 tw=80: */