/* * Copyright (C) 2002-2021 the Network-Based Computing Laboratory * (NBCL), The Ohio State University. * * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu) * * For detailed copyright and licensing information, please refer to the * copyright file COPYRIGHT in the top level OMB directory. */ #include #include "osu_util.h" #define MPI_CHECK(stmt) \ do { \ int mpi_errno = (stmt); \ if (MPI_SUCCESS != mpi_errno) { \ fprintf(stderr, "[%s:%d] MPI call failed with %d \n", \ __FILE__, __LINE__,mpi_errno); \ exit(EXIT_FAILURE); \ } \ assert(MPI_SUCCESS == mpi_errno); \ } while (0) extern MPI_Aint disp_remote; extern MPI_Aint disp_local; /* * Non-blocking Collectives */ double call_test(int * num_tests, MPI_Request** request); void allocate_device_arrays(int n); double dummy_compute(double target_secs, MPI_Request *request); void init_arrays(double seconds); double do_compute_and_probe(double seconds, MPI_Request *request); void free_host_arrays(); #ifdef _ENABLE_CUDA_KERNEL_ extern void call_kernel(float a, float *d_x, float *d_y, int N, cudaStream_t *stream); void free_device_arrays(); #endif /* * Managed Memory */ #ifdef _ENABLE_CUDA_KERNEL_ void touch_managed(char *buf, size_t length); void launch_empty_kernel(char *buf, size_t length); void create_cuda_stream(); void destroy_cuda_stream(); void synchronize_device(); void synchronize_stream(); void prefetch_data(char *buf, size_t length, int devid); void create_cuda_event(); void destroy_cuda_event(); void event_record_start(); void event_record_stop(); void event_elapsed_time(float *); extern void call_touch_managed_kernel(char *buf, size_t length, cudaStream_t *stream); extern void call_empty_kernel(char *buf, size_t length, cudaStream_t *stream); #define PREFETCH_THRESHOLD 131072 #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */ /* * Print Information */ void print_bad_usage_message (int rank); void print_help_message (int rank); void print_version_message (int rank); void print_preamble (int rank); void print_preamble_nbc (int rank); void print_stats (int rank, int size, double avg, double min, double max); void print_stats_validate(int rank, int size, double avg, double min, double max, int errors); void print_stats_nbc (int rank, int size, double ovrl, double cpu, double avg_comm, double min_comm, double max_comm, double wait, double init, double test, int errors); /* * Memory Management */ int allocate_memory_coll (void ** buffer, size_t size, enum accel_type type); void free_buffer (void * buffer, enum accel_type type); void set_buffer (void * buffer, enum accel_type type, int data, size_t size); void set_buffer_pt2pt (void * buffer, int rank, enum accel_type type, int data, size_t size); void set_buffer_validation(void* s_buf, void* r_buf, size_t size, enum accel_type type, int iter); void set_buffer_float (float * buffer, int is_send_buf, size_t size, int iter, enum accel_type type); void set_buffer_char (char * buffer, int is_send_buf, size_t size, int rank, int num_procs, enum accel_type type, int iter); /* * CUDA Context Management */ int init_accel (void); int cleanup_accel (void); extern MPI_Request request[MAX_REQ_NUM]; extern MPI_Status reqstat[MAX_REQ_NUM]; extern MPI_Request send_request[MAX_REQ_NUM]; extern MPI_Request recv_request[MAX_REQ_NUM]; void usage_mbw_mr(); int allocate_memory_pt2pt (char **sbuf, char **rbuf, int rank); int allocate_memory_pt2pt_size (char **sbuf, char **rbuf, int rank, size_t size); int allocate_memory_pt2pt_mul (char **sbuf, char **rbuf, int rank, int pairs); int allocate_memory_pt2pt_mul_size (char **sbuf, char **rbuf, int rank, int pairs, size_t size); void print_header_pt2pt (int rank, int type); void free_memory (void *sbuf, void *rbuf, int rank); void free_memory_pt2pt_mul (void *sbuf, void *rbuf, int rank, int pairs); void print_header(int rank, int full); void usage_one_sided (char const *); void print_header_one_sided (int, enum WINDOW, enum SYNC); void print_help_message_get_acc_lat (int); extern char const * benchmark_header; extern char const * benchmark_name; extern int accel_enabled; extern struct options_t options; extern struct bad_usage_t bad_usage; void allocate_memory_one_sided(int rank, char **sbuf, char **win_base, size_t size, enum WINDOW type, MPI_Win *win); void free_memory_one_sided (void *user_buf, void *win_baseptr, enum WINDOW win_type, MPI_Win win, int rank); void allocate_atomic_memory(int rank, char **sbuf, char **tbuf, char **cbuf, char **win_base, size_t size, enum WINDOW type, MPI_Win *win); void free_atomic_memory (void *sbuf, void *win_baseptr, void *tbuf, void *cbuf, enum WINDOW type, MPI_Win win, int rank); int omb_get_local_rank(); /* * Data Validation */ #define VALIDATION_STATUS(error) (error > 0) ? "Fail" : "Pass" #define ERROR_DELTA 0.001 uint8_t validate_data(void* r_buf, size_t size, int num_procs, enum accel_type type, int iter); int validate_reduction(float * buffer, size_t size, int iter, int num_procs, enum accel_type type); int validate_collective(char *buffer, size_t size, int value1, int value2, enum accel_type type, int itr); int validate_reduce_scatter(float *buffer, size_t size, int* recvcounts, int rank, int num_procs, enum accel_type type, int iter);