#ifndef UCX_LIBPERF_H
#define UCX_LIBPERF_H
#include <ucs/sys/compiler.h>
BEGIN_C_DECLS
#include <uct/api/uct.h>
#include <ucp/api/ucp.h>
typedef enum {
UCX_PERF_API_UCT,
UCX_PERF_API_UCP,
UCX_PERF_API_LAST
} ucx_perf_api_t;
typedef enum {
UCX_PERF_CMD_AM,
UCX_PERF_CMD_PUT,
UCX_PERF_CMD_GET,
UCX_PERF_CMD_ADD,
UCX_PERF_CMD_FADD,
UCX_PERF_CMD_SWAP,
UCX_PERF_CMD_CSWAP,
UCX_PERF_CMD_TAG,
UCX_PERF_CMD_TAG_SYNC,
UCX_PERF_CMD_STREAM,
UCX_PERF_CMD_LAST
} ucx_perf_cmd_t;
typedef enum {
UCX_PERF_TEST_TYPE_PINGPONG,
UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM,
UCX_PERF_TEST_TYPE_STREAM_UNI,
UCX_PERF_TEST_TYPE_STREAM_BI,
UCX_PERF_TEST_TYPE_LAST
} ucx_perf_test_type_t;
typedef enum {
UCP_PERF_DATATYPE_CONTIG,
UCP_PERF_DATATYPE_IOV,
} ucp_perf_datatype_t;
typedef enum {
UCT_PERF_DATA_LAYOUT_SHORT,
UCT_PERF_DATA_LAYOUT_SHORT_IOV,
UCT_PERF_DATA_LAYOUT_BCOPY,
UCT_PERF_DATA_LAYOUT_ZCOPY,
UCT_PERF_DATA_LAYOUT_LAST
} uct_perf_data_layout_t;
typedef enum {
UCX_PERF_WAIT_MODE_POLL,
UCX_PERF_WAIT_MODE_SLEEP,
UCX_PERF_WAIT_MODE_SPIN,
UCX_PERF_WAIT_MODE_LAST
} ucx_perf_wait_mode_t;
enum ucx_perf_test_flags {
UCX_PERF_TEST_FLAG_VALIDATE = UCS_BIT(1),
UCX_PERF_TEST_FLAG_ONE_SIDED = UCS_BIT(2),
UCX_PERF_TEST_FLAG_MAP_NONBLOCK = UCS_BIT(3),
UCX_PERF_TEST_FLAG_TAG_WILDCARD = UCS_BIT(4),
UCX_PERF_TEST_FLAG_TAG_UNEXP_PROBE = UCS_BIT(5),
UCX_PERF_TEST_FLAG_VERBOSE = UCS_BIT(7),
UCX_PERF_TEST_FLAG_STREAM_RECV_DATA = UCS_BIT(8),
UCX_PERF_TEST_FLAG_FLUSH_EP = UCS_BIT(9),
UCX_PERF_TEST_FLAG_WAKEUP = UCS_BIT(10),
UCX_PERF_TEST_FLAG_ERR_HANDLING = UCS_BIT(11),
UCX_PERF_TEST_FLAG_LOOPBACK = UCS_BIT(12)
};
enum {
UCT_PERF_TEST_MAX_FC_WINDOW = 127
};
#define UCT_PERF_TEST_PARAMS_FMT "%s/%s"
#define UCT_PERF_TEST_PARAMS_ARG(_params) (_params)->uct.tl_name, \
(_params)->uct.dev_name
typedef uint64_t ucx_perf_counter_t;
typedef struct ucx_perf_result {
ucx_perf_counter_t iters;
double elapsed_time;
ucx_perf_counter_t bytes;
struct {
double percentile;
double moment_average;
double total_average;
}
latency, bandwidth, msgrate;
} ucx_perf_result_t;
typedef void (*ucx_perf_rte_progress_cb_t)(void *arg);
typedef unsigned (*ucx_perf_rte_group_size_func_t)(void *rte_group);
typedef unsigned (*ucx_perf_rte_group_index_func_t)(void *rte_group);
typedef void (*ucx_perf_rte_barrier_func_t)(void *rte_group,
ucx_perf_rte_progress_cb_t progress,
void *arg);
typedef void (*ucx_perf_rte_post_vec_func_t)(void *rte_group,
const struct iovec *iovec,
int iovcnt, void **req);
typedef void (*ucx_perf_rte_recv_func_t)(void *rte_group, unsigned src,
void *buffer, size_t max, void *req);
typedef void (*ucx_perf_rte_exchange_vec_func_t)(void *rte_group, void *req);
typedef void (*ucx_perf_rte_report_func_t)(void *rte_group,
const ucx_perf_result_t *result,
void *arg, const char *extra_info,
int is_final, int is_multi_thread);
typedef struct ucx_perf_rte {
ucx_perf_rte_group_size_func_t group_size;
ucx_perf_rte_group_index_func_t group_index;
ucx_perf_rte_barrier_func_t barrier;
ucx_perf_rte_post_vec_func_t post_vec;
ucx_perf_rte_recv_func_t recv;
ucx_perf_rte_exchange_vec_func_t exchange_vec;
ucx_perf_rte_report_func_t report;
} ucx_perf_rte_t;
typedef struct ucx_perf_params {
ucx_perf_api_t api;
ucx_perf_cmd_t command;
ucx_perf_test_type_t test_type;
ucs_thread_mode_t thread_mode;
unsigned thread_count;
ucs_async_mode_t async_mode;
ucx_perf_wait_mode_t wait_mode;
ucs_memory_type_t send_mem_type;
ucs_memory_type_t recv_mem_type;
unsigned flags;
size_t *msg_size_list;
size_t msg_size_cnt;
size_t iov_stride;
size_t alignment;
unsigned max_outstanding;
ucx_perf_counter_t warmup_iter;
ucx_perf_counter_t max_iter;
double max_time;
double report_interval;
double percentile_rank;
void *rte_group;
ucx_perf_rte_t *rte;
void *report_arg;
struct {
char dev_name[UCT_DEVICE_NAME_MAX];
char tl_name[UCT_TL_NAME_MAX];
char md_name[UCT_MD_NAME_MAX];
uct_perf_data_layout_t data_layout;
unsigned fc_window;
size_t am_hdr_size;
} uct;
struct {
unsigned nonblocking_mode;
ucp_perf_datatype_t send_datatype;
ucp_perf_datatype_t recv_datatype;
size_t am_hdr_size;
} ucp;
} ucx_perf_params_t;
typedef struct ucx_perf_allocator ucx_perf_allocator_t;
extern const ucx_perf_allocator_t* ucx_perf_mem_type_allocators[];
void ucx_perf_global_init();
ucs_status_t ucx_perf_run(const ucx_perf_params_t *params,
ucx_perf_result_t *result);
END_C_DECLS
#endif