#ifndef UCT_IB_IFACE_H
#define UCT_IB_IFACE_H
#include "ib_md.h"
#include <uct/api/uct.h>
#include <uct/base/uct_iface.h>
#include <uct/base/uct_iov.inl>
#include <ucs/sys/compiler.h>
#include <ucs/sys/string.h>
#include <ucs/sys/math.h>
#include <ucs/datastruct/mpool.inl>
#include <ucs/datastruct/string_buffer.h>
#define UCT_IB_MAX_IOV 8UL
#define UCT_IB_IFACE_NULL_RES_DOMAIN_KEY 0u
#define UCT_IB_MAX_ATOMIC_SIZE sizeof(uint64_t)
#define UCT_IB_ADDRESS_INVALID_GID_INDEX UINT8_MAX
#define UCT_IB_ADDRESS_INVALID_PATH_MTU ((enum ibv_mtu)0)
#define UCT_IB_ADDRESS_INVALID_PKEY 0
#define UCT_IB_ADDRESS_DEFAULT_PKEY 0xffff
#define UCT_IB_SL_NUM 16
typedef struct uct_ib_iface_config uct_ib_iface_config_t;
typedef struct uct_ib_iface_ops uct_ib_iface_ops_t;
typedef struct uct_ib_iface uct_ib_iface_t;
typedef enum uct_ib_mtu {
UCT_IB_MTU_DEFAULT = 0,
UCT_IB_MTU_512 = 1,
UCT_IB_MTU_1024 = 2,
UCT_IB_MTU_2048 = 3,
UCT_IB_MTU_4096 = 4,
UCT_IB_MTU_LAST
} uct_ib_mtu_t;
typedef enum {
UCT_IB_DIR_RX,
UCT_IB_DIR_TX,
UCT_IB_DIR_NUM
} uct_ib_dir_t;
enum {
UCT_IB_QPT_UNKNOWN,
#ifdef HAVE_DC_EXP
UCT_IB_QPT_DCI = IBV_EXP_QPT_DC_INI,
#elif HAVE_DC_DV
UCT_IB_QPT_DCI = IBV_QPT_DRIVER,
#else
UCT_IB_QPT_DCI = UCT_IB_QPT_UNKNOWN,
#endif
};
enum {
UCT_IB_ADDRESS_PACK_FLAG_ETH = UCS_BIT(0),
UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID = UCS_BIT(1),
UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX = UCS_BIT(2),
UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU = UCS_BIT(3),
UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX = UCS_BIT(4),
UCT_IB_ADDRESS_PACK_FLAG_PKEY = UCS_BIT(5)
};
typedef struct uct_ib_address_pack_params {
uint64_t flags;
union ibv_gid gid;
uint16_t lid;
uct_ib_roce_version_info_t roce_info;
enum ibv_mtu path_mtu;
uint8_t gid_index;
uint16_t pkey;
} uct_ib_address_pack_params_t;
struct uct_ib_iface_config {
uct_iface_config_t super;
size_t seg_size;
struct {
unsigned queue_len;
unsigned max_batch;
unsigned max_poll;
size_t min_inline;
unsigned min_sge;
uct_iface_mpool_config_t mp;
unsigned cq_moderation_count;
double cq_moderation_period;
} tx;
struct {
unsigned queue_len;
unsigned max_batch;
unsigned max_poll;
uct_iface_mpool_config_t mp;
unsigned cq_moderation_count;
double cq_moderation_period;
} rx;
size_t inl[UCT_IB_DIR_NUM];
int addr_type;
int is_global;
unsigned long sl;
unsigned long traffic_class;
unsigned hop_limit;
unsigned long num_paths;
int rocev2_use_netmask;
unsigned roce_path_factor;
UCS_CONFIG_ARRAY_FIELD(ucs_range_spec_t, ranges) lid_path_bits;
unsigned pkey;
int enable_res_domain;
uct_ib_mtu_t path_mtu;
};
enum {
UCT_IB_CQ_IGNORE_OVERRUN = UCS_BIT(0),
UCT_IB_TM_SUPPORTED = UCS_BIT(1),
UCT_IB_TX_OPS_PER_PATH = UCS_BIT(2)
};
typedef struct uct_ib_iface_init_attr {
unsigned rx_priv_len;
unsigned rx_hdr_len;
unsigned cq_len[UCT_IB_DIR_NUM];
size_t seg_size;
unsigned fc_req_size;
int qp_type;
int flags;
} uct_ib_iface_init_attr_t;
typedef struct uct_ib_qp_attr {
int qp_type;
struct ibv_qp_cap cap;
int port;
struct ibv_srq *srq;
uint32_t srq_num;
unsigned sq_sig_all;
unsigned max_inl_cqe[UCT_IB_DIR_NUM];
#if HAVE_DECL_IBV_EXP_CREATE_QP
struct ibv_exp_qp_init_attr ibv;
#elif HAVE_DECL_IBV_CREATE_QP_EX
struct ibv_qp_init_attr_ex ibv;
#else
struct ibv_qp_init_attr ibv;
#endif
} uct_ib_qp_attr_t;
typedef ucs_status_t (*uct_ib_iface_create_cq_func_t)(uct_ib_iface_t *iface,
uct_ib_dir_t dir,
const uct_ib_iface_init_attr_t *init_attr,
int preferred_cpu,
size_t inl);
typedef ucs_status_t (*uct_ib_iface_arm_cq_func_t)(uct_ib_iface_t *iface,
uct_ib_dir_t dir,
int solicited_only);
typedef void (*uct_ib_iface_event_cq_func_t)(uct_ib_iface_t *iface,
uct_ib_dir_t dir);
typedef void (*uct_ib_iface_handle_failure_func_t)(uct_ib_iface_t *iface, void *arg,
ucs_status_t status);
typedef ucs_status_t (*uct_ib_iface_set_ep_failed_func_t)(uct_ib_iface_t *iface, uct_ep_h ep,
ucs_status_t status);
struct uct_ib_iface_ops {
uct_iface_internal_ops_t super;
uct_ib_iface_create_cq_func_t create_cq;
uct_ib_iface_arm_cq_func_t arm_cq;
uct_ib_iface_event_cq_func_t event_cq;
uct_ib_iface_handle_failure_func_t handle_failure;
};
struct uct_ib_iface {
uct_base_iface_t super;
struct ibv_cq *cq[UCT_IB_DIR_NUM];
struct ibv_comp_channel *comp_channel;
uct_recv_desc_t release_desc;
uint8_t *path_bits;
unsigned path_bits_count;
unsigned num_paths;
uint16_t pkey_index;
uint16_t pkey;
uint8_t addr_size;
uint8_t addr_prefix_bits;
uct_ib_device_gid_info_t gid_info;
struct {
unsigned rx_payload_offset;
unsigned rx_hdr_offset;
unsigned rx_headroom_offset;
unsigned rx_max_batch;
unsigned rx_max_poll;
unsigned tx_max_poll;
unsigned seg_size;
unsigned roce_path_factor;
uint8_t max_inl_cqe[UCT_IB_DIR_NUM];
uint8_t port_num;
uint8_t sl;
uint8_t traffic_class;
uint8_t hop_limit;
uint8_t enable_res_domain;
uint8_t qp_type;
uint8_t force_global_addr;
enum ibv_mtu path_mtu;
} config;
uct_ib_iface_ops_t *ops;
};
typedef struct uct_ib_fence_info {
uint16_t fence_beat;
} uct_ib_fence_info_t;
UCS_CLASS_DECLARE(uct_ib_iface_t, uct_iface_ops_t*, uct_ib_iface_ops_t*,
uct_md_h, uct_worker_h, const uct_iface_params_t*,
const uct_ib_iface_config_t*,
const uct_ib_iface_init_attr_t*);
typedef struct uct_ib_iface_recv_desc {
uint32_t lkey;
} UCS_S_PACKED uct_ib_iface_recv_desc_t;
extern ucs_config_field_t uct_ib_iface_config_table[];
extern const char *uct_ib_mtu_values[];
ucs_status_t uct_ib_iface_recv_mpool_init(uct_ib_iface_t *iface,
const uct_ib_iface_config_t *config,
const uct_iface_params_t *params,
const char *name, ucs_mpool_t *mp);
void uct_ib_iface_release_desc(uct_recv_desc_t *self, void *desc);
static UCS_F_ALWAYS_INLINE void
uct_ib_iface_invoke_am_desc(uct_ib_iface_t *iface, uint8_t am_id, void *data,
unsigned length, uct_ib_iface_recv_desc_t *ib_desc)
{
void *desc = (char*)ib_desc + iface->config.rx_headroom_offset;
ucs_status_t status;
status = uct_iface_invoke_am(&iface->super, am_id, data, length,
UCT_CB_PARAM_FLAG_DESC);
if (status == UCS_OK) {
ucs_mpool_put_inline(ib_desc);
} else {
uct_recv_desc(desc) = &iface->release_desc;
}
}
int uct_ib_iface_is_roce(uct_ib_iface_t *iface);
int uct_ib_iface_is_ib(uct_ib_iface_t *iface);
size_t uct_ib_address_size(const uct_ib_address_pack_params_t *params);
unsigned uct_ib_iface_address_pack_flags(uct_ib_iface_t *iface);
size_t uct_ib_iface_address_size(uct_ib_iface_t *iface);
void uct_ib_address_pack(const uct_ib_address_pack_params_t *params,
uct_ib_address_t *ib_addr);
void uct_ib_iface_address_pack(uct_ib_iface_t *iface, uct_ib_address_t *ib_addr);
void uct_ib_address_unpack(const uct_ib_address_t *ib_addr,
uct_ib_address_pack_params_t *params_p);
const char *uct_ib_address_str(const uct_ib_address_t *ib_addr, char *buf,
size_t max);
ucs_status_t uct_ib_iface_get_device_address(uct_iface_h tl_iface,
uct_device_addr_t *dev_addr);
int uct_ib_iface_is_reachable(const uct_iface_h tl_iface, const uct_device_addr_t *dev_addr,
const uct_iface_addr_t *iface_addr);
ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len,
uct_iface_attr_t *iface_attr);
ucs_status_t
uct_ib_iface_estimate_perf(uct_iface_h tl_iface, uct_perf_attr_t *perf_attr);
int uct_ib_iface_is_roce_v2(uct_ib_iface_t *iface, uct_ib_device_t *dev);
ucs_status_t uct_ib_iface_init_roce_gid_info(uct_ib_iface_t *iface,
size_t md_config_index);
static inline uct_ib_md_t* uct_ib_iface_md(uct_ib_iface_t *iface)
{
return ucs_derived_of(iface->super.md, uct_ib_md_t);
}
static inline uct_ib_device_t* uct_ib_iface_device(uct_ib_iface_t *iface)
{
return &uct_ib_iface_md(iface)->dev;
}
static inline struct ibv_port_attr* uct_ib_iface_port_attr(uct_ib_iface_t *iface)
{
return uct_ib_device_port_attr(uct_ib_iface_device(iface), iface->config.port_num);
}
static inline void* uct_ib_iface_recv_desc_hdr(uct_ib_iface_t *iface,
uct_ib_iface_recv_desc_t *desc)
{
return (void*)((char *)desc + iface->config.rx_hdr_offset);
}
typedef struct uct_ib_recv_wr {
struct ibv_recv_wr ibwr;
struct ibv_sge sg;
} uct_ib_recv_wr_t;
int uct_ib_iface_prepare_rx_wrs(uct_ib_iface_t *iface, ucs_mpool_t *mp,
uct_ib_recv_wr_t *wrs, unsigned n);
ucs_status_t uct_ib_iface_create_ah(uct_ib_iface_t *iface,
struct ibv_ah_attr *ah_attr,
const char *usage, struct ibv_ah **ah_p);
void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid,
const union ibv_gid *gid,
uint8_t gid_index,
unsigned path_index,
struct ibv_ah_attr *ah_attr);
void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface,
const uct_ib_address_t *ib_addr,
unsigned path_index,
struct ibv_ah_attr *ah_attr,
enum ibv_mtu *path_mtu);
ucs_status_t uct_ib_iface_pre_arm(uct_ib_iface_t *iface);
ucs_status_t uct_ib_iface_event_fd_get(uct_iface_h iface, int *fd_p);
ucs_status_t uct_ib_iface_arm_cq(uct_ib_iface_t *iface,
uct_ib_dir_t dir,
int solicited_only);
ucs_status_t uct_ib_verbs_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir,
const uct_ib_iface_init_attr_t *init_attr,
int preferred_cpu, size_t inl);
ucs_status_t uct_ib_iface_create_qp(uct_ib_iface_t *iface,
uct_ib_qp_attr_t *attr,
struct ibv_qp **qp_p);
void uct_ib_iface_fill_attr(uct_ib_iface_t *iface,
uct_ib_qp_attr_t *attr);
uint8_t uct_ib_iface_config_select_sl(const uct_ib_iface_config_t *ib_config);
#define UCT_IB_IFACE_FMT \
"%s:%d/%s"
#define UCT_IB_IFACE_ARG(_iface) \
uct_ib_device_name(uct_ib_iface_device(_iface)), \
(_iface)->config.port_num, \
uct_ib_iface_is_roce(_iface) ? "RoCE" : "IB"
#define UCT_IB_IFACE_VERBS_COMPLETION_ERR(_type, _iface, _i, _wc) \
ucs_fatal("%s completion[%d] with error on %s/%p: %s, vendor_err 0x%x wr_id 0x%lx", \
_type, _i, uct_ib_device_name(uct_ib_iface_device(_iface)), _iface, \
uct_ib_wc_status_str(_wc[i].status), _wc[i].vendor_err, \
_wc[i].wr_id);
#define UCT_IB_IFACE_VERBS_FOREACH_RXWQE(_iface, _i, _hdr, _wc, _wc_count) \
for (_i = 0; _i < _wc_count && ({ \
if (ucs_unlikely(_wc[i].status != IBV_WC_SUCCESS)) { \
UCT_IB_IFACE_VERBS_COMPLETION_ERR("receive", _iface, _i, _wc); \
} \
_hdr = (typeof(_hdr))uct_ib_iface_recv_desc_hdr(_iface, \
(uct_ib_iface_recv_desc_t *)(uintptr_t)_wc[i].wr_id); \
VALGRIND_MAKE_MEM_DEFINED(_hdr, _wc[i].byte_len); \
1; }); ++_i)
#define UCT_IB_MAX_ZCOPY_LOG_SGE(_iface) \
(uct_ib_iface_device(_iface)->max_zcopy_log_sge)
static UCS_F_ALWAYS_INLINE
size_t uct_ib_verbs_sge_fill_iov(struct ibv_sge *sge, const uct_iov_t *iov,
size_t iovcnt)
{
size_t iov_it, sge_it = 0;
for (iov_it = 0; iov_it < iovcnt; ++iov_it) {
sge[sge_it].length = uct_iov_get_length(&iov[iov_it]);
if (sge[sge_it].length > 0) {
sge[sge_it].addr = (uintptr_t)(iov[iov_it].buffer);
} else {
continue;
}
if (iov[iov_it].memh == UCT_MEM_HANDLE_NULL) {
sge[sge_it].lkey = 0;
} else {
sge[sge_it].lkey = uct_ib_memh_get_lkey(iov[iov_it].memh);
}
++sge_it;
}
return sge_it;
}
static UCS_F_ALWAYS_INLINE
size_t uct_ib_iface_hdr_size(size_t max_inline, size_t min_size)
{
return (size_t)ucs_max((ssize_t)(max_inline - min_size), 0);
}
static UCS_F_ALWAYS_INLINE void
uct_ib_fence_info_init(uct_ib_fence_info_t* fence)
{
fence->fence_beat = 0;
}
static UCS_F_ALWAYS_INLINE unsigned
uct_ib_cq_size(uct_ib_iface_t *iface, const uct_ib_iface_init_attr_t *init_attr,
uct_ib_dir_t dir)
{
if (dir == UCT_IB_DIR_RX) {
return init_attr->cq_len[UCT_IB_DIR_RX];
} else if (init_attr->flags & UCT_IB_TX_OPS_PER_PATH) {
return init_attr->cq_len[UCT_IB_DIR_TX] * iface->num_paths;
} else {
return init_attr->cq_len[UCT_IB_DIR_TX];
}
}
static UCS_F_ALWAYS_INLINE unsigned
uct_ib_iface_roce_dscp(uct_ib_iface_t *iface)
{
ucs_assert(uct_ib_iface_is_roce(iface));
return iface->config.traffic_class >> 2;
}
#if HAVE_DECL_IBV_CREATE_CQ_EX
static UCS_F_ALWAYS_INLINE void
uct_ib_fill_cq_attr(struct ibv_cq_init_attr_ex *cq_attr,
const uct_ib_iface_init_attr_t *init_attr,
uct_ib_iface_t *iface, int preferred_cpu, unsigned cq_size)
{
cq_attr->cqe = cq_size;
cq_attr->channel = iface->comp_channel;
cq_attr->comp_vector = preferred_cpu;
#if HAVE_DECL_IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN
if (!UCS_ENABLE_ASSERT && (init_attr->flags & UCT_IB_CQ_IGNORE_OVERRUN)) {
cq_attr->comp_mask = IBV_CQ_INIT_ATTR_MASK_FLAGS;
cq_attr->flags = IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN;
}
#endif
}
#endif
#endif