#ifndef UCT_DC_IFACE_H
#define UCT_DC_IFACE_H
#include <uct/ib/rc/base/rc_iface.h>
#include <uct/ib/rc/base/rc_ep.h>
#include <uct/ib/rc/verbs/rc_verbs.h>
#include <uct/ib/rc/accel/rc_mlx5_common.h>
#include <uct/ib/ud/base/ud_iface_common.h>
#include <uct/ib/ud/accel/ud_mlx5_common.h>
#include <ucs/debug/assert.h>
#include <ucs/datastruct/bitmap.h>
#if IBV_HW_TM
# if HAVE_INFINIBAND_TM_TYPES_H
struct ibv_ravh {
uint32_t sl_dct;
uint32_t reserved;
uint64_t dc_access_key;
};
# else
# define ibv_ravh ibv_exp_tmh_ravh
# endif
# define UCT_DC_RNDV_HDR_LEN (sizeof(struct ibv_rvh) + \
sizeof(struct ibv_ravh))
#else
# define UCT_DC_RNDV_HDR_LEN 0
#endif
#define UCT_DC_MLX5_IFACE_MAX_USER_DCIS 15
#define UCT_DC_MLX5_KEEPALIVE_NUM_DCIS 1
#define UCT_DC_MLX5_IFACE_MAX_DCI_POOLS 8
#define UCT_DC_MLX5_IFACE_MAX_DCIS ((UCT_DC_MLX5_IFACE_MAX_USER_DCIS * \
UCT_DC_MLX5_IFACE_MAX_DCI_POOLS) + \
UCT_DC_MLX5_KEEPALIVE_NUM_DCIS)
#define UCT_DC_MLX5_IFACE_ADDR_TM_ENABLED(_addr) \
(!!((_addr)->flags & UCT_DC_MLX5_IFACE_ADDR_HW_TM))
#define UCT_DC_MLX5_IFACE_TXQP_DCI_GET(_iface, _dci, _txqp, _txwq) \
{ \
_txqp = &(_iface)->tx.dcis[_dci].txqp; \
_txwq = &(_iface)->tx.dcis[_dci].txwq; \
}
typedef struct uct_dc_mlx5_ep uct_dc_mlx5_ep_t;
typedef struct uct_dc_mlx5_iface uct_dc_mlx5_iface_t;
typedef enum {
UCT_DC_MLX5_IFACE_ADDR_HW_TM = UCS_BIT(0),
UCT_DC_MLX5_IFACE_ADDR_DC_V1 = UCS_BIT(1),
UCT_DC_MLX5_IFACE_ADDR_DC_V2 = UCS_BIT(2),
UCT_DC_MLX5_IFACE_ADDR_DC_VERS = UCT_DC_MLX5_IFACE_ADDR_DC_V1 |
UCT_DC_MLX5_IFACE_ADDR_DC_V2
} uct_dc_mlx5_iface_addr_flags_t;
typedef enum {
UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE = UCS_BIT(0),
UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE_FULL_HANDSHAKE = UCS_BIT(1),
UCT_DC_MLX5_IFACE_FLAG_UIDX = UCS_BIT(2),
UCT_DC_MLX5_IFACE_FLAG_FC_EP_FAILED = UCS_BIT(3),
UCT_DC_MLX5_IFACE_IGNORE_DCI_WAITQ_REORDER = UCS_BIT(4)
} uct_dc_mlx5_iface_flags_t;
typedef struct uct_dc_mlx5_iface_addr {
uct_ib_uint24_t qp_num;
uint8_t atomic_mr_id;
uint8_t flags;
} UCS_S_PACKED uct_dc_mlx5_iface_addr_t;
typedef enum {
UCT_DC_TX_POLICY_DCS,
UCT_DC_TX_POLICY_DCS_QUOTA,
UCT_DC_TX_POLICY_RAND,
UCT_DC_TX_POLICY_LAST
} uct_dc_tx_policy_t;
typedef struct uct_dc_mlx5_iface_config {
uct_rc_iface_common_config_t super;
uct_rc_mlx5_iface_common_config_t rc_mlx5_common;
uct_ud_iface_common_config_t ud_common;
int ndci;
int tx_policy;
ucs_on_off_auto_value_t dci_full_handshake;
ucs_on_off_auto_value_t dci_ka_full_handshake;
ucs_on_off_auto_value_t dct_full_handshake;
unsigned quota;
unsigned rand_seed;
ucs_time_t fc_hard_req_timeout;
uct_ud_mlx5_iface_common_config_t mlx5_ud;
} uct_dc_mlx5_iface_config_t;
typedef void (*uct_dc_dci_handle_failure_func_t)(uct_dc_mlx5_iface_t *iface,
struct mlx5_cqe64 *cqe,
uint8_t dci_index,
ucs_status_t status);
typedef struct uct_dc_dci {
uct_rc_txqp_t txqp;
uct_ib_mlx5_txwq_t txwq;
union {
uct_dc_mlx5_ep_t *ep;
ucs_arbiter_group_t arb_group;
};
uint8_t pool_index;
uint8_t path_index;
} uct_dc_dci_t;
typedef struct uct_dc_fc_sender_data {
uint64_t ep;
struct {
uint64_t seq;
int is_global;
union ibv_gid gid;
} UCS_S_PACKED payload;
} UCS_S_PACKED uct_dc_fc_sender_data_t;
typedef struct uct_dc_fc_request {
uct_rc_pending_req_t super;
uct_dc_fc_sender_data_t sender;
uint32_t dct_num;
uint16_t lid;
} uct_dc_fc_request_t;
typedef struct uct_dc_mlx5_ep_fc_entry {
uint64_t seq;
ucs_time_t send_time;
} uct_dc_mlx5_ep_fc_entry_t;
KHASH_MAP_INIT_INT64(uct_dc_mlx5_fc_hash, uct_dc_mlx5_ep_fc_entry_t);
typedef struct {
int8_t stack_top;
uint8_t stack[UCT_DC_MLX5_IFACE_MAX_USER_DCIS];
ucs_arbiter_t arbiter;
int8_t release_stack_top;
} uct_dc_mlx5_dci_pool_t;
struct uct_dc_mlx5_iface {
uct_rc_mlx5_iface_common_t super;
struct {
uct_dc_dci_t dcis[UCT_DC_MLX5_IFACE_MAX_DCIS];
uint8_t ndci;
uct_dc_mlx5_dci_pool_t dci_pool[UCT_DC_MLX5_IFACE_MAX_DCI_POOLS];
uint8_t num_dci_pools;
uint8_t policy;
int16_t available_quota;
unsigned bb_max;
uct_dc_mlx5_ep_t *fc_ep;
khash_t(uct_dc_mlx5_fc_hash) fc_hash;
uint64_t fc_seq;
ucs_time_t fc_hard_req_timeout;
unsigned rand_seed;
ucs_arbiter_callback_t pend_cb;
uct_worker_cb_id_t dci_release_prog_id;
uint8_t dci_pool_release_bitmap;
} tx;
struct {
uct_ib_mlx5_qp_t dct;
} rx;
uint8_t version_flag;
uint8_t flags;
uint8_t keepalive_dci;
uct_ud_mlx5_iface_common_t ud_common;
};
extern ucs_config_field_t uct_dc_mlx5_iface_config_table[];
ucs_status_t
uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface,
const uct_dc_mlx5_iface_config_t *config);
int uct_dc_mlx5_iface_is_reachable(const uct_iface_h tl_iface,
const uct_device_addr_t *dev_addr,
const uct_iface_addr_t *iface_addr);
ucs_status_t uct_dc_mlx5_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *iface_addr);
ucs_status_t uct_dc_mlx5_iface_flush(uct_iface_h tl_iface, unsigned flags, uct_completion_t *comp);
void uct_dc_mlx5_iface_set_quota(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_iface_config_t *config);
ucs_status_t uct_dc_mlx5_iface_init_fc_ep(uct_dc_mlx5_iface_t *iface);
ucs_status_t uct_dc_mlx5_iface_fc_grant(uct_pending_req_t *self);
ucs_status_t uct_dc_mlx5_iface_fc_handler(uct_rc_iface_t *rc_iface, unsigned qp_num,
uct_rc_hdr_t *hdr, unsigned length,
uint32_t imm_data, uint16_t lid, unsigned flags);
void uct_dc_mlx5_destroy_dct(uct_dc_mlx5_iface_t *iface);
void uct_dc_mlx5_iface_init_version(uct_dc_mlx5_iface_t *iface, uct_md_h md);
ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface,
uct_dc_dci_t *dci);
void uct_dc_mlx5_iface_dcis_destroy(uct_dc_mlx5_iface_t *iface, int max);
ucs_status_t uct_dc_mlx5_iface_keepalive_init(uct_dc_mlx5_iface_t *iface);
void uct_dc_mlx5_iface_set_ep_failed(uct_dc_mlx5_iface_t *iface,
uct_dc_mlx5_ep_t *ep,
struct mlx5_cqe64 *cqe,
uct_ib_mlx5_txwq_t *txwq,
ucs_status_t ep_status);
ucs_status_t
uct_dc_mlx5_iface_create_dcis(uct_dc_mlx5_iface_t *iface,
const uct_dc_mlx5_iface_config_t *config);
void uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface, uint8_t dci_index);
#if HAVE_DEVX
ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface,
int full_handshake);
ucs_status_t uct_dc_mlx5_iface_devx_set_srq_dc_params(uct_dc_mlx5_iface_t *iface);
ucs_status_t uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface,
uct_ib_mlx5_qp_t *qp,
uint8_t path_index);
#else
static UCS_F_MAYBE_UNUSED ucs_status_t uct_dc_mlx5_iface_devx_create_dct(
uct_dc_mlx5_iface_t *iface, int full_handshake)
{
return UCS_ERR_UNSUPPORTED;
}
static UCS_F_MAYBE_UNUSED ucs_status_t
uct_dc_mlx5_iface_devx_set_srq_dc_params(uct_dc_mlx5_iface_t *iface)
{
return UCS_ERR_UNSUPPORTED;
}
static UCS_F_MAYBE_UNUSED ucs_status_t uct_dc_mlx5_iface_devx_dci_connect(
uct_dc_mlx5_iface_t *iface, uct_ib_mlx5_qp_t *qp, uint8_t path_index)
{
return UCS_ERR_UNSUPPORTED;
}
#endif
#if IBV_HW_TM
static UCS_F_ALWAYS_INLINE void
uct_dc_mlx5_iface_fill_ravh(struct ibv_ravh *ravh, uint32_t dct_num)
{
ravh->sl_dct = htobe32(dct_num);
ravh->dc_access_key = htobe64(UCT_IB_KEY);
ravh->reserved = 0;
}
#endif
static UCS_F_ALWAYS_INLINE uint8_t
uct_dc_mlx5_iface_total_ndci(uct_dc_mlx5_iface_t *iface)
{
return (iface->tx.ndci * iface->tx.num_dci_pools) +
((iface->flags & UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE) ?
UCT_DC_MLX5_KEEPALIVE_NUM_DCIS : 0);
}
static UCS_F_ALWAYS_INLINE uint8_t
uct_dc_mlx5_iface_dci_find(uct_dc_mlx5_iface_t *iface, struct mlx5_cqe64 *cqe)
{
uint32_t qp_num;
int i, ndci;
if (ucs_likely(iface->flags & UCT_DC_MLX5_IFACE_FLAG_UIDX)) {
return cqe->srqn_uidx >> UCT_IB_UIDX_SHIFT;
}
qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER);
ndci = uct_dc_mlx5_iface_total_ndci(iface);
for (i = 0; i < ndci; i++) {
if (iface->tx.dcis[i].txwq.super.qp_num == qp_num) {
return i;
}
}
ucs_fatal("DCI (qpnum=%d) does not exist", qp_num);
}
static UCS_F_ALWAYS_INLINE int
uct_dc_mlx5_iface_has_tx_resources(uct_dc_mlx5_iface_t *iface)
{
return !ucs_mpool_is_empty(&iface->super.super.tx.mp) &&
(iface->super.super.tx.reads_available > 0);
}
static UCS_F_ALWAYS_INLINE int
uct_dc_mlx5_iface_dci_has_tx_resources(uct_dc_mlx5_iface_t *iface,
uint8_t dci_index)
{
return uct_rc_txqp_available(&iface->tx.dcis[dci_index].txqp) > 0;
}
static UCS_F_ALWAYS_INLINE ucs_arbiter_t *
uct_dc_mlx5_iface_tx_waitq(uct_dc_mlx5_iface_t *iface)
{
return &iface->super.super.tx.arbiter;
}
static UCS_F_ALWAYS_INLINE ucs_arbiter_t *
uct_dc_mlx5_iface_dci_waitq(uct_dc_mlx5_iface_t *iface, uint8_t pool_index)
{
return &iface->tx.dci_pool[pool_index].arbiter;
}
static UCS_F_ALWAYS_INLINE int
uct_dc_mlx5_iface_dci_has_outstanding(uct_dc_mlx5_iface_t *iface, int dci_index)
{
uct_rc_txqp_t *txqp;
txqp = &iface->tx.dcis[dci_index].txqp;
return uct_rc_txqp_available(txqp) < (int16_t)iface->tx.bb_max;
}
static UCS_F_ALWAYS_INLINE ucs_status_t
uct_dc_mlx5_iface_flush_dci(uct_dc_mlx5_iface_t *iface, int dci_index)
{
if (!uct_dc_mlx5_iface_dci_has_outstanding(iface, dci_index)) {
return UCS_OK;
}
ucs_trace_poll("dci %d is not flushed %d/%d", dci_index,
iface->tx.dcis[dci_index].txqp.available, iface->tx.bb_max);
ucs_assertv(uct_rc_txqp_unsignaled(&iface->tx.dcis[dci_index].txqp) == 0,
"unsignalled send is not supported!!!");
return UCS_INPROGRESS;
}
static UCS_F_ALWAYS_INLINE int
uct_dc_mlx5_iface_is_dci_keepalive(uct_dc_mlx5_iface_t *iface, int dci_index)
{
return dci_index == iface->keepalive_dci;
}
#endif