Commit 297ccceb authored by Alex Vesker's avatar Alex Vesker Committed by Saeed Mahameed
Browse files

net/mlx5: DR, Expose an internal API to issue RDMA operations



Inserting or deleting a rule is done by RDMA read/write operation to SW
ICM device memory. This file provides the support for executing these
operations. It includes allocating the needed resources and providing an
API for writing steering entries to the memory.
Signed-off-by: default avatarAlex Vesker <valex@mellanox.com>
Signed-off-by: default avatarMark Bloch <markb@mellanox.com>
Reviewed-by: default avatarErez Shitrit <erezsh@mellanox.com>
Signed-off-by: default avatarSaeed Mahameed <saeedm@mellanox.com>
parent 29cf8feb
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2019 Mellanox Technologies. */
#include "dr_types.h"
#define QUEUE_SIZE 128
#define SIGNAL_PER_DIV_QUEUE 16
#define TH_NUMS_TO_DRAIN 2
enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
struct dr_data_seg {
u64 addr;
u32 length;
u32 lkey;
unsigned int send_flags;
};
struct postsend_info {
struct dr_data_seg write;
struct dr_data_seg read;
u64 remote_addr;
u32 rkey;
};
struct dr_qp_rtr_attr {
struct mlx5dr_cmd_gid_attr dgid_attr;
enum ib_mtu mtu;
u32 qp_num;
u16 port_num;
u8 min_rnr_timer;
u8 sgid_index;
u16 udp_src_port;
};
struct dr_qp_rts_attr {
u8 timeout;
u8 retry_cnt;
u8 rnr_retry;
};
struct dr_qp_init_attr {
u32 cqn;
u32 pdn;
u32 max_send_wr;
struct mlx5_uars_page *uar;
};
static int dr_parse_cqe(struct mlx5dr_cq *dr_cq, struct mlx5_cqe64 *cqe64)
{
unsigned int idx;
u8 opcode;
opcode = get_cqe_opcode(cqe64);
if (opcode == MLX5_CQE_REQ_ERR) {
idx = be16_to_cpu(cqe64->wqe_counter) &
(dr_cq->qp->sq.wqe_cnt - 1);
dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
} else if (opcode == MLX5_CQE_RESP_ERR) {
++dr_cq->qp->sq.cc;
} else {
idx = be16_to_cpu(cqe64->wqe_counter) &
(dr_cq->qp->sq.wqe_cnt - 1);
dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
return CQ_OK;
}
return CQ_POLL_ERR;
}
static int dr_cq_poll_one(struct mlx5dr_cq *dr_cq)
{
struct mlx5_cqe64 *cqe64;
int err;
cqe64 = mlx5_cqwq_get_cqe(&dr_cq->wq);
if (!cqe64)
return CQ_EMPTY;
mlx5_cqwq_pop(&dr_cq->wq);
err = dr_parse_cqe(dr_cq, cqe64);
mlx5_cqwq_update_db_record(&dr_cq->wq);
return err;
}
static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne)
{
int npolled;
int err = 0;
for (npolled = 0; npolled < ne; ++npolled) {
err = dr_cq_poll_one(dr_cq);
if (err != CQ_OK)
break;
}
return err == CQ_POLL_ERR ? err : npolled;
}
static void dr_qp_event(struct mlx5_core_qp *mqp, int event)
{
pr_info("DR QP event %u on QP #%u\n", event, mqp->qpn);
}
static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
struct dr_qp_init_attr *attr)
{
u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {};
struct mlx5_wq_param wqp;
struct mlx5dr_qp *dr_qp;
int inlen;
void *qpc;
void *in;
int err;
dr_qp = kzalloc(sizeof(*dr_qp), GFP_KERNEL);
if (!dr_qp)
return NULL;
wqp.buf_numa_node = mdev->priv.numa_node;
wqp.db_numa_node = mdev->priv.numa_node;
dr_qp->rq.pc = 0;
dr_qp->rq.cc = 0;
dr_qp->rq.wqe_cnt = 4;
dr_qp->sq.pc = 0;
dr_qp->sq.cc = 0;
dr_qp->sq.wqe_cnt = roundup_pow_of_two(attr->max_send_wr);
MLX5_SET(qpc, temp_qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
MLX5_SET(qpc, temp_qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
MLX5_SET(qpc, temp_qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
err = mlx5_wq_qp_create(mdev, &wqp, temp_qpc, &dr_qp->wq,
&dr_qp->wq_ctrl);
if (err) {
mlx5_core_info(mdev, "Can't create QP WQ\n");
goto err_wq;
}
dr_qp->sq.wqe_head = kcalloc(dr_qp->sq.wqe_cnt,
sizeof(dr_qp->sq.wqe_head[0]),
GFP_KERNEL);
if (!dr_qp->sq.wqe_head) {
mlx5_core_warn(mdev, "Can't allocate wqe head\n");
goto err_wqe_head;
}
inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
dr_qp->wq_ctrl.buf.npages;
in = kvzalloc(inlen, GFP_KERNEL);
if (!in) {
err = -ENOMEM;
goto err_in;
}
qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
MLX5_SET(qpc, qpc, pd, attr->pdn);
MLX5_SET(qpc, qpc, uar_page, attr->uar->index);
MLX5_SET(qpc, qpc, log_page_size,
dr_qp->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
MLX5_SET(qpc, qpc, fre, 1);
MLX5_SET(qpc, qpc, rlky, 1);
MLX5_SET(qpc, qpc, cqn_snd, attr->cqn);
MLX5_SET(qpc, qpc, cqn_rcv, attr->cqn);
MLX5_SET(qpc, qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
MLX5_SET(qpc, qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
MLX5_SET(qpc, qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
MLX5_SET64(qpc, qpc, dbr_addr, dr_qp->wq_ctrl.db.dma);
if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
mlx5_fill_page_frag_array(&dr_qp->wq_ctrl.buf,
(__be64 *)MLX5_ADDR_OF(create_qp_in,
in, pas));
err = mlx5_core_create_qp(mdev, &dr_qp->mqp, in, inlen);
kfree(in);
if (err) {
mlx5_core_warn(mdev, " Can't create QP\n");
goto err_in;
}
dr_qp->mqp.event = dr_qp_event;
dr_qp->uar = attr->uar;
return dr_qp;
err_in:
kfree(dr_qp->sq.wqe_head);
err_wqe_head:
mlx5_wq_destroy(&dr_qp->wq_ctrl);
err_wq:
kfree(dr_qp);
return NULL;
}
static void dr_destroy_qp(struct mlx5_core_dev *mdev,
struct mlx5dr_qp *dr_qp)
{
mlx5_core_destroy_qp(mdev, &dr_qp->mqp);
kfree(dr_qp->sq.wqe_head);
mlx5_wq_destroy(&dr_qp->wq_ctrl);
kfree(dr_qp);
}
static void dr_cmd_notify_hw(struct mlx5dr_qp *dr_qp, void *ctrl)
{
dma_wmb();
*dr_qp->wq.sq.db = cpu_to_be32(dr_qp->sq.pc & 0xfffff);
/* After wmb() the hw aware of new work */
wmb();
mlx5_write64(ctrl, dr_qp->uar->map + MLX5_BF_OFFSET);
}
static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr,
u32 rkey, struct dr_data_seg *data_seg,
u32 opcode, int nreq)
{
struct mlx5_wqe_raddr_seg *wq_raddr;
struct mlx5_wqe_ctrl_seg *wq_ctrl;
struct mlx5_wqe_data_seg *wq_dseg;
unsigned int size;
unsigned int idx;
size = sizeof(*wq_ctrl) / 16 + sizeof(*wq_dseg) / 16 +
sizeof(*wq_raddr) / 16;
idx = dr_qp->sq.pc & (dr_qp->sq.wqe_cnt - 1);
wq_ctrl = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx);
wq_ctrl->imm = 0;
wq_ctrl->fm_ce_se = (data_seg->send_flags) ?
MLX5_WQE_CTRL_CQ_UPDATE : 0;
wq_ctrl->opmod_idx_opcode = cpu_to_be32(((dr_qp->sq.pc & 0xffff) << 8) |
opcode);
wq_ctrl->qpn_ds = cpu_to_be32(size | dr_qp->mqp.qpn << 8);
wq_raddr = (void *)(wq_ctrl + 1);
wq_raddr->raddr = cpu_to_be64(remote_addr);
wq_raddr->rkey = cpu_to_be32(rkey);
wq_raddr->reserved = 0;
wq_dseg = (void *)(wq_raddr + 1);
wq_dseg->byte_count = cpu_to_be32(data_seg->length);
wq_dseg->lkey = cpu_to_be32(data_seg->lkey);
wq_dseg->addr = cpu_to_be64(data_seg->addr);
dr_qp->sq.wqe_head[idx] = dr_qp->sq.pc++;
if (nreq)
dr_cmd_notify_hw(dr_qp, wq_ctrl);
}
static void dr_post_send(struct mlx5dr_qp *dr_qp, struct postsend_info *send_info)
{
dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
&send_info->write, MLX5_OPCODE_RDMA_WRITE, 0);
dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
&send_info->read, MLX5_OPCODE_RDMA_READ, 1);
}
/**
* mlx5dr_send_fill_and_append_ste_send_info: Add data to be sent
* with send_list parameters:
*
* @ste: The data that attached to this specific ste
* @size: of data to write
* @offset: of the data from start of the hw_ste entry
* @data: data
* @ste_info: ste to be sent with send_list
* @send_list: to append into it
* @copy_data: if true indicates that the data should be kept because
* it's not backuped any where (like in re-hash).
* if false, it lets the data to be updated after
* it was added to the list.
*/
void mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste *ste, u16 size,
u16 offset, u8 *data,
struct mlx5dr_ste_send_info *ste_info,
struct list_head *send_list,
bool copy_data)
{
ste_info->size = size;
ste_info->ste = ste;
ste_info->offset = offset;
if (copy_data) {
memcpy(ste_info->data_cont, data, size);
ste_info->data = ste_info->data_cont;
} else {
ste_info->data = data;
}
list_add_tail(&ste_info->send_list, send_list);
}
/* The function tries to consume one wc each time, unless the queue is full, in
* that case, which means that the hw is behind the sw in a full queue len
* the function will drain the cq till it empty.
*/
static int dr_handle_pending_wc(struct mlx5dr_domain *dmn,
struct mlx5dr_send_ring *send_ring)
{
bool is_drain = false;
int ne;
if (send_ring->pending_wqe < send_ring->signal_th)
return 0;
/* Queue is full start drain it */
if (send_ring->pending_wqe >=
dmn->send_ring->signal_th * TH_NUMS_TO_DRAIN)
is_drain = true;
do {
ne = dr_poll_cq(send_ring->cq, 1);
if (ne < 0)
return ne;
else if (ne == 1)
send_ring->pending_wqe -= send_ring->signal_th;
} while (is_drain && send_ring->pending_wqe);
return 0;
}
static void dr_fill_data_segs(struct mlx5dr_send_ring *send_ring,
struct postsend_info *send_info)
{
send_ring->pending_wqe++;
if (send_ring->pending_wqe % send_ring->signal_th == 0)
send_info->write.send_flags |= IB_SEND_SIGNALED;
send_ring->pending_wqe++;
send_info->read.length = send_info->write.length;
/* Read into the same write area */
send_info->read.addr = (uintptr_t)send_info->write.addr;
send_info->read.lkey = send_ring->mr->mkey.key;
if (send_ring->pending_wqe % send_ring->signal_th == 0)
send_info->read.send_flags = IB_SEND_SIGNALED;
else
send_info->read.send_flags = 0;
}
static int dr_postsend_icm_data(struct mlx5dr_domain *dmn,
struct postsend_info *send_info)
{
struct mlx5dr_send_ring *send_ring = dmn->send_ring;
u32 buff_offset;
int ret;
ret = dr_handle_pending_wc(dmn, send_ring);
if (ret)
return ret;
if (send_info->write.length > dmn->info.max_inline_size) {
buff_offset = (send_ring->tx_head &
(dmn->send_ring->signal_th - 1)) *
send_ring->max_post_send_size;
/* Copy to ring mr */
memcpy(send_ring->buf + buff_offset,
(void *)(uintptr_t)send_info->write.addr,
send_info->write.length);
send_info->write.addr = (uintptr_t)send_ring->mr->dma_addr + buff_offset;
send_info->write.lkey = send_ring->mr->mkey.key;
}
send_ring->tx_head++;
dr_fill_data_segs(send_ring, send_info);
dr_post_send(send_ring->qp, send_info);
return 0;
}
static int dr_get_tbl_copy_details(struct mlx5dr_domain *dmn,
struct mlx5dr_ste_htbl *htbl,
u8 **data,
u32 *byte_size,
int *iterations,
int *num_stes)
{
int alloc_size;
if (htbl->chunk->byte_size > dmn->send_ring->max_post_send_size) {
*iterations = htbl->chunk->byte_size /
dmn->send_ring->max_post_send_size;
*byte_size = dmn->send_ring->max_post_send_size;
alloc_size = *byte_size;
*num_stes = *byte_size / DR_STE_SIZE;
} else {
*iterations = 1;
*num_stes = htbl->chunk->num_of_entries;
alloc_size = *num_stes * DR_STE_SIZE;
}
*data = kzalloc(alloc_size, GFP_KERNEL);
if (!*data)
return -ENOMEM;
return 0;
}
/**
* mlx5dr_send_postsend_ste: write size bytes into offset from the hw cm.
*
* @dmn: Domain
* @ste: The ste struct that contains the data (at
* least part of it)
* @data: The real data to send size data
* @size: for writing.
* @offset: The offset from the icm mapped data to
* start write to this for write only part of the
* buffer.
*
* Return: 0 on success.
*/
int mlx5dr_send_postsend_ste(struct mlx5dr_domain *dmn, struct mlx5dr_ste *ste,
u8 *data, u16 size, u16 offset)
{
struct postsend_info send_info = {};
send_info.write.addr = (uintptr_t)data;
send_info.write.length = size;
send_info.write.lkey = 0;
send_info.remote_addr = mlx5dr_ste_get_mr_addr(ste) + offset;
send_info.rkey = ste->htbl->chunk->rkey;
return dr_postsend_icm_data(dmn, &send_info);
}
int mlx5dr_send_postsend_htbl(struct mlx5dr_domain *dmn,
struct mlx5dr_ste_htbl *htbl,
u8 *formatted_ste, u8 *mask)
{
u32 byte_size = htbl->chunk->byte_size;
int num_stes_per_iter;
int iterations;
u8 *data;
int ret;
int i;
int j;
ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
&iterations, &num_stes_per_iter);
if (ret)
return ret;
/* Send the data iteration times */
for (i = 0; i < iterations; i++) {
u32 ste_index = i * (byte_size / DR_STE_SIZE);
struct postsend_info send_info = {};
/* Copy all ste's on the data buffer
* need to add the bit_mask
*/
for (j = 0; j < num_stes_per_iter; j++) {
u8 *hw_ste = htbl->ste_arr[ste_index + j].hw_ste;
u32 ste_off = j * DR_STE_SIZE;
if (mlx5dr_ste_is_not_valid_entry(hw_ste)) {
memcpy(data + ste_off,
formatted_ste, DR_STE_SIZE);
} else {
/* Copy data */
memcpy(data + ste_off,
htbl->ste_arr[ste_index + j].hw_ste,
DR_STE_SIZE_REDUCED);
/* Copy bit_mask */
memcpy(data + ste_off + DR_STE_SIZE_REDUCED,
mask, DR_STE_SIZE_MASK);
}
}
send_info.write.addr = (uintptr_t)data;
send_info.write.length = byte_size;
send_info.write.lkey = 0;
send_info.remote_addr =
mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
send_info.rkey = htbl->chunk->rkey;
ret = dr_postsend_icm_data(dmn, &send_info);
if (ret)
goto out_free;
}
out_free:
kfree(data);
return ret;
}
/* Initialize htble with default STEs */
int mlx5dr_send_postsend_formatted_htbl(struct mlx5dr_domain *dmn,
struct mlx5dr_ste_htbl *htbl,
u8 *ste_init_data,
bool update_hw_ste)
{
u32 byte_size = htbl->chunk->byte_size;
int iterations;
int num_stes;
u8 *data;
int ret;
int i;
ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
&iterations, &num_stes);
if (ret)
return ret;
for (i = 0; i < num_stes; i++) {
u8 *copy_dst;
/* Copy the same ste on the data buffer */
copy_dst = data + i * DR_STE_SIZE;
memcpy(copy_dst, ste_init_data, DR_STE_SIZE);
if (update_hw_ste) {
/* Copy the reduced ste to hash table ste_arr */
copy_dst = htbl->hw_ste_arr + i * DR_STE_SIZE_REDUCED;
memcpy(copy_dst, ste_init_data, DR_STE_SIZE_REDUCED);
}
}
/* Send the data iteration times */
for (i = 0; i < iterations; i++) {
u8 ste_index = i * (byte_size / DR_STE_SIZE);
struct postsend_info send_info = {};
send_info.write.addr = (uintptr_t)data;
send_info.write.length = byte_size;
send_info.write.lkey = 0;
send_info.remote_addr =
mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
send_info.rkey = htbl->chunk->rkey;
ret = dr_postsend_icm_data(dmn, &send_info);
if (ret)
goto out_free;
}
out_free:
kfree(data);
return ret;
}
int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn,
struct mlx5dr_action *action)
{
struct postsend_info send_info = {};
int ret;
send_info.write.addr = (uintptr_t)action->rewrite.data;
send_info.write.length = action->rewrite.chunk->byte_size;
send_info.write.lkey = 0;
send_info.remote_addr = action->rewrite.chunk->mr_addr;
send_info.rkey = action->rewrite.chunk->rkey;
mutex_lock(&dmn->mutex);
ret = dr_postsend_icm_data(dmn, &send_info);
mutex_unlock(&dmn->mutex);
return ret;
}
static int dr_modify_qp_rst2init(struct mlx5_core_dev *mdev,
struct mlx5dr_qp *dr_qp,
int port)
{
u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
void *qpc;
qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc);
MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, port);
MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
MLX5_SET(qpc, qpc, rre, 1);
MLX5_SET(qpc, qpc, rwe, 1);
return mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RST2INIT_QP, 0, qpc,
&dr_qp->mqp);