Commit 8221c5eb authored by Björn Töpel's avatar Björn Töpel Committed by Jeff Kirsher
Browse files

ixgbe: add AF_XDP zero-copy Tx support



This patch adds zero-copy Tx support for AF_XDP sockets. It implements
the ndo_xsk_async_xmit netdev ndo and performs all the Tx logic from a
NAPI context. This means pulling egress packets from the Tx ring,
placing the frames on the NIC HW descriptor ring and completing sent
frames back to the application via the completion ring.

The regular XDP Tx ring is used for AF_XDP as well. This rationale for
this is as follows: XDP_REDIRECT guarantees mutual exclusion between
different NAPI contexts based on CPU id. In other words, a netdev can
XDP_REDIRECT to another netdev with a different NAPI context, since
the operation is bound to a specific core and each core has its own
hardware ring.

As the AF_XDP Tx action is running in the same NAPI context and using
the same ring, it will also be protected from XDP_REDIRECT actions
with the exact same mechanism.

As with AF_XDP Rx, all AF_XDP Tx specific functions are added to
ixgbe_xsk.c.
Signed-off-by: default avatarBjörn Töpel <bjorn.topel@intel.com>
Tested-by: default avatarWilliam Tu <u9012063@gmail.com>
Tested-by: default avatarAndrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: default avatarJeff Kirsher <jeffrey.t.kirsher@intel.com>
parent 05ae8614
......@@ -3161,7 +3161,11 @@ int ixgbe_poll(struct napi_struct *napi, int budget)
#endif
ixgbe_for_each_ring(ring, q_vector->tx) {
if (!ixgbe_clean_tx_irq(q_vector, ring, budget))
bool wd = ring->xsk_umem ?
ixgbe_clean_xdp_tx_irq(q_vector, ring, budget) :
ixgbe_clean_tx_irq(q_vector, ring, budget);
if (!wd)
clean_complete = false;
}
......@@ -3470,6 +3474,10 @@ void ixgbe_configure_tx_ring(struct ixgbe_adapter *adapter,
u32 txdctl = IXGBE_TXDCTL_ENABLE;
u8 reg_idx = ring->reg_idx;
ring->xsk_umem = NULL;
if (ring_is_xdp(ring))
ring->xsk_umem = ixgbe_xsk_umem(adapter, ring);
/* disable queue to avoid issues while updating state */
IXGBE_WRITE_REG(hw, IXGBE_TXDCTL(reg_idx), 0);
IXGBE_WRITE_FLUSH(hw);
......@@ -5942,6 +5950,11 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring)
u16 i = tx_ring->next_to_clean;
struct ixgbe_tx_buffer *tx_buffer = &tx_ring->tx_buffer_info[i];
if (tx_ring->xsk_umem) {
ixgbe_xsk_clean_tx_ring(tx_ring);
goto out;
}
while (i != tx_ring->next_to_use) {
union ixgbe_adv_tx_desc *eop_desc, *tx_desc;
......@@ -5993,6 +6006,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring)
if (!ring_is_xdp(tx_ring))
netdev_tx_reset_queue(txring_txq(tx_ring));
out:
/* reset next_to_use and next_to_clean */
tx_ring->next_to_use = 0;
tx_ring->next_to_clean = 0;
......@@ -10348,6 +10362,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
.ndo_features_check = ixgbe_features_check,
.ndo_bpf = ixgbe_xdp,
.ndo_xdp_xmit = ixgbe_xdp_xmit,
.ndo_xsk_async_xmit = ixgbe_xsk_async_xmit,
};
static void ixgbe_disable_txr_hw(struct ixgbe_adapter *adapter,
......
......@@ -42,5 +42,9 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
struct ixgbe_ring *rx_ring,
const int budget);
void ixgbe_xsk_clean_rx_ring(struct ixgbe_ring *rx_ring);
bool ixgbe_clean_xdp_tx_irq(struct ixgbe_q_vector *q_vector,
struct ixgbe_ring *tx_ring, int napi_budget);
int ixgbe_xsk_async_xmit(struct net_device *dev, u32 queue_id);
void ixgbe_xsk_clean_tx_ring(struct ixgbe_ring *tx_ring);
#endif /* #define _IXGBE_TXRX_COMMON_H_ */
......@@ -624,3 +624,178 @@ void ixgbe_xsk_clean_rx_ring(struct ixgbe_ring *rx_ring)
}
}
}
static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
{
union ixgbe_adv_tx_desc *tx_desc = NULL;
struct ixgbe_tx_buffer *tx_bi;
bool work_done = true;
u32 len, cmd_type;
dma_addr_t dma;
while (budget-- > 0) {
if (unlikely(!ixgbe_desc_unused(xdp_ring))) {
work_done = false;
break;
}
if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &dma, &len))
break;
dma_sync_single_for_device(xdp_ring->dev, dma, len,
DMA_BIDIRECTIONAL);
tx_bi = &xdp_ring->tx_buffer_info[xdp_ring->next_to_use];
tx_bi->bytecount = len;
tx_bi->xdpf = NULL;
tx_desc = IXGBE_TX_DESC(xdp_ring, xdp_ring->next_to_use);
tx_desc->read.buffer_addr = cpu_to_le64(dma);
/* put descriptor type bits */
cmd_type = IXGBE_ADVTXD_DTYP_DATA |
IXGBE_ADVTXD_DCMD_DEXT |
IXGBE_ADVTXD_DCMD_IFCS;
cmd_type |= len | IXGBE_TXD_CMD;
tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
tx_desc->read.olinfo_status =
cpu_to_le32(len << IXGBE_ADVTXD_PAYLEN_SHIFT);
xdp_ring->next_to_use++;
if (xdp_ring->next_to_use == xdp_ring->count)
xdp_ring->next_to_use = 0;
}
if (tx_desc) {
ixgbe_xdp_ring_update_tail(xdp_ring);
xsk_umem_consume_tx_done(xdp_ring->xsk_umem);
}
return !!budget && work_done;
}
static void ixgbe_clean_xdp_tx_buffer(struct ixgbe_ring *tx_ring,
struct ixgbe_tx_buffer *tx_bi)
{
xdp_return_frame(tx_bi->xdpf);
dma_unmap_single(tx_ring->dev,
dma_unmap_addr(tx_bi, dma),
dma_unmap_len(tx_bi, len), DMA_TO_DEVICE);
dma_unmap_len_set(tx_bi, len, 0);
}
bool ixgbe_clean_xdp_tx_irq(struct ixgbe_q_vector *q_vector,
struct ixgbe_ring *tx_ring, int napi_budget)
{
unsigned int total_packets = 0, total_bytes = 0;
u32 i = tx_ring->next_to_clean, xsk_frames = 0;
unsigned int budget = q_vector->tx.work_limit;
struct xdp_umem *umem = tx_ring->xsk_umem;
union ixgbe_adv_tx_desc *tx_desc;
struct ixgbe_tx_buffer *tx_bi;
bool xmit_done;
tx_bi = &tx_ring->tx_buffer_info[i];
tx_desc = IXGBE_TX_DESC(tx_ring, i);
i -= tx_ring->count;
do {
if (!(tx_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD)))
break;
total_bytes += tx_bi->bytecount;
total_packets += tx_bi->gso_segs;
if (tx_bi->xdpf)
ixgbe_clean_xdp_tx_buffer(tx_ring, tx_bi);
else
xsk_frames++;
tx_bi->xdpf = NULL;
total_bytes += tx_bi->bytecount;
tx_bi++;
tx_desc++;
i++;
if (unlikely(!i)) {
i -= tx_ring->count;
tx_bi = tx_ring->tx_buffer_info;
tx_desc = IXGBE_TX_DESC(tx_ring, 0);
}
/* issue prefetch for next Tx descriptor */
prefetch(tx_desc);
/* update budget accounting */
budget--;
} while (likely(budget));
i += tx_ring->count;
tx_ring->next_to_clean = i;
u64_stats_update_begin(&tx_ring->syncp);
tx_ring->stats.bytes += total_bytes;
tx_ring->stats.packets += total_packets;
u64_stats_update_end(&tx_ring->syncp);
q_vector->tx.total_bytes += total_bytes;
q_vector->tx.total_packets += total_packets;
if (xsk_frames)
xsk_umem_complete_tx(umem, xsk_frames);
xmit_done = ixgbe_xmit_zc(tx_ring, q_vector->tx.work_limit);
return budget > 0 && xmit_done;
}
int ixgbe_xsk_async_xmit(struct net_device *dev, u32 qid)
{
struct ixgbe_adapter *adapter = netdev_priv(dev);
struct ixgbe_ring *ring;
if (test_bit(__IXGBE_DOWN, &adapter->state))
return -ENETDOWN;
if (!READ_ONCE(adapter->xdp_prog))
return -ENXIO;
if (qid >= adapter->num_xdp_queues)
return -ENXIO;
if (!adapter->xsk_umems || !adapter->xsk_umems[qid])
return -ENXIO;
ring = adapter->xdp_ring[qid];
if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) {
u64 eics = BIT_ULL(ring->q_vector->v_idx);
ixgbe_irq_rearm_queues(adapter, eics);
}
return 0;
}
void ixgbe_xsk_clean_tx_ring(struct ixgbe_ring *tx_ring)
{
u16 ntc = tx_ring->next_to_clean, ntu = tx_ring->next_to_use;
struct xdp_umem *umem = tx_ring->xsk_umem;
struct ixgbe_tx_buffer *tx_bi;
u32 xsk_frames = 0;
while (ntc != ntu) {
tx_bi = &tx_ring->tx_buffer_info[ntc];
if (tx_bi->xdpf)
ixgbe_clean_xdp_tx_buffer(tx_ring, tx_bi);
else
xsk_frames++;
tx_bi->xdpf = NULL;
ntc++;
if (ntc == tx_ring->count)
ntc = 0;
}
if (xsk_frames)
xsk_umem_complete_tx(umem, xsk_frames);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment