net.c 44.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/* Copyright (C) 2009 Red Hat, Inc.
 * Author: Michael S. Tsirkin <mst@redhat.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.
 *
 * virtio-net server in host kernel.
 */

#include <linux/compat.h>
#include <linux/eventfd.h>
#include <linux/vhost.h>
#include <linux/virtio_net.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
15
#include <linux/moduleparam.h>
16
17
18
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <linux/file.h>
19
#include <linux/slab.h>
20
#include <linux/sched/clock.h>
21
#include <linux/sched/signal.h>
22
#include <linux/vmalloc.h>
23
24
25
26
27

#include <linux/net.h>
#include <linux/if_packet.h>
#include <linux/if_arp.h>
#include <linux/if_tun.h>
Arnd Bergmann's avatar
Arnd Bergmann committed
28
#include <linux/if_macvlan.h>
29
#include <linux/if_tap.h>
30
#include <linux/if_vlan.h>
31
32
#include <linux/skb_array.h>
#include <linux/skbuff.h>
33
34

#include <net/sock.h>
35
#include <net/xdp.h>
36
37
38

#include "vhost.h"

39
static int experimental_zcopytx = 1;
40
module_param(experimental_zcopytx, int, 0444);
41
42
MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
		                       " 1 -Enable; 0 - Disable");
43

44
45
46
47
/* Max number of bytes transferred before requeueing the job.
 * Using this limit prevents one virtqueue from starving others. */
#define VHOST_NET_WEIGHT 0x80000

48
/* Max number of packets transferred before requeueing the job.
49
50
51
52
 * Using this limit prevents one virtqueue from starving others with small
 * pkts.
 */
#define VHOST_NET_PKT_WEIGHT 256
53

54
55
56
57
/* MAX number of TX used buffers for outstanding zerocopy */
#define VHOST_MAX_PEND 128
#define VHOST_GOODCOPY_LEN 256

58
59
60
61
62
/*
 * For transmit, used buffer len is unused; we override it to track buffer
 * status internally; used for zerocopy tx only.
 */
/* Lower device DMA failed */
63
#define VHOST_DMA_FAILED_LEN	((__force __virtio32)3)
64
/* Lower device DMA done */
65
#define VHOST_DMA_DONE_LEN	((__force __virtio32)2)
66
/* Lower device DMA in progress */
67
#define VHOST_DMA_IN_PROGRESS	((__force __virtio32)1)
68
/* Buffer unused */
69
#define VHOST_DMA_CLEAR_LEN	((__force __virtio32)0)
70

71
#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
72

73
74
75
enum {
	VHOST_NET_FEATURES = VHOST_FEATURES |
			 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
Jason Wang's avatar
Jason Wang committed
76
77
			 (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
			 (1ULL << VIRTIO_F_IOMMU_PLATFORM)
78
79
};

80
81
82
83
enum {
	VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
};

84
85
86
87
88
89
enum {
	VHOST_NET_VQ_RX = 0,
	VHOST_NET_VQ_TX = 1,
	VHOST_NET_VQ_MAX = 2,
};

90
struct vhost_net_ubuf_ref {
91
92
93
94
95
96
	/* refcount follows semantics similar to kref:
	 *  0: object is released
	 *  1: no outstanding ubufs
	 * >1: outstanding ubufs
	 */
	atomic_t refcount;
97
98
99
100
	wait_queue_head_t wait;
	struct vhost_virtqueue *vq;
};

101
#define VHOST_NET_BATCH 64
102
struct vhost_net_buf {
103
	void **queue;
104
105
106
107
	int tail;
	int head;
};

108
109
struct vhost_net_virtqueue {
	struct vhost_virtqueue vq;
110
111
	size_t vhost_hlen;
	size_t sock_hlen;
112
113
114
	/* vhost zerocopy support fields below: */
	/* last used idx for outstanding DMA zerocopy buffers */
	int upend_idx;
115
116
117
	/* For TX, first used idx for DMA done zerocopy buffers
	 * For RX, number of batched heads
	 */
118
	int done_idx;
119
120
	/* Number of XDP frames batched */
	int batched_xdp;
121
122
123
124
	/* an array of userspace buffers info */
	struct ubuf_info *ubuf_info;
	/* Reference counting for outstanding ubufs.
	 * Protected by vq mutex. Writers must also take device mutex. */
125
	struct vhost_net_ubuf_ref *ubufs;
126
	struct ptr_ring *rx_ring;
127
	struct vhost_net_buf rxq;
128
129
	/* Batched XDP buffs */
	struct xdp_buff *xdp;
130
131
};

132
133
struct vhost_net {
	struct vhost_dev dev;
134
	struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
135
	struct vhost_poll poll[VHOST_NET_VQ_MAX];
136
137
138
139
140
141
	/* Number of TX recently submitted.
	 * Protected by tx vq lock. */
	unsigned tx_packets;
	/* Number of times zerocopy TX recently failed.
	 * Protected by tx vq lock. */
	unsigned tx_zcopy_err;
142
143
	/* Flush in progress. Protected by tx vq lock. */
	bool tx_flush;
144
145
146
147
	/* Private page frag */
	struct page_frag page_frag;
	/* Refcount bias of page frag */
	int refcnt_bias;
148
149
};

150
static unsigned vhost_net_zcopy_mask __read_mostly;
151

152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq)
{
	if (rxq->tail != rxq->head)
		return rxq->queue[rxq->head];
	else
		return NULL;
}

static int vhost_net_buf_get_size(struct vhost_net_buf *rxq)
{
	return rxq->tail - rxq->head;
}

static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
{
	return rxq->tail == rxq->head;
}

static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
{
	void *ret = vhost_net_buf_get_ptr(rxq);
	++rxq->head;
	return ret;
}

static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
{
	struct vhost_net_buf *rxq = &nvq->rxq;

	rxq->head = 0;
182
	rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
183
					      VHOST_NET_BATCH);
184
185
186
187
188
189
190
	return rxq->tail;
}

static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
{
	struct vhost_net_buf *rxq = &nvq->rxq;

191
192
193
	if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) {
		ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head,
				   vhost_net_buf_get_size(rxq),
194
				   tun_ptr_free);
195
196
197
198
		rxq->head = rxq->tail = 0;
	}
}

Jason Wang's avatar
Jason Wang committed
199
200
static int vhost_net_buf_peek_len(void *ptr)
{
201
202
	if (tun_is_xdp_frame(ptr)) {
		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
Jason Wang's avatar
Jason Wang committed
203

204
		return xdpf->len;
Jason Wang's avatar
Jason Wang committed
205
206
207
208
209
	}

	return __skb_array_len_with_tag(ptr);
}

210
211
212
213
214
215
216
217
218
219
220
static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
{
	struct vhost_net_buf *rxq = &nvq->rxq;

	if (!vhost_net_buf_is_empty(rxq))
		goto out;

	if (!vhost_net_buf_produce(nvq))
		return 0;

out:
Jason Wang's avatar
Jason Wang committed
221
	return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq));
222
223
224
225
226
227
228
}

static void vhost_net_buf_init(struct vhost_net_buf *rxq)
{
	rxq->head = rxq->tail = 0;
}

229
static void vhost_net_enable_zcopy(int vq)
230
{
231
	vhost_net_zcopy_mask |= 0x1 << vq;
232
233
}

234
235
static struct vhost_net_ubuf_ref *
vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
236
{
237
	struct vhost_net_ubuf_ref *ubufs;
238
239
240
241
242
243
	/* No zero copy backend? Nothing to count. */
	if (!zcopy)
		return NULL;
	ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
	if (!ubufs)
		return ERR_PTR(-ENOMEM);
244
	atomic_set(&ubufs->refcount, 1);
245
246
247
248
249
	init_waitqueue_head(&ubufs->wait);
	ubufs->vq = vq;
	return ubufs;
}

250
static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
251
{
252
253
254
255
	int r = atomic_sub_return(1, &ubufs->refcount);
	if (unlikely(!r))
		wake_up(&ubufs->wait);
	return r;
256
257
}

258
static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
259
{
260
261
	vhost_net_ubuf_put(ubufs);
	wait_event(ubufs->wait, !atomic_read(&ubufs->refcount));
262
263
264
265
266
}

static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
{
	vhost_net_ubuf_put_and_wait(ubufs);
267
268
269
	kfree(ubufs);
}

270
271
272
273
static void vhost_net_clear_ubuf_info(struct vhost_net *n)
{
	int i;

274
275
276
	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
		kfree(n->vqs[i].ubuf_info);
		n->vqs[i].ubuf_info = NULL;
277
278
279
	}
}

Asias He's avatar
Asias He committed
280
static int vhost_net_set_ubuf_info(struct vhost_net *n)
281
282
283
284
{
	bool zcopy;
	int i;

285
	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
286
		zcopy = vhost_net_zcopy_mask & (0x1 << i);
287
288
		if (!zcopy)
			continue;
289
290
291
292
		n->vqs[i].ubuf_info =
			kmalloc_array(UIO_MAXIOV,
				      sizeof(*n->vqs[i].ubuf_info),
				      GFP_KERNEL);
293
294
295
296
297
298
		if  (!n->vqs[i].ubuf_info)
			goto err;
	}
	return 0;

err:
299
	vhost_net_clear_ubuf_info(n);
300
301
302
	return -ENOMEM;
}

Asias He's avatar
Asias He committed
303
static void vhost_net_vq_reset(struct vhost_net *n)
304
305
306
{
	int i;

307
308
	vhost_net_clear_ubuf_info(n);

309
310
311
312
	for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
		n->vqs[i].done_idx = 0;
		n->vqs[i].upend_idx = 0;
		n->vqs[i].ubufs = NULL;
313
314
		n->vqs[i].vhost_hlen = 0;
		n->vqs[i].sock_hlen = 0;
315
		vhost_net_buf_init(&n->vqs[i].rxq);
316
317
318
319
	}

}

320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
static void vhost_net_tx_packet(struct vhost_net *net)
{
	++net->tx_packets;
	if (net->tx_packets < 1024)
		return;
	net->tx_packets = 0;
	net->tx_zcopy_err = 0;
}

static void vhost_net_tx_err(struct vhost_net *net)
{
	++net->tx_zcopy_err;
}

static bool vhost_net_tx_select_zcopy(struct vhost_net *net)
{
336
337
338
339
340
	/* TX flush waits for outstanding DMAs to be done.
	 * Don't start new DMAs.
	 */
	return !net->tx_flush &&
		net->tx_packets / 64 >= net->tx_zcopy_err;
341
342
}

343
344
345
346
347
348
static bool vhost_sock_zcopy(struct socket *sock)
{
	return unlikely(experimental_zcopytx) &&
		sock_flag(sock->sk, SOCK_ZEROCOPY);
}

349
350
351
352
353
static bool vhost_sock_xdp(struct socket *sock)
{
	return sock_flag(sock->sk, SOCK_XDP);
}

354
355
356
357
358
/* In case of DMA done not in order in lower device driver for some reason.
 * upend_idx is used to track end of used idx, done_idx is used to track head
 * of used idx. Once lower device DMA done contiguously, we will signal KVM
 * guest used idx.
 */
359
360
static void vhost_zerocopy_signal_used(struct vhost_net *net,
				       struct vhost_virtqueue *vq)
361
{
362
363
	struct vhost_net_virtqueue *nvq =
		container_of(vq, struct vhost_net_virtqueue, vq);
364
	int i, add;
365
366
	int j = 0;

367
	for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
368
369
		if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
			vhost_net_tx_err(net);
370
371
372
373
374
375
		if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
			vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
			++j;
		} else
			break;
	}
376
377
378
379
380
381
382
	while (j) {
		add = min(UIO_MAXIOV - nvq->done_idx, j);
		vhost_add_used_and_signal_n(vq->dev, vq,
					    &vq->heads[nvq->done_idx], add);
		nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
		j -= add;
	}
383
384
}

385
static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
386
{
387
	struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
388
	struct vhost_virtqueue *vq = ubufs->vq;
389
	int cnt;
390

391
392
	rcu_read_lock_bh();

393
394
395
	/* set len to mark this desc buffers done DMA */
	vq->heads[ubuf->desc].len = success ?
		VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
396
	cnt = vhost_net_ubuf_put(ubufs);
397

398
399
	/*
	 * Trigger polling thread if guest stopped submitting new buffers:
400
	 * in this case, the refcount after decrement will eventually reach 1.
401
402
403
404
	 * We also trigger polling periodically after each 16 packets
	 * (the value 16 here is more or less arbitrary, it's tuned to trigger
	 * less than 10% of times).
	 */
405
	if (cnt <= 1 || !(cnt % 16))
406
		vhost_poll_queue(&vq->poll);
407
408

	rcu_read_unlock_bh();
409
410
}

Jason Wang's avatar
Jason Wang committed
411
412
413
414
415
static inline unsigned long busy_clock(void)
{
	return local_clock() >> 10;
}

416
static bool vhost_can_busy_poll(unsigned long endtime)
Jason Wang's avatar
Jason Wang committed
417
{
418
419
	return likely(!need_resched() && !time_after(busy_clock(), endtime) &&
		      !signal_pending(current));
Jason Wang's avatar
Jason Wang committed
420
421
}

422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
static void vhost_net_disable_vq(struct vhost_net *n,
				 struct vhost_virtqueue *vq)
{
	struct vhost_net_virtqueue *nvq =
		container_of(vq, struct vhost_net_virtqueue, vq);
	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
	if (!vq->private_data)
		return;
	vhost_poll_stop(poll);
}

static int vhost_net_enable_vq(struct vhost_net *n,
				struct vhost_virtqueue *vq)
{
	struct vhost_net_virtqueue *nvq =
		container_of(vq, struct vhost_net_virtqueue, vq);
	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
	struct socket *sock;

	sock = vq->private_data;
	if (!sock)
		return 0;

	return vhost_poll_start(poll, sock->file);
}

448
449
450
451
452
453
454
455
456
457
458
459
static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
{
	struct vhost_virtqueue *vq = &nvq->vq;
	struct vhost_dev *dev = vq->dev;

	if (!nvq->done_idx)
		return;

	vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
	nvq->done_idx = 0;
}

460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
static void vhost_tx_batch(struct vhost_net *net,
			   struct vhost_net_virtqueue *nvq,
			   struct socket *sock,
			   struct msghdr *msghdr)
{
	struct tun_msg_ctl ctl = {
		.type = TUN_MSG_PTR,
		.num = nvq->batched_xdp,
		.ptr = nvq->xdp,
	};
	int err;

	if (nvq->batched_xdp == 0)
		goto signal_used;

	msghdr->msg_control = &ctl;
	err = sock->ops->sendmsg(sock, msghdr, 0);
	if (unlikely(err < 0)) {
		vq_err(&nvq->vq, "Fail to batch sending packets\n");
		return;
	}

signal_used:
	vhost_net_signal_used(nvq);
	nvq->batched_xdp = 0;
}

487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
static int sock_has_rx_data(struct socket *sock)
{
	if (unlikely(!sock))
		return 0;

	if (sock->ops->peek_len)
		return sock->ops->peek_len(sock);

	return skb_queue_empty(&sock->sk->sk_receive_queue);
}

static void vhost_net_busy_poll_try_queue(struct vhost_net *net,
					  struct vhost_virtqueue *vq)
{
	if (!vhost_vq_avail_empty(&net->dev, vq)) {
		vhost_poll_queue(&vq->poll);
	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
		vhost_disable_notify(&net->dev, vq);
		vhost_poll_queue(&vq->poll);
	}
}

static void vhost_net_busy_poll(struct vhost_net *net,
				struct vhost_virtqueue *rvq,
				struct vhost_virtqueue *tvq,
				bool *busyloop_intr,
				bool poll_rx)
{
	unsigned long busyloop_timeout;
	unsigned long endtime;
	struct socket *sock;
	struct vhost_virtqueue *vq = poll_rx ? tvq : rvq;

520
521
522
523
524
525
526
	/* Try to hold the vq mutex of the paired virtqueue. We can't
	 * use mutex_lock() here since we could not guarantee a
	 * consistenet lock ordering.
	 */
	if (!mutex_trylock(&vq->mutex))
		return;

527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
	vhost_disable_notify(&net->dev, vq);
	sock = rvq->private_data;

	busyloop_timeout = poll_rx ? rvq->busyloop_timeout:
				     tvq->busyloop_timeout;

	preempt_disable();
	endtime = busy_clock() + busyloop_timeout;

	while (vhost_can_busy_poll(endtime)) {
		if (vhost_has_work(&net->dev)) {
			*busyloop_intr = true;
			break;
		}

		if ((sock_has_rx_data(sock) &&
		     !vhost_vq_avail_empty(&net->dev, rvq)) ||
		    !vhost_vq_avail_empty(&net->dev, tvq))
			break;

		cpu_relax();
	}

	preempt_enable();

	if (poll_rx || sock_has_rx_data(sock))
		vhost_net_busy_poll_try_queue(net, vq);
	else if (!poll_rx) /* On tx here, sock has no rx data. */
		vhost_enable_notify(&net->dev, rvq);

	mutex_unlock(&vq->mutex);
}

Jason Wang's avatar
Jason Wang committed
560
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
561
				    struct vhost_net_virtqueue *tnvq,
562
				    unsigned int *out_num, unsigned int *in_num,
563
				    struct msghdr *msghdr, bool *busyloop_intr)
Jason Wang's avatar
Jason Wang committed
564
{
565
566
567
568
569
	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
	struct vhost_virtqueue *rvq = &rnvq->vq;
	struct vhost_virtqueue *tvq = &tnvq->vq;

	int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
Jason Wang's avatar
Jason Wang committed
570
				  out_num, in_num, NULL, NULL);
Jason Wang's avatar
Jason Wang committed
571

572
	if (r == tvq->num && tvq->busyloop_timeout) {
573
		/* Flush batched packets first */
574
575
576
577
578
579
		if (!vhost_sock_zcopy(tvq->private_data))
			vhost_tx_batch(net, tnvq, tvq->private_data, msghdr);

		vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);

		r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
Jason Wang's avatar
Jason Wang committed
580
				      out_num, in_num, NULL, NULL);
Jason Wang's avatar
Jason Wang committed
581
582
583
584
585
	}

	return r;
}

Jason Wang's avatar
Jason Wang committed
586
587
588
589
590
static bool vhost_exceeds_maxpend(struct vhost_net *net)
{
	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
	struct vhost_virtqueue *vq = &nvq->vq;

591
592
	return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV >
	       min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
Jason Wang's avatar
Jason Wang committed
593
594
}

595
596
597
598
599
600
601
602
603
604
605
606
static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
			    size_t hdr_size, int out)
{
	/* Skip header. TODO: support TSO. */
	size_t len = iov_length(vq->iov, out);

	iov_iter_init(iter, WRITE, vq->iov, out, len);
	iov_iter_advance(iter, hdr_size);

	return iov_iter_count(iter);
}

607
608
609
610
611
612
static bool vhost_exceeds_weight(int pkts, int total_len)
{
	return total_len >= VHOST_NET_WEIGHT ||
	       pkts >= VHOST_NET_PKT_WEIGHT;
}

613
614
615
616
617
618
619
620
621
static int get_tx_bufs(struct vhost_net *net,
		       struct vhost_net_virtqueue *nvq,
		       struct msghdr *msg,
		       unsigned int *out, unsigned int *in,
		       size_t *len, bool *busyloop_intr)
{
	struct vhost_virtqueue *vq = &nvq->vq;
	int ret;

622
	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
623

624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
	if (ret < 0 || ret == vq->num)
		return ret;

	if (*in) {
		vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n",
			*out, *in);
		return -EFAULT;
	}

	/* Sanity check */
	*len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out);
	if (*len == 0) {
		vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n",
			*len, nvq->vhost_hlen);
		return -EFAULT;
	}

	return ret;
}

644
645
646
647
648
649
static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
{
	return total_len < VHOST_NET_WEIGHT &&
	       !vhost_vq_avail_empty(vq->dev, vq);
}

650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
#define SKB_FRAG_PAGE_ORDER     get_order(32768)

static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
				       struct page_frag *pfrag, gfp_t gfp)
{
	if (pfrag->page) {
		if (pfrag->offset + sz <= pfrag->size)
			return true;
		__page_frag_cache_drain(pfrag->page, net->refcnt_bias);
	}

	pfrag->offset = 0;
	net->refcnt_bias = 0;
	if (SKB_FRAG_PAGE_ORDER) {
		/* Avoid direct reclaim but allow kswapd to wake */
		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
					  __GFP_COMP | __GFP_NOWARN |
					  __GFP_NORETRY,
					  SKB_FRAG_PAGE_ORDER);
		if (likely(pfrag->page)) {
			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
			goto done;
		}
	}
	pfrag->page = alloc_page(gfp);
	if (likely(pfrag->page)) {
		pfrag->size = PAGE_SIZE;
		goto done;
	}
	return false;

done:
	net->refcnt_bias = USHRT_MAX;
	page_ref_add(pfrag->page, USHRT_MAX - 1);
	return true;
}

687
688
689
690
691
692
#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
			       struct iov_iter *from)
{
	struct vhost_virtqueue *vq = &nvq->vq;
693
694
	struct vhost_net *net = container_of(vq->dev, struct vhost_net,
					     dev);
695
	struct socket *sock = vq->private_data;
696
	struct page_frag *alloc_frag = &net->page_frag;
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
	struct virtio_net_hdr *gso;
	struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
	struct tun_xdp_hdr *hdr;
	size_t len = iov_iter_count(from);
	int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0;
	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen);
	int sock_hlen = nvq->sock_hlen;
	void *buf;
	int copied;

	if (unlikely(len < nvq->sock_hlen))
		return -EFAULT;

	if (SKB_DATA_ALIGN(len + pad) +
	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
		return -ENOSPC;

	buflen += SKB_DATA_ALIGN(len + pad);
	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
717
718
	if (unlikely(!vhost_net_page_frag_refill(net, buflen,
						 alloc_frag, GFP_KERNEL)))
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
		return -ENOMEM;

	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
	copied = copy_page_from_iter(alloc_frag->page,
				     alloc_frag->offset +
				     offsetof(struct tun_xdp_hdr, gso),
				     sock_hlen, from);
	if (copied != sock_hlen)
		return -EFAULT;

	hdr = buf;
	gso = &hdr->gso;

	if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
	    vhost16_to_cpu(vq, gso->csum_start) +
	    vhost16_to_cpu(vq, gso->csum_offset) + 2 >
	    vhost16_to_cpu(vq, gso->hdr_len)) {
		gso->hdr_len = cpu_to_vhost16(vq,
			       vhost16_to_cpu(vq, gso->csum_start) +
			       vhost16_to_cpu(vq, gso->csum_offset) + 2);

		if (vhost16_to_cpu(vq, gso->hdr_len) > len)
			return -EINVAL;
	}

	len -= sock_hlen;
	copied = copy_page_from_iter(alloc_frag->page,
				     alloc_frag->offset + pad,
				     len, from);
	if (copied != len)
		return -EFAULT;

	xdp->data_hard_start = buf;
	xdp->data = buf + pad;
	xdp->data_end = xdp->data + len;
	hdr->buflen = buflen;

756
	--net->refcnt_bias;
757
758
759
760
761
762
763
	alloc_frag->offset += buflen;

	++nvq->batched_xdp;

	return 0;
}

764
static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
765
{
766
	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
767
	struct vhost_virtqueue *vq = &nvq->vq;
768
	unsigned out, in;
769
	int head;
770
771
772
773
774
775
776
777
	struct msghdr msg = {
		.msg_name = NULL,
		.msg_namelen = 0,
		.msg_control = NULL,
		.msg_controllen = 0,
		.msg_flags = MSG_DONTWAIT,
	};
	size_t len, total_len = 0;
778
	int err;
779
	int sent_pkts = 0;
780
	bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
Arnd Bergmann's avatar
Arnd Bergmann committed
781

782
783
	for (;;) {
		bool busyloop_intr = false;
784

785
786
787
		if (nvq->done_idx == VHOST_NET_BATCH)
			vhost_tx_batch(net, nvq, sock, &msg);

788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
				   &busyloop_intr);
		/* On error, stop handling until the next kick. */
		if (unlikely(head < 0))
			break;
		/* Nothing new?  Wait for eventfd to tell us they refilled. */
		if (head == vq->num) {
			if (unlikely(busyloop_intr)) {
				vhost_poll_queue(&vq->poll);
			} else if (unlikely(vhost_enable_notify(&net->dev,
								vq))) {
				vhost_disable_notify(&net->dev, vq);
				continue;
			}
			break;
		}
Jason Wang's avatar
Jason Wang committed
804

805
		total_len += len;
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832

		/* For simplicity, TX batching is only enabled if
		 * sndbuf is unlimited.
		 */
		if (sock_can_batch) {
			err = vhost_net_build_xdp(nvq, &msg.msg_iter);
			if (!err) {
				goto done;
			} else if (unlikely(err != -ENOSPC)) {
				vhost_tx_batch(net, nvq, sock, &msg);
				vhost_discard_vq_desc(vq, 1);
				vhost_net_enable_vq(net, vq);
				break;
			}

			/* We can't build XDP buff, go for single
			 * packet path but let's flush batched
			 * packets.
			 */
			vhost_tx_batch(net, nvq, sock, &msg);
			msg.msg_control = NULL;
		} else {
			if (tx_can_batch(vq, total_len))
				msg.msg_flags |= MSG_MORE;
			else
				msg.msg_flags &= ~MSG_MORE;
		}
833
834
835
836
837
838
839
840
841
842
843

		/* TODO: Check specific error and bomb out unless ENOBUFS? */
		err = sock->ops->sendmsg(sock, &msg, len);
		if (unlikely(err < 0)) {
			vhost_discard_vq_desc(vq, 1);
			vhost_net_enable_vq(net, vq);
			break;
		}
		if (err != len)
			pr_debug("Truncated TX packet: len %d != %zd\n",
				 err, len);
844
845
846
847
done:
		vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
		vq->heads[nvq->done_idx].len = 0;
		++nvq->done_idx;
848
849
850
851
852
		if (vhost_exceeds_weight(++sent_pkts, total_len)) {
			vhost_poll_queue(&vq->poll);
			break;
		}
	}
853

854
	vhost_tx_batch(net, nvq, sock, &msg);
855
}
856

857
858
859
860
861
862
863
864
865
866
867
868
869
static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
{
	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
	struct vhost_virtqueue *vq = &nvq->vq;
	unsigned out, in;
	int head;
	struct msghdr msg = {
		.msg_name = NULL,
		.msg_namelen = 0,
		.msg_control = NULL,
		.msg_controllen = 0,
		.msg_flags = MSG_DONTWAIT,
	};
870
	struct tun_msg_ctl ctl;
871
872
873
874
875
	size_t len, total_len = 0;
	int err;
	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
	bool zcopy_used;
	int sent_pkts = 0;
876
877

	for (;;) {
878
879
		bool busyloop_intr;

880
		/* Release DMAs done buffers first */
881
		vhost_zerocopy_signal_used(net, vq);
882

883
		busyloop_intr = false;
884
885
		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
				   &busyloop_intr);
886
		/* On error, stop handling until the next kick. */
887
		if (unlikely(head < 0))
888
			break;
889
890
		/* Nothing new?  Wait for eventfd to tell us they refilled. */
		if (head == vq->num) {
891
892
893
			if (unlikely(busyloop_intr)) {
				vhost_poll_queue(&vq->poll);
			} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
894
				vhost_disable_notify(&net->dev, vq);
895
896
897
898
				continue;
			}
			break;
		}
899

900
901
902
		zcopy_used = len >= VHOST_GOODCOPY_LEN
			     && !vhost_exceeds_maxpend(net)
			     && vhost_net_tx_select_zcopy(net);
903

904
		/* use msg_control to pass vhost zerocopy ubuf info to skb */
905
		if (zcopy_used) {
906
907
908
			struct ubuf_info *ubuf;
			ubuf = nvq->ubuf_info + nvq->upend_idx;

909
			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
910
911
912
913
			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
			ubuf->callback = vhost_zerocopy_callback;
			ubuf->ctx = nvq->ubufs;
			ubuf->desc = nvq->upend_idx;
914
			refcount_set(&ubuf->refcnt, 1);
915
916
917
918
			msg.msg_control = &ctl;
			ctl.type = TUN_MSG_UBUF;
			ctl.ptr = ubuf;
			msg.msg_controllen = sizeof(ctl);
919
			ubufs = nvq->ubufs;
920
			atomic_inc(&ubufs->refcount);
921
			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
922
		} else {
923
			msg.msg_control = NULL;
924
925
			ubufs = NULL;
		}
Jason Wang's avatar
Jason Wang committed
926
		total_len += len;
927
		if (tx_can_batch(vq, total_len) &&
Jason Wang's avatar
Jason Wang committed
928
929
930
931
932
933
		    likely(!vhost_exceeds_maxpend(net))) {
			msg.msg_flags |= MSG_MORE;
		} else {
			msg.msg_flags &= ~MSG_MORE;
		}

934
		/* TODO: Check specific error and bomb out unless ENOBUFS? */
935
		err = sock->ops->sendmsg(sock, &msg, len);
936
		if (unlikely(err < 0)) {
937
			if (zcopy_used) {
938
				vhost_net_ubuf_put(ubufs);
939
940
				nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
					% UIO_MAXIOV;
941
			}
942
			vhost_discard_vq_desc(vq, 1);
943
			vhost_net_enable_vq(net, vq);
944
945
946
			break;
		}
		if (err != len)
947
948
			pr_debug("Truncated TX packet: "
				 " len %d != %zd\n", err, len);
949
		if (!zcopy_used)
950
			vhost_add_used_and_signal(&net->dev, vq, head, 0);
951
		else
952
953
			vhost_zerocopy_signal_used(net, vq);
		vhost_net_tx_packet(net);
954
		if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
955
956
957
958
			vhost_poll_queue(&vq->poll);
			break;
		}
	}
959
960
961
962
963
964
965
966
967
968
}

/* Expects to be always run from workqueue - which acts as
 * read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
{
	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
	struct vhost_virtqueue *vq = &nvq->vq;
	struct socket *sock;

969
	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
	sock = vq->private_data;
	if (!sock)
		goto out;

	if (!vq_iotlb_prefetch(vq))
		goto out;

	vhost_disable_notify(&net->dev, vq);
	vhost_net_disable_vq(net, vq);

	if (vhost_sock_zcopy(sock))
		handle_tx_zerocopy(net, sock);
	else
		handle_tx_copy(net, sock);

985
out:
986
987
988
	mutex_unlock(&vq->mutex);
}

989
static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
990
991
992
{
	struct sk_buff *head;
	int len = 0;
993
	unsigned long flags;
994

995
	if (rvq->rx_ring)
996
		return vhost_net_buf_peek(rvq);
997

998
	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
999
	head = skb_peek(&sk->sk_receive_queue);
1000
	if (likely(head)) {
1001
		len = head->len;
1002
		if (skb_vlan_tag_present(head))
1003
1004
1005
			len += VLAN_HLEN;
	}

1006
	spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
1007
1008
1009
	return len;
}

1010
1011
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
				      bool *busyloop_intr)
Jason Wang's avatar
Jason Wang committed
1012
{
1013
1014
	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
	struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
1015
	struct vhost_virtqueue *rvq = &rnvq->vq;
1016
1017
	struct vhost_virtqueue *tvq = &tnvq->vq;
	int len = peek_head_len(rnvq, sk);
Jason Wang's avatar
Jason Wang committed
1018

1019
	if (!len && rvq->busyloop_timeout) {
1020
		/* Flush batched heads first */
1021
		vhost_net_signal_used(rnvq);
Jason Wang's avatar
Jason Wang committed
1022
		/* Both tx vq and rx socket were polled here */
1023
		vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
Jason Wang's avatar
Jason Wang committed
1024

1025
		len = peek_head_len(rnvq, sk);
Jason Wang's avatar
Jason Wang committed
1026
1027
1028
1029
1030
	}

	return len;
}

1031
1032
1033
1034
1035
1036
1037
/* This is a multi-buffer version of vhost_get_desc, that works if
 *	vq has read descriptors only.
 * @vq		- the relevant virtqueue
 * @datalen	- data length we'll be reading
 * @iovcount	- returned count of io vectors we fill
 * @log		- vhost log
 * @log_num	- log offset
1038
 * @quota       - headcount quota, 1 for big buffer
1039
1040
1041
1042
1043
1044
1045
 *	returns number of buffer heads allocated, negative on error
 */
static int get_rx_bufs(struct vhost_virtqueue *vq,
		       struct vring_used_elem *heads,
		       int datalen,
		       unsigned *iovcount,
		       struct vhost_log *log,
1046
1047
		       unsigned *log_num,
		       unsigned int quota)
1048
1049
1050
1051
1052
1053
{
	unsigned int out, in;
	int seg = 0;
	int headcount = 0;
	unsigned d;
	int r, nlogs = 0;
1054
1055
1056
1057
	/* len is always initialized before use since we are always called with
	 * datalen > 0.
	 */
	u32 uninitialized_var(len);
1058

1059
	while (datalen > 0 && headcount < quota) {
Jason Wang's avatar
Jason Wang committed
1060
		if (unlikely(seg >= UIO_MAXIOV)) {
1061
1062
1063
			r = -ENOBUFS;
			goto err;
		}
1064
		r = vhost_get_vq_desc(vq, vq->iov + seg,
1065
1066
				      ARRAY_SIZE(vq->iov) - seg, &out,
				      &in, log, log_num);
1067
1068
1069
1070
		if (unlikely(r < 0))
			goto err;

		d = r;
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
		if (d == vq->num) {
			r = 0;
			goto err;
		}
		if (unlikely(out || in <= 0)) {
			vq_err(vq, "unexpected descriptor format for RX: "
				"out %d, in %d\n", out, in);
			r = -EINVAL;
			goto err;
		}
		if (unlikely(log)) {
			nlogs += *log_num;
			log += *log_num;
		}
1085
1086
1087
1088
		heads[headcount].id = cpu_to_vhost32(vq, d);
		len = iov_length(vq->iov + seg, in);
		heads[headcount].len = cpu_to_vhost32(vq, len);
		datalen -= len;
1089
1090
1091
		++headcount;
		seg += in;
	}
1092
	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
1093
1094
1095
	*iovcount = seg;
	if (unlikely(log))
		*log_num = nlogs;
1096
1097
1098
1099
1100
1101

	/* Detect overrun */
	if (unlikely(datalen > 0)) {
		r = UIO_MAXIOV + 1;
		goto err;
	}
1102
1103
1104
1105
1106
1107
	return headcount;
err:
	vhost_discard_vq_desc(vq, headcount);
	return r;
}

1108
1109
/* Expects to be always run from workqueue - which acts as
 * read-size critical section for our kind of RCU. */
1110
static void handle_rx(struct vhost_net *net)
1111
{
1112
1113
	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
	struct vhost_virtqueue *vq = &nvq->vq;
1114
1115
1116
1117
1118
1119
1120
1121
1122
	unsigned uninitialized_var(in), log;
	struct vhost_log *vq_log;
	struct msghdr msg = {
		.msg_name = NULL,
		.msg_namelen = 0,
		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
		.msg_controllen = 0,
		.msg_flags = MSG_DONTWAIT,
	};
1123
1124
1125
	struct virtio_net_hdr hdr = {
		.flags = 0,
		.gso_type = VIRTIO_NET_HDR_GSO_NONE
1126
1127
	};
	size_t total_len = 0;
1128
	int err, mergeable;
1129
	s16 headcount;
1130
1131
	size_t vhost_hlen, sock_hlen;
	size_t vhost_len, sock_len;
1132
	bool busyloop_intr = false;
1133
	struct socket *sock;
1134
	struct iov_iter fixup;
1135
	__virtio16 num_buffers;
1136
	int recv_pkts = 0;
1137

1138
	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
1139
1140
1141
	sock = vq->private_data;
	if (!sock)
		goto out;
Jason Wang's avatar
Jason Wang committed
1142
1143
1144
1145

	if (!vq_iotlb_prefetch(vq))
		goto out;

1146
	vhost_disable_notify(&net->dev, vq);
1147
	vhost_net_disable_vq(net, vq);
1148

1149
1150
	vhost_hlen = nvq->vhost_hlen;
	sock_hlen = nvq->sock_hlen;
1151

1152
	vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
1153
		vq->log : NULL;
1154
	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
1155

1156
1157
	while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
						      &busyloop_intr))) {
1158
1159
		sock_len += sock_hlen;
		vhost_len = sock_len + vhost_hlen;
1160
1161
		headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
					vhost_len, &in, vq_log, &log,
1162
					likely(mergeable) ? UIO_MAXIOV : 1);
1163
1164
		/* On error, stop handling until the next kick. */
		if (unlikely(headcount < 0))
1165
			goto out;
1166
1167
		/* OK, now we need to know about added descriptors. */
		if (!headcount) {
1168
1169
1170
			if (unlikely(busyloop_intr)) {
				vhost_poll_queue(&vq->poll);
			} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
1171
1172
				/* They have slipped one in as we were
				 * doing that: check again. */
1173
				vhost_disable_notify(&net->dev, vq);
1174
1175
1176
1177
				continue;
			}
			/* Nothing new?  Wait for eventfd to tell us
			 * they refilled. */
1178
			goto out;
1179
		}
1180
		busyloop_intr = false;
1181
		if (nvq->rx_ring)
Wei Xu's avatar
Wei Xu committed
1182
1183
1184
1185
1186
1187
1188
1189
1190
			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
		/* On overrun, truncate and discard */
		if (unlikely(headcount > UIO_MAXIOV)) {
			iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
			err = sock->ops->recvmsg(sock, &msg,
						 1, MSG_DONTWAIT | MSG_TRUNC);
			pr_debug("Discarded rx packet: len %zd\n", sock_len);
			continue;
		}
1191
		/* We don't need to be notified again. */
1192
1193
1194
1195
1196
1197
1198
1199
		iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
		fixup = msg.msg_iter;
		if (unlikely((vhost_hlen))) {
			/* We will supply the header ourselves
			 * TODO: support TSO.
			 */
			iov_iter_advance(&msg.msg_iter, vhost_hlen);
		}
1200
		err = sock->ops->recvmsg(sock, &msg,
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
					 sock_len, MSG_DONTWAIT | MSG_TRUNC);
		/* Userspace might have consumed the packet meanwhile:
		 * it's not supposed to do this usually, but might be hard
		 * to prevent. Discard data we got (if any) and keep going. */
		if (unlikely(err != sock_len)) {
			pr_debug("Discarded rx packet: "
				 " len %d, expected %zd\n", err, sock_len);
			vhost_discard_vq_desc(vq, headcount);
			continue;
		}
1211
		/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
1212
1213
1214
1215
1216
		if (unlikely(vhost_hlen)) {
			if (copy_to_iter(&hdr, sizeof(hdr),
					 &fixup) != sizeof(hdr)) {
				vq_err(vq, "Unable to write vnet_hdr "
				       "at addr %p\n", vq->iov->iov_base);
1217