virtio_ring.c 60.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
3
4
5
6
7
/* Virtio ring implementation.
 *
 *  Copyright 2007 Rusty Russell IBM Corporation
 */
#include <linux/virtio.h>
#include <linux/virtio_ring.h>
8
#include <linux/virtio_config.h>
9
#include <linux/device.h>
10
#include <linux/slab.h>
11
#include <linux/module.h>
12
#include <linux/hrtimer.h>
13
#include <linux/dma-mapping.h>
14
#include <xen/xen.h>
15
16
17

#ifdef DEBUG
/* For development, we want to crash whenever the ring is screwed. */
18
19
20
21
22
23
#define BAD_RING(_vq, fmt, args...)				\
	do {							\
		dev_err(&(_vq)->vq.vdev->dev,			\
			"%s:"fmt, (_vq)->vq.name, ##args);	\
		BUG();						\
	} while (0)
24
25
26
27
/* Caller is supposed to guarantee no reentry. */
#define START_USE(_vq)						\
	do {							\
		if ((_vq)->in_use)				\
28
29
			panic("%s:in_use = %i\n",		\
			      (_vq)->vq.name, (_vq)->in_use);	\
30
		(_vq)->in_use = __LINE__;			\
31
	} while (0)
32
#define END_USE(_vq) \
33
	do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; } while(0)
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#define LAST_ADD_TIME_UPDATE(_vq)				\
	do {							\
		ktime_t now = ktime_get();			\
								\
		/* No kick or get, with .1 second between?  Warn. */ \
		if ((_vq)->last_add_time_valid)			\
			WARN_ON(ktime_to_ms(ktime_sub(now,	\
				(_vq)->last_add_time)) > 100);	\
		(_vq)->last_add_time = now;			\
		(_vq)->last_add_time_valid = true;		\
	} while (0)
#define LAST_ADD_TIME_CHECK(_vq)				\
	do {							\
		if ((_vq)->last_add_time_valid) {		\
			WARN_ON(ktime_to_ms(ktime_sub(ktime_get(), \
				      (_vq)->last_add_time)) > 100); \
		}						\
	} while (0)
#define LAST_ADD_TIME_INVALID(_vq)				\
	((_vq)->last_add_time_valid = false)
54
#else
55
56
57
58
59
60
#define BAD_RING(_vq, fmt, args...)				\
	do {							\
		dev_err(&_vq->vq.vdev->dev,			\
			"%s:"fmt, (_vq)->vq.name, ##args);	\
		(_vq)->broken = true;				\
	} while (0)
61
62
#define START_USE(vq)
#define END_USE(vq)
63
64
65
#define LAST_ADD_TIME_UPDATE(vq)
#define LAST_ADD_TIME_CHECK(vq)
#define LAST_ADD_TIME_INVALID(vq)
66
67
#endif

68
struct vring_desc_state_split {
69
70
71
72
	void *data;			/* Data for callback. */
	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
};

73
74
75
76
77
78
79
80
81
82
83
84
85
86
struct vring_desc_state_packed {
	void *data;			/* Data for callback. */
	struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */
	u16 num;			/* Descriptor list length. */
	u16 next;			/* The next desc state in a list. */
	u16 last;			/* The last desc state in a list. */
};

struct vring_desc_extra_packed {
	dma_addr_t addr;		/* Buffer DMA addr. */
	u32 len;			/* Buffer length. */
	u16 flags;			/* Descriptor flags. */
};

87
struct vring_virtqueue {
88
89
	struct virtqueue vq;

90
91
92
	/* Is this a packed ring? */
	bool packed_ring;

93
94
95
	/* Is DMA API used? */
	bool use_dma_api;

96
97
98
	/* Can we use weak barriers? */
	bool weak_barriers;

99
100
101
	/* Other side has made a mess, don't try any more. */
	bool broken;

102
103
104
	/* Host supports indirect buffers */
	bool indirect;

105
106
107
	/* Host publishes avail event idx */
	bool event;

108
109
110
111
112
113
	/* Head of free buffer list. */
	unsigned int free_head;
	/* Number we've added since last sync. */
	unsigned int num_added;

	/* Last used index we've seen. */
114
	u16 last_used_idx;
115

116
117
118
119
120
121
122
123
	union {
		/* Available for split ring */
		struct {
			/* Actual memory layout for this queue. */
			struct vring vring;

			/* Last written value to avail->flags */
			u16 avail_flags_shadow;
124

125
126
127
128
129
130
131
132
133
134
135
136
137
			/*
			 * Last written value to avail->idx in
			 * guest byte order.
			 */
			u16 avail_idx_shadow;

			/* Per-descriptor state. */
			struct vring_desc_state_split *desc_state;

			/* DMA address and size information */
			dma_addr_t queue_dma_addr;
			size_t queue_size_in_bytes;
		} split;
138

139
140
141
		/* Available for packed ring */
		struct {
			/* Actual memory layout for this queue. */
142
143
144
145
146
147
			struct {
				unsigned int num;
				struct vring_packed_desc *desc;
				struct vring_packed_desc_event *driver;
				struct vring_packed_desc_event *device;
			} vring;
148

149
150
			/* Driver ring wrap counter. */
			bool avail_wrap_counter;
151

152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
			/* Device ring wrap counter. */
			bool used_wrap_counter;

			/* Avail used flags. */
			u16 avail_used_flags;

			/* Index of the next avail descriptor. */
			u16 next_avail_idx;

			/*
			 * Last written value to driver->flags in
			 * guest byte order.
			 */
			u16 event_flags_shadow;

			/* Per-descriptor state. */
			struct vring_desc_state_packed *desc_state;
			struct vring_desc_extra_packed *desc_extra;

			/* DMA address and size information */
			dma_addr_t ring_dma_addr;
			dma_addr_t driver_event_dma_addr;
			dma_addr_t device_event_dma_addr;
			size_t ring_size_in_bytes;
			size_t event_size_in_bytes;
		} packed;
	};
179

180
	/* How to notify other side. FIXME: commonalize hcalls! */
181
	bool (*notify)(struct virtqueue *vq);
182

183
184
185
	/* DMA, allocation, and size information */
	bool we_own_ring;

186
187
188
#ifdef DEBUG
	/* They're supposed to lock for us. */
	unsigned int in_use;
189
190
191
192

	/* Figure out if their kicks are too delayed. */
	bool last_add_time_valid;
	ktime_t last_add_time;
193
194
195
#endif
};

196
197
198
199
200

/*
 * Helpers.
 */

201
202
#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)

203
204
205
206
207
208
209
210
211
212
213
214
static inline bool virtqueue_use_indirect(struct virtqueue *_vq,
					  unsigned int total_sg)
{
	struct vring_virtqueue *vq = to_vvq(_vq);

	/*
	 * If the host supports indirect descriptor tables, and we have multiple
	 * buffers, then go indirect. FIXME: tune this threshold
	 */
	return (vq->indirect && total_sg > 1 && vq->vq.num_free);
}

215
/*
216
217
218
219
 * Modern virtio devices have feature bits to specify whether they need a
 * quirk and bypass the IOMMU. If not there, just use the DMA API.
 *
 * If there, the interaction between virtio and DMA API is messy.
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
 *
 * On most systems with virtio, physical addresses match bus addresses,
 * and it doesn't particularly matter whether we use the DMA API.
 *
 * On some systems, including Xen and any system with a physical device
 * that speaks virtio behind a physical IOMMU, we must use the DMA API
 * for virtio DMA to work at all.
 *
 * On other systems, including SPARC and PPC64, virtio-pci devices are
 * enumerated as though they are behind an IOMMU, but the virtio host
 * ignores the IOMMU, so we must either pretend that the IOMMU isn't
 * there or somehow map everything as the identity.
 *
 * For the time being, we preserve historic behavior and bypass the DMA
 * API.
235
236
237
238
 *
 * TODO: install a per-device DMA ops structure that does the right thing
 * taking into account all the above quirks, and use the DMA API
 * unconditionally on data path.
239
240
241
242
 */

static bool vring_use_dma_api(struct virtio_device *vdev)
{
243
244
245
246
	if (!virtio_has_iommu_quirk(vdev))
		return true;

	/* Otherwise, we are left to guess. */
247
248
249
250
251
252
253
254
255
256
257
	/*
	 * In theory, it's possible to have a buggy QEMU-supposed
	 * emulated Q35 IOMMU and Xen enabled at the same time.  On
	 * such a configuration, virtio has never worked and will
	 * not work without an even larger kludge.  Instead, enable
	 * the DMA API if we're a Xen guest, which at least allows
	 * all of the sensible Xen configurations to work correctly.
	 */
	if (xen_domain())
		return true;

258
259
260
	return false;
}

261
262
263
264
265
266
267
268
269
270
271
size_t virtio_max_dma_size(struct virtio_device *vdev)
{
	size_t max_segment_size = SIZE_MAX;

	if (vring_use_dma_api(vdev))
		max_segment_size = dma_max_mapping_size(&vdev->dev);

	return max_segment_size;
}
EXPORT_SYMBOL_GPL(virtio_max_dma_size);

272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
static void *vring_alloc_queue(struct virtio_device *vdev, size_t size,
			      dma_addr_t *dma_handle, gfp_t flag)
{
	if (vring_use_dma_api(vdev)) {
		return dma_alloc_coherent(vdev->dev.parent, size,
					  dma_handle, flag);
	} else {
		void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag);

		if (queue) {
			phys_addr_t phys_addr = virt_to_phys(queue);
			*dma_handle = (dma_addr_t)phys_addr;

			/*
			 * Sanity check: make sure we dind't truncate
			 * the address.  The only arches I can find that
			 * have 64-bit phys_addr_t but 32-bit dma_addr_t
			 * are certain non-highmem MIPS and x86
			 * configurations, but these configurations
			 * should never allocate physical pages above 32
			 * bits, so this is fine.  Just in case, throw a
			 * warning and abort if we end up with an
			 * unrepresentable address.
			 */
			if (WARN_ON_ONCE(*dma_handle != phys_addr)) {
				free_pages_exact(queue, PAGE_ALIGN(size));
				return NULL;
			}
		}
		return queue;
	}
}

static void vring_free_queue(struct virtio_device *vdev, size_t size,
			     void *queue, dma_addr_t dma_handle)
{
	if (vring_use_dma_api(vdev))
		dma_free_coherent(vdev->dev.parent, size, queue, dma_handle);
	else
		free_pages_exact(queue, PAGE_ALIGN(size));
}

314
315
316
317
318
/*
 * The DMA ops on various arches are rather gnarly right now, and
 * making all of the arch DMA ops work on the vring device itself
 * is a mess.  For now, we use the parent device for DMA ops.
 */
319
static inline struct device *vring_dma_dev(const struct vring_virtqueue *vq)
320
321
322
323
324
325
326
327
328
{
	return vq->vq.vdev->dev.parent;
}

/* Map one sg entry. */
static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
				   struct scatterlist *sg,
				   enum dma_data_direction direction)
{
329
	if (!vq->use_dma_api)
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
		return (dma_addr_t)sg_phys(sg);

	/*
	 * We can't use dma_map_sg, because we don't use scatterlists in
	 * the way it expects (we don't guarantee that the scatterlist
	 * will exist for the lifetime of the mapping).
	 */
	return dma_map_page(vring_dma_dev(vq),
			    sg_page(sg), sg->offset, sg->length,
			    direction);
}

static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
				   void *cpu_addr, size_t size,
				   enum dma_data_direction direction)
{
346
	if (!vq->use_dma_api)
347
348
349
350
351
352
		return (dma_addr_t)virt_to_phys(cpu_addr);

	return dma_map_single(vring_dma_dev(vq),
			      cpu_addr, size, direction);
}

353
354
355
static int vring_mapping_error(const struct vring_virtqueue *vq,
			       dma_addr_t addr)
{
356
	if (!vq->use_dma_api)
357
358
359
360
361
362
363
364
365
366
		return 0;

	return dma_mapping_error(vring_dma_dev(vq), addr);
}


/*
 * Split ring specific functions - *_split().
 */

367
368
static void vring_unmap_one_split(const struct vring_virtqueue *vq,
				  struct vring_desc *desc)
369
370
371
{
	u16 flags;

372
	if (!vq->use_dma_api)
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
		return;

	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);

	if (flags & VRING_DESC_F_INDIRECT) {
		dma_unmap_single(vring_dma_dev(vq),
				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
				 virtio32_to_cpu(vq->vq.vdev, desc->len),
				 (flags & VRING_DESC_F_WRITE) ?
				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
	} else {
		dma_unmap_page(vring_dma_dev(vq),
			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
			       virtio32_to_cpu(vq->vq.vdev, desc->len),
			       (flags & VRING_DESC_F_WRITE) ?
			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
	}
}

392
393
394
static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
					       unsigned int total_sg,
					       gfp_t gfp)
395
396
{
	struct vring_desc *desc;
397
	unsigned int i;
398

399
400
401
402
403
	/*
	 * We require lowmem mappings for the descriptors because
	 * otherwise virt_to_phys will give us bogus addresses in the
	 * virtqueue.
	 */
404
	gfp &= ~__GFP_HIGHMEM;
405

406
	desc = kmalloc_array(total_sg, sizeof(struct vring_desc), gfp);
407
	if (!desc)
408
		return NULL;
409

410
	for (i = 0; i < total_sg; i++)
411
		desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
412
	return desc;
413
414
}

415
416
417
418
419
420
421
422
static inline int virtqueue_add_split(struct virtqueue *_vq,
				      struct scatterlist *sgs[],
				      unsigned int total_sg,
				      unsigned int out_sgs,
				      unsigned int in_sgs,
				      void *data,
				      void *ctx,
				      gfp_t gfp)
423
424
{
	struct vring_virtqueue *vq = to_vvq(_vq);
425
	struct scatterlist *sg;
426
	struct vring_desc *desc;
427
	unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx;
Michael S. Tsirkin's avatar
Michael S. Tsirkin committed
428
	int head;
429
	bool indirect;
430

431
432
	START_USE(vq);

433
	BUG_ON(data == NULL);
434
	BUG_ON(ctx && vq->indirect);
435

436
437
438
439
440
	if (unlikely(vq->broken)) {
		END_USE(vq);
		return -EIO;
	}

441
	LAST_ADD_TIME_UPDATE(vq);
442

443
444
445
446
	BUG_ON(total_sg == 0);

	head = vq->free_head;

447
	if (virtqueue_use_indirect(_vq, total_sg))
448
		desc = alloc_indirect_split(_vq, total_sg, gfp);
449
	else {
450
		desc = NULL;
451
		WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect);
452
	}
453
454
455

	if (desc) {
		/* Use a single buffer which doesn't continue */
456
		indirect = true;
457
458
459
460
		/* Set up rest to use this indirect table. */
		i = 0;
		descs_used = 1;
	} else {
461
		indirect = false;
462
		desc = vq->split.vring.desc;
463
464
		i = head;
		descs_used = total_sg;
465
466
	}

467
	if (vq->vq.num_free < descs_used) {
468
		pr_debug("Can't add buf len %i - avail = %i\n",
469
			 descs_used, vq->vq.num_free);
470
471
472
		/* FIXME: for historical reasons, we force a notify here if
		 * there are outgoing parts to the buffer.  Presumably the
		 * host should service the ring ASAP. */
473
		if (out_sgs)
474
			vq->notify(&vq->vq);
475
476
		if (indirect)
			kfree(desc);
477
478
479
480
		END_USE(vq);
		return -ENOSPC;
	}

481
	for (n = 0; n < out_sgs; n++) {
482
		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
483
484
485
486
			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
			if (vring_mapping_error(vq, addr))
				goto unmap_release;

487
			desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT);
488
			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
489
			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
490
			prev = i;
491
			i = virtio16_to_cpu(_vq->vdev, desc[i].next);
492
		}
493
	}
494
	for (; n < (out_sgs + in_sgs); n++) {
495
		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
496
497
498
499
			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
			if (vring_mapping_error(vq, addr))
				goto unmap_release;

500
			desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE);
501
			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
502
			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
503
			prev = i;
504
			i = virtio16_to_cpu(_vq->vdev, desc[i].next);
505
		}
506
507
	}
	/* Last one doesn't continue. */
508
	desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
509

510
511
512
513
514
515
516
517
	if (indirect) {
		/* Now that the indirect table is filled in, map it. */
		dma_addr_t addr = vring_map_single(
			vq, desc, total_sg * sizeof(struct vring_desc),
			DMA_TO_DEVICE);
		if (vring_mapping_error(vq, addr))
			goto unmap_release;

518
519
520
521
		vq->split.vring.desc[head].flags = cpu_to_virtio16(_vq->vdev,
				VRING_DESC_F_INDIRECT);
		vq->split.vring.desc[head].addr = cpu_to_virtio64(_vq->vdev,
				addr);
522

523
524
		vq->split.vring.desc[head].len = cpu_to_virtio32(_vq->vdev,
				total_sg * sizeof(struct vring_desc));
525
526
527
528
529
	}

	/* We're using some buffers from the free list. */
	vq->vq.num_free -= descs_used;

530
	/* Update free pointer */
531
	if (indirect)
532
533
		vq->free_head = virtio16_to_cpu(_vq->vdev,
					vq->split.vring.desc[head].next);
534
535
	else
		vq->free_head = i;
536

537
	/* Store token and indirect buffer state. */
538
	vq->split.desc_state[head].data = data;
539
	if (indirect)
540
		vq->split.desc_state[head].indir_desc = desc;
541
	else
542
		vq->split.desc_state[head].indir_desc = ctx;
543
544

	/* Put entry in available array (but don't update avail->idx until they
545
	 * do sync). */
546
547
	avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
	vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
548

549
550
	/* Descriptors and available array need to be set before we expose the
	 * new available array entries. */
551
	virtio_wmb(vq->weak_barriers);
552
553
554
	vq->split.avail_idx_shadow++;
	vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
						vq->split.avail_idx_shadow);
555
556
	vq->num_added++;

557
558
559
	pr_debug("Added buffer head %i to %p\n", head, vq);
	END_USE(vq);

560
561
562
563
564
	/* This is very unlikely, but theoretically possible.  Kick
	 * just in case. */
	if (unlikely(vq->num_added == (1 << 16) - 1))
		virtqueue_kick(_vq);

565
	return 0;
566
567
568

unmap_release:
	err_idx = i;
569
570
571
572
573

	if (indirect)
		i = 0;
	else
		i = head;
574
575
576
577

	for (n = 0; n < total_sg; n++) {
		if (i == err_idx)
			break;
578
		vring_unmap_one_split(vq, &desc[i]);
579
		i = virtio16_to_cpu(_vq->vdev, desc[i].next);
580
581
582
583
584
	}

	if (indirect)
		kfree(desc);

585
	END_USE(vq);
586
	return -EIO;
587
}
588

589
static bool virtqueue_kick_prepare_split(struct virtqueue *_vq)
590
591
{
	struct vring_virtqueue *vq = to_vvq(_vq);
592
	u16 new, old;
593
594
	bool needs_kick;

595
	START_USE(vq);
596
597
	/* We need to expose available array entries before checking avail
	 * event. */
598
	virtio_mb(vq->weak_barriers);
599

600
601
	old = vq->split.avail_idx_shadow - vq->num_added;
	new = vq->split.avail_idx_shadow;
602
603
	vq->num_added = 0;

604
605
	LAST_ADD_TIME_CHECK(vq);
	LAST_ADD_TIME_INVALID(vq);
606

607
	if (vq->event) {
608
609
		needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev,
					vring_avail_event(&vq->split.vring)),
610
611
					      new, old);
	} else {
612
613
614
		needs_kick = !(vq->split.vring.used->flags &
					cpu_to_virtio16(_vq->vdev,
						VRING_USED_F_NO_NOTIFY));
615
	}
616
	END_USE(vq);
617
618
	return needs_kick;
}
619
620
621

static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
			     void **ctx)
622
{
623
	unsigned int i, j;
Gonglei's avatar
Gonglei committed
624
	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
625
626

	/* Clear data ptr. */
627
	vq->split.desc_state[head].data = NULL;
628

629
	/* Put back on free list: unmap first-level descriptors and find end */
630
	i = head;
631

632
633
634
	while (vq->split.vring.desc[i].flags & nextflag) {
		vring_unmap_one_split(vq, &vq->split.vring.desc[i]);
		i = virtio16_to_cpu(vq->vq.vdev, vq->split.vring.desc[i].next);
635
		vq->vq.num_free++;
636
637
	}

638
639
640
	vring_unmap_one_split(vq, &vq->split.vring.desc[i]);
	vq->split.vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev,
						vq->free_head);
641
	vq->free_head = head;
642

643
	/* Plus final descriptor */
644
	vq->vq.num_free++;
645

646
	if (vq->indirect) {
647
648
		struct vring_desc *indir_desc =
				vq->split.desc_state[head].indir_desc;
649
650
651
652
653
654
		u32 len;

		/* Free the indirect table, if any, now that it's unmapped. */
		if (!indir_desc)
			return;

655
656
		len = virtio32_to_cpu(vq->vq.vdev,
				vq->split.vring.desc[head].len);
657

658
		BUG_ON(!(vq->split.vring.desc[head].flags &
659
660
661
662
			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
		BUG_ON(len == 0 || len % sizeof(struct vring_desc));

		for (j = 0; j < len / sizeof(struct vring_desc); j++)
663
			vring_unmap_one_split(vq, &indir_desc[j]);
664

665
		kfree(indir_desc);
666
		vq->split.desc_state[head].indir_desc = NULL;
667
	} else if (ctx) {
668
		*ctx = vq->split.desc_state[head].indir_desc;
669
	}
670
671
}

672
static inline bool more_used_split(const struct vring_virtqueue *vq)
673
{
674
675
	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev,
			vq->split.vring.used->idx);
676
677
}

678
679
680
static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
					 unsigned int *len,
					 void **ctx)
681
682
683
684
{
	struct vring_virtqueue *vq = to_vvq(_vq);
	void *ret;
	unsigned int i;
685
	u16 last_used;
686
687
688

	START_USE(vq);

689
690
691
692
693
	if (unlikely(vq->broken)) {
		END_USE(vq);
		return NULL;
	}

694
	if (!more_used_split(vq)) {
695
696
697
698
699
		pr_debug("No more buffers in queue\n");
		END_USE(vq);
		return NULL;
	}

700
	/* Only get used array entries after they have been exposed by host. */
701
	virtio_rmb(vq->weak_barriers);
702

703
704
705
706
707
	last_used = (vq->last_used_idx & (vq->split.vring.num - 1));
	i = virtio32_to_cpu(_vq->vdev,
			vq->split.vring.used->ring[last_used].id);
	*len = virtio32_to_cpu(_vq->vdev,
			vq->split.vring.used->ring[last_used].len);
708

709
	if (unlikely(i >= vq->split.vring.num)) {
710
711
712
		BAD_RING(vq, "id %u out of range\n", i);
		return NULL;
	}
713
	if (unlikely(!vq->split.desc_state[i].data)) {
714
715
716
717
		BAD_RING(vq, "id %u is not a head!\n", i);
		return NULL;
	}

718
	/* detach_buf_split clears data, so grab it now. */
719
	ret = vq->split.desc_state[i].data;
720
	detach_buf_split(vq, i, ctx);
721
	vq->last_used_idx++;
722
723
724
	/* If we expect an interrupt for the next entry, tell host
	 * by writing event index and flush out the write before
	 * the read in the next get_buf call. */
725
	if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
726
		virtio_store_mb(vq->weak_barriers,
727
				&vring_used_event(&vq->split.vring),
728
				cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
729

730
	LAST_ADD_TIME_INVALID(vq);
731

732
733
734
	END_USE(vq);
	return ret;
}
735
736

static void virtqueue_disable_cb_split(struct virtqueue *_vq)
737
738
739
{
	struct vring_virtqueue *vq = to_vvq(_vq);

740
741
	if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
		vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
742
		if (!vq->event)
743
744
745
			vq->split.vring.avail->flags =
				cpu_to_virtio16(_vq->vdev,
						vq->split.avail_flags_shadow);
746
	}
747
748
}

749
static unsigned virtqueue_enable_cb_prepare_split(struct virtqueue *_vq)
750
751
{
	struct vring_virtqueue *vq = to_vvq(_vq);
752
	u16 last_used_idx;
753
754
755
756
757

	START_USE(vq);

	/* We optimistically turn back on interrupts, then check if there was
	 * more to do. */
758
759
760
	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
	 * either clear the flags bit or point the event index at the next
	 * entry. Always do both to keep code simple. */
761
762
	if (vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
		vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
763
		if (!vq->event)
764
765
766
			vq->split.vring.avail->flags =
				cpu_to_virtio16(_vq->vdev,
						vq->split.avail_flags_shadow);
767
	}
768
769
	vring_used_event(&vq->split.vring) = cpu_to_virtio16(_vq->vdev,
			last_used_idx = vq->last_used_idx);
770
771
772
	END_USE(vq);
	return last_used_idx;
}
773
774
775
776
777
778

static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned last_used_idx)
{
	struct vring_virtqueue *vq = to_vvq(_vq);

	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev,
779
			vq->split.vring.used->idx);
780
781
782
}

static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq)
783
784
785
786
787
788
789
790
791
792
{
	struct vring_virtqueue *vq = to_vvq(_vq);
	u16 bufs;

	START_USE(vq);

	/* We optimistically turn back on interrupts, then check if there was
	 * more to do. */
	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
	 * either clear the flags bit or point the event index at the next
793
	 * entry. Always update the event index to keep code simple. */
794
795
	if (vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
		vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
796
		if (!vq->event)
797
798
799
			vq->split.vring.avail->flags =
				cpu_to_virtio16(_vq->vdev,
						vq->split.avail_flags_shadow);
800
	}
801
	/* TODO: tune this threshold */
802
	bufs = (u16)(vq->split.avail_idx_shadow - vq->last_used_idx) * 3 / 4;
803
804

	virtio_store_mb(vq->weak_barriers,
805
			&vring_used_event(&vq->split.vring),
806
807
			cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));

808
809
	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->split.vring.used->idx)
					- vq->last_used_idx) > bufs)) {
810
811
812
813
814
815
816
817
		END_USE(vq);
		return false;
	}

	END_USE(vq);
	return true;
}

818
static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq)
819
820
821
822
823
824
825
{
	struct vring_virtqueue *vq = to_vvq(_vq);
	unsigned int i;
	void *buf;

	START_USE(vq);

826
	for (i = 0; i < vq->split.vring.num; i++) {
827
		if (!vq->split.desc_state[i].data)
828
			continue;
829
		/* detach_buf_split clears data, so grab it now. */
830
		buf = vq->split.desc_state[i].data;
831
		detach_buf_split(vq, i, NULL);
832
833
834
		vq->split.avail_idx_shadow--;
		vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
				vq->split.avail_idx_shadow);
835
836
837
838
		END_USE(vq);
		return buf;
	}
	/* That should have freed everything. */
839
	BUG_ON(vq->vq.num_free != vq->split.vring.num);
840
841
842
843

	END_USE(vq);
	return NULL;
}
844

845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
static struct virtqueue *vring_create_virtqueue_split(
	unsigned int index,
	unsigned int num,
	unsigned int vring_align,
	struct virtio_device *vdev,
	bool weak_barriers,
	bool may_reduce_num,
	bool context,
	bool (*notify)(struct virtqueue *),
	void (*callback)(struct virtqueue *),
	const char *name)
{
	struct virtqueue *vq;
	void *queue = NULL;
	dma_addr_t dma_addr;
	size_t queue_size_in_bytes;
	struct vring vring;

	/* We assume num is a power of 2. */
	if (num & (num - 1)) {
		dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num);
		return NULL;
	}

	/* TODO: allocate each queue chunk individually */
	for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
					  &dma_addr,
					  GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
		if (queue)
			break;
876
877
		if (!may_reduce_num)
			return NULL;
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
	}

	if (!num)
		return NULL;

	if (!queue) {
		/* Try to get a single page. You are my only hope! */
		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
	}
	if (!queue)
		return NULL;

	queue_size_in_bytes = vring_size(num, vring_align);
	vring_init(&vring, num, queue, vring_align);

	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
				   notify, callback, name);
	if (!vq) {
		vring_free_queue(vdev, queue_size_in_bytes, queue,
				 dma_addr);
		return NULL;
	}

	to_vvq(vq)->split.queue_dma_addr = dma_addr;
	to_vvq(vq)->split.queue_size_in_bytes = queue_size_in_bytes;
	to_vvq(vq)->we_own_ring = true;

	return vq;
}

909

910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
/*
 * Packed ring specific functions - *_packed().
 */

static void vring_unmap_state_packed(const struct vring_virtqueue *vq,
				     struct vring_desc_extra_packed *state)
{
	u16 flags;

	if (!vq->use_dma_api)
		return;

	flags = state->flags;

	if (flags & VRING_DESC_F_INDIRECT) {
		dma_unmap_single(vring_dma_dev(vq),
				 state->addr, state->len,
				 (flags & VRING_DESC_F_WRITE) ?
				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
	} else {
		dma_unmap_page(vring_dma_dev(vq),
			       state->addr, state->len,
			       (flags & VRING_DESC_F_WRITE) ?
			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
	}
}

static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
				   struct vring_packed_desc *desc)
{
	u16 flags;

	if (!vq->use_dma_api)
		return;

	flags = le16_to_cpu(desc->flags);

	if (flags & VRING_DESC_F_INDIRECT) {
		dma_unmap_single(vring_dma_dev(vq),
				 le64_to_cpu(desc->addr),
				 le32_to_cpu(desc->len),
				 (flags & VRING_DESC_F_WRITE) ?
				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
	} else {
		dma_unmap_page(vring_dma_dev(vq),
			       le64_to_cpu(desc->addr),
			       le32_to_cpu(desc->len),
			       (flags & VRING_DESC_F_WRITE) ?
			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
	}
}

static struct vring_packed_desc *alloc_indirect_packed(unsigned int total_sg,
						       gfp_t gfp)
{
	struct vring_packed_desc *desc;

	/*
	 * We require lowmem mappings for the descriptors because
	 * otherwise virt_to_phys will give us bogus addresses in the
	 * virtqueue.
	 */
	gfp &= ~__GFP_HIGHMEM;

	desc = kmalloc_array(total_sg, sizeof(struct vring_packed_desc), gfp);

	return desc;
}

static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
				       struct scatterlist *sgs[],
				       unsigned int total_sg,
				       unsigned int out_sgs,
				       unsigned int in_sgs,
				       void *data,
				       gfp_t gfp)
{
	struct vring_packed_desc *desc;
	struct scatterlist *sg;
	unsigned int i, n, err_idx;
	u16 head, id;
	dma_addr_t addr;

	head = vq->packed.next_avail_idx;
	desc = alloc_indirect_packed(total_sg, gfp);

	if (unlikely(vq->vq.num_free < 1)) {
		pr_debug("Can't add buf len 1 - avail = 0\n");
998
		kfree(desc);
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
		END_USE(vq);
		return -ENOSPC;
	}

	i = 0;
	id = vq->free_head;
	BUG_ON(id == vq->packed.vring.num);

	for (n = 0; n < out_sgs + in_sgs; n++) {
		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
			addr = vring_map_one_sg(vq, sg, n < out_sgs ?
					DMA_TO_DEVICE : DMA_FROM_DEVICE);
			if (vring_mapping_error(vq, addr))
				goto unmap_release;

			desc[i].flags = cpu_to_le16(n < out_sgs ?
						0 : VRING_DESC_F_WRITE);
			desc[i].addr = cpu_to_le64(addr);
			desc[i].len = cpu_to_le32(sg->length);
			i++;
		}
	}

	/* Now that the indirect table is filled in, map it. */
	addr = vring_map_single(vq, desc,
			total_sg * sizeof(struct vring_packed_desc),
			DMA_TO_DEVICE);
	if (vring_mapping_error(vq, addr))
		goto unmap_release;

	vq->packed.vring.desc[head].addr = cpu_to_le64(addr);
	vq->packed.vring.desc[head].len = cpu_to_le32(total_sg *
				sizeof(struct vring_packed_desc));
	vq->packed.vring.desc[head].id = cpu_to_le16(id);

	if (vq->use_dma_api) {
		vq->packed.desc_extra[id].addr = addr;
		vq->packed.desc_extra[id].len = total_sg *
				sizeof(struct vring_packed_desc);
		vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT |
						  vq->packed.avail_used_flags;
	}

	/*
	 * A driver MUST NOT make the first descriptor in the list
	 * available before all subsequent descriptors comprising
	 * the list are made available.
	 */
	virtio_wmb(vq->weak_barriers);
	vq->packed.vring.desc[head].flags = cpu_to_le16(VRING_DESC_F_INDIRECT |
						vq->packed.avail_used_flags);

	/* We're using some buffers from the free list. */
	vq->vq.num_free -= 1;

	/* Update free pointer */
	n = head + 1;
	if (n >= vq->packed.vring.num) {
		n = 0;
		vq->packed.avail_wrap_counter ^= 1;
		vq->packed.avail_used_flags ^=
				1 << VRING_PACKED_DESC_F_AVAIL |
				1 << VRING_PACKED_DESC_F_USED;
	}
	vq->packed.next_avail_idx = n;
	vq->free_head = vq->packed.desc_state[id].next;

	/* Store token and indirect buffer state. */
	vq->packed.desc_state[id].num = 1;
	vq->packed.desc_state[id].data = data;
	vq->packed.desc_state[id].indir_desc = desc;
	vq->packed.desc_state[id].last = id;

	vq->num_added += 1;

	pr_debug("Added buffer head %i to %p\n", head, vq);
	END_USE(vq);

	return 0;

unmap_release:
	err_idx = i;

	for (i = 0; i < err_idx; i++)
		vring_unmap_desc_packed(vq, &desc[i]);

	kfree(desc);

	END_USE(vq);
	return -EIO;
}

static inline int virtqueue_add_packed(struct virtqueue *_vq,
				       struct scatterlist *sgs[],
				       unsigned int total_sg,
				       unsigned int out_sgs,
				       unsigned int in_sgs,
				       void *data,
				       void *ctx,
				       gfp_t gfp)
{
	struct vring_virtqueue *vq = to_vvq(_vq);
	struct vring_packed_desc *desc;
	struct scatterlist *sg;
	unsigned int i, n, c, descs_used, err_idx;
	__le16 uninitialized_var(head_flags), flags;
	u16 head, id, uninitialized_var(prev), curr, avail_used_flags;

	START_USE(vq);

	BUG_ON(data == NULL);
	BUG_ON(ctx && vq->indirect);

	if (unlikely(vq->broken)) {
		END_USE(vq);
		return -EIO;
	}

	LAST_ADD_TIME_UPDATE(vq);

	BUG_ON(total_sg == 0);

	if (virtqueue_use_indirect(_vq, total_sg))
		return virtqueue_add_indirect_packed(vq, sgs, total_sg,
				out_sgs, in_sgs, data, gfp);

	head = vq->packed.next_avail_idx;
	avail_used_flags = vq->packed.avail_used_flags;

	WARN_ON_ONCE(total_sg > vq->packed.vring.num && !vq->indirect);

	desc = vq->packed.vring.desc;
	i = head;
	descs_used = total_sg;

	if (unlikely(vq->vq.num_free < descs_used)) {
		pr_debug("Can't add buf len %i - avail = %i\n",
			 descs_used, vq->vq.num_free);
		END_USE(vq);
		return -ENOSPC;
	}

	id = vq->free_head;
	BUG_ON(id == vq->packed.vring.num);

	curr = id;
	c = 0;
	for (n = 0; n < out_sgs + in_sgs; n++) {
		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
					DMA_TO_DEVICE : DMA_FROM_DEVICE);
			if (vring_mapping_error(vq, addr))
				goto unmap_release;

			flags = cpu_to_le16(vq->packed.avail_used_flags |
				    (++c == total_sg ? 0 : VRING_DESC_F_NEXT) |
				    (n < out_sgs ? 0 : VRING_DESC_F_WRITE));
			if (i == head)
				head_flags = flags;
			else
				desc[i].flags = flags;

			desc[i].addr = cpu_to_le64(addr);
			desc[i].len = cpu_to_le32(sg->length);
			desc[i].id = cpu_to_le16(id);

			if (unlikely(vq->use_dma_api)) {
				vq->packed.desc_extra[curr].addr = addr;
				vq->packed.desc_extra[curr].len = sg->length;
				vq->packed.desc_extra[curr].flags =
					le16_to_cpu(flags);
			}
			prev = curr;
			curr = vq->packed.desc_state[curr].next;

			if ((unlikely(++i >= vq->packed.vring.num))) {
				i = 0;
				vq->packed.avail_used_flags ^=
					1 << VRING_PACKED_DESC_F_AVAIL |
					1 << VRING_PACKED_DESC_F_USED;
			}
		}
	}

	if (i < head)
		vq->packed.avail_wrap_counter ^= 1;

	/* We're using some buffers from the free list. */
	vq->vq.num_free -= descs_used;

	/* Update free pointer */
	vq->packed.next_avail_idx = i;
	vq->free_head = curr;

	/* Store token. */
	vq->packed.desc_state[id].num = descs_used;
	vq->packed.desc_state[id].data = data;
	vq->packed.desc_state[id].indir_desc = ctx;
	vq->packed.desc_state[id].last = prev;

	/*
	 * A driver MUST NOT make the first descriptor in the list
	 * available before all subsequent descriptors comprising
	 * the list are made available.
	 */
	virtio_wmb(vq->weak_barriers);
	vq->packed.vring.desc[head].flags = head_flags;
	vq->num_added += descs_used;

	pr_debug("Added buffer head %i to %p\n", head, vq);
	END_USE(vq);

	return 0;

unmap_release:
	err_idx = i;
	i = head;

	vq->packed.avail_used_flags = avail_used_flags;

	for (n = 0; n < total_sg; n++) {
		if (i == err_idx)
			break;
		vring_unmap_desc_packed(vq, &desc[i]);
		i++;
		if (i >= vq->packed.vring.num)
			i = 0;
	}

	END_USE(vq);
	return -EIO;
}

static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
{
	struct vring_virtqueue *vq = to_vvq(_vq);
1235
	u16 new, old, off_wrap, flags, wrap_counter, event_idx;
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
	bool needs_kick;
	union {
		struct {
			__le16 off_wrap;
			__le16 flags;
		};
		u32 u32;
	} snapshot;

	START_USE(vq);

	/*
	 * We need to expose the new flags value before checking notification
	 * suppressions.
	 */
	virtio_mb(vq->weak_barriers);

1253
1254
	old = vq->packed.next_avail_idx - vq->num_added;
	new = vq->packed.next_avail_idx;
1255
1256
1257
1258
1259
1260
1261
1262
	vq->num_added = 0;

	snapshot.u32 = *(u32 *)vq->packed.vring.device;
	flags = le16_to_cpu(snapshot.flags);

	LAST_ADD_TIME_CHECK(vq);
	LAST_ADD_TIME_INVALID(vq);

1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
	if (flags != VRING_PACKED_EVENT_FLAG_DESC) {
		needs_kick = (flags != VRING_PACKED_EVENT_FLAG_DISABLE);
		goto out;
	}

	off_wrap = le16_to_cpu(snapshot.off_wrap);

	wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR;
	event_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR);
	if (wrap_counter != vq->packed.avail_wrap_counter)
		event_idx -= vq->packed.vring.num;

	needs_kick = vring_need_event(event_idx, new, old);
out:
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
	END_USE(vq);
	return needs_kick;
}

static void detach_buf_packed(struct vring_virtqueue *vq,
			      unsigned int id, void **ctx)
{
	struct vring_desc_state_packed *state = NULL;
	struct vring_packed_desc *desc;
	unsigned int i, curr;

	state = &vq->packed.desc_state[id];

	/* Clear data ptr. */
	state->data = NULL;

	vq->packed.desc_state[state->last].next = vq->free_head;
	vq->free_head = id;
	vq->vq.num_free += state->num;

	if (unlikely(vq->use_dma_api)) {
		curr = id;
		for (i = 0; i < state->num; i++) {
			vring_unmap_state_packed(vq,
				&vq->packed.desc_extra[curr]);
			curr = vq->packed.desc_state[curr].next;
		}
	}

	if (vq->indirect) {
		u32 len;

		/* Free the indirect table, if any, now that it's unmapped. */
		desc = state->indir_desc;
		if (!desc)
			return;

		if (vq->use_dma_api) {
			len = vq->packed.desc_extra[id].len;
			for (i = 0; i < len / sizeof(struct vring_packed_desc);
					i++)
				vring_unmap_desc_packed(vq, &desc[i]);
		}
		kfree(desc);
		state->indir_desc = NULL;
	} else if (ctx) {
		*ctx = state->indir_desc;
	}
}

static inline bool is_used_desc_packed(const struct vring_virtqueue *vq,
				       u16 idx, bool used_wrap_counter)
{
	bool avail, used;
	u16 flags;

	flags = le16_to_cpu(vq->packed.vring.desc[idx].flags);
	avail = !!(flags & (1 << VRING_PACKED_DESC_F_AVAIL));
	used = !!(flags & (1 << VRING_PACKED_DESC_F_USED));

	return avail == used && used == used_wrap_counter;
}

static inline bool more_used_packed(const struct vring_virtqueue *vq)
{
	return is_used_desc_packed(vq, vq->last_used_idx,
			vq->packed.used_wrap_counter);
}

static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
					  unsigned int *len,
					  void **ctx)
{
	struct vring_virtqueue *vq = to_vvq(_vq);
	u16 last_used, id;
	void *ret;

	START_USE(vq);

	if (unlikely(vq->broken)) {
		END_USE(vq);
		return NULL;
	}

	if (!more_used_packed(vq)) {
		pr_debug("No more buffers in queue\n");
		END_USE(vq);
		return NULL;
	}

	/* Only get used elements after they have been exposed by host. */
	virtio_rmb(vq->weak_barriers);

	last_used = vq->last_used_idx;
	id = le16_to_cpu(vq->packed.vring.desc[last_used].id);
	*len = le32_to_cpu(vq->packed.vring.desc[last_used].len);

	if (unlikely(id >= vq->packed.vring.num)) {
		BAD_RING(vq, "id %u out of range\n", id);
		return NULL;
	}
	if (unlikely(!vq->packed.desc_state[id].data)) {
		BAD_RING(vq, "id %u is not a head!\n", id);
		return NULL;
	}

	/* detach_buf_packed clears data, so grab it now. */
	ret = vq->packed.desc_state[id].data;
	detach_buf_packed(vq, id, ctx);

	vq->last_used_idx += vq->packed.desc_state[id].num;
	if (unlikely(vq->last_used_idx >= vq->packed.vring.num)) {
		vq->last_used_idx -= vq->packed.vring.num;
		vq->packed.used_wrap_counter ^= 1;
	}

1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
	/*
	 * If we expect an interrupt for the next entry, tell host
	 * by writing event index and flush out the write before
	 * the read in the next get_buf call.
	 */
	if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DESC)
		virtio_store_mb(vq->weak_barriers,
				&vq->packed.vring.driver->off_wrap,
				cpu_to_le16(vq->last_used_idx |
					(vq->packed.used_wrap_counter <<
					 VRING_PACKED_EVENT_F_WRAP_CTR)));

1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
	LAST_ADD_TIME_INVALID(vq);

	END_USE(vq);
	return ret;
}

static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
{
	struct vring_virtqueue *vq = to_vvq(_vq);

	if (vq->packed.event_flags_shadow != VRING_PACKED_EVENT_FLAG_DISABLE) {
		vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE;
		vq->packed.vring.driver->flags =
			cpu_to_le16(vq->packed.event_flags_shadow);
	}
}

static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
{
	struct vring_virtqueue *vq = to_vvq(_vq);