tdp_mmu.c 43.6 KB
Newer Older
1
2
// SPDX-License-Identifier: GPL-2.0

3
4
#include "mmu.h"
#include "mmu_internal.h"
5
#include "mmutrace.h"
6
#include "tdp_iter.h"
7
#include "tdp_mmu.h"
8
#include "spte.h"
9

10
#include <asm/cmpxchg.h>
11
12
#include <trace/events/kvm.h>

13
static bool __read_mostly tdp_mmu_enabled = false;
14
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15
16
17
18

/* Initializes the TDP MMU for the VM, if enabled. */
void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
{
19
	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20
21
22
23
		return;

	/* This should not be changed for the lifetime of the VM. */
	kvm->arch.tdp_mmu_enabled = true;
24
25

	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26
	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27
	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28
29
}

30
31
32
33
34
35
36
37
38
static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
							     bool shared)
{
	if (shared)
		lockdep_assert_held_read(&kvm->mmu_lock);
	else
		lockdep_assert_held_write(&kvm->mmu_lock);
}

39
40
41
42
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
{
	if (!kvm->arch.tdp_mmu_enabled)
		return;
43
44

	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
45
46
47
48
49
50

	/*
	 * Ensure that all the outstanding RCU callbacks to free shadow pages
	 * can run before the VM is torn down.
	 */
	rcu_barrier();
51
52
}

53
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
54
55
			  gfn_t start, gfn_t end, bool can_yield, bool flush,
			  bool shared);
56
57

static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
58
{
59
60
	free_page((unsigned long)sp->spt);
	kmem_cache_free(mmu_page_header_cache, sp);
61
62
}

63
64
65
66
67
68
69
70
71
/*
 * This is called through call_rcu in order to free TDP page table memory
 * safely with respect to other kernel threads that may be operating on
 * the memory.
 * By only accessing TDP MMU page table memory in an RCU read critical
 * section, and freeing it after a grace period, lockless access to that
 * memory won't use it after it is freed.
 */
static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
72
{
73
74
	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
					       rcu_head);
75

76
77
	tdp_mmu_free_sp(sp);
}
78

79
80
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
			  bool shared)
81
82
83
{
	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);

84
	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
85

86
	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
87
88
89
90
		return;

	WARN_ON(!root->tdp_mmu_page);

91
92
93
	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
	list_del_rcu(&root->link);
	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
94

95
	zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
96

97
	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
98
99
}

100
101
102
103
104
105
106
107
/*
 * Finds the next valid root after root (or the first valid root if root
 * is NULL), takes a reference on it, and returns that next root. If root
 * is not NULL, this thread should have already taken a reference on it, and
 * that reference will be dropped. If no valid root is found, this
 * function will return NULL.
 */
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
108
109
					      struct kvm_mmu_page *prev_root,
					      bool shared)
110
111
112
{
	struct kvm_mmu_page *next_root;

113
114
	rcu_read_lock();

115
	if (prev_root)
116
117
118
		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
						  &prev_root->link,
						  typeof(*prev_root), link);
119
	else
120
121
		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
						   typeof(*next_root), link);
122

123
124
125
	while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
				&next_root->link, typeof(*next_root), link);
126

127
	rcu_read_unlock();
128

129
	if (prev_root)
130
		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
131
132
133
134
135
136
137
138
139

	return next_root;
}

/*
 * Note: this iterator gets and puts references to the roots it iterates over.
 * This makes it safe to release the MMU lock and yield within the loop, but
 * if exiting the loop early, the caller must drop the reference to the most
 * recent root. (Unless keeping a live reference is desirable.)
140
141
142
143
 *
 * If shared is set, this function is operating under the MMU lock in read
 * mode. In the unlikely event that this thread must free a root, the lock
 * will be temporarily dropped and reacquired in write mode.
144
 */
145
146
147
148
149
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);		\
	     _root;							\
	     _root = tdp_mmu_next_root(_kvm, _root, _shared))		\
		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
150
		} else
151

152
153
154
155
#define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,		\
				lockdep_is_held_type(&kvm->mmu_lock, 0) ||	\
				lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock))	\
156
157
		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
		} else
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185

static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
						   int level)
{
	union kvm_mmu_page_role role;

	role = vcpu->arch.mmu->mmu_role.base;
	role.level = level;
	role.direct = true;
	role.gpte_is_8_bytes = true;
	role.access = ACC_ALL;

	return role;
}

static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
					       int level)
{
	struct kvm_mmu_page *sp;

	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);

	sp->role.word = page_role_for_level(vcpu, level).word;
	sp->gfn = gfn;
	sp->tdp_mmu_page = true;

186
187
	trace_kvm_mmu_get_page(sp, true);

188
189
190
	return sp;
}

191
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
192
193
194
195
196
{
	union kvm_mmu_page_role role;
	struct kvm *kvm = vcpu->kvm;
	struct kvm_mmu_page *root;

197
	lockdep_assert_held_write(&kvm->mmu_lock);
198

199
	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
200
201

	/* Check for an existing root before allocating a new one. */
202
	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
203
204
		if (root->role.word == role.word &&
		    kvm_tdp_mmu_get_root(kvm, root))
205
			goto out;
206
207
208
	}

	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
209
	refcount_set(&root->tdp_mmu_root_count, 1);
210

211
212
213
	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
214

215
out:
216
	return __pa(root->spt);
217
}
218
219

static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
220
221
				u64 old_spte, u64 new_spte, int level,
				bool shared);
222

223
224
225
226
227
228
static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
{
	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
		return;

	if (is_accessed_spte(old_spte) &&
229
230
	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
231
232
233
		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
}

234
235
236
237
238
239
240
241
242
243
244
245
246
247
static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
					  u64 old_spte, u64 new_spte, int level)
{
	bool pfn_changed;
	struct kvm_memory_slot *slot;

	if (level > PG_LEVEL_4K)
		return;

	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);

	if ((!is_writable_pte(old_spte) || pfn_changed) &&
	    is_writable_pte(new_spte)) {
		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
248
		mark_page_dirty_in_slot(kvm, slot, gfn);
249
250
251
	}
}

252
253
254
255
256
/**
 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
 *
 * @kvm: kvm instance
 * @sp: the new page
257
258
259
 * @shared: This operation may not be running under the exclusive use of
 *	    the MMU lock and the operation must synchronize with other
 *	    threads that might be adding or removing pages.
260
261
262
263
 * @account_nx: This page replaces a NX large page and should be marked for
 *		eventual reclaim.
 */
static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
264
			      bool shared, bool account_nx)
265
{
266
267
268
269
	if (shared)
		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
	else
		lockdep_assert_held_write(&kvm->mmu_lock);
270
271
272
273

	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
	if (account_nx)
		account_huge_nx_page(kvm, sp);
274
275
276

	if (shared)
		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
277
278
279
280
281
282
283
}

/**
 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
 *
 * @kvm: kvm instance
 * @sp: the page to be removed
284
285
286
 * @shared: This operation may not be running under the exclusive use of
 *	    the MMU lock and the operation must synchronize with other
 *	    threads that might be adding or removing pages.
287
 */
288
289
static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				bool shared)
290
{
291
292
293
294
	if (shared)
		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
	else
		lockdep_assert_held_write(&kvm->mmu_lock);
295
296
297
298

	list_del(&sp->link);
	if (sp->lpage_disallowed)
		unaccount_huge_nx_page(kvm, sp);
299
300
301

	if (shared)
		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
302
303
}

304
305
306
307
308
/**
 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
 *
 * @kvm: kvm instance
 * @pt: the page removed from the paging structure
309
310
311
 * @shared: This operation may not be running under the exclusive use
 *	    of the MMU lock and the operation must synchronize with other
 *	    threads that might be modifying SPTEs.
312
313
314
 *
 * Given a page table that has been removed from the TDP paging structure,
 * iterates through the page table to clear SPTEs and free child page tables.
315
316
317
318
319
 *
 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
 * protection. Since this thread removed it from the paging structure,
 * this thread will be responsible for ensuring the page is freed. Hence the
 * early rcu_dereferences in the function.
320
 */
321
static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
322
					bool shared)
323
{
324
	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
325
	int level = sp->role.level;
326
	gfn_t base_gfn = sp->gfn;
327
	u64 old_child_spte;
328
	u64 *sptep;
329
	gfn_t gfn;
330
331
332
333
	int i;

	trace_kvm_mmu_prepare_zap_page(sp);

334
	tdp_mmu_unlink_page(kvm, sp, shared);
335
336

	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
337
		sptep = rcu_dereference(pt) + i;
338
		gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
339
340

		if (shared) {
341
342
343
344
345
346
347
348
349
350
351
352
353
354
			/*
			 * Set the SPTE to a nonpresent value that other
			 * threads will not overwrite. If the SPTE was
			 * already marked as removed then another thread
			 * handling a page fault could overwrite it, so
			 * set the SPTE until it is set from some other
			 * value to the removed SPTE value.
			 */
			for (;;) {
				old_child_spte = xchg(sptep, REMOVED_SPTE);
				if (!is_removed_spte(old_child_spte))
					break;
				cpu_relax();
			}
355
		} else {
356
357
358
359
360
361
362
363
364
			/*
			 * If the SPTE is not MMU-present, there is no backing
			 * page associated with the SPTE and so no side effects
			 * that need to be recorded, and exclusive ownership of
			 * mmu_lock ensures the SPTE can't be made present.
			 * Note, zapping MMIO SPTEs is also unnecessary as they
			 * are guarded by the memslots generation, not by being
			 * unreachable.
			 */
365
			old_child_spte = READ_ONCE(*sptep);
366
367
			if (!is_shadow_present_pte(old_child_spte))
				continue;
368
369
370
371
372
373
374
375
376
377

			/*
			 * Marking the SPTE as a removed SPTE is not
			 * strictly necessary here as the MMU lock will
			 * stop other threads from concurrently modifying
			 * this SPTE. Using the removed SPTE value keeps
			 * the two branches consistent and simplifies
			 * the function.
			 */
			WRITE_ONCE(*sptep, REMOVED_SPTE);
378
		}
379
380
381
		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
				    old_child_spte, REMOVED_SPTE, level - 1,
				    shared);
382
383
384
385
386
	}

	kvm_flush_remote_tlbs_with_address(kvm, gfn,
					   KVM_PAGES_PER_HPAGE(level));

387
	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
388
389
}

390
/**
391
 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
392
393
394
395
396
397
 * @kvm: kvm instance
 * @as_id: the address space of the paging structure the SPTE was a part of
 * @gfn: the base GFN that was mapped by the SPTE
 * @old_spte: The value of the SPTE before the change
 * @new_spte: The value of the SPTE after the change
 * @level: the level of the PT the SPTE is part of in the paging structure
398
399
400
 * @shared: This operation may not be running under the exclusive use of
 *	    the MMU lock and the operation must synchronize with other
 *	    threads that might be modifying SPTEs.
401
402
403
404
405
 *
 * Handle bookkeeping that might result from the modification of a SPTE.
 * This function must be called for all TDP SPTE modifications.
 */
static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
406
407
				  u64 old_spte, u64 new_spte, int level,
				  bool shared)
408
409
410
411
412
413
414
415
416
{
	bool was_present = is_shadow_present_pte(old_spte);
	bool is_present = is_shadow_present_pte(new_spte);
	bool was_leaf = was_present && is_last_spte(old_spte, level);
	bool is_leaf = is_present && is_last_spte(new_spte, level);
	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);

	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
	WARN_ON(level < PG_LEVEL_4K);
417
	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
418
419
420
421
422

	/*
	 * If this warning were to trigger it would indicate that there was a
	 * missing MMU notifier or a race with some notifier handler.
	 * A present, leaf SPTE should never be directly replaced with another
423
	 * present leaf SPTE pointing to a different PFN. A notifier handler
424
425
426
427
428
429
430
431
432
433
434
435
436
	 * should be zapping the SPTE before the main MM's page table is
	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
	 * thread before replacement.
	 */
	if (was_leaf && is_leaf && pfn_changed) {
		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
		       "SPTE with another present leaf SPTE mapping a\n"
		       "different PFN!\n"
		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
		       as_id, gfn, old_spte, new_spte, level);

		/*
		 * Crash the host to prevent error propagation and guest data
437
		 * corruption.
438
439
440
441
442
443
444
		 */
		BUG();
	}

	if (old_spte == new_spte)
		return;

445
446
	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);

447
448
449
450
451
452
453
	if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
		if (is_large_pte(old_spte))
			atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
		else
			atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
	}

454
455
456
457
458
459
460
	/*
	 * The only times a SPTE should be changed from a non-present to
	 * non-present state is when an MMIO entry is installed/modified/
	 * removed. In that case, there is nothing to do here.
	 */
	if (!was_present && !is_present) {
		/*
461
462
463
464
		 * If this change does not involve a MMIO SPTE or removed SPTE,
		 * it is unexpected. Log the change, though it should not
		 * impact the guest since both the former and current SPTEs
		 * are nonpresent.
465
		 */
466
467
468
		if (WARN_ON(!is_mmio_spte(old_spte) &&
			    !is_mmio_spte(new_spte) &&
			    !is_removed_spte(new_spte)))
469
470
471
			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
			       "should not be replaced with another,\n"
			       "different nonpresent SPTE, unless one or both\n"
472
473
			       "are MMIO SPTEs, or the new SPTE is\n"
			       "a temporary removed SPTE.\n"
474
475
476
477
478
479
480
			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
			       as_id, gfn, old_spte, new_spte, level);
		return;
	}


	if (was_leaf && is_dirty_spte(old_spte) &&
481
	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
482
483
484
485
486
487
		kvm_set_pfn_dirty(spte_to_pfn(old_spte));

	/*
	 * Recursively handle child PTs if the change removed a subtree from
	 * the paging structure.
	 */
488
489
	if (was_present && !was_leaf && (pfn_changed || !is_present))
		handle_removed_tdp_mmu_page(kvm,
490
				spte_to_child_pt(old_spte, level), shared);
491
492
493
}

static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
494
495
				u64 old_spte, u64 new_spte, int level,
				bool shared)
496
{
497
498
	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
			      shared);
499
	handle_changed_spte_acc_track(old_spte, new_spte, level);
500
501
	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
				      new_spte, level);
502
}
503

504
/*
505
506
507
 * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
 * and handle the associated bookkeeping, but do not mark the page dirty
 * in KVM's dirty bitmaps.
508
509
510
511
512
513
514
 *
 * @kvm: kvm instance
 * @iter: a tdp_iter instance currently on the SPTE that should be set
 * @new_spte: The value the SPTE should be set to
 * Returns: true if the SPTE was set, false if it was not. If false is returned,
 *	    this function will have no side-effects.
 */
515
516
517
static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
							struct tdp_iter *iter,
							u64 new_spte)
518
519
520
{
	lockdep_assert_held_read(&kvm->mmu_lock);

521
522
523
524
	/*
	 * Do not change removed SPTEs. Only the thread that froze the SPTE
	 * may modify it.
	 */
525
	if (is_removed_spte(iter->old_spte))
526
527
		return false;

528
529
530
531
	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
		      new_spte) != iter->old_spte)
		return false;

532
533
534
	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
			      new_spte, iter->level, true);
	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
535
536
537
538

	return true;
}

539
540
541
542
543
544
545
546
547
548
549
550
static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
					   struct tdp_iter *iter,
					   u64 new_spte)
{
	if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
		return false;

	handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
				      iter->old_spte, new_spte, iter->level);
	return true;
}

551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
					   struct tdp_iter *iter)
{
	/*
	 * Freeze the SPTE by setting it to a special,
	 * non-present value. This will stop other threads from
	 * immediately installing a present entry in its place
	 * before the TLBs are flushed.
	 */
	if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
		return false;

	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
					   KVM_PAGES_PER_HPAGE(iter->level));

	/*
	 * No other thread can overwrite the removed SPTE as they
	 * must either wait on the MMU lock or use
569
	 * tdp_mmu_set_spte_atomic which will not overwrite the
570
571
572
573
	 * special removed SPTE value. No bookkeeping is needed
	 * here since the SPTE is going from non-present
	 * to non-present.
	 */
574
	WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
575
576
577
578

	return true;
}

579

580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
/*
 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 * @kvm: kvm instance
 * @iter: a tdp_iter instance currently on the SPTE that should be set
 * @new_spte: The value the SPTE should be set to
 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 *		      of the page. Should be set unless handling an MMU
 *		      notifier for access tracking. Leaving record_acc_track
 *		      unset in that case prevents page accesses from being
 *		      double counted.
 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 *		      appropriate for the change being made. Should be set
 *		      unless performing certain dirty logging operations.
 *		      Leaving record_dirty_log unset in that case prevents page
 *		      writes from being double counted.
 */
596
static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
597
598
				      u64 new_spte, bool record_acc_track,
				      bool record_dirty_log)
599
{
600
	lockdep_assert_held_write(&kvm->mmu_lock);
601

602
603
604
605
606
607
608
	/*
	 * No thread should be using this function to set SPTEs to the
	 * temporary removed SPTE value.
	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
	 * should be used. If operating under the MMU lock in write mode, the
	 * use of the removed SPTE should not be necessary.
	 */
609
	WARN_ON(is_removed_spte(iter->old_spte));
610

611
	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
612

613
614
	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
			      new_spte, iter->level, false);
615
616
617
	if (record_acc_track)
		handle_changed_spte_acc_track(iter->old_spte, new_spte,
					      iter->level);
618
	if (record_dirty_log)
619
		handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
620
621
					      iter->old_spte, new_spte,
					      iter->level);
622
623
624
625
626
}

static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
				    u64 new_spte)
{
627
	__tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
628
}
629

630
631
632
633
static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
						 struct tdp_iter *iter,
						 u64 new_spte)
{
634
635
636
637
638
639
640
641
	__tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
}

static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
						 struct tdp_iter *iter,
						 u64 new_spte)
{
	__tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
642
643
644
645
646
}

#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
	for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)

647
648
649
650
651
652
653
#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
		if (!is_shadow_present_pte(_iter.old_spte) ||		\
		    !is_last_spte(_iter.old_spte, _iter.level))		\
			continue;					\
		else

654
655
656
657
#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
	for_each_tdp_pte(_iter, __va(_mmu->root_hpa),		\
			 _mmu->shadow_root_level, _start, _end)

658
659
660
661
/*
 * Yield if the MMU lock is contended or this thread needs to return control
 * to the scheduler.
 *
662
663
664
 * If this function should yield and flush is set, it will perform a remote
 * TLB flush before yielding.
 *
665
 * If this function yields, it will also reset the tdp_iter's walk over the
666
667
668
 * paging structure and the calling function should skip to the next
 * iteration to allow the iterator to continue its traversal from the
 * paging structure root.
669
670
671
672
 *
 * Return true if this function yielded and the iterator's traversal was reset.
 * Return false if a yield was not needed.
 */
673
static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
674
675
					     struct tdp_iter *iter, bool flush,
					     bool shared)
676
{
677
678
679
680
	/* Ensure forward progress has been made before yielding. */
	if (iter->next_last_level_gfn == iter->yielded_gfn)
		return false;

681
	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
682
683
		rcu_read_unlock();

684
685
686
		if (flush)
			kvm_flush_remote_tlbs(kvm);

687
688
689
690
691
		if (shared)
			cond_resched_rwlock_read(&kvm->mmu_lock);
		else
			cond_resched_rwlock_write(&kvm->mmu_lock);

692
		rcu_read_lock();
693
694
695

		WARN_ON(iter->gfn > iter->next_last_level_gfn);

696
		tdp_iter_restart(iter);
697

698
		return true;
699
	}
700
701

	return false;
702
703
}

704
705
706
707
708
/*
 * Tears down the mappings for the range of gfns, [start, end), and frees the
 * non-root pages mapping GFNs strictly within that range. Returns true if
 * SPTEs have been cleared and a TLB flush is needed before releasing the
 * MMU lock.
709
 *
710
711
712
713
 * If can_yield is true, will release the MMU lock and reschedule if the
 * scheduler needs the CPU or there is contention on the MMU lock. If this
 * function cannot yield, it will not release the MMU lock or reschedule and
 * the caller must ensure it does not supply too large a GFN range, or the
714
715
716
717
718
719
 * operation can cause a soft lockup.
 *
 * If shared is true, this thread holds the MMU lock in read mode and must
 * account for the possibility that other threads are modifying the paging
 * structures concurrently. If shared is false, this thread should hold the
 * MMU lock in write mode.
720
721
 */
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
722
723
			  gfn_t start, gfn_t end, bool can_yield, bool flush,
			  bool shared)
724
725
726
{
	struct tdp_iter iter;

727
728
	kvm_lockdep_assert_mmu_lock_held(kvm, shared);

729
730
	rcu_read_lock();

731
	tdp_root_for_each_pte(iter, root, start, end) {
732
retry:
733
		if (can_yield &&
734
		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
735
			flush = false;
736
737
738
			continue;
		}

739
740
741
742
743
744
745
746
747
748
749
750
751
		if (!is_shadow_present_pte(iter.old_spte))
			continue;

		/*
		 * If this is a non-last-level SPTE that covers a larger range
		 * than should be zapped, continue, and zap the mappings at a
		 * lower level.
		 */
		if ((iter.gfn < start ||
		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
		    !is_last_spte(iter.old_spte, iter.level))
			continue;

752
753
754
755
756
757
758
759
760
761
762
		if (!shared) {
			tdp_mmu_set_spte(kvm, &iter, 0);
			flush = true;
		} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
			/*
			 * The iter must explicitly re-read the SPTE because
			 * the atomic cmpxchg failed.
			 */
			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
			goto retry;
		}
763
	}
764
765

	rcu_read_unlock();
766
	return flush;
767
768
769
770
771
772
773
}

/*
 * Tears down the mappings for the range of gfns, [start, end), and frees the
 * non-root pages mapping GFNs strictly within that range. Returns true if
 * SPTEs have been cleared and a TLB flush is needed before releasing the
 * MMU lock.
774
775
776
777
778
 *
 * If shared is true, this thread holds the MMU lock in read mode and must
 * account for the possibility that other threads are modifying the paging
 * structures concurrently. If shared is false, this thread should hold the
 * MMU in write mode.
779
 */
780
bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
781
782
				 gfn_t end, bool can_yield, bool flush,
				 bool shared)
783
784
785
{
	struct kvm_mmu_page *root;

786
787
788
	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
				      shared);
789
790
791
792
793
794

	return flush;
}

void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
795
	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
796
797
798
799
	bool flush = false;
	int i;

	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
800
801
		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
						  flush, false);
802
803
804
805

	if (flush)
		kvm_flush_remote_tlbs(kvm);
}
806

807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
						  struct kvm_mmu_page *prev_root)
{
	struct kvm_mmu_page *next_root;

	if (prev_root)
		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
						  &prev_root->link,
						  typeof(*prev_root), link);
	else
		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
						   typeof(*next_root), link);

	while (next_root && !(next_root->role.invalid &&
			      refcount_read(&next_root->tdp_mmu_root_count)))
		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
						  &next_root->link,
						  typeof(*next_root), link);

	return next_root;
}

/*
 * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
 * invalidated root, they will not be freed until this function drops the
 * reference. Before dropping that reference, tear down the paging
 * structure so that whichever thread does drop the last reference
 * only has to do a trivial amount of work. Since the roots are invalid,
 * no new SPTEs should be created under them.
 */
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
{
	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
	struct kvm_mmu_page *next_root;
	struct kvm_mmu_page *root;
	bool flush = false;

	lockdep_assert_held_read(&kvm->mmu_lock);

	rcu_read_lock();

	root = next_invalidated_root(kvm, NULL);

	while (root) {
		next_root = next_invalidated_root(kvm, root);

		rcu_read_unlock();

		flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
				      true);

		/*
		 * Put the reference acquired in
		 * kvm_tdp_mmu_invalidate_roots
		 */
		kvm_tdp_mmu_put_root(kvm, root, true);

		root = next_root;

		rcu_read_lock();
	}

	rcu_read_unlock();
870
871
872
873

	if (flush)
		kvm_flush_remote_tlbs(kvm);
}
874

875
876
877
878
879
/*
 * Mark each TDP MMU root as invalid so that other threads
 * will drop their references and allow the root count to
 * go to 0.
 *
880
881
882
883
884
885
886
887
888
889
890
891
 * Also take a reference on all roots so that this thread
 * can do the bulk of the work required to free the roots
 * once they are invalidated. Without this reference, a
 * vCPU thread might drop the last reference to a root and
 * get stuck with tearing down the entire paging structure.
 *
 * Roots which have a zero refcount should be skipped as
 * they're already being torn down.
 * Already invalid roots should be referenced again so that
 * they aren't freed before kvm_tdp_mmu_zap_all_fast is
 * done with them.
 *
892
893
894
895
896
897
898
899
900
 * This has essentially the same effect for the TDP MMU
 * as updating mmu_valid_gen does for the shadow MMU.
 */
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
{
	struct kvm_mmu_page *root;

	lockdep_assert_held_write(&kvm->mmu_lock);
	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
901
902
		if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
			root->role.invalid = true;
903
904
}

905
906
907
908
909
910
911
912
913
914
915
916
917
/*
 * Installs a last-level SPTE to handle a TDP page fault.
 * (NPT/EPT violation/misconfiguration)
 */
static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
					  int map_writable,
					  struct tdp_iter *iter,
					  kvm_pfn_t pfn, bool prefault)
{
	u64 new_spte;
	int ret = 0;
	int make_spte_ret = 0;

918
	if (unlikely(is_noslot_pfn(pfn)))
919
		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
920
	else
921
922
923
924
925
926
927
		make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
					 pfn, iter->old_spte, prefault, true,
					 map_writable, !shadow_accessed_mask,
					 &new_spte);

	if (new_spte == iter->old_spte)
		ret = RET_PF_SPURIOUS;
928
929
	else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
		return RET_PF_RETRY;
930
931
932
933
934
935
936
937
938
939
940
941
942

	/*
	 * If the page fault was caused by a write but the page is write
	 * protected, emulation is needed. If the emulation was skipped,
	 * the vCPU would have the same fault again.
	 */
	if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
		if (write)
			ret = RET_PF_EMULATE;
		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
	}

	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
943
944
945
	if (unlikely(is_mmio_spte(new_spte))) {
		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
				     new_spte);
946
		ret = RET_PF_EMULATE;
947
	} else {
948
949
		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
				       rcu_dereference(iter->sptep));
950
	}
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971

	if (!prefault)
		vcpu->stat.pf_fixed++;

	return ret;
}

/*
 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 * page tables and SPTEs to translate the faulting guest physical address.
 */
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
		    int map_writable, int max_level, kvm_pfn_t pfn,
		    bool prefault)
{
	bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
	bool write = error_code & PFERR_WRITE_MASK;
	bool exec = error_code & PFERR_FETCH_MASK;
	bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
	struct kvm_mmu *mmu = vcpu->arch.mmu;
	struct tdp_iter iter;
972
	struct kvm_mmu_page *sp;
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
	u64 *child_pt;
	u64 new_spte;
	int ret;
	gfn_t gfn = gpa >> PAGE_SHIFT;
	int level;
	int req_level;

	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
		return RET_PF_RETRY;
	if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
		return RET_PF_RETRY;

	level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
					huge_page_disallowed, &req_level);

	trace_kvm_mmu_spte_requested(gpa, level, pfn);
989
990
991

	rcu_read_lock();

992
993
994
995
996
997
998
999
1000
	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
		if (nx_huge_page_workaround_enabled)
			disallowed_hugepage_adjust(iter.old_spte, gfn,
						   iter.level, &pfn, &level);

		if (iter.level == level)
			break;

		/*