alternative.c 34.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2
3
#define pr_fmt(fmt) "SMP alternatives: " fmt

Gerd Hoffmann's avatar
Gerd Hoffmann committed
4
#include <linux/module.h>
Al Viro's avatar
Al Viro committed
5
#include <linux/sched.h>
6
#include <linux/perf_event.h>
7
#include <linux/mutex.h>
Gerd Hoffmann's avatar
Gerd Hoffmann committed
8
#include <linux/list.h>
9
#include <linux/stringify.h>
10
#include <linux/highmem.h>
11
12
#include <linux/mm.h>
#include <linux/vmalloc.h>
13
#include <linux/memory.h>
14
#include <linux/stop_machine.h>
15
#include <linux/slab.h>
16
#include <linux/kdebug.h>
17
#include <linux/kprobes.h>
18
#include <linux/mmu_context.h>
19
#include <linux/bsearch.h>
20
#include <linux/sync_core.h>
21
#include <asm/text-patching.h>
Gerd Hoffmann's avatar
Gerd Hoffmann committed
22
23
#include <asm/alternative.h>
#include <asm/sections.h>
24
25
#include <asm/mce.h>
#include <asm/nmi.h>
26
#include <asm/cacheflush.h>
27
#include <asm/tlbflush.h>
28
#include <asm/insn.h>
29
#include <asm/io.h>
30
#include <asm/fixmap.h>
Gerd Hoffmann's avatar
Gerd Hoffmann committed
31

32
33
34
35
int __read_mostly alternatives_patched;

EXPORT_SYMBOL_GPL(alternatives_patched);

36
37
#define MAX_PATCH_LEN (255-1)

38
static int __initdata_or_module debug_alternative;
39

40
41
42
43
44
45
46
static int __init debug_alt(char *str)
{
	debug_alternative = 1;
	return 1;
}
__setup("debug-alternative", debug_alt);

47
48
static int noreplace_smp;

49
50
51
52
53
54
55
static int __init setup_noreplace_smp(char *str)
{
	noreplace_smp = 1;
	return 1;
}
__setup("noreplace-smp", setup_noreplace_smp);

56
57
58
#define DPRINTK(fmt, args...)						\
do {									\
	if (debug_alternative)						\
59
		printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);		\
60
} while (0)
61

62
63
64
65
66
67
68
69
#define DUMP_BYTES(buf, len, fmt, args...)				\
do {									\
	if (unlikely(debug_alternative)) {				\
		int j;							\
									\
		if (!(len))						\
			break;						\
									\
70
		printk(KERN_DEBUG pr_fmt(fmt), ##args);			\
71
72
73
74
75
76
		for (j = 0; j < (len) - 1; j++)				\
			printk(KERN_CONT "%02hhx ", buf[j]);		\
		printk(KERN_CONT "%02hhx\n", buf[j]);			\
	}								\
} while (0)

77
78
79
80
81
82
83
84
85
/*
 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
 * that correspond to that nop. Getting from one nop to the next, we
 * add to the array the offset that is equal to the sum of all sizes of
 * nops preceding the one we are after.
 *
 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
 * nice symmetry of sizes of the previous nops.
 */
86
#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
87
88
89
90
91
92
93
94
95
96
97
98
99
100
static const unsigned char intelnops[] =
{
	GENERIC_NOP1,
	GENERIC_NOP2,
	GENERIC_NOP3,
	GENERIC_NOP4,
	GENERIC_NOP5,
	GENERIC_NOP6,
	GENERIC_NOP7,
	GENERIC_NOP8,
	GENERIC_NOP5_ATOMIC
};
static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
{
Gerd Hoffmann's avatar
Gerd Hoffmann committed
101
102
103
104
105
106
107
108
109
	NULL,
	intelnops,
	intelnops + 1,
	intelnops + 1 + 2,
	intelnops + 1 + 2 + 3,
	intelnops + 1 + 2 + 3 + 4,
	intelnops + 1 + 2 + 3 + 4 + 5,
	intelnops + 1 + 2 + 3 + 4 + 5 + 6,
	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
110
	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
Gerd Hoffmann's avatar
Gerd Hoffmann committed
111
};
112
113
114
#endif

#ifdef K8_NOP1
115
116
117
118
119
120
121
122
123
124
125
126
127
128
static const unsigned char k8nops[] =
{
	K8_NOP1,
	K8_NOP2,
	K8_NOP3,
	K8_NOP4,
	K8_NOP5,
	K8_NOP6,
	K8_NOP7,
	K8_NOP8,
	K8_NOP5_ATOMIC
};
static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
{
Gerd Hoffmann's avatar
Gerd Hoffmann committed
129
130
131
132
133
134
135
136
137
	NULL,
	k8nops,
	k8nops + 1,
	k8nops + 1 + 2,
	k8nops + 1 + 2 + 3,
	k8nops + 1 + 2 + 3 + 4,
	k8nops + 1 + 2 + 3 + 4 + 5,
	k8nops + 1 + 2 + 3 + 4 + 5 + 6,
	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
138
	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
Gerd Hoffmann's avatar
Gerd Hoffmann committed
139
};
140
141
#endif

142
#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
143
144
145
146
147
148
149
150
151
152
153
154
155
156
static const unsigned char k7nops[] =
{
	K7_NOP1,
	K7_NOP2,
	K7_NOP3,
	K7_NOP4,
	K7_NOP5,
	K7_NOP6,
	K7_NOP7,
	K7_NOP8,
	K7_NOP5_ATOMIC
};
static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
{
Gerd Hoffmann's avatar
Gerd Hoffmann committed
157
158
159
160
161
162
163
164
165
	NULL,
	k7nops,
	k7nops + 1,
	k7nops + 1 + 2,
	k7nops + 1 + 2 + 3,
	k7nops + 1 + 2 + 3 + 4,
	k7nops + 1 + 2 + 3 + 4 + 5,
	k7nops + 1 + 2 + 3 + 4 + 5 + 6,
	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
166
	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
Gerd Hoffmann's avatar
Gerd Hoffmann committed
167
};
168
169
#endif

170
#ifdef P6_NOP1
171
static const unsigned char p6nops[] =
172
173
174
175
176
177
178
179
180
181
182
183
184
{
	P6_NOP1,
	P6_NOP2,
	P6_NOP3,
	P6_NOP4,
	P6_NOP5,
	P6_NOP6,
	P6_NOP7,
	P6_NOP8,
	P6_NOP5_ATOMIC
};
static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
{
185
186
187
188
189
190
191
192
193
	NULL,
	p6nops,
	p6nops + 1,
	p6nops + 1 + 2,
	p6nops + 1 + 2 + 3,
	p6nops + 1 + 2 + 3 + 4,
	p6nops + 1 + 2 + 3 + 4 + 5,
	p6nops + 1 + 2 + 3 + 4 + 5 + 6,
	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
194
	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
195
196
197
};
#endif

198
/* Initialize these to a safe default */
199
#ifdef CONFIG_X86_64
200
201
202
203
const unsigned char * const *ideal_nops = p6_nops;
#else
const unsigned char * const *ideal_nops = intel_nops;
#endif
204

205
void __init arch_init_ideal_nops(void)
206
{
207
208
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
209
210
211
212
213
214
215
216
217
218
219
220
221
		/*
		 * Due to a decoder implementation quirk, some
		 * specific Intel CPUs actually perform better with
		 * the "k8_nops" than with the SDM-recommended NOPs.
		 */
		if (boot_cpu_data.x86 == 6 &&
		    boot_cpu_data.x86_model >= 0x0f &&
		    boot_cpu_data.x86_model != 0x1c &&
		    boot_cpu_data.x86_model != 0x26 &&
		    boot_cpu_data.x86_model != 0x27 &&
		    boot_cpu_data.x86_model < 0x30) {
			ideal_nops = k8_nops;
		} else if (boot_cpu_has(X86_FEATURE_NOPL)) {
222
223
224
225
226
227
228
229
			   ideal_nops = p6_nops;
		} else {
#ifdef CONFIG_X86_64
			ideal_nops = k8_nops;
#else
			ideal_nops = intel_nops;
#endif
		}
230
		break;
231

232
233
234
235
	case X86_VENDOR_HYGON:
		ideal_nops = p6_nops;
		return;

236
237
238
239
240
241
242
243
	case X86_VENDOR_AMD:
		if (boot_cpu_data.x86 > 0xf) {
			ideal_nops = p6_nops;
			return;
		}

		/* fall through */

244
245
246
247
248
249
250
251
252
253
254
255
	default:
#ifdef CONFIG_X86_64
		ideal_nops = k8_nops;
#else
		if (boot_cpu_has(X86_FEATURE_K8))
			ideal_nops = k8_nops;
		else if (boot_cpu_has(X86_FEATURE_K7))
			ideal_nops = k7_nops;
		else
			ideal_nops = intel_nops;
#endif
	}
Gerd Hoffmann's avatar
Gerd Hoffmann committed
256
257
}

258
/* Use this to add nops to a buffer, then text_poke the whole buffer. */
259
static void __init_or_module add_nops(void *insns, unsigned int len)
260
261
262
263
264
{
	while (len > 0) {
		unsigned int noplen = len;
		if (noplen > ASM_NOP_MAX)
			noplen = ASM_NOP_MAX;
265
		memcpy(insns, ideal_nops[noplen], noplen);
266
267
268
269
270
		insns += noplen;
		len -= noplen;
	}
}

271
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
272
extern s32 __smp_locks[], __smp_locks_end[];
273
void text_poke_early(void *addr, const void *opcode, size_t len);
274

275
276
277
278
279
280
281
282
283
/*
 * Are we looking at a near JMP with a 1 or 4-byte displacement.
 */
static inline bool is_jmp(const u8 opcode)
{
	return opcode == 0xeb || opcode == 0xe9;
}

static void __init_or_module
284
recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
285
286
287
288
289
290
291
292
{
	u8 *next_rip, *tgt_rip;
	s32 n_dspl, o_dspl;
	int repl_len;

	if (a->replacementlen != 5)
		return;

293
	o_dspl = *(s32 *)(insn_buff + 1);
294
295
296
297
298
299
300

	/* next_rip of the replacement JMP */
	next_rip = repl_insn + a->replacementlen;
	/* target rip of the replacement JMP */
	tgt_rip  = next_rip + o_dspl;
	n_dspl = tgt_rip - orig_insn;

301
	DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318

	if (tgt_rip - orig_insn >= 0) {
		if (n_dspl - 2 <= 127)
			goto two_byte_jmp;
		else
			goto five_byte_jmp;
	/* negative offset */
	} else {
		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
			goto two_byte_jmp;
		else
			goto five_byte_jmp;
	}

two_byte_jmp:
	n_dspl -= 2;

319
320
321
	insn_buff[0] = 0xeb;
	insn_buff[1] = (s8)n_dspl;
	add_nops(insn_buff + 2, 3);
322
323
324
325
326
327
328

	repl_len = 2;
	goto done;

five_byte_jmp:
	n_dspl -= 5;

329
330
	insn_buff[0] = 0xe9;
	*(s32 *)&insn_buff[1] = n_dspl;
331
332
333
334
335
336
337
338
339

	repl_len = 5;

done:

	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
}

340
341
342
343
344
/*
 * "noinline" to cause control flow change and thus invalidate I$ and
 * cause refetch after modification.
 */
static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
345
{
346
	unsigned long flags;
347
	int i;
348

349
350
351
352
	for (i = 0; i < a->padlen; i++) {
		if (instr[i] != 0x90)
			return;
	}
353

354
	local_irq_save(flags);
355
	add_nops(instr + (a->instrlen - a->padlen), a->padlen);
356
	local_irq_restore(flags);
357

358
	DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
359
360
361
		   instr, a->instrlen - a->padlen, a->padlen);
}

362
363
364
365
366
367
/*
 * Replace instructions with better alternatives for this CPU type. This runs
 * before SMP is initialized to avoid SMP problems with self modifying code.
 * This implies that asymmetric systems where APs have less capabilities than
 * the boot processor are not handled. Tough. Make sure you disable such
 * features by hand.
368
369
370
 *
 * Marked "noinline" to cause control flow change and thus insn cache
 * to refetch changed I$ lines.
371
 */
372
373
void __init_or_module noinline apply_alternatives(struct alt_instr *start,
						  struct alt_instr *end)
Gerd Hoffmann's avatar
Gerd Hoffmann committed
374
375
{
	struct alt_instr *a;
376
	u8 *instr, *replacement;
377
	u8 insn_buff[MAX_PATCH_LEN];
Gerd Hoffmann's avatar
Gerd Hoffmann committed
378

379
	DPRINTK("alt table %px, -> %px", start, end);
380
381
	/*
	 * The scan order should be from start to end. A later scanned
382
	 * alternative code can overwrite previously scanned alternative code.
383
384
385
386
387
388
	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
	 * patch code.
	 *
	 * So be careful if you want to change the scan order to any other
	 * order.
	 */
Gerd Hoffmann's avatar
Gerd Hoffmann committed
389
	for (a = start; a < end; a++) {
390
		int insn_buff_sz = 0;
391

392
393
		instr = (u8 *)&a->instr_offset + a->instr_offset;
		replacement = (u8 *)&a->repl_offset + a->repl_offset;
394
		BUG_ON(a->instrlen > sizeof(insn_buff));
395
		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
396
397
398
399
		if (!boot_cpu_has(a->cpuid)) {
			if (a->padlen > 1)
				optimize_nops(a, instr);

Gerd Hoffmann's avatar
Gerd Hoffmann committed
400
			continue;
401
		}
402

403
		DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d",
404
405
			a->cpuid >> 5,
			a->cpuid & 0x1f,
406
			instr, instr, a->instrlen,
407
			replacement, a->replacementlen, a->padlen);
408

409
410
		DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
		DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
411

412
413
		memcpy(insn_buff, replacement, a->replacementlen);
		insn_buff_sz = a->replacementlen;
414

415
416
417
418
419
420
		/*
		 * 0xe8 is a relative jump; fix the offset.
		 *
		 * Instruction length is checked before the opcode to avoid
		 * accessing uninitialized bytes for zero-length replacements.
		 */
421
422
		if (a->replacementlen == 5 && *insn_buff == 0xe8) {
			*(s32 *)(insn_buff + 1) += replacement - instr;
423
			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
424
425
				*(s32 *)(insn_buff + 1),
				(unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
426
		}
427

428
		if (a->replacementlen && is_jmp(replacement[0]))
429
			recompute_jump(a, instr, replacement, insn_buff);
430
431

		if (a->instrlen > a->replacementlen) {
432
			add_nops(insn_buff + a->replacementlen,
433
				 a->instrlen - a->replacementlen);
434
			insn_buff_sz += a->instrlen - a->replacementlen;
435
		}
436
		DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
437

438
		text_poke_early(instr, insn_buff, insn_buff_sz);
Gerd Hoffmann's avatar
Gerd Hoffmann committed
439
440
441
	}
}

442
#ifdef CONFIG_SMP
443
444
static void alternatives_smp_lock(const s32 *start, const s32 *end,
				  u8 *text, u8 *text_end)
Gerd Hoffmann's avatar
Gerd Hoffmann committed
445
{
446
	const s32 *poff;
Gerd Hoffmann's avatar
Gerd Hoffmann committed
447

448
449
450
451
	for (poff = start; poff < end; poff++) {
		u8 *ptr = (u8 *)poff + *poff;

		if (!*poff || ptr < text || ptr >= text_end)
Gerd Hoffmann's avatar
Gerd Hoffmann committed
452
			continue;
453
		/* turn DS segment override prefix into lock prefix */
454
455
		if (*ptr == 0x3e)
			text_poke(ptr, ((unsigned char []){0xf0}), 1);
456
	}
Gerd Hoffmann's avatar
Gerd Hoffmann committed
457
458
}

459
460
static void alternatives_smp_unlock(const s32 *start, const s32 *end,
				    u8 *text, u8 *text_end)
Gerd Hoffmann's avatar
Gerd Hoffmann committed
461
{
462
	const s32 *poff;
Gerd Hoffmann's avatar
Gerd Hoffmann committed
463

464
465
466
467
	for (poff = start; poff < end; poff++) {
		u8 *ptr = (u8 *)poff + *poff;

		if (!*poff || ptr < text || ptr >= text_end)
Gerd Hoffmann's avatar
Gerd Hoffmann committed
468
			continue;
469
		/* turn lock prefix into DS segment override prefix */
470
471
		if (*ptr == 0xf0)
			text_poke(ptr, ((unsigned char []){0x3E}), 1);
472
	}
Gerd Hoffmann's avatar
Gerd Hoffmann committed
473
474
475
476
477
478
479
480
}

struct smp_alt_module {
	/* what is this ??? */
	struct module	*mod;
	char		*name;

	/* ptrs to lock prefixes */
481
482
	const s32	*locks;
	const s32	*locks_end;
Gerd Hoffmann's avatar
Gerd Hoffmann committed
483
484
485
486
487
488
489
490

	/* .text segment, needed to avoid patching init code ;) */
	u8		*text;
	u8		*text_end;

	struct list_head next;
};
static LIST_HEAD(smp_alt_modules);
491
static bool uniproc_patched = false;	/* protected by text_mutex */
Gerd Hoffmann's avatar
Gerd Hoffmann committed
492

493
494
495
496
void __init_or_module alternatives_smp_module_add(struct module *mod,
						  char *name,
						  void *locks, void *locks_end,
						  void *text,  void *text_end)
Gerd Hoffmann's avatar
Gerd Hoffmann committed
497
498
499
{
	struct smp_alt_module *smp;

500
	mutex_lock(&text_mutex);
501
502
	if (!uniproc_patched)
		goto unlock;
503

504
505
506
	if (num_possible_cpus() == 1)
		/* Don't bother remembering, we'll never have to undo it. */
		goto smp_unlock;
Gerd Hoffmann's avatar
Gerd Hoffmann committed
507
508
509

	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
	if (NULL == smp)
510
511
		/* we'll run the (safe but slow) SMP code then ... */
		goto unlock;
Gerd Hoffmann's avatar
Gerd Hoffmann committed
512
513
514
515
516
517
518

	smp->mod	= mod;
	smp->name	= name;
	smp->locks	= locks;
	smp->locks_end	= locks_end;
	smp->text	= text;
	smp->text_end	= text_end;
519
520
	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
		smp->locks, smp->locks_end,
Gerd Hoffmann's avatar
Gerd Hoffmann committed
521
522
523
		smp->text, smp->text_end, smp->name);

	list_add_tail(&smp->next, &smp_alt_modules);
524
525
526
smp_unlock:
	alternatives_smp_unlock(locks, locks_end, text, text_end);
unlock:
527
	mutex_unlock(&text_mutex);
Gerd Hoffmann's avatar
Gerd Hoffmann committed
528
529
}

530
void __init_or_module alternatives_smp_module_del(struct module *mod)
Gerd Hoffmann's avatar
Gerd Hoffmann committed
531
532
533
{
	struct smp_alt_module *item;

534
	mutex_lock(&text_mutex);
Gerd Hoffmann's avatar
Gerd Hoffmann committed
535
536
537
538
539
	list_for_each_entry(item, &smp_alt_modules, next) {
		if (mod != item->mod)
			continue;
		list_del(&item->next);
		kfree(item);
540
		break;
Gerd Hoffmann's avatar
Gerd Hoffmann committed
541
	}
542
	mutex_unlock(&text_mutex);
Gerd Hoffmann's avatar
Gerd Hoffmann committed
543
544
}

545
void alternatives_enable_smp(void)
Gerd Hoffmann's avatar
Gerd Hoffmann committed
546
547
548
{
	struct smp_alt_module *mod;

549
550
	/* Why bother if there are no other CPUs? */
	BUG_ON(num_possible_cpus() == 1);
Gerd Hoffmann's avatar
Gerd Hoffmann committed
551

552
	mutex_lock(&text_mutex);
553

554
	if (uniproc_patched) {
555
		pr_info("switching to SMP code\n");
556
		BUG_ON(num_online_cpus() != 1);
557
558
		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
Gerd Hoffmann's avatar
Gerd Hoffmann committed
559
560
561
		list_for_each_entry(mod, &smp_alt_modules, next)
			alternatives_smp_lock(mod->locks, mod->locks_end,
					      mod->text, mod->text_end);
562
		uniproc_patched = false;
Gerd Hoffmann's avatar
Gerd Hoffmann committed
563
	}
564
	mutex_unlock(&text_mutex);
Gerd Hoffmann's avatar
Gerd Hoffmann committed
565
566
}

567
568
569
570
/*
 * Return 1 if the address range is reserved for SMP-alternatives.
 * Must hold text_mutex.
 */
571
572
573
int alternatives_text_reserved(void *start, void *end)
{
	struct smp_alt_module *mod;
574
	const s32 *poff;
575
576
	u8 *text_start = start;
	u8 *text_end = end;
577

578
579
	lockdep_assert_held(&text_mutex);

580
	list_for_each_entry(mod, &smp_alt_modules, next) {
581
		if (mod->text > text_end || mod->text_end < text_start)
582
			continue;
583
584
585
586
		for (poff = mod->locks; poff < mod->locks_end; poff++) {
			const u8 *ptr = (const u8 *)poff + *poff;

			if (text_start <= ptr && text_end > ptr)
587
				return 1;
588
		}
589
590
591
592
	}

	return 0;
}
593
#endif /* CONFIG_SMP */
594

595
#ifdef CONFIG_PARAVIRT
596
597
void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
				     struct paravirt_patch_site *end)
598
{
599
	struct paravirt_patch_site *p;
600
	char insn_buff[MAX_PATCH_LEN];
601
602
603
604

	for (p = start; p < end; p++) {
		unsigned int used;

605
		BUG_ON(p->len > MAX_PATCH_LEN);
606
		/* prep the buffer with the original instructions */
607
		memcpy(insn_buff, p->instr, p->len);
608
		used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
609

610
611
		BUG_ON(used > p->len);

612
		/* Pad the rest with nops */
613
614
		add_nops(insn_buff + used, p->len - used);
		text_poke_early(p->instr, insn_buff, p->len);
615
616
	}
}
617
extern struct paravirt_patch_site __start_parainstructions[],
618
619
620
	__stop_parainstructions[];
#endif	/* CONFIG_PARAVIRT */

621
622
623
624
625
626
627
628
629
630
/*
 * Self-test for the INT3 based CALL emulation code.
 *
 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
 * properly and that there is a stack gap between the INT3 frame and the
 * previous context. Without this gap doing a virtual PUSH on the interrupted
 * stack would corrupt the INT3 IRET frame.
 *
 * See entry_{32,64}.S for more details.
 */
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647

/*
 * We define the int3_magic() function in assembly to control the calling
 * convention such that we can 'call' it from assembly.
 */

extern void int3_magic(unsigned int *ptr); /* defined in asm */

asm (
"	.pushsection	.init.text, \"ax\", @progbits\n"
"	.type		int3_magic, @function\n"
"int3_magic:\n"
"	movl	$1, (%" _ASM_ARG1 ")\n"
"	ret\n"
"	.size		int3_magic, .-int3_magic\n"
"	.popsection\n"
);
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694

extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */

static int __init
int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
{
	struct die_args *args = data;
	struct pt_regs *regs = args->regs;

	if (!regs || user_mode(regs))
		return NOTIFY_DONE;

	if (val != DIE_INT3)
		return NOTIFY_DONE;

	if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip)
		return NOTIFY_DONE;

	int3_emulate_call(regs, (unsigned long)&int3_magic);
	return NOTIFY_STOP;
}

static void __init int3_selftest(void)
{
	static __initdata struct notifier_block int3_exception_nb = {
		.notifier_call	= int3_exception_notify,
		.priority	= INT_MAX-1, /* last */
	};
	unsigned int val = 0;

	BUG_ON(register_die_notifier(&int3_exception_nb));

	/*
	 * Basically: int3_magic(&val); but really complicated :-)
	 *
	 * Stick the address of the INT3 instruction into int3_selftest_ip,
	 * then trigger the INT3, padded with NOPs to match a CALL instruction
	 * length.
	 */
	asm volatile ("1: int3; nop; nop; nop; nop\n\t"
		      ".pushsection .init.data,\"aw\"\n\t"
		      ".align " __ASM_SEL(4, 8) "\n\t"
		      ".type int3_selftest_ip, @object\n\t"
		      ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t"
		      "int3_selftest_ip:\n\t"
		      __ASM_SEL(.long, .quad) " 1b\n\t"
		      ".popsection\n\t"
695
696
697
		      : ASM_CALL_CONSTRAINT
		      : __ASM_SEL_RAW(a, D) (&val)
		      : "memory");
698
699
700
701
702
703

	BUG_ON(val != 1);

	unregister_die_notifier(&int3_exception_nb);
}

Gerd Hoffmann's avatar
Gerd Hoffmann committed
704
705
void __init alternative_instructions(void)
{
706
707
708
709
710
711
712
	int3_selftest();

	/*
	 * The patching is not fully atomic, so try to avoid local
	 * interruptions that might execute the to be patched code.
	 * Other CPUs are not running.
	 */
713
	stop_nmi();
714
715
716
717
718

	/*
	 * Don't stop machine check exceptions while patching.
	 * MCEs only happen when something got corrupted and in this
	 * case we must do something about the corruption.
Marco Ammon's avatar
Marco Ammon committed
719
	 * Ignoring it is worse than an unlikely patching race.
720
721
722
723
724
	 * Also machine checks tend to be broadcast and if one CPU
	 * goes into machine check the others follow quickly, so we don't
	 * expect a machine check to cause undue problems during to code
	 * patching.
	 */
725

Gerd Hoffmann's avatar
Gerd Hoffmann committed
726
727
	apply_alternatives(__alt_instructions, __alt_instructions_end);

728
#ifdef CONFIG_SMP
729
730
731
	/* Patch to UP if other cpus not imminent. */
	if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
		uniproc_patched = true;
Gerd Hoffmann's avatar
Gerd Hoffmann committed
732
733
734
735
		alternatives_smp_module_add(NULL, "core kernel",
					    __smp_locks, __smp_locks_end,
					    _text, _etext);
	}
736

737
	if (!uniproc_patched || num_possible_cpus() == 1) {
738
739
740
		free_init_pages("SMP alternatives",
				(unsigned long)__smp_locks,
				(unsigned long)__smp_locks_end);
741
	}
742
743
744
#endif

	apply_paravirt(__parainstructions, __parainstructions_end);
745

746
	restart_nmi();
747
	alternatives_patched = 1;
Gerd Hoffmann's avatar
Gerd Hoffmann committed
748
}
749

750
751
752
753
754
755
/**
 * text_poke_early - Update instructions on a live kernel at boot time
 * @addr: address to modify
 * @opcode: source of the copy
 * @len: length to copy
 *
756
757
 * When you use this code to patch more than one byte of an instruction
 * you need to make sure that other CPUs cannot execute this code in parallel.
758
 * Also no thread must be currently preempted in the middle of these
Marco Ammon's avatar
Marco Ammon committed
759
760
 * instructions. And on the local CPU you need to be protected against NMI or
 * MCE handlers seeing an inconsistent instruction while you patch.
761
 */
762
763
void __init_or_module text_poke_early(void *addr, const void *opcode,
				      size_t len)
764
{
765
	unsigned long flags;
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785

	if (boot_cpu_has(X86_FEATURE_NX) &&
	    is_module_text_address((unsigned long)addr)) {
		/*
		 * Modules text is marked initially as non-executable, so the
		 * code cannot be running and speculative code-fetches are
		 * prevented. Just change the code.
		 */
		memcpy(addr, opcode, len);
	} else {
		local_irq_save(flags);
		memcpy(addr, opcode, len);
		local_irq_restore(flags);
		sync_core();

		/*
		 * Could also do a CLFLUSH here to speed up CPU recovery; but
		 * that causes hangs on some VIA CPUs.
		 */
	}
786
787
}

788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
typedef struct {
	struct mm_struct *mm;
} temp_mm_state_t;

/*
 * Using a temporary mm allows to set temporary mappings that are not accessible
 * by other CPUs. Such mappings are needed to perform sensitive memory writes
 * that override the kernel memory protections (e.g., W^X), without exposing the
 * temporary page-table mappings that are required for these write operations to
 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
 * mapping is torn down.
 *
 * Context: The temporary mm needs to be used exclusively by a single core. To
 *          harden security IRQs must be disabled while the temporary mm is
 *          loaded, thereby preventing interrupt handler bugs from overriding
 *          the kernel memory protection.
 */
static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
{
	temp_mm_state_t temp_state;

	lockdep_assert_irqs_disabled();
	temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
	switch_mm_irqs_off(NULL, mm, current);

	/*
	 * If breakpoints are enabled, disable them while the temporary mm is
	 * used. Userspace might set up watchpoints on addresses that are used
	 * in the temporary mm, which would lead to wrong signals being sent or
	 * crashes.
	 *
	 * Note that breakpoints are not disabled selectively, which also causes
	 * kernel breakpoints (e.g., perf's) to be disabled. This might be
	 * undesirable, but still seems reasonable as the code that runs in the
	 * temporary mm should be short.
	 */
	if (hw_breakpoint_active())
		hw_breakpoint_disable();

	return temp_state;
}

static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
{
	lockdep_assert_irqs_disabled();
	switch_mm_irqs_off(NULL, prev_state.mm, current);

	/*
	 * Restore the breakpoints if they were disabled before the temporary mm
	 * was loaded.
	 */
	if (hw_breakpoint_active())
		hw_breakpoint_restore();
}

843
844
845
__ro_after_init struct mm_struct *poking_mm;
__ro_after_init unsigned long poking_addr;

846
static void *__text_poke(void *addr, const void *opcode, size_t len)
847
{
848
849
850
	bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
	struct page *pages[2] = {NULL};
	temp_mm_state_t prev;
851
	unsigned long flags;
852
853
854
	pte_t pte, *ptep;
	spinlock_t *ptl;
	pgprot_t pgprot;
855

856
	/*
857
858
	 * While boot memory allocator is running we cannot use struct pages as
	 * they are not yet initialized. There is no way to recover.
859
860
861
	 */
	BUG_ON(!after_bootmem);

Mathieu Desnoyers's avatar
Mathieu Desnoyers committed
862
863
	if (!core_kernel_text((unsigned long)addr)) {
		pages[0] = vmalloc_to_page(addr);
864
865
		if (cross_page_boundary)
			pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
866
	} else {
Mathieu Desnoyers's avatar
Mathieu Desnoyers committed
867
		pages[0] = virt_to_page(addr);
Ingo Molnar's avatar
Ingo Molnar committed
868
		WARN_ON(!PageReserved(pages[0]));
869
870
		if (cross_page_boundary)
			pages[1] = virt_to_page(addr + PAGE_SIZE);
871
	}
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
	/*
	 * If something went wrong, crash and burn since recovery paths are not
	 * implemented.
	 */
	BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));

	/*
	 * Map the page without the global bit, as TLB flushing is done with
	 * flush_tlb_mm_range(), which is intended for non-global PTEs.
	 */
	pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);

	/*
	 * The lock is not really needed, but this allows to avoid open-coding.
	 */
	ptep = get_locked_pte(poking_mm, poking_addr, &ptl);

	/*
	 * This must not fail; preallocated in poking_init().
	 */
	VM_BUG_ON(!ptep);

894
895
	local_irq_save(flags);

896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
	pte = mk_pte(pages[0], pgprot);
	set_pte_at(poking_mm, poking_addr, ptep, pte);

	if (cross_page_boundary) {
		pte = mk_pte(pages[1], pgprot);
		set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
	}

	/*
	 * Loading the temporary mm behaves as a compiler barrier, which
	 * guarantees that the PTE will be set at the time memcpy() is done.
	 */
	prev = use_temporary_mm(poking_mm);

	kasan_disable_current();
	memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
	kasan_enable_current();

	/*
	 * Ensure that the PTE is only cleared after the instructions of memcpy
	 * were issued by using a compiler barrier.
	 */
	barrier();

	pte_clear(poking_mm, poking_addr, ptep);
	if (cross_page_boundary)
		pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);

	/*
	 * Loading the previous page-table hierarchy requires a serializing
	 * instruction that already allows the core to see the updated version.
	 * Xen-PV is assumed to serialize execution in a similar manner.
	 */
	unuse_temporary_mm(prev);

	/*
	 * Flushing the TLB might involve IPIs, which would require enabled
	 * IRQs, but not if the mm is not used, as it is in this point.
	 */
	flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
			   (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
			   PAGE_SHIFT, false);

	/*
	 * If the text does not match what we just wrote then something is
	 * fundamentally screwy; there's nothing we can really do about that.
	 */
	BUG_ON(memcmp(addr, opcode, len));

945
	local_irq_restore(flags);
946
	pte_unmap_unlock(ptep, ptl);
947
	return addr;
948
}
949

950
951
952
953
954
955
956
957
958
959
/**
 * text_poke - Update instructions on a live kernel
 * @addr: address to modify
 * @opcode: source of the copy
 * @len: length to copy
 *
 * Only atomic text poke/set should be allowed when not doing early patching.
 * It means the size must be writable atomically and the address must be aligned
 * in a way that permits an atomic write. It also makes sure we fit on a single
 * page.
960
961
962
963
964
 *
 * Note that the caller must ensure that if the modified code is part of a
 * module, the module would not be removed during poking. This can be achieved
 * by registering a module notifier, and ordering module removal and patching
 * trough a mutex.
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
 */
void *text_poke(void *addr, const void *opcode, size_t len)
{
	lockdep_assert_held(&text_mutex);

	return __text_poke(addr, opcode, len);
}

/**
 * text_poke_kgdb - Update instructions on a live kernel by kgdb
 * @addr: address to modify
 * @opcode: source of the copy
 * @len: length to copy
 *
 * Only atomic text poke/set should be allowed when not doing early patching.
 * It means the size must be writable atomically and the address must be aligned
 * in a way that permits an atomic write. It also makes sure we fit on a single
 * page.
 *
 * Context: should only be used by kgdb, which ensures no other core is running,
 *	    despite the fact it does not hold the text_mutex.
 */
void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
{
	return __text_poke(addr, opcode, len);
}

992
993
994
995
996
static void do_sync_core(void *info)
{
	sync_core();
}

997
998
999
1000
1001
void text_poke_sync(void)
{
	on_each_cpu(do_sync_core, NULL, 1);
}

1002
struct text_poke_loc {
1003
	s32 rel_addr; /* addr := _stext + rel_addr */
1004
1005
1006
	s32 rel32;
	u8 opcode;
	const u8 text[POKE_MAX_OPCODE_SIZE];
1007
	u8 old;
1008
1009
};

1010
struct bp_patching_desc {
1011
1012
	struct text_poke_loc *vec;
	int nr_entries;
1013
1014
1015
1016
1017
	atomic_t refs;
};

static struct bp_patching_desc *bp_desc;

1018
1019
static __always_inline
struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
1020
{
1021
	struct bp_patching_desc *desc = __READ_ONCE(*descp); /* rcu_dereference */
1022

1023
	if (!desc || !arch_atomic_inc_not_zero(&desc->refs))
1024
1025
1026
1027
1028
		return NULL;

	return desc;
}

1029
static __always_inline void put_desc(struct bp_patching_desc *desc)
1030
1031
{
	smp_mb__before_atomic();
1032
	arch_atomic_dec(&desc->refs);
1033
}
1034

1035
static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
1036
1037
1038
1039
{
	return _stext + tp->rel_addr;
}

Peter Zijlstra's avatar
Peter Zijlstra committed
1040
static __always_inline int patch_cmp(const void *key, const void *elt)
1041
1042
1043
{
	struct text_poke_loc *tp = (struct text_poke_loc *) elt;

1044
	if (key < text_poke_addr(tp))
1045
		return -1;
1046
	if (key > text_poke_addr(tp))
1047
1048
1049
		return 1;
	return 0;
}
1050

Ira Weiny's avatar
Ira Weiny committed
1051
noinstr int poke_int3_handler(struct pt_regs *regs)
1052
{
1053
	struct bp_patching_desc *desc;
1054
	struct text_poke_loc *tp;
1055
	int len, ret = 0;
1056
	void *ip;
1057
1058
1059

	if (user_mode(regs))
		return 0;
1060

1061
1062
	/*
	 * Having observed our INT3 instruction, we now must observe
1063
	 * bp_desc:
1064
	 *
1065
	 *	bp_desc = desc			INT3
1066
	 *	WMB				RMB
1067
	 *	write INT3			if (desc)
1068
	 */
1069
1070
	smp_rmb();

1071
1072
	desc = try_get_desc(&bp_desc);
	if (!desc)
1073
		return 0;
1074

1075
	/*
1076
	 * Discount the INT3. See text_poke_bp_batch().
1077
	 */
1078
	ip = (void *) regs->ip - INT3_INSN_SIZE;
1079
1080
1081
1082

	/*
	 * Skip the binary search if there is a single member in the vector.
	 */
1083
	if (unlikely(desc->nr_entries > 1)) {
Peter Zijlstra's avatar
Peter Zijlstra committed
1084
1085
1086
		tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
				      sizeof(struct text_poke_loc),
				      patch_cmp);
1087
		if (!tp)
1088
			goto out_put;
1089
	} else {
1090
		tp = desc->vec;
1091
		if (text_poke_addr(tp) != ip)
1092
			goto out_put;
1093
1094
	}

1095
1096
	len = text_opcode_size(tp->opcode);
	ip += len;
1097
1098
1099
1100
1101
1102
1103

	switch (tp->opcode) {
	case INT3_INSN_OPCODE:
		/*
		 * Someone poked an explicit INT3, they'll want to handle it,
		 * do not consume.
		 */
1104
		goto out_put;
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117

	case CALL_INSN_OPCODE:
		int3_emulate_call(regs, (long)ip + tp->rel32);
		break;

	case JMP32_INSN_OPCODE:
	case JMP8_INSN_OPCODE:
		int3_emulate_jmp(regs, (long)ip + tp->rel32);
		break;

	default:
		BUG();
	}
1118

1119
1120
1121
1122
1123
	ret = 1;

out_put:
	put_desc(desc);
	return ret;
1124
}
1125

1126
1127
1128
1129
#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
static struct text_poke_loc tp_vec[TP_VEC_MAX];
static int tp_vec_nr;

1130
/**
1131
1132
1133
 * text_poke_bp_batch() -- update instructions on live kernel on SMP
 * @tp:			vector of instructions to patch
 * @nr_entries:		number of entries in the vector
1134
1135
 *
 * Modify multi-byte instruction by using int3 breakpoint on SMP.
1136
1137
 * We completely avoid stop_machine() here, and achieve the
 * synchronization using int3 breakpoint.
1138
1139
 *
 * The way it is done:
1140
 *	- For each entry in the vector:
1141
 *		- add a int3 trap to the address that will be patched
1142
 *	- sync cores
1143
1144
 *	- For each entry in the vector:
 *		- update all but the first byte of the patched range
1145
 *	- sync cores
1146
1147
1148
 *	- For each entry in the vector:
 *		- replace the first byte (int3) by the first byte of
 *		  replacing opcode
1149
1150
 *	- sync cores
 */
1151
static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
1152
{
1153
1154
1155
1156
1157
	struct bp_patching_desc desc = {
		.vec = tp,
		.nr_entries = nr_entries,
		.refs = ATOMIC_INIT(1),
	};
1158
	unsigned char int3 = INT3_INSN_OPCODE;
1159
	unsigned int i;
1160
	int do_sync;
1161
1162
1163