Commit e7a36a6e authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
 "A landry list of fixes:

   - fix reboot breakage on some PCID-enabled system

   - fix crashes/hangs on some PCID-enabled systems

   - fix microcode loading on certain older CPUs

   - various unwinder fixes

   - extend an APIC quirk to more hardware systems and disable APIC
     related warning on virtualized systems

   - various Hyper-V fixes

   - a macro definition robustness fix

   - remove jprobes IRQ disabling

   - various mem-encryption fixes"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/microcode: Do the family check first
  x86/mm: Flush more aggressively in lazy TLB mode
  x86/apic: Update TSC_DEADLINE quirk with additional SKX stepping
  x86/apic: Silence "FW_BUG TSC_DEADLINE disabled due to Errata" on hypervisors
  x86/mm: Disable various instrumentations of mm/mem_encrypt.c and mm/tlb.c
  x86/hyperv: Fix hypercalls with extended CPU ranges for TLB flushing
  x86/hyperv: Don't use percpu areas for pcpu_flush/pcpu_flush_ex structures
  x86/hyperv: Clear vCPU banks between calls to avoid flushing unneeded vCPUs
  x86/unwind: Disable unwinder warnings on 32-bit
  x86/unwind: Align stack pointer in unwinder dump
  x86/unwind: Use MSB for frame pointer encoding on 32-bit
  x86/unwind: Fix dereference of untrusted pointer
  x86/alternatives: Fix alt_max_short macro to really be a max()
  x86/mm/64: Fix reboot interaction with CR4.PCIDE
  kprobes/x86: Remove IRQ disabling from jprobe handlers
  kprobes/x86: Set up frame pointer in kprobe trampoline
parents a339b351 1f161f67
......@@ -176,7 +176,7 @@
/*
* This is a sneaky trick to help the unwinder find pt_regs on the stack. The
* frame pointer is replaced with an encoded pointer to pt_regs. The encoding
* is just setting the LSB, which makes it an invalid stack address and is also
* is just clearing the MSB, which makes it an invalid stack address and is also
* a signal to the unwinder that it's a pt_regs pointer in disguise.
*
* NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
......@@ -185,7 +185,7 @@
.macro ENCODE_FRAME_POINTER
#ifdef CONFIG_FRAME_POINTER
mov %esp, %ebp
orl $0x1, %ebp
andl $0x7fffffff, %ebp
#endif
.endm
......
......@@ -85,6 +85,8 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
u32 *hv_vp_index;
EXPORT_SYMBOL_GPL(hv_vp_index);
u32 hv_max_vp_index;
static int hv_cpu_init(unsigned int cpu)
{
u64 msr_vp_index;
......@@ -93,6 +95,9 @@ static int hv_cpu_init(unsigned int cpu)
hv_vp_index[smp_processor_id()] = msr_vp_index;
if (msr_vp_index > hv_max_vp_index)
hv_max_vp_index = msr_vp_index;
return 0;
}
......
......@@ -36,9 +36,9 @@ struct hv_flush_pcpu_ex {
/* Each gva in gva_list encodes up to 4096 pages to flush */
#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
static struct hv_flush_pcpu __percpu *pcpu_flush;
static struct hv_flush_pcpu __percpu **pcpu_flush;
static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex;
static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex;
/*
* Fills in gva_list starting from offset. Returns the number of items added.
......@@ -76,6 +76,18 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush,
{
int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
/* valid_bank_mask can represent up to 64 banks */
if (hv_max_vp_index / 64 >= 64)
return 0;
/*
* Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex
* structs are not cleared between calls, we risk flushing unneeded
* vCPUs otherwise.
*/
for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++)
flush->hv_vp_set.bank_contents[vcpu_bank] = 0;
/*
* Some banks may end up being empty but this is acceptable.
*/
......@@ -83,11 +95,6 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush,
vcpu = hv_cpu_number_to_vp_number(cpu);
vcpu_bank = vcpu / 64;
vcpu_offset = vcpu % 64;
/* valid_bank_mask can represent up to 64 banks */
if (vcpu_bank >= 64)
return 0;
__set_bit(vcpu_offset, (unsigned long *)
&flush->hv_vp_set.bank_contents[vcpu_bank]);
if (vcpu_bank >= nr_bank)
......@@ -102,6 +109,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
int cpu, vcpu, gva_n, max_gvas;
struct hv_flush_pcpu **flush_pcpu;
struct hv_flush_pcpu *flush;
u64 status = U64_MAX;
unsigned long flags;
......@@ -116,7 +124,17 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
local_irq_save(flags);
flush = this_cpu_ptr(pcpu_flush);
flush_pcpu = this_cpu_ptr(pcpu_flush);
if (unlikely(!*flush_pcpu))
*flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
flush = *flush_pcpu;
if (unlikely(!flush)) {
local_irq_restore(flags);
goto do_native;
}
if (info->mm) {
flush->address_space = virt_to_phys(info->mm->pgd);
......@@ -173,6 +191,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
int nr_bank = 0, max_gvas, gva_n;
struct hv_flush_pcpu_ex **flush_pcpu;
struct hv_flush_pcpu_ex *flush;
u64 status = U64_MAX;
unsigned long flags;
......@@ -187,7 +206,17 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
local_irq_save(flags);
flush = this_cpu_ptr(pcpu_flush_ex);
flush_pcpu = this_cpu_ptr(pcpu_flush_ex);
if (unlikely(!*flush_pcpu))
*flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
flush = *flush_pcpu;
if (unlikely(!flush)) {
local_irq_restore(flags);
goto do_native;
}
if (info->mm) {
flush->address_space = virt_to_phys(info->mm->pgd);
......@@ -222,18 +251,18 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
status = hv_do_rep_hypercall(
HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
0, nr_bank + 2, flush, NULL);
0, nr_bank, flush, NULL);
} else if (info->end &&
((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
status = hv_do_rep_hypercall(
HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
0, nr_bank + 2, flush, NULL);
0, nr_bank, flush, NULL);
} else {
gva_n = fill_gva_list(flush->gva_list, nr_bank,
info->start, info->end);
status = hv_do_rep_hypercall(
HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
gva_n, nr_bank + 2, flush, NULL);
gva_n, nr_bank, flush, NULL);
}
local_irq_restore(flags);
......@@ -266,7 +295,7 @@ void hyper_alloc_mmu(void)
return;
if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
pcpu_flush = alloc_percpu(struct hv_flush_pcpu *);
else
pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *);
}
......@@ -62,8 +62,10 @@
#define new_len2 145f-144f
/*
* max without conditionals. Idea adapted from:
* gas compatible max based on the idea from:
* http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
*
* The additional "-" is needed because gas uses a "true" value of -1.
*/
#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
......
......@@ -103,12 +103,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
alt_end_marker ":\n"
/*
* max without conditionals. Idea adapted from:
* gas compatible max based on the idea from:
* http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
*
* The additional "-" is needed because gas works with s32s.
* The additional "-" is needed because gas uses a "true" value of -1.
*/
#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))"
#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))"
/*
* Pad the second replacement alternative with additional NOPs if it is
......
......@@ -126,13 +126,7 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
DEBUG_LOCKS_WARN_ON(preemptible());
}
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
int cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
cpumask_clear_cpu(cpu, mm_cpumask(mm));
}
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
......
......@@ -289,6 +289,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
* to this information.
*/
extern u32 *hv_vp_index;
extern u32 hv_max_vp_index;
/**
* hv_cpu_number_to_vp_number() - Map CPU to VP.
......
......@@ -82,6 +82,13 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif
/*
* If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point
* to init_mm when we switch to a kernel thread (e.g. the idle thread). If
* it's false, then we immediately switch CR3 when entering a kernel thread.
*/
DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
/*
* 6 because 6 should be plenty and struct tlb_state will fit in
* two cache lines.
......@@ -104,6 +111,23 @@ struct tlb_state {
u16 loaded_mm_asid;
u16 next_asid;
/*
* We can be in one of several states:
*
* - Actively using an mm. Our CPU's bit will be set in
* mm_cpumask(loaded_mm) and is_lazy == false;
*
* - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit
* will not be set in mm_cpumask(&init_mm) and is_lazy == false.
*
* - Lazily using a real mm. loaded_mm != &init_mm, our bit
* is set in mm_cpumask(loaded_mm), but is_lazy == true.
* We're heuristically guessing that the CR3 load we
* skipped more than makes up for the overhead added by
* lazy mode.
*/
bool is_lazy;
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
* disabling interrupts when modifying either one.
......
......@@ -573,11 +573,21 @@ static u32 bdx_deadline_rev(void)
return ~0U;
}
static u32 skx_deadline_rev(void)
{
switch (boot_cpu_data.x86_mask) {
case 0x03: return 0x01000136;
case 0x04: return 0x02000014;
}
return ~0U;
}
static const struct x86_cpu_id deadline_match[] = {
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev),
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020),
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev),
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X, 0x02000014),
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev),
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22),
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20),
......@@ -600,7 +610,8 @@ static void apic_check_deadline_errata(void)
const struct x86_cpu_id *m;
u32 rev;
if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
boot_cpu_has(X86_FEATURE_HYPERVISOR))
return;
m = x86_match_cpu(deadline_match);
......
......@@ -122,9 +122,6 @@ static bool __init check_loader_disabled_bsp(void)
bool *res = &dis_ucode_ldr;
#endif
if (!have_cpuid_p())
return *res;
/*
* CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
* completely accurate as xen pv guests don't see that CPUID bit set but
......@@ -166,24 +163,36 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name)
void __init load_ucode_bsp(void)
{
unsigned int cpuid_1_eax;
bool intel = true;
if (check_loader_disabled_bsp())
if (!have_cpuid_p())
return;
cpuid_1_eax = native_cpuid_eax(1);
switch (x86_cpuid_vendor()) {
case X86_VENDOR_INTEL:
if (x86_family(cpuid_1_eax) >= 6)
load_ucode_intel_bsp();
if (x86_family(cpuid_1_eax) < 6)
return;
break;
case X86_VENDOR_AMD:
if (x86_family(cpuid_1_eax) >= 0x10)
load_ucode_amd_bsp(cpuid_1_eax);
if (x86_family(cpuid_1_eax) < 0x10)
return;
intel = false;
break;
default:
break;
return;
}
if (check_loader_disabled_bsp())
return;
if (intel)
load_ucode_intel_bsp();
else
load_ucode_amd_bsp(cpuid_1_eax);
}
static bool check_loader_disabled_ap(void)
......
......@@ -3,6 +3,15 @@
/* Kprobes and Optprobes common header */
#include <asm/asm.h>
#ifdef CONFIG_FRAME_POINTER
# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \
" mov %" _ASM_SP ", %" _ASM_BP "\n"
#else
# define SAVE_RBP_STRING " push %" _ASM_BP "\n"
#endif
#ifdef CONFIG_X86_64
#define SAVE_REGS_STRING \
/* Skip cs, ip, orig_ax. */ \
......@@ -17,7 +26,7 @@
" pushq %r10\n" \
" pushq %r11\n" \
" pushq %rbx\n" \
" pushq %rbp\n" \
SAVE_RBP_STRING \
" pushq %r12\n" \
" pushq %r13\n" \
" pushq %r14\n" \
......@@ -48,7 +57,7 @@
" pushl %es\n" \
" pushl %ds\n" \
" pushl %eax\n" \
" pushl %ebp\n" \
SAVE_RBP_STRING \
" pushl %edi\n" \
" pushl %esi\n" \
" pushl %edx\n" \
......
......@@ -1080,8 +1080,6 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
* raw stack chunk with redzones:
*/
__memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr));
regs->flags &= ~X86_EFLAGS_IF;
trace_hardirqs_off();
regs->ip = (unsigned long)(jp->entry);
/*
......
......@@ -105,6 +105,10 @@ void __noreturn machine_real_restart(unsigned int type)
load_cr3(initial_page_table);
#else
write_cr3(real_mode_header->trampoline_pgd);
/* Exiting long mode will fail if CR4.PCIDE is set. */
if (static_cpu_has(X86_FEATURE_PCID))
cr4_clear_bits(X86_CR4_PCIDE);
#endif
/* Jump to the identity-mapped low memory code */
......
......@@ -44,7 +44,8 @@ static void unwind_dump(struct unwind_state *state)
state->stack_info.type, state->stack_info.next_sp,
state->stack_mask, state->graph_idx);
for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp;
sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
break;
......@@ -174,6 +175,7 @@ static bool is_last_task_frame(struct unwind_state *state)
* This determines if the frame pointer actually contains an encoded pointer to
* pt_regs on the stack. See ENCODE_FRAME_POINTER.
*/
#ifdef CONFIG_X86_64
static struct pt_regs *decode_frame_pointer(unsigned long *bp)
{
unsigned long regs = (unsigned long)bp;
......@@ -183,6 +185,23 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp)
return (struct pt_regs *)(regs & ~0x1);
}
#else
static struct pt_regs *decode_frame_pointer(unsigned long *bp)
{
unsigned long regs = (unsigned long)bp;
if (regs & 0x80000000)
return NULL;
return (struct pt_regs *)(regs | 0x80000000);
}
#endif
#ifdef CONFIG_X86_32
#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long))
#else
#define KERNEL_REGS_SIZE (sizeof(struct pt_regs))
#endif
static bool update_stack_state(struct unwind_state *state,
unsigned long *next_bp)
......@@ -202,7 +221,7 @@ static bool update_stack_state(struct unwind_state *state,
regs = decode_frame_pointer(next_bp);
if (regs) {
frame = (unsigned long *)regs;
len = regs_size(regs);
len = KERNEL_REGS_SIZE;
state->got_irq = true;
} else {
frame = next_bp;
......@@ -226,6 +245,14 @@ static bool update_stack_state(struct unwind_state *state,
frame < prev_frame_end)
return false;
/*
* On 32-bit with user mode regs, make sure the last two regs are safe
* to access:
*/
if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) &&
!on_stack(info, frame, len + 2*sizeof(long)))
return false;
/* Move state to the next frame: */
if (regs) {
state->regs = regs;
......@@ -328,6 +355,13 @@ bool unwind_next_frame(struct unwind_state *state)
state->regs->sp < (unsigned long)task_pt_regs(state->task))
goto the_end;
/*
* There are some known frame pointer issues on 32-bit. Disable
* unwinder warnings on 32-bit until it gets objtool support.
*/
if (IS_ENABLED(CONFIG_X86_32))
goto the_end;
if (state->regs) {
printk_deferred_once(KERN_WARNING
"WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
......
# Kernel does not boot with instrumentation of tlb.c.
KCOV_INSTRUMENT_tlb.o := n
# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c
KCOV_INSTRUMENT_tlb.o := n
KCOV_INSTRUMENT_mem_encrypt.o := n
KASAN_SANITIZE_mem_encrypt.o := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_mem_encrypt.o = -pg
endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o setup_nx.o tlb.o
......
......@@ -30,6 +30,8 @@
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
u16 *new_asid, bool *need_flush)
{
......@@ -80,7 +82,7 @@ void leave_mm(int cpu)
return;
/* Warn if we're not lazy. */
WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
switch_mm(NULL, &init_mm, NULL);
}
......@@ -142,45 +144,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
__flush_tlb_all();
}
#endif
this_cpu_write(cpu_tlbstate.is_lazy, false);
if (real_prev == next) {
VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
/*
* There's nothing to do: we weren't lazy, and we
* aren't changing our mm. We don't need to flush
* anything, nor do we need to update CR3, CR4, or
* LDTR.
*/
return;
}
/* Resume remote flushes and then read tlb_gen. */
cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
next_tlb_gen) {
/*
* Ideally, we'd have a flush_tlb() variant that
* takes the known CR3 value as input. This would
* be faster on Xen PV and on hypothetical CPUs
* on which INVPCID is fast.
*/
this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
next_tlb_gen);
write_cr3(build_cr3(next, prev_asid));
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
TLB_FLUSH_ALL);
}
/*
* We just exited lazy mode, which means that CR4 and/or LDTR
* may be stale. (Changes to the required CR4 and LDTR states
* are not reflected in tlb_gen.)
* We don't currently support having a real mm loaded without
* our cpu set in mm_cpumask(). We have all the bookkeeping
* in place to figure out whether we would need to flush
* if our cpu were cleared in mm_cpumask(), but we don't
* currently use it.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
return;
} else {
u16 new_asid;
bool need_flush;
......@@ -199,10 +180,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
}
/* Stop remote flushes for the previous mm */
if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
real_prev != &init_mm);
cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
/*
* Start remote flushes and then read tlb_gen.
......@@ -232,6 +212,37 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
switch_ldt(real_prev, next);
}
/*
* enter_lazy_tlb() is a hint from the scheduler that we are entering a
* kernel thread or other context without an mm. Acceptable implementations
* include doing nothing whatsoever, switching to init_mm, or various clever
* lazy tricks to try to minimize TLB flushes.
*
* The scheduler reserves the right to call enter_lazy_tlb() several times
* in a row. It will notify us that we're going back to a real mm by
* calling switch_mm_irqs_off().
*/
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
if (static_branch_unlikely(&tlb_use_lazy_mode)) {
/*
* There's a significant optimization that may be possible
* here. We have accurate enough TLB flush tracking that we
* don't need to maintain coherence of TLB per se when we're
* lazy. We do, however, need to maintain coherence of
* paging-structure caches. We could, in principle, leave our
* old mm loaded and only switch to init_mm when
* tlb_remove_page() happens.
*/
this_cpu_write(cpu_tlbstate.is_lazy, true);
} else {
switch_mm(NULL, &init_mm, NULL);
}
}
/*
* Call this when reinitializing a CPU. It fixes the following potential
* problems:
......@@ -303,16 +314,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
if (unlikely(loaded_mm == &init_mm))
return;
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
if (this_cpu_read(cpu_tlbstate.is_lazy)) {
/*
* We're in lazy mode -- don't flush. We can get here on
* remote flushes due to races and on local flushes if a
* kernel thread coincidentally flushes the mm it's lazily
* still using.
* We're in lazy mode. We need to at least flush our
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
}
......@@ -611,3 +626,57 @@ static int __init create_tlb_single_page_flush_ceiling(void)
return 0;
}
late_initcall(create_tlb_single_page_flush_ceiling);
static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[2];
buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0';
buf[1] = '\n';
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
}
static ssize_t tlblazy_write_file(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
bool val;
if (kstrtobool_from_user(user_buf, count, &val))
return -EINVAL;
if (val)
static_branch_enable(&tlb_use_lazy_mode);
else
static_branch_disable(&tlb_use_lazy_mode);
return count;
}
static const struct file_operations fops_tlblazy = {
.read = tlblazy_read_file,
.write = tlblazy_write_file,
.llseek = default_llseek,
};
static int __init init_tlb_use_lazy_mode(void)
{
if (boot_cpu_has(X86_FEATURE_PCID)) {
/*
* Heuristic: with PCID on, switching to and from
* init_mm is reasonably fast, but remote flush IPIs
* as expensive as ever, so turn off lazy TLB mode.
*
* We can't do this in setup_pcid() because static keys
* haven't been initialized yet, and it would blow up
* badly.
*/
static_branch_disable(&tlb_use_lazy_mode);
}
debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR,
arch_debugfs_dir, NULL, &fops_tlblazy);
return 0;