Commit 86721e4f authored by Stephen Rothwell's avatar Stephen Rothwell
Browse files

Merge remote-tracking branch 'hyperv/hyperv-next'

parents b6f20305 fb5ef351
What: /sys/bus/vmbus/hibernation
Date: Jan 2021
KernelVersion: 5.12
Contact: Dexuan Cui <decui@microsoft.com>
Description: Whether the host supports hibernation for the VM.
Users: Daemon that sets up swap partition/file for hibernation.
What: /sys/bus/vmbus/devices/<UUID>/id
Date: Jul 2009
KernelVersion: 2.6.31
......
# SPDX-License-Identifier: GPL-2.0-only
obj-y := hv_init.o mmu.o nested.o
obj-$(CONFIG_X86_64) += hv_apic.o
obj-y := hv_init.o mmu.o nested.o irqdomain.o
obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o
ifdef CONFIG_X86_64
obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o
......
......@@ -10,6 +10,7 @@
#include <linux/acpi.h>
#include <linux/efi.h>
#include <linux/types.h>
#include <linux/bitfield.h>
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hypervisor.h>
......@@ -26,8 +27,11 @@
#include <linux/cpuhotplug.h>
#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>
int hyperv_init_cpuhp;
u64 hv_current_partition_id = ~0ull;
EXPORT_SYMBOL_GPL(hv_current_partition_id);
void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
......@@ -44,6 +48,9 @@ EXPORT_SYMBOL_GPL(hv_vp_assist_page);
void __percpu **hyperv_pcpu_input_arg;
EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
void __percpu **hyperv_pcpu_output_arg;
EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
u32 hv_max_vp_index;
EXPORT_SYMBOL_GPL(hv_max_vp_index);
......@@ -76,12 +83,19 @@ static int hv_cpu_init(unsigned int cpu)
void **input_arg;
struct page *pg;
input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
pg = alloc_page(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL);
pg = alloc_pages(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL, hv_root_partition ? 1 : 0);
if (unlikely(!pg))
return -ENOMEM;
input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
*input_arg = page_address(pg);
if (hv_root_partition) {
void **output_arg;
output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
*output_arg = page_address(pg + 1);
}
hv_get_vp_index(msr_vp_index);
......@@ -208,14 +222,23 @@ static int hv_cpu_die(unsigned int cpu)
unsigned int new_cpu;
unsigned long flags;
void **input_arg;
void *input_pg = NULL;
void *pg;
local_irq_save(flags);
input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
input_pg = *input_arg;
pg = *input_arg;
*input_arg = NULL;
if (hv_root_partition) {
void **output_arg;
output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
*output_arg = NULL;
}
local_irq_restore(flags);
free_page((unsigned long)input_pg);
free_pages((unsigned long)pg, hv_root_partition ? 1 : 0);
if (hv_vp_assist_page && hv_vp_assist_page[cpu])
wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
......@@ -264,6 +287,9 @@ static int hv_suspend(void)
union hv_x64_msr_hypercall_contents hypercall_msr;
int ret;
if (hv_root_partition)
return -EPERM;
/*
* Reset the hypercall page as it is going to be invalidated
* accross hibernation. Setting hv_hypercall_pg to NULL ensures
......@@ -334,6 +360,24 @@ static void __init hv_stimer_setup_percpu_clockev(void)
old_setup_percpu_clockev();
}
static void __init hv_get_partition_id(void)
{
struct hv_get_partition_id *output_page;
u64 status;
unsigned long flags;
local_irq_save(flags);
output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page);
if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
/* No point in proceeding if this failed */
pr_err("Failed to get partition ID: %lld\n", status);
BUG();
}
hv_current_partition_id = output_page->partition_id;
local_irq_restore(flags);
}
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
......@@ -368,6 +412,12 @@ void __init hyperv_init(void)
BUG_ON(hyperv_pcpu_input_arg == NULL);
/* Allocate the per-CPU state for output arg for root */
if (hv_root_partition) {
hyperv_pcpu_output_arg = alloc_percpu(void *);
BUG_ON(hyperv_pcpu_output_arg == NULL);
}
/* Allocate percpu VP index */
hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
GFP_KERNEL);
......@@ -408,8 +458,35 @@ void __init hyperv_init(void)
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
hypercall_msr.enable = 1;
hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
if (hv_root_partition) {
struct page *pg;
void *src, *dst;
/*
* For the root partition, the hypervisor will set up its
* hypercall page. The hypervisor guarantees it will not show
* up in the root's address space. The root can't change the
* location of the hypercall page.
*
* Order is important here. We must enable the hypercall page
* so it is populated with code, then copy the code to an
* executable page.
*/
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
pg = vmalloc_to_page(hv_hypercall_pg);
dst = kmap(pg);
src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE,
MEMREMAP_WB);
BUG_ON(!(src && dst));
memcpy(dst, src, HV_HYP_PAGE_SIZE);
memunmap(src);
kunmap(pg);
} else {
hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
}
/*
* hyperv_init() is called before LAPIC is initialized: see
......@@ -428,6 +505,21 @@ void __init hyperv_init(void)
register_syscore_ops(&hv_syscore_ops);
hyperv_init_cpuhp = cpuhp;
if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID)
hv_get_partition_id();
BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull);
#ifdef CONFIG_PCI_MSI
/*
* If we're running as root, we want to create our own PCI MSI domain.
* We can't set this in hv_pci_init because that would be too late.
*/
if (hv_root_partition)
x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain;
#endif
return;
remove_cpuhp_state:
......@@ -552,6 +644,20 @@ EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
bool hv_is_hibernation_supported(void)
{
return acpi_sleep_state_supported(ACPI_STATE_S4);
return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
}
EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
enum hv_isolation_type hv_get_isolation_type(void)
{
if (!(ms_hyperv.features_b & HV_ISOLATION))
return HV_ISOLATION_TYPE_NONE;
return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
}
EXPORT_SYMBOL_GPL(hv_get_isolation_type);
bool hv_is_isolation_supported(void)
{
return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
}
EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
// SPDX-License-Identifier: GPL-2.0
#include <linux/types.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/clockchips.h>
#include <linux/acpi.h>
#include <linux/hyperv.h>
#include <linux/slab.h>
#include <linux/cpuhotplug.h>
#include <linux/minmax.h>
#include <asm/hypervisor.h>
#include <asm/mshyperv.h>
#include <asm/apic.h>
#include <asm/trace/hyperv.h>
/*
* See struct hv_deposit_memory. The first u64 is partition ID, the rest
* are GPAs.
*/
#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)
/* Deposits exact number of pages. Must be called with interrupts enabled. */
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
{
struct page **pages, *page;
int *counts;
int num_allocations;
int i, j, page_count;
int order;
u64 status;
int ret;
u64 base_pfn;
struct hv_deposit_memory *input_page;
unsigned long flags;
if (num_pages > HV_DEPOSIT_MAX)
return -E2BIG;
if (!num_pages)
return 0;
/* One buffer for page pointers and counts */
page = alloc_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
pages = page_address(page);
counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL);
if (!counts) {
free_page((unsigned long)pages);
return -ENOMEM;
}
/* Allocate all the pages before disabling interrupts */
i = 0;
while (num_pages) {
/* Find highest order we can actually allocate */
order = 31 - __builtin_clz(num_pages);
while (1) {
pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
if (pages[i])
break;
if (!order) {
ret = -ENOMEM;
num_allocations = i;
goto err_free_allocations;
}
--order;
}
split_page(pages[i], order);
counts[i] = 1 << order;
num_pages -= counts[i];
i++;
}
num_allocations = i;
local_irq_save(flags);
input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
input_page->partition_id = partition_id;
/* Populate gpa_page_list - these will fit on the input page */
for (i = 0, page_count = 0; i < num_allocations; ++i) {
base_pfn = page_to_pfn(pages[i]);
for (j = 0; j < counts[i]; ++j, ++page_count)
input_page->gpa_page_list[page_count] = base_pfn + j;
}
status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
page_count, 0, input_page, NULL);
local_irq_restore(flags);
if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
pr_err("Failed to deposit pages: %lld\n", status);
ret = status;
goto err_free_allocations;
}
ret = 0;
goto free_buf;
err_free_allocations:
for (i = 0; i < num_allocations; ++i) {
base_pfn = page_to_pfn(pages[i]);
for (j = 0; j < counts[i]; ++j)
__free_page(pfn_to_page(base_pfn + j));
}
free_buf:
free_page((unsigned long)pages);
kfree(counts);
return ret;
}
int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
{
struct hv_add_logical_processor_in *input;
struct hv_add_logical_processor_out *output;
u64 status;
unsigned long flags;
int ret = 0;
int pxm = node_to_pxm(node);
/*
* When adding a logical processor, the hypervisor may return
* HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more
* pages and retry.
*/
do {
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
/* We don't do anything with the output right now */
output = *this_cpu_ptr(hyperv_pcpu_output_arg);
input->lp_index = lp_index;
input->apic_id = apic_id;
input->flags = 0;
input->proximity_domain_info.domain_id = pxm;
input->proximity_domain_info.flags.reserved = 0;
input->proximity_domain_info.flags.proximity_info_valid = 1;
input->proximity_domain_info.flags.proximity_preferred = 1;
status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR,
input, output);
local_irq_restore(flags);
status &= HV_HYPERCALL_RESULT_MASK;
if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
if (status != HV_STATUS_SUCCESS) {
pr_err("%s: cpu %u apic ID %u, %lld\n", __func__,
lp_index, apic_id, status);
ret = status;
}
break;
}
ret = hv_call_deposit_pages(node, hv_current_partition_id, 1);
} while (!ret);
return ret;
}
int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
{
struct hv_create_vp *input;
u64 status;
unsigned long irq_flags;
int ret = 0;
int pxm = node_to_pxm(node);
/* Root VPs don't seem to need pages deposited */
if (partition_id != hv_current_partition_id) {
/* The value 90 is empirically determined. It may change. */
ret = hv_call_deposit_pages(node, partition_id, 90);
if (ret)
return ret;
}
do {
local_irq_save(irq_flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
input->partition_id = partition_id;
input->vp_index = vp_index;
input->flags = flags;
input->subnode_type = HvSubnodeAny;
if (node != NUMA_NO_NODE) {
input->proximity_domain_info.domain_id = pxm;
input->proximity_domain_info.flags.reserved = 0;
input->proximity_domain_info.flags.proximity_info_valid = 1;
input->proximity_domain_info.flags.proximity_preferred = 1;
} else {
input->proximity_domain_info.as_uint64 = 0;
}
status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
local_irq_restore(irq_flags);
status &= HV_HYPERCALL_RESULT_MASK;
if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
if (status != HV_STATUS_SUCCESS) {
pr_err("%s: vcpu %u, lp %u, %lld\n", __func__,
vp_index, flags, status);
ret = status;
}
break;
}
ret = hv_call_deposit_pages(node, partition_id, 1);
} while (!ret);
return ret;
}
// SPDX-License-Identifier: GPL-2.0
/*
* Irqdomain for Linux to run as the root partition on Microsoft Hypervisor.
*
* Authors:
* Sunil Muthuswamy <sunilmut@microsoft.com>
* Wei Liu <wei.liu@kernel.org>
*/
#include <linux/pci.h>
#include <linux/irq.h>
#include <asm/mshyperv.h>
static int hv_map_interrupt(union hv_device_id device_id, bool level,
int cpu, int vector, struct hv_interrupt_entry *entry)
{
struct hv_input_map_device_interrupt *input;
struct hv_output_map_device_interrupt *output;
struct hv_device_interrupt_descriptor *intr_desc;
unsigned long flags;
u64 status;
int nr_bank, var_size;
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
output = *this_cpu_ptr(hyperv_pcpu_output_arg);
intr_desc = &input->interrupt_descriptor;
memset(input, 0, sizeof(*input));
input->partition_id = hv_current_partition_id;
input->device_id = device_id.as_uint64;
intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED;
intr_desc->vector_count = 1;
intr_desc->target.vector = vector;
if (level)
intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_LEVEL;
else
intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE;
intr_desc->target.vp_set.valid_bank_mask = 0;
intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K;
nr_bank = cpumask_to_vpset(&(intr_desc->target.vp_set), cpumask_of(cpu));
if (nr_bank < 0) {
local_irq_restore(flags);
pr_err("%s: unable to generate VP set\n", __func__);
return EINVAL;
}
intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
/*
* var-sized hypercall, var-size starts after vp_mask (thus
* vp_set.format does not count, but vp_set.valid_bank_mask
* does).
*/
var_size = nr_bank + 1;
status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size,
input, output);
*entry = output->interrupt_entry;
local_irq_restore(flags);
if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
pr_err("%s: hypercall failed, status %lld\n", __func__, status);
return status & HV_HYPERCALL_RESULT_MASK;
}
static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
{
unsigned long flags;
struct hv_input_unmap_device_interrupt *input;
struct hv_interrupt_entry *intr_entry;
u64 status;
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
intr_entry = &input->interrupt_entry;
input->partition_id = hv_current_partition_id;
input->device_id = id;
*intr_entry = *old_entry;
status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
local_irq_restore(flags);
return status & HV_HYPERCALL_RESULT_MASK;
}
#ifdef CONFIG_PCI_MSI
struct rid_data {
struct pci_dev *bridge;
u32 rid;
};
static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data)
{
struct rid_data *rd = data;
u8 bus = PCI_BUS_NUM(rd->rid);
if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) {
rd->bridge = pdev;
rd->rid = alias;
}
return 0;
}
static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev)
{
union hv_device_id dev_id;
struct rid_data data = {
.bridge = NULL,
.rid = PCI_DEVID(dev->bus->number, dev->devfn)
};
pci_for_each_dma_alias(dev, get_rid_cb, &data);
dev_id.as_uint64 = 0;
dev_id.device_type = HV_DEVICE_TYPE_PCI;
dev_id.pci.segment = pci_domain_nr(dev->bus);
dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid);
dev_id.pci.bdf.device = PCI_SLOT(data.rid);
dev_id.pci.bdf.function = PCI_FUNC(data.rid);
dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE;
if (data.bridge) {
int pos;
/*
* Microsoft Hypervisor requires a bus range when the bridge is
* running in PCI-X mode.
*
* To distinguish conventional vs PCI-X bridge, we can check
* the bridge's PCI-X Secondary Status Register, Secondary Bus
* Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge
* Specification Revision 1.0 5.2.2.1.3.
*
* Value zero means it is in conventional mode, otherwise it is
* in PCI-X mode.
*/
pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX);
if (pos) {
u16 status;
pci_read_config_word(data.bridge, pos +
PCI_X_BRIDGE_SSTATUS, &status);
if (status & PCI_X_SSTATUS_FREQ) {
/* Non-zero, PCI-X mode */
u8 sec_bus, sub_bus;
dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE;
pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus);
dev_id.pci.shadow_bus_range.secondary_bus = sec_bus;
pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus);
dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus;
}
}
}
return dev_id;
}
static int hv_map_msi_interrupt(struct pci_dev *dev, int cpu, int vector,
struct hv_interrupt_entry *entry)
{