Commit 6078a454 authored by Jean-Philippe Brucker's avatar Jean-Philippe Brucker Committed by Will Deacon
Browse files

Add PCI device passthrough using VFIO



Assigning devices using VFIO allows the guest to have direct access to the
device, whilst filtering accesses to sensitive areas by trapping config
space accesses and mapping DMA with an IOMMU.

This patch adds a new option to lkvm run: --vfio-pci=<BDF>. Before
assigning a device to a VM, some preparation is required. As described in
Linux Documentation/vfio.txt, the device driver needs to be changed to
vfio-pci:

  $ dev=0000:00:00.0

  $ echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
  $ echo vfio-pci > /sys/bus/pci/devices/$dev/driver_override
  $ echo $dev > /sys/bus/pci/drivers_probe

Adding --vfio-pci=$dev to lkvm-run will pass the device to the guest.
Multiple devices can be passed to the guest by adding more --vfio-pci
parameters.

This patch only implements PCI with INTx. MSI-X routing will be added in a
subsequent patch, and at some point we might add support for passing
platform devices to guests.
Reviewed-by: default avatarPunit Agrawal <punit.agrawal@arm.com>
Signed-off-by: default avatarWill Deacon <will.deacon@arm.com>
Signed-off-by: Robin Murphy's avatarRobin Murphy <robin.murphy@arm.com>
Signed-off-by: default avatarJean-Philippe Brucker <jean-philippe.brucker@arm.com>
Signed-off-by: default avatarWill Deacon <will.deacon@arm.com>
parent ac70b5aa
......@@ -59,6 +59,8 @@ OBJS += main.o
OBJS += mmio.o
OBJS += pci.o
OBJS += term.o
OBJS += vfio/core.o
OBJS += vfio/pci.o
OBJS += virtio/blk.o
OBJS += virtio/scsi.o
OBJS += virtio/console.o
......
#include "kvm/devices.h"
#include "kvm/fdt.h"
#include "kvm/kvm.h"
#include "kvm/of_pci.h"
#include "kvm/pci.h"
#include "kvm/util.h"
......
......@@ -146,6 +146,11 @@ void kvm_run_set_wrapper_sandbox(void)
OPT_BOOLEAN('\0', "no-dhcp", &(cfg)->no_dhcp, "Disable kernel" \
" DHCP in rootfs mode"), \
\
OPT_GROUP("VFIO options:"), \
OPT_CALLBACK('\0', "vfio-pci", NULL, "[domain:]bus:dev.fn", \
"Assign a PCI device to the virtual machine", \
vfio_device_parser, kvm), \
\
OPT_GROUP("Debug options:"), \
OPT_BOOLEAN('\0', "debug", &do_debug_print, \
"Enable debug messages"), \
......
......@@ -2,6 +2,7 @@
#define KVM_CONFIG_H_
#include "kvm/disk-image.h"
#include "kvm/vfio.h"
#include "kvm/kvm-config-arch.h"
#define DEFAULT_KVM_DEV "/dev/kvm"
......@@ -20,9 +21,11 @@
struct kvm_config {
struct kvm_config_arch arch;
struct disk_image_params disk_image[MAX_DISK_IMAGES];
struct vfio_device_params *vfio_devices;
u64 ram_size;
u8 image_count;
u8 num_net_devices;
u8 num_vfio_devices;
bool virtio_rng;
int active_console;
int debug_iodelay;
......
......@@ -7,7 +7,6 @@
#include <endian.h>
#include "kvm/devices.h"
#include "kvm/kvm.h"
#include "kvm/msi.h"
#include "kvm/fdt.h"
......@@ -22,6 +21,8 @@
#define PCI_IO_SIZE 0x100
#define PCI_CFG_SIZE (1ULL << 24)
struct kvm;
union pci_config_address {
struct {
#if __BYTE_ORDER == __LITTLE_ENDIAN
......
#ifndef KVM__VFIO_H
#define KVM__VFIO_H
#include "kvm/parse-options.h"
#include "kvm/pci.h"
#include <linux/vfio.h>
#define vfio_dev_err(vdev, fmt, ...) \
pr_err("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)
#define vfio_dev_warn(vdev, fmt, ...) \
pr_warning("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)
#define vfio_dev_info(vdev, fmt, ...) \
pr_info("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)
#define vfio_dev_dbg(vdev, fmt, ...) \
pr_debug("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)
#define vfio_dev_die(vdev, fmt, ...) \
die("%s: " fmt, (vdev)->params->name, ##__VA_ARGS__)
/* Currently limited by num_vfio_devices */
#define MAX_VFIO_DEVICES 256
enum vfio_device_type {
VFIO_DEVICE_PCI,
};
struct vfio_pci_device {
struct pci_device_header hdr;
};
struct vfio_region {
struct vfio_region_info info;
u64 guest_phys_addr;
void *host_addr;
};
struct vfio_device {
struct device_header dev_hdr;
struct vfio_device_params *params;
struct vfio_group *group;
int fd;
struct vfio_device_info info;
struct vfio_region *regions;
char *sysfs_path;
struct vfio_pci_device pci;
};
struct vfio_device_params {
char *name;
const char *bus;
enum vfio_device_type type;
};
struct vfio_group {
unsigned long id; /* iommu_group number in sysfs */
int fd;
int refs;
struct list_head list;
};
int vfio_device_parser(const struct option *opt, const char *arg, int unset);
int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev,
struct vfio_region *region);
void vfio_unmap_region(struct kvm *kvm, struct vfio_region *region);
int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *device);
void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev);
#endif /* KVM__VFIO_H */
#include "kvm/kvm.h"
#include "kvm/vfio.h"
#include <linux/list.h>
#define VFIO_DEV_DIR "/dev/vfio"
#define VFIO_DEV_NODE VFIO_DEV_DIR "/vfio"
#define IOMMU_GROUP_DIR "/sys/kernel/iommu_groups"
static int vfio_container;
static LIST_HEAD(vfio_groups);
static struct vfio_device *vfio_devices;
static int vfio_device_pci_parser(const struct option *opt, char *arg,
struct vfio_device_params *dev)
{
unsigned int domain, bus, devnr, fn;
int nr = sscanf(arg, "%4x:%2x:%2x.%1x", &domain, &bus, &devnr, &fn);
if (nr < 4) {
domain = 0;
nr = sscanf(arg, "%2x:%2x.%1x", &bus, &devnr, &fn);
if (nr < 3) {
pr_err("Invalid device identifier %s", arg);
return -EINVAL;
}
}
dev->type = VFIO_DEVICE_PCI;
dev->bus = "pci";
dev->name = malloc(13);
if (!dev->name)
return -ENOMEM;
snprintf(dev->name, 13, "%04x:%02x:%02x.%x", domain, bus, devnr, fn);
return 0;
}
int vfio_device_parser(const struct option *opt, const char *arg, int unset)
{
int ret = -EINVAL;
static int idx = 0;
struct kvm *kvm = opt->ptr;
struct vfio_device_params *dev, *devs;
char *cur, *buf = strdup(arg);
if (!buf)
return -ENOMEM;
if (idx >= MAX_VFIO_DEVICES) {
pr_warning("Too many VFIO devices");
goto out_free_buf;
}
devs = realloc(kvm->cfg.vfio_devices, sizeof(*dev) * (idx + 1));
if (!devs) {
ret = -ENOMEM;
goto out_free_buf;
}
kvm->cfg.vfio_devices = devs;
dev = &devs[idx];
cur = strtok(buf, ",");
if (!cur)
goto out_free_buf;
if (!strcmp(opt->long_name, "vfio-pci"))
ret = vfio_device_pci_parser(opt, cur, dev);
else
ret = -EINVAL;
if (!ret)
kvm->cfg.num_vfio_devices = ++idx;
out_free_buf:
free(buf);
return ret;
}
int vfio_map_region(struct kvm *kvm, struct vfio_device *vdev,
struct vfio_region *region)
{
void *base;
int ret, prot = 0;
/* KVM needs page-aligned regions */
u64 map_size = ALIGN(region->info.size, PAGE_SIZE);
/*
* We don't want to mess about trapping config accesses, so require that
* they can be mmap'd. Note that for PCI, this precludes the use of I/O
* BARs in the guest (we will hide them from Configuration Space, which
* is trapped).
*/
if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP)) {
vfio_dev_info(vdev, "ignoring region %u, as it can't be mmap'd",
region->info.index);
return 0;
}
if (region->info.flags & VFIO_REGION_INFO_FLAG_READ)
prot |= PROT_READ;
if (region->info.flags & VFIO_REGION_INFO_FLAG_WRITE)
prot |= PROT_WRITE;
base = mmap(NULL, region->info.size, prot, MAP_SHARED, vdev->fd,
region->info.offset);
if (base == MAP_FAILED) {
ret = -errno;
vfio_dev_err(vdev, "failed to mmap region %u (0x%llx bytes)",
region->info.index, region->info.size);
return ret;
}
region->host_addr = base;
ret = kvm__register_dev_mem(kvm, region->guest_phys_addr, map_size,
region->host_addr);
if (ret) {
vfio_dev_err(vdev, "failed to register region with KVM");
return ret;
}
return 0;
}
void vfio_unmap_region(struct kvm *kvm, struct vfio_region *region)
{
munmap(region->host_addr, region->info.size);
}
static int vfio_configure_device(struct kvm *kvm, struct vfio_device *vdev)
{
int ret;
struct vfio_group *group = vdev->group;
vdev->fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD,
vdev->params->name);
if (vdev->fd < 0) {
vfio_dev_warn(vdev, "failed to get fd");
/* The device might be a bridge without an fd */
return 0;
}
vdev->info.argsz = sizeof(vdev->info);
if (ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &vdev->info)) {
ret = -errno;
vfio_dev_err(vdev, "failed to get info");
goto err_close_device;
}
if (vdev->info.flags & VFIO_DEVICE_FLAGS_RESET &&
ioctl(vdev->fd, VFIO_DEVICE_RESET) < 0)
vfio_dev_warn(vdev, "failed to reset device");
vdev->regions = calloc(vdev->info.num_regions, sizeof(*vdev->regions));
if (!vdev->regions) {
ret = -ENOMEM;
goto err_close_device;
}
/* Now for the bus-specific initialization... */
switch (vdev->params->type) {
case VFIO_DEVICE_PCI:
BUG_ON(!(vdev->info.flags & VFIO_DEVICE_FLAGS_PCI));
ret = vfio_pci_setup_device(kvm, vdev);
break;
default:
BUG_ON(1);
ret = -EINVAL;
}
if (ret)
goto err_free_regions;
vfio_dev_info(vdev, "assigned to device number 0x%x in group %lu",
vdev->dev_hdr.dev_num, group->id);
return 0;
err_free_regions:
free(vdev->regions);
err_close_device:
close(vdev->fd);
return ret;
}
static int vfio_configure_devices(struct kvm *kvm)
{
int i, ret;
for (i = 0; i < kvm->cfg.num_vfio_devices; ++i) {
ret = vfio_configure_device(kvm, &vfio_devices[i]);
if (ret)
return ret;
}
return 0;
}
static int vfio_get_iommu_type(void)
{
if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU))
return VFIO_TYPE1v2_IOMMU;
if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU))
return VFIO_TYPE1_IOMMU;
return -ENODEV;
}
static int vfio_map_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
{
int ret = 0;
struct vfio_iommu_type1_dma_map dma_map = {
.argsz = sizeof(dma_map),
.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
.vaddr = (unsigned long)bank->host_addr,
.iova = (u64)bank->guest_phys_addr,
.size = bank->size,
};
/* Map the guest memory for DMA (i.e. provide isolation) */
if (ioctl(vfio_container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
ret = -errno;
pr_err("Failed to map 0x%llx -> 0x%llx (%llu) for DMA",
dma_map.iova, dma_map.vaddr, dma_map.size);
}
return ret;
}
static int vfio_unmap_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
{
struct vfio_iommu_type1_dma_unmap dma_unmap = {
.argsz = sizeof(dma_unmap),
.size = bank->size,
.iova = bank->guest_phys_addr,
};
ioctl(vfio_container, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
return 0;
}
static struct vfio_group *vfio_group_create(struct kvm *kvm, unsigned long id)
{
int ret;
struct vfio_group *group;
char group_node[PATH_MAX];
struct vfio_group_status group_status = {
.argsz = sizeof(group_status),
};
group = calloc(1, sizeof(*group));
if (!group)
return NULL;
group->id = id;
group->refs = 1;
ret = snprintf(group_node, PATH_MAX, VFIO_DEV_DIR "/%lu", id);
if (ret < 0 || ret == PATH_MAX)
return NULL;
group->fd = open(group_node, O_RDWR);
if (group->fd < 0) {
pr_err("Failed to open IOMMU group %s", group_node);
goto err_free_group;
}
if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group_status)) {
pr_err("Failed to determine status of IOMMU group %lu", id);
goto err_close_group;
}
if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
pr_err("IOMMU group %lu is not viable", id);
goto err_close_group;
}
if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &vfio_container)) {
pr_err("Failed to add IOMMU group %lu to VFIO container", id);
goto err_close_group;
}
list_add(&group->list, &vfio_groups);
return group;
err_close_group:
close(group->fd);
err_free_group:
free(group);
return NULL;
}
static void vfio_group_exit(struct kvm *kvm, struct vfio_group *group)
{
if (--group->refs != 0)
return;
ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER);
list_del(&group->list);
close(group->fd);
free(group);
}
static struct vfio_group *
vfio_group_get_for_dev(struct kvm *kvm, struct vfio_device *vdev)
{
int dirfd;
ssize_t ret;
char *group_name;
unsigned long group_id;
char group_path[PATH_MAX];
struct vfio_group *group = NULL;
/* Find IOMMU group for this device */
dirfd = open(vdev->sysfs_path, O_DIRECTORY | O_PATH | O_RDONLY);
if (dirfd < 0) {
vfio_dev_err(vdev, "failed to open '%s'", vdev->sysfs_path);
return NULL;
}
ret = readlinkat(dirfd, "iommu_group", group_path, PATH_MAX);
if (ret < 0) {
vfio_dev_err(vdev, "no iommu_group");
goto out_close;
}
if (ret == PATH_MAX)
goto out_close;
group_path[ret] = '\0';
group_name = basename(group_path);
errno = 0;
group_id = strtoul(group_name, NULL, 10);
if (errno)
goto out_close;
list_for_each_entry(group, &vfio_groups, list) {
if (group->id == group_id) {
group->refs++;
return group;
}
}
group = vfio_group_create(kvm, group_id);
out_close:
close(dirfd);
return group;
}
static int vfio_device_init(struct kvm *kvm, struct vfio_device *vdev)
{
int ret;
char dev_path[PATH_MAX];
struct vfio_group *group;
ret = snprintf(dev_path, PATH_MAX, "/sys/bus/%s/devices/%s",
vdev->params->bus, vdev->params->name);
if (ret < 0 || ret == PATH_MAX)
return -EINVAL;
vdev->sysfs_path = strndup(dev_path, PATH_MAX);
if (!vdev->sysfs_path)
return -errno;
group = vfio_group_get_for_dev(kvm, vdev);
if (!group) {
free(vdev->sysfs_path);
return -EINVAL;
}
vdev->group = group;
return 0;
}
static void vfio_device_exit(struct kvm *kvm, struct vfio_device *vdev)
{
vfio_group_exit(kvm, vdev->group);
switch (vdev->params->type) {
case VFIO_DEVICE_PCI:
vfio_pci_teardown_device(kvm, vdev);
break;
default:
vfio_dev_warn(vdev, "no teardown function for device");
}
close(vdev->fd);
free(vdev->regions);
free(vdev->sysfs_path);
}
static int vfio_container_init(struct kvm *kvm)
{
int api, i, ret, iommu_type;;
/* Create a container for our IOMMU groups */
vfio_container = open(VFIO_DEV_NODE, O_RDWR);
if (vfio_container == -1) {
ret = errno;
pr_err("Failed to open %s", VFIO_DEV_NODE);
return ret;
}
api = ioctl(vfio_container, VFIO_GET_API_VERSION);
if (api != VFIO_API_VERSION) {
pr_err("Unknown VFIO API version %d", api);
return -ENODEV;
}
iommu_type = vfio_get_iommu_type();
if (iommu_type < 0) {
pr_err("VFIO type-1 IOMMU not supported on this platform");
return iommu_type;
}
/* Create groups for our devices and add them to the container */
for (i = 0; i < kvm->cfg.num_vfio_devices; ++i) {
vfio_devices[i].params = &kvm->cfg.vfio_devices[i];
ret = vfio_device_init(kvm, &vfio_devices[i]);
if (ret)
return ret;
}
/* Finalise the container */
if (ioctl(vfio_container, VFIO_SET_IOMMU, iommu_type)) {
ret = -errno;
pr_err("Failed to set IOMMU type %d for VFIO container",
iommu_type);
return ret;
} else {
pr_info("Using IOMMU type %d for VFIO container", iommu_type);
}
return kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_map_mem_bank,
NULL);
}
static int vfio__init(struct kvm *kvm)
{
int ret;
if (!kvm->cfg.num_vfio_devices)
return 0;
vfio_devices = calloc(kvm->cfg.num_vfio_devices, sizeof(*vfio_devices));
if (!vfio_devices)
return -ENOMEM;
ret = vfio_container_init(kvm);
if (ret)
return ret;
ret = vfio_configure_devices(kvm);
if (ret)
return ret;
return 0;
}
dev_base_init(vfio__init);
static int vfio__exit(struct kvm *kvm)
{
int i;
if (!kvm->cfg.num_vfio_devices)
return 0;
for (i = 0; i < kvm->cfg.num_vfio_devices; i++)
vfio_device_exit(kvm, &vfio_devices[i]);
free(vfio_devices);
kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, vfio_unmap_mem_bank, NULL);
close(vfio_container);