Commit a62a8ef9 authored by Stefan Hajnoczi's avatar Stefan Hajnoczi Committed by Miklos Szeredi
Browse files

virtio-fs: add virtiofs filesystem



Add a basic file system module for virtio-fs.  This does not yet contain
shared data support between host and guest or metadata coherency speedups.
However it is already significantly faster than virtio-9p.

Design Overview
===============

With the goal of designing something with better performance and local file
system semantics, a bunch of ideas were proposed.

 - Use fuse protocol (instead of 9p) for communication between guest and
   host.  Guest kernel will be fuse client and a fuse server will run on
   host to serve the requests.

 - For data access inside guest, mmap portion of file in QEMU address space
   and guest accesses this memory using dax.  That way guest page cache is
   bypassed and there is only one copy of data (on host).  This will also
   enable mmap(MAP_SHARED) between guests.

 - For metadata coherency, there is a shared memory region which contains
   version number associated with metadata and any guest changing metadata
   updates version number and other guests refresh metadata on next access.
   This is yet to be implemented.

How virtio-fs differs from existing approaches
==============================================

The unique idea behind virtio-fs is to take advantage of the co-location of
the virtual machine and hypervisor to avoid communication (vmexits).

DAX allows file contents to be accessed without communication with the
hypervisor.  The shared memory region for metadata avoids communication in
the common case where metadata is unchanged.

By replacing expensive communication with cheaper shared memory accesses,
we expect to achieve better performance than approaches based on network
file system protocols.  In addition, this also makes it easier to achieve
local file system semantics (coherency).

These techniques are not applicable to network file system protocols since
the communications channel is bypassed by taking advantage of shared memory
on a local machine.  This is why we decided to build virtio-fs rather than
focus on 9P or NFS.

Caching Modes
=============

Like virtio-9p, different caching modes are supported which determine the
coherency level as well.  The “cache=FOO” and “writeback” options control
the level of coherence between the guest and host filesystems.

 - cache=none
   metadata, data and pathname lookup are not cached in guest.  They are
   always fetched from host and any changes are immediately pushed to host.

 - cache=always
   metadata, data and pathname lookup are cached in guest and never expire.

 - cache=auto
   metadata and pathname lookup cache expires after a configured amount of
   time (default is 1 second).  Data is cached while the file is open
   (close to open consistency).

 - writeback/no_writeback
   These options control the writeback strategy.  If writeback is disabled,
   then normal writes will immediately be synchronized with the host fs.
   If writeback is enabled, then writes may be cached in the guest until
   the file is closed or an fsync(2) performed.  This option has no effect
   on mmap-ed writes or writes going through the DAX mechanism.
Signed-off-by: default avatarStefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: default avatarVivek Goyal <vgoyal@redhat.com>
Acked-by: default avatarMichael S. Tsirkin <mst@redhat.com>
Signed-off-by: default avatarMiklos Szeredi <mszeredi@redhat.com>
parent 2d1d25d0
......@@ -27,3 +27,14 @@ config CUSE
If you want to develop or use a userspace character device
based on CUSE, answer Y or M.
config VIRTIO_FS
tristate "Virtio Filesystem"
depends on FUSE_FS
select VIRTIO
help
The Virtio Filesystem allows guests to mount file systems from the
host.
If you want to share files between guests or with the host, answer Y
or M.
......@@ -5,5 +5,6 @@
obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
obj-$(CONFIG_VIRTIO_FS) += virtio_fs.o
fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
......@@ -353,6 +353,10 @@ struct fuse_req {
/** Used to wake up the task waiting for completion of request*/
wait_queue_head_t waitq;
#if IS_ENABLED(CONFIG_VIRTIO_FS)
/** virtio-fs's physically contiguous buffer for in and out args */
void *argbuf;
#endif
};
struct fuse_iqueue;
......@@ -383,6 +387,11 @@ struct fuse_iqueue_ops {
*/
void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq)
__releases(fiq->lock);
/**
* Clean up when fuse_iqueue is destroyed
*/
void (*release)(struct fuse_iqueue *fiq);
};
/** /dev/fuse input queue operations */
......
......@@ -630,6 +630,10 @@ EXPORT_SYMBOL_GPL(fuse_conn_init);
void fuse_conn_put(struct fuse_conn *fc)
{
if (refcount_dec_and_test(&fc->count)) {
struct fuse_iqueue *fiq = &fc->iq;
if (fiq->ops->release)
fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
put_user_ns(fc->user_ns);
fc->release(fc);
......
// SPDX-License-Identifier: GPL-2.0
/*
* virtio-fs: Virtio Filesystem
* Copyright (C) 2018 Red Hat, Inc.
*/
#include <linux/fs.h>
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_fs.h>
#include <linux/delay.h>
#include <linux/fs_context.h>
#include <linux/highmem.h>
#include "fuse_i.h"
/* List of virtio-fs device instances and a lock for the list. Also provides
* mutual exclusion in device removal and mounting path
*/
static DEFINE_MUTEX(virtio_fs_mutex);
static LIST_HEAD(virtio_fs_instances);
enum {
VQ_HIPRIO,
VQ_REQUEST
};
/* Per-virtqueue state */
struct virtio_fs_vq {
spinlock_t lock;
struct virtqueue *vq; /* protected by ->lock */
struct work_struct done_work;
struct list_head queued_reqs;
struct delayed_work dispatch_work;
struct fuse_dev *fud;
bool connected;
long in_flight;
char name[24];
} ____cacheline_aligned_in_smp;
/* A virtio-fs device instance */
struct virtio_fs {
struct kref refcount;
struct list_head list; /* on virtio_fs_instances */
char *tag;
struct virtio_fs_vq *vqs;
unsigned int nvqs; /* number of virtqueues */
unsigned int num_request_queues; /* number of request queues */
};
struct virtio_fs_forget {
struct fuse_in_header ih;
struct fuse_forget_in arg;
/* This request can be temporarily queued on virt queue */
struct list_head list;
};
static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
{
struct virtio_fs *fs = vq->vdev->priv;
return &fs->vqs[vq->index];
}
static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq)
{
return &vq_to_fsvq(vq)->fud->pq;
}
static void release_virtio_fs_obj(struct kref *ref)
{
struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount);
kfree(vfs->vqs);
kfree(vfs);
}
/* Make sure virtiofs_mutex is held */
static void virtio_fs_put(struct virtio_fs *fs)
{
kref_put(&fs->refcount, release_virtio_fs_obj);
}
static void virtio_fs_fiq_release(struct fuse_iqueue *fiq)
{
struct virtio_fs *vfs = fiq->priv;
mutex_lock(&virtio_fs_mutex);
virtio_fs_put(vfs);
mutex_unlock(&virtio_fs_mutex);
}
static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq)
{
WARN_ON(fsvq->in_flight < 0);
/* Wait for in flight requests to finish.*/
while (1) {
spin_lock(&fsvq->lock);
if (!fsvq->in_flight) {
spin_unlock(&fsvq->lock);
break;
}
spin_unlock(&fsvq->lock);
/* TODO use completion instead of timeout */
usleep_range(1000, 2000);
}
flush_work(&fsvq->done_work);
flush_delayed_work(&fsvq->dispatch_work);
}
static inline void drain_hiprio_queued_reqs(struct virtio_fs_vq *fsvq)
{
struct virtio_fs_forget *forget;
spin_lock(&fsvq->lock);
while (1) {
forget = list_first_entry_or_null(&fsvq->queued_reqs,
struct virtio_fs_forget, list);
if (!forget)
break;
list_del(&forget->list);
kfree(forget);
}
spin_unlock(&fsvq->lock);
}
static void virtio_fs_drain_all_queues(struct virtio_fs *fs)
{
struct virtio_fs_vq *fsvq;
int i;
for (i = 0; i < fs->nvqs; i++) {
fsvq = &fs->vqs[i];
if (i == VQ_HIPRIO)
drain_hiprio_queued_reqs(fsvq);
virtio_fs_drain_queue(fsvq);
}
}
static void virtio_fs_start_all_queues(struct virtio_fs *fs)
{
struct virtio_fs_vq *fsvq;
int i;
for (i = 0; i < fs->nvqs; i++) {
fsvq = &fs->vqs[i];
spin_lock(&fsvq->lock);
fsvq->connected = true;
spin_unlock(&fsvq->lock);
}
}
/* Add a new instance to the list or return -EEXIST if tag name exists*/
static int virtio_fs_add_instance(struct virtio_fs *fs)
{
struct virtio_fs *fs2;
bool duplicate = false;
mutex_lock(&virtio_fs_mutex);
list_for_each_entry(fs2, &virtio_fs_instances, list) {
if (strcmp(fs->tag, fs2->tag) == 0)
duplicate = true;
}
if (!duplicate)
list_add_tail(&fs->list, &virtio_fs_instances);
mutex_unlock(&virtio_fs_mutex);
if (duplicate)
return -EEXIST;
return 0;
}
/* Return the virtio_fs with a given tag, or NULL */
static struct virtio_fs *virtio_fs_find_instance(const char *tag)
{
struct virtio_fs *fs;
mutex_lock(&virtio_fs_mutex);
list_for_each_entry(fs, &virtio_fs_instances, list) {
if (strcmp(fs->tag, tag) == 0) {
kref_get(&fs->refcount);
goto found;
}
}
fs = NULL; /* not found */
found:
mutex_unlock(&virtio_fs_mutex);
return fs;
}
static void virtio_fs_free_devs(struct virtio_fs *fs)
{
unsigned int i;
for (i = 0; i < fs->nvqs; i++) {
struct virtio_fs_vq *fsvq = &fs->vqs[i];
if (!fsvq->fud)
continue;
fuse_dev_free(fsvq->fud);
fsvq->fud = NULL;
}
}
/* Read filesystem name from virtio config into fs->tag (must kfree()). */
static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs)
{
char tag_buf[sizeof_field(struct virtio_fs_config, tag)];
char *end;
size_t len;
virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag),
&tag_buf, sizeof(tag_buf));
end = memchr(tag_buf, '\0', sizeof(tag_buf));
if (end == tag_buf)
return -EINVAL; /* empty tag */
if (!end)
end = &tag_buf[sizeof(tag_buf)];
len = end - tag_buf;
fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL);
if (!fs->tag)
return -ENOMEM;
memcpy(fs->tag, tag_buf, len);
fs->tag[len] = '\0';
return 0;
}
/* Work function for hiprio completion */
static void virtio_fs_hiprio_done_work(struct work_struct *work)
{
struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
done_work);
struct virtqueue *vq = fsvq->vq;
/* Free completed FUSE_FORGET requests */
spin_lock(&fsvq->lock);
do {
unsigned int len;
void *req;
virtqueue_disable_cb(vq);
while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
kfree(req);
fsvq->in_flight--;
}
} while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
spin_unlock(&fsvq->lock);
}
static void virtio_fs_dummy_dispatch_work(struct work_struct *work)
{
}
static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
{
struct virtio_fs_forget *forget;
struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
dispatch_work.work);
struct virtqueue *vq = fsvq->vq;
struct scatterlist sg;
struct scatterlist *sgs[] = {&sg};
bool notify;
int ret;
pr_debug("virtio-fs: worker %s called.\n", __func__);
while (1) {
spin_lock(&fsvq->lock);
forget = list_first_entry_or_null(&fsvq->queued_reqs,
struct virtio_fs_forget, list);
if (!forget) {
spin_unlock(&fsvq->lock);
return;
}
list_del(&forget->list);
if (!fsvq->connected) {
spin_unlock(&fsvq->lock);
kfree(forget);
continue;
}
sg_init_one(&sg, forget, sizeof(*forget));
/* Enqueue the request */
dev_dbg(&vq->vdev->dev, "%s\n", __func__);
ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
if (ret < 0) {
if (ret == -ENOMEM || ret == -ENOSPC) {
pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n",
ret);
list_add_tail(&forget->list,
&fsvq->queued_reqs);
schedule_delayed_work(&fsvq->dispatch_work,
msecs_to_jiffies(1));
} else {
pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
ret);
kfree(forget);
}
spin_unlock(&fsvq->lock);
return;
}
fsvq->in_flight++;
notify = virtqueue_kick_prepare(vq);
spin_unlock(&fsvq->lock);
if (notify)
virtqueue_notify(vq);
pr_debug("virtio-fs: worker %s dispatched one forget request.\n",
__func__);
}
}
/* Allocate and copy args into req->argbuf */
static int copy_args_to_argbuf(struct fuse_req *req)
{
struct fuse_args *args = req->args;
unsigned int offset = 0;
unsigned int num_in;
unsigned int num_out;
unsigned int len;
unsigned int i;
num_in = args->in_numargs - args->in_pages;
num_out = args->out_numargs - args->out_pages;
len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) +
fuse_len_args(num_out, args->out_args);
req->argbuf = kmalloc(len, GFP_ATOMIC);
if (!req->argbuf)
return -ENOMEM;
for (i = 0; i < num_in; i++) {
memcpy(req->argbuf + offset,
args->in_args[i].value,
args->in_args[i].size);
offset += args->in_args[i].size;
}
return 0;
}
/* Copy args out of and free req->argbuf */
static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req)
{
unsigned int remaining;
unsigned int offset;
unsigned int num_in;
unsigned int num_out;
unsigned int i;
remaining = req->out.h.len - sizeof(req->out.h);
num_in = args->in_numargs - args->in_pages;
num_out = args->out_numargs - args->out_pages;
offset = fuse_len_args(num_in, (struct fuse_arg *)args->in_args);
for (i = 0; i < num_out; i++) {
unsigned int argsize = args->out_args[i].size;
if (args->out_argvar &&
i == args->out_numargs - 1 &&
argsize > remaining) {
argsize = remaining;
}
memcpy(args->out_args[i].value, req->argbuf + offset, argsize);
offset += argsize;
if (i != args->out_numargs - 1)
remaining -= argsize;
}
/* Store the actual size of the variable-length arg */
if (args->out_argvar)
args->out_args[args->out_numargs - 1].size = remaining;
kfree(req->argbuf);
req->argbuf = NULL;
}
/* Work function for request completion */
static void virtio_fs_requests_done_work(struct work_struct *work)
{
struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
done_work);
struct fuse_pqueue *fpq = &fsvq->fud->pq;
struct fuse_conn *fc = fsvq->fud->fc;
struct virtqueue *vq = fsvq->vq;
struct fuse_req *req;
struct fuse_args_pages *ap;
struct fuse_req *next;
struct fuse_args *args;
unsigned int len, i, thislen;
struct page *page;
LIST_HEAD(reqs);
/* Collect completed requests off the virtqueue */
spin_lock(&fsvq->lock);
do {
virtqueue_disable_cb(vq);
while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
spin_lock(&fpq->lock);
list_move_tail(&req->list, &reqs);
spin_unlock(&fpq->lock);
}
} while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
spin_unlock(&fsvq->lock);
/* End requests */
list_for_each_entry_safe(req, next, &reqs, list) {
/*
* TODO verify that server properly follows FUSE protocol
* (oh.uniq, oh.len)
*/
args = req->args;
copy_args_from_argbuf(args, req);
if (args->out_pages && args->page_zeroing) {
len = args->out_args[args->out_numargs - 1].size;
ap = container_of(args, typeof(*ap), args);
for (i = 0; i < ap->num_pages; i++) {
thislen = ap->descs[i].length;
if (len < thislen) {
WARN_ON(ap->descs[i].offset);
page = ap->pages[i];
zero_user_segment(page, len, thislen);
len = 0;
} else {
len -= thislen;
}
}
}
spin_lock(&fpq->lock);
clear_bit(FR_SENT, &req->flags);
list_del_init(&req->list);
spin_unlock(&fpq->lock);
fuse_request_end(fc, req);
spin_lock(&fsvq->lock);
fsvq->in_flight--;
spin_unlock(&fsvq->lock);
}
}
/* Virtqueue interrupt handler */
static void virtio_fs_vq_done(struct virtqueue *vq)
{
struct virtio_fs_vq *fsvq = vq_to_fsvq(vq);
dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name);
schedule_work(&fsvq->done_work);
}
/* Initialize virtqueues */
static int virtio_fs_setup_vqs(struct virtio_device *vdev,
struct virtio_fs *fs)
{
struct virtqueue **vqs;
vq_callback_t **callbacks;
const char **names;
unsigned int i;
int ret = 0;
virtio_cread(vdev, struct virtio_fs_config, num_request_queues,
&fs->num_request_queues);
if (fs->num_request_queues == 0)
return -EINVAL;
fs->nvqs = 1 + fs->num_request_queues;
fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
if (!fs->vqs)
return -ENOMEM;
vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL);
callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
GFP_KERNEL);
names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
if (!vqs || !callbacks || !names) {
ret = -ENOMEM;
goto out;
}
callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name),
"hiprio");
names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
virtio_fs_hiprio_dispatch_work);
spin_lock_init(&fs->vqs[VQ_HIPRIO].lock);
/* Initialize the requests virtqueues */
for (i = VQ_REQUEST; i < fs->nvqs; i++) {
spin_lock_init(&fs->vqs[i].lock);
INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
virtio_fs_dummy_dispatch_work);
INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
"requests.%u", i - VQ_REQUEST);
callbacks[i] = virtio_fs_vq_done;
names[i] = fs->vqs[i].name;
}
ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
if (ret < 0)
goto out;
for (i = 0; i < fs->nvqs; i++)
fs->vqs[i].vq = vqs[i];
virtio_fs_start_all_queues(fs);
out:
kfree(names);
kfree(callbacks);
kfree(vqs);
if (ret)
kfree(fs->vqs);
return ret;
}
/* Free virtqueues (device must already be reset) */
static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
struct virtio_fs *fs)
{
vdev->config->del_vqs(vdev);
}
static int virtio_fs_probe(struct virtio_device *vdev)
{
struct virtio_fs *fs;
int ret;
fs = kzalloc(sizeof(*fs), GFP_KERNEL);
if (!fs)
return -ENOMEM;
kref_init(&fs->refcount);
vdev->priv = fs;
ret = virtio_fs_read_tag(vdev, fs);