Commit 6b1e6cc7 authored by Jason Wang's avatar Jason Wang Committed by Michael S. Tsirkin

vhost: new device IOTLB API

This patch tries to implement an device IOTLB for vhost. This could be
used with userspace(qemu) implementation of DMA remapping
to emulate an IOMMU for the guest.

The idea is simple, cache the translation in a software device IOTLB
(which is implemented as an interval tree) in vhost and use vhost_net
file descriptor for reporting IOTLB miss and IOTLB
update/invalidation. When vhost meets an IOTLB miss, the fault
address, size and access can be read from the file. After userspace
finishes the translation, it writes the translated address to the
vhost_net file to update the device IOTLB.

When device IOTLB is enabled by setting VIRTIO_F_IOMMU_PLATFORM all vq
addresses set by ioctl are treated as iova instead of virtual address and
the accessing can only be done through IOTLB instead of direct userspace
memory access. Before each round or vq processing, all vq metadata is
prefetched in device IOTLB to make sure no translation fault happens
during vq processing.

In most cases, virtqueues are contiguous even in virtual address space.
The IOTLB translation for virtqueue itself may make it a little
slower. We might add fast path cache on top of this patch.
Signed-off-by: 's avatarJason Wang <jasowang@redhat.com>
[mst: use virtio feature bit: VHOST_F_DEVICE_IOTLB -> VIRTIO_F_IOMMU_PLATFORM ]
[mst: fix build warnings ]
Signed-off-by: 's avatarMichael S. Tsirkin <mst@redhat.com>
[ weiyj.lk: missing unlock on error ]
Signed-off-by: 's avatarWei Yongjun <weiyj.lk@gmail.com>
parent b2fbd8b0
......@@ -61,7 +61,8 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
enum {
VHOST_NET_FEATURES = VHOST_FEATURES |
(1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
(1ULL << VIRTIO_NET_F_MRG_RXBUF)
(1ULL << VIRTIO_NET_F_MRG_RXBUF) |
(1ULL << VIRTIO_F_IOMMU_PLATFORM)
};
enum {
......@@ -308,7 +309,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
{
unsigned long uninitialized_var(endtime);
int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
out_num, in_num, NULL, NULL);
out_num, in_num, NULL, NULL);
if (r == vq->num && vq->busyloop_timeout) {
preempt_disable();
......@@ -318,7 +319,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
cpu_relax_lowlatency();
preempt_enable();
r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
out_num, in_num, NULL, NULL);
out_num, in_num, NULL, NULL);
}
return r;
......@@ -351,6 +352,9 @@ static void handle_tx(struct vhost_net *net)
if (!sock)
goto out;
if (!vq_iotlb_prefetch(vq))
goto out;
vhost_disable_notify(&net->dev, vq);
hdr_size = nvq->vhost_hlen;
......@@ -612,6 +616,10 @@ static void handle_rx(struct vhost_net *net)
sock = vq->private_data;
if (!sock)
goto out;
if (!vq_iotlb_prefetch(vq))
goto out;
vhost_disable_notify(&net->dev, vq);
vhost_hlen = nvq->vhost_hlen;
......@@ -1080,10 +1088,14 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
}
mutex_lock(&n->dev.mutex);
if ((features & (1 << VHOST_F_LOG_ALL)) &&
!vhost_log_access_ok(&n->dev)) {
mutex_unlock(&n->dev.mutex);
return -EFAULT;
!vhost_log_access_ok(&n->dev))
goto out_unlock;
if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
if (vhost_init_device_iotlb(&n->dev, true))
goto out_unlock;
}
for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
mutex_lock(&n->vqs[i].vq.mutex);
n->vqs[i].vq.acked_features = features;
......@@ -1093,6 +1105,10 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
}
mutex_unlock(&n->dev.mutex);
return 0;
out_unlock:
mutex_unlock(&n->dev.mutex);
return -EFAULT;
}
static long vhost_net_set_owner(struct vhost_net *n)
......@@ -1166,9 +1182,40 @@ static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
}
#endif
static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct vhost_net *n = file->private_data;
struct vhost_dev *dev = &n->dev;
int noblock = file->f_flags & O_NONBLOCK;
return vhost_chr_read_iter(dev, to, noblock);
}
static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct vhost_net *n = file->private_data;
struct vhost_dev *dev = &n->dev;
return vhost_chr_write_iter(dev, from);
}
static unsigned int vhost_net_chr_poll(struct file *file, poll_table *wait)
{
struct vhost_net *n = file->private_data;
struct vhost_dev *dev = &n->dev;
return vhost_chr_poll(file, dev, wait);
}
static const struct file_operations vhost_net_fops = {
.owner = THIS_MODULE,
.release = vhost_net_release,
.read_iter = vhost_net_chr_read_iter,
.write_iter = vhost_net_chr_write_iter,
.poll = vhost_net_chr_poll,
.unlocked_ioctl = vhost_net_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = vhost_net_compat_ioctl,
......
This diff is collapsed.
......@@ -65,13 +65,15 @@ struct vhost_umem_node {
__u64 last;
__u64 size;
__u64 userspace_addr;
__u64 flags_padding;
__u32 perm;
__u32 flags_padding;
__u64 __subtree_last;
};
struct vhost_umem {
struct rb_root umem_tree;
struct list_head umem_list;
int numem;
};
/* The virtqueue structure describes a queue attached to a device. */
......@@ -119,10 +121,12 @@ struct vhost_virtqueue {
u64 log_addr;
struct iovec iov[UIO_MAXIOV];
struct iovec iotlb_iov[64];
struct iovec *indirect;
struct vring_used_elem *heads;
/* Protected by virtqueue mutex. */
struct vhost_umem *umem;
struct vhost_umem *iotlb;
void *private_data;
u64 acked_features;
/* Log write descriptors */
......@@ -139,6 +143,12 @@ struct vhost_virtqueue {
u32 busyloop_timeout;
};
struct vhost_msg_node {
struct vhost_msg msg;
struct vhost_virtqueue *vq;
struct list_head node;
};
struct vhost_dev {
struct mm_struct *mm;
struct mutex mutex;
......@@ -149,6 +159,11 @@ struct vhost_dev {
struct llist_head work_list;
struct task_struct *worker;
struct vhost_umem *umem;
struct vhost_umem *iotlb;
spinlock_t iotlb_lock;
struct list_head read_list;
struct list_head pending_list;
wait_queue_head_t wait;
};
void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
......@@ -185,6 +200,21 @@ bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
unsigned int log_num, u64 len);
int vq_iotlb_prefetch(struct vhost_virtqueue *vq);
struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type);
void vhost_enqueue_msg(struct vhost_dev *dev,
struct list_head *head,
struct vhost_msg_node *node);
struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
struct list_head *head);
unsigned int vhost_chr_poll(struct file *file, struct vhost_dev *dev,
poll_table *wait);
ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
int noblock);
ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
struct iov_iter *from);
int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
#define vq_err(vq, fmt, ...) do { \
pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
......
......@@ -47,6 +47,32 @@ struct vhost_vring_addr {
__u64 log_guest_addr;
};
/* no alignment requirement */
struct vhost_iotlb_msg {
__u64 iova;
__u64 size;
__u64 uaddr;
#define VHOST_ACCESS_RO 0x1
#define VHOST_ACCESS_WO 0x2
#define VHOST_ACCESS_RW 0x3
__u8 perm;
#define VHOST_IOTLB_MISS 1
#define VHOST_IOTLB_UPDATE 2
#define VHOST_IOTLB_INVALIDATE 3
#define VHOST_IOTLB_ACCESS_FAIL 4
__u8 type;
};
#define VHOST_IOTLB_MSG 0x1
struct vhost_msg {
int type;
union {
struct vhost_iotlb_msg iotlb;
__u8 padding[64];
};
};
struct vhost_memory_region {
__u64 guest_phys_addr;
__u64 memory_size; /* bytes */
......@@ -146,6 +172,8 @@ struct vhost_memory {
#define VHOST_F_LOG_ALL 26
/* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */
#define VHOST_NET_F_VIRTIO_NET_HDR 27
/* Vhost have device IOTLB */
#define VHOST_F_DEVICE_IOTLB 63
/* VHOST_SCSI specific definitions */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment