Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

29
drivers/vfio/Kconfig Normal file
View file

@ -0,0 +1,29 @@
config VFIO_IOMMU_TYPE1
tristate
depends on VFIO
default n
config VFIO_IOMMU_SPAPR_TCE
tristate
depends on VFIO && SPAPR_TCE_IOMMU
default n
config VFIO_SPAPR_EEH
tristate
depends on EEH && VFIO_IOMMU_SPAPR_TCE
default n
menuconfig VFIO
tristate "VFIO Non-Privileged userspace driver framework"
depends on IOMMU_API
select VFIO_IOMMU_TYPE1 if X86
select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
select VFIO_SPAPR_EEH if (PPC_POWERNV || PPC_PSERIES)
select ANON_INODES
help
VFIO provides a framework for secure userspace device drivers.
See Documentation/vfio.txt for more details.
If you don't know what to do here, say N.
source "drivers/vfio/pci/Kconfig"

5
drivers/vfio/Makefile Normal file
View file

@ -0,0 +1,5 @@
obj-$(CONFIG_VFIO) += vfio.o
obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
obj-$(CONFIG_VFIO_PCI) += pci/

18
drivers/vfio/pci/Kconfig Normal file
View file

@ -0,0 +1,18 @@
config VFIO_PCI
tristate "VFIO support for PCI devices"
depends on VFIO && PCI && EVENTFD
help
Support for the PCI VFIO bus driver. This is required to make
use of PCI drivers using the VFIO framework.
If you don't know what to do here, say N.
config VFIO_PCI_VGA
bool "VFIO PCI support for VGA devices"
depends on VFIO_PCI && X86 && VGA_ARB
help
Support for VGA extension to VFIO PCI. This exposes an additional
region on VGA devices for accessing legacy VGA addresses used by
BIOS and generic video drivers.
If you don't know what to do here, say N.

View file

@ -0,0 +1,4 @@
vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
obj-$(CONFIG_VFIO_PCI) += vfio-pci.o

1051
drivers/vfio/pci/vfio_pci.c Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,853 @@
/*
* VFIO PCI interrupt handling
*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Derived from original vfio:
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
* Author: Tom Lyon, pugs@cisco.com
*/
#include <linux/device.h>
#include <linux/interrupt.h>
#include <linux/eventfd.h>
#include <linux/msi.h>
#include <linux/pci.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/vfio.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include "vfio_pci_private.h"
/*
* IRQfd - generic
*/
struct virqfd {
struct vfio_pci_device *vdev;
struct eventfd_ctx *eventfd;
int (*handler)(struct vfio_pci_device *, void *);
void (*thread)(struct vfio_pci_device *, void *);
void *data;
struct work_struct inject;
wait_queue_t wait;
poll_table pt;
struct work_struct shutdown;
struct virqfd **pvirqfd;
};
static struct workqueue_struct *vfio_irqfd_cleanup_wq;
int __init vfio_pci_virqfd_init(void)
{
vfio_irqfd_cleanup_wq =
create_singlethread_workqueue("vfio-irqfd-cleanup");
if (!vfio_irqfd_cleanup_wq)
return -ENOMEM;
return 0;
}
void vfio_pci_virqfd_exit(void)
{
destroy_workqueue(vfio_irqfd_cleanup_wq);
}
static void virqfd_deactivate(struct virqfd *virqfd)
{
queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown);
}
static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
struct virqfd *virqfd = container_of(wait, struct virqfd, wait);
unsigned long flags = (unsigned long)key;
if (flags & POLLIN) {
/* An event has been signaled, call function */
if ((!virqfd->handler ||
virqfd->handler(virqfd->vdev, virqfd->data)) &&
virqfd->thread)
schedule_work(&virqfd->inject);
}
if (flags & POLLHUP) {
unsigned long flags;
spin_lock_irqsave(&virqfd->vdev->irqlock, flags);
/*
* The eventfd is closing, if the virqfd has not yet been
* queued for release, as determined by testing whether the
* vdev pointer to it is still valid, queue it now. As
* with kvm irqfds, we know we won't race against the virqfd
* going away because we hold wqh->lock to get here.
*/
if (*(virqfd->pvirqfd) == virqfd) {
*(virqfd->pvirqfd) = NULL;
virqfd_deactivate(virqfd);
}
spin_unlock_irqrestore(&virqfd->vdev->irqlock, flags);
}
return 0;
}
static void virqfd_ptable_queue_proc(struct file *file,
wait_queue_head_t *wqh, poll_table *pt)
{
struct virqfd *virqfd = container_of(pt, struct virqfd, pt);
add_wait_queue(wqh, &virqfd->wait);
}
static void virqfd_shutdown(struct work_struct *work)
{
struct virqfd *virqfd = container_of(work, struct virqfd, shutdown);
u64 cnt;
eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt);
flush_work(&virqfd->inject);
eventfd_ctx_put(virqfd->eventfd);
kfree(virqfd);
}
static void virqfd_inject(struct work_struct *work)
{
struct virqfd *virqfd = container_of(work, struct virqfd, inject);
if (virqfd->thread)
virqfd->thread(virqfd->vdev, virqfd->data);
}
static int virqfd_enable(struct vfio_pci_device *vdev,
int (*handler)(struct vfio_pci_device *, void *),
void (*thread)(struct vfio_pci_device *, void *),
void *data, struct virqfd **pvirqfd, int fd)
{
struct fd irqfd;
struct eventfd_ctx *ctx;
struct virqfd *virqfd;
int ret = 0;
unsigned int events;
virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL);
if (!virqfd)
return -ENOMEM;
virqfd->pvirqfd = pvirqfd;
virqfd->vdev = vdev;
virqfd->handler = handler;
virqfd->thread = thread;
virqfd->data = data;
INIT_WORK(&virqfd->shutdown, virqfd_shutdown);
INIT_WORK(&virqfd->inject, virqfd_inject);
irqfd = fdget(fd);
if (!irqfd.file) {
ret = -EBADF;
goto err_fd;
}
ctx = eventfd_ctx_fileget(irqfd.file);
if (IS_ERR(ctx)) {
ret = PTR_ERR(ctx);
goto err_ctx;
}
virqfd->eventfd = ctx;
/*
* virqfds can be released by closing the eventfd or directly
* through ioctl. These are both done through a workqueue, so
* we update the pointer to the virqfd under lock to avoid
* pushing multiple jobs to release the same virqfd.
*/
spin_lock_irq(&vdev->irqlock);
if (*pvirqfd) {
spin_unlock_irq(&vdev->irqlock);
ret = -EBUSY;
goto err_busy;
}
*pvirqfd = virqfd;
spin_unlock_irq(&vdev->irqlock);
/*
* Install our own custom wake-up handling so we are notified via
* a callback whenever someone signals the underlying eventfd.
*/
init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt);
/*
* Check if there was an event already pending on the eventfd
* before we registered and trigger it as if we didn't miss it.
*/
if (events & POLLIN) {
if ((!handler || handler(vdev, data)) && thread)
schedule_work(&virqfd->inject);
}
/*
* Do not drop the file until the irqfd is fully initialized,
* otherwise we might race against the POLLHUP.
*/
fdput(irqfd);
return 0;
err_busy:
eventfd_ctx_put(ctx);
err_ctx:
fdput(irqfd);
err_fd:
kfree(virqfd);
return ret;
}
static void virqfd_disable(struct vfio_pci_device *vdev,
struct virqfd **pvirqfd)
{
unsigned long flags;
spin_lock_irqsave(&vdev->irqlock, flags);
if (*pvirqfd) {
virqfd_deactivate(*pvirqfd);
*pvirqfd = NULL;
}
spin_unlock_irqrestore(&vdev->irqlock, flags);
/*
* Block until we know all outstanding shutdown jobs have completed.
* Even if we don't queue the job, flush the wq to be sure it's
* been released.
*/
flush_workqueue(vfio_irqfd_cleanup_wq);
}
/*
* INTx
*/
static void vfio_send_intx_eventfd(struct vfio_pci_device *vdev, void *unused)
{
if (likely(is_intx(vdev) && !vdev->virq_disabled))
eventfd_signal(vdev->ctx[0].trigger, 1);
}
void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
{
struct pci_dev *pdev = vdev->pdev;
unsigned long flags;
spin_lock_irqsave(&vdev->irqlock, flags);
/*
* Masking can come from interrupt, ioctl, or config space
* via INTx disable. The latter means this can get called
* even when not using intx delivery. In this case, just
* try to have the physical bit follow the virtual bit.
*/
if (unlikely(!is_intx(vdev))) {
if (vdev->pci_2_3)
pci_intx(pdev, 0);
} else if (!vdev->ctx[0].masked) {
/*
* Can't use check_and_mask here because we always want to
* mask, not just when something is pending.
*/
if (vdev->pci_2_3)
pci_intx(pdev, 0);
else
disable_irq_nosync(pdev->irq);
vdev->ctx[0].masked = true;
}
spin_unlock_irqrestore(&vdev->irqlock, flags);
}
/*
* If this is triggered by an eventfd, we can't call eventfd_signal
* or else we'll deadlock on the eventfd wait queue. Return >0 when
* a signal is necessary, which can then be handled via a work queue
* or directly depending on the caller.
*/
static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev,
void *unused)
{
struct pci_dev *pdev = vdev->pdev;
unsigned long flags;
int ret = 0;
spin_lock_irqsave(&vdev->irqlock, flags);
/*
* Unmasking comes from ioctl or config, so again, have the
* physical bit follow the virtual even when not using INTx.
*/
if (unlikely(!is_intx(vdev))) {
if (vdev->pci_2_3)
pci_intx(pdev, 1);
} else if (vdev->ctx[0].masked && !vdev->virq_disabled) {
/*
* A pending interrupt here would immediately trigger,
* but we can avoid that overhead by just re-sending
* the interrupt to the user.
*/
if (vdev->pci_2_3) {
if (!pci_check_and_unmask_intx(pdev))
ret = 1;
} else
enable_irq(pdev->irq);
vdev->ctx[0].masked = (ret > 0);
}
spin_unlock_irqrestore(&vdev->irqlock, flags);
return ret;
}
void vfio_pci_intx_unmask(struct vfio_pci_device *vdev)
{
if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
vfio_send_intx_eventfd(vdev, NULL);
}
static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
{
struct vfio_pci_device *vdev = dev_id;
unsigned long flags;
int ret = IRQ_NONE;
spin_lock_irqsave(&vdev->irqlock, flags);
if (!vdev->pci_2_3) {
disable_irq_nosync(vdev->pdev->irq);
vdev->ctx[0].masked = true;
ret = IRQ_HANDLED;
} else if (!vdev->ctx[0].masked && /* may be shared */
pci_check_and_mask_intx(vdev->pdev)) {
vdev->ctx[0].masked = true;
ret = IRQ_HANDLED;
}
spin_unlock_irqrestore(&vdev->irqlock, flags);
if (ret == IRQ_HANDLED)
vfio_send_intx_eventfd(vdev, NULL);
return ret;
}
static int vfio_intx_enable(struct vfio_pci_device *vdev)
{
if (!is_irq_none(vdev))
return -EINVAL;
if (!vdev->pdev->irq)
return -ENODEV;
vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
if (!vdev->ctx)
return -ENOMEM;
vdev->num_ctx = 1;
/*
* If the virtual interrupt is masked, restore it. Devices
* supporting DisINTx can be masked at the hardware level
* here, non-PCI-2.3 devices will have to wait until the
* interrupt is enabled.
*/
vdev->ctx[0].masked = vdev->virq_disabled;
if (vdev->pci_2_3)
pci_intx(vdev->pdev, !vdev->ctx[0].masked);
vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX;
return 0;
}
static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd)
{
struct pci_dev *pdev = vdev->pdev;
unsigned long irqflags = IRQF_SHARED;
struct eventfd_ctx *trigger;
unsigned long flags;
int ret;
if (vdev->ctx[0].trigger) {
free_irq(pdev->irq, vdev);
kfree(vdev->ctx[0].name);
eventfd_ctx_put(vdev->ctx[0].trigger);
vdev->ctx[0].trigger = NULL;
}
if (fd < 0) /* Disable only */
return 0;
vdev->ctx[0].name = kasprintf(GFP_KERNEL, "vfio-intx(%s)",
pci_name(pdev));
if (!vdev->ctx[0].name)
return -ENOMEM;
trigger = eventfd_ctx_fdget(fd);
if (IS_ERR(trigger)) {
kfree(vdev->ctx[0].name);
return PTR_ERR(trigger);
}
vdev->ctx[0].trigger = trigger;
if (!vdev->pci_2_3)
irqflags = 0;
ret = request_irq(pdev->irq, vfio_intx_handler,
irqflags, vdev->ctx[0].name, vdev);
if (ret) {
vdev->ctx[0].trigger = NULL;
kfree(vdev->ctx[0].name);
eventfd_ctx_put(trigger);
return ret;
}
/*
* INTx disable will stick across the new irq setup,
* disable_irq won't.
*/
spin_lock_irqsave(&vdev->irqlock, flags);
if (!vdev->pci_2_3 && vdev->ctx[0].masked)
disable_irq_nosync(pdev->irq);
spin_unlock_irqrestore(&vdev->irqlock, flags);
return 0;
}
static void vfio_intx_disable(struct vfio_pci_device *vdev)
{
vfio_intx_set_signal(vdev, -1);
virqfd_disable(vdev, &vdev->ctx[0].unmask);
virqfd_disable(vdev, &vdev->ctx[0].mask);
vdev->irq_type = VFIO_PCI_NUM_IRQS;
vdev->num_ctx = 0;
kfree(vdev->ctx);
}
/*
* MSI/MSI-X
*/
static irqreturn_t vfio_msihandler(int irq, void *arg)
{
struct eventfd_ctx *trigger = arg;
eventfd_signal(trigger, 1);
return IRQ_HANDLED;
}
static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix)
{
struct pci_dev *pdev = vdev->pdev;
int ret;
if (!is_irq_none(vdev))
return -EINVAL;
vdev->ctx = kzalloc(nvec * sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
if (!vdev->ctx)
return -ENOMEM;
if (msix) {
int i;
vdev->msix = kzalloc(nvec * sizeof(struct msix_entry),
GFP_KERNEL);
if (!vdev->msix) {
kfree(vdev->ctx);
return -ENOMEM;
}
for (i = 0; i < nvec; i++)
vdev->msix[i].entry = i;
ret = pci_enable_msix_range(pdev, vdev->msix, 1, nvec);
if (ret < nvec) {
if (ret > 0)
pci_disable_msix(pdev);
kfree(vdev->msix);
kfree(vdev->ctx);
return ret;
}
} else {
ret = pci_enable_msi_range(pdev, 1, nvec);
if (ret < nvec) {
if (ret > 0)
pci_disable_msi(pdev);
kfree(vdev->ctx);
return ret;
}
}
vdev->num_ctx = nvec;
vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX :
VFIO_PCI_MSI_IRQ_INDEX;
if (!msix) {
/*
* Compute the virtual hardware field for max msi vectors -
* it is the log base 2 of the number of vectors.
*/
vdev->msi_qmax = fls(nvec * 2 - 1) - 1;
}
return 0;
}
static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
int vector, int fd, bool msix)
{
struct pci_dev *pdev = vdev->pdev;
int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector;
char *name = msix ? "vfio-msix" : "vfio-msi";
struct eventfd_ctx *trigger;
int ret;
if (vector >= vdev->num_ctx)
return -EINVAL;
if (vdev->ctx[vector].trigger) {
free_irq(irq, vdev->ctx[vector].trigger);
kfree(vdev->ctx[vector].name);
eventfd_ctx_put(vdev->ctx[vector].trigger);
vdev->ctx[vector].trigger = NULL;
}
if (fd < 0)
return 0;
vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)",
name, vector, pci_name(pdev));
if (!vdev->ctx[vector].name)
return -ENOMEM;
trigger = eventfd_ctx_fdget(fd);
if (IS_ERR(trigger)) {
kfree(vdev->ctx[vector].name);
return PTR_ERR(trigger);
}
/*
* The MSIx vector table resides in device memory which may be cleared
* via backdoor resets. We don't allow direct access to the vector
* table so even if a userspace driver attempts to save/restore around
* such a reset it would be unsuccessful. To avoid this, restore the
* cached value of the message prior to enabling.
*/
if (msix) {
struct msi_msg msg;
get_cached_msi_msg(irq, &msg);
write_msi_msg(irq, &msg);
}
ret = request_irq(irq, vfio_msihandler, 0,
vdev->ctx[vector].name, trigger);
if (ret) {
kfree(vdev->ctx[vector].name);
eventfd_ctx_put(trigger);
return ret;
}
vdev->ctx[vector].trigger = trigger;
return 0;
}
static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start,
unsigned count, int32_t *fds, bool msix)
{
int i, j, ret = 0;
if (start + count > vdev->num_ctx)
return -EINVAL;
for (i = 0, j = start; i < count && !ret; i++, j++) {
int fd = fds ? fds[i] : -1;
ret = vfio_msi_set_vector_signal(vdev, j, fd, msix);
}
if (ret) {
for (--j; j >= start; j--)
vfio_msi_set_vector_signal(vdev, j, -1, msix);
}
return ret;
}
static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix)
{
struct pci_dev *pdev = vdev->pdev;
int i;
vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
for (i = 0; i < vdev->num_ctx; i++) {
virqfd_disable(vdev, &vdev->ctx[i].unmask);
virqfd_disable(vdev, &vdev->ctx[i].mask);
}
if (msix) {
pci_disable_msix(vdev->pdev);
kfree(vdev->msix);
} else
pci_disable_msi(pdev);
vdev->irq_type = VFIO_PCI_NUM_IRQS;
vdev->num_ctx = 0;
kfree(vdev->ctx);
}
/*
* IOCTL support
*/
static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
{
if (!is_intx(vdev) || start != 0 || count != 1)
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_NONE) {
vfio_pci_intx_unmask(vdev);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t unmask = *(uint8_t *)data;
if (unmask)
vfio_pci_intx_unmask(vdev);
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
int32_t fd = *(int32_t *)data;
if (fd >= 0)
return virqfd_enable(vdev, vfio_pci_intx_unmask_handler,
vfio_send_intx_eventfd, NULL,
&vdev->ctx[0].unmask, fd);
virqfd_disable(vdev, &vdev->ctx[0].unmask);
}
return 0;
}
static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
{
if (!is_intx(vdev) || start != 0 || count != 1)
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_NONE) {
vfio_pci_intx_mask(vdev);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t mask = *(uint8_t *)data;
if (mask)
vfio_pci_intx_mask(vdev);
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
return -ENOTTY; /* XXX implement me */
}
return 0;
}
static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
{
if (is_intx(vdev) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
vfio_intx_disable(vdev);
return 0;
}
if (!(is_intx(vdev) || is_irq_none(vdev)) || start != 0 || count != 1)
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
int32_t fd = *(int32_t *)data;
int ret;
if (is_intx(vdev))
return vfio_intx_set_signal(vdev, fd);
ret = vfio_intx_enable(vdev);
if (ret)
return ret;
ret = vfio_intx_set_signal(vdev, fd);
if (ret)
vfio_intx_disable(vdev);
return ret;
}
if (!is_intx(vdev))
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_NONE) {
vfio_send_intx_eventfd(vdev, NULL);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t trigger = *(uint8_t *)data;
if (trigger)
vfio_send_intx_eventfd(vdev, NULL);
}
return 0;
}
static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
{
int i;
bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false;
if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
vfio_msi_disable(vdev, msix);
return 0;
}
if (!(irq_is(vdev, index) || is_irq_none(vdev)))
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
int32_t *fds = data;
int ret;
if (vdev->irq_type == index)
return vfio_msi_set_block(vdev, start, count,
fds, msix);
ret = vfio_msi_enable(vdev, start + count, msix);
if (ret)
return ret;
ret = vfio_msi_set_block(vdev, start, count, fds, msix);
if (ret)
vfio_msi_disable(vdev, msix);
return ret;
}
if (!irq_is(vdev, index) || start + count > vdev->num_ctx)
return -EINVAL;
for (i = start; i < start + count; i++) {
if (!vdev->ctx[i].trigger)
continue;
if (flags & VFIO_IRQ_SET_DATA_NONE) {
eventfd_signal(vdev->ctx[i].trigger, 1);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t *bools = data;
if (bools[i - start])
eventfd_signal(vdev->ctx[i].trigger, 1);
}
}
return 0;
}
static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
{
int32_t fd = *(int32_t *)data;
if ((index != VFIO_PCI_ERR_IRQ_INDEX) ||
!(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
return -EINVAL;
/* DATA_NONE/DATA_BOOL enables loopback testing */
if (flags & VFIO_IRQ_SET_DATA_NONE) {
if (vdev->err_trigger)
eventfd_signal(vdev->err_trigger, 1);
return 0;
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t trigger = *(uint8_t *)data;
if (trigger && vdev->err_trigger)
eventfd_signal(vdev->err_trigger, 1);
return 0;
}
/* Handle SET_DATA_EVENTFD */
if (fd == -1) {
if (vdev->err_trigger)
eventfd_ctx_put(vdev->err_trigger);
vdev->err_trigger = NULL;
return 0;
} else if (fd >= 0) {
struct eventfd_ctx *efdctx;
efdctx = eventfd_ctx_fdget(fd);
if (IS_ERR(efdctx))
return PTR_ERR(efdctx);
if (vdev->err_trigger)
eventfd_ctx_put(vdev->err_trigger);
vdev->err_trigger = efdctx;
return 0;
} else
return -EINVAL;
}
int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
unsigned index, unsigned start, unsigned count,
void *data)
{
int (*func)(struct vfio_pci_device *vdev, unsigned index,
unsigned start, unsigned count, uint32_t flags,
void *data) = NULL;
switch (index) {
case VFIO_PCI_INTX_IRQ_INDEX:
switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
case VFIO_IRQ_SET_ACTION_MASK:
func = vfio_pci_set_intx_mask;
break;
case VFIO_IRQ_SET_ACTION_UNMASK:
func = vfio_pci_set_intx_unmask;
break;
case VFIO_IRQ_SET_ACTION_TRIGGER:
func = vfio_pci_set_intx_trigger;
break;
}
break;
case VFIO_PCI_MSI_IRQ_INDEX:
case VFIO_PCI_MSIX_IRQ_INDEX:
switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
case VFIO_IRQ_SET_ACTION_MASK:
case VFIO_IRQ_SET_ACTION_UNMASK:
/* XXX Need masking support exported */
break;
case VFIO_IRQ_SET_ACTION_TRIGGER:
func = vfio_pci_set_msi_trigger;
break;
}
break;
case VFIO_PCI_ERR_IRQ_INDEX:
switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
case VFIO_IRQ_SET_ACTION_TRIGGER:
if (pci_is_pcie(vdev->pdev))
func = vfio_pci_set_err_trigger;
break;
}
}
if (!func)
return -ENOTTY;
return func(vdev, index, start, count, flags, data);
}

View file

@ -0,0 +1,94 @@
/*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Derived from original vfio:
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
* Author: Tom Lyon, pugs@cisco.com
*/
#include <linux/mutex.h>
#include <linux/pci.h>
#ifndef VFIO_PCI_PRIVATE_H
#define VFIO_PCI_PRIVATE_H
#define VFIO_PCI_OFFSET_SHIFT 40
#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT)
#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
struct vfio_pci_irq_ctx {
struct eventfd_ctx *trigger;
struct virqfd *unmask;
struct virqfd *mask;
char *name;
bool masked;
};
struct vfio_pci_device {
struct pci_dev *pdev;
void __iomem *barmap[PCI_STD_RESOURCE_END + 1];
u8 *pci_config_map;
u8 *vconfig;
struct perm_bits *msi_perm;
spinlock_t irqlock;
struct mutex igate;
struct msix_entry *msix;
struct vfio_pci_irq_ctx *ctx;
int num_ctx;
int irq_type;
u8 msi_qmax;
u8 msix_bar;
u16 msix_size;
u32 msix_offset;
u32 rbar[7];
bool pci_2_3;
bool virq_disabled;
bool reset_works;
bool extended_caps;
bool bardirty;
bool has_vga;
bool needs_reset;
struct pci_saved_state *pci_saved_state;
int refcnt;
struct eventfd_ctx *err_trigger;
};
#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX)
#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
#define irq_is(vdev, type) (vdev->irq_type == type)
extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev);
extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev);
extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev,
uint32_t flags, unsigned index,
unsigned start, unsigned count, void *data);
extern ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev,
char __user *buf, size_t count,
loff_t *ppos, bool iswrite);
extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite);
extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite);
extern int vfio_pci_init_perm_bits(void);
extern void vfio_pci_uninit_perm_bits(void);
extern int vfio_pci_virqfd_init(void);
extern void vfio_pci_virqfd_exit(void);
extern int vfio_config_init(struct vfio_pci_device *vdev);
extern void vfio_config_free(struct vfio_pci_device *vdev);
#endif /* VFIO_PCI_PRIVATE_H */

View file

@ -0,0 +1,238 @@
/*
* VFIO PCI I/O Port & MMIO access
*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Derived from original vfio:
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
* Author: Tom Lyon, pugs@cisco.com
*/
#include <linux/fs.h>
#include <linux/pci.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/vgaarb.h>
#include "vfio_pci_private.h"
/*
* Read or write from an __iomem region (MMIO or I/O port) with an excluded
* range which is inaccessible. The excluded range drops writes and fills
* reads with -1. This is intended for handling MSI-X vector tables and
* leftover space for ROM BARs.
*/
static ssize_t do_io_rw(void __iomem *io, char __user *buf,
loff_t off, size_t count, size_t x_start,
size_t x_end, bool iswrite)
{
ssize_t done = 0;
while (count) {
size_t fillable, filled;
if (off < x_start)
fillable = min(count, (size_t)(x_start - off));
else if (off >= x_end)
fillable = count;
else
fillable = 0;
if (fillable >= 4 && !(off % 4)) {
__le32 val;
if (iswrite) {
if (copy_from_user(&val, buf, 4))
return -EFAULT;
iowrite32(le32_to_cpu(val), io + off);
} else {
val = cpu_to_le32(ioread32(io + off));
if (copy_to_user(buf, &val, 4))
return -EFAULT;
}
filled = 4;
} else if (fillable >= 2 && !(off % 2)) {
__le16 val;
if (iswrite) {
if (copy_from_user(&val, buf, 2))
return -EFAULT;
iowrite16(le16_to_cpu(val), io + off);
} else {
val = cpu_to_le16(ioread16(io + off));
if (copy_to_user(buf, &val, 2))
return -EFAULT;
}
filled = 2;
} else if (fillable) {
u8 val;
if (iswrite) {
if (copy_from_user(&val, buf, 1))
return -EFAULT;
iowrite8(val, io + off);
} else {
val = ioread8(io + off);
if (copy_to_user(buf, &val, 1))
return -EFAULT;
}
filled = 1;
} else {
/* Fill reads with -1, drop writes */
filled = min(count, (size_t)(x_end - off));
if (!iswrite) {
u8 val = 0xFF;
size_t i;
for (i = 0; i < filled; i++)
if (copy_to_user(buf + i, &val, 1))
return -EFAULT;
}
}
count -= filled;
done += filled;
off += filled;
buf += filled;
}
return done;
}
ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite)
{
struct pci_dev *pdev = vdev->pdev;
loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
size_t x_start = 0, x_end = 0;
resource_size_t end;
void __iomem *io;
ssize_t done;
if (!pci_resource_start(pdev, bar))
return -EINVAL;
end = pci_resource_len(pdev, bar);
if (pos >= end)
return -EINVAL;
count = min(count, (size_t)(end - pos));
if (bar == PCI_ROM_RESOURCE) {
/*
* The ROM can fill less space than the BAR, so we start the
* excluded range at the end of the actual ROM. This makes
* filling large ROM BARs much faster.
*/
io = pci_map_rom(pdev, &x_start);
if (!io)
return -ENOMEM;
x_end = end;
} else if (!vdev->barmap[bar]) {
int ret;
ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
if (ret)
return ret;
io = pci_iomap(pdev, bar, 0);
if (!io) {
pci_release_selected_regions(pdev, 1 << bar);
return -ENOMEM;
}
vdev->barmap[bar] = io;
} else
io = vdev->barmap[bar];
if (bar == vdev->msix_bar) {
x_start = vdev->msix_offset;
x_end = vdev->msix_offset + vdev->msix_size;
}
done = do_io_rw(io, buf, pos, count, x_start, x_end, iswrite);
if (done >= 0)
*ppos += done;
if (bar == PCI_ROM_RESOURCE)
pci_unmap_rom(pdev, io);
return done;
}
ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite)
{
int ret;
loff_t off, pos = *ppos & VFIO_PCI_OFFSET_MASK;
void __iomem *iomem = NULL;
unsigned int rsrc;
bool is_ioport;
ssize_t done;
if (!vdev->has_vga)
return -EINVAL;
switch (pos) {
case 0xa0000 ... 0xbffff:
count = min(count, (size_t)(0xc0000 - pos));
iomem = ioremap_nocache(0xa0000, 0xbffff - 0xa0000 + 1);
off = pos - 0xa0000;
rsrc = VGA_RSRC_LEGACY_MEM;
is_ioport = false;
break;
case 0x3b0 ... 0x3bb:
count = min(count, (size_t)(0x3bc - pos));
iomem = ioport_map(0x3b0, 0x3bb - 0x3b0 + 1);
off = pos - 0x3b0;
rsrc = VGA_RSRC_LEGACY_IO;
is_ioport = true;
break;
case 0x3c0 ... 0x3df:
count = min(count, (size_t)(0x3e0 - pos));
iomem = ioport_map(0x3c0, 0x3df - 0x3c0 + 1);
off = pos - 0x3c0;
rsrc = VGA_RSRC_LEGACY_IO;
is_ioport = true;
break;
default:
return -EINVAL;
}
if (!iomem)
return -ENOMEM;
ret = vga_get_interruptible(vdev->pdev, rsrc);
if (ret) {
is_ioport ? ioport_unmap(iomem) : iounmap(iomem);
return ret;
}
done = do_io_rw(iomem, buf, off, count, 0, 0, iswrite);
vga_put(vdev->pdev, rsrc);
is_ioport ? ioport_unmap(iomem) : iounmap(iomem);
if (done >= 0)
*ppos += done;
return done;
}

1509
drivers/vfio/vfio.c Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,392 @@
/*
* VFIO: IOMMU DMA mapping support for TCE on POWER
*
* Copyright (C) 2013 IBM Corp. All rights reserved.
* Author: Alexey Kardashevskiy <aik@ozlabs.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Derived from original vfio_iommu_type1.c:
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*/
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/err.h>
#include <linux/vfio.h>
#include <asm/iommu.h>
#include <asm/tce.h>
#define DRIVER_VERSION "0.1"
#define DRIVER_AUTHOR "aik@ozlabs.ru"
#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
static void tce_iommu_detach_group(void *iommu_data,
struct iommu_group *iommu_group);
/*
* VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
*
* This code handles mapping and unmapping of user data buffers
* into DMA'ble space using the IOMMU
*/
/*
* The container descriptor supports only a single group per container.
* Required by the API as the container is not supplied with the IOMMU group
* at the moment of initialization.
*/
struct tce_container {
struct mutex lock;
struct iommu_table *tbl;
bool enabled;
};
static int tce_iommu_enable(struct tce_container *container)
{
int ret = 0;
unsigned long locked, lock_limit, npages;
struct iommu_table *tbl = container->tbl;
if (!container->tbl)
return -ENXIO;
if (!current->mm)
return -ESRCH; /* process exited */
if (container->enabled)
return -EBUSY;
/*
* When userspace pages are mapped into the IOMMU, they are effectively
* locked memory, so, theoretically, we need to update the accounting
* of locked pages on each map and unmap. For powerpc, the map unmap
* paths can be very hot, though, and the accounting would kill
* performance, especially since it would be difficult to impossible
* to handle the accounting in real mode only.
*
* To address that, rather than precisely accounting every page, we
* instead account for a worst case on locked memory when the iommu is
* enabled and disabled. The worst case upper bound on locked memory
* is the size of the whole iommu window, which is usually relatively
* small (compared to total memory sizes) on POWER hardware.
*
* Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
* that would effectively kill the guest at random points, much better
* enforcing the limit based on the max that the guest can map.
*/
down_write(&current->mm->mmap_sem);
npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
locked = current->mm->locked_vm + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
rlimit(RLIMIT_MEMLOCK));
ret = -ENOMEM;
} else {
current->mm->locked_vm += npages;
container->enabled = true;
}
up_write(&current->mm->mmap_sem);
return ret;
}
static void tce_iommu_disable(struct tce_container *container)
{
if (!container->enabled)
return;
container->enabled = false;
if (!container->tbl || !current->mm)
return;
down_write(&current->mm->mmap_sem);
current->mm->locked_vm -= (container->tbl->it_size <<
IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
up_write(&current->mm->mmap_sem);
}
static void *tce_iommu_open(unsigned long arg)
{
struct tce_container *container;
if (arg != VFIO_SPAPR_TCE_IOMMU) {
pr_err("tce_vfio: Wrong IOMMU type\n");
return ERR_PTR(-EINVAL);
}
container = kzalloc(sizeof(*container), GFP_KERNEL);
if (!container)
return ERR_PTR(-ENOMEM);
mutex_init(&container->lock);
return container;
}
static void tce_iommu_release(void *iommu_data)
{
struct tce_container *container = iommu_data;
WARN_ON(container->tbl && !container->tbl->it_group);
tce_iommu_disable(container);
if (container->tbl && container->tbl->it_group)
tce_iommu_detach_group(iommu_data, container->tbl->it_group);
mutex_destroy(&container->lock);
kfree(container);
}
static long tce_iommu_ioctl(void *iommu_data,
unsigned int cmd, unsigned long arg)
{
struct tce_container *container = iommu_data;
unsigned long minsz;
long ret;
switch (cmd) {
case VFIO_CHECK_EXTENSION:
switch (arg) {
case VFIO_SPAPR_TCE_IOMMU:
ret = 1;
break;
default:
ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
break;
}
return (ret < 0) ? 0 : ret;
case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
struct vfio_iommu_spapr_tce_info info;
struct iommu_table *tbl = container->tbl;
if (WARN_ON(!tbl))
return -ENXIO;
minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
dma32_window_size);
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if (info.argsz < minsz)
return -EINVAL;
info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT_4K;
info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT_4K;
info.flags = 0;
if (copy_to_user((void __user *)arg, &info, minsz))
return -EFAULT;
return 0;
}
case VFIO_IOMMU_MAP_DMA: {
struct vfio_iommu_type1_dma_map param;
struct iommu_table *tbl = container->tbl;
unsigned long tce, i;
if (!tbl)
return -ENXIO;
BUG_ON(!tbl->it_group);
minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
if (copy_from_user(&param, (void __user *)arg, minsz))
return -EFAULT;
if (param.argsz < minsz)
return -EINVAL;
if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
VFIO_DMA_MAP_FLAG_WRITE))
return -EINVAL;
if ((param.size & ~IOMMU_PAGE_MASK_4K) ||
(param.vaddr & ~IOMMU_PAGE_MASK_4K))
return -EINVAL;
/* iova is checked by the IOMMU API */
tce = param.vaddr;
if (param.flags & VFIO_DMA_MAP_FLAG_READ)
tce |= TCE_PCI_READ;
if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
tce |= TCE_PCI_WRITE;
ret = iommu_tce_put_param_check(tbl, param.iova, tce);
if (ret)
return ret;
for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT_4K); ++i) {
ret = iommu_put_tce_user_mode(tbl,
(param.iova >> IOMMU_PAGE_SHIFT_4K) + i,
tce);
if (ret)
break;
tce += IOMMU_PAGE_SIZE_4K;
}
if (ret)
iommu_clear_tces_and_put_pages(tbl,
param.iova >> IOMMU_PAGE_SHIFT_4K, i);
iommu_flush_tce(tbl);
return ret;
}
case VFIO_IOMMU_UNMAP_DMA: {
struct vfio_iommu_type1_dma_unmap param;
struct iommu_table *tbl = container->tbl;
if (WARN_ON(!tbl))
return -ENXIO;
minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
size);
if (copy_from_user(&param, (void __user *)arg, minsz))
return -EFAULT;
if (param.argsz < minsz)
return -EINVAL;
/* No flag is supported now */
if (param.flags)
return -EINVAL;
if (param.size & ~IOMMU_PAGE_MASK_4K)
return -EINVAL;
ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
param.size >> IOMMU_PAGE_SHIFT_4K);
if (ret)
return ret;
ret = iommu_clear_tces_and_put_pages(tbl,
param.iova >> IOMMU_PAGE_SHIFT_4K,
param.size >> IOMMU_PAGE_SHIFT_4K);
iommu_flush_tce(tbl);
return ret;
}
case VFIO_IOMMU_ENABLE:
mutex_lock(&container->lock);
ret = tce_iommu_enable(container);
mutex_unlock(&container->lock);
return ret;
case VFIO_IOMMU_DISABLE:
mutex_lock(&container->lock);
tce_iommu_disable(container);
mutex_unlock(&container->lock);
return 0;
case VFIO_EEH_PE_OP:
if (!container->tbl || !container->tbl->it_group)
return -ENODEV;
return vfio_spapr_iommu_eeh_ioctl(container->tbl->it_group,
cmd, arg);
}
return -ENOTTY;
}
static int tce_iommu_attach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
int ret;
struct tce_container *container = iommu_data;
struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
BUG_ON(!tbl);
mutex_lock(&container->lock);
/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
iommu_group_id(iommu_group), iommu_group); */
if (container->tbl) {
pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
iommu_group_id(container->tbl->it_group),
iommu_group_id(iommu_group));
ret = -EBUSY;
} else if (container->enabled) {
pr_err("tce_vfio: attaching group #%u to enabled container\n",
iommu_group_id(iommu_group));
ret = -EBUSY;
} else {
ret = iommu_take_ownership(tbl);
if (!ret)
container->tbl = tbl;
}
mutex_unlock(&container->lock);
return ret;
}
static void tce_iommu_detach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
struct tce_container *container = iommu_data;
struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
BUG_ON(!tbl);
mutex_lock(&container->lock);
if (tbl != container->tbl) {
pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
iommu_group_id(iommu_group),
iommu_group_id(tbl->it_group));
} else {
if (container->enabled) {
pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
iommu_group_id(tbl->it_group));
tce_iommu_disable(container);
}
/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
iommu_group_id(iommu_group), iommu_group); */
container->tbl = NULL;
iommu_release_ownership(tbl);
}
mutex_unlock(&container->lock);
}
const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
.name = "iommu-vfio-powerpc",
.owner = THIS_MODULE,
.open = tce_iommu_open,
.release = tce_iommu_release,
.ioctl = tce_iommu_ioctl,
.attach_group = tce_iommu_attach_group,
.detach_group = tce_iommu_detach_group,
};
static int __init tce_iommu_init(void)
{
return vfio_register_iommu_driver(&tce_iommu_driver_ops);
}
static void __exit tce_iommu_cleanup(void)
{
vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
}
module_init(tce_iommu_init);
module_exit(tce_iommu_cleanup);
MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);
MODULE_DESCRIPTION(DRIVER_DESC);

View file

@ -0,0 +1,996 @@
/*
* VFIO: IOMMU DMA mapping support for Type1 IOMMU
*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Derived from original vfio:
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
* Author: Tom Lyon, pugs@cisco.com
*
* We arbitrarily define a Type1 IOMMU as one matching the below code.
* It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
* VT-d, but that makes it harder to re-use as theoretically anyone
* implementing a similar IOMMU could make use of this. We expect the
* IOMMU to support the IOMMU API and have few to no restrictions around
* the IOVA range that can be mapped. The Type1 IOMMU is currently
* optimized for relatively static mappings of a userspace process with
* userpsace pages pinned into memory. We also assume devices and IOMMU
* domains are PCI based as the IOMMU API is still centered around a
* device/bus interface rather than a group interface.
*/
#include <linux/compat.h>
#include <linux/device.h>
#include <linux/fs.h>
#include <linux/iommu.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/rbtree.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include <linux/workqueue.h>
#define DRIVER_VERSION "0.2"
#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
#define DRIVER_DESC "Type1 IOMMU driver for VFIO"
static bool allow_unsafe_interrupts;
module_param_named(allow_unsafe_interrupts,
allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(allow_unsafe_interrupts,
"Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
static bool disable_hugepages;
module_param_named(disable_hugepages,
disable_hugepages, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(disable_hugepages,
"Disable VFIO IOMMU support for IOMMU hugepages.");
struct vfio_iommu {
struct list_head domain_list;
struct mutex lock;
struct rb_root dma_list;
bool v2;
bool nesting;
};
struct vfio_domain {
struct iommu_domain *domain;
struct list_head next;
struct list_head group_list;
int prot; /* IOMMU_CACHE */
};
struct vfio_dma {
struct rb_node node;
dma_addr_t iova; /* Device address */
unsigned long vaddr; /* Process virtual addr */
size_t size; /* Map size (bytes) */
int prot; /* IOMMU_READ/WRITE */
};
struct vfio_group {
struct iommu_group *iommu_group;
struct list_head next;
};
/*
* This code handles mapping and unmapping of user data buffers
* into DMA'ble space using the IOMMU
*/
static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
dma_addr_t start, size_t size)
{
struct rb_node *node = iommu->dma_list.rb_node;
while (node) {
struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
if (start + size <= dma->iova)
node = node->rb_left;
else if (start >= dma->iova + dma->size)
node = node->rb_right;
else
return dma;
}
return NULL;
}
static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
{
struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
struct vfio_dma *dma;
while (*link) {
parent = *link;
dma = rb_entry(parent, struct vfio_dma, node);
if (new->iova + new->size <= dma->iova)
link = &(*link)->rb_left;
else
link = &(*link)->rb_right;
}
rb_link_node(&new->node, parent, link);
rb_insert_color(&new->node, &iommu->dma_list);
}
static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
{
rb_erase(&old->node, &iommu->dma_list);
}
struct vwork {
struct mm_struct *mm;
long npage;
struct work_struct work;
};
/* delayed decrement/increment for locked_vm */
static void vfio_lock_acct_bg(struct work_struct *work)
{
struct vwork *vwork = container_of(work, struct vwork, work);
struct mm_struct *mm;
mm = vwork->mm;
down_write(&mm->mmap_sem);
mm->locked_vm += vwork->npage;
up_write(&mm->mmap_sem);
mmput(mm);
kfree(vwork);
}
static void vfio_lock_acct(long npage)
{
struct vwork *vwork;
struct mm_struct *mm;
if (!current->mm || !npage)
return; /* process exited or nothing to do */
if (down_write_trylock(&current->mm->mmap_sem)) {
current->mm->locked_vm += npage;
up_write(&current->mm->mmap_sem);
return;
}
/*
* Couldn't get mmap_sem lock, so must setup to update
* mm->locked_vm later. If locked_vm were atomic, we
* wouldn't need this silliness
*/
vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
if (!vwork)
return;
mm = get_task_mm(current);
if (!mm) {
kfree(vwork);
return;
}
INIT_WORK(&vwork->work, vfio_lock_acct_bg);
vwork->mm = mm;
vwork->npage = npage;
schedule_work(&vwork->work);
}
/*
* Some mappings aren't backed by a struct page, for example an mmap'd
* MMIO range for our own or another device. These use a different
* pfn conversion and shouldn't be tracked as locked pages.
*/
static bool is_invalid_reserved_pfn(unsigned long pfn)
{
if (pfn_valid(pfn)) {
bool reserved;
struct page *tail = pfn_to_page(pfn);
struct page *head = compound_head(tail);
reserved = !!(PageReserved(head));
if (head != tail) {
/*
* "head" is not a dangling pointer
* (compound_head takes care of that)
* but the hugepage may have been split
* from under us (and we may not hold a
* reference count on the head page so it can
* be reused before we run PageReferenced), so
* we've to check PageTail before returning
* what we just read.
*/
smp_rmb();
if (PageTail(tail))
return reserved;
}
return PageReserved(tail);
}
return true;
}
static int put_pfn(unsigned long pfn, int prot)
{
if (!is_invalid_reserved_pfn(pfn)) {
struct page *page = pfn_to_page(pfn);
if (prot & IOMMU_WRITE)
SetPageDirty(page);
put_page(page);
return 1;
}
return 0;
}
static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
{
struct page *page[1];
struct vm_area_struct *vma;
int ret = -EFAULT;
if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
*pfn = page_to_pfn(page[0]);
return 0;
}
down_read(&current->mm->mmap_sem);
vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
if (vma && vma->vm_flags & VM_PFNMAP) {
*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
if (is_invalid_reserved_pfn(*pfn))
ret = 0;
}
up_read(&current->mm->mmap_sem);
return ret;
}
/*
* Attempt to pin pages. We really don't want to track all the pfns and
* the iommu can only map chunks of consecutive pfns anyway, so get the
* first page and all consecutive pages with the same locking.
*/
static long vfio_pin_pages(unsigned long vaddr, long npage,
int prot, unsigned long *pfn_base)
{
unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
bool lock_cap = capable(CAP_IPC_LOCK);
long ret, i;
if (!current->mm)
return -ENODEV;
ret = vaddr_get_pfn(vaddr, prot, pfn_base);
if (ret)
return ret;
if (is_invalid_reserved_pfn(*pfn_base))
return 1;
if (!lock_cap && current->mm->locked_vm + 1 > limit) {
put_pfn(*pfn_base, prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
limit << PAGE_SHIFT);
return -ENOMEM;
}
if (unlikely(disable_hugepages)) {
vfio_lock_acct(1);
return 1;
}
/* Lock all the consecutive pages from pfn_base */
for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
unsigned long pfn = 0;
ret = vaddr_get_pfn(vaddr, prot, &pfn);
if (ret)
break;
if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
put_pfn(pfn, prot);
break;
}
if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
put_pfn(pfn, prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
__func__, limit << PAGE_SHIFT);
break;
}
}
vfio_lock_acct(i);
return i;
}
static long vfio_unpin_pages(unsigned long pfn, long npage,
int prot, bool do_accounting)
{
unsigned long unlocked = 0;
long i;
for (i = 0; i < npage; i++)
unlocked += put_pfn(pfn++, prot);
if (do_accounting)
vfio_lock_acct(-unlocked);
return unlocked;
}
static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
{
dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
struct vfio_domain *domain, *d;
long unlocked = 0;
if (!dma->size)
return;
/*
* We use the IOMMU to track the physical addresses, otherwise we'd
* need a much more complicated tracking system. Unfortunately that
* means we need to use one of the iommu domains to figure out the
* pfns to unpin. The rest need to be unmapped in advance so we have
* no iommu translations remaining when the pages are unpinned.
*/
domain = d = list_first_entry(&iommu->domain_list,
struct vfio_domain, next);
list_for_each_entry_continue(d, &iommu->domain_list, next)
iommu_unmap(d->domain, dma->iova, dma->size);
while (iova < end) {
size_t unmapped;
phys_addr_t phys;
phys = iommu_iova_to_phys(domain->domain, iova);
if (WARN_ON(!phys)) {
iova += PAGE_SIZE;
continue;
}
unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE);
if (WARN_ON(!unmapped))
break;
unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
unmapped >> PAGE_SHIFT,
dma->prot, false);
iova += unmapped;
}
vfio_lock_acct(-unlocked);
}
static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
{
vfio_unmap_unpin(iommu, dma);
vfio_unlink_dma(iommu, dma);
kfree(dma);
}
static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
{
struct vfio_domain *domain;
unsigned long bitmap = PAGE_MASK;
mutex_lock(&iommu->lock);
list_for_each_entry(domain, &iommu->domain_list, next)
bitmap &= domain->domain->ops->pgsize_bitmap;
mutex_unlock(&iommu->lock);
return bitmap;
}
static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_unmap *unmap)
{
uint64_t mask;
struct vfio_dma *dma;
size_t unmapped = 0;
int ret = 0;
mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
if (unmap->iova & mask)
return -EINVAL;
if (!unmap->size || unmap->size & mask)
return -EINVAL;
WARN_ON(mask & PAGE_MASK);
mutex_lock(&iommu->lock);
/*
* vfio-iommu-type1 (v1) - User mappings were coalesced together to
* avoid tracking individual mappings. This means that the granularity
* of the original mapping was lost and the user was allowed to attempt
* to unmap any range. Depending on the contiguousness of physical
* memory and page sizes supported by the IOMMU, arbitrary unmaps may
* or may not have worked. We only guaranteed unmap granularity
* matching the original mapping; even though it was untracked here,
* the original mappings are reflected in IOMMU mappings. This
* resulted in a couple unusual behaviors. First, if a range is not
* able to be unmapped, ex. a set of 4k pages that was mapped as a
* 2M hugepage into the IOMMU, the unmap ioctl returns success but with
* a zero sized unmap. Also, if an unmap request overlaps the first
* address of a hugepage, the IOMMU will unmap the entire hugepage.
* This also returns success and the returned unmap size reflects the
* actual size unmapped.
*
* We attempt to maintain compatibility with this "v1" interface, but
* we take control out of the hands of the IOMMU. Therefore, an unmap
* request offset from the beginning of the original mapping will
* return success with zero sized unmap. And an unmap request covering
* the first iova of mapping will unmap the entire range.
*
* The v2 version of this interface intends to be more deterministic.
* Unmap requests must fully cover previous mappings. Multiple
* mappings may still be unmaped by specifying large ranges, but there
* must not be any previous mappings bisected by the range. An error
* will be returned if these conditions are not met. The v2 interface
* will only return success and a size of zero if there were no
* mappings within the range.
*/
if (iommu->v2) {
dma = vfio_find_dma(iommu, unmap->iova, 0);
if (dma && dma->iova != unmap->iova) {
ret = -EINVAL;
goto unlock;
}
dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
ret = -EINVAL;
goto unlock;
}
}
while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
if (!iommu->v2 && unmap->iova > dma->iova)
break;
unmapped += dma->size;
vfio_remove_dma(iommu, dma);
}
unlock:
mutex_unlock(&iommu->lock);
/* Report how much was unmapped */
unmap->size = unmapped;
return ret;
}
/*
* Turns out AMD IOMMU has a page table bug where it won't map large pages
* to a region that previously mapped smaller pages. This should be fixed
* soon, so this is just a temporary workaround to break mappings down into
* PAGE_SIZE. Better to map smaller pages than nothing.
*/
static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
unsigned long pfn, long npage, int prot)
{
long i;
int ret;
for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
ret = iommu_map(domain->domain, iova,
(phys_addr_t)pfn << PAGE_SHIFT,
PAGE_SIZE, prot | domain->prot);
if (ret)
break;
}
for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
iommu_unmap(domain->domain, iova, PAGE_SIZE);
return ret;
}
static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
unsigned long pfn, long npage, int prot)
{
struct vfio_domain *d;
int ret;
list_for_each_entry(d, &iommu->domain_list, next) {
ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
npage << PAGE_SHIFT, prot | d->prot);
if (ret) {
if (ret != -EBUSY ||
map_try_harder(d, iova, pfn, npage, prot))
goto unwind;
}
}
return 0;
unwind:
list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
return ret;
}
static int vfio_dma_do_map(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_map *map)
{
dma_addr_t iova = map->iova;
unsigned long vaddr = map->vaddr;
size_t size = map->size;
long npage;
int ret = 0, prot = 0;
uint64_t mask;
struct vfio_dma *dma;
unsigned long pfn;
/* Verify that none of our __u64 fields overflow */
if (map->size != size || map->vaddr != vaddr || map->iova != iova)
return -EINVAL;
mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
WARN_ON(mask & PAGE_MASK);
/* READ/WRITE from device perspective */
if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
prot |= IOMMU_WRITE;
if (map->flags & VFIO_DMA_MAP_FLAG_READ)
prot |= IOMMU_READ;
if (!prot || !size || (size | iova | vaddr) & mask)
return -EINVAL;
/* Don't allow IOVA or virtual address wrap */
if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
return -EINVAL;
mutex_lock(&iommu->lock);
if (vfio_find_dma(iommu, iova, size)) {
mutex_unlock(&iommu->lock);
return -EEXIST;
}
dma = kzalloc(sizeof(*dma), GFP_KERNEL);
if (!dma) {
mutex_unlock(&iommu->lock);
return -ENOMEM;
}
dma->iova = iova;
dma->vaddr = vaddr;
dma->prot = prot;
/* Insert zero-sized and grow as we map chunks of it */
vfio_link_dma(iommu, dma);
while (size) {
/* Pin a contiguous chunk of memory */
npage = vfio_pin_pages(vaddr + dma->size,
size >> PAGE_SHIFT, prot, &pfn);
if (npage <= 0) {
WARN_ON(!npage);
ret = (int)npage;
break;
}
/* Map it! */
ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
if (ret) {
vfio_unpin_pages(pfn, npage, prot, true);
break;
}
size -= npage << PAGE_SHIFT;
dma->size += npage << PAGE_SHIFT;
}
if (ret)
vfio_remove_dma(iommu, dma);
mutex_unlock(&iommu->lock);
return ret;
}
static int vfio_bus_type(struct device *dev, void *data)
{
struct bus_type **bus = data;
if (*bus && *bus != dev->bus)
return -EINVAL;
*bus = dev->bus;
return 0;
}
static int vfio_iommu_replay(struct vfio_iommu *iommu,
struct vfio_domain *domain)
{
struct vfio_domain *d;
struct rb_node *n;
int ret;
/* Arbitrarily pick the first domain in the list for lookups */
d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
n = rb_first(&iommu->dma_list);
/* If there's not a domain, there better not be any mappings */
if (WARN_ON(n && !d))
return -EINVAL;
for (; n; n = rb_next(n)) {
struct vfio_dma *dma;
dma_addr_t iova;
dma = rb_entry(n, struct vfio_dma, node);
iova = dma->iova;
while (iova < dma->iova + dma->size) {
phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
size_t size;
if (WARN_ON(!phys)) {
iova += PAGE_SIZE;
continue;
}
size = PAGE_SIZE;
while (iova + size < dma->iova + dma->size &&
phys + size == iommu_iova_to_phys(d->domain,
iova + size))
size += PAGE_SIZE;
ret = iommu_map(domain->domain, iova, phys,
size, dma->prot | domain->prot);
if (ret)
return ret;
iova += size;
}
}
return 0;
}
static int vfio_iommu_type1_attach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
struct vfio_iommu *iommu = iommu_data;
struct vfio_group *group, *g;
struct vfio_domain *domain, *d;
struct bus_type *bus = NULL;
int ret;
mutex_lock(&iommu->lock);
list_for_each_entry(d, &iommu->domain_list, next) {
list_for_each_entry(g, &d->group_list, next) {
if (g->iommu_group != iommu_group)
continue;
mutex_unlock(&iommu->lock);
return -EINVAL;
}
}
group = kzalloc(sizeof(*group), GFP_KERNEL);
domain = kzalloc(sizeof(*domain), GFP_KERNEL);
if (!group || !domain) {
ret = -ENOMEM;
goto out_free;
}
group->iommu_group = iommu_group;
/* Determine bus_type in order to allocate a domain */
ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
if (ret)
goto out_free;
domain->domain = iommu_domain_alloc(bus);
if (!domain->domain) {
ret = -EIO;
goto out_free;
}
if (iommu->nesting) {
int attr = 1;
ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
&attr);
if (ret)
goto out_domain;
}
ret = iommu_attach_group(domain->domain, iommu_group);
if (ret)
goto out_domain;
INIT_LIST_HEAD(&domain->group_list);
list_add(&group->next, &domain->group_list);
if (!allow_unsafe_interrupts &&
!iommu_capable(bus, IOMMU_CAP_INTR_REMAP)) {
pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
__func__);
ret = -EPERM;
goto out_detach;
}
if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
domain->prot |= IOMMU_CACHE;
/*
* Try to match an existing compatible domain. We don't want to
* preclude an IOMMU driver supporting multiple bus_types and being
* able to include different bus_types in the same IOMMU domain, so
* we test whether the domains use the same iommu_ops rather than
* testing if they're on the same bus_type.
*/
list_for_each_entry(d, &iommu->domain_list, next) {
if (d->domain->ops == domain->domain->ops &&
d->prot == domain->prot) {
iommu_detach_group(domain->domain, iommu_group);
if (!iommu_attach_group(d->domain, iommu_group)) {
list_add(&group->next, &d->group_list);
iommu_domain_free(domain->domain);
kfree(domain);
mutex_unlock(&iommu->lock);
return 0;
}
ret = iommu_attach_group(domain->domain, iommu_group);
if (ret)
goto out_domain;
}
}
/* replay mappings on new domains */
ret = vfio_iommu_replay(iommu, domain);
if (ret)
goto out_detach;
list_add(&domain->next, &iommu->domain_list);
mutex_unlock(&iommu->lock);
return 0;
out_detach:
iommu_detach_group(domain->domain, iommu_group);
out_domain:
iommu_domain_free(domain->domain);
out_free:
kfree(domain);
kfree(group);
mutex_unlock(&iommu->lock);
return ret;
}
static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
{
struct rb_node *node;
while ((node = rb_first(&iommu->dma_list)))
vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
}
static void vfio_iommu_type1_detach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
struct vfio_iommu *iommu = iommu_data;
struct vfio_domain *domain;
struct vfio_group *group;
mutex_lock(&iommu->lock);
list_for_each_entry(domain, &iommu->domain_list, next) {
list_for_each_entry(group, &domain->group_list, next) {
if (group->iommu_group != iommu_group)
continue;
iommu_detach_group(domain->domain, iommu_group);
list_del(&group->next);
kfree(group);
/*
* Group ownership provides privilege, if the group
* list is empty, the domain goes away. If it's the
* last domain, then all the mappings go away too.
*/
if (list_empty(&domain->group_list)) {
if (list_is_singular(&iommu->domain_list))
vfio_iommu_unmap_unpin_all(iommu);
iommu_domain_free(domain->domain);
list_del(&domain->next);
kfree(domain);
}
goto done;
}
}
done:
mutex_unlock(&iommu->lock);
}
static void *vfio_iommu_type1_open(unsigned long arg)
{
struct vfio_iommu *iommu;
iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
if (!iommu)
return ERR_PTR(-ENOMEM);
switch (arg) {
case VFIO_TYPE1_IOMMU:
break;
case VFIO_TYPE1_NESTING_IOMMU:
iommu->nesting = true;
case VFIO_TYPE1v2_IOMMU:
iommu->v2 = true;
break;
default:
kfree(iommu);
return ERR_PTR(-EINVAL);
}
INIT_LIST_HEAD(&iommu->domain_list);
iommu->dma_list = RB_ROOT;
mutex_init(&iommu->lock);
return iommu;
}
static void vfio_iommu_type1_release(void *iommu_data)
{
struct vfio_iommu *iommu = iommu_data;
struct vfio_domain *domain, *domain_tmp;
struct vfio_group *group, *group_tmp;
vfio_iommu_unmap_unpin_all(iommu);
list_for_each_entry_safe(domain, domain_tmp,
&iommu->domain_list, next) {
list_for_each_entry_safe(group, group_tmp,
&domain->group_list, next) {
iommu_detach_group(domain->domain, group->iommu_group);
list_del(&group->next);
kfree(group);
}
iommu_domain_free(domain->domain);
list_del(&domain->next);
kfree(domain);
}
kfree(iommu);
}
static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
{
struct vfio_domain *domain;
int ret = 1;
mutex_lock(&iommu->lock);
list_for_each_entry(domain, &iommu->domain_list, next) {
if (!(domain->prot & IOMMU_CACHE)) {
ret = 0;
break;
}
}
mutex_unlock(&iommu->lock);
return ret;
}
static long vfio_iommu_type1_ioctl(void *iommu_data,
unsigned int cmd, unsigned long arg)
{
struct vfio_iommu *iommu = iommu_data;
unsigned long minsz;
if (cmd == VFIO_CHECK_EXTENSION) {
switch (arg) {
case VFIO_TYPE1_IOMMU:
case VFIO_TYPE1v2_IOMMU:
case VFIO_TYPE1_NESTING_IOMMU:
return 1;
case VFIO_DMA_CC_IOMMU:
if (!iommu)
return 0;
return vfio_domains_have_iommu_cache(iommu);
default:
return 0;
}
} else if (cmd == VFIO_IOMMU_GET_INFO) {
struct vfio_iommu_type1_info info;
minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if (info.argsz < minsz)
return -EINVAL;
info.flags = 0;
info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
return copy_to_user((void __user *)arg, &info, minsz);
} else if (cmd == VFIO_IOMMU_MAP_DMA) {
struct vfio_iommu_type1_dma_map map;
uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
VFIO_DMA_MAP_FLAG_WRITE;
minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
if (copy_from_user(&map, (void __user *)arg, minsz))
return -EFAULT;
if (map.argsz < minsz || map.flags & ~mask)
return -EINVAL;
return vfio_dma_do_map(iommu, &map);
} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
struct vfio_iommu_type1_dma_unmap unmap;
long ret;
minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
if (copy_from_user(&unmap, (void __user *)arg, minsz))
return -EFAULT;
if (unmap.argsz < minsz || unmap.flags)
return -EINVAL;
ret = vfio_dma_do_unmap(iommu, &unmap);
if (ret)
return ret;
return copy_to_user((void __user *)arg, &unmap, minsz);
}
return -ENOTTY;
}
static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
.name = "vfio-iommu-type1",
.owner = THIS_MODULE,
.open = vfio_iommu_type1_open,
.release = vfio_iommu_type1_release,
.ioctl = vfio_iommu_type1_ioctl,
.attach_group = vfio_iommu_type1_attach_group,
.detach_group = vfio_iommu_type1_detach_group,
};
static int __init vfio_iommu_type1_init(void)
{
return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
}
static void __exit vfio_iommu_type1_cleanup(void)
{
vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
}
module_init(vfio_iommu_type1_init);
module_exit(vfio_iommu_type1_cleanup);
MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);
MODULE_DESCRIPTION(DRIVER_DESC);

View file

@ -0,0 +1,100 @@
/*
* EEH functionality support for VFIO devices. The feature is only
* available on sPAPR compatible platforms.
*
* Copyright Gavin Shan, IBM Corporation 2014.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include <asm/eeh.h>
#define DRIVER_VERSION "0.1"
#define DRIVER_AUTHOR "Gavin Shan, IBM Corporation"
#define DRIVER_DESC "VFIO IOMMU SPAPR EEH"
/* We might build address mapping here for "fast" path later */
void vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
{
eeh_dev_open(pdev);
}
EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_open);
void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
{
eeh_dev_release(pdev);
}
EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_release);
long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
unsigned int cmd, unsigned long arg)
{
struct eeh_pe *pe;
struct vfio_eeh_pe_op op;
unsigned long minsz;
long ret = -EINVAL;
switch (cmd) {
case VFIO_CHECK_EXTENSION:
if (arg == VFIO_EEH)
ret = eeh_enabled() ? 1 : 0;
else
ret = 0;
break;
case VFIO_EEH_PE_OP:
pe = eeh_iommu_group_to_pe(group);
if (!pe)
return -ENODEV;
minsz = offsetofend(struct vfio_eeh_pe_op, op);
if (copy_from_user(&op, (void __user *)arg, minsz))
return -EFAULT;
if (op.argsz < minsz || op.flags)
return -EINVAL;
switch (op.op) {
case VFIO_EEH_PE_DISABLE:
ret = eeh_pe_set_option(pe, EEH_OPT_DISABLE);
break;
case VFIO_EEH_PE_ENABLE:
ret = eeh_pe_set_option(pe, EEH_OPT_ENABLE);
break;
case VFIO_EEH_PE_UNFREEZE_IO:
ret = eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO);
break;
case VFIO_EEH_PE_UNFREEZE_DMA:
ret = eeh_pe_set_option(pe, EEH_OPT_THAW_DMA);
break;
case VFIO_EEH_PE_GET_STATE:
ret = eeh_pe_get_state(pe);
break;
case VFIO_EEH_PE_RESET_DEACTIVATE:
ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE);
break;
case VFIO_EEH_PE_RESET_HOT:
ret = eeh_pe_reset(pe, EEH_RESET_HOT);
break;
case VFIO_EEH_PE_RESET_FUNDAMENTAL:
ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL);
break;
case VFIO_EEH_PE_CONFIGURE:
ret = eeh_pe_configure(pe);
break;
default:
ret = -EINVAL;
}
}
return ret;
}
EXPORT_SYMBOL_GPL(vfio_spapr_iommu_eeh_ioctl);
MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);
MODULE_DESCRIPTION(DRIVER_DESC);