mirror of
https://github.com/AetherDroid/android_kernel_samsung_on5xelte.git
synced 2025-09-07 16:58:04 -04:00
Fixed MTP to work with TWRP
This commit is contained in:
commit
f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions
29
drivers/vfio/Kconfig
Normal file
29
drivers/vfio/Kconfig
Normal file
|
@ -0,0 +1,29 @@
|
|||
config VFIO_IOMMU_TYPE1
|
||||
tristate
|
||||
depends on VFIO
|
||||
default n
|
||||
|
||||
config VFIO_IOMMU_SPAPR_TCE
|
||||
tristate
|
||||
depends on VFIO && SPAPR_TCE_IOMMU
|
||||
default n
|
||||
|
||||
config VFIO_SPAPR_EEH
|
||||
tristate
|
||||
depends on EEH && VFIO_IOMMU_SPAPR_TCE
|
||||
default n
|
||||
|
||||
menuconfig VFIO
|
||||
tristate "VFIO Non-Privileged userspace driver framework"
|
||||
depends on IOMMU_API
|
||||
select VFIO_IOMMU_TYPE1 if X86
|
||||
select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
|
||||
select VFIO_SPAPR_EEH if (PPC_POWERNV || PPC_PSERIES)
|
||||
select ANON_INODES
|
||||
help
|
||||
VFIO provides a framework for secure userspace device drivers.
|
||||
See Documentation/vfio.txt for more details.
|
||||
|
||||
If you don't know what to do here, say N.
|
||||
|
||||
source "drivers/vfio/pci/Kconfig"
|
5
drivers/vfio/Makefile
Normal file
5
drivers/vfio/Makefile
Normal file
|
@ -0,0 +1,5 @@
|
|||
obj-$(CONFIG_VFIO) += vfio.o
|
||||
obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
|
||||
obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
|
||||
obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
|
||||
obj-$(CONFIG_VFIO_PCI) += pci/
|
18
drivers/vfio/pci/Kconfig
Normal file
18
drivers/vfio/pci/Kconfig
Normal file
|
@ -0,0 +1,18 @@
|
|||
config VFIO_PCI
|
||||
tristate "VFIO support for PCI devices"
|
||||
depends on VFIO && PCI && EVENTFD
|
||||
help
|
||||
Support for the PCI VFIO bus driver. This is required to make
|
||||
use of PCI drivers using the VFIO framework.
|
||||
|
||||
If you don't know what to do here, say N.
|
||||
|
||||
config VFIO_PCI_VGA
|
||||
bool "VFIO PCI support for VGA devices"
|
||||
depends on VFIO_PCI && X86 && VGA_ARB
|
||||
help
|
||||
Support for VGA extension to VFIO PCI. This exposes an additional
|
||||
region on VGA devices for accessing legacy VGA addresses used by
|
||||
BIOS and generic video drivers.
|
||||
|
||||
If you don't know what to do here, say N.
|
4
drivers/vfio/pci/Makefile
Normal file
4
drivers/vfio/pci/Makefile
Normal file
|
@ -0,0 +1,4 @@
|
|||
|
||||
vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
|
||||
|
||||
obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
|
1051
drivers/vfio/pci/vfio_pci.c
Normal file
1051
drivers/vfio/pci/vfio_pci.c
Normal file
File diff suppressed because it is too large
Load diff
1595
drivers/vfio/pci/vfio_pci_config.c
Normal file
1595
drivers/vfio/pci/vfio_pci_config.c
Normal file
File diff suppressed because it is too large
Load diff
853
drivers/vfio/pci/vfio_pci_intrs.c
Normal file
853
drivers/vfio/pci/vfio_pci_intrs.c
Normal file
|
@ -0,0 +1,853 @@
|
|||
/*
|
||||
* VFIO PCI interrupt handling
|
||||
*
|
||||
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
|
||||
* Author: Alex Williamson <alex.williamson@redhat.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Derived from original vfio:
|
||||
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Author: Tom Lyon, pugs@cisco.com
|
||||
*/
|
||||
|
||||
#include <linux/device.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/eventfd.h>
|
||||
#include <linux/msi.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/poll.h>
|
||||
#include <linux/vfio.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include "vfio_pci_private.h"
|
||||
|
||||
/*
|
||||
* IRQfd - generic
|
||||
*/
|
||||
struct virqfd {
|
||||
struct vfio_pci_device *vdev;
|
||||
struct eventfd_ctx *eventfd;
|
||||
int (*handler)(struct vfio_pci_device *, void *);
|
||||
void (*thread)(struct vfio_pci_device *, void *);
|
||||
void *data;
|
||||
struct work_struct inject;
|
||||
wait_queue_t wait;
|
||||
poll_table pt;
|
||||
struct work_struct shutdown;
|
||||
struct virqfd **pvirqfd;
|
||||
};
|
||||
|
||||
static struct workqueue_struct *vfio_irqfd_cleanup_wq;
|
||||
|
||||
int __init vfio_pci_virqfd_init(void)
|
||||
{
|
||||
vfio_irqfd_cleanup_wq =
|
||||
create_singlethread_workqueue("vfio-irqfd-cleanup");
|
||||
if (!vfio_irqfd_cleanup_wq)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void vfio_pci_virqfd_exit(void)
|
||||
{
|
||||
destroy_workqueue(vfio_irqfd_cleanup_wq);
|
||||
}
|
||||
|
||||
static void virqfd_deactivate(struct virqfd *virqfd)
|
||||
{
|
||||
queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown);
|
||||
}
|
||||
|
||||
static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
|
||||
{
|
||||
struct virqfd *virqfd = container_of(wait, struct virqfd, wait);
|
||||
unsigned long flags = (unsigned long)key;
|
||||
|
||||
if (flags & POLLIN) {
|
||||
/* An event has been signaled, call function */
|
||||
if ((!virqfd->handler ||
|
||||
virqfd->handler(virqfd->vdev, virqfd->data)) &&
|
||||
virqfd->thread)
|
||||
schedule_work(&virqfd->inject);
|
||||
}
|
||||
|
||||
if (flags & POLLHUP) {
|
||||
unsigned long flags;
|
||||
spin_lock_irqsave(&virqfd->vdev->irqlock, flags);
|
||||
|
||||
/*
|
||||
* The eventfd is closing, if the virqfd has not yet been
|
||||
* queued for release, as determined by testing whether the
|
||||
* vdev pointer to it is still valid, queue it now. As
|
||||
* with kvm irqfds, we know we won't race against the virqfd
|
||||
* going away because we hold wqh->lock to get here.
|
||||
*/
|
||||
if (*(virqfd->pvirqfd) == virqfd) {
|
||||
*(virqfd->pvirqfd) = NULL;
|
||||
virqfd_deactivate(virqfd);
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&virqfd->vdev->irqlock, flags);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void virqfd_ptable_queue_proc(struct file *file,
|
||||
wait_queue_head_t *wqh, poll_table *pt)
|
||||
{
|
||||
struct virqfd *virqfd = container_of(pt, struct virqfd, pt);
|
||||
add_wait_queue(wqh, &virqfd->wait);
|
||||
}
|
||||
|
||||
static void virqfd_shutdown(struct work_struct *work)
|
||||
{
|
||||
struct virqfd *virqfd = container_of(work, struct virqfd, shutdown);
|
||||
u64 cnt;
|
||||
|
||||
eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt);
|
||||
flush_work(&virqfd->inject);
|
||||
eventfd_ctx_put(virqfd->eventfd);
|
||||
|
||||
kfree(virqfd);
|
||||
}
|
||||
|
||||
static void virqfd_inject(struct work_struct *work)
|
||||
{
|
||||
struct virqfd *virqfd = container_of(work, struct virqfd, inject);
|
||||
if (virqfd->thread)
|
||||
virqfd->thread(virqfd->vdev, virqfd->data);
|
||||
}
|
||||
|
||||
static int virqfd_enable(struct vfio_pci_device *vdev,
|
||||
int (*handler)(struct vfio_pci_device *, void *),
|
||||
void (*thread)(struct vfio_pci_device *, void *),
|
||||
void *data, struct virqfd **pvirqfd, int fd)
|
||||
{
|
||||
struct fd irqfd;
|
||||
struct eventfd_ctx *ctx;
|
||||
struct virqfd *virqfd;
|
||||
int ret = 0;
|
||||
unsigned int events;
|
||||
|
||||
virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL);
|
||||
if (!virqfd)
|
||||
return -ENOMEM;
|
||||
|
||||
virqfd->pvirqfd = pvirqfd;
|
||||
virqfd->vdev = vdev;
|
||||
virqfd->handler = handler;
|
||||
virqfd->thread = thread;
|
||||
virqfd->data = data;
|
||||
|
||||
INIT_WORK(&virqfd->shutdown, virqfd_shutdown);
|
||||
INIT_WORK(&virqfd->inject, virqfd_inject);
|
||||
|
||||
irqfd = fdget(fd);
|
||||
if (!irqfd.file) {
|
||||
ret = -EBADF;
|
||||
goto err_fd;
|
||||
}
|
||||
|
||||
ctx = eventfd_ctx_fileget(irqfd.file);
|
||||
if (IS_ERR(ctx)) {
|
||||
ret = PTR_ERR(ctx);
|
||||
goto err_ctx;
|
||||
}
|
||||
|
||||
virqfd->eventfd = ctx;
|
||||
|
||||
/*
|
||||
* virqfds can be released by closing the eventfd or directly
|
||||
* through ioctl. These are both done through a workqueue, so
|
||||
* we update the pointer to the virqfd under lock to avoid
|
||||
* pushing multiple jobs to release the same virqfd.
|
||||
*/
|
||||
spin_lock_irq(&vdev->irqlock);
|
||||
|
||||
if (*pvirqfd) {
|
||||
spin_unlock_irq(&vdev->irqlock);
|
||||
ret = -EBUSY;
|
||||
goto err_busy;
|
||||
}
|
||||
*pvirqfd = virqfd;
|
||||
|
||||
spin_unlock_irq(&vdev->irqlock);
|
||||
|
||||
/*
|
||||
* Install our own custom wake-up handling so we are notified via
|
||||
* a callback whenever someone signals the underlying eventfd.
|
||||
*/
|
||||
init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
|
||||
init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
|
||||
|
||||
events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt);
|
||||
|
||||
/*
|
||||
* Check if there was an event already pending on the eventfd
|
||||
* before we registered and trigger it as if we didn't miss it.
|
||||
*/
|
||||
if (events & POLLIN) {
|
||||
if ((!handler || handler(vdev, data)) && thread)
|
||||
schedule_work(&virqfd->inject);
|
||||
}
|
||||
|
||||
/*
|
||||
* Do not drop the file until the irqfd is fully initialized,
|
||||
* otherwise we might race against the POLLHUP.
|
||||
*/
|
||||
fdput(irqfd);
|
||||
|
||||
return 0;
|
||||
err_busy:
|
||||
eventfd_ctx_put(ctx);
|
||||
err_ctx:
|
||||
fdput(irqfd);
|
||||
err_fd:
|
||||
kfree(virqfd);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void virqfd_disable(struct vfio_pci_device *vdev,
|
||||
struct virqfd **pvirqfd)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&vdev->irqlock, flags);
|
||||
|
||||
if (*pvirqfd) {
|
||||
virqfd_deactivate(*pvirqfd);
|
||||
*pvirqfd = NULL;
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&vdev->irqlock, flags);
|
||||
|
||||
/*
|
||||
* Block until we know all outstanding shutdown jobs have completed.
|
||||
* Even if we don't queue the job, flush the wq to be sure it's
|
||||
* been released.
|
||||
*/
|
||||
flush_workqueue(vfio_irqfd_cleanup_wq);
|
||||
}
|
||||
|
||||
/*
|
||||
* INTx
|
||||
*/
|
||||
static void vfio_send_intx_eventfd(struct vfio_pci_device *vdev, void *unused)
|
||||
{
|
||||
if (likely(is_intx(vdev) && !vdev->virq_disabled))
|
||||
eventfd_signal(vdev->ctx[0].trigger, 1);
|
||||
}
|
||||
|
||||
void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&vdev->irqlock, flags);
|
||||
|
||||
/*
|
||||
* Masking can come from interrupt, ioctl, or config space
|
||||
* via INTx disable. The latter means this can get called
|
||||
* even when not using intx delivery. In this case, just
|
||||
* try to have the physical bit follow the virtual bit.
|
||||
*/
|
||||
if (unlikely(!is_intx(vdev))) {
|
||||
if (vdev->pci_2_3)
|
||||
pci_intx(pdev, 0);
|
||||
} else if (!vdev->ctx[0].masked) {
|
||||
/*
|
||||
* Can't use check_and_mask here because we always want to
|
||||
* mask, not just when something is pending.
|
||||
*/
|
||||
if (vdev->pci_2_3)
|
||||
pci_intx(pdev, 0);
|
||||
else
|
||||
disable_irq_nosync(pdev->irq);
|
||||
|
||||
vdev->ctx[0].masked = true;
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&vdev->irqlock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is triggered by an eventfd, we can't call eventfd_signal
|
||||
* or else we'll deadlock on the eventfd wait queue. Return >0 when
|
||||
* a signal is necessary, which can then be handled via a work queue
|
||||
* or directly depending on the caller.
|
||||
*/
|
||||
static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev,
|
||||
void *unused)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
unsigned long flags;
|
||||
int ret = 0;
|
||||
|
||||
spin_lock_irqsave(&vdev->irqlock, flags);
|
||||
|
||||
/*
|
||||
* Unmasking comes from ioctl or config, so again, have the
|
||||
* physical bit follow the virtual even when not using INTx.
|
||||
*/
|
||||
if (unlikely(!is_intx(vdev))) {
|
||||
if (vdev->pci_2_3)
|
||||
pci_intx(pdev, 1);
|
||||
} else if (vdev->ctx[0].masked && !vdev->virq_disabled) {
|
||||
/*
|
||||
* A pending interrupt here would immediately trigger,
|
||||
* but we can avoid that overhead by just re-sending
|
||||
* the interrupt to the user.
|
||||
*/
|
||||
if (vdev->pci_2_3) {
|
||||
if (!pci_check_and_unmask_intx(pdev))
|
||||
ret = 1;
|
||||
} else
|
||||
enable_irq(pdev->irq);
|
||||
|
||||
vdev->ctx[0].masked = (ret > 0);
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&vdev->irqlock, flags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void vfio_pci_intx_unmask(struct vfio_pci_device *vdev)
|
||||
{
|
||||
if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
|
||||
vfio_send_intx_eventfd(vdev, NULL);
|
||||
}
|
||||
|
||||
static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
|
||||
{
|
||||
struct vfio_pci_device *vdev = dev_id;
|
||||
unsigned long flags;
|
||||
int ret = IRQ_NONE;
|
||||
|
||||
spin_lock_irqsave(&vdev->irqlock, flags);
|
||||
|
||||
if (!vdev->pci_2_3) {
|
||||
disable_irq_nosync(vdev->pdev->irq);
|
||||
vdev->ctx[0].masked = true;
|
||||
ret = IRQ_HANDLED;
|
||||
} else if (!vdev->ctx[0].masked && /* may be shared */
|
||||
pci_check_and_mask_intx(vdev->pdev)) {
|
||||
vdev->ctx[0].masked = true;
|
||||
ret = IRQ_HANDLED;
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&vdev->irqlock, flags);
|
||||
|
||||
if (ret == IRQ_HANDLED)
|
||||
vfio_send_intx_eventfd(vdev, NULL);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int vfio_intx_enable(struct vfio_pci_device *vdev)
|
||||
{
|
||||
if (!is_irq_none(vdev))
|
||||
return -EINVAL;
|
||||
|
||||
if (!vdev->pdev->irq)
|
||||
return -ENODEV;
|
||||
|
||||
vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
|
||||
if (!vdev->ctx)
|
||||
return -ENOMEM;
|
||||
|
||||
vdev->num_ctx = 1;
|
||||
|
||||
/*
|
||||
* If the virtual interrupt is masked, restore it. Devices
|
||||
* supporting DisINTx can be masked at the hardware level
|
||||
* here, non-PCI-2.3 devices will have to wait until the
|
||||
* interrupt is enabled.
|
||||
*/
|
||||
vdev->ctx[0].masked = vdev->virq_disabled;
|
||||
if (vdev->pci_2_3)
|
||||
pci_intx(vdev->pdev, !vdev->ctx[0].masked);
|
||||
|
||||
vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
unsigned long irqflags = IRQF_SHARED;
|
||||
struct eventfd_ctx *trigger;
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
||||
if (vdev->ctx[0].trigger) {
|
||||
free_irq(pdev->irq, vdev);
|
||||
kfree(vdev->ctx[0].name);
|
||||
eventfd_ctx_put(vdev->ctx[0].trigger);
|
||||
vdev->ctx[0].trigger = NULL;
|
||||
}
|
||||
|
||||
if (fd < 0) /* Disable only */
|
||||
return 0;
|
||||
|
||||
vdev->ctx[0].name = kasprintf(GFP_KERNEL, "vfio-intx(%s)",
|
||||
pci_name(pdev));
|
||||
if (!vdev->ctx[0].name)
|
||||
return -ENOMEM;
|
||||
|
||||
trigger = eventfd_ctx_fdget(fd);
|
||||
if (IS_ERR(trigger)) {
|
||||
kfree(vdev->ctx[0].name);
|
||||
return PTR_ERR(trigger);
|
||||
}
|
||||
|
||||
vdev->ctx[0].trigger = trigger;
|
||||
|
||||
if (!vdev->pci_2_3)
|
||||
irqflags = 0;
|
||||
|
||||
ret = request_irq(pdev->irq, vfio_intx_handler,
|
||||
irqflags, vdev->ctx[0].name, vdev);
|
||||
if (ret) {
|
||||
vdev->ctx[0].trigger = NULL;
|
||||
kfree(vdev->ctx[0].name);
|
||||
eventfd_ctx_put(trigger);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* INTx disable will stick across the new irq setup,
|
||||
* disable_irq won't.
|
||||
*/
|
||||
spin_lock_irqsave(&vdev->irqlock, flags);
|
||||
if (!vdev->pci_2_3 && vdev->ctx[0].masked)
|
||||
disable_irq_nosync(pdev->irq);
|
||||
spin_unlock_irqrestore(&vdev->irqlock, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void vfio_intx_disable(struct vfio_pci_device *vdev)
|
||||
{
|
||||
vfio_intx_set_signal(vdev, -1);
|
||||
virqfd_disable(vdev, &vdev->ctx[0].unmask);
|
||||
virqfd_disable(vdev, &vdev->ctx[0].mask);
|
||||
vdev->irq_type = VFIO_PCI_NUM_IRQS;
|
||||
vdev->num_ctx = 0;
|
||||
kfree(vdev->ctx);
|
||||
}
|
||||
|
||||
/*
|
||||
* MSI/MSI-X
|
||||
*/
|
||||
static irqreturn_t vfio_msihandler(int irq, void *arg)
|
||||
{
|
||||
struct eventfd_ctx *trigger = arg;
|
||||
|
||||
eventfd_signal(trigger, 1);
|
||||
return IRQ_HANDLED;
|
||||
}
|
||||
|
||||
static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
int ret;
|
||||
|
||||
if (!is_irq_none(vdev))
|
||||
return -EINVAL;
|
||||
|
||||
vdev->ctx = kzalloc(nvec * sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
|
||||
if (!vdev->ctx)
|
||||
return -ENOMEM;
|
||||
|
||||
if (msix) {
|
||||
int i;
|
||||
|
||||
vdev->msix = kzalloc(nvec * sizeof(struct msix_entry),
|
||||
GFP_KERNEL);
|
||||
if (!vdev->msix) {
|
||||
kfree(vdev->ctx);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (i = 0; i < nvec; i++)
|
||||
vdev->msix[i].entry = i;
|
||||
|
||||
ret = pci_enable_msix_range(pdev, vdev->msix, 1, nvec);
|
||||
if (ret < nvec) {
|
||||
if (ret > 0)
|
||||
pci_disable_msix(pdev);
|
||||
kfree(vdev->msix);
|
||||
kfree(vdev->ctx);
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
ret = pci_enable_msi_range(pdev, 1, nvec);
|
||||
if (ret < nvec) {
|
||||
if (ret > 0)
|
||||
pci_disable_msi(pdev);
|
||||
kfree(vdev->ctx);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
vdev->num_ctx = nvec;
|
||||
vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX :
|
||||
VFIO_PCI_MSI_IRQ_INDEX;
|
||||
|
||||
if (!msix) {
|
||||
/*
|
||||
* Compute the virtual hardware field for max msi vectors -
|
||||
* it is the log base 2 of the number of vectors.
|
||||
*/
|
||||
vdev->msi_qmax = fls(nvec * 2 - 1) - 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
|
||||
int vector, int fd, bool msix)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector;
|
||||
char *name = msix ? "vfio-msix" : "vfio-msi";
|
||||
struct eventfd_ctx *trigger;
|
||||
int ret;
|
||||
|
||||
if (vector >= vdev->num_ctx)
|
||||
return -EINVAL;
|
||||
|
||||
if (vdev->ctx[vector].trigger) {
|
||||
free_irq(irq, vdev->ctx[vector].trigger);
|
||||
kfree(vdev->ctx[vector].name);
|
||||
eventfd_ctx_put(vdev->ctx[vector].trigger);
|
||||
vdev->ctx[vector].trigger = NULL;
|
||||
}
|
||||
|
||||
if (fd < 0)
|
||||
return 0;
|
||||
|
||||
vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)",
|
||||
name, vector, pci_name(pdev));
|
||||
if (!vdev->ctx[vector].name)
|
||||
return -ENOMEM;
|
||||
|
||||
trigger = eventfd_ctx_fdget(fd);
|
||||
if (IS_ERR(trigger)) {
|
||||
kfree(vdev->ctx[vector].name);
|
||||
return PTR_ERR(trigger);
|
||||
}
|
||||
|
||||
/*
|
||||
* The MSIx vector table resides in device memory which may be cleared
|
||||
* via backdoor resets. We don't allow direct access to the vector
|
||||
* table so even if a userspace driver attempts to save/restore around
|
||||
* such a reset it would be unsuccessful. To avoid this, restore the
|
||||
* cached value of the message prior to enabling.
|
||||
*/
|
||||
if (msix) {
|
||||
struct msi_msg msg;
|
||||
|
||||
get_cached_msi_msg(irq, &msg);
|
||||
write_msi_msg(irq, &msg);
|
||||
}
|
||||
|
||||
ret = request_irq(irq, vfio_msihandler, 0,
|
||||
vdev->ctx[vector].name, trigger);
|
||||
if (ret) {
|
||||
kfree(vdev->ctx[vector].name);
|
||||
eventfd_ctx_put(trigger);
|
||||
return ret;
|
||||
}
|
||||
|
||||
vdev->ctx[vector].trigger = trigger;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start,
|
||||
unsigned count, int32_t *fds, bool msix)
|
||||
{
|
||||
int i, j, ret = 0;
|
||||
|
||||
if (start + count > vdev->num_ctx)
|
||||
return -EINVAL;
|
||||
|
||||
for (i = 0, j = start; i < count && !ret; i++, j++) {
|
||||
int fd = fds ? fds[i] : -1;
|
||||
ret = vfio_msi_set_vector_signal(vdev, j, fd, msix);
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
for (--j; j >= start; j--)
|
||||
vfio_msi_set_vector_signal(vdev, j, -1, msix);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
int i;
|
||||
|
||||
vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
|
||||
|
||||
for (i = 0; i < vdev->num_ctx; i++) {
|
||||
virqfd_disable(vdev, &vdev->ctx[i].unmask);
|
||||
virqfd_disable(vdev, &vdev->ctx[i].mask);
|
||||
}
|
||||
|
||||
if (msix) {
|
||||
pci_disable_msix(vdev->pdev);
|
||||
kfree(vdev->msix);
|
||||
} else
|
||||
pci_disable_msi(pdev);
|
||||
|
||||
vdev->irq_type = VFIO_PCI_NUM_IRQS;
|
||||
vdev->num_ctx = 0;
|
||||
kfree(vdev->ctx);
|
||||
}
|
||||
|
||||
/*
|
||||
* IOCTL support
|
||||
*/
|
||||
static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev,
|
||||
unsigned index, unsigned start,
|
||||
unsigned count, uint32_t flags, void *data)
|
||||
{
|
||||
if (!is_intx(vdev) || start != 0 || count != 1)
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & VFIO_IRQ_SET_DATA_NONE) {
|
||||
vfio_pci_intx_unmask(vdev);
|
||||
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
|
||||
uint8_t unmask = *(uint8_t *)data;
|
||||
if (unmask)
|
||||
vfio_pci_intx_unmask(vdev);
|
||||
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
|
||||
int32_t fd = *(int32_t *)data;
|
||||
if (fd >= 0)
|
||||
return virqfd_enable(vdev, vfio_pci_intx_unmask_handler,
|
||||
vfio_send_intx_eventfd, NULL,
|
||||
&vdev->ctx[0].unmask, fd);
|
||||
|
||||
virqfd_disable(vdev, &vdev->ctx[0].unmask);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev,
|
||||
unsigned index, unsigned start,
|
||||
unsigned count, uint32_t flags, void *data)
|
||||
{
|
||||
if (!is_intx(vdev) || start != 0 || count != 1)
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & VFIO_IRQ_SET_DATA_NONE) {
|
||||
vfio_pci_intx_mask(vdev);
|
||||
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
|
||||
uint8_t mask = *(uint8_t *)data;
|
||||
if (mask)
|
||||
vfio_pci_intx_mask(vdev);
|
||||
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
|
||||
return -ENOTTY; /* XXX implement me */
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev,
|
||||
unsigned index, unsigned start,
|
||||
unsigned count, uint32_t flags, void *data)
|
||||
{
|
||||
if (is_intx(vdev) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
|
||||
vfio_intx_disable(vdev);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!(is_intx(vdev) || is_irq_none(vdev)) || start != 0 || count != 1)
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
|
||||
int32_t fd = *(int32_t *)data;
|
||||
int ret;
|
||||
|
||||
if (is_intx(vdev))
|
||||
return vfio_intx_set_signal(vdev, fd);
|
||||
|
||||
ret = vfio_intx_enable(vdev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = vfio_intx_set_signal(vdev, fd);
|
||||
if (ret)
|
||||
vfio_intx_disable(vdev);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!is_intx(vdev))
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & VFIO_IRQ_SET_DATA_NONE) {
|
||||
vfio_send_intx_eventfd(vdev, NULL);
|
||||
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
|
||||
uint8_t trigger = *(uint8_t *)data;
|
||||
if (trigger)
|
||||
vfio_send_intx_eventfd(vdev, NULL);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
|
||||
unsigned index, unsigned start,
|
||||
unsigned count, uint32_t flags, void *data)
|
||||
{
|
||||
int i;
|
||||
bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false;
|
||||
|
||||
if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
|
||||
vfio_msi_disable(vdev, msix);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!(irq_is(vdev, index) || is_irq_none(vdev)))
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
|
||||
int32_t *fds = data;
|
||||
int ret;
|
||||
|
||||
if (vdev->irq_type == index)
|
||||
return vfio_msi_set_block(vdev, start, count,
|
||||
fds, msix);
|
||||
|
||||
ret = vfio_msi_enable(vdev, start + count, msix);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = vfio_msi_set_block(vdev, start, count, fds, msix);
|
||||
if (ret)
|
||||
vfio_msi_disable(vdev, msix);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!irq_is(vdev, index) || start + count > vdev->num_ctx)
|
||||
return -EINVAL;
|
||||
|
||||
for (i = start; i < start + count; i++) {
|
||||
if (!vdev->ctx[i].trigger)
|
||||
continue;
|
||||
if (flags & VFIO_IRQ_SET_DATA_NONE) {
|
||||
eventfd_signal(vdev->ctx[i].trigger, 1);
|
||||
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
|
||||
uint8_t *bools = data;
|
||||
if (bools[i - start])
|
||||
eventfd_signal(vdev->ctx[i].trigger, 1);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
|
||||
unsigned index, unsigned start,
|
||||
unsigned count, uint32_t flags, void *data)
|
||||
{
|
||||
int32_t fd = *(int32_t *)data;
|
||||
|
||||
if ((index != VFIO_PCI_ERR_IRQ_INDEX) ||
|
||||
!(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
|
||||
return -EINVAL;
|
||||
|
||||
/* DATA_NONE/DATA_BOOL enables loopback testing */
|
||||
if (flags & VFIO_IRQ_SET_DATA_NONE) {
|
||||
if (vdev->err_trigger)
|
||||
eventfd_signal(vdev->err_trigger, 1);
|
||||
return 0;
|
||||
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
|
||||
uint8_t trigger = *(uint8_t *)data;
|
||||
if (trigger && vdev->err_trigger)
|
||||
eventfd_signal(vdev->err_trigger, 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Handle SET_DATA_EVENTFD */
|
||||
if (fd == -1) {
|
||||
if (vdev->err_trigger)
|
||||
eventfd_ctx_put(vdev->err_trigger);
|
||||
vdev->err_trigger = NULL;
|
||||
return 0;
|
||||
} else if (fd >= 0) {
|
||||
struct eventfd_ctx *efdctx;
|
||||
efdctx = eventfd_ctx_fdget(fd);
|
||||
if (IS_ERR(efdctx))
|
||||
return PTR_ERR(efdctx);
|
||||
if (vdev->err_trigger)
|
||||
eventfd_ctx_put(vdev->err_trigger);
|
||||
vdev->err_trigger = efdctx;
|
||||
return 0;
|
||||
} else
|
||||
return -EINVAL;
|
||||
}
|
||||
int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
|
||||
unsigned index, unsigned start, unsigned count,
|
||||
void *data)
|
||||
{
|
||||
int (*func)(struct vfio_pci_device *vdev, unsigned index,
|
||||
unsigned start, unsigned count, uint32_t flags,
|
||||
void *data) = NULL;
|
||||
|
||||
switch (index) {
|
||||
case VFIO_PCI_INTX_IRQ_INDEX:
|
||||
switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
|
||||
case VFIO_IRQ_SET_ACTION_MASK:
|
||||
func = vfio_pci_set_intx_mask;
|
||||
break;
|
||||
case VFIO_IRQ_SET_ACTION_UNMASK:
|
||||
func = vfio_pci_set_intx_unmask;
|
||||
break;
|
||||
case VFIO_IRQ_SET_ACTION_TRIGGER:
|
||||
func = vfio_pci_set_intx_trigger;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case VFIO_PCI_MSI_IRQ_INDEX:
|
||||
case VFIO_PCI_MSIX_IRQ_INDEX:
|
||||
switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
|
||||
case VFIO_IRQ_SET_ACTION_MASK:
|
||||
case VFIO_IRQ_SET_ACTION_UNMASK:
|
||||
/* XXX Need masking support exported */
|
||||
break;
|
||||
case VFIO_IRQ_SET_ACTION_TRIGGER:
|
||||
func = vfio_pci_set_msi_trigger;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case VFIO_PCI_ERR_IRQ_INDEX:
|
||||
switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
|
||||
case VFIO_IRQ_SET_ACTION_TRIGGER:
|
||||
if (pci_is_pcie(vdev->pdev))
|
||||
func = vfio_pci_set_err_trigger;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!func)
|
||||
return -ENOTTY;
|
||||
|
||||
return func(vdev, index, start, count, flags, data);
|
||||
}
|
94
drivers/vfio/pci/vfio_pci_private.h
Normal file
94
drivers/vfio/pci/vfio_pci_private.h
Normal file
|
@ -0,0 +1,94 @@
|
|||
/*
|
||||
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
|
||||
* Author: Alex Williamson <alex.williamson@redhat.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Derived from original vfio:
|
||||
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Author: Tom Lyon, pugs@cisco.com
|
||||
*/
|
||||
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/pci.h>
|
||||
|
||||
#ifndef VFIO_PCI_PRIVATE_H
|
||||
#define VFIO_PCI_PRIVATE_H
|
||||
|
||||
#define VFIO_PCI_OFFSET_SHIFT 40
|
||||
|
||||
#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT)
|
||||
#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
|
||||
#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
|
||||
|
||||
struct vfio_pci_irq_ctx {
|
||||
struct eventfd_ctx *trigger;
|
||||
struct virqfd *unmask;
|
||||
struct virqfd *mask;
|
||||
char *name;
|
||||
bool masked;
|
||||
};
|
||||
|
||||
struct vfio_pci_device {
|
||||
struct pci_dev *pdev;
|
||||
void __iomem *barmap[PCI_STD_RESOURCE_END + 1];
|
||||
u8 *pci_config_map;
|
||||
u8 *vconfig;
|
||||
struct perm_bits *msi_perm;
|
||||
spinlock_t irqlock;
|
||||
struct mutex igate;
|
||||
struct msix_entry *msix;
|
||||
struct vfio_pci_irq_ctx *ctx;
|
||||
int num_ctx;
|
||||
int irq_type;
|
||||
u8 msi_qmax;
|
||||
u8 msix_bar;
|
||||
u16 msix_size;
|
||||
u32 msix_offset;
|
||||
u32 rbar[7];
|
||||
bool pci_2_3;
|
||||
bool virq_disabled;
|
||||
bool reset_works;
|
||||
bool extended_caps;
|
||||
bool bardirty;
|
||||
bool has_vga;
|
||||
bool needs_reset;
|
||||
struct pci_saved_state *pci_saved_state;
|
||||
int refcnt;
|
||||
struct eventfd_ctx *err_trigger;
|
||||
};
|
||||
|
||||
#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
|
||||
#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX)
|
||||
#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
|
||||
#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
|
||||
#define irq_is(vdev, type) (vdev->irq_type == type)
|
||||
|
||||
extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev);
|
||||
extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev);
|
||||
|
||||
extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev,
|
||||
uint32_t flags, unsigned index,
|
||||
unsigned start, unsigned count, void *data);
|
||||
|
||||
extern ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev,
|
||||
char __user *buf, size_t count,
|
||||
loff_t *ppos, bool iswrite);
|
||||
|
||||
extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
|
||||
size_t count, loff_t *ppos, bool iswrite);
|
||||
|
||||
extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
|
||||
size_t count, loff_t *ppos, bool iswrite);
|
||||
|
||||
extern int vfio_pci_init_perm_bits(void);
|
||||
extern void vfio_pci_uninit_perm_bits(void);
|
||||
|
||||
extern int vfio_pci_virqfd_init(void);
|
||||
extern void vfio_pci_virqfd_exit(void);
|
||||
|
||||
extern int vfio_config_init(struct vfio_pci_device *vdev);
|
||||
extern void vfio_config_free(struct vfio_pci_device *vdev);
|
||||
#endif /* VFIO_PCI_PRIVATE_H */
|
238
drivers/vfio/pci/vfio_pci_rdwr.c
Normal file
238
drivers/vfio/pci/vfio_pci_rdwr.c
Normal file
|
@ -0,0 +1,238 @@
|
|||
/*
|
||||
* VFIO PCI I/O Port & MMIO access
|
||||
*
|
||||
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
|
||||
* Author: Alex Williamson <alex.williamson@redhat.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Derived from original vfio:
|
||||
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Author: Tom Lyon, pugs@cisco.com
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/vgaarb.h>
|
||||
|
||||
#include "vfio_pci_private.h"
|
||||
|
||||
/*
|
||||
* Read or write from an __iomem region (MMIO or I/O port) with an excluded
|
||||
* range which is inaccessible. The excluded range drops writes and fills
|
||||
* reads with -1. This is intended for handling MSI-X vector tables and
|
||||
* leftover space for ROM BARs.
|
||||
*/
|
||||
static ssize_t do_io_rw(void __iomem *io, char __user *buf,
|
||||
loff_t off, size_t count, size_t x_start,
|
||||
size_t x_end, bool iswrite)
|
||||
{
|
||||
ssize_t done = 0;
|
||||
|
||||
while (count) {
|
||||
size_t fillable, filled;
|
||||
|
||||
if (off < x_start)
|
||||
fillable = min(count, (size_t)(x_start - off));
|
||||
else if (off >= x_end)
|
||||
fillable = count;
|
||||
else
|
||||
fillable = 0;
|
||||
|
||||
if (fillable >= 4 && !(off % 4)) {
|
||||
__le32 val;
|
||||
|
||||
if (iswrite) {
|
||||
if (copy_from_user(&val, buf, 4))
|
||||
return -EFAULT;
|
||||
|
||||
iowrite32(le32_to_cpu(val), io + off);
|
||||
} else {
|
||||
val = cpu_to_le32(ioread32(io + off));
|
||||
|
||||
if (copy_to_user(buf, &val, 4))
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
filled = 4;
|
||||
} else if (fillable >= 2 && !(off % 2)) {
|
||||
__le16 val;
|
||||
|
||||
if (iswrite) {
|
||||
if (copy_from_user(&val, buf, 2))
|
||||
return -EFAULT;
|
||||
|
||||
iowrite16(le16_to_cpu(val), io + off);
|
||||
} else {
|
||||
val = cpu_to_le16(ioread16(io + off));
|
||||
|
||||
if (copy_to_user(buf, &val, 2))
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
filled = 2;
|
||||
} else if (fillable) {
|
||||
u8 val;
|
||||
|
||||
if (iswrite) {
|
||||
if (copy_from_user(&val, buf, 1))
|
||||
return -EFAULT;
|
||||
|
||||
iowrite8(val, io + off);
|
||||
} else {
|
||||
val = ioread8(io + off);
|
||||
|
||||
if (copy_to_user(buf, &val, 1))
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
filled = 1;
|
||||
} else {
|
||||
/* Fill reads with -1, drop writes */
|
||||
filled = min(count, (size_t)(x_end - off));
|
||||
if (!iswrite) {
|
||||
u8 val = 0xFF;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < filled; i++)
|
||||
if (copy_to_user(buf + i, &val, 1))
|
||||
return -EFAULT;
|
||||
}
|
||||
}
|
||||
|
||||
count -= filled;
|
||||
done += filled;
|
||||
off += filled;
|
||||
buf += filled;
|
||||
}
|
||||
|
||||
return done;
|
||||
}
|
||||
|
||||
ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
|
||||
size_t count, loff_t *ppos, bool iswrite)
|
||||
{
|
||||
struct pci_dev *pdev = vdev->pdev;
|
||||
loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
|
||||
int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
|
||||
size_t x_start = 0, x_end = 0;
|
||||
resource_size_t end;
|
||||
void __iomem *io;
|
||||
ssize_t done;
|
||||
|
||||
if (!pci_resource_start(pdev, bar))
|
||||
return -EINVAL;
|
||||
|
||||
end = pci_resource_len(pdev, bar);
|
||||
|
||||
if (pos >= end)
|
||||
return -EINVAL;
|
||||
|
||||
count = min(count, (size_t)(end - pos));
|
||||
|
||||
if (bar == PCI_ROM_RESOURCE) {
|
||||
/*
|
||||
* The ROM can fill less space than the BAR, so we start the
|
||||
* excluded range at the end of the actual ROM. This makes
|
||||
* filling large ROM BARs much faster.
|
||||
*/
|
||||
io = pci_map_rom(pdev, &x_start);
|
||||
if (!io)
|
||||
return -ENOMEM;
|
||||
x_end = end;
|
||||
} else if (!vdev->barmap[bar]) {
|
||||
int ret;
|
||||
|
||||
ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
io = pci_iomap(pdev, bar, 0);
|
||||
if (!io) {
|
||||
pci_release_selected_regions(pdev, 1 << bar);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
vdev->barmap[bar] = io;
|
||||
} else
|
||||
io = vdev->barmap[bar];
|
||||
|
||||
if (bar == vdev->msix_bar) {
|
||||
x_start = vdev->msix_offset;
|
||||
x_end = vdev->msix_offset + vdev->msix_size;
|
||||
}
|
||||
|
||||
done = do_io_rw(io, buf, pos, count, x_start, x_end, iswrite);
|
||||
|
||||
if (done >= 0)
|
||||
*ppos += done;
|
||||
|
||||
if (bar == PCI_ROM_RESOURCE)
|
||||
pci_unmap_rom(pdev, io);
|
||||
|
||||
return done;
|
||||
}
|
||||
|
||||
ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
|
||||
size_t count, loff_t *ppos, bool iswrite)
|
||||
{
|
||||
int ret;
|
||||
loff_t off, pos = *ppos & VFIO_PCI_OFFSET_MASK;
|
||||
void __iomem *iomem = NULL;
|
||||
unsigned int rsrc;
|
||||
bool is_ioport;
|
||||
ssize_t done;
|
||||
|
||||
if (!vdev->has_vga)
|
||||
return -EINVAL;
|
||||
|
||||
switch (pos) {
|
||||
case 0xa0000 ... 0xbffff:
|
||||
count = min(count, (size_t)(0xc0000 - pos));
|
||||
iomem = ioremap_nocache(0xa0000, 0xbffff - 0xa0000 + 1);
|
||||
off = pos - 0xa0000;
|
||||
rsrc = VGA_RSRC_LEGACY_MEM;
|
||||
is_ioport = false;
|
||||
break;
|
||||
case 0x3b0 ... 0x3bb:
|
||||
count = min(count, (size_t)(0x3bc - pos));
|
||||
iomem = ioport_map(0x3b0, 0x3bb - 0x3b0 + 1);
|
||||
off = pos - 0x3b0;
|
||||
rsrc = VGA_RSRC_LEGACY_IO;
|
||||
is_ioport = true;
|
||||
break;
|
||||
case 0x3c0 ... 0x3df:
|
||||
count = min(count, (size_t)(0x3e0 - pos));
|
||||
iomem = ioport_map(0x3c0, 0x3df - 0x3c0 + 1);
|
||||
off = pos - 0x3c0;
|
||||
rsrc = VGA_RSRC_LEGACY_IO;
|
||||
is_ioport = true;
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!iomem)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = vga_get_interruptible(vdev->pdev, rsrc);
|
||||
if (ret) {
|
||||
is_ioport ? ioport_unmap(iomem) : iounmap(iomem);
|
||||
return ret;
|
||||
}
|
||||
|
||||
done = do_io_rw(iomem, buf, off, count, 0, 0, iswrite);
|
||||
|
||||
vga_put(vdev->pdev, rsrc);
|
||||
|
||||
is_ioport ? ioport_unmap(iomem) : iounmap(iomem);
|
||||
|
||||
if (done >= 0)
|
||||
*ppos += done;
|
||||
|
||||
return done;
|
||||
}
|
1509
drivers/vfio/vfio.c
Normal file
1509
drivers/vfio/vfio.c
Normal file
File diff suppressed because it is too large
Load diff
392
drivers/vfio/vfio_iommu_spapr_tce.c
Normal file
392
drivers/vfio/vfio_iommu_spapr_tce.c
Normal file
|
@ -0,0 +1,392 @@
|
|||
/*
|
||||
* VFIO: IOMMU DMA mapping support for TCE on POWER
|
||||
*
|
||||
* Copyright (C) 2013 IBM Corp. All rights reserved.
|
||||
* Author: Alexey Kardashevskiy <aik@ozlabs.ru>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Derived from original vfio_iommu_type1.c:
|
||||
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
|
||||
* Author: Alex Williamson <alex.williamson@redhat.com>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/vfio.h>
|
||||
#include <asm/iommu.h>
|
||||
#include <asm/tce.h>
|
||||
|
||||
#define DRIVER_VERSION "0.1"
|
||||
#define DRIVER_AUTHOR "aik@ozlabs.ru"
|
||||
#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
|
||||
|
||||
static void tce_iommu_detach_group(void *iommu_data,
|
||||
struct iommu_group *iommu_group);
|
||||
|
||||
/*
|
||||
* VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
|
||||
*
|
||||
* This code handles mapping and unmapping of user data buffers
|
||||
* into DMA'ble space using the IOMMU
|
||||
*/
|
||||
|
||||
/*
|
||||
* The container descriptor supports only a single group per container.
|
||||
* Required by the API as the container is not supplied with the IOMMU group
|
||||
* at the moment of initialization.
|
||||
*/
|
||||
struct tce_container {
|
||||
struct mutex lock;
|
||||
struct iommu_table *tbl;
|
||||
bool enabled;
|
||||
};
|
||||
|
||||
static int tce_iommu_enable(struct tce_container *container)
|
||||
{
|
||||
int ret = 0;
|
||||
unsigned long locked, lock_limit, npages;
|
||||
struct iommu_table *tbl = container->tbl;
|
||||
|
||||
if (!container->tbl)
|
||||
return -ENXIO;
|
||||
|
||||
if (!current->mm)
|
||||
return -ESRCH; /* process exited */
|
||||
|
||||
if (container->enabled)
|
||||
return -EBUSY;
|
||||
|
||||
/*
|
||||
* When userspace pages are mapped into the IOMMU, they are effectively
|
||||
* locked memory, so, theoretically, we need to update the accounting
|
||||
* of locked pages on each map and unmap. For powerpc, the map unmap
|
||||
* paths can be very hot, though, and the accounting would kill
|
||||
* performance, especially since it would be difficult to impossible
|
||||
* to handle the accounting in real mode only.
|
||||
*
|
||||
* To address that, rather than precisely accounting every page, we
|
||||
* instead account for a worst case on locked memory when the iommu is
|
||||
* enabled and disabled. The worst case upper bound on locked memory
|
||||
* is the size of the whole iommu window, which is usually relatively
|
||||
* small (compared to total memory sizes) on POWER hardware.
|
||||
*
|
||||
* Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
|
||||
* that would effectively kill the guest at random points, much better
|
||||
* enforcing the limit based on the max that the guest can map.
|
||||
*/
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
|
||||
locked = current->mm->locked_vm + npages;
|
||||
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
|
||||
if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
|
||||
pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
|
||||
rlimit(RLIMIT_MEMLOCK));
|
||||
ret = -ENOMEM;
|
||||
} else {
|
||||
|
||||
current->mm->locked_vm += npages;
|
||||
container->enabled = true;
|
||||
}
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void tce_iommu_disable(struct tce_container *container)
|
||||
{
|
||||
if (!container->enabled)
|
||||
return;
|
||||
|
||||
container->enabled = false;
|
||||
|
||||
if (!container->tbl || !current->mm)
|
||||
return;
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
current->mm->locked_vm -= (container->tbl->it_size <<
|
||||
IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
}
|
||||
|
||||
static void *tce_iommu_open(unsigned long arg)
|
||||
{
|
||||
struct tce_container *container;
|
||||
|
||||
if (arg != VFIO_SPAPR_TCE_IOMMU) {
|
||||
pr_err("tce_vfio: Wrong IOMMU type\n");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
container = kzalloc(sizeof(*container), GFP_KERNEL);
|
||||
if (!container)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
mutex_init(&container->lock);
|
||||
|
||||
return container;
|
||||
}
|
||||
|
||||
static void tce_iommu_release(void *iommu_data)
|
||||
{
|
||||
struct tce_container *container = iommu_data;
|
||||
|
||||
WARN_ON(container->tbl && !container->tbl->it_group);
|
||||
tce_iommu_disable(container);
|
||||
|
||||
if (container->tbl && container->tbl->it_group)
|
||||
tce_iommu_detach_group(iommu_data, container->tbl->it_group);
|
||||
|
||||
mutex_destroy(&container->lock);
|
||||
|
||||
kfree(container);
|
||||
}
|
||||
|
||||
static long tce_iommu_ioctl(void *iommu_data,
|
||||
unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
struct tce_container *container = iommu_data;
|
||||
unsigned long minsz;
|
||||
long ret;
|
||||
|
||||
switch (cmd) {
|
||||
case VFIO_CHECK_EXTENSION:
|
||||
switch (arg) {
|
||||
case VFIO_SPAPR_TCE_IOMMU:
|
||||
ret = 1;
|
||||
break;
|
||||
default:
|
||||
ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
|
||||
break;
|
||||
}
|
||||
|
||||
return (ret < 0) ? 0 : ret;
|
||||
|
||||
case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
|
||||
struct vfio_iommu_spapr_tce_info info;
|
||||
struct iommu_table *tbl = container->tbl;
|
||||
|
||||
if (WARN_ON(!tbl))
|
||||
return -ENXIO;
|
||||
|
||||
minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
|
||||
dma32_window_size);
|
||||
|
||||
if (copy_from_user(&info, (void __user *)arg, minsz))
|
||||
return -EFAULT;
|
||||
|
||||
if (info.argsz < minsz)
|
||||
return -EINVAL;
|
||||
|
||||
info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT_4K;
|
||||
info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT_4K;
|
||||
info.flags = 0;
|
||||
|
||||
if (copy_to_user((void __user *)arg, &info, minsz))
|
||||
return -EFAULT;
|
||||
|
||||
return 0;
|
||||
}
|
||||
case VFIO_IOMMU_MAP_DMA: {
|
||||
struct vfio_iommu_type1_dma_map param;
|
||||
struct iommu_table *tbl = container->tbl;
|
||||
unsigned long tce, i;
|
||||
|
||||
if (!tbl)
|
||||
return -ENXIO;
|
||||
|
||||
BUG_ON(!tbl->it_group);
|
||||
|
||||
minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
|
||||
|
||||
if (copy_from_user(¶m, (void __user *)arg, minsz))
|
||||
return -EFAULT;
|
||||
|
||||
if (param.argsz < minsz)
|
||||
return -EINVAL;
|
||||
|
||||
if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
|
||||
VFIO_DMA_MAP_FLAG_WRITE))
|
||||
return -EINVAL;
|
||||
|
||||
if ((param.size & ~IOMMU_PAGE_MASK_4K) ||
|
||||
(param.vaddr & ~IOMMU_PAGE_MASK_4K))
|
||||
return -EINVAL;
|
||||
|
||||
/* iova is checked by the IOMMU API */
|
||||
tce = param.vaddr;
|
||||
if (param.flags & VFIO_DMA_MAP_FLAG_READ)
|
||||
tce |= TCE_PCI_READ;
|
||||
if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
|
||||
tce |= TCE_PCI_WRITE;
|
||||
|
||||
ret = iommu_tce_put_param_check(tbl, param.iova, tce);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT_4K); ++i) {
|
||||
ret = iommu_put_tce_user_mode(tbl,
|
||||
(param.iova >> IOMMU_PAGE_SHIFT_4K) + i,
|
||||
tce);
|
||||
if (ret)
|
||||
break;
|
||||
tce += IOMMU_PAGE_SIZE_4K;
|
||||
}
|
||||
if (ret)
|
||||
iommu_clear_tces_and_put_pages(tbl,
|
||||
param.iova >> IOMMU_PAGE_SHIFT_4K, i);
|
||||
|
||||
iommu_flush_tce(tbl);
|
||||
|
||||
return ret;
|
||||
}
|
||||
case VFIO_IOMMU_UNMAP_DMA: {
|
||||
struct vfio_iommu_type1_dma_unmap param;
|
||||
struct iommu_table *tbl = container->tbl;
|
||||
|
||||
if (WARN_ON(!tbl))
|
||||
return -ENXIO;
|
||||
|
||||
minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
|
||||
size);
|
||||
|
||||
if (copy_from_user(¶m, (void __user *)arg, minsz))
|
||||
return -EFAULT;
|
||||
|
||||
if (param.argsz < minsz)
|
||||
return -EINVAL;
|
||||
|
||||
/* No flag is supported now */
|
||||
if (param.flags)
|
||||
return -EINVAL;
|
||||
|
||||
if (param.size & ~IOMMU_PAGE_MASK_4K)
|
||||
return -EINVAL;
|
||||
|
||||
ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
|
||||
param.size >> IOMMU_PAGE_SHIFT_4K);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = iommu_clear_tces_and_put_pages(tbl,
|
||||
param.iova >> IOMMU_PAGE_SHIFT_4K,
|
||||
param.size >> IOMMU_PAGE_SHIFT_4K);
|
||||
iommu_flush_tce(tbl);
|
||||
|
||||
return ret;
|
||||
}
|
||||
case VFIO_IOMMU_ENABLE:
|
||||
mutex_lock(&container->lock);
|
||||
ret = tce_iommu_enable(container);
|
||||
mutex_unlock(&container->lock);
|
||||
return ret;
|
||||
|
||||
|
||||
case VFIO_IOMMU_DISABLE:
|
||||
mutex_lock(&container->lock);
|
||||
tce_iommu_disable(container);
|
||||
mutex_unlock(&container->lock);
|
||||
return 0;
|
||||
case VFIO_EEH_PE_OP:
|
||||
if (!container->tbl || !container->tbl->it_group)
|
||||
return -ENODEV;
|
||||
|
||||
return vfio_spapr_iommu_eeh_ioctl(container->tbl->it_group,
|
||||
cmd, arg);
|
||||
}
|
||||
|
||||
return -ENOTTY;
|
||||
}
|
||||
|
||||
static int tce_iommu_attach_group(void *iommu_data,
|
||||
struct iommu_group *iommu_group)
|
||||
{
|
||||
int ret;
|
||||
struct tce_container *container = iommu_data;
|
||||
struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
|
||||
|
||||
BUG_ON(!tbl);
|
||||
mutex_lock(&container->lock);
|
||||
|
||||
/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
|
||||
iommu_group_id(iommu_group), iommu_group); */
|
||||
if (container->tbl) {
|
||||
pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
|
||||
iommu_group_id(container->tbl->it_group),
|
||||
iommu_group_id(iommu_group));
|
||||
ret = -EBUSY;
|
||||
} else if (container->enabled) {
|
||||
pr_err("tce_vfio: attaching group #%u to enabled container\n",
|
||||
iommu_group_id(iommu_group));
|
||||
ret = -EBUSY;
|
||||
} else {
|
||||
ret = iommu_take_ownership(tbl);
|
||||
if (!ret)
|
||||
container->tbl = tbl;
|
||||
}
|
||||
|
||||
mutex_unlock(&container->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void tce_iommu_detach_group(void *iommu_data,
|
||||
struct iommu_group *iommu_group)
|
||||
{
|
||||
struct tce_container *container = iommu_data;
|
||||
struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
|
||||
|
||||
BUG_ON(!tbl);
|
||||
mutex_lock(&container->lock);
|
||||
if (tbl != container->tbl) {
|
||||
pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
|
||||
iommu_group_id(iommu_group),
|
||||
iommu_group_id(tbl->it_group));
|
||||
} else {
|
||||
if (container->enabled) {
|
||||
pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
|
||||
iommu_group_id(tbl->it_group));
|
||||
tce_iommu_disable(container);
|
||||
}
|
||||
|
||||
/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
|
||||
iommu_group_id(iommu_group), iommu_group); */
|
||||
container->tbl = NULL;
|
||||
iommu_release_ownership(tbl);
|
||||
}
|
||||
mutex_unlock(&container->lock);
|
||||
}
|
||||
|
||||
const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
|
||||
.name = "iommu-vfio-powerpc",
|
||||
.owner = THIS_MODULE,
|
||||
.open = tce_iommu_open,
|
||||
.release = tce_iommu_release,
|
||||
.ioctl = tce_iommu_ioctl,
|
||||
.attach_group = tce_iommu_attach_group,
|
||||
.detach_group = tce_iommu_detach_group,
|
||||
};
|
||||
|
||||
static int __init tce_iommu_init(void)
|
||||
{
|
||||
return vfio_register_iommu_driver(&tce_iommu_driver_ops);
|
||||
}
|
||||
|
||||
static void __exit tce_iommu_cleanup(void)
|
||||
{
|
||||
vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
|
||||
}
|
||||
|
||||
module_init(tce_iommu_init);
|
||||
module_exit(tce_iommu_cleanup);
|
||||
|
||||
MODULE_VERSION(DRIVER_VERSION);
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_AUTHOR(DRIVER_AUTHOR);
|
||||
MODULE_DESCRIPTION(DRIVER_DESC);
|
||||
|
996
drivers/vfio/vfio_iommu_type1.c
Normal file
996
drivers/vfio/vfio_iommu_type1.c
Normal file
|
@ -0,0 +1,996 @@
|
|||
/*
|
||||
* VFIO: IOMMU DMA mapping support for Type1 IOMMU
|
||||
*
|
||||
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
|
||||
* Author: Alex Williamson <alex.williamson@redhat.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Derived from original vfio:
|
||||
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Author: Tom Lyon, pugs@cisco.com
|
||||
*
|
||||
* We arbitrarily define a Type1 IOMMU as one matching the below code.
|
||||
* It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
|
||||
* VT-d, but that makes it harder to re-use as theoretically anyone
|
||||
* implementing a similar IOMMU could make use of this. We expect the
|
||||
* IOMMU to support the IOMMU API and have few to no restrictions around
|
||||
* the IOVA range that can be mapped. The Type1 IOMMU is currently
|
||||
* optimized for relatively static mappings of a userspace process with
|
||||
* userpsace pages pinned into memory. We also assume devices and IOMMU
|
||||
* domains are PCI based as the IOMMU API is still centered around a
|
||||
* device/bus interface rather than a group interface.
|
||||
*/
|
||||
|
||||
#include <linux/compat.h>
|
||||
#include <linux/device.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/iommu.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/vfio.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
#define DRIVER_VERSION "0.2"
|
||||
#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
|
||||
#define DRIVER_DESC "Type1 IOMMU driver for VFIO"
|
||||
|
||||
static bool allow_unsafe_interrupts;
|
||||
module_param_named(allow_unsafe_interrupts,
|
||||
allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
|
||||
MODULE_PARM_DESC(allow_unsafe_interrupts,
|
||||
"Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
|
||||
|
||||
static bool disable_hugepages;
|
||||
module_param_named(disable_hugepages,
|
||||
disable_hugepages, bool, S_IRUGO | S_IWUSR);
|
||||
MODULE_PARM_DESC(disable_hugepages,
|
||||
"Disable VFIO IOMMU support for IOMMU hugepages.");
|
||||
|
||||
struct vfio_iommu {
|
||||
struct list_head domain_list;
|
||||
struct mutex lock;
|
||||
struct rb_root dma_list;
|
||||
bool v2;
|
||||
bool nesting;
|
||||
};
|
||||
|
||||
struct vfio_domain {
|
||||
struct iommu_domain *domain;
|
||||
struct list_head next;
|
||||
struct list_head group_list;
|
||||
int prot; /* IOMMU_CACHE */
|
||||
};
|
||||
|
||||
struct vfio_dma {
|
||||
struct rb_node node;
|
||||
dma_addr_t iova; /* Device address */
|
||||
unsigned long vaddr; /* Process virtual addr */
|
||||
size_t size; /* Map size (bytes) */
|
||||
int prot; /* IOMMU_READ/WRITE */
|
||||
};
|
||||
|
||||
struct vfio_group {
|
||||
struct iommu_group *iommu_group;
|
||||
struct list_head next;
|
||||
};
|
||||
|
||||
/*
|
||||
* This code handles mapping and unmapping of user data buffers
|
||||
* into DMA'ble space using the IOMMU
|
||||
*/
|
||||
|
||||
static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
|
||||
dma_addr_t start, size_t size)
|
||||
{
|
||||
struct rb_node *node = iommu->dma_list.rb_node;
|
||||
|
||||
while (node) {
|
||||
struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
|
||||
|
||||
if (start + size <= dma->iova)
|
||||
node = node->rb_left;
|
||||
else if (start >= dma->iova + dma->size)
|
||||
node = node->rb_right;
|
||||
else
|
||||
return dma;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
|
||||
{
|
||||
struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
|
||||
struct vfio_dma *dma;
|
||||
|
||||
while (*link) {
|
||||
parent = *link;
|
||||
dma = rb_entry(parent, struct vfio_dma, node);
|
||||
|
||||
if (new->iova + new->size <= dma->iova)
|
||||
link = &(*link)->rb_left;
|
||||
else
|
||||
link = &(*link)->rb_right;
|
||||
}
|
||||
|
||||
rb_link_node(&new->node, parent, link);
|
||||
rb_insert_color(&new->node, &iommu->dma_list);
|
||||
}
|
||||
|
||||
static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
|
||||
{
|
||||
rb_erase(&old->node, &iommu->dma_list);
|
||||
}
|
||||
|
||||
struct vwork {
|
||||
struct mm_struct *mm;
|
||||
long npage;
|
||||
struct work_struct work;
|
||||
};
|
||||
|
||||
/* delayed decrement/increment for locked_vm */
|
||||
static void vfio_lock_acct_bg(struct work_struct *work)
|
||||
{
|
||||
struct vwork *vwork = container_of(work, struct vwork, work);
|
||||
struct mm_struct *mm;
|
||||
|
||||
mm = vwork->mm;
|
||||
down_write(&mm->mmap_sem);
|
||||
mm->locked_vm += vwork->npage;
|
||||
up_write(&mm->mmap_sem);
|
||||
mmput(mm);
|
||||
kfree(vwork);
|
||||
}
|
||||
|
||||
static void vfio_lock_acct(long npage)
|
||||
{
|
||||
struct vwork *vwork;
|
||||
struct mm_struct *mm;
|
||||
|
||||
if (!current->mm || !npage)
|
||||
return; /* process exited or nothing to do */
|
||||
|
||||
if (down_write_trylock(¤t->mm->mmap_sem)) {
|
||||
current->mm->locked_vm += npage;
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Couldn't get mmap_sem lock, so must setup to update
|
||||
* mm->locked_vm later. If locked_vm were atomic, we
|
||||
* wouldn't need this silliness
|
||||
*/
|
||||
vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
|
||||
if (!vwork)
|
||||
return;
|
||||
mm = get_task_mm(current);
|
||||
if (!mm) {
|
||||
kfree(vwork);
|
||||
return;
|
||||
}
|
||||
INIT_WORK(&vwork->work, vfio_lock_acct_bg);
|
||||
vwork->mm = mm;
|
||||
vwork->npage = npage;
|
||||
schedule_work(&vwork->work);
|
||||
}
|
||||
|
||||
/*
|
||||
* Some mappings aren't backed by a struct page, for example an mmap'd
|
||||
* MMIO range for our own or another device. These use a different
|
||||
* pfn conversion and shouldn't be tracked as locked pages.
|
||||
*/
|
||||
static bool is_invalid_reserved_pfn(unsigned long pfn)
|
||||
{
|
||||
if (pfn_valid(pfn)) {
|
||||
bool reserved;
|
||||
struct page *tail = pfn_to_page(pfn);
|
||||
struct page *head = compound_head(tail);
|
||||
reserved = !!(PageReserved(head));
|
||||
if (head != tail) {
|
||||
/*
|
||||
* "head" is not a dangling pointer
|
||||
* (compound_head takes care of that)
|
||||
* but the hugepage may have been split
|
||||
* from under us (and we may not hold a
|
||||
* reference count on the head page so it can
|
||||
* be reused before we run PageReferenced), so
|
||||
* we've to check PageTail before returning
|
||||
* what we just read.
|
||||
*/
|
||||
smp_rmb();
|
||||
if (PageTail(tail))
|
||||
return reserved;
|
||||
}
|
||||
return PageReserved(tail);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int put_pfn(unsigned long pfn, int prot)
|
||||
{
|
||||
if (!is_invalid_reserved_pfn(pfn)) {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
if (prot & IOMMU_WRITE)
|
||||
SetPageDirty(page);
|
||||
put_page(page);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
|
||||
{
|
||||
struct page *page[1];
|
||||
struct vm_area_struct *vma;
|
||||
int ret = -EFAULT;
|
||||
|
||||
if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
|
||||
*pfn = page_to_pfn(page[0]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
|
||||
vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
|
||||
|
||||
if (vma && vma->vm_flags & VM_PFNMAP) {
|
||||
*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
|
||||
if (is_invalid_reserved_pfn(*pfn))
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempt to pin pages. We really don't want to track all the pfns and
|
||||
* the iommu can only map chunks of consecutive pfns anyway, so get the
|
||||
* first page and all consecutive pages with the same locking.
|
||||
*/
|
||||
static long vfio_pin_pages(unsigned long vaddr, long npage,
|
||||
int prot, unsigned long *pfn_base)
|
||||
{
|
||||
unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
|
||||
bool lock_cap = capable(CAP_IPC_LOCK);
|
||||
long ret, i;
|
||||
|
||||
if (!current->mm)
|
||||
return -ENODEV;
|
||||
|
||||
ret = vaddr_get_pfn(vaddr, prot, pfn_base);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (is_invalid_reserved_pfn(*pfn_base))
|
||||
return 1;
|
||||
|
||||
if (!lock_cap && current->mm->locked_vm + 1 > limit) {
|
||||
put_pfn(*pfn_base, prot);
|
||||
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
|
||||
limit << PAGE_SHIFT);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
if (unlikely(disable_hugepages)) {
|
||||
vfio_lock_acct(1);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Lock all the consecutive pages from pfn_base */
|
||||
for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
|
||||
unsigned long pfn = 0;
|
||||
|
||||
ret = vaddr_get_pfn(vaddr, prot, &pfn);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
|
||||
put_pfn(pfn, prot);
|
||||
break;
|
||||
}
|
||||
|
||||
if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
|
||||
put_pfn(pfn, prot);
|
||||
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
|
||||
__func__, limit << PAGE_SHIFT);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
vfio_lock_acct(i);
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
static long vfio_unpin_pages(unsigned long pfn, long npage,
|
||||
int prot, bool do_accounting)
|
||||
{
|
||||
unsigned long unlocked = 0;
|
||||
long i;
|
||||
|
||||
for (i = 0; i < npage; i++)
|
||||
unlocked += put_pfn(pfn++, prot);
|
||||
|
||||
if (do_accounting)
|
||||
vfio_lock_acct(-unlocked);
|
||||
|
||||
return unlocked;
|
||||
}
|
||||
|
||||
static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
|
||||
{
|
||||
dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
|
||||
struct vfio_domain *domain, *d;
|
||||
long unlocked = 0;
|
||||
|
||||
if (!dma->size)
|
||||
return;
|
||||
/*
|
||||
* We use the IOMMU to track the physical addresses, otherwise we'd
|
||||
* need a much more complicated tracking system. Unfortunately that
|
||||
* means we need to use one of the iommu domains to figure out the
|
||||
* pfns to unpin. The rest need to be unmapped in advance so we have
|
||||
* no iommu translations remaining when the pages are unpinned.
|
||||
*/
|
||||
domain = d = list_first_entry(&iommu->domain_list,
|
||||
struct vfio_domain, next);
|
||||
|
||||
list_for_each_entry_continue(d, &iommu->domain_list, next)
|
||||
iommu_unmap(d->domain, dma->iova, dma->size);
|
||||
|
||||
while (iova < end) {
|
||||
size_t unmapped;
|
||||
phys_addr_t phys;
|
||||
|
||||
phys = iommu_iova_to_phys(domain->domain, iova);
|
||||
if (WARN_ON(!phys)) {
|
||||
iova += PAGE_SIZE;
|
||||
continue;
|
||||
}
|
||||
|
||||
unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE);
|
||||
if (WARN_ON(!unmapped))
|
||||
break;
|
||||
|
||||
unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
|
||||
unmapped >> PAGE_SHIFT,
|
||||
dma->prot, false);
|
||||
iova += unmapped;
|
||||
}
|
||||
|
||||
vfio_lock_acct(-unlocked);
|
||||
}
|
||||
|
||||
static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
|
||||
{
|
||||
vfio_unmap_unpin(iommu, dma);
|
||||
vfio_unlink_dma(iommu, dma);
|
||||
kfree(dma);
|
||||
}
|
||||
|
||||
static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
|
||||
{
|
||||
struct vfio_domain *domain;
|
||||
unsigned long bitmap = PAGE_MASK;
|
||||
|
||||
mutex_lock(&iommu->lock);
|
||||
list_for_each_entry(domain, &iommu->domain_list, next)
|
||||
bitmap &= domain->domain->ops->pgsize_bitmap;
|
||||
mutex_unlock(&iommu->lock);
|
||||
|
||||
return bitmap;
|
||||
}
|
||||
|
||||
static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
|
||||
struct vfio_iommu_type1_dma_unmap *unmap)
|
||||
{
|
||||
uint64_t mask;
|
||||
struct vfio_dma *dma;
|
||||
size_t unmapped = 0;
|
||||
int ret = 0;
|
||||
|
||||
mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
|
||||
|
||||
if (unmap->iova & mask)
|
||||
return -EINVAL;
|
||||
if (!unmap->size || unmap->size & mask)
|
||||
return -EINVAL;
|
||||
|
||||
WARN_ON(mask & PAGE_MASK);
|
||||
|
||||
mutex_lock(&iommu->lock);
|
||||
|
||||
/*
|
||||
* vfio-iommu-type1 (v1) - User mappings were coalesced together to
|
||||
* avoid tracking individual mappings. This means that the granularity
|
||||
* of the original mapping was lost and the user was allowed to attempt
|
||||
* to unmap any range. Depending on the contiguousness of physical
|
||||
* memory and page sizes supported by the IOMMU, arbitrary unmaps may
|
||||
* or may not have worked. We only guaranteed unmap granularity
|
||||
* matching the original mapping; even though it was untracked here,
|
||||
* the original mappings are reflected in IOMMU mappings. This
|
||||
* resulted in a couple unusual behaviors. First, if a range is not
|
||||
* able to be unmapped, ex. a set of 4k pages that was mapped as a
|
||||
* 2M hugepage into the IOMMU, the unmap ioctl returns success but with
|
||||
* a zero sized unmap. Also, if an unmap request overlaps the first
|
||||
* address of a hugepage, the IOMMU will unmap the entire hugepage.
|
||||
* This also returns success and the returned unmap size reflects the
|
||||
* actual size unmapped.
|
||||
*
|
||||
* We attempt to maintain compatibility with this "v1" interface, but
|
||||
* we take control out of the hands of the IOMMU. Therefore, an unmap
|
||||
* request offset from the beginning of the original mapping will
|
||||
* return success with zero sized unmap. And an unmap request covering
|
||||
* the first iova of mapping will unmap the entire range.
|
||||
*
|
||||
* The v2 version of this interface intends to be more deterministic.
|
||||
* Unmap requests must fully cover previous mappings. Multiple
|
||||
* mappings may still be unmaped by specifying large ranges, but there
|
||||
* must not be any previous mappings bisected by the range. An error
|
||||
* will be returned if these conditions are not met. The v2 interface
|
||||
* will only return success and a size of zero if there were no
|
||||
* mappings within the range.
|
||||
*/
|
||||
if (iommu->v2) {
|
||||
dma = vfio_find_dma(iommu, unmap->iova, 0);
|
||||
if (dma && dma->iova != unmap->iova) {
|
||||
ret = -EINVAL;
|
||||
goto unlock;
|
||||
}
|
||||
dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
|
||||
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
|
||||
ret = -EINVAL;
|
||||
goto unlock;
|
||||
}
|
||||
}
|
||||
|
||||
while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
|
||||
if (!iommu->v2 && unmap->iova > dma->iova)
|
||||
break;
|
||||
unmapped += dma->size;
|
||||
vfio_remove_dma(iommu, dma);
|
||||
}
|
||||
|
||||
unlock:
|
||||
mutex_unlock(&iommu->lock);
|
||||
|
||||
/* Report how much was unmapped */
|
||||
unmap->size = unmapped;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Turns out AMD IOMMU has a page table bug where it won't map large pages
|
||||
* to a region that previously mapped smaller pages. This should be fixed
|
||||
* soon, so this is just a temporary workaround to break mappings down into
|
||||
* PAGE_SIZE. Better to map smaller pages than nothing.
|
||||
*/
|
||||
static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
|
||||
unsigned long pfn, long npage, int prot)
|
||||
{
|
||||
long i;
|
||||
int ret;
|
||||
|
||||
for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
|
||||
ret = iommu_map(domain->domain, iova,
|
||||
(phys_addr_t)pfn << PAGE_SHIFT,
|
||||
PAGE_SIZE, prot | domain->prot);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
|
||||
iommu_unmap(domain->domain, iova, PAGE_SIZE);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
|
||||
unsigned long pfn, long npage, int prot)
|
||||
{
|
||||
struct vfio_domain *d;
|
||||
int ret;
|
||||
|
||||
list_for_each_entry(d, &iommu->domain_list, next) {
|
||||
ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
|
||||
npage << PAGE_SHIFT, prot | d->prot);
|
||||
if (ret) {
|
||||
if (ret != -EBUSY ||
|
||||
map_try_harder(d, iova, pfn, npage, prot))
|
||||
goto unwind;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
unwind:
|
||||
list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
|
||||
iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int vfio_dma_do_map(struct vfio_iommu *iommu,
|
||||
struct vfio_iommu_type1_dma_map *map)
|
||||
{
|
||||
dma_addr_t iova = map->iova;
|
||||
unsigned long vaddr = map->vaddr;
|
||||
size_t size = map->size;
|
||||
long npage;
|
||||
int ret = 0, prot = 0;
|
||||
uint64_t mask;
|
||||
struct vfio_dma *dma;
|
||||
unsigned long pfn;
|
||||
|
||||
/* Verify that none of our __u64 fields overflow */
|
||||
if (map->size != size || map->vaddr != vaddr || map->iova != iova)
|
||||
return -EINVAL;
|
||||
|
||||
mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
|
||||
|
||||
WARN_ON(mask & PAGE_MASK);
|
||||
|
||||
/* READ/WRITE from device perspective */
|
||||
if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
|
||||
prot |= IOMMU_WRITE;
|
||||
if (map->flags & VFIO_DMA_MAP_FLAG_READ)
|
||||
prot |= IOMMU_READ;
|
||||
|
||||
if (!prot || !size || (size | iova | vaddr) & mask)
|
||||
return -EINVAL;
|
||||
|
||||
/* Don't allow IOVA or virtual address wrap */
|
||||
if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&iommu->lock);
|
||||
|
||||
if (vfio_find_dma(iommu, iova, size)) {
|
||||
mutex_unlock(&iommu->lock);
|
||||
return -EEXIST;
|
||||
}
|
||||
|
||||
dma = kzalloc(sizeof(*dma), GFP_KERNEL);
|
||||
if (!dma) {
|
||||
mutex_unlock(&iommu->lock);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
dma->iova = iova;
|
||||
dma->vaddr = vaddr;
|
||||
dma->prot = prot;
|
||||
|
||||
/* Insert zero-sized and grow as we map chunks of it */
|
||||
vfio_link_dma(iommu, dma);
|
||||
|
||||
while (size) {
|
||||
/* Pin a contiguous chunk of memory */
|
||||
npage = vfio_pin_pages(vaddr + dma->size,
|
||||
size >> PAGE_SHIFT, prot, &pfn);
|
||||
if (npage <= 0) {
|
||||
WARN_ON(!npage);
|
||||
ret = (int)npage;
|
||||
break;
|
||||
}
|
||||
|
||||
/* Map it! */
|
||||
ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
|
||||
if (ret) {
|
||||
vfio_unpin_pages(pfn, npage, prot, true);
|
||||
break;
|
||||
}
|
||||
|
||||
size -= npage << PAGE_SHIFT;
|
||||
dma->size += npage << PAGE_SHIFT;
|
||||
}
|
||||
|
||||
if (ret)
|
||||
vfio_remove_dma(iommu, dma);
|
||||
|
||||
mutex_unlock(&iommu->lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int vfio_bus_type(struct device *dev, void *data)
|
||||
{
|
||||
struct bus_type **bus = data;
|
||||
|
||||
if (*bus && *bus != dev->bus)
|
||||
return -EINVAL;
|
||||
|
||||
*bus = dev->bus;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_iommu_replay(struct vfio_iommu *iommu,
|
||||
struct vfio_domain *domain)
|
||||
{
|
||||
struct vfio_domain *d;
|
||||
struct rb_node *n;
|
||||
int ret;
|
||||
|
||||
/* Arbitrarily pick the first domain in the list for lookups */
|
||||
d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
|
||||
n = rb_first(&iommu->dma_list);
|
||||
|
||||
/* If there's not a domain, there better not be any mappings */
|
||||
if (WARN_ON(n && !d))
|
||||
return -EINVAL;
|
||||
|
||||
for (; n; n = rb_next(n)) {
|
||||
struct vfio_dma *dma;
|
||||
dma_addr_t iova;
|
||||
|
||||
dma = rb_entry(n, struct vfio_dma, node);
|
||||
iova = dma->iova;
|
||||
|
||||
while (iova < dma->iova + dma->size) {
|
||||
phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
|
||||
size_t size;
|
||||
|
||||
if (WARN_ON(!phys)) {
|
||||
iova += PAGE_SIZE;
|
||||
continue;
|
||||
}
|
||||
|
||||
size = PAGE_SIZE;
|
||||
|
||||
while (iova + size < dma->iova + dma->size &&
|
||||
phys + size == iommu_iova_to_phys(d->domain,
|
||||
iova + size))
|
||||
size += PAGE_SIZE;
|
||||
|
||||
ret = iommu_map(domain->domain, iova, phys,
|
||||
size, dma->prot | domain->prot);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
iova += size;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vfio_iommu_type1_attach_group(void *iommu_data,
|
||||
struct iommu_group *iommu_group)
|
||||
{
|
||||
struct vfio_iommu *iommu = iommu_data;
|
||||
struct vfio_group *group, *g;
|
||||
struct vfio_domain *domain, *d;
|
||||
struct bus_type *bus = NULL;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&iommu->lock);
|
||||
|
||||
list_for_each_entry(d, &iommu->domain_list, next) {
|
||||
list_for_each_entry(g, &d->group_list, next) {
|
||||
if (g->iommu_group != iommu_group)
|
||||
continue;
|
||||
|
||||
mutex_unlock(&iommu->lock);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
group = kzalloc(sizeof(*group), GFP_KERNEL);
|
||||
domain = kzalloc(sizeof(*domain), GFP_KERNEL);
|
||||
if (!group || !domain) {
|
||||
ret = -ENOMEM;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
group->iommu_group = iommu_group;
|
||||
|
||||
/* Determine bus_type in order to allocate a domain */
|
||||
ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
|
||||
if (ret)
|
||||
goto out_free;
|
||||
|
||||
domain->domain = iommu_domain_alloc(bus);
|
||||
if (!domain->domain) {
|
||||
ret = -EIO;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
if (iommu->nesting) {
|
||||
int attr = 1;
|
||||
|
||||
ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
|
||||
&attr);
|
||||
if (ret)
|
||||
goto out_domain;
|
||||
}
|
||||
|
||||
ret = iommu_attach_group(domain->domain, iommu_group);
|
||||
if (ret)
|
||||
goto out_domain;
|
||||
|
||||
INIT_LIST_HEAD(&domain->group_list);
|
||||
list_add(&group->next, &domain->group_list);
|
||||
|
||||
if (!allow_unsafe_interrupts &&
|
||||
!iommu_capable(bus, IOMMU_CAP_INTR_REMAP)) {
|
||||
pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
|
||||
__func__);
|
||||
ret = -EPERM;
|
||||
goto out_detach;
|
||||
}
|
||||
|
||||
if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
|
||||
domain->prot |= IOMMU_CACHE;
|
||||
|
||||
/*
|
||||
* Try to match an existing compatible domain. We don't want to
|
||||
* preclude an IOMMU driver supporting multiple bus_types and being
|
||||
* able to include different bus_types in the same IOMMU domain, so
|
||||
* we test whether the domains use the same iommu_ops rather than
|
||||
* testing if they're on the same bus_type.
|
||||
*/
|
||||
list_for_each_entry(d, &iommu->domain_list, next) {
|
||||
if (d->domain->ops == domain->domain->ops &&
|
||||
d->prot == domain->prot) {
|
||||
iommu_detach_group(domain->domain, iommu_group);
|
||||
if (!iommu_attach_group(d->domain, iommu_group)) {
|
||||
list_add(&group->next, &d->group_list);
|
||||
iommu_domain_free(domain->domain);
|
||||
kfree(domain);
|
||||
mutex_unlock(&iommu->lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
ret = iommu_attach_group(domain->domain, iommu_group);
|
||||
if (ret)
|
||||
goto out_domain;
|
||||
}
|
||||
}
|
||||
|
||||
/* replay mappings on new domains */
|
||||
ret = vfio_iommu_replay(iommu, domain);
|
||||
if (ret)
|
||||
goto out_detach;
|
||||
|
||||
list_add(&domain->next, &iommu->domain_list);
|
||||
|
||||
mutex_unlock(&iommu->lock);
|
||||
|
||||
return 0;
|
||||
|
||||
out_detach:
|
||||
iommu_detach_group(domain->domain, iommu_group);
|
||||
out_domain:
|
||||
iommu_domain_free(domain->domain);
|
||||
out_free:
|
||||
kfree(domain);
|
||||
kfree(group);
|
||||
mutex_unlock(&iommu->lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
|
||||
{
|
||||
struct rb_node *node;
|
||||
|
||||
while ((node = rb_first(&iommu->dma_list)))
|
||||
vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
|
||||
}
|
||||
|
||||
static void vfio_iommu_type1_detach_group(void *iommu_data,
|
||||
struct iommu_group *iommu_group)
|
||||
{
|
||||
struct vfio_iommu *iommu = iommu_data;
|
||||
struct vfio_domain *domain;
|
||||
struct vfio_group *group;
|
||||
|
||||
mutex_lock(&iommu->lock);
|
||||
|
||||
list_for_each_entry(domain, &iommu->domain_list, next) {
|
||||
list_for_each_entry(group, &domain->group_list, next) {
|
||||
if (group->iommu_group != iommu_group)
|
||||
continue;
|
||||
|
||||
iommu_detach_group(domain->domain, iommu_group);
|
||||
list_del(&group->next);
|
||||
kfree(group);
|
||||
/*
|
||||
* Group ownership provides privilege, if the group
|
||||
* list is empty, the domain goes away. If it's the
|
||||
* last domain, then all the mappings go away too.
|
||||
*/
|
||||
if (list_empty(&domain->group_list)) {
|
||||
if (list_is_singular(&iommu->domain_list))
|
||||
vfio_iommu_unmap_unpin_all(iommu);
|
||||
iommu_domain_free(domain->domain);
|
||||
list_del(&domain->next);
|
||||
kfree(domain);
|
||||
}
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
mutex_unlock(&iommu->lock);
|
||||
}
|
||||
|
||||
static void *vfio_iommu_type1_open(unsigned long arg)
|
||||
{
|
||||
struct vfio_iommu *iommu;
|
||||
|
||||
iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
|
||||
if (!iommu)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
switch (arg) {
|
||||
case VFIO_TYPE1_IOMMU:
|
||||
break;
|
||||
case VFIO_TYPE1_NESTING_IOMMU:
|
||||
iommu->nesting = true;
|
||||
case VFIO_TYPE1v2_IOMMU:
|
||||
iommu->v2 = true;
|
||||
break;
|
||||
default:
|
||||
kfree(iommu);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&iommu->domain_list);
|
||||
iommu->dma_list = RB_ROOT;
|
||||
mutex_init(&iommu->lock);
|
||||
|
||||
return iommu;
|
||||
}
|
||||
|
||||
static void vfio_iommu_type1_release(void *iommu_data)
|
||||
{
|
||||
struct vfio_iommu *iommu = iommu_data;
|
||||
struct vfio_domain *domain, *domain_tmp;
|
||||
struct vfio_group *group, *group_tmp;
|
||||
|
||||
vfio_iommu_unmap_unpin_all(iommu);
|
||||
|
||||
list_for_each_entry_safe(domain, domain_tmp,
|
||||
&iommu->domain_list, next) {
|
||||
list_for_each_entry_safe(group, group_tmp,
|
||||
&domain->group_list, next) {
|
||||
iommu_detach_group(domain->domain, group->iommu_group);
|
||||
list_del(&group->next);
|
||||
kfree(group);
|
||||
}
|
||||
iommu_domain_free(domain->domain);
|
||||
list_del(&domain->next);
|
||||
kfree(domain);
|
||||
}
|
||||
|
||||
kfree(iommu);
|
||||
}
|
||||
|
||||
static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
|
||||
{
|
||||
struct vfio_domain *domain;
|
||||
int ret = 1;
|
||||
|
||||
mutex_lock(&iommu->lock);
|
||||
list_for_each_entry(domain, &iommu->domain_list, next) {
|
||||
if (!(domain->prot & IOMMU_CACHE)) {
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&iommu->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static long vfio_iommu_type1_ioctl(void *iommu_data,
|
||||
unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
struct vfio_iommu *iommu = iommu_data;
|
||||
unsigned long minsz;
|
||||
|
||||
if (cmd == VFIO_CHECK_EXTENSION) {
|
||||
switch (arg) {
|
||||
case VFIO_TYPE1_IOMMU:
|
||||
case VFIO_TYPE1v2_IOMMU:
|
||||
case VFIO_TYPE1_NESTING_IOMMU:
|
||||
return 1;
|
||||
case VFIO_DMA_CC_IOMMU:
|
||||
if (!iommu)
|
||||
return 0;
|
||||
return vfio_domains_have_iommu_cache(iommu);
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
} else if (cmd == VFIO_IOMMU_GET_INFO) {
|
||||
struct vfio_iommu_type1_info info;
|
||||
|
||||
minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
|
||||
|
||||
if (copy_from_user(&info, (void __user *)arg, minsz))
|
||||
return -EFAULT;
|
||||
|
||||
if (info.argsz < minsz)
|
||||
return -EINVAL;
|
||||
|
||||
info.flags = 0;
|
||||
|
||||
info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
|
||||
|
||||
return copy_to_user((void __user *)arg, &info, minsz);
|
||||
|
||||
} else if (cmd == VFIO_IOMMU_MAP_DMA) {
|
||||
struct vfio_iommu_type1_dma_map map;
|
||||
uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
|
||||
VFIO_DMA_MAP_FLAG_WRITE;
|
||||
|
||||
minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
|
||||
|
||||
if (copy_from_user(&map, (void __user *)arg, minsz))
|
||||
return -EFAULT;
|
||||
|
||||
if (map.argsz < minsz || map.flags & ~mask)
|
||||
return -EINVAL;
|
||||
|
||||
return vfio_dma_do_map(iommu, &map);
|
||||
|
||||
} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
|
||||
struct vfio_iommu_type1_dma_unmap unmap;
|
||||
long ret;
|
||||
|
||||
minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
|
||||
|
||||
if (copy_from_user(&unmap, (void __user *)arg, minsz))
|
||||
return -EFAULT;
|
||||
|
||||
if (unmap.argsz < minsz || unmap.flags)
|
||||
return -EINVAL;
|
||||
|
||||
ret = vfio_dma_do_unmap(iommu, &unmap);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return copy_to_user((void __user *)arg, &unmap, minsz);
|
||||
}
|
||||
|
||||
return -ENOTTY;
|
||||
}
|
||||
|
||||
static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
|
||||
.name = "vfio-iommu-type1",
|
||||
.owner = THIS_MODULE,
|
||||
.open = vfio_iommu_type1_open,
|
||||
.release = vfio_iommu_type1_release,
|
||||
.ioctl = vfio_iommu_type1_ioctl,
|
||||
.attach_group = vfio_iommu_type1_attach_group,
|
||||
.detach_group = vfio_iommu_type1_detach_group,
|
||||
};
|
||||
|
||||
static int __init vfio_iommu_type1_init(void)
|
||||
{
|
||||
return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
|
||||
}
|
||||
|
||||
static void __exit vfio_iommu_type1_cleanup(void)
|
||||
{
|
||||
vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
|
||||
}
|
||||
|
||||
module_init(vfio_iommu_type1_init);
|
||||
module_exit(vfio_iommu_type1_cleanup);
|
||||
|
||||
MODULE_VERSION(DRIVER_VERSION);
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_AUTHOR(DRIVER_AUTHOR);
|
||||
MODULE_DESCRIPTION(DRIVER_DESC);
|
100
drivers/vfio/vfio_spapr_eeh.c
Normal file
100
drivers/vfio/vfio_spapr_eeh.c
Normal file
|
@ -0,0 +1,100 @@
|
|||
/*
|
||||
* EEH functionality support for VFIO devices. The feature is only
|
||||
* available on sPAPR compatible platforms.
|
||||
*
|
||||
* Copyright Gavin Shan, IBM Corporation 2014.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/vfio.h>
|
||||
#include <asm/eeh.h>
|
||||
|
||||
#define DRIVER_VERSION "0.1"
|
||||
#define DRIVER_AUTHOR "Gavin Shan, IBM Corporation"
|
||||
#define DRIVER_DESC "VFIO IOMMU SPAPR EEH"
|
||||
|
||||
/* We might build address mapping here for "fast" path later */
|
||||
void vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
|
||||
{
|
||||
eeh_dev_open(pdev);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_open);
|
||||
|
||||
void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
|
||||
{
|
||||
eeh_dev_release(pdev);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_release);
|
||||
|
||||
long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
|
||||
unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
struct eeh_pe *pe;
|
||||
struct vfio_eeh_pe_op op;
|
||||
unsigned long minsz;
|
||||
long ret = -EINVAL;
|
||||
|
||||
switch (cmd) {
|
||||
case VFIO_CHECK_EXTENSION:
|
||||
if (arg == VFIO_EEH)
|
||||
ret = eeh_enabled() ? 1 : 0;
|
||||
else
|
||||
ret = 0;
|
||||
break;
|
||||
case VFIO_EEH_PE_OP:
|
||||
pe = eeh_iommu_group_to_pe(group);
|
||||
if (!pe)
|
||||
return -ENODEV;
|
||||
|
||||
minsz = offsetofend(struct vfio_eeh_pe_op, op);
|
||||
if (copy_from_user(&op, (void __user *)arg, minsz))
|
||||
return -EFAULT;
|
||||
if (op.argsz < minsz || op.flags)
|
||||
return -EINVAL;
|
||||
|
||||
switch (op.op) {
|
||||
case VFIO_EEH_PE_DISABLE:
|
||||
ret = eeh_pe_set_option(pe, EEH_OPT_DISABLE);
|
||||
break;
|
||||
case VFIO_EEH_PE_ENABLE:
|
||||
ret = eeh_pe_set_option(pe, EEH_OPT_ENABLE);
|
||||
break;
|
||||
case VFIO_EEH_PE_UNFREEZE_IO:
|
||||
ret = eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO);
|
||||
break;
|
||||
case VFIO_EEH_PE_UNFREEZE_DMA:
|
||||
ret = eeh_pe_set_option(pe, EEH_OPT_THAW_DMA);
|
||||
break;
|
||||
case VFIO_EEH_PE_GET_STATE:
|
||||
ret = eeh_pe_get_state(pe);
|
||||
break;
|
||||
case VFIO_EEH_PE_RESET_DEACTIVATE:
|
||||
ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE);
|
||||
break;
|
||||
case VFIO_EEH_PE_RESET_HOT:
|
||||
ret = eeh_pe_reset(pe, EEH_RESET_HOT);
|
||||
break;
|
||||
case VFIO_EEH_PE_RESET_FUNDAMENTAL:
|
||||
ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL);
|
||||
break;
|
||||
case VFIO_EEH_PE_CONFIGURE:
|
||||
ret = eeh_pe_configure(pe);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vfio_spapr_iommu_eeh_ioctl);
|
||||
|
||||
MODULE_VERSION(DRIVER_VERSION);
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_AUTHOR(DRIVER_AUTHOR);
|
||||
MODULE_DESCRIPTION(DRIVER_DESC);
|
Loading…
Add table
Add a link
Reference in a new issue