Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

13
drivers/lguest/Kconfig Normal file
View file

@ -0,0 +1,13 @@
config LGUEST
tristate "Linux hypervisor example code"
depends on X86_32 && EVENTFD && TTY
select HVC_DRIVER
---help---
This is a very simple module which allows you to run
multiple instances of the same Linux kernel, using the
"lguest" command found in the tools/lguest directory.
Note that "lguest" is pronounced to rhyme with "fell quest",
not "rustyvisor". See tools/lguest/lguest.txt.
If unsure, say N. If curious, say M. If masochistic, say Y.

29
drivers/lguest/Makefile Normal file
View file

@ -0,0 +1,29 @@
# Guest requires the device configuration and probing code.
obj-$(CONFIG_LGUEST_GUEST) += lguest_device.o
# Host requires the other files, which can be a module.
obj-$(CONFIG_LGUEST) += lg.o
lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
segments.o lguest_user.o
lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o
Preparation Preparation!: PREFIX=P
Guest: PREFIX=G
Drivers: PREFIX=D
Launcher: PREFIX=L
Host: PREFIX=H
Switcher: PREFIX=S
Mastery: PREFIX=M
Beer:
@for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}"
Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery:
@sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
Puppy:
@clear
@printf " __ \n (___()'\`;\n /, /\`\n \\\\\\\"--\\\\\\ \n"
@sleep 2; clear; printf "\n\n Sit!\n\n"; sleep 1; clear
@printf " __ \n ()'\`; \n /\\|\` \n / | \n(/_)_|_ \n"
@sleep 2; clear; printf "\n\n Stand!\n\n"; sleep 1; clear
@printf " __ \n ()'\`; \n /\\|\` \n /._.= \n /| / \n(_\_)_ \n"
@sleep 2; clear; printf "\n\n Good puppy!\n\n"; sleep 1; clear

47
drivers/lguest/README Normal file
View file

@ -0,0 +1,47 @@
Welcome, friend reader, to lguest.
Lguest is an adventure, with you, the reader, as Hero. I can't think of many
5000-line projects which offer both such capability and glimpses of future
potential; it is an exciting time to be delving into the source!
But be warned; this is an arduous journey of several hours or more! And as we
know, all true Heroes are driven by a Noble Goal. Thus I offer a Beer (or
equivalent) to anyone I meet who has completed this documentation.
So get comfortable and keep your wits about you (both quick and humorous).
Along your way to the Noble Goal, you will also gain masterly insight into
lguest, and hypervisors and x86 virtualization in general.
Our Quest is in seven parts: (best read with C highlighting turned on)
I) Preparation
- In which our potential hero is flown quickly over the landscape for a
taste of its scope. Suitable for the armchair coders and other such
persons of faint constitution.
II) Guest
- Where we encounter the first tantalising wisps of code, and come to
understand the details of the life of a Guest kernel.
III) Drivers
- Whereby the Guest finds its voice and become useful, and our
understanding of the Guest is completed.
IV) Launcher
- Where we trace back to the creation of the Guest, and thus begin our
understanding of the Host.
V) Host
- Where we master the Host code, through a long and tortuous journey.
Indeed, it is here that our hero is tested in the Bit of Despair.
VI) Switcher
- Where our understanding of the intertwined nature of Guests and Hosts
is completed.
VII) Mastery
- Where our fully fledged hero grapples with the Great Question:
"What next?"
make Preparation!
Rusty Russell.

370
drivers/lguest/core.c Normal file
View file

@ -0,0 +1,370 @@
/*P:400
* This contains run_guest() which actually calls into the Host<->Guest
* Switcher and analyzes the return, such as determining if the Guest wants the
* Host to do something. This file also contains useful helper routines.
:*/
#include <linux/module.h>
#include <linux/stringify.h>
#include <linux/stddef.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <asm/paravirt.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
#include <asm/poll.h>
#include <asm/asm-offsets.h>
#include "lg.h"
unsigned long switcher_addr;
struct page **lg_switcher_pages;
static struct vm_struct *switcher_vma;
/* This One Big lock protects all inter-guest data structures. */
DEFINE_MUTEX(lguest_lock);
/*H:010
* We need to set up the Switcher at a high virtual address. Remember the
* Switcher is a few hundred bytes of assembler code which actually changes the
* CPU to run the Guest, and then changes back to the Host when a trap or
* interrupt happens.
*
* The Switcher code must be at the same virtual address in the Guest as the
* Host since it will be running as the switchover occurs.
*
* Trying to map memory at a particular address is an unusual thing to do, so
* it's not a simple one-liner.
*/
static __init int map_switcher(void)
{
int i, err;
/*
* Map the Switcher in to high memory.
*
* It turns out that if we choose the address 0xFFC00000 (4MB under the
* top virtual address), it makes setting up the page tables really
* easy.
*/
/* We assume Switcher text fits into a single page. */
if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
end_switcher_text - start_switcher_text);
return -EINVAL;
}
/*
* We allocate an array of struct page pointers. map_vm_area() wants
* this, rather than just an array of pages.
*/
lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
* TOTAL_SWITCHER_PAGES,
GFP_KERNEL);
if (!lg_switcher_pages) {
err = -ENOMEM;
goto out;
}
/*
* Now we actually allocate the pages. The Guest will see these pages,
* so we make sure they're zeroed.
*/
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
if (!lg_switcher_pages[i]) {
err = -ENOMEM;
goto free_some_pages;
}
}
/*
* We place the Switcher underneath the fixmap area, which is the
* highest virtual address we can get. This is important, since we
* tell the Guest it can't access this memory, so we want its ceiling
* as high as possible.
*/
switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
/*
* Now we reserve the "virtual memory area" we want. We might
* not get it in theory, but in practice it's worked so far.
* The end address needs +1 because __get_vm_area allocates an
* extra guard page, so we need space for that.
*/
switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
VM_ALLOC, switcher_addr, switcher_addr
+ (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
if (!switcher_vma) {
err = -ENOMEM;
printk("lguest: could not map switcher pages high\n");
goto free_pages;
}
/*
* This code actually sets up the pages we've allocated to appear at
* switcher_addr. map_vm_area() takes the vma we allocated above, the
* kind of pages we're mapping (kernel pages), and a pointer to our
* array of struct pages.
*/
err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, lg_switcher_pages);
if (err) {
printk("lguest: map_vm_area failed: %i\n", err);
goto free_vma;
}
/*
* Now the Switcher is mapped at the right address, we can't fail!
* Copy in the compiled-in Switcher code (from x86/switcher_32.S).
*/
memcpy(switcher_vma->addr, start_switcher_text,
end_switcher_text - start_switcher_text);
printk(KERN_INFO "lguest: mapped switcher at %p\n",
switcher_vma->addr);
/* And we succeeded... */
return 0;
free_vma:
vunmap(switcher_vma->addr);
free_pages:
i = TOTAL_SWITCHER_PAGES;
free_some_pages:
for (--i; i >= 0; i--)
__free_pages(lg_switcher_pages[i], 0);
kfree(lg_switcher_pages);
out:
return err;
}
/*:*/
/* Cleaning up the mapping when the module is unloaded is almost... too easy. */
static void unmap_switcher(void)
{
unsigned int i;
/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
vunmap(switcher_vma->addr);
/* Now we just need to free the pages we copied the switcher into */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
__free_pages(lg_switcher_pages[i], 0);
kfree(lg_switcher_pages);
}
/*H:032
* Dealing With Guest Memory.
*
* Before we go too much further into the Host, we need to grok the routines
* we use to deal with Guest memory.
*
* When the Guest gives us (what it thinks is) a physical address, we can use
* the normal copy_from_user() & copy_to_user() on the corresponding place in
* the memory region allocated by the Launcher.
*
* But we can't trust the Guest: it might be trying to access the Launcher
* code. We have to check that the range is below the pfn_limit the Launcher
* gave us. We have to make sure that addr + len doesn't give us a false
* positive by overflowing, too.
*/
bool lguest_address_ok(const struct lguest *lg,
unsigned long addr, unsigned long len)
{
return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr);
}
/*
* This routine copies memory from the Guest. Here we can see how useful the
* kill_lguest() routine we met in the Launcher can be: we return a random
* value (all zeroes) instead of needing to return an error.
*/
void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
{
if (!lguest_address_ok(cpu->lg, addr, bytes)
|| copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
/* copy_from_user should do this, but as we rely on it... */
memset(b, 0, bytes);
kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
}
}
/* This is the write (copy into Guest) version. */
void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
unsigned bytes)
{
if (!lguest_address_ok(cpu->lg, addr, bytes)
|| copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
}
/*:*/
/*H:030
* Let's jump straight to the the main loop which runs the Guest.
* Remember, this is called by the Launcher reading /dev/lguest, and we keep
* going around and around until something interesting happens.
*/
int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
{
/* We stop running once the Guest is dead. */
while (!cpu->lg->dead) {
unsigned int irq;
bool more;
/* First we run any hypercalls the Guest wants done. */
if (cpu->hcall)
do_hypercalls(cpu);
/*
* It's possible the Guest did a NOTIFY hypercall to the
* Launcher.
*/
if (cpu->pending_notify) {
/*
* Does it just needs to write to a registered
* eventfd (ie. the appropriate virtqueue thread)?
*/
if (!send_notify_to_eventfd(cpu)) {
/* OK, we tell the main Launcher. */
if (put_user(cpu->pending_notify, user))
return -EFAULT;
return sizeof(cpu->pending_notify);
}
}
/*
* All long-lived kernel loops need to check with this horrible
* thing called the freezer. If the Host is trying to suspend,
* it stops us.
*/
try_to_freeze();
/* Check for signals */
if (signal_pending(current))
return -ERESTARTSYS;
/*
* Check if there are any interrupts which can be delivered now:
* if so, this sets up the hander to be executed when we next
* run the Guest.
*/
irq = interrupt_pending(cpu, &more);
if (irq < LGUEST_IRQS)
try_deliver_interrupt(cpu, irq, more);
/*
* Just make absolutely sure the Guest is still alive. One of
* those hypercalls could have been fatal, for example.
*/
if (cpu->lg->dead)
break;
/*
* If the Guest asked to be stopped, we sleep. The Guest's
* clock timer will wake us.
*/
if (cpu->halted) {
set_current_state(TASK_INTERRUPTIBLE);
/*
* Just before we sleep, make sure no interrupt snuck in
* which we should be doing.
*/
if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
set_current_state(TASK_RUNNING);
else
schedule();
continue;
}
/*
* OK, now we're ready to jump into the Guest. First we put up
* the "Do Not Disturb" sign:
*/
local_irq_disable();
/* Actually run the Guest until something happens. */
lguest_arch_run_guest(cpu);
/* Now we're ready to be interrupted or moved to other CPUs */
local_irq_enable();
/* Now we deal with whatever happened to the Guest. */
lguest_arch_handle_trap(cpu);
}
/* Special case: Guest is 'dead' but wants a reboot. */
if (cpu->lg->dead == ERR_PTR(-ERESTART))
return -ERESTART;
/* The Guest is dead => "No such file or directory" */
return -ENOENT;
}
/*H:000
* Welcome to the Host!
*
* By this point your brain has been tickled by the Guest code and numbed by
* the Launcher code; prepare for it to be stretched by the Host code. This is
* the heart. Let's begin at the initialization routine for the Host's lg
* module.
*/
static int __init init(void)
{
int err;
/* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */
if (get_kernel_rpl() != 0) {
printk("lguest is afraid of being a guest\n");
return -EPERM;
}
/* First we put the Switcher up in very high virtual memory. */
err = map_switcher();
if (err)
goto out;
/* We might need to reserve an interrupt vector. */
err = init_interrupts();
if (err)
goto unmap;
/* /dev/lguest needs to be registered. */
err = lguest_device_init();
if (err)
goto free_interrupts;
/* Finally we do some architecture-specific setup. */
lguest_arch_host_init();
/* All good! */
return 0;
free_interrupts:
free_interrupts();
unmap:
unmap_switcher();
out:
return err;
}
/* Cleaning up is just the same code, backwards. With a little French. */
static void __exit fini(void)
{
lguest_device_remove();
free_interrupts();
unmap_switcher();
lguest_arch_host_fini();
}
/*:*/
/*
* The Host side of lguest can be a module. This is a nice way for people to
* play with it.
*/
module_init(init);
module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");

312
drivers/lguest/hypercalls.c Normal file
View file

@ -0,0 +1,312 @@
/*P:500
* Just as userspace programs request kernel operations through a system
* call, the Guest requests Host operations through a "hypercall". You might
* notice this nomenclature doesn't really follow any logic, but the name has
* been around for long enough that we're stuck with it. As you'd expect, this
* code is basically a one big switch statement.
:*/
/* Copyright (C) 2006 Rusty Russell IBM Corporation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/ktime.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include "lg.h"
/*H:120
* This is the core hypercall routine: where the Guest gets what it wants.
* Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both.
*/
static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
{
switch (args->arg0) {
case LHCALL_FLUSH_ASYNC:
/*
* This call does nothing, except by breaking out of the Guest
* it makes us process all the asynchronous hypercalls.
*/
break;
case LHCALL_SEND_INTERRUPTS:
/*
* This call does nothing too, but by breaking out of the Guest
* it makes us process any pending interrupts.
*/
break;
case LHCALL_LGUEST_INIT:
/*
* You can't get here unless you're already initialized. Don't
* do that.
*/
kill_guest(cpu, "already have lguest_data");
break;
case LHCALL_SHUTDOWN: {
char msg[128];
/*
* Shutdown is such a trivial hypercall that we do it in five
* lines right here.
*
* If the lgread fails, it will call kill_guest() itself; the
* kill_guest() with the message will be ignored.
*/
__lgread(cpu, msg, args->arg1, sizeof(msg));
msg[sizeof(msg)-1] = '\0';
kill_guest(cpu, "CRASH: %s", msg);
if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
cpu->lg->dead = ERR_PTR(-ERESTART);
break;
}
case LHCALL_FLUSH_TLB:
/* FLUSH_TLB comes in two flavors, depending on the argument: */
if (args->arg1)
guest_pagetable_clear_all(cpu);
else
guest_pagetable_flush_user(cpu);
break;
/*
* All these calls simply pass the arguments through to the right
* routines.
*/
case LHCALL_NEW_PGTABLE:
guest_new_pagetable(cpu, args->arg1);
break;
case LHCALL_SET_STACK:
guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
break;
case LHCALL_SET_PTE:
#ifdef CONFIG_X86_PAE
guest_set_pte(cpu, args->arg1, args->arg2,
__pte(args->arg3 | (u64)args->arg4 << 32));
#else
guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
#endif
break;
case LHCALL_SET_PGD:
guest_set_pgd(cpu->lg, args->arg1, args->arg2);
break;
#ifdef CONFIG_X86_PAE
case LHCALL_SET_PMD:
guest_set_pmd(cpu->lg, args->arg1, args->arg2);
break;
#endif
case LHCALL_SET_CLOCKEVENT:
guest_set_clockevent(cpu, args->arg1);
break;
case LHCALL_TS:
/* This sets the TS flag, as we saw used in run_guest(). */
cpu->ts = args->arg1;
break;
case LHCALL_HALT:
/* Similarly, this sets the halted flag for run_guest(). */
cpu->halted = 1;
break;
case LHCALL_NOTIFY:
cpu->pending_notify = args->arg1;
break;
default:
/* It should be an architecture-specific hypercall. */
if (lguest_arch_do_hcall(cpu, args))
kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
}
}
/*H:124
* Asynchronous hypercalls are easy: we just look in the array in the
* Guest's "struct lguest_data" to see if any new ones are marked "ready".
*
* We are careful to do these in order: obviously we respect the order the
* Guest put them in the ring, but we also promise the Guest that they will
* happen before any normal hypercall (which is why we check this before
* checking for a normal hcall).
*/
static void do_async_hcalls(struct lg_cpu *cpu)
{
unsigned int i;
u8 st[LHCALL_RING_SIZE];
/* For simplicity, we copy the entire call status array in at once. */
if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
return;
/* We process "struct lguest_data"s hcalls[] ring once. */
for (i = 0; i < ARRAY_SIZE(st); i++) {
struct hcall_args args;
/*
* We remember where we were up to from last time. This makes
* sure that the hypercalls are done in the order the Guest
* places them in the ring.
*/
unsigned int n = cpu->next_hcall;
/* 0xFF means there's no call here (yet). */
if (st[n] == 0xFF)
break;
/*
* OK, we have hypercall. Increment the "next_hcall" cursor,
* and wrap back to 0 if we reach the end.
*/
if (++cpu->next_hcall == LHCALL_RING_SIZE)
cpu->next_hcall = 0;
/*
* Copy the hypercall arguments into a local copy of the
* hcall_args struct.
*/
if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
sizeof(struct hcall_args))) {
kill_guest(cpu, "Fetching async hypercalls");
break;
}
/* Do the hypercall, same as a normal one. */
do_hcall(cpu, &args);
/* Mark the hypercall done. */
if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
kill_guest(cpu, "Writing result for async hypercall");
break;
}
/*
* Stop doing hypercalls if they want to notify the Launcher:
* it needs to service this first.
*/
if (cpu->pending_notify)
break;
}
}
/*
* Last of all, we look at what happens first of all. The very first time the
* Guest makes a hypercall, we end up here to set things up:
*/
static void initialize(struct lg_cpu *cpu)
{
/*
* You can't do anything until you're initialized. The Guest knows the
* rules, so we're unforgiving here.
*/
if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
return;
}
if (lguest_arch_init_hypercalls(cpu))
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
/*
* The Guest tells us where we're not to deliver interrupts by putting
* the range of addresses into "struct lguest_data".
*/
if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start)
|| get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end))
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
/*
* We write the current time into the Guest's data page once so it can
* set its clock.
*/
write_timestamp(cpu);
/* page_tables.c will also do some setup. */
page_table_guest_data_init(cpu);
/*
* This is the one case where the above accesses might have been the
* first write to a Guest page. This may have caused a copy-on-write
* fault, but the old page might be (read-only) in the Guest
* pagetable.
*/
guest_pagetable_clear_all(cpu);
}
/*:*/
/*M:013
* If a Guest reads from a page (so creates a mapping) that it has never
* written to, and then the Launcher writes to it (ie. the output of a virtual
* device), the Guest will still see the old page. In practice, this never
* happens: why would the Guest read a page which it has never written to? But
* a similar scenario might one day bite us, so it's worth mentioning.
*
* Note that if we used a shared anonymous mapping in the Launcher instead of
* mapping /dev/zero private, we wouldn't worry about cop-on-write. And we
* need that to switch the Launcher to processes (away from threads) anyway.
:*/
/*H:100
* Hypercalls
*
* Remember from the Guest, hypercalls come in two flavors: normal and
* asynchronous. This file handles both of types.
*/
void do_hypercalls(struct lg_cpu *cpu)
{
/* Not initialized yet? This hypercall must do it. */
if (unlikely(!cpu->lg->lguest_data)) {
/* Set up the "struct lguest_data" */
initialize(cpu);
/* Hcall is done. */
cpu->hcall = NULL;
return;
}
/*
* The Guest has initialized.
*
* Look in the hypercall ring for the async hypercalls:
*/
do_async_hcalls(cpu);
/*
* If we stopped reading the hypercall ring because the Guest did a
* NOTIFY to the Launcher, we want to return now. Otherwise we do
* the hypercall.
*/
if (!cpu->pending_notify) {
do_hcall(cpu, cpu->hcall);
/*
* Tricky point: we reset the hcall pointer to mark the
* hypercall as "done". We use the hcall pointer rather than
* the trap number to indicate a hypercall is pending.
* Normally it doesn't matter: the Guest will run again and
* update the trap number before we come back here.
*
* However, if we are signalled or the Guest sends I/O to the
* Launcher, the run_guest() loop will exit without running the
* Guest. When it comes back it would try to re-run the
* hypercall. Finding that bug sucked.
*/
cpu->hcall = NULL;
}
}
/*
* This routine supplies the Guest with time: it's used for wallclock time at
* initial boot and as a rough time source if the TSC isn't available.
*/
void write_timestamp(struct lg_cpu *cpu)
{
struct timespec now;
ktime_get_real_ts(&now);
if (copy_to_user(&cpu->lg->lguest_data->time,
&now, sizeof(struct timespec)))
kill_guest(cpu, "Writing timestamp");
}

View file

@ -0,0 +1,657 @@
/*P:800
* Interrupts (traps) are complicated enough to earn their own file.
* There are three classes of interrupts:
*
* 1) Real hardware interrupts which occur while we're running the Guest,
* 2) Interrupts for virtual devices attached to the Guest, and
* 3) Traps and faults from the Guest.
*
* Real hardware interrupts must be delivered to the Host, not the Guest.
* Virtual interrupts must be delivered to the Guest, but we make them look
* just like real hardware would deliver them. Traps from the Guest can be set
* up to go directly back into the Guest, but sometimes the Host wants to see
* them first, so we also have a way of "reflecting" them into the Guest as if
* they had been delivered to it directly.
:*/
#include <linux/uaccess.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/sched.h>
#include "lg.h"
/* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */
static unsigned int syscall_vector = SYSCALL_VECTOR;
module_param(syscall_vector, uint, 0444);
/* The address of the interrupt handler is split into two bits: */
static unsigned long idt_address(u32 lo, u32 hi)
{
return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
}
/*
* The "type" of the interrupt handler is a 4 bit field: we only support a
* couple of types.
*/
static int idt_type(u32 lo, u32 hi)
{
return (hi >> 8) & 0xF;
}
/* An IDT entry can't be used unless the "present" bit is set. */
static bool idt_present(u32 lo, u32 hi)
{
return (hi & 0x8000);
}
/*
* We need a helper to "push" a value onto the Guest's stack, since that's a
* big part of what delivering an interrupt does.
*/
static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
{
/* Stack grows upwards: move stack then write value. */
*gstack -= 4;
lgwrite(cpu, *gstack, u32, val);
}
/*H:210
* The set_guest_interrupt() routine actually delivers the interrupt or
* trap. The mechanics of delivering traps and interrupts to the Guest are the
* same, except some traps have an "error code" which gets pushed onto the
* stack as well: the caller tells us if this is one.
*
* "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this
* interrupt or trap. It's split into two parts for traditional reasons: gcc
* on i386 used to be frightened by 64 bit numbers.
*
* We set up the stack just like the CPU does for a real interrupt, so it's
* identical for the Guest (and the standard "iret" instruction will undo
* it).
*/
static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
bool has_err)
{
unsigned long gstack, origstack;
u32 eflags, ss, irq_enable;
unsigned long virtstack;
/*
* There are two cases for interrupts: one where the Guest is already
* in the kernel, and a more complex one where the Guest is in
* userspace. We check the privilege level to find out.
*/
if ((cpu->regs->ss&0x3) != GUEST_PL) {
/*
* The Guest told us their kernel stack with the SET_STACK
* hypercall: both the virtual address and the segment.
*/
virtstack = cpu->esp1;
ss = cpu->ss1;
origstack = gstack = guest_pa(cpu, virtstack);
/*
* We push the old stack segment and pointer onto the new
* stack: when the Guest does an "iret" back from the interrupt
* handler the CPU will notice they're dropping privilege
* levels and expect these here.
*/
push_guest_stack(cpu, &gstack, cpu->regs->ss);
push_guest_stack(cpu, &gstack, cpu->regs->esp);
} else {
/* We're staying on the same Guest (kernel) stack. */
virtstack = cpu->regs->esp;
ss = cpu->regs->ss;
origstack = gstack = guest_pa(cpu, virtstack);
}
/*
* Remember that we never let the Guest actually disable interrupts, so
* the "Interrupt Flag" bit is always set. We copy that bit from the
* Guest's "irq_enabled" field into the eflags word: we saw the Guest
* copy it back in "lguest_iret".
*/
eflags = cpu->regs->eflags;
if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
&& !(irq_enable & X86_EFLAGS_IF))
eflags &= ~X86_EFLAGS_IF;
/*
* An interrupt is expected to push three things on the stack: the old
* "eflags" word, the old code segment, and the old instruction
* pointer.
*/
push_guest_stack(cpu, &gstack, eflags);
push_guest_stack(cpu, &gstack, cpu->regs->cs);
push_guest_stack(cpu, &gstack, cpu->regs->eip);
/* For the six traps which supply an error code, we push that, too. */
if (has_err)
push_guest_stack(cpu, &gstack, cpu->regs->errcode);
/*
* Now we've pushed all the old state, we change the stack, the code
* segment and the address to execute.
*/
cpu->regs->ss = ss;
cpu->regs->esp = virtstack + (gstack - origstack);
cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
cpu->regs->eip = idt_address(lo, hi);
/*
* Trapping always clears these flags:
* TF: Trap flag
* VM: Virtual 8086 mode
* RF: Resume
* NT: Nested task.
*/
cpu->regs->eflags &=
~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
/*
* There are two kinds of interrupt handlers: 0xE is an "interrupt
* gate" which expects interrupts to be disabled on entry.
*/
if (idt_type(lo, hi) == 0xE)
if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
kill_guest(cpu, "Disabling interrupts");
}
/*H:205
* Virtual Interrupts.
*
* interrupt_pending() returns the first pending interrupt which isn't blocked
* by the Guest. It is called before every entry to the Guest, and just before
* we go to sleep when the Guest has halted itself.
*/
unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
{
unsigned int irq;
DECLARE_BITMAP(blk, LGUEST_IRQS);
/* If the Guest hasn't even initialized yet, we can do nothing. */
if (!cpu->lg->lguest_data)
return LGUEST_IRQS;
/*
* Take our "irqs_pending" array and remove any interrupts the Guest
* wants blocked: the result ends up in "blk".
*/
if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
sizeof(blk)))
return LGUEST_IRQS;
bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
/* Find the first interrupt. */
irq = find_first_bit(blk, LGUEST_IRQS);
*more = find_next_bit(blk, LGUEST_IRQS, irq+1);
return irq;
}
/*
* This actually diverts the Guest to running an interrupt handler, once an
* interrupt has been identified by interrupt_pending().
*/
void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
{
struct desc_struct *idt;
BUG_ON(irq >= LGUEST_IRQS);
/*
* They may be in the middle of an iret, where they asked us never to
* deliver interrupts.
*/
if (cpu->regs->eip >= cpu->lg->noirq_start &&
(cpu->regs->eip < cpu->lg->noirq_end))
return;
/* If they're halted, interrupts restart them. */
if (cpu->halted) {
/* Re-enable interrupts. */
if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled))
kill_guest(cpu, "Re-enabling interrupts");
cpu->halted = 0;
} else {
/* Otherwise we check if they have interrupts disabled. */
u32 irq_enabled;
if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
irq_enabled = 0;
if (!irq_enabled) {
/* Make sure they know an IRQ is pending. */
put_user(X86_EFLAGS_IF,
&cpu->lg->lguest_data->irq_pending);
return;
}
}
/*
* Look at the IDT entry the Guest gave us for this interrupt. The
* first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
* over them.
*/
idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
/* If they don't have a handler (yet?), we just ignore it */
if (idt_present(idt->a, idt->b)) {
/* OK, mark it no longer pending and deliver it. */
clear_bit(irq, cpu->irqs_pending);
/*
* set_guest_interrupt() takes the interrupt descriptor and a
* flag to say whether this interrupt pushes an error code onto
* the stack as well: virtual interrupts never do.
*/
set_guest_interrupt(cpu, idt->a, idt->b, false);
}
/*
* Every time we deliver an interrupt, we update the timestamp in the
* Guest's lguest_data struct. It would be better for the Guest if we
* did this more often, but it can actually be quite slow: doing it
* here is a compromise which means at least it gets updated every
* timer interrupt.
*/
write_timestamp(cpu);
/*
* If there are no other interrupts we want to deliver, clear
* the pending flag.
*/
if (!more)
put_user(0, &cpu->lg->lguest_data->irq_pending);
}
/* And this is the routine when we want to set an interrupt for the Guest. */
void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
{
/*
* Next time the Guest runs, the core code will see if it can deliver
* this interrupt.
*/
set_bit(irq, cpu->irqs_pending);
/*
* Make sure it sees it; it might be asleep (eg. halted), or running
* the Guest right now, in which case kick_process() will knock it out.
*/
if (!wake_up_process(cpu->tsk))
kick_process(cpu->tsk);
}
/*:*/
/*
* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent
* me a patch, so we support that too. It'd be a big step for lguest if half
* the Plan 9 user base were to start using it.
*
* Actually now I think of it, it's possible that Ron *is* half the Plan 9
* userbase. Oh well.
*/
static bool could_be_syscall(unsigned int num)
{
/* Normal Linux SYSCALL_VECTOR or reserved vector? */
return num == SYSCALL_VECTOR || num == syscall_vector;
}
/* The syscall vector it wants must be unused by Host. */
bool check_syscall_vector(struct lguest *lg)
{
u32 vector;
if (get_user(vector, &lg->lguest_data->syscall_vec))
return false;
return could_be_syscall(vector);
}
int init_interrupts(void)
{
/* If they want some strange system call vector, reserve it now */
if (syscall_vector != SYSCALL_VECTOR) {
if (test_bit(syscall_vector, used_vectors) ||
vector_used_by_percpu_irq(syscall_vector)) {
printk(KERN_ERR "lg: couldn't reserve syscall %u\n",
syscall_vector);
return -EBUSY;
}
set_bit(syscall_vector, used_vectors);
}
return 0;
}
void free_interrupts(void)
{
if (syscall_vector != SYSCALL_VECTOR)
clear_bit(syscall_vector, used_vectors);
}
/*H:220
* Now we've got the routines to deliver interrupts, delivering traps like
* page fault is easy. The only trick is that Intel decided that some traps
* should have error codes:
*/
static bool has_err(unsigned int trap)
{
return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
}
/* deliver_trap() returns true if it could deliver the trap. */
bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
{
/*
* Trap numbers are always 8 bit, but we set an impossible trap number
* for traps inside the Switcher, so check that here.
*/
if (num >= ARRAY_SIZE(cpu->arch.idt))
return false;
/*
* Early on the Guest hasn't set the IDT entries (or maybe it put a
* bogus one in): if we fail here, the Guest will be killed.
*/
if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
return false;
set_guest_interrupt(cpu, cpu->arch.idt[num].a,
cpu->arch.idt[num].b, has_err(num));
return true;
}
/*H:250
* Here's the hard part: returning to the Host every time a trap happens
* and then calling deliver_trap() and re-entering the Guest is slow.
* Particularly because Guest userspace system calls are traps (usually trap
* 128).
*
* So we'd like to set up the IDT to tell the CPU to deliver traps directly
* into the Guest. This is possible, but the complexities cause the size of
* this file to double! However, 150 lines of code is worth writing for taking
* system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all
* the other hypervisors would beat it up at lunchtime.
*
* This routine indicates if a particular trap number could be delivered
* directly.
*/
static bool direct_trap(unsigned int num)
{
/*
* Hardware interrupts don't go to the Guest at all (except system
* call).
*/
if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num))
return false;
/*
* The Host needs to see page faults (for shadow paging and to save the
* fault address), general protection faults (in/out emulation) and
* device not available (TS handling) and of course, the hypercall trap.
*/
return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
}
/*:*/
/*M:005
* The Guest has the ability to turn its interrupt gates into trap gates,
* if it is careful. The Host will let trap gates can go directly to the
* Guest, but the Guest needs the interrupts atomically disabled for an
* interrupt gate. It can do this by pointing the trap gate at instructions
* within noirq_start and noirq_end, where it can safely disable interrupts.
*/
/*M:006
* The Guests do not use the sysenter (fast system call) instruction,
* because it's hardcoded to enter privilege level 0 and so can't go direct.
* It's about twice as fast as the older "int 0x80" system call, so it might
* still be worthwhile to handle it in the Switcher and lcall down to the
* Guest. The sysenter semantics are hairy tho: search for that keyword in
* entry.S
:*/
/*H:260
* When we make traps go directly into the Guest, we need to make sure
* the kernel stack is valid (ie. mapped in the page tables). Otherwise, the
* CPU trying to deliver the trap will fault while trying to push the interrupt
* words on the stack: this is called a double fault, and it forces us to kill
* the Guest.
*
* Which is deeply unfair, because (literally!) it wasn't the Guests' fault.
*/
void pin_stack_pages(struct lg_cpu *cpu)
{
unsigned int i;
/*
* Depending on the CONFIG_4KSTACKS option, the Guest can have one or
* two pages of stack space.
*/
for (i = 0; i < cpu->lg->stack_pages; i++)
/*
* The stack grows *upwards*, so the address we're given is the
* start of the page after the kernel stack. Subtract one to
* get back onto the first stack page, and keep subtracting to
* get to the rest of the stack pages.
*/
pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
}
/*
* Direct traps also mean that we need to know whenever the Guest wants to use
* a different kernel stack, so we can change the guest TSS to use that
* stack. The TSS entries expect a virtual address, so unlike most addresses
* the Guest gives us, the "esp" (stack pointer) value here is virtual, not
* physical.
*
* In Linux each process has its own kernel stack, so this happens a lot: we
* change stacks on each context switch.
*/
void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
{
/*
* You're not allowed a stack segment with privilege level 0: bad Guest!
*/
if ((seg & 0x3) != GUEST_PL)
kill_guest(cpu, "bad stack segment %i", seg);
/* We only expect one or two stack pages. */
if (pages > 2)
kill_guest(cpu, "bad stack pages %u", pages);
/* Save where the stack is, and how many pages */
cpu->ss1 = seg;
cpu->esp1 = esp;
cpu->lg->stack_pages = pages;
/* Make sure the new stack pages are mapped */
pin_stack_pages(cpu);
}
/*
* All this reference to mapping stacks leads us neatly into the other complex
* part of the Host: page table handling.
*/
/*H:235
* This is the routine which actually checks the Guest's IDT entry and
* transfers it into the entry in "struct lguest":
*/
static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
unsigned int num, u32 lo, u32 hi)
{
u8 type = idt_type(lo, hi);
/* We zero-out a not-present entry */
if (!idt_present(lo, hi)) {
trap->a = trap->b = 0;
return;
}
/* We only support interrupt and trap gates. */
if (type != 0xE && type != 0xF)
kill_guest(cpu, "bad IDT type %i", type);
/*
* We only copy the handler address, present bit, privilege level and
* type. The privilege level controls where the trap can be triggered
* manually with an "int" instruction. This is usually GUEST_PL,
* except for system calls which userspace can use.
*/
trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
trap->b = (hi&0xFFFFEF00);
}
/*H:230
* While we're here, dealing with delivering traps and interrupts to the
* Guest, we might as well complete the picture: how the Guest tells us where
* it wants them to go. This would be simple, except making traps fast
* requires some tricks.
*
* We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
* LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here.
*/
void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
{
/*
* Guest never handles: NMI, doublefault, spurious interrupt or
* hypercall. We ignore when it tries to set them.
*/
if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
return;
/*
* Mark the IDT as changed: next time the Guest runs we'll know we have
* to copy this again.
*/
cpu->changed |= CHANGED_IDT;
/* Check that the Guest doesn't try to step outside the bounds. */
if (num >= ARRAY_SIZE(cpu->arch.idt))
kill_guest(cpu, "Setting idt entry %u", num);
else
set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
}
/*
* The default entry for each interrupt points into the Switcher routines which
* simply return to the Host. The run_guest() loop will then call
* deliver_trap() to bounce it back into the Guest.
*/
static void default_idt_entry(struct desc_struct *idt,
int trap,
const unsigned long handler,
const struct desc_struct *base)
{
/* A present interrupt gate. */
u32 flags = 0x8e00;
/*
* Set the privilege level on the entry for the hypercall: this allows
* the Guest to use the "int" instruction to trigger it.
*/
if (trap == LGUEST_TRAP_ENTRY)
flags |= (GUEST_PL << 13);
else if (base)
/*
* Copy privilege level from what Guest asked for. This allows
* debug (int 3) traps from Guest userspace, for example.
*/
flags |= (base->b & 0x6000);
/* Now pack it into the IDT entry in its weird format. */
idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
idt->b = (handler&0xFFFF0000) | flags;
}
/* When the Guest first starts, we put default entries into the IDT. */
void setup_default_idt_entries(struct lguest_ro_state *state,
const unsigned long *def)
{
unsigned int i;
for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
default_idt_entry(&state->guest_idt[i], i, def[i], NULL);
}
/*H:240
* We don't use the IDT entries in the "struct lguest" directly, instead
* we copy them into the IDT which we've set up for Guests on this CPU, just
* before we run the Guest. This routine does that copy.
*/
void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
const unsigned long *def)
{
unsigned int i;
/*
* We can simply copy the direct traps, otherwise we use the default
* ones in the Switcher: they will return to the Host.
*/
for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
const struct desc_struct *gidt = &cpu->arch.idt[i];
/* If no Guest can ever override this trap, leave it alone. */
if (!direct_trap(i))
continue;
/*
* Only trap gates (type 15) can go direct to the Guest.
* Interrupt gates (type 14) disable interrupts as they are
* entered, which we never let the Guest do. Not present
* entries (type 0x0) also can't go direct, of course.
*
* If it can't go direct, we still need to copy the priv. level:
* they might want to give userspace access to a software
* interrupt.
*/
if (idt_type(gidt->a, gidt->b) == 0xF)
idt[i] = *gidt;
else
default_idt_entry(&idt[i], i, def[i], gidt);
}
}
/*H:200
* The Guest Clock.
*
* There are two sources of virtual interrupts. We saw one in lguest_user.c:
* the Launcher sending interrupts for virtual devices. The other is the Guest
* timer interrupt.
*
* The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to
* the next timer interrupt (in nanoseconds). We use the high-resolution timer
* infrastructure to set a callback at that time.
*
* 0 means "turn off the clock".
*/
void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
{
ktime_t expires;
if (unlikely(delta == 0)) {
/* Clock event device is shutting down. */
hrtimer_cancel(&cpu->hrt);
return;
}
/*
* We use wallclock time here, so the Guest might not be running for
* all the time between now and the timer interrupt it asked for. This
* is almost always the right thing to do.
*/
expires = ktime_add_ns(ktime_get_real(), delta);
hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
}
/* This is the function called when the Guest's timer expires. */
static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
{
struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
/* Remember the first interrupt is the timer interrupt. */
set_interrupt(cpu, 0);
return HRTIMER_NORESTART;
}
/* This sets up the timer for this Guest. */
void init_clockdev(struct lg_cpu *cpu)
{
hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
cpu->hrt.function = clockdev_fn;
}

260
drivers/lguest/lg.h Normal file
View file

@ -0,0 +1,260 @@
#ifndef _LGUEST_H
#define _LGUEST_H
#ifndef __ASSEMBLY__
#include <linux/types.h>
#include <linux/init.h>
#include <linux/stringify.h>
#include <linux/lguest.h>
#include <linux/lguest_launcher.h>
#include <linux/wait.h>
#include <linux/hrtimer.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <asm/lguest.h>
struct pgdir {
unsigned long gpgdir;
bool switcher_mapped;
int last_host_cpu;
pgd_t *pgdir;
};
/* We have two pages shared with guests, per cpu. */
struct lguest_pages {
/* This is the stack page mapped rw in guest */
char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
struct lguest_regs regs;
/* This is the host state & guest descriptor page, ro in guest */
struct lguest_ro_state state;
} __attribute__((aligned(PAGE_SIZE)));
#define CHANGED_IDT 1
#define CHANGED_GDT 2
#define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */
#define CHANGED_ALL 3
struct lg_cpu {
unsigned int id;
struct lguest *lg;
struct task_struct *tsk;
struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
u32 cr2;
int ts;
u32 esp1;
u16 ss1;
/* Bitmap of what has changed: see CHANGED_* above. */
int changed;
unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
/* At end of a page shared mapped over lguest_pages in guest. */
unsigned long regs_page;
struct lguest_regs *regs;
struct lguest_pages *last_pages;
/* Initialization mode: linear map everything. */
bool linear_pages;
int cpu_pgd; /* Which pgd this cpu is currently using */
/* If a hypercall was asked for, this points to the arguments. */
struct hcall_args *hcall;
u32 next_hcall;
/* Virtual clock device */
struct hrtimer hrt;
/* Did the Guest tell us to halt? */
int halted;
/* Pending virtual interrupts */
DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
struct lg_cpu_arch arch;
};
struct lg_eventfd {
unsigned long addr;
struct eventfd_ctx *event;
};
struct lg_eventfd_map {
unsigned int num;
struct lg_eventfd map[];
};
/* The private info the thread maintains about the guest. */
struct lguest {
struct lguest_data __user *lguest_data;
struct lg_cpu cpus[NR_CPUS];
unsigned int nr_cpus;
u32 pfn_limit;
/*
* This provides the offset to the base of guest-physical memory in the
* Launcher.
*/
void __user *mem_base;
unsigned long kernel_address;
struct pgdir pgdirs[4];
unsigned long noirq_start, noirq_end;
unsigned int stack_pages;
u32 tsc_khz;
struct lg_eventfd_map *eventfds;
/* Dead? */
const char *dead;
};
extern struct mutex lguest_lock;
/* core.c: */
bool lguest_address_ok(const struct lguest *lg,
unsigned long addr, unsigned long len);
void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
extern struct page **lg_switcher_pages;
/*H:035
* Using memory-copy operations like that is usually inconvient, so we
* have the following helper macros which read and write a specific type (often
* an unsigned long).
*
* This reads into a variable of the given type then returns that.
*/
#define lgread(cpu, addr, type) \
({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
/* This checks that the variable is of the given type, then writes it out. */
#define lgwrite(cpu, addr, type, val) \
do { \
typecheck(type, val); \
__lgwrite((cpu), (addr), &(val), sizeof(val)); \
} while(0)
/* (end of memory access helper routines) :*/
int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
/*
* Helper macros to obtain the first 12 or the last 20 bits, this is only the
* first step in the migration to the kernel types. pte_pfn is already defined
* in the kernel.
*/
#define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK)
#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT)
#define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK)
#define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT)
/* interrupts_and_traps.c: */
unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
u32 low, u32 hi);
void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
void pin_stack_pages(struct lg_cpu *cpu);
void setup_default_idt_entries(struct lguest_ro_state *state,
const unsigned long *def);
void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
const unsigned long *def);
void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
bool send_notify_to_eventfd(struct lg_cpu *cpu);
void init_clockdev(struct lg_cpu *cpu);
bool check_syscall_vector(struct lguest *lg);
int init_interrupts(void);
void free_interrupts(void);
/* segments.c: */
void setup_default_gdt_entries(struct lguest_ro_state *state);
void setup_guest_gdt(struct lg_cpu *cpu);
void load_guest_gdt_entry(struct lg_cpu *cpu, unsigned int i,
u32 low, u32 hi);
void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
/* page_tables.c: */
int init_guest_pagetable(struct lguest *lg);
void free_guest_pagetable(struct lguest *lg);
void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
#ifdef CONFIG_X86_PAE
void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
#endif
void guest_pagetable_clear_all(struct lg_cpu *cpu);
void guest_pagetable_flush_user(struct lg_cpu *cpu);
void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
unsigned long vaddr, pte_t val);
void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode);
void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
void page_table_guest_data_init(struct lg_cpu *cpu);
/* <arch>/core.c: */
void lguest_arch_host_init(void);
void lguest_arch_host_fini(void);
void lguest_arch_run_guest(struct lg_cpu *cpu);
void lguest_arch_handle_trap(struct lg_cpu *cpu);
int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
/* <arch>/switcher.S: */
extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
/* lguest_user.c: */
int lguest_device_init(void);
void lguest_device_remove(void);
/* hypercalls.c: */
void do_hypercalls(struct lg_cpu *cpu);
void write_timestamp(struct lg_cpu *cpu);
/*L:035
* Let's step aside for the moment, to study one important routine that's used
* widely in the Host code.
*
* There are many cases where the Guest can do something invalid, like pass crap
* to a hypercall. Since only the Guest kernel can make hypercalls, it's quite
* acceptable to simply terminate the Guest and give the Launcher a nicely
* formatted reason. It's also simpler for the Guest itself, which doesn't
* need to check most hypercalls for "success"; if you're still running, it
* succeeded.
*
* Once this is called, the Guest will never run again, so most Host code can
* call this then continue as if nothing had happened. This means many
* functions don't have to explicitly return an error code, which keeps the
* code simple.
*
* It also means that this can be called more than once: only the first one is
* remembered. The only trick is that we still need to kill the Guest even if
* we can't allocate memory to store the reason. Linux has a neat way of
* packing error codes into invalid pointers, so we use that here.
*
* Like any macro which uses an "if", it is safely wrapped in a run-once "do {
* } while(0)".
*/
#define kill_guest(cpu, fmt...) \
do { \
if (!(cpu)->lg->dead) { \
(cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt); \
if (!(cpu)->lg->dead) \
(cpu)->lg->dead = ERR_PTR(-ENOMEM); \
} \
} while(0)
/* (End of aside) :*/
#endif /* __ASSEMBLY__ */
#endif /* _LGUEST_H */

View file

@ -0,0 +1,535 @@
/*P:050
* Lguest guests use a very simple method to describe devices. It's a
* series of device descriptors contained just above the top of normal Guest
* memory.
*
* We use the standard "virtio" device infrastructure, which provides us with a
* console, a network and a block driver. Each one expects some configuration
* information and a "virtqueue" or two to send and receive data.
:*/
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/lguest_launcher.h>
#include <linux/virtio.h>
#include <linux/virtio_config.h>
#include <linux/interrupt.h>
#include <linux/virtio_ring.h>
#include <linux/err.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <asm/io.h>
#include <asm/paravirt.h>
#include <asm/lguest_hcall.h>
/* The pointer to our (page) of device descriptions. */
static void *lguest_devices;
/*
* For Guests, device memory can be used as normal memory, so we cast away the
* __iomem to quieten sparse.
*/
static inline void *lguest_map(unsigned long phys_addr, unsigned long pages)
{
return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages);
}
static inline void lguest_unmap(void *addr)
{
iounmap((__force void __iomem *)addr);
}
/*D:100
* Each lguest device is just a virtio device plus a pointer to its entry
* in the lguest_devices page.
*/
struct lguest_device {
struct virtio_device vdev;
/* The entry in the lguest_devices page for this device. */
struct lguest_device_desc *desc;
};
/*
* Since the virtio infrastructure hands us a pointer to the virtio_device all
* the time, it helps to have a curt macro to get a pointer to the struct
* lguest_device it's enclosed in.
*/
#define to_lgdev(vd) container_of(vd, struct lguest_device, vdev)
/*D:130
* Device configurations
*
* The configuration information for a device consists of one or more
* virtqueues, a feature bitmap, and some configuration bytes. The
* configuration bytes don't really matter to us: the Launcher sets them up, and
* the driver will look at them during setup.
*
* A convenient routine to return the device's virtqueue config array:
* immediately after the descriptor.
*/
static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc)
{
return (void *)(desc + 1);
}
/* The features come immediately after the virtqueues. */
static u8 *lg_features(const struct lguest_device_desc *desc)
{
return (void *)(lg_vq(desc) + desc->num_vq);
}
/* The config space comes after the two feature bitmasks. */
static u8 *lg_config(const struct lguest_device_desc *desc)
{
return lg_features(desc) + desc->feature_len * 2;
}
/* The total size of the config page used by this device (incl. desc) */
static unsigned desc_size(const struct lguest_device_desc *desc)
{
return sizeof(*desc)
+ desc->num_vq * sizeof(struct lguest_vqconfig)
+ desc->feature_len * 2
+ desc->config_len;
}
/* This gets the device's feature bits. */
static u32 lg_get_features(struct virtio_device *vdev)
{
unsigned int i;
u32 features = 0;
struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
u8 *in_features = lg_features(desc);
/* We do this the slow but generic way. */
for (i = 0; i < min(desc->feature_len * 8, 32); i++)
if (in_features[i / 8] & (1 << (i % 8)))
features |= (1 << i);
return features;
}
/*
* To notify on reset or feature finalization, we (ab)use the NOTIFY
* hypercall, with the descriptor address of the device.
*/
static void status_notify(struct virtio_device *vdev)
{
unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices;
hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0);
}
/*
* The virtio core takes the features the Host offers, and copies the ones
* supported by the driver into the vdev->features array. Once that's all
* sorted out, this routine is called so we can tell the Host which features we
* understand and accept.
*/
static void lg_finalize_features(struct virtio_device *vdev)
{
unsigned int i, bits;
struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
/* Second half of bitmap is features we accept. */
u8 *out_features = lg_features(desc) + desc->feature_len;
/* Give virtio_ring a chance to accept features. */
vring_transport_features(vdev);
/*
* The vdev->feature array is a Linux bitmask: this isn't the same as a
* the simple array of bits used by lguest devices for features. So we
* do this slow, manual conversion which is completely general.
*/
memset(out_features, 0, desc->feature_len);
bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
for (i = 0; i < bits; i++) {
if (test_bit(i, vdev->features))
out_features[i / 8] |= (1 << (i % 8));
}
/* Tell Host we've finished with this device's feature negotiation */
status_notify(vdev);
}
/* Once they've found a field, getting a copy of it is easy. */
static void lg_get(struct virtio_device *vdev, unsigned int offset,
void *buf, unsigned len)
{
struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
/* Check they didn't ask for more than the length of the config! */
BUG_ON(offset + len > desc->config_len);
memcpy(buf, lg_config(desc) + offset, len);
}
/* Setting the contents is also trivial. */
static void lg_set(struct virtio_device *vdev, unsigned int offset,
const void *buf, unsigned len)
{
struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
/* Check they didn't ask for more than the length of the config! */
BUG_ON(offset + len > desc->config_len);
memcpy(lg_config(desc) + offset, buf, len);
}
/*
* The operations to get and set the status word just access the status field
* of the device descriptor.
*/
static u8 lg_get_status(struct virtio_device *vdev)
{
return to_lgdev(vdev)->desc->status;
}
static void lg_set_status(struct virtio_device *vdev, u8 status)
{
BUG_ON(!status);
to_lgdev(vdev)->desc->status = status;
/* Tell Host immediately if we failed. */
if (status & VIRTIO_CONFIG_S_FAILED)
status_notify(vdev);
}
static void lg_reset(struct virtio_device *vdev)
{
/* 0 status means "reset" */
to_lgdev(vdev)->desc->status = 0;
status_notify(vdev);
}
/*
* Virtqueues
*
* The other piece of infrastructure virtio needs is a "virtqueue": a way of
* the Guest device registering buffers for the other side to read from or
* write into (ie. send and receive buffers). Each device can have multiple
* virtqueues: for example the console driver uses one queue for sending and
* another for receiving.
*
* Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue
* already exists in virtio_ring.c. We just need to connect it up.
*
* We start with the information we need to keep about each virtqueue.
*/
/*D:140 This is the information we remember about each virtqueue. */
struct lguest_vq_info {
/* A copy of the information contained in the device config. */
struct lguest_vqconfig config;
/* The address where we mapped the virtio ring, so we can unmap it. */
void *pages;
};
/*
* When the virtio_ring code wants to prod the Host, it calls us here and we
* make a hypercall. We hand the physical address of the virtqueue so the Host
* knows which virtqueue we're talking about.
*/
static bool lg_notify(struct virtqueue *vq)
{
/*
* We store our virtqueue information in the "priv" pointer of the
* virtqueue structure.
*/
struct lguest_vq_info *lvq = vq->priv;
hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0, 0);
return true;
}
/* An extern declaration inside a C file is bad form. Don't do it. */
extern int lguest_setup_irq(unsigned int irq);
/*
* This routine finds the Nth virtqueue described in the configuration of
* this device and sets it up.
*
* This is kind of an ugly duckling. It'd be nicer to have a standard
* representation of a virtqueue in the configuration space, but it seems that
* everyone wants to do it differently. The KVM coders want the Guest to
* allocate its own pages and tell the Host where they are, but for lguest it's
* simpler for the Host to simply tell us where the pages are.
*/
static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
unsigned index,
void (*callback)(struct virtqueue *vq),
const char *name)
{
struct lguest_device *ldev = to_lgdev(vdev);
struct lguest_vq_info *lvq;
struct virtqueue *vq;
int err;
if (!name)
return NULL;
/* We must have this many virtqueues. */
if (index >= ldev->desc->num_vq)
return ERR_PTR(-ENOENT);
lvq = kmalloc(sizeof(*lvq), GFP_KERNEL);
if (!lvq)
return ERR_PTR(-ENOMEM);
/*
* Make a copy of the "struct lguest_vqconfig" entry, which sits after
* the descriptor. We need a copy because the config space might not
* be aligned correctly.
*/
memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config));
printk("Mapping virtqueue %i addr %lx\n", index,
(unsigned long)lvq->config.pfn << PAGE_SHIFT);
/* Figure out how many pages the ring will take, and map that memory */
lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT,
DIV_ROUND_UP(vring_size(lvq->config.num,
LGUEST_VRING_ALIGN),
PAGE_SIZE));
if (!lvq->pages) {
err = -ENOMEM;
goto free_lvq;
}
/*
* OK, tell virtio_ring.c to set up a virtqueue now we know its size
* and we've got a pointer to its pages. Note that we set weak_barriers
* to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu
* barriers.
*/
vq = vring_new_virtqueue(index, lvq->config.num, LGUEST_VRING_ALIGN, vdev,
true, lvq->pages, lg_notify, callback, name);
if (!vq) {
err = -ENOMEM;
goto unmap;
}
/* Make sure the interrupt is allocated. */
err = lguest_setup_irq(lvq->config.irq);
if (err)
goto destroy_vring;
/*
* Tell the interrupt for this virtqueue to go to the virtio_ring
* interrupt handler.
*
* FIXME: We used to have a flag for the Host to tell us we could use
* the interrupt as a source of randomness: it'd be nice to have that
* back.
*/
err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED,
dev_name(&vdev->dev), vq);
if (err)
goto free_desc;
/*
* Last of all we hook up our 'struct lguest_vq_info" to the
* virtqueue's priv pointer.
*/
vq->priv = lvq;
return vq;
free_desc:
irq_free_desc(lvq->config.irq);
destroy_vring:
vring_del_virtqueue(vq);
unmap:
lguest_unmap(lvq->pages);
free_lvq:
kfree(lvq);
return ERR_PTR(err);
}
/*:*/
/* Cleaning up a virtqueue is easy */
static void lg_del_vq(struct virtqueue *vq)
{
struct lguest_vq_info *lvq = vq->priv;
/* Release the interrupt */
free_irq(lvq->config.irq, vq);
/* Tell virtio_ring.c to free the virtqueue. */
vring_del_virtqueue(vq);
/* Unmap the pages containing the ring. */
lguest_unmap(lvq->pages);
/* Free our own queue information. */
kfree(lvq);
}
static void lg_del_vqs(struct virtio_device *vdev)
{
struct virtqueue *vq, *n;
list_for_each_entry_safe(vq, n, &vdev->vqs, list)
lg_del_vq(vq);
}
static int lg_find_vqs(struct virtio_device *vdev, unsigned nvqs,
struct virtqueue *vqs[],
vq_callback_t *callbacks[],
const char *names[])
{
struct lguest_device *ldev = to_lgdev(vdev);
int i;
/* We must have this many virtqueues. */
if (nvqs > ldev->desc->num_vq)
return -ENOENT;
for (i = 0; i < nvqs; ++i) {
vqs[i] = lg_find_vq(vdev, i, callbacks[i], names[i]);
if (IS_ERR(vqs[i]))
goto error;
}
return 0;
error:
lg_del_vqs(vdev);
return PTR_ERR(vqs[i]);
}
static const char *lg_bus_name(struct virtio_device *vdev)
{
return "";
}
/* The ops structure which hooks everything together. */
static const struct virtio_config_ops lguest_config_ops = {
.get_features = lg_get_features,
.finalize_features = lg_finalize_features,
.get = lg_get,
.set = lg_set,
.get_status = lg_get_status,
.set_status = lg_set_status,
.reset = lg_reset,
.find_vqs = lg_find_vqs,
.del_vqs = lg_del_vqs,
.bus_name = lg_bus_name,
};
/*
* The root device for the lguest virtio devices. This makes them appear as
* /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2.
*/
static struct device *lguest_root;
/*D:120
* This is the core of the lguest bus: actually adding a new device.
* It's a separate function because it's neater that way, and because an
* earlier version of the code supported hotplug and unplug. They were removed
* early on because they were never used.
*
* As Andrew Tridgell says, "Untested code is buggy code".
*
* It's worth reading this carefully: we start with a pointer to the new device
* descriptor in the "lguest_devices" page, and the offset into the device
* descriptor page so we can uniquely identify it if things go badly wrong.
*/
static void add_lguest_device(struct lguest_device_desc *d,
unsigned int offset)
{
struct lguest_device *ldev;
/* Start with zeroed memory; Linux's device layer counts on it. */
ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
if (!ldev) {
printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n",
offset, d->type);
return;
}
/* This devices' parent is the lguest/ dir. */
ldev->vdev.dev.parent = lguest_root;
/*
* The device type comes straight from the descriptor. There's also a
* device vendor field in the virtio_device struct, which we leave as
* 0.
*/
ldev->vdev.id.device = d->type;
/*
* We have a simple set of routines for querying the device's
* configuration information and setting its status.
*/
ldev->vdev.config = &lguest_config_ops;
/* And we remember the device's descriptor for lguest_config_ops. */
ldev->desc = d;
/*
* register_virtio_device() sets up the generic fields for the struct
* virtio_device and calls device_register(). This makes the bus
* infrastructure look for a matching driver.
*/
if (register_virtio_device(&ldev->vdev) != 0) {
printk(KERN_ERR "Failed to register lguest dev %u type %u\n",
offset, d->type);
kfree(ldev);
}
}
/*D:110
* scan_devices() simply iterates through the device page. The type 0 is
* reserved to mean "end of devices".
*/
static void scan_devices(void)
{
unsigned int i;
struct lguest_device_desc *d;
/* We start at the page beginning, and skip over each entry. */
for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
d = lguest_devices + i;
/* Once we hit a zero, stop. */
if (d->type == 0)
break;
printk("Device at %i has size %u\n", i, desc_size(d));
add_lguest_device(d, i);
}
}
/*D:105
* Fairly early in boot, lguest_devices_init() is called to set up the
* lguest device infrastructure. We check that we are a Guest by checking
* pv_info.name: there are other ways of checking, but this seems most
* obvious to me.
*
* So we can access the "struct lguest_device_desc"s easily, we map that memory
* and store the pointer in the global "lguest_devices". Then we register a
* root device from which all our devices will hang (this seems to be the
* correct sysfs incantation).
*
* Finally we call scan_devices() which adds all the devices found in the
* lguest_devices page.
*/
static int __init lguest_devices_init(void)
{
if (strcmp(pv_info.name, "lguest") != 0)
return 0;
lguest_root = root_device_register("lguest");
if (IS_ERR(lguest_root))
panic("Could not register lguest root");
/* Devices are in a single page above top of "normal" mem */
lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
scan_devices();
return 0;
}
/* We do this after core stuff, but before the drivers. */
postcore_initcall(lguest_devices_init);
/*D:150
* At this point in the journey we used to now wade through the lguest
* devices themselves: net, block and console. Since they're all now virtio
* devices rather than lguest-specific, I've decided to ignore them. Mostly,
* they're kind of boring. But this does mean you'll never experience the
* thrill of reading the forbidden love scene buried deep in the block driver.
*
* "make Launcher" beckons, where we answer questions like "Where do Guests
* come from?", and "What do you do when someone asks for optimization?".
*/

View file

@ -0,0 +1,542 @@
/*P:200 This contains all the /dev/lguest code, whereby the userspace
* launcher controls and communicates with the Guest. For example,
* the first write will tell us the Guest's memory layout and entry
* point. A read will run the Guest until something happens, such as
* a signal or the Guest doing a NOTIFY out to the Launcher. There is
* also a way for the Launcher to attach eventfds to particular NOTIFY
* values instead of returning from the read() call.
:*/
#include <linux/uaccess.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/eventfd.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/export.h>
#include "lg.h"
/*L:056
* Before we move on, let's jump ahead and look at what the kernel does when
* it needs to look up the eventfds. That will complete our picture of how we
* use RCU.
*
* The notification value is in cpu->pending_notify: we return true if it went
* to an eventfd.
*/
bool send_notify_to_eventfd(struct lg_cpu *cpu)
{
unsigned int i;
struct lg_eventfd_map *map;
/*
* This "rcu_read_lock()" helps track when someone is still looking at
* the (RCU-using) eventfds array. It's not actually a lock at all;
* indeed it's a noop in many configurations. (You didn't expect me to
* explain all the RCU secrets here, did you?)
*/
rcu_read_lock();
/*
* rcu_dereference is the counter-side of rcu_assign_pointer(); it
* makes sure we don't access the memory pointed to by
* cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy,
* but Alpha allows this! Paul McKenney points out that a really
* aggressive compiler could have the same effect:
* http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html
*
* So play safe, use rcu_dereference to get the rcu-protected pointer:
*/
map = rcu_dereference(cpu->lg->eventfds);
/*
* Simple array search: even if they add an eventfd while we do this,
* we'll continue to use the old array and just won't see the new one.
*/
for (i = 0; i < map->num; i++) {
if (map->map[i].addr == cpu->pending_notify) {
eventfd_signal(map->map[i].event, 1);
cpu->pending_notify = 0;
break;
}
}
/* We're done with the rcu-protected variable cpu->lg->eventfds. */
rcu_read_unlock();
/* If we cleared the notification, it's because we found a match. */
return cpu->pending_notify == 0;
}
/*L:055
* One of the more tricksy tricks in the Linux Kernel is a technique called
* Read Copy Update. Since one point of lguest is to teach lguest journeyers
* about kernel coding, I use it here. (In case you're curious, other purposes
* include learning about virtualization and instilling a deep appreciation for
* simplicity and puppies).
*
* We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we
* add new eventfds without ever blocking readers from accessing the array.
* The current Launcher only does this during boot, so that never happens. But
* Read Copy Update is cool, and adding a lock risks damaging even more puppies
* than this code does.
*
* We allocate a brand new one-larger array, copy the old one and add our new
* element. Then we make the lg eventfd pointer point to the new array.
* That's the easy part: now we need to free the old one, but we need to make
* sure no slow CPU somewhere is still looking at it. That's what
* synchronize_rcu does for us: waits until every CPU has indicated that it has
* moved on to know it's no longer using the old one.
*
* If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update.
*/
static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
{
struct lg_eventfd_map *new, *old = lg->eventfds;
/*
* We don't allow notifications on value 0 anyway (pending_notify of
* 0 means "nothing pending").
*/
if (!addr)
return -EINVAL;
/*
* Replace the old array with the new one, carefully: others can
* be accessing it at the same time.
*/
new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
GFP_KERNEL);
if (!new)
return -ENOMEM;
/* First make identical copy. */
memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
new->num = old->num;
/* Now append new entry. */
new->map[new->num].addr = addr;
new->map[new->num].event = eventfd_ctx_fdget(fd);
if (IS_ERR(new->map[new->num].event)) {
int err = PTR_ERR(new->map[new->num].event);
kfree(new);
return err;
}
new->num++;
/*
* Now put new one in place: rcu_assign_pointer() is a fancy way of
* doing "lg->eventfds = new", but it uses memory barriers to make
* absolutely sure that the contents of "new" written above is nailed
* down before we actually do the assignment.
*
* We have to think about these kinds of things when we're operating on
* live data without locks.
*/
rcu_assign_pointer(lg->eventfds, new);
/*
* We're not in a big hurry. Wait until no one's looking at old
* version, then free it.
*/
synchronize_rcu();
kfree(old);
return 0;
}
/*L:052
* Receiving notifications from the Guest is usually done by attaching a
* particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will
* become readable when the Guest does an LHCALL_NOTIFY with that value.
*
* This is really convenient for processing each virtqueue in a separate
* thread.
*/
static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
{
unsigned long addr, fd;
int err;
if (get_user(addr, input) != 0)
return -EFAULT;
input++;
if (get_user(fd, input) != 0)
return -EFAULT;
/*
* Just make sure two callers don't add eventfds at once. We really
* only need to lock against callers adding to the same Guest, so using
* the Big Lguest Lock is overkill. But this is setup, not a fast path.
*/
mutex_lock(&lguest_lock);
err = add_eventfd(lg, addr, fd);
mutex_unlock(&lguest_lock);
return err;
}
/*L:050
* Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
* number to /dev/lguest.
*/
static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
{
unsigned long irq;
if (get_user(irq, input) != 0)
return -EFAULT;
if (irq >= LGUEST_IRQS)
return -EINVAL;
/*
* Next time the Guest runs, the core code will see if it can deliver
* this interrupt.
*/
set_interrupt(cpu, irq);
return 0;
}
/*L:040
* Once our Guest is initialized, the Launcher makes it run by reading
* from /dev/lguest.
*/
static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
{
struct lguest *lg = file->private_data;
struct lg_cpu *cpu;
unsigned int cpu_id = *o;
/* You must write LHREQ_INITIALIZE first! */
if (!lg)
return -EINVAL;
/* Watch out for arbitrary vcpu indexes! */
if (cpu_id >= lg->nr_cpus)
return -EINVAL;
cpu = &lg->cpus[cpu_id];
/* If you're not the task which owns the Guest, go away. */
if (current != cpu->tsk)
return -EPERM;
/* If the Guest is already dead, we indicate why */
if (lg->dead) {
size_t len;
/* lg->dead either contains an error code, or a string. */
if (IS_ERR(lg->dead))
return PTR_ERR(lg->dead);
/* We can only return as much as the buffer they read with. */
len = min(size, strlen(lg->dead)+1);
if (copy_to_user(user, lg->dead, len) != 0)
return -EFAULT;
return len;
}
/*
* If we returned from read() last time because the Guest sent I/O,
* clear the flag.
*/
if (cpu->pending_notify)
cpu->pending_notify = 0;
/* Run the Guest until something interesting happens. */
return run_guest(cpu, (unsigned long __user *)user);
}
/*L:025
* This actually initializes a CPU. For the moment, a Guest is only
* uniprocessor, so "id" is always 0.
*/
static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
{
/* We have a limited number of CPUs in the lguest struct. */
if (id >= ARRAY_SIZE(cpu->lg->cpus))
return -EINVAL;
/* Set up this CPU's id, and pointer back to the lguest struct. */
cpu->id = id;
cpu->lg = container_of(cpu, struct lguest, cpus[id]);
cpu->lg->nr_cpus++;
/* Each CPU has a timer it can set. */
init_clockdev(cpu);
/*
* We need a complete page for the Guest registers: they are accessible
* to the Guest and we can only grant it access to whole pages.
*/
cpu->regs_page = get_zeroed_page(GFP_KERNEL);
if (!cpu->regs_page)
return -ENOMEM;
/* We actually put the registers at the end of the page. */
cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
/*
* Now we initialize the Guest's registers, handing it the start
* address.
*/
lguest_arch_setup_regs(cpu, start_ip);
/*
* We keep a pointer to the Launcher task (ie. current task) for when
* other Guests want to wake this one (eg. console input).
*/
cpu->tsk = current;
/*
* We need to keep a pointer to the Launcher's memory map, because if
* the Launcher dies we need to clean it up. If we don't keep a
* reference, it is destroyed before close() is called.
*/
cpu->mm = get_task_mm(cpu->tsk);
/*
* We remember which CPU's pages this Guest used last, for optimization
* when the same Guest runs on the same CPU twice.
*/
cpu->last_pages = NULL;
/* No error == success. */
return 0;
}
/*L:020
* The initialization write supplies 3 pointer sized (32 or 64 bit) values (in
* addition to the LHREQ_INITIALIZE value). These are:
*
* base: The start of the Guest-physical memory inside the Launcher memory.
*
* pfnlimit: The highest (Guest-physical) page number the Guest should be
* allowed to access. The Guest memory lives inside the Launcher, so it sets
* this to ensure the Guest can only reach its own memory.
*
* start: The first instruction to execute ("eip" in x86-speak).
*/
static int initialize(struct file *file, const unsigned long __user *input)
{
/* "struct lguest" contains all we (the Host) know about a Guest. */
struct lguest *lg;
int err;
unsigned long args[3];
/*
* We grab the Big Lguest lock, which protects against multiple
* simultaneous initializations.
*/
mutex_lock(&lguest_lock);
/* You can't initialize twice! Close the device and start again... */
if (file->private_data) {
err = -EBUSY;
goto unlock;
}
if (copy_from_user(args, input, sizeof(args)) != 0) {
err = -EFAULT;
goto unlock;
}
lg = kzalloc(sizeof(*lg), GFP_KERNEL);
if (!lg) {
err = -ENOMEM;
goto unlock;
}
lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
if (!lg->eventfds) {
err = -ENOMEM;
goto free_lg;
}
lg->eventfds->num = 0;
/* Populate the easy fields of our "struct lguest" */
lg->mem_base = (void __user *)args[0];
lg->pfn_limit = args[1];
/* This is the first cpu (cpu 0) and it will start booting at args[2] */
err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
if (err)
goto free_eventfds;
/*
* Initialize the Guest's shadow page tables. This allocates
* memory, so can fail.
*/
err = init_guest_pagetable(lg);
if (err)
goto free_regs;
/* We keep our "struct lguest" in the file's private_data. */
file->private_data = lg;
mutex_unlock(&lguest_lock);
/* And because this is a write() call, we return the length used. */
return sizeof(args);
free_regs:
/* FIXME: This should be in free_vcpu */
free_page(lg->cpus[0].regs_page);
free_eventfds:
kfree(lg->eventfds);
free_lg:
kfree(lg);
unlock:
mutex_unlock(&lguest_lock);
return err;
}
/*L:010
* The first operation the Launcher does must be a write. All writes
* start with an unsigned long number: for the first write this must be
* LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use
* writes of other values to send interrupts or set up receipt of notifications.
*
* Note that we overload the "offset" in the /dev/lguest file to indicate what
* CPU number we're dealing with. Currently this is always 0 since we only
* support uniprocessor Guests, but you can see the beginnings of SMP support
* here.
*/
static ssize_t write(struct file *file, const char __user *in,
size_t size, loff_t *off)
{
/*
* Once the Guest is initialized, we hold the "struct lguest" in the
* file private data.
*/
struct lguest *lg = file->private_data;
const unsigned long __user *input = (const unsigned long __user *)in;
unsigned long req;
struct lg_cpu *uninitialized_var(cpu);
unsigned int cpu_id = *off;
/* The first value tells us what this request is. */
if (get_user(req, input) != 0)
return -EFAULT;
input++;
/* If you haven't initialized, you must do that first. */
if (req != LHREQ_INITIALIZE) {
if (!lg || (cpu_id >= lg->nr_cpus))
return -EINVAL;
cpu = &lg->cpus[cpu_id];
/* Once the Guest is dead, you can only read() why it died. */
if (lg->dead)
return -ENOENT;
}
switch (req) {
case LHREQ_INITIALIZE:
return initialize(file, input);
case LHREQ_IRQ:
return user_send_irq(cpu, input);
case LHREQ_EVENTFD:
return attach_eventfd(lg, input);
default:
return -EINVAL;
}
}
/*L:060
* The final piece of interface code is the close() routine. It reverses
* everything done in initialize(). This is usually called because the
* Launcher exited.
*
* Note that the close routine returns 0 or a negative error number: it can't
* really fail, but it can whine. I blame Sun for this wart, and K&R C for
* letting them do it.
:*/
static int close(struct inode *inode, struct file *file)
{
struct lguest *lg = file->private_data;
unsigned int i;
/* If we never successfully initialized, there's nothing to clean up */
if (!lg)
return 0;
/*
* We need the big lock, to protect from inter-guest I/O and other
* Launchers initializing guests.
*/
mutex_lock(&lguest_lock);
/* Free up the shadow page tables for the Guest. */
free_guest_pagetable(lg);
for (i = 0; i < lg->nr_cpus; i++) {
/* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
hrtimer_cancel(&lg->cpus[i].hrt);
/* We can free up the register page we allocated. */
free_page(lg->cpus[i].regs_page);
/*
* Now all the memory cleanups are done, it's safe to release
* the Launcher's memory management structure.
*/
mmput(lg->cpus[i].mm);
}
/* Release any eventfds they registered. */
for (i = 0; i < lg->eventfds->num; i++)
eventfd_ctx_put(lg->eventfds->map[i].event);
kfree(lg->eventfds);
/*
* If lg->dead doesn't contain an error code it will be NULL or a
* kmalloc()ed string, either of which is ok to hand to kfree().
*/
if (!IS_ERR(lg->dead))
kfree(lg->dead);
/* Free the memory allocated to the lguest_struct */
kfree(lg);
/* Release lock and exit. */
mutex_unlock(&lguest_lock);
return 0;
}
/*L:000
* Welcome to our journey through the Launcher!
*
* The Launcher is the Host userspace program which sets up, runs and services
* the Guest. In fact, many comments in the Drivers which refer to "the Host"
* doing things are inaccurate: the Launcher does all the device handling for
* the Guest, but the Guest can't know that.
*
* Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we
* shall see more of that later.
*
* We begin our understanding with the Host kernel interface which the Launcher
* uses: reading and writing a character device called /dev/lguest. All the
* work happens in the read(), write() and close() routines:
*/
static const struct file_operations lguest_fops = {
.owner = THIS_MODULE,
.release = close,
.write = write,
.read = read,
.llseek = default_llseek,
};
/*:*/
/*
* This is a textbook example of a "misc" character device. Populate a "struct
* miscdevice" and register it with misc_register().
*/
static struct miscdevice lguest_dev = {
.minor = MISC_DYNAMIC_MINOR,
.name = "lguest",
.fops = &lguest_fops,
};
int __init lguest_device_init(void)
{
return misc_register(&lguest_dev);
}
void __exit lguest_device_remove(void)
{
misc_deregister(&lguest_dev);
}

1196
drivers/lguest/page_tables.c Normal file

File diff suppressed because it is too large Load diff

228
drivers/lguest/segments.c Normal file
View file

@ -0,0 +1,228 @@
/*P:600
* The x86 architecture has segments, which involve a table of descriptors
* which can be used to do funky things with virtual address interpretation.
* We originally used to use segments so the Guest couldn't alter the
* Guest<->Host Switcher, and then we had to trim Guest segments, and restore
* for userspace per-thread segments, but trim again for on userspace->kernel
* transitions... This nightmarish creation was contained within this file,
* where we knew not to tread without heavy armament and a change of underwear.
*
* In these modern times, the segment handling code consists of simple sanity
* checks, and the worst you'll experience reading this code is butterfly-rash
* from frolicking through its parklike serenity.
:*/
#include "lg.h"
/*H:600
* Segments & The Global Descriptor Table
*
* (That title sounds like a bad Nerdcore group. Not to suggest that there are
* any good Nerdcore groups, but in high school a friend of mine had a band
* called Joe Fish and the Chips, so there are definitely worse band names).
*
* To refresh: the GDT is a table of 8-byte values describing segments. Once
* set up, these segments can be loaded into one of the 6 "segment registers".
*
* GDT entries are passed around as "struct desc_struct"s, which like IDT
* entries are split into two 32-bit members, "a" and "b". One day, someone
* will clean that up, and be declared a Hero. (No pressure, I'm just saying).
*
* Anyway, the GDT entry contains a base (the start address of the segment), a
* limit (the size of the segment - 1), and some flags. Sounds simple, and it
* would be, except those zany Intel engineers decided that it was too boring
* to put the base at one end, the limit at the other, and the flags in
* between. They decided to shotgun the bits at random throughout the 8 bytes,
* like so:
*
* 0 16 40 48 52 56 63
* [ limit part 1 ][ base part 1 ][ flags ][li][fl][base ]
* mit ags part 2
* part 2
*
* As a result, this file contains a certain amount of magic numeracy. Let's
* begin.
*/
/*
* There are several entries we don't let the Guest set. The TSS entry is the
* "Task State Segment" which controls all kinds of delicate things. The
* LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the
* the Guest can't be trusted to deal with double faults.
*/
static bool ignored_gdt(unsigned int num)
{
return (num == GDT_ENTRY_TSS
|| num == GDT_ENTRY_LGUEST_CS
|| num == GDT_ENTRY_LGUEST_DS
|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
}
/*H:630
* Once the Guest gave us new GDT entries, we fix them up a little. We
* don't care if they're invalid: the worst that can happen is a General
* Protection Fault in the Switcher when it restores a Guest segment register
* which tries to use that entry. Then we kill the Guest for causing such a
* mess: the message will be "unhandled trap 256".
*/
static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
{
unsigned int i;
for (i = start; i < end; i++) {
/*
* We never copy these ones to real GDT, so we don't care what
* they say
*/
if (ignored_gdt(i))
continue;
/*
* Segment descriptors contain a privilege level: the Guest is
* sometimes careless and leaves this as 0, even though it's
* running at privilege level 1. If so, we fix it here.
*/
if (cpu->arch.gdt[i].dpl == 0)
cpu->arch.gdt[i].dpl |= GUEST_PL;
/*
* Each descriptor has an "accessed" bit. If we don't set it
* now, the CPU will try to set it when the Guest first loads
* that entry into a segment register. But the GDT isn't
* writable by the Guest, so bad things can happen.
*/
cpu->arch.gdt[i].type |= 0x1;
}
}
/*H:610
* Like the IDT, we never simply use the GDT the Guest gives us. We keep
* a GDT for each CPU, and copy across the Guest's entries each time we want to
* run the Guest on that CPU.
*
* This routine is called at boot or modprobe time for each CPU to set up the
* constant GDT entries: the ones which are the same no matter what Guest we're
* running.
*/
void setup_default_gdt_entries(struct lguest_ro_state *state)
{
struct desc_struct *gdt = state->guest_gdt;
unsigned long tss = (unsigned long)&state->guest_tss;
/* The Switcher segments are full 0-4G segments, privilege level 0 */
gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
/*
* The TSS segment refers to the TSS entry for this particular CPU.
*/
gdt[GDT_ENTRY_TSS].a = 0;
gdt[GDT_ENTRY_TSS].b = 0;
gdt[GDT_ENTRY_TSS].limit0 = 0x67;
gdt[GDT_ENTRY_TSS].base0 = tss & 0xFFFF;
gdt[GDT_ENTRY_TSS].base1 = (tss >> 16) & 0xFF;
gdt[GDT_ENTRY_TSS].base2 = tss >> 24;
gdt[GDT_ENTRY_TSS].type = 0x9; /* 32-bit TSS (available) */
gdt[GDT_ENTRY_TSS].p = 0x1; /* Entry is present */
gdt[GDT_ENTRY_TSS].dpl = 0x0; /* Privilege level 0 */
gdt[GDT_ENTRY_TSS].s = 0x0; /* system segment */
}
/*
* This routine sets up the initial Guest GDT for booting. All entries start
* as 0 (unusable).
*/
void setup_guest_gdt(struct lg_cpu *cpu)
{
/*
* Start with full 0-4G segments...except the Guest is allowed to use
* them, so set the privilege level appropriately in the flags.
*/
cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL;
cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL;
}
/*H:650
* An optimization of copy_gdt(), for just the three "thead-local storage"
* entries.
*/
void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
{
unsigned int i;
for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
gdt[i] = cpu->arch.gdt[i];
}
/*H:640
* When the Guest is run on a different CPU, or the GDT entries have changed,
* copy_gdt() is called to copy the Guest's GDT entries across to this CPU's
* GDT.
*/
void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
{
unsigned int i;
/*
* The default entries from setup_default_gdt_entries() are not
* replaced. See ignored_gdt() above.
*/
for (i = 0; i < GDT_ENTRIES; i++)
if (!ignored_gdt(i))
gdt[i] = cpu->arch.gdt[i];
}
/*H:620
* This is where the Guest asks us to load a new GDT entry
* (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in.
*/
void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
{
/*
* We assume the Guest has the same number of GDT entries as the
* Host, otherwise we'd have to dynamically allocate the Guest GDT.
*/
if (num >= ARRAY_SIZE(cpu->arch.gdt)) {
kill_guest(cpu, "too many gdt entries %i", num);
return;
}
/* Set it up, then fix it. */
cpu->arch.gdt[num].a = lo;
cpu->arch.gdt[num].b = hi;
fixup_gdt_table(cpu, num, num+1);
/*
* Mark that the GDT changed so the core knows it has to copy it again,
* even if the Guest is run on the same CPU.
*/
cpu->changed |= CHANGED_GDT;
}
/*
* This is the fast-track version for just changing the three TLS entries.
* Remember that this happens on every context switch, so it's worth
* optimizing. But wouldn't it be neater to have a single hypercall to cover
* both cases?
*/
void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
{
struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
__lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
/* Note that just the TLS entries have changed. */
cpu->changed |= CHANGED_GDT_TLS;
}
/*H:660
* With this, we have finished the Host.
*
* Five of the seven parts of our task are complete. You have made it through
* the Bit of Despair (I think that's somewhere in the page table code,
* myself).
*
* Next, we examine "make Switcher". It's short, but intense.
*/

720
drivers/lguest/x86/core.c Normal file
View file

@ -0,0 +1,720 @@
/*
* Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
* Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*P:450
* This file contains the x86-specific lguest code. It used to be all
* mixed in with drivers/lguest/core.c but several foolhardy code slashers
* wrestled most of the dependencies out to here in preparation for porting
* lguest to other architectures (see what I mean by foolhardy?).
*
* This also contains a couple of non-obvious setup and teardown pieces which
* were implemented after days of debugging pain.
:*/
#include <linux/kernel.h>
#include <linux/start_kernel.h>
#include <linux/string.h>
#include <linux/console.h>
#include <linux/screen_info.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/cpu.h>
#include <linux/lguest.h>
#include <linux/lguest_launcher.h>
#include <asm/paravirt.h>
#include <asm/param.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/lguest.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
#include "../lg.h"
static int cpu_had_pge;
static struct {
unsigned long offset;
unsigned short segment;
} lguest_entry;
/* Offset from where switcher.S was compiled to where we've copied it */
static unsigned long switcher_offset(void)
{
return switcher_addr - (unsigned long)start_switcher_text;
}
/* This cpu's struct lguest_pages (after the Switcher text page) */
static struct lguest_pages *lguest_pages(unsigned int cpu)
{
return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
}
static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
/*S:010
* We approach the Switcher.
*
* Remember that each CPU has two pages which are visible to the Guest when it
* runs on that CPU. This has to contain the state for that Guest: we copy the
* state in just before we run the Guest.
*
* Each Guest has "changed" flags which indicate what has changed in the Guest
* since it last ran. We saw this set in interrupts_and_traps.c and
* segments.c.
*/
static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
{
/*
* Copying all this data can be quite expensive. We usually run the
* same Guest we ran last time (and that Guest hasn't run anywhere else
* meanwhile). If that's not the case, we pretend everything in the
* Guest has changed.
*/
if (__this_cpu_read(lg_last_cpu) != cpu || cpu->last_pages != pages) {
__this_cpu_write(lg_last_cpu, cpu);
cpu->last_pages = pages;
cpu->changed = CHANGED_ALL;
}
/*
* These copies are pretty cheap, so we do them unconditionally: */
/* Save the current Host top-level page directory.
*/
pages->state.host_cr3 = __pa(current->mm->pgd);
/*
* Set up the Guest's page tables to see this CPU's pages (and no
* other CPU's pages).
*/
map_switcher_in_guest(cpu, pages);
/*
* Set up the two "TSS" members which tell the CPU what stack to use
* for traps which do directly into the Guest (ie. traps at privilege
* level 1).
*/
pages->state.guest_tss.sp1 = cpu->esp1;
pages->state.guest_tss.ss1 = cpu->ss1;
/* Copy direct-to-Guest trap entries. */
if (cpu->changed & CHANGED_IDT)
copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
/* Copy all GDT entries which the Guest can change. */
if (cpu->changed & CHANGED_GDT)
copy_gdt(cpu, pages->state.guest_gdt);
/* If only the TLS entries have changed, copy them. */
else if (cpu->changed & CHANGED_GDT_TLS)
copy_gdt_tls(cpu, pages->state.guest_gdt);
/* Mark the Guest as unchanged for next time. */
cpu->changed = 0;
}
/* Finally: the code to actually call into the Switcher to run the Guest. */
static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
{
/* This is a dummy value we need for GCC's sake. */
unsigned int clobber;
/*
* Copy the guest-specific information into this CPU's "struct
* lguest_pages".
*/
copy_in_guest_info(cpu, pages);
/*
* Set the trap number to 256 (impossible value). If we fault while
* switching to the Guest (bad segment registers or bug), this will
* cause us to abort the Guest.
*/
cpu->regs->trapnum = 256;
/*
* Now: we push the "eflags" register on the stack, then do an "lcall".
* This is how we change from using the kernel code segment to using
* the dedicated lguest code segment, as well as jumping into the
* Switcher.
*
* The lcall also pushes the old code segment (KERNEL_CS) onto the
* stack, then the address of this call. This stack layout happens to
* exactly match the stack layout created by an interrupt...
*/
asm volatile("pushf; lcall *%4"
/*
* This is how we tell GCC that %eax ("a") and %ebx ("b")
* are changed by this routine. The "=" means output.
*/
: "=a"(clobber), "=b"(clobber)
/*
* %eax contains the pages pointer. ("0" refers to the
* 0-th argument above, ie "a"). %ebx contains the
* physical address of the Guest's top-level page
* directory.
*/
: "0"(pages),
"1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)),
"m"(lguest_entry)
/*
* We tell gcc that all these registers could change,
* which means we don't have to save and restore them in
* the Switcher.
*/
: "memory", "%edx", "%ecx", "%edi", "%esi");
}
/*:*/
/*M:002
* There are hooks in the scheduler which we can register to tell when we
* get kicked off the CPU (preempt_notifier_register()). This would allow us
* to lazily disable SYSENTER which would regain some performance, and should
* also simplify copy_in_guest_info(). Note that we'd still need to restore
* things when we exit to Launcher userspace, but that's fairly easy.
*
* We could also try using these hooks for PGE, but that might be too expensive.
*
* The hooks were designed for KVM, but we can also put them to good use.
:*/
/*H:040
* This is the i386-specific code to setup and run the Guest. Interrupts
* are disabled: we own the CPU.
*/
void lguest_arch_run_guest(struct lg_cpu *cpu)
{
/*
* Remember the awfully-named TS bit? If the Guest has asked to set it
* we set it now, so we can trap and pass that trap to the Guest if it
* uses the FPU.
*/
if (cpu->ts && user_has_fpu())
stts();
/*
* SYSENTER is an optimized way of doing system calls. We can't allow
* it because it always jumps to privilege level 0. A normal Guest
* won't try it because we don't advertise it in CPUID, but a malicious
* Guest (or malicious Guest userspace program) could, so we tell the
* CPU to disable it before running the Guest.
*/
if (boot_cpu_has(X86_FEATURE_SEP))
wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
/*
* Now we actually run the Guest. It will return when something
* interesting happens, and we can examine its registers to see what it
* was doing.
*/
run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
/*
* Note that the "regs" structure contains two extra entries which are
* not really registers: a trap number which says what interrupt or
* trap made the switcher code come back, and an error code which some
* traps set.
*/
/* Restore SYSENTER if it's supposed to be on. */
if (boot_cpu_has(X86_FEATURE_SEP))
wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
/* Clear the host TS bit if it was set above. */
if (cpu->ts && user_has_fpu())
clts();
/*
* If the Guest page faulted, then the cr2 register will tell us the
* bad virtual address. We have to grab this now, because once we
* re-enable interrupts an interrupt could fault and thus overwrite
* cr2, or we could even move off to a different CPU.
*/
if (cpu->regs->trapnum == 14)
cpu->arch.last_pagefault = read_cr2();
/*
* Similarly, if we took a trap because the Guest used the FPU,
* we have to restore the FPU it expects to see.
* math_state_restore() may sleep and we may even move off to
* a different CPU. So all the critical stuff should be done
* before this.
*/
else if (cpu->regs->trapnum == 7 && !user_has_fpu())
math_state_restore();
}
/*H:130
* Now we've examined the hypercall code; our Guest can make requests.
* Our Guest is usually so well behaved; it never tries to do things it isn't
* allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual
* infrastructure isn't quite complete, because it doesn't contain replacements
* for the Intel I/O instructions. As a result, the Guest sometimes fumbles
* across one during the boot process as it probes for various things which are
* usually attached to a PC.
*
* When the Guest uses one of these instructions, we get a trap (General
* Protection Fault) and come here. We see if it's one of those troublesome
* instructions and skip over it. We return true if we did.
*/
static int emulate_insn(struct lg_cpu *cpu)
{
u8 insn;
unsigned int insnlen = 0, in = 0, small_operand = 0;
/*
* The eip contains the *virtual* address of the Guest's instruction:
* walk the Guest's page tables to find the "physical" address.
*/
unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
/*
* This must be the Guest kernel trying to do something, not userspace!
* The bottom two bits of the CS segment register are the privilege
* level.
*/
if ((cpu->regs->cs & 3) != GUEST_PL)
return 0;
/* Decoding x86 instructions is icky. */
insn = lgread(cpu, physaddr, u8);
/*
* Around 2.6.33, the kernel started using an emulation for the
* cmpxchg8b instruction in early boot on many configurations. This
* code isn't paravirtualized, and it tries to disable interrupts.
* Ignore it, which will Mostly Work.
*/
if (insn == 0xfa) {
/* "cli", or Clear Interrupt Enable instruction. Skip it. */
cpu->regs->eip++;
return 1;
}
/*
* 0x66 is an "operand prefix". It means a 16, not 32 bit in/out.
*/
if (insn == 0x66) {
small_operand = 1;
/* The instruction is 1 byte so far, read the next byte. */
insnlen = 1;
insn = lgread(cpu, physaddr + insnlen, u8);
}
/*
* We can ignore the lower bit for the moment and decode the 4 opcodes
* we need to emulate.
*/
switch (insn & 0xFE) {
case 0xE4: /* in <next byte>,%al */
insnlen += 2;
in = 1;
break;
case 0xEC: /* in (%dx),%al */
insnlen += 1;
in = 1;
break;
case 0xE6: /* out %al,<next byte> */
insnlen += 2;
break;
case 0xEE: /* out %al,(%dx) */
insnlen += 1;
break;
default:
/* OK, we don't know what this is, can't emulate. */
return 0;
}
/*
* If it was an "IN" instruction, they expect the result to be read
* into %eax, so we change %eax. We always return all-ones, which
* traditionally means "there's nothing there".
*/
if (in) {
/* Lower bit tells means it's a 32/16 bit access */
if (insn & 0x1) {
if (small_operand)
cpu->regs->eax |= 0xFFFF;
else
cpu->regs->eax = 0xFFFFFFFF;
} else
cpu->regs->eax |= 0xFF;
}
/* Finally, we've "done" the instruction, so move past it. */
cpu->regs->eip += insnlen;
/* Success! */
return 1;
}
/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
void lguest_arch_handle_trap(struct lg_cpu *cpu)
{
switch (cpu->regs->trapnum) {
case 13: /* We've intercepted a General Protection Fault. */
/*
* Check if this was one of those annoying IN or OUT
* instructions which we need to emulate. If so, we just go
* back into the Guest after we've done it.
*/
if (cpu->regs->errcode == 0) {
if (emulate_insn(cpu))
return;
}
break;
case 14: /* We've intercepted a Page Fault. */
/*
* The Guest accessed a virtual address that wasn't mapped.
* This happens a lot: we don't actually set up most of the page
* tables for the Guest at all when we start: as it runs it asks
* for more and more, and we set them up as required. In this
* case, we don't even tell the Guest that the fault happened.
*
* The errcode tells whether this was a read or a write, and
* whether kernel or userspace code.
*/
if (demand_page(cpu, cpu->arch.last_pagefault,
cpu->regs->errcode))
return;
/*
* OK, it's really not there (or not OK): the Guest needs to
* know. We write out the cr2 value so it knows where the
* fault occurred.
*
* Note that if the Guest were really messed up, this could
* happen before it's done the LHCALL_LGUEST_INIT hypercall, so
* lg->lguest_data could be NULL
*/
if (cpu->lg->lguest_data &&
put_user(cpu->arch.last_pagefault,
&cpu->lg->lguest_data->cr2))
kill_guest(cpu, "Writing cr2");
break;
case 7: /* We've intercepted a Device Not Available fault. */
/*
* If the Guest doesn't want to know, we already restored the
* Floating Point Unit, so we just continue without telling it.
*/
if (!cpu->ts)
return;
break;
case 32 ... 255:
/*
* These values mean a real interrupt occurred, in which case
* the Host handler has already been run. We just do a
* friendly check if another process should now be run, then
* return to run the Guest again.
*/
cond_resched();
return;
case LGUEST_TRAP_ENTRY:
/*
* Our 'struct hcall_args' maps directly over our regs: we set
* up the pointer now to indicate a hypercall is pending.
*/
cpu->hcall = (struct hcall_args *)cpu->regs;
return;
}
/* We didn't handle the trap, so it needs to go to the Guest. */
if (!deliver_trap(cpu, cpu->regs->trapnum))
/*
* If the Guest doesn't have a handler (either it hasn't
* registered any yet, or it's one of the faults we don't let
* it handle), it dies with this cryptic error message.
*/
kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
cpu->regs->trapnum, cpu->regs->eip,
cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
: cpu->regs->errcode);
}
/*
* Now we can look at each of the routines this calls, in increasing order of
* complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
* deliver_trap() and demand_page(). After all those, we'll be ready to
* examine the Switcher, and our philosophical understanding of the Host/Guest
* duality will be complete.
:*/
static void adjust_pge(void *on)
{
if (on)
write_cr4(read_cr4() | X86_CR4_PGE);
else
write_cr4(read_cr4() & ~X86_CR4_PGE);
}
/*H:020
* Now the Switcher is mapped and every thing else is ready, we need to do
* some more i386-specific initialization.
*/
void __init lguest_arch_host_init(void)
{
int i;
/*
* Most of the x86/switcher_32.S doesn't care that it's been moved; on
* Intel, jumps are relative, and it doesn't access any references to
* external code or data.
*
* The only exception is the interrupt handlers in switcher.S: their
* addresses are placed in a table (default_idt_entries), so we need to
* update the table with the new addresses. switcher_offset() is a
* convenience function which returns the distance between the
* compiled-in switcher code and the high-mapped copy we just made.
*/
for (i = 0; i < IDT_ENTRIES; i++)
default_idt_entries[i] += switcher_offset();
/*
* Set up the Switcher's per-cpu areas.
*
* Each CPU gets two pages of its own within the high-mapped region
* (aka. "struct lguest_pages"). Much of this can be initialized now,
* but some depends on what Guest we are running (which is set up in
* copy_in_guest_info()).
*/
for_each_possible_cpu(i) {
/* lguest_pages() returns this CPU's two pages. */
struct lguest_pages *pages = lguest_pages(i);
/* This is a convenience pointer to make the code neater. */
struct lguest_ro_state *state = &pages->state;
/*
* The Global Descriptor Table: the Host has a different one
* for each CPU. We keep a descriptor for the GDT which says
* where it is and how big it is (the size is actually the last
* byte, not the size, hence the "-1").
*/
state->host_gdt_desc.size = GDT_SIZE-1;
state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
/*
* All CPUs on the Host use the same Interrupt Descriptor
* Table, so we just use store_idt(), which gets this CPU's IDT
* descriptor.
*/
store_idt(&state->host_idt_desc);
/*
* The descriptors for the Guest's GDT and IDT can be filled
* out now, too. We copy the GDT & IDT into ->guest_gdt and
* ->guest_idt before actually running the Guest.
*/
state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
state->guest_idt_desc.address = (long)&state->guest_idt;
state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
state->guest_gdt_desc.address = (long)&state->guest_gdt;
/*
* We know where we want the stack to be when the Guest enters
* the Switcher: in pages->regs. The stack grows upwards, so
* we start it at the end of that structure.
*/
state->guest_tss.sp0 = (long)(&pages->regs + 1);
/*
* And this is the GDT entry to use for the stack: we keep a
* couple of special LGUEST entries.
*/
state->guest_tss.ss0 = LGUEST_DS;
/*
* x86 can have a finegrained bitmap which indicates what I/O
* ports the process can use. We set it to the end of our
* structure, meaning "none".
*/
state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
/*
* Some GDT entries are the same across all Guests, so we can
* set them up now.
*/
setup_default_gdt_entries(state);
/* Most IDT entries are the same for all Guests, too.*/
setup_default_idt_entries(state, default_idt_entries);
/*
* The Host needs to be able to use the LGUEST segments on this
* CPU, too, so put them in the Host GDT.
*/
get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
}
/*
* In the Switcher, we want the %cs segment register to use the
* LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
* it will be undisturbed when we switch. To change %cs and jump we
* need this structure to feed to Intel's "lcall" instruction.
*/
lguest_entry.offset = (long)switch_to_guest + switcher_offset();
lguest_entry.segment = LGUEST_CS;
/*
* Finally, we need to turn off "Page Global Enable". PGE is an
* optimization where page table entries are specially marked to show
* they never change. The Host kernel marks all the kernel pages this
* way because it's always present, even when userspace is running.
*
* Lguest breaks this: unbeknownst to the rest of the Host kernel, we
* switch to the Guest kernel. If you don't disable this on all CPUs,
* you'll get really weird bugs that you'll chase for two days.
*
* I used to turn PGE off every time we switched to the Guest and back
* on when we return, but that slowed the Switcher down noticibly.
*/
/*
* We don't need the complexity of CPUs coming and going while we're
* doing this.
*/
get_online_cpus();
if (cpu_has_pge) { /* We have a broader idea of "global". */
/* Remember that this was originally set (for cleanup). */
cpu_had_pge = 1;
/*
* adjust_pge is a helper function which sets or unsets the PGE
* bit on its CPU, depending on the argument (0 == unset).
*/
on_each_cpu(adjust_pge, (void *)0, 1);
/* Turn off the feature in the global feature set. */
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
}
put_online_cpus();
}
/*:*/
void __exit lguest_arch_host_fini(void)
{
/* If we had PGE before we started, turn it back on now. */
get_online_cpus();
if (cpu_had_pge) {
set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
/* adjust_pge's argument "1" means set PGE. */
on_each_cpu(adjust_pge, (void *)1, 1);
}
put_online_cpus();
}
/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
{
switch (args->arg0) {
case LHCALL_LOAD_GDT_ENTRY:
load_guest_gdt_entry(cpu, args->arg1, args->arg2, args->arg3);
break;
case LHCALL_LOAD_IDT_ENTRY:
load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
break;
case LHCALL_LOAD_TLS:
guest_load_tls(cpu, args->arg1);
break;
default:
/* Bad Guest. Bad! */
return -EIO;
}
return 0;
}
/*H:126 i386-specific hypercall initialization: */
int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
{
u32 tsc_speed;
/*
* The pointer to the Guest's "struct lguest_data" is the only argument.
* We check that address now.
*/
if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
sizeof(*cpu->lg->lguest_data)))
return -EFAULT;
/*
* Having checked it, we simply set lg->lguest_data to point straight
* into the Launcher's memory at the right place and then use
* copy_to_user/from_user from now on, instead of lgread/write. I put
* this in to show that I'm not immune to writing stupid
* optimizations.
*/
cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
/*
* We insist that the Time Stamp Counter exist and doesn't change with
* cpu frequency. Some devious chip manufacturers decided that TSC
* changes could be handled in software. I decided that time going
* backwards might be good for benchmarks, but it's bad for users.
*
* We also insist that the TSC be stable: the kernel detects unreliable
* TSCs for its own purposes, and we use that here.
*/
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
tsc_speed = tsc_khz;
else
tsc_speed = 0;
if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
return -EFAULT;
/* The interrupt code might not like the system call vector. */
if (!check_syscall_vector(cpu->lg))
kill_guest(cpu, "bad syscall vector");
return 0;
}
/*:*/
/*L:030
* Most of the Guest's registers are left alone: we used get_zeroed_page() to
* allocate the structure, so they will be 0.
*/
void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
{
struct lguest_regs *regs = cpu->regs;
/*
* There are four "segment" registers which the Guest needs to boot:
* The "code segment" register (cs) refers to the kernel code segment
* __KERNEL_CS, and the "data", "extra" and "stack" segment registers
* refer to the kernel data segment __KERNEL_DS.
*
* The privilege level is packed into the lower bits. The Guest runs
* at privilege level 1 (GUEST_PL).
*/
regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
regs->cs = __KERNEL_CS|GUEST_PL;
/*
* The "eflags" register contains miscellaneous flags. Bit 1 (0x002)
* is supposed to always be "1". Bit 9 (0x200) controls whether
* interrupts are enabled. We always leave interrupts enabled while
* running the Guest.
*/
regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
/*
* The "Extended Instruction Pointer" register says where the Guest is
* running.
*/
regs->eip = start;
/*
* %esi points to our boot information, at physical address 0, so don't
* touch it.
*/
/* There are a couple of GDT entries the Guest expects at boot. */
setup_guest_gdt(cpu);
}

View file

@ -0,0 +1,388 @@
/*P:900
* This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride
* both the Host and Guest to do the low-level Guest<->Host switch. It is as
* simple as it can be made, but it's naturally very specific to x86.
*
* You have now completed Preparation. If this has whet your appetite; if you
* are feeling invigorated and refreshed then the next, more challenging stage
* can be found in "make Guest".
:*/
/*M:012
* Lguest is meant to be simple: my rule of thumb is that 1% more LOC must
* gain at least 1% more performance. Since neither LOC nor performance can be
* measured beforehand, it generally means implementing a feature then deciding
* if it's worth it. And once it's implemented, who can say no?
*
* This is why I haven't implemented this idea myself. I want to, but I
* haven't. You could, though.
*
* The main place where lguest performance sucks is Guest page faulting. When
* a Guest userspace process hits an unmapped page we switch back to the Host,
* walk the page tables, find it's not mapped, switch back to the Guest page
* fault handler, which calls a hypercall to set the page table entry, then
* finally returns to userspace. That's two round-trips.
*
* If we had a small walker in the Switcher, we could quickly check the Guest
* page table and if the page isn't mapped, immediately reflect the fault back
* into the Guest. This means the Switcher would have to know the top of the
* Guest page table and the page fault handler address.
*
* For simplicity, the Guest should only handle the case where the privilege
* level of the fault is 3 and probably only not present or write faults. It
* should also detect recursive faults, and hand the original fault to the
* Host (which is actually really easy).
*
* Two questions remain. Would the performance gain outweigh the complexity?
* And who would write the verse documenting it?
:*/
/*M:011
* Lguest64 handles NMI. This gave me NMI envy (until I looked at their
* code). It's worth doing though, since it would let us use oprofile in the
* Host when a Guest is running.
:*/
/*S:100
* Welcome to the Switcher itself!
*
* This file contains the low-level code which changes the CPU to run the Guest
* code, and returns to the Host when something happens. Understand this, and
* you understand the heart of our journey.
*
* Because this is in assembler rather than C, our tale switches from prose to
* verse. First I tried limericks:
*
* There once was an eax reg,
* To which our pointer was fed,
* It needed an add,
* Which asm-offsets.h had
* But this limerick is hurting my head.
*
* Next I tried haikus, but fitting the required reference to the seasons in
* every stanza was quickly becoming tiresome:
*
* The %eax reg
* Holds "struct lguest_pages" now:
* Cherry blossoms fall.
*
* Then I started with Heroic Verse, but the rhyming requirement leeched away
* the content density and led to some uniquely awful oblique rhymes:
*
* These constants are coming from struct offsets
* For use within the asm switcher text.
*
* Finally, I settled for something between heroic hexameter, and normal prose
* with inappropriate linebreaks. Anyway, it aint no Shakespeare.
*/
// Not all kernel headers work from assembler
// But these ones are needed: the ENTRY() define
// And constants extracted from struct offsets
// To avoid magic numbers and breakage:
// Should they change the compiler can't save us
// Down here in the depths of assembler code.
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/page.h>
#include <asm/segment.h>
#include <asm/lguest.h>
// We mark the start of the code to copy
// It's placed in .text tho it's never run here
// You'll see the trick macro at the end
// Which interleaves data and text to effect.
.text
ENTRY(start_switcher_text)
// When we reach switch_to_guest we have just left
// The safe and comforting shores of C code
// %eax has the "struct lguest_pages" to use
// Where we save state and still see it from the Guest
// And %ebx holds the Guest shadow pagetable:
// Once set we have truly left Host behind.
ENTRY(switch_to_guest)
// We told gcc all its regs could fade,
// Clobbered by our journey into the Guest
// We could have saved them, if we tried
// But time is our master and cycles count.
// Segment registers must be saved for the Host
// We push them on the Host stack for later
pushl %es
pushl %ds
pushl %gs
pushl %fs
// But the compiler is fickle, and heeds
// No warning of %ebp clobbers
// When frame pointers are used. That register
// Must be saved and restored or chaos strikes.
pushl %ebp
// The Host's stack is done, now save it away
// In our "struct lguest_pages" at offset
// Distilled into asm-offsets.h
movl %esp, LGUEST_PAGES_host_sp(%eax)
// All saved and there's now five steps before us:
// Stack, GDT, IDT, TSS
// Then last of all the page tables are flipped.
// Yet beware that our stack pointer must be
// Always valid lest an NMI hits
// %edx does the duty here as we juggle
// %eax is lguest_pages: our stack lies within.
movl %eax, %edx
addl $LGUEST_PAGES_regs, %edx
movl %edx, %esp
// The Guest's GDT we so carefully
// Placed in the "struct lguest_pages" before
lgdt LGUEST_PAGES_guest_gdt_desc(%eax)
// The Guest's IDT we did partially
// Copy to "struct lguest_pages" as well.
lidt LGUEST_PAGES_guest_idt_desc(%eax)
// The TSS entry which controls traps
// Must be loaded up with "ltr" now:
// The GDT entry that TSS uses
// Changes type when we load it: damn Intel!
// For after we switch over our page tables
// That entry will be read-only: we'd crash.
movl $(GDT_ENTRY_TSS*8), %edx
ltr %dx
// Look back now, before we take this last step!
// The Host's TSS entry was also marked used;
// Let's clear it again for our return.
// The GDT descriptor of the Host
// Points to the table after two "size" bytes
movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
// Clear "used" from type field (byte 5, bit 2)
andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
// Once our page table's switched, the Guest is live!
// The Host fades as we run this final step.
// Our "struct lguest_pages" is now read-only.
movl %ebx, %cr3
// The page table change did one tricky thing:
// The Guest's register page has been mapped
// Writable under our %esp (stack) --
// We can simply pop off all Guest regs.
popl %eax
popl %ebx
popl %ecx
popl %edx
popl %esi
popl %edi
popl %ebp
popl %gs
popl %fs
popl %ds
popl %es
// Near the base of the stack lurk two strange fields
// Which we fill as we exit the Guest
// These are the trap number and its error
// We can simply step past them on our way.
addl $8, %esp
// The last five stack slots hold return address
// And everything needed to switch privilege
// From Switcher's level 0 to Guest's 1,
// And the stack where the Guest had last left it.
// Interrupts are turned back on: we are Guest.
iret
// We tread two paths to switch back to the Host
// Yet both must save Guest state and restore Host
// So we put the routine in a macro.
#define SWITCH_TO_HOST \
/* We save the Guest state: all registers first \
* Laid out just as "struct lguest_regs" defines */ \
pushl %es; \
pushl %ds; \
pushl %fs; \
pushl %gs; \
pushl %ebp; \
pushl %edi; \
pushl %esi; \
pushl %edx; \
pushl %ecx; \
pushl %ebx; \
pushl %eax; \
/* Our stack and our code are using segments \
* Set in the TSS and IDT \
* Yet if we were to touch data we'd use \
* Whatever data segment the Guest had. \
* Load the lguest ds segment for now. */ \
movl $(LGUEST_DS), %eax; \
movl %eax, %ds; \
/* So where are we? Which CPU, which struct? \
* The stack is our clue: our TSS starts \
* It at the end of "struct lguest_pages". \
* Or we may have stumbled while restoring \
* Our Guest segment regs while in switch_to_guest, \
* The fault pushed atop that part-unwound stack. \
* If we round the stack down to the page start \
* We're at the start of "struct lguest_pages". */ \
movl %esp, %eax; \
andl $(~(1 << PAGE_SHIFT - 1)), %eax; \
/* Save our trap number: the switch will obscure it \
* (In the Host the Guest regs are not mapped here) \
* %ebx holds it safe for deliver_to_host */ \
movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \
/* The Host GDT, IDT and stack! \
* All these lie safely hidden from the Guest: \
* We must return to the Host page tables \
* (Hence that was saved in struct lguest_pages) */ \
movl LGUEST_PAGES_host_cr3(%eax), %edx; \
movl %edx, %cr3; \
/* As before, when we looked back at the Host \
* As we left and marked TSS unused \
* So must we now for the Guest left behind. */ \
andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
/* Switch to Host's GDT, IDT. */ \
lgdt LGUEST_PAGES_host_gdt_desc(%eax); \
lidt LGUEST_PAGES_host_idt_desc(%eax); \
/* Restore the Host's stack where its saved regs lie */ \
movl LGUEST_PAGES_host_sp(%eax), %esp; \
/* Last the TSS: our Host is returned */ \
movl $(GDT_ENTRY_TSS*8), %edx; \
ltr %dx; \
/* Restore now the regs saved right at the first. */ \
popl %ebp; \
popl %fs; \
popl %gs; \
popl %ds; \
popl %es
// The first path is trod when the Guest has trapped:
// (Which trap it was has been pushed on the stack).
// We need only switch back, and the Host will decode
// Why we came home, and what needs to be done.
return_to_host:
SWITCH_TO_HOST
iret
// We are lead to the second path like so:
// An interrupt, with some cause external
// Has ajerked us rudely from the Guest's code
// Again we must return home to the Host
deliver_to_host:
SWITCH_TO_HOST
// But now we must go home via that place
// Where that interrupt was supposed to go
// Had we not been ensconced, running the Guest.
// Here we see the trickness of run_guest_once():
// The Host stack is formed like an interrupt
// With EIP, CS and EFLAGS layered.
// Interrupt handlers end with "iret"
// And that will take us home at long long last.
// But first we must find the handler to call!
// The IDT descriptor for the Host
// Has two bytes for size, and four for address:
// %edx will hold it for us for now.
movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
// We now know the table address we need,
// And saved the trap's number inside %ebx.
// Yet the pointer to the handler is smeared
// Across the bits of the table entry.
// What oracle can tell us how to extract
// From such a convoluted encoding?
// I consulted gcc, and it gave
// These instructions, which I gladly credit:
leal (%edx,%ebx,8), %eax
movzwl (%eax),%edx
movl 4(%eax), %eax
xorw %ax, %ax
orl %eax, %edx
// Now the address of the handler's in %edx
// We call it now: its "iret" drops us home.
jmp *%edx
// Every interrupt can come to us here
// But we must truly tell each apart.
// They number two hundred and fifty six
// And each must land in a different spot,
// Push its number on stack, and join the stream.
// And worse, a mere six of the traps stand apart
// And push on their stack an addition:
// An error number, thirty two bits long
// So we punish the other two fifty
// And make them push a zero so they match.
// Yet two fifty six entries is long
// And all will look most the same as the last
// So we create a macro which can make
// As many entries as we need to fill.
// Note the change to .data then .text:
// We plant the address of each entry
// Into a (data) table for the Host
// To know where each Guest interrupt should go.
.macro IRQ_STUB N TARGET
.data; .long 1f; .text; 1:
// Trap eight, ten through fourteen and seventeen
// Supply an error number. Else zero.
.if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
pushl $0
.endif
pushl $\N
jmp \TARGET
ALIGN
.endm
// This macro creates numerous entries
// Using GAS macros which out-power C's.
.macro IRQ_STUBS FIRST LAST TARGET
irq=\FIRST
.rept \LAST-\FIRST+1
IRQ_STUB irq \TARGET
irq=irq+1
.endr
.endm
// Here's the marker for our pointer table
// Laid in the data section just before
// Each macro places the address of code
// Forming an array: each one points to text
// Which handles interrupt in its turn.
.data
.global default_idt_entries
default_idt_entries:
.text
// The first two traps go straight back to the Host
IRQ_STUBS 0 1 return_to_host
// We'll say nothing, yet, about NMI
IRQ_STUB 2 handle_nmi
// Other traps also return to the Host
IRQ_STUBS 3 31 return_to_host
// All interrupts go via their handlers
IRQ_STUBS 32 127 deliver_to_host
// 'Cept system calls coming from userspace
// Are to go to the Guest, never the Host.
IRQ_STUB 128 return_to_host
IRQ_STUBS 129 255 deliver_to_host
// The NMI, what a fabulous beast
// Which swoops in and stops us no matter that
// We're suspended between heaven and hell,
// (Or more likely between the Host and Guest)
// When in it comes! We are dazed and confused
// So we do the simplest thing which one can.
// Though we've pushed the trap number and zero
// We discard them, return, and hope we live.
handle_nmi:
addl $8, %esp
iret
// We are done; all that's left is Mastery
// And "make Mastery" is a journey long
// Designed to make your fingers itch to code.
// Here ends the text, the file and poem.
ENTRY(end_switcher_text)