Commit ecda85e7 authored by Juergen Gross's avatar Juergen Gross Committed by Ingo Molnar

x86/lguest: Remove lguest support

Lguest seems to be rather unused these days. It has seen only patches
ensuring it still builds the last two years and its official state is
"Odd Fixes".

Remove it in order to be able to clean up the paravirt code.
Signed-off-by: default avatarJuergen Gross <>
Acked-by: default avatarRusty Russell <>
Acked-by: default avatarThomas Gleixner <>
Cc: Linus Torvalds <>
Cc: Peter Zijlstra <>
Link: default avatarIngo Molnar <>
parent edcb5cf8
......@@ -7640,17 +7640,6 @@ T: git git://
S: Maintained
F: drivers/media/dvb-frontends/lgdt3305.*
M: Rusty Russell <>
S: Odd Fixes
F: arch/x86/include/asm/lguest*.h
F: arch/x86/lguest/
F: drivers/lguest/
F: include/linux/lguest*.h
F: tools/lguest/
M: Viresh Kumar <>
......@@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/
# Hyper-V paravirtualization support
# lguest paravirtualization support
obj-$(CONFIG_LGUEST_GUEST) += lguest/
obj-y += realmode/
obj-y += kernel/
obj-y += mm/
......@@ -777,8 +777,6 @@ config KVM_DEBUG_FS
Statistics are displayed in debugfs filesystem. Enabling this option
may incur significant overhead.
source "arch/x86/lguest/Kconfig"
bool "Paravirtual steal time accounting"
depends on PARAVIRT
#ifndef _ASM_X86_LGUEST_H
#define _ASM_X86_LGUEST_H
#ifndef __ASSEMBLY__
#include <asm/desc.h>
#define GUEST_PL 1
/* Page for Switcher text itself, then two pages per cpu */
#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
/* Where we map the Switcher, in both Host and Guest. */
extern unsigned long switcher_addr;
/* Found in switcher.S */
extern unsigned long default_idt_entries[];
/* Declarations for definitions in arch/x86/lguest/head_32.S */
extern char lguest_noirq_iret[];
extern const char lgstart_cli[], lgend_cli[];
extern const char lgstart_pushf[], lgend_pushf[];
extern void lguest_iret(void);
extern void lguest_init(void);
struct lguest_regs {
/* Manually saved part. */
unsigned long eax, ebx, ecx, edx;
unsigned long esi, edi, ebp;
unsigned long gs;
unsigned long fs, ds, es;
unsigned long trapnum, errcode;
/* Trap pushed part */
unsigned long eip;
unsigned long cs;
unsigned long eflags;
unsigned long esp;
unsigned long ss;
/* This is a guest-specific page (mapped ro) into the guest. */
struct lguest_ro_state {
/* Host information we need to restore when we switch back. */
u32 host_cr3;
struct desc_ptr host_idt_desc;
struct desc_ptr host_gdt_desc;
u32 host_sp;
/* Fields which are used when guest is running. */
struct desc_ptr guest_idt_desc;
struct desc_ptr guest_gdt_desc;
struct x86_hw_tss guest_tss;
struct desc_struct guest_idt[IDT_ENTRIES];
struct desc_struct guest_gdt[GDT_ENTRIES];
struct lg_cpu_arch {
/* The GDT entries copied into lguest_ro_state when running. */
struct desc_struct gdt[GDT_ENTRIES];
/* The IDT entries: some copied into lguest_ro_state when running. */
struct desc_struct idt[IDT_ENTRIES];
/* The address of the last guest-visible pagefault (ie. cr2). */
unsigned long last_pagefault;
static inline void lguest_set_ts(void)
u32 cr0;
cr0 = read_cr0();
if (!(cr0 & 8))
write_cr0(cr0 | 8);
/* Full 4G segment descriptors, suitable for CS and DS. */
((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff))
#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff))
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_LGUEST_H */
/* Architecture specific portion of the lguest hypercalls */
#define LHCALL_HALT 10
#define LHCALL_SET_PMD 13
#define LHCALL_SET_PTE 14
#define LHCALL_SET_PGD 15
#define LHCALL_LOAD_TLS 16
/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
#ifndef __ASSEMBLY__
#include <asm/hw_irq.h>
* But first, how does our Guest contact the Host to ask for privileged
* operations? There are two ways: the direct way is to make a "hypercall",
* to make requests of the Host Itself.
* Our hypercall mechanism uses the highest unused trap code (traps 32 and
* above are used by real hardware interrupts). Seventeen hypercalls are
* available: the hypercall number is put in the %eax register, and the
* arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
* If a return value makes sense, it's returned in %eax.
* Grossly invalid calls result in Sudden Death at the hands of the vengeful
* Host, rather than returning failure. This reflects Winston Churchill's
* definition of a gentleman: "someone who is only rude intentionally".
static inline unsigned long
hcall(unsigned long call,
unsigned long arg1, unsigned long arg2, unsigned long arg3,
unsigned long arg4)
/* "int" is the Intel instruction to trigger a trap. */
asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
/* The call in %eax (aka "a") might be overwritten */
: "=a"(call)
/* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
: "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
/* "memory" means this might write somewhere in memory.
* This isn't true for all calls, but it's safe to tell
* gcc that it might happen so it doesn't get clever. */
: "memory");
return call;
/* Can't use our min() macro here: needs to be a constant */
#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
struct hcall_args {
/* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
unsigned long arg0, arg1, arg2, arg3, arg4;
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_LGUEST_HCALL_H */
......@@ -662,7 +662,7 @@ static inline void sync_core(void)
* In case NMI unmasking or performance ever becomes a problem,
* the next best option appears to be MOV-to-CR2 and an
* unconditional jump. That sequence also works on all CPUs,
* but it will fault at CPL3 (i.e. Xen PV and lguest).
* but it will fault at CPL3 (i.e. Xen PV).
* CPUID is the conventional way, but it's nasty: it doesn't
* exist on some 486-like CPUs, and it usually exits to a
......@@ -201,7 +201,7 @@ struct boot_params {
* @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
* PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
* @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
* @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated
* @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
* which start at asm startup_xen() entry point and later jump to the C
* xen_start_kernel() entry point. Both domU and dom0 type of guests are
......@@ -4,9 +4,6 @@
#include <asm/ucontext.h>
#include <linux/lguest.h>
#include "../../../drivers/lguest/lg.h"
#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
static char syscalls[] = {
#include <asm/syscalls_32.h>
......@@ -62,23 +59,6 @@ void foo(void)
OFFSET(stack_canary_offset, stack_canary, canary);
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
DEFINE(NR_syscalls, sizeof(syscalls));
......@@ -155,7 +155,6 @@ ENTRY(startup_32)
jmp *%eax
/* Unknown implementation; there's really
nothing we can do at this point. */
......@@ -165,7 +164,6 @@ WEAK(xen_entry)
.long .Ldefault_entry /* normal x86/PC */
.long lguest_entry /* lguest hypervisor */
.long xen_entry /* Xen hypervisor */
.long .Ldefault_entry /* Moorestown MID */
num_subarch_entries = (. - subarch_entries) / 4
......@@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void)
x86_platform.legacy.reserve_bios_regions = 1;
x86_platform.legacy.devices.pnpbios = 0;
x86_platform.legacy.rtc = 0;
......@@ -89,6 +89,5 @@ config KVM_MMU_AUDIT
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
source drivers/lguest/Kconfig
bool "Lguest guest support"
depends on X86_32 && PARAVIRT && PCI
select TTY
select VIRTIO
Lguest is a tiny in-kernel hypervisor. Selecting this will
allow your kernel to boot under lguest. This option will increase
your kernel size by about 10k. If in doubt, say N.
If you say Y here, make sure you say Y (or M) to the virtio block
and net drivers which lguest needs.
obj-y := head_32.o boot.o
CFLAGS_boot.o := $(call cc-option, -fno-stack-protector)
This diff is collapsed.
#include <linux/linkage.h>
#include <linux/lguest.h>
#include <asm/lguest_hcall.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/processor-flags.h>
* Our story starts with the bzImage: booting starts at startup_32 in
* arch/x86/boot/compressed/head_32.S. This merely uncompresses the real
* kernel in place and then jumps into it: startup_32 in
* arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi
* register, which is created by the bootloader (the Launcher in our case).
* The startup_32 function does very little: it clears the uninitialized global
* C variables which we expect to be zero (ie. BSS) and then copies the boot
* header and kernel command line somewhere safe, and populates some initial
* page tables. Finally it checks the 'hardware_subarch' field. This was
* introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
* assigned number), then it calls us here.
* WARNING: be very careful here! We're running at addresses equal to physical
* addresses (around 0), not above PAGE_OFFSET as most code expects
* (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
* data without remembering to subtract __PAGE_OFFSET!
* The .section line puts this code in .init.text so it will be discarded after
* boot.
.section .init.text, "ax", @progbits
* We make the "initialization" hypercall now to tell the Host where
* our lguest_data struct is.
movl $lguest_data - __PAGE_OFFSET, %ebx
/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
movl $(initial_page_table - __PAGE_OFFSET), %ebx
/* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp
/* Jumps are relative: we're running __PAGE_OFFSET too low. */
jmp lguest_init+__PAGE_OFFSET
* We create a macro which puts the assembler code between lgstart_ and lgend_
* markers. These templates are put in the .text section: they can't be
* discarded after boot as we may need to patch modules, too.
#define LGUEST_PATCH(name, insns...) \
lgstart_##name: insns; lgend_##name:; \
.globl lgstart_##name; .globl lgend_##name
LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
* But using those wrappers is inefficient (we'll see why that doesn't matter
* for save_fl and irq_disable later). If we write our routines carefully in
* assembler, we can avoid clobbering any registers and avoid jumping through
* the wrapper functions.
* I skipped over our first piece of assembler, but this one is worth studying
* in a bit more detail so I'll describe in easy stages. First, the routine to
* enable interrupts:
* The reverse of irq_disable, this sets lguest_data.irq_enabled to
* X86_EFLAGS_IF (ie. "Interrupts enabled").
movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
* But now we need to check if the Host wants to know: there might have
* been interrupts waiting to be delivered, in which case it will have
* set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
* jump to send_interrupts, otherwise we're done.
cmpl $0, lguest_data+LGUEST_DATA_irq_pending
jnz send_interrupts
* One cool thing about x86 is that you can do many things without using
* a register. In this case, the normal path hasn't needed to save or
* restore any registers at all!
* OK, now we need a register: eax is used for the hypercall number,
* We used not to bother with this pending detection at all, which was
* much simpler. Sooner or later the Host would realize it had to
* send us an interrupt. But that turns out to make performance 7
* times worse on a simple tcp benchmark. So now we do this the hard
* way.
pushl %eax
/* This is the actual hypercall trap. */
/* Put eax back the way we found it. */
popl %eax
* Finally, the "popf" or "restore flags" routine. The %eax register holds the
* flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
* enabling interrupts again, if it's 0 we're leaving them off.
/* This is just "lguest_data.irq_enabled = flags;" */
movl %eax, lguest_data+LGUEST_DATA_irq_enabled
* Now, if the %eax value has enabled interrupts and
* lguest_data.irq_pending is set, we want to tell the Host so it can
* deliver any outstanding interrupts. Fortunately, both values will
* be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
* instruction will AND them together for us. If both are set, we
* jump to send_interrupts.
testl lguest_data+LGUEST_DATA_irq_pending, %eax
jnz send_interrupts
/* Again, the normal path has used no extra registers. Clever, huh? */
/* These demark the EIP where host should never deliver interrupts. */
.global lguest_noirq_iret
* When the Host reflects a trap or injects an interrupt into the Guest, it
* sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
* so the Guest iret logic does the right thing when restoring it. However,
* when the Host sets the Guest up for direct traps, such as system calls, the
* processor is the one to push eflags onto the stack, and the interrupt bit
* will be 1 (in reality, interrupts are always enabled in the Guest).
* This turns out to be harmless: the only trap which should happen under Linux
* with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
* regions), which has to be reflected through the Host anyway. If another
* trap *does* go off when interrupts are disabled, the Guest will panic, and
* we'll never get to this iret!
* There is one final paravirt_op that the Guest implements, and glancing at it
* you can see why I left it to last. It's *cool*! It's in *assembler*!
* The "iret" instruction is used to return from an interrupt or trap. The
* stack looks like this:
* old address
* old code segment & privilege level
* old processor flags ("eflags")
* The "iret" instruction pops those values off the stack and restores them all
* at once. The only problem is that eflags includes the Interrupt Flag which
* the Guest can't change: the CPU will simply ignore it when we do an "iret".
* So we have to copy eflags from the stack to lguest_data.irq_enabled before
* we do the "iret".
* There are two problems with this: firstly, we can't clobber any registers
* and secondly, the whole thing needs to be atomic. The first problem
* is solved by using "push memory"/"pop memory" instruction pair for copying.
* The second is harder: copying eflags to lguest_data.irq_enabled will turn
* interrupts on before we're finished, so we could be interrupted before we
* return to userspace or wherever. Our solution to this is to tell the
* Host that it is *never* to interrupt us there, even if interrupts seem to be
* enabled. (It's not necessary to protect pop instruction, since
* data gets updated only after it completes, so we only need to protect
* one instruction, iret).
pushl 2*4(%esp)
* Note the %ss: segment prefix here. Normal data accesses use the
* "ds" segment, but that will have already been restored for whatever
* we're returning to (such as userspace): we can't trust it. The %ss:
* prefix makes sure we use the stack segment, which is still valid.
popl %ss:lguest_data+LGUEST_DATA_irq_enabled
......@@ -125,7 +125,6 @@ obj-$(CONFIG_ACCESSIBILITY) += accessibility/
obj-$(CONFIG_ISDN) += isdn/
obj-$(CONFIG_EDAC) += edac/
obj-$(CONFIG_EISA) += eisa/
obj-y += lguest/
obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_CPU_IDLE) += cpuidle/
obj-y += mmc/
......@@ -470,7 +470,7 @@ config VIRTIO_BLK
depends on VIRTIO
This is the virtual block driver for virtio. It can be used with
lguest or QEMU based VMMs (like KVM or Xen). Say Y or M.
QEMU based VMMs (like KVM or Xen). Say Y or M.
bool "SCSI passthrough request for the Virtio block driver"
......@@ -161,7 +161,7 @@ config VIRTIO_CONSOLE
depends on VIRTIO && TTY
Virtio console for use with lguest and other hypervisors.
Virtio console for use with hypervisors.
Also serves as a general-purpose serial device for data
transfer between the guest and host. Character devices at
......@@ -1130,7 +1130,7 @@ static const struct file_operations port_fops = {
* We turn the characters into a scatter-gather list, add it to the
* output queue and then kick the Host. Then we sit here waiting for
* it to finish: inefficient in theory, but in practice
* implementations will do it immediately (lguest's Launcher does).
* implementations will do it immediately.
static int put_chars(u32 vtermno, const char *buf, int count)
config LGUEST
tristate "Linux hypervisor example code"
depends on X86_32 && EVENTFD && TTY && PCI_DIRECT
This is a very simple module which allows you to run
multiple instances of the same Linux kernel, using the
"lguest" command found in the tools/lguest directory.
Note that "lguest" is pronounced to rhyme with "fell quest",
not "rustyvisor". See tools/lguest/lguest.txt.
If unsure, say N. If curious, say M. If masochistic, say Y.
# Host requires the other files, which can be a module.
obj-$(CONFIG_LGUEST) += lg.o
lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
segments.o lguest_user.o
lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o
Preparation Preparation!: PREFIX=P
Drivers: PREFIX=D
Launcher: PREFIX=L
Switcher: PREFIX=S
Mastery: PREFIX=M
@for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}"
Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery:
@sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
@printf " __ \n (___()'\`;\n /, /\`\n \\\\\\\"--\\\\\\ \n"
@sleep 2; clear; printf "\n\n Sit!\n\n"; sleep 1; clear
@printf " __ \n ()'\`; \n /\\|\` \n / | \n(/_)_|_ \n"
@sleep 2; clear; printf "\n\n Stand!\n\n"; sleep 1; clear
@printf " __ \n ()'\`; \n /\\|\` \n /._.= \n /| / \n(_\_)_ \n"
@sleep 2; clear; printf "\n\n Good puppy!\n\n"; sleep 1; clear
Welcome, friend reader, to lguest.
Lguest is an adventure, with you, the reader, as Hero. I can't think of many
5000-line projects which offer both such capability and glimpses of future
potential; it is an exciting time to be delving into the source!
But be warned; this is an arduous journey of several hours or more! And as we
know, all true Heroes are driven by a Noble Goal. Thus I offer a Beer (or
equivalent) to anyone I meet who has completed this documentation.
So get comfortable and keep your wits about you (both quick and humorous).
Along your way to the Noble Goal, you will also gain masterly insight into
lguest, and hypervisors and x86 virtualization in general.
Our Quest is in seven parts: (best read with C highlighting turned on)
I) Preparation
- In which our potential hero is flown quickly over the landscape for a
taste of its scope. Suitable for the armchair coders and other such
persons of faint constitution.
II) Guest
- Where we encounter the first tantalising wisps of code, and come to
understand the details of the life of a Guest kernel.
III) Drivers
- Whereby the Guest finds its voice and become useful, and our
understanding of the Guest is completed.
IV) Launcher
- Where we trace back to the creation of the Guest, and thus begin our
understanding of the Host.
V) Host
- Where we master the Host code, through a long and tortuous journey.
Indeed, it is here that our hero is tested in the Bit of Despair.
VI) Switcher
- Where our understanding of the intertwined nature of Guests and Hosts
is completed.
VII) Mastery
- Where our fully fledged hero grapples with the Great Question:
"What next?"
make Preparation!
Rusty Russell.
This diff is collapsed.
* Just as userspace programs request kernel operations through a system
* call, the Guest requests Host operations through a "hypercall". You might
* notice this nomenclature doesn't really follow any logic, but the name has
* been around for long enough that we're stuck with it. As you'd expect, this
* code is basically a one big switch statement.
/* Copyright (C) 2006 Rusty Russell IBM Corporation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/ktime.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include "lg.h"
* This is the core hypercall routine: where the Guest gets what it wants.
* Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both.
static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
switch (args->arg0) {
* This call does nothing, except by breaking out of the Guest
* it makes us process all the asynchronous hypercalls.
* This call does nothing too, but by breaking out of the Guest
* it makes us process any pending interrupts.
* You can't get here unless you're already initialized. Don't
* do that.
kill_guest(cpu, "already have lguest_data");
char msg[128];
* Shutdown is such a trivial hypercall that we do it in five
* lines right here.
* If the lgread fails, it will call kill_guest() itself; the
* kill_guest() with the message will be ignored.
__lgread(cpu, msg, args->arg1, sizeof(msg));
msg[sizeof(msg)-1] = '\0';