From 3a006f5b596da90b876d320b8d48f278b88a5ec1 Mon Sep 17 00:00:00 2001
From: iximeow <me@iximeow.net>
Date: Sun, 24 May 2026 00:49:32 +0000
Subject: move tests out to the bottom

---
 src/x86_64.rs | 2883 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 1442 insertions(+), 1441 deletions(-)

(limited to 'src')

diff --git a/src/x86_64.rs b/src/x86_64.rs
index ff23b34..0e08446 100644
--- a/src/x86_64.rs
+++ b/src/x86_64.rs
@@ -437,1722 +437,1723 @@ fn test_check_range_exact() {
     assert!(mapping.check_range(GuestAddress(0x4000), 0x1000));
 }
 
-#[test]
-fn test_xor_runs() {
-    let mut vm = Vm::create(128 * 1024).expect("can create vm");
-    let mut regs = vm.get_regs().expect("can get regs");
+/// a selector for the execution mode the VM should be initialized to.
+///
+/// different `IsaMode` will configure the VM wildly differently; generally any VM/vCPU state not
+/// directly required for the requested mode will be left untouched.
+///
+/// in all modes, CPUID leaves and xcr0 are set up to support any ISA extensions supported by the
+/// host CPU.
+///
+/// in all modes, an IDT is installed with interrupt handlers pointed to the 256 bytes from
+/// `interrupt_handlers_start()`.
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub enum IsaMode {
+    /// request that the VM be configured to run x86-64 instructions, aka "AMD64", or "IA-32e" (and
+    /// specifically "IA-32e 64-bit mode") in some Intel nomenclature.
+    ///
+    /// this configures identity paging, selectors sufficient for long mode (with all vCPU
+    /// execution at CPL=0), prepares some MSRs for syscalls, and of course configures `cr0`
+    /// for long mode.
+    Long,
+    /// request that the VM be configured to run 32-bit instructions, with long mode neither
+    /// enabled nor active.
+    ///
+    /// this configures identity paging and selectors covering all 32-bit address space and with
+    /// CPL=0.
+    Protected,
+    /// request that the VM be configured to run 16-bit instructions.
+    ///
+    /// this configures code/data selectors covering all 24 bits of address space and an interrupt
+    /// descriptor table, and CPUID for any host-supported ISA extensions, but that's about it.
+    Real,
+}
 
-    vm.program(&[0x33, 0xc0], &mut regs);
+/// the settings to configure a [`Vm::create_by_settings`]. see `VmSettings::new` for top-level
+/// configuration.
+pub struct VmSettings {
+    mem_size: usize,
+    isa_mode: IsaMode,
+}
 
-    regs.rax = 0x1234;
-    let rip_before = regs.rip;
+impl VmSettings {
+    /// provide the bare-minimum configuration for a VM: the size of its memory and what execution
+    /// mode the resulting VM should be set for.
+    ///
+    /// VM control settings (IDT, `cs`, `ds`, other selectors, syscalls, page tables, etc) vary
+    /// substantially across different `IsaMode`. in all cases code can be written into the VM with
+    /// [`Vm::program()`], then run with [`Vm::run()`].
+    pub fn new(mem_size: usize, isa_mode: IsaMode) -> Self {
+        Self { mem_size, isa_mode }
+    }
+}
 
-    vm.set_regs(&regs).expect("can set regs");
+impl Vm {
+    pub fn create(mem_size: usize) -> Result<Vm, VmCreateError> {
+        Self::create_by_settings(VmSettings::new(mem_size, IsaMode::Long))
+    }
 
-    vm.set_single_step(true).expect("can set single-step");
+    pub fn create_by_settings(settings: VmSettings) -> Result<Vm, VmCreateError> {
+        let kvm = Kvm::new()
+            .map_err(|e| VmError::from_kvm("Kvm::new()", e))?;
 
-    let res = vm.run().expect("can run vm");
+        let vm = kvm.create_vm()
+            .map_err(|e| VmError::from_kvm("craete_vm", e))?;
 
-    let expected_rip = rip_before + 2;
-    match res {
-        VcpuExit::Debug { pc: rip_after, .. } => {
-            assert_eq!(expected_rip, rip_after);
-        }
-        other => {
-            panic!("unexpected exit: {:?}", other);
+        let supported_cpuid = kvm.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES).unwrap();
+
+        // actual minimum is somewhere around 0x1a000 bytes, but 0x20_000 aka 128k will do
+        if settings.mem_size < 128 * 1024 {
+            return Err(VmCreateError::TooSmall {
+                requested: settings.mem_size,
+                required: 128 * 1024,
+            });
         }
-    };
 
-    let regs_after = vm.get_regs().expect("can get regs");
-    assert_eq!(regs_after.rax, 0);
-}
+        let mapping = Mapping::create_shared(0, settings.mem_size, ProtFlags::PROT_READ | ProtFlags::PROT_WRITE)?;
 
-#[test]
-fn test_protected_mode_runs() {
-    let settings = VmSettings::new(128 * 1024, IsaMode::Protected);
-    let mut vm = Vm::create_by_settings(settings).expect("can create vm");
-    let mut regs = vm.get_regs().expect("can get regs");
+        let region = kvm_userspace_memory_region {
+            slot: 0,
+            guest_phys_addr: 0x0000,
+            memory_size: mapping.size.get() as u64,
+            userspace_addr: mapping.addr.as_ptr() as u64,
+            flags: 0,
+        };
 
-    let buf = &[
-        0xc5, 0xe0, 0x54, 0xc3, // vandps xmm0, xmm3, xmm3
-        0x33, 0xc0,             // xor eax, eax
-        0x8b, 0x09,             // mov ecx, [ecx]
-        0xf4                    // hlt
-    ];
-    vm.program(buf, &mut regs);
+        let set_res = unsafe { vm.set_user_memory_region(region) };
+        set_res.map_err(|e| VmError::from_kvm("set_user_memory_region", e))?;
 
-    regs.rax = 0x1234;
-    regs.rcx = 0x4;
+        let vcpu_res = vm.create_vcpu(0);
+        let vcpu = vcpu_res.map_err(|e| VmError::from_kvm("create_vcpu(0)", e))?;
 
-    vm.set_regs(&regs).expect("can set regs");
+        let current_cpuid = vcpu.get_cpuid2(KVM_MAX_CPUID_ENTRIES).unwrap();
 
-    let res = vm.run().expect("can run vm");
+        let mem_ceiling = mapping.size.get().try_into().unwrap();
 
-    match res {
-        VcpuExit::Hlt => {
-            // expected exit from the `0xf4` above.
-        }
-        other => {
-            panic!("unexpected exit: {:?}", other);
-        }
-    };
+        let mut this = Vm {
+            settings,
+            vm,
+            vcpu,
+            supported_cpuid,
+            current_cpuid,
+            idt_configured: false,
+            syscall_configured: false,
+            memory: mapping,
+            aux_memories: Vec::new(),
+            mem_ceiling,
+        };
 
-    let regs_after = vm.get_regs().expect("can get regs");
-    assert_eq!(regs_after.rax, 0);
-    assert_eq!(regs_after.rcx, 0);
-}
+        let mut vcpu_regs = this.get_regs()?;
+        let mut vcpu_sregs = this.get_sregs()?;
 
-#[test]
-fn test_pusha_runs() {
-    let settings = VmSettings::new(128 * 1024, IsaMode::Real);
-    let mut vm = Vm::create_by_settings(settings).expect("can create vm");
-    let mut regs = vm.get_regs().expect("can get regs");
+        assert!(this.cpuid_supports(Feature::Base));
+        this.cpuid_set(Feature::Base, true);
 
-    vm.program(&[0x60], &mut regs);
+        match this.settings.isa_mode {
+            IsaMode::Long => {
+                unsafe {
+                    this.configure_identity_paging(Some(&mut vcpu_sregs));
+                    this.configure_selectors(&mut vcpu_sregs);
+                    this.configure_idt(&mut vcpu_regs, &mut vcpu_sregs);
+                    let mut xcrs = this.get_xcrs()?;
+                    this.configure_extensions(&mut vcpu_sregs, &mut xcrs);
+                    this.set_xcrs(&xcrs)?;
+                    this.configure_syscalls(&mut vcpu_sregs);
+                }
 
-    regs.rip = 0;
-    regs.rax = 0x1234;
-    eprintln!("{:?}", regs);
+                vcpu_sregs.efer |= 0x0000_0500; // LME | LMA
+            }
+            IsaMode::Protected => {
+                unsafe {
+                    this.configure_identity_paging_32b(Some(&mut vcpu_sregs));
+                    this.configure_selectors_32b(&mut vcpu_sregs);
+                    this.configure_idt_32b(&mut vcpu_regs, &mut vcpu_sregs);
+                    let mut xcrs = this.get_xcrs()?;
+                    this.configure_extensions(&mut vcpu_sregs, &mut xcrs);
+                    this.set_xcrs(&xcrs)?;
 
-    vm.set_regs(&regs).expect("can set regs");
+                }
+            }
+            IsaMode::Real => {
+                unsafe {
+                    this.configure_selectors_16b(&mut vcpu_sregs);
+                    this.configure_idt_16b(&mut vcpu_regs, &mut vcpu_sregs);
+                    let mut xcrs = this.get_xcrs()?;
+                    this.configure_extensions(&mut vcpu_sregs, &mut xcrs);
+                    this.set_xcrs(&xcrs)?;
 
-    vm.set_single_step(true).expect("can set single-step");
-    let expected_rip = vm.code_addr().0 + 1;
+                    // in 16-bit mode we've set cs and ds to cover the last 4kb of memory, starting
+                    // at the same place we've written code to execute. there's not much memory to
+                    // go around, and not a ton of flexibility in the asmlinator API, so uh ... the
+                    // least annoying thing to do might be to just put the stack 0x80 bytes from
+                    // the end?
+                    vcpu_regs.rsp = 0x1000 - 0x80;
+                }
+            }
+        }
 
-    let res = vm.run().expect("can run vm");
+        this.set_regs(&vcpu_regs)?;
+        this.set_sregs(&vcpu_sregs)?;
 
-    match res {
-        VcpuExit::Debug { pc: rip_after, .. } => {
-            eprintln!("rip after: {:08x}", rip_after);
-            assert_eq!(expected_rip, rip_after);
-        }
-        other => {
-            panic!("unexpected exit: {:?}", other);
+        Ok(this)
+    }
+
+    /// map and add a region of size `size` at guest-physical address `gpa`.
+    ///
+    /// this will not update page tables, so if the newly-added memory is not already mapped due to
+    /// a previous `configure_identity_paging` call and it is not mapped due to explicit page table
+    /// management, it will not yet be accessible by guest code.
+    pub fn add_memory(&mut self, gpa: GuestAddress, size: u64) -> Result<(), VmError> {
+        let new_mapping_end = gpa.0.checked_add(size)
+            .map(|addr| GuestAddress(addr))
+            .ok_or_else(|| VmError::InvalidMapping { base: gpa, size })?;
+        if self.memory.overlaps(gpa, new_mapping_end) {
+            return Err(VmError::InvalidMapping { base: gpa, size });
+        } else {
+            for mapping in self.aux_memories.iter() {
+                if mapping.overlaps(gpa, new_mapping_end) {
+                    return Err(VmError::InvalidMapping { base: gpa, size });
+                }
+            }
         }
-    };
 
-    let regs_after = vm.get_regs().expect("can get regs");
-    assert_eq!(regs_after.rax, 0x1234);
-    assert_eq!(regs_after.rsp, 0x1000 - 0x80 - (8 * 2));
+        let mapping = Mapping::create_shared(
+            u64_to_usize(gpa.0),
+            u64_to_usize(size),
+            ProtFlags::PROT_READ | ProtFlags::PROT_WRITE
+        )?;
 
-    let mut regs = vm.get_regs().expect("can get regs");
+        let used_slots: u32 = self.aux_memories.len().try_into()
+            .map_err(|_| VmError::InvalidMapping { base: gpa, size })?;
+        let next_slot = used_slots.checked_add(1)
+            .ok_or_else(|| VmError::InvalidMapping { base: gpa, size })?;
 
-    vm.program(&[0x66, 0x60], &mut regs);
+        let region = kvm_userspace_memory_region {
+            slot: next_slot,
+            guest_phys_addr: gpa.0,
+            memory_size: mapping.size.get() as u64,
+            userspace_addr: mapping.addr.as_ptr() as u64,
+            flags: 0,
+        };
 
-    regs.rip = 0;
-    regs.rax = 0x1234;
-    regs.rsp = 0x1000 - 0x80;
-    eprintln!("{:?}", regs);
+        let set_res = unsafe { self.vm.set_user_memory_region(region) };
+        set_res.map_err(|e| VmError::from_kvm("set_user_memory_region", e))?;
 
-    vm.set_regs(&regs).expect("can set regs");
+        self.aux_memories.push(mapping);
 
-    vm.set_single_step(true).expect("can set single-step");
-    let expected_rip = vm.code_addr().0 + 2;
-
-    let res = vm.run().expect("can run vm");
-
-    match res {
-        VcpuExit::Debug { pc: rip_after, .. } => {
-            eprintln!("rip after: {:08x}", rip_after);
-            assert_eq!(expected_rip, rip_after);
-        }
-        other => {
-            panic!("unexpected exit: {:?}", other);
+        if new_mapping_end.0 > self.mem_ceiling {
+            self.mem_ceiling = new_mapping_end.0;
         }
-    };
-
-    let regs_after = vm.get_regs().expect("can get regs");
-    assert_eq!(regs_after.rax, 0x1234);
-    assert_eq!(regs_after.rsp, 0x1000 - 0x80 - (8 * 4));
-}
-
-#[test]
-fn test_syscall() {
-    let mut vm = Vm::create(128 * 1024).expect("can create vm");
-    let mut regs = vm.get_regs().expect("can get regs");
 
-    vm.program(&[0x0f, 0x05], &mut regs);
-    eprintln!("rip before: {:08x}", regs.rip);
-
-    vm.set_regs(&regs).expect("can set regs");
-
-//    vm.set_single_step(true).expect("can set single-step");
+        Ok(())
+    }
 
-    let res = vm.run().expect("can run vm");
-    match res {
-        VcpuExit::Syscall => { /* expected */ }
-        VcpuExit::Debug { pc, .. } => {
-            if pc == vm.syscall_addr().0 {
-                panic!(
-                    "VM exited at syscall target. \
-                     syscall hlt stub not executed. \
-                     is the VM being single-stepped?"
-                );
-            }
-            panic!("unexpected debug exit at rip={:08x}", pc);
-        }
-        other => {
-            panic!("unexpected exit: {:?}", other);
-        }
-    };
+    pub fn get_regs(&self) -> Result<kvm_regs, VmError> {
+        self.vcpu.get_regs()
+            .map_err(|e| VmError::from_kvm("get_regs", e))
+    }
 
-    let regs_after = vm.get_regs().expect("can get regs");
+    pub fn get_sregs(&self) -> Result<kvm_sregs, VmError> {
+        self.vcpu.get_sregs()
+            .map_err(|e| VmError::from_kvm("get_sregs", e))
+    }
 
-    let expected_rip = vm.syscall_addr().0 + 1;
-    assert_eq!(expected_rip, regs_after.rip);
-}
+    pub fn get_xcrs(&self) -> Result<kvm_xcrs, VmError> {
+        self.vcpu.get_xcrs()
+            .map_err(|e| VmError::from_kvm("get_xcrs", e))
+    }
 
-#[test]
-fn test_xorps_runs() {
-    let mut vm = Vm::create(128 * 1024).expect("can create vm");
-    let mut regs = vm.get_regs().expect("can get regs");
+    pub fn set_regs(&self, regs: &kvm_regs) -> Result<(), VmError> {
+        self.vcpu.set_regs(regs)
+            .map_err(|e| VmError::from_kvm("set_regs", e))
+    }
 
-    vm.program(&[0x0f, 0x57, 0xc0], &mut regs);
+    pub fn set_sregs(&self, sregs: &kvm_sregs) -> Result<(), VmError> {
+        self.vcpu.set_sregs(sregs)
+            .map_err(|e| VmError::from_kvm("set_sregs", e))
+    }
 
-    let rip_before = regs.rip;
+    pub fn set_xcrs(&self, xcrs: &kvm_xcrs) -> Result<(), VmError> {
+        self.vcpu.set_xcrs(xcrs)
+            .map_err(|e| VmError::from_kvm("set_xcrs", e))
+    }
 
-    vm.set_regs(&regs).expect("can set regs");
+    pub fn set_msrs(&self, msrs: &Msrs) -> Result<(), VmError> {
+        let n_set = self.vcpu.set_msrs(msrs)
+            .map_err(|e| VmError::from_kvm("set_msrs", e))?;
+        assert_eq!(msrs.as_slice().len(), n_set);
+        Ok(())
+    }
 
-    vm.set_single_step(true).expect("can set single-step");
+    pub fn idt_configured(&self) -> bool {
+        self.idt_configured
+    }
 
-    let res = vm.run().expect("can run vm");
+    pub fn syscall_configured(&self) -> bool {
+        self.syscall_configured
+    }
 
-    let expected_rip = rip_before + 3;
-    eprintln!("exit: {:?}", res);
-    match res {
-        VcpuExit::Debug { pc: rip_after, .. } => {
-            assert_eq!(expected_rip, rip_after);
-        }
-        other => {
-            panic!("unexpected exit: {:?}", other);
-        }
-    };
-}
+    // TODO: seems like there's a KVM bug where if the VM is configured for single-step and the
+    // single-stepped instruction is a rmw to MMIO memory (or MMIO hugepages?), the single-step
+    // doesn't actually take effect. compare `0x33 0x00` and `0x31 0x00`. what the hell!
+    pub fn set_single_step(&mut self, active: bool) -> Result<(), VmError> {
+        let mut guest_debug = kvm_guest_debug::default();
 
-#[test]
-fn test_vex_vandps_runs() {
-    let mut vm = Vm::create(128 * 1024).expect("can create vm");
+        if active {
+            guest_debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP
+        };
 
-    if !vm.cpuid_supports(Feature::StateAVX) {
-        panic!("host CPU does not support AVX");
+        self.vcpu.set_guest_debug(&guest_debug)
+            .map_err(|e| VmError::from_kvm("set_guest_debug", e))
     }
 
-    let mut regs = vm.get_regs().expect("can get regs");
+    pub fn run<'vm>(&'vm mut self) -> Result<VcpuExit<'vm>, VmError> {
+        let exit = self.vcpu.run()
+            .map_err(|e| VmError::from_kvm("vcpu run", e))?;
 
-    vm.program(&[0xc5, 0xe0, 0x54, 0x03], &mut regs);
+        match exit {
+            kvm_ioctls::VcpuExit::MmioRead(addr, buf) => {
+                // `buf` is typed with a lifetime from the reborrow of self.vcpu for run() above.
+                // this means it's a shorter lifetime than `'vm`, but since the resulting lifetime
+                // is also `'vm` it *really* has the effect of disallowing any subsequent use of
+                // `self`. these transmutes decouple the lifetime of `exit` from the lifetime of
+                // `self` and returned `VcpuExit`, so other arms that don't involve lifetimes can
+                // drop `exit()` and query the vcpu.
+                //
+                // SAFETY: this actually extends the lifetime of `buf` from the shorter transient
+                // lifetime to `'vm` for the return type.
+                let buf: &'vm mut [u8] = unsafe { core::mem::transmute(buf) };
+                return Ok(VcpuExit::MmioRead { buf, addr });
+            }
+            kvm_ioctls::VcpuExit::MmioWrite(addr, buf) => {
+                // see the same transmute in `MmioRead` for why this is load-bearing.
+                //
+                // SAFETY: this actually extends the lifetime of `buf` from the shorter transient
+                // lifetime to `'vm` for the return type.
+                let buf: &'vm [u8] = unsafe { core::mem::transmute(buf) };
+                return Ok(VcpuExit::MmioWrite { buf, addr });
+            }
+            kvm_ioctls::VcpuExit::IoIn(port, buf) => {
+                // see the same transmute in `MmioRead` for why this is load-bearing.
+                //
+                // SAFETY: this actually extends the lifetime of `buf` from the shorter transient
+                // lifetime to `'vm` for the return type.
+                let buf: &'vm mut [u8] = unsafe { core::mem::transmute(buf) };
+                return Ok(VcpuExit::IoIn { port, buf });
+            }
+            kvm_ioctls::VcpuExit::IoOut(port, buf) => {
+                // see the same transmute in `MmioRead` for why this is load-bearing.
+                //
+                // SAFETY: this actually extends the lifetime of `buf` from the shorter transient
+                // lifetime to `'vm` for the return type.
+                let buf: &'vm [u8] = unsafe { core::mem::transmute(buf) };
+                return Ok(VcpuExit::IoOut { port, buf });
+            }
+            kvm_ioctls::VcpuExit::Debug(info) => {
+                let pc = info.pc;
+                return Ok(VcpuExit::Debug { pc, info });
+            }
+            kvm_ioctls::VcpuExit::Hlt => {
+                let regs = self.get_regs()?;
 
-    regs.rbx = regs.rip;
-    let rip_before = regs.rip;
+                if self.idt_configured {
+                    let intrs_start = self.interrupt_handlers_start().0;
+                    let intrs_end = intrs_start + IDT_ENTRIES as u64;
+                    // by the time we've exited the `hlt` of the interrupt handler has completed,
+                    // so rip is advanced by one. subtract back out to convert to an exception
+                    // vector number.
+                    let intr_addr = regs.rip - 1;
 
-    vm.set_regs(&regs).expect("can set regs");
+                    if intr_addr >= intrs_start && intr_addr < intrs_end {
+                        let nr = intr_addr - intrs_start;
+                        // because IDT_ENTRIES is 256, this should always be true..
+                        assert!(nr < 256);
+                        let nr = nr as u8;
 
-    vm.set_single_step(true).expect("can set single-step");
+                        return Ok(VcpuExit::Exception { nr });
+                    }
+                }
 
-    let res = vm.run().expect("can run vm");
+                if self.syscall_configured {
+                    // the behavior of `syscall`, `hlt`, and `rip` is a little funky. similar to
+                    // interrupt handlers, we typically exit with rip pointed immediately after
+                    // `syscall_addr()` because we would syscall to `hlt`, execute the first `hlt`,
+                    // advance `rip` by one byte, and exit to userland for the HLT.
+                    if regs.rip == self.syscall_addr().0 + 1{
+                        return Ok(VcpuExit::Syscall);
+                    }
+                }
 
-    let expected_rip = rip_before + 4;
-    eprintln!("exit: {:?}", res);
-    match res {
-        VcpuExit::Debug { pc: rip_after, .. } => {
-            assert_eq!(expected_rip, rip_after);
-        }
-        other => {
-            panic!("unexpected exit: {:?}", other);
+                Ok(VcpuExit::Hlt)
+            }
+            kvm_ioctls::VcpuExit::Shutdown => {
+                return Ok(VcpuExit::Shutdown);
+            }
+            other => {
+                panic!("unhandled VcpuExit kind: {other:?}");
+            }
         }
-    };
-}
+    }
 
-#[test]
-fn test_vex_vandps_runs_32b() {
-    let settings = VmSettings::new(128 * 1024, IsaMode::Protected);
-    let mut vm = Vm::create_by_settings(settings).expect("can create vm");
+    /// get a pointer to host memory mapped to guest address `address`.
+    ///
+    /// panics if `address` is not a guest-physical address backed by host memory.
+    pub unsafe fn host_ptr(&self, address: GuestAddress) -> *mut u8 {
+        let mapping = self.map_containing(address, 0)
+            .expect("mapping for address exists");
 
-    if !vm.cpuid_supports(Feature::StateAVX) {
-        panic!("host CPU does not support AVX");
+        unsafe {
+            mapping.host_ptr(address)
+        }
     }
 
-    let mut regs = vm.get_regs().expect("can get regs");
-
-    vm.program(&[0xc5, 0xe0, 0x54, 0x03], &mut regs);
+    pub fn gdt_addr(&self) -> GuestAddress {
+        GuestAddress(0x1000)
+    }
 
-    regs.rbx = regs.rip;
-    let rip_before = regs.rip;
+    pub fn idt_addr(&self) -> GuestAddress {
+        GuestAddress(0x2000)
+    }
 
-    vm.set_regs(&regs).expect("can set regs");
+    pub fn interrupt_handlers_start(&self) -> GuestAddress {
+        GuestAddress(0x3000)
+    }
 
-    vm.set_single_step(true).expect("can set single-step");
+    pub fn syscall_addr(&self) -> GuestAddress {
+        GuestAddress(0x4000)
+    }
 
-    let res = vm.run().expect("can run vm");
-
-    let expected_rip = rip_before + 4;
-    eprintln!("exit: {:?}", res);
-    match res {
-        VcpuExit::Debug { pc: rip_after, .. } => {
-            assert_eq!(expected_rip, rip_after);
-        }
-        other => {
-            panic!("unexpected exit: {:?}", other);
-        }
-    };
-}
+    pub fn page_table_addr(&self) -> GuestAddress {
+        GuestAddress(0x10000)
+    }
 
-#[test]
-fn test_evex_vandps_runs() {
-    let mut vm = Vm::create(128 * 1024).expect("can create vm");
+    pub fn code_addr(&self) -> GuestAddress {
+        GuestAddress(self.memory.size.get() as u64 - 4096)
+    }
 
-    if !vm.cpuid_supports(Feature::StateAVX512) {
-        panic!("host CPU does not support AVX512");
+    pub fn mem_ceiling(&self) -> u64 {
+        self.mem_ceiling
     }
 
-    let mut regs = vm.get_regs().expect("can get regs");
+    /// configuring the IDT implies the IDT might be used which means we want a stack pointer
+    /// that can have at least 0x18 bytes pushed to it if an interrupt happens.
+    pub fn stack_addr(&self) -> GuestAddress {
+        // it would be nice to point the stack somewhere that we could get MMIO exits and see the
+        // processor push words for the interrupt in real time, but that doesn't ... work.
+        // instead, you end up in a loop somewhere around svm_vcpu_run (which you can ^C out of,
+        // thankfully).
+        //
+        // so this picks some guest memory lower down.
 
-    vm.program(&[0x62, 0xf1, 0x7c, 0xbd, 0x54, 0x0a], &mut regs);
+        // stack grows *down* but if someone pops a lot of bytes from rsp we'd go up and
+        // clobber the page tables. so leave a bit of space.
+        GuestAddress(0x19800)
+    }
 
-    regs.rbx = regs.rip;
-    let rip_before = regs.rip;
+    /// selector 0x10 is chosen arbitrarily for code.
+    pub fn selector_cs(&self) -> u16 {
+        0x10
+    }
 
-    vm.set_regs(&regs).expect("can set regs");
+    /// selector 0x18 is chosen arbitrarily for data (all segments; ss, ds, es, etc).
+    pub fn selector_ds(&self) -> u16 {
+        0x18
+    }
 
-    vm.set_single_step(true).expect("can set single-step");
+    /// selector 0x20 is chosen arbitrarily for 16-bit interrupts, which are placed well away from
+    /// where selector 0x10 is pointed in real mode.
+    pub fn selector_cs_idt_16b(&self) -> u16 {
+        0x20
+    }
 
-    let res = vm.run().expect("can run vm");
+    fn map_containing_mut(&mut self, base: GuestAddress, size: u64) -> Option<&mut Mapping> {
+        let mapping = if self.memory.contains(base) {
+            &mut self.memory
+        } else {
+            self.aux_memories.iter_mut()
+                .find(|map| map.contains(base))?
+        };
 
-    let expected_rip = rip_before + 6;
-    eprintln!("exit: {:?}", res);
-    match res {
-        VcpuExit::Debug { pc: rip_after, .. } => {
-            assert_eq!(expected_rip, rip_after);
-        }
-        other => {
-            panic!("unexpected exit: {:?}", other);
+        if !mapping.check_range(base, size) {
+            return None;
         }
-    };
-}
-
 
-// this function will sit and loop in the kernel after trying to fulfill the MMIO exit.
-//
-// not great! don't do that! it's responsive to EINTR at least.
-// #[test]
-#[allow(dead_code)]
-fn kvm_hugepage_bug() {
-    let mut vm = Vm::create(1024 * 1024).expect("can create vm");
-    vm.add_memory(GuestAddress(0x1_0000_0000), 128 * 1024).expect("can add test mem region");
-    unsafe {
-        vm.configure_identity_paging(None);
+        Some(mapping)
     }
 
-    // `add [rsp], al; add [rcx], al; pop [rcx]; hlt`
-    // the first instruction runs fine. the second instruction runs fine.
-    // the third instruction gets a page fault at 0xf800? which worked fine for the add.
-    // this turns out to be an issue in linux' paging64_gva_to_gpa() when the va is mapped with
-    // huge pages.
-    let inst: &'static [u8] = &[0x00, 0x04, 0x24, 0x00, 0x01, 0x8f, 0x01, 0xf4];
-    let mut regs = vm.get_regs().unwrap();
-    regs.rax = 0x00000002_00100000;
-    regs.rcx = 0x00000002_00100000;
-    vm.program(inst, &mut regs);
-    vm.set_regs(&regs).unwrap();
-    vm.set_single_step(true).expect("can enable single-step");
-    vm.run().expect("can run vm");
+    fn map_containing(&self, base: GuestAddress, size: u64) -> Option<&Mapping> {
+        let mapping = if self.memory.contains(base) {
+            &self.memory
+        } else {
+            self.aux_memories.iter()
+                .find(|map| map.contains(base))?
+        };
 
-    let vm_regs = vm.get_regs().unwrap();
-    let vm_sregs = vm.get_sregs().unwrap();
-    let mut prev_rip = [0u8; 8];
-    vm.read_mem(GuestAddress(vm_regs.rsp + 8), &mut prev_rip[..]);
-    let mut buf = [0u8; 8];
-    vm.read_mem(GuestAddress(vm_regs.rsp), &mut buf[..]);
-    eprintln!(
-        "error code: {:#08x} accessing {:016x} @ rip={:#016x} (cr3={:016x})",
-        u64::from_le_bytes(buf), vm_sregs.cr2,
-        u64::from_le_bytes(prev_rip), vm_sregs.cr3
-    );
-    if vm_regs.rip == 0x300f {
-        let mut pdpt = [0u8; 4096];
-        vm.read_mem(vm.page_tables().pdpt_addr(), &mut pdpt[..]);
-        eprintln!("pdpt: {:x?}", &pdpt[..8]);
+        if !mapping.check_range(base, size) {
+            return None;
+        }
+
+        Some(mapping)
     }
-    panic!("no");
-}
 
-/// a selector for the execution mode the VM should be initialized to.
-///
-/// different `IsaMode` will configure the VM wildly differently; generally any VM/vCPU state not
-/// directly required for the requested mode will be left untouched.
-///
-/// in all modes, CPUID leaves and xcr0 are set up to support any ISA extensions supported by the
-/// host CPU.
-///
-/// in all modes, an IDT is installed with interrupt handlers pointed to the 256 bytes from
-/// `interrupt_handlers_start()`.
-#[derive(Copy, Clone, Debug, PartialEq)]
-pub enum IsaMode {
-    /// request that the VM be configured to run x86-64 instructions, aka "AMD64", or "IA-32e" (and
-    /// specifically "IA-32e 64-bit mode") in some Intel nomenclature.
-    ///
-    /// this configures identity paging, selectors sufficient for long mode (with all vCPU
-    /// execution at CPL=0), prepares some MSRs for syscalls, and of course configures `cr0`
-    /// for long mode.
-    Long,
-    /// request that the VM be configured to run 32-bit instructions, with long mode neither
-    /// enabled nor active.
-    ///
-    /// this configures identity paging and selectors covering all 32-bit address space and with
-    /// CPL=0.
-    Protected,
-    /// request that the VM be configured to run 16-bit instructions.
+    /// write all of `data` into guest memory at guest-physical address `addr`.
     ///
-    /// this configures code/data selectors covering all 24 bits of address space and an interrupt
-    /// descriptor table, and CPUID for any host-supported ISA extensions, but that's about it.
-    Real,
-}
+    /// panics if `data` extends beyond the end of guest memory.
+    pub fn write_mem(&mut self, addr: GuestAddress, data: &[u8]) {
+        let mapping = self.map_containing(addr, data.len() as u64).expect("mapping is valid");
 
-/// the settings to configure a [`Vm::create_by_settings`]. see `VmSettings::new` for top-level
-/// configuration.
-pub struct VmSettings {
-    mem_size: usize,
-    isa_mode: IsaMode,
-}
+        // SAFETY: `check_range` above validates the range to copy, and... please do not
+        // provide a slice of guest memory as what the guest should be programmed for...
+        unsafe {
+            std::ptr::copy_nonoverlapping(
+                data.as_ptr(),
+                mapping.host_ptr(addr),
+                data.len()
+            );
+        }
+    }
 
-impl VmSettings {
-    /// provide the bare-minimum configuration for a VM: the size of its memory and what execution
-    /// mode the resulting VM should be set for.
+    /// read guest-physical memory at `addr` to `addr + buf.len()` into `buf`.
     ///
-    /// VM control settings (IDT, `cs`, `ds`, other selectors, syscalls, page tables, etc) vary
-    /// substantially across different `IsaMode`. in all cases code can be written into the VM with
-    /// [`Vm::program()`], then run with [`Vm::run()`].
-    pub fn new(mem_size: usize, isa_mode: IsaMode) -> Self {
-        Self { mem_size, isa_mode }
-    }
-}
+    /// panics if `addr + buf.len()` extends beyond the end of guest memory.
+    pub fn read_mem(&mut self, addr: GuestAddress, buf: &mut [u8]) {
+        let mapping = self.map_containing(addr, buf.len() as u64).expect("mapping is valid");
 
-impl Vm {
-    pub fn create(mem_size: usize) -> Result<Vm, VmCreateError> {
-        Self::create_by_settings(VmSettings::new(mem_size, IsaMode::Long))
+        // SAFETY: `check_range` above validates the range to copy, and... please do not
+        // provide a slice of guest memory as what should be read into...
+        unsafe {
+            std::ptr::copy_nonoverlapping(
+                mapping.host_ptr(addr) as *const _,
+                buf.as_mut_ptr(),
+                buf.len()
+            );
+        }
     }
 
-    pub fn create_by_settings(settings: VmSettings) -> Result<Vm, VmCreateError> {
-        let kvm = Kvm::new()
-            .map_err(|e| VmError::from_kvm("Kvm::new()", e))?;
-
-        let vm = kvm.create_vm()
-            .map_err(|e| VmError::from_kvm("craete_vm", e))?;
-
-        let supported_cpuid = kvm.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES).unwrap();
+    /// returns a slice of guest memory pointed to by guest-physical address `addr`, of size
+    /// `size`.
+    ///
+    /// panics if `addr + size` is not enclosed in a single guest mapping. this crate doesn't
+    /// support returning a single slice of adjacent guest memory regions (yet?), sorry.
+    pub fn mem_slice_mut<'vm>(&'vm mut self, addr: GuestAddress, size: u64) -> &'vm mut [u8] {
+        let mapping = self.map_containing_mut(addr, size).expect("mapping is valid");
 
-        // actual minimum is somewhere around 0x1a000 bytes, but 0x20_000 aka 128k will do
-        if settings.mem_size < 128 * 1024 {
-            return Err(VmCreateError::TooSmall {
-                requested: settings.mem_size,
-                required: 128 * 1024,
-            });
+        // SAFETY: we have an exclusive borrow of the VM, so it is not currently running, and there
+        // is no other outstanding slice of guest memory. `map_containing` has already ensured that
+        // this mapping contains the whole range `[addr, addr + size)`.
+        unsafe {
+            mapping.slice_mut(addr, size)
         }
+    }
 
-        let mapping = Mapping::create_shared(0, settings.mem_size, ProtFlags::PROT_READ | ProtFlags::PROT_WRITE)?;
+    /// returns a slice of guest memory pointed to by guest-physical address `addr`, of size
+    /// `size`.
+    ///
+    /// panics if `addr + size` is not enclosed in a single guest mapping. this crate doesn't
+    /// support returning a single slice of adjacent guest memory regions (yet?), sorry.
+    pub fn mem_slice<'vm>(&'vm self, addr: GuestAddress, size: u64) -> &'vm [u8] {
+        let mapping = self.map_containing(addr, size).expect("mapping is valid");
 
-        let region = kvm_userspace_memory_region {
-            slot: 0,
-            guest_phys_addr: 0x0000,
-            memory_size: mapping.size.get() as u64,
-            userspace_addr: mapping.addr.as_ptr() as u64,
-            flags: 0,
-        };
+        // SAFETY: we have an exclusive borrow of the VM, so it is not currently running, and there
+        // is no other outstanding slice of guest memory. `map_containing` has already ensured that
+        // this mapping contains the whole range `[addr, addr + size)`.
+        unsafe {
+            mapping.slice(addr, size)
+        }
+    }
 
-        let set_res = unsafe { vm.set_user_memory_region(region) };
-        set_res.map_err(|e| VmError::from_kvm("set_user_memory_region", e))?;
+    /// write `code` into guest memory and set `regs.rip` to the address of that code.
+    ///
+    /// the chosen code address is [`Self::code_addr`]; this is the guest linear address the
+    /// provided code buffer is written to.
+    ///
+    /// if the VM is configured for `IsaMode::Long` or `IsaMode::Protected`, `rip` or `eip` is set
+    /// to this address as well. otherwise, the VM is configured for `IsaMode::Real` and `ip` is
+    /// set to `code_addr() & 0x0f` - in typical cases `ip` will be 0.
+    ///
+    pub fn program(&mut self, code: &[u8], regs: &mut kvm_regs) {
+        let addr = self.code_addr();
+        self.write_mem(addr, code);
 
-        let vcpu_res = vm.create_vcpu(0);
-        let vcpu = vcpu_res.map_err(|e| VmError::from_kvm("create_vcpu(0)", e))?;
+        if self.settings.isa_mode != IsaMode::Real {
+            regs.rip = addr.0;
+        } else {
+            regs.rip = addr.0 & 0x000f;
+        }
+    }
 
-        let current_cpuid = vcpu.get_cpuid2(KVM_MAX_CPUID_ENTRIES).unwrap();
+    fn gdt_entry_mut(&mut self, idx: u16) -> *mut u64 {
+        // the GDT is set up at addresses 0..64k:
+        //
+        // > 3.5.1 Segment Descriptor Tables
+        // > A segment descriptor table is an array of segment descriptors (see Figure 3-10). A
+        // > descriptor table is variable in length and can contain up to 8192 (2^13) 8-byte
+        // > descriptors.
 
-        let mem_ceiling = mapping.size.get().try_into().unwrap();
+        assert!(idx < 4096 / 8);
+        let addr = GuestAddress(self.gdt_addr().0 + (idx as u64 * 8));
+        let mapping = self.map_containing(addr, std::mem::size_of::<u64>() as u64).unwrap();
 
-        let mut this = Vm {
-            settings,
-            vm,
-            vcpu,
-            supported_cpuid,
-            current_cpuid,
-            idt_configured: false,
-            syscall_configured: false,
-            memory: mapping,
-            aux_memories: Vec::new(),
-            mem_ceiling,
-        };
+        // SAFETY: idx * 8 can't overflow isize, and we've asserted the end of the pointer is
+        // still inside the allocation (`self.memory`).
+        unsafe {
+            mapping.host_ptr(addr) as *mut u64
+        }
+    }
 
-        let mut vcpu_regs = this.get_regs()?;
-        let mut vcpu_sregs = this.get_sregs()?;
+    // note this returns a u32, but a long-mode IDT is four u32. the u32 this points at is the
+    // first of the four for the entry.
+    fn idt_entry_mut(&mut self, idx: u8) -> *mut u32 {
+        let addr = GuestAddress(self.idt_addr().0 + (idx as u64 * 16));
+        let mapping = self.map_containing(addr, std::mem::size_of::<[u64; 2]>() as u64).unwrap();
 
-        assert!(this.cpuid_supports(Feature::Base));
-        this.cpuid_set(Feature::Base, true);
+        unsafe {
+            mapping.host_ptr(addr) as *mut u32
+        }
+    }
 
-        match this.settings.isa_mode {
-            IsaMode::Long => {
-                unsafe {
-                    this.configure_identity_paging(Some(&mut vcpu_sregs));
-                    this.configure_selectors(&mut vcpu_sregs);
-                    this.configure_idt(&mut vcpu_regs, &mut vcpu_sregs);
-                    let mut xcrs = this.get_xcrs()?;
-                    this.configure_extensions(&mut vcpu_sregs, &mut xcrs);
-                    this.set_xcrs(&xcrs)?;
-                    this.configure_syscalls(&mut vcpu_sregs);
-                }
+    // note this returns a u32, but a legacy IDT is two u32. the u32 this points at is the
+    // first of the four for the entry.
+    fn idt_entry_legacy_mut(&mut self, idx: u8) -> *mut u32 {
+        let addr = GuestAddress(self.idt_addr().0 + (idx as u64 * 8));
+        let mapping = self.map_containing(addr, std::mem::size_of::<[u64; 2]>() as u64).unwrap();
 
-                vcpu_sregs.efer |= 0x0000_0500; // LME | LMA
-            }
-            IsaMode::Protected => {
-                unsafe {
-                    this.configure_identity_paging_32b(Some(&mut vcpu_sregs));
-                    this.configure_selectors_32b(&mut vcpu_sregs);
-                    this.configure_idt_32b(&mut vcpu_regs, &mut vcpu_sregs);
-                    let mut xcrs = this.get_xcrs()?;
-                    this.configure_extensions(&mut vcpu_sregs, &mut xcrs);
-                    this.set_xcrs(&xcrs)?;
+        unsafe {
+            mapping.host_ptr(addr) as *mut u32
+        }
+    }
 
-                }
-            }
-            IsaMode::Real => {
-                unsafe {
-                    this.configure_selectors_16b(&mut vcpu_sregs);
-                    this.configure_idt_16b(&mut vcpu_regs, &mut vcpu_sregs);
-                    let mut xcrs = this.get_xcrs()?;
-                    this.configure_extensions(&mut vcpu_sregs, &mut xcrs);
-                    this.set_xcrs(&xcrs)?;
+    pub fn page_tables(&self) -> VmPageTables<'_> {
+        let base = self.page_table_addr();
 
-                    // in 16-bit mode we've set cs and ds to cover the last 4kb of memory, starting
-                    // at the same place we've written code to execute. there's not much memory to
-                    // go around, and not a ton of flexibility in the asmlinator API, so uh ... the
-                    // least annoying thing to do might be to just put the stack 0x80 bytes from
-                    // the end?
-                    vcpu_regs.rsp = 0x1000 - 0x80;
+        // the page tables are really just two pages: a PML4 and a PDPT for its first 512G of
+        // address space.
+        assert!(self.map_containing(base, 0x2000).is_some());
+
+        VmPageTables {
+            vm: self,
+            base,
+        }
+    }
+
+    // TODO: there should be a version of this that can be used to query "does this VM support
+    // these extensions" probably, and that should take a subset of `Feature` for the ones that are
+    // actually related to ISA support (e.g. Pdpe1Gb isn't really useful as a public queryable
+    // feature..)
+    fn cpuid_supports(&self, feature: Feature) -> bool {
+        fn find_leaf(cpuid: &CpuId, leaf: u32, index: u32, f: impl Fn(&kvm_cpuid_entry2) -> bool) -> bool {
+            for mut entry in cpuid.as_slice() {
+                if entry.function == leaf && entry.index == index {
+                    return f(&mut entry);
                 }
             }
-        }
 
-        this.set_regs(&vcpu_regs)?;
-        this.set_sregs(&vcpu_sregs)?;
+            false
+        }
 
-        Ok(this)
+        match feature {
+            Feature::Base => {
+                let lm = find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| {
+                    leaf.edx & CPUID_80000001_EDX_LM != 0
+                });
+                let msr = find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| {
+                    leaf.edx & CPUID_00000001_EDX_MSR != 0
+                });
+                let clstac = find_leaf(&self.supported_cpuid, 0x0000_0007, 0, |leaf| {
+                    leaf.ebx & CPUID_00000007_EBX_CLSTAC != 0
+                });
+                lm && msr && clstac
+            }
+            Feature::Syscall => {
+                find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| {
+                    leaf.edx & CPUID_80000001_EDX_SYSCALL != 0
+                })
+            }
+            Feature::XSave => {
+                find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| {
+                    leaf.edx & CPUID_00000001_ECX_XSAVE != 0
+                })
+            }
+            Feature::Pdpe1Gb => {
+                find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| {
+                    leaf.edx & CPUID_80000001_EDX_PDPE1GB != 0
+                })
+            }
+            Feature::StateSSE => {
+                find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| {
+                    leaf.eax & CPUID_0000000D_EAX_SSE == CPUID_0000000D_EAX_SSE
+                })
+            }
+            Feature::StateAVX => {
+                find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| {
+                    leaf.eax & CPUID_0000000D_EAX_AVX == CPUID_0000000D_EAX_AVX
+                })
+            }
+            Feature::StateAVX512 => {
+                find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| {
+                    leaf.eax & CPUID_0000000D_EAX_AVX512 == CPUID_0000000D_EAX_AVX512
+                })
+            }
+            Feature::Pse => {
+                find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| {
+                    leaf.edx & CPUID_00000001_EDX_PSE == CPUID_00000001_EDX_PSE
+                })
+            }
+        }
     }
 
-    /// map and add a region of size `size` at guest-physical address `gpa`.
+    /// set `feature` to `wanted` in the VM's CPUID configuration.
     ///
-    /// this will not update page tables, so if the newly-added memory is not already mapped due to
-    /// a previous `configure_identity_paging` call and it is not mapped due to explicit page table
-    /// management, it will not yet be accessible by guest code.
-    pub fn add_memory(&mut self, gpa: GuestAddress, size: u64) -> Result<(), VmError> {
-        let new_mapping_end = gpa.0.checked_add(size)
-            .map(|addr| GuestAddress(addr))
-            .ok_or_else(|| VmError::InvalidMapping { base: gpa, size })?;
-        if self.memory.overlaps(gpa, new_mapping_end) {
-            return Err(VmError::InvalidMapping { base: gpa, size });
-        } else {
-            for mapping in self.aux_memories.iter() {
-                if mapping.overlaps(gpa, new_mapping_end) {
-                    return Err(VmError::InvalidMapping { base: gpa, size });
+    /// panics if the feature cannot be configured (such as if the corresponding CPUID leaf is not
+    /// available at all). use [`cpuid_supports`] to test if the feature can be configured.
+    fn cpuid_set(&mut self, feature: Feature, wanted: bool) {
+        fn edit_leaf(cpuid: &mut CpuId, leaf: u32, index: u32, mut f: impl FnMut(&mut kvm_cpuid_entry2)) {
+            for mut entry in cpuid.as_mut_slice() {
+                if entry.function == leaf && entry.index == index {
+                    f(&mut entry);
+                    return;
                 }
             }
-        }
-
-        let mapping = Mapping::create_shared(
-            u64_to_usize(gpa.0),
-            u64_to_usize(size),
-            ProtFlags::PROT_READ | ProtFlags::PROT_WRITE
-        )?;
-
-        let used_slots: u32 = self.aux_memories.len().try_into()
-            .map_err(|_| VmError::InvalidMapping { base: gpa, size })?;
-        let next_slot = used_slots.checked_add(1)
-            .ok_or_else(|| VmError::InvalidMapping { base: gpa, size })?;
-
-        let region = kvm_userspace_memory_region {
-            slot: next_slot,
-            guest_phys_addr: gpa.0,
-            memory_size: mapping.size.get() as u64,
-            userspace_addr: mapping.addr.as_ptr() as u64,
-            flags: 0,
-        };
-
-        let set_res = unsafe { self.vm.set_user_memory_region(region) };
-        set_res.map_err(|e| VmError::from_kvm("set_user_memory_region", e))?;
 
-        self.aux_memories.push(mapping);
+            // if we're here, the entry simply is not present (yet..?)
+            //
+            // so, create it.
+            let mut entry = kvm_cpuid_entry2 {
+                function: leaf,
+                index: index,
+                eax: 0,
+                ecx: 0,
+                edx: 0,
+                ebx: 0,
+                flags: 0,
+                padding: [0; 3],
+            };
+            f(&mut entry);
+            cpuid.push(entry).expect("can push");
+        }
 
-        if new_mapping_end.0 > self.mem_ceiling {
-            self.mem_ceiling = new_mapping_end.0;
+        fn bit_set(word: &mut u32, bit: u32, wanted: bool) {
+            *word &= !bit;
+            if wanted {
+                *word |= bit;
+            }
         }
 
-        Ok(())
-    }
+        let mut edited = false;
 
-    pub fn get_regs(&self) -> Result<kvm_regs, VmError> {
-        self.vcpu.get_regs()
-            .map_err(|e| VmError::from_kvm("get_regs", e))
-    }
+        match feature {
+            Feature::Base => {
+                edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| {
+                    bit_set(&mut leaf.edx, CPUID_80000001_EDX_LM, wanted);
+                    edited = true;
+                });
+                edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| {
+                    bit_set(&mut leaf.edx, CPUID_00000001_EDX_MSR, wanted);
+                    edited = true;
+                });
+                edit_leaf(&mut self.current_cpuid, 0x0000_0007, 0, |leaf| {
+                    bit_set(&mut leaf.ebx, CPUID_00000007_EBX_CLSTAC, wanted);
+                    edited = true;
+                });
+            }
+            Feature::Syscall => {
+                edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| {
+                    bit_set(&mut leaf.edx, CPUID_80000001_EDX_SYSCALL, wanted);
+                    edited = true;
+                });
+            }
+            Feature::XSave => {
+                edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| {
+                    bit_set(&mut leaf.ecx, CPUID_00000001_ECX_XSAVE, wanted);
+                    edited = true;
+                });
+            },
+            Feature::Pdpe1Gb => {
+                edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| {
+                    bit_set(&mut leaf.edx, CPUID_80000001_EDX_PDPE1GB, wanted);
+                    edited = true;
+                });
+            },
+            Feature::StateSSE => {
+                edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| {
+                    bit_set(&mut leaf.eax, 1, wanted);
+                    bit_set(&mut leaf.eax, CPUID_0000000D_EAX_SSE, wanted);
+                    edited = true;
+                });
+            }
+            Feature::StateAVX => {
+                edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| {
+                    bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX, wanted);
+                    edited = true;
+                });
+            }
+            Feature::StateAVX512 => {
+                edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| {
+                    bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX512, wanted);
+                    edited = true;
+                });
+            }
+            Feature::Pse => {
+                edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| {
+                    bit_set(&mut leaf.edx, CPUID_00000001_EDX_PSE, wanted);
+                    edited = true;
+                });
+            }
+        }
 
-    pub fn get_sregs(&self) -> Result<kvm_sregs, VmError> {
-        self.vcpu.get_sregs()
-            .map_err(|e| VmError::from_kvm("get_sregs", e))
-    }
+        assert!(edited);
 
-    pub fn get_xcrs(&self) -> Result<kvm_xcrs, VmError> {
-        self.vcpu.get_xcrs()
-            .map_err(|e| VmError::from_kvm("get_xcrs", e))
+        self.vcpu.set_cpuid2(&self.current_cpuid).expect("can set cpuid");
     }
 
-    pub fn set_regs(&self, regs: &kvm_regs) -> Result<(), VmError> {
-        self.vcpu.set_regs(regs)
-            .map_err(|e| VmError::from_kvm("set_regs", e))
-    }
+    /// configure page tables for identity mapping of all memory from guest address zero up to the
+    /// end of added memory regions, rounded up to the next GiB.
+    ///
+    /// if `sregs` is provided, update `cr0`, `cr3`, and `cr4` in support of protected-mode or
+    /// long-mode paging. this is a fixed pattern: if control registers have not been changed since
+    /// `Vm::create` then there will be no change to these control registers and `sregs` can be
+    /// omitted.
+    ///
+    /// panics if the end of added memory regions is above 512 GiB.
+    pub unsafe fn configure_identity_paging(&mut self, sregs: Option<&mut kvm_sregs>) {
+        // we're only setting up one PDPT, which can have up to 512 PDPTE covering 1G each.
+        assert!(self.mem_ceiling() <= 512 * GB);
 
-    pub fn set_sregs(&self, sregs: &kvm_sregs) -> Result<(), VmError> {
-        self.vcpu.set_sregs(sregs)
-            .map_err(|e| VmError::from_kvm("set_sregs", e))
-    }
+        assert!(self.cpuid_supports(Feature::Pdpe1Gb));
+        self.cpuid_set(Feature::Pdpe1Gb, true);
 
-    pub fn set_xcrs(&self, xcrs: &kvm_xcrs) -> Result<(), VmError> {
-        self.vcpu.set_xcrs(xcrs)
-            .map_err(|e| VmError::from_kvm("set_xcrs", e))
-    }
+        let pt = self.page_tables();
 
-    pub fn set_msrs(&self, msrs: &Msrs) -> Result<(), VmError> {
-        let n_set = self.vcpu.set_msrs(msrs)
-            .map_err(|e| VmError::from_kvm("set_msrs", e))?;
-        assert_eq!(msrs.as_slice().len(), n_set);
-        Ok(())
-    }
+        let pml4_ent =
+            1 << 0 | // P
+            1 << 1 | // RW
+            1 << 2 | // user access allowed. but no user code will run so not strictly needed.
+            0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient)
+            0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient)
+            0 << 5 | // A
+            0 << 6 | // ignored
+            0 << 7 | // PS (reserved must-be-0)
+            0 << 11 | // R (for ordinary paging, ignored; for HLAT ...)
+            pt.pdpt_addr().0;
+        unsafe {
+            pt.pml4_mut().write(pml4_ent);
+        }
 
-    pub fn idt_configured(&self) -> bool {
-        self.idt_configured
-    }
+        let mut mapped: u64 = 0;
+        // we've set up the first PML4 to point to a PDPT, so we should actually set it up!
+        let pdpt = pt.pdpt_mut();
+        // PDPTEs start at the start of PDPT..
+        let mut pdpte = pdpt;
+        let entry_bits: u64 =
+            1 << 0 | // P
+            1 << 1 | // RW
+            1 << 2 | // user accesses allowed (everything is under privilege level 0 tho)
+            0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient)
+            0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient)
+            0 << 5 | // Accessed
+            0 << 6 | // Dirty
+            1 << 7 | // Page size (1 implies 1G page)
+            1 << 8 | // Global (if cr4.pge)
+            0 << 9 |
+            0 << 10 |
+            0 << 11 | // for ordinary paging, ignored. for HLAT, ...
+            0 << 12; // PAT (TODO: configure explicitly, but PAT0 is sufficient. verify MTRR sets PAT0 to WB?)
 
-    pub fn syscall_configured(&self) -> bool {
-        self.syscall_configured
+        while mapped < self.mem_ceiling() {
+            let phys_num = mapped >> 30;
+            let entry = entry_bits | (phys_num << 30);
+            unsafe {
+                pdpte.write(entry);
+                pdpte = pdpte.offset(1);
+            }
+            // eprintln!("mapped 1g at {:08x}", mapped);
+            mapped += 1 << 30;
+        }
+
+        if let Some(sregs) = sregs {
+            sregs.cr0 |= 0x8000_0001; // cr0.PE | cr0.PG
+            sregs.cr3 = pt.pml4_addr().0 as u64;
+            sregs.cr4 |= 1 << 5; // enable PAE
+        }
     }
 
-    // TODO: seems like there's a KVM bug where if the VM is configured for single-step and the
-    // single-stepped instruction is a rmw to MMIO memory (or MMIO hugepages?), the single-step
-    // doesn't actually take effect. compare `0x33 0x00` and `0x31 0x00`. what the hell!
-    pub fn set_single_step(&mut self, active: bool) -> Result<(), VmError> {
-        let mut guest_debug = kvm_guest_debug::default();
+    /// configure page tables for identity mapping of all memory from guest address zero up to the
+    /// end of added memory regions, rounded up to the next 4MiB.
+    ///
+    /// if `sregs` is provided, update `cr0`, `cr3`, and `cr4` in support of protected-mode paging.
+    /// this is a fixed pattern: if control registers have not been changed since `Vm::create` then
+    /// there will be no change to these control registers and `sregs` can be omitted.
+    pub unsafe fn configure_identity_paging_32b(&mut self, sregs: Option<&mut kvm_sregs>) {
+        // because we'll set PDEs to map 4M pages and cr3 points at a page-aligned block of 1024
+        // 4-byte PDEs, that gives us 4KiB of memory used to map 4GiB of address space. that's all
+        // of 32-bit, so we don't need to check an upper bound.
 
-        if active {
-            guest_debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP
-        };
+        assert!(self.cpuid_supports(Feature::Pse));
+        self.cpuid_set(Feature::Pse, true);
 
-        self.vcpu.set_guest_debug(&guest_debug)
-            .map_err(|e| VmError::from_kvm("set_guest_debug", e))
-    }
+        let pt = self.page_tables();
 
-    pub fn run<'vm>(&'vm mut self) -> Result<VcpuExit<'vm>, VmError> {
-        let exit = self.vcpu.run()
-            .map_err(|e| VmError::from_kvm("vcpu run", e))?;
+        let mut mapped: u64 = 0;
+        // "pml4_mut" is really just the start of page table memory. we'll pun this in 32-bit with
+        // the knowledge it's really a block of PDEs.
+        let pd = pt.pml4_mut() as *mut u32;
+        let mut pde = pd;
+        let entry_bits: u32 =
+            1 << 0 | // P
+            1 << 1 | // RW
+            1 << 2 | // user accesses allowed (everything is under privilege level 0 tho)
+            0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient)
+            0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient)
+            0 << 5 | // Accessed
+            0 << 6 | // Dirty
+            1 << 7 | // Page size (1 implies 4M page)
+            1 << 8 | // Global (if cr4.pge)
+            0 << 9 |
+            0 << 10 |
+            0 << 11 | // for ordinary paging, ignored. for HLAT, ...
+            0 << 12; // PAT (TODO: configure explicitly, but PAT0 is sufficient. verify MTRR sets PAT0 to WB?)
 
-        match exit {
-            kvm_ioctls::VcpuExit::MmioRead(addr, buf) => {
-                // `buf` is typed with a lifetime from the reborrow of self.vcpu for run() above.
-                // this means it's a shorter lifetime than `'vm`, but since the resulting lifetime
-                // is also `'vm` it *really* has the effect of disallowing any subsequent use of
-                // `self`. these transmutes decouple the lifetime of `exit` from the lifetime of
-                // `self` and returned `VcpuExit`, so other arms that don't involve lifetimes can
-                // drop `exit()` and query the vcpu.
-                //
-                // SAFETY: this actually extends the lifetime of `buf` from the shorter transient
-                // lifetime to `'vm` for the return type.
-                let buf: &'vm mut [u8] = unsafe { core::mem::transmute(buf) };
-                return Ok(VcpuExit::MmioRead { buf, addr });
-            }
-            kvm_ioctls::VcpuExit::MmioWrite(addr, buf) => {
-                // see the same transmute in `MmioRead` for why this is load-bearing.
-                //
-                // SAFETY: this actually extends the lifetime of `buf` from the shorter transient
-                // lifetime to `'vm` for the return type.
-                let buf: &'vm [u8] = unsafe { core::mem::transmute(buf) };
-                return Ok(VcpuExit::MmioWrite { buf, addr });
-            }
-            kvm_ioctls::VcpuExit::IoIn(port, buf) => {
-                // see the same transmute in `MmioRead` for why this is load-bearing.
-                //
-                // SAFETY: this actually extends the lifetime of `buf` from the shorter transient
-                // lifetime to `'vm` for the return type.
-                let buf: &'vm mut [u8] = unsafe { core::mem::transmute(buf) };
-                return Ok(VcpuExit::IoIn { port, buf });
-            }
-            kvm_ioctls::VcpuExit::IoOut(port, buf) => {
-                // see the same transmute in `MmioRead` for why this is load-bearing.
-                //
-                // SAFETY: this actually extends the lifetime of `buf` from the shorter transient
-                // lifetime to `'vm` for the return type.
-                let buf: &'vm [u8] = unsafe { core::mem::transmute(buf) };
-                return Ok(VcpuExit::IoOut { port, buf });
-            }
-            kvm_ioctls::VcpuExit::Debug(info) => {
-                let pc = info.pc;
-                return Ok(VcpuExit::Debug { pc, info });
+        while mapped < self.mem_ceiling() {
+            let phys_num = (mapped as u32) >> 22;
+            let entry = entry_bits | (phys_num << 22);
+            unsafe {
+                pde.write(entry);
+                pde = pde.offset(1);
             }
-            kvm_ioctls::VcpuExit::Hlt => {
-                let regs = self.get_regs()?;
+            mapped += 1 << 22;
+        }
 
-                if self.idt_configured {
-                    let intrs_start = self.interrupt_handlers_start().0;
-                    let intrs_end = intrs_start + IDT_ENTRIES as u64;
-                    // by the time we've exited the `hlt` of the interrupt handler has completed,
-                    // so rip is advanced by one. subtract back out to convert to an exception
-                    // vector number.
-                    let intr_addr = regs.rip - 1;
+        // page size extensions; collaborates with page tables' PS bit to make 4MiB pages in 32-bit
+        // mode. see SDM section 2.5 "CONTROL REGISTERS".
+        const PSE: u64 = 1 << 4;
 
-                    if intr_addr >= intrs_start && intr_addr < intrs_end {
-                        let nr = intr_addr - intrs_start;
-                        // because IDT_ENTRIES is 256, this should always be true..
-                        assert!(nr < 256);
-                        let nr = nr as u8;
+        if let Some(sregs) = sregs {
+            sregs.cr0 |= 0x8000_0001; // cr0.PE | cr0.PG
+            sregs.cr3 = pt.pml4_addr().0 as u64;
+            sregs.cr4 |= PSE;
+        }
+    }
 
-                        return Ok(VcpuExit::Exception { nr });
-                    }
-                }
+    unsafe fn configure_selectors(&mut self, sregs: &mut kvm_sregs) {
+        // we have to set descriptor information directly. this avoids having to load selectors
+        // as the first instructions on the vCPU, which is simplifying. but if we want the
+        // information in these selectors to match with anything in a GDT (i do!) we'll have to
+        // keep this initial state lined up with GDT entries ourselves.
+        //
+        // we could avoid setting up the GDT for the most part, but anything that might
+        // legitimately load the "valid" current segment selector would instead clobber the
+        // selector with zeroes.
 
-                if self.syscall_configured {
-                    // the behavior of `syscall`, `hlt`, and `rip` is a little funky. similar to
-                    // interrupt handlers, we typically exit with rip pointed immediately after
-                    // `syscall_addr()` because we would syscall to `hlt`, execute the first `hlt`,
-                    // advance `rip` by one byte, and exit to userland for the HLT.
-                    if regs.rip == self.syscall_addr().0 + 1{
-                        return Ok(VcpuExit::Syscall);
-                    }
-                }
+        sregs.cs.base = 0;
+        sregs.cs.limit = 0;
+        sregs.cs.selector = self.selector_cs();
+        sregs.cs.type_ = 0b1011; // see SDM table 3-1 Code- and Data-Segment Types
+        sregs.cs.present = 1;
+        sregs.cs.dpl = 0;
+        sregs.cs.db = 0;
+        sregs.cs.s = 1;
+        sregs.cs.l = 1;
+        sregs.cs.g = 0;
+        sregs.cs.avl = 0;
 
-                Ok(VcpuExit::Hlt)
-            }
-            kvm_ioctls::VcpuExit::Shutdown => {
-                return Ok(VcpuExit::Shutdown);
-            }
-            other => {
-                panic!("unhandled VcpuExit kind: {other:?}");
-            }
-        }
-    }
+        sregs.ds.base = 0;
+        sregs.ds.limit = 0xffffffff;
+        sregs.ds.selector = self.selector_ds();
+        sregs.ds.type_ = 0b0011; // see SDM table 3-1 Code- and Data-Segment Types
+        sregs.ds.present = 1;
+        sregs.ds.dpl = 0;
+        sregs.ds.db = 0;
+        sregs.ds.s = 1;
+        sregs.ds.l = 0;
+        sregs.ds.g = 0;
+        sregs.ds.avl = 0;
 
-    /// get a pointer to host memory mapped to guest address `address`.
-    ///
-    /// panics if `address` is not a guest-physical address backed by host memory.
-    pub unsafe fn host_ptr(&self, address: GuestAddress) -> *mut u8 {
-        let mapping = self.map_containing(address, 0)
-            .expect("mapping for address exists");
+        sregs.es = sregs.ds;
+        sregs.fs = sregs.ds;
+        sregs.gs = sregs.ds;
+        // linux populates the vmcb cpl field with whatever's in ss.dpl. what the hell???
+        sregs.ss = sregs.ds;
+
+        sregs.gdt.base = self.gdt_addr().0;
+        sregs.gdt.limit = 256 * 8 - 1;
 
         unsafe {
-            mapping.host_ptr(address)
+            self.gdt_entry_mut(self.selector_cs() >> 3).write(encode_segment(&sregs.cs));
+            self.gdt_entry_mut(self.selector_ds() >> 3).write(encode_segment(&sregs.ds));
         }
     }
 
-    pub fn gdt_addr(&self) -> GuestAddress {
-        GuestAddress(0x1000)
-    }
-
-    pub fn idt_addr(&self) -> GuestAddress {
-        GuestAddress(0x2000)
-    }
+    /// configure selectors for 32-bit code exceution. this is basically the same as 64-bit, but we
+    /// set a limit and set `cs.db` so that the default operand size is a normal 32-bit.
+    unsafe fn configure_selectors_32b(&mut self, sregs: &mut kvm_sregs) {
+        // we have to set descriptor information directly. this avoids having to load selectors
+        // as the first instructions on the vCPU, which is simplifying. but if we want the
+        // information in these selectors to match with anything in a GDT (i do!) we'll have to
+        // keep this initial state lined up with GDT entries ourselves.
+        //
+        // we could avoid setting up the GDT for the most part, but anything that might
+        // legitimately load the "valid" current segment selector would instead clobber the
+        // selector with zeroes.
 
-    pub fn interrupt_handlers_start(&self) -> GuestAddress {
-        GuestAddress(0x3000)
-    }
+        sregs.cs.base = 0;
+        sregs.cs.limit = 0xffffffff;
+        sregs.cs.selector = self.selector_cs();
+        sregs.cs.type_ = 0b1011; // see SDM table 3-1 Code- and Data-Segment Types
+        sregs.cs.present = 1;
+        sregs.cs.dpl = 0;
+        sregs.cs.db = 1;
+        sregs.cs.s = 1;
+        sregs.cs.l = 0;
+        sregs.cs.g = 1;
+        sregs.cs.avl = 0;
 
-    pub fn syscall_addr(&self) -> GuestAddress {
-        GuestAddress(0x4000)
-    }
+        sregs.ds.base = 0;
+        sregs.ds.limit = 0xffffffff;
+        sregs.ds.selector = self.selector_ds();
+        sregs.ds.type_ = 0b0011; // see SDM table 3-1 Code- and Data-Segment Types
+        sregs.ds.present = 1;
+        sregs.ds.dpl = 0;
+        sregs.ds.db = 1;
+        sregs.ds.s = 1;
+        sregs.ds.l = 0;
+        sregs.ds.g = 1;
+        sregs.ds.avl = 0;
 
-    pub fn page_table_addr(&self) -> GuestAddress {
-        GuestAddress(0x10000)
-    }
+        sregs.es = sregs.ds;
+        sregs.fs = sregs.ds;
+        sregs.gs = sregs.ds;
+        // linux populates the vmcb cpl field with whatever's in ss.dpl. what the hell???
+        sregs.ss = sregs.ds;
 
-    pub fn code_addr(&self) -> GuestAddress {
-        GuestAddress(self.memory.size.get() as u64 - 4096)
-    }
+        sregs.gdt.base = self.gdt_addr().0;
+        sregs.gdt.limit = 256 * 8 - 1;
 
-    pub fn mem_ceiling(&self) -> u64 {
-        self.mem_ceiling
+        unsafe {
+            self.gdt_entry_mut(self.selector_cs() >> 3).write(encode_segment(&sregs.cs));
+            self.gdt_entry_mut(self.selector_ds() >> 3).write(encode_segment(&sregs.ds));
+        }
     }
 
-    /// configuring the IDT implies the IDT might be used which means we want a stack pointer
-    /// that can have at least 0x18 bytes pushed to it if an interrupt happens.
-    pub fn stack_addr(&self) -> GuestAddress {
-        // it would be nice to point the stack somewhere that we could get MMIO exits and see the
-        // processor push words for the interrupt in real time, but that doesn't ... work.
-        // instead, you end up in a loop somewhere around svm_vcpu_run (which you can ^C out of,
-        // thankfully).
+    /// configure selectors for 16-bit code exceution.
+    ///
+    /// unlike other modes, this sets `cs` to execute code at the linear address given by
+    /// [`Self::code_addr`]. `ds` is configured to overlap with `cs`. this way, when executing
+    /// 16-bit code the VM can simply be configured to `ip = 0`, and code addresses match data
+    /// addresses. additionally, clear `cs.db` so that the default operand size is 16-bit.
+    unsafe fn configure_selectors_16b(&mut self, sregs: &mut kvm_sregs) {
+        // we have to set descriptor information directly. this avoids having to load selectors
+        // as the first instructions on the vCPU, which is simplifying. but if we want the
+        // information in these selectors to match with anything in a GDT (i do!) we'll have to
+        // keep this initial state lined up with GDT entries ourselves.
         //
-        // so this picks some guest memory lower down.
+        // we could avoid setting up the GDT for the most part, but anything that might
+        // legitimately load the "valid" current segment selector would instead clobber the
+        // selector with zeroes.
 
-        // stack grows *down* but if someone pops a lot of bytes from rsp we'd go up and
-        // clobber the page tables. so leave a bit of space.
-        GuestAddress(0x19800)
-    }
+        sregs.cs.base = 0;
+        sregs.cs.limit = 0xfffff;
+        sregs.cs.selector = self.selector_cs();
+        sregs.cs.type_ = 0b1011; // see SDM table 3-1 Code- and Data-Segment Types
+        sregs.cs.present = 1;
+        sregs.cs.dpl = 0;
+        sregs.cs.db = 0;
+        sregs.cs.s = 1;
+        sregs.cs.l = 0;
+        sregs.cs.g = 1;
+        sregs.cs.avl = 0;
 
-    /// selector 0x10 is chosen arbitrarily for code.
-    pub fn selector_cs(&self) -> u16 {
-        0x10
-    }
+        unsafe {
+            self.gdt_entry_mut(self.selector_cs_idt_16b() >> 3).write(encode_segment(&sregs.cs));
+        }
 
-    /// selector 0x18 is chosen arbitrarily for data (all segments; ss, ds, es, etc).
-    pub fn selector_ds(&self) -> u16 {
-        0x18
-    }
+        // and now adjust for the real cs for code execution to happen in..
+        sregs.cs.base = self.code_addr().0;
 
-    /// selector 0x20 is chosen arbitrarily for 16-bit interrupts, which are placed well away from
-    /// where selector 0x10 is pointed in real mode.
-    pub fn selector_cs_idt_16b(&self) -> u16 {
-        0x20
-    }
+        sregs.ds.base = self.code_addr().0;
+        sregs.ds.limit = 0xfffff;
+        sregs.ds.selector = self.selector_ds();
+        sregs.ds.type_ = 0b0011; // see SDM table 3-1 Code- and Data-Segment Types
+        sregs.ds.present = 1;
+        sregs.ds.dpl = 0;
+        sregs.ds.db = 0;
+        sregs.ds.s = 1;
+        sregs.ds.l = 0;
+        sregs.ds.g = 1;
+        sregs.ds.avl = 0;
 
-    fn map_containing_mut(&mut self, base: GuestAddress, size: u64) -> Option<&mut Mapping> {
-        let mapping = if self.memory.contains(base) {
-            &mut self.memory
-        } else {
-            self.aux_memories.iter_mut()
-                .find(|map| map.contains(base))?
-        };
+        sregs.es = sregs.ds;
+        sregs.fs = sregs.ds;
+        sregs.gs = sregs.ds;
+        // linux populates the vmcb cpl field with whatever's in ss.dpl. what the hell???
+        sregs.ss = sregs.ds;
 
-        if !mapping.check_range(base, size) {
-            return None;
-        }
+        sregs.gdt.base = self.gdt_addr().0;
+        sregs.gdt.limit = 256 * 8 - 1;
 
-        Some(mapping)
+        unsafe {
+            self.gdt_entry_mut(self.selector_cs() >> 3).write(encode_segment(&sregs.cs));
+            self.gdt_entry_mut(self.selector_ds() >> 3).write(encode_segment(&sregs.ds));
+        }
     }
 
-    fn map_containing(&self, base: GuestAddress, size: u64) -> Option<&Mapping> {
-        let mapping = if self.memory.contains(base) {
-            &self.memory
-        } else {
-            self.aux_memories.iter()
-                .find(|map| map.contains(base))?
-        };
-
-        if !mapping.check_range(base, size) {
-            return None;
-        }
-
-        Some(mapping)
-    }
+    fn write_idt_entry(
+        &mut self,
+        intr_nr: u8,
+        interrupt_handler_cs: u16,
+        interrupt_handler_addr: GuestAddress
+    ) {
+        let idt_ptr = self.idt_entry_mut(intr_nr);
 
-    /// write all of `data` into guest memory at guest-physical address `addr`.
-    ///
-    /// panics if `data` extends beyond the end of guest memory.
-    pub fn write_mem(&mut self, addr: GuestAddress, data: &[u8]) {
-        let mapping = self.map_containing(addr, data.len() as u64).expect("mapping is valid");
+        // entries in the IDT, interrupt and trap descriptors (in the AMD APM, "interrupt-gate"
+        // and "trap-gate" descriptors), are described (in the AMD APM) by
+        // "Figure 4-24. Interrupt-Gate and Trap-Gate Descriptors—Long Mode". reproduced here:
+        //
+        //  3   2                 1        |          1                   0
+        //  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+        // |---------------------------------------------------------------|
+        // |                            res,ign                            | +12
+        // |                      target offset[63:32]                     | +8
+        // |     target offset[31:16]      |P|DPL|0| type  | res,ign | IST | +4
+        // |     target selector           |    target offset[15:0]        | +0
+        // |---------------------------------------------------------------|
+        //
+        // descriptors are encoded with P set, DPL at 0, and type set to 0b1110. TODO: frankly
+        // i don't know the mechanical difference between type 0x0e and type 0x0f, but 0x0e
+        // works for now.
+        let idt_attr_bits = 0b1_00_0_1110_00000_000;
+        let low_hi = (interrupt_handler_addr.0 as u32 & 0xffff_0000) | idt_attr_bits;
+        let low_lo = (interrupt_handler_cs as u32) << 16 | (interrupt_handler_addr.0 as u32 & 0x0000_ffff);
 
-        // SAFETY: `check_range` above validates the range to copy, and... please do not
-        // provide a slice of guest memory as what the guest should be programmed for...
         unsafe {
-            std::ptr::copy_nonoverlapping(
-                data.as_ptr(),
-                mapping.host_ptr(addr),
-                data.len()
-            );
+            idt_ptr.offset(0).write(low_lo);
+            idt_ptr.offset(1).write(low_hi);
+            idt_ptr.offset(2).write((interrupt_handler_addr.0 >> 32) as u32);
+            idt_ptr.offset(3).write(0); // reserved
         }
     }
 
-    /// read guest-physical memory at `addr` to `addr + buf.len()` into `buf`.
+    /// 16-bit/32-bit IDT entries, described in the APM as
     ///
-    /// panics if `addr + buf.len()` extends beyond the end of guest memory.
-    pub fn read_mem(&mut self, addr: GuestAddress, buf: &mut [u8]) {
-        let mapping = self.map_containing(addr, buf.len() as u64).expect("mapping is valid");
+    /// > Interrupt-Gate and Trap-Gate Descriptors—Legacy Mode
+    ///
+    /// have a different (smaller!) format.
+    fn write_idt_entry_legacy(
+        &mut self,
+        intr_nr: u8,
+        interrupt_handler_cs: u16,
+        interrupt_handler_addr: GuestAddress
+    ) {
+        assert!(interrupt_handler_addr.0 <= u32::MAX as u64);
+        let idt_ptr = self.idt_entry_legacy_mut(intr_nr);
+
+        // entries in the IDT, interrupt and trap descriptors (in the AMD APM, "interrupt-gate"
+        // and "trap-gate" descriptors), are described (in the AMD APM) by
+        // "Figure 4-24. Interrupt-Gate and Trap-Gate Descriptors—Long Mode". reproduced here:
+        //
+        //  3   2                 1        |          1                   0
+        //  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+        // |---------------------------------------------------------------|
+        // |     target offset[31:16]      |P|DPL|0| type  | res,ign | IST | +4
+        // |     target selector           |    target offset[15:0]        | +0
+        // |---------------------------------------------------------------|
+        //
+        // descriptors are encoded with P set, DPL at 0, and type set to 0b1110. TODO: frankly
+        // i don't know the mechanical difference between type 0x0e and type 0x0f, but 0x0e
+        // works for now.
+        let idt_attr_bits = 0b1_00_0_1110_00000_000;
+        let low_hi = (interrupt_handler_addr.0 as u32 & 0xffff_0000) | idt_attr_bits;
+        let low_lo = (interrupt_handler_cs as u32) << 16 | (interrupt_handler_addr.0 as u32 & 0x0000_ffff);
 
-        // SAFETY: `check_range` above validates the range to copy, and... please do not
-        // provide a slice of guest memory as what should be read into...
         unsafe {
-            std::ptr::copy_nonoverlapping(
-                mapping.host_ptr(addr) as *const _,
-                buf.as_mut_ptr(),
-                buf.len()
-            );
+            idt_ptr.offset(0).write(low_lo);
+            idt_ptr.offset(1).write(low_hi);
         }
     }
 
-    /// returns a slice of guest memory pointed to by guest-physical address `addr`, of size
-    /// `size`.
-    ///
-    /// panics if `addr + size` is not enclosed in a single guest mapping. this crate doesn't
-    /// support returning a single slice of adjacent guest memory regions (yet?), sorry.
-    pub fn mem_slice_mut<'vm>(&'vm mut self, addr: GuestAddress, size: u64) -> &'vm mut [u8] {
-        let mapping = self.map_containing_mut(addr, size).expect("mapping is valid");
+    fn configure_idt(&mut self, regs: &mut kvm_regs, sregs: &mut kvm_sregs) {
+        sregs.idt.base = self.idt_addr().0;
+        sregs.idt.limit = IDT_ENTRIES * 16 - 1; // IDT is 256 entries of 16 bytes each
 
-        // SAFETY: we have an exclusive borrow of the VM, so it is not currently running, and there
-        // is no other outstanding slice of guest memory. `map_containing` has already ensured that
-        // this mapping contains the whole range `[addr, addr + size)`.
-        unsafe {
-            mapping.slice_mut(addr, size)
+        for i in 0..IDT_ENTRIES {
+            let interrupt_handler_addr = GuestAddress(self.interrupt_handlers_start().0 + i as u64);
+            self.write_idt_entry(
+                i.try_into().expect("<u8::MAX interrupts"),
+                self.selector_cs(),
+                interrupt_handler_addr
+            );
         }
-    }
-
-    /// returns a slice of guest memory pointed to by guest-physical address `addr`, of size
-    /// `size`.
-    ///
-    /// panics if `addr + size` is not enclosed in a single guest mapping. this crate doesn't
-    /// support returning a single slice of adjacent guest memory regions (yet?), sorry.
-    pub fn mem_slice<'vm>(&'vm self, addr: GuestAddress, size: u64) -> &'vm [u8] {
-        let mapping = self.map_containing(addr, size).expect("mapping is valid");
 
-        // SAFETY: we have an exclusive borrow of the VM, so it is not currently running, and there
-        // is no other outstanding slice of guest memory. `map_containing` has already ensured that
-        // this mapping contains the whole range `[addr, addr + size)`.
+        // all interrupt handlers are just `hlt`. their position is used to detect which
+        // exception/interrupt occurred.
         unsafe {
-            mapping.slice(addr, size)
+            std::slice::from_raw_parts_mut(
+                self.host_ptr(self.interrupt_handlers_start()),
+                IDT_ENTRIES as usize
+            ).fill(0xf4);
         }
-    }
-
-    /// write `code` into guest memory and set `regs.rip` to the address of that code.
-    ///
-    /// the chosen code address is [`Self::code_addr`]; this is the guest linear address the
-    /// provided code buffer is written to.
-    ///
-    /// if the VM is configured for `IsaMode::Long` or `IsaMode::Protected`, `rip` or `eip` is set
-    /// to this address as well. otherwise, the VM is configured for `IsaMode::Real` and `ip` is
-    /// set to `code_addr() & 0x0f` - in typical cases `ip` will be 0.
-    ///
-    pub fn program(&mut self, code: &[u8], regs: &mut kvm_regs) {
-        let addr = self.code_addr();
-        self.write_mem(addr, code);
 
-        if self.settings.isa_mode != IsaMode::Real {
-            regs.rip = addr.0;
-        } else {
-            regs.rip = addr.0 & 0x000f;
-        }
+        // finally, set `rsp` to a valid region so that the CPU can push necessary state (see
+        // AMD APM section "8.9.3 Interrupt Stack Frame") to actually enter the interrupt
+        // handler. if we didn't do this, rsp will probably be zero or something, underflow,
+        // page fault on push to 0xffffffff_ffffffff, and just triple fault.
+        //
+        // TODO: this is our option in 16- and 32-bit modes, but in long mode all the interrupt
+        // descriptors could set something in IST to switch stacks outright for exception
+        // handling. this might be nice to test rsp permutations in 64-bit code? alternatively
+        // we might just have to limit possible rsp permutations so as to be able to test in
+        // 16- and 32-bit modes anyway.
+        regs.rsp = self.stack_addr().0;
+        self.idt_configured = true;
     }
 
-    fn gdt_entry_mut(&mut self, idx: u16) -> *mut u64 {
-        // the GDT is set up at addresses 0..64k:
-        //
-        // > 3.5.1 Segment Descriptor Tables
-        // > A segment descriptor table is an array of segment descriptors (see Figure 3-10). A
-        // > descriptor table is variable in length and can contain up to 8192 (2^13) 8-byte
-        // > descriptors.
+    /// IDT configuration in 32-bit mode is funky because the interrupt handlers live in a totally
+    /// different region of memory and need a different value in `cs`.
+    fn configure_idt_32b(&mut self, regs: &mut kvm_regs, sregs: &mut kvm_sregs) {
+        sregs.idt.base = self.idt_addr().0;
+        sregs.idt.limit = IDT_ENTRIES * 8 - 1; // legacy IDT is 256 entries of 8 bytes each
 
-        assert!(idx < 4096 / 8);
-        let addr = GuestAddress(self.gdt_addr().0 + (idx as u64 * 8));
-        let mapping = self.map_containing(addr, std::mem::size_of::<u64>() as u64).unwrap();
+        for i in 0..IDT_ENTRIES {
+            let interrupt_handler_addr = GuestAddress(self.interrupt_handlers_start().0 + i as u64);
+            self.write_idt_entry_legacy(
+                i.try_into().expect("<u8::MAX interrupts"),
+                self.selector_cs(),
+                interrupt_handler_addr
+            );
+        }
 
-        // SAFETY: idx * 8 can't overflow isize, and we've asserted the end of the pointer is
-        // still inside the allocation (`self.memory`).
+        // all interrupt handlers are just `hlt`. their position is used to detect which
+        // exception/interrupt occurred.
         unsafe {
-            mapping.host_ptr(addr) as *mut u64
+            std::slice::from_raw_parts_mut(
+                self.host_ptr(self.interrupt_handlers_start()),
+                IDT_ENTRIES as usize
+            ).fill(0xf4);
         }
+
+        // finally, set `rsp` to a valid region so that the CPU can push necessary state (see
+        // AMD APM section "8.9.3 Interrupt Stack Frame") to actually enter the interrupt
+        // handler. if we didn't do this, rsp will probably be zero or something, underflow,
+        // page fault on push to 0xffffffff_ffffffff, and just triple fault.
+        //
+        // TODO: this is our option in 16- and 32-bit modes, but in long mode all the interrupt
+        // descriptors could set something in IST to switch stacks outright for exception
+        // handling. this might be nice to test rsp permutations in 64-bit code? alternatively
+        // we might just have to limit possible rsp permutations so as to be able to test in
+        // 16- and 32-bit modes anyway.
+        regs.rsp = self.stack_addr().0;
+        self.idt_configured = true;
     }
 
-    // note this returns a u32, but a long-mode IDT is four u32. the u32 this points at is the
-    // first of the four for the entry.
-    fn idt_entry_mut(&mut self, idx: u8) -> *mut u32 {
-        let addr = GuestAddress(self.idt_addr().0 + (idx as u64 * 16));
-        let mapping = self.map_containing(addr, std::mem::size_of::<[u64; 2]>() as u64).unwrap();
+    /// IDT configuration in 16-bit mode is funky because the interrupt handlers live in a totally
+    /// different region of memory and need a different value in `cs`.
+    fn configure_idt_16b(&mut self, regs: &mut kvm_regs, sregs: &mut kvm_sregs) {
+        sregs.idt.base = self.idt_addr().0;
+        sregs.idt.limit = IDT_ENTRIES * 8 - 1; // IDT is 256 entries of 8 bytes each
 
-        unsafe {
-            mapping.host_ptr(addr) as *mut u32
+        for i in 0..IDT_ENTRIES {
+            let interrupt_handler_addr = GuestAddress(self.interrupt_handlers_start().0 + i as u64);
+            self.write_idt_entry_legacy(
+                i.try_into().expect("<u8::MAX interrupts"),
+                self.selector_cs_idt_16b(),
+                interrupt_handler_addr
+            );
         }
-    }
-
-    // note this returns a u32, but a legacy IDT is two u32. the u32 this points at is the
-    // first of the four for the entry.
-    fn idt_entry_legacy_mut(&mut self, idx: u8) -> *mut u32 {
-        let addr = GuestAddress(self.idt_addr().0 + (idx as u64 * 8));
-        let mapping = self.map_containing(addr, std::mem::size_of::<[u64; 2]>() as u64).unwrap();
 
+        // all interrupt handlers are just `hlt`. their position is used to detect which
+        // exception/interrupt occurred.
         unsafe {
-            mapping.host_ptr(addr) as *mut u32
+            std::slice::from_raw_parts_mut(
+                self.host_ptr(self.interrupt_handlers_start()),
+                IDT_ENTRIES as usize
+            ).fill(0xf4);
         }
-    }
 
-    pub fn page_tables(&self) -> VmPageTables<'_> {
-        let base = self.page_table_addr();
+        // finally, set `rsp` to a valid region so that the CPU can push necessary state (see
+        // AMD APM section "8.9.3 Interrupt Stack Frame") to actually enter the interrupt
+        // handler. if we didn't do this, rsp will probably be zero or something, underflow,
+        // page fault on push to 0xffffffff_ffffffff, and just triple fault.
+        //
+        // TODO: this is our option in 16- and 32-bit modes, but in long mode all the interrupt
+        // descriptors could set something in IST to switch stacks outright for exception
+        // handling. this might be nice to test rsp permutations in 64-bit code? alternatively
+        // we might just have to limit possible rsp permutations so as to be able to test in
+        // 16- and 32-bit modes anyway.
+        regs.rsp = self.stack_addr().0;
+        self.idt_configured = true;
+    }
 
-        // the page tables are really just two pages: a PML4 and a PDPT for its first 512G of
-        // address space.
-        assert!(self.map_containing(base, 0x2000).is_some());
+    /// configure the vCPU for executing instructions in the hardware-supported extensions.
+    /// on a fresh vCPU, various extension may be "supported" but result in `#UD` when executed,
+    /// unless additional configuration is done (as this function does).
+    ///
+    /// the Intel SDM describes `INITIALIZING SSE/SSE2/SSE3/SSSE3 EXTENSIONS` but does not point
+    /// out this `#UD` behavior so directly. the AMD APM does not seem to discuss it at all?
+    ///
+    /// this function configures the vCPU to be ready to execute `SSE*` instructions.
+    fn configure_extensions(&mut self, sregs: &mut kvm_sregs, xcrs: &mut kvm_xcrs) {
+        // these bit positions in control registers, and their behaviors, are described more
+        // comprehensively in Voluem 3,
+        // > `2.5 CONTROL REGISTERS`
 
-        VmPageTables {
-            vm: self,
-            base,
-        }
-    }
+        // CR0
+        const TS: u32 = 3;
+        // CR4
+        const OSFXSR: u32 = 9;
+        const OSXMMEXCPT: u32 = 10;
+        const OSXSAVE: u32 = 18;
 
-    // TODO: there should be a version of this that can be used to query "does this VM support
-    // these extensions" probably, and that should take a subset of `Feature` for the ones that are
-    // actually related to ISA support (e.g. Pdpe1Gb isn't really useful as a public queryable
-    // feature..)
-    fn cpuid_supports(&self, feature: Feature) -> bool {
-        fn find_leaf(cpuid: &CpuId, leaf: u32, index: u32, f: impl Fn(&kvm_cpuid_entry2) -> bool) -> bool {
-            for mut entry in cpuid.as_slice() {
-                if entry.function == leaf && entry.index == index {
-                    return f(&mut entry);
-                }
-            }
+        // XCR0 (see "EXTENDED CONTROL REGISTERS (INCLUDING XCR0)")
+        // these bits are the same as in cpuid leaf 0xd.eax
+        const XCR0_SSE: u64 = CPUID_0000000D_EAX_SSE as u64;
+        const XCR0_AVX: u64 = CPUID_0000000D_EAX_AVX as u64;
+        const XCR0_AVX512: u64 = CPUID_0000000D_EAX_AVX512 as u64;
 
-            false
-        }
+        // operations on `xmm` registers result in `#UD` even if CPUID says that SSE should be
+        // quite functional. this is true even for SSE or SSE2 instructions on an `x86_64` system
+        // (which makes SSE a non-optional baseline!)
+        //
+        // the Intel SDM implies this through somewhat tortured language in the section
+        // "Checking for Intel® SSE and SSE2 Support":
+        // > If an operating system did not provide adequate system level support for Intel
+        // > SSE, executing an Intel SSE or SSE2 instructions can also generate #UD.
+        //
+        // to fully understand this statement, realize that `an operating system .. provide[s]
+        // adequate system level support" by setting CR4.OSFXSR,
+        //
+        // > Set the OSFXSR flag (bit 9 in control register CR4) to indicate that the operating
+        // > system supports saving and restoring the SSE/SSE2/SSE3/SSSE3 execution environment
+        //
+        // so OSFXSR is how "the operating system" indicates save/restore state, and must be set to
+        // execute SSE (and later) SIMD instructions even if we never will use `fxsave` or even
+        // switch tasks on the vCPU.
+        sregs.cr4 |= 1 << OSFXSR;
 
-        match feature {
-            Feature::Base => {
-                let lm = find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| {
-                    leaf.edx & CPUID_80000001_EDX_LM != 0
-                });
-                let msr = find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| {
-                    leaf.edx & CPUID_00000001_EDX_MSR != 0
-                });
-                let clstac = find_leaf(&self.supported_cpuid, 0x0000_0007, 0, |leaf| {
-                    leaf.ebx & CPUID_00000007_EBX_CLSTAC != 0
-                });
-                lm && msr && clstac
-            }
-            Feature::Syscall => {
-                find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| {
-                    leaf.edx & CPUID_80000001_EDX_SYSCALL != 0
-                })
-            }
-            Feature::XSave => {
-                find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| {
-                    leaf.edx & CPUID_00000001_ECX_XSAVE != 0
-                })
-            }
-            Feature::Pdpe1Gb => {
-                find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| {
-                    leaf.edx & CPUID_80000001_EDX_PDPE1GB != 0
-                })
-            }
-            Feature::StateSSE => {
-                find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| {
-                    leaf.eax & CPUID_0000000D_EAX_SSE == CPUID_0000000D_EAX_SSE
-                })
-            }
-            Feature::StateAVX => {
-                find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| {
-                    leaf.eax & CPUID_0000000D_EAX_AVX == CPUID_0000000D_EAX_AVX
-                })
-            }
-            Feature::StateAVX512 => {
-                find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| {
-                    leaf.eax & CPUID_0000000D_EAX_AVX512 == CPUID_0000000D_EAX_AVX512
-                })
-            }
-            Feature::Pse => {
-                find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| {
-                    leaf.edx & CPUID_00000001_EDX_PSE == CPUID_00000001_EDX_PSE
-                })
-            }
-        }
-    }
+        // there is a similar relationship between SIMD extension functionality and CR4.OSXSAVE.
+        // this passage in the SDM under "XSAVE-SUPPORTED FEATURES AND STATE-COMPONENT BITMAPS"
+        // draws a fairly direct connection:
+        //
+        // > As will be explained in Section 13.3, the XSAVE feature set is enabled only if
+        // > CR4.OSXSAVE[bit 18] = 1. If CR4.OSXSAVE = 0, the processor treats XSAVE-enabled state
+        // > features and their state components as if all bits in XCR0 were clear; the state
+        // > components cannot be modified and the features’ instructions cannot be executed.
+        //
+        // but the consequence is contradicted by the next paragraph,
+        //
+        // > Processors allow modification of this state, as well as execution of x87 FPU
+        // > instructions and SSE instructions [...] , regardless of the value of CR4.OSXSAVE and
+        // > XCR0.
+        //
+        // we will see that CR4.OSXSAVE must be set for other SIMD extensions below, as well.
+        sregs.cr4 |= 1 << OSXSAVE;
 
-    /// set `feature` to `wanted` in the VM's CPUID configuration.
-    ///
-    /// panics if the feature cannot be configured (such as if the corresponding CPUID leaf is not
-    /// available at all). use [`cpuid_supports`] to test if the feature can be configured.
-    fn cpuid_set(&mut self, feature: Feature, wanted: bool) {
-        fn edit_leaf(cpuid: &mut CpuId, leaf: u32, index: u32, mut f: impl FnMut(&mut kvm_cpuid_entry2)) {
-            for mut entry in cpuid.as_mut_slice() {
-                if entry.function == leaf && entry.index == index {
-                    f(&mut entry);
-                    return;
-                }
-            }
+        // SSE3, SSSE3, and SSE4 involve a bit extra:
+        // > Intel SSE3, SSSE3, and Intel SSE4 will cause a DNA Exception (#NM) if the processor
+        // > attempts to execute an Intel SSE3 instruction while CR0.TS[bit 3] = 1
+        sregs.cr0 &= !(1 << TS);
 
-            // if we're here, the entry simply is not present (yet..?)
-            //
-            // so, create it.
-            let mut entry = kvm_cpuid_entry2 {
-                function: leaf,
-                index: index,
-                eax: 0,
-                ecx: 0,
-                edx: 0,
-                ebx: 0,
-                flags: 0,
-                padding: [0; 3],
-            };
-            f(&mut entry);
-            cpuid.push(entry).expect("can push");
-        }
+        // > Set the OSXMMEXCPT flag (bit 10 in control register CR4) to indicate that the operating
+        // > system supports the handling of SSE/SSE2/SSE3 SIMD floating-point exceptions (#XM).
+        //
+        // this is somewhat better than just getting an uncategorized #UD.
+        sregs.cr4 |= 1 << OSXMMEXCPT;
 
-        fn bit_set(word: &mut u32, bit: u32, wanted: bool) {
-            *word &= !bit;
-            if wanted {
-                *word |= bit;
-            }
-        }
+        assert!(xcrs.nr_xcrs > 0);
+        assert_eq!(xcrs.xcrs[0].xcr, 0);
 
-        let mut edited = false;
+        let mut needs_xsave = false;
+        if self.cpuid_supports(Feature::StateSSE) {
+            self.cpuid_set(Feature::StateSSE, true);
+            xcrs.xcrs[0].value |= 1;
+            xcrs.xcrs[0].value |= XCR0_SSE;
+            needs_xsave = true;
+        }
+        if self.cpuid_supports(Feature::StateAVX) {
+            self.cpuid_set(Feature::StateAVX, true);
+            xcrs.xcrs[0].value |= XCR0_AVX;
+            needs_xsave = true;
+        }
+        if self.cpuid_supports(Feature::StateAVX512) {
+            self.cpuid_set(Feature::StateAVX512, true);
+            xcrs.xcrs[0].value |= XCR0_AVX512;
+            needs_xsave = true;
+        }
 
-        match feature {
-            Feature::Base => {
-                edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| {
-                    bit_set(&mut leaf.edx, CPUID_80000001_EDX_LM, wanted);
-                    edited = true;
-                });
-                edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| {
-                    bit_set(&mut leaf.edx, CPUID_00000001_EDX_MSR, wanted);
-                    edited = true;
-                });
-                edit_leaf(&mut self.current_cpuid, 0x0000_0007, 0, |leaf| {
-                    bit_set(&mut leaf.ebx, CPUID_00000007_EBX_CLSTAC, wanted);
-                    edited = true;
-                });
-            }
-            Feature::Syscall => {
-                edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| {
-                    bit_set(&mut leaf.edx, CPUID_80000001_EDX_SYSCALL, wanted);
-                    edited = true;
-                });
-            }
-            Feature::XSave => {
-                edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| {
-                    bit_set(&mut leaf.ecx, CPUID_00000001_ECX_XSAVE, wanted);
-                    edited = true;
-                });
-            },
-            Feature::Pdpe1Gb => {
-                edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| {
-                    bit_set(&mut leaf.edx, CPUID_80000001_EDX_PDPE1GB, wanted);
-                    edited = true;
-                });
-            },
-            Feature::StateSSE => {
-                edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| {
-                    bit_set(&mut leaf.eax, 1, wanted);
-                    bit_set(&mut leaf.eax, CPUID_0000000D_EAX_SSE, wanted);
-                    edited = true;
-                });
-            }
-            Feature::StateAVX => {
-                edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| {
-                    bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX, wanted);
-                    edited = true;
-                });
-            }
-            Feature::StateAVX512 => {
-                edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| {
-                    bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX512, wanted);
-                    edited = true;
-                });
-            }
-            Feature::Pse => {
-                edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| {
-                    bit_set(&mut leaf.edx, CPUID_00000001_EDX_PSE, wanted);
-                    edited = true;
-                });
+        if needs_xsave {
+            if self.cpuid_supports(Feature::XSave) {
+                self.cpuid_set(Feature::XSave, true);
+            } else {
+                panic!(
+                    "look, there's no CPU that supports SSE but not xsave. \
+                    i only checked to be thorough.");
             }
         }
-
-        assert!(edited);
-
-        self.vcpu.set_cpuid2(&self.current_cpuid).expect("can set cpuid");
     }
 
-    /// configure page tables for identity mapping of all memory from guest address zero up to the
-    /// end of added memory regions, rounded up to the next GiB.
-    ///
-    /// if `sregs` is provided, update `cr0`, `cr3`, and `cr4` in support of protected-mode or
-    /// long-mode paging. this is a fixed pattern: if control registers have not been changed since
-    /// `Vm::create` then there will be no change to these control registers and `sregs` can be
-    /// omitted.
-    ///
-    /// panics if the end of added memory regions is above 512 GiB.
-    pub unsafe fn configure_identity_paging(&mut self, sregs: Option<&mut kvm_sregs>) {
-        // we're only setting up one PDPT, which can have up to 512 PDPTE covering 1G each.
-        assert!(self.mem_ceiling() <= 512 * GB);
-
-        assert!(self.cpuid_supports(Feature::Pdpe1Gb));
-        self.cpuid_set(Feature::Pdpe1Gb, true);
-
-        let pt = self.page_tables();
-
-        let pml4_ent =
-            1 << 0 | // P
-            1 << 1 | // RW
-            1 << 2 | // user access allowed. but no user code will run so not strictly needed.
-            0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient)
-            0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient)
-            0 << 5 | // A
-            0 << 6 | // ignored
-            0 << 7 | // PS (reserved must-be-0)
-            0 << 11 | // R (for ordinary paging, ignored; for HLAT ...)
-            pt.pdpt_addr().0;
-        unsafe {
-            pt.pml4_mut().write(pml4_ent);
-        }
-
-        let mut mapped: u64 = 0;
-        // we've set up the first PML4 to point to a PDPT, so we should actually set it up!
-        let pdpt = pt.pdpt_mut();
-        // PDPTEs start at the start of PDPT..
-        let mut pdpte = pdpt;
-        let entry_bits: u64 =
-            1 << 0 | // P
-            1 << 1 | // RW
-            1 << 2 | // user accesses allowed (everything is under privilege level 0 tho)
-            0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient)
-            0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient)
-            0 << 5 | // Accessed
-            0 << 6 | // Dirty
-            1 << 7 | // Page size (1 implies 1G page)
-            1 << 8 | // Global (if cr4.pge)
-            0 << 9 |
-            0 << 10 |
-            0 << 11 | // for ordinary paging, ignored. for HLAT, ...
-            0 << 12; // PAT (TODO: configure explicitly, but PAT0 is sufficient. verify MTRR sets PAT0 to WB?)
-
-        while mapped < self.mem_ceiling() {
-            let phys_num = mapped >> 30;
-            let entry = entry_bits | (phys_num << 30);
-            unsafe {
-                pdpte.write(entry);
-                pdpte = pdpte.offset(1);
+    fn configure_syscalls(&mut self, vcpu_sregs: &mut kvm_sregs) {
+        assert!(self.cpuid_supports(Feature::Syscall));
+        self.cpuid_set(Feature::Syscall, true);
+
+        // > System-Call Extension (SCE) Bit.
+        vcpu_sregs.efer |= 0x0000_0001;
+
+        let msrs = Msrs::from_entries(&[
+            kvm_msr_entry {
+                // LSTAR (C000_0082h)
+                index: 0xc000_0082,
+                data: self.syscall_addr().0,
+                reserved: 0,
+            },
+            kvm_msr_entry {
+                // CSTAR (C000_0083h)
+                index: 0xc000_0083,
+                data: self.syscall_addr().0,
+                reserved: 0,
             }
-            // eprintln!("mapped 1g at {:08x}", mapped);
-            mapped += 1 << 30;
-        }
+        ]).unwrap();
+        self.set_msrs(&msrs).unwrap();
 
-        if let Some(sregs) = sregs {
-            sregs.cr0 |= 0x8000_0001; // cr0.PE | cr0.PG
-            sregs.cr3 = pt.pml4_addr().0 as u64;
-            sregs.cr4 |= 1 << 5; // enable PAE
-        }
+        // fill the syscall landing area with hlt to trap out immediately.
+        self.mem_slice_mut(self.syscall_addr(), 16).fill(0xf4);
+
+        self.syscall_configured = true;
     }
+}
 
-    /// configure page tables for identity mapping of all memory from guest address zero up to the
-    /// end of added memory regions, rounded up to the next 4MiB.
-    ///
-    /// if `sregs` is provided, update `cr0`, `cr3`, and `cr4` in support of protected-mode paging.
-    /// this is a fixed pattern: if control registers have not been changed since `Vm::create` then
-    /// there will be no change to these control registers and `sregs` can be omitted.
-    pub unsafe fn configure_identity_paging_32b(&mut self, sregs: Option<&mut kvm_sregs>) {
-        // because we'll set PDEs to map 4M pages and cr3 points at a page-aligned block of 1024
-        // 4-byte PDEs, that gives us 4KiB of memory used to map 4GiB of address space. that's all
-        // of 32-bit, so we don't need to check an upper bound.
 
-        assert!(self.cpuid_supports(Feature::Pse));
-        self.cpuid_set(Feature::Pse, true);
+#[test]
+fn test_xor_runs() {
+    let mut vm = Vm::create(128 * 1024).expect("can create vm");
+    let mut regs = vm.get_regs().expect("can get regs");
 
-        let pt = self.page_tables();
+    vm.program(&[0x33, 0xc0], &mut regs);
 
-        let mut mapped: u64 = 0;
-        // "pml4_mut" is really just the start of page table memory. we'll pun this in 32-bit with
-        // the knowledge it's really a block of PDEs.
-        let pd = pt.pml4_mut() as *mut u32;
-        let mut pde = pd;
-        let entry_bits: u32 =
-            1 << 0 | // P
-            1 << 1 | // RW
-            1 << 2 | // user accesses allowed (everything is under privilege level 0 tho)
-            0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient)
-            0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient)
-            0 << 5 | // Accessed
-            0 << 6 | // Dirty
-            1 << 7 | // Page size (1 implies 4M page)
-            1 << 8 | // Global (if cr4.pge)
-            0 << 9 |
-            0 << 10 |
-            0 << 11 | // for ordinary paging, ignored. for HLAT, ...
-            0 << 12; // PAT (TODO: configure explicitly, but PAT0 is sufficient. verify MTRR sets PAT0 to WB?)
+    regs.rax = 0x1234;
+    let rip_before = regs.rip;
 
-        while mapped < self.mem_ceiling() {
-            let phys_num = (mapped as u32) >> 22;
-            let entry = entry_bits | (phys_num << 22);
-            unsafe {
-                pde.write(entry);
-                pde = pde.offset(1);
-            }
-            mapped += 1 << 22;
-        }
+    vm.set_regs(&regs).expect("can set regs");
 
-        // page size extensions; collaborates with page tables' PS bit to make 4MiB pages in 32-bit
-        // mode. see SDM section 2.5 "CONTROL REGISTERS".
-        const PSE: u64 = 1 << 4;
+    vm.set_single_step(true).expect("can set single-step");
 
-        if let Some(sregs) = sregs {
-            sregs.cr0 |= 0x8000_0001; // cr0.PE | cr0.PG
-            sregs.cr3 = pt.pml4_addr().0 as u64;
-            sregs.cr4 |= PSE;
+    let res = vm.run().expect("can run vm");
+
+    let expected_rip = rip_before + 2;
+    match res {
+        VcpuExit::Debug { pc: rip_after, .. } => {
+            assert_eq!(expected_rip, rip_after);
         }
-    }
+        other => {
+            panic!("unexpected exit: {:?}", other);
+        }
+    };
 
-    unsafe fn configure_selectors(&mut self, sregs: &mut kvm_sregs) {
-        // we have to set descriptor information directly. this avoids having to load selectors
-        // as the first instructions on the vCPU, which is simplifying. but if we want the
-        // information in these selectors to match with anything in a GDT (i do!) we'll have to
-        // keep this initial state lined up with GDT entries ourselves.
-        //
-        // we could avoid setting up the GDT for the most part, but anything that might
-        // legitimately load the "valid" current segment selector would instead clobber the
-        // selector with zeroes.
+    let regs_after = vm.get_regs().expect("can get regs");
+    assert_eq!(regs_after.rax, 0);
+}
 
-        sregs.cs.base = 0;
-        sregs.cs.limit = 0;
-        sregs.cs.selector = self.selector_cs();
-        sregs.cs.type_ = 0b1011; // see SDM table 3-1 Code- and Data-Segment Types
-        sregs.cs.present = 1;
-        sregs.cs.dpl = 0;
-        sregs.cs.db = 0;
-        sregs.cs.s = 1;
-        sregs.cs.l = 1;
-        sregs.cs.g = 0;
-        sregs.cs.avl = 0;
+#[test]
+fn test_protected_mode_runs() {
+    let settings = VmSettings::new(128 * 1024, IsaMode::Protected);
+    let mut vm = Vm::create_by_settings(settings).expect("can create vm");
+    let mut regs = vm.get_regs().expect("can get regs");
 
-        sregs.ds.base = 0;
-        sregs.ds.limit = 0xffffffff;
-        sregs.ds.selector = self.selector_ds();
-        sregs.ds.type_ = 0b0011; // see SDM table 3-1 Code- and Data-Segment Types
-        sregs.ds.present = 1;
-        sregs.ds.dpl = 0;
-        sregs.ds.db = 0;
-        sregs.ds.s = 1;
-        sregs.ds.l = 0;
-        sregs.ds.g = 0;
-        sregs.ds.avl = 0;
+    let buf = &[
+        0xc5, 0xe0, 0x54, 0xc3, // vandps xmm0, xmm3, xmm3
+        0x33, 0xc0,             // xor eax, eax
+        0x8b, 0x09,             // mov ecx, [ecx]
+        0xf4                    // hlt
+    ];
+    vm.program(buf, &mut regs);
 
-        sregs.es = sregs.ds;
-        sregs.fs = sregs.ds;
-        sregs.gs = sregs.ds;
-        // linux populates the vmcb cpl field with whatever's in ss.dpl. what the hell???
-        sregs.ss = sregs.ds;
+    regs.rax = 0x1234;
+    regs.rcx = 0x4;
 
-        sregs.gdt.base = self.gdt_addr().0;
-        sregs.gdt.limit = 256 * 8 - 1;
+    vm.set_regs(&regs).expect("can set regs");
 
-        unsafe {
-            self.gdt_entry_mut(self.selector_cs() >> 3).write(encode_segment(&sregs.cs));
-            self.gdt_entry_mut(self.selector_ds() >> 3).write(encode_segment(&sregs.ds));
+    let res = vm.run().expect("can run vm");
+
+    match res {
+        VcpuExit::Hlt => {
+            // expected exit from the `0xf4` above.
         }
-    }
+        other => {
+            panic!("unexpected exit: {:?}", other);
+        }
+    };
 
-    /// configure selectors for 32-bit code exceution. this is basically the same as 64-bit, but we
-    /// set a limit and set `cs.db` so that the default operand size is a normal 32-bit.
-    unsafe fn configure_selectors_32b(&mut self, sregs: &mut kvm_sregs) {
-        // we have to set descriptor information directly. this avoids having to load selectors
-        // as the first instructions on the vCPU, which is simplifying. but if we want the
-        // information in these selectors to match with anything in a GDT (i do!) we'll have to
-        // keep this initial state lined up with GDT entries ourselves.
-        //
-        // we could avoid setting up the GDT for the most part, but anything that might
-        // legitimately load the "valid" current segment selector would instead clobber the
-        // selector with zeroes.
+    let regs_after = vm.get_regs().expect("can get regs");
+    assert_eq!(regs_after.rax, 0);
+    assert_eq!(regs_after.rcx, 0);
+}
 
-        sregs.cs.base = 0;
-        sregs.cs.limit = 0xffffffff;
-        sregs.cs.selector = self.selector_cs();
-        sregs.cs.type_ = 0b1011; // see SDM table 3-1 Code- and Data-Segment Types
-        sregs.cs.present = 1;
-        sregs.cs.dpl = 0;
-        sregs.cs.db = 1;
-        sregs.cs.s = 1;
-        sregs.cs.l = 0;
-        sregs.cs.g = 1;
-        sregs.cs.avl = 0;
+#[test]
+fn test_pusha_runs() {
+    let settings = VmSettings::new(128 * 1024, IsaMode::Real);
+    let mut vm = Vm::create_by_settings(settings).expect("can create vm");
+    let mut regs = vm.get_regs().expect("can get regs");
 
-        sregs.ds.base = 0;
-        sregs.ds.limit = 0xffffffff;
-        sregs.ds.selector = self.selector_ds();
-        sregs.ds.type_ = 0b0011; // see SDM table 3-1 Code- and Data-Segment Types
-        sregs.ds.present = 1;
-        sregs.ds.dpl = 0;
-        sregs.ds.db = 1;
-        sregs.ds.s = 1;
-        sregs.ds.l = 0;
-        sregs.ds.g = 1;
-        sregs.ds.avl = 0;
+    vm.program(&[0x60], &mut regs);
 
-        sregs.es = sregs.ds;
-        sregs.fs = sregs.ds;
-        sregs.gs = sregs.ds;
-        // linux populates the vmcb cpl field with whatever's in ss.dpl. what the hell???
-        sregs.ss = sregs.ds;
+    regs.rip = 0;
+    regs.rax = 0x1234;
+    eprintln!("{:?}", regs);
 
-        sregs.gdt.base = self.gdt_addr().0;
-        sregs.gdt.limit = 256 * 8 - 1;
+    vm.set_regs(&regs).expect("can set regs");
 
-        unsafe {
-            self.gdt_entry_mut(self.selector_cs() >> 3).write(encode_segment(&sregs.cs));
-            self.gdt_entry_mut(self.selector_ds() >> 3).write(encode_segment(&sregs.ds));
+    vm.set_single_step(true).expect("can set single-step");
+    let expected_rip = vm.code_addr().0 + 1;
+
+    let res = vm.run().expect("can run vm");
+
+    match res {
+        VcpuExit::Debug { pc: rip_after, .. } => {
+            eprintln!("rip after: {:08x}", rip_after);
+            assert_eq!(expected_rip, rip_after);
         }
-    }
+        other => {
+            panic!("unexpected exit: {:?}", other);
+        }
+    };
 
-    /// configure selectors for 16-bit code exceution.
-    ///
-    /// unlike other modes, this sets `cs` to execute code at the linear address given by
-    /// [`Self::code_addr`]. `ds` is configured to overlap with `cs`. this way, when executing
-    /// 16-bit code the VM can simply be configured to `ip = 0`, and code addresses match data
-    /// addresses. additionally, clear `cs.db` so that the default operand size is 16-bit.
-    unsafe fn configure_selectors_16b(&mut self, sregs: &mut kvm_sregs) {
-        // we have to set descriptor information directly. this avoids having to load selectors
-        // as the first instructions on the vCPU, which is simplifying. but if we want the
-        // information in these selectors to match with anything in a GDT (i do!) we'll have to
-        // keep this initial state lined up with GDT entries ourselves.
-        //
-        // we could avoid setting up the GDT for the most part, but anything that might
-        // legitimately load the "valid" current segment selector would instead clobber the
-        // selector with zeroes.
+    let regs_after = vm.get_regs().expect("can get regs");
+    assert_eq!(regs_after.rax, 0x1234);
+    assert_eq!(regs_after.rsp, 0x1000 - 0x80 - (8 * 2));
 
-        sregs.cs.base = 0;
-        sregs.cs.limit = 0xfffff;
-        sregs.cs.selector = self.selector_cs();
-        sregs.cs.type_ = 0b1011; // see SDM table 3-1 Code- and Data-Segment Types
-        sregs.cs.present = 1;
-        sregs.cs.dpl = 0;
-        sregs.cs.db = 0;
-        sregs.cs.s = 1;
-        sregs.cs.l = 0;
-        sregs.cs.g = 1;
-        sregs.cs.avl = 0;
+    let mut regs = vm.get_regs().expect("can get regs");
 
-        unsafe {
-            self.gdt_entry_mut(self.selector_cs_idt_16b() >> 3).write(encode_segment(&sregs.cs));
-        }
+    vm.program(&[0x66, 0x60], &mut regs);
 
-        // and now adjust for the real cs for code execution to happen in..
-        sregs.cs.base = self.code_addr().0;
+    regs.rip = 0;
+    regs.rax = 0x1234;
+    regs.rsp = 0x1000 - 0x80;
+    eprintln!("{:?}", regs);
 
-        sregs.ds.base = self.code_addr().0;
-        sregs.ds.limit = 0xfffff;
-        sregs.ds.selector = self.selector_ds();
-        sregs.ds.type_ = 0b0011; // see SDM table 3-1 Code- and Data-Segment Types
-        sregs.ds.present = 1;
-        sregs.ds.dpl = 0;
-        sregs.ds.db = 0;
-        sregs.ds.s = 1;
-        sregs.ds.l = 0;
-        sregs.ds.g = 1;
-        sregs.ds.avl = 0;
+    vm.set_regs(&regs).expect("can set regs");
 
-        sregs.es = sregs.ds;
-        sregs.fs = sregs.ds;
-        sregs.gs = sregs.ds;
-        // linux populates the vmcb cpl field with whatever's in ss.dpl. what the hell???
-        sregs.ss = sregs.ds;
+    vm.set_single_step(true).expect("can set single-step");
+    let expected_rip = vm.code_addr().0 + 2;
 
-        sregs.gdt.base = self.gdt_addr().0;
-        sregs.gdt.limit = 256 * 8 - 1;
+    let res = vm.run().expect("can run vm");
 
-        unsafe {
-            self.gdt_entry_mut(self.selector_cs() >> 3).write(encode_segment(&sregs.cs));
-            self.gdt_entry_mut(self.selector_ds() >> 3).write(encode_segment(&sregs.ds));
+    match res {
+        VcpuExit::Debug { pc: rip_after, .. } => {
+            eprintln!("rip after: {:08x}", rip_after);
+            assert_eq!(expected_rip, rip_after);
         }
-    }
+        other => {
+            panic!("unexpected exit: {:?}", other);
+        }
+    };
 
-    fn write_idt_entry(
-        &mut self,
-        intr_nr: u8,
-        interrupt_handler_cs: u16,
-        interrupt_handler_addr: GuestAddress
-    ) {
-        let idt_ptr = self.idt_entry_mut(intr_nr);
+    let regs_after = vm.get_regs().expect("can get regs");
+    assert_eq!(regs_after.rax, 0x1234);
+    assert_eq!(regs_after.rsp, 0x1000 - 0x80 - (8 * 4));
+}
 
-        // entries in the IDT, interrupt and trap descriptors (in the AMD APM, "interrupt-gate"
-        // and "trap-gate" descriptors), are described (in the AMD APM) by
-        // "Figure 4-24. Interrupt-Gate and Trap-Gate Descriptors—Long Mode". reproduced here:
-        //
-        //  3   2                 1        |          1                   0
-        //  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
-        // |---------------------------------------------------------------|
-        // |                            res,ign                            | +12
-        // |                      target offset[63:32]                     | +8
-        // |     target offset[31:16]      |P|DPL|0| type  | res,ign | IST | +4
-        // |     target selector           |    target offset[15:0]        | +0
-        // |---------------------------------------------------------------|
-        //
-        // descriptors are encoded with P set, DPL at 0, and type set to 0b1110. TODO: frankly
-        // i don't know the mechanical difference between type 0x0e and type 0x0f, but 0x0e
-        // works for now.
-        let idt_attr_bits = 0b1_00_0_1110_00000_000;
-        let low_hi = (interrupt_handler_addr.0 as u32 & 0xffff_0000) | idt_attr_bits;
-        let low_lo = (interrupt_handler_cs as u32) << 16 | (interrupt_handler_addr.0 as u32 & 0x0000_ffff);
+#[test]
+fn test_syscall() {
+    let mut vm = Vm::create(128 * 1024).expect("can create vm");
+    let mut regs = vm.get_regs().expect("can get regs");
 
-        unsafe {
-            idt_ptr.offset(0).write(low_lo);
-            idt_ptr.offset(1).write(low_hi);
-            idt_ptr.offset(2).write((interrupt_handler_addr.0 >> 32) as u32);
-            idt_ptr.offset(3).write(0); // reserved
-        }
-    }
+    vm.program(&[0x0f, 0x05], &mut regs);
+    eprintln!("rip before: {:08x}", regs.rip);
 
-    /// 16-bit/32-bit IDT entries, described in the APM as
-    ///
-    /// > Interrupt-Gate and Trap-Gate Descriptors—Legacy Mode
-    ///
-    /// have a different (smaller!) format.
-    fn write_idt_entry_legacy(
-        &mut self,
-        intr_nr: u8,
-        interrupt_handler_cs: u16,
-        interrupt_handler_addr: GuestAddress
-    ) {
-        assert!(interrupt_handler_addr.0 <= u32::MAX as u64);
-        let idt_ptr = self.idt_entry_legacy_mut(intr_nr);
+    vm.set_regs(&regs).expect("can set regs");
 
-        // entries in the IDT, interrupt and trap descriptors (in the AMD APM, "interrupt-gate"
-        // and "trap-gate" descriptors), are described (in the AMD APM) by
-        // "Figure 4-24. Interrupt-Gate and Trap-Gate Descriptors—Long Mode". reproduced here:
-        //
-        //  3   2                 1        |          1                   0
-        //  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
-        // |---------------------------------------------------------------|
-        // |     target offset[31:16]      |P|DPL|0| type  | res,ign | IST | +4
-        // |     target selector           |    target offset[15:0]        | +0
-        // |---------------------------------------------------------------|
-        //
-        // descriptors are encoded with P set, DPL at 0, and type set to 0b1110. TODO: frankly
-        // i don't know the mechanical difference between type 0x0e and type 0x0f, but 0x0e
-        // works for now.
-        let idt_attr_bits = 0b1_00_0_1110_00000_000;
-        let low_hi = (interrupt_handler_addr.0 as u32 & 0xffff_0000) | idt_attr_bits;
-        let low_lo = (interrupt_handler_cs as u32) << 16 | (interrupt_handler_addr.0 as u32 & 0x0000_ffff);
+//    vm.set_single_step(true).expect("can set single-step");
 
-        unsafe {
-            idt_ptr.offset(0).write(low_lo);
-            idt_ptr.offset(1).write(low_hi);
+    let res = vm.run().expect("can run vm");
+    match res {
+        VcpuExit::Syscall => { /* expected */ }
+        VcpuExit::Debug { pc, .. } => {
+            if pc == vm.syscall_addr().0 {
+                panic!(
+                    "VM exited at syscall target. \
+                     syscall hlt stub not executed. \
+                     is the VM being single-stepped?"
+                );
+            }
+            panic!("unexpected debug exit at rip={:08x}", pc);
         }
-    }
+        other => {
+            panic!("unexpected exit: {:?}", other);
+        }
+    };
 
-    fn configure_idt(&mut self, regs: &mut kvm_regs, sregs: &mut kvm_sregs) {
-        sregs.idt.base = self.idt_addr().0;
-        sregs.idt.limit = IDT_ENTRIES * 16 - 1; // IDT is 256 entries of 16 bytes each
+    let regs_after = vm.get_regs().expect("can get regs");
 
-        for i in 0..IDT_ENTRIES {
-            let interrupt_handler_addr = GuestAddress(self.interrupt_handlers_start().0 + i as u64);
-            self.write_idt_entry(
-                i.try_into().expect("<u8::MAX interrupts"),
-                self.selector_cs(),
-                interrupt_handler_addr
-            );
-        }
+    let expected_rip = vm.syscall_addr().0 + 1;
+    assert_eq!(expected_rip, regs_after.rip);
+}
 
-        // all interrupt handlers are just `hlt`. their position is used to detect which
-        // exception/interrupt occurred.
-        unsafe {
-            std::slice::from_raw_parts_mut(
-                self.host_ptr(self.interrupt_handlers_start()),
-                IDT_ENTRIES as usize
-            ).fill(0xf4);
-        }
+#[test]
+fn test_xorps_runs() {
+    let mut vm = Vm::create(128 * 1024).expect("can create vm");
+    let mut regs = vm.get_regs().expect("can get regs");
 
-        // finally, set `rsp` to a valid region so that the CPU can push necessary state (see
-        // AMD APM section "8.9.3 Interrupt Stack Frame") to actually enter the interrupt
-        // handler. if we didn't do this, rsp will probably be zero or something, underflow,
-        // page fault on push to 0xffffffff_ffffffff, and just triple fault.
-        //
-        // TODO: this is our option in 16- and 32-bit modes, but in long mode all the interrupt
-        // descriptors could set something in IST to switch stacks outright for exception
-        // handling. this might be nice to test rsp permutations in 64-bit code? alternatively
-        // we might just have to limit possible rsp permutations so as to be able to test in
-        // 16- and 32-bit modes anyway.
-        regs.rsp = self.stack_addr().0;
-        self.idt_configured = true;
-    }
+    vm.program(&[0x0f, 0x57, 0xc0], &mut regs);
 
-    /// IDT configuration in 32-bit mode is funky because the interrupt handlers live in a totally
-    /// different region of memory and need a different value in `cs`.
-    fn configure_idt_32b(&mut self, regs: &mut kvm_regs, sregs: &mut kvm_sregs) {
-        sregs.idt.base = self.idt_addr().0;
-        sregs.idt.limit = IDT_ENTRIES * 8 - 1; // legacy IDT is 256 entries of 8 bytes each
+    let rip_before = regs.rip;
 
-        for i in 0..IDT_ENTRIES {
-            let interrupt_handler_addr = GuestAddress(self.interrupt_handlers_start().0 + i as u64);
-            self.write_idt_entry_legacy(
-                i.try_into().expect("<u8::MAX interrupts"),
-                self.selector_cs(),
-                interrupt_handler_addr
-            );
-        }
+    vm.set_regs(&regs).expect("can set regs");
 
-        // all interrupt handlers are just `hlt`. their position is used to detect which
-        // exception/interrupt occurred.
-        unsafe {
-            std::slice::from_raw_parts_mut(
-                self.host_ptr(self.interrupt_handlers_start()),
-                IDT_ENTRIES as usize
-            ).fill(0xf4);
+    vm.set_single_step(true).expect("can set single-step");
+
+    let res = vm.run().expect("can run vm");
+
+    let expected_rip = rip_before + 3;
+    eprintln!("exit: {:?}", res);
+    match res {
+        VcpuExit::Debug { pc: rip_after, .. } => {
+            assert_eq!(expected_rip, rip_after);
         }
+        other => {
+            panic!("unexpected exit: {:?}", other);
+        }
+    };
+}
 
-        // finally, set `rsp` to a valid region so that the CPU can push necessary state (see
-        // AMD APM section "8.9.3 Interrupt Stack Frame") to actually enter the interrupt
-        // handler. if we didn't do this, rsp will probably be zero or something, underflow,
-        // page fault on push to 0xffffffff_ffffffff, and just triple fault.
-        //
-        // TODO: this is our option in 16- and 32-bit modes, but in long mode all the interrupt
-        // descriptors could set something in IST to switch stacks outright for exception
-        // handling. this might be nice to test rsp permutations in 64-bit code? alternatively
-        // we might just have to limit possible rsp permutations so as to be able to test in
-        // 16- and 32-bit modes anyway.
-        regs.rsp = self.stack_addr().0;
-        self.idt_configured = true;
+#[test]
+fn test_vex_vandps_runs() {
+    let mut vm = Vm::create(128 * 1024).expect("can create vm");
+
+    if !vm.cpuid_supports(Feature::StateAVX) {
+        panic!("host CPU does not support AVX");
     }
 
-    /// IDT configuration in 16-bit mode is funky because the interrupt handlers live in a totally
-    /// different region of memory and need a different value in `cs`.
-    fn configure_idt_16b(&mut self, regs: &mut kvm_regs, sregs: &mut kvm_sregs) {
-        sregs.idt.base = self.idt_addr().0;
-        sregs.idt.limit = IDT_ENTRIES * 8 - 1; // IDT is 256 entries of 8 bytes each
+    let mut regs = vm.get_regs().expect("can get regs");
+
+    vm.program(&[0xc5, 0xe0, 0x54, 0x03], &mut regs);
+
+    regs.rbx = regs.rip;
+    let rip_before = regs.rip;
+
+    vm.set_regs(&regs).expect("can set regs");
+
+    vm.set_single_step(true).expect("can set single-step");
 
-        for i in 0..IDT_ENTRIES {
-            let interrupt_handler_addr = GuestAddress(self.interrupt_handlers_start().0 + i as u64);
-            self.write_idt_entry_legacy(
-                i.try_into().expect("<u8::MAX interrupts"),
-                self.selector_cs_idt_16b(),
-                interrupt_handler_addr
-            );
-        }
+    let res = vm.run().expect("can run vm");
 
-        // all interrupt handlers are just `hlt`. their position is used to detect which
-        // exception/interrupt occurred.
-        unsafe {
-            std::slice::from_raw_parts_mut(
-                self.host_ptr(self.interrupt_handlers_start()),
-                IDT_ENTRIES as usize
-            ).fill(0xf4);
+    let expected_rip = rip_before + 4;
+    eprintln!("exit: {:?}", res);
+    match res {
+        VcpuExit::Debug { pc: rip_after, .. } => {
+            assert_eq!(expected_rip, rip_after);
         }
+        other => {
+            panic!("unexpected exit: {:?}", other);
+        }
+    };
+}
 
-        // finally, set `rsp` to a valid region so that the CPU can push necessary state (see
-        // AMD APM section "8.9.3 Interrupt Stack Frame") to actually enter the interrupt
-        // handler. if we didn't do this, rsp will probably be zero or something, underflow,
-        // page fault on push to 0xffffffff_ffffffff, and just triple fault.
-        //
-        // TODO: this is our option in 16- and 32-bit modes, but in long mode all the interrupt
-        // descriptors could set something in IST to switch stacks outright for exception
-        // handling. this might be nice to test rsp permutations in 64-bit code? alternatively
-        // we might just have to limit possible rsp permutations so as to be able to test in
-        // 16- and 32-bit modes anyway.
-        regs.rsp = self.stack_addr().0;
-        self.idt_configured = true;
-    }
-
-    /// configure the vCPU for executing instructions in the hardware-supported extensions.
-    /// on a fresh vCPU, various extension may be "supported" but result in `#UD` when executed,
-    /// unless additional configuration is done (as this function does).
-    ///
-    /// the Intel SDM describes `INITIALIZING SSE/SSE2/SSE3/SSSE3 EXTENSIONS` but does not point
-    /// out this `#UD` behavior so directly. the AMD APM does not seem to discuss it at all?
-    ///
-    /// this function configures the vCPU to be ready to execute `SSE*` instructions.
-    fn configure_extensions(&mut self, sregs: &mut kvm_sregs, xcrs: &mut kvm_xcrs) {
-        // these bit positions in control registers, and their behaviors, are described more
-        // comprehensively in Voluem 3,
-        // > `2.5 CONTROL REGISTERS`
+#[test]
+fn test_vex_vandps_runs_32b() {
+    let settings = VmSettings::new(128 * 1024, IsaMode::Protected);
+    let mut vm = Vm::create_by_settings(settings).expect("can create vm");
 
-        // CR0
-        const TS: u32 = 3;
-        // CR4
-        const OSFXSR: u32 = 9;
-        const OSXMMEXCPT: u32 = 10;
-        const OSXSAVE: u32 = 18;
+    if !vm.cpuid_supports(Feature::StateAVX) {
+        panic!("host CPU does not support AVX");
+    }
 
-        // XCR0 (see "EXTENDED CONTROL REGISTERS (INCLUDING XCR0)")
-        // these bits are the same as in cpuid leaf 0xd.eax
-        const XCR0_SSE: u64 = CPUID_0000000D_EAX_SSE as u64;
-        const XCR0_AVX: u64 = CPUID_0000000D_EAX_AVX as u64;
-        const XCR0_AVX512: u64 = CPUID_0000000D_EAX_AVX512 as u64;
+    let mut regs = vm.get_regs().expect("can get regs");
 
-        // operations on `xmm` registers result in `#UD` even if CPUID says that SSE should be
-        // quite functional. this is true even for SSE or SSE2 instructions on an `x86_64` system
-        // (which makes SSE a non-optional baseline!)
-        //
-        // the Intel SDM implies this through somewhat tortured language in the section
-        // "Checking for Intel® SSE and SSE2 Support":
-        // > If an operating system did not provide adequate system level support for Intel
-        // > SSE, executing an Intel SSE or SSE2 instructions can also generate #UD.
-        //
-        // to fully understand this statement, realize that `an operating system .. provide[s]
-        // adequate system level support" by setting CR4.OSFXSR,
-        //
-        // > Set the OSFXSR flag (bit 9 in control register CR4) to indicate that the operating
-        // > system supports saving and restoring the SSE/SSE2/SSE3/SSSE3 execution environment
-        //
-        // so OSFXSR is how "the operating system" indicates save/restore state, and must be set to
-        // execute SSE (and later) SIMD instructions even if we never will use `fxsave` or even
-        // switch tasks on the vCPU.
-        sregs.cr4 |= 1 << OSFXSR;
+    vm.program(&[0xc5, 0xe0, 0x54, 0x03], &mut regs);
 
-        // there is a similar relationship between SIMD extension functionality and CR4.OSXSAVE.
-        // this passage in the SDM under "XSAVE-SUPPORTED FEATURES AND STATE-COMPONENT BITMAPS"
-        // draws a fairly direct connection:
-        //
-        // > As will be explained in Section 13.3, the XSAVE feature set is enabled only if
-        // > CR4.OSXSAVE[bit 18] = 1. If CR4.OSXSAVE = 0, the processor treats XSAVE-enabled state
-        // > features and their state components as if all bits in XCR0 were clear; the state
-        // > components cannot be modified and the features’ instructions cannot be executed.
-        //
-        // but the consequence is contradicted by the next paragraph,
-        //
-        // > Processors allow modification of this state, as well as execution of x87 FPU
-        // > instructions and SSE instructions [...] , regardless of the value of CR4.OSXSAVE and
-        // > XCR0.
-        //
-        // we will see that CR4.OSXSAVE must be set for other SIMD extensions below, as well.
-        sregs.cr4 |= 1 << OSXSAVE;
+    regs.rbx = regs.rip;
+    let rip_before = regs.rip;
 
-        // SSE3, SSSE3, and SSE4 involve a bit extra:
-        // > Intel SSE3, SSSE3, and Intel SSE4 will cause a DNA Exception (#NM) if the processor
-        // > attempts to execute an Intel SSE3 instruction while CR0.TS[bit 3] = 1
-        sregs.cr0 &= !(1 << TS);
+    vm.set_regs(&regs).expect("can set regs");
 
-        // > Set the OSXMMEXCPT flag (bit 10 in control register CR4) to indicate that the operating
-        // > system supports the handling of SSE/SSE2/SSE3 SIMD floating-point exceptions (#XM).
-        //
-        // this is somewhat better than just getting an uncategorized #UD.
-        sregs.cr4 |= 1 << OSXMMEXCPT;
+    vm.set_single_step(true).expect("can set single-step");
 
-        assert!(xcrs.nr_xcrs > 0);
-        assert_eq!(xcrs.xcrs[0].xcr, 0);
+    let res = vm.run().expect("can run vm");
 
-        let mut needs_xsave = false;
-        if self.cpuid_supports(Feature::StateSSE) {
-            self.cpuid_set(Feature::StateSSE, true);
-            xcrs.xcrs[0].value |= 1;
-            xcrs.xcrs[0].value |= XCR0_SSE;
-            needs_xsave = true;
-        }
-        if self.cpuid_supports(Feature::StateAVX) {
-            self.cpuid_set(Feature::StateAVX, true);
-            xcrs.xcrs[0].value |= XCR0_AVX;
-            needs_xsave = true;
+    let expected_rip = rip_before + 4;
+    eprintln!("exit: {:?}", res);
+    match res {
+        VcpuExit::Debug { pc: rip_after, .. } => {
+            assert_eq!(expected_rip, rip_after);
         }
-        if self.cpuid_supports(Feature::StateAVX512) {
-            self.cpuid_set(Feature::StateAVX512, true);
-            xcrs.xcrs[0].value |= XCR0_AVX512;
-            needs_xsave = true;
+        other => {
+            panic!("unexpected exit: {:?}", other);
         }
+    };
+}
 
-        if needs_xsave {
-            if self.cpuid_supports(Feature::XSave) {
-                self.cpuid_set(Feature::XSave, true);
-            } else {
-                panic!(
-                    "look, there's no CPU that supports SSE but not xsave. \
-                    i only checked to be thorough.");
-            }
-        }
+#[test]
+fn test_evex_vandps_runs() {
+    let mut vm = Vm::create(128 * 1024).expect("can create vm");
+
+    if !vm.cpuid_supports(Feature::StateAVX512) {
+        panic!("host CPU does not support AVX512");
     }
 
-    fn configure_syscalls(&mut self, vcpu_sregs: &mut kvm_sregs) {
-        assert!(self.cpuid_supports(Feature::Syscall));
-        self.cpuid_set(Feature::Syscall, true);
+    let mut regs = vm.get_regs().expect("can get regs");
 
-        // > System-Call Extension (SCE) Bit.
-        vcpu_sregs.efer |= 0x0000_0001;
+    vm.program(&[0x62, 0xf1, 0x7c, 0xbd, 0x54, 0x0a], &mut regs);
 
-        let msrs = Msrs::from_entries(&[
-            kvm_msr_entry {
-                // LSTAR (C000_0082h)
-                index: 0xc000_0082,
-                data: self.syscall_addr().0,
-                reserved: 0,
-            },
-            kvm_msr_entry {
-                // CSTAR (C000_0083h)
-                index: 0xc000_0083,
-                data: self.syscall_addr().0,
-                reserved: 0,
-            }
-        ]).unwrap();
-        self.set_msrs(&msrs).unwrap();
+    regs.rbx = regs.rip;
+    let rip_before = regs.rip;
 
-        // fill the syscall landing area with hlt to trap out immediately.
-        self.mem_slice_mut(self.syscall_addr(), 16).fill(0xf4);
+    vm.set_regs(&regs).expect("can set regs");
 
-        self.syscall_configured = true;
+    vm.set_single_step(true).expect("can set single-step");
+
+    let res = vm.run().expect("can run vm");
+
+    let expected_rip = rip_before + 6;
+    eprintln!("exit: {:?}", res);
+    match res {
+        VcpuExit::Debug { pc: rip_after, .. } => {
+            assert_eq!(expected_rip, rip_after);
+        }
+        other => {
+            panic!("unexpected exit: {:?}", other);
+        }
+    };
+}
+
+
+// this function will sit and loop in the kernel after trying to fulfill the MMIO exit.
+//
+// not great! don't do that! it's responsive to EINTR at least.
+// #[test]
+#[allow(dead_code)]
+fn kvm_hugepage_bug() {
+    let mut vm = Vm::create(1024 * 1024).expect("can create vm");
+    vm.add_memory(GuestAddress(0x1_0000_0000), 128 * 1024).expect("can add test mem region");
+    unsafe {
+        vm.configure_identity_paging(None);
     }
+
+    // `add [rsp], al; add [rcx], al; pop [rcx]; hlt`
+    // the first instruction runs fine. the second instruction runs fine.
+    // the third instruction gets a page fault at 0xf800? which worked fine for the add.
+    // this turns out to be an issue in linux' paging64_gva_to_gpa() when the va is mapped with
+    // huge pages.
+    let inst: &'static [u8] = &[0x00, 0x04, 0x24, 0x00, 0x01, 0x8f, 0x01, 0xf4];
+    let mut regs = vm.get_regs().unwrap();
+    regs.rax = 0x00000002_00100000;
+    regs.rcx = 0x00000002_00100000;
+    vm.program(inst, &mut regs);
+    vm.set_regs(&regs).unwrap();
+    vm.set_single_step(true).expect("can enable single-step");
+    vm.run().expect("can run vm");
+
+    let vm_regs = vm.get_regs().unwrap();
+    let vm_sregs = vm.get_sregs().unwrap();
+    let mut prev_rip = [0u8; 8];
+    vm.read_mem(GuestAddress(vm_regs.rsp + 8), &mut prev_rip[..]);
+    let mut buf = [0u8; 8];
+    vm.read_mem(GuestAddress(vm_regs.rsp), &mut buf[..]);
+    eprintln!(
+        "error code: {:#08x} accessing {:016x} @ rip={:#016x} (cr3={:016x})",
+        u64::from_le_bytes(buf), vm_sregs.cr2,
+        u64::from_le_bytes(prev_rip), vm_sregs.cr3
+    );
+    if vm_regs.rip == 0x300f {
+        let mut pdpt = [0u8; 4096];
+        vm.read_mem(vm.page_tables().pdpt_addr(), &mut pdpt[..]);
+        eprintln!("pdpt: {:x?}", &pdpt[..8]);
+    }
+    panic!("no");
 }
-- 
cgit v1.1