From 3a006f5b596da90b876d320b8d48f278b88a5ec1 Mon Sep 17 00:00:00 2001 From: iximeow Date: Sun, 24 May 2026 00:49:32 +0000 Subject: move tests out to the bottom --- src/x86_64.rs | 2883 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 1442 insertions(+), 1441 deletions(-) (limited to 'src') diff --git a/src/x86_64.rs b/src/x86_64.rs index ff23b34..0e08446 100644 --- a/src/x86_64.rs +++ b/src/x86_64.rs @@ -437,1722 +437,1723 @@ fn test_check_range_exact() { assert!(mapping.check_range(GuestAddress(0x4000), 0x1000)); } -#[test] -fn test_xor_runs() { - let mut vm = Vm::create(128 * 1024).expect("can create vm"); - let mut regs = vm.get_regs().expect("can get regs"); +/// a selector for the execution mode the VM should be initialized to. +/// +/// different `IsaMode` will configure the VM wildly differently; generally any VM/vCPU state not +/// directly required for the requested mode will be left untouched. +/// +/// in all modes, CPUID leaves and xcr0 are set up to support any ISA extensions supported by the +/// host CPU. +/// +/// in all modes, an IDT is installed with interrupt handlers pointed to the 256 bytes from +/// `interrupt_handlers_start()`. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum IsaMode { + /// request that the VM be configured to run x86-64 instructions, aka "AMD64", or "IA-32e" (and + /// specifically "IA-32e 64-bit mode") in some Intel nomenclature. + /// + /// this configures identity paging, selectors sufficient for long mode (with all vCPU + /// execution at CPL=0), prepares some MSRs for syscalls, and of course configures `cr0` + /// for long mode. + Long, + /// request that the VM be configured to run 32-bit instructions, with long mode neither + /// enabled nor active. + /// + /// this configures identity paging and selectors covering all 32-bit address space and with + /// CPL=0. + Protected, + /// request that the VM be configured to run 16-bit instructions. + /// + /// this configures code/data selectors covering all 24 bits of address space and an interrupt + /// descriptor table, and CPUID for any host-supported ISA extensions, but that's about it. + Real, +} - vm.program(&[0x33, 0xc0], &mut regs); +/// the settings to configure a [`Vm::create_by_settings`]. see `VmSettings::new` for top-level +/// configuration. +pub struct VmSettings { + mem_size: usize, + isa_mode: IsaMode, +} - regs.rax = 0x1234; - let rip_before = regs.rip; +impl VmSettings { + /// provide the bare-minimum configuration for a VM: the size of its memory and what execution + /// mode the resulting VM should be set for. + /// + /// VM control settings (IDT, `cs`, `ds`, other selectors, syscalls, page tables, etc) vary + /// substantially across different `IsaMode`. in all cases code can be written into the VM with + /// [`Vm::program()`], then run with [`Vm::run()`]. + pub fn new(mem_size: usize, isa_mode: IsaMode) -> Self { + Self { mem_size, isa_mode } + } +} - vm.set_regs(®s).expect("can set regs"); +impl Vm { + pub fn create(mem_size: usize) -> Result { + Self::create_by_settings(VmSettings::new(mem_size, IsaMode::Long)) + } - vm.set_single_step(true).expect("can set single-step"); + pub fn create_by_settings(settings: VmSettings) -> Result { + let kvm = Kvm::new() + .map_err(|e| VmError::from_kvm("Kvm::new()", e))?; - let res = vm.run().expect("can run vm"); + let vm = kvm.create_vm() + .map_err(|e| VmError::from_kvm("craete_vm", e))?; - let expected_rip = rip_before + 2; - match res { - VcpuExit::Debug { pc: rip_after, .. } => { - assert_eq!(expected_rip, rip_after); - } - other => { - panic!("unexpected exit: {:?}", other); + let supported_cpuid = kvm.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES).unwrap(); + + // actual minimum is somewhere around 0x1a000 bytes, but 0x20_000 aka 128k will do + if settings.mem_size < 128 * 1024 { + return Err(VmCreateError::TooSmall { + requested: settings.mem_size, + required: 128 * 1024, + }); } - }; - let regs_after = vm.get_regs().expect("can get regs"); - assert_eq!(regs_after.rax, 0); -} + let mapping = Mapping::create_shared(0, settings.mem_size, ProtFlags::PROT_READ | ProtFlags::PROT_WRITE)?; -#[test] -fn test_protected_mode_runs() { - let settings = VmSettings::new(128 * 1024, IsaMode::Protected); - let mut vm = Vm::create_by_settings(settings).expect("can create vm"); - let mut regs = vm.get_regs().expect("can get regs"); + let region = kvm_userspace_memory_region { + slot: 0, + guest_phys_addr: 0x0000, + memory_size: mapping.size.get() as u64, + userspace_addr: mapping.addr.as_ptr() as u64, + flags: 0, + }; - let buf = &[ - 0xc5, 0xe0, 0x54, 0xc3, // vandps xmm0, xmm3, xmm3 - 0x33, 0xc0, // xor eax, eax - 0x8b, 0x09, // mov ecx, [ecx] - 0xf4 // hlt - ]; - vm.program(buf, &mut regs); + let set_res = unsafe { vm.set_user_memory_region(region) }; + set_res.map_err(|e| VmError::from_kvm("set_user_memory_region", e))?; - regs.rax = 0x1234; - regs.rcx = 0x4; + let vcpu_res = vm.create_vcpu(0); + let vcpu = vcpu_res.map_err(|e| VmError::from_kvm("create_vcpu(0)", e))?; - vm.set_regs(®s).expect("can set regs"); + let current_cpuid = vcpu.get_cpuid2(KVM_MAX_CPUID_ENTRIES).unwrap(); - let res = vm.run().expect("can run vm"); + let mem_ceiling = mapping.size.get().try_into().unwrap(); - match res { - VcpuExit::Hlt => { - // expected exit from the `0xf4` above. - } - other => { - panic!("unexpected exit: {:?}", other); - } - }; + let mut this = Vm { + settings, + vm, + vcpu, + supported_cpuid, + current_cpuid, + idt_configured: false, + syscall_configured: false, + memory: mapping, + aux_memories: Vec::new(), + mem_ceiling, + }; - let regs_after = vm.get_regs().expect("can get regs"); - assert_eq!(regs_after.rax, 0); - assert_eq!(regs_after.rcx, 0); -} + let mut vcpu_regs = this.get_regs()?; + let mut vcpu_sregs = this.get_sregs()?; -#[test] -fn test_pusha_runs() { - let settings = VmSettings::new(128 * 1024, IsaMode::Real); - let mut vm = Vm::create_by_settings(settings).expect("can create vm"); - let mut regs = vm.get_regs().expect("can get regs"); + assert!(this.cpuid_supports(Feature::Base)); + this.cpuid_set(Feature::Base, true); - vm.program(&[0x60], &mut regs); + match this.settings.isa_mode { + IsaMode::Long => { + unsafe { + this.configure_identity_paging(Some(&mut vcpu_sregs)); + this.configure_selectors(&mut vcpu_sregs); + this.configure_idt(&mut vcpu_regs, &mut vcpu_sregs); + let mut xcrs = this.get_xcrs()?; + this.configure_extensions(&mut vcpu_sregs, &mut xcrs); + this.set_xcrs(&xcrs)?; + this.configure_syscalls(&mut vcpu_sregs); + } - regs.rip = 0; - regs.rax = 0x1234; - eprintln!("{:?}", regs); + vcpu_sregs.efer |= 0x0000_0500; // LME | LMA + } + IsaMode::Protected => { + unsafe { + this.configure_identity_paging_32b(Some(&mut vcpu_sregs)); + this.configure_selectors_32b(&mut vcpu_sregs); + this.configure_idt_32b(&mut vcpu_regs, &mut vcpu_sregs); + let mut xcrs = this.get_xcrs()?; + this.configure_extensions(&mut vcpu_sregs, &mut xcrs); + this.set_xcrs(&xcrs)?; - vm.set_regs(®s).expect("can set regs"); + } + } + IsaMode::Real => { + unsafe { + this.configure_selectors_16b(&mut vcpu_sregs); + this.configure_idt_16b(&mut vcpu_regs, &mut vcpu_sregs); + let mut xcrs = this.get_xcrs()?; + this.configure_extensions(&mut vcpu_sregs, &mut xcrs); + this.set_xcrs(&xcrs)?; - vm.set_single_step(true).expect("can set single-step"); - let expected_rip = vm.code_addr().0 + 1; + // in 16-bit mode we've set cs and ds to cover the last 4kb of memory, starting + // at the same place we've written code to execute. there's not much memory to + // go around, and not a ton of flexibility in the asmlinator API, so uh ... the + // least annoying thing to do might be to just put the stack 0x80 bytes from + // the end? + vcpu_regs.rsp = 0x1000 - 0x80; + } + } + } - let res = vm.run().expect("can run vm"); + this.set_regs(&vcpu_regs)?; + this.set_sregs(&vcpu_sregs)?; - match res { - VcpuExit::Debug { pc: rip_after, .. } => { - eprintln!("rip after: {:08x}", rip_after); - assert_eq!(expected_rip, rip_after); - } - other => { - panic!("unexpected exit: {:?}", other); + Ok(this) + } + + /// map and add a region of size `size` at guest-physical address `gpa`. + /// + /// this will not update page tables, so if the newly-added memory is not already mapped due to + /// a previous `configure_identity_paging` call and it is not mapped due to explicit page table + /// management, it will not yet be accessible by guest code. + pub fn add_memory(&mut self, gpa: GuestAddress, size: u64) -> Result<(), VmError> { + let new_mapping_end = gpa.0.checked_add(size) + .map(|addr| GuestAddress(addr)) + .ok_or_else(|| VmError::InvalidMapping { base: gpa, size })?; + if self.memory.overlaps(gpa, new_mapping_end) { + return Err(VmError::InvalidMapping { base: gpa, size }); + } else { + for mapping in self.aux_memories.iter() { + if mapping.overlaps(gpa, new_mapping_end) { + return Err(VmError::InvalidMapping { base: gpa, size }); + } + } } - }; - let regs_after = vm.get_regs().expect("can get regs"); - assert_eq!(regs_after.rax, 0x1234); - assert_eq!(regs_after.rsp, 0x1000 - 0x80 - (8 * 2)); + let mapping = Mapping::create_shared( + u64_to_usize(gpa.0), + u64_to_usize(size), + ProtFlags::PROT_READ | ProtFlags::PROT_WRITE + )?; - let mut regs = vm.get_regs().expect("can get regs"); + let used_slots: u32 = self.aux_memories.len().try_into() + .map_err(|_| VmError::InvalidMapping { base: gpa, size })?; + let next_slot = used_slots.checked_add(1) + .ok_or_else(|| VmError::InvalidMapping { base: gpa, size })?; - vm.program(&[0x66, 0x60], &mut regs); + let region = kvm_userspace_memory_region { + slot: next_slot, + guest_phys_addr: gpa.0, + memory_size: mapping.size.get() as u64, + userspace_addr: mapping.addr.as_ptr() as u64, + flags: 0, + }; - regs.rip = 0; - regs.rax = 0x1234; - regs.rsp = 0x1000 - 0x80; - eprintln!("{:?}", regs); + let set_res = unsafe { self.vm.set_user_memory_region(region) }; + set_res.map_err(|e| VmError::from_kvm("set_user_memory_region", e))?; - vm.set_regs(®s).expect("can set regs"); + self.aux_memories.push(mapping); - vm.set_single_step(true).expect("can set single-step"); - let expected_rip = vm.code_addr().0 + 2; - - let res = vm.run().expect("can run vm"); - - match res { - VcpuExit::Debug { pc: rip_after, .. } => { - eprintln!("rip after: {:08x}", rip_after); - assert_eq!(expected_rip, rip_after); - } - other => { - panic!("unexpected exit: {:?}", other); + if new_mapping_end.0 > self.mem_ceiling { + self.mem_ceiling = new_mapping_end.0; } - }; - - let regs_after = vm.get_regs().expect("can get regs"); - assert_eq!(regs_after.rax, 0x1234); - assert_eq!(regs_after.rsp, 0x1000 - 0x80 - (8 * 4)); -} - -#[test] -fn test_syscall() { - let mut vm = Vm::create(128 * 1024).expect("can create vm"); - let mut regs = vm.get_regs().expect("can get regs"); - vm.program(&[0x0f, 0x05], &mut regs); - eprintln!("rip before: {:08x}", regs.rip); - - vm.set_regs(®s).expect("can set regs"); - -// vm.set_single_step(true).expect("can set single-step"); + Ok(()) + } - let res = vm.run().expect("can run vm"); - match res { - VcpuExit::Syscall => { /* expected */ } - VcpuExit::Debug { pc, .. } => { - if pc == vm.syscall_addr().0 { - panic!( - "VM exited at syscall target. \ - syscall hlt stub not executed. \ - is the VM being single-stepped?" - ); - } - panic!("unexpected debug exit at rip={:08x}", pc); - } - other => { - panic!("unexpected exit: {:?}", other); - } - }; + pub fn get_regs(&self) -> Result { + self.vcpu.get_regs() + .map_err(|e| VmError::from_kvm("get_regs", e)) + } - let regs_after = vm.get_regs().expect("can get regs"); + pub fn get_sregs(&self) -> Result { + self.vcpu.get_sregs() + .map_err(|e| VmError::from_kvm("get_sregs", e)) + } - let expected_rip = vm.syscall_addr().0 + 1; - assert_eq!(expected_rip, regs_after.rip); -} + pub fn get_xcrs(&self) -> Result { + self.vcpu.get_xcrs() + .map_err(|e| VmError::from_kvm("get_xcrs", e)) + } -#[test] -fn test_xorps_runs() { - let mut vm = Vm::create(128 * 1024).expect("can create vm"); - let mut regs = vm.get_regs().expect("can get regs"); + pub fn set_regs(&self, regs: &kvm_regs) -> Result<(), VmError> { + self.vcpu.set_regs(regs) + .map_err(|e| VmError::from_kvm("set_regs", e)) + } - vm.program(&[0x0f, 0x57, 0xc0], &mut regs); + pub fn set_sregs(&self, sregs: &kvm_sregs) -> Result<(), VmError> { + self.vcpu.set_sregs(sregs) + .map_err(|e| VmError::from_kvm("set_sregs", e)) + } - let rip_before = regs.rip; + pub fn set_xcrs(&self, xcrs: &kvm_xcrs) -> Result<(), VmError> { + self.vcpu.set_xcrs(xcrs) + .map_err(|e| VmError::from_kvm("set_xcrs", e)) + } - vm.set_regs(®s).expect("can set regs"); + pub fn set_msrs(&self, msrs: &Msrs) -> Result<(), VmError> { + let n_set = self.vcpu.set_msrs(msrs) + .map_err(|e| VmError::from_kvm("set_msrs", e))?; + assert_eq!(msrs.as_slice().len(), n_set); + Ok(()) + } - vm.set_single_step(true).expect("can set single-step"); + pub fn idt_configured(&self) -> bool { + self.idt_configured + } - let res = vm.run().expect("can run vm"); + pub fn syscall_configured(&self) -> bool { + self.syscall_configured + } - let expected_rip = rip_before + 3; - eprintln!("exit: {:?}", res); - match res { - VcpuExit::Debug { pc: rip_after, .. } => { - assert_eq!(expected_rip, rip_after); - } - other => { - panic!("unexpected exit: {:?}", other); - } - }; -} + // TODO: seems like there's a KVM bug where if the VM is configured for single-step and the + // single-stepped instruction is a rmw to MMIO memory (or MMIO hugepages?), the single-step + // doesn't actually take effect. compare `0x33 0x00` and `0x31 0x00`. what the hell! + pub fn set_single_step(&mut self, active: bool) -> Result<(), VmError> { + let mut guest_debug = kvm_guest_debug::default(); -#[test] -fn test_vex_vandps_runs() { - let mut vm = Vm::create(128 * 1024).expect("can create vm"); + if active { + guest_debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP + }; - if !vm.cpuid_supports(Feature::StateAVX) { - panic!("host CPU does not support AVX"); + self.vcpu.set_guest_debug(&guest_debug) + .map_err(|e| VmError::from_kvm("set_guest_debug", e)) } - let mut regs = vm.get_regs().expect("can get regs"); + pub fn run<'vm>(&'vm mut self) -> Result, VmError> { + let exit = self.vcpu.run() + .map_err(|e| VmError::from_kvm("vcpu run", e))?; - vm.program(&[0xc5, 0xe0, 0x54, 0x03], &mut regs); + match exit { + kvm_ioctls::VcpuExit::MmioRead(addr, buf) => { + // `buf` is typed with a lifetime from the reborrow of self.vcpu for run() above. + // this means it's a shorter lifetime than `'vm`, but since the resulting lifetime + // is also `'vm` it *really* has the effect of disallowing any subsequent use of + // `self`. these transmutes decouple the lifetime of `exit` from the lifetime of + // `self` and returned `VcpuExit`, so other arms that don't involve lifetimes can + // drop `exit()` and query the vcpu. + // + // SAFETY: this actually extends the lifetime of `buf` from the shorter transient + // lifetime to `'vm` for the return type. + let buf: &'vm mut [u8] = unsafe { core::mem::transmute(buf) }; + return Ok(VcpuExit::MmioRead { buf, addr }); + } + kvm_ioctls::VcpuExit::MmioWrite(addr, buf) => { + // see the same transmute in `MmioRead` for why this is load-bearing. + // + // SAFETY: this actually extends the lifetime of `buf` from the shorter transient + // lifetime to `'vm` for the return type. + let buf: &'vm [u8] = unsafe { core::mem::transmute(buf) }; + return Ok(VcpuExit::MmioWrite { buf, addr }); + } + kvm_ioctls::VcpuExit::IoIn(port, buf) => { + // see the same transmute in `MmioRead` for why this is load-bearing. + // + // SAFETY: this actually extends the lifetime of `buf` from the shorter transient + // lifetime to `'vm` for the return type. + let buf: &'vm mut [u8] = unsafe { core::mem::transmute(buf) }; + return Ok(VcpuExit::IoIn { port, buf }); + } + kvm_ioctls::VcpuExit::IoOut(port, buf) => { + // see the same transmute in `MmioRead` for why this is load-bearing. + // + // SAFETY: this actually extends the lifetime of `buf` from the shorter transient + // lifetime to `'vm` for the return type. + let buf: &'vm [u8] = unsafe { core::mem::transmute(buf) }; + return Ok(VcpuExit::IoOut { port, buf }); + } + kvm_ioctls::VcpuExit::Debug(info) => { + let pc = info.pc; + return Ok(VcpuExit::Debug { pc, info }); + } + kvm_ioctls::VcpuExit::Hlt => { + let regs = self.get_regs()?; - regs.rbx = regs.rip; - let rip_before = regs.rip; + if self.idt_configured { + let intrs_start = self.interrupt_handlers_start().0; + let intrs_end = intrs_start + IDT_ENTRIES as u64; + // by the time we've exited the `hlt` of the interrupt handler has completed, + // so rip is advanced by one. subtract back out to convert to an exception + // vector number. + let intr_addr = regs.rip - 1; - vm.set_regs(®s).expect("can set regs"); + if intr_addr >= intrs_start && intr_addr < intrs_end { + let nr = intr_addr - intrs_start; + // because IDT_ENTRIES is 256, this should always be true.. + assert!(nr < 256); + let nr = nr as u8; - vm.set_single_step(true).expect("can set single-step"); + return Ok(VcpuExit::Exception { nr }); + } + } - let res = vm.run().expect("can run vm"); + if self.syscall_configured { + // the behavior of `syscall`, `hlt`, and `rip` is a little funky. similar to + // interrupt handlers, we typically exit with rip pointed immediately after + // `syscall_addr()` because we would syscall to `hlt`, execute the first `hlt`, + // advance `rip` by one byte, and exit to userland for the HLT. + if regs.rip == self.syscall_addr().0 + 1{ + return Ok(VcpuExit::Syscall); + } + } - let expected_rip = rip_before + 4; - eprintln!("exit: {:?}", res); - match res { - VcpuExit::Debug { pc: rip_after, .. } => { - assert_eq!(expected_rip, rip_after); - } - other => { - panic!("unexpected exit: {:?}", other); + Ok(VcpuExit::Hlt) + } + kvm_ioctls::VcpuExit::Shutdown => { + return Ok(VcpuExit::Shutdown); + } + other => { + panic!("unhandled VcpuExit kind: {other:?}"); + } } - }; -} + } -#[test] -fn test_vex_vandps_runs_32b() { - let settings = VmSettings::new(128 * 1024, IsaMode::Protected); - let mut vm = Vm::create_by_settings(settings).expect("can create vm"); + /// get a pointer to host memory mapped to guest address `address`. + /// + /// panics if `address` is not a guest-physical address backed by host memory. + pub unsafe fn host_ptr(&self, address: GuestAddress) -> *mut u8 { + let mapping = self.map_containing(address, 0) + .expect("mapping for address exists"); - if !vm.cpuid_supports(Feature::StateAVX) { - panic!("host CPU does not support AVX"); + unsafe { + mapping.host_ptr(address) + } } - let mut regs = vm.get_regs().expect("can get regs"); - - vm.program(&[0xc5, 0xe0, 0x54, 0x03], &mut regs); + pub fn gdt_addr(&self) -> GuestAddress { + GuestAddress(0x1000) + } - regs.rbx = regs.rip; - let rip_before = regs.rip; + pub fn idt_addr(&self) -> GuestAddress { + GuestAddress(0x2000) + } - vm.set_regs(®s).expect("can set regs"); + pub fn interrupt_handlers_start(&self) -> GuestAddress { + GuestAddress(0x3000) + } - vm.set_single_step(true).expect("can set single-step"); + pub fn syscall_addr(&self) -> GuestAddress { + GuestAddress(0x4000) + } - let res = vm.run().expect("can run vm"); - - let expected_rip = rip_before + 4; - eprintln!("exit: {:?}", res); - match res { - VcpuExit::Debug { pc: rip_after, .. } => { - assert_eq!(expected_rip, rip_after); - } - other => { - panic!("unexpected exit: {:?}", other); - } - }; -} + pub fn page_table_addr(&self) -> GuestAddress { + GuestAddress(0x10000) + } -#[test] -fn test_evex_vandps_runs() { - let mut vm = Vm::create(128 * 1024).expect("can create vm"); + pub fn code_addr(&self) -> GuestAddress { + GuestAddress(self.memory.size.get() as u64 - 4096) + } - if !vm.cpuid_supports(Feature::StateAVX512) { - panic!("host CPU does not support AVX512"); + pub fn mem_ceiling(&self) -> u64 { + self.mem_ceiling } - let mut regs = vm.get_regs().expect("can get regs"); + /// configuring the IDT implies the IDT might be used which means we want a stack pointer + /// that can have at least 0x18 bytes pushed to it if an interrupt happens. + pub fn stack_addr(&self) -> GuestAddress { + // it would be nice to point the stack somewhere that we could get MMIO exits and see the + // processor push words for the interrupt in real time, but that doesn't ... work. + // instead, you end up in a loop somewhere around svm_vcpu_run (which you can ^C out of, + // thankfully). + // + // so this picks some guest memory lower down. - vm.program(&[0x62, 0xf1, 0x7c, 0xbd, 0x54, 0x0a], &mut regs); + // stack grows *down* but if someone pops a lot of bytes from rsp we'd go up and + // clobber the page tables. so leave a bit of space. + GuestAddress(0x19800) + } - regs.rbx = regs.rip; - let rip_before = regs.rip; + /// selector 0x10 is chosen arbitrarily for code. + pub fn selector_cs(&self) -> u16 { + 0x10 + } - vm.set_regs(®s).expect("can set regs"); + /// selector 0x18 is chosen arbitrarily for data (all segments; ss, ds, es, etc). + pub fn selector_ds(&self) -> u16 { + 0x18 + } - vm.set_single_step(true).expect("can set single-step"); + /// selector 0x20 is chosen arbitrarily for 16-bit interrupts, which are placed well away from + /// where selector 0x10 is pointed in real mode. + pub fn selector_cs_idt_16b(&self) -> u16 { + 0x20 + } - let res = vm.run().expect("can run vm"); + fn map_containing_mut(&mut self, base: GuestAddress, size: u64) -> Option<&mut Mapping> { + let mapping = if self.memory.contains(base) { + &mut self.memory + } else { + self.aux_memories.iter_mut() + .find(|map| map.contains(base))? + }; - let expected_rip = rip_before + 6; - eprintln!("exit: {:?}", res); - match res { - VcpuExit::Debug { pc: rip_after, .. } => { - assert_eq!(expected_rip, rip_after); - } - other => { - panic!("unexpected exit: {:?}", other); + if !mapping.check_range(base, size) { + return None; } - }; -} - -// this function will sit and loop in the kernel after trying to fulfill the MMIO exit. -// -// not great! don't do that! it's responsive to EINTR at least. -// #[test] -#[allow(dead_code)] -fn kvm_hugepage_bug() { - let mut vm = Vm::create(1024 * 1024).expect("can create vm"); - vm.add_memory(GuestAddress(0x1_0000_0000), 128 * 1024).expect("can add test mem region"); - unsafe { - vm.configure_identity_paging(None); + Some(mapping) } - // `add [rsp], al; add [rcx], al; pop [rcx]; hlt` - // the first instruction runs fine. the second instruction runs fine. - // the third instruction gets a page fault at 0xf800? which worked fine for the add. - // this turns out to be an issue in linux' paging64_gva_to_gpa() when the va is mapped with - // huge pages. - let inst: &'static [u8] = &[0x00, 0x04, 0x24, 0x00, 0x01, 0x8f, 0x01, 0xf4]; - let mut regs = vm.get_regs().unwrap(); - regs.rax = 0x00000002_00100000; - regs.rcx = 0x00000002_00100000; - vm.program(inst, &mut regs); - vm.set_regs(®s).unwrap(); - vm.set_single_step(true).expect("can enable single-step"); - vm.run().expect("can run vm"); + fn map_containing(&self, base: GuestAddress, size: u64) -> Option<&Mapping> { + let mapping = if self.memory.contains(base) { + &self.memory + } else { + self.aux_memories.iter() + .find(|map| map.contains(base))? + }; - let vm_regs = vm.get_regs().unwrap(); - let vm_sregs = vm.get_sregs().unwrap(); - let mut prev_rip = [0u8; 8]; - vm.read_mem(GuestAddress(vm_regs.rsp + 8), &mut prev_rip[..]); - let mut buf = [0u8; 8]; - vm.read_mem(GuestAddress(vm_regs.rsp), &mut buf[..]); - eprintln!( - "error code: {:#08x} accessing {:016x} @ rip={:#016x} (cr3={:016x})", - u64::from_le_bytes(buf), vm_sregs.cr2, - u64::from_le_bytes(prev_rip), vm_sregs.cr3 - ); - if vm_regs.rip == 0x300f { - let mut pdpt = [0u8; 4096]; - vm.read_mem(vm.page_tables().pdpt_addr(), &mut pdpt[..]); - eprintln!("pdpt: {:x?}", &pdpt[..8]); + if !mapping.check_range(base, size) { + return None; + } + + Some(mapping) } - panic!("no"); -} -/// a selector for the execution mode the VM should be initialized to. -/// -/// different `IsaMode` will configure the VM wildly differently; generally any VM/vCPU state not -/// directly required for the requested mode will be left untouched. -/// -/// in all modes, CPUID leaves and xcr0 are set up to support any ISA extensions supported by the -/// host CPU. -/// -/// in all modes, an IDT is installed with interrupt handlers pointed to the 256 bytes from -/// `interrupt_handlers_start()`. -#[derive(Copy, Clone, Debug, PartialEq)] -pub enum IsaMode { - /// request that the VM be configured to run x86-64 instructions, aka "AMD64", or "IA-32e" (and - /// specifically "IA-32e 64-bit mode") in some Intel nomenclature. - /// - /// this configures identity paging, selectors sufficient for long mode (with all vCPU - /// execution at CPL=0), prepares some MSRs for syscalls, and of course configures `cr0` - /// for long mode. - Long, - /// request that the VM be configured to run 32-bit instructions, with long mode neither - /// enabled nor active. - /// - /// this configures identity paging and selectors covering all 32-bit address space and with - /// CPL=0. - Protected, - /// request that the VM be configured to run 16-bit instructions. + /// write all of `data` into guest memory at guest-physical address `addr`. /// - /// this configures code/data selectors covering all 24 bits of address space and an interrupt - /// descriptor table, and CPUID for any host-supported ISA extensions, but that's about it. - Real, -} + /// panics if `data` extends beyond the end of guest memory. + pub fn write_mem(&mut self, addr: GuestAddress, data: &[u8]) { + let mapping = self.map_containing(addr, data.len() as u64).expect("mapping is valid"); -/// the settings to configure a [`Vm::create_by_settings`]. see `VmSettings::new` for top-level -/// configuration. -pub struct VmSettings { - mem_size: usize, - isa_mode: IsaMode, -} + // SAFETY: `check_range` above validates the range to copy, and... please do not + // provide a slice of guest memory as what the guest should be programmed for... + unsafe { + std::ptr::copy_nonoverlapping( + data.as_ptr(), + mapping.host_ptr(addr), + data.len() + ); + } + } -impl VmSettings { - /// provide the bare-minimum configuration for a VM: the size of its memory and what execution - /// mode the resulting VM should be set for. + /// read guest-physical memory at `addr` to `addr + buf.len()` into `buf`. /// - /// VM control settings (IDT, `cs`, `ds`, other selectors, syscalls, page tables, etc) vary - /// substantially across different `IsaMode`. in all cases code can be written into the VM with - /// [`Vm::program()`], then run with [`Vm::run()`]. - pub fn new(mem_size: usize, isa_mode: IsaMode) -> Self { - Self { mem_size, isa_mode } - } -} + /// panics if `addr + buf.len()` extends beyond the end of guest memory. + pub fn read_mem(&mut self, addr: GuestAddress, buf: &mut [u8]) { + let mapping = self.map_containing(addr, buf.len() as u64).expect("mapping is valid"); -impl Vm { - pub fn create(mem_size: usize) -> Result { - Self::create_by_settings(VmSettings::new(mem_size, IsaMode::Long)) + // SAFETY: `check_range` above validates the range to copy, and... please do not + // provide a slice of guest memory as what should be read into... + unsafe { + std::ptr::copy_nonoverlapping( + mapping.host_ptr(addr) as *const _, + buf.as_mut_ptr(), + buf.len() + ); + } } - pub fn create_by_settings(settings: VmSettings) -> Result { - let kvm = Kvm::new() - .map_err(|e| VmError::from_kvm("Kvm::new()", e))?; - - let vm = kvm.create_vm() - .map_err(|e| VmError::from_kvm("craete_vm", e))?; - - let supported_cpuid = kvm.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES).unwrap(); + /// returns a slice of guest memory pointed to by guest-physical address `addr`, of size + /// `size`. + /// + /// panics if `addr + size` is not enclosed in a single guest mapping. this crate doesn't + /// support returning a single slice of adjacent guest memory regions (yet?), sorry. + pub fn mem_slice_mut<'vm>(&'vm mut self, addr: GuestAddress, size: u64) -> &'vm mut [u8] { + let mapping = self.map_containing_mut(addr, size).expect("mapping is valid"); - // actual minimum is somewhere around 0x1a000 bytes, but 0x20_000 aka 128k will do - if settings.mem_size < 128 * 1024 { - return Err(VmCreateError::TooSmall { - requested: settings.mem_size, - required: 128 * 1024, - }); + // SAFETY: we have an exclusive borrow of the VM, so it is not currently running, and there + // is no other outstanding slice of guest memory. `map_containing` has already ensured that + // this mapping contains the whole range `[addr, addr + size)`. + unsafe { + mapping.slice_mut(addr, size) } + } - let mapping = Mapping::create_shared(0, settings.mem_size, ProtFlags::PROT_READ | ProtFlags::PROT_WRITE)?; + /// returns a slice of guest memory pointed to by guest-physical address `addr`, of size + /// `size`. + /// + /// panics if `addr + size` is not enclosed in a single guest mapping. this crate doesn't + /// support returning a single slice of adjacent guest memory regions (yet?), sorry. + pub fn mem_slice<'vm>(&'vm self, addr: GuestAddress, size: u64) -> &'vm [u8] { + let mapping = self.map_containing(addr, size).expect("mapping is valid"); - let region = kvm_userspace_memory_region { - slot: 0, - guest_phys_addr: 0x0000, - memory_size: mapping.size.get() as u64, - userspace_addr: mapping.addr.as_ptr() as u64, - flags: 0, - }; + // SAFETY: we have an exclusive borrow of the VM, so it is not currently running, and there + // is no other outstanding slice of guest memory. `map_containing` has already ensured that + // this mapping contains the whole range `[addr, addr + size)`. + unsafe { + mapping.slice(addr, size) + } + } - let set_res = unsafe { vm.set_user_memory_region(region) }; - set_res.map_err(|e| VmError::from_kvm("set_user_memory_region", e))?; + /// write `code` into guest memory and set `regs.rip` to the address of that code. + /// + /// the chosen code address is [`Self::code_addr`]; this is the guest linear address the + /// provided code buffer is written to. + /// + /// if the VM is configured for `IsaMode::Long` or `IsaMode::Protected`, `rip` or `eip` is set + /// to this address as well. otherwise, the VM is configured for `IsaMode::Real` and `ip` is + /// set to `code_addr() & 0x0f` - in typical cases `ip` will be 0. + /// + pub fn program(&mut self, code: &[u8], regs: &mut kvm_regs) { + let addr = self.code_addr(); + self.write_mem(addr, code); - let vcpu_res = vm.create_vcpu(0); - let vcpu = vcpu_res.map_err(|e| VmError::from_kvm("create_vcpu(0)", e))?; + if self.settings.isa_mode != IsaMode::Real { + regs.rip = addr.0; + } else { + regs.rip = addr.0 & 0x000f; + } + } - let current_cpuid = vcpu.get_cpuid2(KVM_MAX_CPUID_ENTRIES).unwrap(); + fn gdt_entry_mut(&mut self, idx: u16) -> *mut u64 { + // the GDT is set up at addresses 0..64k: + // + // > 3.5.1 Segment Descriptor Tables + // > A segment descriptor table is an array of segment descriptors (see Figure 3-10). A + // > descriptor table is variable in length and can contain up to 8192 (2^13) 8-byte + // > descriptors. - let mem_ceiling = mapping.size.get().try_into().unwrap(); + assert!(idx < 4096 / 8); + let addr = GuestAddress(self.gdt_addr().0 + (idx as u64 * 8)); + let mapping = self.map_containing(addr, std::mem::size_of::() as u64).unwrap(); - let mut this = Vm { - settings, - vm, - vcpu, - supported_cpuid, - current_cpuid, - idt_configured: false, - syscall_configured: false, - memory: mapping, - aux_memories: Vec::new(), - mem_ceiling, - }; + // SAFETY: idx * 8 can't overflow isize, and we've asserted the end of the pointer is + // still inside the allocation (`self.memory`). + unsafe { + mapping.host_ptr(addr) as *mut u64 + } + } - let mut vcpu_regs = this.get_regs()?; - let mut vcpu_sregs = this.get_sregs()?; + // note this returns a u32, but a long-mode IDT is four u32. the u32 this points at is the + // first of the four for the entry. + fn idt_entry_mut(&mut self, idx: u8) -> *mut u32 { + let addr = GuestAddress(self.idt_addr().0 + (idx as u64 * 16)); + let mapping = self.map_containing(addr, std::mem::size_of::<[u64; 2]>() as u64).unwrap(); - assert!(this.cpuid_supports(Feature::Base)); - this.cpuid_set(Feature::Base, true); + unsafe { + mapping.host_ptr(addr) as *mut u32 + } + } - match this.settings.isa_mode { - IsaMode::Long => { - unsafe { - this.configure_identity_paging(Some(&mut vcpu_sregs)); - this.configure_selectors(&mut vcpu_sregs); - this.configure_idt(&mut vcpu_regs, &mut vcpu_sregs); - let mut xcrs = this.get_xcrs()?; - this.configure_extensions(&mut vcpu_sregs, &mut xcrs); - this.set_xcrs(&xcrs)?; - this.configure_syscalls(&mut vcpu_sregs); - } + // note this returns a u32, but a legacy IDT is two u32. the u32 this points at is the + // first of the four for the entry. + fn idt_entry_legacy_mut(&mut self, idx: u8) -> *mut u32 { + let addr = GuestAddress(self.idt_addr().0 + (idx as u64 * 8)); + let mapping = self.map_containing(addr, std::mem::size_of::<[u64; 2]>() as u64).unwrap(); - vcpu_sregs.efer |= 0x0000_0500; // LME | LMA - } - IsaMode::Protected => { - unsafe { - this.configure_identity_paging_32b(Some(&mut vcpu_sregs)); - this.configure_selectors_32b(&mut vcpu_sregs); - this.configure_idt_32b(&mut vcpu_regs, &mut vcpu_sregs); - let mut xcrs = this.get_xcrs()?; - this.configure_extensions(&mut vcpu_sregs, &mut xcrs); - this.set_xcrs(&xcrs)?; + unsafe { + mapping.host_ptr(addr) as *mut u32 + } + } - } - } - IsaMode::Real => { - unsafe { - this.configure_selectors_16b(&mut vcpu_sregs); - this.configure_idt_16b(&mut vcpu_regs, &mut vcpu_sregs); - let mut xcrs = this.get_xcrs()?; - this.configure_extensions(&mut vcpu_sregs, &mut xcrs); - this.set_xcrs(&xcrs)?; + pub fn page_tables(&self) -> VmPageTables<'_> { + let base = self.page_table_addr(); - // in 16-bit mode we've set cs and ds to cover the last 4kb of memory, starting - // at the same place we've written code to execute. there's not much memory to - // go around, and not a ton of flexibility in the asmlinator API, so uh ... the - // least annoying thing to do might be to just put the stack 0x80 bytes from - // the end? - vcpu_regs.rsp = 0x1000 - 0x80; + // the page tables are really just two pages: a PML4 and a PDPT for its first 512G of + // address space. + assert!(self.map_containing(base, 0x2000).is_some()); + + VmPageTables { + vm: self, + base, + } + } + + // TODO: there should be a version of this that can be used to query "does this VM support + // these extensions" probably, and that should take a subset of `Feature` for the ones that are + // actually related to ISA support (e.g. Pdpe1Gb isn't really useful as a public queryable + // feature..) + fn cpuid_supports(&self, feature: Feature) -> bool { + fn find_leaf(cpuid: &CpuId, leaf: u32, index: u32, f: impl Fn(&kvm_cpuid_entry2) -> bool) -> bool { + for mut entry in cpuid.as_slice() { + if entry.function == leaf && entry.index == index { + return f(&mut entry); } } - } - this.set_regs(&vcpu_regs)?; - this.set_sregs(&vcpu_sregs)?; + false + } - Ok(this) + match feature { + Feature::Base => { + let lm = find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| { + leaf.edx & CPUID_80000001_EDX_LM != 0 + }); + let msr = find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| { + leaf.edx & CPUID_00000001_EDX_MSR != 0 + }); + let clstac = find_leaf(&self.supported_cpuid, 0x0000_0007, 0, |leaf| { + leaf.ebx & CPUID_00000007_EBX_CLSTAC != 0 + }); + lm && msr && clstac + } + Feature::Syscall => { + find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| { + leaf.edx & CPUID_80000001_EDX_SYSCALL != 0 + }) + } + Feature::XSave => { + find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| { + leaf.edx & CPUID_00000001_ECX_XSAVE != 0 + }) + } + Feature::Pdpe1Gb => { + find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| { + leaf.edx & CPUID_80000001_EDX_PDPE1GB != 0 + }) + } + Feature::StateSSE => { + find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { + leaf.eax & CPUID_0000000D_EAX_SSE == CPUID_0000000D_EAX_SSE + }) + } + Feature::StateAVX => { + find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { + leaf.eax & CPUID_0000000D_EAX_AVX == CPUID_0000000D_EAX_AVX + }) + } + Feature::StateAVX512 => { + find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { + leaf.eax & CPUID_0000000D_EAX_AVX512 == CPUID_0000000D_EAX_AVX512 + }) + } + Feature::Pse => { + find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| { + leaf.edx & CPUID_00000001_EDX_PSE == CPUID_00000001_EDX_PSE + }) + } + } } - /// map and add a region of size `size` at guest-physical address `gpa`. + /// set `feature` to `wanted` in the VM's CPUID configuration. /// - /// this will not update page tables, so if the newly-added memory is not already mapped due to - /// a previous `configure_identity_paging` call and it is not mapped due to explicit page table - /// management, it will not yet be accessible by guest code. - pub fn add_memory(&mut self, gpa: GuestAddress, size: u64) -> Result<(), VmError> { - let new_mapping_end = gpa.0.checked_add(size) - .map(|addr| GuestAddress(addr)) - .ok_or_else(|| VmError::InvalidMapping { base: gpa, size })?; - if self.memory.overlaps(gpa, new_mapping_end) { - return Err(VmError::InvalidMapping { base: gpa, size }); - } else { - for mapping in self.aux_memories.iter() { - if mapping.overlaps(gpa, new_mapping_end) { - return Err(VmError::InvalidMapping { base: gpa, size }); + /// panics if the feature cannot be configured (such as if the corresponding CPUID leaf is not + /// available at all). use [`cpuid_supports`] to test if the feature can be configured. + fn cpuid_set(&mut self, feature: Feature, wanted: bool) { + fn edit_leaf(cpuid: &mut CpuId, leaf: u32, index: u32, mut f: impl FnMut(&mut kvm_cpuid_entry2)) { + for mut entry in cpuid.as_mut_slice() { + if entry.function == leaf && entry.index == index { + f(&mut entry); + return; } } - } - - let mapping = Mapping::create_shared( - u64_to_usize(gpa.0), - u64_to_usize(size), - ProtFlags::PROT_READ | ProtFlags::PROT_WRITE - )?; - - let used_slots: u32 = self.aux_memories.len().try_into() - .map_err(|_| VmError::InvalidMapping { base: gpa, size })?; - let next_slot = used_slots.checked_add(1) - .ok_or_else(|| VmError::InvalidMapping { base: gpa, size })?; - - let region = kvm_userspace_memory_region { - slot: next_slot, - guest_phys_addr: gpa.0, - memory_size: mapping.size.get() as u64, - userspace_addr: mapping.addr.as_ptr() as u64, - flags: 0, - }; - - let set_res = unsafe { self.vm.set_user_memory_region(region) }; - set_res.map_err(|e| VmError::from_kvm("set_user_memory_region", e))?; - self.aux_memories.push(mapping); + // if we're here, the entry simply is not present (yet..?) + // + // so, create it. + let mut entry = kvm_cpuid_entry2 { + function: leaf, + index: index, + eax: 0, + ecx: 0, + edx: 0, + ebx: 0, + flags: 0, + padding: [0; 3], + }; + f(&mut entry); + cpuid.push(entry).expect("can push"); + } - if new_mapping_end.0 > self.mem_ceiling { - self.mem_ceiling = new_mapping_end.0; + fn bit_set(word: &mut u32, bit: u32, wanted: bool) { + *word &= !bit; + if wanted { + *word |= bit; + } } - Ok(()) - } + let mut edited = false; - pub fn get_regs(&self) -> Result { - self.vcpu.get_regs() - .map_err(|e| VmError::from_kvm("get_regs", e)) - } + match feature { + Feature::Base => { + edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| { + bit_set(&mut leaf.edx, CPUID_80000001_EDX_LM, wanted); + edited = true; + }); + edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| { + bit_set(&mut leaf.edx, CPUID_00000001_EDX_MSR, wanted); + edited = true; + }); + edit_leaf(&mut self.current_cpuid, 0x0000_0007, 0, |leaf| { + bit_set(&mut leaf.ebx, CPUID_00000007_EBX_CLSTAC, wanted); + edited = true; + }); + } + Feature::Syscall => { + edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| { + bit_set(&mut leaf.edx, CPUID_80000001_EDX_SYSCALL, wanted); + edited = true; + }); + } + Feature::XSave => { + edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| { + bit_set(&mut leaf.ecx, CPUID_00000001_ECX_XSAVE, wanted); + edited = true; + }); + }, + Feature::Pdpe1Gb => { + edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| { + bit_set(&mut leaf.edx, CPUID_80000001_EDX_PDPE1GB, wanted); + edited = true; + }); + }, + Feature::StateSSE => { + edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { + bit_set(&mut leaf.eax, 1, wanted); + bit_set(&mut leaf.eax, CPUID_0000000D_EAX_SSE, wanted); + edited = true; + }); + } + Feature::StateAVX => { + edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { + bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX, wanted); + edited = true; + }); + } + Feature::StateAVX512 => { + edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { + bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX512, wanted); + edited = true; + }); + } + Feature::Pse => { + edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| { + bit_set(&mut leaf.edx, CPUID_00000001_EDX_PSE, wanted); + edited = true; + }); + } + } - pub fn get_sregs(&self) -> Result { - self.vcpu.get_sregs() - .map_err(|e| VmError::from_kvm("get_sregs", e)) - } + assert!(edited); - pub fn get_xcrs(&self) -> Result { - self.vcpu.get_xcrs() - .map_err(|e| VmError::from_kvm("get_xcrs", e)) + self.vcpu.set_cpuid2(&self.current_cpuid).expect("can set cpuid"); } - pub fn set_regs(&self, regs: &kvm_regs) -> Result<(), VmError> { - self.vcpu.set_regs(regs) - .map_err(|e| VmError::from_kvm("set_regs", e)) - } + /// configure page tables for identity mapping of all memory from guest address zero up to the + /// end of added memory regions, rounded up to the next GiB. + /// + /// if `sregs` is provided, update `cr0`, `cr3`, and `cr4` in support of protected-mode or + /// long-mode paging. this is a fixed pattern: if control registers have not been changed since + /// `Vm::create` then there will be no change to these control registers and `sregs` can be + /// omitted. + /// + /// panics if the end of added memory regions is above 512 GiB. + pub unsafe fn configure_identity_paging(&mut self, sregs: Option<&mut kvm_sregs>) { + // we're only setting up one PDPT, which can have up to 512 PDPTE covering 1G each. + assert!(self.mem_ceiling() <= 512 * GB); - pub fn set_sregs(&self, sregs: &kvm_sregs) -> Result<(), VmError> { - self.vcpu.set_sregs(sregs) - .map_err(|e| VmError::from_kvm("set_sregs", e)) - } + assert!(self.cpuid_supports(Feature::Pdpe1Gb)); + self.cpuid_set(Feature::Pdpe1Gb, true); - pub fn set_xcrs(&self, xcrs: &kvm_xcrs) -> Result<(), VmError> { - self.vcpu.set_xcrs(xcrs) - .map_err(|e| VmError::from_kvm("set_xcrs", e)) - } + let pt = self.page_tables(); - pub fn set_msrs(&self, msrs: &Msrs) -> Result<(), VmError> { - let n_set = self.vcpu.set_msrs(msrs) - .map_err(|e| VmError::from_kvm("set_msrs", e))?; - assert_eq!(msrs.as_slice().len(), n_set); - Ok(()) - } + let pml4_ent = + 1 << 0 | // P + 1 << 1 | // RW + 1 << 2 | // user access allowed. but no user code will run so not strictly needed. + 0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient) + 0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient) + 0 << 5 | // A + 0 << 6 | // ignored + 0 << 7 | // PS (reserved must-be-0) + 0 << 11 | // R (for ordinary paging, ignored; for HLAT ...) + pt.pdpt_addr().0; + unsafe { + pt.pml4_mut().write(pml4_ent); + } - pub fn idt_configured(&self) -> bool { - self.idt_configured - } + let mut mapped: u64 = 0; + // we've set up the first PML4 to point to a PDPT, so we should actually set it up! + let pdpt = pt.pdpt_mut(); + // PDPTEs start at the start of PDPT.. + let mut pdpte = pdpt; + let entry_bits: u64 = + 1 << 0 | // P + 1 << 1 | // RW + 1 << 2 | // user accesses allowed (everything is under privilege level 0 tho) + 0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient) + 0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient) + 0 << 5 | // Accessed + 0 << 6 | // Dirty + 1 << 7 | // Page size (1 implies 1G page) + 1 << 8 | // Global (if cr4.pge) + 0 << 9 | + 0 << 10 | + 0 << 11 | // for ordinary paging, ignored. for HLAT, ... + 0 << 12; // PAT (TODO: configure explicitly, but PAT0 is sufficient. verify MTRR sets PAT0 to WB?) - pub fn syscall_configured(&self) -> bool { - self.syscall_configured + while mapped < self.mem_ceiling() { + let phys_num = mapped >> 30; + let entry = entry_bits | (phys_num << 30); + unsafe { + pdpte.write(entry); + pdpte = pdpte.offset(1); + } + // eprintln!("mapped 1g at {:08x}", mapped); + mapped += 1 << 30; + } + + if let Some(sregs) = sregs { + sregs.cr0 |= 0x8000_0001; // cr0.PE | cr0.PG + sregs.cr3 = pt.pml4_addr().0 as u64; + sregs.cr4 |= 1 << 5; // enable PAE + } } - // TODO: seems like there's a KVM bug where if the VM is configured for single-step and the - // single-stepped instruction is a rmw to MMIO memory (or MMIO hugepages?), the single-step - // doesn't actually take effect. compare `0x33 0x00` and `0x31 0x00`. what the hell! - pub fn set_single_step(&mut self, active: bool) -> Result<(), VmError> { - let mut guest_debug = kvm_guest_debug::default(); + /// configure page tables for identity mapping of all memory from guest address zero up to the + /// end of added memory regions, rounded up to the next 4MiB. + /// + /// if `sregs` is provided, update `cr0`, `cr3`, and `cr4` in support of protected-mode paging. + /// this is a fixed pattern: if control registers have not been changed since `Vm::create` then + /// there will be no change to these control registers and `sregs` can be omitted. + pub unsafe fn configure_identity_paging_32b(&mut self, sregs: Option<&mut kvm_sregs>) { + // because we'll set PDEs to map 4M pages and cr3 points at a page-aligned block of 1024 + // 4-byte PDEs, that gives us 4KiB of memory used to map 4GiB of address space. that's all + // of 32-bit, so we don't need to check an upper bound. - if active { - guest_debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP - }; + assert!(self.cpuid_supports(Feature::Pse)); + self.cpuid_set(Feature::Pse, true); - self.vcpu.set_guest_debug(&guest_debug) - .map_err(|e| VmError::from_kvm("set_guest_debug", e)) - } + let pt = self.page_tables(); - pub fn run<'vm>(&'vm mut self) -> Result, VmError> { - let exit = self.vcpu.run() - .map_err(|e| VmError::from_kvm("vcpu run", e))?; + let mut mapped: u64 = 0; + // "pml4_mut" is really just the start of page table memory. we'll pun this in 32-bit with + // the knowledge it's really a block of PDEs. + let pd = pt.pml4_mut() as *mut u32; + let mut pde = pd; + let entry_bits: u32 = + 1 << 0 | // P + 1 << 1 | // RW + 1 << 2 | // user accesses allowed (everything is under privilege level 0 tho) + 0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient) + 0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient) + 0 << 5 | // Accessed + 0 << 6 | // Dirty + 1 << 7 | // Page size (1 implies 4M page) + 1 << 8 | // Global (if cr4.pge) + 0 << 9 | + 0 << 10 | + 0 << 11 | // for ordinary paging, ignored. for HLAT, ... + 0 << 12; // PAT (TODO: configure explicitly, but PAT0 is sufficient. verify MTRR sets PAT0 to WB?) - match exit { - kvm_ioctls::VcpuExit::MmioRead(addr, buf) => { - // `buf` is typed with a lifetime from the reborrow of self.vcpu for run() above. - // this means it's a shorter lifetime than `'vm`, but since the resulting lifetime - // is also `'vm` it *really* has the effect of disallowing any subsequent use of - // `self`. these transmutes decouple the lifetime of `exit` from the lifetime of - // `self` and returned `VcpuExit`, so other arms that don't involve lifetimes can - // drop `exit()` and query the vcpu. - // - // SAFETY: this actually extends the lifetime of `buf` from the shorter transient - // lifetime to `'vm` for the return type. - let buf: &'vm mut [u8] = unsafe { core::mem::transmute(buf) }; - return Ok(VcpuExit::MmioRead { buf, addr }); - } - kvm_ioctls::VcpuExit::MmioWrite(addr, buf) => { - // see the same transmute in `MmioRead` for why this is load-bearing. - // - // SAFETY: this actually extends the lifetime of `buf` from the shorter transient - // lifetime to `'vm` for the return type. - let buf: &'vm [u8] = unsafe { core::mem::transmute(buf) }; - return Ok(VcpuExit::MmioWrite { buf, addr }); - } - kvm_ioctls::VcpuExit::IoIn(port, buf) => { - // see the same transmute in `MmioRead` for why this is load-bearing. - // - // SAFETY: this actually extends the lifetime of `buf` from the shorter transient - // lifetime to `'vm` for the return type. - let buf: &'vm mut [u8] = unsafe { core::mem::transmute(buf) }; - return Ok(VcpuExit::IoIn { port, buf }); - } - kvm_ioctls::VcpuExit::IoOut(port, buf) => { - // see the same transmute in `MmioRead` for why this is load-bearing. - // - // SAFETY: this actually extends the lifetime of `buf` from the shorter transient - // lifetime to `'vm` for the return type. - let buf: &'vm [u8] = unsafe { core::mem::transmute(buf) }; - return Ok(VcpuExit::IoOut { port, buf }); - } - kvm_ioctls::VcpuExit::Debug(info) => { - let pc = info.pc; - return Ok(VcpuExit::Debug { pc, info }); + while mapped < self.mem_ceiling() { + let phys_num = (mapped as u32) >> 22; + let entry = entry_bits | (phys_num << 22); + unsafe { + pde.write(entry); + pde = pde.offset(1); } - kvm_ioctls::VcpuExit::Hlt => { - let regs = self.get_regs()?; + mapped += 1 << 22; + } - if self.idt_configured { - let intrs_start = self.interrupt_handlers_start().0; - let intrs_end = intrs_start + IDT_ENTRIES as u64; - // by the time we've exited the `hlt` of the interrupt handler has completed, - // so rip is advanced by one. subtract back out to convert to an exception - // vector number. - let intr_addr = regs.rip - 1; + // page size extensions; collaborates with page tables' PS bit to make 4MiB pages in 32-bit + // mode. see SDM section 2.5 "CONTROL REGISTERS". + const PSE: u64 = 1 << 4; - if intr_addr >= intrs_start && intr_addr < intrs_end { - let nr = intr_addr - intrs_start; - // because IDT_ENTRIES is 256, this should always be true.. - assert!(nr < 256); - let nr = nr as u8; + if let Some(sregs) = sregs { + sregs.cr0 |= 0x8000_0001; // cr0.PE | cr0.PG + sregs.cr3 = pt.pml4_addr().0 as u64; + sregs.cr4 |= PSE; + } + } - return Ok(VcpuExit::Exception { nr }); - } - } + unsafe fn configure_selectors(&mut self, sregs: &mut kvm_sregs) { + // we have to set descriptor information directly. this avoids having to load selectors + // as the first instructions on the vCPU, which is simplifying. but if we want the + // information in these selectors to match with anything in a GDT (i do!) we'll have to + // keep this initial state lined up with GDT entries ourselves. + // + // we could avoid setting up the GDT for the most part, but anything that might + // legitimately load the "valid" current segment selector would instead clobber the + // selector with zeroes. - if self.syscall_configured { - // the behavior of `syscall`, `hlt`, and `rip` is a little funky. similar to - // interrupt handlers, we typically exit with rip pointed immediately after - // `syscall_addr()` because we would syscall to `hlt`, execute the first `hlt`, - // advance `rip` by one byte, and exit to userland for the HLT. - if regs.rip == self.syscall_addr().0 + 1{ - return Ok(VcpuExit::Syscall); - } - } + sregs.cs.base = 0; + sregs.cs.limit = 0; + sregs.cs.selector = self.selector_cs(); + sregs.cs.type_ = 0b1011; // see SDM table 3-1 Code- and Data-Segment Types + sregs.cs.present = 1; + sregs.cs.dpl = 0; + sregs.cs.db = 0; + sregs.cs.s = 1; + sregs.cs.l = 1; + sregs.cs.g = 0; + sregs.cs.avl = 0; - Ok(VcpuExit::Hlt) - } - kvm_ioctls::VcpuExit::Shutdown => { - return Ok(VcpuExit::Shutdown); - } - other => { - panic!("unhandled VcpuExit kind: {other:?}"); - } - } - } + sregs.ds.base = 0; + sregs.ds.limit = 0xffffffff; + sregs.ds.selector = self.selector_ds(); + sregs.ds.type_ = 0b0011; // see SDM table 3-1 Code- and Data-Segment Types + sregs.ds.present = 1; + sregs.ds.dpl = 0; + sregs.ds.db = 0; + sregs.ds.s = 1; + sregs.ds.l = 0; + sregs.ds.g = 0; + sregs.ds.avl = 0; - /// get a pointer to host memory mapped to guest address `address`. - /// - /// panics if `address` is not a guest-physical address backed by host memory. - pub unsafe fn host_ptr(&self, address: GuestAddress) -> *mut u8 { - let mapping = self.map_containing(address, 0) - .expect("mapping for address exists"); + sregs.es = sregs.ds; + sregs.fs = sregs.ds; + sregs.gs = sregs.ds; + // linux populates the vmcb cpl field with whatever's in ss.dpl. what the hell??? + sregs.ss = sregs.ds; + + sregs.gdt.base = self.gdt_addr().0; + sregs.gdt.limit = 256 * 8 - 1; unsafe { - mapping.host_ptr(address) + self.gdt_entry_mut(self.selector_cs() >> 3).write(encode_segment(&sregs.cs)); + self.gdt_entry_mut(self.selector_ds() >> 3).write(encode_segment(&sregs.ds)); } } - pub fn gdt_addr(&self) -> GuestAddress { - GuestAddress(0x1000) - } - - pub fn idt_addr(&self) -> GuestAddress { - GuestAddress(0x2000) - } + /// configure selectors for 32-bit code exceution. this is basically the same as 64-bit, but we + /// set a limit and set `cs.db` so that the default operand size is a normal 32-bit. + unsafe fn configure_selectors_32b(&mut self, sregs: &mut kvm_sregs) { + // we have to set descriptor information directly. this avoids having to load selectors + // as the first instructions on the vCPU, which is simplifying. but if we want the + // information in these selectors to match with anything in a GDT (i do!) we'll have to + // keep this initial state lined up with GDT entries ourselves. + // + // we could avoid setting up the GDT for the most part, but anything that might + // legitimately load the "valid" current segment selector would instead clobber the + // selector with zeroes. - pub fn interrupt_handlers_start(&self) -> GuestAddress { - GuestAddress(0x3000) - } + sregs.cs.base = 0; + sregs.cs.limit = 0xffffffff; + sregs.cs.selector = self.selector_cs(); + sregs.cs.type_ = 0b1011; // see SDM table 3-1 Code- and Data-Segment Types + sregs.cs.present = 1; + sregs.cs.dpl = 0; + sregs.cs.db = 1; + sregs.cs.s = 1; + sregs.cs.l = 0; + sregs.cs.g = 1; + sregs.cs.avl = 0; - pub fn syscall_addr(&self) -> GuestAddress { - GuestAddress(0x4000) - } + sregs.ds.base = 0; + sregs.ds.limit = 0xffffffff; + sregs.ds.selector = self.selector_ds(); + sregs.ds.type_ = 0b0011; // see SDM table 3-1 Code- and Data-Segment Types + sregs.ds.present = 1; + sregs.ds.dpl = 0; + sregs.ds.db = 1; + sregs.ds.s = 1; + sregs.ds.l = 0; + sregs.ds.g = 1; + sregs.ds.avl = 0; - pub fn page_table_addr(&self) -> GuestAddress { - GuestAddress(0x10000) - } + sregs.es = sregs.ds; + sregs.fs = sregs.ds; + sregs.gs = sregs.ds; + // linux populates the vmcb cpl field with whatever's in ss.dpl. what the hell??? + sregs.ss = sregs.ds; - pub fn code_addr(&self) -> GuestAddress { - GuestAddress(self.memory.size.get() as u64 - 4096) - } + sregs.gdt.base = self.gdt_addr().0; + sregs.gdt.limit = 256 * 8 - 1; - pub fn mem_ceiling(&self) -> u64 { - self.mem_ceiling + unsafe { + self.gdt_entry_mut(self.selector_cs() >> 3).write(encode_segment(&sregs.cs)); + self.gdt_entry_mut(self.selector_ds() >> 3).write(encode_segment(&sregs.ds)); + } } - /// configuring the IDT implies the IDT might be used which means we want a stack pointer - /// that can have at least 0x18 bytes pushed to it if an interrupt happens. - pub fn stack_addr(&self) -> GuestAddress { - // it would be nice to point the stack somewhere that we could get MMIO exits and see the - // processor push words for the interrupt in real time, but that doesn't ... work. - // instead, you end up in a loop somewhere around svm_vcpu_run (which you can ^C out of, - // thankfully). + /// configure selectors for 16-bit code exceution. + /// + /// unlike other modes, this sets `cs` to execute code at the linear address given by + /// [`Self::code_addr`]. `ds` is configured to overlap with `cs`. this way, when executing + /// 16-bit code the VM can simply be configured to `ip = 0`, and code addresses match data + /// addresses. additionally, clear `cs.db` so that the default operand size is 16-bit. + unsafe fn configure_selectors_16b(&mut self, sregs: &mut kvm_sregs) { + // we have to set descriptor information directly. this avoids having to load selectors + // as the first instructions on the vCPU, which is simplifying. but if we want the + // information in these selectors to match with anything in a GDT (i do!) we'll have to + // keep this initial state lined up with GDT entries ourselves. // - // so this picks some guest memory lower down. + // we could avoid setting up the GDT for the most part, but anything that might + // legitimately load the "valid" current segment selector would instead clobber the + // selector with zeroes. - // stack grows *down* but if someone pops a lot of bytes from rsp we'd go up and - // clobber the page tables. so leave a bit of space. - GuestAddress(0x19800) - } + sregs.cs.base = 0; + sregs.cs.limit = 0xfffff; + sregs.cs.selector = self.selector_cs(); + sregs.cs.type_ = 0b1011; // see SDM table 3-1 Code- and Data-Segment Types + sregs.cs.present = 1; + sregs.cs.dpl = 0; + sregs.cs.db = 0; + sregs.cs.s = 1; + sregs.cs.l = 0; + sregs.cs.g = 1; + sregs.cs.avl = 0; - /// selector 0x10 is chosen arbitrarily for code. - pub fn selector_cs(&self) -> u16 { - 0x10 - } + unsafe { + self.gdt_entry_mut(self.selector_cs_idt_16b() >> 3).write(encode_segment(&sregs.cs)); + } - /// selector 0x18 is chosen arbitrarily for data (all segments; ss, ds, es, etc). - pub fn selector_ds(&self) -> u16 { - 0x18 - } + // and now adjust for the real cs for code execution to happen in.. + sregs.cs.base = self.code_addr().0; - /// selector 0x20 is chosen arbitrarily for 16-bit interrupts, which are placed well away from - /// where selector 0x10 is pointed in real mode. - pub fn selector_cs_idt_16b(&self) -> u16 { - 0x20 - } + sregs.ds.base = self.code_addr().0; + sregs.ds.limit = 0xfffff; + sregs.ds.selector = self.selector_ds(); + sregs.ds.type_ = 0b0011; // see SDM table 3-1 Code- and Data-Segment Types + sregs.ds.present = 1; + sregs.ds.dpl = 0; + sregs.ds.db = 0; + sregs.ds.s = 1; + sregs.ds.l = 0; + sregs.ds.g = 1; + sregs.ds.avl = 0; - fn map_containing_mut(&mut self, base: GuestAddress, size: u64) -> Option<&mut Mapping> { - let mapping = if self.memory.contains(base) { - &mut self.memory - } else { - self.aux_memories.iter_mut() - .find(|map| map.contains(base))? - }; + sregs.es = sregs.ds; + sregs.fs = sregs.ds; + sregs.gs = sregs.ds; + // linux populates the vmcb cpl field with whatever's in ss.dpl. what the hell??? + sregs.ss = sregs.ds; - if !mapping.check_range(base, size) { - return None; - } + sregs.gdt.base = self.gdt_addr().0; + sregs.gdt.limit = 256 * 8 - 1; - Some(mapping) + unsafe { + self.gdt_entry_mut(self.selector_cs() >> 3).write(encode_segment(&sregs.cs)); + self.gdt_entry_mut(self.selector_ds() >> 3).write(encode_segment(&sregs.ds)); + } } - fn map_containing(&self, base: GuestAddress, size: u64) -> Option<&Mapping> { - let mapping = if self.memory.contains(base) { - &self.memory - } else { - self.aux_memories.iter() - .find(|map| map.contains(base))? - }; - - if !mapping.check_range(base, size) { - return None; - } - - Some(mapping) - } + fn write_idt_entry( + &mut self, + intr_nr: u8, + interrupt_handler_cs: u16, + interrupt_handler_addr: GuestAddress + ) { + let idt_ptr = self.idt_entry_mut(intr_nr); - /// write all of `data` into guest memory at guest-physical address `addr`. - /// - /// panics if `data` extends beyond the end of guest memory. - pub fn write_mem(&mut self, addr: GuestAddress, data: &[u8]) { - let mapping = self.map_containing(addr, data.len() as u64).expect("mapping is valid"); + // entries in the IDT, interrupt and trap descriptors (in the AMD APM, "interrupt-gate" + // and "trap-gate" descriptors), are described (in the AMD APM) by + // "Figure 4-24. Interrupt-Gate and Trap-Gate Descriptors—Long Mode". reproduced here: + // + // 3 2 1 | 1 0 + // 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + // |---------------------------------------------------------------| + // | res,ign | +12 + // | target offset[63:32] | +8 + // | target offset[31:16] |P|DPL|0| type | res,ign | IST | +4 + // | target selector | target offset[15:0] | +0 + // |---------------------------------------------------------------| + // + // descriptors are encoded with P set, DPL at 0, and type set to 0b1110. TODO: frankly + // i don't know the mechanical difference between type 0x0e and type 0x0f, but 0x0e + // works for now. + let idt_attr_bits = 0b1_00_0_1110_00000_000; + let low_hi = (interrupt_handler_addr.0 as u32 & 0xffff_0000) | idt_attr_bits; + let low_lo = (interrupt_handler_cs as u32) << 16 | (interrupt_handler_addr.0 as u32 & 0x0000_ffff); - // SAFETY: `check_range` above validates the range to copy, and... please do not - // provide a slice of guest memory as what the guest should be programmed for... unsafe { - std::ptr::copy_nonoverlapping( - data.as_ptr(), - mapping.host_ptr(addr), - data.len() - ); + idt_ptr.offset(0).write(low_lo); + idt_ptr.offset(1).write(low_hi); + idt_ptr.offset(2).write((interrupt_handler_addr.0 >> 32) as u32); + idt_ptr.offset(3).write(0); // reserved } } - /// read guest-physical memory at `addr` to `addr + buf.len()` into `buf`. + /// 16-bit/32-bit IDT entries, described in the APM as /// - /// panics if `addr + buf.len()` extends beyond the end of guest memory. - pub fn read_mem(&mut self, addr: GuestAddress, buf: &mut [u8]) { - let mapping = self.map_containing(addr, buf.len() as u64).expect("mapping is valid"); + /// > Interrupt-Gate and Trap-Gate Descriptors—Legacy Mode + /// + /// have a different (smaller!) format. + fn write_idt_entry_legacy( + &mut self, + intr_nr: u8, + interrupt_handler_cs: u16, + interrupt_handler_addr: GuestAddress + ) { + assert!(interrupt_handler_addr.0 <= u32::MAX as u64); + let idt_ptr = self.idt_entry_legacy_mut(intr_nr); + + // entries in the IDT, interrupt and trap descriptors (in the AMD APM, "interrupt-gate" + // and "trap-gate" descriptors), are described (in the AMD APM) by + // "Figure 4-24. Interrupt-Gate and Trap-Gate Descriptors—Long Mode". reproduced here: + // + // 3 2 1 | 1 0 + // 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + // |---------------------------------------------------------------| + // | target offset[31:16] |P|DPL|0| type | res,ign | IST | +4 + // | target selector | target offset[15:0] | +0 + // |---------------------------------------------------------------| + // + // descriptors are encoded with P set, DPL at 0, and type set to 0b1110. TODO: frankly + // i don't know the mechanical difference between type 0x0e and type 0x0f, but 0x0e + // works for now. + let idt_attr_bits = 0b1_00_0_1110_00000_000; + let low_hi = (interrupt_handler_addr.0 as u32 & 0xffff_0000) | idt_attr_bits; + let low_lo = (interrupt_handler_cs as u32) << 16 | (interrupt_handler_addr.0 as u32 & 0x0000_ffff); - // SAFETY: `check_range` above validates the range to copy, and... please do not - // provide a slice of guest memory as what should be read into... unsafe { - std::ptr::copy_nonoverlapping( - mapping.host_ptr(addr) as *const _, - buf.as_mut_ptr(), - buf.len() - ); + idt_ptr.offset(0).write(low_lo); + idt_ptr.offset(1).write(low_hi); } } - /// returns a slice of guest memory pointed to by guest-physical address `addr`, of size - /// `size`. - /// - /// panics if `addr + size` is not enclosed in a single guest mapping. this crate doesn't - /// support returning a single slice of adjacent guest memory regions (yet?), sorry. - pub fn mem_slice_mut<'vm>(&'vm mut self, addr: GuestAddress, size: u64) -> &'vm mut [u8] { - let mapping = self.map_containing_mut(addr, size).expect("mapping is valid"); + fn configure_idt(&mut self, regs: &mut kvm_regs, sregs: &mut kvm_sregs) { + sregs.idt.base = self.idt_addr().0; + sregs.idt.limit = IDT_ENTRIES * 16 - 1; // IDT is 256 entries of 16 bytes each - // SAFETY: we have an exclusive borrow of the VM, so it is not currently running, and there - // is no other outstanding slice of guest memory. `map_containing` has already ensured that - // this mapping contains the whole range `[addr, addr + size)`. - unsafe { - mapping.slice_mut(addr, size) + for i in 0..IDT_ENTRIES { + let interrupt_handler_addr = GuestAddress(self.interrupt_handlers_start().0 + i as u64); + self.write_idt_entry( + i.try_into().expect("(&'vm self, addr: GuestAddress, size: u64) -> &'vm [u8] { - let mapping = self.map_containing(addr, size).expect("mapping is valid"); - // SAFETY: we have an exclusive borrow of the VM, so it is not currently running, and there - // is no other outstanding slice of guest memory. `map_containing` has already ensured that - // this mapping contains the whole range `[addr, addr + size)`. + // all interrupt handlers are just `hlt`. their position is used to detect which + // exception/interrupt occurred. unsafe { - mapping.slice(addr, size) + std::slice::from_raw_parts_mut( + self.host_ptr(self.interrupt_handlers_start()), + IDT_ENTRIES as usize + ).fill(0xf4); } - } - - /// write `code` into guest memory and set `regs.rip` to the address of that code. - /// - /// the chosen code address is [`Self::code_addr`]; this is the guest linear address the - /// provided code buffer is written to. - /// - /// if the VM is configured for `IsaMode::Long` or `IsaMode::Protected`, `rip` or `eip` is set - /// to this address as well. otherwise, the VM is configured for `IsaMode::Real` and `ip` is - /// set to `code_addr() & 0x0f` - in typical cases `ip` will be 0. - /// - pub fn program(&mut self, code: &[u8], regs: &mut kvm_regs) { - let addr = self.code_addr(); - self.write_mem(addr, code); - if self.settings.isa_mode != IsaMode::Real { - regs.rip = addr.0; - } else { - regs.rip = addr.0 & 0x000f; - } + // finally, set `rsp` to a valid region so that the CPU can push necessary state (see + // AMD APM section "8.9.3 Interrupt Stack Frame") to actually enter the interrupt + // handler. if we didn't do this, rsp will probably be zero or something, underflow, + // page fault on push to 0xffffffff_ffffffff, and just triple fault. + // + // TODO: this is our option in 16- and 32-bit modes, but in long mode all the interrupt + // descriptors could set something in IST to switch stacks outright for exception + // handling. this might be nice to test rsp permutations in 64-bit code? alternatively + // we might just have to limit possible rsp permutations so as to be able to test in + // 16- and 32-bit modes anyway. + regs.rsp = self.stack_addr().0; + self.idt_configured = true; } - fn gdt_entry_mut(&mut self, idx: u16) -> *mut u64 { - // the GDT is set up at addresses 0..64k: - // - // > 3.5.1 Segment Descriptor Tables - // > A segment descriptor table is an array of segment descriptors (see Figure 3-10). A - // > descriptor table is variable in length and can contain up to 8192 (2^13) 8-byte - // > descriptors. + /// IDT configuration in 32-bit mode is funky because the interrupt handlers live in a totally + /// different region of memory and need a different value in `cs`. + fn configure_idt_32b(&mut self, regs: &mut kvm_regs, sregs: &mut kvm_sregs) { + sregs.idt.base = self.idt_addr().0; + sregs.idt.limit = IDT_ENTRIES * 8 - 1; // legacy IDT is 256 entries of 8 bytes each - assert!(idx < 4096 / 8); - let addr = GuestAddress(self.gdt_addr().0 + (idx as u64 * 8)); - let mapping = self.map_containing(addr, std::mem::size_of::() as u64).unwrap(); + for i in 0..IDT_ENTRIES { + let interrupt_handler_addr = GuestAddress(self.interrupt_handlers_start().0 + i as u64); + self.write_idt_entry_legacy( + i.try_into().expect(" *mut u32 { - let addr = GuestAddress(self.idt_addr().0 + (idx as u64 * 16)); - let mapping = self.map_containing(addr, std::mem::size_of::<[u64; 2]>() as u64).unwrap(); + /// IDT configuration in 16-bit mode is funky because the interrupt handlers live in a totally + /// different region of memory and need a different value in `cs`. + fn configure_idt_16b(&mut self, regs: &mut kvm_regs, sregs: &mut kvm_sregs) { + sregs.idt.base = self.idt_addr().0; + sregs.idt.limit = IDT_ENTRIES * 8 - 1; // IDT is 256 entries of 8 bytes each - unsafe { - mapping.host_ptr(addr) as *mut u32 + for i in 0..IDT_ENTRIES { + let interrupt_handler_addr = GuestAddress(self.interrupt_handlers_start().0 + i as u64); + self.write_idt_entry_legacy( + i.try_into().expect(" *mut u32 { - let addr = GuestAddress(self.idt_addr().0 + (idx as u64 * 8)); - let mapping = self.map_containing(addr, std::mem::size_of::<[u64; 2]>() as u64).unwrap(); + // all interrupt handlers are just `hlt`. their position is used to detect which + // exception/interrupt occurred. unsafe { - mapping.host_ptr(addr) as *mut u32 + std::slice::from_raw_parts_mut( + self.host_ptr(self.interrupt_handlers_start()), + IDT_ENTRIES as usize + ).fill(0xf4); } - } - pub fn page_tables(&self) -> VmPageTables<'_> { - let base = self.page_table_addr(); + // finally, set `rsp` to a valid region so that the CPU can push necessary state (see + // AMD APM section "8.9.3 Interrupt Stack Frame") to actually enter the interrupt + // handler. if we didn't do this, rsp will probably be zero or something, underflow, + // page fault on push to 0xffffffff_ffffffff, and just triple fault. + // + // TODO: this is our option in 16- and 32-bit modes, but in long mode all the interrupt + // descriptors could set something in IST to switch stacks outright for exception + // handling. this might be nice to test rsp permutations in 64-bit code? alternatively + // we might just have to limit possible rsp permutations so as to be able to test in + // 16- and 32-bit modes anyway. + regs.rsp = self.stack_addr().0; + self.idt_configured = true; + } - // the page tables are really just two pages: a PML4 and a PDPT for its first 512G of - // address space. - assert!(self.map_containing(base, 0x2000).is_some()); + /// configure the vCPU for executing instructions in the hardware-supported extensions. + /// on a fresh vCPU, various extension may be "supported" but result in `#UD` when executed, + /// unless additional configuration is done (as this function does). + /// + /// the Intel SDM describes `INITIALIZING SSE/SSE2/SSE3/SSSE3 EXTENSIONS` but does not point + /// out this `#UD` behavior so directly. the AMD APM does not seem to discuss it at all? + /// + /// this function configures the vCPU to be ready to execute `SSE*` instructions. + fn configure_extensions(&mut self, sregs: &mut kvm_sregs, xcrs: &mut kvm_xcrs) { + // these bit positions in control registers, and their behaviors, are described more + // comprehensively in Voluem 3, + // > `2.5 CONTROL REGISTERS` - VmPageTables { - vm: self, - base, - } - } + // CR0 + const TS: u32 = 3; + // CR4 + const OSFXSR: u32 = 9; + const OSXMMEXCPT: u32 = 10; + const OSXSAVE: u32 = 18; - // TODO: there should be a version of this that can be used to query "does this VM support - // these extensions" probably, and that should take a subset of `Feature` for the ones that are - // actually related to ISA support (e.g. Pdpe1Gb isn't really useful as a public queryable - // feature..) - fn cpuid_supports(&self, feature: Feature) -> bool { - fn find_leaf(cpuid: &CpuId, leaf: u32, index: u32, f: impl Fn(&kvm_cpuid_entry2) -> bool) -> bool { - for mut entry in cpuid.as_slice() { - if entry.function == leaf && entry.index == index { - return f(&mut entry); - } - } + // XCR0 (see "EXTENDED CONTROL REGISTERS (INCLUDING XCR0)") + // these bits are the same as in cpuid leaf 0xd.eax + const XCR0_SSE: u64 = CPUID_0000000D_EAX_SSE as u64; + const XCR0_AVX: u64 = CPUID_0000000D_EAX_AVX as u64; + const XCR0_AVX512: u64 = CPUID_0000000D_EAX_AVX512 as u64; - false - } + // operations on `xmm` registers result in `#UD` even if CPUID says that SSE should be + // quite functional. this is true even for SSE or SSE2 instructions on an `x86_64` system + // (which makes SSE a non-optional baseline!) + // + // the Intel SDM implies this through somewhat tortured language in the section + // "Checking for Intel® SSE and SSE2 Support": + // > If an operating system did not provide adequate system level support for Intel + // > SSE, executing an Intel SSE or SSE2 instructions can also generate #UD. + // + // to fully understand this statement, realize that `an operating system .. provide[s] + // adequate system level support" by setting CR4.OSFXSR, + // + // > Set the OSFXSR flag (bit 9 in control register CR4) to indicate that the operating + // > system supports saving and restoring the SSE/SSE2/SSE3/SSSE3 execution environment + // + // so OSFXSR is how "the operating system" indicates save/restore state, and must be set to + // execute SSE (and later) SIMD instructions even if we never will use `fxsave` or even + // switch tasks on the vCPU. + sregs.cr4 |= 1 << OSFXSR; - match feature { - Feature::Base => { - let lm = find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| { - leaf.edx & CPUID_80000001_EDX_LM != 0 - }); - let msr = find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| { - leaf.edx & CPUID_00000001_EDX_MSR != 0 - }); - let clstac = find_leaf(&self.supported_cpuid, 0x0000_0007, 0, |leaf| { - leaf.ebx & CPUID_00000007_EBX_CLSTAC != 0 - }); - lm && msr && clstac - } - Feature::Syscall => { - find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| { - leaf.edx & CPUID_80000001_EDX_SYSCALL != 0 - }) - } - Feature::XSave => { - find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| { - leaf.edx & CPUID_00000001_ECX_XSAVE != 0 - }) - } - Feature::Pdpe1Gb => { - find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| { - leaf.edx & CPUID_80000001_EDX_PDPE1GB != 0 - }) - } - Feature::StateSSE => { - find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { - leaf.eax & CPUID_0000000D_EAX_SSE == CPUID_0000000D_EAX_SSE - }) - } - Feature::StateAVX => { - find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { - leaf.eax & CPUID_0000000D_EAX_AVX == CPUID_0000000D_EAX_AVX - }) - } - Feature::StateAVX512 => { - find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { - leaf.eax & CPUID_0000000D_EAX_AVX512 == CPUID_0000000D_EAX_AVX512 - }) - } - Feature::Pse => { - find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| { - leaf.edx & CPUID_00000001_EDX_PSE == CPUID_00000001_EDX_PSE - }) - } - } - } + // there is a similar relationship between SIMD extension functionality and CR4.OSXSAVE. + // this passage in the SDM under "XSAVE-SUPPORTED FEATURES AND STATE-COMPONENT BITMAPS" + // draws a fairly direct connection: + // + // > As will be explained in Section 13.3, the XSAVE feature set is enabled only if + // > CR4.OSXSAVE[bit 18] = 1. If CR4.OSXSAVE = 0, the processor treats XSAVE-enabled state + // > features and their state components as if all bits in XCR0 were clear; the state + // > components cannot be modified and the features’ instructions cannot be executed. + // + // but the consequence is contradicted by the next paragraph, + // + // > Processors allow modification of this state, as well as execution of x87 FPU + // > instructions and SSE instructions [...] , regardless of the value of CR4.OSXSAVE and + // > XCR0. + // + // we will see that CR4.OSXSAVE must be set for other SIMD extensions below, as well. + sregs.cr4 |= 1 << OSXSAVE; - /// set `feature` to `wanted` in the VM's CPUID configuration. - /// - /// panics if the feature cannot be configured (such as if the corresponding CPUID leaf is not - /// available at all). use [`cpuid_supports`] to test if the feature can be configured. - fn cpuid_set(&mut self, feature: Feature, wanted: bool) { - fn edit_leaf(cpuid: &mut CpuId, leaf: u32, index: u32, mut f: impl FnMut(&mut kvm_cpuid_entry2)) { - for mut entry in cpuid.as_mut_slice() { - if entry.function == leaf && entry.index == index { - f(&mut entry); - return; - } - } + // SSE3, SSSE3, and SSE4 involve a bit extra: + // > Intel SSE3, SSSE3, and Intel SSE4 will cause a DNA Exception (#NM) if the processor + // > attempts to execute an Intel SSE3 instruction while CR0.TS[bit 3] = 1 + sregs.cr0 &= !(1 << TS); - // if we're here, the entry simply is not present (yet..?) - // - // so, create it. - let mut entry = kvm_cpuid_entry2 { - function: leaf, - index: index, - eax: 0, - ecx: 0, - edx: 0, - ebx: 0, - flags: 0, - padding: [0; 3], - }; - f(&mut entry); - cpuid.push(entry).expect("can push"); - } + // > Set the OSXMMEXCPT flag (bit 10 in control register CR4) to indicate that the operating + // > system supports the handling of SSE/SSE2/SSE3 SIMD floating-point exceptions (#XM). + // + // this is somewhat better than just getting an uncategorized #UD. + sregs.cr4 |= 1 << OSXMMEXCPT; - fn bit_set(word: &mut u32, bit: u32, wanted: bool) { - *word &= !bit; - if wanted { - *word |= bit; - } - } + assert!(xcrs.nr_xcrs > 0); + assert_eq!(xcrs.xcrs[0].xcr, 0); - let mut edited = false; + let mut needs_xsave = false; + if self.cpuid_supports(Feature::StateSSE) { + self.cpuid_set(Feature::StateSSE, true); + xcrs.xcrs[0].value |= 1; + xcrs.xcrs[0].value |= XCR0_SSE; + needs_xsave = true; + } + if self.cpuid_supports(Feature::StateAVX) { + self.cpuid_set(Feature::StateAVX, true); + xcrs.xcrs[0].value |= XCR0_AVX; + needs_xsave = true; + } + if self.cpuid_supports(Feature::StateAVX512) { + self.cpuid_set(Feature::StateAVX512, true); + xcrs.xcrs[0].value |= XCR0_AVX512; + needs_xsave = true; + } - match feature { - Feature::Base => { - edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| { - bit_set(&mut leaf.edx, CPUID_80000001_EDX_LM, wanted); - edited = true; - }); - edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| { - bit_set(&mut leaf.edx, CPUID_00000001_EDX_MSR, wanted); - edited = true; - }); - edit_leaf(&mut self.current_cpuid, 0x0000_0007, 0, |leaf| { - bit_set(&mut leaf.ebx, CPUID_00000007_EBX_CLSTAC, wanted); - edited = true; - }); - } - Feature::Syscall => { - edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| { - bit_set(&mut leaf.edx, CPUID_80000001_EDX_SYSCALL, wanted); - edited = true; - }); - } - Feature::XSave => { - edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| { - bit_set(&mut leaf.ecx, CPUID_00000001_ECX_XSAVE, wanted); - edited = true; - }); - }, - Feature::Pdpe1Gb => { - edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| { - bit_set(&mut leaf.edx, CPUID_80000001_EDX_PDPE1GB, wanted); - edited = true; - }); - }, - Feature::StateSSE => { - edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { - bit_set(&mut leaf.eax, 1, wanted); - bit_set(&mut leaf.eax, CPUID_0000000D_EAX_SSE, wanted); - edited = true; - }); - } - Feature::StateAVX => { - edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { - bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX, wanted); - edited = true; - }); - } - Feature::StateAVX512 => { - edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { - bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX512, wanted); - edited = true; - }); - } - Feature::Pse => { - edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| { - bit_set(&mut leaf.edx, CPUID_00000001_EDX_PSE, wanted); - edited = true; - }); + if needs_xsave { + if self.cpuid_supports(Feature::XSave) { + self.cpuid_set(Feature::XSave, true); + } else { + panic!( + "look, there's no CPU that supports SSE but not xsave. \ + i only checked to be thorough."); } } - - assert!(edited); - - self.vcpu.set_cpuid2(&self.current_cpuid).expect("can set cpuid"); } - /// configure page tables for identity mapping of all memory from guest address zero up to the - /// end of added memory regions, rounded up to the next GiB. - /// - /// if `sregs` is provided, update `cr0`, `cr3`, and `cr4` in support of protected-mode or - /// long-mode paging. this is a fixed pattern: if control registers have not been changed since - /// `Vm::create` then there will be no change to these control registers and `sregs` can be - /// omitted. - /// - /// panics if the end of added memory regions is above 512 GiB. - pub unsafe fn configure_identity_paging(&mut self, sregs: Option<&mut kvm_sregs>) { - // we're only setting up one PDPT, which can have up to 512 PDPTE covering 1G each. - assert!(self.mem_ceiling() <= 512 * GB); - - assert!(self.cpuid_supports(Feature::Pdpe1Gb)); - self.cpuid_set(Feature::Pdpe1Gb, true); - - let pt = self.page_tables(); - - let pml4_ent = - 1 << 0 | // P - 1 << 1 | // RW - 1 << 2 | // user access allowed. but no user code will run so not strictly needed. - 0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient) - 0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient) - 0 << 5 | // A - 0 << 6 | // ignored - 0 << 7 | // PS (reserved must-be-0) - 0 << 11 | // R (for ordinary paging, ignored; for HLAT ...) - pt.pdpt_addr().0; - unsafe { - pt.pml4_mut().write(pml4_ent); - } - - let mut mapped: u64 = 0; - // we've set up the first PML4 to point to a PDPT, so we should actually set it up! - let pdpt = pt.pdpt_mut(); - // PDPTEs start at the start of PDPT.. - let mut pdpte = pdpt; - let entry_bits: u64 = - 1 << 0 | // P - 1 << 1 | // RW - 1 << 2 | // user accesses allowed (everything is under privilege level 0 tho) - 0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient) - 0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient) - 0 << 5 | // Accessed - 0 << 6 | // Dirty - 1 << 7 | // Page size (1 implies 1G page) - 1 << 8 | // Global (if cr4.pge) - 0 << 9 | - 0 << 10 | - 0 << 11 | // for ordinary paging, ignored. for HLAT, ... - 0 << 12; // PAT (TODO: configure explicitly, but PAT0 is sufficient. verify MTRR sets PAT0 to WB?) - - while mapped < self.mem_ceiling() { - let phys_num = mapped >> 30; - let entry = entry_bits | (phys_num << 30); - unsafe { - pdpte.write(entry); - pdpte = pdpte.offset(1); + fn configure_syscalls(&mut self, vcpu_sregs: &mut kvm_sregs) { + assert!(self.cpuid_supports(Feature::Syscall)); + self.cpuid_set(Feature::Syscall, true); + + // > System-Call Extension (SCE) Bit. + vcpu_sregs.efer |= 0x0000_0001; + + let msrs = Msrs::from_entries(&[ + kvm_msr_entry { + // LSTAR (C000_0082h) + index: 0xc000_0082, + data: self.syscall_addr().0, + reserved: 0, + }, + kvm_msr_entry { + // CSTAR (C000_0083h) + index: 0xc000_0083, + data: self.syscall_addr().0, + reserved: 0, } - // eprintln!("mapped 1g at {:08x}", mapped); - mapped += 1 << 30; - } + ]).unwrap(); + self.set_msrs(&msrs).unwrap(); - if let Some(sregs) = sregs { - sregs.cr0 |= 0x8000_0001; // cr0.PE | cr0.PG - sregs.cr3 = pt.pml4_addr().0 as u64; - sregs.cr4 |= 1 << 5; // enable PAE - } + // fill the syscall landing area with hlt to trap out immediately. + self.mem_slice_mut(self.syscall_addr(), 16).fill(0xf4); + + self.syscall_configured = true; } +} - /// configure page tables for identity mapping of all memory from guest address zero up to the - /// end of added memory regions, rounded up to the next 4MiB. - /// - /// if `sregs` is provided, update `cr0`, `cr3`, and `cr4` in support of protected-mode paging. - /// this is a fixed pattern: if control registers have not been changed since `Vm::create` then - /// there will be no change to these control registers and `sregs` can be omitted. - pub unsafe fn configure_identity_paging_32b(&mut self, sregs: Option<&mut kvm_sregs>) { - // because we'll set PDEs to map 4M pages and cr3 points at a page-aligned block of 1024 - // 4-byte PDEs, that gives us 4KiB of memory used to map 4GiB of address space. that's all - // of 32-bit, so we don't need to check an upper bound. - assert!(self.cpuid_supports(Feature::Pse)); - self.cpuid_set(Feature::Pse, true); +#[test] +fn test_xor_runs() { + let mut vm = Vm::create(128 * 1024).expect("can create vm"); + let mut regs = vm.get_regs().expect("can get regs"); - let pt = self.page_tables(); + vm.program(&[0x33, 0xc0], &mut regs); - let mut mapped: u64 = 0; - // "pml4_mut" is really just the start of page table memory. we'll pun this in 32-bit with - // the knowledge it's really a block of PDEs. - let pd = pt.pml4_mut() as *mut u32; - let mut pde = pd; - let entry_bits: u32 = - 1 << 0 | // P - 1 << 1 | // RW - 1 << 2 | // user accesses allowed (everything is under privilege level 0 tho) - 0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient) - 0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient) - 0 << 5 | // Accessed - 0 << 6 | // Dirty - 1 << 7 | // Page size (1 implies 4M page) - 1 << 8 | // Global (if cr4.pge) - 0 << 9 | - 0 << 10 | - 0 << 11 | // for ordinary paging, ignored. for HLAT, ... - 0 << 12; // PAT (TODO: configure explicitly, but PAT0 is sufficient. verify MTRR sets PAT0 to WB?) + regs.rax = 0x1234; + let rip_before = regs.rip; - while mapped < self.mem_ceiling() { - let phys_num = (mapped as u32) >> 22; - let entry = entry_bits | (phys_num << 22); - unsafe { - pde.write(entry); - pde = pde.offset(1); - } - mapped += 1 << 22; - } + vm.set_regs(®s).expect("can set regs"); - // page size extensions; collaborates with page tables' PS bit to make 4MiB pages in 32-bit - // mode. see SDM section 2.5 "CONTROL REGISTERS". - const PSE: u64 = 1 << 4; + vm.set_single_step(true).expect("can set single-step"); - if let Some(sregs) = sregs { - sregs.cr0 |= 0x8000_0001; // cr0.PE | cr0.PG - sregs.cr3 = pt.pml4_addr().0 as u64; - sregs.cr4 |= PSE; + let res = vm.run().expect("can run vm"); + + let expected_rip = rip_before + 2; + match res { + VcpuExit::Debug { pc: rip_after, .. } => { + assert_eq!(expected_rip, rip_after); } - } + other => { + panic!("unexpected exit: {:?}", other); + } + }; - unsafe fn configure_selectors(&mut self, sregs: &mut kvm_sregs) { - // we have to set descriptor information directly. this avoids having to load selectors - // as the first instructions on the vCPU, which is simplifying. but if we want the - // information in these selectors to match with anything in a GDT (i do!) we'll have to - // keep this initial state lined up with GDT entries ourselves. - // - // we could avoid setting up the GDT for the most part, but anything that might - // legitimately load the "valid" current segment selector would instead clobber the - // selector with zeroes. + let regs_after = vm.get_regs().expect("can get regs"); + assert_eq!(regs_after.rax, 0); +} - sregs.cs.base = 0; - sregs.cs.limit = 0; - sregs.cs.selector = self.selector_cs(); - sregs.cs.type_ = 0b1011; // see SDM table 3-1 Code- and Data-Segment Types - sregs.cs.present = 1; - sregs.cs.dpl = 0; - sregs.cs.db = 0; - sregs.cs.s = 1; - sregs.cs.l = 1; - sregs.cs.g = 0; - sregs.cs.avl = 0; +#[test] +fn test_protected_mode_runs() { + let settings = VmSettings::new(128 * 1024, IsaMode::Protected); + let mut vm = Vm::create_by_settings(settings).expect("can create vm"); + let mut regs = vm.get_regs().expect("can get regs"); - sregs.ds.base = 0; - sregs.ds.limit = 0xffffffff; - sregs.ds.selector = self.selector_ds(); - sregs.ds.type_ = 0b0011; // see SDM table 3-1 Code- and Data-Segment Types - sregs.ds.present = 1; - sregs.ds.dpl = 0; - sregs.ds.db = 0; - sregs.ds.s = 1; - sregs.ds.l = 0; - sregs.ds.g = 0; - sregs.ds.avl = 0; + let buf = &[ + 0xc5, 0xe0, 0x54, 0xc3, // vandps xmm0, xmm3, xmm3 + 0x33, 0xc0, // xor eax, eax + 0x8b, 0x09, // mov ecx, [ecx] + 0xf4 // hlt + ]; + vm.program(buf, &mut regs); - sregs.es = sregs.ds; - sregs.fs = sregs.ds; - sregs.gs = sregs.ds; - // linux populates the vmcb cpl field with whatever's in ss.dpl. what the hell??? - sregs.ss = sregs.ds; + regs.rax = 0x1234; + regs.rcx = 0x4; - sregs.gdt.base = self.gdt_addr().0; - sregs.gdt.limit = 256 * 8 - 1; + vm.set_regs(®s).expect("can set regs"); - unsafe { - self.gdt_entry_mut(self.selector_cs() >> 3).write(encode_segment(&sregs.cs)); - self.gdt_entry_mut(self.selector_ds() >> 3).write(encode_segment(&sregs.ds)); + let res = vm.run().expect("can run vm"); + + match res { + VcpuExit::Hlt => { + // expected exit from the `0xf4` above. } - } + other => { + panic!("unexpected exit: {:?}", other); + } + }; - /// configure selectors for 32-bit code exceution. this is basically the same as 64-bit, but we - /// set a limit and set `cs.db` so that the default operand size is a normal 32-bit. - unsafe fn configure_selectors_32b(&mut self, sregs: &mut kvm_sregs) { - // we have to set descriptor information directly. this avoids having to load selectors - // as the first instructions on the vCPU, which is simplifying. but if we want the - // information in these selectors to match with anything in a GDT (i do!) we'll have to - // keep this initial state lined up with GDT entries ourselves. - // - // we could avoid setting up the GDT for the most part, but anything that might - // legitimately load the "valid" current segment selector would instead clobber the - // selector with zeroes. + let regs_after = vm.get_regs().expect("can get regs"); + assert_eq!(regs_after.rax, 0); + assert_eq!(regs_after.rcx, 0); +} - sregs.cs.base = 0; - sregs.cs.limit = 0xffffffff; - sregs.cs.selector = self.selector_cs(); - sregs.cs.type_ = 0b1011; // see SDM table 3-1 Code- and Data-Segment Types - sregs.cs.present = 1; - sregs.cs.dpl = 0; - sregs.cs.db = 1; - sregs.cs.s = 1; - sregs.cs.l = 0; - sregs.cs.g = 1; - sregs.cs.avl = 0; +#[test] +fn test_pusha_runs() { + let settings = VmSettings::new(128 * 1024, IsaMode::Real); + let mut vm = Vm::create_by_settings(settings).expect("can create vm"); + let mut regs = vm.get_regs().expect("can get regs"); - sregs.ds.base = 0; - sregs.ds.limit = 0xffffffff; - sregs.ds.selector = self.selector_ds(); - sregs.ds.type_ = 0b0011; // see SDM table 3-1 Code- and Data-Segment Types - sregs.ds.present = 1; - sregs.ds.dpl = 0; - sregs.ds.db = 1; - sregs.ds.s = 1; - sregs.ds.l = 0; - sregs.ds.g = 1; - sregs.ds.avl = 0; + vm.program(&[0x60], &mut regs); - sregs.es = sregs.ds; - sregs.fs = sregs.ds; - sregs.gs = sregs.ds; - // linux populates the vmcb cpl field with whatever's in ss.dpl. what the hell??? - sregs.ss = sregs.ds; + regs.rip = 0; + regs.rax = 0x1234; + eprintln!("{:?}", regs); - sregs.gdt.base = self.gdt_addr().0; - sregs.gdt.limit = 256 * 8 - 1; + vm.set_regs(®s).expect("can set regs"); - unsafe { - self.gdt_entry_mut(self.selector_cs() >> 3).write(encode_segment(&sregs.cs)); - self.gdt_entry_mut(self.selector_ds() >> 3).write(encode_segment(&sregs.ds)); + vm.set_single_step(true).expect("can set single-step"); + let expected_rip = vm.code_addr().0 + 1; + + let res = vm.run().expect("can run vm"); + + match res { + VcpuExit::Debug { pc: rip_after, .. } => { + eprintln!("rip after: {:08x}", rip_after); + assert_eq!(expected_rip, rip_after); } - } + other => { + panic!("unexpected exit: {:?}", other); + } + }; - /// configure selectors for 16-bit code exceution. - /// - /// unlike other modes, this sets `cs` to execute code at the linear address given by - /// [`Self::code_addr`]. `ds` is configured to overlap with `cs`. this way, when executing - /// 16-bit code the VM can simply be configured to `ip = 0`, and code addresses match data - /// addresses. additionally, clear `cs.db` so that the default operand size is 16-bit. - unsafe fn configure_selectors_16b(&mut self, sregs: &mut kvm_sregs) { - // we have to set descriptor information directly. this avoids having to load selectors - // as the first instructions on the vCPU, which is simplifying. but if we want the - // information in these selectors to match with anything in a GDT (i do!) we'll have to - // keep this initial state lined up with GDT entries ourselves. - // - // we could avoid setting up the GDT for the most part, but anything that might - // legitimately load the "valid" current segment selector would instead clobber the - // selector with zeroes. + let regs_after = vm.get_regs().expect("can get regs"); + assert_eq!(regs_after.rax, 0x1234); + assert_eq!(regs_after.rsp, 0x1000 - 0x80 - (8 * 2)); - sregs.cs.base = 0; - sregs.cs.limit = 0xfffff; - sregs.cs.selector = self.selector_cs(); - sregs.cs.type_ = 0b1011; // see SDM table 3-1 Code- and Data-Segment Types - sregs.cs.present = 1; - sregs.cs.dpl = 0; - sregs.cs.db = 0; - sregs.cs.s = 1; - sregs.cs.l = 0; - sregs.cs.g = 1; - sregs.cs.avl = 0; + let mut regs = vm.get_regs().expect("can get regs"); - unsafe { - self.gdt_entry_mut(self.selector_cs_idt_16b() >> 3).write(encode_segment(&sregs.cs)); - } + vm.program(&[0x66, 0x60], &mut regs); - // and now adjust for the real cs for code execution to happen in.. - sregs.cs.base = self.code_addr().0; + regs.rip = 0; + regs.rax = 0x1234; + regs.rsp = 0x1000 - 0x80; + eprintln!("{:?}", regs); - sregs.ds.base = self.code_addr().0; - sregs.ds.limit = 0xfffff; - sregs.ds.selector = self.selector_ds(); - sregs.ds.type_ = 0b0011; // see SDM table 3-1 Code- and Data-Segment Types - sregs.ds.present = 1; - sregs.ds.dpl = 0; - sregs.ds.db = 0; - sregs.ds.s = 1; - sregs.ds.l = 0; - sregs.ds.g = 1; - sregs.ds.avl = 0; + vm.set_regs(®s).expect("can set regs"); - sregs.es = sregs.ds; - sregs.fs = sregs.ds; - sregs.gs = sregs.ds; - // linux populates the vmcb cpl field with whatever's in ss.dpl. what the hell??? - sregs.ss = sregs.ds; + vm.set_single_step(true).expect("can set single-step"); + let expected_rip = vm.code_addr().0 + 2; - sregs.gdt.base = self.gdt_addr().0; - sregs.gdt.limit = 256 * 8 - 1; + let res = vm.run().expect("can run vm"); - unsafe { - self.gdt_entry_mut(self.selector_cs() >> 3).write(encode_segment(&sregs.cs)); - self.gdt_entry_mut(self.selector_ds() >> 3).write(encode_segment(&sregs.ds)); + match res { + VcpuExit::Debug { pc: rip_after, .. } => { + eprintln!("rip after: {:08x}", rip_after); + assert_eq!(expected_rip, rip_after); } - } + other => { + panic!("unexpected exit: {:?}", other); + } + }; - fn write_idt_entry( - &mut self, - intr_nr: u8, - interrupt_handler_cs: u16, - interrupt_handler_addr: GuestAddress - ) { - let idt_ptr = self.idt_entry_mut(intr_nr); + let regs_after = vm.get_regs().expect("can get regs"); + assert_eq!(regs_after.rax, 0x1234); + assert_eq!(regs_after.rsp, 0x1000 - 0x80 - (8 * 4)); +} - // entries in the IDT, interrupt and trap descriptors (in the AMD APM, "interrupt-gate" - // and "trap-gate" descriptors), are described (in the AMD APM) by - // "Figure 4-24. Interrupt-Gate and Trap-Gate Descriptors—Long Mode". reproduced here: - // - // 3 2 1 | 1 0 - // 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 - // |---------------------------------------------------------------| - // | res,ign | +12 - // | target offset[63:32] | +8 - // | target offset[31:16] |P|DPL|0| type | res,ign | IST | +4 - // | target selector | target offset[15:0] | +0 - // |---------------------------------------------------------------| - // - // descriptors are encoded with P set, DPL at 0, and type set to 0b1110. TODO: frankly - // i don't know the mechanical difference between type 0x0e and type 0x0f, but 0x0e - // works for now. - let idt_attr_bits = 0b1_00_0_1110_00000_000; - let low_hi = (interrupt_handler_addr.0 as u32 & 0xffff_0000) | idt_attr_bits; - let low_lo = (interrupt_handler_cs as u32) << 16 | (interrupt_handler_addr.0 as u32 & 0x0000_ffff); +#[test] +fn test_syscall() { + let mut vm = Vm::create(128 * 1024).expect("can create vm"); + let mut regs = vm.get_regs().expect("can get regs"); - unsafe { - idt_ptr.offset(0).write(low_lo); - idt_ptr.offset(1).write(low_hi); - idt_ptr.offset(2).write((interrupt_handler_addr.0 >> 32) as u32); - idt_ptr.offset(3).write(0); // reserved - } - } + vm.program(&[0x0f, 0x05], &mut regs); + eprintln!("rip before: {:08x}", regs.rip); - /// 16-bit/32-bit IDT entries, described in the APM as - /// - /// > Interrupt-Gate and Trap-Gate Descriptors—Legacy Mode - /// - /// have a different (smaller!) format. - fn write_idt_entry_legacy( - &mut self, - intr_nr: u8, - interrupt_handler_cs: u16, - interrupt_handler_addr: GuestAddress - ) { - assert!(interrupt_handler_addr.0 <= u32::MAX as u64); - let idt_ptr = self.idt_entry_legacy_mut(intr_nr); + vm.set_regs(®s).expect("can set regs"); - // entries in the IDT, interrupt and trap descriptors (in the AMD APM, "interrupt-gate" - // and "trap-gate" descriptors), are described (in the AMD APM) by - // "Figure 4-24. Interrupt-Gate and Trap-Gate Descriptors—Long Mode". reproduced here: - // - // 3 2 1 | 1 0 - // 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 - // |---------------------------------------------------------------| - // | target offset[31:16] |P|DPL|0| type | res,ign | IST | +4 - // | target selector | target offset[15:0] | +0 - // |---------------------------------------------------------------| - // - // descriptors are encoded with P set, DPL at 0, and type set to 0b1110. TODO: frankly - // i don't know the mechanical difference between type 0x0e and type 0x0f, but 0x0e - // works for now. - let idt_attr_bits = 0b1_00_0_1110_00000_000; - let low_hi = (interrupt_handler_addr.0 as u32 & 0xffff_0000) | idt_attr_bits; - let low_lo = (interrupt_handler_cs as u32) << 16 | (interrupt_handler_addr.0 as u32 & 0x0000_ffff); +// vm.set_single_step(true).expect("can set single-step"); - unsafe { - idt_ptr.offset(0).write(low_lo); - idt_ptr.offset(1).write(low_hi); + let res = vm.run().expect("can run vm"); + match res { + VcpuExit::Syscall => { /* expected */ } + VcpuExit::Debug { pc, .. } => { + if pc == vm.syscall_addr().0 { + panic!( + "VM exited at syscall target. \ + syscall hlt stub not executed. \ + is the VM being single-stepped?" + ); + } + panic!("unexpected debug exit at rip={:08x}", pc); } - } + other => { + panic!("unexpected exit: {:?}", other); + } + }; - fn configure_idt(&mut self, regs: &mut kvm_regs, sregs: &mut kvm_sregs) { - sregs.idt.base = self.idt_addr().0; - sregs.idt.limit = IDT_ENTRIES * 16 - 1; // IDT is 256 entries of 16 bytes each + let regs_after = vm.get_regs().expect("can get regs"); - for i in 0..IDT_ENTRIES { - let interrupt_handler_addr = GuestAddress(self.interrupt_handlers_start().0 + i as u64); - self.write_idt_entry( - i.try_into().expect(" { + assert_eq!(expected_rip, rip_after); } + other => { + panic!("unexpected exit: {:?}", other); + } + }; +} - // finally, set `rsp` to a valid region so that the CPU can push necessary state (see - // AMD APM section "8.9.3 Interrupt Stack Frame") to actually enter the interrupt - // handler. if we didn't do this, rsp will probably be zero or something, underflow, - // page fault on push to 0xffffffff_ffffffff, and just triple fault. - // - // TODO: this is our option in 16- and 32-bit modes, but in long mode all the interrupt - // descriptors could set something in IST to switch stacks outright for exception - // handling. this might be nice to test rsp permutations in 64-bit code? alternatively - // we might just have to limit possible rsp permutations so as to be able to test in - // 16- and 32-bit modes anyway. - regs.rsp = self.stack_addr().0; - self.idt_configured = true; +#[test] +fn test_vex_vandps_runs() { + let mut vm = Vm::create(128 * 1024).expect("can create vm"); + + if !vm.cpuid_supports(Feature::StateAVX) { + panic!("host CPU does not support AVX"); } - /// IDT configuration in 16-bit mode is funky because the interrupt handlers live in a totally - /// different region of memory and need a different value in `cs`. - fn configure_idt_16b(&mut self, regs: &mut kvm_regs, sregs: &mut kvm_sregs) { - sregs.idt.base = self.idt_addr().0; - sregs.idt.limit = IDT_ENTRIES * 8 - 1; // IDT is 256 entries of 8 bytes each + let mut regs = vm.get_regs().expect("can get regs"); + + vm.program(&[0xc5, 0xe0, 0x54, 0x03], &mut regs); + + regs.rbx = regs.rip; + let rip_before = regs.rip; + + vm.set_regs(®s).expect("can set regs"); + + vm.set_single_step(true).expect("can set single-step"); - for i in 0..IDT_ENTRIES { - let interrupt_handler_addr = GuestAddress(self.interrupt_handlers_start().0 + i as u64); - self.write_idt_entry_legacy( - i.try_into().expect(" { + assert_eq!(expected_rip, rip_after); } + other => { + panic!("unexpected exit: {:?}", other); + } + }; +} - // finally, set `rsp` to a valid region so that the CPU can push necessary state (see - // AMD APM section "8.9.3 Interrupt Stack Frame") to actually enter the interrupt - // handler. if we didn't do this, rsp will probably be zero or something, underflow, - // page fault on push to 0xffffffff_ffffffff, and just triple fault. - // - // TODO: this is our option in 16- and 32-bit modes, but in long mode all the interrupt - // descriptors could set something in IST to switch stacks outright for exception - // handling. this might be nice to test rsp permutations in 64-bit code? alternatively - // we might just have to limit possible rsp permutations so as to be able to test in - // 16- and 32-bit modes anyway. - regs.rsp = self.stack_addr().0; - self.idt_configured = true; - } - - /// configure the vCPU for executing instructions in the hardware-supported extensions. - /// on a fresh vCPU, various extension may be "supported" but result in `#UD` when executed, - /// unless additional configuration is done (as this function does). - /// - /// the Intel SDM describes `INITIALIZING SSE/SSE2/SSE3/SSSE3 EXTENSIONS` but does not point - /// out this `#UD` behavior so directly. the AMD APM does not seem to discuss it at all? - /// - /// this function configures the vCPU to be ready to execute `SSE*` instructions. - fn configure_extensions(&mut self, sregs: &mut kvm_sregs, xcrs: &mut kvm_xcrs) { - // these bit positions in control registers, and their behaviors, are described more - // comprehensively in Voluem 3, - // > `2.5 CONTROL REGISTERS` +#[test] +fn test_vex_vandps_runs_32b() { + let settings = VmSettings::new(128 * 1024, IsaMode::Protected); + let mut vm = Vm::create_by_settings(settings).expect("can create vm"); - // CR0 - const TS: u32 = 3; - // CR4 - const OSFXSR: u32 = 9; - const OSXMMEXCPT: u32 = 10; - const OSXSAVE: u32 = 18; + if !vm.cpuid_supports(Feature::StateAVX) { + panic!("host CPU does not support AVX"); + } - // XCR0 (see "EXTENDED CONTROL REGISTERS (INCLUDING XCR0)") - // these bits are the same as in cpuid leaf 0xd.eax - const XCR0_SSE: u64 = CPUID_0000000D_EAX_SSE as u64; - const XCR0_AVX: u64 = CPUID_0000000D_EAX_AVX as u64; - const XCR0_AVX512: u64 = CPUID_0000000D_EAX_AVX512 as u64; + let mut regs = vm.get_regs().expect("can get regs"); - // operations on `xmm` registers result in `#UD` even if CPUID says that SSE should be - // quite functional. this is true even for SSE or SSE2 instructions on an `x86_64` system - // (which makes SSE a non-optional baseline!) - // - // the Intel SDM implies this through somewhat tortured language in the section - // "Checking for Intel® SSE and SSE2 Support": - // > If an operating system did not provide adequate system level support for Intel - // > SSE, executing an Intel SSE or SSE2 instructions can also generate #UD. - // - // to fully understand this statement, realize that `an operating system .. provide[s] - // adequate system level support" by setting CR4.OSFXSR, - // - // > Set the OSFXSR flag (bit 9 in control register CR4) to indicate that the operating - // > system supports saving and restoring the SSE/SSE2/SSE3/SSSE3 execution environment - // - // so OSFXSR is how "the operating system" indicates save/restore state, and must be set to - // execute SSE (and later) SIMD instructions even if we never will use `fxsave` or even - // switch tasks on the vCPU. - sregs.cr4 |= 1 << OSFXSR; + vm.program(&[0xc5, 0xe0, 0x54, 0x03], &mut regs); - // there is a similar relationship between SIMD extension functionality and CR4.OSXSAVE. - // this passage in the SDM under "XSAVE-SUPPORTED FEATURES AND STATE-COMPONENT BITMAPS" - // draws a fairly direct connection: - // - // > As will be explained in Section 13.3, the XSAVE feature set is enabled only if - // > CR4.OSXSAVE[bit 18] = 1. If CR4.OSXSAVE = 0, the processor treats XSAVE-enabled state - // > features and their state components as if all bits in XCR0 were clear; the state - // > components cannot be modified and the features’ instructions cannot be executed. - // - // but the consequence is contradicted by the next paragraph, - // - // > Processors allow modification of this state, as well as execution of x87 FPU - // > instructions and SSE instructions [...] , regardless of the value of CR4.OSXSAVE and - // > XCR0. - // - // we will see that CR4.OSXSAVE must be set for other SIMD extensions below, as well. - sregs.cr4 |= 1 << OSXSAVE; + regs.rbx = regs.rip; + let rip_before = regs.rip; - // SSE3, SSSE3, and SSE4 involve a bit extra: - // > Intel SSE3, SSSE3, and Intel SSE4 will cause a DNA Exception (#NM) if the processor - // > attempts to execute an Intel SSE3 instruction while CR0.TS[bit 3] = 1 - sregs.cr0 &= !(1 << TS); + vm.set_regs(®s).expect("can set regs"); - // > Set the OSXMMEXCPT flag (bit 10 in control register CR4) to indicate that the operating - // > system supports the handling of SSE/SSE2/SSE3 SIMD floating-point exceptions (#XM). - // - // this is somewhat better than just getting an uncategorized #UD. - sregs.cr4 |= 1 << OSXMMEXCPT; + vm.set_single_step(true).expect("can set single-step"); - assert!(xcrs.nr_xcrs > 0); - assert_eq!(xcrs.xcrs[0].xcr, 0); + let res = vm.run().expect("can run vm"); - let mut needs_xsave = false; - if self.cpuid_supports(Feature::StateSSE) { - self.cpuid_set(Feature::StateSSE, true); - xcrs.xcrs[0].value |= 1; - xcrs.xcrs[0].value |= XCR0_SSE; - needs_xsave = true; - } - if self.cpuid_supports(Feature::StateAVX) { - self.cpuid_set(Feature::StateAVX, true); - xcrs.xcrs[0].value |= XCR0_AVX; - needs_xsave = true; + let expected_rip = rip_before + 4; + eprintln!("exit: {:?}", res); + match res { + VcpuExit::Debug { pc: rip_after, .. } => { + assert_eq!(expected_rip, rip_after); } - if self.cpuid_supports(Feature::StateAVX512) { - self.cpuid_set(Feature::StateAVX512, true); - xcrs.xcrs[0].value |= XCR0_AVX512; - needs_xsave = true; + other => { + panic!("unexpected exit: {:?}", other); } + }; +} - if needs_xsave { - if self.cpuid_supports(Feature::XSave) { - self.cpuid_set(Feature::XSave, true); - } else { - panic!( - "look, there's no CPU that supports SSE but not xsave. \ - i only checked to be thorough."); - } - } +#[test] +fn test_evex_vandps_runs() { + let mut vm = Vm::create(128 * 1024).expect("can create vm"); + + if !vm.cpuid_supports(Feature::StateAVX512) { + panic!("host CPU does not support AVX512"); } - fn configure_syscalls(&mut self, vcpu_sregs: &mut kvm_sregs) { - assert!(self.cpuid_supports(Feature::Syscall)); - self.cpuid_set(Feature::Syscall, true); + let mut regs = vm.get_regs().expect("can get regs"); - // > System-Call Extension (SCE) Bit. - vcpu_sregs.efer |= 0x0000_0001; + vm.program(&[0x62, 0xf1, 0x7c, 0xbd, 0x54, 0x0a], &mut regs); - let msrs = Msrs::from_entries(&[ - kvm_msr_entry { - // LSTAR (C000_0082h) - index: 0xc000_0082, - data: self.syscall_addr().0, - reserved: 0, - }, - kvm_msr_entry { - // CSTAR (C000_0083h) - index: 0xc000_0083, - data: self.syscall_addr().0, - reserved: 0, - } - ]).unwrap(); - self.set_msrs(&msrs).unwrap(); + regs.rbx = regs.rip; + let rip_before = regs.rip; - // fill the syscall landing area with hlt to trap out immediately. - self.mem_slice_mut(self.syscall_addr(), 16).fill(0xf4); + vm.set_regs(®s).expect("can set regs"); - self.syscall_configured = true; + vm.set_single_step(true).expect("can set single-step"); + + let res = vm.run().expect("can run vm"); + + let expected_rip = rip_before + 6; + eprintln!("exit: {:?}", res); + match res { + VcpuExit::Debug { pc: rip_after, .. } => { + assert_eq!(expected_rip, rip_after); + } + other => { + panic!("unexpected exit: {:?}", other); + } + }; +} + + +// this function will sit and loop in the kernel after trying to fulfill the MMIO exit. +// +// not great! don't do that! it's responsive to EINTR at least. +// #[test] +#[allow(dead_code)] +fn kvm_hugepage_bug() { + let mut vm = Vm::create(1024 * 1024).expect("can create vm"); + vm.add_memory(GuestAddress(0x1_0000_0000), 128 * 1024).expect("can add test mem region"); + unsafe { + vm.configure_identity_paging(None); } + + // `add [rsp], al; add [rcx], al; pop [rcx]; hlt` + // the first instruction runs fine. the second instruction runs fine. + // the third instruction gets a page fault at 0xf800? which worked fine for the add. + // this turns out to be an issue in linux' paging64_gva_to_gpa() when the va is mapped with + // huge pages. + let inst: &'static [u8] = &[0x00, 0x04, 0x24, 0x00, 0x01, 0x8f, 0x01, 0xf4]; + let mut regs = vm.get_regs().unwrap(); + regs.rax = 0x00000002_00100000; + regs.rcx = 0x00000002_00100000; + vm.program(inst, &mut regs); + vm.set_regs(®s).unwrap(); + vm.set_single_step(true).expect("can enable single-step"); + vm.run().expect("can run vm"); + + let vm_regs = vm.get_regs().unwrap(); + let vm_sregs = vm.get_sregs().unwrap(); + let mut prev_rip = [0u8; 8]; + vm.read_mem(GuestAddress(vm_regs.rsp + 8), &mut prev_rip[..]); + let mut buf = [0u8; 8]; + vm.read_mem(GuestAddress(vm_regs.rsp), &mut buf[..]); + eprintln!( + "error code: {:#08x} accessing {:016x} @ rip={:#016x} (cr3={:016x})", + u64::from_le_bytes(buf), vm_sregs.cr2, + u64::from_le_bytes(prev_rip), vm_sregs.cr3 + ); + if vm_regs.rip == 0x300f { + let mut pdpt = [0u8; 4096]; + vm.read_mem(vm.page_tables().pdpt_addr(), &mut pdpt[..]); + eprintln!("pdpt: {:x?}", &pdpt[..8]); + } + panic!("no"); } -- cgit v1.1