use core::fmt; use core::num::NonZero; use core::ptr::NonNull; use nix::sys::mman::{MapFlags, ProtFlags}; use kvm_ioctls::{Kvm, VcpuFd, VmFd}; use kvm_bindings::{ kvm_cpuid_entry2, kvm_guest_debug, kvm_userspace_memory_region, kvm_segment, CpuId, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_MAX_CPUID_ENTRIES, }; pub use kvm_bindings::{kvm_regs, kvm_sregs, kvm_xcrs, kvm_debug_exit_arch}; const _TARGET_IS_64BIT: () = { assert!(core::mem::size_of::() == core::mem::size_of::(), "asmlinator only supports 64-bit targets"); }; // the wanton casting between usize and u64 is justifiable here because TARGET_IS_64BIT above: fn usize_to_u64(x: usize) -> u64 { let _ = _TARGET_IS_64BIT; x as u64 } fn u64_to_usize(x: u64) -> usize { let _ = _TARGET_IS_64BIT; x as usize } /// a test VM for running arbitrary instructions. /// /// there is one CPU which is configured for long-mode execution. all memory is /// identity-mapped with 1GiB pages. page tables are configured to cover 512 GiB of memory, but /// much much less than that is actually allocated and usable through `memory.` /// /// it is configured with `mem_size` bytes of memory at guest address 0, accessible through /// host pointer `memory`. this region is used for "control structures"; page tables, GDT, IDT, /// and stack. it is also the region where code to be executed is placed. pub struct Vm { vm: VmFd, vcpu: VcpuFd, supported_cpuid: CpuId, current_cpuid: CpuId, idt_configured: bool, mem_ceiling: u64, memory: Mapping, aux_memories: Vec, } /// broad categories of cpuid/cpu features that should be detectable or configurable as part of /// setting up a VM. this is split out for legibility, but also because in theory these (especially /// ISA extensions) features probably should be configurable by library users somehow.. /// /// not yet sure, so this is not pub. #[derive(Copy, Clone, Debug)] enum Feature { /// support for the xsave/xrstor instructions and at least xcr0. /// /// cpuid leaf eax=0x0000_0001 bit ecx[26], see APM /// chapter "Obtaining Processor Information Via the CPUID Instruction", /// section "Standard Feature Function Numbers". XSave, /// support for 1GB page mappings. cpuid leaf eax=0x8000_0001 bit edx[26]. Pdpe1Gb, /// support for the XSAVE SSE region. this correponds to the bit in CPUID leaf D and /// corresponding bit in xcr0. if this bit is unset, attempts to use instructions with xmm /// state will #UD. StateSSE, /// support for the XSAVE AVX region. this correponds to the bit in CPUID leaf D and /// corresponding bit in xcr0. if this bit is unset, attempts to use instructions with ymm /// state will #UD. StateAVX, /// support for the XSAVE AVX512 regions. this correponds to the bits for K, ZMM_Hi256, and /// Hi16_ZMM in CPUID leaf D and corresponding bits in xcr0. if these bits are not set, /// attempts to use instructions with zmm state may #UD. StateAVX512, } const CPUID_00000001_ECX_XSAVE: u32 = 1 << 26; const CPUID_0000000D_EAX_SSE: u32 = 1 << 1; const CPUID_0000000D_EAX_AVX: u32 = 1 << 2; const CPUID_0000000D_EAX_AVX512: u32 = (1 << 5) | (1 << 6) | (1 << 7); const CPUID_80000001_EDX_PDPE1GB: u32 = 1 << 26; #[derive(PartialEq)] pub enum VcpuExit<'buf> { MmioRead { addr: u64, buf: &'buf mut [u8] }, MmioWrite { addr: u64, buf: &'buf [u8] }, IoIn { port: u16, buf: &'buf mut [u8] }, IoOut { port: u16, buf: &'buf [u8] }, Debug { pc: u64, info: kvm_debug_exit_arch }, Exception { nr: u8 }, Shutdown, Hlt, } impl<'buf> fmt::Debug for VcpuExit<'buf> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use VcpuExit::*; match self { MmioRead { addr, buf } => { let size = buf.len(); write!(f, "VcpuExit::MmioRead {{ addr: {addr:#08x}, size: {size} }}") }, MmioWrite { addr, buf } => { let size = buf.len(); write!(f, "VcpuExit::MmioWrite {{ addr: {addr:#08x}, size: {size} }}") }, IoIn { port, buf } => { let size = buf.len(); write!(f, "VcpuExit::IoIn {{ port: {port:#04x}, size: {size} }}") }, IoOut { port, buf } => { let size = buf.len(); write!(f, "VcpuExit::IoOut {{ port: {port:#04x}, size: {size} }}") }, Debug { pc, info: _ } => { write!(f, "VcpuExit::Debug {{ pc: {pc:#016x}, _ }}") }, Exception { nr } => { write!(f, "VcpuExit::Exception {{ nr: {nr} }}") }, Shutdown => { write!(f, "VcpuExit::Shutdown") }, Hlt => { write!(f, "VcpuExit::Hlt") } } } } const GB: u64 = 1 << 30; // TODO: cite APM/SDM const IDT_ENTRIES: u16 = 256; #[derive(Copy, Clone)] pub struct GuestAddress(pub u64); pub struct VmPageTables<'vm> { vm: &'vm Vm, base: GuestAddress, } impl<'vm> VmPageTables<'vm> { pub fn pml4_addr(&self) -> GuestAddress { self.base } pub fn pdpt_addr(&self) -> GuestAddress { GuestAddress(self.base.0 + 0x1000) } pub fn pml4_mut(&self) -> *mut u64 { // SAFETY: creating VmPageTables implies we've asserted that we can form host pointers // for all addresses in the page tables. unsafe { self.vm.host_ptr(self.pml4_addr()) as *mut u64 } } pub fn pdpt_mut(&self) -> *mut u64 { // SAFETY: creating VmPageTables implies we've asserted that we can form host pointers // for all addresses in the page tables. unsafe { self.vm.host_ptr(self.pdpt_addr()) as *mut u64 } } } fn encode_segment(seg: &kvm_segment) -> u64 { let base = seg.base as u64; let limit = seg.limit as u64; let lim_low = limit & 0xffff; let lim_high = (limit >> 16) & 0xf; let addr_low = base & 0xffff; let desc_low = lim_low | (addr_low << 16); let base_mid = (base >> 16) & 0xff; let base_high = (base >> 24) & 0xff; let access_byte = (seg.type_ as u64) | (seg.s as u64) << 4 | (seg.dpl as u64) << 5 | (seg.present as u64) << 7; let flaglim_byte = lim_high | (seg.avl as u64) << 4 | (seg.l as u64) << 5 | (seg.db as u64) << 6 | (seg.g as u64) << 7; let desc_high = base_mid | access_byte << 8 | flaglim_byte << 16 | base_high << 24; desc_low | (desc_high << 32) } pub enum VmCreateError { /// the requested VM was smaller than `asmlinator`'s minimum allowable size. TooSmall { requested: usize, required: usize }, /// the requested VM's memory size was not an even number of pages. BadSize { requested: usize, unit: usize }, /// one of the several syscalls in setting up a new VM failed. /// /// this is most likely a permissions error, or `/dev/kvm` doesn't exist. otherwise, something /// interesting happened! /// /// this deserves better documentation, but i'm not aware of documentation for KVM ioctl /// failure modes. SyscallError { op: &'static str, err: nix::errno::Errno }, /// `base` and `size` are not valid for mapping; either because of over/underflow, or overlaps /// with an existing mapping. InvalidMapping { base: GuestAddress, size: u64 } } impl fmt::Debug for VmCreateError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { VmCreateError::TooSmall { requested, required } => { write!(f, "requested memory size ({requested}) is too small, must be at least {required}") } VmCreateError::BadSize { requested, unit } => { write!(f, "requested memory size ({requested}) is not a multiple of ({unit})") } VmCreateError::SyscallError { op, err } => { write!(f, "error at {op}: {err}") } VmCreateError::InvalidMapping { base, size } => { write!(f, "invalid mapping (gpa={:#08x}/size={:08x})", base.0, size) } } } } impl fmt::Debug for VmError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { VmError::BadSize { requested, unit } => { write!(f, "requested memory size ({requested}) is not a multiple of ({unit})") } VmError::SyscallError { op, err } => { write!(f, "error at {op}: {err}") } VmError::InvalidMapping { base, size } => { write!(f, "invalid mapping (gpa={:#08x}/size={:08x})", base.0, size) } } } } pub enum VmError { /// the requested VM's memory size was not an even number of pages. BadSize { requested: usize, unit: usize }, /// one of the several syscalls in operating a VM failed. /// /// this deserves better documentation, but i'm not aware of documentation for KVM ioctl /// failure modes. SyscallError { op: &'static str, err: nix::errno::Errno }, /// `base` and `size` are not valid for mapping; either because of over/underflow, or overlaps /// with an existing mapping. InvalidMapping { base: GuestAddress, size: u64 } } impl VmError { fn from_kvm(op: &'static str, err: kvm_ioctls::Error) -> Self { Self::SyscallError { op, err: nix::errno::Errno::from_raw(err.errno()) } } } impl From for VmCreateError { fn from(other: VmError) -> Self { match other { VmError::BadSize { requested, unit } => VmCreateError::BadSize { requested, unit }, VmError::SyscallError { op, err } => VmCreateError::SyscallError { op, err }, VmError::InvalidMapping { base, size } => VmCreateError::InvalidMapping { base, size }, } } } /// a `mmap`'d region, `munmap`'d on drop. struct Mapping { guest_addr: usize, addr: NonNull, size: NonZero, } impl Drop for Mapping { fn drop(&mut self) { let res = unsafe { nix::sys::mman::munmap(self.addr, self.size.get()) }; res.expect("can unmap a region we mapped"); } } impl Mapping { fn create_shared(guest_addr: usize, size: usize, prot: ProtFlags) -> Result { if size % 4096 != 0 { return Err(VmError::BadSize { requested: size, unit: 4096, }); } let size = NonZero::new(size) .ok_or(VmError::BadSize { requested: 0, unit: 0, })?; let map_res = unsafe { nix::sys::mman::mmap_anonymous( None, size, prot, MapFlags::MAP_ANONYMOUS | MapFlags::MAP_SHARED, ) }; let map_addr = map_res .map_err(|e| VmError::SyscallError { op: "mmap", err: e })?; // look, mmap should only be in the business of returning page-aligned addresses but i // just wanna see it, you know... assert!(map_addr.as_ptr() as usize % 4096 == 0); Ok(Self { guest_addr, addr: map_addr, size, }) } /// SAFETY: the caller must not use the returned pointer to violate reference safety of the VM. /// the pointer must not be turned into a reference while running the VM, etc. /// /// panics if `address` is not contained in this mapping. unsafe fn host_ptr(&self, address: GuestAddress) -> *mut u8 { let guest_addr: u64 = usize_to_u64(self.guest_addr); let offset = address.0.checked_sub(guest_addr) .expect("guest address is above mapping base"); let base = self.addr.as_ptr() as *mut u8; unsafe { base.offset(offset as isize) } } /// SAFETY: the caller must ensure that this mapping covers `base` and that there are at least /// `size` bytes at `base` before the end of this mapping. unsafe fn slice_mut(&mut self, base: GuestAddress, size: u64) -> &mut [u8] { let ptr = unsafe { self.host_ptr(base) }; unsafe { core::slice::from_raw_parts_mut(ptr, u64_to_usize(size)) } } /// SAFETY: the caller must ensure that this mapping covers `base` and that there are at least /// `size` bytes at `base` before the end of this mapping. unsafe fn slice(&self, base: GuestAddress, size: u64) -> &[u8] { let ptr = unsafe { self.host_ptr(base) }; unsafe { core::slice::from_raw_parts(ptr, u64_to_usize(size)) } } fn overlaps(&self, base: GuestAddress, index_end: GuestAddress) -> bool { let map_base: u64 = usize_to_u64(self.guest_addr); let map_end = map_base.checked_add(usize_to_u64(self.size.get())).unwrap(); let enclosed_by = base.0 <= map_base && index_end.0 >= map_end; let contains_base = base.0 >= map_base && base.0 < map_end; let contains_end = index_end.0 >= map_base && index_end.0 <= map_end; enclosed_by || contains_base || contains_end } fn contains(&self, base: GuestAddress) -> bool { let end = self.guest_addr.checked_add(self.size.get()).unwrap(); base.0 >= self.guest_addr as u64 && base.0 < end as u64 } fn check_range(&self, base: GuestAddress, size: u64) -> bool { let map_base: u64 = self.guest_addr.try_into().unwrap(); let Some(offset) = base.0.checked_sub(map_base) else { return false; }; let Some(end) = offset.checked_add(size) else { return false; }; end <= self.size.get().try_into().unwrap() } } #[test] fn test_check_range_exact() { let mapping = Mapping::create_shared(0x4000, 0x1000, ProtFlags::PROT_READ).expect("can create mapping"); assert!(mapping.check_range(GuestAddress(0x4000), 0x1000)); } #[test] fn test_xor_runs() { let mut vm = Vm::create(128 * 1024).expect("can create vm"); let mut regs = vm.get_regs().expect("can get regs"); vm.program(&[0x33, 0xc0], &mut regs); regs.rax = 0x1234; let rip_before = regs.rip; vm.set_regs(®s).expect("can set regs"); vm.set_single_step(true).expect("can set single-step"); let res = vm.run().expect("can run vm"); let expected_rip = rip_before + 2; match res { VcpuExit::Debug { pc: rip_after, .. } => { assert_eq!(expected_rip, rip_after); } other => { panic!("unexpected exit: {:?}", other); } }; let regs_after = vm.get_regs().expect("can get regs"); assert_eq!(regs_after.rax, 0); } #[test] fn test_xorps_runs() { let mut vm = Vm::create(128 * 1024).expect("can create vm"); let mut regs = vm.get_regs().expect("can get regs"); vm.program(&[0x0f, 0x57, 0xc0], &mut regs); let rip_before = regs.rip; vm.set_regs(®s).expect("can set regs"); vm.set_single_step(true).expect("can set single-step"); let res = vm.run().expect("can run vm"); let expected_rip = rip_before + 3; eprintln!("exit: {:?}", res); match res { VcpuExit::Debug { pc: rip_after, .. } => { assert_eq!(expected_rip, rip_after); } other => { panic!("unexpected exit: {:?}", other); } }; } #[test] fn test_vex_vandps_runs() { let mut vm = Vm::create(128 * 1024).expect("can create vm"); if !vm.cpuid_supports(Feature::StateAVX) { panic!("host CPU does not support AVX"); } let mut regs = vm.get_regs().expect("can get regs"); vm.program(&[0xc5, 0xe0, 0x54, 0x03], &mut regs); regs.rbx = regs.rip; let rip_before = regs.rip; vm.set_regs(®s).expect("can set regs"); vm.set_single_step(true).expect("can set single-step"); let res = vm.run().expect("can run vm"); let expected_rip = rip_before + 4; eprintln!("exit: {:?}", res); match res { VcpuExit::Debug { pc: rip_after, .. } => { assert_eq!(expected_rip, rip_after); } other => { panic!("unexpected exit: {:?}", other); } }; } #[test] fn test_evex_vandps_runs() { let mut vm = Vm::create(128 * 1024).expect("can create vm"); if !vm.cpuid_supports(Feature::StateAVX512) { panic!("host CPU does not support AVX512"); } let mut regs = vm.get_regs().expect("can get regs"); vm.program(&[0x62, 0xf1, 0x7c, 0xbd, 0x54, 0x0a], &mut regs); regs.rbx = regs.rip; let rip_before = regs.rip; vm.set_regs(®s).expect("can set regs"); vm.set_single_step(true).expect("can set single-step"); let res = vm.run().expect("can run vm"); let expected_rip = rip_before + 6; eprintln!("exit: {:?}", res); match res { VcpuExit::Debug { pc: rip_after, .. } => { assert_eq!(expected_rip, rip_after); } other => { panic!("unexpected exit: {:?}", other); } }; } // this function will sit and loop in the kernel after trying to fulfill the MMIO exit. // // not great! don't do that! it's responsive to EINTR at least. // #[test] #[allow(dead_code)] fn kvm_hugepage_bug() { let mut vm = Vm::create(1024 * 1024).expect("can create vm"); vm.add_memory(GuestAddress(0x1_0000_0000), 128 * 1024).expect("can add test mem region"); unsafe { vm.configure_identity_paging(None); } // `add [rsp], al; add [rcx], al; pop [rcx]; hlt` // the first instruction runs fine. the second instruction runs fine. // the third instruction gets a page fault at 0xf800? which worked fine for the add. // this turns out to be an issue in linux' paging64_gva_to_gpa() when the va is mapped with // huge pages. let inst: &'static [u8] = &[0x00, 0x04, 0x24, 0x00, 0x01, 0x8f, 0x01, 0xf4]; let mut regs = vm.get_regs().unwrap(); regs.rax = 0x00000002_00100000; regs.rcx = 0x00000002_00100000; vm.program(inst, &mut regs); vm.set_regs(®s).unwrap(); vm.set_single_step(true).expect("can enable single-step"); vm.run().expect("can run vm"); let vm_regs = vm.get_regs().unwrap(); let vm_sregs = vm.get_sregs().unwrap(); let mut prev_rip = [0u8; 8]; vm.read_mem(GuestAddress(vm_regs.rsp + 8), &mut prev_rip[..]); let mut buf = [0u8; 8]; vm.read_mem(GuestAddress(vm_regs.rsp), &mut buf[..]); eprintln!( "error code: {:#08x} accessing {:016x} @ rip={:#016x} (cr3={:016x})", u64::from_le_bytes(buf), vm_sregs.cr2, u64::from_le_bytes(prev_rip), vm_sregs.cr3 ); if vm_regs.rip == 0x300f { let mut pdpt = [0u8; 4096]; vm.read_mem(vm.page_tables().pdpt_addr(), &mut pdpt[..]); eprintln!("pdpt: {:x?}", &pdpt[..8]); } panic!("no"); } impl Vm { pub fn create(mem_size: usize) -> Result { let kvm = Kvm::new() .map_err(|e| VmError::from_kvm("Kvm::new()", e))?; let vm = kvm.create_vm() .map_err(|e| VmError::from_kvm("craete_vm", e))?; let supported_cpuid = kvm.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES).unwrap(); // actual minimum is somewhere around 0x1a000 bytes, but 0x20_000 aka 128k will do if mem_size < 128 * 1024 { return Err(VmCreateError::TooSmall { requested: mem_size, required: 128 * 1024, }); } let mapping = Mapping::create_shared(0, mem_size, ProtFlags::PROT_READ | ProtFlags::PROT_WRITE)?; let region = kvm_userspace_memory_region { slot: 0, guest_phys_addr: 0x0000, memory_size: mapping.size.get() as u64, userspace_addr: mapping.addr.as_ptr() as u64, flags: 0, }; let set_res = unsafe { vm.set_user_memory_region(region) }; set_res.map_err(|e| VmError::from_kvm("set_user_memory_region", e))?; let vcpu_res = vm.create_vcpu(0); let vcpu = vcpu_res.map_err(|e| VmError::from_kvm("create_vcpu(0)", e))?; let current_cpuid = vcpu.get_cpuid2(KVM_MAX_CPUID_ENTRIES).unwrap(); let mem_ceiling = mapping.size.get().try_into().unwrap(); let mut this = Vm { vm, vcpu, supported_cpuid, current_cpuid, idt_configured: false, memory: mapping, aux_memories: Vec::new(), mem_ceiling, }; let mut vcpu_regs = this.get_regs()?; let mut vcpu_sregs = this.get_sregs()?; unsafe { this.configure_identity_paging(Some(&mut vcpu_sregs)); this.configure_selectors(&mut vcpu_sregs); this.configure_idt(&mut vcpu_regs, &mut vcpu_sregs); let mut xcrs = this.get_xcrs()?; this.configure_extensions(&mut vcpu_sregs, &mut xcrs); this.set_xcrs(&xcrs)?; } vcpu_sregs.efer = 0x0000_0500; // LME | LMA this.set_regs(&vcpu_regs)?; this.set_sregs(&vcpu_sregs)?; Ok(this) } /// map and add a region of size `size` at guest-physical address `gpa`. /// /// this will not update page tables, so if the newly-added memory is not already mapped due to /// a previous `configure_identity_paging` call and it is not mapped due to explicit page table /// management, it will not yet be accessible by guest code. pub fn add_memory(&mut self, gpa: GuestAddress, size: u64) -> Result<(), VmError> { let new_mapping_end = gpa.0.checked_add(size) .map(|addr| GuestAddress(addr)) .ok_or_else(|| VmError::InvalidMapping { base: gpa, size })?; if self.memory.overlaps(gpa, new_mapping_end) { return Err(VmError::InvalidMapping { base: gpa, size }); } else { for mapping in self.aux_memories.iter() { if mapping.overlaps(gpa, new_mapping_end) { return Err(VmError::InvalidMapping { base: gpa, size }); } } } let mapping = Mapping::create_shared( u64_to_usize(gpa.0), u64_to_usize(size), ProtFlags::PROT_READ | ProtFlags::PROT_WRITE )?; let used_slots: u32 = self.aux_memories.len().try_into() .map_err(|_| VmError::InvalidMapping { base: gpa, size })?; let next_slot = used_slots.checked_add(1) .ok_or_else(|| VmError::InvalidMapping { base: gpa, size })?; let region = kvm_userspace_memory_region { slot: next_slot, guest_phys_addr: gpa.0, memory_size: mapping.size.get() as u64, userspace_addr: mapping.addr.as_ptr() as u64, flags: 0, }; let set_res = unsafe { self.vm.set_user_memory_region(region) }; set_res.map_err(|e| VmError::from_kvm("set_user_memory_region", e))?; self.aux_memories.push(mapping); if new_mapping_end.0 > self.mem_ceiling { self.mem_ceiling = new_mapping_end.0; } Ok(()) } pub fn get_regs(&self) -> Result { self.vcpu.get_regs() .map_err(|e| VmError::from_kvm("get_regs", e)) } pub fn get_sregs(&self) -> Result { self.vcpu.get_sregs() .map_err(|e| VmError::from_kvm("get_sregs", e)) } pub fn get_xcrs(&self) -> Result { self.vcpu.get_xcrs() .map_err(|e| VmError::from_kvm("get_xcrs", e)) } pub fn set_regs(&self, regs: &kvm_regs) -> Result<(), VmError> { self.vcpu.set_regs(regs) .map_err(|e| VmError::from_kvm("set_regs", e)) } pub fn set_sregs(&self, sregs: &kvm_sregs) -> Result<(), VmError> { self.vcpu.set_sregs(sregs) .map_err(|e| VmError::from_kvm("set_sregs", e)) } pub fn set_xcrs(&self, xcrs: &kvm_xcrs) -> Result<(), VmError> { self.vcpu.set_xcrs(xcrs) .map_err(|e| VmError::from_kvm("set_xcrs", e)) } pub fn idt_configured(&self) -> bool { self.idt_configured } // TODO: seems like there's a KVM bug where if the VM is configured for single-step and the // single-stepped instruction is a rmw to MMIO memory (or MMIO hugepages?), the single-step // doesn't actually take effect. compare `0x33 0x00` and `0x31 0x00`. what the hell! pub fn set_single_step(&mut self, active: bool) -> Result<(), VmError> { let mut guest_debug = kvm_guest_debug::default(); if active { guest_debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP }; self.vcpu.set_guest_debug(&guest_debug) .map_err(|e| VmError::from_kvm("set_guest_debug", e)) } pub fn run<'vm>(&'vm mut self) -> Result, VmError> { let exit = self.vcpu.run() .map_err(|e| VmError::from_kvm("vcpu run", e))?; match exit { kvm_ioctls::VcpuExit::MmioRead(addr, buf) => { // `buf` is typed with a lifetime from the reborrow of self.vcpu for run() above. // this means it's a shorter lifetime than `'vm`, but since the resulting lifetime // is also `'vm` it *really* has the effect of disallowing any subsequent use of // `self`. these transmutes decouple the lifetime of `exit` from the lifetime of // `self` and returned `VcpuExit`, so other arms that don't involve lifetimes can // drop `exit()` and query the vcpu. // // SAFETY: this actually extends the lifetime of `buf` from the shorter transient // lifetime to `'vm` for the return type. let buf: &'vm mut [u8] = unsafe { core::mem::transmute(buf) }; return Ok(VcpuExit::MmioRead { buf, addr }); } kvm_ioctls::VcpuExit::MmioWrite(addr, buf) => { // see the same transmute in `MmioRead` for why this is load-bearing. // // SAFETY: this actually extends the lifetime of `buf` from the shorter transient // lifetime to `'vm` for the return type. let buf: &'vm [u8] = unsafe { core::mem::transmute(buf) }; return Ok(VcpuExit::MmioWrite { buf, addr }); } kvm_ioctls::VcpuExit::IoIn(port, buf) => { // see the same transmute in `MmioRead` for why this is load-bearing. // // SAFETY: this actually extends the lifetime of `buf` from the shorter transient // lifetime to `'vm` for the return type. let buf: &'vm mut [u8] = unsafe { core::mem::transmute(buf) }; return Ok(VcpuExit::IoIn { port, buf }); } kvm_ioctls::VcpuExit::IoOut(port, buf) => { // see the same transmute in `MmioRead` for why this is load-bearing. // // SAFETY: this actually extends the lifetime of `buf` from the shorter transient // lifetime to `'vm` for the return type. let buf: &'vm [u8] = unsafe { core::mem::transmute(buf) }; return Ok(VcpuExit::IoOut { port, buf }); } kvm_ioctls::VcpuExit::Debug(info) => { let pc = info.pc; return Ok(VcpuExit::Debug { pc, info }); } kvm_ioctls::VcpuExit::Hlt => { let regs = self.get_regs()?; if self.idt_configured { let intrs_start = self.interrupt_handlers_start().0; let intrs_end = intrs_start + IDT_ENTRIES as u64; // by the time we've exited the `hlt` of the interrupt handler has completed, // so rip is advanced by one. subtract back out to convert to an exception // vector number. let intr_addr = regs.rip - 1; if intr_addr >= intrs_start && intr_addr < intrs_end { let nr = intr_addr - intrs_start; // because IDT_ENTRIES is 256, this should always be true.. assert!(nr < 256); let nr = nr as u8; return Ok(VcpuExit::Exception { nr }); } } Ok(VcpuExit::Hlt) } kvm_ioctls::VcpuExit::Shutdown => { return Ok(VcpuExit::Shutdown); } other => { panic!("unhandled VcpuExit kind: {other:?}"); } } } /// get a pointer to host memory mapped to guest address `address`. /// /// panics if `address` is not a guest-physical address backed by host memory. pub unsafe fn host_ptr(&self, address: GuestAddress) -> *mut u8 { let mapping = self.map_containing(address, 0) .expect("mapping for address exists"); unsafe { mapping.host_ptr(address) } } pub fn gdt_addr(&self) -> GuestAddress { GuestAddress(0x1000) } pub fn idt_addr(&self) -> GuestAddress { GuestAddress(0x2000) } pub fn interrupt_handlers_start(&self) -> GuestAddress { GuestAddress(0x3000) } pub fn page_table_addr(&self) -> GuestAddress { GuestAddress(0x10000) } pub fn code_addr(&self) -> GuestAddress { GuestAddress(self.memory.size.get() as u64 - 4096) } pub fn mem_ceiling(&self) -> u64 { self.mem_ceiling } /// configuring the IDT implies the IDT might be used which means we want a stack pointer /// that can have at least 0x18 bytes pushed to it if an interrupt happens. pub fn stack_addr(&self) -> GuestAddress { // it would be nice to point the stack somewhere that we could get MMIO exits and see the // processor push words for the interrupt in real time, but that doesn't ... work. // instead, you end up in a loop somewhere around svm_vcpu_run (which you can ^C out of, // thankfully). // // so this picks some guest memory lower down. // stack grows *down* but if someone pops a lot of bytes from rsp we'd go up and // clobber the page tables. so leave a bit of space. GuestAddress(0x19800) } /// selector 0x10 is chosen arbitrarily for code. pub fn selector_cs(&self) -> u16 { 0x10 } /// selector 0x18 is chosen arbitrarily for data (all segments; ss, ds, es, etc). pub fn selector_ds(&self) -> u16 { 0x18 } fn map_containing_mut(&mut self, base: GuestAddress, size: u64) -> Option<&mut Mapping> { let mapping = if self.memory.contains(base) { &mut self.memory } else { self.aux_memories.iter_mut() .find(|map| map.contains(base))? }; if !mapping.check_range(base, size) { return None; } Some(mapping) } fn map_containing(&self, base: GuestAddress, size: u64) -> Option<&Mapping> { let mapping = if self.memory.contains(base) { &self.memory } else { self.aux_memories.iter() .find(|map| map.contains(base))? }; if !mapping.check_range(base, size) { return None; } Some(mapping) } /// write all of `data` into guest memory at guest-physical address `addr`. /// /// panics if `data` extends beyond the end of guest memory. pub fn write_mem(&mut self, addr: GuestAddress, data: &[u8]) { let mapping = self.map_containing(addr, data.len() as u64).expect("mapping is valid"); // SAFETY: `check_range` above validates the range to copy, and... please do not // provide a slice of guest memory as what the guest should be programmed for... unsafe { std::ptr::copy_nonoverlapping( data.as_ptr(), mapping.host_ptr(addr), data.len() ); } } /// read guest-physical memory at `addr` to `addr + buf.len()` into `buf`. /// /// panics if `addr + buf.len()` extends beyond the end of guest memory. pub fn read_mem(&mut self, addr: GuestAddress, buf: &mut [u8]) { let mapping = self.map_containing(addr, buf.len() as u64).expect("mapping is valid"); // SAFETY: `check_range` above validates the range to copy, and... please do not // provide a slice of guest memory as what should be read into... unsafe { std::ptr::copy_nonoverlapping( mapping.host_ptr(addr) as *const _, buf.as_mut_ptr(), buf.len() ); } } /// returns a slice of guest memory pointed to by guest-physical address `addr`, of size /// `size`. /// /// panics if `addr + size` is not enclosed in a single guest mapping. this crate doesn't /// support returning a single slice of adjacent guest memory regions (yet?), sorry. pub fn mem_slice_mut<'vm>(&'vm mut self, addr: GuestAddress, size: u64) -> &'vm mut [u8] { let mapping = self.map_containing_mut(addr, size).expect("mapping is valid"); // SAFETY: we have an exclusive borrow of the VM, so it is not currently running, and there // is no other outstanding slice of guest memory. `map_containing` has already ensured that // this mapping contains the whole range `[addr, addr + size)`. unsafe { mapping.slice_mut(addr, size) } } /// returns a slice of guest memory pointed to by guest-physical address `addr`, of size /// `size`. /// /// panics if `addr + size` is not enclosed in a single guest mapping. this crate doesn't /// support returning a single slice of adjacent guest memory regions (yet?), sorry. pub fn mem_slice<'vm>(&'vm self, addr: GuestAddress, size: u64) -> &'vm [u8] { let mapping = self.map_containing(addr, size).expect("mapping is valid"); // SAFETY: we have an exclusive borrow of the VM, so it is not currently running, and there // is no other outstanding slice of guest memory. `map_containing` has already ensured that // this mapping contains the whole range `[addr, addr + size)`. unsafe { mapping.slice(addr, size) } } /// write `code` into guest memory and set `regs.rip` to the address of that code. /// /// the chosen code address is [`Self::code_addr`]. pub fn program(&mut self, code: &[u8], regs: &mut kvm_regs) { let addr = self.code_addr(); self.write_mem(addr, code); regs.rip = addr.0; } fn gdt_entry_mut(&mut self, idx: u16) -> *mut u64 { // the GDT is set up at addresses 0..64k: // // > 3.5.1 Segment Descriptor Tables // > A segment descriptor table is an array of segment descriptors (see Figure 3-10). A // > descriptor table is variable in length and can contain up to 8192 (2^13) 8-byte // > descriptors. assert!(idx < 4096 / 8); let addr = GuestAddress(self.gdt_addr().0 + (idx as u64 * 8)); let mapping = self.map_containing(addr, std::mem::size_of::() as u64).unwrap(); // SAFETY: idx * 8 can't overflow isize, and we've asserted the end of the pointer is // still inside the allocation (`self.memory`). unsafe { mapping.host_ptr(addr) as *mut u64 } } // note this returns a u32, but an IDT is four u32. the u32 this points at is the first of // the four for the entry. fn idt_entry_mut(&mut self, idx: u8) -> *mut u32 { let addr = GuestAddress(self.idt_addr().0 + (idx as u64 * 16)); let mapping = self.map_containing(addr, std::mem::size_of::<[u64; 2]>() as u64).unwrap(); unsafe { mapping.host_ptr(addr) as *mut u32 } } pub fn page_tables(&self) -> VmPageTables<'_> { let base = self.page_table_addr(); // the page tables are really just two pages: a PML4 and a PDPT for its first 512G of // address space. assert!(self.map_containing(base, 0x2000).is_some()); VmPageTables { vm: self, base, } } // TODO: there should be a version of this that can be used to query "does this VM support // these extensions" probably, and that should take a subset of `Feature` for the ones that are // actually related to ISA support (e.g. Pdpe1Gb isn't really useful as a public queryable // feature..) fn cpuid_supports(&self, feature: Feature) -> bool { fn find_leaf(cpuid: &CpuId, leaf: u32, index: u32, f: impl Fn(&kvm_cpuid_entry2) -> bool) -> bool { for mut entry in cpuid.as_slice() { if entry.function == leaf && entry.index == index { return f(&mut entry); } } false } match feature { Feature::XSave => { find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| { leaf.edx & CPUID_00000001_ECX_XSAVE != 0 }) } Feature::Pdpe1Gb => { find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| { leaf.edx & CPUID_80000001_EDX_PDPE1GB != 0 }) } Feature::StateSSE => { find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { leaf.eax & CPUID_0000000D_EAX_SSE == CPUID_0000000D_EAX_SSE }) } Feature::StateAVX => { find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { leaf.eax & CPUID_0000000D_EAX_AVX == CPUID_0000000D_EAX_AVX }) } Feature::StateAVX512 => { find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { leaf.eax & CPUID_0000000D_EAX_AVX512 == CPUID_0000000D_EAX_AVX512 }) } } } /// set `feature` to `wanted` in the VM's CPUID configuration. /// /// panics if the feature cannot be configured (such as if the corresponding CPUID leaf is not /// available at all). use [`cpuid_supports`] to test if the feature can be configured. fn cpuid_set(&mut self, feature: Feature, wanted: bool) { fn edit_leaf(cpuid: &mut CpuId, leaf: u32, index: u32, mut f: impl FnMut(&mut kvm_cpuid_entry2)) { for mut entry in cpuid.as_mut_slice() { if entry.function == leaf && entry.index == index { f(&mut entry); return; } } // if we're here, the entry simply is not present (yet..?) // // so, create it. let mut entry = kvm_cpuid_entry2 { function: leaf, index: index, eax: 0, ecx: 0, edx: 0, ebx: 0, flags: 0, padding: [0; 3], }; f(&mut entry); cpuid.push(entry).expect("can push"); } fn bit_set(word: &mut u32, bit: u32, wanted: bool) { *word &= !bit; if wanted { *word |= bit; } } let mut edited = false; match feature { Feature::XSave => { edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| { bit_set(&mut leaf.ecx, CPUID_00000001_ECX_XSAVE, wanted); edited = true; }); }, Feature::Pdpe1Gb => { edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| { bit_set(&mut leaf.edx, CPUID_80000001_EDX_PDPE1GB, wanted); edited = true; }); }, Feature::StateSSE => { edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { bit_set(&mut leaf.eax, 1, wanted); bit_set(&mut leaf.eax, CPUID_0000000D_EAX_SSE, wanted); edited = true; }); } Feature::StateAVX => { edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX, wanted); edited = true; }); } Feature::StateAVX512 => { edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX512, wanted); edited = true; }); } } assert!(edited); self.vcpu.set_cpuid2(&self.current_cpuid).expect("can set cpuid"); } /// configure page tables for identity mapping of all memory from guest address zero up to the /// end of added memory regions, rounded up to the next GiB. /// /// if `sregs` is provided, update `cr0`, `cr3`, and `cr4` in support of protected-mode or /// long-mode paging. this is a fixed pattern: if control registers have not been changed since /// `Vm::create` then there will be no change to these control registers and `sregs` can be /// omitted. /// /// panics if the end of added memory regions is above 512 GiB. pub unsafe fn configure_identity_paging(&mut self, sregs: Option<&mut kvm_sregs>) { // we're only setting up one PDPT, which can have up to 512 PDPTE covering 1G each. assert!(self.mem_ceiling() <= 512 * GB); assert!(self.cpuid_supports(Feature::Pdpe1Gb)); self.cpuid_set(Feature::Pdpe1Gb, true); let pt = self.page_tables(); let pml4_ent = 1 << 0 | // P 1 << 1 | // RW 1 << 2 | // user access allowed. but no user code will run so not strictly needed. 0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient) 0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient) 0 << 5 | // A 0 << 6 | // ignored 0 << 7 | // PS (reserved must-be-0) 0 << 11 | // R (for ordinary paging, ignored; for HLAT ...) pt.pdpt_addr().0; unsafe { pt.pml4_mut().write(pml4_ent); } let mut mapped: u64 = 0; // we've set up the first PML4 to point to a PDPT, so we should actually set it up! let pdpt = pt.pdpt_mut(); // PDPTEs start at the start of PDPT.. let mut pdpte = pdpt; let entry_bits: u64 = 1 << 0 | // P 1 << 1 | // RW 1 << 2 | // user accesses allowed (everything is under privilege level 0 tho) 0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient) 0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient) 0 << 5 | // Accessed 0 << 6 | // Dirty 1 << 7 | // Page size (1 implies 1G page) 1 << 8 | // Global (if cr4.pge) 0 << 9 | 0 << 10 | 0 << 11 | // for ordinary paging, ignored. for HLAT, ... 0 << 12; // PAT (TODO: configure explicitly, but PAT0 is sufficient. verify MTRR sets PAT0 to WB?) while mapped < self.mem_ceiling() { let phys_num = mapped >> 30; let entry = entry_bits | (phys_num << 30); unsafe { pdpte.write(entry); pdpte = pdpte.offset(1); } // eprintln!("mapped 1g at {:08x}", mapped); mapped += 1 << 30; } if let Some(sregs) = sregs { sregs.cr0 |= 0x8000_0001; // cr0.PE | cr0.PG sregs.cr3 = pt.pml4_addr().0 as u64; sregs.cr4 |= 1 << 5; // enable PAE } } unsafe fn configure_selectors(&mut self, sregs: &mut kvm_sregs) { // we have to set descriptor information directly. this avoids having to load selectors // as the first instructions on the vCPU, which is simplifying. but if we want the // information in these selectors to match with anything in a GDT (i do!) we'll have to // keep this initial state lined up with GDT entries ourselves. // // we could avoid setting up the GDT for the most part, but anything that might // legitimately load the "valid" current segment selector would instead clobber the // selector with zeroes. sregs.cs.base = 0; sregs.cs.limit = 0; sregs.cs.selector = self.selector_cs(); sregs.cs.type_ = 0b1011; // see SDM table 3-1 Code- and Data-Segment Types sregs.cs.present = 1; sregs.cs.dpl = 0; sregs.cs.db = 0; sregs.cs.s = 1; sregs.cs.l = 1; sregs.cs.g = 0; sregs.cs.avl = 0; sregs.ds.base = 0; sregs.ds.limit = 0xffffffff; sregs.ds.selector = self.selector_ds(); sregs.ds.type_ = 0b0011; // see SDM table 3-1 Code- and Data-Segment Types sregs.ds.present = 1; sregs.ds.dpl = 0; sregs.ds.db = 0; sregs.ds.s = 1; sregs.ds.l = 0; sregs.ds.g = 0; sregs.ds.avl = 0; sregs.es = sregs.ds; sregs.fs = sregs.ds; sregs.gs = sregs.ds; // linux populates the vmcb cpl field with whatever's in ss.dpl. what the hell??? sregs.ss = sregs.ds; sregs.gdt.base = self.gdt_addr().0; sregs.gdt.limit = 256 * 8 - 1; unsafe { self.gdt_entry_mut(self.selector_cs() >> 3).write(encode_segment(&sregs.cs)); self.gdt_entry_mut(self.selector_ds() >> 3).write(encode_segment(&sregs.ds)); } } fn write_idt_entry( &mut self, intr_nr: u8, interrupt_handler_cs: u16, interrupt_handler_addr: GuestAddress ) { let idt_ptr = self.idt_entry_mut(intr_nr); // entries in the IDT, interrupt and trap descriptors (in the AMD APM, "interrupt-gate" // and "trap-gate" descriptors), are described (in the AMD APM) by // "Figure 4-24. Interrupt-Gate and Trap-Gate Descriptors—Long Mode". reproduced here: // // 3 2 1 | 1 0 // 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 // |---------------------------------------------------------------| // | res,ign | +12 // | target offset[63:32] | +8 // | target offset[31:16] |P|DPL|0| type | res,ign | IST | +4 // | target selector | target offset[15:0] | +0 // |---------------------------------------------------------------| // // descriptors are encoded with P set, DPL at 0, and type set to 0b1110. TODO: frankly // i don't know the mechanical difference between type 0x0e and type 0x0f, but 0x0e // works for now. let idt_attr_bits = 0b1_00_0_1110_00000_000; let low_hi = (interrupt_handler_addr.0 as u32 & 0xffff_0000) | idt_attr_bits; let low_lo = (interrupt_handler_cs as u32) << 16 | (interrupt_handler_addr.0 as u32 & 0x0000_ffff); unsafe { idt_ptr.offset(0).write(low_lo); idt_ptr.offset(1).write(low_hi); idt_ptr.offset(2).write((interrupt_handler_addr.0 >> 32) as u32); idt_ptr.offset(3).write(0); // reserved } } fn configure_idt(&mut self, regs: &mut kvm_regs, sregs: &mut kvm_sregs) { sregs.idt.base = self.idt_addr().0; sregs.idt.limit = IDT_ENTRIES * 16 - 1; // IDT is 256 entries of 16 bytes each for i in 0..IDT_ENTRIES { let interrupt_handler_addr = GuestAddress(self.interrupt_handlers_start().0 + i as u64); self.write_idt_entry( i.try_into().expect(" `2.5 CONTROL REGISTERS` // CR0 const TS: u32 = 3; // CR4 const OSFXSR: u32 = 9; const OSXMMEXCPT: u32 = 10; const OSXSAVE: u32 = 18; // XCR0 (see "EXTENDED CONTROL REGISTERS (INCLUDING XCR0)") // these bits are the same as in cpuid leaf 0xd.eax const XCR0_SSE: u64 = CPUID_0000000D_EAX_SSE as u64; const XCR0_AVX: u64 = CPUID_0000000D_EAX_AVX as u64; const XCR0_AVX512: u64 = CPUID_0000000D_EAX_AVX512 as u64; // operations on `xmm` registers result in `#UD` even if CPUID says that SSE should be // quite functional. this is true even for SSE or SSE2 instructions on an `x86_64` system // (which makes SSE a non-optional baseline!) // // the Intel SDM implies this through somewhat tortured language in the section // "Checking for Intel® SSE and SSE2 Support": // > If an operating system did not provide adequate system level support for Intel // > SSE, executing an Intel SSE or SSE2 instructions can also generate #UD. // // to fully understand this statement, realize that `an operating system .. provide[s] // adequate system level support" by setting CR4.OSFXSR, // // > Set the OSFXSR flag (bit 9 in control register CR4) to indicate that the operating // > system supports saving and restoring the SSE/SSE2/SSE3/SSSE3 execution environment // // so OSFXSR is how "the operating system" indicates save/restore state, and must be set to // execute SSE (and later) SIMD instructions even if we never will use `fxsave` or even // switch tasks on the vCPU. sregs.cr4 |= 1 << OSFXSR; // there is a similar relationship between SIMD extension functionality and CR4.OSXSAVE. // this passage in the SDM under "XSAVE-SUPPORTED FEATURES AND STATE-COMPONENT BITMAPS" // draws a fairly direct connection: // // > As will be explained in Section 13.3, the XSAVE feature set is enabled only if // > CR4.OSXSAVE[bit 18] = 1. If CR4.OSXSAVE = 0, the processor treats XSAVE-enabled state // > features and their state components as if all bits in XCR0 were clear; the state // > components cannot be modified and the features’ instructions cannot be executed. // // but the consequence is contradicted by the next paragraph, // // > Processors allow modification of this state, as well as execution of x87 FPU // > instructions and SSE instructions [...] , regardless of the value of CR4.OSXSAVE and // > XCR0. // // we will see that CR4.OSXSAVE must be set for other SIMD extensions below, as well. sregs.cr4 |= 1 << OSXSAVE; // SSE3, SSSE3, and SSE4 involve a bit extra: // > Intel SSE3, SSSE3, and Intel SSE4 will cause a DNA Exception (#NM) if the processor // > attempts to execute an Intel SSE3 instruction while CR0.TS[bit 3] = 1 sregs.cr0 &= !(1 << TS); // > Set the OSXMMEXCPT flag (bit 10 in control register CR4) to indicate that the operating // > system supports the handling of SSE/SSE2/SSE3 SIMD floating-point exceptions (#XM). // // this is somewhat better than just getting an uncategorized #UD. sregs.cr4 |= 1 << OSXMMEXCPT; assert!(xcrs.nr_xcrs > 0); assert_eq!(xcrs.xcrs[0].xcr, 0); let mut needs_xsave = false; if self.cpuid_supports(Feature::StateSSE) { self.cpuid_set(Feature::StateSSE, true); xcrs.xcrs[0].value |= 1; xcrs.xcrs[0].value |= XCR0_SSE; needs_xsave = true; } if self.cpuid_supports(Feature::StateAVX) { self.cpuid_set(Feature::StateAVX, true); xcrs.xcrs[0].value |= XCR0_AVX; needs_xsave = true; } if self.cpuid_supports(Feature::StateAVX512) { self.cpuid_set(Feature::StateAVX512, true); xcrs.xcrs[0].value |= XCR0_AVX512; needs_xsave = true; } if needs_xsave { if self.cpuid_supports(Feature::XSave) { self.cpuid_set(Feature::XSave, true); } else { panic!( "look, there's no CPU that supports SSE but not xsave. \ i only checked to be thorough."); } } } }