diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/x86_64.rs | 451 |
1 files changed, 443 insertions, 8 deletions
diff --git a/src/x86_64.rs b/src/x86_64.rs index da46380..d02fcf0 100644 --- a/src/x86_64.rs +++ b/src/x86_64.rs @@ -5,11 +5,12 @@ use nix::sys::mman::{MapFlags, ProtFlags}; use kvm_ioctls::{Kvm, VcpuFd, VmFd}; use kvm_bindings::{ - kvm_guest_debug, kvm_userspace_memory_region, kvm_segment, - KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, + kvm_cpuid_entry2, kvm_guest_debug, + kvm_userspace_memory_region, kvm_segment, CpuId, + KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_MAX_CPUID_ENTRIES, }; -pub use kvm_bindings::{kvm_regs, kvm_sregs, kvm_debug_exit_arch}; +pub use kvm_bindings::{kvm_regs, kvm_sregs, kvm_xcrs, kvm_debug_exit_arch}; const _TARGET_IS_64BIT: () = { assert!(core::mem::size_of::<u64>() == core::mem::size_of::<usize>(), "asmlinator only supports 64-bit targets"); @@ -40,12 +41,51 @@ fn u64_to_usize(x: u64) -> usize { pub struct Vm { vm: VmFd, vcpu: VcpuFd, + supported_cpuid: CpuId, + current_cpuid: CpuId, idt_configured: bool, mem_ceiling: u64, memory: Mapping, aux_memories: Vec<Mapping>, } +/// broad categories of cpuid/cpu features that should be detectable or configurable as part of +/// setting up a VM. this is split out for legibility, but also because in theory these (especially +/// ISA extensions) features probably should be configurable by library users somehow.. +/// +/// not yet sure, so this is not pub. +#[derive(Copy, Clone, Debug)] +enum Feature { + /// support for the xsave/xrstor instructions and at least xcr0. + /// + /// cpuid leaf eax=0x0000_0001 bit ecx[26], see APM + /// chapter "Obtaining Processor Information Via the CPUID Instruction", + /// section "Standard Feature Function Numbers". + XSave, + /// support for 1GB page mappings. cpuid leaf eax=0x8000_0001 bit edx[26]. + Pdpe1Gb, + /// support for the XSAVE SSE region. this correponds to the bit in CPUID leaf D and + /// corresponding bit in xcr0. if this bit is unset, attempts to use instructions with xmm + /// state will #UD. + StateSSE, + /// support for the XSAVE AVX region. this correponds to the bit in CPUID leaf D and + /// corresponding bit in xcr0. if this bit is unset, attempts to use instructions with ymm + /// state will #UD. + StateAVX, + /// support for the XSAVE AVX512 regions. this correponds to the bits for K, ZMM_Hi256, and + /// Hi16_ZMM in CPUID leaf D and corresponding bits in xcr0. if these bits are not set, + /// attempts to use instructions with zmm state may #UD. + StateAVX512, +} + +const CPUID_00000001_ECX_XSAVE: u32 = 1 << 26; + +const CPUID_0000000D_EAX_SSE: u32 = 1 << 1; +const CPUID_0000000D_EAX_AVX: u32 = 1 << 2; +const CPUID_0000000D_EAX_AVX512: u32 = (1 << 5) | (1 << 6) | (1 << 7); + +const CPUID_80000001_EDX_PDPE1GB: u32 = 1 << 26; + #[derive(PartialEq)] pub enum VcpuExit<'buf> { MmioRead { addr: u64, buf: &'buf mut [u8] }, @@ -387,13 +427,159 @@ fn test_xor_runs() { let res = vm.run().expect("can run vm"); - let rip_after = rip_before + 2; - assert!(matches!(res, VcpuExit::Debug { pc: rip_after, .. })); + let expected_rip = rip_before + 2; + match res { + VcpuExit::Debug { pc: rip_after, .. } => { + assert_eq!(expected_rip, rip_after); + } + other => { + panic!("unexpected exit: {:?}", other); + } + }; let regs_after = vm.get_regs().expect("can get regs"); assert_eq!(regs_after.rax, 0); } +#[test] +fn test_xorps_runs() { + let mut vm = Vm::create(128 * 1024).expect("can create vm"); + let mut regs = vm.get_regs().expect("can get regs"); + + vm.program(&[0x0f, 0x57, 0xc0], &mut regs); + + let rip_before = regs.rip; + + vm.set_regs(®s).expect("can set regs"); + + vm.set_single_step(true).expect("can set single-step"); + + let res = vm.run().expect("can run vm"); + + let expected_rip = rip_before + 3; + eprintln!("exit: {:?}", res); + match res { + VcpuExit::Debug { pc: rip_after, .. } => { + assert_eq!(expected_rip, rip_after); + } + other => { + panic!("unexpected exit: {:?}", other); + } + }; +} + +#[test] +fn test_vex_vandps_runs() { + let mut vm = Vm::create(128 * 1024).expect("can create vm"); + + if !vm.cpuid_supports(Feature::StateAVX) { + panic!("host CPU does not support AVX"); + } + + let mut regs = vm.get_regs().expect("can get regs"); + + vm.program(&[0xc5, 0xe0, 0x54, 0x03], &mut regs); + + regs.rbx = regs.rip; + let rip_before = regs.rip; + + vm.set_regs(®s).expect("can set regs"); + + vm.set_single_step(true).expect("can set single-step"); + + let res = vm.run().expect("can run vm"); + + let expected_rip = rip_before + 4; + eprintln!("exit: {:?}", res); + match res { + VcpuExit::Debug { pc: rip_after, .. } => { + assert_eq!(expected_rip, rip_after); + } + other => { + panic!("unexpected exit: {:?}", other); + } + }; +} + +#[test] +fn test_evex_vandps_runs() { + let mut vm = Vm::create(128 * 1024).expect("can create vm"); + + if !vm.cpuid_supports(Feature::StateAVX512) { + panic!("host CPU does not support AVX512"); + } + + let mut regs = vm.get_regs().expect("can get regs"); + + vm.program(&[0x62, 0xf1, 0x7c, 0xbd, 0x54, 0x0a], &mut regs); + + regs.rbx = regs.rip; + let rip_before = regs.rip; + + vm.set_regs(®s).expect("can set regs"); + + vm.set_single_step(true).expect("can set single-step"); + + let res = vm.run().expect("can run vm"); + + let expected_rip = rip_before + 6; + eprintln!("exit: {:?}", res); + match res { + VcpuExit::Debug { pc: rip_after, .. } => { + assert_eq!(expected_rip, rip_after); + } + other => { + panic!("unexpected exit: {:?}", other); + } + }; +} + + +// this function will sit and loop in the kernel after trying to fulfill the MMIO exit. +// +// not great! don't do that! it's responsive to EINTR at least. +// #[test] +#[allow(dead_code)] +fn kvm_hugepage_bug() { + let mut vm = Vm::create(1024 * 1024).expect("can create vm"); + vm.add_memory(GuestAddress(0x1_0000_0000), 128 * 1024).expect("can add test mem region"); + unsafe { + vm.configure_identity_paging(None); + } + + // `add [rsp], al; add [rcx], al; pop [rcx]; hlt` + // the first instruction runs fine. the second instruction runs fine. + // the third instruction gets a page fault at 0xf800? which worked fine for the add. + // this turns out to be an issue in linux' paging64_gva_to_gpa() when the va is mapped with + // huge pages. + let inst: &'static [u8] = &[0x00, 0x04, 0x24, 0x00, 0x01, 0x8f, 0x01, 0xf4]; + let mut regs = vm.get_regs().unwrap(); + regs.rax = 0x00000002_00100000; + regs.rcx = 0x00000002_00100000; + vm.program(inst, &mut regs); + vm.set_regs(®s).unwrap(); + vm.set_single_step(true).expect("can enable single-step"); + vm.run().expect("can run vm"); + + let vm_regs = vm.get_regs().unwrap(); + let vm_sregs = vm.get_sregs().unwrap(); + let mut prev_rip = [0u8; 8]; + vm.read_mem(GuestAddress(vm_regs.rsp + 8), &mut prev_rip[..]); + let mut buf = [0u8; 8]; + vm.read_mem(GuestAddress(vm_regs.rsp), &mut buf[..]); + eprintln!( + "error code: {:#08x} accessing {:016x} @ rip={:#016x} (cr3={:016x})", + u64::from_le_bytes(buf), vm_sregs.cr2, + u64::from_le_bytes(prev_rip), vm_sregs.cr3 + ); + if vm_regs.rip == 0x300f { + let mut pdpt = [0u8; 4096]; + vm.read_mem(vm.page_tables().pdpt_addr(), &mut pdpt[..]); + eprintln!("pdpt: {:x?}", &pdpt[..8]); + } + panic!("no"); +} + impl Vm { pub fn create(mem_size: usize) -> Result<Vm, VmCreateError> { let kvm = Kvm::new() @@ -402,6 +588,8 @@ impl Vm { let vm = kvm.create_vm() .map_err(|e| VmError::from_kvm("craete_vm", e))?; + let supported_cpuid = kvm.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES).unwrap(); + // actual minimum is somewhere around 0x1a000 bytes, but 0x20_000 aka 128k will do if mem_size < 128 * 1024 { return Err(VmCreateError::TooSmall { @@ -426,11 +614,15 @@ impl Vm { let vcpu_res = vm.create_vcpu(0); let vcpu = vcpu_res.map_err(|e| VmError::from_kvm("create_vcpu(0)", e))?; + let current_cpuid = vcpu.get_cpuid2(KVM_MAX_CPUID_ENTRIES).unwrap(); + let mem_ceiling = mapping.size.get().try_into().unwrap(); let mut this = Vm { vm, vcpu, + supported_cpuid, + current_cpuid, idt_configured: false, memory: mapping, aux_memories: Vec::new(), @@ -444,6 +636,9 @@ impl Vm { this.configure_identity_paging(Some(&mut vcpu_sregs)); this.configure_selectors(&mut vcpu_sregs); this.configure_idt(&mut vcpu_regs, &mut vcpu_sregs); + let mut xcrs = this.get_xcrs()?; + this.configure_extensions(&mut vcpu_sregs, &mut xcrs); + this.set_xcrs(&xcrs)?; } vcpu_sregs.efer = 0x0000_0500; // LME | LMA @@ -514,6 +709,11 @@ impl Vm { .map_err(|e| VmError::from_kvm("get_sregs", e)) } + pub fn get_xcrs(&self) -> Result<kvm_xcrs, VmError> { + self.vcpu.get_xcrs() + .map_err(|e| VmError::from_kvm("get_xcrs", e)) + } + pub fn set_regs(&self, regs: &kvm_regs) -> Result<(), VmError> { self.vcpu.set_regs(regs) .map_err(|e| VmError::from_kvm("set_regs", e)) @@ -524,6 +724,11 @@ impl Vm { .map_err(|e| VmError::from_kvm("set_sregs", e)) } + pub fn set_xcrs(&self, xcrs: &kvm_xcrs) -> Result<(), VmError> { + self.vcpu.set_xcrs(xcrs) + .map_err(|e| VmError::from_kvm("set_xcrs", e)) + } + pub fn idt_configured(&self) -> bool { self.idt_configured } @@ -830,6 +1035,128 @@ impl Vm { } } + // TODO: there should be a version of this that can be used to query "does this VM support + // these extensions" probably, and that should take a subset of `Feature` for the ones that are + // actually related to ISA support (e.g. Pdpe1Gb isn't really useful as a public queryable + // feature..) + fn cpuid_supports(&self, feature: Feature) -> bool { + fn find_leaf(cpuid: &CpuId, leaf: u32, index: u32, f: impl Fn(&kvm_cpuid_entry2) -> bool) -> bool { + for mut entry in cpuid.as_slice() { + if entry.function == leaf && entry.index == index { + return f(&mut entry); + } + } + + false + } + + match feature { + Feature::XSave => { + find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| { + leaf.edx & CPUID_00000001_ECX_XSAVE != 0 + }) + } + Feature::Pdpe1Gb => { + find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| { + leaf.edx & CPUID_80000001_EDX_PDPE1GB != 0 + }) + } + Feature::StateSSE => { + find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { + leaf.eax & CPUID_0000000D_EAX_SSE == CPUID_0000000D_EAX_SSE + }) + } + Feature::StateAVX => { + find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { + leaf.eax & CPUID_0000000D_EAX_AVX == CPUID_0000000D_EAX_AVX + }) + } + Feature::StateAVX512 => { + find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { + leaf.eax & CPUID_0000000D_EAX_AVX512 == CPUID_0000000D_EAX_AVX512 + }) + } + } + } + + /// set `feature` to `wanted` in the VM's CPUID configuration. + /// + /// panics if the feature cannot be configured (such as if the corresponding CPUID leaf is not + /// available at all). use [`cpuid_supports`] to test if the feature can be configured. + fn cpuid_set(&mut self, feature: Feature, wanted: bool) { + fn edit_leaf(cpuid: &mut CpuId, leaf: u32, index: u32, mut f: impl FnMut(&mut kvm_cpuid_entry2)) { + for mut entry in cpuid.as_mut_slice() { + if entry.function == leaf && entry.index == index { + f(&mut entry); + return; + } + } + + // if we're here, the entry simply is not present (yet..?) + // + // so, create it. + let mut entry = kvm_cpuid_entry2 { + function: leaf, + index: index, + eax: 0, + ecx: 0, + edx: 0, + ebx: 0, + flags: 0, + padding: [0; 3], + }; + f(&mut entry); + cpuid.push(entry).expect("can push"); + } + + fn bit_set(word: &mut u32, bit: u32, wanted: bool) { + *word &= !bit; + if wanted { + *word |= bit; + } + } + + let mut edited = false; + + match feature { + Feature::XSave => { + edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| { + bit_set(&mut leaf.ecx, CPUID_00000001_ECX_XSAVE, wanted); + edited = true; + }); + }, + Feature::Pdpe1Gb => { + edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| { + bit_set(&mut leaf.edx, CPUID_80000001_EDX_PDPE1GB, wanted); + edited = true; + }); + }, + Feature::StateSSE => { + edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { + bit_set(&mut leaf.eax, 1, wanted); + bit_set(&mut leaf.eax, CPUID_0000000D_EAX_SSE, wanted); + edited = true; + }); + } + Feature::StateAVX => { + edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { + bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX, wanted); + edited = true; + }); + } + Feature::StateAVX512 => { + edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { + bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX512, wanted); + edited = true; + }); + } + } + + assert!(edited); + + self.vcpu.set_cpuid2(&self.current_cpuid).expect("can set cpuid"); + } + /// configure page tables for identity mapping of all memory from guest address zero up to the /// end of added memory regions, rounded up to the next GiB. /// @@ -840,12 +1167,13 @@ impl Vm { /// /// panics if the end of added memory regions is above 512 GiB. pub unsafe fn configure_identity_paging(&mut self, sregs: Option<&mut kvm_sregs>) { - let pt = self.page_tables(); - // we're only setting up one PDPT, which can have up to 512 PDPTE covering 1G each. assert!(self.mem_ceiling() <= 512 * GB); - // TODO: expects 1G page support + assert!(self.cpuid_supports(Feature::Pdpe1Gb)); + self.cpuid_set(Feature::Pdpe1Gb, true); + + let pt = self.page_tables(); let pml4_ent = 1 << 0 | // P @@ -1020,4 +1348,111 @@ impl Vm { regs.rsp = self.stack_addr().0; self.idt_configured = true; } + + /// configure the vCPU for executing instructions in the hardware-supported extensions. + /// on a fresh vCPU, various extension may be "supported" but result in `#UD` when executed, + /// unless additional configuration is done (as this function does). + /// + /// the Intel SDM describes `INITIALIZING SSE/SSE2/SSE3/SSSE3 EXTENSIONS` but does not point + /// out this `#UD` behavior so directly. the AMD APM does not seem to discuss it at all? + /// + /// this function configures the vCPU to be ready to execute `SSE*` instructions. + fn configure_extensions(&mut self, sregs: &mut kvm_sregs, xcrs: &mut kvm_xcrs) { + // these bit positions in control registers, and their behaviors, are described more + // comprehensively in Voluem 3, + // > `2.5 CONTROL REGISTERS` + + // CR0 + const TS: u32 = 3; + // CR4 + const OSFXSR: u32 = 9; + const OSXMMEXCPT: u32 = 10; + const OSXSAVE: u32 = 18; + + // XCR0 (see "EXTENDED CONTROL REGISTERS (INCLUDING XCR0)") + // these bits are the same as in cpuid leaf 0xd.eax + const XCR0_SSE: u64 = CPUID_0000000D_EAX_SSE as u64; + const XCR0_AVX: u64 = CPUID_0000000D_EAX_AVX as u64; + const XCR0_AVX512: u64 = CPUID_0000000D_EAX_AVX512 as u64; + + // operations on `xmm` registers result in `#UD` even if CPUID says that SSE should be + // quite functional. this is true even for SSE or SSE2 instructions on an `x86_64` system + // (which makes SSE a non-optional baseline!) + // + // the Intel SDM implies this through somewhat tortured language in the section + // "Checking for Intel® SSE and SSE2 Support": + // > If an operating system did not provide adequate system level support for Intel + // > SSE, executing an Intel SSE or SSE2 instructions can also generate #UD. + // + // to fully understand this statement, realize that `an operating system .. provide[s] + // adequate system level support" by setting CR4.OSFXSR, + // + // > Set the OSFXSR flag (bit 9 in control register CR4) to indicate that the operating + // > system supports saving and restoring the SSE/SSE2/SSE3/SSSE3 execution environment + // + // so OSFXSR is how "the operating system" indicates save/restore state, and must be set to + // execute SSE (and later) SIMD instructions even if we never will use `fxsave` or even + // switch tasks on the vCPU. + sregs.cr4 |= 1 << OSFXSR; + + // there is a similar relationship between SIMD extension functionality and CR4.OSXSAVE. + // this passage in the SDM under "XSAVE-SUPPORTED FEATURES AND STATE-COMPONENT BITMAPS" + // draws a fairly direct connection: + // + // > As will be explained in Section 13.3, the XSAVE feature set is enabled only if + // > CR4.OSXSAVE[bit 18] = 1. If CR4.OSXSAVE = 0, the processor treats XSAVE-enabled state + // > features and their state components as if all bits in XCR0 were clear; the state + // > components cannot be modified and the features’ instructions cannot be executed. + // + // but the consequence is contradicted by the next paragraph, + // + // > Processors allow modification of this state, as well as execution of x87 FPU + // > instructions and SSE instructions [...] , regardless of the value of CR4.OSXSAVE and + // > XCR0. + // + // we will see that CR4.OSXSAVE must be set for other SIMD extensions below, as well. + sregs.cr4 |= 1 << OSXSAVE; + + // SSE3, SSSE3, and SSE4 involve a bit extra: + // > Intel SSE3, SSSE3, and Intel SSE4 will cause a DNA Exception (#NM) if the processor + // > attempts to execute an Intel SSE3 instruction while CR0.TS[bit 3] = 1 + sregs.cr0 &= !(1 << TS); + + // > Set the OSXMMEXCPT flag (bit 10 in control register CR4) to indicate that the operating + // > system supports the handling of SSE/SSE2/SSE3 SIMD floating-point exceptions (#XM). + // + // this is somewhat better than just getting an uncategorized #UD. + sregs.cr4 |= 1 << OSXMMEXCPT; + + assert!(xcrs.nr_xcrs > 0); + assert_eq!(xcrs.xcrs[0].xcr, 0); + + let mut needs_xsave = false; + if self.cpuid_supports(Feature::StateSSE) { + self.cpuid_set(Feature::StateSSE, true); + xcrs.xcrs[0].value |= 1; + xcrs.xcrs[0].value |= XCR0_SSE; + needs_xsave = true; + } + if self.cpuid_supports(Feature::StateAVX) { + self.cpuid_set(Feature::StateAVX, true); + xcrs.xcrs[0].value |= XCR0_AVX; + needs_xsave = true; + } + if self.cpuid_supports(Feature::StateAVX512) { + self.cpuid_set(Feature::StateAVX512, true); + xcrs.xcrs[0].value |= XCR0_AVX512; + needs_xsave = true; + } + + if needs_xsave { + if self.cpuid_supports(Feature::XSave) { + self.cpuid_set(Feature::XSave, true); + } else { + panic!( + "look, there's no CPU that supports SSE but not xsave. \ + i only checked to be thorough."); + } + } + } } |
