diff options
| author | iximeow <me@iximeow.net> | 2026-04-21 17:22:40 +0000 |
|---|---|---|
| committer | iximeow <me@iximeow.net> | 2026-04-21 17:22:44 +0000 |
| commit | 254ef83f988265546efd16d584ab8de162fcb045 (patch) | |
| tree | 3b73320f5b798fd20896d0e82b8d82cad510fedc /src/x86_64.rs | |
| parent | 9bd5202201302bc20dd671d515e74b090eda95cc (diff) | |
configure minimal guest CPUID, xcr0
this actually checks for 1GB page support, enables SSE, AVX, AVX512 bits
as well as corresponding xcr0 bits. does not set up AMX, MPK, etc.
Diffstat (limited to 'src/x86_64.rs')
| -rw-r--r-- | src/x86_64.rs | 451 |
1 files changed, 443 insertions, 8 deletions
diff --git a/src/x86_64.rs b/src/x86_64.rs index da46380..d02fcf0 100644 --- a/src/x86_64.rs +++ b/src/x86_64.rs @@ -5,11 +5,12 @@ use nix::sys::mman::{MapFlags, ProtFlags}; use kvm_ioctls::{Kvm, VcpuFd, VmFd}; use kvm_bindings::{ - kvm_guest_debug, kvm_userspace_memory_region, kvm_segment, - KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, + kvm_cpuid_entry2, kvm_guest_debug, + kvm_userspace_memory_region, kvm_segment, CpuId, + KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_MAX_CPUID_ENTRIES, }; -pub use kvm_bindings::{kvm_regs, kvm_sregs, kvm_debug_exit_arch}; +pub use kvm_bindings::{kvm_regs, kvm_sregs, kvm_xcrs, kvm_debug_exit_arch}; const _TARGET_IS_64BIT: () = { assert!(core::mem::size_of::<u64>() == core::mem::size_of::<usize>(), "asmlinator only supports 64-bit targets"); @@ -40,12 +41,51 @@ fn u64_to_usize(x: u64) -> usize { pub struct Vm { vm: VmFd, vcpu: VcpuFd, + supported_cpuid: CpuId, + current_cpuid: CpuId, idt_configured: bool, mem_ceiling: u64, memory: Mapping, aux_memories: Vec<Mapping>, } +/// broad categories of cpuid/cpu features that should be detectable or configurable as part of +/// setting up a VM. this is split out for legibility, but also because in theory these (especially +/// ISA extensions) features probably should be configurable by library users somehow.. +/// +/// not yet sure, so this is not pub. +#[derive(Copy, Clone, Debug)] +enum Feature { + /// support for the xsave/xrstor instructions and at least xcr0. + /// + /// cpuid leaf eax=0x0000_0001 bit ecx[26], see APM + /// chapter "Obtaining Processor Information Via the CPUID Instruction", + /// section "Standard Feature Function Numbers". + XSave, + /// support for 1GB page mappings. cpuid leaf eax=0x8000_0001 bit edx[26]. + Pdpe1Gb, + /// support for the XSAVE SSE region. this correponds to the bit in CPUID leaf D and + /// corresponding bit in xcr0. if this bit is unset, attempts to use instructions with xmm + /// state will #UD. + StateSSE, + /// support for the XSAVE AVX region. this correponds to the bit in CPUID leaf D and + /// corresponding bit in xcr0. if this bit is unset, attempts to use instructions with ymm + /// state will #UD. + StateAVX, + /// support for the XSAVE AVX512 regions. this correponds to the bits for K, ZMM_Hi256, and + /// Hi16_ZMM in CPUID leaf D and corresponding bits in xcr0. if these bits are not set, + /// attempts to use instructions with zmm state may #UD. + StateAVX512, +} + +const CPUID_00000001_ECX_XSAVE: u32 = 1 << 26; + +const CPUID_0000000D_EAX_SSE: u32 = 1 << 1; +const CPUID_0000000D_EAX_AVX: u32 = 1 << 2; +const CPUID_0000000D_EAX_AVX512: u32 = (1 << 5) | (1 << 6) | (1 << 7); + +const CPUID_80000001_EDX_PDPE1GB: u32 = 1 << 26; + #[derive(PartialEq)] pub enum VcpuExit<'buf> { MmioRead { addr: u64, buf: &'buf mut [u8] }, @@ -387,13 +427,159 @@ fn test_xor_runs() { let res = vm.run().expect("can run vm"); - let rip_after = rip_before + 2; - assert!(matches!(res, VcpuExit::Debug { pc: rip_after, .. })); + let expected_rip = rip_before + 2; + match res { + VcpuExit::Debug { pc: rip_after, .. } => { + assert_eq!(expected_rip, rip_after); + } + other => { + panic!("unexpected exit: {:?}", other); + } + }; let regs_after = vm.get_regs().expect("can get regs"); assert_eq!(regs_after.rax, 0); } +#[test] +fn test_xorps_runs() { + let mut vm = Vm::create(128 * 1024).expect("can create vm"); + let mut regs = vm.get_regs().expect("can get regs"); + + vm.program(&[0x0f, 0x57, 0xc0], &mut regs); + + let rip_before = regs.rip; + + vm.set_regs(®s).expect("can set regs"); + + vm.set_single_step(true).expect("can set single-step"); + + let res = vm.run().expect("can run vm"); + + let expected_rip = rip_before + 3; + eprintln!("exit: {:?}", res); + match res { + VcpuExit::Debug { pc: rip_after, .. } => { + assert_eq!(expected_rip, rip_after); + } + other => { + panic!("unexpected exit: {:?}", other); + } + }; +} + +#[test] +fn test_vex_vandps_runs() { + let mut vm = Vm::create(128 * 1024).expect("can create vm"); + + if !vm.cpuid_supports(Feature::StateAVX) { + panic!("host CPU does not support AVX"); + } + + let mut regs = vm.get_regs().expect("can get regs"); + + vm.program(&[0xc5, 0xe0, 0x54, 0x03], &mut regs); + + regs.rbx = regs.rip; + let rip_before = regs.rip; + + vm.set_regs(®s).expect("can set regs"); + + vm.set_single_step(true).expect("can set single-step"); + + let res = vm.run().expect("can run vm"); + + let expected_rip = rip_before + 4; + eprintln!("exit: {:?}", res); + match res { + VcpuExit::Debug { pc: rip_after, .. } => { + assert_eq!(expected_rip, rip_after); + } + other => { + panic!("unexpected exit: {:?}", other); + } + }; +} + +#[test] +fn test_evex_vandps_runs() { + let mut vm = Vm::create(128 * 1024).expect("can create vm"); + + if !vm.cpuid_supports(Feature::StateAVX512) { + panic!("host CPU does not support AVX512"); + } + + let mut regs = vm.get_regs().expect("can get regs"); + + vm.program(&[0x62, 0xf1, 0x7c, 0xbd, 0x54, 0x0a], &mut regs); + + regs.rbx = regs.rip; + let rip_before = regs.rip; + + vm.set_regs(®s).expect("can set regs"); + + vm.set_single_step(true).expect("can set single-step"); + + let res = vm.run().expect("can run vm"); + + let expected_rip = rip_before + 6; + eprintln!("exit: {:?}", res); + match res { + VcpuExit::Debug { pc: rip_after, .. } => { + assert_eq!(expected_rip, rip_after); + } + other => { + panic!("unexpected exit: {:?}", other); + } + }; +} + + +// this function will sit and loop in the kernel after trying to fulfill the MMIO exit. +// +// not great! don't do that! it's responsive to EINTR at least. +// #[test] +#[allow(dead_code)] +fn kvm_hugepage_bug() { + let mut vm = Vm::create(1024 * 1024).expect("can create vm"); + vm.add_memory(GuestAddress(0x1_0000_0000), 128 * 1024).expect("can add test mem region"); + unsafe { + vm.configure_identity_paging(None); + } + + // `add [rsp], al; add [rcx], al; pop [rcx]; hlt` + // the first instruction runs fine. the second instruction runs fine. + // the third instruction gets a page fault at 0xf800? which worked fine for the add. + // this turns out to be an issue in linux' paging64_gva_to_gpa() when the va is mapped with + // huge pages. + let inst: &'static [u8] = &[0x00, 0x04, 0x24, 0x00, 0x01, 0x8f, 0x01, 0xf4]; + let mut regs = vm.get_regs().unwrap(); + regs.rax = 0x00000002_00100000; + regs.rcx = 0x00000002_00100000; + vm.program(inst, &mut regs); + vm.set_regs(®s).unwrap(); + vm.set_single_step(true).expect("can enable single-step"); + vm.run().expect("can run vm"); + + let vm_regs = vm.get_regs().unwrap(); + let vm_sregs = vm.get_sregs().unwrap(); + let mut prev_rip = [0u8; 8]; + vm.read_mem(GuestAddress(vm_regs.rsp + 8), &mut prev_rip[..]); + let mut buf = [0u8; 8]; + vm.read_mem(GuestAddress(vm_regs.rsp), &mut buf[..]); + eprintln!( + "error code: {:#08x} accessing {:016x} @ rip={:#016x} (cr3={:016x})", + u64::from_le_bytes(buf), vm_sregs.cr2, + u64::from_le_bytes(prev_rip), vm_sregs.cr3 + ); + if vm_regs.rip == 0x300f { + let mut pdpt = [0u8; 4096]; + vm.read_mem(vm.page_tables().pdpt_addr(), &mut pdpt[..]); + eprintln!("pdpt: {:x?}", &pdpt[..8]); + } + panic!("no"); +} + impl Vm { pub fn create(mem_size: usize) -> Result<Vm, VmCreateError> { let kvm = Kvm::new() @@ -402,6 +588,8 @@ impl Vm { let vm = kvm.create_vm() .map_err(|e| VmError::from_kvm("craete_vm", e))?; + let supported_cpuid = kvm.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES).unwrap(); + // actual minimum is somewhere around 0x1a000 bytes, but 0x20_000 aka 128k will do if mem_size < 128 * 1024 { return Err(VmCreateError::TooSmall { @@ -426,11 +614,15 @@ impl Vm { let vcpu_res = vm.create_vcpu(0); let vcpu = vcpu_res.map_err(|e| VmError::from_kvm("create_vcpu(0)", e))?; + let current_cpuid = vcpu.get_cpuid2(KVM_MAX_CPUID_ENTRIES).unwrap(); + let mem_ceiling = mapping.size.get().try_into().unwrap(); let mut this = Vm { vm, vcpu, + supported_cpuid, + current_cpuid, idt_configured: false, memory: mapping, aux_memories: Vec::new(), @@ -444,6 +636,9 @@ impl Vm { this.configure_identity_paging(Some(&mut vcpu_sregs)); this.configure_selectors(&mut vcpu_sregs); this.configure_idt(&mut vcpu_regs, &mut vcpu_sregs); + let mut xcrs = this.get_xcrs()?; + this.configure_extensions(&mut vcpu_sregs, &mut xcrs); + this.set_xcrs(&xcrs)?; } vcpu_sregs.efer = 0x0000_0500; // LME | LMA @@ -514,6 +709,11 @@ impl Vm { .map_err(|e| VmError::from_kvm("get_sregs", e)) } + pub fn get_xcrs(&self) -> Result<kvm_xcrs, VmError> { + self.vcpu.get_xcrs() + .map_err(|e| VmError::from_kvm("get_xcrs", e)) + } + pub fn set_regs(&self, regs: &kvm_regs) -> Result<(), VmError> { self.vcpu.set_regs(regs) .map_err(|e| VmError::from_kvm("set_regs", e)) @@ -524,6 +724,11 @@ impl Vm { .map_err(|e| VmError::from_kvm("set_sregs", e)) } + pub fn set_xcrs(&self, xcrs: &kvm_xcrs) -> Result<(), VmError> { + self.vcpu.set_xcrs(xcrs) + .map_err(|e| VmError::from_kvm("set_xcrs", e)) + } + pub fn idt_configured(&self) -> bool { self.idt_configured } @@ -830,6 +1035,128 @@ impl Vm { } } + // TODO: there should be a version of this that can be used to query "does this VM support + // these extensions" probably, and that should take a subset of `Feature` for the ones that are + // actually related to ISA support (e.g. Pdpe1Gb isn't really useful as a public queryable + // feature..) + fn cpuid_supports(&self, feature: Feature) -> bool { + fn find_leaf(cpuid: &CpuId, leaf: u32, index: u32, f: impl Fn(&kvm_cpuid_entry2) -> bool) -> bool { + for mut entry in cpuid.as_slice() { + if entry.function == leaf && entry.index == index { + return f(&mut entry); + } + } + + false + } + + match feature { + Feature::XSave => { + find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| { + leaf.edx & CPUID_00000001_ECX_XSAVE != 0 + }) + } + Feature::Pdpe1Gb => { + find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| { + leaf.edx & CPUID_80000001_EDX_PDPE1GB != 0 + }) + } + Feature::StateSSE => { + find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { + leaf.eax & CPUID_0000000D_EAX_SSE == CPUID_0000000D_EAX_SSE + }) + } + Feature::StateAVX => { + find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { + leaf.eax & CPUID_0000000D_EAX_AVX == CPUID_0000000D_EAX_AVX + }) + } + Feature::StateAVX512 => { + find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| { + leaf.eax & CPUID_0000000D_EAX_AVX512 == CPUID_0000000D_EAX_AVX512 + }) + } + } + } + + /// set `feature` to `wanted` in the VM's CPUID configuration. + /// + /// panics if the feature cannot be configured (such as if the corresponding CPUID leaf is not + /// available at all). use [`cpuid_supports`] to test if the feature can be configured. + fn cpuid_set(&mut self, feature: Feature, wanted: bool) { + fn edit_leaf(cpuid: &mut CpuId, leaf: u32, index: u32, mut f: impl FnMut(&mut kvm_cpuid_entry2)) { + for mut entry in cpuid.as_mut_slice() { + if entry.function == leaf && entry.index == index { + f(&mut entry); + return; + } + } + + // if we're here, the entry simply is not present (yet..?) + // + // so, create it. + let mut entry = kvm_cpuid_entry2 { + function: leaf, + index: index, + eax: 0, + ecx: 0, + edx: 0, + ebx: 0, + flags: 0, + padding: [0; 3], + }; + f(&mut entry); + cpuid.push(entry).expect("can push"); + } + + fn bit_set(word: &mut u32, bit: u32, wanted: bool) { + *word &= !bit; + if wanted { + *word |= bit; + } + } + + let mut edited = false; + + match feature { + Feature::XSave => { + edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| { + bit_set(&mut leaf.ecx, CPUID_00000001_ECX_XSAVE, wanted); + edited = true; + }); + }, + Feature::Pdpe1Gb => { + edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| { + bit_set(&mut leaf.edx, CPUID_80000001_EDX_PDPE1GB, wanted); + edited = true; + }); + }, + Feature::StateSSE => { + edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { + bit_set(&mut leaf.eax, 1, wanted); + bit_set(&mut leaf.eax, CPUID_0000000D_EAX_SSE, wanted); + edited = true; + }); + } + Feature::StateAVX => { + edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { + bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX, wanted); + edited = true; + }); + } + Feature::StateAVX512 => { + edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| { + bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX512, wanted); + edited = true; + }); + } + } + + assert!(edited); + + self.vcpu.set_cpuid2(&self.current_cpuid).expect("can set cpuid"); + } + /// configure page tables for identity mapping of all memory from guest address zero up to the /// end of added memory regions, rounded up to the next GiB. /// @@ -840,12 +1167,13 @@ impl Vm { /// /// panics if the end of added memory regions is above 512 GiB. pub unsafe fn configure_identity_paging(&mut self, sregs: Option<&mut kvm_sregs>) { - let pt = self.page_tables(); - // we're only setting up one PDPT, which can have up to 512 PDPTE covering 1G each. assert!(self.mem_ceiling() <= 512 * GB); - // TODO: expects 1G page support + assert!(self.cpuid_supports(Feature::Pdpe1Gb)); + self.cpuid_set(Feature::Pdpe1Gb, true); + + let pt = self.page_tables(); let pml4_ent = 1 << 0 | // P @@ -1020,4 +1348,111 @@ impl Vm { regs.rsp = self.stack_addr().0; self.idt_configured = true; } + + /// configure the vCPU for executing instructions in the hardware-supported extensions. + /// on a fresh vCPU, various extension may be "supported" but result in `#UD` when executed, + /// unless additional configuration is done (as this function does). + /// + /// the Intel SDM describes `INITIALIZING SSE/SSE2/SSE3/SSSE3 EXTENSIONS` but does not point + /// out this `#UD` behavior so directly. the AMD APM does not seem to discuss it at all? + /// + /// this function configures the vCPU to be ready to execute `SSE*` instructions. + fn configure_extensions(&mut self, sregs: &mut kvm_sregs, xcrs: &mut kvm_xcrs) { + // these bit positions in control registers, and their behaviors, are described more + // comprehensively in Voluem 3, + // > `2.5 CONTROL REGISTERS` + + // CR0 + const TS: u32 = 3; + // CR4 + const OSFXSR: u32 = 9; + const OSXMMEXCPT: u32 = 10; + const OSXSAVE: u32 = 18; + + // XCR0 (see "EXTENDED CONTROL REGISTERS (INCLUDING XCR0)") + // these bits are the same as in cpuid leaf 0xd.eax + const XCR0_SSE: u64 = CPUID_0000000D_EAX_SSE as u64; + const XCR0_AVX: u64 = CPUID_0000000D_EAX_AVX as u64; + const XCR0_AVX512: u64 = CPUID_0000000D_EAX_AVX512 as u64; + + // operations on `xmm` registers result in `#UD` even if CPUID says that SSE should be + // quite functional. this is true even for SSE or SSE2 instructions on an `x86_64` system + // (which makes SSE a non-optional baseline!) + // + // the Intel SDM implies this through somewhat tortured language in the section + // "Checking for Intel® SSE and SSE2 Support": + // > If an operating system did not provide adequate system level support for Intel + // > SSE, executing an Intel SSE or SSE2 instructions can also generate #UD. + // + // to fully understand this statement, realize that `an operating system .. provide[s] + // adequate system level support" by setting CR4.OSFXSR, + // + // > Set the OSFXSR flag (bit 9 in control register CR4) to indicate that the operating + // > system supports saving and restoring the SSE/SSE2/SSE3/SSSE3 execution environment + // + // so OSFXSR is how "the operating system" indicates save/restore state, and must be set to + // execute SSE (and later) SIMD instructions even if we never will use `fxsave` or even + // switch tasks on the vCPU. + sregs.cr4 |= 1 << OSFXSR; + + // there is a similar relationship between SIMD extension functionality and CR4.OSXSAVE. + // this passage in the SDM under "XSAVE-SUPPORTED FEATURES AND STATE-COMPONENT BITMAPS" + // draws a fairly direct connection: + // + // > As will be explained in Section 13.3, the XSAVE feature set is enabled only if + // > CR4.OSXSAVE[bit 18] = 1. If CR4.OSXSAVE = 0, the processor treats XSAVE-enabled state + // > features and their state components as if all bits in XCR0 were clear; the state + // > components cannot be modified and the features’ instructions cannot be executed. + // + // but the consequence is contradicted by the next paragraph, + // + // > Processors allow modification of this state, as well as execution of x87 FPU + // > instructions and SSE instructions [...] , regardless of the value of CR4.OSXSAVE and + // > XCR0. + // + // we will see that CR4.OSXSAVE must be set for other SIMD extensions below, as well. + sregs.cr4 |= 1 << OSXSAVE; + + // SSE3, SSSE3, and SSE4 involve a bit extra: + // > Intel SSE3, SSSE3, and Intel SSE4 will cause a DNA Exception (#NM) if the processor + // > attempts to execute an Intel SSE3 instruction while CR0.TS[bit 3] = 1 + sregs.cr0 &= !(1 << TS); + + // > Set the OSXMMEXCPT flag (bit 10 in control register CR4) to indicate that the operating + // > system supports the handling of SSE/SSE2/SSE3 SIMD floating-point exceptions (#XM). + // + // this is somewhat better than just getting an uncategorized #UD. + sregs.cr4 |= 1 << OSXMMEXCPT; + + assert!(xcrs.nr_xcrs > 0); + assert_eq!(xcrs.xcrs[0].xcr, 0); + + let mut needs_xsave = false; + if self.cpuid_supports(Feature::StateSSE) { + self.cpuid_set(Feature::StateSSE, true); + xcrs.xcrs[0].value |= 1; + xcrs.xcrs[0].value |= XCR0_SSE; + needs_xsave = true; + } + if self.cpuid_supports(Feature::StateAVX) { + self.cpuid_set(Feature::StateAVX, true); + xcrs.xcrs[0].value |= XCR0_AVX; + needs_xsave = true; + } + if self.cpuid_supports(Feature::StateAVX512) { + self.cpuid_set(Feature::StateAVX512, true); + xcrs.xcrs[0].value |= XCR0_AVX512; + needs_xsave = true; + } + + if needs_xsave { + if self.cpuid_supports(Feature::XSave) { + self.cpuid_set(Feature::XSave, true); + } else { + panic!( + "look, there's no CPU that supports SSE but not xsave. \ + i only checked to be thorough."); + } + } + } } |
