aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/x86_64.rs451
1 files changed, 443 insertions, 8 deletions
diff --git a/src/x86_64.rs b/src/x86_64.rs
index da46380..d02fcf0 100644
--- a/src/x86_64.rs
+++ b/src/x86_64.rs
@@ -5,11 +5,12 @@ use nix::sys::mman::{MapFlags, ProtFlags};
use kvm_ioctls::{Kvm, VcpuFd, VmFd};
use kvm_bindings::{
- kvm_guest_debug, kvm_userspace_memory_region, kvm_segment,
- KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP,
+ kvm_cpuid_entry2, kvm_guest_debug,
+ kvm_userspace_memory_region, kvm_segment, CpuId,
+ KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_MAX_CPUID_ENTRIES,
};
-pub use kvm_bindings::{kvm_regs, kvm_sregs, kvm_debug_exit_arch};
+pub use kvm_bindings::{kvm_regs, kvm_sregs, kvm_xcrs, kvm_debug_exit_arch};
const _TARGET_IS_64BIT: () = {
assert!(core::mem::size_of::<u64>() == core::mem::size_of::<usize>(), "asmlinator only supports 64-bit targets");
@@ -40,12 +41,51 @@ fn u64_to_usize(x: u64) -> usize {
pub struct Vm {
vm: VmFd,
vcpu: VcpuFd,
+ supported_cpuid: CpuId,
+ current_cpuid: CpuId,
idt_configured: bool,
mem_ceiling: u64,
memory: Mapping,
aux_memories: Vec<Mapping>,
}
+/// broad categories of cpuid/cpu features that should be detectable or configurable as part of
+/// setting up a VM. this is split out for legibility, but also because in theory these (especially
+/// ISA extensions) features probably should be configurable by library users somehow..
+///
+/// not yet sure, so this is not pub.
+#[derive(Copy, Clone, Debug)]
+enum Feature {
+ /// support for the xsave/xrstor instructions and at least xcr0.
+ ///
+ /// cpuid leaf eax=0x0000_0001 bit ecx[26], see APM
+ /// chapter "Obtaining Processor Information Via the CPUID Instruction",
+ /// section "Standard Feature Function Numbers".
+ XSave,
+ /// support for 1GB page mappings. cpuid leaf eax=0x8000_0001 bit edx[26].
+ Pdpe1Gb,
+ /// support for the XSAVE SSE region. this correponds to the bit in CPUID leaf D and
+ /// corresponding bit in xcr0. if this bit is unset, attempts to use instructions with xmm
+ /// state will #UD.
+ StateSSE,
+ /// support for the XSAVE AVX region. this correponds to the bit in CPUID leaf D and
+ /// corresponding bit in xcr0. if this bit is unset, attempts to use instructions with ymm
+ /// state will #UD.
+ StateAVX,
+ /// support for the XSAVE AVX512 regions. this correponds to the bits for K, ZMM_Hi256, and
+ /// Hi16_ZMM in CPUID leaf D and corresponding bits in xcr0. if these bits are not set,
+ /// attempts to use instructions with zmm state may #UD.
+ StateAVX512,
+}
+
+const CPUID_00000001_ECX_XSAVE: u32 = 1 << 26;
+
+const CPUID_0000000D_EAX_SSE: u32 = 1 << 1;
+const CPUID_0000000D_EAX_AVX: u32 = 1 << 2;
+const CPUID_0000000D_EAX_AVX512: u32 = (1 << 5) | (1 << 6) | (1 << 7);
+
+const CPUID_80000001_EDX_PDPE1GB: u32 = 1 << 26;
+
#[derive(PartialEq)]
pub enum VcpuExit<'buf> {
MmioRead { addr: u64, buf: &'buf mut [u8] },
@@ -387,13 +427,159 @@ fn test_xor_runs() {
let res = vm.run().expect("can run vm");
- let rip_after = rip_before + 2;
- assert!(matches!(res, VcpuExit::Debug { pc: rip_after, .. }));
+ let expected_rip = rip_before + 2;
+ match res {
+ VcpuExit::Debug { pc: rip_after, .. } => {
+ assert_eq!(expected_rip, rip_after);
+ }
+ other => {
+ panic!("unexpected exit: {:?}", other);
+ }
+ };
let regs_after = vm.get_regs().expect("can get regs");
assert_eq!(regs_after.rax, 0);
}
+#[test]
+fn test_xorps_runs() {
+ let mut vm = Vm::create(128 * 1024).expect("can create vm");
+ let mut regs = vm.get_regs().expect("can get regs");
+
+ vm.program(&[0x0f, 0x57, 0xc0], &mut regs);
+
+ let rip_before = regs.rip;
+
+ vm.set_regs(&regs).expect("can set regs");
+
+ vm.set_single_step(true).expect("can set single-step");
+
+ let res = vm.run().expect("can run vm");
+
+ let expected_rip = rip_before + 3;
+ eprintln!("exit: {:?}", res);
+ match res {
+ VcpuExit::Debug { pc: rip_after, .. } => {
+ assert_eq!(expected_rip, rip_after);
+ }
+ other => {
+ panic!("unexpected exit: {:?}", other);
+ }
+ };
+}
+
+#[test]
+fn test_vex_vandps_runs() {
+ let mut vm = Vm::create(128 * 1024).expect("can create vm");
+
+ if !vm.cpuid_supports(Feature::StateAVX) {
+ panic!("host CPU does not support AVX");
+ }
+
+ let mut regs = vm.get_regs().expect("can get regs");
+
+ vm.program(&[0xc5, 0xe0, 0x54, 0x03], &mut regs);
+
+ regs.rbx = regs.rip;
+ let rip_before = regs.rip;
+
+ vm.set_regs(&regs).expect("can set regs");
+
+ vm.set_single_step(true).expect("can set single-step");
+
+ let res = vm.run().expect("can run vm");
+
+ let expected_rip = rip_before + 4;
+ eprintln!("exit: {:?}", res);
+ match res {
+ VcpuExit::Debug { pc: rip_after, .. } => {
+ assert_eq!(expected_rip, rip_after);
+ }
+ other => {
+ panic!("unexpected exit: {:?}", other);
+ }
+ };
+}
+
+#[test]
+fn test_evex_vandps_runs() {
+ let mut vm = Vm::create(128 * 1024).expect("can create vm");
+
+ if !vm.cpuid_supports(Feature::StateAVX512) {
+ panic!("host CPU does not support AVX512");
+ }
+
+ let mut regs = vm.get_regs().expect("can get regs");
+
+ vm.program(&[0x62, 0xf1, 0x7c, 0xbd, 0x54, 0x0a], &mut regs);
+
+ regs.rbx = regs.rip;
+ let rip_before = regs.rip;
+
+ vm.set_regs(&regs).expect("can set regs");
+
+ vm.set_single_step(true).expect("can set single-step");
+
+ let res = vm.run().expect("can run vm");
+
+ let expected_rip = rip_before + 6;
+ eprintln!("exit: {:?}", res);
+ match res {
+ VcpuExit::Debug { pc: rip_after, .. } => {
+ assert_eq!(expected_rip, rip_after);
+ }
+ other => {
+ panic!("unexpected exit: {:?}", other);
+ }
+ };
+}
+
+
+// this function will sit and loop in the kernel after trying to fulfill the MMIO exit.
+//
+// not great! don't do that! it's responsive to EINTR at least.
+// #[test]
+#[allow(dead_code)]
+fn kvm_hugepage_bug() {
+ let mut vm = Vm::create(1024 * 1024).expect("can create vm");
+ vm.add_memory(GuestAddress(0x1_0000_0000), 128 * 1024).expect("can add test mem region");
+ unsafe {
+ vm.configure_identity_paging(None);
+ }
+
+ // `add [rsp], al; add [rcx], al; pop [rcx]; hlt`
+ // the first instruction runs fine. the second instruction runs fine.
+ // the third instruction gets a page fault at 0xf800? which worked fine for the add.
+ // this turns out to be an issue in linux' paging64_gva_to_gpa() when the va is mapped with
+ // huge pages.
+ let inst: &'static [u8] = &[0x00, 0x04, 0x24, 0x00, 0x01, 0x8f, 0x01, 0xf4];
+ let mut regs = vm.get_regs().unwrap();
+ regs.rax = 0x00000002_00100000;
+ regs.rcx = 0x00000002_00100000;
+ vm.program(inst, &mut regs);
+ vm.set_regs(&regs).unwrap();
+ vm.set_single_step(true).expect("can enable single-step");
+ vm.run().expect("can run vm");
+
+ let vm_regs = vm.get_regs().unwrap();
+ let vm_sregs = vm.get_sregs().unwrap();
+ let mut prev_rip = [0u8; 8];
+ vm.read_mem(GuestAddress(vm_regs.rsp + 8), &mut prev_rip[..]);
+ let mut buf = [0u8; 8];
+ vm.read_mem(GuestAddress(vm_regs.rsp), &mut buf[..]);
+ eprintln!(
+ "error code: {:#08x} accessing {:016x} @ rip={:#016x} (cr3={:016x})",
+ u64::from_le_bytes(buf), vm_sregs.cr2,
+ u64::from_le_bytes(prev_rip), vm_sregs.cr3
+ );
+ if vm_regs.rip == 0x300f {
+ let mut pdpt = [0u8; 4096];
+ vm.read_mem(vm.page_tables().pdpt_addr(), &mut pdpt[..]);
+ eprintln!("pdpt: {:x?}", &pdpt[..8]);
+ }
+ panic!("no");
+}
+
impl Vm {
pub fn create(mem_size: usize) -> Result<Vm, VmCreateError> {
let kvm = Kvm::new()
@@ -402,6 +588,8 @@ impl Vm {
let vm = kvm.create_vm()
.map_err(|e| VmError::from_kvm("craete_vm", e))?;
+ let supported_cpuid = kvm.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES).unwrap();
+
// actual minimum is somewhere around 0x1a000 bytes, but 0x20_000 aka 128k will do
if mem_size < 128 * 1024 {
return Err(VmCreateError::TooSmall {
@@ -426,11 +614,15 @@ impl Vm {
let vcpu_res = vm.create_vcpu(0);
let vcpu = vcpu_res.map_err(|e| VmError::from_kvm("create_vcpu(0)", e))?;
+ let current_cpuid = vcpu.get_cpuid2(KVM_MAX_CPUID_ENTRIES).unwrap();
+
let mem_ceiling = mapping.size.get().try_into().unwrap();
let mut this = Vm {
vm,
vcpu,
+ supported_cpuid,
+ current_cpuid,
idt_configured: false,
memory: mapping,
aux_memories: Vec::new(),
@@ -444,6 +636,9 @@ impl Vm {
this.configure_identity_paging(Some(&mut vcpu_sregs));
this.configure_selectors(&mut vcpu_sregs);
this.configure_idt(&mut vcpu_regs, &mut vcpu_sregs);
+ let mut xcrs = this.get_xcrs()?;
+ this.configure_extensions(&mut vcpu_sregs, &mut xcrs);
+ this.set_xcrs(&xcrs)?;
}
vcpu_sregs.efer = 0x0000_0500; // LME | LMA
@@ -514,6 +709,11 @@ impl Vm {
.map_err(|e| VmError::from_kvm("get_sregs", e))
}
+ pub fn get_xcrs(&self) -> Result<kvm_xcrs, VmError> {
+ self.vcpu.get_xcrs()
+ .map_err(|e| VmError::from_kvm("get_xcrs", e))
+ }
+
pub fn set_regs(&self, regs: &kvm_regs) -> Result<(), VmError> {
self.vcpu.set_regs(regs)
.map_err(|e| VmError::from_kvm("set_regs", e))
@@ -524,6 +724,11 @@ impl Vm {
.map_err(|e| VmError::from_kvm("set_sregs", e))
}
+ pub fn set_xcrs(&self, xcrs: &kvm_xcrs) -> Result<(), VmError> {
+ self.vcpu.set_xcrs(xcrs)
+ .map_err(|e| VmError::from_kvm("set_xcrs", e))
+ }
+
pub fn idt_configured(&self) -> bool {
self.idt_configured
}
@@ -830,6 +1035,128 @@ impl Vm {
}
}
+ // TODO: there should be a version of this that can be used to query "does this VM support
+ // these extensions" probably, and that should take a subset of `Feature` for the ones that are
+ // actually related to ISA support (e.g. Pdpe1Gb isn't really useful as a public queryable
+ // feature..)
+ fn cpuid_supports(&self, feature: Feature) -> bool {
+ fn find_leaf(cpuid: &CpuId, leaf: u32, index: u32, f: impl Fn(&kvm_cpuid_entry2) -> bool) -> bool {
+ for mut entry in cpuid.as_slice() {
+ if entry.function == leaf && entry.index == index {
+ return f(&mut entry);
+ }
+ }
+
+ false
+ }
+
+ match feature {
+ Feature::XSave => {
+ find_leaf(&self.supported_cpuid, 0x0000_0001, 0, |leaf| {
+ leaf.edx & CPUID_00000001_ECX_XSAVE != 0
+ })
+ }
+ Feature::Pdpe1Gb => {
+ find_leaf(&self.supported_cpuid, 0x8000_0001, 0, |leaf| {
+ leaf.edx & CPUID_80000001_EDX_PDPE1GB != 0
+ })
+ }
+ Feature::StateSSE => {
+ find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| {
+ leaf.eax & CPUID_0000000D_EAX_SSE == CPUID_0000000D_EAX_SSE
+ })
+ }
+ Feature::StateAVX => {
+ find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| {
+ leaf.eax & CPUID_0000000D_EAX_AVX == CPUID_0000000D_EAX_AVX
+ })
+ }
+ Feature::StateAVX512 => {
+ find_leaf(&self.supported_cpuid, 0x0000_000d, 0, |leaf| {
+ leaf.eax & CPUID_0000000D_EAX_AVX512 == CPUID_0000000D_EAX_AVX512
+ })
+ }
+ }
+ }
+
+ /// set `feature` to `wanted` in the VM's CPUID configuration.
+ ///
+ /// panics if the feature cannot be configured (such as if the corresponding CPUID leaf is not
+ /// available at all). use [`cpuid_supports`] to test if the feature can be configured.
+ fn cpuid_set(&mut self, feature: Feature, wanted: bool) {
+ fn edit_leaf(cpuid: &mut CpuId, leaf: u32, index: u32, mut f: impl FnMut(&mut kvm_cpuid_entry2)) {
+ for mut entry in cpuid.as_mut_slice() {
+ if entry.function == leaf && entry.index == index {
+ f(&mut entry);
+ return;
+ }
+ }
+
+ // if we're here, the entry simply is not present (yet..?)
+ //
+ // so, create it.
+ let mut entry = kvm_cpuid_entry2 {
+ function: leaf,
+ index: index,
+ eax: 0,
+ ecx: 0,
+ edx: 0,
+ ebx: 0,
+ flags: 0,
+ padding: [0; 3],
+ };
+ f(&mut entry);
+ cpuid.push(entry).expect("can push");
+ }
+
+ fn bit_set(word: &mut u32, bit: u32, wanted: bool) {
+ *word &= !bit;
+ if wanted {
+ *word |= bit;
+ }
+ }
+
+ let mut edited = false;
+
+ match feature {
+ Feature::XSave => {
+ edit_leaf(&mut self.current_cpuid, 0x0000_0001, 0, |leaf| {
+ bit_set(&mut leaf.ecx, CPUID_00000001_ECX_XSAVE, wanted);
+ edited = true;
+ });
+ },
+ Feature::Pdpe1Gb => {
+ edit_leaf(&mut self.current_cpuid, 0x8000_0001, 0, |leaf| {
+ bit_set(&mut leaf.edx, CPUID_80000001_EDX_PDPE1GB, wanted);
+ edited = true;
+ });
+ },
+ Feature::StateSSE => {
+ edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| {
+ bit_set(&mut leaf.eax, 1, wanted);
+ bit_set(&mut leaf.eax, CPUID_0000000D_EAX_SSE, wanted);
+ edited = true;
+ });
+ }
+ Feature::StateAVX => {
+ edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| {
+ bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX, wanted);
+ edited = true;
+ });
+ }
+ Feature::StateAVX512 => {
+ edit_leaf(&mut self.current_cpuid, 0x0000_000d, 0, |leaf| {
+ bit_set(&mut leaf.eax, CPUID_0000000D_EAX_AVX512, wanted);
+ edited = true;
+ });
+ }
+ }
+
+ assert!(edited);
+
+ self.vcpu.set_cpuid2(&self.current_cpuid).expect("can set cpuid");
+ }
+
/// configure page tables for identity mapping of all memory from guest address zero up to the
/// end of added memory regions, rounded up to the next GiB.
///
@@ -840,12 +1167,13 @@ impl Vm {
///
/// panics if the end of added memory regions is above 512 GiB.
pub unsafe fn configure_identity_paging(&mut self, sregs: Option<&mut kvm_sregs>) {
- let pt = self.page_tables();
-
// we're only setting up one PDPT, which can have up to 512 PDPTE covering 1G each.
assert!(self.mem_ceiling() <= 512 * GB);
- // TODO: expects 1G page support
+ assert!(self.cpuid_supports(Feature::Pdpe1Gb));
+ self.cpuid_set(Feature::Pdpe1Gb, true);
+
+ let pt = self.page_tables();
let pml4_ent =
1 << 0 | // P
@@ -1020,4 +1348,111 @@ impl Vm {
regs.rsp = self.stack_addr().0;
self.idt_configured = true;
}
+
+ /// configure the vCPU for executing instructions in the hardware-supported extensions.
+ /// on a fresh vCPU, various extension may be "supported" but result in `#UD` when executed,
+ /// unless additional configuration is done (as this function does).
+ ///
+ /// the Intel SDM describes `INITIALIZING SSE/SSE2/SSE3/SSSE3 EXTENSIONS` but does not point
+ /// out this `#UD` behavior so directly. the AMD APM does not seem to discuss it at all?
+ ///
+ /// this function configures the vCPU to be ready to execute `SSE*` instructions.
+ fn configure_extensions(&mut self, sregs: &mut kvm_sregs, xcrs: &mut kvm_xcrs) {
+ // these bit positions in control registers, and their behaviors, are described more
+ // comprehensively in Voluem 3,
+ // > `2.5 CONTROL REGISTERS`
+
+ // CR0
+ const TS: u32 = 3;
+ // CR4
+ const OSFXSR: u32 = 9;
+ const OSXMMEXCPT: u32 = 10;
+ const OSXSAVE: u32 = 18;
+
+ // XCR0 (see "EXTENDED CONTROL REGISTERS (INCLUDING XCR0)")
+ // these bits are the same as in cpuid leaf 0xd.eax
+ const XCR0_SSE: u64 = CPUID_0000000D_EAX_SSE as u64;
+ const XCR0_AVX: u64 = CPUID_0000000D_EAX_AVX as u64;
+ const XCR0_AVX512: u64 = CPUID_0000000D_EAX_AVX512 as u64;
+
+ // operations on `xmm` registers result in `#UD` even if CPUID says that SSE should be
+ // quite functional. this is true even for SSE or SSE2 instructions on an `x86_64` system
+ // (which makes SSE a non-optional baseline!)
+ //
+ // the Intel SDM implies this through somewhat tortured language in the section
+ // "Checking for Intel® SSE and SSE2 Support":
+ // > If an operating system did not provide adequate system level support for Intel
+ // > SSE, executing an Intel SSE or SSE2 instructions can also generate #UD.
+ //
+ // to fully understand this statement, realize that `an operating system .. provide[s]
+ // adequate system level support" by setting CR4.OSFXSR,
+ //
+ // > Set the OSFXSR flag (bit 9 in control register CR4) to indicate that the operating
+ // > system supports saving and restoring the SSE/SSE2/SSE3/SSSE3 execution environment
+ //
+ // so OSFXSR is how "the operating system" indicates save/restore state, and must be set to
+ // execute SSE (and later) SIMD instructions even if we never will use `fxsave` or even
+ // switch tasks on the vCPU.
+ sregs.cr4 |= 1 << OSFXSR;
+
+ // there is a similar relationship between SIMD extension functionality and CR4.OSXSAVE.
+ // this passage in the SDM under "XSAVE-SUPPORTED FEATURES AND STATE-COMPONENT BITMAPS"
+ // draws a fairly direct connection:
+ //
+ // > As will be explained in Section 13.3, the XSAVE feature set is enabled only if
+ // > CR4.OSXSAVE[bit 18] = 1. If CR4.OSXSAVE = 0, the processor treats XSAVE-enabled state
+ // > features and their state components as if all bits in XCR0 were clear; the state
+ // > components cannot be modified and the features’ instructions cannot be executed.
+ //
+ // but the consequence is contradicted by the next paragraph,
+ //
+ // > Processors allow modification of this state, as well as execution of x87 FPU
+ // > instructions and SSE instructions [...] , regardless of the value of CR4.OSXSAVE and
+ // > XCR0.
+ //
+ // we will see that CR4.OSXSAVE must be set for other SIMD extensions below, as well.
+ sregs.cr4 |= 1 << OSXSAVE;
+
+ // SSE3, SSSE3, and SSE4 involve a bit extra:
+ // > Intel SSE3, SSSE3, and Intel SSE4 will cause a DNA Exception (#NM) if the processor
+ // > attempts to execute an Intel SSE3 instruction while CR0.TS[bit 3] = 1
+ sregs.cr0 &= !(1 << TS);
+
+ // > Set the OSXMMEXCPT flag (bit 10 in control register CR4) to indicate that the operating
+ // > system supports the handling of SSE/SSE2/SSE3 SIMD floating-point exceptions (#XM).
+ //
+ // this is somewhat better than just getting an uncategorized #UD.
+ sregs.cr4 |= 1 << OSXMMEXCPT;
+
+ assert!(xcrs.nr_xcrs > 0);
+ assert_eq!(xcrs.xcrs[0].xcr, 0);
+
+ let mut needs_xsave = false;
+ if self.cpuid_supports(Feature::StateSSE) {
+ self.cpuid_set(Feature::StateSSE, true);
+ xcrs.xcrs[0].value |= 1;
+ xcrs.xcrs[0].value |= XCR0_SSE;
+ needs_xsave = true;
+ }
+ if self.cpuid_supports(Feature::StateAVX) {
+ self.cpuid_set(Feature::StateAVX, true);
+ xcrs.xcrs[0].value |= XCR0_AVX;
+ needs_xsave = true;
+ }
+ if self.cpuid_supports(Feature::StateAVX512) {
+ self.cpuid_set(Feature::StateAVX512, true);
+ xcrs.xcrs[0].value |= XCR0_AVX512;
+ needs_xsave = true;
+ }
+
+ if needs_xsave {
+ if self.cpuid_supports(Feature::XSave) {
+ self.cpuid_set(Feature::XSave, true);
+ } else {
+ panic!(
+ "look, there's no CPU that supports SSE but not xsave. \
+ i only checked to be thorough.");
+ }
+ }
+ }
}