diff options
| author | iximeow <me@iximeow.net> | 2026-03-29 03:30:02 +0000 |
|---|---|---|
| committer | iximeow <me@iximeow.net> | 2026-03-29 03:30:02 +0000 |
| commit | 06ca764412cfa6dca028eaee989412bb0422684b (patch) | |
| tree | 156c1ee5a0a0a2fd348d3030b0ac9256e6b3eb49 | |
lmao
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | CHANGELOG | 6 | ||||
| -rw-r--r-- | Cargo.lock | 86 | ||||
| -rw-r--r-- | Cargo.toml | 16 | ||||
| -rw-r--r-- | README.md | 74 | ||||
| -rw-r--r-- | src/lib.rs | 4 | ||||
| -rw-r--r-- | src/x86_64.rs | 1023 |
7 files changed, 1210 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 0000000..9695bb4 --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,6 @@ +## 1.0.0 + +it exists. easy path to a VM that effectively boots directly to x86_64. + +immediately 1.0.0 because this is already incredibly useful to me elsewhere and +an interface i think will hold up to modest usage. diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..3a10c04 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,86 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "asmlinator" +version = "1.0.0" +dependencies = [ + "kvm-bindings", + "kvm-ioctls", + "libc", + "nix", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + +[[package]] +name = "kvm-bindings" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b3c06ff73c7ce03e780887ec2389d62d2a2a9ddf471ab05c2ff69207cd3f3b4" +dependencies = [ + "vmm-sys-util", +] + +[[package]] +name = "kvm-ioctls" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "333f77a20344a448f3f70664918135fddeb804e938f28a99d685bd92926e0b19" +dependencies = [ + "bitflags 2.11.0", + "kvm-bindings", + "libc", + "vmm-sys-util", +] + +[[package]] +name = "libc" +version = "0.2.183" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" + +[[package]] +name = "nix" +version = "0.31.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d6d0705320c1e6ba1d912b5e37cf18071b6c2e9b7fa8215a1e8a7651966f5d3" +dependencies = [ + "bitflags 2.11.0", + "cfg-if", + "cfg_aliases", + "libc", +] + +[[package]] +name = "vmm-sys-util" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "506c62fdf617a5176827c2f9afbcf1be155b03a9b4bf9617a60dbc07e3a1642f" +dependencies = [ + "bitflags 1.3.2", + "libc", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..f3427bc --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,16 @@ +[package] + +name = "asmlinator" +version = "1.0.0" +authors = [ "iximeow <me@iximeow.net>" ] +license = "0BSD" +repository = "https://git.iximeow.net/asmlinator/" +description = "VMs preconfigured to directly execute code" +readme = "README.md" +edition = "2024" + +[dependencies] +kvm-bindings = "0.14.0" +kvm-ioctls = "0.24.0" +libc = { version = "0.2.183", default-features = false } +nix = { version = "0.31.2", features = ["mman"] } diff --git a/README.md b/README.md new file mode 100644 index 0000000..3124647 --- /dev/null +++ b/README.md @@ -0,0 +1,74 @@ +## asmlinator + +[](https://crates.io/crates/asmlinator) +[](https://docs.rs/asmlinator) + +just enough glue on top of KVM to get a VM with one CPU set up to execute `x86_64` instructions. + +### usage + +```rust +use asmlinator::x86_64::VcpuExit; + +let mem_size = 1024 * 1024; +let mut vm = asmlinator::x86_64::Vm::create(mem_size) + .expect("can create the VM"); + +let mut regs = vm.get_regs().unwrap(); + +// program VM with "xor eax, eax; hlt" +vm.program(&[0x33, 0xc0, 0xf4], &mut regs); +vm.set_regs(®s).unwrap(); + +let res = vm.run().expect("can run cpu"); +assert_eq!(res, VcpuExit::Hlt); + +let regs = vm.get_regs().unwrap(); +eprintln!("ending rip: {:016x}", regs.rip); +eprintln!("ending rax: {:016x}", regs.rax); +``` + +### design + +it's just a glorified virtual CPU wrapper. there is no device emulation, there +is no interrupt controller, there's only one CPU, etc. the plan is to support +ad-hoc questions about x86 behavior rather than a general VMM. more "run this +function in that address space" and "single-step these instructions" than "call +into a library". + +you could imagine this as a more opinionated and much smaller `hyperlight`, +without any support for IPC into or out of the VM. i don't. + +i consider this closer to a missing OS primitive. the OS knows how to boot +itself on native hardware, it knows how to create a virtual machine, it should +be able to create exactly this kind of partially-initialized VM that does not +require setting up an IDT, GDT, paging, ... + +### future + +it'd be nice to set up aarch64 processors for code execution too. and +32-bit/16-bit x86. and to do all this on other OSes with other VM APIs. + +there should be an option to set up `syscall`/sysenter` and handle such +instructions as a `VcpuExit::Syscall`, but i won't need that for a bit. + +it would probably nice to expose a C ffi to embed this into other programs! +such an ffi interface should be straightforward. i haven't needed one yet. + +### mirrors + +the canonical copy of `asmlinator` is at [https://git.iximeow.net/asmlinator/](https://git.iximeow.net/asmlinator). + +`asmlinator` is also mirrored on Codeberg at [https://codeberg.org/iximeow/asmlinator](https://codeberg.org/iximeow/asmlinator). + +### changelog + +a changelog across crate versions is maintained in the `CHANGELOG` file located in the repo. + +### contributing + +unfortunately, pushing commits to the canonical repo at `git.iximeow.net` is +impossible. if you'd like to contribute - thank you! - please send patches to +emails iximeow has committed under or by opening PRs against the [Codeberg +mirror](https://codeberg.org/iximeow/asmlinator). both remotes are kept in +sync. diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..5efe9b5 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,4 @@ +#![doc = include_str!("../README.md")] + +/// `x86_64` VMs and supporting types. +pub mod x86_64; diff --git a/src/x86_64.rs b/src/x86_64.rs new file mode 100644 index 0000000..da46380 --- /dev/null +++ b/src/x86_64.rs @@ -0,0 +1,1023 @@ +use core::fmt; +use core::num::NonZero; +use core::ptr::NonNull; +use nix::sys::mman::{MapFlags, ProtFlags}; + +use kvm_ioctls::{Kvm, VcpuFd, VmFd}; +use kvm_bindings::{ + kvm_guest_debug, kvm_userspace_memory_region, kvm_segment, + KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, +}; + +pub use kvm_bindings::{kvm_regs, kvm_sregs, kvm_debug_exit_arch}; + +const _TARGET_IS_64BIT: () = { + assert!(core::mem::size_of::<u64>() == core::mem::size_of::<usize>(), "asmlinator only supports 64-bit targets"); +}; + +// the wanton casting between usize and u64 is justifiable here because TARGET_IS_64BIT above: +fn usize_to_u64(x: usize) -> u64 { + let _ = _TARGET_IS_64BIT; + + x as u64 +} + +fn u64_to_usize(x: u64) -> usize { + let _ = _TARGET_IS_64BIT; + + x as usize +} + +/// a test VM for running arbitrary instructions. +/// +/// there is one CPU which is configured for long-mode execution. all memory is +/// identity-mapped with 1GiB pages. page tables are configured to cover 512 GiB of memory, but +/// much much less than that is actually allocated and usable through `memory.` +/// +/// it is configured with `mem_size` bytes of memory at guest address 0, accessible through +/// host pointer `memory`. this region is used for "control structures"; page tables, GDT, IDT, +/// and stack. it is also the region where code to be executed is placed. +pub struct Vm { + vm: VmFd, + vcpu: VcpuFd, + idt_configured: bool, + mem_ceiling: u64, + memory: Mapping, + aux_memories: Vec<Mapping>, +} + +#[derive(PartialEq)] +pub enum VcpuExit<'buf> { + MmioRead { addr: u64, buf: &'buf mut [u8] }, + MmioWrite { addr: u64, buf: &'buf [u8] }, + IoIn { port: u16, buf: &'buf mut [u8] }, + IoOut { port: u16, buf: &'buf [u8] }, + Debug { pc: u64, info: kvm_debug_exit_arch }, + Exception { nr: u8 }, + Shutdown, + Hlt, +} + +impl<'buf> fmt::Debug for VcpuExit<'buf> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use VcpuExit::*; + match self { + MmioRead { addr, buf } => { + let size = buf.len(); + write!(f, "VcpuExit::MmioRead {{ addr: {addr:#08x}, size: {size} }}") + }, + MmioWrite { addr, buf } => { + let size = buf.len(); + write!(f, "VcpuExit::MmioWrite {{ addr: {addr:#08x}, size: {size} }}") + }, + IoIn { port, buf } => { + let size = buf.len(); + write!(f, "VcpuExit::IoIn {{ port: {port:#04x}, size: {size} }}") + }, + IoOut { port, buf } => { + let size = buf.len(); + write!(f, "VcpuExit::IoOut {{ port: {port:#04x}, size: {size} }}") + }, + Debug { pc, info: _ } => { + write!(f, "VcpuExit::Debug {{ pc: {pc:#016x}, _ }}") + }, + Exception { nr } => { + write!(f, "VcpuExit::Exception {{ nr: {nr} }}") + }, + Shutdown => { + write!(f, "VcpuExit::Shutdown") + }, + Hlt => { + write!(f, "VcpuExit::Hlt") + } + } + } +} + + +const GB: u64 = 1 << 30; + +// TODO: cite APM/SDM +const IDT_ENTRIES: u16 = 256; + +#[derive(Copy, Clone)] +pub struct GuestAddress(pub u64); + +pub struct VmPageTables<'vm> { + vm: &'vm Vm, + base: GuestAddress, +} + +impl<'vm> VmPageTables<'vm> { + pub fn pml4_addr(&self) -> GuestAddress { + self.base + } + + pub fn pdpt_addr(&self) -> GuestAddress { + GuestAddress(self.base.0 + 0x1000) + } + + pub fn pml4_mut(&self) -> *mut u64 { + // SAFETY: creating VmPageTables implies we've asserted that we can form host pointers + // for all addresses in the page tables. + unsafe { + self.vm.host_ptr(self.pml4_addr()) as *mut u64 + } + } + + pub fn pdpt_mut(&self) -> *mut u64 { + // SAFETY: creating VmPageTables implies we've asserted that we can form host pointers + // for all addresses in the page tables. + unsafe { + self.vm.host_ptr(self.pdpt_addr()) as *mut u64 + } + } +} + +fn encode_segment(seg: &kvm_segment) -> u64 { + let base = seg.base as u64; + let limit = seg.limit as u64; + + let lim_low = limit & 0xffff; + let lim_high = (limit >> 16) & 0xf; + let addr_low = base & 0xffff; + let desc_low = lim_low | (addr_low << 16); + + let base_mid = (base >> 16) & 0xff; + let base_high = (base >> 24) & 0xff; + let access_byte = (seg.type_ as u64) + | (seg.s as u64) << 4 + | (seg.dpl as u64) << 5 + | (seg.present as u64) << 7; + let flaglim_byte = lim_high + | (seg.avl as u64) << 4 + | (seg.l as u64) << 5 + | (seg.db as u64) << 6 + | (seg.g as u64) << 7; + let desc_high = base_mid + | access_byte << 8 + | flaglim_byte << 16 + | base_high << 24; + + desc_low | (desc_high << 32) +} + +pub enum VmCreateError { + /// the requested VM was smaller than `asmlinator`'s minimum allowable size. + TooSmall { requested: usize, required: usize }, + /// the requested VM's memory size was not an even number of pages. + BadSize { requested: usize, unit: usize }, + /// one of the several syscalls in setting up a new VM failed. + /// + /// this is most likely a permissions error, or `/dev/kvm` doesn't exist. otherwise, something + /// interesting happened! + /// + /// this deserves better documentation, but i'm not aware of documentation for KVM ioctl + /// failure modes. + SyscallError { op: &'static str, err: nix::errno::Errno }, + /// `base` and `size` are not valid for mapping; either because of over/underflow, or overlaps + /// with an existing mapping. + InvalidMapping { base: GuestAddress, size: u64 } +} + +impl fmt::Debug for VmCreateError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + VmCreateError::TooSmall { requested, required } => { + write!(f, "requested memory size ({requested}) is too small, must be at least {required}") + } + VmCreateError::BadSize { requested, unit } => { + write!(f, "requested memory size ({requested}) is not a multiple of ({unit})") + } + VmCreateError::SyscallError { op, err } => { + write!(f, "error at {op}: {err}") + } + VmCreateError::InvalidMapping { base, size } => { + write!(f, "invalid mapping (gpa={:#08x}/size={:08x})", base.0, size) + } + } + } +} + +impl fmt::Debug for VmError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + VmError::BadSize { requested, unit } => { + write!(f, "requested memory size ({requested}) is not a multiple of ({unit})") + } + VmError::SyscallError { op, err } => { + write!(f, "error at {op}: {err}") + } + VmError::InvalidMapping { base, size } => { + write!(f, "invalid mapping (gpa={:#08x}/size={:08x})", base.0, size) + } + } + } +} + + +pub enum VmError { + /// the requested VM's memory size was not an even number of pages. + BadSize { requested: usize, unit: usize }, + /// one of the several syscalls in operating a VM failed. + /// + /// this deserves better documentation, but i'm not aware of documentation for KVM ioctl + /// failure modes. + SyscallError { op: &'static str, err: nix::errno::Errno }, + /// `base` and `size` are not valid for mapping; either because of over/underflow, or overlaps + /// with an existing mapping. + InvalidMapping { base: GuestAddress, size: u64 } +} + +impl VmError { + fn from_kvm(op: &'static str, err: kvm_ioctls::Error) -> Self { + Self::SyscallError { op, err: nix::errno::Errno::from_raw(err.errno()) } + } +} + +impl From<VmError> for VmCreateError { + fn from(other: VmError) -> Self { + match other { + VmError::BadSize { requested, unit } => VmCreateError::BadSize { requested, unit }, + VmError::SyscallError { op, err } => VmCreateError::SyscallError { op, err }, + VmError::InvalidMapping { base, size } => VmCreateError::InvalidMapping { base, size }, + } + } +} + +/// a `mmap`'d region, `munmap`'d on drop. +struct Mapping { + guest_addr: usize, + addr: NonNull<core::ffi::c_void>, + size: NonZero<usize>, +} + +impl Drop for Mapping { + fn drop(&mut self) { + let res = unsafe { + nix::sys::mman::munmap(self.addr, self.size.get()) + }; + res.expect("can unmap a region we mapped"); + } +} + +impl Mapping { + fn create_shared(guest_addr: usize, size: usize, prot: ProtFlags) -> Result<Self, VmError> { + if size % 4096 != 0 { + return Err(VmError::BadSize { + requested: size, + unit: 4096, + }); + } + + let size = NonZero::new(size) + .ok_or(VmError::BadSize { + requested: 0, + unit: 0, + })?; + + let map_res = unsafe { + nix::sys::mman::mmap_anonymous( + None, + size, + prot, + MapFlags::MAP_ANONYMOUS | MapFlags::MAP_SHARED, + ) + }; + + let map_addr = map_res + .map_err(|e| VmError::SyscallError { op: "mmap", err: e })?; + + // look, mmap should only be in the business of returning page-aligned addresses but i + // just wanna see it, you know... + assert!(map_addr.as_ptr() as usize % 4096 == 0); + + Ok(Self { + guest_addr, + addr: map_addr, + size, + }) + } + + /// SAFETY: the caller must not use the returned pointer to violate reference safety of the VM. + /// the pointer must not be turned into a reference while running the VM, etc. + /// + /// panics if `address` is not contained in this mapping. + unsafe fn host_ptr(&self, address: GuestAddress) -> *mut u8 { + let guest_addr: u64 = usize_to_u64(self.guest_addr); + let offset = address.0.checked_sub(guest_addr) + .expect("guest address is above mapping base"); + + let base = self.addr.as_ptr() as *mut u8; + + unsafe { + base.offset(offset as isize) + } + } + + /// SAFETY: the caller must ensure that this mapping covers `base` and that there are at least + /// `size` bytes at `base` before the end of this mapping. + unsafe fn slice_mut(&mut self, base: GuestAddress, size: u64) -> &mut [u8] { + let ptr = unsafe { self.host_ptr(base) }; + + unsafe { + core::slice::from_raw_parts_mut(ptr, u64_to_usize(size)) + } + } + + /// SAFETY: the caller must ensure that this mapping covers `base` and that there are at least + /// `size` bytes at `base` before the end of this mapping. + unsafe fn slice(&self, base: GuestAddress, size: u64) -> &[u8] { + let ptr = unsafe { self.host_ptr(base) }; + + unsafe { + core::slice::from_raw_parts(ptr, u64_to_usize(size)) + } + } + + fn overlaps(&self, base: GuestAddress, index_end: GuestAddress) -> bool { + let map_base: u64 = usize_to_u64(self.guest_addr); + let map_end = map_base.checked_add(usize_to_u64(self.size.get())).unwrap(); + + let enclosed_by = base.0 <= map_base && index_end.0 >= map_end; + let contains_base = base.0 >= map_base && base.0 < map_end; + let contains_end = index_end.0 >= map_base && index_end.0 <= map_end; + + enclosed_by || contains_base || contains_end + } + + fn contains(&self, base: GuestAddress) -> bool { + let end = self.guest_addr.checked_add(self.size.get()).unwrap(); + + base.0 >= self.guest_addr as u64 && base.0 < end as u64 + } + + fn check_range(&self, base: GuestAddress, size: u64) -> bool { + let map_base: u64 = self.guest_addr.try_into().unwrap(); + let Some(offset) = base.0.checked_sub(map_base) else { + return false; + }; + let Some(end) = offset.checked_add(size) else { + return false; + }; + + end <= self.size.get().try_into().unwrap() + } +} + +#[test] +fn test_check_range_exact() { + let mapping = Mapping::create_shared(0x4000, 0x1000, ProtFlags::PROT_READ).expect("can create mapping"); + assert!(mapping.check_range(GuestAddress(0x4000), 0x1000)); +} + +#[test] +fn test_xor_runs() { + let mut vm = Vm::create(128 * 1024).expect("can create vm"); + let mut regs = vm.get_regs().expect("can get regs"); + + vm.program(&[0x33, 0xc0], &mut regs); + + regs.rax = 0x1234; + let rip_before = regs.rip; + + vm.set_regs(®s).expect("can set regs"); + + vm.set_single_step(true).expect("can set single-step"); + + let res = vm.run().expect("can run vm"); + + let rip_after = rip_before + 2; + assert!(matches!(res, VcpuExit::Debug { pc: rip_after, .. })); + + let regs_after = vm.get_regs().expect("can get regs"); + assert_eq!(regs_after.rax, 0); +} + +impl Vm { + pub fn create(mem_size: usize) -> Result<Vm, VmCreateError> { + let kvm = Kvm::new() + .map_err(|e| VmError::from_kvm("Kvm::new()", e))?; + + let vm = kvm.create_vm() + .map_err(|e| VmError::from_kvm("craete_vm", e))?; + + // actual minimum is somewhere around 0x1a000 bytes, but 0x20_000 aka 128k will do + if mem_size < 128 * 1024 { + return Err(VmCreateError::TooSmall { + requested: mem_size, + required: 128 * 1024, + }); + } + + let mapping = Mapping::create_shared(0, mem_size, ProtFlags::PROT_READ | ProtFlags::PROT_WRITE)?; + + let region = kvm_userspace_memory_region { + slot: 0, + guest_phys_addr: 0x0000, + memory_size: mapping.size.get() as u64, + userspace_addr: mapping.addr.as_ptr() as u64, + flags: 0, + }; + + let set_res = unsafe { vm.set_user_memory_region(region) }; + set_res.map_err(|e| VmError::from_kvm("set_user_memory_region", e))?; + + let vcpu_res = vm.create_vcpu(0); + let vcpu = vcpu_res.map_err(|e| VmError::from_kvm("create_vcpu(0)", e))?; + + let mem_ceiling = mapping.size.get().try_into().unwrap(); + + let mut this = Vm { + vm, + vcpu, + idt_configured: false, + memory: mapping, + aux_memories: Vec::new(), + mem_ceiling, + }; + + let mut vcpu_regs = this.get_regs()?; + let mut vcpu_sregs = this.get_sregs()?; + + unsafe { + this.configure_identity_paging(Some(&mut vcpu_sregs)); + this.configure_selectors(&mut vcpu_sregs); + this.configure_idt(&mut vcpu_regs, &mut vcpu_sregs); + } + + vcpu_sregs.efer = 0x0000_0500; // LME | LMA + + this.set_regs(&vcpu_regs)?; + this.set_sregs(&vcpu_sregs)?; + + Ok(this) + } + + /// map and add a region of size `size` at guest-physical address `gpa`. + /// + /// this will not update page tables, so if the newly-added memory is not already mapped due to + /// a previous `configure_identity_paging` call and it is not mapped due to explicit page table + /// management, it will not yet be accessible by guest code. + pub fn add_memory(&mut self, gpa: GuestAddress, size: u64) -> Result<(), VmError> { + let new_mapping_end = gpa.0.checked_add(size) + .map(|addr| GuestAddress(addr)) + .ok_or_else(|| VmError::InvalidMapping { base: gpa, size })?; + if self.memory.overlaps(gpa, new_mapping_end) { + return Err(VmError::InvalidMapping { base: gpa, size }); + } else { + for mapping in self.aux_memories.iter() { + if mapping.overlaps(gpa, new_mapping_end) { + return Err(VmError::InvalidMapping { base: gpa, size }); + } + } + } + + let mapping = Mapping::create_shared( + u64_to_usize(gpa.0), + u64_to_usize(size), + ProtFlags::PROT_READ | ProtFlags::PROT_WRITE + )?; + + let used_slots: u32 = self.aux_memories.len().try_into() + .map_err(|_| VmError::InvalidMapping { base: gpa, size })?; + let next_slot = used_slots.checked_add(1) + .ok_or_else(|| VmError::InvalidMapping { base: gpa, size })?; + + let region = kvm_userspace_memory_region { + slot: next_slot, + guest_phys_addr: gpa.0, + memory_size: mapping.size.get() as u64, + userspace_addr: mapping.addr.as_ptr() as u64, + flags: 0, + }; + + let set_res = unsafe { self.vm.set_user_memory_region(region) }; + set_res.map_err(|e| VmError::from_kvm("set_user_memory_region", e))?; + + self.aux_memories.push(mapping); + + if new_mapping_end.0 > self.mem_ceiling { + self.mem_ceiling = new_mapping_end.0; + } + + Ok(()) + } + + pub fn get_regs(&self) -> Result<kvm_regs, VmError> { + self.vcpu.get_regs() + .map_err(|e| VmError::from_kvm("get_regs", e)) + } + + pub fn get_sregs(&self) -> Result<kvm_sregs, VmError> { + self.vcpu.get_sregs() + .map_err(|e| VmError::from_kvm("get_sregs", e)) + } + + pub fn set_regs(&self, regs: &kvm_regs) -> Result<(), VmError> { + self.vcpu.set_regs(regs) + .map_err(|e| VmError::from_kvm("set_regs", e)) + } + + pub fn set_sregs(&self, sregs: &kvm_sregs) -> Result<(), VmError> { + self.vcpu.set_sregs(sregs) + .map_err(|e| VmError::from_kvm("set_sregs", e)) + } + + pub fn idt_configured(&self) -> bool { + self.idt_configured + } + + // TODO: seems like there's a KVM bug where if the VM is configured for single-step and the + // single-stepped instruction is a rmw to MMIO memory (or MMIO hugepages?), the single-step + // doesn't actually take effect. compare `0x33 0x00` and `0x31 0x00`. what the hell! + pub fn set_single_step(&mut self, active: bool) -> Result<(), VmError> { + let mut guest_debug = kvm_guest_debug::default(); + + if active { + guest_debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP + }; + + self.vcpu.set_guest_debug(&guest_debug) + .map_err(|e| VmError::from_kvm("set_guest_debug", e)) + } + + pub fn run<'vm>(&'vm mut self) -> Result<VcpuExit<'vm>, VmError> { + let exit = self.vcpu.run() + .map_err(|e| VmError::from_kvm("vcpu run", e))?; + + match exit { + kvm_ioctls::VcpuExit::MmioRead(addr, buf) => { + // `buf` is typed with a lifetime from the reborrow of self.vcpu for run() above. + // this means it's a shorter lifetime than `'vm`, but since the resulting lifetime + // is also `'vm` it *really* has the effect of disallowing any subsequent use of + // `self`. these transmutes decouple the lifetime of `exit` from the lifetime of + // `self` and returned `VcpuExit`, so other arms that don't involve lifetimes can + // drop `exit()` and query the vcpu. + // + // SAFETY: this actually extends the lifetime of `buf` from the shorter transient + // lifetime to `'vm` for the return type. + let buf: &'vm mut [u8] = unsafe { core::mem::transmute(buf) }; + return Ok(VcpuExit::MmioRead { buf, addr }); + } + kvm_ioctls::VcpuExit::MmioWrite(addr, buf) => { + // see the same transmute in `MmioRead` for why this is load-bearing. + // + // SAFETY: this actually extends the lifetime of `buf` from the shorter transient + // lifetime to `'vm` for the return type. + let buf: &'vm [u8] = unsafe { core::mem::transmute(buf) }; + return Ok(VcpuExit::MmioWrite { buf, addr }); + } + kvm_ioctls::VcpuExit::IoIn(port, buf) => { + // see the same transmute in `MmioRead` for why this is load-bearing. + // + // SAFETY: this actually extends the lifetime of `buf` from the shorter transient + // lifetime to `'vm` for the return type. + let buf: &'vm mut [u8] = unsafe { core::mem::transmute(buf) }; + return Ok(VcpuExit::IoIn { port, buf }); + } + kvm_ioctls::VcpuExit::IoOut(port, buf) => { + // see the same transmute in `MmioRead` for why this is load-bearing. + // + // SAFETY: this actually extends the lifetime of `buf` from the shorter transient + // lifetime to `'vm` for the return type. + let buf: &'vm [u8] = unsafe { core::mem::transmute(buf) }; + return Ok(VcpuExit::IoOut { port, buf }); + } + kvm_ioctls::VcpuExit::Debug(info) => { + let pc = info.pc; + return Ok(VcpuExit::Debug { pc, info }); + } + kvm_ioctls::VcpuExit::Hlt => { + let regs = self.get_regs()?; + + if self.idt_configured { + let intrs_start = self.interrupt_handlers_start().0; + let intrs_end = intrs_start + IDT_ENTRIES as u64; + // by the time we've exited the `hlt` of the interrupt handler has completed, + // so rip is advanced by one. subtract back out to convert to an exception + // vector number. + let intr_addr = regs.rip - 1; + + if intr_addr >= intrs_start && intr_addr < intrs_end { + let nr = intr_addr - intrs_start; + // because IDT_ENTRIES is 256, this should always be true.. + assert!(nr < 256); + let nr = nr as u8; + + return Ok(VcpuExit::Exception { nr }); + } + } + + Ok(VcpuExit::Hlt) + } + kvm_ioctls::VcpuExit::Shutdown => { + return Ok(VcpuExit::Shutdown); + } + other => { + panic!("unhandled VcpuExit kind: {other:?}"); + } + } + } + + /// get a pointer to host memory mapped to guest address `address`. + /// + /// panics if `address` is not a guest-physical address backed by host memory. + pub unsafe fn host_ptr(&self, address: GuestAddress) -> *mut u8 { + let mapping = self.map_containing(address, 0) + .expect("mapping for address exists"); + + unsafe { + mapping.host_ptr(address) + } + } + + pub fn gdt_addr(&self) -> GuestAddress { + GuestAddress(0x1000) + } + + pub fn idt_addr(&self) -> GuestAddress { + GuestAddress(0x2000) + } + + pub fn interrupt_handlers_start(&self) -> GuestAddress { + GuestAddress(0x3000) + } + + pub fn page_table_addr(&self) -> GuestAddress { + GuestAddress(0x10000) + } + + pub fn code_addr(&self) -> GuestAddress { + GuestAddress(self.memory.size.get() as u64 - 4096) + } + + pub fn mem_ceiling(&self) -> u64 { + self.mem_ceiling + } + + /// configuring the IDT implies the IDT might be used which means we want a stack pointer + /// that can have at least 0x18 bytes pushed to it if an interrupt happens. + pub fn stack_addr(&self) -> GuestAddress { + // it would be nice to point the stack somewhere that we could get MMIO exits and see the + // processor push words for the interrupt in real time, but that doesn't ... work. + // instead, you end up in a loop somewhere around svm_vcpu_run (which you can ^C out of, + // thankfully). + // + // so this picks some guest memory lower down. + + // stack grows *down* but if someone pops a lot of bytes from rsp we'd go up and + // clobber the page tables. so leave a bit of space. + GuestAddress(0x19800) + } + + /// selector 0x10 is chosen arbitrarily for code. + pub fn selector_cs(&self) -> u16 { + 0x10 + } + + /// selector 0x18 is chosen arbitrarily for data (all segments; ss, ds, es, etc). + pub fn selector_ds(&self) -> u16 { + 0x18 + } + + fn map_containing_mut(&mut self, base: GuestAddress, size: u64) -> Option<&mut Mapping> { + let mapping = if self.memory.contains(base) { + &mut self.memory + } else { + self.aux_memories.iter_mut() + .find(|map| map.contains(base))? + }; + + if !mapping.check_range(base, size) { + return None; + } + + Some(mapping) + } + + fn map_containing(&self, base: GuestAddress, size: u64) -> Option<&Mapping> { + let mapping = if self.memory.contains(base) { + &self.memory + } else { + self.aux_memories.iter() + .find(|map| map.contains(base))? + }; + + if !mapping.check_range(base, size) { + return None; + } + + Some(mapping) + } + + /// write all of `data` into guest memory at guest-physical address `addr`. + /// + /// panics if `data` extends beyond the end of guest memory. + pub fn write_mem(&mut self, addr: GuestAddress, data: &[u8]) { + let mapping = self.map_containing(addr, data.len() as u64).expect("mapping is valid"); + + // SAFETY: `check_range` above validates the range to copy, and... please do not + // provide a slice of guest memory as what the guest should be programmed for... + unsafe { + std::ptr::copy_nonoverlapping( + data.as_ptr(), + mapping.host_ptr(addr), + data.len() + ); + } + } + + /// read guest-physical memory at `addr` to `addr + buf.len()` into `buf`. + /// + /// panics if `addr + buf.len()` extends beyond the end of guest memory. + pub fn read_mem(&mut self, addr: GuestAddress, buf: &mut [u8]) { + let mapping = self.map_containing(addr, buf.len() as u64).expect("mapping is valid"); + + // SAFETY: `check_range` above validates the range to copy, and... please do not + // provide a slice of guest memory as what should be read into... + unsafe { + std::ptr::copy_nonoverlapping( + mapping.host_ptr(addr) as *const _, + buf.as_mut_ptr(), + buf.len() + ); + } + } + + /// returns a slice of guest memory pointed to by guest-physical address `addr`, of size + /// `size`. + /// + /// panics if `addr + size` is not enclosed in a single guest mapping. this crate doesn't + /// support returning a single slice of adjacent guest memory regions (yet?), sorry. + pub fn mem_slice_mut<'vm>(&'vm mut self, addr: GuestAddress, size: u64) -> &'vm mut [u8] { + let mapping = self.map_containing_mut(addr, size).expect("mapping is valid"); + + // SAFETY: we have an exclusive borrow of the VM, so it is not currently running, and there + // is no other outstanding slice of guest memory. `map_containing` has already ensured that + // this mapping contains the whole range `[addr, addr + size)`. + unsafe { + mapping.slice_mut(addr, size) + } + } + + /// returns a slice of guest memory pointed to by guest-physical address `addr`, of size + /// `size`. + /// + /// panics if `addr + size` is not enclosed in a single guest mapping. this crate doesn't + /// support returning a single slice of adjacent guest memory regions (yet?), sorry. + pub fn mem_slice<'vm>(&'vm self, addr: GuestAddress, size: u64) -> &'vm [u8] { + let mapping = self.map_containing(addr, size).expect("mapping is valid"); + + // SAFETY: we have an exclusive borrow of the VM, so it is not currently running, and there + // is no other outstanding slice of guest memory. `map_containing` has already ensured that + // this mapping contains the whole range `[addr, addr + size)`. + unsafe { + mapping.slice(addr, size) + } + } + + /// write `code` into guest memory and set `regs.rip` to the address of that code. + /// + /// the chosen code address is [`Self::code_addr`]. + pub fn program(&mut self, code: &[u8], regs: &mut kvm_regs) { + let addr = self.code_addr(); + self.write_mem(addr, code); + + regs.rip = addr.0; + } + + fn gdt_entry_mut(&mut self, idx: u16) -> *mut u64 { + // the GDT is set up at addresses 0..64k: + // + // > 3.5.1 Segment Descriptor Tables + // > A segment descriptor table is an array of segment descriptors (see Figure 3-10). A + // > descriptor table is variable in length and can contain up to 8192 (2^13) 8-byte + // > descriptors. + + assert!(idx < 4096 / 8); + let addr = GuestAddress(self.gdt_addr().0 + (idx as u64 * 8)); + let mapping = self.map_containing(addr, std::mem::size_of::<u64>() as u64).unwrap(); + + // SAFETY: idx * 8 can't overflow isize, and we've asserted the end of the pointer is + // still inside the allocation (`self.memory`). + unsafe { + mapping.host_ptr(addr) as *mut u64 + } + } + + // note this returns a u32, but an IDT is four u32. the u32 this points at is the first of + // the four for the entry. + fn idt_entry_mut(&mut self, idx: u8) -> *mut u32 { + let addr = GuestAddress(self.idt_addr().0 + (idx as u64 * 16)); + let mapping = self.map_containing(addr, std::mem::size_of::<[u64; 2]>() as u64).unwrap(); + + unsafe { + mapping.host_ptr(addr) as *mut u32 + } + } + + pub fn page_tables(&self) -> VmPageTables<'_> { + let base = self.page_table_addr(); + + // the page tables are really just two pages: a PML4 and a PDPT for its first 512G of + // address space. + assert!(self.map_containing(base, 0x2000).is_some()); + + VmPageTables { + vm: self, + base, + } + } + + /// configure page tables for identity mapping of all memory from guest address zero up to the + /// end of added memory regions, rounded up to the next GiB. + /// + /// if `sregs` is provided, update `cr0`, `cr3`, and `cr4` in support of protected-mode or + /// long-mode paging. this is a fixed pattern: if control registers have not been changed since + /// `Vm::create` then there will be no change to these control registers and `sregs` can be + /// omitted. + /// + /// panics if the end of added memory regions is above 512 GiB. + pub unsafe fn configure_identity_paging(&mut self, sregs: Option<&mut kvm_sregs>) { + let pt = self.page_tables(); + + // we're only setting up one PDPT, which can have up to 512 PDPTE covering 1G each. + assert!(self.mem_ceiling() <= 512 * GB); + + // TODO: expects 1G page support + + let pml4_ent = + 1 << 0 | // P + 1 << 1 | // RW + 1 << 2 | // user access allowed. but no user code will run so not strictly needed. + 0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient) + 0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient) + 0 << 5 | // A + 0 << 6 | // ignored + 0 << 7 | // PS (reserved must-be-0) + 0 << 11 | // R (for ordinary paging, ignored; for HLAT ...) + pt.pdpt_addr().0; + unsafe { + pt.pml4_mut().write(pml4_ent); + } + + let mut mapped: u64 = 0; + // we've set up the first PML4 to point to a PDPT, so we should actually set it up! + let pdpt = pt.pdpt_mut(); + // PDPTEs start at the start of PDPT.. + let mut pdpte = pdpt; + let entry_bits: u64 = + 1 << 0 | // P + 1 << 1 | // RW + 1 << 2 | // user accesses allowed (everything is under privilege level 0 tho) + 0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient) + 0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient) + 0 << 5 | // Accessed + 0 << 6 | // Dirty + 1 << 7 | // Page size (1 implies 1G page) + 1 << 8 | // Global (if cr4.pge) + 0 << 9 | + 0 << 10 | + 0 << 11 | // for ordinary paging, ignored. for HLAT, ... + 0 << 12; // PAT (TODO: configure explicitly, but PAT0 is sufficient. verify MTRR sets PAT0 to WB?) + + while mapped < self.mem_ceiling() { + let phys_num = mapped >> 30; + let entry = entry_bits | (phys_num << 30); + unsafe { + pdpte.write(entry); + pdpte = pdpte.offset(1); + } + // eprintln!("mapped 1g at {:08x}", mapped); + mapped += 1 << 30; + } + + if let Some(sregs) = sregs { + sregs.cr0 |= 0x8000_0001; // cr0.PE | cr0.PG + sregs.cr3 = pt.pml4_addr().0 as u64; + sregs.cr4 |= 1 << 5; // enable PAE + } + } + + unsafe fn configure_selectors(&mut self, sregs: &mut kvm_sregs) { + // we have to set descriptor information directly. this avoids having to load selectors + // as the first instructions on the vCPU, which is simplifying. but if we want the + // information in these selectors to match with anything in a GDT (i do!) we'll have to + // keep this initial state lined up with GDT entries ourselves. + // + // we could avoid setting up the GDT for the most part, but anything that might + // legitimately load the "valid" current segment selector would instead clobber the + // selector with zeroes. + + sregs.cs.base = 0; + sregs.cs.limit = 0; + sregs.cs.selector = self.selector_cs(); + sregs.cs.type_ = 0b1011; // see SDM table 3-1 Code- and Data-Segment Types + sregs.cs.present = 1; + sregs.cs.dpl = 0; + sregs.cs.db = 0; + sregs.cs.s = 1; + sregs.cs.l = 1; + sregs.cs.g = 0; + sregs.cs.avl = 0; + + sregs.ds.base = 0; + sregs.ds.limit = 0xffffffff; + sregs.ds.selector = self.selector_ds(); + sregs.ds.type_ = 0b0011; // see SDM table 3-1 Code- and Data-Segment Types + sregs.ds.present = 1; + sregs.ds.dpl = 0; + sregs.ds.db = 0; + sregs.ds.s = 1; + sregs.ds.l = 0; + sregs.ds.g = 0; + sregs.ds.avl = 0; + + sregs.es = sregs.ds; + sregs.fs = sregs.ds; + sregs.gs = sregs.ds; + // linux populates the vmcb cpl field with whatever's in ss.dpl. what the hell??? + sregs.ss = sregs.ds; + + sregs.gdt.base = self.gdt_addr().0; + sregs.gdt.limit = 256 * 8 - 1; + + unsafe { + self.gdt_entry_mut(self.selector_cs() >> 3).write(encode_segment(&sregs.cs)); + self.gdt_entry_mut(self.selector_ds() >> 3).write(encode_segment(&sregs.ds)); + } + } + + fn write_idt_entry( + &mut self, + intr_nr: u8, + interrupt_handler_cs: u16, + interrupt_handler_addr: GuestAddress + ) { + let idt_ptr = self.idt_entry_mut(intr_nr); + + // entries in the IDT, interrupt and trap descriptors (in the AMD APM, "interrupt-gate" + // and "trap-gate" descriptors), are described (in the AMD APM) by + // "Figure 4-24. Interrupt-Gate and Trap-Gate Descriptors—Long Mode". reproduced here: + // + // 3 2 1 | 1 0 + // 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + // |---------------------------------------------------------------| + // | res,ign | +12 + // | target offset[63:32] | +8 + // | target offset[31:16] |P|DPL|0| type | res,ign | IST | +4 + // | target selector | target offset[15:0] | +0 + // |---------------------------------------------------------------| + // + // descriptors are encoded with P set, DPL at 0, and type set to 0b1110. TODO: frankly + // i don't know the mechanical difference between type 0x0e and type 0x0f, but 0x0e + // works for now. + let idt_attr_bits = 0b1_00_0_1110_00000_000; + let low_hi = (interrupt_handler_addr.0 as u32 & 0xffff_0000) | idt_attr_bits; + let low_lo = (interrupt_handler_cs as u32) << 16 | (interrupt_handler_addr.0 as u32 & 0x0000_ffff); + + unsafe { + idt_ptr.offset(0).write(low_lo); + idt_ptr.offset(1).write(low_hi); + idt_ptr.offset(2).write((interrupt_handler_addr.0 >> 32) as u32); + idt_ptr.offset(3).write(0); // reserved + } + } + + fn configure_idt(&mut self, regs: &mut kvm_regs, sregs: &mut kvm_sregs) { + sregs.idt.base = self.idt_addr().0; + sregs.idt.limit = IDT_ENTRIES * 16 - 1; // IDT is 256 entries of 16 bytes each + + for i in 0..IDT_ENTRIES { + let interrupt_handler_addr = GuestAddress(self.interrupt_handlers_start().0 + i as u64); + self.write_idt_entry( + i.try_into().expect("<u8::MAX interrupts"), + self.selector_cs(), + interrupt_handler_addr + ); + } + + // all interrupt handlers are just `hlt`. their position is used to detect which + // exception/interrupt occurred. + unsafe { + std::slice::from_raw_parts_mut( + self.host_ptr(self.interrupt_handlers_start()), + IDT_ENTRIES as usize + ).fill(0xf4); + } + + // finally, set `rsp` to a valid region so that the CPU can push necessary state (see + // AMD APM section "8.9.3 Interrupt Stack Frame") to actually enter the interrupt + // handler. if we didn't do this, rsp will probably be zero or something, underflow, + // page fault on push to 0xffffffff_ffffffff, and just triple fault. + // + // TODO: this is our option in 16- and 32-bit modes, but in long mode all the interrupt + // descriptors could set something in IST to switch stacks outright for exception + // handling. this might be nice to test rsp permutations in 64-bit code? alternatively + // we might just have to limit possible rsp permutations so as to be able to test in + // 16- and 32-bit modes anyway. + regs.rsp = self.stack_addr().0; + self.idt_configured = true; + } +} |
