1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
|
#[cfg(target_arch = "x86_64")]
mod kvm {
use kvm_ioctls::{Kvm, VcpuFd, VmFd, VcpuExit};
use kvm_bindings::{
kvm_guest_debug, kvm_userspace_memory_region, kvm_segment, kvm_regs, kvm_sregs,
KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP,
};
/// a test VM for running arbitrary instructions.
///
/// there is one CPU which is configured for long-mode execution. all memory is
/// identity-mapped with 1GiB pages.
///
/// it is configured with `mem_size` bytes of memory at guest address 0, accessible through
/// host pointer `memory`.
#[allow(unused)]
struct TestVm {
vm: VmFd,
vcpu: VcpuFd,
memory: *mut u8,
mem_size: usize,
}
#[derive(Copy, Clone)]
struct GuestAddress(u64);
struct VmPageTables<'vm> {
vm: &'vm TestVm,
base: GuestAddress,
}
impl<'vm> VmPageTables<'vm> {
fn pml4_addr(&self) -> GuestAddress {
self.base
}
fn pdpt_addr(&self) -> GuestAddress {
GuestAddress(self.base.0 + 0x1000)
}
fn pml4_mut(&self) -> *mut u64 {
// SAFETY: creating VmPageTables implies we've asserted that we can form host pointers
// for all addresses in the page tables.
unsafe {
self.vm.host_ptr(self.pml4_addr()) as *mut u64
}
}
fn pdpt_mut(&self) -> *mut u64 {
// SAFETY: creating VmPageTables implies we've asserted that we can form host pointers
// for all addresses in the page tables.
unsafe {
self.vm.host_ptr(self.pdpt_addr()) as *mut u64
}
}
}
impl TestVm {
fn create() -> TestVm {
let kvm = Kvm::new().unwrap();
let vm = kvm.create_vm().unwrap();
let mem_size = 1024 * 1024;
let mem_addr: *mut u8 = unsafe {
libc::mmap(
core::ptr::null_mut(),
mem_size,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_ANONYMOUS | libc::MAP_SHARED | libc::MAP_NORESERVE,
-1,
0,
) as *mut u8
};
assert!(!mem_addr.is_null());
// look, mmap should only be in the business of returning page-aligned addresses but i
// just wanna see it, you know...
assert!(mem_addr as usize % 4096 == 0);
let region = kvm_userspace_memory_region {
slot: 0,
guest_phys_addr: 0x0000,
memory_size: mem_size as u64,
userspace_addr: mem_addr as u64,
flags: 0,
};
unsafe { vm.set_user_memory_region(region).unwrap() };
let vcpu = vm.create_vcpu(0).unwrap();
let mut this = TestVm {
vm,
vcpu,
memory: mem_addr,
mem_size,
};
let mut vcpu_sregs = this.vcpu.get_sregs().unwrap();
unsafe {
this.configure_identity_paging(&mut vcpu_sregs);
this.configure_selectors(&mut vcpu_sregs);
}
vcpu_sregs.efer = 0x0000_0500; // LME | LMA
this.vcpu.set_sregs(&vcpu_sregs).unwrap();
this
}
fn set_single_step(&mut self, active: bool) {
let mut guest_debug = kvm_guest_debug::default();
if active {
guest_debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP
};
self.vcpu.set_guest_debug(&guest_debug).unwrap();
}
fn run(&mut self) -> VcpuExit<'_> {
self.vcpu.run().unwrap()
}
unsafe fn host_ptr(&self, address: GuestAddress) -> *mut u8 {
self.memory.offset(address.0 as isize)
}
fn gdt_addr(&self) -> GuestAddress {
GuestAddress(0)
}
fn page_table_addr(&self) -> GuestAddress {
GuestAddress(0x10000)
}
fn code_addr(&self) -> GuestAddress {
GuestAddress(self.mem_size as u64 - 4096)
}
fn check_range(&self, base: GuestAddress, size: u64) {
let base = base.0;
let end = base.checked_add(size).expect("no overflow");
assert!(base < self.mem_size as u64);
assert!(self.mem_size as u64 >= end);
}
pub fn program(&mut self, code: &[u8], regs: &mut kvm_regs) {
let addr = self.code_addr();
self.check_range(addr, code.len() as u64);
// SAFETY: `check_range` above validates the range to copy, and... please do not
// provide a slice of guest memory as what the guest should be programmed for...
unsafe {
std::ptr::copy_nonoverlapping(code.as_ptr(), self.host_ptr(addr), code.len());
}
regs.rip = self.code_addr().0;
}
fn gdt_entry_mut(&mut self, idx: u16) -> *mut u64 {
// the GDT is set up at addresses 0..64k:
//
// > 3.5.1 Segment Descriptor Tables
// > A segment descriptor table is an array of segment descriptors (see Figure 3-10). A
// > descriptor table is variable in length and can contain up to 8192 (2^13) 8-byte
// > descriptors.
let addr = GuestAddress(self.gdt_addr().0 + (idx as u64 * 8));
assert!(idx < 8192);
self.check_range(addr, std::mem::size_of::<u64>() as u64);
// SAFETY: idx * 8 can't overflow isize, and we've asserted the end of the pointer is
// still inside the allocation (`self.memory`).
unsafe {
self.host_ptr(addr) as *mut u64
}
}
fn page_tables(&self) -> VmPageTables<'_> {
let base = self.page_table_addr();
// the page tables are really just two pages: a PML4 and a PDPT for its first 512G of
// address space.
self.check_range(base, 0x2000);
VmPageTables {
vm: self,
base,
}
}
unsafe fn configure_identity_paging(&mut self, sregs: &mut kvm_sregs) {
let pt = self.page_tables();
const GB: usize = 1 << 30;
// we're only setting up one PDPT, which can have up to 512 PDPTE covering 1G each.
assert!(self.mem_size <= 512 * GB);
// TODO: expects 1G page support
pt.pml4_mut().write(
1 << 0 | // P
1 << 1 | // RW
1 << 2 | // user access allowed. but no user code will run so not strictly needed.
0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient)
0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient)
0 << 5 | // A
0 << 6 | // ignored
0 << 7 | // PS (reserved must-be-0)
0 << 11 | // R (for ordinary paging, ignored; for HLAT ...)
pt.pdpt_addr().0
);
let mut mapped: u64 = 0;
// we've set up the first PML4 to point to a PDPT, so we should actually set it up!
let pdpt = pt.pdpt_mut();
// PDPTEs start at the start of PDPT..
let mut pdpte = pdpt;
let entry_bits: u64 =
1 << 0 | // P
1 << 1 | // RW
1 << 2 | // user accesses allowed (everything is under privilege level 0 tho)
0 << 3 | // PWT (TODO: configure PAT explicitly, but PAT0 is sufficient)
0 << 4 | // PCD (TODO: configure PAT explicitly, but PAT0 is sufficient)
0 << 5 | // Accessed
0 << 6 | // Dirty
1 << 7 | // Page size (1 implies 1G page)
1 << 8 | // Global (if cr4.pge)
0 << 9 |
0 << 10 |
0 << 11 | // for ordinary paging, ignored. for HLAT, ...
0 << 12; // PAT (TODO: configure explicitly, but PAT0 is sufficient. verify MTRR sets PAT0 to WB?)
while mapped < self.mem_size as u64 {
let phys_num = mapped >> 30;
let entry = entry_bits | (phys_num << 30);
pdpte.write(entry);
pdpte = pdpte.offset(1);
// eprintln!("mapped 1g at {:08x}", mapped);
mapped += 1 << 30;
}
sregs.cr0 = 0x8000_0001; // cr0.PE | cr0.PG
sregs.cr3 = pt.pml4_addr().0 as u64;
sregs.cr4 = 1 << 5; // enable PAE
}
unsafe fn configure_selectors(&mut self, sregs: &mut kvm_sregs) {
// we have to set descriptor information directly. this avoids having to load selectors
// as the first instructions on the vCPU, which is simplifying. but if we want the
// information in these selectors to match with anything in a GDT (i do!) we'll have to
// keep this initial state lined up with GDT entries ourselves.
//
// we could avoid setting up the GDT for the most part, but anything that might
// legitimately load the "valid" current segment selector would instead clobber the
// selector with zeroes.
sregs.cs.base = 0;
sregs.cs.limit = 0;
sregs.cs.selector = 4 * 8;
sregs.cs.type_ = 0b1010; // see SDM table 3-1 Code- and Data-Segment Types
sregs.cs.present = 1;
sregs.cs.dpl = 0;
sregs.cs.db = 0;
sregs.cs.s = 1;
sregs.cs.l = 1;
sregs.cs.g = 0;
sregs.cs.avl = 1;
sregs.ds.base = 0;
sregs.ds.limit = 0;
sregs.ds.selector = 5 * 8;
sregs.ds.type_ = 0b0010; // see SDM table 3-1 Code- and Data-Segment Types
sregs.ds.present = 1;
sregs.ds.dpl = 0;
sregs.ds.db = 1;
sregs.ds.s = 1;
sregs.ds.l = 0;
sregs.ds.g = 1;
sregs.ds.avl = 0;
sregs.es = sregs.ds;
sregs.fs = sregs.ds;
sregs.gs = sregs.ds;
sregs.ss = sregs.ds;
fn encode_segment(seg: &kvm_segment) -> u64 {
let base = seg.base as u64;
let limit = seg.limit as u64;
let lim_low = limit & 0xffff;
let lim_high = (limit >> 16) & 0xf;
let addr_low = base & 0xffff;
let desc_low = lim_low | (addr_low << 16);
let base_mid = (base >> 16) & 0xff;
let base_high = (base >> 24) & 0xff;
let desc_high = base_mid
| (seg.type_ as u64) << 8
| (seg.s as u64) << 12
| (seg.dpl as u64) << 13
| (seg.present as u64) << 15
| lim_high << 16
| (seg.avl as u64) << 20
| (seg.l as u64) << 21
| (seg.db as u64) << 22
| (seg.g as u64) << 23
| base_high << 24;
desc_low | (desc_high << 32)
}
sregs.gdt.base = self.gdt_addr().0;
sregs.gdt.limit = 0xffff;
self.gdt_entry_mut(4).write(encode_segment(&sregs.cs));
self.gdt_entry_mut(5).write(encode_segment(&sregs.ds));
}
}
#[test]
fn behavior_verify_kvm() {
let mut vm = TestVm::create();
let inst: &'static [u8] = &[0x33, 0x01, 0xf4];
let code_addr = 1024 * 1024 - 4096;
{
// safety: VM is not running so we are the only writers to this memory
let slice = unsafe {
std::slice::from_raw_parts_mut(
vm.memory.offset(code_addr),
0x1000
)
};
slice[..inst.len()].copy_from_slice(&inst);
}
let before_sregs = vm.vcpu.get_sregs().unwrap();
let mut regs = vm.vcpu.get_regs().unwrap();
vm.program(inst, &mut regs);
regs.rax = 0;
regs.rcx = code_addr as u64;
vm.vcpu.set_regs(®s).unwrap();
vm.set_single_step(true);
let res = vm.run();
eprintln!("exit: {:?}", res);
let after_regs = vm.vcpu.get_regs().unwrap();
eprintln!("rip is now {:08x}", after_regs.rip);
eprintln!("rax is now {:08x}", after_regs.rax);
eprintln!("rcx is now {:08x}", after_regs.rcx);
let after_sregs = vm.vcpu.get_sregs().unwrap();
fn get_reg_changes(
before_regs: kvm_regs, after_regs: kvm_regs,
before_sregs: kvm_sregs, after_sregs: kvm_sregs
) {
}
let changed_regs = get_reg_changes(regs, after_regs, before_sregs, after_sregs);
}
}
|