From b29167eeee880c9f7f31194c94aadf715733bf99 Mon Sep 17 00:00:00 2001 From: iximeow Date: Sat, 23 Dec 2023 19:29:20 -0800 Subject: lots of stuff. yax rx notes --- .../blog/computer_misuse_rust_vtable_patching.md | 197 +++++++++++++++++++ source/blog/index.md | 10 + source/blog/now_what.md | 66 +++++++ source/blog/now_what/ntdll.dll | Bin 0 -> 2024736 bytes source/blog/now_what/rtlqpc_disassembly_start | 0 source/blog/regmap.md | 46 +++++ source/blog/the_ci_situation.md | 7 + source/blog/working_title.md | 61 ++++++ source/blog/yax/arch/rx.md | 215 +++++++++++++++++++++ source/blog/yaxgbc_dev_notes.md | 39 ++++ source/blog/yaxpeax.md | 36 ++++ 11 files changed, 677 insertions(+) create mode 100644 source/blog/computer_misuse_rust_vtable_patching.md create mode 100644 source/blog/index.md create mode 100644 source/blog/now_what.md create mode 100644 source/blog/now_what/ntdll.dll create mode 100644 source/blog/now_what/rtlqpc_disassembly_start create mode 100644 source/blog/regmap.md create mode 100644 source/blog/the_ci_situation.md create mode 100644 source/blog/working_title.md create mode 100644 source/blog/yax/arch/rx.md create mode 100644 source/blog/yaxgbc_dev_notes.md create mode 100644 source/blog/yaxpeax.md (limited to 'source/blog') diff --git a/source/blog/computer_misuse_rust_vtable_patching.md b/source/blog/computer_misuse_rust_vtable_patching.md new file mode 100644 index 0000000..ddc27e2 --- /dev/null +++ b/source/blog/computer_misuse_rust_vtable_patching.md @@ -0,0 +1,197 @@ +# computer misuse: rust vtable patching + +this is the first in a loosely-affiliated series of posts about wildly misusing what is available for us on the computer. setting the tone some, this first post is about an old bit of rust vtable patching i hacked together years ago. + +the claim at the time was that rust has no affordances for monkey-patching, and i decided to prove otherwise\* as much as possible\*\*. in case you're fluent in rust and x86, just posting the code might be fine. afterward, i'll talk about it a bit. + +``` +trait Person { + fn greeting(&self) -> StateOfMind; + fn name(&self) -> &'static str; +} + +struct Ixi { } +struct Katie { } + +#[derive(Debug)] +enum StateOfMind { + GoodMorning, + HappyBirthday, +} + +impl Person for Ixi { + fn greeting(&self) -> StateOfMind { + StateOfMind::GoodMorning + } + fn name(&self) -> &'static str { "ixi" } +} + +impl Person for Katie { + fn greeting(&self) -> StateOfMind { + StateOfMind::HappyBirthday + } + fn name(&self) -> &'static str { "katie" } +} + +#[inline(never)] +fn greet(you: &dyn Person) { + println!("{} says {:?} to you!", you.name(), you.greeting()); +} + +#[inline(never)] +fn correct_ixi() { + unsafe { + // gotta reprogram ixi real quick + let ixi_in_a_box: Box = Box::new(Ixi {}); + #[inline(never)] + unsafe fn inner_takes_dyn(v: Box) { + if let Some(vtable_offset) = get_greeting_offset() { + // ok we know where in the vtable to patch, just not where the vtable itself is yet. + + // v is a pointer to (data, vtable) + let dyn_thingie: [*const usize; 2] = std::mem::transmute(v); + let vtable = dyn_thingie[1]; + let table_entry = vtable_offset as usize / std::mem::size_of::(); + // get a pointer to the entry in the vtable we want to fix + let vtable_ptr = (vtable as *mut usize).offset(table_entry as isize); + let katie_greet = get_katie_greet_address() + .expect("if we got here we can read ixi's vtable, and the same should work for katie"); + + // memory mapping will have this as read-only. gotta fix that... + libc::mprotect( + (vtable_ptr as usize & !(4095)) as *mut _, + 4096 * 2, + 7, // READ | WRITE + ); + + // and commit a crime + std::ptr::write_volatile(vtable_ptr, katie_greet); + + // clean up after though + libc::mprotect( + (vtable_ptr as usize & !(4095)) as *mut _, + 4096 * 2, + 1, // READ | WRITE + ); + } else { + panic!("well we need a vtable offset"); + } + } + + inner_takes_dyn(ixi_in_a_box); + } +} + +// just get the offset of `greeting` in the `Person` vtable +#[inline(never)] +unsafe fn get_greeting_offset() -> Option { + // the function is read, so this will be compiled. + // because `greeting` does not return a type with a destructor, it + // will end with a tail call to the function we want. the offset + // of that tail call is the magic number for the offset of the + // function to replace in the vtable. + // + // in debug builds, this ends with `call; pop; ret`. look for both. + // in both cases these are preceeded by `mov rdi, rax`, spelled + // `4889c7`. + fn do_greeting(b: &Box) { + b.greeting(); + } + let f = do_greeting as *const u8; + std::ptr::read_volatile(f); + + // find the magic sigil. 0 indicates we couldn't find it! + let mut i = 1; + let mut vtable_offset = None; + while i < 100 { + let curr = f.offset(i); + if *curr == 0x48 && *(curr.offset(1)) == 0x89 && *(curr.offset(2)) == 0xc7 { + // now the offset is encoded in one of two ways: either an 8-bit + // offset if the vtable is tiny, or a 32-bit offset if it's larger. + // + // 8-bit offst has modrm bits of 01_rrr_mmm + // 32-bit offset has modrm bits of 10_rrr_mmm + // for example, "call [rcx + 0x18]" is spelled + // ff 61 18 + // ^ ^ ^-- offset we want + // | ------ modrm 01_000_001, 8-bit displacement off reg 001 (rcx), r=0 (call) + // --------- opcode for call/jmp [mem] + let modrm: u8 = *curr.offset(4); + let offset = curr.offset(5); // we might not actually read this, depending on mod bits + match modrm >> 6 { + 0b00 => { + // no offset - this is just `call [reg]` + vtable_offset = Some(0); + } + 0b01 => { + // 8-bit offset + vtable_offset = Some(std::ptr::read_unaligned(offset) as u32); + } + 0b10 => { + // 32-bit offset + vtable_offset = Some(std::ptr::read_unaligned(offset as *const u32)); + } + 0b11 => { + // this is actually `{call,jmp} reg`. we'd be in a bad spot here. + } + _ => { + // these are just unreachable. if everything is well-formed, anyway. + unreachable!(); + } + } + // anyway we found the instruction so lets break and move on + break; + } + i += 1; + } + + vtable_offset +} + +fn get_katie_greet_address() -> Option { + // this is for the most part the same logic as correcting ixi, but instead of + // writing to ixi's vtable, we read katie's vtable + unsafe { + let katie_in_a_box: Box = Box::new(Katie {}); + #[inline(never)] + unsafe fn inner_takes_dyn(v: Box) -> Option{ + if let Some(vtable_offset) = get_greeting_offset() { + // ok we know where in the vtable to read, just not where the vtable itself is yet. + + // v is a pointer to (data, vtable) + let dyn_thingie: [*const usize; 2] = std::mem::transmute(v); + let vtable = dyn_thingie[1]; + let table_entry = vtable_offset as usize / std::mem::size_of::(); + // get a pointer to the entry in the vtable we want to read + let vtable_ptr = (vtable as *mut usize).offset(table_entry as isize); + // and commit a crime + Some(*vtable_ptr) + } else { + None + } + } + + inner_takes_dyn(katie_in_a_box) + } +} + +fn main() { + greet(&Ixi {}); + greet(&Katie {}); + println!("wait that's not right, hold on a sec..."); + correct_ixi(); + greet(&Ixi {}); +} +``` + +... and when you run it, you'll get ... the wrong thing. gotta figure it out and fix. + +

tl;dr? is rust bad?

+ +index + +

ps: some windows stuff

+ +
+#eval radare2 -q -c 'pd 43 @ 0x180040150' ./now_what/ntdll.dll | aha --no-header --stylesheet +
diff --git a/source/blog/index.md b/source/blog/index.md new file mode 100644 index 0000000..6eca535 --- /dev/null +++ b/source/blog/index.md @@ -0,0 +1,10 @@ +welcome to my blog + +*
  ✨timeless✨ 
[yaxpeax](./yaxpeax.html) +* ... +*
2021-05-17     
[regmap](./regmap.html) +*
2022-04-08     
[`Instant::now()` what?](./now_what.html) +*
2023-10-14     
[annular eclipse](./astro/2023/oct14_annular.html) +*
2023-12-13 ... 
[gameboy (color) emulation notes](./yaxgbc_dev_notes.html) +*
2023-12-19     
[Renesas RX disassembler notes](./yax/arch/rx.html) +*
2023-12-?      
[ci.butactuallyin.space, or, "why i don't just use github actions"](./the_ci_situation.html) diff --git a/source/blog/now_what.md b/source/blog/now_what.md new file mode 100644 index 0000000..ae76062 --- /dev/null +++ b/source/blog/now_what.md @@ -0,0 +1,66 @@ +# `Instant::now()` what? + +until Rust 1.60, `Instant::now()` included a [heavyweight hammer](https://github.com/rust-lang/rust/blob/bb1e42599d0062b9a43e83b5486d61eb1fcf0771/src/libstd/time.rs#L153-L194) to enforce the standard library's guarantee that `Instant::now()` monotonically increases. that is, it does not go backwards. some hardware has clocks that go backwards, and Rust sees fit to guard against such ridiculousness. + +[tl;dr at the bottom](./now_what.html#tldr) + +so, which hardware/software pairs are `actually_monotonic()`? according to [this comment](https://github.com/rust-lang/rust/blob/bb1e42599d0062b9a43e83b5486d61eb1fcf0771/src/libstd/time.rs#L153-L194): + +* (`OpenBSD`, `x86_64`) is [not monotonic](https://github.com/rust-lang/rust/issues/48514) +* (`linux`, `arm64`) is [not monotonic](https://github.com/rust-lang/rust/issues/49281) ([and again](https://github.com/rust-lang/rust/issues/56940)) +* (`linux`, `s390x`) is [not monotonic](https://github.com/rust-lang/rust/issues/49281#issuecomment-375469099) +* (`windows`, `x86`) is [not monotonic](https://github.com/rust-lang/rust/issues/51648) + - hardware here might be a haswell chip, but under xen (details at the bottom of OP: `Intel64 Family 6 Model 63 Stepping 2 GenuineIntel ~2400 Mhz`, [lookup](https://en.wikichip.org/wiki/intel/cpuid#Big_Cores_.28Server.29)) +* (`windows`, `x86_64`) is [not monotonic](https://github.com/rust-lang/rust/issues/56560) + - unknown hardware, also aws +* (`windows`, `x86`) is [not monotonic](https://github.com/rust-lang/rust/issues/56612) + +and Firefox has a [similar hammer](https://bugzilla.mozilla.org/show_bug.cgi?id=1487778) to force apparent monotonicity of "now". + +i've seen people talk about this before, with shock and awe and horror. i've [tweeted about this before](https://twitter.com/iximeow/status/1114677717897580544). there have been [sharp words](https://lwn.net/Articles/388286/) about x86 TSCs in linux discussions. + +what's interesting to me today is that Rust concludes that on the same inconsistent hardware, windows and openbsd get clocks wrong in a way linux does not. + +so: on windows, Rust uses [`QueryPerformanceCounter`](https://github.com/rust-lang/rust/blob/bb1e42599d0062b9a43e83b5486d61eb1fcf0771/src/libstd/sys/windows/time.rs#L35-L43). for macos, [`mach_absolute_time`](https://github.com/rust-lang/rust/blob/bb1e42599d0062b9a43e83b5486d61eb1fcf0771/src/libstd/sys/unix/time.rs#L150-L155), and linux, [`clock_gettime(CLOCK_MONOTONIC)`](https://github.com/rust-lang/rust/blob/bb1e42599d0062b9a43e83b5486d61eb1fcf0771/src/libstd/sys/unix/time.rs#L301-L303). these all seem like the reasonable hardware-abstracting ways to get a monotonic clock, letting the OS paper over broken hardware when possible. + +and they do ([linux](https://github.com/torvalds/linux/blob/b91c8c42ffdd5c983923edb38b3c3e112bfe6263/lib/vdso/gettimeofday.c#L105-L107), [windows](./now_what.html#ntdll_RtlQueryPerformanceCounter)). i didn't care to figure out what openbsd does, but it certainly also tries to fast-path time checks on reasonable hardware. + +so how does linux get steady monotonically increasing times on hardware that windows can't make consistent? [a random comment on stackoverflow](https://stackoverflow.com/questions/28921328/why-does-windows-switch-processes-between-processors#comment101077103_28921779) believes that windows aggressively moves processes between cores, where linux tightly couples processes and cores, which might mean that windows happens to expose inconsistent more often. it also proceeds right into claims about product claims for no good reason. linux just [does what Rust also now does](https://github.com/torvalds/linux/blob/1831fed559732b132aef0ea8261ac77e73f7eadf/arch/x86/include/asm/vdso/gettimeofday.h#L294-L318) - "if the clock looks wrong just saturate and say [it didn't change](https://github.com/torvalds/linux/blob/65c61de9d090edb8a3cfb3f45541e268eb2cdb13/lib/vdso/gettimeofday.c#L78)" (as of [1.60 anyway](https://github.com/rust-lang/rust/commit/9d8ef1160747a4d033f21803770641f2deb32b25)), generally. on x86 specifically, linux [falls back to the kernel](https://github.com/torvalds/linux/blob/65c61de9d090edb8a3cfb3f45541e268eb2cdb13/lib/vdso/gettimeofday.c#L258-L261) if it decided a clock is no longer trustworthy, as determined by the last vdso data update. + +`clock_gettime(CLOCK_MONOTONIC)` also makes stronger claims than `QueryPerformanceCounter`, asserting that the returned time is with respect to the system's startup, where QPC says that it's independent of any external time source (so, not comparable to wallclock times). according to microsoft's documentation for QPC, windows XP may be tripped up by hardware incorrectly reporting TSC variance, Vista chose to use HPET instead of a TSC, windows 7 was back to using a TSC if available (modulo incorrect hardware reporting), and windows 8+ use TSCs. i didn't bother looking to see what windows 10 does in the kernel, but in `ntdll.dll!RtlQueryPerformanceCounter`, on x86, it certainly does rely on `rdtsc` with appropriate barriers for serialization. + +but then linux developers report that some hardware will change TSCs and lie about the current time, which may lead to incorrect time reports from `clock_gettime` in the fallback kernel code anyway. + +so why did Rust decide that windows is untrustworthy due to the presence of broken hardware, while linux is trusted to not give totally bogus times? idk, probably because there were reports of broken windows times on x86, and not reports of broken linux times on x86. maybe linux' attempt at monotonization is sufficient for the worst cases of whacky hardware. maybe windows has a particularly bad time migrating between VMs, might be hinted by a section from this [high-resolution time stamps](https://docs.microsoft.com/en-us/windows/win32/sysinfo/acquiring-high-resolution-time-stamps) document: `; and on Hyper-V, the performance counter frequency is always 10 MHz when the guest virtual machine runs under a hypervisor that implements the hypervisor version 1.0 interface`. the windows issues all have some evidence of being related to times gathered in a VM (maybe even AWS specifically). the Firefox issue seems to relate to older hardware, but [some comments](https://bugzilla.mozilla.org/show_bug.cgi?id=1487778#c5) suggest they actually saw instability on linux as well. i can't see the old crash reports, so i don't have any hope of seeing implicated hardware. + +even if windows was penalized for what might be a primarily-in-VMs time issue, the hammer fixes what was an uncontrolled, unpredictable crash due to hardware-level behavior into just a performance issue. that's a good improvement. + +

tl;dr? is rust bad?

+ +given that this was a fix for crashes with murky circumstances where the only clear information - especially easily available - is that the circumstance should be impossible and that buggy hardware is prevalent, the technical decisions made here were reasonable given what the parties knew at the time and the constraints they were subject to. it's fine. + +index + +

ps: some windows stuff

+ +windows is closed source. so to know how it handles hardware differences in tsc consistency we get to read compiled code. + +so here's `ntdll.dll!RtlQueryPerformanceCounter`. + +
+#eval radare2 -q -c 'pd 43 @ 0x180040150' ./now_what/ntdll.dll | aha --no-header --stylesheet +
+ +first, `mov r8b, byte [0x7ffe03c6]` loads a byte that will be used to check which way we should read time counters. `r8b` will be reused several times in this function. + +all early `je` checks are to branch off to some cold code far away from this function. the happy path is to fall through to `0x18004019d` where either we believe `rdtscp` is sufficient to read timers, or we should `lfence; rdtsc` and come back. either way this loads the TSC into `edx:eax`, which is reassembled into a 64-bit number before being offset and scaled (?) by some core-local (?) information in `r9`. and if this compares less than something (?), branch back and see if we should use the cold path anyway. the cold path code returns here, where we eventually write to the out-pointer parameter in the `mov` at `0x1800401cf`. + +the cold path is interesting and worth looking at too: + +
+#eval radare2 -q -c 'pd 21 @ 0x1800b6a3e' ./now_what/ntdll.dll | aha --no-header --stylesheet +
+ +again we're consulting `r8b` for which mechanism we can safely use. down at `0x1800b6a6a` is the worst case, calling into `NtQueryPerformanceCounter` - a wrapper to make the syscall into the kernel for whatever fallback mechanism it has available. this is how windows eventually falls back to HPET if something is seriously wrong. + +all in all, not dissimilar from linux's implementation of `tsc`-based timers. diff --git a/source/blog/now_what/ntdll.dll b/source/blog/now_what/ntdll.dll new file mode 100644 index 0000000..a0dea6b Binary files /dev/null and b/source/blog/now_what/ntdll.dll differ diff --git a/source/blog/now_what/rtlqpc_disassembly_start b/source/blog/now_what/rtlqpc_disassembly_start new file mode 100644 index 0000000..e69de29 diff --git a/source/blog/regmap.md b/source/blog/regmap.md new file mode 100644 index 0000000..e792398 --- /dev/null +++ b/source/blog/regmap.md @@ -0,0 +1,46 @@ +# regmap + +an april fools prank but make it March. [the README says it best](https://github.com/iximeow/regmap#regmap): + +> `regmap` handles this design oversight by allowing users to memory-map the processor's general-purpose registers (GPR). + +and + +> # why +> because it's funny + +but more seriously, [`regmap`'s emulation](https://github.com/iximeow/regmap/blob/no-gods-no-/src/regmap.rs#L442-L746) of x86 was a first draft of how i might want to describe instruction semantics like i eventually did in [`yaxpeax-core`](https://github.com/iximeow/yaxpeax-core/blob/no-gods-no-/src/arch/x86_64/semantic.rs#L266). this is where i discovered that `do_binop` and its approach, as well as `conditional_loc_write` as it eventually was written in `yaxpeax-core`, don't work as well with dead code elimination (circa 2021 Rust) as i'd wanted. + +and anyway, sometimes i forget that `regmap` made it possible to write this [disorienting mess of a program](https://github.com/iximeow/regmap/blob/no-gods-no-/examples/hello_world.rs): +``` +use regmap::registers::*; + +fn main() { + unsafe { regmap::map_registers(); } + + println!("rsp: {:x}", RSP.load()); + RSP.sub_assign(8); + RIP.store(lol as u64); +} + +fn lol() { + println!("very good"); + RSP.sub_assign(8); + RDI.store(1); + RSI.store(1234); + RIP.store(lol_args as u64); +} + +fn lol_args(a1: u64, a2: u64) { + println!("a1: {}, a2: {}", a1, a2); + println!("goodbye!"); + std::process::exit(0); +} +``` +which, you guessed it (?), prints out +``` +rsp: 0xwhatever +very good +a1: 1, at: 1234 +goodbye! +``` diff --git a/source/blog/the_ci_situation.md b/source/blog/the_ci_situation.md new file mode 100644 index 0000000..d7929ae --- /dev/null +++ b/source/blog/the_ci_situation.md @@ -0,0 +1,7 @@ +# build-o-tron + +i don't particularly trust that GitHub will be where i want my authoritative repos to exist, and i trust free CI on someone else's infrastructure even less. sr.ht exists, and [forgeperf.org](https://forgeperf.org/) is compelling! but i have infrastructure at home and i'd like to get precise (ish) perf measurements when validating changes to sensitive code like `yaxpeax-x86`. "How Hard Is Writing A CI System Anyway?" + +it's a little annoying i gusess, but not that bad. + + diff --git a/source/blog/working_title.md b/source/blog/working_title.md new file mode 100644 index 0000000..88ef25f --- /dev/null +++ b/source/blog/working_title.md @@ -0,0 +1,61 @@ +references: + * ~/linux/include/linux/mm.h + * man 2 mmap + +# mm thingy + +i told some people that i wanted to write a memory management post and they all thought i was going to talk about malloc. but no, it'll mention malloc maybe once and zoom right along. i've been finally connecting a lot of details in how linux implements some kinds of memory tricks so i'm going go to write it all down and you get to learn it too. so here's a small chapter on how computers use memory, from a book i'm not writing. + +if you're already familiar with how hardware addresses memory, and how you'd configure an MMU and you're just here for the interesting Linux bits: i'll get there eventually, you might want to [skip over the buildup](link to the linux parts). + +## primitives + +### eletricals and computer architecture (like, physical discrete chips on a breadboard) +i want to start from (approximately) scratch here: the juicy nuggets that i set out to write about are at the end, but i think there's something difficult to appreciate without also holding "how we got here" in your head alongside. so, let's start with the simplest machines and see how we get to the systems we have... today. + +the simplest computer might be something that just has load, store, and arithmetic instructions. maybe even just "load" for all data transfer - "load" to memory, "load" from memory, "load" from one register to another. the processor has a few registers it can directly operate on. so, to do complex workloads you attach some read-write memory (RAM) to store data into with your load and store instructions. volatile memory is expensive and requires power to maintain bits, so you also attach read-only storage (ROM) to store some bootstrap programs like a BASIC interpreter. + +a simple processor might do 8-bit operations, and use up to 16 bits to address storage. that gives it 2^16 addresses it can access, but your largest (and very expensive) storage parts might only be as large as 4kb - 2^12 needing only 12 bits to address any byte in the part. the processor itself just knows about addresses, and might not particularly care what it's reading or writing to - an access is an access, a load is a load. it just executes a `ld A, (4567h)`, and that execution is just setting an address on a memory bus and reading whatever is on the data lines a few cycles later. + +this is very flexible! say there aren't monolithic storage devices large enough to span the whole 16-bit-addressable range. if you happen to know none of your storage parts need more than 12 bits to select a byte, that leaves four address-select lines that the processor _will_ signal, if an instruction says to do so, but you might use to select _which part_ you're addressing. to the program running on the processor, exactly what address selection does isn't really important, just that it selects an address and accesses it. + +so with a single kind of load instruction, you can have a computer that addresses volatile and non-volatile memory. great! you can go even further, and have some address-select lines actually select things that aren't "memory" - a keyboard, some latches driving LEDs, who knows. even a video buffer, if you're getting wild. all at the low cost of deciding to use a few bits of address selection to select where you're actually selecting an address from. at this point you're most of the way to a [TRS-80](trs-80-schematic-diagram-goes-here). + +processors might have a mechanism to auto-increment after an access, so it makes sense to ensure sequential addresses are to the same storage. if you've got 4kb of RAM and 4kb of ROM you can use the low twelve bits to select a byte in each of those, and pick some other higher bit to select _which_ of those you're accessing. say you decide that bit 15, `0b1000_0000_0000_0000` is how do that device selection. a RAM address would be something low, between `0b0000_0000_0000_0000` and `0b0000_1111_1111_1111`. then a ROM address would be something higher, between `0b1000_0000_0000_0000` and `0b1000_1111_1111_1111`. as a diagram: +| DIAGRAM GOES HERE | + +now you've also got enough of a computer to have problems. it's well and good to say "0x0000 to 0x0fff is RAM, 0x8000 to 0x8fff is ROM", but the processor doesn't know or care about this. the processor will still happily try executing an `ld A, (4567h)`, even if your manual says that's a nonsense address. it will still try to set the address `0100_0101_0110_0111` on the address bus, and still read a byte from the data bus a few cycles later. so what happens? in programming languages people talk about "undefined behavior", and hardware can be undefined just the same. + +_probably_ what will happen is the address lines that weren't going to select something useful get ignored. so the high bit might still select RAM or ROM, and only the low twelve bits would get used for addressing. the bits in between could easily just be pins of the processor that aren't even electrically connected to anything interesting. + +you could also have some separate circuitry that detects an invalid address on the address bus and signals an interrupt to handle the condition. but that's a lot of circuitry to handle an error condition that you shouldn't be encountering anyway - just don't make an incorrect program that does stray memory accesses and we can all avoid the hassle. + +if you decoded to write out what the hardware _does_ with this "undefined behavior", you might end up with a machine that has a strange mapping of addresses to memory: bit 15 is useful, bits 12, 13, and 14 are not, then bits 0 through 11 are functional again. the earlier diagram skipped over the undefined regions, but if you wrote out how this addressing would work in practice, when you _execute_ these "nonsense" accesses, you'd have a diagram more like this: + | LONGER AND MORE DEPRESSING DIAGRAM GOES HERE | + +note that the processor _really doesn't care what an address is selecting_. multiple addresses might select the same physical byte of memory in the machine! that's fine. the processor won't complain, anyway. + +if you designed a program in tandem with the storage it would be residing in, you might even take advantage of this: _an address now does not have to select the same byte as an address later_. as long as the program using a so-afflicted storage system knows how it will behave, things could still work! _this_ gives you a very interesting option in designing a computer: say you want to store downright luxurious amounts of data - 256kb in total. this would need 18 bits of addressing to pick out individual bits, but your processor is still a little tiny machine that only has 16 address bus pins. if you've designed your program along with the storage for this machine, you could work around this by arranging your storage into _banks_ of memory and reserving a specific byte in memory to select _which bank_ an address should select! + +now, you'd always want to be able to access the bank select byte. you probably want some RAM that's easily accessed regardless of what bank is selected. so, say you reserve 32kb for bank-select-and-other-misc-purposes, with the banked memory being the other 32kb you can address at a given time. with this scheme, 8 bits for bank selection could let you have a program that spans `32kb (non-banked) + 8 * 32kb (banked)` or `288kb` (!!) of memory, while still using a processor that can only address 16 bits - 64kb - of memory! (we've now invented the [NES's Memory Management Controller/Multi-Memory Controller/MMC](https://www.nesdev.org/wiki/Mapper).) + +this all is to reinforce some important points about addresses as a simple processor sees them: +* addresses don't have to go to a single block of storage +* addressing causes a computer to perform actions, electrically or otherwise +* addresses can select whatever they're physically wired up to select - maybe not even storage +* one address does not have to always select the same word in memory +* "addressing" can be a larger operation than just the address indicated on an address bus when a processor is executing an instruction +* the processor might trod along even when working with unintended addresses + +one last thought before getting really into it: i talked at you about a computer design where some addresses select read-only memory, and some addresses select read-write memory. and, a bit about what might happen if you try to do an access to an address that isn't what the hardware was designed to support. but the same line of "what if" might also have you ask, what happens if you try to store a byte to read-only memory? + +the simplest computer might faithfully select ROM at the address an instruction indicated, and assert that a write is occurring on the data bus. and put the word on its data lines. and wait the agreed-upon number of cycles for the addressed device to complete its storage. and the memory device on the other end of the lines may just have entirely ignored it. the write would be lost. the hardware doesn't much care if it's driven incorrectly, unless it's made to care. + +so, one last point about addresses: +* an address doesn't have to tell you or the processor what it's usable for + +OK! hopefully that's enough about addresses electrically selecting bytes in storage somewhere. over in processors themselves, things get off the rails. + +### memory management units + +say you've gone forward a few years. it's diff --git a/source/blog/yax/arch/rx.md b/source/blog/yax/arch/rx.md new file mode 100644 index 0000000..237d4de --- /dev/null +++ b/source/blog/yax/arch/rx.md @@ -0,0 +1,215 @@ +# Renesas RX + +notes from writing `yaxpeax-rx`, largely from reading the rx v1/v2/v3 manuals: + +* `rxv1`: RX Family RXv1 Instruction Set Architecture (User's Manual: Software), Rev. 1.30 (Dec 2019) + * retrieved 2023-12-16 from https://www.renesas.com/us/en/document/mas/rx-family-rxv1-instruction-set-architecture-users-manual-software-rev130 + * sha256: `e659dd509141da6bb1cfabf26c9f9ab5996d02060acaad2b5702963116834415` +* `rxv2`: RX Family RXv2 Instruction Set Architecture (User's Manual: Software), Rev. 1.00 (Nov 2013) + * retrieved 2023-12-16 from https://www.renesas.com/us/en/document/mas/rx-family-rxv2-instruction-set-architecture-users-manual-software + * sha256: `c12fc8d16adf1530f2cad3f75974d2a29062580a984a71fd9461417b66bba18a` +* `rxv3`: RX Family RXv3 Instruction Set Architecture (User's Manual: Software), Rev. 1.00 (Nov 2018) + * retrieved 2023-12-16 from https://www.renesas.com/us/en/document/mas/rx-family-rxv3-instruction-set-architecture-users-manual-software-rev100 + * sha256: `829815515a57d077bdfa418e0e167b512f2a04b3db3613329a4d8980399cf74c` + +broadly: of all the instruction sets, this is definitely one of them. 16 +general-purpose registers. some instructions have shorter-form encodings that +use only three bits for register selection, rather than four. so i imagine a +preference to use the low eight registers for code density reasons. i'm curious +how that works out for real programs and compilers weighing register choice +like that. + +`BMCnd` stands out as an interesting instruction; `Conditional bit transfer` +undersells it. it moves the state of a condition, `0` or `1`, to the specified +bit in a destination. the destination can either be a register or memory, and +otherwise leaves the destination value unmodified. `SCCnd` is similar but +behaves more like x86's `setcc` instructions: set the entire destination +byte/register to `0` or `1` depending on the condition. + +## rx v2 + +v2 adds a smattering of new instructions, and architectural extensions - see section `3.2 List of RXv2 Extended Instruction Set`. + +* a second accumulator register was added, bringing the set to `a0` and `a1`. +* many instructions were extended to operate on either `a0` or `a1`, in place of prior `a0`-only forms. +* `fsqrt`! new! and 3-operand forms of `fadd`, `fmul`, and `fsub`. +* and, accumulators are 72-bit now. + +## rx v3 + +v3 adds less, but also more. again, section `3.2 List of RXv3 Extended Instructions` for exact info. + +* `bfmov/`bfmovz`, which i talk a bit more about below, for bulk bit transfers between words +* a 3-operand form of `xor`, giving it parity with other instructions like `add`, `sub`, etc +* AND AN ENTIRE SET OF DOUBLE-PRECISION INSTRUCTIONS AND 16 NEW DOUBLE-PRECISION REGISTERS. + +practically speaking, the summaries here are accurate with i found when reading +through the manuals' contents. why did i have to read through the manuals +meticulously? + +# decode table, or lack thereof + +instruction encodings are listed in alphabetic order of instruction mnemonics. this is not amenable to writing a disassembler.. so i went through all three versions of the manual and transcribed encodings *from* the manual into a text file i could easily reorder. and so [notes/encoding_table](https://github.com/iximeow/yaxpeax-rx/blob/no-gods-no-/notes/encoding_table) was born. reorder that to be approximately by bits, and [notes/reordered_encodings](https://github.com/iximeow/yaxpeax-rx/blob/no-gods-no-/notes/reordered_encodings). finally, i tried finding patterns across encodings and simplifying the total number of encodings across all instructions, and that left me with [notes/grouped_encodings](https://github.com/iximeow/yaxpeax-rx/blob/no-gods-no-/notes/grouped_encodings). + +the disassembler itself is largely transcription of this table into source code. including, unfortunately, a massive chain of if/else from `0b00000000` stopping at dozens of points on the way to `0b11111111`. :') + +# encoding notes + +## operands... + +instructions with `ld` or `ls` fields encode an operand that is either `[Reg]`, +`disp[Reg]`, or `Reg` (just the register, no memory access). some of these +instructions, like the `06` encodings of `sub`, `cmp`, `add`, ... also have a +`mi` field that indicates how the memory operand is extended for use with the +second operand - which may be used only as a second source, or sometimes used +as a source+destination. + +so, if `ld` is `0b11` indicating a `Reg`, and `mi` indicates, for example, `.B` +meaning sign extension of a byte. but there is no indication in the manual +that, for example, `sub` would have an encoding that would mean `sub.b r1, r5`. +so what does `mi = 0b00 = b` mean for these instructions? no idea! `yaxpeax-rx` +assumes the bits are ignored for direct register operands. someone please prove +this wrong! or right. either is fine. + +## stnz/stz v2+ encoding typo + +encoding `(2)` of both of these instructions is a new extension in `RXv2`. unfortunately the manual has a typo: it says that `stnz` encoding 2 looks like... + +``` +(2) STNZ src, dest + +b7 b0 | b7 b0 | b7 b0a +1 1 1 1 1 1 0 0 | 0 1 0 0 1 0 1 1 | [ rs ] [ rd ] + ^^^^^^^ relevant +``` + +while encoding 2 of `stz`... +``` +(2) STZ src, dest + +b7 b0 | b7 b0 | b7 b0a +1 1 1 1 1 1 0 0 | 0 1 0 0 1 0 1 1 | [ rs ] [ rd ] + ^^^^^^^ same as above! +``` + +are `stz` and `stnz` somehow encoded the same? confusion abounds. internet dog the6p4c had the good idea to check binutils to cross check with what Renesas themselves might have said on the matter. they found: + +[`[PATCH v2][RX] Add RXv2 Instructions`](https://sourceware.org/legacy-ml/binutils/2015-04/msg00081.html) +``` ++ ++/** 1111 1100 0100 1011 rsrc rdst stz %1, %0 */ ++ ID(stcc); SR(rsrc); DR(rdst); S2cc(RXC_z); ++ ++/** 1111 1100 0100 1111 rsrc rdst stnz %1, %0 */ ++ ID(stcc); SR(rsrc); DR(rdst); S2cc(RXC_z); +``` + +which pretty clearly says "`stz` has the low bits of `1011`", "`stnz` has the low bits of `1111`". confusion resolved. EXCEPT: this includes a *different* copy/paste error! both instructions here have `S2cc(RXC_z)`. there's a followup commit for this, + +``` +commit 239efab16429cad466591ccd1c57bba786171765 +Author: Yoshinori Sato +Date: Thu Dec 17 01:42:34 2015 +0900 + + RXv2 support update + + 2015-12-22 Yoshinori Sato + + opcodes/ + * rx-decode.opc (movco): Use uniqe id. + (movli): Likewise. + (stnz): Condition fix. + +[...snip...] + + /** 1111 1100 0100 1111 rsrc rdst stnz %1, %0 */ +- ID(stcc); SR(rsrc); DR(rdst); S2cc(RXC_z); ++ ID(stcc); SR(rsrc); DR(rdst); S2cc(RXC_nz); + +[...snip...] + +``` + +so eventually everything ended up in the right state. but it's *very* funny to +look through the history and realize there were two copy-paste errors in +different directions about these two instructions. cursed additions! + +## cmp... + +cmp encoding (2), for `cmp #uimm:8` could be read as the bit pattern +``` +0 1 1 1 0 1 li | [ opc ] [ rs2 ] +``` +like `cmp` encoding `(3)`, or similar encodings of `mul`, `and`, `or`, but with `opc=0b101`. it has the additional constraint of `li=0b01` in such a reading, but this raises a question.. if `opc=0b000` allows four immediate operand lengths - 8, 16, 24, and 32 bits, sign-extended to 32 bits - why not allow all operand lengths with zero-extension for `opc=0b101`?? alas. + +## double-precision instructions... + +also in the area of +``` +0 1 1 1 0 1 li ... +``` +instructions, in RXv3 a new set of double-precision and related instructions were added. this makes another pattern with this encoding clearer: `li` picks the number of bytes to be read for operands, even though none of the operands are necessarily interpreted as an immediate. + +`li=0b01` usually represents a 32-bit immediate encoded as a sign-extended 8-bit value. so, read `0x7a`, read a byte for the opcode and destination register, then read one byte for the immediate. but for instructions like `int`, the encoding works out as +``` +0 1 1 1 0 1 0 1 | 0 1 1 0 0 0 0 0 | [ uimm:8 ] + li=01 opc=0110 rd=0000 ^ and read the 1-byte immediate of li=01 +``` + +RXv3 extends this - where a 2-byte immediate might involved in an instruction like +``` +0 1 1 1 0 1 1 0 | 0 0 0 1 0 1 1 0 | 0 1 0 1 0 1 0 1 | 1 0 1 0 1 0 1 0 + li=10 opc=0001 rs2=0110 imm=0x55AAi16 +``` +other new instructions, like `dadd r6, r5, r4`, are encoded.... *similarly* +``` +0 1 1 1 0 1 1 0 | 1 0 0 1 0 0 0 0 | 0 1 0 1 0 0 0 0 | 0 1 1 0 0 1 0 0 + "li=10" reserved? rs2=0101 opc=0000 rd=0110 rs=0100 +``` +`li` still means "read two bytes"! they're just not an immediate anymore. wild. + +## opcode selectors move around! + +in RXv3, with the new double-precision instructions, there is an interesting consistency decision to note... + +consider the `{dadd,dsub,dmul}` encoding pattern of +``` +0 1 1 1 0 1 1 0 | 1 0 0 1 0 0 0 0 | [ rs2 ] [ opc ] | [ rd ] [ rs ] +``` +for these instructions, the exact opcode is chosen by the four `opc` bits in the low nibble of the third byte. sure, that's fine! one of the possible opcodes here is `dcmp`, whose condition is indicated by the value of `rd`. this means that `dcmp` is encoded like: + +``` +0 1 1 1 0 1 1 0 | 1 0 0 1 0 0 0 0 | [ rs2 ] [ opc ] | [ rd ] [ rs ] + opc=0111 rd=cm={.., UN, EQ, ..} +``` +or, an instruction like `double-OP src, src2` and `dest` repurposed otherwise. + +this is in contrast of other two-operand instructions like `dabs`, encoded like: +``` +0 1 1 1 0 1 1 0 | 1 0 0 1 0 0 0 0 | [ rs ] [ opc ] | [ rd ] [ opc2] + opc=1100 opc2=0001 +``` +where the instruction has a skeleton more like `double-OP src, dest`, with `rs` being the repurposed field. this follows! the instruction no longer has two source operands, but does have a destination operand. + +i'm deeply curious why `rs` is the repurposed field here, rather than `rs2`. in that case, the "opcode" would be the third byte in its entirety, which seems like a nice property on its own. alternatively, maybe keeping the semantics of register selector bits the same simplifies decoder hardware... + +## float instruction encodings + +the three-operand forms of float instructions have similar mappings from bits to opcodes, compared to scalar operations. + +|bits|scalar|float| +|----|----|----| +|`0000`|`sub`|`fsub`| +|`0001`|`cmp`|`undef`| +|`0010`|`add`|`fadd`| +|`0011`|`mul`|`fmul`| + +this does not continue to be the case for double-precision instructions, unfortunately. for those instructions, `0001` tends to select `dadd`, rather than leave space for a future `fcmp`. + +## bitfields + +`bfmov` and `bfmovz` include a triplet of immediates to describe "move N bits starting from bit A out of source and into dest at bit B". the manual then goes on to say, + +> If (slsb + width) > 32 and (dlsb + width) > 32, then dest becomes undefined. + +... but that implies that if only one of the two overflows, dest is well-defined somehow? i think the manual *means* `or` in that sentence, alas. diff --git a/source/blog/yaxgbc_dev_notes.md b/source/blog/yaxgbc_dev_notes.md new file mode 100644 index 0000000..2a14567 --- /dev/null +++ b/source/blog/yaxgbc_dev_notes.md @@ -0,0 +1,39 @@ +# yaxgbc + +i hack on a gameboy color emulator from time to time. it started as... [a disassembler](https://git.iximeow.net/yaxpeax-sm83/?h=no-gods-no-) for the kind-of-but-not-Z80 processor in the Gameboy and Gameboy Color. in December or so, i looked at that library, remarked to myself that the whole disassembler is maybe 400 lines of Rust, and wondered how it would fare seeing real use. + +so i started writing enough of an emulator to see that the disassembler worked (it did! yay!), and started measuring how it affected the dependent code: +* actual runtime overhead? +* how large is the compiled decoder? + +TODO: fish up old numbers and screenshots. because there was substantial runtime impact, and i spotted a [missed optimization](https://github.com/rust-lang/rust/issues/107208) in rustc. i entirely rewrote the [`sm83` disassembler](https://github.com/iximeow/yaxpeax-sm83/commit/819e8a30d20c28398a00976a9925e9e741950bee) to be easier to merge the disassembler directly into code using it. as a [length-only decoder](https://github.com/iximeow/yaxpeax-sm83/commit/819e8a30d20c28398a00976a9925e9e741950bee#diff-9176af78feab5192ec447e97a2b52a0ba00b22c9e9b7ac3fc8c9763a5e9bd5caR50-R55), the decoder is on the order of 85 bytes without rustc being improved. + +somewhere between discovering the issue and coming up with a redesigned decoder interface, i had an existential panic about code reuse being fundamentally incompatible with high performance/tightly-integrated libraries. i no longer have any such panic :) + +## bugs aka dev log + +### video issues ("ascii `video` to the terminal is good right") + +### video issues ("why is the gameboy logo rendered wrong") + +### interrupt issues (are they level-triggered or edge-triggered? what clears interrupt bits, if anything) + +### input issues (edge/level trigger on inputs too) + +### clock sync (sleep() accuracy on windows and timeBeginPeriod) + +### audio issues ("there isn't any") + +### audio issues ("sounds are.. backwards?? what gives" [cycles/sample vs samples/sec]) + +### audio issues ("noise noises too long. why." [channel 4 length docs]) + +### audio issues ("frequencies are all off?" [timer multiplier issue]) + +### video issues ("why is parallax in link's awakening broken" [not handlnig lyc=0]) + +### video issues ("why is oracle of seasons/oracle of ages corrupted" [dma transfers to odd addresses]) + +### video issues ("why is wario land 3 corrupted" [dma transfers to address 0, expecting high bits set]) + +... diff --git a/source/blog/yaxpeax.md b/source/blog/yaxpeax.md new file mode 100644 index 0000000..9f8cc88 --- /dev/null +++ b/source/blog/yaxpeax.md @@ -0,0 +1,36 @@ +# yaxpeax + +a few things i talk about and hack on refer to _The Yaxpeax Project_. or, just "yaxpeax". [yaxpeax-arch](https://git.iximeow.net/yaxpeax-arch/about/) talks about "shared traits ... from the yaxpeax project". it's worth saying explicitly what the thing is or isn't. + +my thesis is that most programs are not inherently more difficult to work with (e.g. read, write, modify) as machine code than as source code it was compiled from. where machine code seems dense, this is a consequence of decades of neglect and missing tooling. aspirationally, "yaxpeax" is what i think could support high-quality tooling for that category of problem. + +realistically, "yaxpeax" is a [pile of disassemblers](https://git.iximeow.net/yaxpeax-arch/about/#implementations) and their partial integration into a [library for control-flow and data-flow analysis](https://github.com/iximeow/yaxpeax-core). + +even this, it seems, is enough to have a twinkle of promise!! + +* [Довер´яй, но провер´яй: SFI safety for native-compiled Wasm](https://www.ndss-symposium.org/wp-content/uploads/ndss2021_5B-3_24078_paper.pdf) was a paper accompanying [VeriWasm](https://github.com/PLSysSec/veriwasm), a tool to verify the isolation properties of native-compiled WebAssembly modules by analyzing produced machine code as a black box. this requires both disassembling the native-compiled code and reasoning about the relationships of values - addresses and otherwise. [yaxpeax-core provided enough control flow analysis they could (mostly) directly use it](https://github.com/PLSysSec/veriwasm/blob/b70e92b/src/ir/x64.rs#L1029-L1044). + +... though that's the best and only example of code analysis being as useful as i'd hope, so far. this is why "yaxpeax" as a project is fuzzy, and i primarily talk about it as a pile of disassemblers; those are neatly-scoped with a simple enough interface, and *are* reusable. + +* [for disassembly](https://github.com/mstange/linux-perf-data/blob/0bf074f/examples/jitdumpdump.rs#L82-L206) in an example of annotating perf data +* [for disassembly](https://github.com/rust-minidump/rust-minidump/blob/019c848/minidump-processor/src/op_analysis.rs#L137-L445) in analyzing minidump crash files' code for hints of crash causes + +* [for disassembly](https://github.com/novafacing/tsffs/blob/10697a078ab5510127bd7c067760e2f8cb593675/tsffs_module/src/processor/mod.rs#L743-L751) in part of a coverage-guided fuzzer + +and in some uses of my own - of course i find nails for my hammer: +* [yaxpeax-dis](https://github.com/iximeow/yaxpeax-dis), as a "please try to disassemble this" tool, including all\* yaxpeax-supporting disassemblers + \* "all" meaning best effort :) +* [dis.yaxpeax.net](https://dis.yaxpeax.net), same as the above, but as a website +* [yaxpeax-eval](https://github.com/iximeow/yaxpeax-eval), for disassembly and debug viewing of executed machine code +* [yaxpeax-demo](https://github.com/iximeow/yaxdemo/blob/no-gods-no-/src/main.rs), a one-off example of disassembly and code analysis with `yaxpeax-core`. +* [zvm](https://github.com/iximeow/zvm), for disassembly of jitted JVM bytecode + +but the real place i hope to find yaxpeax one day is to be used for analysis tasks like + +![register numbering](register_numbering.png)\ + +constructing an SSA-style representation of machine code, in turn letting me (or you!!!) get value anlyses, + +![range inference](range_inference.png)\ + +anyway, between Then and Now.. Ghidra has become an entire *thing*. Binary Ninja still exists and continues improving. maybe yaxpeax ends up just a pile of neat disassemblers and toys demos in my (ha ha) spare time. -- cgit v1.1