From 507a1c14b335a273304070289cb35a4bef7d1de3 Mon Sep 17 00:00:00 2001 From: iximeow Date: Sat, 16 Mar 2024 12:09:15 -0700 Subject: multithread differential disassembly and support pc-relative operands, remove a few more exceptions --- differential-tests/tests/capstone-differential.rs | 558 ++++++++++++---------- 1 file changed, 297 insertions(+), 261 deletions(-) diff --git a/differential-tests/tests/capstone-differential.rs b/differential-tests/tests/capstone-differential.rs index 8457bce..cb97fef 100644 --- a/differential-tests/tests/capstone-differential.rs +++ b/differential-tests/tests/capstone-differential.rs @@ -4,17 +4,21 @@ use capstone::prelude::*; use yaxpeax_arch::{Arch, Decoder}; +use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; use std::num::ParseIntError; #[derive(Debug)] enum ParsedOperand { Register { size: char, num: u8 }, Memory(String), + MemoryWithOffset { base: String, offset: u32, writeback: bool }, SIMDRegister { size: char, num: u8 }, // SIMDRegisterElements { num: u8, elems: u8, elem_size: char }, // SIMDRegisterElement { num: u8, elem_size: char, elem: u8 }, SIMDElementLane { elem: String, lane_selector: u8 }, Immediate(i64), + PCRel(i64), Float(f64), Other(String), RegisterFamily(String), @@ -31,9 +35,28 @@ impl PartialEq for ParsedOperand { (Memory(l), Memory(r)) => { l == r }, + ( + MemoryWithOffset { base: base_l, offset: offset_l, writeback: writeback_l }, + MemoryWithOffset { base: base_r, offset: offset_r, writeback: writeback_r }, + ) => { + base_l == base_r && + offset_l == offset_r && + writeback_l == writeback_r + }, (Immediate(l), Immediate(r)) => { l == r }, + (PCRel(l), PCRel(r)) => { + l == r + }, + (Immediate(l), PCRel(r)) => { + // assume pc=0 as capstone does by default + *l == 0 + r + }, + (PCRel(l), Immediate(r)) => { + // assume pc=0 as capstone does by default + 0 + l == *r + }, (Float(l), Float(r)) => { l.to_ne_bytes() == r.to_ne_bytes() }, @@ -121,6 +144,16 @@ impl ParsedOperand { let imm = parse_hex_or_dec(imm_str); (ParsedOperand::Immediate(imm), end) } + } else if s.as_bytes()[0] == b'$' { + let end = s.find(',').unwrap_or(s.len()); + let imm_str = &s[1..end]; + let imm_str = if imm_str.starts_with("+") { + &imm_str[1..] + } else { + imm_str + }; + let imm = parse_hex_or_dec(imm_str); + (ParsedOperand::PCRel(imm), end) } else if s.as_bytes()[0] == b'[' { let mut end = s.find(']').map(|x| x + 1).unwrap_or(s.len()); if s.as_bytes().get(end) == Some(&b'!') { @@ -244,324 +277,327 @@ impl ParsedDisassembly { #[test] fn capstone_differential() { - let cs = Capstone::new() - .arm64() - .mode(capstone::arch::arm64::ArchMode::Arm) - .build() - .expect("can create capstone"); - - let yax = ::Decoder::default(); - - let mut mismatch = 0; - let mut good = 0; - let mut yax_reject = 0; - let mut missed_incomplete = 0; - - for i in 0x00_00_00_00..u32::MAX { - let bytes = &i.to_le_bytes(); - if i % 0x00_10_00_00 == 0 { - eprintln!("case {:08x}", i); - } + struct Stats { + mismatch: AtomicUsize, + good: AtomicUsize, + yax_reject: AtomicUsize, + missed_incomplete: AtomicUsize, + } + let mut yax_reject = AtomicUsize::new(0); + let mut missed_incomplete = AtomicUsize::new(0); + + let stats = Stats { + mismatch: AtomicUsize::new(0), + good: AtomicUsize::new(0), + yax_reject: AtomicUsize::new(0), + missed_incomplete: AtomicUsize::new(0), + }; + + fn test_range(start: u64, end: u64, stats: Arc) { + let cs = Capstone::new() + .arm64() + .mode(capstone::arch::arm64::ArchMode::Arm) + .build() + .expect("can create capstone"); + + let yax = ::Decoder::default(); + + for i in start..=end { + let i = i as u32; + let bytes = &i.to_le_bytes(); + if i % 0x00_10_00_00 == 0 { + eprintln!("case {:08x}", i); + } - let res = cs.disasm_all(bytes, 0); - if let Ok(insts) = &res { - let insts_slice = insts.as_ref(); - if insts_slice.len() == 1 { - // then yax should also succeed.. - // and it should only be one instruction - let cs_text = format!("{}", insts_slice[0]); - let cs_text = &cs_text[5..]; - - let yax_res = yax.decode(&mut yaxpeax_arch::U8Reader::new(bytes)); - let yax_text = if let Ok(inst) = yax_res { - format!("{}", inst) - } else if let Err(yaxpeax_arm::armv8::a64::DecodeError::IncompleteDecoder) = yax_res { - missed_incomplete += 1; - continue; - } else { - panic!("yax errored where capstone succeeded. cs text: '{}', bytes: {:x?}", cs_text, bytes); - }; + let res = cs.disasm_all(bytes, 0); + if let Ok(insts) = &res { + let insts_slice = insts.as_ref(); + if insts_slice.len() == 1 { + // then yax should also succeed.. + // and it should only be one instruction + let cs_text = format!("{}", insts_slice[0]); + let cs_text = &cs_text[5..]; + + let yax_res = yax.decode(&mut yaxpeax_arch::U8Reader::new(bytes)); + let yax_text = if let Ok(inst) = yax_res { + format!("{}", inst) + } else if let Err(yaxpeax_arm::armv8::a64::DecodeError::IncompleteDecoder) = yax_res { + stats.missed_incomplete.fetch_add(1, Ordering::SeqCst); + continue; + } else { + panic!("yax errored where capstone succeeded. cs text: '{}', bytes: {:x?}", cs_text, bytes); + }; - fn acceptable_match(yax_text: &str, cs_text: &str) -> bool { - if yax_text == cs_text { - return true; - } + fn acceptable_match(yax_text: &str, cs_text: &str) -> bool { + if yax_text == cs_text { + return true; + } - let parsed_yax = ParsedDisassembly::parse(yax_text); - let parsed_cs = ParsedDisassembly::parse(cs_text); + let parsed_yax = ParsedDisassembly::parse(yax_text); + let parsed_cs = ParsedDisassembly::parse(cs_text); - if parsed_yax == parsed_cs { - return true; - } - -// eprintln!("yax: {} -> {:?}", yax_text, parsed_yax); -// eprintln!("cs: {} -> {:?}", cs_text, parsed_cs); + if parsed_yax == parsed_cs { + return true; + } - if cs_text - .replace("uxtw #0", "uxtw") - .replace("uxtx #0", "uxtx") == yax_text { + if false { + eprintln!("yax: {} -> {:?}", yax_text, parsed_yax); + eprintln!("cs: {} -> {:?}", cs_text, parsed_cs); + } - return true; - } + if cs_text + .replace("uxtw #0", "uxtw") + .replace("uxtx #0", "uxtx") == yax_text { - // capstone discards uxtw in some circumstances for reasons i don't yet - // know - if let Some(yax_text) = yax_text.strip_suffix(", uxtw") { - if yax_text == cs_text { return true; } - } - if let Some(cs_text) = cs_text.strip_suffix(", uxtw") { - if yax_text == cs_text { - return true; + + // capstone discards uxtw in some circumstances for reasons i don't yet + // know + if let Some(yax_text) = yax_text.strip_suffix(", uxtw") { + if yax_text == cs_text { + return true; + } + } + if let Some(cs_text) = cs_text.strip_suffix(", uxtw") { + if yax_text == cs_text { + return true; + } } - } - if yax_text.replace("lsl", "uxtw") == cs_text { - return true; - } - if let Some(yax_text) = yax_text.strip_suffix(" #0") { - if yax_text == cs_text { + if yax_text.replace("lsl", "uxtw") == cs_text { return true; } - } - if let Some(cs_text) = cs_text.strip_suffix(" #0") { - if yax_text == cs_text { + + if cs_text.starts_with("ubfx ") { return true; } - } - // TODO: what kind of cases is this for? - if cs_text.starts_with(yax_text) && cs_text.ends_with("000") { - return true; - }; - if cs_text.starts_with("ubfx ") { - return true; - } + if yax_text.starts_with("adrp ") { + return true; + } - if yax_text.starts_with("adrp ") { - return true; - } + if yax_text.starts_with("adr ") { + return true; + } - if yax_text.starts_with("adr ") { - return true; - } + // some instructions like `11400000` have an immeidate lsl #12 as their + // last operand. yax normalizes this to an unshifted `imm << 12`, capstone + // just prints lsl #12. + if cs_text.starts_with(yax_text) && cs_text.ends_with(", lsl #12") { + return true; + } - if yax_text.starts_with("b ") { - return true; - } + // yax and capstone deal with immediates in `mov reg, imm` a little + // differently. they're correct, but displayed differently (0xffffffff + // instead of -1) + if cs_text.starts_with("mov ") && yax_text.starts_with("mov ") { + return true; + } - if yax_text.starts_with("bl ") { - return true; - } + // capstone just shows empty string for unrecognized prf{,u}m immediates, + // leaving broken text + if cs_text.starts_with("prfum ") && yax_text.starts_with("prfum ") { + return true; + } + if cs_text.starts_with("prfm ") && yax_text.starts_with("prfm ") { + return true; + } - // some instructions like `11400000` have an immeidate lsl #12 as their - // last operand. yax normalizes this to an unshifted `imm << 12`, capstone - // just prints lsl #12. - if cs_text.starts_with(yax_text) && cs_text.ends_with(", lsl #12") { - return true; - } + // don't totally understand aliasing rules for `ORR (immediate)` and mov.. + if cs_text.starts_with("mov ") && yax_text.starts_with("orr ") || + cs_text.starts_with("orr ") && yax_text.starts_with("mov ") + { + return true; + } - // yax and capstone deal with immediates in `mov reg, imm` a little - // differently. they're correct, but displayed differently (0xffffffff - // instead of -1) - if cs_text.starts_with("mov ") && yax_text.starts_with("mov ") { - return true; - } + // yax notmalizes movn to mov + if cs_text.starts_with("movn ") && yax_text.starts_with("mov ") { + return true; + } - // capstone just shows empty string for unrecognized prf{,u}m immediates, - // leaving broken text - if cs_text.starts_with("prfum ") && yax_text.starts_with("prfum ") { - return true; - } - if cs_text.starts_with("prfm ") && yax_text.starts_with("prfm ") { - return true; - } + // yax notmalizes movz to mov + if cs_text.starts_with("movz ") && yax_text.starts_with("mov ") { + return true; + } - // don't totally understand aliasing rules for `ORR (immediate)` and mov.. - if cs_text.starts_with("mov ") && yax_text.starts_with("orr ") || - cs_text.starts_with("orr ") && yax_text.starts_with("mov ") - { - return true; - } + // differences on displaying immediates.. + let new_cs_text = cs_text + .replace("#0x", "") + .replace("#-0x", "") + .replace("#-", "") + .replace("#", ""); + let new_yax_text = yax_text + .replace("#0x", "") + .replace("#-0x", "") + .replace("#-", "") + .replace("#", "") + .replace("$+0x", ""); + if new_cs_text == new_yax_text { + return true; + } - // yax notmalizes movn to mov - if cs_text.starts_with("movn ") && yax_text.starts_with("mov ") { - return true; - } + if cs_text.len() > 7 && yax_text.len() > 7 { + if &cs_text[..7] == &yax_text[..7] && (cs_text.contains("#-") || yax_text.contains("#-")) { + return true; + } + if &cs_text[..7] == &yax_text[..7] && (cs_text.contains("shll") || yax_text.contains("shll")) { + return true; + } + } - // yax notmalizes movz to mov - if cs_text.starts_with("movz ") && yax_text.starts_with("mov ") { - return true; - } + if parsed_yax.opcode == "mov" && parsed_cs.opcode == "dup" { + if parsed_yax.operands == parsed_cs.operands { + return true; + } + } + // if cs_text.starts_with("dup") && yax_text.starts_with("mov ") && cs_text.replace("dup ", "mov ") == yax_text { + // return true; + // } + // capstone bug! e0030033 is `bfxil w0, wzr, #0, #1`, but capstone picks + // the bfc alias instead. skip these, generally. + if yax_text.starts_with("bfxil") && (cs_text.starts_with("bfc") || cs_text.starts_with("bfi")) { + return true; + } - // differences on displaying immediates.. - let new_cs_text = cs_text - .replace("#0x", "") - .replace("#-0x", "") - .replace("#-", "") - .replace("#", ""); - let new_yax_text = yax_text - .replace("#0x", "") - .replace("#-0x", "") - .replace("#-", "") - .replace("#", "") - .replace("$+0x", ""); - if new_cs_text == new_yax_text { - return true; - } + if cs_text.len() > 10 && yax_text.len() > 10 { + // eh they're probably the same but yax has a signed hex and capstone has + // unsigned + if &cs_text[..10] == &yax_text[..10] && cs_text.contains("ffffffff") && yax_text.contains("#-0x") { + return true; + } + // yax, for reg + shifted-reg operands, does not omit shift amount + if &cs_text[..10] == &yax_text[..10] && yax_text.contains(" #0x0]") { + return true; + } + + // postindex offsets are base 10 in capstone sometimes? + if yax_text.contains("], #0x") && cs_text.contains("], #") && + &cs_text[..20] == &yax_text[..20] { + return true; + } + } - if cs_text.len() > 7 && yax_text.len() > 7 { - if &cs_text[..7] == &yax_text[..7] && (cs_text.contains("#-") || yax_text.contains("#-")) { + // yax omits `uxt{w,x}` for extended reg where extension matches the + // register size + if cs_text.starts_with(yax_text) && (cs_text.ends_with("uxtx") || cs_text.ends_with("uxtw")) { return true; } - if &cs_text[..7] == &yax_text[..7] && (cs_text.contains("shll") || yax_text.contains("shll")) { + + if cs_text.starts_with(yax_text) && cs_text.ends_with("0") { return true; } - } - // capstone doesn't show relative offsets, always makes absolute for some - // ip - if yax_text.contains("$-0x") || yax_text.contains("$+0x") { - return true; - } - - if yax_text.contains("esb") { - return true; - } - if yax_text.contains("movi") { - return true; - } + // S being present or not has no bearing on the shift amount, #0 either + // way. + // yax will not print shift because of its ineffectual nature. + if (cs_text.starts_with("strb") || cs_text.starts_with("ldrb") || cs_text.starts_with("ldrsb") || cs_text.starts_with("ldr b") || cs_text.starts_with("str b")) && cs_text.contains(" lsl #0]") { + return true; + } - if parsed_yax.opcode == "mov" && parsed_cs.opcode == "dup" { - if parsed_yax.operands == parsed_cs.operands { + if cs_text == yax_text.replace(" #0", "") { return true; } - } -// if cs_text.starts_with("dup") && yax_text.starts_with("mov ") && cs_text.replace("dup ", "mov ") == yax_text { -// return true; -// } - // capstone bug! e0030033 is `bfxil w0, wzr, #0, #1`, but capstone picks - // the bfc alias instead. skip these, generally. - if yax_text.starts_with("bfxil") && (cs_text.starts_with("bfc") || cs_text.starts_with("bfi")) { - return true; - } - if cs_text.len() > 10 && yax_text.len() > 10 { - // eh they're probably the same but yax has a signed hex and capstone has - // unsigned - if &cs_text[..10] == &yax_text[..10] && cs_text.contains("ffffffff") && yax_text.contains("#-0x") { + // yax uses lsl instead of uxtx when the reg size is uxtx. same for + // uxtw/w-regs + if cs_text.replace("uxtx", "lsl") == yax_text || + cs_text.replace("uxtw", "lsl") == yax_text { return true; } - // yax, for reg + shifted-reg operands, does not omit shift amount - if &cs_text[..10] == &yax_text[..10] && yax_text.contains(" #0x0]") { + + // yax shows dcps{1,2} operand, capstone does not? + if yax_text.starts_with("dcps") { return true; } - // postindex offsets are base 10 in capstone sometimes? - if yax_text.contains("], #0x") && cs_text.contains("], #") && - &cs_text[..20] == &yax_text[..20] { + if cs_text.starts_with("msr ") { return true; } - } - // yax omits `uxt{w,x}` for extended reg where extension matches the - // register size - if cs_text.starts_with(yax_text) && (cs_text.ends_with("uxtx") || cs_text.ends_with("uxtw")) { - return true; - } + // yax does not handle aliases for msr instructions yet + if yax_text.starts_with("msr ") { + return true; + } - if cs_text.starts_with(yax_text) && cs_text.ends_with("0") { - return true; - } + // some kinda bug to deal with hint value width + if cs_text.starts_with("hint ") { + return true; + } + if cs_text.starts_with("dsb ") { + return true; + } + if cs_text.starts_with("clrex ") { + return true; + } + if yax_text.starts_with("sys ") { + return true; + } + if cs_text.starts_with("yield ") { + return true; + } + if cs_text.starts_with("wfe ") { + return true; + } + if cs_text.starts_with("wfi ") { + return true; + } + if cs_text.starts_with("sev ") { + return true; + } + if cs_text.starts_with("mrs ") { + return true; + } + if cs_text.starts_with("sysl ") { + return true; + } + if yax_text.starts_with("hint ") { + return true; + } - // S being present or not has no bearing on the shift amount, #0 either - // way. - // yax will not print shift because of its ineffectual nature. - if (cs_text.starts_with("strb") || cs_text.starts_with("ldrb") || cs_text.starts_with("ldrsb") || cs_text.starts_with("ldr b") || cs_text.starts_with("str b")) && cs_text.contains(" lsl #0]") { - return true; - } + if yax_text == &cs_text[..cs_text.len() - 1] && cs_text.ends_with(" ") { + return true; + } - if cs_text == yax_text.replace(" #0", "") { - return true; + return false; } - // yax uses lsl instead of uxtx when the reg size is uxtx. same for - // uxtw/w-regs - if cs_text.replace("uxtx", "lsl") == yax_text || - cs_text.replace("uxtw", "lsl") == yax_text { - return true; + // eprintln!("{}", yax_text); + if !acceptable_match(&yax_text, cs_text) { + eprintln!("disassembly mismatch: {} != {}. bytes: {:x?}", yax_text, cs_text, bytes); + std::process::abort(); + } else { + stats.good.fetch_add(1, Ordering::SeqCst); } + } else { + // yax should also fail? + } + } + } + } - // yax shows dcps{1,2} operand, capstone does not? - if yax_text.starts_with("dcps") { - return true; - } + const NR_THREADS: u64 = 64; - if cs_text.starts_with("msr ") { - return true; - } + let range_size = (u32::MAX as u64 + 1) / NR_THREADS; - // yax does not handle aliases for msr instructions yet - if yax_text.starts_with("msr ") { - return true; - } + let mut handles = Vec::new(); - // some kinda bug to deal with hint value width - if cs_text.starts_with("hint ") { - return true; - } - if cs_text.starts_with("dsb ") { - return true; - } - if cs_text.starts_with("clrex ") { - return true; - } - if yax_text.starts_with("sys ") { - return true; - } - if cs_text.starts_with("yield ") { - return true; - } - if cs_text.starts_with("wfe ") { - return true; - } - if cs_text.starts_with("wfi ") { - return true; - } - if cs_text.starts_with("sev ") { - return true; - } - if cs_text.starts_with("mrs ") { - return true; - } - if cs_text.starts_with("sysl ") { - return true; - } - if yax_text.starts_with("hint ") { - return true; - } + let stats = Arc::new(stats); - if yax_text == &cs_text[..cs_text.len() - 1] && cs_text.ends_with(" ") { - return true; - } + test_range(0x54_80_00_00, 0x54_80_00_10, Arc::clone(&stats)); - return false; - } + for i in 0..NR_THREADS { + let stats = Arc::clone(&stats); + let handle = std::thread::spawn(move || test_range(i * range_size, i * range_size + range_size, stats)); + handles.push(handle); + } -// eprintln!("{}", yax_text); - if !acceptable_match(&yax_text, cs_text) { - panic!("disassembly mismatch: {} != {}. bytes: {:x?}", yax_text, cs_text, bytes); - } else { - good += 1; - } - } else { - // yax should also fail? - } - } + while let Some(handle) = handles.pop() { + handle.join().unwrap(); } - eprintln!("match: {}", good); - eprintln!("mismatch: {}", mismatch); - eprintln!("bad reject: {}", yax_reject); - eprintln!("incomplete: {}", missed_incomplete); + + eprintln!("match: {}", stats.good.load(Ordering::SeqCst)); + eprintln!("mismatch: {}", stats.mismatch.load(Ordering::SeqCst)); + eprintln!("bad reject: {}", stats.yax_reject.load(Ordering::SeqCst)); + eprintln!("incomplete: {}", stats.missed_incomplete.load(Ordering::SeqCst)); } -- cgit v1.1