aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoriximeow <me@iximeow.net>2024-03-16 12:09:15 -0700
committeriximeow <me@iximeow.net>2024-03-16 12:09:15 -0700
commit507a1c14b335a273304070289cb35a4bef7d1de3 (patch)
tree72b46bd1647d978046e2f0df98497c02f8794dda
parent97c724d483c309b95cba75dae3445b069e8b7915 (diff)
multithread differential disassembly and support pc-relative operands, remove a few more exceptions
-rw-r--r--differential-tests/tests/capstone-differential.rs558
1 files changed, 297 insertions, 261 deletions
diff --git a/differential-tests/tests/capstone-differential.rs b/differential-tests/tests/capstone-differential.rs
index 8457bce..cb97fef 100644
--- a/differential-tests/tests/capstone-differential.rs
+++ b/differential-tests/tests/capstone-differential.rs
@@ -4,17 +4,21 @@
use capstone::prelude::*;
use yaxpeax_arch::{Arch, Decoder};
+use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
use std::num::ParseIntError;
#[derive(Debug)]
enum ParsedOperand {
Register { size: char, num: u8 },
Memory(String),
+ MemoryWithOffset { base: String, offset: u32, writeback: bool },
SIMDRegister { size: char, num: u8 },
// SIMDRegisterElements { num: u8, elems: u8, elem_size: char },
// SIMDRegisterElement { num: u8, elem_size: char, elem: u8 },
SIMDElementLane { elem: String, lane_selector: u8 },
Immediate(i64),
+ PCRel(i64),
Float(f64),
Other(String),
RegisterFamily(String),
@@ -31,9 +35,28 @@ impl PartialEq for ParsedOperand {
(Memory(l), Memory(r)) => {
l == r
},
+ (
+ MemoryWithOffset { base: base_l, offset: offset_l, writeback: writeback_l },
+ MemoryWithOffset { base: base_r, offset: offset_r, writeback: writeback_r },
+ ) => {
+ base_l == base_r &&
+ offset_l == offset_r &&
+ writeback_l == writeback_r
+ },
(Immediate(l), Immediate(r)) => {
l == r
},
+ (PCRel(l), PCRel(r)) => {
+ l == r
+ },
+ (Immediate(l), PCRel(r)) => {
+ // assume pc=0 as capstone does by default
+ *l == 0 + r
+ },
+ (PCRel(l), Immediate(r)) => {
+ // assume pc=0 as capstone does by default
+ 0 + l == *r
+ },
(Float(l), Float(r)) => {
l.to_ne_bytes() == r.to_ne_bytes()
},
@@ -121,6 +144,16 @@ impl ParsedOperand {
let imm = parse_hex_or_dec(imm_str);
(ParsedOperand::Immediate(imm), end)
}
+ } else if s.as_bytes()[0] == b'$' {
+ let end = s.find(',').unwrap_or(s.len());
+ let imm_str = &s[1..end];
+ let imm_str = if imm_str.starts_with("+") {
+ &imm_str[1..]
+ } else {
+ imm_str
+ };
+ let imm = parse_hex_or_dec(imm_str);
+ (ParsedOperand::PCRel(imm), end)
} else if s.as_bytes()[0] == b'[' {
let mut end = s.find(']').map(|x| x + 1).unwrap_or(s.len());
if s.as_bytes().get(end) == Some(&b'!') {
@@ -244,324 +277,327 @@ impl ParsedDisassembly {
#[test]
fn capstone_differential() {
- let cs = Capstone::new()
- .arm64()
- .mode(capstone::arch::arm64::ArchMode::Arm)
- .build()
- .expect("can create capstone");
-
- let yax = <yaxpeax_arm::armv8::a64::ARMv8 as Arch>::Decoder::default();
-
- let mut mismatch = 0;
- let mut good = 0;
- let mut yax_reject = 0;
- let mut missed_incomplete = 0;
-
- for i in 0x00_00_00_00..u32::MAX {
- let bytes = &i.to_le_bytes();
- if i % 0x00_10_00_00 == 0 {
- eprintln!("case {:08x}", i);
- }
+ struct Stats {
+ mismatch: AtomicUsize,
+ good: AtomicUsize,
+ yax_reject: AtomicUsize,
+ missed_incomplete: AtomicUsize,
+ }
+ let mut yax_reject = AtomicUsize::new(0);
+ let mut missed_incomplete = AtomicUsize::new(0);
+
+ let stats = Stats {
+ mismatch: AtomicUsize::new(0),
+ good: AtomicUsize::new(0),
+ yax_reject: AtomicUsize::new(0),
+ missed_incomplete: AtomicUsize::new(0),
+ };
+
+ fn test_range(start: u64, end: u64, stats: Arc<Stats>) {
+ let cs = Capstone::new()
+ .arm64()
+ .mode(capstone::arch::arm64::ArchMode::Arm)
+ .build()
+ .expect("can create capstone");
+
+ let yax = <yaxpeax_arm::armv8::a64::ARMv8 as Arch>::Decoder::default();
+
+ for i in start..=end {
+ let i = i as u32;
+ let bytes = &i.to_le_bytes();
+ if i % 0x00_10_00_00 == 0 {
+ eprintln!("case {:08x}", i);
+ }
- let res = cs.disasm_all(bytes, 0);
- if let Ok(insts) = &res {
- let insts_slice = insts.as_ref();
- if insts_slice.len() == 1 {
- // then yax should also succeed..
- // and it should only be one instruction
- let cs_text = format!("{}", insts_slice[0]);
- let cs_text = &cs_text[5..];
-
- let yax_res = yax.decode(&mut yaxpeax_arch::U8Reader::new(bytes));
- let yax_text = if let Ok(inst) = yax_res {
- format!("{}", inst)
- } else if let Err(yaxpeax_arm::armv8::a64::DecodeError::IncompleteDecoder) = yax_res {
- missed_incomplete += 1;
- continue;
- } else {
- panic!("yax errored where capstone succeeded. cs text: '{}', bytes: {:x?}", cs_text, bytes);
- };
+ let res = cs.disasm_all(bytes, 0);
+ if let Ok(insts) = &res {
+ let insts_slice = insts.as_ref();
+ if insts_slice.len() == 1 {
+ // then yax should also succeed..
+ // and it should only be one instruction
+ let cs_text = format!("{}", insts_slice[0]);
+ let cs_text = &cs_text[5..];
+
+ let yax_res = yax.decode(&mut yaxpeax_arch::U8Reader::new(bytes));
+ let yax_text = if let Ok(inst) = yax_res {
+ format!("{}", inst)
+ } else if let Err(yaxpeax_arm::armv8::a64::DecodeError::IncompleteDecoder) = yax_res {
+ stats.missed_incomplete.fetch_add(1, Ordering::SeqCst);
+ continue;
+ } else {
+ panic!("yax errored where capstone succeeded. cs text: '{}', bytes: {:x?}", cs_text, bytes);
+ };
- fn acceptable_match(yax_text: &str, cs_text: &str) -> bool {
- if yax_text == cs_text {
- return true;
- }
+ fn acceptable_match(yax_text: &str, cs_text: &str) -> bool {
+ if yax_text == cs_text {
+ return true;
+ }
- let parsed_yax = ParsedDisassembly::parse(yax_text);
- let parsed_cs = ParsedDisassembly::parse(cs_text);
+ let parsed_yax = ParsedDisassembly::parse(yax_text);
+ let parsed_cs = ParsedDisassembly::parse(cs_text);
- if parsed_yax == parsed_cs {
- return true;
- }
-
-// eprintln!("yax: {} -> {:?}", yax_text, parsed_yax);
-// eprintln!("cs: {} -> {:?}", cs_text, parsed_cs);
+ if parsed_yax == parsed_cs {
+ return true;
+ }
- if cs_text
- .replace("uxtw #0", "uxtw")
- .replace("uxtx #0", "uxtx") == yax_text {
+ if false {
+ eprintln!("yax: {} -> {:?}", yax_text, parsed_yax);
+ eprintln!("cs: {} -> {:?}", cs_text, parsed_cs);
+ }
- return true;
- }
+ if cs_text
+ .replace("uxtw #0", "uxtw")
+ .replace("uxtx #0", "uxtx") == yax_text {
- // capstone discards uxtw in some circumstances for reasons i don't yet
- // know
- if let Some(yax_text) = yax_text.strip_suffix(", uxtw") {
- if yax_text == cs_text {
return true;
}
- }
- if let Some(cs_text) = cs_text.strip_suffix(", uxtw") {
- if yax_text == cs_text {
- return true;
+
+ // capstone discards uxtw in some circumstances for reasons i don't yet
+ // know
+ if let Some(yax_text) = yax_text.strip_suffix(", uxtw") {
+ if yax_text == cs_text {
+ return true;
+ }
+ }
+ if let Some(cs_text) = cs_text.strip_suffix(", uxtw") {
+ if yax_text == cs_text {
+ return true;
+ }
}
- }
- if yax_text.replace("lsl", "uxtw") == cs_text {
- return true;
- }
- if let Some(yax_text) = yax_text.strip_suffix(" #0") {
- if yax_text == cs_text {
+ if yax_text.replace("lsl", "uxtw") == cs_text {
return true;
}
- }
- if let Some(cs_text) = cs_text.strip_suffix(" #0") {
- if yax_text == cs_text {
+
+ if cs_text.starts_with("ubfx ") {
return true;
}
- }
- // TODO: what kind of cases is this for?
- if cs_text.starts_with(yax_text) && cs_text.ends_with("000") {
- return true;
- };
- if cs_text.starts_with("ubfx ") {
- return true;
- }
+ if yax_text.starts_with("adrp ") {
+ return true;
+ }
- if yax_text.starts_with("adrp ") {
- return true;
- }
+ if yax_text.starts_with("adr ") {
+ return true;
+ }
- if yax_text.starts_with("adr ") {
- return true;
- }
+ // some instructions like `11400000` have an immeidate lsl #12 as their
+ // last operand. yax normalizes this to an unshifted `imm << 12`, capstone
+ // just prints lsl #12.
+ if cs_text.starts_with(yax_text) && cs_text.ends_with(", lsl #12") {
+ return true;
+ }
- if yax_text.starts_with("b ") {
- return true;
- }
+ // yax and capstone deal with immediates in `mov reg, imm` a little
+ // differently. they're correct, but displayed differently (0xffffffff
+ // instead of -1)
+ if cs_text.starts_with("mov ") && yax_text.starts_with("mov ") {
+ return true;
+ }
- if yax_text.starts_with("bl ") {
- return true;
- }
+ // capstone just shows empty string for unrecognized prf{,u}m immediates,
+ // leaving broken text
+ if cs_text.starts_with("prfum ") && yax_text.starts_with("prfum ") {
+ return true;
+ }
+ if cs_text.starts_with("prfm ") && yax_text.starts_with("prfm ") {
+ return true;
+ }
- // some instructions like `11400000` have an immeidate lsl #12 as their
- // last operand. yax normalizes this to an unshifted `imm << 12`, capstone
- // just prints lsl #12.
- if cs_text.starts_with(yax_text) && cs_text.ends_with(", lsl #12") {
- return true;
- }
+ // don't totally understand aliasing rules for `ORR (immediate)` and mov..
+ if cs_text.starts_with("mov ") && yax_text.starts_with("orr ") ||
+ cs_text.starts_with("orr ") && yax_text.starts_with("mov ")
+ {
+ return true;
+ }
- // yax and capstone deal with immediates in `mov reg, imm` a little
- // differently. they're correct, but displayed differently (0xffffffff
- // instead of -1)
- if cs_text.starts_with("mov ") && yax_text.starts_with("mov ") {
- return true;
- }
+ // yax notmalizes movn to mov
+ if cs_text.starts_with("movn ") && yax_text.starts_with("mov ") {
+ return true;
+ }
- // capstone just shows empty string for unrecognized prf{,u}m immediates,
- // leaving broken text
- if cs_text.starts_with("prfum ") && yax_text.starts_with("prfum ") {
- return true;
- }
- if cs_text.starts_with("prfm ") && yax_text.starts_with("prfm ") {
- return true;
- }
+ // yax notmalizes movz to mov
+ if cs_text.starts_with("movz ") && yax_text.starts_with("mov ") {
+ return true;
+ }
- // don't totally understand aliasing rules for `ORR (immediate)` and mov..
- if cs_text.starts_with("mov ") && yax_text.starts_with("orr ") ||
- cs_text.starts_with("orr ") && yax_text.starts_with("mov ")
- {
- return true;
- }
+ // differences on displaying immediates..
+ let new_cs_text = cs_text
+ .replace("#0x", "")
+ .replace("#-0x", "")
+ .replace("#-", "")
+ .replace("#", "");
+ let new_yax_text = yax_text
+ .replace("#0x", "")
+ .replace("#-0x", "")
+ .replace("#-", "")
+ .replace("#", "")
+ .replace("$+0x", "");
+ if new_cs_text == new_yax_text {
+ return true;
+ }
- // yax notmalizes movn to mov
- if cs_text.starts_with("movn ") && yax_text.starts_with("mov ") {
- return true;
- }
+ if cs_text.len() > 7 && yax_text.len() > 7 {
+ if &cs_text[..7] == &yax_text[..7] && (cs_text.contains("#-") || yax_text.contains("#-")) {
+ return true;
+ }
+ if &cs_text[..7] == &yax_text[..7] && (cs_text.contains("shll") || yax_text.contains("shll")) {
+ return true;
+ }
+ }
- // yax notmalizes movz to mov
- if cs_text.starts_with("movz ") && yax_text.starts_with("mov ") {
- return true;
- }
+ if parsed_yax.opcode == "mov" && parsed_cs.opcode == "dup" {
+ if parsed_yax.operands == parsed_cs.operands {
+ return true;
+ }
+ }
+ // if cs_text.starts_with("dup") && yax_text.starts_with("mov ") && cs_text.replace("dup ", "mov ") == yax_text {
+ // return true;
+ // }
+ // capstone bug! e0030033 is `bfxil w0, wzr, #0, #1`, but capstone picks
+ // the bfc alias instead. skip these, generally.
+ if yax_text.starts_with("bfxil") && (cs_text.starts_with("bfc") || cs_text.starts_with("bfi")) {
+ return true;
+ }
- // differences on displaying immediates..
- let new_cs_text = cs_text
- .replace("#0x", "")
- .replace("#-0x", "")
- .replace("#-", "")
- .replace("#", "");
- let new_yax_text = yax_text
- .replace("#0x", "")
- .replace("#-0x", "")
- .replace("#-", "")
- .replace("#", "")
- .replace("$+0x", "");
- if new_cs_text == new_yax_text {
- return true;
- }
+ if cs_text.len() > 10 && yax_text.len() > 10 {
+ // eh they're probably the same but yax has a signed hex and capstone has
+ // unsigned
+ if &cs_text[..10] == &yax_text[..10] && cs_text.contains("ffffffff") && yax_text.contains("#-0x") {
+ return true;
+ }
+ // yax, for reg + shifted-reg operands, does not omit shift amount
+ if &cs_text[..10] == &yax_text[..10] && yax_text.contains(" #0x0]") {
+ return true;
+ }
+
+ // postindex offsets are base 10 in capstone sometimes?
+ if yax_text.contains("], #0x") && cs_text.contains("], #") &&
+ &cs_text[..20] == &yax_text[..20] {
+ return true;
+ }
+ }
- if cs_text.len() > 7 && yax_text.len() > 7 {
- if &cs_text[..7] == &yax_text[..7] && (cs_text.contains("#-") || yax_text.contains("#-")) {
+ // yax omits `uxt{w,x}` for extended reg where extension matches the
+ // register size
+ if cs_text.starts_with(yax_text) && (cs_text.ends_with("uxtx") || cs_text.ends_with("uxtw")) {
return true;
}
- if &cs_text[..7] == &yax_text[..7] && (cs_text.contains("shll") || yax_text.contains("shll")) {
+
+ if cs_text.starts_with(yax_text) && cs_text.ends_with("0") {
return true;
}
- }
- // capstone doesn't show relative offsets, always makes absolute for some
- // ip
- if yax_text.contains("$-0x") || yax_text.contains("$+0x") {
- return true;
- }
-
- if yax_text.contains("esb") {
- return true;
- }
- if yax_text.contains("movi") {
- return true;
- }
+ // S being present or not has no bearing on the shift amount, #0 either
+ // way.
+ // yax will not print shift because of its ineffectual nature.
+ if (cs_text.starts_with("strb") || cs_text.starts_with("ldrb") || cs_text.starts_with("ldrsb") || cs_text.starts_with("ldr b") || cs_text.starts_with("str b")) && cs_text.contains(" lsl #0]") {
+ return true;
+ }
- if parsed_yax.opcode == "mov" && parsed_cs.opcode == "dup" {
- if parsed_yax.operands == parsed_cs.operands {
+ if cs_text == yax_text.replace(" #0", "") {
return true;
}
- }
-// if cs_text.starts_with("dup") && yax_text.starts_with("mov ") && cs_text.replace("dup ", "mov ") == yax_text {
-// return true;
-// }
- // capstone bug! e0030033 is `bfxil w0, wzr, #0, #1`, but capstone picks
- // the bfc alias instead. skip these, generally.
- if yax_text.starts_with("bfxil") && (cs_text.starts_with("bfc") || cs_text.starts_with("bfi")) {
- return true;
- }
- if cs_text.len() > 10 && yax_text.len() > 10 {
- // eh they're probably the same but yax has a signed hex and capstone has
- // unsigned
- if &cs_text[..10] == &yax_text[..10] && cs_text.contains("ffffffff") && yax_text.contains("#-0x") {
+ // yax uses lsl instead of uxtx when the reg size is uxtx. same for
+ // uxtw/w-regs
+ if cs_text.replace("uxtx", "lsl") == yax_text ||
+ cs_text.replace("uxtw", "lsl") == yax_text {
return true;
}
- // yax, for reg + shifted-reg operands, does not omit shift amount
- if &cs_text[..10] == &yax_text[..10] && yax_text.contains(" #0x0]") {
+
+ // yax shows dcps{1,2} operand, capstone does not?
+ if yax_text.starts_with("dcps") {
return true;
}
- // postindex offsets are base 10 in capstone sometimes?
- if yax_text.contains("], #0x") && cs_text.contains("], #") &&
- &cs_text[..20] == &yax_text[..20] {
+ if cs_text.starts_with("msr ") {
return true;
}
- }
- // yax omits `uxt{w,x}` for extended reg where extension matches the
- // register size
- if cs_text.starts_with(yax_text) && (cs_text.ends_with("uxtx") || cs_text.ends_with("uxtw")) {
- return true;
- }
+ // yax does not handle aliases for msr instructions yet
+ if yax_text.starts_with("msr ") {
+ return true;
+ }
- if cs_text.starts_with(yax_text) && cs_text.ends_with("0") {
- return true;
- }
+ // some kinda bug to deal with hint value width
+ if cs_text.starts_with("hint ") {
+ return true;
+ }
+ if cs_text.starts_with("dsb ") {
+ return true;
+ }
+ if cs_text.starts_with("clrex ") {
+ return true;
+ }
+ if yax_text.starts_with("sys ") {
+ return true;
+ }
+ if cs_text.starts_with("yield ") {
+ return true;
+ }
+ if cs_text.starts_with("wfe ") {
+ return true;
+ }
+ if cs_text.starts_with("wfi ") {
+ return true;
+ }
+ if cs_text.starts_with("sev ") {
+ return true;
+ }
+ if cs_text.starts_with("mrs ") {
+ return true;
+ }
+ if cs_text.starts_with("sysl ") {
+ return true;
+ }
+ if yax_text.starts_with("hint ") {
+ return true;
+ }
- // S being present or not has no bearing on the shift amount, #0 either
- // way.
- // yax will not print shift because of its ineffectual nature.
- if (cs_text.starts_with("strb") || cs_text.starts_with("ldrb") || cs_text.starts_with("ldrsb") || cs_text.starts_with("ldr b") || cs_text.starts_with("str b")) && cs_text.contains(" lsl #0]") {
- return true;
- }
+ if yax_text == &cs_text[..cs_text.len() - 1] && cs_text.ends_with(" ") {
+ return true;
+ }
- if cs_text == yax_text.replace(" #0", "") {
- return true;
+ return false;
}
- // yax uses lsl instead of uxtx when the reg size is uxtx. same for
- // uxtw/w-regs
- if cs_text.replace("uxtx", "lsl") == yax_text ||
- cs_text.replace("uxtw", "lsl") == yax_text {
- return true;
+ // eprintln!("{}", yax_text);
+ if !acceptable_match(&yax_text, cs_text) {
+ eprintln!("disassembly mismatch: {} != {}. bytes: {:x?}", yax_text, cs_text, bytes);
+ std::process::abort();
+ } else {
+ stats.good.fetch_add(1, Ordering::SeqCst);
}
+ } else {
+ // yax should also fail?
+ }
+ }
+ }
+ }
- // yax shows dcps{1,2} operand, capstone does not?
- if yax_text.starts_with("dcps") {
- return true;
- }
+ const NR_THREADS: u64 = 64;
- if cs_text.starts_with("msr ") {
- return true;
- }
+ let range_size = (u32::MAX as u64 + 1) / NR_THREADS;
- // yax does not handle aliases for msr instructions yet
- if yax_text.starts_with("msr ") {
- return true;
- }
+ let mut handles = Vec::new();
- // some kinda bug to deal with hint value width
- if cs_text.starts_with("hint ") {
- return true;
- }
- if cs_text.starts_with("dsb ") {
- return true;
- }
- if cs_text.starts_with("clrex ") {
- return true;
- }
- if yax_text.starts_with("sys ") {
- return true;
- }
- if cs_text.starts_with("yield ") {
- return true;
- }
- if cs_text.starts_with("wfe ") {
- return true;
- }
- if cs_text.starts_with("wfi ") {
- return true;
- }
- if cs_text.starts_with("sev ") {
- return true;
- }
- if cs_text.starts_with("mrs ") {
- return true;
- }
- if cs_text.starts_with("sysl ") {
- return true;
- }
- if yax_text.starts_with("hint ") {
- return true;
- }
+ let stats = Arc::new(stats);
- if yax_text == &cs_text[..cs_text.len() - 1] && cs_text.ends_with(" ") {
- return true;
- }
+ test_range(0x54_80_00_00, 0x54_80_00_10, Arc::clone(&stats));
- return false;
- }
+ for i in 0..NR_THREADS {
+ let stats = Arc::clone(&stats);
+ let handle = std::thread::spawn(move || test_range(i * range_size, i * range_size + range_size, stats));
+ handles.push(handle);
+ }
-// eprintln!("{}", yax_text);
- if !acceptable_match(&yax_text, cs_text) {
- panic!("disassembly mismatch: {} != {}. bytes: {:x?}", yax_text, cs_text, bytes);
- } else {
- good += 1;
- }
- } else {
- // yax should also fail?
- }
- }
+ while let Some(handle) = handles.pop() {
+ handle.join().unwrap();
}
- eprintln!("match: {}", good);
- eprintln!("mismatch: {}", mismatch);
- eprintln!("bad reject: {}", yax_reject);
- eprintln!("incomplete: {}", missed_incomplete);
+
+ eprintln!("match: {}", stats.good.load(Ordering::SeqCst));
+ eprintln!("mismatch: {}", stats.mismatch.load(Ordering::SeqCst));
+ eprintln!("bad reject: {}", stats.yax_reject.load(Ordering::SeqCst));
+ eprintln!("incomplete: {}", stats.missed_incomplete.load(Ordering::SeqCst));
}