From 17f9330abed70364edc404f95471298e2b9bd29a Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 14 Mar 2024 21:31:05 -0700 Subject: fix up differential fuzzer to rectify hex vs decimal literals in different disassemblers sure does include a quasi aarch64-asm-parser --- differential-tests/tests/capstone-differential.rs | 288 ++++++++++++++++++++-- 1 file changed, 274 insertions(+), 14 deletions(-) diff --git a/differential-tests/tests/capstone-differential.rs b/differential-tests/tests/capstone-differential.rs index 73b0f2b..8457bce 100644 --- a/differential-tests/tests/capstone-differential.rs +++ b/differential-tests/tests/capstone-differential.rs @@ -4,6 +4,244 @@ use capstone::prelude::*; use yaxpeax_arch::{Arch, Decoder}; +use std::num::ParseIntError; + +#[derive(Debug)] +enum ParsedOperand { + Register { size: char, num: u8 }, + Memory(String), + SIMDRegister { size: char, num: u8 }, +// SIMDRegisterElements { num: u8, elems: u8, elem_size: char }, +// SIMDRegisterElement { num: u8, elem_size: char, elem: u8 }, + SIMDElementLane { elem: String, lane_selector: u8 }, + Immediate(i64), + Float(f64), + Other(String), + RegisterFamily(String), +} + +impl PartialEq for ParsedOperand { + fn eq(&self, other: &Self) -> bool { + use ParsedOperand::*; + + match (self, other) { + (Register { size: size_l, num: num_l }, Register { size: size_r, num: num_r }) => { + size_l == size_r && num_l == num_r + }, + (Memory(l), Memory(r)) => { + l == r + }, + (Immediate(l), Immediate(r)) => { + l == r + }, + (Float(l), Float(r)) => { + l.to_ne_bytes() == r.to_ne_bytes() + }, + (RegisterFamily(l), RegisterFamily(r)) => { + l == r + }, + (SIMDRegister { size: size_l, num: num_l }, SIMDRegister { size: size_r, num: num_r }) => { + size_l == size_r && num_l == num_r + }, + (SIMDElementLane { elem: elem_l, lane_selector: lane_l }, SIMDElementLane { elem: elem_r, lane_selector: lane_r }) => { + elem_l == elem_r && lane_l == lane_r + } + (Other(l), Other(r)) => { + // yax prints `asr #0` as just `asr`. is this actually a no-op? + if (l == "asr" && r == "asr #0") || (l == "asr #0" && r == "asr") { + true + } else if (l == "lsr" && r == "lsr #0") || (l == "lsr #0" && r == "lsr") { + true + } else if (l == "ror" && r == "ror #0") || (l == "ror #0" && r == "ror") { + true + } else { + l == r + } + } + (_, _) => { + false + } + } + } +} + +#[test] +fn test_operand_parsing() { + assert_eq!(ParsedOperand::parse("xzr"), (ParsedOperand::Register { size: 'x', num: 0 }, 3)); + assert_eq!(ParsedOperand::parse("wzr"), (ParsedOperand::Register { size: 'w', num: 0 }, 3)); + assert_eq!(ParsedOperand::parse("w1"), (ParsedOperand::Register { size: 'w', num: 1 }, 2)); + assert_eq!(ParsedOperand::parse("x1"), (ParsedOperand::Register { size: 'x', num: 1 }, 2)); +} + +#[test] +fn test_instruction_parsing() { + let inst = ParsedDisassembly::parse("msub w17, w8, w15, w0"); + assert_eq!(inst, ParsedDisassembly { + opcode: "msub".to_string(), + operands: [ + Some(ParsedOperand::Register { size: 'w', num: 17 }), + Some(ParsedOperand::Register { size: 'w', num: 8 }), + Some(ParsedOperand::Register { size: 'w', num: 15 }), + Some(ParsedOperand::Register { size: 'w', num: 0 }), + None, + None, + ] + }); +} + +impl ParsedOperand { + fn parse(s: &str) -> (Self, usize) { + let parse_hex_or_dec = |mut s: &str| { + let mut negate = false; + if s.as_bytes()[0] == b'-' { + negate = true; + s = &s[1..]; + } + + let v = if !s.starts_with("0x") { + i64::from_str_radix(s, 10).expect("can parse string") + } else { + (u64::from_str_radix(&s[2..], 16).expect("can parse string") as i64) + }; + if negate { + -v + } else { + v + } + }; + + let mut consumed = 0; + if s.as_bytes()[0] == b'#' { + let end = s.find(',').unwrap_or(s.len()); + let imm_str = &s[1..end]; + if imm_str.contains('.') { + use std::str::FromStr; + (ParsedOperand::Float(f64::from_str(imm_str).expect("can parse string")), end) + } else { + let imm = parse_hex_or_dec(imm_str); + (ParsedOperand::Immediate(imm), end) + } + } else if s.as_bytes()[0] == b'[' { + let mut end = s.find(']').map(|x| x + 1).unwrap_or(s.len()); + if s.as_bytes().get(end) == Some(&b'!') { + end += 1; + } + + (ParsedOperand::Memory(s[0..end].to_string()), end) + } else if s.as_bytes()[0] == b'{' { + let mut brace_end = s.find('}'); + if let Some(brace_end) = brace_end { + if s.as_bytes().get(brace_end + 1) == Some(&b'[') { + if let Some(end) = s.find(']') { + let group = &s[0..brace_end]; + let lane = &s[brace_end + 2..end]; + let lane = parse_hex_or_dec(lane); + + return (ParsedOperand::SIMDElementLane { + elem: group.to_string(), + lane_selector: lane as u8, + }, end); + } + } + + let end = s.find(',').unwrap_or(s.len()); + (ParsedOperand::RegisterFamily(s[0..end].to_string()), end) + } else { + let end = s.find(',').unwrap_or(s.len()); + (ParsedOperand::Other(s[0..end].to_string()), end) + } + } else { + let mut end = s.find(',').unwrap_or(s.len()); + let substr = &s[..end]; + match (s.as_bytes()[0] as char) { + sz @ 'w' | sz @ 'x' => { + if &s[1..end] == "zr" { + return (ParsedOperand::Register { size: sz, num: 0 }, 3); + } + let num: Result = s[1..end].parse(); + match num { + Ok(num) => { + (ParsedOperand::Register { size: sz, num }, end) + } + Err(e) => { + (ParsedOperand::Other(s[..end].to_string()), end) + } + } + } + sz @ 'b' | sz @ 'h' | sz @ 's' | sz @ 'd' | sz @ 'q' => { + let num: Result = s[1..end].parse(); + match num { + Ok(num) => { + (ParsedOperand::SIMDRegister { size: sz, num }, end) + } + Err(e) => { + (ParsedOperand::Other(s[..end].to_string()), end) + } + } + } + 'v' => { + match substr.find('[') { + Some(lane_selector_start) => { + let lane_selector_end = substr.find(']').unwrap(); + let elem = substr[..lane_selector_start].to_string(); + let lane_selector = parse_hex_or_dec(&substr[lane_selector_start + 1..lane_selector_end]) as u8; + (ParsedOperand::SIMDElementLane { elem, lane_selector }, end) + } + None => { + // some kind of simd element that does not include a trailing `[]`. + // treat it as an opaque string for now. + (ParsedOperand::Other(substr.to_string()), end) + } + } + } + other => { + (ParsedOperand::Other(s[..end].to_string()), end) + } + } + } + } +} + +#[derive(Debug, PartialEq)] +struct ParsedDisassembly { + opcode: String, + // arm instructions do not have six operands, but due to parse ambiguity and the rather hackjob + // parser here, pretend they might. + operands: [Option; 6] +} + +impl ParsedDisassembly { + fn parse(s: &str) -> Self { + let mut operands = [None, None, None, None, None, None]; + if let Some((opcode, mut operands_text)) = s.split_once(' ') { + let opcode = opcode.to_string(); + + let mut i = 0; + + while operands_text.len() > 0 { + if operands_text.as_bytes()[0] == b',' { + operands_text = &operands_text[1..]; + } + operands_text = operands_text.trim(); + let (parsed, amount) = ParsedOperand::parse(&operands_text); + operands[i] = Some(parsed); + operands_text = &operands_text[amount..]; + i += 1; + } + + ParsedDisassembly { + opcode, + operands, + } + } else { + ParsedDisassembly { + opcode: s.to_string(), + operands, + } + } + } +} + #[test] fn capstone_differential() { let cs = Capstone::new() @@ -49,6 +287,16 @@ fn capstone_differential() { return true; } + let parsed_yax = ParsedDisassembly::parse(yax_text); + let parsed_cs = ParsedDisassembly::parse(cs_text); + + if parsed_yax == parsed_cs { + return true; + } + +// eprintln!("yax: {} -> {:?}", yax_text, parsed_yax); +// eprintln!("cs: {} -> {:?}", cs_text, parsed_cs); + if cs_text .replace("uxtw #0", "uxtw") .replace("uxtx #0", "uxtx") == yax_text { @@ -58,25 +306,31 @@ fn capstone_differential() { // capstone discards uxtw in some circumstances for reasons i don't yet // know - if yax_text.ends_with("uxtw") && - &yax_text[..yax_text.len() - 6] == cs_text { - return true; + if let Some(yax_text) = yax_text.strip_suffix(", uxtw") { + if yax_text == cs_text { + return true; + } } - if cs_text.ends_with("uxtw") && - &cs_text[..cs_text.len() - 6] == yax_text { - return true; + if let Some(cs_text) = cs_text.strip_suffix(", uxtw") { + if yax_text == cs_text { + return true; + } } + if yax_text.replace("lsl", "uxtw") == cs_text { return true; } - if yax_text.ends_with("#0") && - &yax_text[..yax_text.len() - 3] == cs_text { - return true; + if let Some(yax_text) = yax_text.strip_suffix(" #0") { + if yax_text == cs_text { + return true; + } } - if cs_text.ends_with("#0") && - &cs_text[..cs_text.len() - 3] == yax_text { - return true; + if let Some(cs_text) = cs_text.strip_suffix(" #0") { + if yax_text == cs_text { + return true; + } } + // TODO: what kind of cases is this for? if cs_text.starts_with(yax_text) && cs_text.ends_with("000") { return true; }; @@ -179,9 +433,14 @@ fn capstone_differential() { return true; } - if cs_text.starts_with("dup") && yax_text.starts_with("mov ") && cs_text.replace("dup ", "mov ") == yax_text { - return true; + if parsed_yax.opcode == "mov" && parsed_cs.opcode == "dup" { + if parsed_yax.operands == parsed_cs.operands { + return true; + } } +// if cs_text.starts_with("dup") && yax_text.starts_with("mov ") && cs_text.replace("dup ", "mov ") == yax_text { +// return true; +// } // capstone bug! e0030033 is `bfxil w0, wzr, #0, #1`, but capstone picks // the bfc alias instead. skip these, generally. if yax_text.starts_with("bfxil") && (cs_text.starts_with("bfc") || cs_text.starts_with("bfi")) { @@ -290,6 +549,7 @@ fn capstone_differential() { return false; } +// eprintln!("{}", yax_text); if !acceptable_match(&yax_text, cs_text) { panic!("disassembly mismatch: {} != {}. bytes: {:x?}", yax_text, cs_text, bytes); } else { -- cgit v1.1