From b92bd2d1c03ec9a65a947b3bb6f24f4529905815 Mon Sep 17 00:00:00 2001 From: iximeow Date: Sat, 30 Oct 2021 18:34:02 -0700 Subject: support simd load/store (single structure) --- src/armv8/a64.rs | 365 +++++++++++++++++++++++++++++++++++++++++++++++++++++- test/armv8/a64.rs | 28 ++--- 2 files changed, 377 insertions(+), 16 deletions(-) diff --git a/src/armv8/a64.rs b/src/armv8/a64.rs index 9f51a80..d46c46e 100644 --- a/src/armv8/a64.rs +++ b/src/armv8/a64.rs @@ -216,6 +216,28 @@ pub enum SizeCode { X, W } #[repr(u8)] pub enum SIMDSizeCode { B, H, S, D, Q } +impl SIMDSizeCode { + fn width(&self) -> u16 { + match self { + SIMDSizeCode::B => 1, + SIMDSizeCode::H => 2, + SIMDSizeCode::S => 4, + SIMDSizeCode::D => 8, + SIMDSizeCode::Q => 16, + } + } + + fn name(&self) -> &'static str { + match self { + SIMDSizeCode::B => "b", + SIMDSizeCode::H => "h", + SIMDSizeCode::S => "s", + SIMDSizeCode::D => "d", + SIMDSizeCode::Q => "q", + } + } +} + #[derive(Copy, Clone, Debug, PartialEq)] #[repr(C)] pub struct Instruction { @@ -920,6 +942,42 @@ impl Display for Instruction { Opcode::LDNP => { write!(fmt, "ldnp")?; } + Opcode::ST1 => { + write!(fmt, "st1")?; + } + Opcode::ST2 => { + write!(fmt, "st2")?; + } + Opcode::ST3 => { + write!(fmt, "st3")?; + } + Opcode::ST4 => { + write!(fmt, "st4")?; + } + Opcode::LD1R => { + write!(fmt, "ld1r")?; + } + Opcode::LD2R => { + write!(fmt, "ld2r")?; + } + Opcode::LD3R => { + write!(fmt, "ld3r")?; + } + Opcode::LD4R => { + write!(fmt, "ld4r")?; + } + Opcode::LD1 => { + write!(fmt, "ld1")?; + } + Opcode::LD2 => { + write!(fmt, "ld2")?; + } + Opcode::LD3 => { + write!(fmt, "ld3")?; + } + Opcode::LD4 => { + write!(fmt, "ld4")?; + } }; if self.operands[0] != Operand::Nothing { @@ -1119,6 +1177,18 @@ pub enum Opcode { CRC32CX, STNP, LDNP, + ST1, + ST2, + ST3, + ST4, + LD1, + LD2, + LD3, + LD4, + LD1R, + LD2R, + LD3R, + LD4R, } #[derive(Copy, Clone, Debug, PartialEq)] @@ -1162,6 +1232,8 @@ pub enum Operand { Nothing, Register(SizeCode, u16), SIMDRegister(SIMDSizeCode, u16), + SIMDRegisterGroup(SIMDSizeCode, u16, SIMDSizeCode, u8), + SIMDRegisterGroupLane(u16, SIMDSizeCode, u8, u8), RegisterOrSP(SizeCode, u16), ConditionCode(u8), Offset(i64), @@ -1213,6 +1285,38 @@ impl Display for Operand { SIMDSizeCode::Q => { write!(fmt, "q{}", reg) } } } + Operand::SIMDRegisterGroup(vector_width, reg, lane_width, group_size) => { + let num_items = vector_width.width() / lane_width.width(); + let format_reg = |f: &mut fmt::Formatter, reg, elems, lane_size: SIMDSizeCode| { + write!(f, "v{}.{}{}", reg, elems, lane_size.name()) + }; + + fmt.write_str("{")?; + format_reg(fmt, *reg, num_items, *lane_width)?; + for i in 1..*group_size { + fmt.write_str(", ")?; + format_reg(fmt, (*reg + i as u16) % 32, num_items, *lane_width)?; + } + fmt.write_str("}")?; + + Ok(()) + } + Operand::SIMDRegisterGroupLane(reg, lane_width, group_size, lane) => { + let format_reg = |f: &mut fmt::Formatter, reg, lane_size: SIMDSizeCode| { + write!(f, "v{}.{}", reg, lane_size.name()) + }; + + fmt.write_str("{")?; + format_reg(fmt, *reg, *lane_width)?; + for i in 1..*group_size { + fmt.write_str(", ")?; + format_reg(fmt, (*reg + i as u16) % 32, *lane_width)?; + } + fmt.write_str("}")?; + write!(fmt, "[{}]", lane)?; + + Ok(()) + } Operand::RegisterOrSP(size, reg) => { if *reg == 31 { match size { @@ -3316,11 +3420,268 @@ impl Decoder for InstDecoder { }, 0b00110 => { // AdvSIMD load/store single structure - return Err(DecodeError::IncompleteDecoder); + let Rt = word & 0x1f; + let Rn = (word >> 5) & 0x1f; + let size = (word >> 10) & 0x03; + let S = (word >> 12) & 1; + let opcode_bits = (word >> 13) & 0x07; + let Rm = (word >> 16) & 0x1f; + if Rm != 0 { + return Err(DecodeError::InvalidOperand); + } + let R = (word >> 21) & 0x01; + let L = (word >> 22) & 0x01; + let Q = (word >> 30) & 0x01; + let datasize = if Q == 1 { SIMDSizeCode::Q } else { SIMDSizeCode::D }; + + // interleave R==0, R==1 + const OPCODES: &[Result<(Opcode, u8, SIMDSizeCode), DecodeError>] = &[ + Ok((Opcode::ST1, 1, SIMDSizeCode::B)), + Ok((Opcode::ST2, 2, SIMDSizeCode::B)), + Ok((Opcode::ST3, 3, SIMDSizeCode::B)), + Ok((Opcode::ST4, 4, SIMDSizeCode::B)), + // opcode = 0b010 + Ok((Opcode::ST1, 1, SIMDSizeCode::H)), + Ok((Opcode::ST2, 2, SIMDSizeCode::H)), + Ok((Opcode::ST3, 3, SIMDSizeCode::H)), + Ok((Opcode::ST4, 4, SIMDSizeCode::H)), + // opcode = 0b100 + Ok((Opcode::ST1, 1, SIMDSizeCode::S)), // note these can be 64-bit if `size` says so. + Ok((Opcode::ST2, 2, SIMDSizeCode::S)), + Ok((Opcode::ST3, 3, SIMDSizeCode::S)), + Ok((Opcode::ST4, 4, SIMDSizeCode::S)), + // opcode = 0b110 + // unallocated, is. if L==1, these are LD*R + ]; + + if opcode_bits > 0b101 { + if S != 0 { + return Err(DecodeError::InvalidOpcode); + } + + if L == 0 { + return Err(DecodeError::InvalidOpcode); + } + + const OPCODES: [Opcode; 4] = [ + Opcode::LD1R, + Opcode::LD2R, + Opcode::LD3R, + Opcode::LD4R, + ]; + let opc_idx = (opcode_bits & 0x01) * 2 + S; + inst.opcode = OPCODES[opc_idx as usize]; + const SIZES: [SIMDSizeCode; 4] = [ + SIMDSizeCode::B, + SIMDSizeCode::H, + SIMDSizeCode::S, + SIMDSizeCode::D, + ]; + inst.operands = [ + Operand::SIMDRegisterGroup(datasize, Rt as u16, SIZES[size as usize], opc_idx as u8), + Operand::RegPostIndex(Rn as u16, 0), + Operand::Nothing, + Operand::Nothing, + ]; + + return Ok(()); + } + + let mut scale = opcode_bits >> 1; + // let selem = (((opcode_bits & 1) << 1) | R) + 1; + // let mut replicate = false; + let opc_idx = (opcode_bits << 1) | R; + + let (opcode, group_size, item_size) = OPCODES[opc_idx as usize]?; + + let item_size = match item_size { + SIMDSizeCode::B => SIMDSizeCode::B, + SIMDSizeCode::H => { + if (size & 1) == 1 { + return Err(DecodeError::InvalidOperand); + } + SIMDSizeCode::H + } + SIMDSizeCode::S => { + if size >= 0b10 { + return Err(DecodeError::InvalidOperand); + } + if size == 0b01 { + if S == 1 { + return Err(DecodeError::InvalidOperand); + } + scale = 3; + SIMDSizeCode::D + } else { + SIMDSizeCode::S + } + } + SIMDSizeCode::D => { + if L == 0 || S == 1 { + return Err(DecodeError::InvalidOperand); + } + // replicate = true; + SIMDSizeCode::D + } + other => other + }; + + let index = ((Q << 3) | (S << 2) | size) >> scale; + + inst.opcode = if L == 0 { + opcode + } else { + if opcode == Opcode::ST1 { + Opcode::LD1 + } else if opcode == Opcode::ST2 { + Opcode::LD2 + } else if opcode == Opcode::ST3 { + Opcode::LD3 + } else { + Opcode::LD4 + } + }; + inst.operands = [ + Operand::SIMDRegisterGroupLane(Rt as u16, item_size, group_size, index as u8), + Operand::RegPostIndex(Rn as u16, 0), + Operand::Nothing, + Operand::Nothing, + ]; }, 0b00111 => { // AdvSIMD load/store single structure (post-indexed) - return Err(DecodeError::IncompleteDecoder); + let Rt = word & 0x1f; + let Rn = (word >> 5) & 0x1f; + let size = (word >> 10) & 0x03; + let S = (word >> 12) & 1; + let opcode_bits = (word >> 13) & 0x07; + let Rm = (word >> 16) & 0x1f; + let R = (word >> 21) & 0x01; + let L = (word >> 22) & 0x01; + let Q = (word >> 30) & 0x01; + let datasize = if Q == 1 { SIMDSizeCode::Q } else { SIMDSizeCode::D }; + + // interleave R==0, R==1 + const OPCODES: &[Result<(Opcode, u8, SIMDSizeCode), DecodeError>] = &[ + Ok((Opcode::ST1, 1, SIMDSizeCode::B)), + Ok((Opcode::ST2, 2, SIMDSizeCode::B)), + Ok((Opcode::ST3, 3, SIMDSizeCode::B)), + Ok((Opcode::ST4, 4, SIMDSizeCode::B)), + // opcode = 0b010 + Ok((Opcode::ST1, 1, SIMDSizeCode::H)), + Ok((Opcode::ST2, 2, SIMDSizeCode::H)), + Ok((Opcode::ST3, 3, SIMDSizeCode::H)), + Ok((Opcode::ST4, 4, SIMDSizeCode::H)), + // opcode = 0b100 + Ok((Opcode::ST1, 1, SIMDSizeCode::S)), // note these can be 64-bit if `size` says so. + Ok((Opcode::ST2, 2, SIMDSizeCode::S)), + Ok((Opcode::ST3, 3, SIMDSizeCode::S)), + Ok((Opcode::ST4, 4, SIMDSizeCode::S)), + // opcode = 0b110 + // unallocated, is. if L==1, these are LD*R + ]; + + if opcode_bits >= 0b110 { + if S != 0 { + return Err(DecodeError::InvalidOpcode); + } + + if L == 0 { + return Err(DecodeError::InvalidOpcode); + } + + const OPCODES: [Opcode; 4] = [ + Opcode::LD1R, + Opcode::LD2R, + Opcode::LD3R, + Opcode::LD4R, + ]; + let opc_idx = (opcode_bits & 0x01) * 2 + S; + inst.opcode = OPCODES[opc_idx as usize]; + const SIZES: [SIMDSizeCode; 4] = [ + SIMDSizeCode::B, + SIMDSizeCode::H, + SIMDSizeCode::S, + SIMDSizeCode::D, + ]; + inst.operands = [ + Operand::SIMDRegisterGroup(datasize, Rt as u16, SIZES[size as usize], opc_idx as u8), + if Rm == 31 { + Operand::RegPostIndex(Rn as u16, ((opc_idx + 1) * (1 << size)) as i32) + } else { + Operand::RegPostIndexReg(Rn as u16, Rm as u16) + }, + Operand::Nothing, + Operand::Nothing, + ]; + + return Ok(()); + } + + let mut scale = opcode_bits >> 1; + // let selem = (((opcode_bits & 1) << 1) | R) + 1; + // let mut replicate = false; + let opc_idx = (opcode_bits << 1) | R; + + let (opcode, group_size, item_size) = OPCODES[opc_idx as usize]?; + + let item_size = match item_size { + SIMDSizeCode::B => SIMDSizeCode::B, + SIMDSizeCode::H => { + if (size & 1) == 1 { + return Err(DecodeError::InvalidOperand); + } + SIMDSizeCode::H + } + SIMDSizeCode::S => { + if size >= 0b10 { + return Err(DecodeError::InvalidOperand); + } + if size == 0b01 { + if S == 1 { + return Err(DecodeError::InvalidOperand); + } + scale = 3; + SIMDSizeCode::D + } else { + SIMDSizeCode::S + } + } + SIMDSizeCode::D => { + if L == 0 || S == 1 { + return Err(DecodeError::InvalidOperand); + } + // replicate = true; + SIMDSizeCode::D + } + other => other + }; + + let index = ((Q << 3) | (S << 2) | size) >> scale; + + inst.opcode = if L == 0 { + opcode + } else { + if opcode == Opcode::ST1 { + Opcode::LD1 + } else if opcode == Opcode::ST2 { + Opcode::LD2 + } else if opcode == Opcode::ST3 { + Opcode::LD3 + } else { + Opcode::LD4 + } + }; + inst.operands = [ + Operand::SIMDRegisterGroupLane(Rt as u16, item_size, group_size, index as u8), + if Rm == 31 { + Operand::RegPostIndex(Rn as u16, (group_size as u16 * item_size.width()) as i32) + } else { + Operand::RegPostIndexReg(Rn as u16, Rm as u16) + }, + Operand::Nothing, + Operand::Nothing, + ]; } _ => { inst.opcode = Opcode::Invalid; diff --git a/test/armv8/a64.rs b/test/armv8/a64.rs index 69039e7..8de1f99 100644 --- a/test/armv8/a64.rs +++ b/test/armv8/a64.rs @@ -2749,9 +2749,9 @@ fn test_openblas_simd_loadstore() { ([0xa4, 0x89, 0x40, 0x0c], "ld2 {v4.2s, v5.2s}, [x13]"), ([0xa2, 0xa9, 0x40, 0x0c], "ld1 {v2.2s, v3.2s}, [x13]"), ([0xac, 0xa9, 0x40, 0x0c], "ld1 {v12.2s, v13.2s}, [x13]"), - ([0xa5, 0x79, 0x9f, 0x0c], "st1 {v5.2s}, [x13], 8"), + ([0xa5, 0x79, 0x9f, 0x0c], "st1 {v5.2s}, [x13], 0x8"), ([0x45, 0x79, 0xc2, 0x0c], "ld1 {v5.2s}, [x10], x2"), - ([0x20, 0x78, 0xdf, 0x0c], "ld1 {v0.2s}, [x1], 8"), + ([0x20, 0x78, 0xdf, 0x0c], "ld1 {v0.2s}, [x1], 0x8"), ([0xcc, 0x85, 0x00, 0x0d], "st1 {v12.d}[0], [x14]"), ([0xa8, 0x91, 0x00, 0x0d], "st1 {v8.s}[1], [x13]"), ([0xa0, 0x81, 0x20, 0x0d], "st2 {v0.s, v1.s}[0], [x13]"), @@ -2762,7 +2762,7 @@ fn test_openblas_simd_loadstore() { ([0x64, 0x90, 0x9f, 0x0d], "st1 {v4.s}[1], [x3], 0x4"), ([0x22, 0x84, 0xc2, 0x0d], "ld1 {v2.d}[0], [x1], x2"), ([0x61, 0x80, 0xc4, 0x0d], "ld1 {v1.s}[0], [x3], x4"), - ([0x24, 0xc9, 0xdf, 0x0d], "ld1r {v4.2s}, [x9], 0x4"), + ([0x24, 0xc9, 0xdf, 0x0d], "ld1r {v4.2s}, [x9], 0x4"), // TODO: could use a test for "ld1r {v4.2s}, [x9]" ([0x88, 0x28, 0x00, 0x4c], "st1 {v8.4s-v11.4s}, [x4]"), ([0x60, 0x2d, 0x00, 0x4c], "st1 {v0.2d-v3.2d}, [x11]"), ([0x9c, 0x2e, 0x00, 0x4c], "st1 {v28.2d-v31.2d}, [x20]"), @@ -2801,22 +2801,22 @@ fn test_openblas_simd_loadstore() { ([0xa4, 0x8d, 0x40, 0x4c], "ld2 {v4.2d, v5.2d}, [x13]"), ([0xa6, 0x8d, 0x40, 0x4c], "ld2 {v6.2d, v7.2d}, [x13]"), ([0xa3, 0x7c, 0x86, 0x4c], "st1 {v3.2d}, [x5], x6"), - ([0x61, 0x2c, 0x9f, 0x4c], "st1 {v1.2d-v4.2d}, [x3], 64"), - ([0xb0, 0x2c, 0x9f, 0x4c], "st1 {v16.2d-v19.2d}, [x5], 64"), + ([0x61, 0x2c, 0x9f, 0x4c], "st1 {v1.2d-v4.2d}, [x3], 0x40"), + ([0xb0, 0x2c, 0x9f, 0x4c], "st1 {v16.2d-v19.2d}, [x5], 0x40"), ([0x24, 0x78, 0x9f, 0x4c], "st1 {v4.4s}, [x1], 0x10"), ([0xa5, 0x7d, 0x9f, 0x4c], "st1 {v5.2d}, [x13], 0x10"), - ([0xa4, 0x88, 0x9f, 0x4c], "st2 {v4.4s, v5.4s}, [x5], 32"), - ([0xc4, 0x88, 0x9f, 0x4c], "st2 {v4.4s, v5.4s}, [x6], 32"), - ([0xb0, 0xad, 0x9f, 0x4c], "st1 {v16.2d, v17.2d}, [x13], 32"), + ([0xa4, 0x88, 0x9f, 0x4c], "st2 {v4.4s, v5.4s}, [x5], 0x20"), + ([0xc4, 0x88, 0x9f, 0x4c], "st2 {v4.4s, v5.4s}, [x6], 0x20"), + ([0xb0, 0xad, 0x9f, 0x4c], "st1 {v16.2d, v17.2d}, [x13], 0x20"), ([0x20, 0x7c, 0xc2, 0x4c], "ld1 {v0.2d}, [x1], x2"), ([0x46, 0x7d, 0xc6, 0x4c], "ld1 {v6.2d}, [x10], x6"), - ([0x20, 0x0c, 0xdf, 0x4c], "ld4 {v0.2d-v3.2d}, [x1], 64"), - ([0x51, 0x2d, 0xdf, 0x4c], "ld1 {v17.2d-v20.2d}, [x10], 64"), + ([0x20, 0x0c, 0xdf, 0x4c], "ld4 {v0.2d-v3.2d}, [x1], 0x40"), + ([0x51, 0x2d, 0xdf, 0x4c], "ld1 {v17.2d-v20.2d}, [x10], 0x40"), ([0x20, 0x78, 0xdf, 0x4c], "ld1 {v0.4s}, [x1], 0x10"), ([0x21, 0x78, 0xdf, 0x4c], "ld1 {v1.4s}, [x1], 0x10"), ([0x46, 0x7d, 0xdf, 0x4c], "ld1 {v6.2d}, [x10], 0x10"), - ([0x20, 0x88, 0xdf, 0x4c], "ld2 {v0.4s, v1.4s}, [x1], 32"), - ([0x50, 0xad, 0xdf, 0x4c], "ld1 {v16.2d, v17.2d}, [x10], 32"), + ([0x20, 0x88, 0xdf, 0x4c], "ld2 {v0.4s, v1.4s}, [x1], 0x20"), + ([0x50, 0xad, 0xdf, 0x4c], "ld1 {v16.2d, v17.2d}, [x10], 0x20"), ([0xa8, 0x85, 0x00, 0x4d], "st1 {v8.d}[1], [x13]"), ([0xac, 0x85, 0x00, 0x4d], "st1 {v12.d}[1], [x13]"), ([0xec, 0x85, 0x00, 0x4d], "st1 {v12.d}[1], [x15]"), @@ -2824,8 +2824,8 @@ fn test_openblas_simd_loadstore() { ([0xa8, 0x85, 0x40, 0x4d], "ld1 {v8.d}[1], [x13]"), ([0xec, 0x85, 0x40, 0x4d], "ld1 {v12.d}[1], [x15]"), ([0x64, 0x84, 0x84, 0x4d], "st1 {v4.d}[1], [x3], x4"), - ([0x64, 0x84, 0x9f, 0x4d], "st1 {v4.d}[1], [x3], 8"), - ([0x24, 0xcd, 0xdf, 0x4d], "ld1r {v4.2d}, [x9], 8"), + ([0x64, 0x84, 0x9f, 0x4d], "st1 {v4.d}[1], [x3], 0x8"), + ([0x24, 0xcd, 0xdf, 0x4d], "ld1r {v4.2d}, [x9], 0x8"), ([0x60, 0x04, 0x81, 0x3c], "str q0, [x3], 0x10"), ([0x61, 0x00, 0x9f, 0x3c], "stur q1, [x3, -0x10]"), ([0xa0, 0x00, 0x9f, 0x3c], "stur q0, [x5, -0x10]"), -- cgit v1.1