aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoriximeow <me@iximeow.net>2021-10-30 18:34:02 -0700
committeriximeow <me@iximeow.net>2021-10-30 18:34:02 -0700
commitb92bd2d1c03ec9a65a947b3bb6f24f4529905815 (patch)
treec7d188c1e1e9afd7ac8cb4c144af84841efc5aa7
parent9b3d3d5c52a619e8e090e676033ee99abe33553d (diff)
support simd load/store (single structure)
-rw-r--r--src/armv8/a64.rs365
-rw-r--r--test/armv8/a64.rs28
2 files changed, 377 insertions, 16 deletions
diff --git a/src/armv8/a64.rs b/src/armv8/a64.rs
index 9f51a80..d46c46e 100644
--- a/src/armv8/a64.rs
+++ b/src/armv8/a64.rs
@@ -216,6 +216,28 @@ pub enum SizeCode { X, W }
#[repr(u8)]
pub enum SIMDSizeCode { B, H, S, D, Q }
+impl SIMDSizeCode {
+ fn width(&self) -> u16 {
+ match self {
+ SIMDSizeCode::B => 1,
+ SIMDSizeCode::H => 2,
+ SIMDSizeCode::S => 4,
+ SIMDSizeCode::D => 8,
+ SIMDSizeCode::Q => 16,
+ }
+ }
+
+ fn name(&self) -> &'static str {
+ match self {
+ SIMDSizeCode::B => "b",
+ SIMDSizeCode::H => "h",
+ SIMDSizeCode::S => "s",
+ SIMDSizeCode::D => "d",
+ SIMDSizeCode::Q => "q",
+ }
+ }
+}
+
#[derive(Copy, Clone, Debug, PartialEq)]
#[repr(C)]
pub struct Instruction {
@@ -920,6 +942,42 @@ impl Display for Instruction {
Opcode::LDNP => {
write!(fmt, "ldnp")?;
}
+ Opcode::ST1 => {
+ write!(fmt, "st1")?;
+ }
+ Opcode::ST2 => {
+ write!(fmt, "st2")?;
+ }
+ Opcode::ST3 => {
+ write!(fmt, "st3")?;
+ }
+ Opcode::ST4 => {
+ write!(fmt, "st4")?;
+ }
+ Opcode::LD1R => {
+ write!(fmt, "ld1r")?;
+ }
+ Opcode::LD2R => {
+ write!(fmt, "ld2r")?;
+ }
+ Opcode::LD3R => {
+ write!(fmt, "ld3r")?;
+ }
+ Opcode::LD4R => {
+ write!(fmt, "ld4r")?;
+ }
+ Opcode::LD1 => {
+ write!(fmt, "ld1")?;
+ }
+ Opcode::LD2 => {
+ write!(fmt, "ld2")?;
+ }
+ Opcode::LD3 => {
+ write!(fmt, "ld3")?;
+ }
+ Opcode::LD4 => {
+ write!(fmt, "ld4")?;
+ }
};
if self.operands[0] != Operand::Nothing {
@@ -1119,6 +1177,18 @@ pub enum Opcode {
CRC32CX,
STNP,
LDNP,
+ ST1,
+ ST2,
+ ST3,
+ ST4,
+ LD1,
+ LD2,
+ LD3,
+ LD4,
+ LD1R,
+ LD2R,
+ LD3R,
+ LD4R,
}
#[derive(Copy, Clone, Debug, PartialEq)]
@@ -1162,6 +1232,8 @@ pub enum Operand {
Nothing,
Register(SizeCode, u16),
SIMDRegister(SIMDSizeCode, u16),
+ SIMDRegisterGroup(SIMDSizeCode, u16, SIMDSizeCode, u8),
+ SIMDRegisterGroupLane(u16, SIMDSizeCode, u8, u8),
RegisterOrSP(SizeCode, u16),
ConditionCode(u8),
Offset(i64),
@@ -1213,6 +1285,38 @@ impl Display for Operand {
SIMDSizeCode::Q => { write!(fmt, "q{}", reg) }
}
}
+ Operand::SIMDRegisterGroup(vector_width, reg, lane_width, group_size) => {
+ let num_items = vector_width.width() / lane_width.width();
+ let format_reg = |f: &mut fmt::Formatter, reg, elems, lane_size: SIMDSizeCode| {
+ write!(f, "v{}.{}{}", reg, elems, lane_size.name())
+ };
+
+ fmt.write_str("{")?;
+ format_reg(fmt, *reg, num_items, *lane_width)?;
+ for i in 1..*group_size {
+ fmt.write_str(", ")?;
+ format_reg(fmt, (*reg + i as u16) % 32, num_items, *lane_width)?;
+ }
+ fmt.write_str("}")?;
+
+ Ok(())
+ }
+ Operand::SIMDRegisterGroupLane(reg, lane_width, group_size, lane) => {
+ let format_reg = |f: &mut fmt::Formatter, reg, lane_size: SIMDSizeCode| {
+ write!(f, "v{}.{}", reg, lane_size.name())
+ };
+
+ fmt.write_str("{")?;
+ format_reg(fmt, *reg, *lane_width)?;
+ for i in 1..*group_size {
+ fmt.write_str(", ")?;
+ format_reg(fmt, (*reg + i as u16) % 32, *lane_width)?;
+ }
+ fmt.write_str("}")?;
+ write!(fmt, "[{}]", lane)?;
+
+ Ok(())
+ }
Operand::RegisterOrSP(size, reg) => {
if *reg == 31 {
match size {
@@ -3316,11 +3420,268 @@ impl Decoder<ARMv8> for InstDecoder {
},
0b00110 => {
// AdvSIMD load/store single structure
- return Err(DecodeError::IncompleteDecoder);
+ let Rt = word & 0x1f;
+ let Rn = (word >> 5) & 0x1f;
+ let size = (word >> 10) & 0x03;
+ let S = (word >> 12) & 1;
+ let opcode_bits = (word >> 13) & 0x07;
+ let Rm = (word >> 16) & 0x1f;
+ if Rm != 0 {
+ return Err(DecodeError::InvalidOperand);
+ }
+ let R = (word >> 21) & 0x01;
+ let L = (word >> 22) & 0x01;
+ let Q = (word >> 30) & 0x01;
+ let datasize = if Q == 1 { SIMDSizeCode::Q } else { SIMDSizeCode::D };
+
+ // interleave R==0, R==1
+ const OPCODES: &[Result<(Opcode, u8, SIMDSizeCode), DecodeError>] = &[
+ Ok((Opcode::ST1, 1, SIMDSizeCode::B)),
+ Ok((Opcode::ST2, 2, SIMDSizeCode::B)),
+ Ok((Opcode::ST3, 3, SIMDSizeCode::B)),
+ Ok((Opcode::ST4, 4, SIMDSizeCode::B)),
+ // opcode = 0b010
+ Ok((Opcode::ST1, 1, SIMDSizeCode::H)),
+ Ok((Opcode::ST2, 2, SIMDSizeCode::H)),
+ Ok((Opcode::ST3, 3, SIMDSizeCode::H)),
+ Ok((Opcode::ST4, 4, SIMDSizeCode::H)),
+ // opcode = 0b100
+ Ok((Opcode::ST1, 1, SIMDSizeCode::S)), // note these can be 64-bit if `size` says so.
+ Ok((Opcode::ST2, 2, SIMDSizeCode::S)),
+ Ok((Opcode::ST3, 3, SIMDSizeCode::S)),
+ Ok((Opcode::ST4, 4, SIMDSizeCode::S)),
+ // opcode = 0b110
+ // unallocated, is. if L==1, these are LD*R
+ ];
+
+ if opcode_bits > 0b101 {
+ if S != 0 {
+ return Err(DecodeError::InvalidOpcode);
+ }
+
+ if L == 0 {
+ return Err(DecodeError::InvalidOpcode);
+ }
+
+ const OPCODES: [Opcode; 4] = [
+ Opcode::LD1R,
+ Opcode::LD2R,
+ Opcode::LD3R,
+ Opcode::LD4R,
+ ];
+ let opc_idx = (opcode_bits & 0x01) * 2 + S;
+ inst.opcode = OPCODES[opc_idx as usize];
+ const SIZES: [SIMDSizeCode; 4] = [
+ SIMDSizeCode::B,
+ SIMDSizeCode::H,
+ SIMDSizeCode::S,
+ SIMDSizeCode::D,
+ ];
+ inst.operands = [
+ Operand::SIMDRegisterGroup(datasize, Rt as u16, SIZES[size as usize], opc_idx as u8),
+ Operand::RegPostIndex(Rn as u16, 0),
+ Operand::Nothing,
+ Operand::Nothing,
+ ];
+
+ return Ok(());
+ }
+
+ let mut scale = opcode_bits >> 1;
+ // let selem = (((opcode_bits & 1) << 1) | R) + 1;
+ // let mut replicate = false;
+ let opc_idx = (opcode_bits << 1) | R;
+
+ let (opcode, group_size, item_size) = OPCODES[opc_idx as usize]?;
+
+ let item_size = match item_size {
+ SIMDSizeCode::B => SIMDSizeCode::B,
+ SIMDSizeCode::H => {
+ if (size & 1) == 1 {
+ return Err(DecodeError::InvalidOperand);
+ }
+ SIMDSizeCode::H
+ }
+ SIMDSizeCode::S => {
+ if size >= 0b10 {
+ return Err(DecodeError::InvalidOperand);
+ }
+ if size == 0b01 {
+ if S == 1 {
+ return Err(DecodeError::InvalidOperand);
+ }
+ scale = 3;
+ SIMDSizeCode::D
+ } else {
+ SIMDSizeCode::S
+ }
+ }
+ SIMDSizeCode::D => {
+ if L == 0 || S == 1 {
+ return Err(DecodeError::InvalidOperand);
+ }
+ // replicate = true;
+ SIMDSizeCode::D
+ }
+ other => other
+ };
+
+ let index = ((Q << 3) | (S << 2) | size) >> scale;
+
+ inst.opcode = if L == 0 {
+ opcode
+ } else {
+ if opcode == Opcode::ST1 {
+ Opcode::LD1
+ } else if opcode == Opcode::ST2 {
+ Opcode::LD2
+ } else if opcode == Opcode::ST3 {
+ Opcode::LD3
+ } else {
+ Opcode::LD4
+ }
+ };
+ inst.operands = [
+ Operand::SIMDRegisterGroupLane(Rt as u16, item_size, group_size, index as u8),
+ Operand::RegPostIndex(Rn as u16, 0),
+ Operand::Nothing,
+ Operand::Nothing,
+ ];
},
0b00111 => {
// AdvSIMD load/store single structure (post-indexed)
- return Err(DecodeError::IncompleteDecoder);
+ let Rt = word & 0x1f;
+ let Rn = (word >> 5) & 0x1f;
+ let size = (word >> 10) & 0x03;
+ let S = (word >> 12) & 1;
+ let opcode_bits = (word >> 13) & 0x07;
+ let Rm = (word >> 16) & 0x1f;
+ let R = (word >> 21) & 0x01;
+ let L = (word >> 22) & 0x01;
+ let Q = (word >> 30) & 0x01;
+ let datasize = if Q == 1 { SIMDSizeCode::Q } else { SIMDSizeCode::D };
+
+ // interleave R==0, R==1
+ const OPCODES: &[Result<(Opcode, u8, SIMDSizeCode), DecodeError>] = &[
+ Ok((Opcode::ST1, 1, SIMDSizeCode::B)),
+ Ok((Opcode::ST2, 2, SIMDSizeCode::B)),
+ Ok((Opcode::ST3, 3, SIMDSizeCode::B)),
+ Ok((Opcode::ST4, 4, SIMDSizeCode::B)),
+ // opcode = 0b010
+ Ok((Opcode::ST1, 1, SIMDSizeCode::H)),
+ Ok((Opcode::ST2, 2, SIMDSizeCode::H)),
+ Ok((Opcode::ST3, 3, SIMDSizeCode::H)),
+ Ok((Opcode::ST4, 4, SIMDSizeCode::H)),
+ // opcode = 0b100
+ Ok((Opcode::ST1, 1, SIMDSizeCode::S)), // note these can be 64-bit if `size` says so.
+ Ok((Opcode::ST2, 2, SIMDSizeCode::S)),
+ Ok((Opcode::ST3, 3, SIMDSizeCode::S)),
+ Ok((Opcode::ST4, 4, SIMDSizeCode::S)),
+ // opcode = 0b110
+ // unallocated, is. if L==1, these are LD*R
+ ];
+
+ if opcode_bits >= 0b110 {
+ if S != 0 {
+ return Err(DecodeError::InvalidOpcode);
+ }
+
+ if L == 0 {
+ return Err(DecodeError::InvalidOpcode);
+ }
+
+ const OPCODES: [Opcode; 4] = [
+ Opcode::LD1R,
+ Opcode::LD2R,
+ Opcode::LD3R,
+ Opcode::LD4R,
+ ];
+ let opc_idx = (opcode_bits & 0x01) * 2 + S;
+ inst.opcode = OPCODES[opc_idx as usize];
+ const SIZES: [SIMDSizeCode; 4] = [
+ SIMDSizeCode::B,
+ SIMDSizeCode::H,
+ SIMDSizeCode::S,
+ SIMDSizeCode::D,
+ ];
+ inst.operands = [
+ Operand::SIMDRegisterGroup(datasize, Rt as u16, SIZES[size as usize], opc_idx as u8),
+ if Rm == 31 {
+ Operand::RegPostIndex(Rn as u16, ((opc_idx + 1) * (1 << size)) as i32)
+ } else {
+ Operand::RegPostIndexReg(Rn as u16, Rm as u16)
+ },
+ Operand::Nothing,
+ Operand::Nothing,
+ ];
+
+ return Ok(());
+ }
+
+ let mut scale = opcode_bits >> 1;
+ // let selem = (((opcode_bits & 1) << 1) | R) + 1;
+ // let mut replicate = false;
+ let opc_idx = (opcode_bits << 1) | R;
+
+ let (opcode, group_size, item_size) = OPCODES[opc_idx as usize]?;
+
+ let item_size = match item_size {
+ SIMDSizeCode::B => SIMDSizeCode::B,
+ SIMDSizeCode::H => {
+ if (size & 1) == 1 {
+ return Err(DecodeError::InvalidOperand);
+ }
+ SIMDSizeCode::H
+ }
+ SIMDSizeCode::S => {
+ if size >= 0b10 {
+ return Err(DecodeError::InvalidOperand);
+ }
+ if size == 0b01 {
+ if S == 1 {
+ return Err(DecodeError::InvalidOperand);
+ }
+ scale = 3;
+ SIMDSizeCode::D
+ } else {
+ SIMDSizeCode::S
+ }
+ }
+ SIMDSizeCode::D => {
+ if L == 0 || S == 1 {
+ return Err(DecodeError::InvalidOperand);
+ }
+ // replicate = true;
+ SIMDSizeCode::D
+ }
+ other => other
+ };
+
+ let index = ((Q << 3) | (S << 2) | size) >> scale;
+
+ inst.opcode = if L == 0 {
+ opcode
+ } else {
+ if opcode == Opcode::ST1 {
+ Opcode::LD1
+ } else if opcode == Opcode::ST2 {
+ Opcode::LD2
+ } else if opcode == Opcode::ST3 {
+ Opcode::LD3
+ } else {
+ Opcode::LD4
+ }
+ };
+ inst.operands = [
+ Operand::SIMDRegisterGroupLane(Rt as u16, item_size, group_size, index as u8),
+ if Rm == 31 {
+ Operand::RegPostIndex(Rn as u16, (group_size as u16 * item_size.width()) as i32)
+ } else {
+ Operand::RegPostIndexReg(Rn as u16, Rm as u16)
+ },
+ Operand::Nothing,
+ Operand::Nothing,
+ ];
}
_ => {
inst.opcode = Opcode::Invalid;
diff --git a/test/armv8/a64.rs b/test/armv8/a64.rs
index 69039e7..8de1f99 100644
--- a/test/armv8/a64.rs
+++ b/test/armv8/a64.rs
@@ -2749,9 +2749,9 @@ fn test_openblas_simd_loadstore() {
([0xa4, 0x89, 0x40, 0x0c], "ld2 {v4.2s, v5.2s}, [x13]"),
([0xa2, 0xa9, 0x40, 0x0c], "ld1 {v2.2s, v3.2s}, [x13]"),
([0xac, 0xa9, 0x40, 0x0c], "ld1 {v12.2s, v13.2s}, [x13]"),
- ([0xa5, 0x79, 0x9f, 0x0c], "st1 {v5.2s}, [x13], 8"),
+ ([0xa5, 0x79, 0x9f, 0x0c], "st1 {v5.2s}, [x13], 0x8"),
([0x45, 0x79, 0xc2, 0x0c], "ld1 {v5.2s}, [x10], x2"),
- ([0x20, 0x78, 0xdf, 0x0c], "ld1 {v0.2s}, [x1], 8"),
+ ([0x20, 0x78, 0xdf, 0x0c], "ld1 {v0.2s}, [x1], 0x8"),
([0xcc, 0x85, 0x00, 0x0d], "st1 {v12.d}[0], [x14]"),
([0xa8, 0x91, 0x00, 0x0d], "st1 {v8.s}[1], [x13]"),
([0xa0, 0x81, 0x20, 0x0d], "st2 {v0.s, v1.s}[0], [x13]"),
@@ -2762,7 +2762,7 @@ fn test_openblas_simd_loadstore() {
([0x64, 0x90, 0x9f, 0x0d], "st1 {v4.s}[1], [x3], 0x4"),
([0x22, 0x84, 0xc2, 0x0d], "ld1 {v2.d}[0], [x1], x2"),
([0x61, 0x80, 0xc4, 0x0d], "ld1 {v1.s}[0], [x3], x4"),
- ([0x24, 0xc9, 0xdf, 0x0d], "ld1r {v4.2s}, [x9], 0x4"),
+ ([0x24, 0xc9, 0xdf, 0x0d], "ld1r {v4.2s}, [x9], 0x4"), // TODO: could use a test for "ld1r {v4.2s}, [x9]"
([0x88, 0x28, 0x00, 0x4c], "st1 {v8.4s-v11.4s}, [x4]"),
([0x60, 0x2d, 0x00, 0x4c], "st1 {v0.2d-v3.2d}, [x11]"),
([0x9c, 0x2e, 0x00, 0x4c], "st1 {v28.2d-v31.2d}, [x20]"),
@@ -2801,22 +2801,22 @@ fn test_openblas_simd_loadstore() {
([0xa4, 0x8d, 0x40, 0x4c], "ld2 {v4.2d, v5.2d}, [x13]"),
([0xa6, 0x8d, 0x40, 0x4c], "ld2 {v6.2d, v7.2d}, [x13]"),
([0xa3, 0x7c, 0x86, 0x4c], "st1 {v3.2d}, [x5], x6"),
- ([0x61, 0x2c, 0x9f, 0x4c], "st1 {v1.2d-v4.2d}, [x3], 64"),
- ([0xb0, 0x2c, 0x9f, 0x4c], "st1 {v16.2d-v19.2d}, [x5], 64"),
+ ([0x61, 0x2c, 0x9f, 0x4c], "st1 {v1.2d-v4.2d}, [x3], 0x40"),
+ ([0xb0, 0x2c, 0x9f, 0x4c], "st1 {v16.2d-v19.2d}, [x5], 0x40"),
([0x24, 0x78, 0x9f, 0x4c], "st1 {v4.4s}, [x1], 0x10"),
([0xa5, 0x7d, 0x9f, 0x4c], "st1 {v5.2d}, [x13], 0x10"),
- ([0xa4, 0x88, 0x9f, 0x4c], "st2 {v4.4s, v5.4s}, [x5], 32"),
- ([0xc4, 0x88, 0x9f, 0x4c], "st2 {v4.4s, v5.4s}, [x6], 32"),
- ([0xb0, 0xad, 0x9f, 0x4c], "st1 {v16.2d, v17.2d}, [x13], 32"),
+ ([0xa4, 0x88, 0x9f, 0x4c], "st2 {v4.4s, v5.4s}, [x5], 0x20"),
+ ([0xc4, 0x88, 0x9f, 0x4c], "st2 {v4.4s, v5.4s}, [x6], 0x20"),
+ ([0xb0, 0xad, 0x9f, 0x4c], "st1 {v16.2d, v17.2d}, [x13], 0x20"),
([0x20, 0x7c, 0xc2, 0x4c], "ld1 {v0.2d}, [x1], x2"),
([0x46, 0x7d, 0xc6, 0x4c], "ld1 {v6.2d}, [x10], x6"),
- ([0x20, 0x0c, 0xdf, 0x4c], "ld4 {v0.2d-v3.2d}, [x1], 64"),
- ([0x51, 0x2d, 0xdf, 0x4c], "ld1 {v17.2d-v20.2d}, [x10], 64"),
+ ([0x20, 0x0c, 0xdf, 0x4c], "ld4 {v0.2d-v3.2d}, [x1], 0x40"),
+ ([0x51, 0x2d, 0xdf, 0x4c], "ld1 {v17.2d-v20.2d}, [x10], 0x40"),
([0x20, 0x78, 0xdf, 0x4c], "ld1 {v0.4s}, [x1], 0x10"),
([0x21, 0x78, 0xdf, 0x4c], "ld1 {v1.4s}, [x1], 0x10"),
([0x46, 0x7d, 0xdf, 0x4c], "ld1 {v6.2d}, [x10], 0x10"),
- ([0x20, 0x88, 0xdf, 0x4c], "ld2 {v0.4s, v1.4s}, [x1], 32"),
- ([0x50, 0xad, 0xdf, 0x4c], "ld1 {v16.2d, v17.2d}, [x10], 32"),
+ ([0x20, 0x88, 0xdf, 0x4c], "ld2 {v0.4s, v1.4s}, [x1], 0x20"),
+ ([0x50, 0xad, 0xdf, 0x4c], "ld1 {v16.2d, v17.2d}, [x10], 0x20"),
([0xa8, 0x85, 0x00, 0x4d], "st1 {v8.d}[1], [x13]"),
([0xac, 0x85, 0x00, 0x4d], "st1 {v12.d}[1], [x13]"),
([0xec, 0x85, 0x00, 0x4d], "st1 {v12.d}[1], [x15]"),
@@ -2824,8 +2824,8 @@ fn test_openblas_simd_loadstore() {
([0xa8, 0x85, 0x40, 0x4d], "ld1 {v8.d}[1], [x13]"),
([0xec, 0x85, 0x40, 0x4d], "ld1 {v12.d}[1], [x15]"),
([0x64, 0x84, 0x84, 0x4d], "st1 {v4.d}[1], [x3], x4"),
- ([0x64, 0x84, 0x9f, 0x4d], "st1 {v4.d}[1], [x3], 8"),
- ([0x24, 0xcd, 0xdf, 0x4d], "ld1r {v4.2d}, [x9], 8"),
+ ([0x64, 0x84, 0x9f, 0x4d], "st1 {v4.d}[1], [x3], 0x8"),
+ ([0x24, 0xcd, 0xdf, 0x4d], "ld1r {v4.2d}, [x9], 0x8"),
([0x60, 0x04, 0x81, 0x3c], "str q0, [x3], 0x10"),
([0x61, 0x00, 0x9f, 0x3c], "stur q1, [x3, -0x10]"),
([0xa0, 0x00, 0x9f, 0x3c], "stur q0, [x5, -0x10]"),