From a0fd5a24cb0aa0b697f680c451d928cefe8323b4 Mon Sep 17 00:00:00 2001
From: iximeow <me@iximeow.net>
Date: Thu, 21 May 2020 23:09:39 -0700
Subject: add sha, lzcnt, tsx, f16c, svm, movbe, adx, and prefetchw extensions

also add builders to get decoders appropriate for specific
microarchitectures from intel and amd
* low-power architectures are not yet present
---
 src/long_mode/display.rs |  48 +++++
 src/long_mode/mod.rs     | 514 +++++++++++++++++++++++++++++++++++++++++++++--
 src/long_mode/uarch.rs   | 221 ++++++++++++++++++++
 3 files changed, 763 insertions(+), 20 deletions(-)
 create mode 100644 src/long_mode/uarch.rs

(limited to 'src/long_mode')
diff --git a/src/long_mode/display.rs b/src/long_mode/display.rs
index 49d1600..5318ebb 100644
--- a/src/long_mode/display.rs
+++ b/src/long_mode/display.rs
@@ -1076,6 +1076,30 @@ impl fmt::Display for Opcode {
             &Opcode::HSUBPD => write!(f, "hsubpd"),
             &Opcode::HADDPD => write!(f, "haddpd"),
             &Opcode::ADDSUBPD => write!(f, "addsubpd"),
+            &Opcode::XABORT => write!(f, "xabort"),
+            &Opcode::XBEGIN => write!(f, "xbegin"),
+            &Opcode::RDSEED => write!(f, "rdseed"),
+            &Opcode::LZCNT => write!(f, "lzcnt"),
+            &Opcode::CLGI => write!(f, "clgi"),
+            &Opcode::STGI => write!(f, "stgi"),
+            &Opcode::SKINIT => write!(f, "skinit"),
+            &Opcode::VMLOAD => write!(f, "vmload"),
+            &Opcode::VMMCALL => write!(f, "vmmcall"),
+            &Opcode::VMSAVE => write!(f, "vmsave"),
+            &Opcode::VMRUN => write!(f, "vmrun"),
+            &Opcode::INVLPGA => write!(f, "invlpga"),
+            &Opcode::MOVBE => write!(f, "movbe"),
+            &Opcode::ADCX => write!(f, "adcx"),
+            &Opcode::ADOX => write!(f, "adox"),
+            &Opcode::PREFETCHW => write!(f, "prefetchw"),
+            &Opcode::RDRAND => write!(f, "rdrand"),
+            &Opcode::SHA1RNDS4 => write!(f, "sha1rnds4"),
+            &Opcode::SHA1NEXTE => write!(f, "sha1nexte"),
+            &Opcode::SHA1MSG1 => write!(f, "sha1msg1"),
+            &Opcode::SHA1MSG2 => write!(f, "sha1msg2"),
+            &Opcode::SHA256RNDS2 => write!(f, "sha256rnds2"),
+            &Opcode::SHA256MSG1 => write!(f, "sha256msg1"),
+            &Opcode::SHA256MSG2 => write!(f, "sha256msg2"),
             &Opcode::Invalid => write!(f, "invalid"),
         }
     }
@@ -1306,8 +1330,11 @@ impl <T: fmt::Write, Color: fmt::Display, Y: YaxColors<Color>> Colorize<T, Color
             Opcode::LEA |
             Opcode::ADD |
             Opcode::ADC |
+            Opcode::ADCX |
+            Opcode::ADOX |
             Opcode::SUB |
             Opcode::POPCNT |
+            Opcode::LZCNT |
             Opcode::BT |
             Opcode::BTS |
             Opcode::BTR |
@@ -1399,6 +1426,7 @@ impl <T: fmt::Write, Color: fmt::Display, Y: YaxColors<Color>> Colorize<T, Color
             Opcode::PREFETCH0 |
             Opcode::PREFETCH1 |
             Opcode::PREFETCH2 |
+            Opcode::PREFETCHW |
             Opcode::NOP => { write!(out, "{}", colors.nop_op(self)) }
 
             /* Control flow */
@@ -1680,6 +1708,7 @@ impl <T: fmt::Write, Color: fmt::Display, Y: YaxColors<Color>> Colorize<T, Color
             Opcode::PEXTRW |
             Opcode::PINSRW |
             Opcode::MOV |
+            Opcode::MOVBE |
             Opcode::LODS |
             Opcode::STOS |
             Opcode::LAHF |
@@ -1830,6 +1859,7 @@ impl <T: fmt::Write, Color: fmt::Display, Y: YaxColors<Color>> Colorize<T, Color
             Opcode::SWAPGS |
             Opcode::RDTSCP |
             Opcode::INVLPG |
+            Opcode::INVLPGA |
             Opcode::CPUID |
             Opcode::WBINVD |
             Opcode::INVD |
@@ -1860,9 +1890,16 @@ impl <T: fmt::Write, Color: fmt::Display, Y: YaxColors<Color>> Colorize<T, Color
             Opcode::VMCALL |
             Opcode::VMLAUNCH |
             Opcode::VMRESUME |
+            Opcode::VMLOAD |
+            Opcode::VMMCALL |
+            Opcode::VMSAVE |
+            Opcode::VMRUN |
             Opcode::VMXOFF |
             Opcode::MONITOR |
             Opcode::MWAIT |
+            Opcode::SKINIT |
+            Opcode::CLGI |
+            Opcode::STGI |
             Opcode::CLAC |
             Opcode::STAC |
             Opcode::ENCLS |
@@ -1872,11 +1909,22 @@ impl <T: fmt::Write, Color: fmt::Display, Y: YaxColors<Color>> Colorize<T, Color
             Opcode::VMFUNC |
             Opcode::XEND |
             Opcode::XTEST |
+            Opcode::XABORT |
+            Opcode::XBEGIN |
             Opcode::ENCLU |
             Opcode::RDPKRU |
             Opcode::WRPKRU |
             Opcode::LAR => { write!(out, "{}", colors.platform_op(self)) }
 
+            Opcode::RDSEED |
+            Opcode::RDRAND |
+            Opcode::SHA1RNDS4 |
+            Opcode::SHA1NEXTE |
+            Opcode::SHA1MSG1 |
+            Opcode::SHA1MSG2 |
+            Opcode::SHA256RNDS2 |
+            Opcode::SHA256MSG1 |
+            Opcode::SHA256MSG2 |
             Opcode::AESDEC |
             Opcode::AESDECLAST |
             Opcode::AESENC |
diff --git a/src/long_mode/mod.rs b/src/long_mode/mod.rs
index e0a1fdf..20abe1f 100644
--- a/src/long_mode/mod.rs
+++ b/src/long_mode/mod.rs
@@ -1,5 +1,6 @@
 mod vex;
 mod display;
+pub mod uarch;
 
 use core::hint::unreachable_unchecked;
 
@@ -73,7 +74,6 @@ impl RegSpec {
 
     #[inline]
     fn gp_from_parts(num: u8, extended: bool, width: u8, rex: bool) -> RegSpec {
-//        println!("from_parts width: {}, num: {}, extended: {}", width, num, extended);
         RegSpec {
             num: num + if extended { 0b1000 } else { 0 },
             bank: width_to_gp_reg_bank(width, rex)
@@ -129,6 +129,22 @@ impl RegSpec {
     }
 
     #[inline]
+    pub fn esp() -> RegSpec {
+        RegSpec {
+            num: 4,
+            bank: RegisterBank::D
+        }
+    }
+
+    #[inline]
+    pub fn sp() -> RegSpec {
+        RegSpec {
+            num: 4,
+            bank: RegisterBank::W
+        }
+    }
+
+    #[inline]
     pub fn fs() -> RegSpec {
         RegSpec { bank: RegisterBank::S, num: 3 }
     }
@@ -779,12 +795,17 @@ pub enum Opcode {
     XGETBV,
     XSETBV,
     VMFUNC,
+    XABORT,
+    XBEGIN,
     XEND,
     XTEST,
     ENCLU,
     RDPKRU,
     WRPKRU,
 
+    RDSEED,
+    RDRAND,
+
     ADDPS,
     ADDPD,
     ANDNPS,
@@ -1330,6 +1351,31 @@ pub enum Opcode {
     PHADDW,
     HSUBPD,
     HADDPD,
+
+    SHA1RNDS4,
+    SHA1NEXTE,
+    SHA1MSG1,
+    SHA1MSG2,
+    SHA256RNDS2,
+    SHA256MSG1,
+    SHA256MSG2,
+
+    LZCNT,
+    CLGI,
+    STGI,
+    SKINIT,
+    VMLOAD,
+    VMMCALL,
+    VMSAVE,
+    VMRUN,
+    INVLPGA,
+
+    MOVBE,
+
+    ADCX,
+    ADOX,
+
+    PREFETCHW,
 }
 
 #[derive(Debug)]
@@ -1500,6 +1546,14 @@ pub struct InstDecoder {
     // 53. intel quirks
     // 54. amd quirks
     // 55. avx (intel ?, amd ?)
+    // 56. amd-v/svm
+    // 57. lahfsahf
+    // 58. cmov
+    // 59. f16c
+    // 60. fma4
+    // 61. prefetchw
+    // 62. tsx
+    // 63. lzcnt
     flags: u64,
 }
 
@@ -1586,6 +1640,12 @@ impl InstDecoder {
         self
     }
 
+    pub fn with_sse4(self) -> Self {
+        self
+            .with_sse4_1()
+            .with_sse4_2()
+    }
+
     pub fn movbe(&self) -> bool {
         self.flags & (1 << 8) != 0
     }
@@ -1658,6 +1718,9 @@ impl InstDecoder {
         self
     }
 
+    /// `bmi2` indicates support for the `BZHI`, `MULX`, `PDEP`, `PEXT`, `RORX`, `SARX`, `SHRX`,
+    /// and `SHLX` instructions. `bmi2` is implemented in all x86_64 chips that implement `bmi`,
+    /// except the amd `piledriver` and `steamroller` microarchitectures.
     pub fn bmi2(&self) -> bool {
         self.flags & (1 << 16) != 0
     }
@@ -2018,6 +2081,94 @@ impl InstDecoder {
         self
     }
 
+    pub fn svm(&self) -> bool {
+        self.flags & (1 << 56) != 0
+    }
+
+    pub fn with_svm(mut self) -> Self {
+        self.flags |= 1 << 56;
+        self
+    }
+
+    /// `lahfsahf` is only unset for early revisions of 64-bit amd and intel chips. unfortunately
+    /// the clearest documentation on when these instructions were reintroduced into 64-bit
+    /// architectures seems to be
+    /// [wikipedia](https://en.wikipedia.org/wiki/X86-64#Older_implementations):
+    /// ```
+    /// Early AMD64 and Intel 64 CPUs lacked LAHF and SAHF instructions in 64-bit mode. AMD
+    /// introduced these instructions (also in 64-bit mode) with their Athlon 64, Opteron and
+    /// Turion 64 revision D processors in March 2005[48][49][50] while Intel introduced the
+    /// instructions with the Pentium 4 G1 stepping in December 2005. The 64-bit version of Windows
+    /// 8.1 requires this feature.[47]
+    /// ```
+    ///
+    /// this puts reintroduction of these instructions somewhere in the middle of prescott and k8
+    /// lifecycles, for intel and amd respectively. because there is no specific uarch where these
+    /// features become enabled, prescott and k8 default to not supporting these instructions,
+    /// where later uarches support these instructions.
+    pub fn lahfsahf(&self) -> bool {
+        self.flags & (1 << 57) != 0
+    }
+
+    pub fn with_lahfsahf(mut self) -> Self {
+        self.flags |= 1 << 57;
+        self
+    }
+
+    pub fn cmov(&self) -> bool {
+        self.flags & (1 << 58) != 0
+    }
+
+    pub fn with_cmov(mut self) -> Self {
+        self.flags |= 1 << 58;
+        self
+    }
+
+    pub fn f16c(&self) -> bool {
+        self.flags & (1 << 59) != 0
+    }
+
+    pub fn with_f16c(mut self) -> Self {
+        self.flags |= 1 << 59;
+        self
+    }
+
+    pub fn fma4(&self) -> bool {
+        self.flags & (1 << 60) != 0
+    }
+
+    pub fn with_fma4(mut self) -> Self {
+        self.flags |= 1 << 60;
+        self
+    }
+
+    pub fn prefetchw(&self) -> bool {
+        self.flags & (1 << 61) != 0
+    }
+
+    pub fn with_prefetchw(mut self) -> Self {
+        self.flags |= 1 << 61;
+        self
+    }
+
+    pub fn tsx(&self) -> bool {
+        self.flags & (1 << 62) != 0
+    }
+
+    pub fn with_tsx(mut self) -> Self {
+        self.flags |= 1 << 62;
+        self
+    }
+
+    pub fn lzcnt(&self) -> bool {
+        self.flags & (1 << 63) != 0
+    }
+
+    pub fn with_lzcnt(mut self) -> Self {
+        self.flags |= 1 << 63;
+        self
+    }
+
     /// Optionally reject or reinterpret instruction according to the decoder's
     /// declared extensions.
     fn revise_instruction(&self, inst: &mut Instruction) -> Result<(), DecodeError> {
@@ -2150,21 +2301,15 @@ impl InstDecoder {
                     return Err(DecodeError::InvalidOpcode);
                 }
             }
-            // AVX...
-            /* // TODO
             Opcode::XABORT |
-            Opcode::XACQUIRE |
-            Opcode::XRELEASE |
             Opcode::XBEGIN |
             Opcode::XEND |
             Opcode::XTEST => {
                 if !self.tsx() {
                     inst.opcode = Opcode::Invalid;
-                    return Err(());
+                    return Err(DecodeError::InvalidOpcode);
                 }
             }
-            */
-            /* // TODO
             Opcode::SHA1MSG1 |
             Opcode::SHA1MSG2 |
             Opcode::SHA1NEXTE |
@@ -2174,9 +2319,9 @@ impl InstDecoder {
             Opcode::SHA256RNDS2 => {
                 if !self.sha() {
                     inst.opcode = Opcode::Invalid;
-                    return Err(());
+                    return Err(DecodeError::InvalidOpcode);
                 }
-            }*/
+            }
             Opcode::ENCLV |
             Opcode::ENCLS |
             Opcode::ENCLU => {
@@ -2185,6 +2330,7 @@ impl InstDecoder {
                     return Err(DecodeError::InvalidOpcode);
                 }
             }
+            // AVX...
             Opcode::VMOVDDUP |
             Opcode::VPSHUFLW |
             Opcode::VHADDPS |
@@ -2216,7 +2362,6 @@ impl InstDecoder {
             Opcode::VCVTDQ2PD |
             Opcode::VCVTDQ2PS |
             Opcode::VCVTPD2PS |
-            Opcode::VCVTPH2PS |
             Opcode::VCVTPS2DQ |
             Opcode::VCVTPS2PD |
             Opcode::VCVTSS2SD |
@@ -2224,7 +2369,6 @@ impl InstDecoder {
             Opcode::VCVTSI2SD |
             Opcode::VCVTSD2SI |
             Opcode::VCVTSD2SS |
-            Opcode::VCVTPS2PH |
             Opcode::VCVTSS2SI |
             Opcode::VCVTTPD2DQ |
             Opcode::VCVTTPS2DQ |
@@ -2527,6 +2671,124 @@ impl InstDecoder {
                     return Err(DecodeError::InvalidOpcode);
                 }
             }
+            Opcode::MOVBE => {
+                if !self.movbe() {
+                    inst.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                }
+            }
+            Opcode::POPCNT => {
+                /*
+                 * from the intel SDM:
+                 * ```
+                 * Before an application attempts to use the POPCNT instruction, it must check that
+                 * the processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1) and POPCNT
+                 * (if CPUID.01H:ECX.POPCNT[bit 23] = 1).
+                 * ```
+                 */
+                if self.intel_quirks() && (!self.sse4_2() || !self.popcnt()) {
+                    inst.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                } else if !self.popcnt() {
+                    /*
+                     * elsewhere from the amd APM:
+                     * `Instruction Subsets and CPUID Feature Flags` on page 507 indicates that
+                     * popcnt is present when the popcnt bit is reported by cpuid. this seems to be
+                     * the less quirky default, so `intel_quirks` is considered the outlier, and
+                     * before this default.
+                     * */
+                    inst.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                }
+            }
+            Opcode::LZCNT => {
+                /*
+                 * amd APM, `LZCNT` page 212:
+                 * LZCNT is an Advanced Bit Manipulation (ABM) instruction. Support for the LZCNT
+                 * instruction is indicated by CPUID Fn8000_0001_ECX[ABM] = 1.
+                 *
+                 * meanwhile the intel SDM simply states:
+                 * ```
+                 * CPUID.EAX=80000001H:ECX.LZCNT[bit 5]: if 1 indicates the processor supports the
+                 * LZCNT instruction.
+                 * ```
+                 *
+                 * so that's considered the less-quirky (default) case here.
+                 * */
+                if self.amd_quirks() && !self.abm() {
+                    inst.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                } else if !self.lzcnt() {
+                    inst.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                }
+            }
+            Opcode::ADCX |
+            Opcode::ADOX => {
+                if !self.adx() {
+                    inst.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                }
+            }
+            Opcode::VMRUN |
+            Opcode::VMLOAD |
+            Opcode::VMSAVE |
+            Opcode::CLGI |
+            Opcode::VMMCALL |
+            Opcode::INVLPGA => {
+                if !self.svm() {
+                    inst.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                }
+            }
+            Opcode::STGI |
+            Opcode::SKINIT => {
+                if !self.svm() || !self.skinit() {
+                    inst.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                }
+            }
+            Opcode::LAHF |
+            Opcode::SAHF => {
+                if !self.lahfsahf() {
+                    inst.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                }
+            }
+            Opcode::VCVTPS2PH |
+            Opcode::VCVTPH2PS => {
+                /*
+                 * from intel SDM:
+                 * ```
+                 * 14.4.1 Detection of F16C Instructions Application using float 16 instruction
+                 *    must follow a detection sequence similar to AVX to ensure: • The OS has
+                 *    enabled YMM state management support, • The processor support AVX as
+                 *    indicated by the CPUID feature flag, i.e. CPUID.01H:ECX.AVX[bit 28] = 1.  •
+                 *    The processor support 16-bit floating-point conversion instructions via a
+                 *    CPUID feature flag (CPUID.01H:ECX.F16C[bit 29] = 1).
+                 * ```
+                 *
+                 * TODO: only the VEX-coded variant of this instruction should be gated on `f16c`.
+                 * the EVEX-coded variant should be gated on `avx512f` or `avx512vl` if not
+                 * EVEX.512-coded.
+                 */
+                if !self.avx() || !self.f16c() {
+                    inst.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                }
+            }
+            Opcode::RDRAND => {
+                if !self.rdrand() {
+                    inst.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                }
+            }
+            Opcode::RDSEED => {
+                if !self.rdseed() {
+                    inst.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                }
+            }
             other => {
                 if !self.bmi1() {
                     if BMI1.contains(&other) {
@@ -2871,11 +3133,13 @@ impl PrefixRex {
 pub enum OperandCode {
     ModRM_0x0f00,
     ModRM_0x0f01,
+    ModRM_0x0f0d,
     ModRM_0x0fae,
     ModRM_0x0fba,
     ModRM_0xf238,
     ModRM_0xf30fc7,
     ModRM_0x660f38,
+    ModRM_0xf30f38,
     ModRM_0x660f3a,
     CVT_AA,
     CVT_DA,
@@ -3686,7 +3950,7 @@ const OPCODE_F30F_MAP: [OpcodeRecord; 256] = [
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
-    OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
+    OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::ModRM_0xf30f38),
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
@@ -3827,7 +4091,7 @@ const OPCODE_F30F_MAP: [OpcodeRecord; 256] = [
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
-    OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
+    OpcodeRecord(Interpretation::Instruction(Opcode::LZCNT), OperandCode::Gv_Ev),
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
 // 0xc0
@@ -3950,7 +4214,7 @@ const OPCODE_0F_MAP: [OpcodeRecord; 256] = [
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
     OpcodeRecord(Interpretation::Instruction(Opcode::UD2), OperandCode::Nothing),
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
-    OpcodeRecord(Interpretation::Instruction(Opcode::NOP), OperandCode::Ev),
+    OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::ModRM_0x0f0d),
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
     OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing),
 // 0x10
@@ -5104,8 +5368,28 @@ fn read_operands<T: Iterator<Item=u8>>(decoder: &InstDecoder, mut bytes_iter: T,
             };
             instruction.operand_count = 2;
         },
-        _op @ OperandCode::ModRM_0xc6_Eb_Ib |
-        _op @ OperandCode::ModRM_0xc7_Ev_Iv => {
+        op @ OperandCode::ModRM_0xc6_Eb_Ib |
+        op @ OperandCode::ModRM_0xc7_Ev_Iv => {
+            if modrm == 0xf8 {
+                if op == OperandCode::ModRM_0xc6_Eb_Ib {
+                    instruction.opcode = Opcode::XABORT;
+                    instruction.imm = read_imm_signed(&mut bytes_iter, 1, length)? as u64;
+                    instruction.operands[0] = OperandSpec::ImmI8;
+                    instruction.operand_count = 1;
+                    return Ok(());
+                } else {
+                    instruction.opcode = Opcode::XBEGIN;
+                    instruction.disp = if opwidth == 2 {
+                        read_imm_signed(&mut bytes_iter, 2, length)? as i16 as i64 as u64
+                    } else {
+                        read_imm_signed(&mut bytes_iter, 4, length)? as i32 as i64 as u64
+                    };
+                    instruction.modrm_mmm = RegSpec::rip();
+                    instruction.operands[0] = OperandSpec::RegDisp;
+                    instruction.operand_count = 1;
+                    return Ok(());
+                }
+            }
             if (modrm & 0b00111000) != 0 {
                 instruction.opcode = Opcode::Invalid;
                 return Err(DecodeError::InvalidOperand); // Err("Invalid modr/m for opcode 0xc7".to_string());
@@ -5490,6 +5774,115 @@ fn unlikely_operands<T: Iterator<Item=u8>>(decoder: &InstDecoder, mut bytes_iter
                 instruction.opcode = Opcode::MOVD;
             }
         }
+        OperandCode::ModRM_0x0f0d => {
+            let modrm = read_modrm(&mut bytes_iter, length)?;
+            let r = modrm & 0b111;
+
+            let opwidth = imm_width_from_prefixes_64(SizeCode::vq, instruction.prefixes);
+
+            match r {
+                1 => {
+                    instruction.opcode = Opcode::PREFETCHW;
+                }
+                _ => {
+                    instruction.opcode = Opcode::NOP;
+                }
+            }
+            instruction.operands[0] = read_E(&mut bytes_iter, instruction, modrm, opwidth, length)?;
+            instruction.operand_count = 1;
+        }
+        OperandCode::ModRM_0x0f38 => {
+            let opcode = read_modrm(&mut bytes_iter, length)?;
+
+            let high = opcode >> 4;
+            let low = opcode & 0xf;
+
+            let operands = match high {
+                0 => {
+                    // PqQq
+                    OperandCode::G_E_mm
+                },
+                1 => {
+                    // PqQq
+                    OperandCode::G_E_mm
+                },
+                0xc => {
+                    // Vdq,Wdq
+                    OperandCode::G_E_xmm
+                }
+                0xf => {
+                    match low {
+                        0 => OperandCode::Gv_Ev,
+                        1 => OperandCode::Ev_Gv,
+                        _ => {
+                            instruction.opcode = Opcode::Invalid;
+                            return Err(DecodeError::InvalidOpcode);
+                        }
+                    }
+                }
+                _ => {
+                    instruction.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                }
+            };
+            instruction.opcode = match opcode {
+                0xc8 => Opcode::SHA1NEXTE,
+                0xc9 => Opcode::SHA1MSG1,
+                0xca => Opcode::SHA1MSG2,
+                0xcb => Opcode::SHA256RNDS2,
+                0xcc => Opcode::SHA256MSG1,
+                0xcd => Opcode::SHA256MSG2,
+                0xf0 | 0xf1 => Opcode::MOVBE,
+                _ => {
+                    instruction.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                }
+            };
+
+            return read_operands(decoder, bytes_iter, instruction, operands, length);
+        },
+        OperandCode::ModRM_0x0f3a => {
+        },
+        OperandCode::ModRM_0x0fc7 => {
+            let modrm = read_modrm(&mut bytes_iter, length)?;
+            if modrm >> 6 == 0b11 {
+                match (modrm >> 3) & 0b111 {
+                    0b111 => {
+                        instruction.opcode = Opcode::RDSEED;
+                        instruction.operand_count = 1;
+                        instruction.operands[0] = OperandSpec::RegRRR;
+                        let opwidth = imm_width_from_prefixes_64(SizeCode::vq, instruction.prefixes);
+                        instruction.modrm_rrr =
+                            RegSpec::from_parts(modrm & 7, instruction.prefixes.rex().r(), match opwidth {
+                                8 => RegisterBank::Q,
+                                4 => RegisterBank::D,
+                                2 => RegisterBank::W,
+                                _ => unreachable!()
+                            });
+                    }
+                    0b110 => {
+                        instruction.opcode = Opcode::RDRAND;
+                        instruction.operand_count = 1;
+                        instruction.operands[0] = OperandSpec::RegRRR;
+                        let opwidth = imm_width_from_prefixes_64(SizeCode::vq, instruction.prefixes);
+                        instruction.modrm_rrr =
+                            RegSpec::from_parts(modrm & 7, instruction.prefixes.rex().r(), match opwidth {
+                                8 => RegisterBank::Q,
+                                4 => RegisterBank::D,
+                                2 => RegisterBank::W,
+                                _ => unreachable!()
+                            });
+                    }
+                    _ => {
+                        instruction.opcode = Opcode::Invalid;
+                        return Err(DecodeError::InvalidOpcode);
+                    }
+                }
+            } else {
+                instruction.opcode = Opcode::Invalid;
+                return Err(DecodeError::InvalidOpcode);
+            }
+        },
         OperandCode::ModRM_0x0f71 => {
             instruction.operand_count = 2;
 
@@ -5604,6 +5997,19 @@ fn unlikely_operands<T: Iterator<Item=u8>>(decoder: &InstDecoder, mut bytes_iter
             instruction.operands[1] = read_E_xmm(&mut bytes_iter, instruction, modrm, length)?;
             instruction.operand_count = 2;
         }
+        OperandCode::ModRM_0xf30f38 => {
+            let op = bytes_iter.next().ok_or(DecodeError::ExhaustedInput).map(|b| { *length += 1; b })?;
+            match op {
+                0xf6 => {
+                    instruction.opcode = Opcode::ADOX;
+                    return read_operands(decoder, bytes_iter, instruction, OperandCode::Gv_Ev, length);
+                }
+                _ => {
+                    instruction.opcode = Opcode::Invalid;
+                    return Err(DecodeError::InvalidOpcode);
+                }
+            };
+        }
         OperandCode::ModRM_0x660f38 => {
             let op = bytes_iter.next().ok_or(DecodeError::ExhaustedInput).map(|b| { *length += 1; b })?;
             match op {
@@ -5612,6 +6018,10 @@ fn unlikely_operands<T: Iterator<Item=u8>>(decoder: &InstDecoder, mut bytes_iter
                 0xdd => { instruction.opcode = Opcode::AESENCLAST; }
                 0xde => { instruction.opcode = Opcode::AESDEC; }
                 0xdf => { instruction.opcode = Opcode::AESDECLAST; }
+                0xf6 => {
+                    instruction.opcode = Opcode::ADCX;
+                    return read_operands(decoder, bytes_iter, instruction, OperandCode::Gv_Ev, length);
+                }
                 _ => {
                     instruction.opcode = Opcode::Invalid;
                     return Err(DecodeError::InvalidOpcode);
@@ -5630,6 +6040,21 @@ fn unlikely_operands<T: Iterator<Item=u8>>(decoder: &InstDecoder, mut bytes_iter
         OperandCode::ModRM_0x660f3a => {
             let op = bytes_iter.next().ok_or(DecodeError::ExhaustedInput).map(|b| { *length += 1; b })?;
             match op {
+                0xcc => {
+                    instruction.opcode = Opcode::SHA1RNDS4;
+
+                    let modrm = read_modrm(&mut bytes_iter, length)?;
+                    instruction.modrm_rrr =
+                        RegSpec::from_parts((modrm >> 3) & 7, instruction.prefixes.rex().r(), RegisterBank::X);
+
+
+                    instruction.operands[0] = OperandSpec::RegRRR;
+                    instruction.operands[1] = read_E_xmm(&mut bytes_iter, instruction, modrm, length)?;
+                    instruction.imm =
+                        read_imm_unsigned(&mut bytes_iter, 1, length)?;
+                    instruction.operands[2] = OperandSpec::ImmU8;
+                    instruction.operand_count = 3;
+                }
                 0xdf => {
                     instruction.opcode = Opcode::AESKEYGENASSIST;
                     // read operands right here right now
@@ -6214,10 +6639,59 @@ fn unlikely_operands<T: Iterator<Item=u8>>(decoder: &InstDecoder, mut bytes_iter
                 }
             } else if r == 3 {
                 let mod_bits = modrm >> 6;
+                let m = modrm & 7;
                 if mod_bits == 0b11 {
-                    instruction.opcode = Opcode::Invalid;
-                    instruction.operand_count = 0;
-                    return Err(DecodeError::InvalidOperand);
+                    match m {
+                        0b000 => {
+                            instruction.opcode = Opcode::VMRUN;
+                            instruction.operand_count = 1;
+                            instruction.modrm_rrr = RegSpec::rax();
+                            instruction.operands[0] = OperandSpec::RegRRR;
+                        },
+                        0b001 => {
+                            instruction.opcode = Opcode::VMMCALL;
+                            instruction.operand_count = 0;
+                        },
+                        0b010 => {
+                            instruction.opcode = Opcode::VMLOAD;
+                            instruction.operand_count = 1;
+                            instruction.modrm_rrr = RegSpec::rax();
+                            instruction.operands[0] = OperandSpec::RegRRR;
+                        },
+                        0b011 => {
+                            instruction.opcode = Opcode::VMSAVE;
+                            instruction.operand_count = 1;
+                            instruction.modrm_rrr = RegSpec::rax();
+                            instruction.operands[0] = OperandSpec::RegRRR;
+                        },
+                        0b100 => {
+                            instruction.opcode = Opcode::STGI;
+                            instruction.operand_count = 0;
+                        },
+                        0b101 => {
+                            instruction.opcode = Opcode::CLGI;
+                            instruction.operand_count = 0;
+                        },
+                        0b110 => {
+                            instruction.opcode = Opcode::SKINIT;
+                            instruction.operand_count = 1;
+                            instruction.operands[0] = OperandSpec::RegRRR;
+                            instruction.modrm_rrr = RegSpec::eax();
+                        },
+                        0b111 => {
+                            instruction.opcode = Opcode::INVLPGA;
+                            instruction.operand_count = 2;
+                            instruction.operands[0] = OperandSpec::RegRRR;
+                            instruction.operands[1] = OperandSpec::RegMMM;
+                            instruction.modrm_rrr = RegSpec::rax();
+                            instruction.modrm_mmm = RegSpec::ecx();
+                        },
+                        _ => {
+                            instruction.opcode = Opcode::Invalid;
+                            instruction.operand_count = 0;
+                            return Err(DecodeError::InvalidOperand);
+                        }
+                    }
                 } else {
                     instruction.opcode = Opcode::LIDT;
                     instruction.operand_count = 1;
diff --git a/src/long_mode/uarch.rs b/src/long_mode/uarch.rs
new file mode 100644
index 0000000..b2b1201
--- /dev/null
+++ b/src/long_mode/uarch.rs
@@ -0,0 +1,221 @@
+pub mod amd {
+    //! most information about instruction set extensions for microarchitectures here was sourced
+    //! from https://en.wikipedia.org/wiki/AMD_Accelerated_Processing_Unit#Feature_overview and
+    //! https://en.wikipedia.org/wiki/Template:AMD_x86_CPU_features. these mappings are best-effort
+    //! but fairly unused, so a critical eye should be kept towards these decoders rejecting
+    //! instructions they should not, or incorrectly accepting instructions.
+    //!
+    //! microarchitectures as defined here are with respect to flags reported by CPUID. notably,
+    //! `Zen` does not report `FMA4` support by `CPUID`, but instructions in that extension
+    //! reportedly function correctly (agner p217).
+    //!
+    //! [agner](https://www.agner.org/optimize/microarchitecture.pdf)
+    //! as retrieved 2020 may 19
+    //! `sha256: 87ff152ae18c017dcbfb9f7ee6e88a9f971f6250fd15a70a3dd87c3546323bd5`
+
+    use long_mode::InstDecoder;
+
+    /// `k8` was the first AMD microarchitecture to implement x86_64, launched in 2003. while later
+    /// `k8`-based processors supported SSE3, these predefined decoders pick the lower end of
+    /// support - SSE2 and no later.
+    pub fn k8() -> InstDecoder {
+        InstDecoder::minimal()
+    }
+
+    /// `k10` was the successor to `k8`, launched in 2007. `k10` cores extended SSE support through
+    /// to SSE4.2a, as well as consistent `cmov` support, among other features.
+    pub fn k10() -> InstDecoder {
+        k8()
+            .with_cmov()
+            .with_cmpxchg16b()
+            .with_svm()
+            .with_abm()
+            .with_lahfsahf()
+            .with_sse3()
+            .with_ssse3()
+            .with_sse4()
+            .with_sse4_2()
+            .with_sse4a()
+    }
+
+    /// `Bulldozer` was the successor to `K10`, launched in 2011. `Bulldozer` cores include AVX
+    /// support among other extensions, and are notable for including `AESNI`.
+    pub fn bulldozer() -> InstDecoder {
+        k10()
+            .with_bmi1()
+            .with_aesni()
+            .with_pclmulqdq()
+            .with_f16c()
+            .with_avx()
+            .with_fma4()
+            .with_xop()
+    }
+
+    /// `Piledriver` was the successor to `Bulldozer`, launched in 2012.
+    pub fn piledriver() -> InstDecoder {
+        bulldozer()
+            .with_tbm()
+            .with_fma3()
+            .with_fma4()
+    }
+
+    /// `Steamroller` was the successor to `Piledriver`, launched in 2014. unlike `Piledriver`
+    /// cores, these cores do not support `TBM` or `FMA3`.
+    pub fn steamroller() -> InstDecoder {
+        bulldozer()
+    }
+
+    /// `Excavator` was the successor to `Steamroller`, launched in 2015.
+    pub fn excavator() -> InstDecoder {
+        steamroller()
+            .with_movbe()
+            .with_bmi2()
+            .with_rdrand()
+            .with_avx()
+            .with_xop()
+            .with_bmi2()
+            .with_sha()
+            .with_rdrand()
+            .with_avx2()
+    }
+
+    /// `Zen` was the successor to `Excavator`, launched in 2017. `Zen` cores extend SIMD
+    /// instructions to AVX2 and discarded FMA4, TBM, and XOP extensions. they also gained ADX,
+    /// SHA, RDSEED, and other extensions.
+    pub fn zen() -> InstDecoder {
+        k10()
+            .with_avx()
+            .with_avx2()
+            .with_bmi1()
+            .with_aesni()
+            .with_pclmulqdq()
+            .with_f16c()
+            .with_movbe()
+            .with_bmi2()
+            .with_rdrand()
+            .with_adx()
+            .with_sha()
+            .with_rdseed()
+            .with_fma3()
+            // TODO: XSAVEC, XSAVES, XRSTORS, CLFLUSHOPT, CLZERO?
+    }
+}
+
+pub mod intel {
+    //! sourced by walking wikipedia pages. seriously! this stuff is kinda hard to figure out!
+
+    use long_mode::InstDecoder;
+
+    /// `Netburst` was the first Intel microarchitecture to implement x86_64, beginning with the
+    /// `Prescott` family launched in 2004. while the wider `Netburst` family launched in 2000
+    /// with only SSE2, the first `x86_64`-supporting incarnation was `Prescott` which indeed
+    /// included SSE3.
+    pub fn netburst() -> InstDecoder {
+        InstDecoder::minimal()
+            .with_cmov()
+            .with_sse3()
+    }
+
+    /// `Core` was the successor to `Netburst`, launched in 2006. it included up to SSE4, with
+    /// processors using this architecture shipped under the names "Merom", "Conroe", and
+    /// "Woodcrest", for mobile, desktop, and server processors respectively. not to be confused
+    /// with the later `Nehalem` microarchitecture that introduced the `Core i*` product lines,
+    /// `Core 2 *` processors used the `Core` architecture.
+    pub fn core() -> InstDecoder {
+        netburst()
+            .with_ssse3()
+            .with_sse4()
+    }
+
+    /// `Peryn` was the successor to `Core`, launched in early 2008. it added SSE4.1, along with
+    /// virtualization extensions.
+    pub fn peryn() -> InstDecoder {
+        core()
+            .with_sse4_1()
+    }
+
+    /// `Nehalem` was the successor to `Peryn`, launched in late 2008. not to be confused with the
+    /// earlier `Core` microarchitecture, the `Core i*` products were based on `Nehalem` cores.
+    /// `Nehalem` added SSE4.2 extensions, along with the `POPCNT` instruction.
+    pub fn nehalem() -> InstDecoder {
+        peryn()
+            .with_sse4_2()
+            .with_popcnt()
+    }
+
+    /// `Westmere` was the successor to `Nehalem`, launched in 2010. it added AES-NI and CLMUL
+    /// extensions.
+    pub fn westmere() -> InstDecoder {
+        nehalem()
+            .with_aesni()
+            .with_pclmulqdq()
+    }
+
+    /// `Sandy Bridge` was the successor to `Westmere`, launched in 2011. it added AVX
+    /// instructions.
+    pub fn sandybridge() -> InstDecoder {
+        westmere()
+            .with_avx()
+    }
+
+    /// `Ivy Bridge` was the successor to `Sandy Bridge`, launched in 2012. it added F16C
+    /// extensions for 16-bit floating point conversion, and the RDRAND instruction.
+    pub fn ivybridge() -> InstDecoder {
+        sandybridge()
+            .with_f16c()
+            .with_rdrand()
+    }
+
+    /// `Haswell` was the successor to `Ivy Bridge`, launched in 2013. it added several instruction
+    /// set extensions: AVX2, BMI1, BMI2, ABM, and FMA3.
+    pub fn haswell() -> InstDecoder {
+        ivybridge()
+            .with_bmi1()
+            .with_bmi2()
+            .with_abm()
+            .with_fma3()
+            .with_avx2()
+    }
+
+    /// `Haswell-EX` was a variant of `Haswell` launched in 2015 with functional TSX. these cores
+    /// were shipped as `E7-48xx/E7-88xx v3` models of processors.
+    pub fn haswell_ex() -> InstDecoder {
+        haswell()
+            .with_tsx()
+    }
+
+    /// `Broadwell` was the successor to `Haswell`, launched in late 2014. it added ADX, RDSEED,
+    /// and PREFETCHW, as well as broadly rolling out TSX. TSX is enabled on this decoder because
+    /// some chips of this microarchitecture rolled out with TSX, and lack of TSX seems to be
+    /// reported as an errata (for example, the `Broadwell-Y` line of parts).
+    pub fn broadwell() -> InstDecoder {
+        haswell_ex()
+            .with_adx()
+            .with_rdseed()
+            .with_prefetchw()
+    }
+
+    /// `Skylake` was the successor to `Broadwell`, launched in mid 2015. it added MPX and SGX
+    /// extensions, as well as a mixed rollout of AVX512 in different subsets for different product
+    /// lines.
+    ///
+    /// AVX512 is not enabled on this decoder by default because there doesn't seem to be a lowest
+    /// common denominator: if you want a `Skylake` decoder with AVX512, something like the
+    /// following:
+    /// ```
+    /// InstDecoder::skylake().with_avx512_f().with_avx512_dq()
+    /// ```
+    /// is likely your best option.
+    pub fn skylake() -> InstDecoder {
+        broadwell()
+            .with_mpx()
+            .with_sgx()
+    }
+
+    /// `Kaby Lake` was the successor to `Sky Lake`, launched in 2016. it adds no extensions to
+    /// x86_64 implementaiton beyond `skylake`.
+    pub fn kabylake() -> InstDecoder {
+        skylake()
+    }
+    // ice lake is shipping so that should probably be included...
+}
-- 
cgit v1.1