From 016cb77445857b63b3c5ba3ea095c5a36a357fbd Mon Sep 17 00:00:00 2001 From: iximeow Date: Sun, 1 Jun 2025 23:51:42 +0000 Subject: 3dnow was still supported on K8, K10. 32-bit mode should learn about uarch tweaks too --- src/long_mode/uarch.rs | 39 ++++++++++---- src/protected_mode/uarch.rs | 129 +++++++++++++++++++++++++++++++++++++------- src/real_mode/uarch.rs | 129 +++++++++++++++++++++++++++++++++++++------- test/long_mode/mod.rs | 3 +- test/protected_mode/mod.rs | 25 ++++++--- 5 files changed, 271 insertions(+), 54 deletions(-) diff --git a/src/long_mode/uarch.rs b/src/long_mode/uarch.rs index 5af0175..63fa972 100644 --- a/src/long_mode/uarch.rs +++ b/src/long_mode/uarch.rs @@ -35,13 +35,15 @@ pub mod amd { /// support - SSE2 and no later. pub fn k8() -> InstDecoder { InstDecoder::minimal() + .with_3dnow() + .with_3dnowprefetch() + .with_cmov() } /// `k10` was the successor to `k8`, launched in 2007. `k10` cores extended SSE support through /// to SSE4.2a, as well as consistent `cmov` support, among other features. pub fn k10() -> InstDecoder { k8() - .with_cmov() .with_cmpxchg16b() .with_svm() .with_abm() @@ -51,9 +53,20 @@ pub mod amd { } /// `Bulldozer` was the successor to `K10`, launched in 2011. `Bulldozer` cores include AVX - /// support among other extensions, and are notable for including `AESNI`. + /// support among other extensions, and are notable for including `AESNI`. `Bulldozer` was also + /// the first microarchitecture to *remove* support for 3DNow instructions. pub fn bulldozer() -> InstDecoder { - k10() + InstDecoder::minimal() + // first, apply all the K8 extensions again, sans 3DNow + // .. should be sse, sse2 + // then the K10 + .with_cmpxchg16b() + .with_svm() + .with_abm() + .with_lahfsahf() + .with_sse3() + .with_sse4a() + // now the new extensions .with_ssse3() .with_sse4() .with_sse4_2() @@ -101,9 +114,18 @@ pub mod amd { /// SHA, RDSEED, and other extensions. pub fn zen() -> InstDecoder { // no nice way to *un*set feature bits, but several extensions were dropped. - // so, start again from K10. - k10() - // first, bundle all the K10->Bulldozer features.. + // so, start again. + InstDecoder::minimal() + // first, apply all the K8 extensions again, sans 3DNow + // .. should be sse, sse2 + // then the K10 + .with_cmpxchg16b() + .with_svm() + .with_abm() + .with_lahfsahf() + .with_sse3() + .with_sse4a() + // now, bundle all the K10->Bulldozer features.. .with_ssse3() .with_sse4() .with_sse4_2() @@ -114,11 +136,8 @@ pub mod amd { .with_avx() .with_xsave() .with_skinit() - // now all the Bulldozer (/Piledriver/Steamroller/Excavator)->Zen features + // finally all the Bulldozer (/Piledriver/Steamroller/Excavator)->Zen features .with_avx2() - .with_aesni() - .with_pclmulqdq() - .with_f16c() .with_movbe() .with_bmi2() .with_adx() diff --git a/src/protected_mode/uarch.rs b/src/protected_mode/uarch.rs index cbe3e89..6914348 100644 --- a/src/protected_mode/uarch.rs +++ b/src/protected_mode/uarch.rs @@ -1,12 +1,24 @@ +//! information for AMD and Intel microarchitectures in the modules below is sourced from a +//! combination of Wikipedia (especially for dates), one-off research for particular +//! microarchitectures, and `InstLatx64`'s CPUID dumps via [chip directory](https://github.com/iximeow/chip_directory). +//! +//! these microarchitecture-specific decoders are relatively rarely used, but generally should be +//! accurate. + pub mod amd { - //! most information about instruction set extensions for microarchitectures here was sourced - //! from - //! [https://en.wikipedia.org/wiki/AMD_Accelerated_Processing_Unit#Feature_overview](https://docs.rs/yaxpeax-x86/0.0.12/yaxpeax_x86/protected_mode/uarch/intel/index.html) + //! initial information for the mircoarchitecture (families) described here came from a + //! combination of the Wikipedia pages + //! [https://en.wikipedia.org/wiki/AMD_Accelerated_Processing_Unit#Feature_overview](https://en.wikipedia.org/wiki/AMD_Accelerated_Processing_Unit#Feature_overview) //! and - //! [https://en.wikipedia.org/wiki/Template:AMD_x86_CPU_features](https://docs.rs/yaxpeax-x86/0.0.12/yaxpeax_x86/protected_mode/uarch/intel/index.html). - //! these mappings are best-effort but fairly unused, so a critical eye should be kept towards - //! these decoders rejecting instructions they should not, or incorrectly accepting - //! instructions. + //! [https://en.wikipedia.org/wiki/Template:AMD_x86_CPU_features](https://en.wikipedia.org/wiki/Template:AMD_x86_CPU_features). + //! it has been since "augmented" by the CPUID dumps from InstLatx64, via [chip + //! directory](https://github.com/iximeow/chip_directory/tree/no-gods-no-/x86). scare quotes + //! because in several cases CPUID measurement error adds, rather than removes, ambiguity. + //! additionally, for some CPU features, InstLatx64 has CPUID dumps of early engineering + //! samples where features are not present. later production steppings of those parts do + //! universally have the corresponding feature, which makes it less obvious which features are + //! universally present in a family, standardized in a following architecture, unevenly present + //! due to market segmentation, and so on. //! //! microarchitectures as defined here are with respect to flags reported by CPUID. notably, //! `Zen` does not report `FMA4` support by `CPUID`, but instructions in that extension @@ -23,28 +35,41 @@ pub mod amd { /// support - SSE2 and no later. pub fn k8() -> InstDecoder { InstDecoder::minimal() + .with_3dnow() + .with_3dnowprefetch() + .with_cmov() } /// `k10` was the successor to `k8`, launched in 2007. `k10` cores extended SSE support through /// to SSE4.2a, as well as consistent `cmov` support, among other features. pub fn k10() -> InstDecoder { k8() - .with_cmov() .with_cmpxchg16b() .with_svm() .with_abm() .with_lahfsahf() .with_sse3() - .with_ssse3() - .with_sse4() - .with_sse4_2() .with_sse4a() } /// `Bulldozer` was the successor to `K10`, launched in 2011. `Bulldozer` cores include AVX - /// support among other extensions, and are notable for including `AESNI`. + /// support among other extensions, and are notable for including `AESNI`. `Bulldozer` was also + /// the first microarchitecture to *remove* support for 3DNow instructions. pub fn bulldozer() -> InstDecoder { - k10() + InstDecoder::minimal() + // first, apply all the K8 extensions again, sans 3DNow + // .. should be sse, sse2 + // then the K10 + .with_cmpxchg16b() + .with_svm() + .with_abm() + .with_lahfsahf() + .with_sse3() + .with_sse4a() + // now the new extensions + .with_ssse3() + .with_sse4() + .with_sse4_2() .with_bmi1() .with_aesni() .with_pclmulqdq() @@ -52,6 +77,8 @@ pub mod amd { .with_avx() .with_fma4() .with_xop() + .with_xsave() + .with_skinit() } /// `Piledriver` was the successor to `Bulldozer`, launched in 2012. @@ -86,21 +113,87 @@ pub mod amd { /// instructions to AVX2 and discarded FMA4, TBM, and XOP extensions. they also gained ADX, /// SHA, RDSEED, and other extensions. pub fn zen() -> InstDecoder { - k10() - .with_avx() - .with_avx2() + // no nice way to *un*set feature bits, but several extensions were dropped. + // so, start again. + InstDecoder::minimal() + // first, apply all the K8 extensions again, sans 3DNow + // .. should be sse, sse2 + // then the K10 + .with_cmpxchg16b() + .with_svm() + .with_abm() + .with_lahfsahf() + .with_sse3() + .with_sse4a() + // now, bundle all the K10->Bulldozer features.. + .with_ssse3() + .with_sse4() + .with_sse4_2() .with_bmi1() .with_aesni() .with_pclmulqdq() .with_f16c() + .with_avx() + .with_xsave() + .with_skinit() + // finally all the Bulldozer (/Piledriver/Steamroller/Excavator)->Zen features + .with_avx2() .with_movbe() .with_bmi2() - .with_rdrand() .with_adx() .with_sha() + .with_rdrand() .with_rdseed() .with_fma3() - // TODO: XSAVEC, XSAVES, XRSTORS, CLFLUSHOPT, CLZERO? + + .with_xsavec() + .with_xsaves() + .with_xsaveopt() + .with_clflushopt() + .with_clwb() + .with_fsgsbase() + .with_monitorx() + } + + /// `Zen 2`, launched in 2019, succeeded `Zen`/`Zen+`. there aren't many instruction set + /// extensions here, but `clwb`, `rdpid`, and `wbnoinvd` show up here. + pub fn zen2() -> InstDecoder { + zen() + .with_clwb() + .with_rdpid() + .with_wbnoinvd() + } + + /// `Zen 3`, launched in 2020, succeeded `Zen 2`. like `Zen 2`, there aren't many instruction + /// set extensions here. + pub fn zen3() -> InstDecoder { + zen2() + .with_invpcid() + .with_vaes() + .with_vpclmulqdq() + } + + /// `Zen 4`, launched in 2022, succeeded `Zen 3`. `Zen 4` is notable for being the first AMD + /// processor family supporting AVX-512. + pub fn zen4() -> InstDecoder { + zen3() + .with_avx512_f() + .with_avx512_vl() + .with_avx512_bw() + .with_avx512_cd() + .with_avx512_cd() + .with_avx512_vbmi() + .with_avx512_vbmi2() + .with_avx512_vpopcntdq() + .with_gfni() + } + + /// `Zen 5`, launched in 2024, succeeded `Zen 4`. `Zen 5` adds only a few additional + /// instructions; some AVX-512 features, `enqcmd`, and `movdir64b`. + pub fn zen5() -> InstDecoder { + zen4() + .with_movdir64b() + .with_enqcmd() } } diff --git a/src/real_mode/uarch.rs b/src/real_mode/uarch.rs index 60bf168..8df4213 100644 --- a/src/real_mode/uarch.rs +++ b/src/real_mode/uarch.rs @@ -1,12 +1,24 @@ +//! information for AMD and Intel microarchitectures in the modules below is sourced from a +//! combination of Wikipedia (especially for dates), one-off research for particular +//! microarchitectures, and `InstLatx64`'s CPUID dumps via [chip directory](https://github.com/iximeow/chip_directory). +//! +//! these microarchitecture-specific decoders are relatively rarely used, but generally should be +//! accurate. + pub mod amd { - //! most information about instruction set extensions for microarchitectures here was sourced - //! from - //! [https://en.wikipedia.org/wiki/AMD_Accelerated_Processing_Unit#Feature_overview](https://docs.rs/yaxpeax-x86/0.0.12/yaxpeax_x86/real_mode/uarch/intel/index.html) + //! initial information for the mircoarchitecture (families) described here came from a + //! combination of the Wikipedia pages + //! [https://en.wikipedia.org/wiki/AMD_Accelerated_Processing_Unit#Feature_overview](https://en.wikipedia.org/wiki/AMD_Accelerated_Processing_Unit#Feature_overview) //! and - //! [https://en.wikipedia.org/wiki/Template:AMD_x86_CPU_features](https://docs.rs/yaxpeax-x86/0.0.12/yaxpeax_x86/real_mode/uarch/intel/index.html). - //! these mappings are best-effort but fairly unused, so a critical eye should be kept towards - //! these decoders rejecting instructions they should not, or incorrectly accepting - //! instructions. + //! [https://en.wikipedia.org/wiki/Template:AMD_x86_CPU_features](https://en.wikipedia.org/wiki/Template:AMD_x86_CPU_features). + //! it has been since "augmented" by the CPUID dumps from InstLatx64, via [chip + //! directory](https://github.com/iximeow/chip_directory/tree/no-gods-no-/x86). scare quotes + //! because in several cases CPUID measurement error adds, rather than removes, ambiguity. + //! additionally, for some CPU features, InstLatx64 has CPUID dumps of early engineering + //! samples where features are not present. later production steppings of those parts do + //! universally have the corresponding feature, which makes it less obvious which features are + //! universally present in a family, standardized in a following architecture, unevenly present + //! due to market segmentation, and so on. //! //! microarchitectures as defined here are with respect to flags reported by CPUID. notably, //! `Zen` does not report `FMA4` support by `CPUID`, but instructions in that extension @@ -23,28 +35,41 @@ pub mod amd { /// support - SSE2 and no later. pub fn k8() -> InstDecoder { InstDecoder::minimal() + .with_3dnow() + .with_3dnowprefetch() + .with_cmov() } /// `k10` was the successor to `k8`, launched in 2007. `k10` cores extended SSE support through /// to SSE4.2a, as well as consistent `cmov` support, among other features. pub fn k10() -> InstDecoder { k8() - .with_cmov() .with_cmpxchg16b() .with_svm() .with_abm() .with_lahfsahf() .with_sse3() - .with_ssse3() - .with_sse4() - .with_sse4_2() .with_sse4a() } /// `Bulldozer` was the successor to `K10`, launched in 2011. `Bulldozer` cores include AVX - /// support among other extensions, and are notable for including `AESNI`. + /// support among other extensions, and are notable for including `AESNI`. `Bulldozer` was also + /// the first microarchitecture to *remove* support for 3DNow instructions. pub fn bulldozer() -> InstDecoder { - k10() + InstDecoder::minimal() + // first, apply all the K8 extensions again, sans 3DNow + // .. should be sse, sse2 + // then the K10 + .with_cmpxchg16b() + .with_svm() + .with_abm() + .with_lahfsahf() + .with_sse3() + .with_sse4a() + // now the new extensions + .with_ssse3() + .with_sse4() + .with_sse4_2() .with_bmi1() .with_aesni() .with_pclmulqdq() @@ -52,6 +77,8 @@ pub mod amd { .with_avx() .with_fma4() .with_xop() + .with_xsave() + .with_skinit() } /// `Piledriver` was the successor to `Bulldozer`, launched in 2012. @@ -86,21 +113,87 @@ pub mod amd { /// instructions to AVX2 and discarded FMA4, TBM, and XOP extensions. they also gained ADX, /// SHA, RDSEED, and other extensions. pub fn zen() -> InstDecoder { - k10() - .with_avx() - .with_avx2() + // no nice way to *un*set feature bits, but several extensions were dropped. + // so, start again. + InstDecoder::minimal() + // first, apply all the K8 extensions again, sans 3DNow + // .. should be sse, sse2 + // then the K10 + .with_cmpxchg16b() + .with_svm() + .with_abm() + .with_lahfsahf() + .with_sse3() + .with_sse4a() + // now, bundle all the K10->Bulldozer features.. + .with_ssse3() + .with_sse4() + .with_sse4_2() .with_bmi1() .with_aesni() .with_pclmulqdq() .with_f16c() + .with_avx() + .with_xsave() + .with_skinit() + // finally all the Bulldozer (/Piledriver/Steamroller/Excavator)->Zen features + .with_avx2() .with_movbe() .with_bmi2() - .with_rdrand() .with_adx() .with_sha() + .with_rdrand() .with_rdseed() .with_fma3() - // TODO: XSAVEC, XSAVES, XRSTORS, CLFLUSHOPT, CLZERO? + + .with_xsavec() + .with_xsaves() + .with_xsaveopt() + .with_clflushopt() + .with_clwb() + .with_fsgsbase() + .with_monitorx() + } + + /// `Zen 2`, launched in 2019, succeeded `Zen`/`Zen+`. there aren't many instruction set + /// extensions here, but `clwb`, `rdpid`, and `wbnoinvd` show up here. + pub fn zen2() -> InstDecoder { + zen() + .with_clwb() + .with_rdpid() + .with_wbnoinvd() + } + + /// `Zen 3`, launched in 2020, succeeded `Zen 2`. like `Zen 2`, there aren't many instruction + /// set extensions here. + pub fn zen3() -> InstDecoder { + zen2() + .with_invpcid() + .with_vaes() + .with_vpclmulqdq() + } + + /// `Zen 4`, launched in 2022, succeeded `Zen 3`. `Zen 4` is notable for being the first AMD + /// processor family supporting AVX-512. + pub fn zen4() -> InstDecoder { + zen3() + .with_avx512_f() + .with_avx512_vl() + .with_avx512_bw() + .with_avx512_cd() + .with_avx512_cd() + .with_avx512_vbmi() + .with_avx512_vbmi2() + .with_avx512_vpopcntdq() + .with_gfni() + } + + /// `Zen 5`, launched in 2024, succeeded `Zen 4`. `Zen 5` adds only a few additional + /// instructions; some AVX-512 features, `enqcmd`, and `movdir64b`. + pub fn zen5() -> InstDecoder { + zen4() + .with_movdir64b() + .with_enqcmd() } } diff --git a/test/long_mode/mod.rs b/test/long_mode/mod.rs index 9493a63..2c7771c 100644 --- a/test/long_mode/mod.rs +++ b/test/long_mode/mod.rs @@ -3454,7 +3454,8 @@ fn test_3dnow() { test_display_under(&InstDecoder::default(), bytes, text); test_invalid_under(&InstDecoder::minimal(), bytes); test_invalid_under(&InstDecoder::minimal(), bytes); - test_invalid_under(&yaxpeax_x86::long_mode::uarch::amd::k8(), bytes); + test_display_under(&yaxpeax_x86::long_mode::uarch::amd::k8(), bytes, text); + test_invalid_under(&yaxpeax_x86::long_mode::uarch::amd::bulldozer(), bytes); test_invalid_under(&yaxpeax_x86::long_mode::uarch::intel::netburst(), bytes); } diff --git a/test/protected_mode/mod.rs b/test/protected_mode/mod.rs index 8fecdab..29e9ec4 100644 --- a/test/protected_mode/mod.rs +++ b/test/protected_mode/mod.rs @@ -3068,13 +3068,24 @@ fn test_sse4a() { #[test] fn test_3dnow() { - test_display(&[0x0f, 0x0f, 0xe0, 0x8a], "pfnacc mm4, mm0"); - test_display(&[0x0f, 0x0f, 0x38, 0x8e], "pfpnacc mm7, qword [eax]"); - test_display(&[0x65, 0x67, 0x65, 0x65, 0x0f, 0x0e], "femms"); - test_display(&[0x3e, 0xf3, 0x2e, 0xf2, 0x0f, 0x0f, 0x64, 0x93, 0x93, 0xa4], "pfmax mm4, qword cs:[ebx + edx * 4 - 0x6d]"); - test_display(&[0x26, 0x36, 0x0f, 0x0f, 0x70, 0xfb, 0x0c], "pi2fw mm6, qword ss:[eax - 0x5]"); - test_display(&[0x66, 0x0f, 0x0f, 0xc6, 0xb7], "pmulhrw mm0, mm6"); - test_display(&[0x0f, 0x0f, 0xc6, 0xb7], "pmulhrw mm0, mm6"); + fn test_instr(bytes: &[u8], text: &'static str) { + test_display_under(&InstDecoder::minimal().with_3dnow(), bytes, text); + test_display_under(&InstDecoder::default(), bytes, text); + test_invalid_under(&InstDecoder::minimal(), bytes); + test_invalid_under(&InstDecoder::minimal(), bytes); + test_display_under(&yaxpeax_x86::protected_mode::uarch::amd::k8(), bytes, text); + test_invalid_under(&yaxpeax_x86::protected_mode::uarch::amd::bulldozer(), bytes); + test_invalid_under(&yaxpeax_x86::protected_mode::uarch::intel::netburst(), bytes); + } + + test_instr(&[0x0f, 0x0f, 0xe0, 0x8a], "pfnacc mm4, mm0"); + test_instr(&[0x0f, 0x0f, 0x38, 0x8e], "pfpnacc mm7, qword [eax]"); + test_instr(&[0x65, 0x67, 0x65, 0x65, 0x0f, 0x0e], "femms"); + test_instr(&[0x3e, 0xf3, 0x2e, 0xf2, 0x0f, 0x0f, 0x64, 0x93, 0x93, 0xa4], "pfmax mm4, qword cs:[ebx + edx * 4 - 0x6d]"); + test_instr(&[0x26, 0x36, 0x0f, 0x0f, 0x70, 0xfb, 0x0c], "pi2fw mm6, qword ss:[eax - 0x5]"); + test_instr(&[0x66, 0x0f, 0x0f, 0xc6, 0xb7], "pmulhrw mm0, mm6"); + test_instr(&[0x0f, 0x0f, 0xc6, 0xb7], "pmulhrw mm0, mm6"); + test_instr(&[0x0f, 0x0e], "femms"); } // first appeared in tremont -- cgit v1.1