From d8083b08dc987adeda73fb13298383c6cf519596 Mon Sep 17 00:00:00 2001
From: iximeow <me@iximeow.net>
Date: Fri, 15 Jan 2021 18:15:04 -0800
Subject: small perf tweaks

clearing reg_rrr and reg_mmm more efficiently is an extremely small win,
but a win

read_imm_signed generally should inline well and runs afoul of some
heuristic. inlining gets about 8% improved throughput on the
(unrealistic) in-repo benchmark

it would be great to be able to avoid bounds checks somehow; it looks
like they alone are another ~10% of decode time. i'm not sure how to
pull that off while retaining the generic iterator parameter. might just
not be possible.
---
 src/long_mode/mod.rs | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'src/long_mode/mod.rs')
diff --git a/src/long_mode/mod.rs b/src/long_mode/mod.rs
index f15e2a1..f9be9ab 100644
--- a/src/long_mode/mod.rs
+++ b/src/long_mode/mod.rs
@@ -5835,8 +5835,10 @@ fn read_instr<T: Iterator<Item=u8>>(decoder: &InstDecoder, mut bytes_iter: T, in
 //    use core::intrinsics::unlikely;
     let mut prefixes = Prefixes::new(0);
 
-    instruction.modrm_mmm.bank = RegisterBank::Q;
-    instruction.sib_index.bank = RegisterBank::Q;
+    // ever so slightly faster than just setting .bank: this allows the two assignments to merge
+    // into one `mov 0, dword [instruction + modrm_mmm_offset]`
+    instruction.modrm_mmm = RegSpec::rax();
+    instruction.sib_index = RegSpec::rax();
 
     fn escapes_are_prefixes_actually(prefixes: &mut Prefixes, opc_map: &mut Option<OpcodeMap>) {
         match opc_map {
@@ -8881,7 +8883,7 @@ fn read_imm_ivq<T: Iterator<Item=u8>>(bytes: &mut T, width: u8, length: &mut u8)
     }
 }
 
-#[inline]
+#[inline(always)]
 fn read_imm_signed<T: Iterator<Item=u8>>(bytes: &mut T, num_width: u8, length: &mut u8) -> Result<i64, DecodeError> {
     if num_width == 1 {
         *length += 1;
-- 
cgit v1.1