adjust decode logic for better pipelining

at least on my zen2. when reading prefixes, optimize for the likely case of reading an instruction rather than an invalid run of prefixes. checking if we've exceeded the x86 length bound immediately after reading the byte is only a benefit if we'd otherwise read an impossibly-long instruction; in this case we can exit exactly at prefix byte 15 rather than potentially later at byte 16 (assuming a one-byte instruction like `c3`), or byte ~24 (a more complex store with immediate and displacement). these casese are extremely unlikely in practice. more likely is that reading a prefix byte is one of the first two or three bytes in an instruction, and we will never benefit from checking the x86 length bound at this point. instead, only check length bounds after decoding the entire instruction. this penalizes the slowest path through the decoder but speeds up the likely path about 5% on my zen2 processor. additionally, begin reading instruction bytes as soon as we enter the decoder, and before initial clearing of instruction data. again, this is for zen2 pipeline reasons. reading the first byte and corresponding `OPCODES` entry improves the odds that this data is available by the time we check for `Interpretation::Prefix` in the opcode scanning loop. then, if we did *not* load an instruction, we immediately know another byte must be read; begin reading this byte before applying `rex` prefixes, and as soon as a prefix is known to not be one of the escape-code prefix byte (c5, c4, 62, 0f). this clocked in at another ~5% in total. i've found that `read_volatile` is necessary to force rust to begin the loadwhere it's written, rather than reordering it over other data. i'm not committed to this being a guaranteed truth. also, don't bother checking for `Invalid`. again, `Opcode::Invalid` is a relatively unlikely path through the decoder and `Nothing` is already optiimized for `None` cases. this appears to be another small improvement in throughput but i wouldn't want to give it a number - it was relatively small and may not be attributable to this effect.
author: iximeow <me@iximeow.net> 2021-07-02 19:47:26 -0700
committer: iximeow <me@iximeow.net> 2021-07-02 19:47:26 -0700
commit: 312591aabadc6d43a80699e2a6da611932a643c7 (patch)
tree: 4b3470b4aadb2e5485c523c658f0fd851e81f77c
parent: 7394c9dc4727d42b3ccbdf38f114ae0b5d28069d (diff)
1 files changed, 20 insertions, 10 deletions
diff --git a/src/long_mode/mod.rs b/src/long_mode/mod.rs
index e8947ea..40e7fb2 100644
--- a/src/long_mode/mod.rs
+++ b/src/long_mode/mod.rs
@@ -6984,6 +6984,8 @@ fn read_0f3a_opcode(opcode: u8, prefixes: &mut Prefixes) -> OpcodeRecord {
 
 fn read_instr<T: Reader<<Arch as yaxpeax_arch::Arch>::Address, <Arch as yaxpeax_arch::Arch>::Word>>(decoder: &InstDecoder, words: &mut T, instruction: &mut Instruction) -> Result<(), DecodeError> {
     words.mark();
+    let mut nextb = words.next().ok().ok_or(DecodeError::ExhaustedInput)?;
+    let mut next_rec = OPCODES[nextb as usize];
 //    use core::intrinsics::unlikely;
     let mut prefixes = Prefixes::new(0);
 
@@ -6996,16 +6998,23 @@ fn read_instr<T: Reader<<Arch as yaxpeax_arch::Arch>::Address, <Arch as yaxpeax_
 
 
     let record: OpcodeRecord = loop {
-        let b = words.next().ok().ok_or(DecodeError::ExhaustedInput)?;
-        if words.offset() >= 15 {
-            return Err(DecodeError::TooLong);
-        }
-        let record = OPCODES[b as usize];
-        if b >= 0x40 && b < 0x50 {
-            prefixes.rex_from(b);
+        let record = next_rec;
+        if nextb >= 0x40 && nextb < 0x50 {
+            if words.offset() >= 15 {
+                return Err(DecodeError::TooLong);
+            }
+            nextb = words.next().ok().ok_or(DecodeError::ExhaustedInput)?;
+            next_rec = unsafe {
+                core::ptr::read_volatile(&OPCODES[nextb as usize])
+            };
+            prefixes.rex_from(nextb);
         } else if let Interpretation::Instruction(_) = record.0 {
             break record;
         } else {
+            let b = nextb;
+            if words.offset() >= 15 {
+                return Err(DecodeError::TooLong);
+            }
             if b == 0x0f {
                 let b = words.next().ok().ok_or(DecodeError::ExhaustedInput)?;
                 if b == 0x38 {
@@ -7063,6 +7072,10 @@ fn read_instr<T: Reader<<Arch as yaxpeax_arch::Arch>::Address, <Arch as yaxpeax_
                 }
             }
 
+            nextb = words.next().ok().ok_or(DecodeError::ExhaustedInput)?;
+            next_rec = unsafe {
+                core::ptr::read_volatile(&OPCODES[nextb as usize])
+            };
             prefixes.rex_from(0);
             match b {
                 0x26 |
@@ -7096,9 +7109,6 @@ fn read_instr<T: Reader<<Arch as yaxpeax_arch::Arch>::Address, <Arch as yaxpeax_
             }
         }
     };
-    if record == OpcodeRecord(Interpretation::Instruction(Opcode::Invalid), OperandCode::Nothing) {
-        return Err(DecodeError::InvalidOpcode);
-    }
     if let Interpretation::Instruction(opcode) = record.0 {
         instruction.opcode = opcode;
     } else {
author	iximeow <me@iximeow.net>	2021-07-02 19:47:26 -0700
committer	iximeow <me@iximeow.net>	2021-07-02 19:47:26 -0700
commit	312591aabadc6d43a80699e2a6da611932a643c7 (patch)
tree	4b3470b4aadb2e5485c523c658f0fd851e81f77c
parent	7394c9dc4727d42b3ccbdf38f114ae0b5d28069d (diff)