1 files changed, 172 insertions, 21 deletions
diff --git a/src/long_mode/behavior.rs b/src/long_mode/behavior.rs
index dd9ec3e..97ae3fd 100644
--- a/src/long_mode/behavior.rs
+++ b/src/long_mode/behavior.rs
@@ -1,9 +1,41 @@
+//! behavior information for x86-64 instructions.
+//!
+//! this module allows users of yaxpeax-x86 to collect operand read/write information about
+//! instructions ([`InstBehavior::get_operand()`]), as well as allowed execution level
+//! ([`InstBehavior::privilege_level()`]), potential exceptions ([`InstBehavior::exceptions()`]),
+//! and iterating all explicit and implicit operands ([`InstBehavior::all_operands()`]).
+//!
+//! additionally, [`ComplexOp`] enumerates instructions that may be considered "complex" - either
+//! involving architectural state not expressed in yaxpeax-x86' API or otherwise affecting machine
+//! state in a way that simply considering the operands as-presented would be inaccurate. where
+//! possible, `ComplexOp` tries to guide users towards how to handle such instructions.
+//!
+//! some behavior information in this module is "unstable", meaning it must be opted into with
+//! `feature = ["unstable"]` on yaxpeax-x86; information from "unstable" interfaces may be
+//! less-tested and change across semver-compatible releases! if you want to use unstable
+//! interfaces here, first: thank you!! please report any issues, and second: consider pinning to a
+//! specific minor version while setting `feature = ["unstable"]` if instruction behavior becoming
+//! more correct might present an issue in your application.
+
+#![deny(missing_docs)]
+
+// a lot of people have told me that we don't need to read code anymore, just like we "never look at
+// machine code anymore". sit with me and have a sad laugh for a moment. if we weren't here, reading
+// and thinking and talking about how to model the computer, where would the training data come
+// from? "you're not being left behind, it's a personal choice!", i've heard. a year later this has
+// evolved into "it is an abdication of your responsibility as an engineer to not pay Anthropic".
+// it is our moral responsibility to build the highest quality software in service of furthering
+// the rot? it is an abdication of ethics to claim all works are good.
+
 use super::{Instruction, Opcode, Operand, OperandSpec};
 use super::RegSpec;
 
 /// an accessor for run-time characteristics of instructions.
 ///
-/// ... TODO words ...
+/// generally, behavior accessors across architectures are expected to have a `behavior()`
+/// entrypoint on a decoded instruction. it is not clear which properties of `behavior()`
+/// generalize across architectures (yet!) but presumably something like `all_operands()` and
+/// `Access` do.
 ///
 /// additionally, of note for x86:
 ///
@@ -15,11 +47,10 @@ use super::RegSpec;
 ///   operation (consider `call qword [rcx]`; `qword [rcx]` is one memory access, but the implied
 ///   push of a return address is a second memory operation).
 /// * `{,e,r}flags` is often written and sometimes read, but almost never as an explicit source or
-///   destination operand. this can be queried with `flags_access()`, in addition to its inclusion
-///   as an implicit operand.
+///   destination operand. this can be queried with [`flags_access()`].
 ///
 /// it's also useful to know if implicit and explicit operands are reads, writes, or both, such as
-/// when diagnosing a run-time fault. to iterate over this information, `operands().accesses()`. or
+/// when diagnosing a run-time fault. to iterate over this information, `all_operands().iter()`. or
 /// `visit_accesses(&mut ..)` to collect all operand/access information for this instruction.
 #[derive(Copy, Clone)]
 pub struct InstBehavior<'inst> {
@@ -28,6 +59,14 @@ pub struct InstBehavior<'inst> {
 }
 
 impl Instruction {
+    /// get a struct to query behaviors of an instruction.
+    ///
+    /// "behaviors" is broad! as of writing, "behavior" covers "implicit and explicit operand
+    /// reads/writes", "possible exceptions", "allowed privilege levels", and "instruction has
+    /// additional semantics not easily expressed by this library".
+    ///
+    /// see the documentation for [`InstBehavior`] as well as the
+    /// [`behavior`][yaxpeax_x86::long_mode::behavior] module for more information.
     pub fn behavior<'inst>(&'inst self) -> InstBehavior<'inst> {
         let mut behavior = opcode2behavior(&self.opcode);
 
@@ -252,6 +291,10 @@ impl Exception {
     /// Control Protection Exception
     pub const CP: Exception = Exception::vector(21);
 
+    /// construct an `Exception` for the provided exception vector number.
+    ///
+    /// this is provided for convenience when converting (for example) the number in an x86
+    /// exception handler to the kinds of `Exception` in this library.
     pub const fn vector(vector: u8) -> Self {
         Self { vector }
     }
@@ -262,6 +305,11 @@ impl Exception {
     }
 
     #[cfg(feature = "fmt")]
+    /// get the typical mnemonic for this `Exception`, if one is documented.
+    ///
+    /// the names returned by helper do not include a leading `#`. they come from the Intel SDM
+    /// chapter 7.3 "SOURCES OF INTERRUPTS" table 7-1 "Protected-Mode Exceptions and Interrupts".
+    /// similar descriptions can be found in the AMD APM.
     pub fn name(&self) -> Option<&'static str> {
         static NAMES: [Option<&'static str>; 22] = [
             Some("DE"), Some("DB"), Some("NMI"), Some("BP"),
@@ -294,20 +342,24 @@ impl fmt::Debug for Exception {
 }
 
 impl ExceptionInfo {
+    /// construct an empty set of possible exception vectors.
     pub fn empty() -> Self {
         Self {
             possible_vectors: 0,
         }
     }
 
+    /// test if this `ExceptionInfo` has any possible vector set.
     pub fn any(&self) -> bool {
         self.possible_vectors != 0
     }
 
+    /// test if this `ExceptionInfo` has no vector set.
     pub fn none(&self) -> bool {
         !self.any()
     }
 
+    /// test if this `ExceptionInfo` indicates that exception `e` may be raised.
     pub fn may(&self, e: Exception) -> bool {
         (self.possible_vectors & (1 << e.vector)) != 0
     }
@@ -323,6 +375,8 @@ impl ExceptionInfo {
         self.possible_vectors |= bit;
     }
 
+    /// record that exception `e` is or is not (`b`) possible in this `ExceptionInfo` record, but
+    /// in a more chaining-friendly way.
     pub const fn with(mut self, e: Exception, b: bool) -> Self {
         self.set(e, b);
         self
@@ -351,13 +405,25 @@ fn test_exception_info() {
     assert_eq!(info.possible_vectors, 0x12000);
 }
 
+/// a description of the privilege level (that is, value of `CPL` in the current code selector)
+/// that allows executing the corresponding instruction.
 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub enum PrivilegeLevel {
+    /// the corresponding instruction can run at any privilege level.
     Any = 0b00,
+    /// the corresponding instruction can only run when `CPL=0` (aka "in ring 0").
     PL0 = 0b01,
+    /// the corresponding instruction has more complex rule for when it is allowed.
+    ///
+    /// this may mean the instruction is either "Any" or "PL0" depending on other processor state
+    /// (such as `rdtsc`), or it may mean the instruction simply does not relate directly to
+    /// `CPL=3`/`CPL=0` (such as for `iret`).
     Special = 0b10,
 }
 
+/// a handle for an instruction, its behavior, and any related implicit operands.
+///
+/// this is only useful for [`InstOperands::iter()`].
 #[derive(Copy, Clone)]
 pub struct InstOperands<'inst> {
     inst: InstBehavior<'inst>,
@@ -365,11 +431,17 @@ pub struct InstOperands<'inst> {
 }
 
 impl<'inst> InstOperands<'inst> {
+    /// establish an iterator over the operands described in this `InstOperands`.
     pub fn iter(self) -> AccessIter<'inst> {
         AccessIter::new(self)
     }
 }
 
+/// this struct implements [`Iterator`] to allow library users to walk all explicit and implicit
+/// operands for the corresponding instruction, along with if they are used for reading or for
+/// writing.
+///
+/// implicit operands are always walked first, explicit operands are walked last.
 pub struct AccessIter<'inst> {
     operands: InstOperands<'inst>,
     explicit: bool,
@@ -385,6 +457,8 @@ impl<'inst> AccessIter<'inst> {
         }
     }
 
+    /// weaken this iterator to only returning the operands corresponding to this instruction,
+    /// without specific access information.
     pub fn operands(self) -> OperandIter<'inst> {
         OperandIter { inner: self }
     }
@@ -437,7 +511,8 @@ impl<'inst> Iterator for AccessIter<'inst> {
 
         if self.next < self.operands.inst.inst.operand_count() {
             let op = self.operands.inst.inst.operand(self.next);
-            let access = self.operands.inst.operand_access(self.next).expect("defined operand has defined access");
+            let access = self.operands.inst.operand_access(self.next)
+                .expect("defined operand has defined access");
             debug_assert!(
                 access != Access::None || (
                     self.operands.inst.inst.opcode == Opcode::NOP ||
@@ -454,6 +529,11 @@ impl<'inst> Iterator for AccessIter<'inst> {
     }
 }
 
+/// a reduced-strength iterator of an instruction's implicit and explicit operands.
+///
+/// unlike `AccessIter`, this iterator does not provide read/write information, simply that
+/// operands are or are not present. this is more likely useful for some kinds of instruction
+/// printing than automated instruction analysis.
 pub struct OperandIter<'inst> {
     inner: AccessIter<'inst>,
 }
@@ -512,6 +592,11 @@ impl<'inst> Iterator for OperandIter<'inst> {
 }
 
 impl<'inst> InstBehavior<'inst> {
+    #[cfg(feature = "unstable")]
+    /// get the [`PrivilegeLevel`] for this instruction.
+    ///
+    /// returns `None` if no privilege level information is recorded for the instruction. such
+    /// cases are a bug, please report if you see them.
     pub fn privilege_level(&self) -> Option<PrivilegeLevel> {
         let pl_bits = self.behavior.behavior & 0b11;
         const LUT: [Option<PrivilegeLevel>; 4] = [
@@ -522,8 +607,11 @@ impl<'inst> InstBehavior<'inst> {
         LUT[pl_bits as usize]
     }
 
-    /// 
-//    #[cfg(feature = "unstable")]
+    #[cfg(feature = "unstable")]
+    /// get the [`ExceptionInfo`] for this instruction.
+    ///
+    /// this is very much best-effort and poorly tested. it is behind the `unstable` feature for a
+    /// reason!
     pub fn exceptions(&self) -> ExceptionInfo {
         let mut exceptions = ExceptionInfo::empty();
         if self.privilege_level() != Some(PrivilegeLevel::Any) {
@@ -545,15 +633,21 @@ impl<'inst> InstBehavior<'inst> {
         exceptions
     }
 
-    fn as_complex_op(&self) -> Option<ComplexOp> {
-        // if the behavior is not complex, it is *definitely* not complex. if the behavior is
+    /// transform this instruction's [`Opcode`] into a [`ComplexOp`], if the instruction is
+    /// "complex".
+    ///
+    /// documentation on [`ComplexOp`] covers what instructions are considered "complex" by
+    /// yaxpeax-x86 and why in more detail. correct analysis of a function (or program!) in the
+    /// presence of complex instructions may require consulting the Intel Software Developer's
+    /// Manual or AMD Architecture Programmer's Manual.
+    pub fn as_complex_op(&self) -> Option<ComplexOp> {
+        // if the behavior is not complex, it is *definitely* not a complex op. if the behavior is
         // complex, it's really a "depending on the specific instruction and operands it might
         // be"...
         if !self.behavior.is_complex() {
             return None;
         }
 
-        // TODO: all of these should be a `set_complex` bit.
         if self.inst.opcode == Opcode::BT {
             if self.inst.operands[0] != OperandSpec::RegMMM {
                 Some(ComplexOp::BT)
@@ -614,12 +708,17 @@ impl<'inst> InstBehavior<'inst> {
         })
     }
 
+    /// get the `Access` behavior this instruction has for `rflags`.
+    ///
+    /// note that as the documentation for [`Access`] describes, "read" and "write" have slightly
+    /// different meanings for the flags register than other locations.
+    // this implies that `rflags` must never appear in an implicit operand list.
     pub fn flags_access(&self) -> Option<Access> {
         let flag_acc = (self.behavior.behavior >> 2) & 0b11;
         Access::from_bits(flag_acc)
     }
 
-    pub fn implicit_oplist(&self) -> Option<&'static [ImplicitOperand]> {
+    fn implicit_oplist(&self) -> Option<&'static [ImplicitOperand]> {
         let ops_idx = self.behavior.extra;
         if ops_idx == 0 {
             return None;
@@ -629,8 +728,13 @@ impl<'inst> InstBehavior<'inst> {
         Some(&IMPLICIT_OPS_LIST[ops_idx as usize])
     }
 
+    /// get the `Access` behavor for an explicit operand of this instruction.
+    ///
+    /// `None` means that there is no operand at the given index, while `Some(Access::None)` means
+    /// there is an operand, and the instruction does not actually access it (as for `nop`, `ud0`,
+    /// and `ud1`)
     pub fn operand_access(&self, idx: u8) -> Option<Access> {
-        if idx >= 4 {
+        if idx >= self.inst.operand_count {
             return None;
         }
 
@@ -638,13 +742,24 @@ impl<'inst> InstBehavior<'inst> {
         Access::from_bits(op_acc)
     }
 
+    /// iterate all operands in the instruction and report them to the provided `AccessVisitor`.
+    ///
+    /// this is a more informative, but somewhat more specialized, interface than simply iterating
+    /// [`InstBehavior::all_operands()`]. for memory operands, address calculations are reported to
+    /// the access visitor as reads of the relevant registers. if all dependent values are
+    /// available, the resulting effective address is computed and reported as part of the memory
+    /// access.
+    ///
+    /// `visit_accesses()` is slightly more efficient in this than iterating `all_operands()` as
+    /// well, as it uses unstable internal representations directly, rather than converting to API
+    /// types and back for every operand.
     pub fn visit_accesses<T: AccessVisitor>(&self, v: &mut T) -> Result<(), ComplexOp> {
         if let Some(op) = self.as_complex_op() {
             return Err(op);
         }
 
         fn compute_addr<T: AccessVisitor>(v: &mut T, inst: &Instruction, op_spec: OperandSpec) -> Option<u64> {
-            // TODO: test assertions feature?
+            #[cfg(feature = "_debug_internal_asserts")]
             if !op_spec.is_memory() {
                 panic!("expected memory operand but got {:?}", op_spec);
             }
@@ -753,6 +868,8 @@ impl<'inst> InstBehavior<'inst> {
                     Some(inst.disp as u32 as u64)
                 }
                 other => {
+                    // this could be `_debug_internal_assertions`-gated, but i'm not quite that
+                    // confident yet..
                     panic!("not-yet-handled memory operand: {:?}", other);
                 }
             }
@@ -776,7 +893,11 @@ impl<'inst> InstBehavior<'inst> {
                         OperandSpec::Deref_edi => RegSpec::edi(),
                         OperandSpec::Deref_esi => RegSpec::esi(),
                         OperandSpec::Deref => self.inst.regs[1],
-                        other => { panic!("TODO: unreachable {:?}", other); }
+                        other => {
+                            // this could be `_debug_internal_assertions`-gated, but i'm not quite
+                            // that confident yet..
+                            panic!("TODO: unreachable {:?}", other);
+                        }
                     };
                     if op.write {
                         v.register_write(reg);
@@ -796,7 +917,9 @@ impl<'inst> InstBehavior<'inst> {
                             }
                         }
                         OperandSpec::MemIndexScale => {
-                            // HACK HACK HACK
+                            // HACK HACK HACK this is just how i've decided to interpret
+                            // `MemIndexScale` as an operand spec; it's only for xlat. adding
+                            // another field to implicit operands just for this is a little silly..
                             let base = v.get_register(op.reg);
                             let index = v.get_register(RegSpec::al());
                             if let (Some(base), Some(index)) = (base, index) {
@@ -806,6 +929,8 @@ impl<'inst> InstBehavior<'inst> {
                             }
                         }
                         other => {
+                            // this could be `_debug_internal_assertions`-gated, but i'm not quite
+                            // that confident yet..
                             panic!("impossible operand spec {:?}", other);
                         }
                     };
@@ -969,7 +1094,7 @@ impl<'inst> InstBehavior<'inst> {
                         if other.is_masked() && self.inst.prefixes.evex_unchecked().mask_reg() != 0 {
                             v.register_read(RegSpec::mask(self.inst.prefixes.evex_unchecked().mask_reg()));
                         }
-                        // no lea check necessary: the memory access is coded as a read and no
+                        // no lea check necessary: its memory access is coded as a read and no
                         // instruction has a similar "fake" memory write.
                         v.memory_write(addr, size as u32);
                     }
@@ -1045,17 +1170,23 @@ impl Access {
         LUT[bits as usize]
     }
 
+    /// is this access a read?
+    ///
+    /// if it is `ReadWrite`, this will be `true` as will `is_write`.
     pub fn is_read(&self) -> bool {
         *self as u8 & 0b01 != 0
     }
 
+    /// is this access a write?
+    ///
+    /// if it is `ReadWrite`, this will be `true` as will `is_read`.
     pub fn is_write(&self) -> bool {
         *self as u8 & 0b10 != 0
     }
 }
 
 #[derive(Copy, Clone, PartialEq, Debug)]
-pub struct BehaviorDigest {
+struct BehaviorDigest {
     // laid out like:
     //
     // |7 6|5 4|3 2|1 0|
@@ -1081,7 +1212,6 @@ pub struct BehaviorDigest {
     extra: u16,
 }
 
-// TODO: the various `set_pl*()` are not actually used yet..
 #[allow(dead_code)]
 impl BehaviorDigest {
     const fn empty() -> BehaviorDigest {
@@ -1457,9 +1587,13 @@ impl BehaviorDigest {
 ///
 /// TDX-related instructions are considered complex because they are not more precisely tested and
 /// are assumed as-complex-as-VMX in the first place.
+// TODO: this could be declared through a macro that does something like:
+// "declare_opcode_subset! { }" which gets a list of identifiers and generates the
+// `Opcode::<ident> as u32` rhs. but a vim macro will do for now.
 #[non_exhaustive]
 #[repr(u32)] // same repr as `Opcode`
 #[derive(Copy, Clone, Debug)]
+#[allow(missing_docs)]
 pub enum ComplexOp {
     /// rdmsr/wrmsr are considered "complex" for reasons in the enum doc comment.
     RDMSR = (Opcode::RDMSR as u32),
@@ -1765,13 +1899,30 @@ pub trait AccessVisitor {
     /// if any `get_register()` returns `None` in an address calculation, the subsequent
     /// `memory_read()` or `memory_write()` for that operand will be given an `address` of `None`.
     ///
-    /// if `get_register` is implemented withhout calling `register_read`, the 
-    /// if `get_register()` is given a custom implementation, be sure to either call `
+    /// `get_register()` may be implemented withhout calling `register_read()`, in which case when
+    /// used with `visit_accesses` the register/memory read/writes will all correspond directly to
+    /// implicit and explicit operands.
     fn get_register(&mut self, reg: RegSpec) -> Option<u64> {
         self.register_read(reg);
         None
     }
+    /// record that the instruction reads a memory location.
+    ///
+    /// when used with `visit_accesses`, an address is only provided when yaxpeax-x86 can calculate
+    /// an effective address (i.e. `get_register()` calls for all dependent registers return a
+    /// value). all non-`ComplexOp` instructions have a known memory access size, so this is always
+    /// reported regardless of if *where* is not known.
+    ///
+    /// some instructions can both read and write memory (consider `call [addr]`).
     fn memory_read(&mut self, address: Option<u64>, size: u32);
+    /// record that the instruction writes a memory location.
+    ///
+    /// when used with `visit_accesses`, an address is only provided when yaxpeax-x86 can calculate
+    /// an effective address (i.e. `get_register()` calls for all dependent registers return a
+    /// value). all non-`ComplexOp` instructions have a known memory access size, so this is always
+    /// reported regardless of if *where* is not known.
+    ///
+    /// some instructions can both read and write memory (consider `call [addr]`).
     fn memory_write(&mut self, address: Option<u64>, size: u32);
 }
 
@@ -4038,7 +4189,7 @@ static TABLE: [BehaviorDigest; 1413] = [
             .set_operand(0, Access::Read)
             .set_complex(true),
     /* VERR => */ GENERAL_R_FLAGWRITE,
-    /* VERW => */  GENERAL_R_FLAGWRITE,
+    /* VERW => */ GENERAL_R_FLAGWRITE,
     /* CMC => */ GENERAL_FLAGRW,
     /* CLC => */ GENERAL_FLAGRW,
     /* STC => */ GENERAL_FLAGRW,