11 files changed, 1851 insertions, 36 deletions
diff --git a/src/annotation/mod.rs b/src/annotation/mod.rs
index 0248b94..af8b4bf 100644
--- a/src/annotation/mod.rs
+++ b/src/annotation/mod.rs
@@ -19,6 +19,8 @@
 //! in a generic setting, there isn't much to do with a `FieldDescription` other than display it. a
 //! typical use might look something like:
 //! ```
+//! #[cfg(feature="std")]
+//! # {
 //! use core::fmt;
 //!
 //! use yaxpeax_arch::annotation::{AnnotatingDecoder, VecSink};
@@ -40,6 +42,7 @@
 //!         println!("  bits [{}, {}]: {}", start, end, desc);
 //!     }
 //! }
+//! # }
 //! ```
 //!
 //! note that the range `[start, end]` for a reported span is _inclusive_. the `end`-th bit of a
@@ -73,7 +76,7 @@ use crate::{Arch, Reader};
 
 use core::fmt::Display;
 
-/// implementors of `DescriptionSink` receive descriptions of an instruction's disassembly process
+/// implementers of `DescriptionSink` receive descriptions of an instruction's disassembly process
 /// and relevant offsets in the bitstream being decoded. descriptions are archtecture-specific, and
 /// architectures are expected to be able to turn the bit-level `start` and `width` values into a
 /// meaningful description of bits in the original instruction stream.
@@ -91,24 +94,34 @@ impl<T> DescriptionSink<T> for NullSink {
     fn record(&mut self, _start: u32, _end: u32, _description: T) { }
 }
 
-#[cfg(feature = "std")]
-pub struct VecSink<T: Clone + Display> {
-    pub records: std::vec::Vec<(u32, u32, T)>
-}
+#[cfg(feature = "alloc")]
+mod vec_sink {
+    use alloc::vec::Vec;
+    use core::fmt::Display;
+    use crate::annotation::DescriptionSink;
 
-#[cfg(feature = "std")]
-impl<T: Clone + Display> VecSink<T> {
-    pub fn new() -> Self {
-        VecSink { records: std::vec::Vec::new() }
+    pub struct VecSink<T: Clone + Display> {
+        pub records: Vec<(u32, u32, T)>
+    }
+
+    impl<T: Clone + Display> VecSink<T> {
+        pub fn new() -> Self {
+            VecSink { records: Vec::new() }
+        }
+
+        pub fn into_inner(self) -> Vec<(u32, u32, T)> {
+            self.records
+        }
     }
-}
 
-#[cfg(feature = "std")]
-impl<T: Clone + Display> DescriptionSink<T> for VecSink<T> {
-    fn record(&mut self, start: u32, end: u32, description: T) {
-        self.records.push((start, end, description));
+    impl<T: Clone + Display> DescriptionSink<T> for VecSink<T> {
+        fn record(&mut self, start: u32, end: u32, description: T) {
+            self.records.push((start, end, description));
+        }
     }
 }
+#[cfg(feature = "alloc")]
+pub use vec_sink::VecSink;
 
 pub trait FieldDescription {
     fn id(&self) -> u32;
@@ -118,7 +131,7 @@ pub trait FieldDescription {
 /// an interface to decode [`Arch::Instruction`] words from a reader of [`Arch::Word`]s, with the
 /// decoder able to report descriptions of bits or fields in the instruction to a sink implementing
 /// [`DescriptionSink`]. the sink may be [`NullSink`] to discard provided data. decoding with a
-/// `NullSink` should behave identically to `Decoder::decode_into`. implementors are recommended to
+/// `NullSink` should behave identically to `Decoder::decode_into`. implementers are recommended to
 /// implement `Decoder::decode_into` as a call to `AnnotatingDecoder::decode_with_annotation` if
 /// implementing both traits.
 pub trait AnnotatingDecoder<A: Arch + ?Sized> {
diff --git a/src/color_new.rs b/src/color_new.rs
new file mode 100644
index 0000000..1d3e358
--- /dev/null
+++ b/src/color_new.rs
@@ -0,0 +1,281 @@
+#[non_exhaustive]
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
+pub enum Color {
+    Black,
+    DarkGrey,
+    Red,
+    DarkRed,
+    Green,
+    DarkGreen,
+    Yellow,
+    DarkYellow,
+    Blue,
+    DarkBlue,
+    Magenta,
+    DarkMagenta,
+    Cyan,
+    DarkCyan,
+    White,
+    Grey,
+}
+
+pub trait YaxColors {
+    fn arithmetic_op(&self) -> Color;
+    fn stack_op(&self) -> Color;
+    fn nop_op(&self) -> Color;
+    fn stop_op(&self) -> Color;
+    fn control_flow_op(&self) -> Color;
+    fn data_op(&self) -> Color;
+    fn comparison_op(&self) -> Color;
+    fn invalid_op(&self) -> Color;
+    fn platform_op(&self) -> Color;
+    fn misc_op(&self) -> Color;
+
+    fn register(&self) -> Color;
+    fn program_counter(&self) -> Color;
+    fn number(&self) -> Color;
+    fn zero(&self) -> Color;
+    fn one(&self) -> Color;
+    fn minus_one(&self) -> Color;
+    fn address(&self) -> Color;
+    fn symbol(&self) -> Color;
+    fn function(&self) -> Color;
+}
+
+/// support for colorizing text with ANSI control sequences.
+///
+/// the most useful item in this module is [`ansi::AnsiDisplaySink`], which interprets span entry
+/// and exit as points at which ANSI sequences may need to be written into the output it wraps -
+/// that output may be any type implementing [`crate::display::DisplaySink`], including
+/// [`crate::display::FmtSink`] to adapt any implementer of `fmt::Write` such as standard out.
+///
+/// ## example
+///
+/// to write colored text to standard out:
+///
+/// ```
+/// # #[cfg(feature="alloc")]
+/// # {
+/// # extern crate alloc;
+/// # use alloc::string::String;
+/// use yaxpeax_arch::color_new::DefaultColors;
+/// use yaxpeax_arch::color_new::ansi::AnsiDisplaySink;
+/// use yaxpeax_arch::display::FmtSink;
+///
+/// let mut s = String::new();
+/// let mut s_sink = FmtSink::new(&mut s);
+///
+/// let mut writer = AnsiDisplaySink::new(&mut s_sink, DefaultColors);
+///
+/// // this might be a yaxpeax crate's `display_into`, or other library implementation code
+/// mod fake_yaxpeax_crate {
+///     use yaxpeax_arch::display::DisplaySink;
+///
+///     pub fn format_memory_operand<T: DisplaySink>(out: &mut T) -> core::fmt::Result {
+///         out.span_start_immediate();
+///         out.write_prefixed_u8(0x80)?;
+///         out.span_end_immediate();
+///         out.write_fixed_size("(")?;
+///         out.span_start_register();
+///         out.write_fixed_size("rbp")?;
+///         out.span_end_register();
+///         out.write_fixed_size(")")?;
+///         Ok(())
+///     }
+/// }
+///
+/// // this might be how a user uses `AnsiDisplaySink`, which will write ANSI-ful text to `s` and
+/// // print it.
+///
+/// fake_yaxpeax_crate::format_memory_operand(&mut writer).expect("write succeeds");
+///
+/// println!("{}", s);
+/// # }
+/// ```
+pub mod ansi {
+    use crate::color_new::Color;
+
+    // color sequences as described by ECMA-48 and, apparently, `man 4 console_codes`
+    /// translate [`yaxpeax_arch::color_new::Color`] to an ANSI control code that changes the
+    /// foreground color to match.
+    #[allow(dead_code)] // allowing this to be dead code because if colors are enabled and alloc is not, there will not be an AnsiDisplaySink, which is the sole user of this function.
+    fn color2ansi(color: Color) -> &'static str {
+        // for most of these, in 256 color space the darker color can be picked by the same color
+        // index as the brighter form (from the 8 color command set). dark grey is an outlier,
+        // where 38;5;0 and 30 both are black. there is no "grey" in the shorter command set to
+        // map to. but it turns out that 38;5;m is exactly the darker grey to use.
+        match color {
+            Color::Black => "\x1b[30m",
+            Color::DarkGrey => "\x1b[38;5;8m",
+            Color::Red => "\x1b[31m",
+            Color::DarkRed => "\x1b[38;5;1m",
+            Color::Green => "\x1b[32m",
+            Color::DarkGreen => "\x1b[38;5;2m",
+            Color::Yellow => "\x1b[33m",
+            Color::DarkYellow => "\x1b[38;5;3m",
+            Color::Blue => "\x1b[34m",
+            Color::DarkBlue => "\x1b[38;5;4m",
+            Color::Magenta => "\x1b[35m",
+            Color::DarkMagenta => "\x1b[38;5;5m",
+            Color::Cyan => "\x1b[36m",
+            Color::DarkCyan => "\x1b[38;5;6m",
+            Color::White => "\x1b[37m",
+            Color::Grey => "\x1b[38;5;7m",
+        }
+    }
+
+    // could reasonably be always present, but only used if feature="alloc"
+    #[cfg(feature="alloc")]
+    const DEFAULT_FG: &'static str = "\x1b[39m";
+
+    #[cfg(feature="alloc")]
+    mod ansi_display_sink {
+        use crate::color_new::{Color, YaxColors};
+        use crate::display::DisplaySink;
+
+        /// adapter to insert ANSI color command sequences in formatted text to style printed
+        /// instructions.
+        ///
+        /// this enables similar behavior as the deprecated [`crate::Colorize`] trait,
+        /// for outputs that can process ANSI color commands.
+        ///
+        /// `AnsiDisplaySink` will silently ignore errors from writes to the underlying `T:
+        /// DisplaySink`. when writing to a string or other growable buffer, errors are likely
+        /// inseparable from `abort()`. when writing to stdout or stderr, write failures likely
+        /// mean output is piped to a process which has closed the pipe but are otherwise harmless.
+        /// `span_enter_*` and `span_exit_*` don't have error reporting mechanisms in their return
+        /// type, so the only available error mechanism would be to also `abort()`.
+        ///
+        /// if this turns out to be a bad decision, it'll have to be rethought!
+        pub struct AnsiDisplaySink<'sink, T: DisplaySink, Y: YaxColors> {
+            out: &'sink mut T,
+            span_stack: alloc::vec::Vec<Color>,
+            colors: Y
+        }
+
+        impl<'sink, T: DisplaySink, Y: YaxColors> AnsiDisplaySink<'sink, T, Y> {
+            pub fn new(out: &'sink mut T, colors: Y) -> Self {
+                Self {
+                    out,
+                    span_stack: alloc::vec::Vec::new(),
+                    colors,
+                }
+            }
+
+            fn push_color(&mut self, color: Color) {
+                self.span_stack.push(color);
+                let _ = self.out.write_fixed_size(super::color2ansi(color));
+            }
+
+            fn restore_prev_color(&mut self) {
+                let _ = self.span_stack.pop();
+                if let Some(prev_color) = self.span_stack.last() {
+                    let _ = self.out.write_fixed_size(super::color2ansi(*prev_color));
+                } else {
+                    let _ = self.out.write_fixed_size(super::DEFAULT_FG);
+                };
+            }
+        }
+
+        impl<'sink, T: DisplaySink, Y: YaxColors> core::fmt::Write for AnsiDisplaySink<'sink, T, Y> {
+            fn write_str(&mut self, s: &str) -> Result<(), core::fmt::Error> {
+                self.out.write_str(s)
+            }
+            fn write_char(&mut self, c: char) -> Result<(), core::fmt::Error> {
+                self.out.write_char(c)
+            }
+        }
+
+        impl<'sink, T: DisplaySink, Y: YaxColors> DisplaySink for AnsiDisplaySink<'sink, T, Y> {
+            fn span_start_immediate(&mut self) { self.push_color(self.colors.number()); }
+            fn span_end_immediate(&mut self) { self.restore_prev_color() }
+
+            fn span_start_register(&mut self) { self.push_color(self.colors.register()); }
+            fn span_end_register(&mut self) { self.restore_prev_color() }
+
+            // ah.. the right way, currently, to colorize opcodes would be to collect text while in the
+            // opcode span, and request some kind of user-provided decoder ring to translate mnemonics
+            // into the right color. that's very unfortunate. maybe there should be another span for
+            // `opcode_kind(u8)` for impls to report what kind of opcode they'll be emitting..
+            fn span_start_opcode(&mut self) { self.push_color(self.colors.misc_op()); }
+            fn span_end_opcode(&mut self) { self.restore_prev_color() }
+
+            fn span_start_program_counter(&mut self) { self.push_color(self.colors.program_counter()); }
+            fn span_end_program_counter(&mut self) { self.restore_prev_color() }
+
+            fn span_start_number(&mut self) { self.push_color(self.colors.number()); }
+            fn span_end_number(&mut self) { self.restore_prev_color() }
+
+            fn span_start_address(&mut self) { self.push_color(self.colors.address()); }
+            fn span_end_address(&mut self) { self.restore_prev_color() }
+
+            fn span_start_function_expr(&mut self) { self.push_color(self.colors.function()); }
+            fn span_end_function_expr(&mut self) { self.restore_prev_color() }
+        }
+    }
+    #[cfg(feature="alloc")]
+    pub use ansi_display_sink::AnsiDisplaySink;
+}
+
+pub struct DefaultColors;
+
+impl YaxColors for DefaultColors {
+    fn arithmetic_op(&self) -> Color {
+        Color::Yellow
+    }
+    fn stack_op(&self) -> Color {
+        Color::DarkMagenta
+    }
+    fn nop_op(&self) -> Color {
+        Color::DarkBlue
+    }
+    fn stop_op(&self) -> Color {
+        Color::Red
+    }
+    fn control_flow_op(&self) -> Color {
+        Color::DarkGreen
+    }
+    fn data_op(&self) -> Color {
+        Color::Magenta
+    }
+    fn comparison_op(&self) -> Color {
+        Color::DarkYellow
+    }
+    fn invalid_op(&self) -> Color {
+        Color::DarkRed
+    }
+    fn misc_op(&self) -> Color {
+        Color::Cyan
+    }
+    fn platform_op(&self) -> Color {
+        Color::DarkCyan
+    }
+
+    fn register(&self) -> Color {
+        Color::DarkCyan
+    }
+    fn program_counter(&self) -> Color {
+        Color::DarkRed
+    }
+    fn number(&self) -> Color {
+        Color::White
+    }
+    fn zero(&self) -> Color {
+        Color::White
+    }
+    fn one(&self) -> Color {
+        Color::White
+    }
+    fn minus_one(&self) -> Color {
+        Color::White
+    }
+    fn address(&self) -> Color {
+        Color::DarkGreen
+    }
+    fn symbol(&self) -> Color {
+        Color::Green
+    }
+    fn function(&self) -> Color {
+        Color::Green
+    }
+}
diff --git a/src/display.rs b/src/display.rs
index 789919e..754d3e6 100644
--- a/src/display.rs
+++ b/src/display.rs
@@ -1,9 +1,35 @@
+// allow use of deprecated items in this module since some functions using `SignedHexDisplay` still
+// exist here
+#![allow(deprecated)]
+
 use crate::YaxColors;
 
 use core::fmt;
 use core::num::Wrapping;
 use core::ops::Neg;
 
+mod display_sink;
+
+pub use display_sink::{DisplaySink, FmtSink};
+#[cfg(feature = "alloc")]
+pub use display_sink::InstructionTextSink;
+
+/// translate a byte in range `[0, 15]` to a lowercase base-16 digit.
+///
+/// if `c` is in range, the output is always valid as the sole byte in a utf-8 string. if `c` is out
+/// of range, the returned character might not be a valid single-byte utf-8 codepoint.
+#[cfg(feature = "alloc")] // this function is of course not directly related to alloc, but it's only needed by impls that themselves are only present with alloc.
+fn u8_to_hex(c: u8) -> u8 {
+    // this conditional branch is faster than a lookup for... most architectures (especially x86
+    // with cmov)
+    if c < 10 {
+        b'0' + c
+    } else {
+        b'a' + c - 10
+    }
+}
+
+#[deprecated(since="0.3.0", note="format_number_i32 does not optimize as expected and will be removed in the future. see DisplaySink instead.")]
 pub enum NumberStyleHint {
     Signed,
     HexSigned,
@@ -17,36 +43,37 @@ pub enum NumberStyleHint {
     HexUnsignedWithSign
 }
 
-pub fn format_number_i32<W: fmt::Write, Y: YaxColors>(colors: &Y, f: &mut W, i: i32, hint: NumberStyleHint) -> fmt::Result {
+#[deprecated(since="0.3.0", note="format_number_i32 is both slow and incorrect: YaxColors may not result in correct styling when writing anywhere other than a terminal, and both stylin and formatting does not inline as well as initially expected. see DisplaySink instead.")]
+pub fn format_number_i32<W: fmt::Write, Y: YaxColors>(_colors: &Y, f: &mut W, i: i32, hint: NumberStyleHint) -> fmt::Result {
     match hint {
         NumberStyleHint::Signed => {
-            write!(f, "{}", colors.number(i))
+            write!(f, "{}", (i))
         },
         NumberStyleHint::HexSigned => {
-            write!(f, "{}", colors.number(signed_i32_hex(i)))
+            write!(f, "{}", signed_i32_hex(i))
         },
         NumberStyleHint::Unsigned => {
-            write!(f, "{}", colors.number(i as u32))
+            write!(f, "{}", i as u32)
         },
         NumberStyleHint::HexUnsigned => {
-            write!(f, "{}", colors.number(u32_hex(i as u32)))
+            write!(f, "{}", u32_hex(i as u32))
         },
         NumberStyleHint::SignedWithSignSplit => {
             if i == core::i32::MIN {
-                write!(f, "- {}", colors.number("2147483647"))
+                write!(f, "- {}", "2147483647")
             } else if i < 0 {
-                write!(f, "- {}", colors.number(-Wrapping(i)))
+                write!(f, "- {}", -Wrapping(i))
             } else {
-                write!(f, "+ {}", colors.number(i))
+                write!(f, "+ {}", i)
             }
         }
         NumberStyleHint::HexSignedWithSignSplit => {
             if i == core::i32::MIN {
-                write!(f, "- {}", colors.number("0x7fffffff"))
+                write!(f, "- {}", ("0x7fffffff"))
             } else if i < 0 {
-                write!(f, "- {}", colors.number(u32_hex((-Wrapping(i)).0 as u32)))
+                write!(f, "- {}", u32_hex((-Wrapping(i)).0 as u32))
             } else {
-                write!(f, "+ {}", colors.number(u32_hex(i as u32)))
+                write!(f, "+ {}", u32_hex(i as u32))
             }
         },
         NumberStyleHint::HexSignedWithSign => {
@@ -64,6 +91,7 @@ pub fn format_number_i32<W: fmt::Write, Y: YaxColors>(colors: &Y, f: &mut W, i:
     }
 }
 
+#[deprecated(since="0.3.0", note="SignedHexDisplay does not optimize like expected and will be removed in the future. see DisplaySink instead.")]
 pub struct SignedHexDisplay<T: core::fmt::LowerHex + Neg> {
     value: T,
     negative: bool
@@ -79,6 +107,7 @@ impl<T: fmt::LowerHex + Neg + Copy> fmt::Display for SignedHexDisplay<T> where W
     }
 }
 
+#[deprecated(since="0.3.0", note="u8_hex does not optimize like expected and will be removed in the future. see DisplaySink instead.")]
 pub fn u8_hex(value: u8) -> SignedHexDisplay<i8> {
     SignedHexDisplay {
         value: value as i8,
@@ -86,6 +115,7 @@ pub fn u8_hex(value: u8) -> SignedHexDisplay<i8> {
     }
 }
 
+#[deprecated(since="0.3.0", note="signed_i8_hex does not optimize like expected and will be removed in the future. see DisplaySink instead.")]
 pub fn signed_i8_hex(imm: i8) -> SignedHexDisplay<i8> {
     SignedHexDisplay {
         value: imm,
@@ -93,6 +123,7 @@ pub fn signed_i8_hex(imm: i8) -> SignedHexDisplay<i8> {
     }
 }
 
+#[deprecated(since="0.3.0", note="u16_hex does not optimize like expected and will be removed in the future. see DisplaySink instead.")]
 pub fn u16_hex(value: u16) -> SignedHexDisplay<i16> {
     SignedHexDisplay {
         value: value as i16,
@@ -100,6 +131,7 @@ pub fn u16_hex(value: u16) -> SignedHexDisplay<i16> {
     }
 }
 
+#[deprecated(since="0.3.0", note="signed_i16_hex does not optimize like expected and will be removed in the future. see DisplaySink instead.")]
 pub fn signed_i16_hex(imm: i16) -> SignedHexDisplay<i16> {
     SignedHexDisplay {
         value: imm,
@@ -107,6 +139,7 @@ pub fn signed_i16_hex(imm: i16) -> SignedHexDisplay<i16> {
     }
 }
 
+#[deprecated(since="0.3.0", note="u32_hex does not optimize like expected and will be removed in the future. see DisplaySink instead.")]
 pub fn u32_hex(value: u32) -> SignedHexDisplay<i32> {
     SignedHexDisplay {
         value: value as i32,
@@ -114,6 +147,7 @@ pub fn u32_hex(value: u32) -> SignedHexDisplay<i32> {
     }
 }
 
+#[deprecated(since="0.3.0", note="signed_i32_hex does not optimize like expected and will be removed in the future. see DisplaySink instead.")]
 pub fn signed_i32_hex(imm: i32) -> SignedHexDisplay<i32> {
     SignedHexDisplay {
         value: imm,
@@ -121,6 +155,7 @@ pub fn signed_i32_hex(imm: i32) -> SignedHexDisplay<i32> {
     }
 }
 
+#[deprecated(since="0.3.0", note="u64_hex does not optimize like expected and will be removed in the future. see DisplaySink instead.")]
 pub fn u64_hex(value: u64) -> SignedHexDisplay<i64> {
     SignedHexDisplay {
         value: value as i64,
@@ -128,6 +163,7 @@ pub fn u64_hex(value: u64) -> SignedHexDisplay<i64> {
     }
 }
 
+#[deprecated(since="0.3.0", note="signed_i64_hex does not optimize like expected and will be removed in the future. see DisplaySink instead.")]
 pub fn signed_i64_hex(imm: i64) -> SignedHexDisplay<i64> {
     SignedHexDisplay {
         value: imm,
diff --git a/src/display/display_sink.rs b/src/display/display_sink.rs
new file mode 100644
index 0000000..9aa3c85
--- /dev/null
+++ b/src/display/display_sink.rs
@@ -0,0 +1,1017 @@
+use core::fmt;
+
+// `imp_x86.rs` has `asm!()` macros, and so is not portable at all.
+#[cfg(all(feature="alloc", target_arch = "x86_64"))]
+#[path="./display_sink/imp_x86.rs"]
+mod imp;
+
+// for other architectures, fall back on possibly-slower portable functions.
+#[cfg(all(feature="alloc", not(target_arch = "x86_64")))]
+#[path="./display_sink/imp_generic.rs"]
+mod imp;
+
+
+/// `DisplaySink` allows client code to collect output and minimal markup. this is currently used
+/// in formatting instructions for two reasons:
+/// * `DisplaySink` implementations have the opportunity to collect starts and ends of tokens at
+///   the same time as collecting output itself.
+/// * `DisplaySink` implementations provide specialized functions for writing strings in
+///   circumstances where a simple "use `core::fmt`" might incur unwanted overhead.
+///
+/// ## spans
+///
+/// spans are out-of-band indicators for the meaning of data written to this sink. when a
+/// `span_start_<foo>` function is called, data written until a matching `span_end_<foo>` can be
+/// considered the text corresponding to `<foo>`.
+///
+/// spans are entered and exited in a FILO manner. implementations of `DisplaySink` are explicitly
+/// allowed to depend on this fact. functions writing to a `DisplaySink` must exit spans in reverse
+/// order to when they are entered. a function that has a call sequence like
+/// ```text
+/// sink.span_start_operand();
+/// sink.span_start_immediate();
+/// sink.span_end_operand();
+/// ```
+/// is in error.
+///
+/// spans are reported through the `span_start_*` and `span_end_*` families of functions to avoid
+/// constraining implementations into tracking current output offset (which may not be knowable) or
+/// span size (which may be knowable, but incur additional overhead to compute or track). if the
+/// task for a span is to simply emit VT100 color codes, for example, implementations avoid the
+/// overhead of tracking offsets.
+///
+/// default implementations of the `span_start_*` and `span_end_*` functions are to do nothing. a
+/// no-op `span_start_*` or `span_end_*` allows rustc to elimiate such calls at compile time for
+/// `DisplaySink` that are uninterested in the corresponding span type.
+///
+/// # write helpers (`write_*`)
+///
+/// the `write_*` helpers on `DisplaySink` may be able to take advantage of contraints described in
+/// documentation here to better support writing some kinds of inputs than a fully-general solution
+/// (such as `core::fmt`) might be able to yield.
+///
+/// currently there are two motivating factors for `write_*` helpers:
+///
+/// instruction formatting often involves writing small but variable-size strings, such as register
+/// names, which is something of a pathological case for string appending as Rust currently exists:
+/// this often becomes `memcpy` and specifically a call to the platform's `memcpy` (rather than an
+/// inlined `rep movsb`) just to move 3-5 bytes. one relevant Rust issue for reference:
+/// <https://github.com/rust-lang/rust/issues/92993#issuecomment-2028915232>
+///
+/// there are similar papercuts around formatting integers as base-16 numbers, such as
+/// <https://github.com/rust-lang/rust/pull/122770>. in isolation and in most applications these are
+/// not a significant source of overhead. but for programs bounded on decoding and printing
+/// instructions, these can add up to significant overhead - on the order of 10-20% of total
+/// runtime.
+///
+/// ## example
+///
+/// a simple call sequence to `DisplaySink` might look something like:
+/// ```compile_fail
+/// sink.span_start_operand()
+/// sink.write_char('[')
+/// sink.span_start_register()
+/// sink.write_fixed_size("rbp")
+/// sink.span_end_register()
+/// sink.write_char(']')
+/// sink.span_end_operand()
+/// ```
+/// which writes the text `[rbp]`, telling sinks that the operand begins at `[`, ends after `]`,
+/// and `rbp` is a register in that operand.
+///
+/// ## extensibility
+///
+/// additional `span_{start,end}_*` helpers may be added over time - in the above example, one
+/// future addition might be to add a new `effective_address` span that is started before
+/// `register` and ended after `register. for an operand like `\[rbp\]` the effective address span
+/// would exactly match a corresponding register span, but in more complicated scenarios like
+/// `[rsp + rdi * 4 + 0x50]` the effective address would be all of `rsp + rdi * 4 + 0x50`.
+///
+/// additional spans are expected to be added as needed. it is not immediately clear how to add
+/// support for more architecture-specific concepts (such as itanium predicate registers) would be
+/// supported yet, and so architecture-specific concepts may be expressed on `DisplaySink` if the
+/// need arises.
+///
+/// new `span_{start,end}_*` helpers will be defaulted as no-op. additions to this trait will be
+/// minor version bumps, so users should take care to not add custom functions starting with
+/// `span_start_` or `span_end_` to structs implementing `DisplaySink`.
+pub trait DisplaySink: fmt::Write {
+    #[inline(always)]
+    fn write_fixed_size(&mut self, s: &str) -> Result<(), core::fmt::Error> {
+        self.write_str(s)
+    }
+
+    /// write a string to this sink that is less than 32 bytes. this is provided for optimization
+    /// opportunities when writing a variable-length string with known max size.
+    ///
+    /// SAFETY: the provided `s` must be less than 32 bytes. if the provided string is longer than
+    /// 31 bytes, implementations may only copy part of a multi-byte codepoint while writing to a
+    /// utf-8 string. this may corrupt Rust strings.
+    unsafe fn write_lt_32(&mut self, s: &str) -> Result<(), core::fmt::Error> {
+        self.write_str(s)
+    }
+    /// write a string to this sink that is less than 16 bytes. this is provided for optimization
+    /// opportunities when writing a variable-length string with known max size.
+    ///
+    /// SAFETY: the provided `s` must be less than 16 bytes. if the provided string is longer than
+    /// 15 bytes, implementations may only copy part of a multi-byte codepoint while writing to a
+    /// utf-8 string. this may corrupt Rust strings.
+    unsafe fn write_lt_16(&mut self, s: &str) -> Result<(), core::fmt::Error> {
+        self.write_str(s)
+    }
+    /// write a string to this sink that is less than 8 bytes. this is provided for optimization
+    /// opportunities when writing a variable-length string with known max size.
+    ///
+    /// SAFETY: the provided `s` must be less than 8 bytes. if the provided string is longer than
+    /// 7 bytes, implementations may only copy part of a multi-byte codepoint while writing to a
+    /// utf-8 string. this may corrupt Rust strings.
+    unsafe fn write_lt_8(&mut self, s: &str) -> Result<(), core::fmt::Error> {
+        self.write_str(s)
+    }
+
+    /// write a u8 to the output as a base-16 integer.
+    ///
+    /// this corresponds to the Rust format specifier `{:x}` - see [`std::fmt::LowerHex`] for more.
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    fn write_u8(&mut self, v: u8) -> Result<(), core::fmt::Error> {
+        write!(self, "{:x}", v)
+    }
+    /// write a u8 to the output as a base-16 integer with leading `0x`.
+    ///
+    /// this corresponds to the Rust format specifier `{#:x}` - see [`std::fmt::LowerHex`] for more.
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    fn write_prefixed_u8(&mut self, v: u8) -> Result<(), core::fmt::Error> {
+        self.write_fixed_size("0x")?;
+        self.write_u8(v)
+    }
+    /// write an i8 to the output as a base-16 integer with leading `0x`, and leading `-` if the
+    /// value is negative.
+    ///
+    /// there is no matching `std` formatter, so some examples here:
+    /// ```text
+    /// sink.write_prefixed_i8(-0x60); // writes `-0x60` to the sink
+    /// sink.write_prefixed_i8(127); // writes `0x7f` to the sink
+    /// sink.write_prefixed_i8(-128); // writes `-0x80` to the sink
+    /// ```
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    fn write_prefixed_i8(&mut self, v: i8) -> Result<(), core::fmt::Error> {
+        let v = if v < 0 {
+            self.write_char('-')?;
+            v.unsigned_abs()
+        } else {
+            v as u8
+        };
+        self.write_prefixed_u8(v)
+    }
+    /// write a u16 to the output as a base-16 integer.
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    fn write_u16(&mut self, v: u16) -> Result<(), core::fmt::Error> {
+        write!(self, "{:x}", v)
+    }
+    /// write a u16 to the output as a base-16 integer with leading `0x`.
+    ///
+    /// this corresponds to the Rust format specifier `{#:x}` - see [`std::fmt::LowerHex`] for more.
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    fn write_prefixed_u16(&mut self, v: u16) -> Result<(), core::fmt::Error> {
+        self.write_fixed_size("0x")?;
+        self.write_u16(v)
+    }
+    /// write an i16 to the output as a base-16 integer with leading `0x`, and leading `-` if the
+    /// value is negative.
+    ///
+    /// there is no matching `std` formatter, so some examples here:
+    /// ```text
+    /// sink.write_prefixed_i16(-0x60); // writes `-0x60` to the sink
+    /// sink.write_prefixed_i16(127); // writes `0x7f` to the sink
+    /// sink.write_prefixed_i16(-128); // writes `-0x80` to the sink
+    /// ```
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    fn write_prefixed_i16(&mut self, v: i16) -> Result<(), core::fmt::Error> {
+        let v = if v < 0 {
+            self.write_char('-')?;
+            v.unsigned_abs()
+        } else {
+            v as u16
+        };
+        self.write_prefixed_u16(v)
+    }
+    /// write a u32 to the output as a base-16 integer.
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    fn write_u32(&mut self, v: u32) -> Result<(), core::fmt::Error> {
+        write!(self, "{:x}", v)
+    }
+    /// write a u32 to the output as a base-16 integer with leading `0x`.
+    ///
+    /// this corresponds to the Rust format specifier `{#:x}` - see [`std::fmt::LowerHex`] for more.
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    fn write_prefixed_u32(&mut self, v: u32) -> Result<(), core::fmt::Error> {
+        self.write_fixed_size("0x")?;
+        self.write_u32(v)
+    }
+    /// write an i32 to the output as a base-32 integer with leading `0x`, and leading `-` if the
+    /// value is negative.
+    ///
+    /// there is no matching `std` formatter, so some examples here:
+    /// ```text
+    /// sink.write_prefixed_i32(-0x60); // writes `-0x60` to the sink
+    /// sink.write_prefixed_i32(127); // writes `0x7f` to the sink
+    /// sink.write_prefixed_i32(-128); // writes `-0x80` to the sink
+    /// ```
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    fn write_prefixed_i32(&mut self, v: i32) -> Result<(), core::fmt::Error> {
+        let v = if v < 0 {
+            self.write_char('-')?;
+            v.unsigned_abs()
+        } else {
+            v as u32
+        };
+        self.write_prefixed_u32(v)
+    }
+    /// write a u64 to the output as a base-16 integer.
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    fn write_u64(&mut self, v: u64) -> Result<(), core::fmt::Error> {
+        write!(self, "{:x}", v)
+    }
+    /// write a u64 to the output as a base-16 integer with leading `0x`.
+    ///
+    /// this corresponds to the Rust format specifier `{#:x}` - see [`std::fmt::LowerHex`] for more.
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    fn write_prefixed_u64(&mut self, v: u64) -> Result<(), core::fmt::Error> {
+        self.write_fixed_size("0x")?;
+        self.write_u64(v)
+    }
+    /// write an i64 to the output as a base-64 integer with leading `0x`, and leading `-` if the
+    /// value is negative.
+    ///
+    /// there is no matching `std` formatter, so some examples here:
+    /// ```text
+    /// sink.write_prefixed_i64(-0x60); // writes `-0x60` to the sink
+    /// sink.write_prefixed_i64(127); // writes `0x7f` to the sink
+    /// sink.write_prefixed_i64(-128); // writes `-0x80` to the sink
+    /// ```
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    fn write_prefixed_i64(&mut self, v: i64) -> Result<(), core::fmt::Error> {
+        let v = if v < 0 {
+            self.write_char('-')?;
+            v.unsigned_abs()
+        } else {
+            v as u64
+        };
+        self.write_prefixed_u64(v)
+    }
+
+    /// enter a region inside which output corresponds to an immediate.
+    fn span_start_immediate(&mut self) { }
+    /// end a region where an immediate was written. see docs on [`DisplaySink`] for more.
+    fn span_end_immediate(&mut self) { }
+
+    /// enter a region inside which output corresponds to a register.
+    fn span_start_register(&mut self) { }
+    /// end a region where a register was written. see docs on [`DisplaySink`] for more.
+    fn span_end_register(&mut self) { }
+
+    /// enter a region inside which output corresponds to an opcode.
+    fn span_start_opcode(&mut self) { }
+    /// end a region where an opcode was written. see docs on [`DisplaySink`] for more.
+    fn span_end_opcode(&mut self) { }
+
+    /// enter a region inside which output corresponds to the program counter.
+    fn span_start_program_counter(&mut self) { }
+    /// end a region where the program counter was written. see docs on [`DisplaySink`] for more.
+    fn span_end_program_counter(&mut self) { }
+
+    /// enter a region inside which output corresponds to a number, such as a memory offset or
+    /// immediate.
+    fn span_start_number(&mut self) { }
+    /// end a region where a number was written. see docs on [`DisplaySink`] for more.
+    fn span_end_number(&mut self) { }
+
+    /// enter a region inside which output corresponds to an address. this is a best guess;
+    /// instructions like x86's `lea` may involve an "address" that is not, and arithmetic
+    /// instructions may operate on addresses held in registers.
+    ///
+    /// where possible, the presence of this span will be informed by ISA semantics - if an
+    /// instruction has a memory operand, the effective address calculation of that operand should
+    /// be in an address span.
+    fn span_start_address(&mut self) { }
+    /// end a region where an address was written. the specifics of an "address" are ambiguous and
+    /// best-effort; see [`DisplaySink::span_start_address`] for more about this. otherwise, see
+    /// docs on [`DisplaySink`] for more about spans.
+    fn span_end_address(&mut self) { }
+
+    /// enter a region inside which output corresponds to a function address, or expression
+    /// evaluating to a function address. this is a best guess; instructions like `call` may call
+    /// to a non-function address, `jmp` may jump to a function (as with tail calls), function
+    /// addresses may be computed via table lookup without semantic hints.
+    ///
+    /// where possible, the presence of this span will be informed by ISA semantics - if an
+    /// instruction is like a "call", an address operand should be a `function` span. if other
+    /// instructions can be expected to handle subroutine starting addresses purely from ISA
+    /// semantics, address operand(s) should be in a `function` span.
+    fn span_start_function_expr(&mut self) { }
+    /// end a region where function address expression was written. the specifics of a "function
+    /// address" are ambiguous and best-effort; see [`DisplaySink::span_start_function_expr`] for more
+    /// about this. otherwise, see docs on [`DisplaySink`] for more about spans.
+    fn span_end_function_expr(&mut self) { }
+}
+
+/// `FmtSink` can be used to adapt any `fmt::Write`-implementing type into a `DisplaySink` to
+/// format an instruction while discarding all span information at zero cost.
+pub struct FmtSink<'a, T: fmt::Write> {
+    out: &'a mut T,
+}
+
+impl<'a, T: fmt::Write> FmtSink<'a, T> {
+    pub fn new(f: &'a mut T) -> Self {
+        Self { out: f }
+    }
+
+    pub fn inner_ref(&self) -> &T {
+        &self.out
+    }
+}
+
+/// blanket impl that discards all span information, forwards writes to the underlying `fmt::Write`
+/// type.
+impl<'a, T: fmt::Write> DisplaySink for FmtSink<'a, T> { }
+
+impl<'a, T: fmt::Write> fmt::Write for FmtSink<'a, T> {
+    fn write_str(&mut self, s: &str) -> Result<(), core::fmt::Error> {
+        self.out.write_str(s)
+    }
+    fn write_char(&mut self, c: char) -> Result<(), core::fmt::Error> {
+        self.out.write_char(c)
+    }
+    fn write_fmt(&mut self, f: fmt::Arguments) -> Result<(), core::fmt::Error> {
+        self.out.write_fmt(f)
+    }
+}
+
+#[cfg(feature = "alloc")]
+mod instruction_text_sink {
+    use core::fmt;
+
+    use super::{DisplaySink, u8_to_hex};
+
+    /// this is an implementation detail of yaxpeax-arch and related crates. if you are a user of the
+    /// disassemblers, do not use this struct. do not depend on this struct existing. this struct is
+    /// not stable. this struct is not safe for general use. if you use this struct you and your
+    /// program will be eaten by gremlins.
+    ///
+    /// if you are implementing an instruction formatter for the yaxpeax family of crates: this struct
+    /// is guaranteed to contain a string that is long enough to hold a fully-formatted instruction.
+    /// because the buffer is guaranteed to be long enough, writes through `InstructionTextSink` are
+    /// not bounds-checked, and the buffer is never grown.
+    ///
+    /// this is wildly dangerous in general use. the public constructor of `InstructionTextSink` is
+    /// unsafe as a result. as used in `InstructionFormatter`, the buffer is guaranteed to be
+    /// `clear()`ed before use, `InstructionFormatter` ensures the buffer is large enough, *and*
+    /// `InstructionFormatter` never allows `InstructionTextSink` to exist in a context where it would
+    /// be written to without being rewound first.
+    ///
+    /// because this opens a very large hole through which `fmt::Write` can become unsafe, incorrect
+    /// uses of this struct will be hard to debug in general. `InstructionFormatter` is probably at the
+    /// limit of easily-reasoned-about lifecycle of the buffer, which "only" leaves the problem of
+    /// ensuring that instruction formatting impls this buffer is passed to are appropriately sized.
+    ///
+    /// this is intended to be hidden in docs. if you see this in docs, it's a bug.
+    #[doc(hidden)]
+    pub struct InstructionTextSink<'buf> {
+        buf: &'buf mut alloc::string::String
+    }
+
+    impl<'buf> InstructionTextSink<'buf> {
+        /// create an `InstructionTextSink` using the provided buffer for storage.
+        ///
+        /// SAFETY: callers must ensure that this sink will never have more content written than
+        /// this buffer can hold. while the buffer may appear growable, `write_*` methods here may
+        /// *bypass bounds checks* and so will never trigger the buffer to grow. writing more data
+        /// than the buffer's size when provided to `new` will cause out-of-bounds writes and
+        /// memory corruption.
+        pub unsafe fn new(buf: &'buf mut alloc::string::String) -> Self {
+            Self { buf }
+        }
+    }
+
+    impl<'buf> fmt::Write for InstructionTextSink<'buf> {
+        fn write_str(&mut self, s: &str) -> Result<(), core::fmt::Error> {
+            self.buf.write_str(s)
+        }
+        fn write_char(&mut self, c: char) -> Result<(), core::fmt::Error> {
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + 1 {
+                    panic!("InstructionTextSink::write_char would overflow output");
+                }
+            }
+
+            // SAFETY: `buf` is assumed to be long enough to hold all input, `buf` at `underlying.len()`
+            // is valid for writing, but may be uninitialized.
+            //
+            // this function is essentially equivalent to `Vec::push` specialized for the case that
+            // `len < buf.capacity()`:
+            // https://github.com/rust-lang/rust/blob/be9e27e/library/alloc/src/vec/mod.rs#L1993-L2006
+            unsafe {
+                let underlying = self.buf.as_mut_vec();
+                // `InstructionTextSink::write_char` is only used by yaxpeax-x86, and is only used to
+                // write single ASCII characters. this is wrong in the general case, but `write_char`
+                // here is not going to be used in the general case.
+                if cfg!(debug_assertions) {
+                    if c > '\x7f' {
+                        panic!("InstructionTextSink::write_char would truncate output");
+                    }
+                }
+                let to_push = c as u8;
+                // `ptr::write` here because `underlying.add(underlying.len())` may not point to an
+                // initialized value, which would mean that turning that pointer into a `&mut u8` to
+                // store through would be UB. `ptr::write` avoids taking the mut ref.
+                underlying.as_mut_ptr().offset(underlying.len() as isize).write(to_push);
+                // we have initialized all (one) bytes that `set_len` is increasing the length to
+                // include.
+                underlying.set_len(underlying.len() + 1);
+            }
+            Ok(())
+        }
+    }
+
+    impl<'buf> DisplaySink for InstructionTextSink<'buf> {
+        #[inline(always)]
+        fn write_fixed_size(&mut self, s: &str) -> Result<(), core::fmt::Error> {
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + s.len() {
+                    panic!("InstructionTextSink::write_fixed_size would overflow output");
+                }
+            }
+
+            // Safety: we are appending only valid utf8 strings to `self.buf`, as `s` is known to
+            // be valid utf8
+            let buf = unsafe { self.buf.as_mut_vec() };
+            let new_bytes = s.as_bytes();
+
+            if new_bytes.len() == 0 {
+                return Ok(());
+            }
+
+            unsafe {
+                let dest = buf.as_mut_ptr().offset(buf.len() as isize);
+
+                // this used to be enough to bamboozle llvm away from
+                // https://github.com/rust-lang/rust/issues/92993#issuecomment-2028915232https://github.com/rust-lang/rust/issues/92993#issuecomment-2028915232
+                // if `s` is not fixed size. somewhere between Rust 1.68 and Rust 1.74 this stopped
+                // being sufficient, so `write_fixed_size` truly should only be used for fixed size `s`
+                // (otherwise this is a libc memcpy call in disguise). for fixed-size strings this
+                // unrolls into some kind of appropriate series of `mov`.
+                dest.offset(0 as isize).write(new_bytes[0]);
+                for i in 1..new_bytes.len() {
+                    dest.offset(i as isize).write(new_bytes[i]);
+                }
+
+                buf.set_len(buf.len() + new_bytes.len());
+            }
+
+            Ok(())
+        }
+        unsafe fn write_lt_32(&mut self, s: &str) -> Result<(), fmt::Error> {
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + s.len() {
+                    panic!("InstructionTextSink::write_lt_32 would overflow output");
+                }
+            }
+
+            // Safety: `new` requires callers promise there is enough space to hold `s`.
+            unsafe {
+                super::imp::append_string_lt_32_unchecked(&mut self.buf, s);
+            }
+
+            Ok(())
+        }
+        unsafe fn write_lt_16(&mut self, s: &str) -> Result<(), fmt::Error> {
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + s.len() {
+                    panic!("InstructionTextSink::write_lt_16 would overflow output");
+                }
+            }
+
+            // Safety: `new` requires callers promise there is enough space to hold `s`.
+            unsafe {
+                super::imp::append_string_lt_16_unchecked(&mut self.buf, s);
+            }
+
+            Ok(())
+        }
+        unsafe fn write_lt_8(&mut self, s: &str) -> Result<(), fmt::Error> {
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + s.len() {
+                    panic!("InstructionTextSink::write_lt_8 would overflow output");
+                }
+            }
+
+            // Safety: `new` requires callers promise there is enough space to hold `s`.
+            unsafe {
+                super::imp::append_string_lt_8_unchecked(&mut self.buf, s);
+            }
+
+            Ok(())
+        }
+        /// write a u8 to the output as a base-16 integer.
+        ///
+        /// this is provided for optimization opportunities when the formatted integer can be written
+        /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+        /// followup step)
+        #[inline(always)]
+        fn write_u8(&mut self, mut v: u8) -> Result<(), core::fmt::Error> {
+            if v == 0 {
+                return self.write_fixed_size("0");
+            }
+            // we can fairly easily predict the size of a formatted string here with lzcnt, which also
+            // means we can write directly into the correct offsets of the output string.
+            let printed_size = ((8 - v.leading_zeros() + 3) >> 2) as usize;
+
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + printed_size {
+                    panic!("InstructionTextSink::write_u8 would overflow output");
+                }
+            }
+
+            // Safety: we are appending only valid utf8 strings to `self.buf`, as `s` is known to
+            // be valid utf8
+            let buf = unsafe { self.buf.as_mut_vec() };
+            let new_len = buf.len() + printed_size;
+
+            // Safety: there is no way to exit this function without initializing all bytes up to
+            // `new_len`
+            unsafe {
+                buf.set_len(new_len);
+            }
+            // Safety: `new()` requires callers promise there is space through to `new_len`
+            let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+
+            loop {
+                let digit = v % 16;
+                let c = u8_to_hex(digit as u8);
+                // Safety: `p` will not move before `buf`'s length at function entry, so `p` points
+                // to a location valid for writing.
+                unsafe {
+                    p = p.offset(-1);
+                    p.write(c);
+                }
+                v = v / 16;
+                if v == 0 {
+                    break;
+                }
+            }
+
+            Ok(())
+        }
+        /// write a u16 to the output as a base-16 integer.
+        ///
+        /// this is provided for optimization opportunities when the formatted integer can be written
+        /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+        /// followup step)
+        #[inline(always)]
+        fn write_u16(&mut self, mut v: u16) -> Result<(), core::fmt::Error> {
+            if v == 0 {
+                return self.write_fixed_size("0");
+            }
+
+            // we can fairly easily predict the size of a formatted string here with lzcnt, which also
+            // means we can write directly into the correct offsets of the output string.
+            let printed_size = ((16 - v.leading_zeros() + 3) >> 2) as usize;
+
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + printed_size {
+                    panic!("InstructionTextSink::write_u16 would overflow output");
+                }
+            }
+
+            // Safety: we are appending only valid utf8 strings to `self.buf`, as `s` is known to
+            // be valid utf8
+            let buf = unsafe { self.buf.as_mut_vec() };
+            let new_len = buf.len() + printed_size;
+
+            // Safety: there is no way to exit this function without initializing all bytes up to
+            // `new_len`
+            unsafe {
+                buf.set_len(new_len);
+            }
+            // Safety: `new()` requires callers promise there is space through to `new_len`
+            let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+
+            loop {
+                let digit = v % 16;
+                let c = u8_to_hex(digit as u8);
+                // Safety: `p` will not move before `buf`'s length at function entry, so `p` points
+                // to a location valid for writing.
+                unsafe {
+                    p = p.offset(-1);
+                    p.write(c);
+                }
+                v = v / 16;
+                if v == 0 {
+                    break;
+                }
+            }
+
+            Ok(())
+        }
+        /// write a u32 to the output as a base-16 integer.
+        ///
+        /// this is provided for optimization opportunities when the formatted integer can be written
+        /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+        /// followup step)
+        #[inline(always)]
+        fn write_u32(&mut self, mut v: u32) -> Result<(), core::fmt::Error> {
+            if v == 0 {
+                return self.write_fixed_size("0");
+            }
+
+            // we can fairly easily predict the size of a formatted string here with lzcnt, which also
+            // means we can write directly into the correct offsets of the output string.
+            let printed_size = ((32 - v.leading_zeros() + 3) >> 2) as usize;
+
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + printed_size {
+                    panic!("InstructionTextSink::write_u32 would overflow output");
+                }
+            }
+
+            // Safety: we are appending only valid utf8 strings to `self.buf`, as `s` is known to
+            // be valid utf8
+            let buf = unsafe { self.buf.as_mut_vec() };
+            let new_len = buf.len() + printed_size;
+
+            // Safety: there is no way to exit this function without initializing all bytes up to
+            // `new_len`
+            unsafe {
+                buf.set_len(new_len);
+            }
+            // Safety: `new()` requires callers promise there is space through to `new_len`
+            let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+
+            loop {
+                let digit = v % 16;
+                let c = u8_to_hex(digit as u8);
+                // Safety: `p` will not move before `buf`'s length at function entry, so `p` points
+                // to a location valid for writing.
+                unsafe {
+                    p = p.offset(-1);
+                    p.write(c);
+                }
+                v = v / 16;
+                if v == 0 {
+                    break;
+                }
+            }
+
+            Ok(())
+        }
+        /// write a u64 to the output as a base-16 integer.
+        ///
+        /// this is provided for optimization opportunities when the formatted integer can be written
+        /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+        /// followup step)
+        #[inline(always)]
+        fn write_u64(&mut self, mut v: u64) -> Result<(), core::fmt::Error> {
+            if v == 0 {
+                return self.write_fixed_size("0");
+            }
+
+            // we can fairly easily predict the size of a formatted string here with lzcnt, which also
+            // means we can write directly into the correct offsets of the output string.
+            let printed_size = ((64 - v.leading_zeros() + 3) >> 2) as usize;
+
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + printed_size {
+                    panic!("InstructionTextSink::write_u64 would overflow output");
+                }
+            }
+
+            // Safety: we are appending only valid utf8 strings to `self.buf`, as `s` is known to
+            // be valid utf8
+            let buf = unsafe { self.buf.as_mut_vec() };
+            let new_len = buf.len() + printed_size;
+
+            // Safety: there is no way to exit this function without initializing all bytes up to
+            // `new_len`
+            unsafe {
+                buf.set_len(new_len);
+            }
+            // Safety: `new()` requires callers promise there is space through to `new_len`
+            let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+
+            loop {
+                let digit = v % 16;
+                let c = u8_to_hex(digit as u8);
+                // Safety: `p` will not move before `buf`'s length at function entry, so `p` points
+                // to a location valid for writing.
+                unsafe {
+                    p = p.offset(-1);
+                    p.write(c);
+                }
+                v = v / 16;
+                if v == 0 {
+                    break;
+                }
+            }
+
+            Ok(())
+        }
+    }
+}
+#[cfg(feature = "alloc")]
+pub use instruction_text_sink::InstructionTextSink;
+
+
+#[cfg(feature = "alloc")]
+use crate::display::u8_to_hex;
+
+/// this [`DisplaySink`] impl exists to support somewhat more performant buffering of the kinds of
+/// strings `yaxpeax-x86` uses in formatting instructions.
+///
+/// span information is discarded at zero cost.
+#[cfg(feature = "alloc")]
+impl DisplaySink for alloc::string::String {
+    #[inline(always)]
+    fn write_fixed_size(&mut self, s: &str) -> Result<(), core::fmt::Error> {
+        self.reserve(s.len());
+        // Safety: we are appending only valid utf8 strings to `self.buf`, as `s` is known to
+        // be valid utf8
+        let buf = unsafe { self.as_mut_vec() };
+        let new_bytes = s.as_bytes();
+
+        if new_bytes.len() == 0 {
+            return Ok(());
+        }
+
+        // Safety: we have reserved space for all `buf` bytes, above.
+        unsafe {
+            let dest = buf.as_mut_ptr().offset(buf.len() as isize);
+
+            // this used to be enough to bamboozle llvm away from
+            // https://github.com/rust-lang/rust/issues/92993#issuecomment-2028915232
+            // if `s` is not fixed size. somewhere between Rust 1.68 and Rust 1.74 this stopped
+            // being sufficient, so `write_fixed_size` truly should only be used for fixed size `s`
+            // (otherwise this is a libc memcpy call in disguise). for fixed-size strings this
+            // unrolls into some kind of appropriate series of `mov`.
+            dest.offset(0 as isize).write(new_bytes[0]);
+            for i in 1..new_bytes.len() {
+                dest.offset(i as isize).write(new_bytes[i]);
+            }
+
+            // Safety: we have initialized all bytes from where `self` initially ended, through to
+            // all `new_bytes` additional elements.
+            buf.set_len(buf.len() + new_bytes.len());
+        }
+
+        Ok(())
+    }
+    unsafe fn write_lt_32(&mut self, s: &str) -> Result<(), fmt::Error> {
+        self.reserve(s.len());
+
+        // Safety: we have reserved enough space for `s`.
+        unsafe {
+            imp::append_string_lt_32_unchecked(self, s);
+        }
+
+        Ok(())
+    }
+    unsafe fn write_lt_16(&mut self, s: &str) -> Result<(), fmt::Error> {
+        self.reserve(s.len());
+
+        // Safety: we have reserved enough space for `s`.
+        unsafe {
+            imp::append_string_lt_16_unchecked(self, s);
+        }
+
+        Ok(())
+    }
+    unsafe fn write_lt_8(&mut self, s: &str) -> Result<(), fmt::Error> {
+        self.reserve(s.len());
+
+        // Safety: we have reserved enough space for `s`.
+        unsafe {
+            imp::append_string_lt_8_unchecked(self, s);
+        }
+
+        Ok(())
+    }
+    /// write a u8 to the output as a base-16 integer.
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    #[inline(always)]
+    fn write_u8(&mut self, mut v: u8) -> Result<(), core::fmt::Error> {
+        if v == 0 {
+            return self.write_fixed_size("0");
+        }
+        // we can fairly easily predict the size of a formatted string here with lzcnt, which also
+        // means we can write directly into the correct offsets of the output string.
+        let printed_size = ((8 - v.leading_zeros() + 3) >> 2) as usize;
+
+        self.reserve(printed_size);
+
+        // Safety: we are appending only valid utf8 strings to `self.buf`, as `s` is known to
+        // be valid utf8
+        let buf = unsafe { self.as_mut_vec() };
+        let new_len = buf.len() + printed_size;
+
+        // Safety: there is no way to exit this function without initializing all bytes up to
+        // `new_len`
+        unsafe {
+            buf.set_len(new_len);
+        }
+        // Safety: we have reserved space through to `new_len` by calling `reserve` above.
+        let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+
+        loop {
+            let digit = v % 16;
+            let c = u8_to_hex(digit as u8);
+            // Safety: `p` will not move before `buf`'s length at function entry, so `p` points
+            // to a location valid for writing.
+            unsafe {
+                p = p.offset(-1);
+                p.write(c);
+            }
+            v = v / 16;
+            if v == 0 {
+                break;
+            }
+        }
+
+        Ok(())
+    }
+    /// write a u16 to the output as a base-16 integer.
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    #[inline(always)]
+    fn write_u16(&mut self, mut v: u16) -> Result<(), core::fmt::Error> {
+        if v == 0 {
+            return self.write_fixed_size("0");
+        }
+        // we can fairly easily predict the size of a formatted string here with lzcnt, which also
+        // means we can write directly into the correct offsets of the output string.
+        let printed_size = ((16 - v.leading_zeros() + 3) >> 2) as usize;
+
+        self.reserve(printed_size);
+
+        // Safety: we are appending only valid utf8 strings to `self.buf`, as `s` is known to
+        // be valid utf8
+        let buf = unsafe { self.as_mut_vec() };
+        let new_len = buf.len() + printed_size;
+
+        // Safety: there is no way to exit this function without initializing all bytes up to
+        // `new_len`
+        unsafe {
+            buf.set_len(new_len);
+        }
+        // Safety: we have reserved space through to `new_len` by calling `reserve` above.
+        let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+
+        loop {
+            let digit = v % 16;
+            let c = u8_to_hex(digit as u8);
+            // Safety: `p` will not move before `buf`'s length at function entry, so `p` points
+            // to a location valid for writing.
+            unsafe {
+                p = p.offset(-1);
+                p.write(c);
+            }
+            v = v / 16;
+            if v == 0 {
+                break;
+            }
+        }
+
+        Ok(())
+    }
+    /// write a u32 to the output as a base-16 integer.
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    #[inline(always)]
+    fn write_u32(&mut self, mut v: u32) -> Result<(), core::fmt::Error> {
+        if v == 0 {
+            return self.write_fixed_size("0");
+        }
+        // we can fairly easily predict the size of a formatted string here with lzcnt, which also
+        // means we can write directly into the correct offsets of the output string.
+        let printed_size = ((32 - v.leading_zeros() + 3) >> 2) as usize;
+
+        self.reserve(printed_size);
+
+        // Safety: we are appending only valid utf8 strings to `self.buf`, as `s` is known to
+        // be valid utf8
+        let buf = unsafe { self.as_mut_vec() };
+        let new_len = buf.len() + printed_size;
+
+        // Safety: there is no way to exit this function without initializing all bytes up to
+        // `new_len`
+        unsafe {
+            buf.set_len(new_len);
+        }
+        // Safety: we have reserved space through to `new_len` by calling `reserve` above.
+        let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+
+        loop {
+            let digit = v % 16;
+            let c = u8_to_hex(digit as u8);
+            // Safety: `p` will not move before `buf`'s length at function entry, so `p` points
+            // to a location valid for writing.
+            unsafe {
+                p = p.offset(-1);
+                p.write(c);
+            }
+            v = v / 16;
+            if v == 0 {
+                break;
+            }
+        }
+
+        Ok(())
+    }
+    /// write a u64 to the output as a base-16 integer.
+    ///
+    /// this is provided for optimization opportunities when the formatted integer can be written
+    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+    /// followup step)
+    #[inline(always)]
+    fn write_u64(&mut self, mut v: u64) -> Result<(), core::fmt::Error> {
+        if v == 0 {
+            return self.write_fixed_size("0");
+        }
+        // we can fairly easily predict the size of a formatted string here with lzcnt, which also
+        // means we can write directly into the correct offsets of the output string.
+        let printed_size = ((64 - v.leading_zeros() + 3) >> 2) as usize;
+
+        self.reserve(printed_size);
+
+        // Safety: we are appending only valid utf8 strings to `self.buf`, as `s` is known to
+        // be valid utf8
+        let buf = unsafe { self.as_mut_vec() };
+        let new_len = buf.len() + printed_size;
+
+        // Safety: there is no way to exit this function without initializing all bytes up to
+        // `new_len`
+        unsafe {
+            buf.set_len(new_len);
+        }
+        // Safety: we have reserved space through to `new_len` by calling `reserve` above.
+        let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+
+        loop {
+            let digit = v % 16;
+            let c = u8_to_hex(digit as u8);
+            // Safety: `p` will not move before `buf`'s length at function entry, so `p` points
+            // to a location valid for writing.
+            unsafe {
+                p = p.offset(-1);
+                p.write(c);
+            }
+            v = v / 16;
+            if v == 0 {
+                break;
+            }
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/display/display_sink/imp_generic.rs b/src/display/display_sink/imp_generic.rs
new file mode 100644
index 0000000..8819243
--- /dev/null
+++ b/src/display/display_sink/imp_generic.rs
@@ -0,0 +1,26 @@
+/// append `data` to `buf`, assuming `data` is less than 8 bytes and that `buf` has enough space
+/// remaining to hold all bytes in `data`.
+///
+/// Safety: callers must ensure that `buf.capacity() - buf.len() >= data.len()`.
+#[inline(always)]
+pub unsafe fn append_string_lt_8_unchecked(buf: &mut alloc::string::String, data: &str) {
+    buf.push_str(data);
+}
+
+/// append `data` to `buf`, assuming `data` is less than 16 bytes and that `buf` has enough space
+/// remaining to hold all bytes in `data`.
+///
+/// Safety: callers must ensure that `buf.capacity() - buf.len() >= data.len()`.
+#[inline(always)]
+pub unsafe fn append_string_lt_16_unchecked(buf: &mut alloc::string::String, data: &str) {
+    buf.push_str(data);
+}
+
+/// append `data` to `buf`, assuming `data` is less than 32 bytes and that `buf` has enough space
+/// remaining to hold all bytes in `data`.
+///
+/// Safety: callers must ensure that `buf.capacity() - buf.len() >= data.len()`.
+#[inline(always)]
+pub unsafe fn append_string_lt_32_unchecked(buf: &mut alloc::string::String, data: &str) {
+    buf.push_str(data);
+}
diff --git a/src/display/display_sink/imp_x86.rs b/src/display/display_sink/imp_x86.rs
new file mode 100644
index 0000000..902ea69
--- /dev/null
+++ b/src/display/display_sink/imp_x86.rs
@@ -0,0 +1,187 @@
+//! `imp_x86` has specialized copies to append short strings to strings. buffer sizing must be
+//! handled by callers, in all cases.
+//!
+//! the structure of all implementations here is, essentially, to take the size of the data to
+//! append and execute a copy for each bit set in that size, from highest to lowest. some bits are
+//! simply never checked if the input is promised to never be that large - if a string to append is
+//! only 0..7 bytes long, it is sufficient to only look at the low three bits to copy all bytes.
+//!
+//! in this way, it is slightly more efficient to right-size which append function is used, if the
+//! maximum size of input strings can be bounded well. if the maximum size of input strings cannot
+//! be bounded, you shouldn't be using these functions.
+
+/// append `data` to `buf`, assuming `data` is less than 8 bytes and that `buf` has enough space
+/// remaining to hold all bytes in `data`.
+///
+/// Safety: callers must ensure that `buf.capacity() - buf.len() >= data.len()`.
+#[inline(always)]
+pub unsafe fn append_string_lt_8_unchecked(buf: &mut alloc::string::String, data: &str) {
+    // Safety: we are appending only valid utf8 strings to `self.buf`, as `s` is known to
+    // be valid utf8
+    let buf = unsafe { buf.as_mut_vec() };
+    let new_bytes = data.as_bytes();
+
+    unsafe {
+        let dest = buf.as_mut_ptr().offset(buf.len() as isize);
+        let src = new_bytes.as_ptr();
+
+        let rem = new_bytes.len() as isize;
+
+        // set_len early because there is no way to avoid the following asm!() writing that
+        // same number of bytes into buf
+        buf.set_len(buf.len() + new_bytes.len());
+
+        core::arch::asm!(
+            "8:",
+            "cmp {rem:e}, 4",
+            "jb 9f",
+            "mov {buf:e}, dword ptr [{src} + {rem} - 4]",
+            "mov dword ptr [{dest} + {rem} - 4], {buf:e}",
+            "sub {rem:e}, 4",
+            "jz 11f",
+            "9:",
+            "cmp {rem:e}, 2",
+            "jb 10f",
+            "mov {buf:x}, word ptr [{src} + {rem} - 2]",
+            "mov word ptr [{dest} + {rem} - 2], {buf:x}",
+            "sub {rem:e}, 2",
+            "jz 11f",
+            "10:",
+            "cmp {rem:e}, 1",
+            "jb 11f",
+            "mov {buf:l}, byte ptr [{src} + {rem} - 1]",
+            "mov byte ptr [{dest} + {rem} - 1], {buf:l}",
+            "11:",
+            src = in(reg) src,
+            dest = in(reg) dest,
+            rem = inout(reg) rem => _,
+            buf = out(reg) _,
+            options(nostack),
+        );
+    }
+}
+
+/// append `data` to `buf`, assuming `data` is less than 16 bytes and that `buf` has enough space
+/// remaining to hold all bytes in `data`.
+///
+/// Safety: callers must ensure that `buf.capacity() - buf.len() >= data.len()`.
+#[inline(always)]
+pub unsafe fn append_string_lt_16_unchecked(buf: &mut alloc::string::String, data: &str) {
+    // Safety: we are appending only valid utf8 strings to `self.buf`, as `s` is known to
+    // be valid utf8
+    let buf = unsafe { buf.as_mut_vec() };
+    let new_bytes = data.as_bytes();
+
+    unsafe {
+        let dest = buf.as_mut_ptr().offset(buf.len() as isize);
+        let src = new_bytes.as_ptr();
+
+        let rem = new_bytes.len() as isize;
+
+        // set_len early because there is no way to avoid the following asm!() writing that
+        // same number of bytes into buf
+        buf.set_len(buf.len() + new_bytes.len());
+
+        core::arch::asm!(
+            "7:",
+            "cmp {rem:e}, 8",
+            "jb 8f",
+            "mov {buf:r}, qword ptr [{src} + {rem} - 8]",
+            "mov qword ptr [{dest} + {rem} - 8], {buf:r}",
+            "sub {rem:e}, 8",
+            "jz 11f",
+            "8:",
+            "cmp {rem:e}, 4",
+            "jb 9f",
+            "mov {buf:e}, dword ptr [{src} + {rem} - 4]",
+            "mov dword ptr [{dest} + {rem} - 4], {buf:e}",
+            "sub {rem:e}, 4",
+            "jz 11f",
+            "9:",
+            "cmp {rem:e}, 2",
+            "jb 10f",
+            "mov {buf:x}, word ptr [{src} + {rem} - 2]",
+            "mov word ptr [{dest} + {rem} - 2], {buf:x}",
+            "sub {rem:e}, 2",
+            "jz 11f",
+            "10:",
+            "cmp {rem:e}, 1",
+            "jb 11f",
+            "mov {buf:l}, byte ptr [{src} + {rem} - 1]",
+            "mov byte ptr [{dest} + {rem} - 1], {buf:l}",
+            "11:",
+            src = in(reg) src,
+            dest = in(reg) dest,
+            rem = inout(reg) rem => _,
+            buf = out(reg) _,
+            options(nostack),
+        );
+    }
+}
+
+/// append `data` to `buf`, assuming `data` is less than 32 bytes and that `buf` has enough space
+/// remaining to hold all bytes in `data`.
+///
+/// Safety: callers must ensure that `buf.capacity() - buf.len() >= data.len()`.
+#[inline(always)]
+pub unsafe fn append_string_lt_32_unchecked(buf: &mut alloc::string::String, data: &str) {
+    // Safety: we are appending only valid utf8 strings to `self.buf`, as `s` is known to
+    // be valid utf8
+    let buf = unsafe { buf.as_mut_vec() };
+    let new_bytes = data.as_bytes();
+
+    unsafe {
+        let dest = buf.as_mut_ptr().offset(buf.len() as isize);
+        let src = new_bytes.as_ptr();
+
+        let rem = new_bytes.len() as isize;
+
+        // set_len early because there is no way to avoid the following asm!() writing that
+        // same number of bytes into buf
+        buf.set_len(buf.len() + new_bytes.len());
+
+        core::arch::asm!(
+            "6:",
+            "cmp {rem:e}, 16",
+            "jb 7f",
+            "mov {buf:r}, qword ptr [{src} + {rem} - 16]",
+            "mov qword ptr [{dest} + {rem} - 16], {buf:r}",
+            "mov {buf:r}, qword ptr [{src} + {rem} - 8]",
+            "mov qword ptr [{dest} + {rem} - 8], {buf:r}",
+            "sub {rem:e}, 16",
+            "jz 11f",
+            "7:",
+            "cmp {rem:e}, 8",
+            "jb 8f",
+            "mov {buf:r}, qword ptr [{src} + {rem} - 8]",
+            "mov qword ptr [{dest} + {rem} - 8], {buf:r}",
+            "sub {rem:e}, 8",
+            "jz 11f",
+            "8:",
+            "cmp {rem:e}, 4",
+            "jb 9f",
+            "mov {buf:e}, dword ptr [{src} + {rem} - 4]",
+            "mov dword ptr [{dest} + {rem} - 4], {buf:e}",
+            "sub {rem:e}, 4",
+            "jz 11f",
+            "9:",
+            "cmp {rem:e}, 2",
+            "jb 10f",
+            "mov {buf:x}, word ptr [{src} + {rem} - 2]",
+            "mov word ptr [{dest} + {rem} - 2], {buf:x}",
+            "sub {rem:e}, 2",
+            "jz 11f",
+            "10:",
+            "cmp {rem:e}, 1",
+            "jb 11f",
+            "mov {buf:l}, byte ptr [{src} + {rem} - 1]",
+            "mov byte ptr [{dest} + {rem} - 1], {buf:l}",
+            "11:",
+            src = in(reg) src,
+            dest = in(reg) dest,
+            rem = inout(reg) rem => _,
+            buf = out(reg) _,
+            options(nostack),
+        );
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 7aaba21..a0c237b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,12 +1,14 @@
 #![no_std]
 #![doc = include_str!("../README.md")]
 
+#[cfg(feature = "alloc")]
+extern crate alloc;
+
 use core::fmt::{self, Debug, Display};
 use core::hash::Hash;
 
 #[cfg(feature="use-serde")]
 #[macro_use] extern crate serde_derive;
-
 #[cfg(feature="use-serde")]
 use serde::{Serialize, Deserialize};
 
@@ -18,19 +20,25 @@ pub use address::AddrParse;
 
 pub mod annotation;
 
+#[deprecated(since="0.3.0", note="yaxpeax_arch::color conflates output mechanism and styling, leaving it brittle and overly-restrictive. see `yaxpeax_arch::color_new`, which will replace `color` in a future version.")]
 mod color;
+#[allow(deprecated)] // allow exporting the deprecated items here to not break downstreams even further...
 pub use color::{Colorize, NoColors, YaxColors};
-
-#[cfg(feature="colors")]
-pub use color::ColorSettings;
+#[cfg(feature="color-new")]
+pub mod color_new;
 
 pub mod display;
+
 mod reader;
 pub use reader::{Reader, ReaderBuilder, ReadError, U8Reader, U16le, U16be, U32le, U32be, U64le, U64be};
 
+pub mod safer_unchecked;
+
+pub mod testkit;
+
 /// the minimum set of errors a `yaxpeax-arch` disassembler may produce.
 ///
-/// it is permissible for an implementor of `DecodeError` to have items that return `false` for
+/// it is permissible for an implementer of `DecodeError` to have items that return `false` for
 /// all these functions; decoders are permitted to error in way that `yaxpeax-arch` does not know
 /// about.
 pub trait DecodeError: PartialEq + Display + Debug + Send + Sync + 'static {
@@ -42,12 +50,12 @@ pub trait DecodeError: PartialEq + Display + Debug + Send + Sync + 'static {
     /// generally indicate an issue with the instruction itself. this is in contrast to one
     /// specific operand being invalid for the instruction, or some other issue to do with decoding
     /// data beyond the top-level instruction. the "opcode"/"operand" distinction is often fuzzy
-    /// and left as best-effort for decoder implementors.
+    /// and left as best-effort for decoder implementers.
     fn bad_opcode(&self) -> bool;
     /// did the decoder error because an operand of the instruction to decode is invalid?
     ///
     /// similar to [`DecodeError::bad_opcode`], this is a subjective distinction and best-effort on
-    /// the part of implementors.
+    /// the part of implementers.
     fn bad_operand(&self) -> bool;
     /// a human-friendly description of this decode error.
     fn description(&self) -> &'static str;
@@ -127,6 +135,7 @@ impl DecodeError for StandardPartialDecoderError {
     }
 }
 
+/*
 #[derive(Copy, Clone)]
 struct NoDescription {}
 
@@ -135,6 +144,7 @@ impl fmt::Display for NoDescription {
         Ok(())
     }
 }
+*/
 
 /// an interface to decode [`Arch::Instruction`] words from a reader of [`Arch::Word`]s. errors are
 /// the architecture-defined [`DecodeError`] implemention.
@@ -152,7 +162,7 @@ pub trait Decoder<A: Arch + ?Sized> {
     /// SAFETY:
     ///
     /// while `inst` MUST be left in a state that does not violate Rust's safety guarantees,
-    /// implementors are NOT obligated to leave `inst` in a semantically meaningful state if
+    /// implementers are NOT obligated to leave `inst` in a semantically meaningful state if
     /// decoding fails. if `decode_into` returns an error, callers may find contradictory and
     /// useless information in `inst`, as well as *stale data* from whatever was passed in.
     fn decode_into<T: Reader<A::Address, A::Word>>(&self, inst: &mut A::Instruction, words: &mut T) -> Result<(), A::DecodeError>;
@@ -227,6 +237,8 @@ pub trait Instruction {
     fn well_defined(&self) -> bool;
 }
 
+#[allow(deprecated)]
+#[deprecated(since="0.3.0", note="ShowContextual ties YaxColors and fmt::Write in a way that only sometimes composes. simultaneously, it is too generic on Ctx, making it difficult to implement and use. it will be revisited in the future.")]
 pub trait ShowContextual<Addr, Ctx: ?Sized, T: fmt::Write, Y: YaxColors> {
     fn contextualize(&self, colors: &Y, address: Addr, context: Option<&Ctx>, out: &mut T) -> fmt::Result;
 }
diff --git a/src/reader.rs b/src/reader.rs
index 028d835..8b68486 100644
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -24,8 +24,9 @@ pub enum ReadError {
 /// isn't a multiple of 8 bits, `U8Reader` won't be sufficient.
 pub trait Reader<Address, Item> {
     fn next(&mut self) -> Result<Item, ReadError>;
-    /// read `buf`-many items from this reader in bulk. if `Reader` cannot read `buf`-many items,
-    /// return `ReadError::ExhaustedInput`.
+    /// read `buf`-many items from this reader in bulk.
+    ///
+    /// if `Reader` cannot read `buf`-many items, return `ReadError::ExhaustedInput`.
     fn next_n(&mut self, buf: &mut [Item]) -> Result<(), ReadError>;
     /// mark the current position as where to measure `offset` against.
     fn mark(&mut self);
diff --git a/src/safer_unchecked.rs b/src/safer_unchecked.rs
new file mode 100644
index 0000000..b556a6f
--- /dev/null
+++ b/src/safer_unchecked.rs
@@ -0,0 +1,40 @@
+//! tools to help validate correct use of `unchecked` functions.
+//!
+//! these `kinda_unchecked` functions will use equivalent implementations that panic when
+//! invariants are violated when the `debug_assertions` config is present, but use the
+//! corresponding `*_unchecked` otherwise.
+//!
+//! for example, `GetSaferUnchecked` uses a normal index when debug assertions are enabled, but
+//! `.get_unchecked()` otherwise. this means that tests and even fuzzing can be made to exercise
+//! panic-on-error cases as desired.
+
+use core::slice::SliceIndex;
+
+pub trait GetSaferUnchecked<T> {
+    unsafe fn get_kinda_unchecked<I>(&self, index: I) -> &<I as SliceIndex<[T]>>::Output
+    where
+        I: SliceIndex<[T]>;
+}
+
+impl<T> GetSaferUnchecked<T> for [T] {
+    #[inline(always)]
+    unsafe fn get_kinda_unchecked<I>(&self, index: I) -> &<I as SliceIndex<[T]>>::Output
+    where
+        I: SliceIndex<[T]>,
+    {
+        if cfg!(debug_assertions) {
+            &self[index]
+        } else {
+            self.get_unchecked(index)
+        }
+    }
+}
+
+#[inline(always)]
+pub unsafe fn unreachable_kinda_unchecked() -> ! {
+    if cfg!(debug_assertions) {
+        panic!("UB: Unreachable unchecked was executed")
+    } else {
+        core::hint::unreachable_unchecked()
+    }
+}
diff --git a/src/testkit.rs b/src/testkit.rs
new file mode 100644
index 0000000..215a062
--- /dev/null
+++ b/src/testkit.rs
@@ -0,0 +1,10 @@
+//! utilities to validate that implementations of traits in `yaxpeax-arch` uphold requirements
+//! described in this crate.
+//!
+//! currently, this only includes tools to validate correct use of
+//! [`crate::display::DisplaySink`], but may grow in the future.
+
+#[cfg(feature="alloc")]
+mod display;
+#[cfg(feature="alloc")]
+pub use display::{DisplaySinkValidator, DisplaySinkWriteComparator};
diff --git a/src/testkit/display.rs b/src/testkit/display.rs
new file mode 100644
index 0000000..3cef59c
--- /dev/null
+++ b/src/testkit/display.rs
@@ -0,0 +1,192 @@
+//! tools to test the correctness of `yaxpeax-arch` trait implementations.
+
+use core::fmt;
+use core::fmt::Write;
+
+use crate::display::DisplaySink;
+
+/// `DisplaySinkValidator` is a `DisplaySink` that panics if invariants required of
+/// `DisplaySink`-writing functions are not upheld.
+///
+/// there are two categories of invariants that `DisplaySinkValidator` validates.
+///
+/// first, this panics if spans are not `span_end_*`-ed in first-in-last-out order with
+/// corresponding `span_start_*. second, this panics if `write_lt_*` functions are ever provided
+/// inputs longer than the corresponding maximum length.
+///
+/// functions that write to a `DisplaySink` are strongly encouraged to come with fuzzing that for
+/// all inputs `DisplaySinkValidator` does not panic.
+pub struct DisplaySinkValidator {
+    spans: alloc::vec::Vec<&'static str>,
+}
+
+impl DisplaySinkValidator {
+    pub fn new() -> Self {
+        Self { spans: alloc::vec::Vec::new() }
+    }
+}
+
+impl core::ops::Drop for DisplaySinkValidator {
+    fn drop(&mut self) {
+        if self.spans.len() != 0 {
+            panic!("DisplaySinkValidator dropped with open spans");
+        }
+    }
+}
+
+impl fmt::Write for DisplaySinkValidator {
+    fn write_str(&mut self, _s: &str) -> Result<(), fmt::Error> {
+        Ok(())
+    }
+    fn write_char(&mut self, _c: char) -> Result<(), fmt::Error> {
+        Ok(())
+    }
+}
+
+impl DisplaySink for DisplaySinkValidator {
+    unsafe fn write_lt_32(&mut self, s: &str) -> Result<(), fmt::Error> {
+        if s.len() >= 32 {
+            panic!("DisplaySinkValidator::write_lt_32 was given a string longer than the maximum permitted length");
+        }
+
+        self.write_str(s)
+    }
+    unsafe fn write_lt_16(&mut self, s: &str) -> Result<(), fmt::Error> {
+        if s.len() >= 16 {
+            panic!("DisplaySinkValidator::write_lt_16 was given a string longer than the maximum permitted length");
+        }
+
+        self.write_str(s)
+    }
+    unsafe fn write_lt_8(&mut self, s: &str) -> Result<(), fmt::Error> {
+        if s.len() >= 8 {
+            panic!("DisplaySinkValidator::write_lt_8 was given a string longer than the maximum permitted length");
+        }
+
+        self.write_str(s)
+    }
+
+    fn span_start_immediate(&mut self) {
+        self.spans.push("immediate");
+    }
+
+    fn span_end_immediate(&mut self) {
+        let last = self.spans.pop().expect("item to pop");
+        assert_eq!(last, "immediate");
+    }
+
+    fn span_start_register(&mut self) {
+        self.spans.push("register");
+    }
+
+    fn span_end_register(&mut self) {
+        let last = self.spans.pop().expect("item to pop");
+        assert_eq!(last, "register");
+    }
+
+    fn span_start_opcode(&mut self) {
+        self.spans.push("opcode");
+    }
+
+    fn span_end_opcode(&mut self) {
+        let last = self.spans.pop().expect("item to pop");
+        assert_eq!(last, "opcode");
+    }
+
+    fn span_start_program_counter(&mut self) {
+        self.spans.push("program counter");
+    }
+
+    fn span_end_program_counter(&mut self) {
+        let last = self.spans.pop().expect("item to pop");
+        assert_eq!(last, "program counter");
+    }
+
+    fn span_start_number(&mut self) {
+        self.spans.push("number");
+    }
+
+    fn span_end_number(&mut self) {
+        let last = self.spans.pop().expect("item to pop");
+        assert_eq!(last, "number");
+    }
+
+    fn span_start_address(&mut self) {
+        self.spans.push("address");
+    }
+
+    fn span_end_address(&mut self) {
+        let last = self.spans.pop().expect("item to pop");
+        assert_eq!(last, "address");
+    }
+
+    fn span_start_function_expr(&mut self) {
+        self.spans.push("function expr");
+    }
+
+    fn span_end_function_expr(&mut self) {
+        let last = self.spans.pop().expect("item to pop");
+        assert_eq!(last, "function expr");
+    }
+}
+
+/// `DisplaySinkWriteComparator` helps test that two `DisplaySink` implementations which should
+/// produce the same output actually do.
+///
+/// this is most useful for cases like testing specialized `write_lt_*` functions, which ought to
+/// behave the same as if `write_str()` were called instead and so can be used as a very simple
+/// oracle.
+///
+/// this is somewhat less useful when the sinks are expected to produce unequal text, such as when
+/// one sink writes ANSI color sequences and the other does not.
+pub struct DisplaySinkWriteComparator<'sinks, T: DisplaySink, U: DisplaySink> {
+    sink1: &'sinks mut T,
+    sink1_check: fn(&T) -> &str,
+    sink2: &'sinks mut U,
+    sink2_check: fn(&U) -> &str,
+}
+
+impl<'sinks, T: DisplaySink, U: DisplaySink> DisplaySinkWriteComparator<'sinks, T, U> {
+    pub fn new(
+        t: &'sinks mut T, t_check: fn(&T) -> &str,
+        u: &'sinks mut U, u_check: fn(&U) -> &str
+    ) -> Self {
+        Self {
+            sink1: t,
+            sink1_check: t_check,
+            sink2: u,
+            sink2_check: u_check,
+        }
+    }
+
+    fn compare_sinks(&self) {
+        let sink1_text = (self.sink1_check)(self.sink1);
+        let sink2_text = (self.sink2_check)(self.sink2);
+
+        if sink1_text != sink2_text {
+            panic!("sinks produced different output: {} != {}", sink1_text, sink2_text);
+        }
+    }
+}
+
+impl<'sinks, T: DisplaySink, U: DisplaySink> DisplaySink for DisplaySinkWriteComparator<'sinks, T, U> {
+    fn write_u8(&mut self, v: u8) -> Result<(), fmt::Error> {
+        self.sink1.write_u8(v).expect("write to sink1 succeeds");
+        self.sink2.write_u8(v).expect("write to sink2 succeeds");
+        self.compare_sinks();
+        Ok(())
+    }
+}
+
+impl<'sinks, T: DisplaySink, U: DisplaySink> fmt::Write for DisplaySinkWriteComparator<'sinks, T, U> {
+    fn write_str(&mut self, s: &str) -> Result<(), fmt::Error> {
+        self.sink1.write_str(s).expect("write to sink1 succeeds");
+        self.sink2.write_str(s).expect("write to sink2 succeeds");
+        Ok(())
+    }
+    fn write_char(&mut self, c: char) -> Result<(), fmt::Error> {
+        self.sink1.write_char(c).expect("write to sink1 succeeds");
+        self.sink2.write_char(c).expect("write to sink2 succeeds");
+        Ok(())
+    }
+}