From 2bbeeec8cf26c1b165cdc5e6548b28bbc3c1d6a3 Mon Sep 17 00:00:00 2001
From: iximeow <me@iximeow.net>
Date: Sat, 22 Jun 2024 12:26:04 -0700
Subject: be more careful about what does and doesnt need alloc

---
 goodfile                    |   1 +
 src/display.rs              |   5 +-
 src/display/display_sink.rs | 896 ++++++++++++++++++++++----------------------
 src/lib.rs                  |   1 -
 tests/display.rs            |  17 +
 5 files changed, 477 insertions(+), 443 deletions(-)

diff --git a/goodfile b/goodfile
index 51a1399..5264b18 100644
--- a/goodfile
+++ b/goodfile
@@ -5,6 +5,7 @@ Step.push("build")
 Build.run({"cargo", "build"})
 
 Step.advance("test")
+-- TODO: set `-D warnings` here and below...
 Build.run({"cargo", "test"}, {name="test default features"})
 
 -- `cargo test` ends up running doc tests. great! but yaxpeax-arch's docs reference items in std only.
diff --git a/src/display.rs b/src/display.rs
index 77f6ba9..3965bdc 100644
--- a/src/display.rs
+++ b/src/display.rs
@@ -10,12 +10,15 @@ use core::ops::Neg;
 
 mod display_sink;
 
-pub use display_sink::{DisplaySink, FmtSink, InstructionTextSink};
+pub use display_sink::{DisplaySink, FmtSink};
+#[cfg(feature = "alloc")]
+pub use display_sink::InstructionTextSink;
 
 /// translate a byte in range `[0, 15]` to a lowercase base-16 digit.
 ///
 /// if `c` is in range, the output is always valid as the sole byte in a utf-8 string. if `c` is out
 /// of range, the returned character might not be a valid single-byte utf-8 codepoint.
+#[cfg(feature = "alloc")] // this function is of course not directly related to alloc, but it's only needed by impls that themselves are only present with alloc.
 fn u8_to_hex(c: u8) -> u8 {
     // this conditional branch is faster than a lookup for... most architectures (especially x86
     // with cmov)
diff --git a/src/display/display_sink.rs b/src/display/display_sink.rs
index 418b6aa..1fb8837 100644
--- a/src/display/display_sink.rs
+++ b/src/display/display_sink.rs
@@ -1,9 +1,5 @@
 use core::fmt;
 
-use crate::display::u8_to_hex;
-
-use crate::safer_unchecked::unreachable_kinda_unchecked;
-
 /// `DisplaySink` allows client code to collect output and minimal markup. this is currently used
 /// in formatting instructions for two reasons:
 /// * `DisplaySink` implementations have the opportunity to collect starts and ends of tokens at
@@ -372,469 +368,518 @@ impl<'a, T: fmt::Write> fmt::Write for FmtSink<'a, T> {
     }
 }
 
-/// this is an implementation detail of yaxpeax-arch and related crates. if you are a user of the
-/// disassemblers, do not use this struct. do not depend on this struct existing. this struct is
-/// not stable. this struct is not safe for general use. if you use this struct you and your
-/// program will be eaten by gremlins.
-///
-/// if you are implementing an instruction formatter for the yaxpeax family of crates: this struct
-/// is guaranteed to contain a string that is long enough to hold a fully-formatted instruction.
-/// because the buffer is guaranteed to be long enough, writes through `InstructionTextSink` are
-/// not bounds-checked, and the buffer is never grown.
-///
-/// this is wildly dangerous in general use. the public constructor of `InstructionTextSink` is
-/// unsafe as a result. as used in `InstructionFormatter`, the buffer is guaranteed to be
-/// `clear()`ed before use, `InstructionFormatter` ensures the buffer is large enough, *and*
-/// `InstructionFormatter` never allows `InstructionTextSink` to exist in a context where it would
-/// be written to without being rewound first.
-///
-/// because this opens a very large hole through which `fmt::Write` can become unsafe, incorrect
-/// uses of this struct will be hard to debug in general. `InstructionFormatter` is probably at the
-/// limit of easily-reasoned-about lifecycle of the buffer, which "only" leaves the problem of
-/// ensuring that instruction formatting impls this buffer is passed to are appropriately sized.
-///
-/// this is intended to be hidden in docs. if you see this in docs, it's a bug.
-#[doc(hidden)]
-pub struct InstructionTextSink<'buf> {
-    buf: &'buf mut alloc::string::String
-}
+#[cfg(feature = "alloc")]
+mod instruction_text_sink {
+    use core::fmt;
 
-impl<'buf> InstructionTextSink<'buf> {
-    // TODO: safety
-    pub unsafe fn new(buf: &'buf mut alloc::string::String) -> Self {
-        Self { buf }
-    }
-}
+    use super::{DisplaySink, u8_to_hex};
+    use crate::safer_unchecked::unreachable_kinda_unchecked;
 
-impl<'buf> fmt::Write for InstructionTextSink<'buf> {
-    fn write_str(&mut self, s: &str) -> Result<(), core::fmt::Error> {
-        self.buf.write_str(s)
-    }
-    fn write_char(&mut self, c: char) -> Result<(), core::fmt::Error> {
-        if cfg!(debug_assertions) {
-            if self.buf.capacity() < self.buf.len() + 1 {
-                panic!("InstructionTextSink::write_char would overflow output");
-            }
-        }
-        // SAFETY: `buf` is assumed to be long enough to hold all input, `buf` at `underlying.len()`
-        // is valid for writing, but may be uninitialized.
-        //
-        // this function is essentially equivalent to `Vec::push` specialized for the case that
-        // `len < buf.capacity()`:
-        // https://github.com/rust-lang/rust/blob/be9e27e/library/alloc/src/vec/mod.rs#L1993-L2006
-        unsafe {
-            let underlying = self.buf.as_mut_vec();
-            // `InstructionTextSink::write_char` is only used by yaxpeax-x86, and is only used to
-            // write single ASCII characters. this is wrong in the general case, but `write_char`
-            // here is not going to be used in the general case.
-            if cfg!(debug_asertions) {
-                panic!("InstructionTextSink::write_char would truncate output");
-            }
-            let to_push = c as u8;
-            // `ptr::write` here because `underlying.add(underlying.len())` may not point to an
-            // initialized value, which would mean that turning that pointer into a `&mut u8` to
-            // store through would be UB. `ptr::write` avoids taking the mut ref.
-            underlying.as_mut_ptr().offset(underlying.len() as isize).write(to_push);
-            // we have initialized all (one) bytes that `set_len` is increasing the length to
-            // include.
-            underlying.set_len(underlying.len() + 1);
-        }
-        Ok(())
+    /// this is an implementation detail of yaxpeax-arch and related crates. if you are a user of the
+    /// disassemblers, do not use this struct. do not depend on this struct existing. this struct is
+    /// not stable. this struct is not safe for general use. if you use this struct you and your
+    /// program will be eaten by gremlins.
+    ///
+    /// if you are implementing an instruction formatter for the yaxpeax family of crates: this struct
+    /// is guaranteed to contain a string that is long enough to hold a fully-formatted instruction.
+    /// because the buffer is guaranteed to be long enough, writes through `InstructionTextSink` are
+    /// not bounds-checked, and the buffer is never grown.
+    ///
+    /// this is wildly dangerous in general use. the public constructor of `InstructionTextSink` is
+    /// unsafe as a result. as used in `InstructionFormatter`, the buffer is guaranteed to be
+    /// `clear()`ed before use, `InstructionFormatter` ensures the buffer is large enough, *and*
+    /// `InstructionFormatter` never allows `InstructionTextSink` to exist in a context where it would
+    /// be written to without being rewound first.
+    ///
+    /// because this opens a very large hole through which `fmt::Write` can become unsafe, incorrect
+    /// uses of this struct will be hard to debug in general. `InstructionFormatter` is probably at the
+    /// limit of easily-reasoned-about lifecycle of the buffer, which "only" leaves the problem of
+    /// ensuring that instruction formatting impls this buffer is passed to are appropriately sized.
+    ///
+    /// this is intended to be hidden in docs. if you see this in docs, it's a bug.
+#[doc(hidden)]
+    pub struct InstructionTextSink<'buf> {
+        buf: &'buf mut alloc::string::String
     }
-}
 
-/// this [`DisplaySink`] impl exists to support somewhat more performant buffering of the kinds of
-/// strings `yaxpeax-x86` uses in formatting instructions.
-///
-/// span information is discarded at zero cost.
-impl DisplaySink for alloc::string::String {
-    #[inline(always)]
-    fn write_fixed_size(&mut self, s: &str) -> Result<(), core::fmt::Error> {
-        self.reserve(s.len());
-        let buf = unsafe { self.as_mut_vec() };
-        let new_bytes = s.as_bytes();
-
-        if new_bytes.len() == 0 {
-            unsafe { unreachable_kinda_unchecked() }
+    impl<'buf> InstructionTextSink<'buf> {
+        // TODO: safety
+        pub unsafe fn new(buf: &'buf mut alloc::string::String) -> Self {
+            Self { buf }
         }
+    }
 
-        if new_bytes.len() >= 16 {
-            unsafe { unreachable_kinda_unchecked() }
+    impl<'buf> fmt::Write for InstructionTextSink<'buf> {
+        fn write_str(&mut self, s: &str) -> Result<(), core::fmt::Error> {
+            self.buf.write_str(s)
         }
-
-        unsafe {
-            let dest = buf.as_mut_ptr().offset(buf.len() as isize);
-
-            // this used to be enough to bamboozle llvm away from
-            // https://github.com/rust-lang/rust/issues/92993#issuecomment-2028915232
-            // if `s` is not fixed size. somewhere between Rust 1.68 and Rust 1.74 this stopped
-            // being sufficient, so `write_fixed_size` truly should only be used for fixed size `s`
-            // (otherwise this is a libc memcpy call in disguise). for fixed-size strings this
-            // unrolls into some kind of appropriate series of `mov`.
-            dest.offset(0 as isize).write(new_bytes[0]);
-            for i in 1..new_bytes.len() {
-                dest.offset(i as isize).write(new_bytes[i]);
+        fn write_char(&mut self, c: char) -> Result<(), core::fmt::Error> {
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + 1 {
+                    panic!("InstructionTextSink::write_char would overflow output");
+                }
             }
-
-            buf.set_len(buf.len() + new_bytes.len());
+            // SAFETY: `buf` is assumed to be long enough to hold all input, `buf` at `underlying.len()`
+            // is valid for writing, but may be uninitialized.
+            //
+            // this function is essentially equivalent to `Vec::push` specialized for the case that
+            // `len < buf.capacity()`:
+            // https://github.com/rust-lang/rust/blob/be9e27e/library/alloc/src/vec/mod.rs#L1993-L2006
+            unsafe {
+                let underlying = self.buf.as_mut_vec();
+                // `InstructionTextSink::write_char` is only used by yaxpeax-x86, and is only used to
+                // write single ASCII characters. this is wrong in the general case, but `write_char`
+                // here is not going to be used in the general case.
+                if cfg!(debug_asertions) {
+                    panic!("InstructionTextSink::write_char would truncate output");
+                }
+                let to_push = c as u8;
+                // `ptr::write` here because `underlying.add(underlying.len())` may not point to an
+                // initialized value, which would mean that turning that pointer into a `&mut u8` to
+                // store through would be UB. `ptr::write` avoids taking the mut ref.
+                underlying.as_mut_ptr().offset(underlying.len() as isize).write(to_push);
+                // we have initialized all (one) bytes that `set_len` is increasing the length to
+                // include.
+                underlying.set_len(underlying.len() + 1);
+            }
+            Ok(())
         }
-
-        Ok(())
     }
-    unsafe fn write_lt_32(&mut self, s: &str) -> Result<(), fmt::Error> {
-        self.reserve(s.len());
 
-        // SAFETY: todo
-        let buf = unsafe { self.as_mut_vec() };
-        let new_bytes = s.as_bytes();
-
-        // should get DCE
-        if new_bytes.len() >= 32 {
-            unsafe { core::hint::unreachable_unchecked() }
-        }
-
-        unsafe {
-            let dest = buf.as_mut_ptr().offset(buf.len() as isize);
-            let src = new_bytes.as_ptr();
+    impl<'buf> DisplaySink for InstructionTextSink<'buf> {
+        #[inline(always)]
+        fn write_fixed_size(&mut self, s: &str) -> Result<(), core::fmt::Error> {
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + s.len() {
+                    panic!("InstructionTextSink::write_fixed_size would overflow output");
+                }
+            }
 
-            let rem = new_bytes.len() as isize;
+            let buf = unsafe { self.buf.as_mut_vec() };
+            let new_bytes = s.as_bytes();
 
-            // set_len early because there is no way to avoid the following asm!() writing that
-            // same number of bytes into buf
-            buf.set_len(buf.len() + new_bytes.len());
-
-            core::arch::asm!(
-                "6:",
-                "cmp {rem:e}, 16",
-                "jb 7f",
-                "mov {buf:r}, qword ptr [{src} + {rem} - 16]",
-                "mov qword ptr [{dest} + {rem} - 16], {buf:r}",
-                "mov {buf:r}, qword ptr [{src} + {rem} - 8]",
-                "mov qword ptr [{dest} + {rem} - 8], {buf:r}",
-                "sub {rem:e}, 16",
-                "jz 11f",
-                "7:",
-                "cmp {rem:e}, 8",
-                "jb 8f",
-                "mov {buf:r}, qword ptr [{src} + {rem} - 8]",
-                "mov qword ptr [{dest} + {rem} - 8], {buf:r}",
-                "sub {rem:e}, 8",
-                "jz 11f",
-                "8:",
-                "cmp {rem:e}, 4",
-                "jb 9f",
-                "mov {buf:e}, dword ptr [{src} + {rem} - 4]",
-                "mov dword ptr [{dest} + {rem} - 4], {buf:e}",
-                "sub {rem:e}, 4",
-                "jz 11f",
-                "9:",
-                "cmp {rem:e}, 2",
-                "jb 10f",
-                "mov {buf:x}, word ptr [{src} + {rem} - 2]",
-                "mov word ptr [{dest} + {rem} - 2], {buf:x}",
-                "sub {rem:e}, 2",
-                "jz 11f",
-                "10:",
-                "cmp {rem:e}, 1",
-                "jb 11f",
-                "mov {buf:l}, byte ptr [{src} + {rem} - 1]",
-                "mov byte ptr [{dest} + {rem} - 1], {buf:l}",
-                "11:",
-                src = in(reg) src,
-                dest = in(reg) dest,
-                rem = inout(reg) rem => _,
-                buf = out(reg) _,
-                options(nostack),
-            );
-        }
+            if new_bytes.len() == 0 {
+                return Ok(());
+            }
 
-        Ok(())
-    }
-    unsafe fn write_lt_16(&mut self, s: &str) -> Result<(), fmt::Error> {
-        self.reserve(s.len());
+            if new_bytes.len() >= 16 {
+                unsafe { unreachable_kinda_unchecked() }
+            }
 
-        // SAFETY: todo
-        let buf = unsafe { self.as_mut_vec() };
-        let new_bytes = s.as_bytes();
+            unsafe {
+                let dest = buf.as_mut_ptr().offset(buf.len() as isize);
+
+                // this used to be enough to bamboozle llvm away from
+                // https://github.com/rust-lang/rust/issues/92993#issuecomment-2028915232https://github.com/rust-lang/rust/issues/92993#issuecomment-2028915232
+                // if `s` is not fixed size. somewhere between Rust 1.68 and Rust 1.74 this stopped
+                // being sufficient, so `write_fixed_size` truly should only be used for fixed size `s`
+                // (otherwise this is a libc memcpy call in disguise). for fixed-size strings this
+                // unrolls into some kind of appropriate series of `mov`.
+                dest.offset(0 as isize).write(new_bytes[0]);
+                for i in 1..new_bytes.len() {
+                    dest.offset(i as isize).write(new_bytes[i]);
+                }
+
+                buf.set_len(buf.len() + new_bytes.len());
+            }
 
-        // should get DCE
-        if new_bytes.len() >= 16 {
-            unsafe { core::hint::unreachable_unchecked() }
+            Ok(())
         }
+        unsafe fn write_lt_32(&mut self, s: &str) -> Result<(), fmt::Error> {
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + s.len() {
+                    panic!("InstructionTextSink::write_lt_32 would overflow output");
+                }
+            }
 
-        unsafe {
-            let dest = buf.as_mut_ptr().offset(buf.len() as isize);
-            let src = new_bytes.as_ptr();
-
-            let rem = new_bytes.len() as isize;
-
-            // set_len early because there is no way to avoid the following asm!() writing that
-            // same number of bytes into buf
-            buf.set_len(buf.len() + new_bytes.len());
-
-            core::arch::asm!(
-                "7:",
-                "cmp {rem:e}, 8",
-                "jb 8f",
-                "mov {buf:r}, qword ptr [{src} + {rem} - 8]",
-                "mov qword ptr [{dest} + {rem} - 8], {buf:r}",
-                "sub {rem:e}, 8",
-                "jz 11f",
-                "8:",
-                "cmp {rem:e}, 4",
-                "jb 9f",
-                "mov {buf:e}, dword ptr [{src} + {rem} - 4]",
-                "mov dword ptr [{dest} + {rem} - 4], {buf:e}",
-                "sub {rem:e}, 4",
-                "jz 11f",
-                "9:",
-                "cmp {rem:e}, 2",
-                "jb 10f",
-                "mov {buf:x}, word ptr [{src} + {rem} - 2]",
-                "mov word ptr [{dest} + {rem} - 2], {buf:x}",
-                "sub {rem:e}, 2",
-                "jz 11f",
-                "10:",
-                "cmp {rem:e}, 1",
-                "jb 11f",
-                "mov {buf:l}, byte ptr [{src} + {rem} - 1]",
-                "mov byte ptr [{dest} + {rem} - 1], {buf:l}",
-                "11:",
-                src = in(reg) src,
-                dest = in(reg) dest,
-                rem = inout(reg) rem => _,
-                buf = out(reg) _,
-                options(nostack),
-            );
-        }
+            // SAFETY: todo
+            let buf = unsafe { self.buf.as_mut_vec() };
+            let new_bytes = s.as_bytes();
 
-        Ok(())
-    }
-    unsafe fn write_lt_8(&mut self, s: &str) -> Result<(), fmt::Error> {
-        self.reserve(s.len());
+            // should get DCE
+            if new_bytes.len() >= 32 {
+                unsafe { core::hint::unreachable_unchecked() }
+            }
 
-        // SAFETY: todo
-        let buf = unsafe { self.as_mut_vec() };
-        let new_bytes = s.as_bytes();
+            unsafe {
+                let dest = buf.as_mut_ptr().offset(buf.len() as isize);
+                let src = new_bytes.as_ptr();
+
+                let rem = new_bytes.len() as isize;
+
+                // set_len early because there is no way to avoid the following asm!() writing that
+                // same number of bytes into buf
+                buf.set_len(buf.len() + new_bytes.len());
+
+                core::arch::asm!(
+                    "6:",
+                    "cmp {rem:e}, 16",
+                    "jb 7f",
+                    "mov {buf:r}, qword ptr [{src} + {rem} - 16]",
+                    "mov qword ptr [{dest} + {rem} - 16], {buf:r}",
+                    "mov {buf:r}, qword ptr [{src} + {rem} - 8]",
+                    "mov qword ptr [{dest} + {rem} - 8], {buf:r}",
+                    "sub {rem:e}, 16",
+                    "jz 11f",
+                    "7:",
+                    "cmp {rem:e}, 8",
+                    "jb 8f",
+                    "mov {buf:r}, qword ptr [{src} + {rem} - 8]",
+                    "mov qword ptr [{dest} + {rem} - 8], {buf:r}",
+                    "sub {rem:e}, 8",
+                    "jz 11f",
+                    "8:",
+                    "cmp {rem:e}, 4",
+                    "jb 9f",
+                    "mov {buf:e}, dword ptr [{src} + {rem} - 4]",
+                    "mov dword ptr [{dest} + {rem} - 4], {buf:e}",
+                    "sub {rem:e}, 4",
+                    "jz 11f",
+                    "9:",
+                    "cmp {rem:e}, 2",
+                    "jb 10f",
+                    "mov {buf:x}, word ptr [{src} + {rem} - 2]",
+                    "mov word ptr [{dest} + {rem} - 2], {buf:x}",
+                    "sub {rem:e}, 2",
+                    "jz 11f",
+                    "10:",
+                    "cmp {rem:e}, 1",
+                    "jb 11f",
+                    "mov {buf:l}, byte ptr [{src} + {rem} - 1]",
+                    "mov byte ptr [{dest} + {rem} - 1], {buf:l}",
+                    "11:",
+                    src = in(reg) src,
+                    dest = in(reg) dest,
+                    rem = inout(reg) rem => _,
+                    buf = out(reg) _,
+                    options(nostack),
+                );
+            }
 
-        // should get DCE
-        if new_bytes.len() >= 8 {
-            unsafe { core::hint::unreachable_unchecked() }
+            Ok(())
         }
+        unsafe fn write_lt_16(&mut self, s: &str) -> Result<(), fmt::Error> {
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + s.len() {
+                    panic!("InstructionTextSink::write_lt_16 would overflow output");
+                }
+            }
 
-        unsafe {
-            let dest = buf.as_mut_ptr().offset(buf.len() as isize);
-            let src = new_bytes.as_ptr();
+            // SAFETY: todo
+            let buf = unsafe { self.buf.as_mut_vec() };
+            let new_bytes = s.as_bytes();
 
-            let rem = new_bytes.len() as isize;
+            // should get DCE
+            if new_bytes.len() >= 16 {
+                unsafe { core::hint::unreachable_unchecked() }
+            }
 
-            // set_len early because there is no way to avoid the following asm!() writing that
-            // same number of bytes into buf
-            buf.set_len(buf.len() + new_bytes.len());
+            unsafe {
+                let dest = buf.as_mut_ptr().offset(buf.len() as isize);
+                let src = new_bytes.as_ptr();
+
+                let rem = new_bytes.len() as isize;
+
+                // set_len early because there is no way to avoid the following asm!() writing that
+                // same number of bytes into buf
+                buf.set_len(buf.len() + new_bytes.len());
+
+                core::arch::asm!(
+                    "7:",
+                    "cmp {rem:e}, 8",
+                    "jb 8f",
+                    "mov {buf:r}, qword ptr [{src} + {rem} - 8]",
+                    "mov qword ptr [{dest} + {rem} - 8], {buf:r}",
+                    "sub {rem:e}, 8",
+                    "jz 11f",
+                    "8:",
+                    "cmp {rem:e}, 4",
+                    "jb 9f",
+                    "mov {buf:e}, dword ptr [{src} + {rem} - 4]",
+                    "mov dword ptr [{dest} + {rem} - 4], {buf:e}",
+                    "sub {rem:e}, 4",
+                    "jz 11f",
+                    "9:",
+                    "cmp {rem:e}, 2",
+                    "jb 10f",
+                    "mov {buf:x}, word ptr [{src} + {rem} - 2]",
+                    "mov word ptr [{dest} + {rem} - 2], {buf:x}",
+                    "sub {rem:e}, 2",
+                    "jz 11f",
+                    "10:",
+                    "cmp {rem:e}, 1",
+                    "jb 11f",
+                    "mov {buf:l}, byte ptr [{src} + {rem} - 1]",
+                    "mov byte ptr [{dest} + {rem} - 1], {buf:l}",
+                    "11:",
+                    src = in(reg) src,
+                    dest = in(reg) dest,
+                    rem = inout(reg) rem => _,
+                    buf = out(reg) _,
+                    options(nostack),
+                );
+            }
 
-            core::arch::asm!(
-                "8:",
-                "cmp {rem:e}, 4",
-                "jb 9f",
-                "mov {buf:e}, dword ptr [{src} + {rem} - 4]",
-                "mov dword ptr [{dest} + {rem} - 4], {buf:e}",
-                "sub {rem:e}, 4",
-                "jz 11f",
-                "9:",
-                "cmp {rem:e}, 2",
-                "jb 10f",
-                "mov {buf:x}, word ptr [{src} + {rem} - 2]",
-                "mov word ptr [{dest} + {rem} - 2], {buf:x}",
-                "sub {rem:e}, 2",
-                "jz 11f",
-                "10:",
-                "cmp {rem:e}, 1",
-                "jb 11f",
-                "mov {buf:l}, byte ptr [{src} + {rem} - 1]",
-                "mov byte ptr [{dest} + {rem} - 1], {buf:l}",
-                "11:",
-                src = in(reg) src,
-                dest = in(reg) dest,
-                rem = inout(reg) rem => _,
-                buf = out(reg) _,
-                options(nostack),
-            );
+            Ok(())
         }
+        unsafe fn write_lt_8(&mut self, s: &str) -> Result<(), fmt::Error> {
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + s.len() {
+                    panic!("InstructionTextSink::write_lt_8 would overflow output");
+                }
+            }
 
-        Ok(())
-    }
-    /// write a u8 to the output as a base-16 integer.
-    ///
-    /// this is provided for optimization opportunities when the formatted integer can be written
-    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
-    /// followup step)
-    #[inline(always)]
-    fn write_u8(&mut self, mut v: u8) -> Result<(), core::fmt::Error> {
-        if v == 0 {
-            return self.write_fixed_size("0");
-        }
-        // we can fairly easily predict the size of a formatted string here with lzcnt, which also
-        // means we can write directly into the correct offsets of the output string.
-        let printed_size = ((8 - v.leading_zeros() + 3) >> 2) as usize;
+            // SAFETY: todo
+            let buf = unsafe { self.buf.as_mut_vec() };
+            let new_bytes = s.as_bytes();
 
-        self.reserve(printed_size);
+            // should get DCE
+            if new_bytes.len() >= 8 {
+                unsafe { core::hint::unreachable_unchecked() }
+            }
 
-        let buf = unsafe { self.as_mut_vec() };
-        let new_len = buf.len() + printed_size;
+            unsafe {
+                let dest = buf.as_mut_ptr().offset(buf.len() as isize);
+                let src = new_bytes.as_ptr();
+
+                let rem = new_bytes.len() as isize;
+
+                // set_len early because there is no way to avoid the following asm!() writing that
+                // same number of bytes into buf
+                buf.set_len(buf.len() + new_bytes.len());
+
+                core::arch::asm!(
+                    "8:",
+                    "cmp {rem:e}, 4",
+                    "jb 9f",
+                    "mov {buf:e}, dword ptr [{src} + {rem} - 4]",
+                    "mov dword ptr [{dest} + {rem} - 4], {buf:e}",
+                    "sub {rem:e}, 4",
+                    "jz 11f",
+                    "9:",
+                    "cmp {rem:e}, 2",
+                    "jb 10f",
+                    "mov {buf:x}, word ptr [{src} + {rem} - 2]",
+                    "mov word ptr [{dest} + {rem} - 2], {buf:x}",
+                    "sub {rem:e}, 2",
+                    "jz 11f",
+                    "10:",
+                    "cmp {rem:e}, 1",
+                    "jb 11f",
+                    "mov {buf:l}, byte ptr [{src} + {rem} - 1]",
+                    "mov byte ptr [{dest} + {rem} - 1], {buf:l}",
+                    "11:",
+                    src = in(reg) src,
+                    dest = in(reg) dest,
+                    rem = inout(reg) rem => _,
+                    buf = out(reg) _,
+                    options(nostack),
+                );
+            }
 
-        unsafe {
-            buf.set_len(new_len);
+            Ok(())
         }
-        let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+        /// write a u8 to the output as a base-16 integer.
+        ///
+        /// this is provided for optimization opportunities when the formatted integer can be written
+        /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+        /// followup step)
+        #[inline(always)]
+        fn write_u8(&mut self, mut v: u8) -> Result<(), core::fmt::Error> {
+            if v == 0 {
+                return self.write_fixed_size("0");
+            }
+            // we can fairly easily predict the size of a formatted string here with lzcnt, which also
+            // means we can write directly into the correct offsets of the output string.
+            let printed_size = ((8 - v.leading_zeros() + 3) >> 2) as usize;
+
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + printed_size {
+                    panic!("InstructionTextSink::write_u8 would overflow output");
+                }
+            }
+
+            let buf = unsafe { self.buf.as_mut_vec() };
+            let new_len = buf.len() + printed_size;
 
-        loop {
-            let digit = v % 16;
-            let c = u8_to_hex(digit as u8);
             unsafe {
-                p = p.offset(-1);
-                p.write(c);
+                buf.set_len(new_len);
             }
-            v = v / 16;
-            if v == 0 {
-                break;
+            let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+
+            loop {
+                let digit = v % 16;
+                let c = u8_to_hex(digit as u8);
+                unsafe {
+                    p = p.offset(-1);
+                    p.write(c);
+                }
+                v = v / 16;
+                if v == 0 {
+                    break;
+                }
             }
-        }
 
-        Ok(())
-    }
-    /// write a u16 to the output as a base-16 integer.
-    ///
-    /// this is provided for optimization opportunities when the formatted integer can be written
-    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
-    /// followup step)
-    #[inline(always)]
-    fn write_u16(&mut self, mut v: u16) -> Result<(), core::fmt::Error> {
-        if v == 0 {
-            return self.write_fixed_size("0");
+            Ok(())
         }
-        // we can fairly easily predict the size of a formatted string here with lzcnt, which also
-        // means we can write directly into the correct offsets of the output string.
-        let printed_size = ((16 - v.leading_zeros() + 3) >> 2) as usize;
+        /// write a u16 to the output as a base-16 integer.
+        ///
+        /// this is provided for optimization opportunities when the formatted integer can be written
+        /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+        /// followup step)
+        #[inline(always)]
+        fn write_u16(&mut self, mut v: u16) -> Result<(), core::fmt::Error> {
+            if v == 0 {
+                return self.write_fixed_size("0");
+            }
 
-        self.reserve(printed_size);
+            // we can fairly easily predict the size of a formatted string here with lzcnt, which also
+            // means we can write directly into the correct offsets of the output string.
+            let printed_size = ((16 - v.leading_zeros() + 3) >> 2) as usize;
 
-        let buf = unsafe { self.as_mut_vec() };
-        let new_len = buf.len() + printed_size;
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + printed_size {
+                    panic!("InstructionTextSink::write_u16 would overflow output");
+                }
+            }
 
-        unsafe {
-            buf.set_len(new_len);
-        }
-        let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+            let buf = unsafe { self.buf.as_mut_vec() };
+            let new_len = buf.len() + printed_size;
 
-        loop {
-            let digit = v % 16;
-            let c = u8_to_hex(digit as u8);
             unsafe {
-                p = p.offset(-1);
-                p.write(c);
+                buf.set_len(new_len);
             }
-            v = v / 16;
-            if v == 0 {
-                break;
+            let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+
+            loop {
+                let digit = v % 16;
+                let c = u8_to_hex(digit as u8);
+                unsafe {
+                    p = p.offset(-1);
+                    p.write(c);
+                }
+                v = v / 16;
+                if v == 0 {
+                    break;
+                }
             }
-        }
 
-        Ok(())
-    }
-    /// write a u32 to the output as a base-16 integer.
-    ///
-    /// this is provided for optimization opportunities when the formatted integer can be written
-    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
-    /// followup step)
-    #[inline(always)]
-    fn write_u32(&mut self, mut v: u32) -> Result<(), core::fmt::Error> {
-        if v == 0 {
-            return self.write_fixed_size("0");
+            Ok(())
         }
-        // we can fairly easily predict the size of a formatted string here with lzcnt, which also
-        // means we can write directly into the correct offsets of the output string.
-        let printed_size = ((32 - v.leading_zeros() + 3) >> 2) as usize;
+        /// write a u32 to the output as a base-16 integer.
+        ///
+        /// this is provided for optimization opportunities when the formatted integer can be written
+        /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+        /// followup step)
+        #[inline(always)]
+        fn write_u32(&mut self, mut v: u32) -> Result<(), core::fmt::Error> {
+            if v == 0 {
+                return self.write_fixed_size("0");
+            }
 
-        self.reserve(printed_size);
+            // we can fairly easily predict the size of a formatted string here with lzcnt, which also
+            // means we can write directly into the correct offsets of the output string.
+            let printed_size = ((32 - v.leading_zeros() + 3) >> 2) as usize;
 
-        let buf = unsafe { self.as_mut_vec() };
-        let new_len = buf.len() + printed_size;
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + printed_size {
+                    panic!("InstructionTextSink::write_u32 would overflow output");
+                }
+            }
 
-        unsafe {
-            buf.set_len(new_len);
-        }
-        let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+            let buf = unsafe { self.buf.as_mut_vec() };
+            let new_len = buf.len() + printed_size;
 
-        loop {
-            let digit = v % 16;
-            let c = u8_to_hex(digit as u8);
             unsafe {
-                p = p.offset(-1);
-                p.write(c);
+                buf.set_len(new_len);
             }
-            v = v / 16;
-            if v == 0 {
-                break;
+            let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+
+            loop {
+                let digit = v % 16;
+                let c = u8_to_hex(digit as u8);
+                unsafe {
+                    p = p.offset(-1);
+                    p.write(c);
+                }
+                v = v / 16;
+                if v == 0 {
+                    break;
+                }
             }
-        }
 
-        Ok(())
-    }
-    /// write a u64 to the output as a base-16 integer.
-    ///
-    /// this is provided for optimization opportunities when the formatted integer can be written
-    /// directly to the sink (rather than formatted to an intermediate buffer and output as a
-    /// followup step)
-    #[inline(always)]
-    fn write_u64(&mut self, mut v: u64) -> Result<(), core::fmt::Error> {
-        if v == 0 {
-            return self.write_fixed_size("0");
+            Ok(())
         }
-        // we can fairly easily predict the size of a formatted string here with lzcnt, which also
-        // means we can write directly into the correct offsets of the output string.
-        let printed_size = ((64 - v.leading_zeros() + 3) >> 2) as usize;
+        /// write a u64 to the output as a base-16 integer.
+        ///
+        /// this is provided for optimization opportunities when the formatted integer can be written
+        /// directly to the sink (rather than formatted to an intermediate buffer and output as a
+        /// followup step)
+        #[inline(always)]
+        fn write_u64(&mut self, mut v: u64) -> Result<(), core::fmt::Error> {
+            if v == 0 {
+                return self.write_fixed_size("0");
+            }
 
-        self.reserve(printed_size);
+            // we can fairly easily predict the size of a formatted string here with lzcnt, which also
+            // means we can write directly into the correct offsets of the output string.
+            let printed_size = ((64 - v.leading_zeros() + 3) >> 2) as usize;
 
-        let buf = unsafe { self.as_mut_vec() };
-        let new_len = buf.len() + printed_size;
+            if cfg!(debug_assertions) {
+                if self.buf.capacity() < self.buf.len() + printed_size {
+                    panic!("InstructionTextSink::write_u64 would overflow output");
+                }
+            }
 
-        unsafe {
-            buf.set_len(new_len);
-        }
-        let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+            let buf = unsafe { self.buf.as_mut_vec() };
+            let new_len = buf.len() + printed_size;
 
-        loop {
-            let digit = v % 16;
-            let c = u8_to_hex(digit as u8);
             unsafe {
-                p = p.offset(-1);
-                p.write(c);
+                buf.set_len(new_len);
             }
-            v = v / 16;
-            if v == 0 {
-                break;
+            let mut p = unsafe { buf.as_mut_ptr().offset(new_len as isize) };
+
+            loop {
+                let digit = v % 16;
+                let c = u8_to_hex(digit as u8);
+                unsafe {
+                    p = p.offset(-1);
+                    p.write(c);
+                }
+                v = v / 16;
+                if v == 0 {
+                    break;
+                }
             }
-        }
 
-        Ok(())
+            Ok(())
+        }
     }
 }
+#[cfg(feature = "alloc")]
+pub use instruction_text_sink::InstructionTextSink;
+
+
+#[cfg(feature = "alloc")]
+use crate::display::u8_to_hex;
+
+#[cfg(feature = "alloc")]
+use crate::safer_unchecked::unreachable_kinda_unchecked;
 
-impl<'buf> DisplaySink for InstructionTextSink<'buf> {
+/// this [`DisplaySink`] impl exists to support somewhat more performant buffering of the kinds of
+/// strings `yaxpeax-x86` uses in formatting instructions.
+///
+/// span information is discarded at zero cost.
+#[cfg(feature = "alloc")]
+impl DisplaySink for alloc::string::String {
     #[inline(always)]
     fn write_fixed_size(&mut self, s: &str) -> Result<(), core::fmt::Error> {
-        if cfg!(debug_assertions) {
-            if self.buf.capacity() < self.buf.len() + s.len() {
-                panic!("InstructionTextSink::write_fixed_size would overflow output");
-            }
-        }
-
-        let buf = unsafe { self.buf.as_mut_vec() };
+        self.reserve(s.len());
+        let buf = unsafe { self.as_mut_vec() };
         let new_bytes = s.as_bytes();
 
         if new_bytes.len() == 0 {
-            return Ok(());
+            unsafe { unreachable_kinda_unchecked() }
         }
 
         if new_bytes.len() >= 16 {
@@ -845,7 +890,7 @@ impl<'buf> DisplaySink for InstructionTextSink<'buf> {
             let dest = buf.as_mut_ptr().offset(buf.len() as isize);
 
             // this used to be enough to bamboozle llvm away from
-            // https://github.com/rust-lang/rust/issues/92993#issuecomment-2028915232https://github.com/rust-lang/rust/issues/92993#issuecomment-2028915232
+            // https://github.com/rust-lang/rust/issues/92993#issuecomment-2028915232
             // if `s` is not fixed size. somewhere between Rust 1.68 and Rust 1.74 this stopped
             // being sufficient, so `write_fixed_size` truly should only be used for fixed size `s`
             // (otherwise this is a libc memcpy call in disguise). for fixed-size strings this
@@ -861,14 +906,10 @@ impl<'buf> DisplaySink for InstructionTextSink<'buf> {
         Ok(())
     }
     unsafe fn write_lt_32(&mut self, s: &str) -> Result<(), fmt::Error> {
-        if cfg!(debug_assertions) {
-            if self.buf.capacity() < self.buf.len() + s.len() {
-                panic!("InstructionTextSink::write_lt_32 would overflow output");
-            }
-        }
+        self.reserve(s.len());
 
         // SAFETY: todo
-        let buf = unsafe { self.buf.as_mut_vec() };
+        let buf = unsafe { self.as_mut_vec() };
         let new_bytes = s.as_bytes();
 
         // should get DCE
@@ -934,14 +975,10 @@ impl<'buf> DisplaySink for InstructionTextSink<'buf> {
         Ok(())
     }
     unsafe fn write_lt_16(&mut self, s: &str) -> Result<(), fmt::Error> {
-        if cfg!(debug_assertions) {
-            if self.buf.capacity() < self.buf.len() + s.len() {
-                panic!("InstructionTextSink::write_lt_16 would overflow output");
-            }
-        }
+        self.reserve(s.len());
 
         // SAFETY: todo
-        let buf = unsafe { self.buf.as_mut_vec() };
+        let buf = unsafe { self.as_mut_vec() };
         let new_bytes = s.as_bytes();
 
         // should get DCE
@@ -998,14 +1035,10 @@ impl<'buf> DisplaySink for InstructionTextSink<'buf> {
         Ok(())
     }
     unsafe fn write_lt_8(&mut self, s: &str) -> Result<(), fmt::Error> {
-        if cfg!(debug_assertions) {
-            if self.buf.capacity() < self.buf.len() + s.len() {
-                panic!("InstructionTextSink::write_lt_8 would overflow output");
-            }
-        }
+        self.reserve(s.len());
 
         // SAFETY: todo
-        let buf = unsafe { self.buf.as_mut_vec() };
+        let buf = unsafe { self.as_mut_vec() };
         let new_bytes = s.as_bytes();
 
         // should get DCE
@@ -1068,13 +1101,9 @@ impl<'buf> DisplaySink for InstructionTextSink<'buf> {
         // means we can write directly into the correct offsets of the output string.
         let printed_size = ((8 - v.leading_zeros() + 3) >> 2) as usize;
 
-        if cfg!(debug_assertions) {
-            if self.buf.capacity() < self.buf.len() + printed_size {
-                panic!("InstructionTextSink::write_u8 would overflow output");
-            }
-        }
+        self.reserve(printed_size);
 
-        let buf = unsafe { self.buf.as_mut_vec() };
+        let buf = unsafe { self.as_mut_vec() };
         let new_len = buf.len() + printed_size;
 
         unsafe {
@@ -1107,18 +1136,13 @@ impl<'buf> DisplaySink for InstructionTextSink<'buf> {
         if v == 0 {
             return self.write_fixed_size("0");
         }
-
         // we can fairly easily predict the size of a formatted string here with lzcnt, which also
         // means we can write directly into the correct offsets of the output string.
         let printed_size = ((16 - v.leading_zeros() + 3) >> 2) as usize;
 
-        if cfg!(debug_assertions) {
-            if self.buf.capacity() < self.buf.len() + printed_size {
-                panic!("InstructionTextSink::write_u16 would overflow output");
-            }
-        }
+        self.reserve(printed_size);
 
-        let buf = unsafe { self.buf.as_mut_vec() };
+        let buf = unsafe { self.as_mut_vec() };
         let new_len = buf.len() + printed_size;
 
         unsafe {
@@ -1151,18 +1175,13 @@ impl<'buf> DisplaySink for InstructionTextSink<'buf> {
         if v == 0 {
             return self.write_fixed_size("0");
         }
-
         // we can fairly easily predict the size of a formatted string here with lzcnt, which also
         // means we can write directly into the correct offsets of the output string.
         let printed_size = ((32 - v.leading_zeros() + 3) >> 2) as usize;
 
-        if cfg!(debug_assertions) {
-            if self.buf.capacity() < self.buf.len() + printed_size {
-                panic!("InstructionTextSink::write_u32 would overflow output");
-            }
-        }
+        self.reserve(printed_size);
 
-        let buf = unsafe { self.buf.as_mut_vec() };
+        let buf = unsafe { self.as_mut_vec() };
         let new_len = buf.len() + printed_size;
 
         unsafe {
@@ -1195,18 +1214,13 @@ impl<'buf> DisplaySink for InstructionTextSink<'buf> {
         if v == 0 {
             return self.write_fixed_size("0");
         }
-
         // we can fairly easily predict the size of a formatted string here with lzcnt, which also
         // means we can write directly into the correct offsets of the output string.
         let printed_size = ((64 - v.leading_zeros() + 3) >> 2) as usize;
 
-        if cfg!(debug_assertions) {
-            if self.buf.capacity() < self.buf.len() + printed_size {
-                panic!("InstructionTextSink::write_u64 would overflow output");
-            }
-        }
+        self.reserve(printed_size);
 
-        let buf = unsafe { self.buf.as_mut_vec() };
+        let buf = unsafe { self.as_mut_vec() };
         let new_len = buf.len() + printed_size;
 
         unsafe {
diff --git a/src/lib.rs b/src/lib.rs
index a194942..db438c2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -27,7 +27,6 @@ pub use color::ColorSettings;
 #[cfg(feature = "alloc")]
 extern crate alloc;
 
-#[cfg(feature = "alloc")]
 pub mod display;
 
 pub mod testkit;
diff --git a/tests/display.rs b/tests/display.rs
index 9a8ef2e..887db53 100644
--- a/tests/display.rs
+++ b/tests/display.rs
@@ -22,6 +22,23 @@ fn sinks_are_equivalent() {
 }
 */
 
+#[test]
+#[allow(deprecated)]
+fn formatters_are_not_feature_gated() {
+    use yaxpeax_arch::display::{
+        u8_hex, u16_hex, u32_hex, u64_hex,
+        signed_i8_hex, signed_i16_hex, signed_i32_hex, signed_i64_hex
+    };
+    let _ = u8_hex(10);
+    let _ = u16_hex(10);
+    let _ = u32_hex(10);
+    let _ = u64_hex(10);
+    let _ = signed_i8_hex(10);
+    let _ = signed_i16_hex(10);
+    let _ = signed_i32_hex(10);
+    let _ = signed_i64_hex(10);
+}
+
 #[cfg(feature="alloc")]
 #[test]
 fn display_sink_write_hex_helpers() {
-- 
cgit v1.1