rustzmq2 0.1.0 - Docs.rs

//! Hand-rolled vectored-write sink for the engine writer task.
//!
//! Replaces `tokio_util::codec::FramedWrite`'s single-buffer `poll_write` with
//! `OwnedWriteHalf::try_write_vectored` + `IoSlice`. The encoder's
//! `extend_from_slice` payload copy (`src/codec/zmq_codec.rs:191`) is gone:
//! frame headers are built inline into a stack `[u8; 9]`, frame payloads are
//! referenced directly out of the `ZmqMessage`'s `Bytes` (refcount clone, no
//! byte copy), and both go out as separate `IoSlice` entries in one
//! `writev(2)` syscall.
//!
//! Partial writes are handled by keeping a `VecDeque<PendingMsg>` with a
//! per-front-frame `header_pos` cursor and `Bytes::advance` on the payload.
//! A partial write trims the front slices without re-allocating; the next
//! `try_write_vectored` builds a fresh `IoSlice` array from whatever's left.
//!
//! SNDMORE atomicity is preserved by construction: `enqueue` pushes the whole
//! `ZmqMessage` into the pending queue under the writer task's `&mut`, and
//! `flush_all` drains front-to-back — no logical message can get interleaved
//! with another. The "whole messages flushed" counter only bumps when a
//! `PendingMsg`'s last byte is on the wire, matching the flush-waiter
//! semantics on `FlushState.flushed` (src/engine.rs:453).

use crate::codec::zmq_codec::encode_frame_header;
use crate::codec::Message;
use crate::error::CodecError;
use crate::io_compat::AsyncVectoredWrite;

use bytes::{Buf, Bytes};
use parking_lot::Mutex;
use smallvec::SmallVec;

use std::collections::VecDeque;
use std::io::{self, IoSlice};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;

/// Maximum `IoSlice` entries per `try_write_vectored` call. 64 slices = up to
/// 32 frames per syscall at 2 slices/frame (header + payload). Well under
/// `IOV_MAX = 1024` on Linux/macOS; enough to keep batch amortization cheap.
const IOV_CAP: usize = 64;

/// Pending-queue pre-allocation. Bounded by the writer's
/// `out_batch_size` drain; a small multiplier absorbs the rare case of
/// a partial write leaving a message at the front while the next batch lands
/// behind it. 16 fits comfortably and re-grows are a non-issue anyway since
/// we drain in bulk.
const PENDING_CAPACITY: usize = 16;

/// Upper bound on payload size for the writer fast path. Below this,
/// single-frame messages arriving at an empty writer bypass the
/// peer-loop side's `PendingMsg` / `VecDeque` and go straight out as
/// one `try_write_vectored` with 2 `IoSlice`s (header + payload).
///
/// 64 KiB covers the typical-message regime (256 B / 4 KiB benches).
/// On partial write at any size, the remainder is materialized into
/// `SharedHalf::pending_overflow` and the peer loop drains it before
/// its own queue — preserves wire-order without corruption.
///
/// Wake-count instrumentation (`RUSTZMQ2_WAKE_COUNT=1`) shows that when
/// inline fires, `peer_loop_iters` drops from 2 to 1 per RTT and
/// per-iter latency improves by ~5 µs on TCP. Sub-RTT bench is 256 B,
/// well inside the 64 KiB cap.
const FAST_PATH_MAX_PAYLOAD: usize = 65536;

/// Owned write half of the underlying transport, generic over the concrete
/// half type. `W` must implement [`AsyncVectoredWrite`], which provides the
/// readiness-based `try_write_vectored` + `writable` pair the engine needs
/// to avoid copying frame payloads into a single contiguous buffer.
#[cfg(any(feature = "tcp", all(feature = "ipc", target_family = "unix")))]
pub(crate) struct EngineWriteHalf<W>(W);

#[cfg(any(feature = "tcp", all(feature = "ipc", target_family = "unix")))]
impl<W: AsyncVectoredWrite> EngineWriteHalf<W> {
    pub(crate) fn new(inner: W) -> Self {
        Self(inner)
    }

    #[inline]
    fn try_write_vectored(&self, bufs: &[IoSlice<'_>]) -> io::Result<usize> {
        self.0.try_write_vectored(bufs)
    }

    async fn writable(&self) -> io::Result<()> {
        self.0.writable().await
    }
}

/// Inline-allocated ZMTP frame header. Two bytes for frames ≤ 255, nine
/// bytes otherwise.
#[derive(Debug)]
struct HeaderBuf {
    bytes: [u8; 9],
    len: u8,
}

#[derive(Debug)]
struct PendingFrame {
    header: HeaderBuf,
    /// Cursor into `header.bytes`. Advances as partial writes drain header
    /// bytes; once equal to `header.len`, the header is done.
    header_pos: u8,
    /// Remaining payload. `Bytes::advance` trims the front on partial writes;
    /// a fully-drained payload is empty.
    payload: Bytes,
}

impl PendingFrame {
    #[inline]
    fn is_drained(&self) -> bool {
        self.header_pos == self.header.len && self.payload.is_empty()
    }
}

#[derive(Debug)]
pub(crate) struct PendingMsg {
    /// `SmallVec` inline size matches `ZmqMessage::INLINE_FRAMES = 2` so the
    /// dominant single-frame / 2-frame envelope shapes stay heap-free.
    frames: SmallVec<[PendingFrame; 2]>,
}

impl PendingMsg {
    fn from_zmq_message(msg: &crate::ZmqMessage) -> Self {
        let mut frames: SmallVec<[PendingFrame; 2]> = SmallVec::with_capacity(msg.len());
        let last = msg.len() - 1;
        for (idx, frame) in msg.iter().enumerate() {
            let (header_bytes, header_len) = encode_frame_header(frame.len(), idx != last);
            frames.push(PendingFrame {
                header: HeaderBuf {
                    bytes: header_bytes,
                    len: header_len,
                },
                header_pos: 0,
                payload: frame.clone(),
            });
        }
        PendingMsg { frames }
    }

    /// Build a single-frame pending message from a payload + already-encoded
    /// header, then advance the internal cursor by `bytes_already_written`.
    /// Used by the inline path to materialize a partial-write remainder
    /// into the overflow queue without re-encoding.
    fn from_single_frame_partial(
        header: HeaderBuf,
        payload: Bytes,
        bytes_already_written: usize,
    ) -> Self {
        let mut frame = PendingFrame {
            header,
            header_pos: 0,
            payload,
        };
        let mut n = bytes_already_written;
        let hdr_left = (frame.header.len - frame.header_pos) as usize;
        let take = n.min(hdr_left);
        frame.header_pos += take as u8;
        n -= take;
        if n > 0 {
            frame.payload.advance(n.min(frame.payload.len()));
        }
        let mut frames: SmallVec<[PendingFrame; 2]> = SmallVec::new();
        frames.push(frame);
        PendingMsg { frames }
    }

    /// Wrap a pre-encoded single-buffer payload (greeting / command) as a
    /// "headerless" pending message — the payload `Bytes` already contains
    /// the full wire-format bytes, so the header is zero-length.
    fn from_raw_bytes(payload: Bytes) -> Self {
        let mut frames: SmallVec<[PendingFrame; 2]> = SmallVec::new();
        frames.push(PendingFrame {
            header: HeaderBuf {
                bytes: [0; 9],
                len: 0,
            },
            header_pos: 0,
            payload,
        });
        PendingMsg { frames }
    }
}

/// Result of a fast-path attempt. See
/// [`VectoredWriter::try_fast_path_single_frame`].
#[derive(Debug)]
pub(crate) enum FastPath {
    /// Message fully sent; no pending-queue work needed.
    Sent,
    /// Partial write; remainder pushed into the pending queue. Caller must
    /// drive `flush_all` to finish draining.
    Enqueued,
    /// Precondition miss or `WouldBlock`; caller owns the message and
    /// should enqueue via the normal path.
    NotTaken(Message),
}

/// Concrete write-half enum for the default tokio transports. This is the
/// type that crosses the codec → engine boundary today; when a new runtime is
/// added it contributes its own variant here (or its own enum alias).
#[cfg(all(
    feature = "tokio",
    any(feature = "tcp", all(feature = "ipc", target_family = "unix"))
))]
pub enum ZmqEngineWriteHalf {
    #[cfg(feature = "tcp")]
    Tcp(EngineWriteHalf<tokio::net::tcp::OwnedWriteHalf>),
    #[cfg(all(feature = "ipc", feature = "tokio", target_family = "unix"))]
    Ipc(EngineWriteHalf<tokio::net::unix::OwnedWriteHalf>),
}

#[cfg(all(
    feature = "tokio",
    any(feature = "tcp", all(feature = "ipc", target_family = "unix"))
))]
impl AsyncVectoredWrite for ZmqEngineWriteHalf {
    #[inline]
    fn try_write_vectored(&self, bufs: &[IoSlice<'_>]) -> io::Result<usize> {
        match self {
            #[cfg(feature = "tcp")]
            ZmqEngineWriteHalf::Tcp(h) => h.try_write_vectored(bufs),
            #[cfg(all(feature = "ipc", feature = "tokio", target_family = "unix"))]
            ZmqEngineWriteHalf::Ipc(h) => h.try_write_vectored(bufs),
        }
    }

    async fn writable(&self) -> io::Result<()> {
        match self {
            #[cfg(feature = "tcp")]
            ZmqEngineWriteHalf::Tcp(h) => h.writable().await,
            #[cfg(all(feature = "ipc", feature = "tokio", target_family = "unix"))]
            ZmqEngineWriteHalf::Ipc(h) => h.writable().await,
        }
    }
}

/// Concrete write-half newtype for the smol TCP transport.
///
/// `smol::net::TcpStream` converts into `Arc<async_io::Async<std::net::TcpStream>>`
/// via its `From` impl, which exposes `get_ref()` and `writable()` needed by
/// `AsyncVectoredWrite`.
#[cfg(all(feature = "smol", not(feature = "tokio"), feature = "tcp"))]
pub struct SmolEngineWriteHalf(
    pub(crate) EngineWriteHalf<std::sync::Arc<async_io::Async<std::net::TcpStream>>>,
);

#[cfg(all(feature = "smol", not(feature = "tokio"), feature = "tcp"))]
impl AsyncVectoredWrite for SmolEngineWriteHalf {
    #[inline]
    fn try_write_vectored(&self, bufs: &[IoSlice<'_>]) -> io::Result<usize> {
        self.0.try_write_vectored(bufs)
    }

    async fn writable(&self) -> io::Result<()> {
        self.0.writable().await
    }
}

/// Shared state behind the transport write half.
///
/// The peer loop's `VectoredWriter` and the optional caller-thread
/// inline-write path both write through `write_lock` so frame headers
/// and payloads land contiguously on the wire.
///
/// `pending_len` is a cheap FIFO-ordering observable: non-zero means
/// the peer loop owes bytes on the wire, so an inline write would
/// jump ahead. `peer_loop_busy` covers the dequeue-but-not-yet-disposed
/// window. `pending_overflow` holds partial-write remainders from the
/// inline path; the peer loop drains them before its own queue.
///
/// All of these are zero-cost when inline is disabled — the inline
/// call sites short-circuit on `inline_write.is_none()` before
/// touching them.
pub(crate) struct SharedHalf<W> {
    half: W,
    write_lock: Mutex<()>,
    pending_len: AtomicUsize,
    peer_loop_busy: std::sync::atomic::AtomicBool,
    pending_overflow: Mutex<VecDeque<PendingMsg>>,
    pending_overflow_len: AtomicUsize,
    /// Wake source for the peer loop when the inline path leaves a
    /// partial-write remainder in `pending_overflow`.
    pub(crate) overflow_notify: crate::async_rt::notify::RuntimeNotify,
}

impl<W> SharedHalf<W> {
    fn new(half: W) -> Self {
        Self {
            half,
            write_lock: Mutex::new(()),
            pending_len: AtomicUsize::new(0),
            peer_loop_busy: std::sync::atomic::AtomicBool::new(false),
            pending_overflow: Mutex::new(VecDeque::new()),
            pending_overflow_len: AtomicUsize::new(0),
            overflow_notify: crate::async_rt::notify::RuntimeNotify::new(),
        }
    }

    /// Cheap check used by the inline path's queue gate and by the peer
    /// loop's iteration top. Acquire-ordered with the overflow push.
    #[inline]
    pub(crate) fn has_overflow(&self) -> bool {
        self.pending_overflow_len.load(Ordering::Acquire) > 0
    }

    /// Drain the overflow queue into `dst` (the peer-loop's `pending`)
    /// under the overflow mutex. Caller is then responsible for
    /// flushing `dst` to the wire in FIFO order.
    pub(crate) fn drain_overflow_into(&self, dst: &mut VecDeque<PendingMsg>) {
        let mut g = self.pending_overflow.lock();
        // push_front in reverse pop order preserves the original FIFO.
        while let Some(msg) = g.pop_back() {
            dst.push_front(msg);
        }
        self.pending_overflow_len.store(0, Ordering::Release);
    }

    /// Mark the peer loop as busy processing a dequeued message.
    /// Inline-path writers will bail until this clears.
    #[inline]
    pub(crate) fn mark_peer_loop_busy(&self) {
        self.peer_loop_busy.store(true, Ordering::Release);
    }

    /// Pair to [`mark_peer_loop_busy`]. Clear once the dequeued message
    /// has reached the wire (fast-path Sent) or `pending` (Enqueued).
    #[inline]
    pub(crate) fn clear_peer_loop_busy(&self) {
        self.peer_loop_busy.store(false, Ordering::Release);
    }
}

/// Maximum frame count the inline multi-frame path handles. Covers the
/// three multi-frame shapes on the hot path:
///
///   * REQ: empty delimiter + body (2 frames)
///   * REP: routing envelope + empty delimiter + body (3 frames; more
///     if the client's REQ chain has extra hops)
///   * ROUTER: routing prefix + body (2 frames)
///
/// Capped at 4 so any realistic envelope shape fits in one syscall
/// without bloating the stack iovec array past 8 entries.
const FAST_PATH_MAX_FRAMES: usize = 4;

/// Narrow type-erased handle for the caller-thread inline-write fast
/// path. Kept as a trait object because `PeerEngine` is not generic
/// over `W`. The optional `cap` parameter on each method is the
/// per-engine payload-size gate (resolved from
/// `SocketOptions::inline_write_max` at engine spawn): `None` means
/// "no cap" (any payload size accepted), `Some(n)` means decline
/// payloads `>= n`.
pub(crate) trait InlineWriteTarget: Send + Sync {
    /// Attempt to encode + write a single-frame payload inline.
    ///
    /// Returns:
    /// - `Some(Ok(()))` — message fully written.
    /// - `Some(Err(e))` — unrecoverable I/O error (peer gone).
    /// - `None` — fast path declined (cap exceeded, pending queue
    ///   non-empty, peer loop mid-dequeue, write lock held, or kernel
    ///   `WouldBlock`). Caller must fall back to the channel.
    fn try_inline_single_frame(&self, payload: &[u8], cap: Option<usize>)
        -> Option<io::Result<()>>;

    /// Multi-frame variant for REQ envelopes (empty delimiter + body)
    /// and REP / ROUTER routing envelopes. Frames are handed in wire
    /// order; the last frame gets `more=false`, all earlier frames
    /// get `more=true`.
    ///
    /// Same decline-vs-error semantics as
    /// [`Self::try_inline_single_frame`]. Caller must ensure
    /// `frames.len() ∈ 1..=FAST_PATH_MAX_FRAMES` and total payload is
    /// within `cap`; violations return `None` (silently decline).
    fn try_inline_multi_frame(
        &self,
        frames: &[&[u8]],
        cap: Option<usize>,
    ) -> Option<io::Result<()>>;
}

#[inline]
fn cap_exceeded(payload_len: usize, cap: Option<usize>) -> bool {
    matches!(cap, Some(c) if payload_len >= c)
}

impl<W: AsyncVectoredWrite + Send + Sync + 'static> InlineWriteTarget for SharedHalf<W> {
    fn try_inline_single_frame(
        &self,
        payload: &[u8],
        cap: Option<usize>,
    ) -> Option<io::Result<()>> {
        if cap_exceeded(payload.len(), cap) {
            return None;
        }
        // Cheap pre-checks (re-done under the write lock).
        if self.pending_len.load(Ordering::Acquire) > 0 {
            return None;
        }
        if self.peer_loop_busy.load(Ordering::Acquire) {
            return None;
        }
        if self.has_overflow() {
            return None;
        }
        let (hdr_bytes, hdr_len) = encode_frame_header(payload.len(), false);
        let hdr_len_usize = hdr_len as usize;
        let total = hdr_len_usize + payload.len();
        let iovs: [IoSlice<'_>; 2] = [
            IoSlice::new(&hdr_bytes[..hdr_len_usize]),
            IoSlice::new(payload),
        ];

        // try_lock so we don't block the caller thread if the peer loop
        // is mid-writev; on contention, fall back to the channel.
        let _guard = self.write_lock.try_lock()?;
        // Re-check under the lock — peer loop may have started a
        // dequeue, or another inline call may have left overflow,
        // between our first check and lock acquisition.
        if self.pending_len.load(Ordering::Acquire) > 0
            || self.peer_loop_busy.load(Ordering::Acquire)
            || self.has_overflow()
        {
            return None;
        }
        match self.half.try_write_vectored(&iovs) {
            Ok(n) if n == total => Some(Ok(())),
            Ok(0) => Some(Err(io::Error::from(CodecError::WriteZero))),
            Ok(n) => {
                // Partial write: `n` bytes are on the wire; the
                // remainder must follow contiguously. Materialize the
                // unwritten tail into the overflow queue with the
                // cursor advanced by `n`. Peer loop drains overflow
                // before its own queue. Reports success — the message
                // is "in the pipeline" exactly as if it had taken the
                // channel path.
                let header = HeaderBuf {
                    bytes: hdr_bytes,
                    len: hdr_len,
                };
                let payload_owned = Bytes::copy_from_slice(payload);
                let pending = PendingMsg::from_single_frame_partial(header, payload_owned, n);
                {
                    let mut g = self.pending_overflow.lock();
                    g.push_back(pending);
                    self.pending_overflow_len.store(g.len(), Ordering::Release);
                }
                // Wake the peer loop so it adopts overflow and finishes
                // the wire handoff. `notify_one` stores a permit if no
                // waiter is currently parked, closing the
                // notify-before-wait race; without this the receiver
                // would hang waiting for the partial bytes' continuation.
                use crate::async_rt::notify::AsyncNotify;
                self.overflow_notify.notify_one();
                Some(Ok(()))
            }
            Err(e) if e.kind() == io::ErrorKind::WouldBlock => None,
            Err(e) => Some(Err(e)),
        }
    }

    fn try_inline_multi_frame(
        &self,
        frames: &[&[u8]],
        cap: Option<usize>,
    ) -> Option<io::Result<()>> {
        let n_frames = frames.len();
        if !(1..=FAST_PATH_MAX_FRAMES).contains(&n_frames) {
            return None;
        }
        let payload_total: usize = frames.iter().map(|f| f.len()).sum();
        if cap_exceeded(payload_total, cap) {
            return None;
        }
        if self.pending_len.load(Ordering::Acquire) > 0 {
            return None;
        }
        if self.peer_loop_busy.load(Ordering::Acquire) {
            return None;
        }

        // Encode all N headers into a stack-allocated fixed-size array.
        let mut hdr_bufs: [[u8; 9]; FAST_PATH_MAX_FRAMES] = [[0u8; 9]; FAST_PATH_MAX_FRAMES];
        let mut hdr_lens: [u8; FAST_PATH_MAX_FRAMES] = [0u8; FAST_PATH_MAX_FRAMES];
        let mut total: usize = 0;
        for (i, frame) in frames.iter().enumerate() {
            let more = i + 1 < n_frames;
            let (buf, len) = encode_frame_header(frame.len(), more);
            hdr_bufs[i] = buf;
            hdr_lens[i] = len;
            total += len as usize + frame.len();
        }

        // Build iovec array: [hdr0, pay0, hdr1, pay1, ...]. Slice down
        // to the first 2*n_frames entries before the syscall.
        let mut iov_storage: [IoSlice<'_>; FAST_PATH_MAX_FRAMES * 2] = [
            IoSlice::new(&[]),
            IoSlice::new(&[]),
            IoSlice::new(&[]),
            IoSlice::new(&[]),
            IoSlice::new(&[]),
            IoSlice::new(&[]),
            IoSlice::new(&[]),
            IoSlice::new(&[]),
        ];
        for (i, frame) in frames.iter().enumerate() {
            let hdr_len_usize = hdr_lens[i] as usize;
            iov_storage[i * 2] = IoSlice::new(&hdr_bufs[i][..hdr_len_usize]);
            iov_storage[i * 2 + 1] = IoSlice::new(frame);
        }
        let iovs = &iov_storage[..n_frames * 2];

        let _guard = self.write_lock.try_lock()?;
        if self.pending_len.load(Ordering::Acquire) > 0
            || self.peer_loop_busy.load(Ordering::Acquire)
        {
            return None;
        }
        match self.half.try_write_vectored(iovs) {
            Ok(n) if n == total => Some(Ok(())),
            Ok(0) => Some(Err(io::Error::from(CodecError::WriteZero))),
            Ok(_partial) => {
                // Multi-frame partials are not currently recovered —
                // the per-frame partial-resume bookkeeping isn't
                // wired up for the multi-frame inline path. In
                // practice the typical small-envelope sizes
                // (REQ/REP/ROUTER) don't see partials on TCP/UDS
                // under the kernel's send-buffer; surface as
                // WriteZero if it ever fires.
                Some(Err(io::Error::from(CodecError::WriteZero)))
            }
            Err(e) if e.kind() == io::ErrorKind::WouldBlock => None,
            Err(e) => Some(Err(e)),
        }
    }
}

/// Vectored writer sink. Owns the pending queue exclusively (via `&mut`
/// from the peer loop); the transport write half lives behind
/// `Arc<SharedHalf<W>>` so heartbeat/eviction code can hold a clone.
pub(crate) struct VectoredWriter<W> {
    shared: Arc<SharedHalf<W>>,
    pending: VecDeque<PendingMsg>,
}

impl<W: AsyncVectoredWrite + Send + Sync + 'static> VectoredWriter<W> {
    /// Clone the shared handle for the caller-thread inline-write path.
    /// `PeerEngine::spawn` calls this once at engine init when the
    /// socket has `inline_write_max = Some(_)`; the resulting
    /// `Arc<dyn InlineWriteTarget>` lives on `PeerEngine.inline_write`
    /// and is reached from `try_inline_write`.
    pub(crate) fn inline_write_target(&self) -> Arc<dyn InlineWriteTarget> {
        self.shared.clone()
    }
}

impl<W: AsyncVectoredWrite> VectoredWriter<W> {
    pub(crate) fn new(half: W) -> Self {
        VectoredWriter {
            shared: Arc::new(SharedHalf::new(half)),
            pending: VecDeque::with_capacity(PENDING_CAPACITY),
        }
    }

    /// Pull any inline-path partial-write remainders into the front of
    /// the pending queue so the very next flush continues mid-message.
    /// Cheap when overflow is empty (one Acquire load + branch).
    /// Called at the top of every peer-loop iteration when inline is
    /// enabled (no-op when disabled because inline can't push overflow).
    pub(crate) fn pull_inline_overflow(&mut self) {
        if self.shared.has_overflow() {
            self.shared.drain_overflow_into(&mut self.pending);
            self.shared
                .pending_len
                .store(self.pending.len(), Ordering::Release);
        }
    }

    /// Return a `'static` future that resolves on the next inline-path
    /// partial-write overflow event. Clones the shared `Arc` so the
    /// future doesn't borrow `&self` (same pattern as
    /// `writable_owned()`), letting peer-loop's other select arms keep
    /// `&mut writer`.
    ///
    /// Notifications fired between successive `notified()`
    /// registrations are coalesced — a missed wake is recovered by the
    /// `pull_inline_overflow()` call at the top of the next iteration.
    pub(crate) fn overflow_notified(&self) -> impl std::future::Future<Output = ()> + Send + 'static
    where
        W: Send + Sync + 'static,
    {
        use crate::async_rt::notify::AsyncNotify;
        let shared = self.shared.clone();
        async move { shared.overflow_notify.notified().await }
    }

    /// Mark that the peer loop is mid-way through processing a
    /// dequeued outbound message. Inline-write racers will bail until
    /// this clears.
    #[inline]
    pub(crate) fn mark_peer_loop_busy(&self) {
        self.shared.mark_peer_loop_busy();
    }

    /// Pair to [`Self::mark_peer_loop_busy`]. Clear once the dequeued
    /// message has reached the wire (fast-path Sent) or `pending`
    /// (Enqueued).
    #[inline]
    pub(crate) fn clear_peer_loop_busy(&self) {
        self.shared.clear_peer_loop_busy();
    }

    /// Append a logical `Message` to the pending queue. Greeting/command/heartbeat
    /// variants go through the pre-encoded single-buffer path.
    pub(crate) fn enqueue(&mut self, msg: Message) {
        match msg {
            Message::Message(m) => self.pending.push_back(PendingMsg::from_zmq_message(&m)),
            Message::Shared(arc) => self
                .pending
                .push_back(PendingMsg::from_zmq_message(arc.as_ref())),
            Message::Greeting(g) => {
                let mut buf = bytes::BytesMut::new();
                buf.unsplit(g.into());
                self.pending
                    .push_back(PendingMsg::from_raw_bytes(buf.freeze()));
            }
            Message::Command(c) => {
                let mut buf = bytes::BytesMut::new();
                buf.unsplit(c.into());
                self.pending
                    .push_back(PendingMsg::from_raw_bytes(buf.freeze()));
            }
            Message::Heartbeat(hb) => {
                let encoded: bytes::BytesMut = hb.into();
                self.pending
                    .push_back(PendingMsg::from_raw_bytes(encoded.freeze()));
            }
            Message::SecurityRaw(raw) => {
                self.pending.push_back(PendingMsg::from_raw_bytes(raw));
            }
        }
        // Keep the speculative-write gate in lock-step with the queue.
        self.shared
            .pending_len
            .store(self.pending.len(), Ordering::Release);
    }

    /// Return whether the pending queue is empty. Used by the unified
    /// `peer_loop` to decide whether to arm the writable-readiness arm.
    pub(crate) fn is_empty(&self) -> bool {
        self.pending.is_empty()
    }

    /// Writer-side fast path. Only used at the top of each batch iteration,
    /// where `pending` is guaranteed empty (prior iterations either fully
    /// drained or returned `Err`).
    ///
    /// Preconditions for firing:
    ///   * `msg` is `Message::Message` or `Message::Shared` with exactly
    ///     one frame whose payload is `< FAST_PATH_MAX_PAYLOAD`.
    ///   * `self.pending.is_empty()` (debug-asserted).
    ///
    /// Outcomes (see [`FastPath`]):
    /// - `Sent` — full write landed. Caller bumps `flushed` by 1 and notifies waiters.
    /// - `Enqueued` — partial write; remainder pushed as `PendingMsg`. Caller drives `flush_all`.
    /// - `NotTaken(m)` — precondition miss or `WouldBlock`. `m` returned so the caller can
    ///   `enqueue` through the normal path with no clone.
    #[inline]
    pub(crate) fn try_fast_path_single_frame(&mut self, msg: Message) -> io::Result<FastPath> {
        debug_assert!(
            self.pending.is_empty(),
            "fast path requires empty pending queue",
        );

        // Eligibility check + payload borrow in one pass. On miss we hand
        // `msg` back unchanged.
        let payload_slice: &[u8] = match &msg {
            Message::Message(m) if m.len() == 1 => {
                let f = m.get(0).expect("len==1");
                if f.len() >= FAST_PATH_MAX_PAYLOAD {
                    return Ok(FastPath::NotTaken(msg));
                }
                f.as_ref()
            }
            Message::Shared(arc) if arc.len() == 1 => {
                let f = arc.get(0).expect("len==1");
                if f.len() >= FAST_PATH_MAX_PAYLOAD {
                    return Ok(FastPath::NotTaken(msg));
                }
                f.as_ref()
            }
            _ => return Ok(FastPath::NotTaken(msg)),
        };
        let payload_len = payload_slice.len();
        let (hdr_bytes, hdr_len) = encode_frame_header(payload_len, false);
        let hdr_len_usize = hdr_len as usize;
        let total = hdr_len_usize + payload_len;
        let iovs: [IoSlice<'_>; 2] = [
            IoSlice::new(&hdr_bytes[..hdr_len_usize]),
            IoSlice::new(payload_slice),
        ];

        let write_result = {
            let _g = self.shared.write_lock.lock();
            self.shared.half.try_write_vectored(&iovs)
        };
        match write_result {
            Ok(0) => Err(io::Error::from(CodecError::WriteZero)),
            Ok(n) if n == total => Ok(FastPath::Sent),
            Ok(n) => {
                // Rare: partial write. Materialize a `PendingMsg`, advance
                // it by `n`, queue it. Existing `advance()` handles the
                // byte accounting identically to a mid-flush partial.
                let zmsg = match msg {
                    Message::Message(m) => m,
                    Message::Shared(arc) => (*arc).clone(),
                    _ => unreachable!(),
                };
                self.pending.push_back(PendingMsg::from_zmq_message(&zmsg));
                self.shared
                    .pending_len
                    .store(self.pending.len(), Ordering::Release);
                let _ = self.advance(n);
                Ok(FastPath::Enqueued)
            }
            Err(e) if e.kind() == io::ErrorKind::WouldBlock => Ok(FastPath::NotTaken(msg)),
            Err(e) => Err(e),
        }
    }

    /// Drain messages from `outbound` into the pending queue until the
    /// accumulated payload size reaches `max_bytes` OR `max_msgs`
    /// messages are queued, whichever comes first, or the channel is
    /// empty. Either bound may be `None` to disable it; at least one
    /// should be `Some` on production paths, otherwise the drain only
    /// stops when the channel empties. Returns the count enqueued. Does
    /// NOT flush.
    ///
    /// Byte budget (`max_bytes`) matches libzmq's `ZMQ_OUT_BATCH_SIZE`
    /// semantics — libzmq accumulates bytes into one `writev` until
    /// hitting `out_batch_size` (default 8192). A pure byte budget would
    /// under-utilize syscalls for tiny messages (8 msgs × 16 B = 128 B
    /// per writev in our old count-based version, versus libzmq's 512 ×
    /// 16 B = 8 KB). However, the message-count clamp (`max_msgs`)
    /// prevents the other extreme: tiny messages + high PUB-fanout
    /// peer counts otherwise build up per-peer pending queues large
    /// enough to overflow HWM and drop messages (measured: ~50%
    /// regression on `throughput/pub_fanout subs=64` without a clamp).
    ///
    /// Always drains at least one message, even if it alone exceeds
    /// `max_bytes`, so a single large send is not starved.
    pub(crate) fn drain_batch(
        &mut self,
        outbound: &flume::Receiver<crate::engine::Outbound>,
        max_bytes: Option<usize>,
        max_msgs: Option<usize>,
    ) -> usize {
        use flume::TryRecvError;
        let mut count = 0;
        let mut accumulated: usize = 0;
        loop {
            // Stop once we've enqueued at least one message AND either
            // configured bound is reached. Payload-only byte count
            // (ignoring 2-9 B frame headers) to match libzmq's
            // payload-oriented `out_batch_size`.
            if count > 0 {
                let byte_limit_hit = max_bytes.is_some_and(|m| accumulated >= m);
                let msg_limit_hit = max_msgs.is_some_and(|m| count >= m);
                if byte_limit_hit || msg_limit_hit {
                    break;
                }
            }
            match outbound.try_recv() {
                Ok(o) => {
                    accumulated += msg_payload_size(&o.msg);
                    self.enqueue(o.msg);
                    count += 1;
                }
                Err(TryRecvError::Empty | TryRecvError::Disconnected) => break,
            }
        }
        count
    }

    /// Single-pass flush: attempt one non-blocking `try_write_vectored`.
    /// Returns the number of whole `PendingMsg` values fully written
    /// (may be 0 on `WouldBlock` — caller must `writable().await` to
    /// wait for the socket before retrying). This method MUST NOT block,
    /// otherwise the `peer_loop`'s concurrent read/recv arms starve and
    /// bidirectional patterns (DEALER↔ROUTER, REQ↔REP) can deadlock.
    pub(crate) fn flush_one_pass(&mut self) -> std::io::Result<usize> {
        if self.pending.is_empty() {
            return Ok(0);
        }
        let iovs: smallvec::SmallVec<[std::io::IoSlice<'_>; IOV_CAP]> = build_iovs(&self.pending);
        let write_result = {
            let _g = self.shared.write_lock.lock();
            self.shared.half.try_write_vectored(&iovs)
        };
        match write_result {
            Ok(0) => Err(io::Error::from(CodecError::WriteZero)),
            Ok(n) => {
                drop(iovs);
                Ok(self.advance(n) as usize)
            }
            Err(e) if e.kind() == std::io::ErrorKind::WouldBlock => {
                drop(iovs);
                Ok(0)
            }
            Err(e) => Err(e),
        }
    }

    /// Return a `'static` writable-readiness future by cloning the shared
    /// Arc handle into the future. Lets `peer_loop`'s `select!` hold a
    /// pending writable arm without borrowing `&self`, so other arms can
    /// still take `&mut writer` to enqueue / drain.
    ///
    /// Cost vs. `writable()`: one extra `Arc::clone` (atomic refcount bump,
    /// ~5 ns) per call, which trades for not heap-allocating a `Box::pin`
    /// every loop iteration.
    pub(crate) fn writable_owned(
        &self,
    ) -> impl std::future::Future<Output = std::io::Result<()>> + Send + 'static
    where
        W: Send + Sync + 'static,
    {
        let shared = self.shared.clone();
        async move { shared.half.writable().await }
    }

    /// Drain the pending queue to the wire. Returns the number of whole
    /// `PendingMsg` values fully written. Blocks awaiting writable readiness
    /// on `WouldBlock`.
    ///
    /// On an un-retryable I/O error the queue may still contain partially
    /// drained messages — we leave them there; the writer task treats the
    /// error as terminal and exits, which flips `writer_alive=false` and
    /// wakes any in-flight flush waiters with `SendError::Flush`.
    #[cfg(all(test, feature = "tokio"))]
    pub(crate) async fn flush_all(&mut self) -> io::Result<u64> {
        let mut whole: u64 = 0;
        while !self.pending.is_empty() {
            // Build up to IOV_CAP IoSlice entries from the front of the
            // queue. Lifetime is tied to `self.pending` — the slices borrow
            // from `PendingFrame.header.bytes` / `PendingFrame.payload`, both
            // of which are owned by queue entries that won't move while the
            // slice array exists (we don't touch `self.pending` between here
            // and the `try_write_vectored` call).
            let iovs: SmallVec<[IoSlice<'_>; IOV_CAP]> = build_iovs(&self.pending);
            debug_assert!(
                !iovs.is_empty(),
                "non-empty pending must yield at least one slice"
            );

            let write_result = {
                let _g = self.shared.write_lock.lock();
                self.shared.half.try_write_vectored(&iovs)
            };
            match write_result {
                Ok(0) => {
                    return Err(io::Error::from(CodecError::WriteZero));
                }
                Ok(n) => {
                    drop(iovs);
                    whole += self.advance(n);
                }
                Err(e) if e.kind() == io::ErrorKind::WouldBlock => {
                    drop(iovs);
                    self.shared.half.writable().await?;
                }
                Err(e) => return Err(e),
            }
        }
        Ok(whole)
    }

    /// Advance the pending queue by `n` bytes. Returns the number of whole
    /// `PendingMsg` values whose last byte was consumed by this advance.
    fn advance(&mut self, mut n: usize) -> u64 {
        let mut whole: u64 = 0;
        while n > 0 {
            let front = self
                .pending
                .front_mut()
                .expect("advance called with pending empty");
            // Drain frames front-to-back within this message.
            while n > 0 && !front.frames.is_empty() {
                let frame = &mut front.frames[0];
                let hdr_left = (frame.header.len - frame.header_pos) as usize;
                if hdr_left > 0 {
                    let take = n.min(hdr_left);
                    frame.header_pos += take as u8;
                    n -= take;
                    if n == 0 {
                        break;
                    }
                }
                let pay_left = frame.payload.len();
                if pay_left > 0 {
                    let take = n.min(pay_left);
                    frame.payload.advance(take);
                    n -= take;
                }
                if frame.is_drained() {
                    front.frames.remove(0);
                } else {
                    debug_assert_eq!(n, 0, "partial frame must leave n == 0");
                    break;
                }
            }
            if front.frames.is_empty() {
                self.pending.pop_front();
                whole += 1;
            }
        }
        // Keep the inline-write gate in lock-step with the real length.
        self.shared
            .pending_len
            .store(self.pending.len(), Ordering::Release);
        whole
    }
}

/// Approximate payload byte size of a `Message`, used by the byte-based
/// batching in `drain_batch`. Counts every payload frame; ignores the 2-9 B
/// frame headers (matching libzmq's `out_batch_size` convention, which is
/// likewise payload-oriented). Non-application variants (greeting, command,
/// heartbeat) return 0 so they never block the batch from accepting more
/// application messages behind them.
fn msg_payload_size(msg: &Message) -> usize {
    match msg {
        Message::Message(m) => m.iter().map(|f| f.len()).sum(),
        Message::Shared(m) => m.iter().map(|f| f.len()).sum(),
        Message::Greeting(_) | Message::Command(_) | Message::Heartbeat(_) => 0,
        Message::SecurityRaw(b) => b.len(),
    }
}

/// Build an `IoSlice` array from the front of the pending queue, capped at
/// `IOV_CAP`. Kept as a free function so the borrow is scoped tightly — the
/// returned slices hold a shared borrow on `pending`, and the caller drops
/// the array before mutating.
fn build_iovs(pending: &VecDeque<PendingMsg>) -> SmallVec<[IoSlice<'_>; IOV_CAP]> {
    let mut iovs: SmallVec<[IoSlice<'_>; IOV_CAP]> = SmallVec::new();
    'outer: for msg in pending.iter() {
        for frame in &msg.frames {
            let hdr_start = frame.header_pos as usize;
            let hdr_end = frame.header.len as usize;
            if hdr_start < hdr_end {
                if iovs.len() == IOV_CAP {
                    break 'outer;
                }
                iovs.push(IoSlice::new(&frame.header.bytes[hdr_start..hdr_end]));
            }
            if !frame.payload.is_empty() {
                if iovs.len() == IOV_CAP {
                    break 'outer;
                }
                iovs.push(IoSlice::new(frame.payload.as_ref()));
            }
        }
    }
    iovs
}

#[cfg(all(test, feature = "tokio"))]
mod tests {
    use super::*;
    use crate::codec::Message;
    use crate::message::ZmqMessage;
    use bytes::Bytes;
    use futures::StreamExt;
    use tokio::net::{TcpListener, TcpStream};

    /// Convert a bare `TcpStream` into a `VectoredWriter` for testing.
    /// Tests operate on a post-greeting codec to skip the ZMTP handshake;
    /// in production `PeerEngine::spawn` will call
    /// `ZmqFramedWrite::into_engine_writer` on the already-greeted stream.
    fn engine_half_from_tcp(tcp: TcpStream) -> VectoredWriter<tokio::net::tcp::OwnedWriteHalf> {
        let (_r, w) = tcp.into_split();
        VectoredWriter::new(w)
    }

    /// 1000 single-frame messages end-to-end through a `VectoredWriter` +
    /// `FramedRead` pair. Ordered delivery + correct framing.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn vectored_writer_roundtrip() {
        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
        let addr = listener.local_addr().unwrap();
        let connect_fut = TcpStream::connect(addr);
        let (accept_res, connect_res) = futures::join!(listener.accept(), connect_fut);
        let (server, _) = accept_res.unwrap();
        let client = connect_res.unwrap();

        // Server side is the writer; skip the greeting (unit test exercises
        // only the encode+writev loop, not ZMTP handshake).
        let mut writer = engine_half_from_tcp(server);

        // Client side reads raw bytes via FramedRead with a fresh ZmqCodec.
        // Bypass greeting by manually seeding the codec state past the
        // greeting expectation.
        use crate::codec::ZmqCodec;
        let (read_half, _write_half) = client.into_split();
        let codec = ZmqCodec::post_greeting();
        let mut reader = tokio_util::codec::FramedRead::new(read_half, codec);

        for i in 0..1000u32 {
            let msg = ZmqMessage::from(Bytes::from(i.to_be_bytes().to_vec()));
            writer.enqueue(Message::Message(msg));
        }
        let flushed = writer.flush_all().await.unwrap();
        assert_eq!(flushed, 1000);
        assert!(writer.is_empty());

        for i in 0..1000u32 {
            match reader.next().await.expect("stream closed").unwrap() {
                Message::Message(m) => {
                    let frame = m.get(0).expect("frame").clone();
                    assert_eq!(&frame[..], &i.to_be_bytes()[..]);
                }
                other => panic!("unexpected variant: {:?}", other),
            }
        }
    }

    /// 10 messages × 4 frames (varied sizes) with SNDMORE preserved. Reader
    /// must decode exactly 10 four-frame `ZmqMessage`s in order.
    ///
    /// The total payload (~690 KiB) exceeds the default TCP send+recv buffer
    /// on some platforms (notably macOS), so we run the reader concurrently
    /// with `flush_all()` to avoid a send-buffer-full → reader-not-yet-draining
    /// deadlock.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn vectored_writer_multiframe_atomicity() {
        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
        let addr = listener.local_addr().unwrap();
        let connect_fut = TcpStream::connect(addr);
        let (accept_res, connect_res) = futures::join!(listener.accept(), connect_fut);
        let (server, _) = accept_res.unwrap();
        let client = connect_res.unwrap();

        let mut writer = engine_half_from_tcp(server);
        use crate::codec::ZmqCodec;
        let (read_half, _w) = client.into_split();
        let codec = ZmqCodec::post_greeting();
        let mut reader = tokio_util::codec::FramedRead::new(read_half, codec);

        let sizes = [1usize, 256, 4096, 65_536];
        for i in 0..10u32 {
            let mut m = ZmqMessage::from(Bytes::from(format!("start-{}", i).into_bytes()));
            for (idx, &sz) in sizes.iter().enumerate() {
                if idx == 0 {
                    continue;
                }
                m.push_back(Bytes::from(vec![idx as u8 ^ i as u8; sz]));
            }
            // Now m has 4 frames.
            assert_eq!(m.len(), 4);
            writer.enqueue(Message::Message(m));
        }

        // Spawn reader concurrently so the writer doesn't block waiting for
        // TCP buffer space.
        let reader_task = tokio::spawn(async move {
            let mut received = Vec::with_capacity(10);
            for _ in 0..10u32 {
                match reader.next().await.expect("closed").unwrap() {
                    Message::Message(m) => received.push(m),
                    other => panic!("unexpected variant: {:?}", other),
                }
            }
            received
        });

        let flushed = writer.flush_all().await.unwrap();
        assert_eq!(flushed, 10);

        let received = reader_task.await.unwrap();
        for (i, m) in received.into_iter().enumerate() {
            assert_eq!(m.len(), 4, "message {} frame count", i);
            assert_eq!(
                m.get(0).unwrap().as_ref(),
                format!("start-{}", i).as_bytes()
            );
            assert_eq!(m.get(1).unwrap().len(), 256);
            assert_eq!(m.get(2).unwrap().len(), 4096);
            assert_eq!(m.get(3).unwrap().len(), 65_536);
        }
    }

    /// Force partial `try_write_vectored` by setting a tiny send buffer on
    /// the writer and a tiny recv buffer on the reader, then sending a 128
    /// KiB message. The sink must loop `writable()` → `try_write_vectored` until
    /// the whole logical message is on the wire.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn vectored_writer_partial_write() {
        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
        let addr = listener.local_addr().unwrap();
        let connect_fut = TcpStream::connect(addr);
        let (accept_res, connect_res) = futures::join!(listener.accept(), connect_fut);
        let (server, _) = accept_res.unwrap();
        let client = connect_res.unwrap();

        // Tiny buffers to force partial writes (best-effort — kernel may
        // round up, but 1 KiB is well below the 128 KiB payload so partials
        // are guaranteed).
        server.set_nodelay(true).ok();
        client.set_nodelay(true).ok();
        // Note: tokio's TcpStream doesn't directly expose set_send_buffer_size;
        // we use socket2 via std. Skipping actual buffer tuning — the 128 KiB
        // size against default buffers still exercises the partial-write path
        // on most kernels (esp. with NODELAY + lockstep consumer).

        let mut writer = engine_half_from_tcp(server);
        use crate::codec::ZmqCodec;
        let (read_half, _w) = client.into_split();
        let codec = ZmqCodec::post_greeting();
        let mut reader = tokio_util::codec::FramedRead::new(read_half, codec);

        let payload = Bytes::from(vec![0xa5u8; 128 * 1024]);
        let msg = ZmqMessage::from(payload.clone());
        writer.enqueue(Message::Message(msg));

        let (write_res, read_res) = tokio::join!(writer.flush_all(), reader.next());
        assert_eq!(write_res.unwrap(), 1);
        match read_res.expect("closed").unwrap() {
            Message::Message(m) => {
                assert_eq!(m.len(), 1);
                assert_eq!(m.get(0).unwrap().as_ref(), payload.as_ref());
            }
            other => panic!("unexpected variant: {:?}", other),
        }
    }

    /// Three 64 KiB messages in a row; reader must decode exactly three
    /// with boundaries preserved even if the kernel returned partial writes
    /// in between.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn vectored_writer_interleaved_partial() {
        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
        let addr = listener.local_addr().unwrap();
        let connect_fut = TcpStream::connect(addr);
        let (accept_res, connect_res) = futures::join!(listener.accept(), connect_fut);
        let (server, _) = accept_res.unwrap();
        let client = connect_res.unwrap();

        let mut writer = engine_half_from_tcp(server);
        use crate::codec::ZmqCodec;
        let (read_half, _w) = client.into_split();
        let codec = ZmqCodec::post_greeting();
        let mut reader = tokio_util::codec::FramedRead::new(read_half, codec);

        for i in 0u32..3 {
            let payload = Bytes::from(vec![i as u8 + 1; 64 * 1024]);
            writer.enqueue(Message::Message(ZmqMessage::from(payload)));
        }
        let (write_res, r0) = tokio::join!(writer.flush_all(), reader.next());
        assert_eq!(write_res.unwrap(), 3);
        let mut seen = Vec::new();
        match r0.unwrap().unwrap() {
            Message::Message(m) => seen.push(m.get(0).unwrap().clone()),
            _ => unreachable!(),
        }
        for _ in 0..2 {
            match reader.next().await.unwrap().unwrap() {
                Message::Message(m) => seen.push(m.get(0).unwrap().clone()),
                _ => unreachable!(),
            }
        }
        for (i, frame) in seen.iter().enumerate() {
            assert_eq!(frame.len(), 64 * 1024);
            assert!(frame.iter().all(|&b| b == (i as u8 + 1)));
        }
    }

    /// Peer closes mid-flush; the sink returns an `io::Error` instead of
    /// hanging.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn vectored_writer_peer_close_mid_flush() {
        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
        let addr = listener.local_addr().unwrap();
        let connect_fut = TcpStream::connect(addr);
        let (accept_res, connect_res) = futures::join!(listener.accept(), connect_fut);
        let (server, _) = accept_res.unwrap();
        let client = connect_res.unwrap();

        let mut writer = engine_half_from_tcp(server);
        // Drop the client, then issue enough writes to blow past the socket
        // buffer so the kernel eventually signals the closed peer.
        drop(client);

        for _ in 0..128 {
            writer.enqueue(Message::Message(ZmqMessage::from(Bytes::from(vec![
                0xau8;
                4096
            ]))));
        }
        let res = writer.flush_all().await;
        assert!(res.is_err(), "flush_all should surface peer-close error");
    }

    // ── Fast-path tests ─────────────────────────────────────────────────

    /// Eligible single-frame small message: fast path must fire and return
    /// `Sent` without touching the pending queue. Reader sees the message.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn fast_path_single_small_frame_sent() {
        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
        let addr = listener.local_addr().unwrap();
        let connect_fut = TcpStream::connect(addr);
        let (accept_res, connect_res) = futures::join!(listener.accept(), connect_fut);
        let (server, _) = accept_res.unwrap();
        let client = connect_res.unwrap();

        let mut writer = engine_half_from_tcp(server);
        use crate::codec::ZmqCodec;
        let (read_half, _w) = client.into_split();
        let codec = ZmqCodec::post_greeting();
        let mut reader = tokio_util::codec::FramedRead::new(read_half, codec);

        // Wait for the socket to be writable so the fast path doesn't hit
        // WouldBlock on the first try (fresh TCP sockets can be non-writable
        // immediately post-handshake in release builds).
        writer.writable_owned().await.unwrap();

        let payload = Bytes::from(vec![0x77u8; 16]);
        let msg = Message::Message(ZmqMessage::from(payload.clone()));
        match writer.try_fast_path_single_frame(msg).unwrap() {
            FastPath::Sent => {}
            other => panic!("expected Sent, got {:?}", other),
        }
        assert!(writer.is_empty(), "fast path must not queue");

        match reader.next().await.expect("closed").unwrap() {
            Message::Message(m) => {
                assert_eq!(m.len(), 1);
                assert_eq!(m.get(0).unwrap().as_ref(), payload.as_ref());
            }
            other => panic!("unexpected variant: {:?}", other),
        }
    }

    /// Payload at and just below the threshold: cap-1 B must be accepted,
    /// cap B must be rejected (strict `<`). The peer-loop fast path
    /// handles partial writes via its own `PendingMsg` queue, so the cap
    /// only gates against blowing past one writev's reasonable budget.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn fast_path_threshold_boundary() {
        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
        let addr = listener.local_addr().unwrap();
        let (accept_res, connect_res) =
            futures::join!(listener.accept(), TcpStream::connect(addr),);
        let (server, _) = accept_res.unwrap();
        let _client = connect_res.unwrap(); // keep alive so server writes succeed

        let mut writer = engine_half_from_tcp(server);
        writer.writable_owned().await.unwrap();

        // cap-1 B — eligible.
        let ok_msg = Message::Message(ZmqMessage::from(Bytes::from(vec![
            0x11u8;
            FAST_PATH_MAX_PAYLOAD - 1
        ])));
        match writer.try_fast_path_single_frame(ok_msg).unwrap() {
            // Sent fully or buffered after partial write — both prove
            // eligibility (cap didn't gate it off).
            FastPath::Sent | FastPath::Enqueued => {}
            other @ FastPath::NotTaken(_) => {
                panic!("expected Sent/Enqueued at cap-1 B, got {:?}", other)
            }
        }

        // Drain whatever the cap-1 send queued (if it took Enqueued) so
        // the cap-B test sees a clean writer.
        let _ = writer.flush_all().await;

        // cap B — NOT eligible (strict `<` threshold).
        let big_msg = Message::Message(ZmqMessage::from(Bytes::from(vec![
            0x22u8;
            FAST_PATH_MAX_PAYLOAD
        ])));
        match writer.try_fast_path_single_frame(big_msg).unwrap() {
            FastPath::NotTaken(_) => {}
            other => panic!("expected NotTaken at cap, got {:?}", other),
        }
        assert!(writer.is_empty(), "NotTaken must not touch queue");
    }

    /// Multi-frame message: fast path must reject (returns `NotTaken`) because
    /// the fast path only handles single-frame sends.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn fast_path_skipped_for_multiframe() {
        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
        let addr = listener.local_addr().unwrap();
        let (accept_res, connect_res) =
            futures::join!(listener.accept(), TcpStream::connect(addr),);
        let (server, _) = accept_res.unwrap();
        let _client = connect_res.unwrap();

        let mut writer = engine_half_from_tcp(server);
        let mut m = ZmqMessage::from(Bytes::from(vec![0xaau8; 16]));
        m.push_back(Bytes::from(vec![0xbbu8; 16]));
        assert_eq!(m.len(), 2);

        let msg = Message::Message(m);
        match writer.try_fast_path_single_frame(msg).unwrap() {
            FastPath::NotTaken(_) => {}
            _ => panic!("2-frame must not fast-path"),
        }
        assert!(writer.is_empty());
    }

    /// `Message::Shared` single-frame small: also eligible.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn fast_path_shared_variant_sent() {
        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
        let addr = listener.local_addr().unwrap();
        let (accept_res, connect_res) =
            futures::join!(listener.accept(), TcpStream::connect(addr),);
        let (server, _) = accept_res.unwrap();
        let client = connect_res.unwrap();

        let mut writer = engine_half_from_tcp(server);
        use crate::codec::ZmqCodec;
        let (read_half, _w) = client.into_split();
        let codec = ZmqCodec::post_greeting();
        let mut reader = tokio_util::codec::FramedRead::new(read_half, codec);
        writer.writable_owned().await.unwrap();

        let payload = Bytes::from(vec![0x3cu8; 32]);
        let arc = std::sync::Arc::new(ZmqMessage::from(payload.clone()));
        let msg = Message::Shared(arc);
        assert!(matches!(
            writer.try_fast_path_single_frame(msg).unwrap(),
            FastPath::Sent
        ));
        match reader.next().await.expect("closed").unwrap() {
            Message::Message(m) => {
                assert_eq!(m.get(0).unwrap().as_ref(), payload.as_ref());
            }
            other => panic!("unexpected: {:?}", other),
        }
    }

    /// Peer close via the fast path must surface as Err, not panic.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn fast_path_peer_close_errors() {
        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
        let addr = listener.local_addr().unwrap();
        let (accept_res, connect_res) =
            futures::join!(listener.accept(), TcpStream::connect(addr),);
        let (server, _) = accept_res.unwrap();
        let client = connect_res.unwrap();
        drop(client);
        // Give the OS a moment to propagate the close.
        tokio::time::sleep(std::time::Duration::from_millis(10)).await;

        let mut writer = engine_half_from_tcp(server);
        // Write once to get the kernel to notice the closed peer, then try
        // the fast path — the second one should surface the error.
        let _ = writer.try_fast_path_single_frame(Message::Message(ZmqMessage::from(Bytes::from(
            vec![0u8; 8],
        ))));
        let mut saw_err = false;
        for _ in 0..64 {
            if writer
                .try_fast_path_single_frame(Message::Message(ZmqMessage::from(Bytes::from(
                    vec![0u8; 8],
                ))))
                .is_err()
            {
                saw_err = true;
                break;
            }
            tokio::time::sleep(std::time::Duration::from_millis(5)).await;
        }
        assert!(saw_err, "expected I/O error once peer-close is detected");
    }
}