rustzmq2 0.1.0 - Docs.rs

//! Each `PeerEngine` owns the `ZmqFramedRead`/`ZmqFramedWrite` halves of a
//! connection and spawns two background tasks connected to the socket
//! layer by bounded `flume` SPSC pipes:
//!
//! - **`reader_loop`** — drains the framed stream into the inbound channel.
//!   A full inbound channel yields natural receive-side backpressure
//!   (`receive_hwm`).
//! - **`writer_loop`** — drains the outbound channel into the framed sink,
//!   batching up to `out_batch_size` payload bytes between flushes
//!   (default 8192, matching libzmq's `ZMQ_OUT_BATCH_SIZE`). Collapses
//!   N `poll_flush`es into one per batch.
//!
//! The socket layer talks to the engine exclusively through those two
//! channels; it never touches `ZmqFramedRead`/`ZmqFramedWrite` after
//! `PeerEngine::spawn` returns.
//!
//! Shutdown:
//! - **`writer_loop`**: closed outbound channel → `recv` returns `Err` →
//!   loop exits. Dropping the engine drops its `outbound: Sender<_>`;
//!   since the engine is the only sender, that closes the channel.
//! - **`reader_loop`**: the engine owns a `oneshot::Sender<()>`. On drop
//!   it fires a shutdown signal that races the socket read inside a
//!   `select!`. Without this, a reader blocked on `poll_next` would
//!   wait indefinitely for the TCP side to notice the close.
//!
//! Task `JoinHandle`s are detached — we never await them from `Drop`
//! (which would block). If a socket needs hard-shutdown semantics
//! (e.g. unit tests), it can call [`PeerEngine::shutdown`] and await
//! its return.
//!
//! `send_flushed().await` semantics: awaits until the message has been
//! *flushed* to the sink (not merely queued). Callers can rely on bytes
//! being on the wire when the future resolves, via a shared per-engine
//! `FlushState { flushed: AtomicU64, notify: Notify }`:
//!
//! - An `enqueue_lock: parking_lot::Mutex<()>` guards the critical section
//!   where a send synchronously enqueues into the outbound channel *and*
//!   atomically bumps `enqueued: AtomicU64`. The lock is held only for the
//!   sync `try_send` + counter bump (tens of nanoseconds), never across an
//!   await.
//! - The writer task, after each successful flush of N messages, does
//!   `flushed.fetch_add(N)` + `notify.notify_waiters()`. Because the
//!   channel preserves FIFO w.r.t. the locked enqueue order, the absolute
//!   `enqueued` position assigned to each message matches the 1:1 position
//!   the writer sees on dequeue — so a caller waiting on `my_pos` wakes
//!   exactly when its message has been flushed.
//!
//! Cost: one uncontended `parking_lot::Mutex::lock()` + one `fetch_add` per
//! send-that-wants-flush. No heap allocation per message.

pub(crate) mod backend;
pub(crate) mod peer_loop;
pub(crate) mod registry;
pub(crate) mod writer;

use crate::async_rt::task::spawn;
use crate::codec::{CodecError, Message};
use crate::error::{Disconnected, SendError};
use crate::io_compat::AsyncVectoredWrite;
#[cfg(feature = "inproc")]
use crate::message::ZmqMessage;
use crate::PeerIdentity;

#[cfg(feature = "inproc")]
use crate::async_rt::notify::AsyncNotify;
use crate::async_rt::signaler::{AsyncSignaler, RuntimeSignaler};
#[cfg(all(test, feature = "tokio"))]
use flume::Receiver;
use flume::{Sender, TrySendError};
use futures::channel::oneshot;
use futures::FutureExt;
use parking_lot::Mutex as SyncMutex;

use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use std::sync::Arc;
use std::time::Duration;

/// Sender for the inproc-only inbound channel.
/// `crossbeam_channel::Sender` is `Send + Sync + Clone`.
#[cfg(feature = "inproc")]
pub(crate) type InprocInboundTx = crossbeam_channel::Sender<(
    crate::engine::registry::PeerKey,
    Result<Message, CodecError>,
)>;

/// Receiver for the inproc-only inbound channel.
/// `crossbeam_channel::Receiver` is `Send + Sync + Clone`.
#[cfg(feature = "inproc")]
pub(crate) type InprocInboundRx = crossbeam_channel::Receiver<(
    crate::engine::registry::PeerKey,
    Result<Message, CodecError>,
)>;

/// Heartbeat configuration for a `PeerEngine`. All durations are wall-clock.
/// The engine sends a PING every `interval`, then waits up to `timeout` for
/// a PONG. If no PONG arrives the peer is considered stalled and the engine
/// exits.
pub(crate) struct HeartbeatConfig {
    pub interval: Duration,
    pub timeout: Duration,
    pub ttl: Duration,
}

/// Outbound queue item — just the message. Flush completion is tracked
/// via the per-engine shared `FlushState` (see [`PeerEngine`] docs and
/// [`FlushToken`]), not per-message notifiers.
pub(crate) struct Outbound {
    pub(crate) msg: Message,
}

/// Shared flush-progress state. One allocation per engine; every
/// [`FlushToken`] clones the `Arc` into itself. Writer owns the write
/// side of all three fields.
pub(crate) struct FlushState {
    /// Total messages the writer has flushed to the wire. Monotonic.
    pub(crate) flushed: AtomicU64,
    /// False once the writer task exits. Tokens waiting on flushed >=
    /// target that will never be reached need to surface as errors.
    pub(crate) writer_alive: std::sync::atomic::AtomicBool,
    /// Wakes tokens waiting for `target <= flushed` (or for
    /// `writer_alive` to flip false). Edge-triggered fd signaler:
    /// `eventfd` on Linux, `pipe` on other unix.
    pub(crate) signaler: Arc<RuntimeSignaler>,
    /// Tasks currently parked on `signaler.signaled()` for a flush
    /// target. Producers gate `signal()` on this being > 0 so the
    /// fd write is skipped on the steady-state path where no one
    /// waits — PUB/XPUB fanout never produces flush tokens, so this
    /// stays 0 on the bench's hot path. Acquire matches the Release
    /// on the increment so we never miss a just-registered waiter.
    pub(crate) flush_waiters: AtomicUsize,
}

impl FlushState {
    /// Wake any tasks parked on the flush signaler — but only if at
    /// least one is parked. The fd write is cheap but still a syscall;
    /// gating on the waiter count keeps the no-waiter path syscall-free.
    #[inline]
    pub(crate) fn notify_flush_waiters(&self) {
        if self.flush_waiters.load(Ordering::Acquire) > 0 {
            self.signaler.signal();
        }
    }
}

/// RAII increment of `FlushState::flush_waiters`. Bumps on construction
/// (with `Release` so a producer that observes >0 will also see the
/// flush counter the waiter just sampled), decrements on drop. Used by
/// `FlushToken::await_flush` and `send_flushed`'s slot-wake path so
/// producers can short-circuit `notify_waiters` calls.
struct FlushWaiterGuard<'a> {
    state: &'a FlushState,
}

impl<'a> FlushWaiterGuard<'a> {
    #[inline]
    fn new(state: &'a FlushState) -> Self {
        state.flush_waiters.fetch_add(1, Ordering::Release);
        Self { state }
    }
}

impl Drop for FlushWaiterGuard<'_> {
    #[inline]
    fn drop(&mut self) {
        self.state.flush_waiters.fetch_sub(1, Ordering::Release);
    }
}

/// Returned by [`PeerEngine::try_send_tracked`]; resolves when the
/// message has been flushed to the wire. Drop without awaiting is
/// fine — the engine does not depend on anyone awaiting tokens.
pub(crate) struct FlushToken {
    target: u64,
    state: Arc<FlushState>,
}

impl FlushToken {
    /// Await the write-side flush. Returns `Err(Disconnected)` if the
    /// writer task exits before the target is reached — the message may
    /// or may not have landed on the wire; the caller should treat the
    /// peer as gone.
    pub(crate) async fn await_flush(self) -> Result<(), Disconnected> {
        // Fast path: target already met, no need to register as a waiter.
        if self.state.flushed.load(Ordering::Acquire) >= self.target {
            return Ok(());
        }
        if !self.state.writer_alive.load(Ordering::Acquire) {
            return Err(Disconnected);
        }
        // Slow path: park on notify. Increment the waiter count so
        // producers know to call `notify_waiters` (skipped when zero).
        let _wg = FlushWaiterGuard::new(&self.state);
        loop {
            if self.state.flushed.load(Ordering::Acquire) >= self.target {
                return Ok(());
            }
            if !self.state.writer_alive.load(Ordering::Acquire) {
                return Err(Disconnected);
            }
            let fut = self.state.signaler.signaled();
            // Recheck after registering the waker — standard Notify idiom
            // to avoid the "writer bumped between load and notified()" race.
            if self.state.flushed.load(Ordering::Acquire) >= self.target {
                return Ok(());
            }
            if !self.state.writer_alive.load(Ordering::Acquire) {
                return Err(Disconnected);
            }
            fut.await;
        }
    }
}

/// Per-peer engine. Owns the two I/O tasks and the channels that bridge
/// them to the socket layer.
///
/// `Drop` fires a shutdown oneshot so background tasks can exit
/// gracefully without blocking. The tasks also exit naturally when the
/// socket drops all `PeerEngine`s (receiver closed → writer exits; sender
/// closed → reader's push fails → reader exits).
pub(crate) struct PeerEngine {
    /// Socket → writer task. Socket calls `try_send_tracked` (PUB/XPUB,
    /// drop on full per RFC 29) or `send` / `send_flushed` (DEALER/REQ/REP/
    /// ROUTER/PUSH, block on full and optionally await flush).
    outbound: Sender<Outbound>,
    /// Sync lock guarding the enqueue critical section — held just long
    /// enough to make "bump `enqueued` + `try_send` into outbound" atomic
    /// so the writer's dequeue order matches the assigned positions 1:1.
    /// Never held across an `.await`.
    enqueue_lock: SyncMutex<()>,
    /// Total messages accepted into the outbound channel. Bumped under
    /// `enqueue_lock` after a successful `try_send`.
    enqueued: AtomicU64,
    /// Shared flush-progress state. Both tokens and the writer task clone
    /// this Arc; writer owns the write side of all fields inside.
    flush_state: Arc<FlushState>,
    /// Caller-thread inline-write fast path. `Some` only when the
    /// socket opted in via `SocketOptions::inline_write_max =
    /// Some(_)` and the engine isn't CURVE-encrypted. `None` means
    /// every send goes through the channel + peer loop.
    inline_write: Option<Arc<dyn crate::engine::writer::InlineWriteTarget>>,
    /// Per-message payload cap forwarded into every inline call.
    /// `None` (when `inline_write` is `Some`) means uncapped; `Some(n)`
    /// means decline payloads `>= n`. Ignored when `inline_write` is
    /// `None`.
    inline_write_max: Option<usize>,
    /// Drop-fired shutdown signal for the **reader** task. The writer
    /// doesn't subscribe; it exits via outbound-channel close when the
    /// engine is dropped (see module-level doc for why).
    shutdown_tx: Option<oneshot::Sender<()>>,
}

/// Tagged inbound sender type. The reader task tags each decoded
/// message with the engine's `peer_key` (a `Copy` u32) before
/// forwarding. Socket consumers resolve the key to a `PeerIdentity`
/// only at cold path boundaries (ROUTER wire envelope, disconnect).
pub(crate) type TaggedInboundTx = Sender<(
    crate::engine::registry::PeerKey,
    Result<Message, CodecError>,
)>;

#[inline]
fn unwrap_outbound_err(e: TrySendError<Outbound>) -> TrySendError<Message> {
    // Unwrap the `Outbound` envelope so the returned error carries the
    // raw `Message` for caller-side fallback handling. PUB/XPUB pattern-
    // match on `Full` for HWM-drop semantics.
    match e {
        TrySendError::Full(o) => TrySendError::Full(o.msg),
        TrySendError::Disconnected(o) => TrySendError::Disconnected(o.msg),
    }
}

impl PeerEngine {
    /// Spawn the unified peer loop for an established `FramedIo`-like pair.
    /// `send_hwm` bounds the per-peer outbound queue. `config` carries optional
    /// settings (`heartbeat`, `curve`, `max_msg_size`).
    pub(crate) fn spawn<R, W>(
        peer_key: crate::engine::registry::PeerKey,
        _peer_id: PeerIdentity,
        read_half: R,
        write_half: W,
        send_hwm: usize,
        shared_inbound: TaggedInboundTx,
        config: crate::engine::peer_loop::PeerConfig,
    ) -> Self
    where
        R: futures::Stream<Item = Result<Message, CodecError>> + Unpin + Send + 'static,
        W: AsyncVectoredWrite + Send + Sync + 'static,
    {
        let (outbound_tx, outbound_rx) = flume::bounded::<Outbound>(send_hwm);
        let (shutdown_tx, shutdown_rx) = oneshot::channel::<()>();
        let shutdown = shutdown_rx.shared();

        let flush_state = Arc::new(FlushState {
            flushed: AtomicU64::new(0),
            writer_alive: std::sync::atomic::AtomicBool::new(true),
            signaler: Arc::new(RuntimeSignaler::new().expect("signaler init")),
            flush_waiters: AtomicUsize::new(0),
        });

        // Resolve the inline-write knob. Outer Some on
        // `config.inline_write_max` means inline is enabled; inner
        // `Option<usize>` is the per-message payload cap (None =
        // uncapped). CURVE was already forced to outer-None by the
        // backend when building `PeerConfig`.
        let inline_write_max: Option<usize> = config.inline_write_max.unwrap_or(None);
        let inline_enabled = config.inline_write_max.is_some();
        let inline_write = {
            use crate::engine::peer_loop::{peer_loop, PeerChannels};
            use crate::engine::writer::VectoredWriter;
            let writer = VectoredWriter::new(write_half);
            // Grab the inline target before moving the writer into the
            // peer loop. Only Some when the engine opted in.
            let inline: Option<Arc<dyn crate::engine::writer::InlineWriteTarget>> =
                if inline_enabled {
                    Some(writer.inline_write_target())
                } else {
                    None
                };
            spawn(peer_loop(
                read_half,
                writer,
                PeerChannels {
                    outbound_rx,
                    shared_inbound,
                },
                peer_key,
                flush_state.clone(),
                shutdown.clone(),
                config,
            ));
            inline
        };
        Self {
            outbound: outbound_tx,
            enqueue_lock: SyncMutex::new(()),
            enqueued: AtomicU64::new(0),
            flush_state,
            inline_write,
            inline_write_max,
            shutdown_tx: Some(shutdown_tx),
        }
    }

    /// Non-blocking send, fire-and-forget. Returns an error if the
    /// outbound channel is at its HWM (PUB/XPUB path — drop silently) or
    /// the writer task has exited (peer disconnected). No flush-completion
    /// tracking; the caller accepted that in the try-send contract.
    ///
    /// Does NOT take `enqueue_lock`: fire-and-forget doesn't need position
    /// tracking, so racing with other senders is fine. The writer still
    /// drains FIFO; it just won't correlate these items to any waiter.
    /// Note: the `enqueued` counter is NOT bumped here, which means tracked
    /// sends interleaved with fire-and-forget sends see gaps in their
    /// flush-counter correlation. That's OK because PUB/XPUB sockets only
    /// ever use `try_send_tracked` (not plain `try_send`) when they care
    /// about flush ordering; DEALER/etc. always use `send_flushed`.
    #[cfg(all(test, feature = "tokio"))]
    pub(crate) fn try_send(&self, msg: Message) -> Result<(), TrySendError<Message>> {
        self.outbound
            .try_send(Outbound { msg })
            .map_err(unwrap_outbound_err)
    }

    /// Caller-thread inline-write fast path. Returns `Some(Ok(()))` when
    /// the message landed on the wire directly (no channel bounce, no
    /// peer-loop wake), `Some(Err(_))` on peer disconnect, or `None`
    /// when the fast path was declined and the caller must fall back
    /// to the channel.
    ///
    /// Returns `None` immediately when the engine has inline disabled
    /// (`self.inline_write.is_none()`), which is the steady state for
    /// every socket type except REQ/REP/PAIR (and any other socket the
    /// user explicitly opted in via `SocketOptions::inline_write_max`).
    /// On enabled engines, the inline target enforces the
    /// `inline_write_max` payload cap and the FIFO/peer-loop-busy
    /// gates internally.
    ///
    /// On success bumps both `enqueued` and `flushed` so a concurrent
    /// `FlushToken::await_flush` or `drain_outbound` sees the message
    /// as accounted-for.
    #[inline]
    fn try_inline_write(&self, msg: &Message) -> Option<Result<(), SendError>> {
        let zmsg: &crate::message::ZmqMessage = match msg {
            Message::Message(m) => m,
            Message::Shared(arc) => arc.as_ref(),
            _ => return None,
        };
        self.try_inline_zmsg(zmsg)
    }

    /// Shared implementation of the inline-write fast path. Dispatches
    /// to the single-frame or multi-frame `InlineWriteTarget` variant
    /// based on `msg.len()`, and on success bumps `enqueued`/`flushed`
    /// atomically under `enqueue_lock` (since the message skipped the
    /// channel entirely).
    #[inline]
    fn try_inline_zmsg(&self, msg: &crate::message::ZmqMessage) -> Option<Result<(), SendError>> {
        let handle = self.inline_write.as_ref()?;
        // Outbound must be empty: queued messages ahead would be jumped.
        if !self.outbound.is_empty() {
            return None;
        }
        let cap = self.inline_write_max;
        let io_result = match msg.len() {
            0 => return None,
            1 => {
                let payload = msg.get(0).expect("len==1").as_ref();
                handle.try_inline_single_frame(payload, cap)?
            }
            _ => {
                // Stack-collect frame slices (max 4 = FAST_PATH_MAX_FRAMES
                // in writer.rs). The trait method declines anything past.
                let mut frame_buf: [&[u8]; 4] = [&[]; 4];
                let n = msg.len();
                if n > frame_buf.len() {
                    return None;
                }
                for (i, frame) in msg.iter().enumerate() {
                    frame_buf[i] = frame.as_ref();
                }
                handle.try_inline_multi_frame(&frame_buf[..n], cap)?
            }
        };
        match io_result {
            Ok(()) => {
                let _g = self.enqueue_lock.lock();
                self.enqueued.fetch_add(1, Ordering::Relaxed);
                self.flush_state.flushed.fetch_add(1, Ordering::Release);
                self.flush_state.notify_flush_waiters();
                crate::wake_counter::bump(&crate::wake_counter::INLINE_WRITES);
                Some(Ok(()))
            }
            Err(_) => Some(Err(SendError::Flush)),
        }
    }

    /// PUB/XPUB fanout-side inline write. Mirrors `try_inline_write`
    /// but takes the `Arc<ZmqMessage>` directly so the fanout doesn't
    /// have to wrap it in `Message::Shared`. Returns `None` when
    /// inline is disabled — the steady state for PUB/XPUB unless the
    /// user explicitly opted in.
    #[inline]
    pub(crate) fn try_inline_fanout(
        &self,
        shared: &Arc<crate::message::ZmqMessage>,
    ) -> Option<Result<(), SendError>> {
        self.try_inline_zmsg(shared.as_ref())
    }

    /// PUB/XPUB fanout helper: enqueues under `enqueue_lock`, returning a
    /// [`FlushToken`] that resolves when the message has been flushed to
    /// the wire. Drop-on-full per RFC 29 (`Err(Full)`).
    pub(crate) fn try_send_tracked(
        &self,
        msg: Message,
    ) -> Result<FlushToken, TrySendError<Message>> {
        let _g = self.enqueue_lock.lock();
        self.outbound
            .try_send(Outbound { msg })
            .map_err(unwrap_outbound_err)?;
        let target = self.enqueued.fetch_add(1, Ordering::Relaxed) + 1;
        Ok(FlushToken {
            target,
            state: self.flush_state.clone(),
        })
    }

    /// PUB fire-and-forget enqueue. Skips both `enqueue_lock` and the
    /// `enqueued` counter since no one waits on a flush token for this
    /// message — PUB is best-effort delivery per RFC 29. Drop-on-full.
    pub(crate) fn try_send_fire_and_forget(
        &self,
        msg: Message,
    ) -> Result<(), TrySendError<Message>> {
        self.outbound
            .try_send(Outbound { msg })
            .map_err(unwrap_outbound_err)
    }

    /// Awaits space in the outbound channel. Returns once the message has
    /// been enqueued. Used by DEALER/REQ/REP/ROUTER/PUSH where the HWM
    /// semantic is "block until there's room" (libzmq `pipe_t`). Does NOT
    /// take `enqueue_lock` or bump `enqueued`; callers that need flush
    /// semantics use [`PeerEngine::send_flushed`].
    pub(crate) async fn send(&self, msg: Message) -> Result<(), SendError> {
        if let Some(result) = self.try_inline_write(&msg) {
            return result;
        }
        self.outbound
            .send_async(Outbound { msg })
            .await
            .map_err(|e| SendError::Enqueue(e.0.msg))?;
        self.enqueued.fetch_add(1, Ordering::Relaxed);
        crate::async_rt::task::yield_now().await;
        Ok(())
    }

    /// Flush-await variant of [`PeerEngine::send`]. Fast path: take
    /// `enqueue_lock`, `try_send`, await the shared flush-completion
    /// notify. Slow path: channel full → drop lock, await the flush
    /// notify (writer drains → slot opens), retry. Either way ends with
    /// a well-defined flush position; the writer bumps `flushed` past
    /// that position as it drains the FIFO.
    pub(crate) async fn send_flushed(&self, mut msg: Message) -> Result<(), SendError> {
        if let Some(result) = self.try_inline_write(&msg) {
            return result;
        }
        // Bump the waiter count once for the whole HWM-retry loop; producers
        // only call `notify_waiters` when this is > 0. Dropped on loop exit.
        // We intentionally pay the bump even for the fast (channel-not-full)
        // path so the increment-then-try is race-free with a concurrent
        // writer-side `notify_flush_waiters()` check.
        let _wg = FlushWaiterGuard::new(&self.flush_state);
        let token = loop {
            // Register a Notify waiter *before* taking the lock so the
            // HWM-full slow path doesn't miss a wake between the failed
            // try_send and the await. tokio::sync::Notify is level-triggered
            // for this pattern: notified() returned before notify_waiters()
            // fires will complete immediately.
            let slot_wake = self.flush_state.signaler.signaled();
            let attempt = {
                let _g = self.enqueue_lock.lock();
                match self.outbound.try_send(Outbound { msg }) {
                    Ok(()) => {
                        let target = self.enqueued.fetch_add(1, Ordering::Relaxed) + 1;
                        Ok(FlushToken {
                            target,
                            state: self.flush_state.clone(),
                        })
                    }
                    Err(TrySendError::Full(o)) => Err(Err(o.msg)),
                    Err(TrySendError::Disconnected(o)) => Err(Ok(o.msg)),
                }
            };
            match attempt {
                Ok(token) => break token,
                Err(Ok(returned)) => return Err(SendError::Enqueue(returned)),
                Err(Err(returned)) => {
                    msg = returned;
                    if !self.flush_state.writer_alive.load(Ordering::Acquire) {
                        return Err(SendError::Enqueue(msg));
                    }
                    // Wait for the writer to drain something. notify_waiters
                    // fires on every flush; we'll retry then.
                    slot_wake.await;
                }
            }
        };
        token.await_flush().await.map_err(|_e| SendError::Flush)
    }

    /// Returns `true` while the writer task is still running.
    /// Becomes `false` when the `peer_loop` (or writer task) exits —
    /// e.g. after a heartbeat timeout or TCP disconnect.
    pub(crate) fn writer_alive(&self) -> bool {
        self.flush_state.writer_alive.load(Ordering::Acquire)
    }

    /// Wait until all enqueued outbound messages have been flushed to the
    /// wire, or until `timeout` expires (pass `None` to block indefinitely).
    /// Returns immediately if the outbound queue is already empty or if the
    /// writer task has exited.
    pub(crate) async fn drain_outbound(&self, timeout: Option<Duration>) {
        let fut = async {
            // Bumps `flush_waiters` for the duration so producers know
            // to call `notify_waiters`. Skipped on the early-return
            // (already drained) path so we never touch the atomic when
            // the queue is empty.
            let enqueued = self.enqueued.load(Ordering::Acquire);
            let flushed = self.flush_state.flushed.load(Ordering::Acquire);
            if flushed >= enqueued {
                return;
            }
            if !self.flush_state.writer_alive.load(Ordering::Acquire) {
                return;
            }
            let _wg = FlushWaiterGuard::new(&self.flush_state);
            loop {
                let enqueued = self.enqueued.load(Ordering::Acquire);
                let flushed = self.flush_state.flushed.load(Ordering::Acquire);
                if flushed >= enqueued {
                    return;
                }
                if !self.flush_state.writer_alive.load(Ordering::Acquire) {
                    return;
                }
                self.flush_state.signaler.signaled().await;
            }
        };
        match timeout {
            None => fut.await,
            Some(d) => {
                let _ = crate::async_rt::task::timeout(d, fut).await;
            }
        }
    }
}

impl Drop for PeerEngine {
    fn drop(&mut self) {
        if let Some(tx) = self.shutdown_tx.take() {
            let _ = tx.send(());
        }
    }
}

// ── InprocEngine ──────────────────────────────────────────────────────────────

/// Lightweight send handle for an inproc peer. Replaces `PeerEngine` for
/// inproc connections: no outbound channel, no flush state, no writer task,
/// no reader task. Messages are injected directly into the remote socket's
/// sync inproc inbound channel tagged with the remote's `PeerKey`.
///
/// The send path is fully synchronous — `try_send` on a bounded
/// `std::sync::mpsc::SyncSender` followed by `Notify::notify_one()` to wake
/// a parked `recv_next`. No async executor involvement on the hot path.
///
/// Disconnect signaling: `Drop` sends a synthetic `PeerDisconnected` error
/// and notifies the remote, which `recv_next` handles identically to TCP EOF.
///
/// # HWM semantics — divergence from libzmq
///
/// libzmq (`socket_base.cpp:831-840`) sizes each inproc pipe to
/// `binder.send_hwm + connector.receive_hwm` in one direction and the symmetric
/// sum in the other, using a per-pipe buffer.
///
/// We use a **single shared inbound queue per receiver socket**, sized to
/// `options.receive_hwm` at socket construction. This means:
///
/// - `send_hwm` on an **inproc-only** sender has no knob to attach to: sends
///   write directly into the peer's inbound (no per-peer outbound buffer
///   exists on `InprocEngine`). Backpressure is governed entirely by the
///   receiver's `receive_hwm`.
/// - Effective buffer between two inproc peers is `receiver.receive_hwm`, not
///   `receiver.receive_hwm + sender.send_hwm`.
/// - With many inproc peers hitting the same receiver, the queue is
///   shared across all senders (libzmq gives each pipe its own).
///
/// Practical impact: PUB→SUB drop thresholds and REQ→REP blocking
/// thresholds hit slightly earlier than libzmq would under identical
/// settings. For the common case of both sides using the default `1000`,
/// we give `1000` total vs libzmq's `2000`. Tests and benchmarks have not
/// surfaced a regression from this; tracked as known divergence.
#[cfg(feature = "inproc")]
pub(crate) struct InprocEngine {
    /// Sync sender into the remote socket's inproc-only inbound channel.
    remote_inproc_tx: InprocInboundTx,
    /// Wakes the remote's `recv_next` when a message is pushed.
    remote_notify: Arc<crate::async_rt::notify::RuntimeNotify>,
    /// The `PeerKey` the remote assigned to this connection in its registry.
    remote_peer_key: crate::engine::registry::PeerKey,
}

#[cfg(feature = "inproc")]
impl InprocEngine {
    pub(crate) fn new(
        remote_inproc_tx: InprocInboundTx,
        remote_notify: Arc<crate::async_rt::notify::RuntimeNotify>,
        remote_peer_key: crate::engine::registry::PeerKey,
    ) -> Self {
        Self {
            remote_inproc_tx,
            remote_notify,
            remote_peer_key,
        }
    }

    /// Non-blocking send. Returns `Err(true)` if full (drop silently, PUB
    /// semantics), `Err(false)` if the remote channel is closed (peer gone).
    pub(crate) fn try_send_direct(&self, zm: ZmqMessage) -> Result<(), bool> {
        let item = (self.remote_peer_key, Ok(crate::codec::Message::Message(zm)));
        match self.remote_inproc_tx.try_send(item) {
            Ok(()) => {
                self.remote_notify.notify_one();
                Ok(())
            }
            Err(crossbeam_channel::TrySendError::Full(_)) => Err(true),
            Err(crossbeam_channel::TrySendError::Disconnected(_)) => Err(false),
        }
    }

    /// Synchronous non-blocking send. Used by REQ/REP/DEALER/ROUTER/PUSH.
    /// If the channel is full (capacity = peer's `receive_hwm`), blocks until
    /// space is available; if closed (peer disconnected), returns
    /// `SendError::Flush`.
    pub(crate) fn send_direct(&self, zm: ZmqMessage) -> Result<(), SendError> {
        let item = (self.remote_peer_key, Ok(crate::codec::Message::Message(zm)));
        match self.remote_inproc_tx.try_send(item) {
            Ok(()) => {
                self.remote_notify.notify_one();
                Ok(())
            }
            Err(crossbeam_channel::TrySendError::Disconnected(_)) => Err(SendError::Flush),
            Err(crossbeam_channel::TrySendError::Full(item)) => {
                // Channel full (cap=64, rare). Block until space is available.
                self.remote_inproc_tx
                    .send(item)
                    .map_err(|_e| SendError::Flush)?;
                self.remote_notify.notify_one();
                Ok(())
            }
        }
    }

    #[allow(clippy::unused_self)]
    pub(crate) fn writer_alive(&self) -> bool {
        true
    }
}

#[cfg(feature = "inproc")]
impl Drop for InprocEngine {
    fn drop(&mut self) {
        let _ = self
            .remote_inproc_tx
            .try_send((self.remote_peer_key, Err(CodecError::PeerDisconnected)));
        self.remote_notify.notify_one();
    }
}

/// Perform the two-round inproc handshake and return a fully-wired
/// `InprocEngine`.
///
/// **Ordering constraint:** the caller must insert a placeholder into its
/// registry *before* calling this function so that `local_key` is valid.
/// After this returns, the caller swaps the real engine in via
/// `registry.replace_engine(local_key, AnyEngine::Inproc(Arc::new(engine)))`.
///
/// Round 1: both sides concurrently send their `InprocChannelInfo`
/// (sync tx, notify, and socket type) to the other. The socket-type
/// compatibility check here is the inproc analogue of libzmq's ZMTP
/// greeting check — inproc has no wire greeting, so the same validation
/// happens as part of the handshake.
///
/// Round 2: both sides concurrently send their assigned `PeerKey` to the other.
///
/// Returns an error with reason `"inproc peer dropped during handshake"` if
/// the remote disconnected mid-handshake, or `"inproc peer socket type not
/// compatible"` if the two sockets cannot talk to each other (e.g. REQ↔PUB).
#[cfg(feature = "inproc")]
pub(crate) async fn connect_inproc_engine(
    local_key: crate::engine::registry::PeerKey,
    local_socket_type: crate::SocketType,
    local_routing_id: Option<PeerIdentity>,
    local_inproc_tx: InprocInboundTx,
    local_notify: Arc<crate::async_rt::notify::RuntimeNotify>,
    peer: crate::transport::inproc::InprocPeer,
) -> crate::ZmqResult<(InprocEngine, Option<PeerIdentity>)> {
    const HANDSHAKE_DROPPED: &str = "inproc peer dropped during handshake";

    peer.send_inbound
        .send(crate::transport::inproc::InprocChannelInfo {
            tx: local_inproc_tx,
            notify: local_notify,
            socket_type: local_socket_type,
            routing_id: local_routing_id,
        })
        .map_err(|_info| crate::error::ZmqError::Socket(HANDSHAKE_DROPPED.into()))?;
    let remote_info = peer
        .recv_inbound
        .await
        .map_err(|_canceled| crate::error::ZmqError::Socket(HANDSHAKE_DROPPED.into()))?;

    if !local_socket_type.compatible(remote_info.socket_type) {
        log::warn!(
            "inproc: incompatible socket types ({:?} ↔ {:?}); refusing connection",
            local_socket_type,
            remote_info.socket_type
        );
        return Err(crate::error::ZmqError::Socket(
            "inproc peer socket type not compatible".into(),
        ));
    }

    peer.send_key
        .send(local_key)
        .map_err(|_key| crate::error::ZmqError::Socket(HANDSHAKE_DROPPED.into()))?;
    let remote_key = peer
        .recv_key
        .await
        .map_err(|_canceled| crate::error::ZmqError::Socket(HANDSHAKE_DROPPED.into()))?;

    Ok((
        InprocEngine::new(remote_info.tx, remote_info.notify, remote_key),
        remote_info.routing_id,
    ))
}

/// Create a placeholder `InprocEngine` whose channel receiver is immediately
/// dropped. Used to satisfy `insert_with`'s synchronous closure while the async
/// handshake runs; the real engine is swapped in via `registry.replace_engine`
/// afterwards. The `Drop` impl's `try_send` sees `Closed` and is a no-op.
#[cfg(feature = "inproc")]
pub(crate) fn inproc_placeholder_engine() -> InprocEngine {
    let (tx, _rx) = crossbeam_channel::bounded(1);
    // _rx drops here → channel is disconnected → placeholder sends are no-ops
    let notify = Arc::new(crate::async_rt::notify::RuntimeNotify::new());
    InprocEngine::new(tx, notify, 0)
}

#[cfg(all(test, feature = "tokio"))]
#[allow(clippy::type_complexity)]
mod tests {
    use super::*;
    use crate::async_rt;
    use crate::codec::DefaultFramedIo as FramedIo;
    use crate::message::ZmqMessage;
    use bytes::Bytes;
    use tokio::net::{TcpListener, TcpStream};

    /// Establish a pair of connected TCP sockets on an ephemeral port
    /// and complete the ZMTP greeting exchange on both halves. After
    /// this returns the codec state machines are past `Greeting` so the
    /// engine can operate on plain `Message::Message` frames. This is
    /// the real-world handoff: `socket::handshake::peer_connected` performs the
    /// handshake *before* yielding the `FramedIo` to the backend.
    async fn tcp_pair() -> (FramedIo, FramedIo) {
        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
        let addr = listener.local_addr().unwrap();
        let connect_fut = TcpStream::connect(addr);
        let (accept_res, connect_res) = futures::join!(listener.accept(), connect_fut);
        let (server, _) = accept_res.unwrap();
        let client = connect_res.unwrap();
        let mut io_a = FramedIo::from_tcp(server);
        let mut io_b = FramedIo::from_tcp(client);
        let (greet_a, greet_b) = futures::join!(
            crate::codec::handshake::greet_exchange(&mut io_a),
            crate::codec::handshake::greet_exchange(&mut io_b),
        );
        greet_a.unwrap();
        greet_b.unwrap();
        (io_a, io_b)
    }

    /// Spawn a pair of engines wired to fresh shared-inbound channels.
    /// Returns the two engines and the two receive ends so the test can
    /// drain them. Inline-write defaults to disabled (matches
    /// `PeerConfig::default()`); use `spawn_pair_with_inline` for tests
    /// that need the inline path enabled.
    fn spawn_pair(
        io_a: FramedIo,
        io_b: FramedIo,
        send_hwm: usize,
    ) -> (
        PeerEngine,
        PeerEngine,
        Receiver<(
            crate::engine::registry::PeerKey,
            Result<Message, CodecError>,
        )>,
        Receiver<(
            crate::engine::registry::PeerKey,
            Result<Message, CodecError>,
        )>,
    ) {
        spawn_pair_with_inline(io_a, io_b, send_hwm, None)
    }

    /// Like `spawn_pair` but lets the caller set
    /// `PeerConfig::inline_write_max` (REQ/REP/PAIR-style opt-in).
    #[allow(clippy::option_option)]
    fn spawn_pair_with_inline(
        io_a: FramedIo,
        io_b: FramedIo,
        send_hwm: usize,
        inline_write_max: Option<Option<usize>>,
    ) -> (
        PeerEngine,
        PeerEngine,
        Receiver<(
            crate::engine::registry::PeerKey,
            Result<Message, CodecError>,
        )>,
        Receiver<(
            crate::engine::registry::PeerKey,
            Result<Message, CodecError>,
        )>,
    ) {
        #[cfg(feature = "curve")]
        let (a_read, a_write, _) = io_a.into_parts();
        #[cfg(not(feature = "curve"))]
        let (a_read, a_write) = io_a.into_parts();
        #[cfg(feature = "curve")]
        let (b_read, b_write, _) = io_b.into_parts();
        #[cfg(not(feature = "curve"))]
        let (b_read, b_write) = io_b.into_parts();
        let (a_tx, a_rx) = flume::bounded(1024);
        let (b_tx, b_rx) = flume::bounded(1024);
        let config_a = crate::engine::peer_loop::PeerConfig {
            inline_write_max,
            ..Default::default()
        };
        let config_b = crate::engine::peer_loop::PeerConfig {
            inline_write_max,
            ..Default::default()
        };
        let engine_a = PeerEngine::spawn(
            0,
            PeerIdentity::new(),
            a_read,
            a_write.into_engine_writer(),
            send_hwm,
            a_tx,
            config_a,
        );
        let engine_b = PeerEngine::spawn(
            1,
            PeerIdentity::new(),
            b_read,
            b_write.into_engine_writer(),
            send_hwm,
            b_tx,
            config_b,
        );
        (engine_a, engine_b, a_rx, b_rx)
    }

    /// 100-message ordered round-trip through a pair of `PeerEngine`s.
    /// Validates basic framing, in-order delivery, and drop-triggered
    /// shutdown.
    #[async_rt::test]
    async fn engine_roundtrip_tcp() {
        let (io_a, io_b) = tcp_pair().await;
        let (engine_a, engine_b, _a_rx, b_rx) = spawn_pair(io_a, io_b, 64);

        for i in 0..100u32 {
            let msg = ZmqMessage::from(Bytes::from(i.to_be_bytes().to_vec()));
            engine_a.send(Message::Message(msg)).await.unwrap();
        }

        for i in 0..100u32 {
            let (_peer, got) = b_rx.recv_async().await.expect("closed");
            match got.expect("codec error") {
                Message::Message(m) => {
                    let frame = m.get(0).expect("frame").clone();
                    assert_eq!(&frame[..], &i.to_be_bytes()[..]);
                }
                other => panic!("unexpected message variant: {:?}", other),
            }
        }

        drop(engine_a);
        drop(engine_b);
    }

    /// Interleave small single-frame and larger sends and verify the
    /// receiver sees them in send order. Regresses any FIFO bug in the
    /// peer loop's drain/fast-path paths.
    #[async_rt::test]
    async fn engine_mixed_size_preserves_order() {
        let (io_a, io_b) = tcp_pair().await;
        let (engine_a, _engine_b, _a_rx, b_rx) = spawn_pair(io_a, io_b, 1024);

        // Alternate small (inline-eligible) and large (channel path).
        for i in 0..50u32 {
            if i.is_multiple_of(2) {
                let msg = ZmqMessage::from(Bytes::from(i.to_be_bytes().to_vec()));
                engine_a.send(Message::Message(msg)).await.unwrap();
            } else {
                // 256 B: larger send forces multi-frame iovec coalescing
                // through the peer loop's drain_batch path.
                let mut payload = vec![i as u8; 256];
                payload[0] = (i >> 24) as u8;
                payload[1] = (i >> 16) as u8;
                payload[2] = (i >> 8) as u8;
                payload[3] = i as u8;
                let msg = ZmqMessage::from(Bytes::from(payload));
                engine_a.send(Message::Message(msg)).await.unwrap();
            }
        }

        for i in 0..50u32 {
            let (_peer, got) = b_rx.recv_async().await.expect("closed");
            match got.expect("codec error") {
                Message::Message(m) => {
                    let frame = m.get(0).expect("frame");
                    if i.is_multiple_of(2) {
                        assert_eq!(frame.len(), 4, "msg {} was the small variant", i);
                        assert_eq!(&frame[..], &i.to_be_bytes()[..]);
                    } else {
                        assert_eq!(frame.len(), 256, "msg {} was the large variant", i);
                        assert_eq!(&frame[..4], &i.to_be_bytes()[..]);
                    }
                }
                other => panic!("unexpected variant: {:?}", other),
            }
        }
    }

    /// Inline disabled (default config): `try_inline_write` returns
    /// `None` immediately and the message takes the channel path.
    /// `inline_write` is `None` so the field exists as a cheap branch
    /// at the top of `send`.
    #[async_rt::test]
    async fn inline_disabled_returns_none() {
        let (io_a, io_b) = tcp_pair().await;
        let (engine_a, _engine_b, _a_rx, _b_rx) = spawn_pair(io_a, io_b, 64);
        let msg = Message::Message(ZmqMessage::from(Bytes::from_static(b"x")));
        // No inline target → returns None.
        assert!(engine_a.try_inline_write(&msg).is_none());
    }

    /// Inline enabled (uncapped): single-frame small message takes the
    /// inline path, `enqueued`/`flushed` bumped together.
    #[async_rt::test]
    async fn inline_enabled_single_frame_sent() {
        let (io_a, io_b) = tcp_pair().await;
        let (engine_a, _engine_b, _a_rx, b_rx) = spawn_pair_with_inline(io_a, io_b, 64, Some(None));
        let payload = Bytes::from_static(b"hello-inline");
        for _ in 0..10 {
            let msg = Message::Message(ZmqMessage::from(payload.clone()));
            // Run a few times so the peer-loop has cleared its
            // `peer_loop_busy` window between tries.
            engine_a.send(msg).await.unwrap();
        }
        // enqueued + flushed should track each other (inline bumps both).
        assert_eq!(
            engine_a.enqueued.load(Ordering::Acquire),
            engine_a.flush_state.flushed.load(Ordering::Acquire),
        );
        // Drain the receiver to confirm wire delivery.
        for _ in 0..10 {
            let (_peer, got) = b_rx.recv_async().await.expect("closed");
            match got.expect("codec error") {
                Message::Message(m) => {
                    assert_eq!(&m.get(0).unwrap()[..], &payload[..]);
                }
                other => panic!("unexpected: {:?}", other),
            }
        }
    }

    /// Inline enabled with a payload cap: messages over the cap take
    /// the channel path, under-cap messages take the inline path.
    /// Validates the `cap_exceeded` gate end-to-end and that mixed
    /// inline/channel sends preserve FIFO ordering on the receiver.
    #[async_rt::test]
    async fn inline_payload_cap_partitions_paths() {
        let (io_a, io_b) = tcp_pair().await;
        // Cap at 16 bytes — 8 B sends inline, 32 B sends through channel.
        let (engine_a, _engine_b, _a_rx, b_rx) =
            spawn_pair_with_inline(io_a, io_b, 64, Some(Some(16)));
        let small = Bytes::from_static(b"sml-08-B"); // 8 B
        let large = Bytes::from(vec![0xAB; 32]); // 32 B
        for i in 0..20u32 {
            let payload = if i.is_multiple_of(2) {
                small.clone()
            } else {
                large.clone()
            };
            let msg = Message::Message(ZmqMessage::from(payload));
            engine_a.send(msg).await.unwrap();
        }
        // FIFO across mixed inline/channel sends.
        for i in 0..20u32 {
            let (_peer, got) = b_rx.recv_async().await.expect("closed");
            match got.expect("codec error") {
                Message::Message(m) => {
                    let frame = m.get(0).unwrap();
                    let expected_len = if i.is_multiple_of(2) { 8 } else { 32 };
                    assert_eq!(
                        frame.len(),
                        expected_len,
                        "message {} wrong size — FIFO broken?",
                        i
                    );
                }
                other => panic!("unexpected: {:?}", other),
            }
        }
    }

    /// Multi-frame inline (REQ-style envelope: empty delimiter + body)
    /// when inline is enabled. Verifies the multi-frame trait variant
    /// fires and frames arrive intact.
    #[async_rt::test]
    async fn inline_multi_frame_delivers() {
        let (io_a, io_b) = tcp_pair().await;
        let (engine_a, _engine_b, _a_rx, b_rx) = spawn_pair_with_inline(io_a, io_b, 64, Some(None));
        for i in 0..5u32 {
            let mut m = ZmqMessage::from(Bytes::new());
            m.push_back(Bytes::from(i.to_be_bytes().to_vec()));
            assert_eq!(m.len(), 2);
            engine_a.send(Message::Message(m)).await.unwrap();
        }
        for i in 0..5u32 {
            let (_peer, got) = b_rx.recv_async().await.expect("closed");
            match got.expect("codec error") {
                Message::Message(m) => {
                    assert_eq!(m.len(), 2, "frame count");
                    assert!(m.get(0).unwrap().is_empty());
                    assert_eq!(&m.get(1).unwrap()[..], &i.to_be_bytes()[..]);
                }
                other => panic!("unexpected: {:?}", other),
            }
        }
    }

    /// Inline-fanout helper used by PUB/XPUB: when enabled, returns
    /// `Some(Ok(()))` on success.
    #[async_rt::test]
    async fn inline_fanout_enabled_sends() {
        let (io_a, io_b) = tcp_pair().await;
        let (engine_a, _engine_b, _a_rx, b_rx) = spawn_pair_with_inline(io_a, io_b, 64, Some(None));
        let payload = Bytes::from_static(b"fanout-msg");
        for _ in 0..5 {
            let shared = Arc::new(ZmqMessage::from(payload.clone()));
            match engine_a.try_inline_fanout(&shared) {
                Some(Ok(())) => {}
                other => panic!(
                    "expected inline Sent, got {:?}",
                    other.map(|r| r.map_err(|e| format!("{:?}", e)))
                ),
            }
        }
        for _ in 0..5 {
            let (_peer, got) = b_rx.recv_async().await.expect("closed");
            match got.expect("codec error") {
                Message::Message(m) => {
                    assert_eq!(&m.get(0).unwrap()[..], &payload[..]);
                }
                other => panic!("unexpected: {:?}", other),
            }
        }
    }

    /// Writer batching smoke test: push enough 16 B messages to span
    /// more than one `out_batch_size = 8192 B` drain cycle (~512 msgs
    /// per batch at this size), and verify every message arrives in
    /// order. Regresses any FIFO bug at drain boundaries.
    #[async_rt::test]
    async fn engine_batch_flush() {
        let (io_a, io_b) = tcp_pair().await;
        let (engine_a, _engine_b, _a_rx, b_rx) = spawn_pair(io_a, io_b, 2048);

        // > 512 messages (one batch at 16 B each) so the writer drains,
        // flushes, then drains the remainder in a second cycle.
        let count = 1024u32;
        for i in 0..count {
            let msg = ZmqMessage::from(Bytes::from(i.to_be_bytes().to_vec()));
            engine_a.send(Message::Message(msg)).await.unwrap();
        }

        for i in 0..count {
            let (_peer, got) = b_rx.recv_async().await.expect("closed");
            match got.expect("codec error") {
                Message::Message(m) => {
                    let frame = m.get(0).expect("frame").clone();
                    assert_eq!(&frame[..], &i.to_be_bytes()[..]);
                }
                other => panic!("unexpected variant: {:?}", other),
            }
        }
    }
}