freenet 0.2.55 - Docs.rs

//! Manages the state and execution of diverse network operations (e.g., Get, Put, Subscribe).
//!
//! The `OpManager` runs its own event loop (`garbage_cleanup_task`) to handle the lifecycle
//! of operations, ensuring they progress correctly and are eventually cleaned up.
//! It communicates with the main node event loop and the network bridge via channels.
//!
//! See [`../../architecture.md`](../../architecture.md) for details on its role and interaction with other components.

use std::{
    cmp::Reverse,
    collections::{BTreeSet, HashSet},
    net::SocketAddr,
    sync::{Arc, OnceLock, atomic::AtomicBool},
    time::Duration,
};

use dashmap::{DashMap, DashSet};
use either::Either;
use freenet_stdlib::prelude::{ContractInstanceId, ContractKey};
use parking_lot::{Mutex, RwLock};
use tokio::sync::{mpsc, oneshot};
use tracing::Instrument;

use crate::{
    client_events::HostResult,
    config::GlobalExecutor,
    contract::{ContractError, ContractHandlerChannel, ContractHandlerEvent, SenderHalve},
    message::{
        InterestMessage, MessageStats, NetMessage, NetMessageV1, NodeEvent, Transaction,
        TransactionType,
    },
    operations::{
        OpCtx, OpEnum, OpError,
        connect::{ConnectForwardEstimator, ConnectOp, ConnectState},
        get::GetOp,
        orphan_streams::OrphanStreamRegistry,
        put::PutOp,
        subscribe::SubscribeOp,
        update::UpdateOp,
    },
    ring::{
        ConnectionFailureReason, ConnectionManager, LiveTransactionTracker, PeerConnectionBackoff,
        PeerKey, PeerKeyLocation, Ring,
    },
    transport::TransportPublicKey,
    util::time_source::InstantTimeSrc,
};

use super::{
    NetEventRegister, NodeConfig, RequestRouter, neighbor_hosting::NeighborHostingManager,
    network_bridge::EventLoopNotificationsSender,
};

#[cfg(debug_assertions)]
macro_rules! check_id_op {
    ($get_ty:expr, $var:path) => {
        if !matches!($get_ty, $var) {
            return Err(OpError::IncorrectTxType($var, $get_ty));
        }
    };
}

#[derive(Debug, thiserror::Error)]
pub(crate) enum OpNotAvailable {
    #[error("operation running")]
    Running,
    #[error("operation completed")]
    Completed,
}

#[derive(Default)]
struct Ops {
    connect: DashMap<Transaction, crate::operations::connect::ConnectOp>,
    put: DashMap<Transaction, PutOp>,
    get: DashMap<Transaction, GetOp>,
    subscribe: DashMap<Transaction, SubscribeOp>,
    update: DashMap<Transaction, UpdateOp>,
    completed: DashSet<Transaction>,
    under_progress: DashSet<Transaction>,
}

/// Snapshot of per-map sizes held by `Ops`. Emitted periodically from
/// `garbage_cleanup_task` when `FREENET_MEMORY_STATS=1` is set, to help
/// diagnose retained-state bloat without forcing a full heap profiler
/// run.
#[derive(Debug, Default)]
struct OpsSizes {
    connect: usize,
    put: usize,
    get: usize,
    subscribe: usize,
    update: usize,
    completed: usize,
    under_progress: usize,
}

impl Ops {
    fn sizes(&self) -> OpsSizes {
        OpsSizes {
            connect: self.connect.len(),
            put: self.put.len(),
            get: self.get.len(),
            subscribe: self.subscribe.len(),
            update: self.update.len(),
            completed: self.completed.len(),
            under_progress: self.under_progress.len(),
        }
    }
}

/// Thread safe and friendly data structure to maintain state of the different operations
/// and enable their execution.
pub(crate) struct OpManager {
    pub ring: Arc<Ring>,
    ops: Arc<Ops>,
    pub(crate) to_event_listener: EventLoopNotificationsSender,
    pub ch_outbound: Arc<ContractHandlerChannel<SenderHalve>>,
    new_transactions: tokio::sync::mpsc::Sender<Transaction>,
    pub result_router_tx: mpsc::Sender<(Transaction, HostResult)>,
    pub(crate) connect_forward_estimator: Arc<RwLock<ConnectForwardEstimator>>,
    /// Indicates whether the peer is ready to process client operations.
    /// For gateways: always true (peer_id is set from config)
    /// For regular peers: true only after first successful network handshake sets peer_id
    pub peer_ready: Arc<AtomicBool>,
    /// Whether this node is a gateway
    pub is_gateway: bool,
    /// Waiters for contract storage notification.
    /// Operations can register to be notified when a specific contract is stored.
    contract_waiters:
        Arc<Mutex<std::collections::HashMap<ContractInstanceId, Vec<oneshot::Sender<()>>>>>,
    /// Neighbor hosting manager for tracking neighbor contract hosting
    pub neighbor_hosting: Arc<NeighborHostingManager>,
    /// Interest manager for delta-based state synchronization
    pub interest_manager: Arc<crate::ring::interest::InterestManager<InstantTimeSrc>>,
    /// Dedup cache for skipping redundant broadcast WASM merges
    pub broadcast_dedup_cache: Arc<crate::operations::update::BroadcastDedupCache>,
    /// Request router for client request deduplication.
    ///
    /// This is initialized lazily from `client_event_handling` because the router is only
    /// available once the client-side handling layer has been constructed. When set, it is
    /// used by operations to clean up stale routing entries as they complete or time out.
    ///
    /// Operations that start and finish before the router has been initialized will *not*
    /// clean up any routing state via this router. In practice this is acceptable because
    /// `client_event_handling` sets the router early in the node startup sequence, before
    /// regular client operations are expected to run.
    ///
    /// Wrapped in Arc for sharing with `garbage_cleanup_task`.
    request_router: Arc<OnceLock<Arc<RequestRouter>>>,
    /// Registry for handling race conditions between stream fragments and metadata messages.
    /// Coordinates transport layer (which receives fragments) with operations layer
    /// (which receives RequestStreaming/ResponseStreaming messages).
    orphan_stream_registry: Arc<OrphanStreamRegistry>,
    /// Size threshold in bytes above which streaming is used.
    pub streaming_threshold: usize,
    /// Backoff tracker for failed gateway connection attempts.
    /// Used to implement exponential backoff when retrying connections.
    pub gateway_backoff: Arc<Mutex<PeerConnectionBackoff>>,
    /// Notifies `initial_join_procedure` and `handle_aborted_op` when gateway
    /// backoff is cleared, so they can wake from backoff sleeps and retry immediately.
    pub gateway_backoff_cleared: Arc<tokio::sync::Notify>,
    /// Addresses blocked by local policy. Used by the connect protocol to reject
    /// join requests from blocked peers at the routing level, allowing the uphill
    /// hop mechanism to find alternate acceptors.
    pub blocked_addresses: Option<Arc<HashSet<SocketAddr>>>,
    /// Configured gateway peers for bootstrap/re-bootstrap.
    /// Used by connection_maintenance to directly attempt gateway connections
    /// when the node has zero ring connections (#3219).
    pub configured_gateways: Arc<Vec<PeerKeyLocation>>,
    /// Tracks contracts for which a self-healing GET has been triggered
    /// (e.g., when an UPDATE broadcast fails due to missing contract parameters).
    /// Maps contract instance ID to the timestamp (ms since epoch via GlobalSimulationTime)
    /// when the fetch was initiated, with a cooldown to avoid repeated fetch attempts.
    pub(crate) pending_contract_fetches: Arc<DashMap<ContractInstanceId, u64>>,
    /// Transactions with an active task-per-tx relay-GET driver at this
    /// node. Populated by `start_relay_get` before spawn and removed by
    /// an RAII guard on the driver task. Consulted by the dispatch gate
    /// in `node.rs` to reject duplicate inbound Requests for a tx that
    /// already has a live relay driver — prevents the 3^HTL spawn
    /// amplification observed in workflow run 24600634908 (6.8M spawns
    /// in 100s, 63GB RSS).
    pub(crate) active_relay_get_txs: Arc<DashSet<Transaction>>,
    /// Set of transactions currently being driven by a relay UPDATE
    /// task-per-tx driver on this node. Same role as
    /// `active_relay_get_txs` but for UPDATE relay (#1454 phase 5
    /// follow-up). UPDATE relay has no retry loop and no upstream
    /// reply, so the amplification risk is structurally lower than GET
    /// — the dedup gate exists primarily for robustness against
    /// GC-spawned re-entries and routing-bloom false-positive
    /// retransmissions.
    pub(crate) active_relay_update_txs: Arc<DashSet<Transaction>>,
    /// Set of transactions currently being driven by a relay PUT
    /// task-per-tx driver on this node. Same role as
    /// `active_relay_get_txs` but for PUT relay (#1454 phase 5
    /// follow-up slice A). PUT relay has req/response semantics like
    /// GET (but forwards once — no per-hop retry), so the amplification
    /// risk is comparable to GET. The dedup gate rejects duplicate
    /// inbound `PutMsg::Request` for a tx that already has a live
    /// relay driver — prevents GC-spawned re-entries and routing-bloom
    /// false-positive retransmissions from spawning redundant drivers.
    pub(crate) active_relay_put_txs: Arc<DashSet<Transaction>>,
    /// Set of transactions currently being driven by a relay SUBSCRIBE
    /// task-per-tx driver on this node. Same role as
    /// `active_relay_get_txs` but for SUBSCRIBE relay (#1454 phase 5
    /// follow-up slice A). SUBSCRIBE relay has req/response semantics
    /// like GET/PUT but forwards once — no per-hop retry — because the
    /// client-init driver (Phase 2b) owns cross-peer retry. The dedup
    /// gate rejects duplicate inbound `SubscribeMsg::Request` for a tx
    /// that already has a live relay driver.
    pub(crate) active_relay_subscribe_txs: Arc<DashSet<Transaction>>,
    /// Set of transactions currently being driven by a relay CONNECT
    /// task-per-tx driver on this node. Same role as
    /// `active_relay_get_txs` but for CONNECT relay (#1454 phase 2c
    /// slice 1). The dedup gate rejects duplicate inbound
    /// `ConnectMsg::Request` for a tx that already has a live relay
    /// driver — prevents bloom-filter rekey re-entries and
    /// uphill-retry false-positive retransmissions from spawning
    /// redundant drivers. Phase 2c slice 1 covers the
    /// Request→Response forward path; Rejected within-relay retries
    /// and ConnectFailed downstream propagation stay on legacy
    /// `process_message`, gated by the dedup set's absence on those
    /// branches.
    pub(crate) active_relay_connect_txs: Arc<DashSet<Transaction>>,
}

impl Clone for OpManager {
    fn clone(&self) -> Self {
        Self {
            ring: self.ring.clone(),
            ops: self.ops.clone(),
            to_event_listener: self.to_event_listener.clone(),
            ch_outbound: self.ch_outbound.clone(),
            new_transactions: self.new_transactions.clone(),
            result_router_tx: self.result_router_tx.clone(),
            connect_forward_estimator: self.connect_forward_estimator.clone(),
            peer_ready: self.peer_ready.clone(),
            is_gateway: self.is_gateway,
            contract_waiters: self.contract_waiters.clone(),
            neighbor_hosting: self.neighbor_hosting.clone(),
            interest_manager: self.interest_manager.clone(),
            broadcast_dedup_cache: self.broadcast_dedup_cache.clone(),
            request_router: self.request_router.clone(),
            orphan_stream_registry: self.orphan_stream_registry.clone(),
            streaming_threshold: self.streaming_threshold,
            gateway_backoff: self.gateway_backoff.clone(),
            gateway_backoff_cleared: self.gateway_backoff_cleared.clone(),
            blocked_addresses: self.blocked_addresses.clone(),
            configured_gateways: self.configured_gateways.clone(),
            pending_contract_fetches: self.pending_contract_fetches.clone(),
            active_relay_get_txs: self.active_relay_get_txs.clone(),
            active_relay_update_txs: self.active_relay_update_txs.clone(),
            active_relay_put_txs: self.active_relay_put_txs.clone(),
            active_relay_subscribe_txs: self.active_relay_subscribe_txs.clone(),
            active_relay_connect_txs: self.active_relay_connect_txs.clone(),
        }
    }
}

impl OpManager {
    pub(super) fn new<ER: NetEventRegister + Clone>(
        notification_channel: EventLoopNotificationsSender,
        ch_outbound: ContractHandlerChannel<SenderHalve>,
        config: &NodeConfig,
        event_register: ER,
        connection_manager: ConnectionManager,
        result_router_tx: mpsc::Sender<(Transaction, HostResult)>,
        task_monitor: &super::background_task_monitor::BackgroundTaskMonitor,
    ) -> anyhow::Result<Self> {
        let ring = Ring::new(
            config,
            notification_channel.clone(),
            event_register.clone(),
            config.is_gateway,
            connection_manager,
            task_monitor,
        )?;
        let ops = Arc::new(Ops::default());

        let (new_transactions, rx) = tokio::sync::mpsc::channel(100);
        let current_span = tracing::Span::current();
        let garbage_span = if current_span.is_none() {
            tracing::info_span!("garbage_cleanup_task")
        } else {
            tracing::info_span!(parent: current_span, "garbage_cleanup_task")
        };
        let connect_forward_estimator = Arc::new(RwLock::new(ConnectForwardEstimator::new()));
        let request_router = Arc::new(OnceLock::new());
        let ch_outbound = Arc::new(ch_outbound);
        let contract_waiters: Arc<
            Mutex<std::collections::HashMap<ContractInstanceId, Vec<oneshot::Sender<()>>>>,
        > = Arc::new(Mutex::new(std::collections::HashMap::new()));
        let pending_contract_fetches: Arc<DashMap<ContractInstanceId, u64>> =
            Arc::new(DashMap::new());
        let active_relay_get_txs: Arc<DashSet<Transaction>> = Arc::new(DashSet::new());
        let active_relay_update_txs: Arc<DashSet<Transaction>> = Arc::new(DashSet::new());
        let active_relay_put_txs: Arc<DashSet<Transaction>> = Arc::new(DashSet::new());
        let active_relay_subscribe_txs: Arc<DashSet<Transaction>> = Arc::new(DashSet::new());
        let active_relay_connect_txs: Arc<DashSet<Transaction>> = Arc::new(DashSet::new());

        task_monitor.register(
            "garbage_cleanup",
            GlobalExecutor::spawn(
                garbage_cleanup_task(
                    rx,
                    ops.clone(),
                    ring.live_tx_tracker.clone(),
                    notification_channel.clone(),
                    event_register,
                    result_router_tx.clone(),
                    request_router.clone(),
                    ring.clone(),
                    ch_outbound.clone(),
                    contract_waiters.clone(),
                    pending_contract_fetches.clone(),
                    active_relay_get_txs.clone(),
                    active_relay_update_txs.clone(),
                    active_relay_put_txs.clone(),
                    active_relay_subscribe_txs.clone(),
                    active_relay_connect_txs.clone(),
                )
                .instrument(garbage_span),
            ),
        );

        // Gateways are ready immediately (peer_id set from config)
        // Regular peers become ready after first handshake
        let is_gateway = config.is_gateway;
        let peer_ready = Arc::new(AtomicBool::new(is_gateway));

        if is_gateway {
            tracing::debug!("Gateway node: peer_ready set to true immediately");
        } else {
            tracing::debug!("Regular peer node: peer_ready will be set after first handshake");
        }

        let neighbor_hosting = Arc::new(NeighborHostingManager::new());
        let interest_manager = Arc::new(crate::ring::interest::InterestManager::new(
            InstantTimeSrc::new(),
        ));

        // Start background sweep task for interest expiration
        crate::ring::interest::InterestManager::start_sweep_task(interest_manager.clone());

        // Extract streaming config from NodeConfig
        let streaming_threshold = config.config.network_api.streaming_threshold;

        tracing::info!(
            streaming_threshold_bytes = streaming_threshold,
            "Streaming transport enabled for large transfers"
        );

        // Create orphan stream registry and start GC task
        let orphan_stream_registry = Arc::new(OrphanStreamRegistry::new());
        OrphanStreamRegistry::start_gc_task(orphan_stream_registry.clone());

        Ok(Self {
            ring,
            ops,
            to_event_listener: notification_channel,
            ch_outbound,
            new_transactions,
            result_router_tx,
            connect_forward_estimator,
            peer_ready,
            is_gateway,
            contract_waiters,
            neighbor_hosting,
            interest_manager,
            broadcast_dedup_cache: Arc::new(crate::operations::update::BroadcastDedupCache::new()),
            request_router,
            orphan_stream_registry,
            streaming_threshold,
            gateway_backoff: Arc::new(Mutex::new(PeerConnectionBackoff::new())),
            gateway_backoff_cleared: Arc::new(tokio::sync::Notify::new()),
            blocked_addresses: config
                .blocked_addresses
                .as_ref()
                .map(|a| Arc::new(a.clone())),
            configured_gateways: Arc::new(
                config
                    .gateways
                    .iter()
                    .map(|gw| gw.peer_key_location.clone())
                    .collect(),
            ),
            pending_contract_fetches,
            active_relay_get_txs,
            active_relay_update_txs,
            active_relay_put_txs,
            active_relay_subscribe_txs,
            active_relay_connect_txs,
        })
    }

    /// Set the request router for cleaning up stale entries when operations complete.
    ///
    /// This is called from client_event_handling after the request_router is created.
    /// Without this, completed operations leave stale entries in the request router's
    /// resource_to_transaction map, causing subsequent requests to hang forever.
    pub fn set_request_router(&self, router: Arc<RequestRouter>) {
        if self.request_router.set(router).is_err() {
            tracing::warn!("Request router already set - ignoring duplicate set");
        }
    }

    /// Send a result to the client via the result router.
    ///
    /// Uses try_send to avoid blocking the caller (which may be the node
    /// event loop). If the result router channel is full, the result is
    /// dropped and the client will see a timeout.
    pub(crate) fn send_client_result(&self, tx: Transaction, host_result: HostResult) {
        if let Err(err) = self.result_router_tx.try_send((tx, host_result)) {
            tracing::error!(
                %tx,
                error = %err,
                "failed to dispatch operation result to client \
                 (result router channel full or closed)"
            );
            return;
        }

        if let Err(err) = self
            .to_event_listener
            .notifications_sender
            .try_send(Either::Right(NodeEvent::TransactionCompleted(tx)))
        {
            tracing::warn!(
                %tx,
                error = %err,
                "failed to notify event loop about transaction completion"
            );
        }
    }

    /// Timeout for sending notifications to the event loop.
    /// If the channel is full for this long, the event loop is stuck and sending will never succeed.
    const NOTIFICATION_SEND_TIMEOUT: Duration = Duration::from_secs(30);

    /// An early, fast path, return for communicating back changes of on-going operations
    /// in the node to the main message handler, without any transmission in the network whatsoever.
    ///
    /// Useful when transitioning between states that do not require any network communication
    /// with other nodes, like intermediate states before returning.
    pub async fn notify_op_change(&self, msg: NetMessage, op: OpEnum) -> Result<(), OpError> {
        let tx = *msg.id();
        let peer_id = &self.ring.connection_manager.pub_key;
        tracing::debug!(
            tx = %tx,
            msg_type = %msg,
            peer = %peer_id,
            "notify_op_change: Pushing operation and sending notification"
        );

        // push back the state to the stack
        self.push(tx, op).await?;

        tracing::debug!(
            tx = %tx,
            peer = %peer_id,
            "notify_op_change: Operation pushed, sending to event listener"
        );

        match tokio::time::timeout(
            Self::NOTIFICATION_SEND_TIMEOUT,
            self.to_event_listener
                .notifications_sender()
                .send(Either::Left(msg)),
        )
        .await
        {
            Ok(Ok(())) => {}
            Ok(Err(_)) => return Err(OpError::NotificationError),
            Err(_) => {
                tracing::error!(
                    tx = %tx,
                    timeout_secs = Self::NOTIFICATION_SEND_TIMEOUT.as_secs(),
                    channel_pending = self.to_event_listener.notification_channel_pending(),
                    channel_remaining = self.to_event_listener.notifications_sender().capacity(),
                    "notify_op_change: Notification channel full for too long, event loop may be stuck"
                );
                return Err(OpError::NotificationChannelError(
                    "notification channel send timed out — event loop is likely stuck".into(),
                ));
            }
        }

        tracing::debug!(
            tx = %tx,
            peer = %peer_id,
            "notify_op_change: Notification sent successfully"
        );

        Ok(())
    }

    /// Non-blocking variant of [`notify_op_change`] that fails fast when the
    /// notification channel is full instead of blocking for 30 seconds.
    /// On failure the pushed operation is cleaned up so it does not leak.
    pub async fn notify_op_change_nonblocking(
        &self,
        msg: NetMessage,
        op: OpEnum,
    ) -> Result<(), OpError> {
        let tx = *msg.id();
        self.push(tx, op).await?;

        match self
            .to_event_listener
            .notifications_sender()
            .try_send(Either::Left(msg))
        {
            Ok(()) => Ok(()),
            Err(tokio::sync::mpsc::error::TrySendError::Full(_)) => {
                tracing::warn!(
                    tx = %tx,
                    channel_pending = self.to_event_listener.notification_channel_pending(),
                    "notify_op_change_nonblocking: channel full, failing fast"
                );
                self.completed(tx);
                Err(OpError::NotificationChannelError(
                    "notification channel full (non-blocking send)".into(),
                ))
            }
            Err(tokio::sync::mpsc::error::TrySendError::Closed(_)) => {
                self.completed(tx);
                Err(OpError::NotificationError)
            }
        }
    }

    // An early, fast path, return for communicating events in the node to the main message handler,
    // without any transmission in the network whatsoever and avoiding any state transition.
    //
    // Useful when we want to notify connection attempts, or other events that do not require any
    // network communication with other nodes.
    pub async fn notify_node_event(&self, msg: NodeEvent) -> Result<(), OpError> {
        tracing::debug!(event = %msg, "notify_node_event: queuing node event");
        match tokio::time::timeout(
            Self::NOTIFICATION_SEND_TIMEOUT,
            self.to_event_listener
                .notifications_sender
                .send(Either::Right(msg)),
        )
        .await
        {
            Ok(Ok(())) => Ok(()),
            Ok(Err(e)) => Err(e.into()),
            Err(_) => {
                tracing::error!(
                    timeout_secs = Self::NOTIFICATION_SEND_TIMEOUT.as_secs(),
                    channel_pending = self.to_event_listener.notification_channel_pending(),
                    channel_remaining = self.to_event_listener.notifications_sender().capacity(),
                    "notify_node_event: Notification channel full for too long, event loop may be stuck"
                );
                Err(OpError::NotificationChannelError(
                    "notification channel send timed out — event loop is likely stuck".into(),
                ))
            }
        }
    }

    /// Get all active subscriptions.
    /// In the simplified lease-based model, this returns contracts we're actively subscribed to.
    /// Note: We no longer track per-contract subscriber lists.
    pub fn get_network_subscriptions(&self) -> Vec<(ContractKey, Vec<PeerKeyLocation>)> {
        // Return contracts we're subscribed to with an empty peer list
        // (no longer tracking individual subscribers in the new model)
        self.ring
            .get_subscribed_contracts()
            .into_iter()
            .map(|contract_key| (contract_key, Vec::new()))
            .collect()
    }

    /// Send an Unsubscribe message to the upstream peer for a contract.
    ///
    /// Finds the upstream peer from the interest manager, resolves its address,
    /// and sends a fire-and-forget Unsubscribe message via the operation routing
    /// mechanism. Also removes the local active subscription and interest tracking.
    pub async fn send_unsubscribe_upstream(&self, contract: &ContractKey) {
        // Find the upstream peer for this contract
        let upstream = self
            .interest_manager
            .get_interested_peers(contract)
            .into_iter()
            .find(|(_, interest)| interest.is_upstream);

        let Some((peer_key, _)) = upstream else {
            tracing::debug!(
                contract = %contract,
                "No upstream peer found for unsubscribe"
            );
            self.ring.unsubscribe(contract);
            return;
        };

        // Resolve peer address
        let Some(peer_location) = self
            .ring
            .connection_manager
            .get_peer_by_pub_key(&peer_key.0)
        else {
            tracing::debug!(
                contract = %contract,
                "Upstream peer address not found, cleaning up locally"
            );
            self.ring.unsubscribe(contract);
            self.interest_manager
                .remove_peer_interest(contract, &peer_key);
            return;
        };

        let Some(&target_addr) = peer_location.peer_addr.as_known() else {
            tracing::debug!(
                contract = %contract,
                "Upstream peer has no known address, cleaning up locally"
            );
            self.ring.unsubscribe(contract);
            self.interest_manager
                .remove_peer_interest(contract, &peer_key);
            return;
        };

        let instance_id = *contract.id();
        let tx = Transaction::new::<crate::operations::subscribe::SubscribeMsg>();
        let msg = NetMessage::from(crate::operations::subscribe::SubscribeMsg::Unsubscribe {
            id: tx,
            instance_id,
        });

        let op = OpEnum::Subscribe(crate::operations::subscribe::create_unsubscribe_op(
            instance_id,
            tx,
            target_addr,
        ));

        match self.notify_op_change_nonblocking(msg, op).await {
            Ok(()) => {
                tracing::debug!(
                    contract = %contract,
                    target = %target_addr,
                    "Sent Unsubscribe upstream"
                );
            }
            Err(e) => {
                tracing::warn!(
                    contract = %contract,
                    error = %e,
                    "Failed to send Unsubscribe upstream"
                );
            }
        }

        // Clean up local state regardless of send result.
        self.ring.unsubscribe(contract);
        self.interest_manager
            .remove_peer_interest(contract, &peer_key);
    }

    /// Build a per-transaction [`OpCtx`] bound to `tx`.
    ///
    /// Phase 2a factory for the async sub-transaction refactor (#1454). The
    /// returned context clones the event-loop `op_execution_sender` and is
    /// the only supported way to obtain an `OpCtx` outside this crate's
    /// unit tests. See [`OpCtx`] for scope, single-use semantics, and the
    /// "where to call this" guidance.
    #[allow(dead_code)] // Phase 2a scaffolding: first production caller lands in Phase 2b (#1454).
    pub fn op_ctx(&self, tx: Transaction) -> OpCtx {
        OpCtx::new(tx, self.to_event_listener.op_execution_sender.clone())
    }

    /// Send an event to the contract handler and await a response event from it if successful.
    pub async fn notify_contract_handler(
        &self,
        msg: ContractHandlerEvent,
    ) -> Result<ContractHandlerEvent, ContractError> {
        self.ch_outbound.send_to_handler(msg).await
    }

    /// Send an event to the contract handler with a custom timeout.
    ///
    /// Use shorter timeouts for broadcast-path callers (e.g., delta
    /// computation) to prevent tasks from accumulating when the handler is slow.
    pub async fn notify_contract_handler_with_timeout(
        &self,
        msg: ContractHandlerEvent,
        timeout: std::time::Duration,
    ) -> Result<ContractHandlerEvent, ContractError> {
        self.ch_outbound
            .send_to_handler_with_timeout(msg, timeout)
            .await
    }

    pub async fn push(&self, id: Transaction, op: OpEnum) -> Result<(), OpError> {
        // Check if operation is already completed - don't push back to HashMap
        if self.ops.completed.contains(&id) {
            tracing::debug!(
                tx = %id,
                "OpManager: Ignoring push for already completed operation"
            );
            return Ok(());
        }

        if let Some(tx) = self.ops.under_progress.remove(&id) {
            if tx.timed_out() {
                self.ops.completed.insert(tx);
                return Ok(());
            }
        }
        self.new_transactions.send(id).await?;
        match op {
            OpEnum::Connect(op) => {
                #[cfg(debug_assertions)]
                check_id_op!(id.transaction_type(), TransactionType::Connect);
                self.ops.connect.insert(id, *op);
            }
            OpEnum::Put(op) => {
                #[cfg(debug_assertions)]
                check_id_op!(id.transaction_type(), TransactionType::Put);
                self.ops.put.insert(id, op);
            }
            OpEnum::Get(op) => {
                #[cfg(debug_assertions)]
                check_id_op!(id.transaction_type(), TransactionType::Get);
                self.ops.get.insert(id, op);
            }
            OpEnum::Subscribe(op) => {
                #[cfg(debug_assertions)]
                check_id_op!(id.transaction_type(), TransactionType::Subscribe);
                self.ops.subscribe.insert(id, op);
            }
            OpEnum::Update(op) => {
                #[cfg(debug_assertions)]
                check_id_op!(id.transaction_type(), TransactionType::Update);
                self.ops.update.insert(id, op);
            }
        }
        Ok(())
    }

    /// Peek at the next hop address for an operation without removing it.
    /// Used by hop-by-hop routing to determine where to send initial outbound messages.
    /// Returns None if the operation doesn't exist or doesn't have a next hop address.
    pub fn peek_next_hop_addr(&self, id: &Transaction) -> Option<std::net::SocketAddr> {
        if self.ops.completed.contains(id) || self.ops.under_progress.contains(id) {
            return None;
        }
        match id.transaction_type() {
            TransactionType::Connect => self
                .ops
                .connect
                .get(id)
                .and_then(|op| op.get_next_hop_addr()),
            TransactionType::Put => self.ops.put.get(id).and_then(|op| op.get_next_hop_addr()),
            TransactionType::Get => self.ops.get.get(id).and_then(|op| op.get_next_hop_addr()),
            TransactionType::Subscribe => self
                .ops
                .subscribe
                .get(id)
                .and_then(|op| op.get_next_hop_addr()),
            TransactionType::Update => self
                .ops
                .update
                .get(id)
                .and_then(|op| op.get_next_hop_addr()),
        }
    }

    /// Peek at the full target peer (including public key) without removing the operation.
    /// Used when establishing new connections where we need the public key for handshake.
    pub fn peek_target_peer(&self, id: &Transaction) -> Option<PeerKeyLocation> {
        if self.ops.completed.contains(id) || self.ops.under_progress.contains(id) {
            return None;
        }
        match id.transaction_type() {
            TransactionType::Connect => {
                self.ops.connect.get(id).and_then(|op| op.get_target_peer())
            }
            // Other operations only store addresses, not full peer info
            TransactionType::Put
            | TransactionType::Get
            | TransactionType::Subscribe
            | TransactionType::Update => None,
        }
    }

    /// Get the current hop count (remaining HTL) for an operation.
    /// Used for calculating hop_count in success/failure events.
    pub fn get_current_hop(&self, id: &Transaction) -> Option<usize> {
        match id.transaction_type() {
            TransactionType::Get => self.ops.get.get(id).and_then(|op| op.get_current_hop()),
            TransactionType::Put => self.ops.put.get(id).and_then(|op| op.get_current_htl()),
            // TODO: Add support for Subscribe operations when they track HTL
            TransactionType::Connect | TransactionType::Subscribe | TransactionType::Update => None,
        }
    }

    pub fn pop(&self, id: &Transaction) -> Result<Option<OpEnum>, OpNotAvailable> {
        if self.ops.completed.contains(id) {
            return Err(OpNotAvailable::Completed);
        }
        if self.ops.under_progress.contains(id) {
            if id.timed_out() {
                self.ops.completed.insert(*id);
                return Err(OpNotAvailable::Completed);
            }
            return Err(OpNotAvailable::Running);
        }
        let op = match id.transaction_type() {
            TransactionType::Connect => self
                .ops
                .connect
                .remove(id)
                .map(|(_k, v)| v)
                .map(|op| OpEnum::Connect(Box::new(op))),
            TransactionType::Put => self.ops.put.remove(id).map(|(_k, v)| v).map(OpEnum::Put),
            TransactionType::Get => self.ops.get.remove(id).map(|(_k, v)| v).map(OpEnum::Get),
            TransactionType::Subscribe => self
                .ops
                .subscribe
                .remove(id)
                .map(|(_k, v)| v)
                .map(OpEnum::Subscribe),
            TransactionType::Update => self
                .ops
                .update
                .remove(id)
                .map(|(_k, v)| v)
                .map(OpEnum::Update),
        };
        self.ops.under_progress.insert(*id);
        Ok(op)
    }

    /// Emit a `NodeEvent::TransactionCompleted(tx)` to the event loop,
    /// triggering cleanup of any `pending_op_results` entry keyed by `tx`.
    ///
    /// Used by Phase 2b's task-per-tx subscribe path (#1454) to release
    /// the per-attempt callback slot in `p2p_protoc::pending_op_results`
    /// after each `OpCtx::send_and_await` round-trip finishes. Without
    /// this emission the attempt-tx entries accumulate until either the
    /// periodic 60 s cleanup sweeps closed senders or the node shuts
    /// down — see `test_pending_op_results_bounded` for the regression
    /// guard.
    ///
    /// Distinct from [`Self::send_client_result`] which also emits this
    /// event but additionally pushes a `HostResult` through
    /// `result_router_tx`. The task-per-tx path has many attempt txs
    /// per client tx, so per-attempt cleanup can't go through
    /// `send_client_result` (that would publish N duplicate results to
    /// the client).
    ///
    /// # Blocking vs non-blocking send
    ///
    /// Uses `send().await` wrapped in [`Self::NOTIFICATION_SEND_TIMEOUT`]
    /// rather than `try_send` because the cleanup is load-bearing: a
    /// dropped `TransactionCompleted` on a transiently-full notification
    /// channel would leave the `pending_op_results` slot in place until
    /// the 60 s periodic sweep runs, which
    /// `test_pending_op_results_bounded` is designed to catch. Since this
    /// method is only called from spawned task bodies (never from an
    /// event loop), `send().await` is within the `.claude/rules/channel-safety.md`
    /// rules. The 30 s timeout guards against a genuinely wedged event
    /// loop — the same timeout [`Self::notify_op_change`] uses.
    ///
    /// # Side effects on other `TransactionCompleted` consumers
    ///
    /// The `p2p_protoc::handle_notification_message` branch for
    /// `TransactionCompleted` (lines 2030–2036) also calls
    /// `state.tx_to_client.remove(&tx)`. For task-per-tx attempt txs this
    /// is a tolerated no-op: `tx_to_client` is only populated on
    /// client-visible txs via `ch_outbound.waiting_for_subscription_result`
    /// / `waiting_for_transaction_result`, never on per-attempt txs. If a
    /// future change starts keying `tx_to_client` by attempt tx, this
    /// eager cleanup will silently drop mappings and must be revisited.
    pub(crate) async fn release_pending_op_slot(&self, tx: Transaction) {
        release_pending_op_slot_on(
            self.to_event_listener.notifications_sender(),
            tx,
            Self::NOTIFICATION_SEND_TIMEOUT,
        )
        .await
    }

    /// Returns `true` if there is an active `GetOp` stored in `OpManager.ops.get`
    /// for the given transaction.
    ///
    /// Used by the relay GET dispatch in `node.rs` to distinguish a fresh inbound
    /// relay request (no existing op → spawn the task-per-tx driver) from a
    /// GC-spawned retry or `start_targeted_op` (existing op → fall through to the
    /// legacy `handle_op_request` path).
    ///
    /// Does **not** check `completed` or `under_progress` — those are covered by
    /// `pop` / `load_or_init`. This is a lightweight existence check only.
    pub fn has_get_op(&self, id: &Transaction) -> bool {
        self.ops.get.contains_key(id)
    }

    /// Returns `true` if an `UpdateOp` is currently registered for this
    /// transaction in `OpManager.ops.update`.
    ///
    /// Same role as `has_get_op` but for the relay UPDATE dispatch gate
    /// (#1454 phase 5 follow-up). Used by `node.rs` to distinguish a fresh
    /// inbound relay update (no existing op → spawn the task-per-tx driver)
    /// from a GC-spawned retry or `start_targeted_op`-style internal caller
    /// (existing op → fall through to the legacy `handle_op_request` path).
    pub fn has_update_op(&self, id: &Transaction) -> bool {
        self.ops.update.contains_key(id)
    }

    /// Returns `true` if a `PutOp` is currently registered for this
    /// transaction in `OpManager.ops.put`.
    ///
    /// Same role as `has_get_op` but for the relay PUT dispatch gate
    /// (#1454 phase 5 follow-up slice A). Used by `node.rs` to
    /// distinguish a fresh inbound relay PUT (no existing op → spawn
    /// the task-per-tx driver) from a GC-spawned speculative retry or
    /// client-initiated PUT loopback (existing op → fall through to the
    /// legacy `handle_op_request` path).
    pub fn has_put_op(&self, id: &Transaction) -> bool {
        self.ops.put.contains_key(id)
    }

    /// Returns `true` if a `SubscribeOp` is currently registered for this
    /// transaction in `OpManager.ops.subscribe`.
    ///
    /// Same role as `has_get_op` but for the relay SUBSCRIBE dispatch gate
    /// (#1454 phase 5 follow-up slice A). Used by `node.rs` to distinguish
    /// a fresh inbound relay SUBSCRIBE (no existing op → spawn the
    /// task-per-tx driver) from a renewal, PUT sub-op, or GC-spawned retry
    /// (existing op → fall through to the legacy `handle_op_request`
    /// path).
    pub fn has_subscribe_op(&self, id: &Transaction) -> bool {
        self.ops.subscribe.contains_key(id)
    }

    /// Returns `true` if a `ConnectOp` is currently registered for this
    /// transaction in `OpManager.ops.connect`.
    ///
    /// Same role as `has_get_op` but for the relay CONNECT dispatch
    /// gate (#1454 phase 2c slice 1). Used by `node.rs` to distinguish
    /// a fresh inbound relay CONNECT Request (no existing op → spawn
    /// the task-per-tx driver) from a within-relay Rejected retry, a
    /// ConnectFailed downstream re-route, or any other re-entry that
    /// already has a `ConnectOp` from a prior `process_message` call
    /// (existing op → fall through to the legacy `handle_op_request`
    /// path).
    pub fn has_connect_op(&self, id: &Transaction) -> bool {
        self.ops.connect.contains_key(id)
    }

    pub fn completed(&self, id: Transaction) {
        self.ring.live_tx_tracker.remove_finished_transaction(id);
        self.ops.under_progress.remove(&id);
        self.ops.completed.insert(id);

        // Remove the per-type DashMap entry for this transaction.
        //
        // Legacy state-machine paths reach `completed` after `pop` already
        // removed the entry, so the remove below is a no-op for them.
        //
        // Task-per-tx drivers use `OpCtx::send_and_await`, which loops the
        // outbound message back through the event loop as an InboundMessage
        // with `source_addr=None`. The originator's multi-hop branch in
        // `process_message` (e.g. put.rs SendAndContinue when forwarding)
        // returns a non-finalized state that `handle_op_result` pushes into
        // `ops.<type>`. The bypass in `handle_pure_network_message_v1` then
        // routes the terminal Response directly to the driver task, so the
        // entry never re-enters `handle_op_request` and never gets popped.
        // Without this remove, the entry leaks for OPERATION_TTL (60s) and
        // pollutes `has_<type>_op` checks until the GC sweep clears it.
        match id.transaction_type() {
            TransactionType::Connect => {
                self.ops.connect.remove(&id);
            }
            TransactionType::Put => {
                self.ops.put.remove(&id);
            }
            TransactionType::Get => {
                self.ops.get.remove(&id);
            }
            TransactionType::Subscribe => {
                self.ops.subscribe.remove(&id);
            }
            TransactionType::Update => {
                self.ops.update.remove(&id);
            }
        }

        // Clean up request router to prevent stale entries from blocking subsequent requests
        if let Some(router) = self.request_router.get() {
            router.complete_operation(id);
        }
    }

    /// Notify the operation manager that a transaction is being transacted over the network.
    pub fn sending_transaction(&self, peer: &PeerKeyLocation, msg: &NetMessage) {
        let transaction = msg.id();
        // With hop-by-hop routing, record the request using the peer we're sending to
        // and the message's requested location (contract location)
        if let Some(target_loc) = msg.requested_location() {
            self.ring
                .record_request(peer.clone(), target_loc, transaction.transaction_type());
        }
        if let Some(peer_addr) = peer.socket_addr() {
            self.ring
                .live_tx_tracker
                .add_transaction(peer_addr, *transaction);
        }
    }

    /// Register to be notified when a contract is stored.
    /// Returns a receiver that will be signaled when the contract is stored.
    /// This is used to handle race conditions where a subscription arrives before
    /// the contract has been propagated via PUT.
    pub fn wait_for_contract(&self, instance_id: ContractInstanceId) -> oneshot::Receiver<()> {
        let (tx, rx) = oneshot::channel();
        let mut waiters = self.contract_waiters.lock();
        waiters.entry(instance_id).or_default().push(tx);
        rx
    }

    /// Notify all waiters that a contract has been stored.
    /// Called after successful contract storage in PUT operations.
    ///
    /// Note: Stale waiters (from timed-out operations) are automatically cleaned up
    /// here when we remove all senders for the key. The send() will fail silently
    /// for dropped receivers, which is harmless.
    pub fn notify_contract_stored(&self, key: &ContractKey) {
        let mut waiters = self.contract_waiters.lock();
        if let Some(senders) = waiters.remove(key.id()) {
            let count = senders.len();
            for sender in senders {
                // Receiver may already be dropped (e.g., operation timed out)
                #[allow(clippy::let_underscore_must_use)]
                let _ = sender.send(());
            }
            if count > 0 {
                tracing::debug!(
                    %key,
                    count,
                    "Notified waiters that contract has been stored"
                );
            }
        }
    }

    /// Returns pending operation counts: [connect, put, get, subscribe, update].
    pub fn pending_op_counts(&self) -> [u32; 5] {
        [
            self.ops.connect.len() as u32,
            self.ops.put.len() as u32,
            self.ops.get.len() as u32,
            self.ops.subscribe.len() as u32,
            self.ops.update.len() as u32,
        ]
    }

    /// Returns the number of entries in the contract_waiters map.
    pub fn contract_waiters_count(&self) -> u32 {
        self.contract_waiters.lock().len() as u32
    }

    /// Returns a reference to the orphan stream registry.
    ///
    /// Used by operations layer to claim orphan streams when RequestStreaming
    /// or ResponseStreaming metadata messages arrive.
    #[allow(dead_code)] // Phase 3 infrastructure - will be used when streaming handlers are implemented
    pub fn orphan_stream_registry(&self) -> &Arc<OrphanStreamRegistry> {
        &self.orphan_stream_registry
    }

    /// Determines if streaming should be used for a payload of the given size.
    ///
    /// Returns `true` if the payload size exceeds the streaming threshold.
    #[allow(dead_code)] // Phase 3 infrastructure - will be used when streaming handlers are implemented
    pub fn should_use_streaming(&self, payload_size: usize) -> bool {
        payload_size > self.streaming_threshold
    }

    /// Builds the messages we need to send to a peer that just joined the ring,
    /// so it learns which contracts we're subscribed to and our cached state.
    pub(crate) fn on_ring_connection_established(
        &self,
        peer_addr: SocketAddr,
        pub_key: &TransportPublicKey,
    ) -> Vec<(SocketAddr, NetMessage)> {
        // Cancel any pending deferred interest removal for this peer.
        // If the peer reconnected within the grace period, their interests
        // are preserved — no re-registration needed via heartbeat.
        self.interest_manager
            .cancel_deferred_removal(&PeerKey::from(pub_key.clone()));

        let mut messages = Vec::with_capacity(2);

        let interest_hashes = self.interest_manager.get_all_interest_hashes();
        if !interest_hashes.is_empty() {
            messages.push((
                peer_addr,
                NetMessage::V1(NetMessageV1::InterestSync {
                    message: InterestMessage::Interests {
                        hashes: interest_hashes,
                    },
                }),
            ));
        }

        if let Some(cache_msg) = self
            .neighbor_hosting
            .on_ring_connection_established(pub_key)
        {
            messages.push((
                peer_addr,
                NetMessage::V1(NetMessageV1::NeighborHosting { message: cache_msg }),
            ));
        }

        // If we're already ready, tell the new peer immediately
        if self.ring.connection_manager.is_self_ready()
            && self.ring.connection_manager.min_ready_connections > 0
        {
            messages.push((
                peer_addr,
                NetMessage::V1(NetMessageV1::ReadyState { ready: true }),
            ));
        }

        messages
    }

    /// Handles a peer leaving the ring.
    ///
    /// Proximity cache is cleared immediately. Interest removal is deferred for
    /// `INTEREST_DISCONNECT_GRACE_PERIOD` to survive transient disconnects.
    /// Downstream subscriber entries in the hosting manager are NOT removed here —
    /// they have lease-based TTL and will be cleaned up by the periodic
    /// `expire_stale_downstream_subscribers` sweep, which also decrements the
    /// interest manager's `downstream_subscriber_count` and triggers upstream
    /// unsubscribe when appropriate.
    pub(crate) fn on_ring_connection_lost(&self, pub_key: &TransportPublicKey) {
        self.neighbor_hosting.on_peer_disconnected(pub_key);
        self.interest_manager
            .schedule_deferred_removal(&PeerKey::from(pub_key.clone()));
    }
}

/// Emit `NodeEvent::TransactionCompleted(tx)` through a provided
/// notification sender, timeout-wrapped so a wedged event loop does not
/// hang the caller forever.
///
/// Extracted from [`OpManager::release_pending_op_slot`] so the channel
/// interaction can be unit-tested in isolation without building a full
/// `OpManager` (review finding T-3). The `OpManager` method is a thin
/// wrapper around this free function.
///
/// Uses `send().await` (wrapped in `timeout`) rather than `try_send`
/// because the caller is Phase 2b's task-per-tx subscribe driver
/// spawned via `GlobalExecutor::spawn` — a short blocking wait is
/// within the channel-safety rules for that context, and dropping the
/// event on transient backpressure would re-introduce the
/// `test_pending_op_results_bounded` leak.
async fn release_pending_op_slot_on(
    notifications_sender: &mpsc::Sender<Either<NetMessage, NodeEvent>>,
    tx: Transaction,
    timeout: Duration,
) {
    match tokio::time::timeout(
        timeout,
        notifications_sender.send(Either::Right(NodeEvent::TransactionCompleted(tx))),
    )
    .await
    {
        Ok(Ok(())) => {}
        Ok(Err(_)) => {
            tracing::warn!(
                %tx,
                "release_pending_op_slot: notification channel closed; \
                 pending_op_results entry will be reclaimed by 60s sweep"
            );
        }
        Err(_) => {
            tracing::error!(
                %tx,
                timeout_secs = timeout.as_secs(),
                "release_pending_op_slot: notification channel full for too long; \
                 event loop may be stuck; pending_op_results entry will be \
                 reclaimed by 60s sweep"
            );
        }
    }
}

/// Notify the event loop about a timed-out transaction without blocking.
///
/// Uses `try_send` instead of `.send().await` to avoid blocking the garbage
/// cleanup task when the notification channel is full. The GC task already
/// cleans up the transaction from the ops maps — this notification only
/// lets the event loop clean up its `tx_to_client` map, so dropping it
/// when the channel is congested is acceptable.
fn notify_transaction_timeout(
    event_loop_notifier: &EventLoopNotificationsSender,
    tx: Transaction,
) -> bool {
    match event_loop_notifier
        .notifications_sender
        .try_send(Either::Right(NodeEvent::TransactionTimedOut(tx)))
    {
        Ok(()) => true,
        Err(mpsc::error::TrySendError::Full(_)) => {
            tracing::warn!(
                tx = %tx,
                "Notification channel full, skipping timeout notification for event loop"
            );
            false
        }
        Err(mpsc::error::TrySendError::Closed(_)) => {
            tracing::warn!(
                tx = %tx,
                "Notification channel closed, receiver likely dropped"
            );
            false
        }
    }
}

/// Fire-and-forget notification that a subscription timed out.
/// Spawns a task to avoid blocking the garbage cleanup loop on the contract handler response.
fn notify_subscription_timeout(
    ch_outbound: &Arc<crate::contract::ContractHandlerChannel<crate::contract::SenderHalve>>,
    instance_id: ContractInstanceId,
) {
    let ch = Arc::clone(ch_outbound);
    crate::config::GlobalExecutor::spawn(async move {
        if let Err(e) = ch
            .send_to_handler(ContractHandlerEvent::NotifySubscriptionError {
                key: instance_id,
                reason: format!("Subscription timed out for contract {}", instance_id),
            })
            .await
        {
            tracing::debug!(
                contract = %instance_id,
                error = %e,
                "Failed to notify subscription timeout"
            );
        }
    });
}

/// Reports a routing failure for a timed-out operation to the router's isotonic model.
fn report_timeout_failure(
    ring: &crate::ring::Ring,
    tx: &Transaction,
    peer: crate::ring::PeerKeyLocation,
    contract_location: crate::ring::Location,
) {
    let op_type = match tx.transaction_type() {
        crate::message::TransactionType::Get => Some(crate::node::network_status::OpType::Get),
        crate::message::TransactionType::Put => Some(crate::node::network_status::OpType::Put),
        crate::message::TransactionType::Update => {
            Some(crate::node::network_status::OpType::Update)
        }
        crate::message::TransactionType::Subscribe => {
            Some(crate::node::network_status::OpType::Subscribe)
        }
        crate::message::TransactionType::Connect => None,
    };
    ring.routing_finished(crate::router::RouteEvent {
        peer: peer.clone(),
        contract_location,
        outcome: crate::router::RouteOutcome::Failure,
        op_type,
    });
    tracing::info!(
        tx = %tx,
        peer = ?peer.socket_addr(),
        %contract_location,
        "Reported operation timeout as routing failure"
    );
}

/// Removes a put operation from the ops map, reports timeout failure if stats are available,
/// and notifies the client with an error so they don't wait silently until their own timeout.
/// Returns `true` if the operation was found and removed, `false` otherwise.
fn remove_put_and_report_failure(
    ops: &Ops,
    tx: &Transaction,
    ring: &crate::ring::Ring,
    result_router_tx: &mpsc::Sender<(Transaction, HostResult)>,
) -> bool {
    if let Some((_, put_op)) = ops.put.remove(tx) {
        if let Some((peer, contract_location)) = put_op.failure_routing_info() {
            report_timeout_failure(ring, tx, peer, contract_location);
        }
        tracing::warn!(
            tx = %tx,
            elapsed_ms = tx.elapsed().as_millis(),
            phase = "put_timeout",
            "PUT operation timed out without receiving a response"
        );
        // Notify client of timeout so they get an immediate error instead of
        // waiting silently for their own client-side timeout (#3451).
        if put_op.is_client_initiated() {
            let error_result = Err(freenet_stdlib::client_api::ErrorKind::OperationError {
                cause: "PUT operation timed out".into(),
            }
            .into());
            if let Err(e) = result_router_tx.try_send((*tx, error_result)) {
                tracing::warn!(
                    %tx,
                    error = %e,
                    "failed to send PUT timeout error to client"
                );
            }
        }
        true
    } else {
        false
    }
}

/// Removes an update operation from the ops map, reports timeout failure if stats are available,
/// and notifies the client with an error so they don't wait silently until their own timeout.
/// Returns `true` if the operation was found and removed, `false` otherwise.
fn remove_update_and_report_failure(
    ops: &Ops,
    tx: &Transaction,
    ring: &crate::ring::Ring,
    result_router_tx: &mpsc::Sender<(Transaction, HostResult)>,
) -> bool {
    if let Some((_, update_op)) = ops.update.remove(tx) {
        if let Some((peer, contract_location)) = update_op.failure_routing_info() {
            report_timeout_failure(ring, tx, peer, contract_location);
        }
        tracing::warn!(
            tx = %tx,
            elapsed_ms = tx.elapsed().as_millis(),
            phase = "update_timeout",
            "UPDATE operation timed out without receiving a response"
        );
        // Notify client of timeout so they get an immediate error instead of
        // waiting silently for their own client-side timeout (#3451).
        if update_op.is_client_initiated() {
            let error_result = Err(freenet_stdlib::client_api::ErrorKind::OperationError {
                cause: "UPDATE operation timed out".into(),
            }
            .into());
            if let Err(e) = result_router_tx.try_send((*tx, error_result)) {
                tracing::warn!(
                    %tx,
                    error = %e,
                    "failed to send UPDATE timeout error to client"
                );
            }
        }
        true
    } else {
        false
    }
}

/// Removes a subscribe operation from the ops map and notifies timeout if found.
/// Returns `Some(())` if the operation was found and removed, `None` otherwise.
///
/// Note: For intermediate nodes, the subscribe times out silently without sending
/// NotFound upstream. This is acceptable because the originator retries independently
/// after SUBSCRIBE_RETRY_THRESHOLD (5s), so it doesn't depend on NotFound for fast
/// failure detection. The intermediate node's stale op chain is harmless.
fn remove_subscribe_and_notify_timeout(
    ops: &Ops,
    tx: &Transaction,
    ch_outbound: &Arc<ContractHandlerChannel<SenderHalve>>,
    ring: &crate::ring::Ring,
) -> Option<()> {
    let (_, sub_op) = ops.subscribe.remove(tx)?;
    let is_originator = sub_op.is_originator();
    if let Some((peer, contract_location)) = sub_op.failure_routing_info() {
        report_timeout_failure(ring, tx, peer, contract_location);
    }
    let instance_id = sub_op.instance_id();
    if let Some(instance_id) = instance_id {
        notify_subscription_timeout(ch_outbound, instance_id);
    }
    // Emit telemetry so subscribe timeouts are visible in the dashboard (#3676).
    // Without this, timed-out subscribes were invisible — only subscribe_request
    // and subscribe_success were emitted, so timeouts looked like missing data.
    if is_originator {
        crate::tracing::telemetry::send_standalone_event(
            "subscribe_timeout",
            serde_json::json!({
                "tx": tx.to_string(),
                "instance_id": instance_id.map(|id| id.to_string()),
                "elapsed_ms": tx.elapsed().as_millis() as u64,
            }),
        );
    }
    Some(())
}

/// Removes a get operation from the ops map, reports timeout failure if stats are available,
/// and notifies the client with an error so they don't wait silently until their own timeout.
/// Returns `true` if the operation was found and removed, `false` otherwise.
fn remove_get_and_report_failure(
    ops: &Ops,
    tx: &Transaction,
    ring: &crate::ring::Ring,
    result_router_tx: &mpsc::Sender<(Transaction, HostResult)>,
) -> bool {
    if let Some((_, get_op)) = ops.get.remove(tx) {
        if let Some((peer, contract_location)) = get_op.failure_routing_info() {
            report_timeout_failure(ring, tx, peer, contract_location);
        }
        // Log GET timeout so failures are visible in traces
        let instance_id = get_op.instance_id();
        tracing::warn!(
            tx = %tx,
            instance_id = ?instance_id,
            elapsed_ms = tx.elapsed().as_millis(),
            phase = "get_timeout",
            "GET operation timed out without receiving a response"
        );
        // Notify client of timeout so they get an immediate error instead of
        // waiting silently for their own client-side timeout (#3423).
        // Only for client-initiated GETs (requester is None); forwarded GETs
        // have no local client waiting.
        if get_op.is_client_initiated() {
            let error_result = Err(freenet_stdlib::client_api::ErrorKind::OperationError {
                cause: "GET operation timed out".into(),
            }
            .into());
            if let Err(e) = result_router_tx.try_send((*tx, error_result)) {
                tracing::warn!(
                    %tx,
                    error = %e,
                    "failed to send GET timeout error to client"
                );
            }
        }
        true
    } else {
        false
    }
}

/// Log when a connect operation in Relaying state with an outstanding uphill forward times out,
/// and record the unresponsive peer as an acceptor failure for reliability scoring.
fn record_connect_uphill_timeout(
    tx: &Transaction,
    op: &ConnectOp,
    conn_manager: &ConnectionManager,
) {
    let Some(ConnectState::Relaying(state)) = &op.state else {
        return;
    };
    // Don't record timeout for relays that already forwarded a ConnectResponse —
    // forwarded_to is preserved for ConnectFailed propagation, not because
    // we're still waiting for a response.
    if state.response_forwarded {
        return;
    }
    let Some(ref peer) = state.forwarded_to else {
        return;
    };
    let pending_secs = if let Some(ref fwd_at) = state.forwarded_at {
        fwd_at.elapsed().as_secs()
    } else {
        tx.elapsed().as_secs()
    };
    tracing::warn!(
        tx = %tx,
        forwarded_to = %peer.pub_key(),
        forwarded_to_addr = ?peer.socket_addr(),
        pending_secs,
        "connect: uphill route timed out with no response"
    );
    if let Some(addr) = peer.socket_addr() {
        let now = tokio::time::Instant::now();
        conn_manager.record_acceptor_outcome(addr, false, now);
        tracing::info!(
            tx = %tx,
            addr = %addr,
            "recorded GC timeout as acceptor failure"
        );
    }
}

#[allow(clippy::too_many_arguments)]
async fn garbage_cleanup_task<ER: NetEventRegister>(
    mut new_transactions: tokio::sync::mpsc::Receiver<Transaction>,
    ops: Arc<Ops>,
    live_tx_tracker: LiveTransactionTracker,
    event_loop_notifier: EventLoopNotificationsSender,
    mut event_register: ER,
    result_router_tx: mpsc::Sender<(Transaction, HostResult)>,
    request_router: Arc<OnceLock<Arc<RequestRouter>>>,
    ring: Arc<Ring>,
    ch_outbound: Arc<crate::contract::ContractHandlerChannel<crate::contract::SenderHalve>>,
    contract_waiters: Arc<
        Mutex<std::collections::HashMap<ContractInstanceId, Vec<oneshot::Sender<()>>>>,
    >,
    pending_contract_fetches: Arc<DashMap<ContractInstanceId, u64>>,
    active_relay_get_txs: Arc<DashSet<Transaction>>,
    active_relay_update_txs: Arc<DashSet<Transaction>>,
    active_relay_put_txs: Arc<DashSet<Transaction>>,
    active_relay_subscribe_txs: Arc<DashSet<Transaction>>,
    active_relay_connect_txs: Arc<DashSet<Transaction>>,
) {
    const CLEANUP_INTERVAL: Duration = Duration::from_secs(5);
    /// How often to clean up stale contract_waiters entries (every N ticks).
    const WAITER_CLEANUP_EVERY_N_TICKS: u32 = 12; // every 60s at 5s interval
    let mut tick = tokio::time::interval(CLEANUP_INTERVAL);
    tick.tick().await;
    let mut tick_count: u32 = 0;

    let mut ttl_set = BTreeSet::new();

    let mut delayed = vec![];
    loop {
        crate::deterministic_select! {
            tx = new_transactions.recv() => {
                if let Some(tx) = tx {
                    ttl_set.insert(Reverse(tx));
                }
            },
            _ = tick.tick() => {
                tick_count = tick_count.wrapping_add(1);

                // Opt-in periodic memory-stats dump. Gated by env var so the
                // hot path stays quiet in prod. Intended for local / CI sim
                // runs where we want to correlate RSS growth with retained
                // state in OpManager.
                if std::env::var("FREENET_MEMORY_STATS").is_ok() {
                    use std::sync::atomic::Ordering;
                    let ops_sizes = ops.sizes();
                    let pending_fetches = pending_contract_fetches.len();
                    let waiters_len = contract_waiters.lock().len();
                    let relay_inflight =
                        crate::operations::get::op_ctx_task::RELAY_INFLIGHT
                            .load(Ordering::Relaxed);
                    let relay_spawned =
                        crate::operations::get::op_ctx_task::RELAY_SPAWNED_TOTAL
                            .load(Ordering::Relaxed);
                    let relay_completed =
                        crate::operations::get::op_ctx_task::RELAY_COMPLETED_TOTAL
                            .load(Ordering::Relaxed);
                    let relay_dedup_rejects =
                        crate::operations::get::op_ctx_task::RELAY_DEDUP_REJECTS
                            .load(Ordering::Relaxed);
                    let relay_active_txs = active_relay_get_txs.len();
                    let relay_update_inflight =
                        crate::operations::update::op_ctx_task::RELAY_UPDATE_INFLIGHT
                            .load(Ordering::Relaxed);
                    let relay_update_spawned =
                        crate::operations::update::op_ctx_task::RELAY_UPDATE_SPAWNED_TOTAL
                            .load(Ordering::Relaxed);
                    let relay_update_completed =
                        crate::operations::update::op_ctx_task::RELAY_UPDATE_COMPLETED_TOTAL
                            .load(Ordering::Relaxed);
                    let relay_update_dedup_rejects =
                        crate::operations::update::op_ctx_task::RELAY_UPDATE_DEDUP_REJECTS
                            .load(Ordering::Relaxed);
                    let relay_update_active_txs = active_relay_update_txs.len();
                    let relay_put_inflight =
                        crate::operations::put::op_ctx_task::RELAY_PUT_INFLIGHT
                            .load(Ordering::Relaxed);
                    let relay_put_spawned =
                        crate::operations::put::op_ctx_task::RELAY_PUT_SPAWNED_TOTAL
                            .load(Ordering::Relaxed);
                    let relay_put_completed =
                        crate::operations::put::op_ctx_task::RELAY_PUT_COMPLETED_TOTAL
                            .load(Ordering::Relaxed);
                    let relay_put_dedup_rejects =
                        crate::operations::put::op_ctx_task::RELAY_PUT_DEDUP_REJECTS
                            .load(Ordering::Relaxed);
                    let relay_put_active_txs = active_relay_put_txs.len();
                    let relay_subscribe_inflight =
                        crate::operations::subscribe::op_ctx_task::RELAY_SUBSCRIBE_INFLIGHT
                            .load(Ordering::Relaxed);
                    let relay_subscribe_spawned =
                        crate::operations::subscribe::op_ctx_task::RELAY_SUBSCRIBE_SPAWNED_TOTAL
                            .load(Ordering::Relaxed);
                    let relay_subscribe_completed =
                        crate::operations::subscribe::op_ctx_task::RELAY_SUBSCRIBE_COMPLETED_TOTAL
                            .load(Ordering::Relaxed);
                    let relay_subscribe_dedup_rejects =
                        crate::operations::subscribe::op_ctx_task::RELAY_SUBSCRIBE_DEDUP_REJECTS
                            .load(Ordering::Relaxed);
                    let relay_subscribe_active_txs = active_relay_subscribe_txs.len();
                    let relay_connect_active_txs = active_relay_connect_txs.len();
                    tracing::info!(
                        target: "memory_stats",
                        tick = tick_count,
                        ops_connect = ops_sizes.connect,
                        ops_put = ops_sizes.put,
                        ops_get = ops_sizes.get,
                        ops_subscribe = ops_sizes.subscribe,
                        ops_update = ops_sizes.update,
                        ops_completed = ops_sizes.completed,
                        ops_under_progress = ops_sizes.under_progress,
                        pending_contract_fetches = pending_fetches,
                        contract_waiters = waiters_len,
                        relay_inflight = relay_inflight,
                        relay_spawned = relay_spawned,
                        relay_completed = relay_completed,
                        relay_dedup_rejects = relay_dedup_rejects,
                        relay_active_txs = relay_active_txs,
                        relay_update_inflight = relay_update_inflight,
                        relay_update_spawned = relay_update_spawned,
                        relay_update_completed = relay_update_completed,
                        relay_update_dedup_rejects = relay_update_dedup_rejects,
                        relay_update_active_txs = relay_update_active_txs,
                        relay_put_inflight = relay_put_inflight,
                        relay_put_spawned = relay_put_spawned,
                        relay_put_completed = relay_put_completed,
                        relay_put_dedup_rejects = relay_put_dedup_rejects,
                        relay_put_active_txs = relay_put_active_txs,
                        relay_subscribe_inflight = relay_subscribe_inflight,
                        relay_subscribe_spawned = relay_subscribe_spawned,
                        relay_subscribe_completed = relay_subscribe_completed,
                        relay_subscribe_dedup_rejects = relay_subscribe_dedup_rejects,
                        relay_subscribe_active_txs = relay_subscribe_active_txs,
                        relay_connect_active_txs = relay_connect_active_txs,
                        "memory stats"
                    );
                }

                // Periodically clean up stale contract_waiters entries where the
                // receiver has been dropped (e.g., operation timed out). Without this,
                // the map grows unboundedly under sustained load (#2928).
                if tick_count % WAITER_CLEANUP_EVERY_N_TICKS == 0 {
                    let mut waiters = contract_waiters.lock();
                    let before = waiters.len();
                    waiters.retain(|_id, senders| {
                        // Remove senders whose receiver was dropped
                        senders.retain(|sender| !sender.is_closed());
                        !senders.is_empty()
                    });
                    let after = waiters.len();
                    if before != after {
                        tracing::info!(
                            before,
                            after,
                            removed = before - after,
                            "Cleaned up stale contract_waiters entries"
                        );
                    }
                }

                // Shared retry constants for SUBSCRIBE speculative retry below.
                // (GET speculative retry was retired in #1454 Phase 5-final
                // slice 1; SUBSCRIBE speculative retry was retired in
                // Phase 5-final slice 1 follow-up — see comment below.
                // The shared `ACK_TIMEOUT` / `MAX_SPECULATIVE_PATHS` /
                // `PROGRESS_TIMEOUT` constants that both blocks consumed
                // are no longer referenced and have been removed.)

                // Periodically clean up stale pending_contract_fetches entries.
                // Entries older than 2x cooldown are removed to prevent unbounded growth.
                if tick_count % 12 == 0 {
                    let cooldown_ms = crate::operations::update::CONTRACT_FETCH_COOLDOWN_MS;
                    let now_ms = crate::config::GlobalSimulationTime::read_time_ms();
                    pending_contract_fetches.retain(|_, ts| {
                        now_ms.saturating_sub(*ts) < cooldown_ms * 2
                    });
                }

                // SUBSCRIBE speculative-retry GC block was retired in #1454
                // Phase 5-final: every originator-side writer into
                // `ops.subscribe` (client-initiated, executor auto-subscribe,
                // renewal, PUT/GET sub-op) now runs on the task-per-tx
                // driver in `operations/subscribe/op_ctx_task.rs`, which
                // owns its own retry loop via `advance_to_next_peer` and
                // never inserts a `SubscribeOp` into the DashMap. The
                // remaining legacy entries (relay state during multi-hop
                // forwarding, `create_unsubscribe_op` routing entries) are
                // not retry candidates: relay entries fail
                // `is_originator()` and unsubscribe entries fail
                // `failure_routing_info().is_some()`. The DashMap, the
                // `SubscribeOp::ack_received` / `speculative_paths` fields,
                // and `SubscribeMsg::ForwardingAck` survive only as wire-
                // and bookkeeping-compat for legacy relay traffic;
                // they are not load-bearing for retry. Mirrors the GET
                // Phase 5-final retirement in PR #3974.

                let mut old_missing = std::mem::replace(&mut delayed, Vec::with_capacity(200));
                for tx in old_missing.drain(..) {
                    if let Some(tx) = ops.completed.remove(&tx) {
                        if cfg!(feature = "trace-ot") {
                            let op_type = tx.transaction_type().description();
                            event_register.notify_of_time_out(tx, op_type, None).await;
                        } else {
                            _ = tx;
                        }
                        continue;
                    }
                    let still_waiting = match tx.transaction_type() {
                        TransactionType::Connect => {
                            if let Some((_, mut op)) = ops.connect.remove(&tx) {
                                op.expire_forward_attempts(tokio::time::Instant::now());
                                record_connect_uphill_timeout(&tx, &op, &ring.connection_manager);
                                if let Some(target_loc) = op.desired_location {
                                    ring.record_connection_failure(target_loc, ConnectionFailureReason::Timeout);
                                }
                                false
                            } else {
                                true
                            }
                        }
                        TransactionType::Put => !remove_put_and_report_failure(&ops, &tx, &ring, &result_router_tx),
                        TransactionType::Get => !remove_get_and_report_failure(&ops, &tx, &ring, &result_router_tx),
                        TransactionType::Subscribe => {
                            remove_subscribe_and_notify_timeout(&ops, &tx, &ch_outbound, &ring).is_none()
                        }
                        TransactionType::Update => !remove_update_and_report_failure(&ops, &tx, &ring, &result_router_tx),
                    };
                    if still_waiting {
                        delayed.push(tx);
                    } else {
                        ops.under_progress.remove(&tx);
                        ops.completed.remove(&tx);
                        tracing::info!(
                            tx = %tx,
                            tx_type = ?tx.transaction_type(),
                            elapsed_ms = tx.elapsed().as_millis(),
                            ttl_ms = crate::config::OPERATION_TTL.as_millis(),
                            "Transaction timed out"
                        );

                        notify_transaction_timeout(&event_loop_notifier, tx);
                        live_tx_tracker.remove_finished_transaction(tx);

                        // Clean up request router to prevent stale entries from blocking
                        // subsequent requests for the same resource after timeout
                        if let Some(router) = request_router.get() {
                            router.complete_operation(tx);
                        }
                    }
                }

                // notice the use of reverse so the older transactions are removed instead of the newer ones
                let older_than: Reverse<Transaction> = Reverse(Transaction::ttl_transaction());
                // Absolute cutoff for under_progress ops: 5× normal TTL (5 minutes).
                // Without this, operations stuck in under_progress are exempt from GC forever.
                let absolute_cutoff: Reverse<Transaction> =
                    Reverse(Transaction::ttl_transaction_with_multiplier(5));
                for Reverse(tx) in ttl_set.split_off(&older_than).into_iter() {
                    if ops.under_progress.contains(&tx) {
                        // Allow extended lifetime unless absolute timeout exceeded.
                        // Reverse flips ordering: Reverse(tx) < absolute_cutoff means
                        // tx is newer than the 5× TTL cutoff, so keep it alive.
                        if Reverse(tx) < absolute_cutoff {
                            delayed.push(tx);
                            continue;
                        }
                        tracing::warn!(tx = %tx, "Cleaning up under_progress op that exceeded absolute timeout (5× TTL)");
                        ops.under_progress.remove(&tx);
                        // Fall through to normal cleanup below
                    }
                    if let Some(tx) = ops.completed.remove(&tx) {
                        tracing::debug!("Clean up timed out: {tx}");
                        if cfg!(feature = "trace-ot") {
                            let op_type = tx.transaction_type().description();
                            event_register.notify_of_time_out(tx, op_type, None).await;
                        } else {
                            _ = tx;
                        }
                    }
                    let removed = match tx.transaction_type() {
                        TransactionType::Connect => {
                            if let Some((_, mut op)) = ops.connect.remove(&tx) {
                                op.expire_forward_attempts(tokio::time::Instant::now());
                                record_connect_uphill_timeout(&tx, &op, &ring.connection_manager);
                                if let Some(target_loc) = op.desired_location {
                                    ring.record_connection_failure(target_loc, ConnectionFailureReason::Timeout);
                                }
                                true
                            } else {
                                false
                            }
                        }
                        TransactionType::Put => remove_put_and_report_failure(&ops, &tx, &ring, &result_router_tx),
                        TransactionType::Get => remove_get_and_report_failure(&ops, &tx, &ring, &result_router_tx),
                        TransactionType::Subscribe => {
                            remove_subscribe_and_notify_timeout(&ops, &tx, &ch_outbound, &ring).is_some()
                        }
                        TransactionType::Update => remove_update_and_report_failure(&ops, &tx, &ring, &result_router_tx),
                    };
                    if removed {
                        tracing::info!(
                            tx = %tx,
                            tx_type = ?tx.transaction_type(),
                            elapsed_ms = tx.elapsed().as_millis(),
                            ttl_ms = crate::config::OPERATION_TTL.as_millis(),
                            "Transaction timed out"
                        );

                        notify_transaction_timeout(&event_loop_notifier, tx);
                        live_tx_tracker.remove_finished_transaction(tx);

                        // Clean up request router to prevent stale entries from blocking
                        // subsequent requests for the same resource after timeout
                        if let Some(router) = request_router.get() {
                            router.complete_operation(tx);
                        }
                    }
                }
            },
        }
    }
}

#[cfg(test)]
mod tests {
    use super::super::network_bridge::event_loop_notification_channel;
    use super::*;
    use crate::node::network_bridge::EventLoopNotificationsReceiver;
    use either::Either;
    use tokio::time::{Duration, timeout};

    #[tokio::test]
    async fn notify_timeout_succeeds_when_receiver_alive() {
        let (receiver, notifier) = event_loop_notification_channel();
        let EventLoopNotificationsReceiver {
            mut notifications_receiver,
            ..
        } = receiver;

        let tx = Transaction::ttl_transaction();

        let delivered = notify_transaction_timeout(&notifier, tx);
        assert!(
            delivered,
            "notification should be delivered while receiver is alive"
        );

        let received = timeout(Duration::from_millis(100), notifications_receiver.recv())
            .await
            .expect("timed out waiting for notification")
            .expect("notification channel closed");

        match received {
            Either::Right(NodeEvent::TransactionTimedOut(observed)) => {
                assert_eq!(observed, tx, "unexpected transaction in notification");
            }
            other @ Either::Left(_) | other @ Either::Right(_) => {
                panic!("unexpected notification: {other:?}")
            }
        }
    }

    #[tokio::test]
    async fn notify_timeout_handles_dropped_receiver() {
        let (receiver, notifier) = event_loop_notification_channel();
        drop(receiver);

        let tx = Transaction::ttl_transaction();

        let delivered = notify_transaction_timeout(&notifier, tx);
        assert!(
            !delivered,
            "notification delivery should fail once receiver is dropped"
        );
    }

    // ──────────────────────────────────────────────────────────
    // `release_pending_op_slot_on` tests (#1454 Phase 2b,
    // review finding T-3). Tests the extracted helper directly
    // so we don't need to build a full OpManager.
    // ──────────────────────────────────────────────────────────

    #[tokio::test]
    async fn release_pending_op_slot_emits_transaction_completed() {
        // Happy path: the helper must emit exactly one
        // `TransactionCompleted(tx)` on the notification channel.
        let (receiver, notifier) = event_loop_notification_channel();
        let EventLoopNotificationsReceiver {
            mut notifications_receiver,
            ..
        } = receiver;

        let tx = Transaction::ttl_transaction();

        super::release_pending_op_slot_on(
            notifier.notifications_sender(),
            tx,
            Duration::from_secs(1),
        )
        .await;

        let received = timeout(Duration::from_millis(100), notifications_receiver.recv())
            .await
            .expect("timed out waiting for TransactionCompleted emission")
            .expect("notification channel closed");

        #[allow(clippy::wildcard_enum_match_arm)]
        match received {
            Either::Right(NodeEvent::TransactionCompleted(observed)) => {
                assert_eq!(observed, tx, "emitted tx must match the argument");
            }
            other => panic!("expected TransactionCompleted, got {other:?}"),
        }
    }

    #[tokio::test]
    async fn release_pending_op_slot_blocks_through_backpressure() {
        // Regression guard for review finding M1: the earlier
        // `try_send` implementation would silently drop the cleanup
        // event when the notification channel was transiently full.
        // The `send().await` implementation must block and deliver
        // once the consumer drains one slot.
        let (receiver, notifier) = event_loop_notification_channel();
        let EventLoopNotificationsReceiver {
            mut notifications_receiver,
            ..
        } = receiver;

        // Saturate the channel up to its capacity. The channel
        // capacity is whatever `event_loop_notification_channel`
        // configures — we don't hard-code it. Pre-fill until
        // `try_send` fails, then use that count.
        let filler_tx = Transaction::ttl_transaction();
        let mut pre_filled = 0usize;
        loop {
            match notifier
                .notifications_sender()
                .try_send(Either::Right(NodeEvent::TransactionCompleted(filler_tx)))
            {
                Ok(()) => pre_filled += 1,
                Err(tokio::sync::mpsc::error::TrySendError::Full(_)) => break,
                Err(tokio::sync::mpsc::error::TrySendError::Closed(_)) => {
                    panic!("channel unexpectedly closed while pre-filling")
                }
            }
            // Safety valve: don't loop forever if the channel is
            // unbounded or absurdly large. Real channel is bounded
            // at a few hundred entries — if we hit this cap it's a
            // test-config change and deserves an explicit fix.
            if pre_filled > 4096 {
                panic!("channel did not backpressure after 4096 entries");
            }
        }
        assert!(
            pre_filled > 0,
            "expected a bounded channel; got what appears to be unbounded"
        );

        // Spawn the drain side a moment later: it will consume one
        // entry, unblocking the `send().await` inside the helper.
        let release_tx = Transaction::ttl_transaction();
        let consumer = tokio::spawn(async move {
            // Sleep briefly so the helper's `send().await` is already
            // pending when we start draining.
            tokio::time::sleep(Duration::from_millis(20)).await;
            // Drain one entry to create room.
            notifications_receiver
                .recv()
                .await
                .expect("notification channel closed during drain");
            // Keep draining until we see our release event. Additional
            // pre-filled entries may sit ahead of it.
            loop {
                match notifications_receiver.recv().await {
                    Some(Either::Right(NodeEvent::TransactionCompleted(observed)))
                        if observed == release_tx =>
                    {
                        return;
                    }
                    Some(_) => continue,
                    None => panic!("channel closed before release event observed"),
                }
            }
        });

        // The helper must not complete instantaneously (channel is
        // saturated) but must complete once the consumer drains. Give
        // it up to 2 s — plenty of slack for the 20 ms drain delay.
        let release = timeout(
            Duration::from_secs(2),
            super::release_pending_op_slot_on(
                notifier.notifications_sender(),
                release_tx,
                Duration::from_secs(30),
            ),
        )
        .await;
        release.expect("helper must complete once channel has room");

        consumer
            .await
            .expect("consumer task should terminate cleanly");
    }

    #[tokio::test]
    async fn release_pending_op_slot_returns_on_closed_channel() {
        // If the notification channel is closed entirely (receiver
        // dropped), the helper must return promptly (via the `Err`
        // arm of the inner match) rather than hanging on
        // `send().await`. The 60 s periodic sweep will still reclaim
        // the slot eventually; this test pins "no hang."
        let (receiver, notifier) = event_loop_notification_channel();
        drop(receiver);

        let tx = Transaction::ttl_transaction();

        let result = timeout(
            Duration::from_millis(200),
            super::release_pending_op_slot_on(
                notifier.notifications_sender(),
                tx,
                Duration::from_secs(30),
            ),
        )
        .await;
        assert!(
            result.is_ok(),
            "helper must return promptly on closed channel"
        );
    }

    #[test]
    fn contract_waiters_cleanup_removes_closed_senders() {
        use std::collections::HashMap;

        let mut waiters: HashMap<ContractInstanceId, Vec<oneshot::Sender<()>>> = HashMap::new();
        let id1 = ContractInstanceId::new([1; 32]);
        let id2 = ContractInstanceId::new([2; 32]);

        // Create waiters with live and dropped receivers
        let (tx_live, _rx_live) = oneshot::channel();
        let (tx_dead, _rx_dead) = oneshot::channel::<()>();
        drop(_rx_dead); // Drop receiver so sender.is_closed() returns true

        waiters.entry(id1).or_default().push(tx_live);
        waiters.entry(id1).or_default().push(tx_dead);

        // id2 has only dead waiters
        let (tx_dead2, rx_dead2) = oneshot::channel::<()>();
        drop(rx_dead2);
        waiters.entry(id2).or_default().push(tx_dead2);

        assert_eq!(waiters.len(), 2);

        // Run the cleanup logic (same as in garbage_cleanup_task)
        waiters.retain(|_id, senders| {
            senders.retain(|sender| !sender.is_closed());
            !senders.is_empty()
        });

        // id1 should remain (has one live sender), id2 should be removed
        assert_eq!(waiters.len(), 1);
        assert!(waiters.contains_key(&id1));
        assert!(!waiters.contains_key(&id2));
        assert_eq!(waiters[&id1].len(), 1);
    }

    mod record_connect_uphill_timeout_tests {
        use super::super::record_connect_uphill_timeout;
        use crate::message::Transaction;
        use crate::operations::VisitedPeers;
        use crate::operations::connect::{
            ConnectMsg, ConnectOp, ConnectRequest, ConnectState, DEFAULT_UPHILL_BUDGET, RelayState,
        };
        use crate::ring::{ConnectionManager, Location, PeerKeyLocation};
        use crate::transport::TransportKeypair;
        use std::net::{IpAddr, Ipv4Addr, SocketAddr};

        fn make_peer(port: u16) -> PeerKeyLocation {
            let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), port);
            let keypair = TransportKeypair::new();
            PeerKeyLocation::new(keypair.public().clone(), addr)
        }

        fn make_relay_state(
            forwarded_to: Option<PeerKeyLocation>,
            response_forwarded: bool,
        ) -> ConnectState {
            ConnectState::Relaying(Box::new(RelayState {
                upstream_addr: "10.0.0.1:5000".parse().unwrap(),
                request: ConnectRequest {
                    desired_location: Location::new(0.3),
                    joiner: make_peer(5002),
                    ttl: 5,
                    visited: VisitedPeers::new(&Transaction::new::<ConnectMsg>()),
                    uphill_budget: DEFAULT_UPHILL_BUDGET,
                },
                forwarded_to,
                forwarded_at: Some(tokio::time::Instant::now()),
                observed_sent: false,
                accepted_locally: false,
                response_forwarded,
            }))
        }

        /// Regression test for #3392: GC-expired CONNECT forwards to unresponsive
        /// peers must be recorded as acceptor failures so the peer gets a low
        /// reliability score for future routing decisions.
        #[test]
        fn records_failure_for_unresponsive_relay() {
            let peer = make_peer(5001);
            let peer_addr = peer.socket_addr().unwrap();
            let op = ConnectOp::with_state(make_relay_state(Some(peer), false));
            let tx = Transaction::new::<ConnectMsg>();
            let cm = ConnectionManager::test_default();

            let now = tokio::time::Instant::now();
            // Initially unknown → 0.5 prior
            let initial = cm.peer_acceptor_reliability(peer_addr, now);
            assert!(
                (initial - 0.5).abs() < f64::EPSILON,
                "unknown peer should start at 0.5"
            );

            // Three GC timeouts should lower reliability
            for _ in 0..3 {
                record_connect_uphill_timeout(&tx, &op, &cm);
            }

            let after = cm.peer_acceptor_reliability(peer_addr, now);
            assert!(
                after < initial,
                "peer reliability should decrease after 3 GC timeouts: {} vs {}",
                after,
                initial
            );
            // (0+1)/(3+2) = 0.2
            assert!(
                (after - 0.2).abs() < 0.01,
                "expected ~0.2 after 3 failures, got {}",
                after
            );
        }

        /// response_forwarded=true must NOT record a failure — the forwarded_to
        /// field is kept for ConnectFailed propagation, not because the peer is
        /// unresponsive.
        #[test]
        fn skips_when_response_already_forwarded() {
            let peer = make_peer(5001);
            let peer_addr = peer.socket_addr().unwrap();
            let op = ConnectOp::with_state(make_relay_state(Some(peer), true));
            let tx = Transaction::new::<ConnectMsg>();
            let cm = ConnectionManager::test_default();

            for _ in 0..5 {
                record_connect_uphill_timeout(&tx, &op, &cm);
            }

            let now = tokio::time::Instant::now();
            let score = cm.peer_acceptor_reliability(peer_addr, now);
            assert!(
                (score - 0.5).abs() < f64::EPSILON,
                "peer should still have 0.5 reliability when response was already forwarded, got {}",
                score
            );
        }

        /// Non-Relaying state (e.g., Completed) must not record any failure.
        #[test]
        fn skips_for_non_relay_state() {
            let op = ConnectOp::with_state(ConnectState::Completed);
            let tx = Transaction::new::<ConnectMsg>();
            let cm = ConnectionManager::test_default();

            // Should not panic or record anything
            record_connect_uphill_timeout(&tx, &op, &cm);
        }
    }

    // ── has_get_op unit tests ─────────────────────────────────────────────
    //
    // `has_get_op` is a thin wrapper over `self.ops.get.contains_key`.
    // Since `OpManager` requires complex infrastructure to construct in unit
    // tests, we exercise the underlying `Ops::get` DashMap directly — the
    // same pattern used by `remove_put_returns_false_for_missing_tx` above.
    // The delegation in `has_get_op` is one line and trivially correct.

    /// has_get_op returns false for an unknown transaction.
    #[test]
    fn has_get_op_returns_false_for_unknown_tx() {
        let ops = Ops::default();
        let tx = Transaction::new::<crate::operations::get::GetMsg>();
        assert!(
            !ops.get.contains_key(&tx),
            "ops.get should not contain a never-inserted tx"
        );
    }

    /// has_get_op returns true after a GetOp is inserted, and false once removed.
    #[test]
    fn has_get_op_returns_true_after_insert_false_after_remove() {
        use freenet_stdlib::prelude::ContractInstanceId;

        let ops = Ops::default();
        let instance_id = ContractInstanceId::new([0u8; 32]);
        let tx = Transaction::new::<crate::operations::get::GetMsg>();
        let get_op = crate::operations::get::start_op_with_id(instance_id, false, false, false, tx);

        // Before insert: absent
        assert!(
            !ops.get.contains_key(&tx),
            "ops.get should not contain tx before insertion"
        );

        // After insert: present
        ops.get.insert(tx, get_op);
        assert!(
            ops.get.contains_key(&tx),
            "ops.get should contain tx after insertion"
        );

        // After remove: absent again
        ops.get.remove(&tx);
        assert!(
            !ops.get.contains_key(&tx),
            "ops.get should not contain tx after removal"
        );
    }

    // ── has_update_op unit tests ──────────────────────────────────────────
    //
    // Same shape as has_get_op tests above. `has_update_op` is a one-line
    // wrapper over `self.ops.update.contains_key`, so we exercise the
    // underlying map directly.

    /// has_update_op returns false for an unknown transaction.
    #[test]
    fn has_update_op_returns_false_for_unknown_tx() {
        let ops = Ops::default();
        let tx = Transaction::new::<crate::operations::update::UpdateMsg>();
        assert!(
            !ops.update.contains_key(&tx),
            "ops.update should not contain a never-inserted tx"
        );
    }

    /// has_update_op returns true after an UpdateOp is inserted, and false
    /// once removed.
    #[test]
    fn has_update_op_returns_true_after_insert_false_after_remove() {
        use freenet_stdlib::prelude::{CodeHash, ContractInstanceId, ContractKey};

        let ops = Ops::default();
        let instance_id = ContractInstanceId::new([0u8; 32]);
        let key = ContractKey::from_id_and_code(instance_id, CodeHash::new([1u8; 32]));
        let update_op = crate::operations::update::start_op(
            key,
            freenet_stdlib::prelude::UpdateData::State(freenet_stdlib::prelude::State::from(vec![
                1, 2, 3,
            ])),
            freenet_stdlib::prelude::RelatedContracts::default(),
        );
        let tx = update_op.id;

        assert!(
            !ops.update.contains_key(&tx),
            "ops.update should not contain tx before insertion"
        );

        ops.update.insert(tx, update_op);
        assert!(
            ops.update.contains_key(&tx),
            "ops.update should contain tx after insertion"
        );

        ops.update.remove(&tx);
        assert!(
            !ops.update.contains_key(&tx),
            "ops.update should not contain tx after removal"
        );
    }
}