noxu-rep 7.2.1 - Docs.rs

//! The main replicated environment API.
//!
//!
//! A replicated database environment that is a node in a replication group.
//! This is the entry point for replication. It wraps a standard Environment
//! and adds replication capabilities including master election, replica
//! streaming, and commit acknowledgments.
//!
//! # Replication node states
//!
//! The replication node state determines the operations that the application
//! can perform against its replicated environment. The state transitions
//! visible to the application can be summarized by the regular expression:
//!
//! ```text
//! [ MASTER | REPLICA | UNKNOWN ]+ DETACHED
//! ```
//!
//! When the first handle to a `ReplicatedEnvironment` is created and the node
//! is brought up, the node usually establishes Master or Replica state. These
//! states are preceded by the Unknown state. As various remote nodes become
//! unavailable and elections are held, the local node may change between
//! Master and Replica states, always with a (usually brief) transition through
//! Unknown state.
//!
//! When the environment is closed, the node transitions to the Detached state.

use noxu_dbi::{
    AckWaitError, AckWaitErrorKind, EnvironmentImpl, ReplicaAckCoordinator,
    ReplicaAckPolicyKind,
};
use noxu_sync::RwLock;
use std::net::SocketAddr;
use std::sync::Arc;
use std::sync::Mutex as StdMutex;
use std::sync::OnceLock;
use std::sync::Weak;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration;

use crate::ack_tracker::AckTracker;
use crate::elections::election_service::{
    ELECTION_SERVICE_NAME, ElectionAcceptorState, ElectionService,
};
use crate::elections::master_tracker::MasterTracker;
use crate::error::{RepError, Result};
use crate::group_service::GroupService;
use crate::master_transfer::MasterTransferConfig;
use crate::net::service_dispatcher::{
    AnyServiceDispatcher, TcpServiceDispatcher,
};
use crate::network_restore::{NetworkRestore, NetworkRestoreConfig};
use crate::network_restore_server::{
    NetworkRestoreServer, RESTORE_SERVICE_NAME,
};
use crate::node_state::{NodeState, NodeStateMachine};
use crate::rep_config::RepConfig;
use crate::rep_stats::RepStats;
use crate::state_change_listener::{StateChangeEvent, StateChangeListener};
use crate::stream::feeder::EnvironmentLogScanner;
use crate::stream::feeder::Feeder;
use crate::stream::feeder::FeederRunner;
use crate::stream::peer_feeder::PeerScannerAdapter;
use crate::stream::peer_feeder::{
    PEER_FEEDER_SERVICE_NAME, PeerFeederService, PeerLogScanner,
};
use crate::stream::replica_stream::{EnvironmentLogWriter, ReplicaStream};
use crate::stream::syncup::{
    Matchpoint, RollbackDecision, find_matchpoint, verify_rollback,
};
use crate::stream::syncup_reader::VlsnIndexView;
use crate::vlsn::vlsn_index::VlsnIndex;
use crate::vlsn::vlsn_range::VlsnRange;
use std::collections::HashMap;

/// Default heartbeat timeout for master liveness detection.
const DEFAULT_HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(30);

/// A replicated database environment.
///
///
///
/// This is the entry point for replication. It wraps a standard Environment
/// and adds replication capabilities including master election, replica
/// streaming, and commit acknowledgments.
///
/// High Availability (HA) provides a replicated, embedded database
/// management system which provides fast, reliable, and scalable data
/// management. HA enables replication of an environment across a Replication
/// Group. A `ReplicatedEnvironment` is a single node in the replication group.
///
/// `ReplicatedEnvironment` wraps a standard `Environment`. All database
/// operations are executed in the same fashion in both replicated and
/// non-replicated applications. A `ReplicatedEnvironment` must be
/// transactional. All replicated databases created in the replicated
/// environment must be transactional as well.
///
/// A `ReplicatedEnvironment` joins its replication group when it is created.
/// When `new()` returns, the node will have established contact with the other
/// members of the group and will be ready to service operations.
///
/// Replicated environments can be created with node type Electable or
/// Secondary. Electable nodes can be masters or replicas, and participate in
/// both master elections and commit durability decisions. Secondary nodes can
/// only be replicas, not masters, and do not participate in either elections or
/// durability decisions.
///
/// # Example
///
/// ```ignore
/// use noxu_rep::{ReplicatedEnvironment, RepConfig};
///
/// let config = RepConfig::builder("my_group", "node1", "localhost")
///     .node_port(5001)
///     .build();
/// let rep_env = ReplicatedEnvironment::new(config).unwrap();
/// ```
/// Outcome of [`ReplicatedEnvironment::syncup_with_feeder`] — the action taken
/// by a live diverged-tail syncup. Port of the branch in JE
/// `ReplicaFeederSyncup.execute` between a soft rollback and a network restore.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum SyncupAction {
    /// The divergent tail was rolled back to the matchpoint; resume streaming
    /// from `start_vlsn` (`matchpoint + 1`). `matchpoint_vlsn == last VLSN`
    /// means the replica was not diverged and nothing was truncated.
    RolledBack { matchpoint_vlsn: u64, start_vlsn: u64 },
    /// No safe rollback (no common matchpoint, or it would cross a committed
    /// txn); the replica must do a full network restore.
    NeedsRestore,
}

pub struct ReplicatedEnvironment {
    /// The replication configuration for this node.
    config: RepConfig,
    /// Tracks the current node state (Detached, Unknown, Master, Replica).
    node_state: NodeStateMachine,
    /// Service for managing the replication group membership.
    group_service: GroupService,
    /// Maps VLSNs to log file positions.
    ///
    /// Wrapped in `Arc` so that background daemons (election driver,
    /// VLSN-index persistence flusher) can share access without
    /// borrowing the env.  Closes finding F11 (
    /// the 2026 review).
    vlsn_index: Arc<VlsnIndex>,
    /// Tracks acknowledgments from replicas (used by master).
    ack_tracker: AckTracker,
    /// Replication statistics.
    stats: RepStats,
    /// Active feeder threads (master -> replica streams).
    feeders: RwLock<Vec<Feeder>>,
    /// Replica stream for receiving updates from the master.
    replica_stream: ReplicaStream,
    /// Tracks the current master node.
    master_tracker: MasterTracker,
    /// State change listeners.
    listeners: RwLock<Vec<Arc<dyn StateChangeListener>>>,
    /// Shutdown flag.
    shutdown: AtomicBool,
    /// Service dispatcher — listens on the replication port and routes
    /// incoming connections to the appropriate service handler (feeder, etc.).
    ///
    /// `Plain`: plain TCP (default / Phase-2 behaviour).
    /// `Tls`: TLS + mTLS enforcement (Phase 3, when `RepConfig::tls_config` is set
    /// and `transport_kind` is `Tls`).
    ///
    /// `None` only when the bind address cannot be resolved.
    tcp_dispatcher: Option<AnyServiceDispatcher>,
    /// The address the `tcp_dispatcher` is actually bound to (may differ from
    /// the configured port when port 0 is used in tests).
    bound_addr: Option<SocketAddr>,

    /// Optional live `EnvironmentImpl` wired in via [`with_environment`].
    ///
    /// When set, `become_master` spawns a `FeederRunner` per replica using
    /// `EnvironmentLogScanner`, and `become_replica` spawns a
    /// `ReplicaReceiver` thread using `EnvironmentLogWriter`.
    ///
    /// In HA.
    env_impl: StdMutex<Option<Arc<EnvironmentImpl>>>,

    /// Background I/O thread handles spawned during state transitions.
    ///
    /// Stored so that `close()` can join them cleanly.  Each handle is
    /// `Option` so we can `take()` it in `close()`.
    io_threads: StdMutex<Vec<std::thread::JoinHandle<()>>>,

    /// Shutdown flag shared with I/O threads so they terminate when the
    /// environment is closed.
    ///
    /// Wrapped in an `Arc` so the replica receive thread (which connects to
    /// an upstream feeder via `catch_up_from_peer`) can poll it directly and
    /// break out of its blocking receive loop on close — otherwise a node
    /// whose upstream stays connected (e.g. a mid-tier replica in a chain,
    /// closed before its upstream) would never observe the close and
    /// `close()`'s thread-join would hang.
    io_shutdown: Arc<AtomicBool>,

    /// Whether the RESTORE service has been registered on the TCP dispatcher.
    ///
    /// When `config.env_home` is `None` at construction time, registration is
    /// deferred until `with_environment()` provides the env home path.
    restore_registered: AtomicBool,

    /// In-memory log queue used by the peer feeder service.
    ///
    /// When this node is a replica, `apply_entry()` pushes each received log
    /// entry here.  The `PeerFeederService` registered on the TCP dispatcher
    /// reads from this queue to stream entries to downstream replicas that
    /// are behind this node (peer-to-peer log distribution, HA style).
    peer_scanner: Arc<PeerLogScanner>,

    /// Durable Transaction VLSN (D7, JE RepNode.dtvlsn): the highest VLSN
    /// known to have been replicated to a *majority* of the electable
    /// replicas. On a master it is computed from feeder ack/heartbeat progress
    /// (`update_dtvlsn_from_feeders`); on a replica it is set from commit/abort
    /// records in the stream (`set_dtvlsn`). It advances monotonically (an
    /// `update_max`). 0 = NULL_VLSN. Used by the election ranking (D2) so the
    /// most-durable node, not merely the highest-raw-VLSN node, wins.
    dtvlsn: std::sync::atomic::AtomicU64,

    /// Shared acceptor state used by the ELECTION service handler.
    /// The election driver updates `own_vlsn` / `own_term` here as the
    /// node progresses; incoming acceptor sessions read it on every
    /// connection so their replies always reflect the local node's
    /// most recent state.  Closes finding F6.
    election_state: Arc<ElectionAcceptorState>,

    /// Self-referential `Weak` populated once the env has been wrapped
    /// in an `Arc`.  Used by the replica I/O thread spawned in
    /// `become_replica` so it can call `bootstrap_via_dispatcher` when
    /// the master signals `NeedsRestore`.
    ///
    /// Populated lazily via [`Self::init_self_weak`] from `open()` and
    /// the test harness.  When unset (callers that build the env via
    /// raw `Arc::new(Self::new(...))` and never call `init_self_weak`)
    /// the I/O thread falls back to operator-driven bootstrap.
    self_weak: OnceLock<Weak<Self>>,

    // -----------------------------------------------------------------------
    // C-C2: active push-feeder infrastructure
    // -----------------------------------------------------------------------
    /// Per-replica channels injected via [`Self::register_feeder_channel`].
    ///
    /// When [`Self::become_master`] is called (or when the node is already
    /// master), a [`FeederRunner`] thread is spawned for each registered
    /// channel, actively streaming entries to that replica over the channel.
    ///
    /// Using `register_feeder_channel` is the primary integration point for
    /// the push-based feeder path.  Production deployments wire in a
    /// `TcpChannel`; test code uses `LocalChannelPair`.
    feeder_channels: StdMutex<HashMap<String, Arc<dyn crate::net::Channel>>>,

    /// Per-replica dedicated entry queues backing the push-feeder path.
    ///
    /// Each `FeederRunner` thread reads exclusively from its replica's queue.
    /// [`Self::replicate_entry`] and [`Self::apply_entry`] fan out into all
    /// registered queues so the push runners receive entries without competing
    /// with [`PeerFeederService`] for ownership of `peer_scanner`.
    feeder_queues: std::sync::RwLock<HashMap<String, Arc<PeerLogScanner>>>,

    /// Active `FeederRunner` references for acked-VLSN queries and
    /// clean shutdown (M-4: wait for replicas to catch up).
    active_feeder_runners: StdMutex<HashMap<String, Arc<FeederRunner>>>,

    /// Monotone VLSN counter shared with the wired `EnvironmentImpl`.
    ///
    /// Installed into the environment via
    /// `EnvironmentImpl::set_replication_vlsn_counter()` when
    /// `with_environment` is called.  Each `log_txn_commit` on the master
    /// atomically increments this counter and writes a VLSN-tagged WAL entry,
    /// which `EnvironmentLogScanner` then picks up without any
    /// `replicate_entry` call from the application.
    wal_vlsn_counter: Arc<std::sync::atomic::AtomicU64>,

    /// Count of downstream connections this node has served via the JE
    /// `Feeder`/`MasterFeederSource` mechanism (`FeederRunner +
    /// EnvironmentLogScanner` reading this node's WAL).  Shared with the
    /// node's [`crate::stream::peer_feeder::PeerFeederService`] when a WAL
    /// source is registered (master in `become_master`, or a cascading
    /// replica in `become_replica`).  A non-zero value PROVES this node fed
    /// a downstream by the SAME mechanism the master uses — the cascade does
    /// not diverge.  See [`Self::wal_feeds_served`].
    wal_feeds_served: Arc<std::sync::atomic::AtomicU64>,

    /// REP-10 (C): the replica-side consistency tracker, built from the
    /// REP-7 `last_applied_vlsn` handle when the replica replay thread starts
    /// (`become_replica`).  `None` on a master or before replay is wired.
    ///
    /// A read that begins on a replica with a non-`NoConsistency` policy waits
    /// on this tracker (`begin_read_consistency`).  Port of
    /// `RepImpl.getConsistency` / `Replica.getConsistencyTracker`.
    consistency_tracker: StdMutex<Option<crate::ConsistencyTracker>>,
}

impl ReplicatedEnvironment {
    /// Create a new replicated environment.
    ///
    ///
    ///
    /// Creates a replicated environment handle and starts participating in the
    /// replication group. The node's state is determined when it joins the
    /// group, and mastership is not preconfigured. If the group has no current
    /// master, creation will trigger an election to determine whether this node
    /// will participate as a Master or a Replica.
    ///
    /// A brand new node will always join an existing group as a Replica, unless
    /// it is the very first electable node that is creating the group. In that
    /// case it joins as the Master of the newly formed singleton group.
    pub fn new(config: RepConfig) -> Result<Self> {
        // mTLS Phase 2 (v3.1.0): peer_allowlist enforcement is real at the
        // TLS channel layer (TlsTcpChannelListener::bind_with_tls_and_allowlist).
        // Phase 3 (this release): when RepConfig::tls_config is set AND
        // transport_kind is Tls, the service dispatcher itself enforces mTLS
        // via TlsTcpServiceDispatcher.  For the remaining cases (no TlsConfig
        // or non-TLS transport) keep the Phase-2 accurate warn.
        if !config.peer_allowlist.is_empty() {
            match config.transport_kind {
                crate::rep_config::RepTransportKind::Tls => {
                    if config.tls_config.is_some() {
                        log::info!(
                            "[{}] peer_allowlist ({} entries) + tls_config set; \
                             mTLS will be enforced on the service dispatcher.",
                            config.node_name,
                            config.peer_allowlist.len(),
                        );
                    } else {
                        log::info!(
                            "[{}] peer_allowlist configured ({} entries) but \
                             tls_config is None — the service dispatcher will \
                             use plain TCP. Set RepConfig::tls_config to \
                             activate end-to-end mTLS on this path.",
                            config.node_name,
                            config.peer_allowlist.len(),
                        );
                    }
                }
                _ => {
                    log::warn!(
                        "[{}] peer_allowlist is configured ({} entries) but \
                         transport_kind is not Tls — the allowlist has no \
                         effect without TLS transport. Set \
                         RepTransportKind::Tls to activate mTLS enforcement.",
                        config.node_name,
                        config.peer_allowlist.len(),
                    );
                }
            }
        }
        let node_state = NodeStateMachine::new();
        let group_service = GroupService::new(config.group_name.clone());
        let vlsn_index = {
            // F11: try to load a previously persisted vlsn.idx from
            // env_home if one exists.  A successfully loaded index lets a
            // restarted replica resume from where it left off without a
            // full network restore; a missing or corrupt file falls back
            // to a fresh in-memory index (caller will need to bootstrap).
            if let Some(ref home) = config.env_home {
                match crate::vlsn::persist::load_from_disk(home) {
                    Ok(Some(idx)) => {
                        log::info!(
                            "Node '{}' loaded persisted VLSN index from {} \
                             ({} entries, latest vlsn={})",
                            config.node_name,
                            home.display(),
                            idx.snapshot_entries().len(),
                            idx.get_latest_vlsn(),
                        );
                        Arc::new(idx)
                    }
                    Ok(None) => Arc::new(VlsnIndex::new(10)),
                    Err(e) => {
                        log::warn!(
                            "Node '{}' failed to load persisted VLSN index \
                             from {}: {} (treating as fresh node — network \
                             restore required)",
                            config.node_name,
                            home.display(),
                            e
                        );
                        // Best-effort: remove the corrupt file so the
                        // next persist cycle writes a clean one.  A
                        // missing file is the "fresh node" baseline.
                        let _ = std::fs::remove_file(
                            crate::vlsn::persist::index_path(home),
                        );
                        Arc::new(VlsnIndex::new(10))
                    }
                }
            } else {
                Arc::new(VlsnIndex::new(10))
            }
        };
        let ack_tracker = AckTracker::new();
        let stats = RepStats::new();
        let feeders = RwLock::new(Vec::new());
        let replica_stream = ReplicaStream::new();
        let master_tracker = MasterTracker::new(DEFAULT_HEARTBEAT_TIMEOUT);

        // Start the service dispatcher.
        //
        // Phase 3: when RepConfig::tls_config is set AND transport_kind is Tls,
        // start a TlsTcpServiceDispatcher (mTLS enforced).  Otherwise fall back
        // to the plain-TCP TcpServiceDispatcher.
        let listen_addr_str =
            format!("{}:{}", config.node_host, config.node_port);
        let mut restore_registered_init = false;

        // Returns (AnyServiceDispatcher, bound_addr) or (None, None) on error.
        let (tcp_dispatcher, bound_addr) = match listen_addr_str
            .parse::<SocketAddr>()
        {
            Ok(addr) => {
                let build_result: Result<(AnyServiceDispatcher, SocketAddr)> =
                    Self::build_dispatcher(&config, addr);
                match build_result {
                    Ok((dispatcher, bound)) => {
                        // Register the network restore handler.
                        if let Some(ref home) = config.env_home {
                            let restore_server =
                                NetworkRestoreServer::new(home.clone());
                            dispatcher.register(
                                RESTORE_SERVICE_NAME,
                                Arc::new(restore_server),
                            );
                            log::debug!(
                                "Node '{}' RESTORE service registered \
                                     (env_home={})",
                                config.node_name,
                                home.display(),
                            );
                            restore_registered_init = true;
                        }
                        let kind =
                            if dispatcher.is_tls() { "TLS" } else { "TCP" };
                        log::info!(
                            "Node '{}' {} service dispatcher started on {}",
                            config.node_name,
                            kind,
                            bound
                        );
                        (Some(dispatcher), Some(bound))
                    }
                    Err(e) => {
                        log::warn!(
                            "Node '{}' failed to start service dispatcher \
                             on {}: {}",
                            config.node_name,
                            listen_addr_str,
                            e
                        );
                        (None, None)
                    }
                }
            }
            Err(e) => {
                log::warn!(
                    "Node '{}' cannot parse listen address '{}': {}",
                    config.node_name,
                    listen_addr_str,
                    e
                );
                (None, None)
            }
        };

        // Build the in-memory peer log scanner; register the peer feeder
        // service on the dispatcher so downstream replicas can connect.
        let peer_scanner = Arc::new(PeerLogScanner::new());
        // F5/F31: build the acceptor state with persistence enabled when
        // env_home is configured.  Crash-durable promises are required
        // for the Paxos safety invariant after a process restart.
        let election_state =
            Arc::new(if let Some(ref home) = config.env_home {
                ElectionAcceptorState::with_env_home(
                    config.node_name.clone(),
                    1,
                    home,
                )
            } else {
                ElectionAcceptorState::new(config.node_name.clone(), 1)
            });
        if let Some(ref dispatcher) = tcp_dispatcher {
            let service = PeerFeederService::new(Arc::clone(&peer_scanner));
            dispatcher.register(PEER_FEEDER_SERVICE_NAME, Arc::new(service));
            log::debug!(
                "Node '{}' PEER_FEEDER service registered",
                config.node_name,
            );
            // F6: register the ELECTION service so peers can run
            // run_acceptor against this node when proposing.
            let election_svc =
                Arc::new(ElectionService::new(Arc::clone(&election_state)));
            dispatcher.register(ELECTION_SERVICE_NAME, election_svc);
            log::debug!(
                "Node '{}' ELECTION service registered",
                config.node_name,
            );
        }

        let env = Self {
            config,
            node_state,
            group_service,
            vlsn_index,
            ack_tracker,
            stats,
            feeders,
            replica_stream,
            master_tracker,
            listeners: RwLock::new(Vec::new()),
            shutdown: AtomicBool::new(false),
            tcp_dispatcher,
            bound_addr,
            env_impl: StdMutex::new(None),
            io_threads: StdMutex::new(Vec::new()),
            io_shutdown: Arc::new(AtomicBool::new(false)),
            restore_registered: AtomicBool::new(restore_registered_init),
            peer_scanner,
            dtvlsn: std::sync::atomic::AtomicU64::new(0),
            election_state,
            self_weak: OnceLock::new(),
            feeder_channels: StdMutex::new(HashMap::new()),
            feeder_queues: std::sync::RwLock::new(HashMap::new()),
            active_feeder_runners: StdMutex::new(HashMap::new()),
            wal_vlsn_counter: Arc::new(std::sync::atomic::AtomicU64::new(0)),
            wal_feeds_served: Arc::new(std::sync::atomic::AtomicU64::new(0)),
            consistency_tracker: StdMutex::new(None),
        };

        Ok(env)
    }

    /// Open a replicated environment with the standard production
    /// lifecycle.
    ///
    /// This is the entry point recommended by the mdBook chapters:
    /// it allocates the `ReplicatedEnvironment`, registers all
    /// services on the TCP dispatcher, and spawns the **election
    /// driver** background thread that runs Paxos rounds against
    /// known peers until the node has resolved into either Master or
    /// Replica state.
    ///
    /// Closes finding F6 of the 2026 review.
    ///
    /// Use [`ReplicatedEnvironment::new`] directly only when the
    /// caller plans to drive state transitions explicitly (test
    /// harnesses, scripted bootstrap, recovery tooling).
    pub fn open(config: RepConfig) -> Result<Arc<Self>> {
        let env = Arc::new(Self::new(config)?);
        env.init_self_weak();
        env.start_election_driver();
        env.start_vlsn_persistence_daemon();
        env.register_admin_service();
        Ok(env)
    }

    /// Build the service dispatcher for this node.
    ///
    /// Phase 3 logic: when `config.transport_kind == Tls` AND
    /// `config.tls_config` is `Some`, start a
    /// [`crate::net::service_dispatcher::TlsTcpServiceDispatcher`] that
    /// enforces mTLS with the configured `peer_allowlist`.  Otherwise
    /// start the plain-TCP [`TcpServiceDispatcher`].
    ///
    /// Returns `(dispatcher, bound_addr)` or a `RepError` on bind / TLS
    /// config failure.
    fn build_dispatcher(
        #[cfg_attr(not(feature = "tls-rustls"), allow(unused_variables))]
        config: &RepConfig,
        addr: SocketAddr,
    ) -> Result<(AnyServiceDispatcher, SocketAddr)> {
        #[cfg(feature = "tls-rustls")]
        if config.transport_kind == crate::rep_config::RepTransportKind::Tls {
            use crate::auth::PeerAllowlist;
            use crate::net::service_dispatcher::TlsTcpServiceDispatcher;
            let tls = config.tls_config.as_ref().ok_or_else(|| {
                RepError::ConfigError(format!(
                    "node '{}': transport_kind=Tls requires a tls_config",
                    config.node_name,
                ))
            })?;
            let allowlist =
                PeerAllowlist::new(config.peer_allowlist.iter().cloned());
            // Fail-closed: an empty allowlist with TLS transport is a
            // misconfiguration. The same policy is enforced at the TLS
            // listener and QUIC constructors; downgrading to plain TCP here
            // would be a silent security regression for a node that asked
            // for TLS.
            if allowlist.is_empty() {
                return Err(RepError::ConfigError(format!(
                    "node '{}': transport_kind=Tls requires a non-empty \
                     peer_allowlist (mTLS enforcement is fail-closed)",
                    config.node_name,
                )));
            }
            let disp = TlsTcpServiceDispatcher::new(addr, tls, allowlist)?;
            let bound = disp.start()?;
            return Ok((AnyServiceDispatcher::Tls(disp), bound));
        }
        // Plain-TCP dispatcher (default or when TLS config is missing).
        let disp = TcpServiceDispatcher::new(addr).map_err(|e| {
            RepError::NetworkError(format!("TCP dispatcher init: {e}"))
        })?;
        let bound = disp.start()?;
        Ok((AnyServiceDispatcher::Plain(disp), bound))
    }

    /// Populate the env's self-referential `Weak` so background
    /// threads can obtain a back-reference for auto-orchestrated
    /// follow-up actions (e.g. replica auto-bootstrap on
    /// `NeedsRestore`).  Idempotent: subsequent calls are silent
    /// no-ops because the inner [`OnceLock`] only accepts one set.
    ///
    /// Callers that wrap the env in `Arc` and want auto-bootstrap
    /// behaviour should call this immediately after construction.
    /// `Self::open` already does so.  Test harnesses that drive
    /// transitions manually (`RepTestBase`) also call this so the
    /// auto-bootstrap path is exercised in tests.
    pub fn init_self_weak(self: &Arc<Self>) {
        let _ = self.self_weak.set(Arc::downgrade(self));
    }

    /// Register the `ADMIN` service handler on the TCP dispatcher.
    ///
    /// Closes findings F7 / F8.  Holds a `Weak<Self>` so the handler
    /// does not extend the env's lifetime.  Idempotent: re-registering
    /// is harmless because `TcpServiceDispatcher::register` overwrites
    /// the existing handler.
    pub fn register_admin_service(self: &Arc<Self>) {
        if let Some(ref dispatcher) = self.tcp_dispatcher {
            crate::group_admin::register_admin_service(
                dispatcher,
                Arc::downgrade(self),
            );
            log::debug!(
                "Node '{}' ADMIN service registered",
                self.config.node_name,
            );
        }
    }

    /// Spawn the VLSN-index persistence daemon (F11).
    ///
    /// Periodically (every 2 seconds) snapshots the in-memory
    /// `VlsnIndex` to `<env_home>/vlsn.idx` so a clean restart can
    /// resume from where the replica left off without a full network
    /// restore.  No-op when `config.env_home` is `None`.
    ///
    /// Idempotent: only one daemon is ever spawned per env.
    pub fn start_vlsn_persistence_daemon(self: &Arc<Self>) {
        let Some(home) = self.config.env_home.clone() else {
            return;
        };
        {
            let threads = self.io_threads.lock().unwrap();
            if threads.iter().any(|h| {
                h.thread()
                    .name()
                    .is_some_and(|n| n.starts_with("noxu-vlsn-flush-"))
            }) {
                return;
            }
        }

        let vlsn_index = Arc::clone(&self.vlsn_index);
        let name = format!("noxu-vlsn-flush-{}", self.config.node_name);
        let me = Arc::clone(self);
        let interval = Duration::from_secs(2);

        let handle = std::thread::Builder::new()
            .name(name)
            .spawn(move || {
                use std::sync::atomic::Ordering;
                let mut last_persisted_vlsn: u64 = 0;
                while !me.io_shutdown.load(Ordering::SeqCst)
                    && !me.is_shutdown()
                {
                    std::thread::sleep(interval);
                    if me.io_shutdown.load(Ordering::SeqCst) {
                        break;
                    }
                    let latest = vlsn_index.get_latest_vlsn();
                    if latest == last_persisted_vlsn {
                        // Nothing new to flush.
                        continue;
                    }
                    // X-2: cap the flush at the last durable checkpoint's
                    // end LSN so the persisted VLSN index never claims
                    // VLSNs beyond the durable B-tree state.  After a crash
                    // the recovered tree and the index will be coherent.
                    let cap_lsn = me
                        .env_impl
                        .lock()
                        .unwrap()
                        .as_ref()
                        .and_then(|e| e.get_checkpointer())
                        .map(|c| c.get_last_checkpoint_end())
                        .unwrap_or(noxu_util::NULL_LSN);
                    match crate::vlsn::persist::flush_to_disk_capped(
                        &vlsn_index,
                        &home,
                        cap_lsn,
                    ) {
                        Ok(n) => {
                            log::trace!(
                                "vlsn-flush: persisted {} entries (latest vlsn={}, cap_lsn={:?})",
                                n,
                                latest,
                                cap_lsn,
                            );
                            last_persisted_vlsn = latest;
                        }
                        Err(e) => {
                            log::warn!(
                                "vlsn-flush: failed to persist VLSN index to {}: {}",
                                home.display(),
                                e
                            );
                        }
                    }
                }
                // Final flush on shutdown so a clean close is recoverable.
                // Cap at the last checkpoint even for the shutdown flush.
                let cap_lsn = me
                    .env_impl
                    .lock()
                    .unwrap()
                    .as_ref()
                    .and_then(|e| e.get_checkpointer())
                    .map(|c| c.get_last_checkpoint_end())
                    .unwrap_or(noxu_util::NULL_LSN);
                if let Err(e) = crate::vlsn::persist::flush_to_disk_capped(
                    &vlsn_index,
                    &home,
                    cap_lsn,
                ) {
                    log::warn!(
                        "vlsn-flush (final): failed to persist VLSN index: {}",
                        e
                    );
                }
            })
            .expect("failed to spawn noxu-vlsn-flush thread");

        self.io_threads.lock().unwrap().push(handle);
        log::debug!(
            "Node '{}' VLSN persistence daemon started",
            self.config.node_name,
        );
    }

    /// Spawn the election driver background thread.
    ///
    /// While the env is in `Detached` or `Unknown` state and no master
    /// is known, the driver periodically attempts a Paxos election
    /// against peers in `GroupService` (whose ELECTION services were
    /// registered at `open()` time).  On success the driver calls
    /// `become_master` (if this node is the winner) or `become_replica`
    /// (otherwise).  On failure (no quorum), the driver waits
    /// `config.election_timeout` and tries again.
    ///
    /// The driver respects `io_shutdown`; on env close the loop exits
    /// promptly.
    ///
    /// Idempotent: a second call is a no-op (only one driver thread is
    /// ever spawned per env).
    pub fn start_election_driver(self: &Arc<Self>) {
        use std::sync::atomic::Ordering;
        // Reuse io_shutdown for cancellation; a successful spawn is
        // recorded by appending to io_threads, so a duplicate call
        // would re-add a thread — we use a one-shot `AtomicBool`
        // sentinel placed in the io_shutdown's slot via a new field.
        // Cheaper: a static name check on io_threads is impossible;
        // instead, gate spawning on whether any io_thread already
        // carries the driver name.
        {
            let threads = self.io_threads.lock().unwrap();
            if threads.iter().any(|h| {
                h.thread()
                    .name()
                    .is_some_and(|n| n.starts_with("noxu-election-"))
            }) {
                return;
            }
        }

        let me = Arc::clone(self);
        let name = format!("noxu-election-{}", self.config.node_name);
        let handle = std::thread::Builder::new()
            .name(name)
            .spawn(move || {
                me.run_election_loop();
            })
            .expect("failed to spawn election driver thread");
        self.io_threads.lock().unwrap().push(handle);
        log::debug!("Node '{}' election driver started", self.config.node_name,);
        // Keep ordering sane on the io_shutdown flag.
        let _ = self.io_shutdown.load(Ordering::SeqCst);
    }

    /// Body of the election driver loop.  Public only for tests; called
    /// by [`Self::start_election_driver`].
    fn run_election_loop(self: Arc<Self>) {
        use std::sync::atomic::Ordering;
        // Maintain an internal monotonically increasing election term.
        // Each successful or failed round bumps the term so retries do
        // not collide with stale acceptor promises.
        let mut term: u64 = 1;

        loop {
            if self.io_shutdown.load(Ordering::SeqCst) {
                return;
            }
            if self.is_shutdown() {
                return;
            }

            let state = self.node_state.get_state();
            // Stop driving once we've resolved into Master/Replica;
            // re-arm only if the node returns to Unknown.
            if matches!(state, NodeState::Master | NodeState::Replica) {
                std::thread::sleep(Duration::from_millis(200));
                continue;
            }
            if matches!(state, NodeState::Shutdown) {
                return;
            }

            // Probe peers for an active master via the existing
            // GroupService cache.  In the absence of a heartbeat path
            // we rely on master_tracker (set by become_replica from
            // the receive loop).
            if let Some(master_name) = self.master_tracker.get_master()
                && master_name != self.config.node_name
            {
                let _ = self.become_replica(&master_name);
                continue;
            }

            // Snapshot peers to dial for ELECTION.
            let peers: Vec<(String, SocketAddr)> = self
                .group_service
                .get_all_nodes()
                .into_iter()
                .filter(|n| n.name != self.config.node_name)
                .filter_map(|n| {
                    format!("{}:{}", n.host, n.port)
                        .parse::<SocketAddr>()
                        .ok()
                        .map(|a| (n.name, a))
                })
                .collect();

            // Build the local rep group view used by run_election to
            // compute quorum and resolve the winner name.  Include
            // self.
            let group = self.local_rep_group_with_self();

            // Update election state for any concurrent acceptor calls.
            let our_vlsn = self.vlsn_index.get_latest_vlsn();
            self.election_state.set_vlsn(our_vlsn);
            self.election_state.set_term(term);
            // D2: advertise our DTVLSN as the major election-ranking key.
            self.election_state.set_dtvlsn(self.get_dtvlsn());

            // Connect to each peer's ELECTION service.  Failures are
            // tolerated: a peer that doesn't answer simply contributes
            // no vote.  The election may still reach quorum in the
            // remaining peers.
            let mut channels: Vec<Arc<dyn crate::net::channel::Channel>> =
                Vec::new();
            for (peer_name, addr) in &peers {
                match crate::net::service_dispatcher::connect_to_service(
                    *addr,
                    ELECTION_SERVICE_NAME,
                ) {
                    Ok(ch) => {
                        let arc: Arc<dyn crate::net::channel::Channel> =
                            Arc::new(ch);
                        channels.push(arc);
                    }
                    Err(e) => {
                        log::trace!(
                            "election driver: peer {} ({}) unreachable: {}",
                            peer_name,
                            addr,
                            e
                        );
                    }
                }
            }

            // Resolve our own node_id from the group; if not present
            // we cannot run an election (closed-world guard — see F22).
            let self_node_id =
                group.get_node(&self.config.node_name).map(|n| n.node_id());
            let self_node_id = match self_node_id {
                Some(id) => id,
                None => {
                    log::warn!(
                        "election driver: node '{}' not registered in \
                         own group view; sleeping",
                        self.config.node_name
                    );
                    std::thread::sleep(Duration::from_millis(200));
                    continue;
                }
            };

            log::debug!(
                "election driver on '{}': starting term={} with {} peers",
                self.config.node_name,
                term,
                channels.len(),
            );
            let outcome = crate::elections::paxos::run_election_with_phi_dtvlsn(
                self_node_id,
                &self.config.node_name,
                &group,
                &channels,
                our_vlsn,
                /* priority */ 1,
                term,
                /* own_dtvlsn (D2 major ranking key) */
                self.get_dtvlsn(),
                None,
                std::time::Duration::from_millis(500),
            );

            match outcome {
                Some(winner_id) if winner_id == self_node_id => {
                    if let Err(e) = self.become_master(term) {
                        log::warn!(
                            "election driver: become_master failed: {}",
                            e
                        );
                    } else {
                        log::info!(
                            "election driver: '{}' became master at term {}",
                            self.config.node_name,
                            term,
                        );
                    }
                }
                Some(winner_id) => {
                    if let Some(winner_node) = group
                        .get_nodes()
                        .into_iter()
                        .find(|n| n.node_id() == winner_id)
                    {
                        if let Err(e) = self.become_replica(&winner_node.name) {
                            log::warn!(
                                "election driver: become_replica failed: {}",
                                e
                            );
                        } else {
                            log::info!(
                                "election driver: '{}' became replica of '{}' at term {}",
                                self.config.node_name,
                                winner_node.name,
                                term,
                            );
                        }
                    }
                }
                None => {
                    log::debug!(
                        "election driver on '{}' term={}: no quorum",
                        self.config.node_name,
                        term,
                    );
                }
            }

            term = term.saturating_add(1);
            // Back off so we don't pin the loop on transient failures.
            std::thread::sleep(
                self.config.election_timeout.min(Duration::from_millis(500)),
            );
        }
    }

    /// Internal: a `RepGroup` snapshot that includes self.
    fn local_rep_group_with_self(&self) -> crate::rep_group::RepGroup {
        let mut group = self.get_rep_group();
        // Ensure self is present in the group view; the
        // group_service does not auto-register the local node.
        if group.get_node(&self.config.node_name).is_none() {
            let mut self_node = crate::rep_node::RepNode::new(
                self.config.node_name.clone(),
                self.config.node_type,
                self.config.node_host.clone(),
                self.config.node_port,
                /* node_id */ 0,
            );
            // Stable self node_id derived from the name hash so
            // re-creations in the same process don't collide.
            use std::hash::{Hash, Hasher};
            let mut hasher = std::collections::hash_map::DefaultHasher::new();
            self.config.node_name.hash(&mut hasher);
            // Restrict to a u32 range and avoid 0 (reserved for
            // "unknown").
            let id = ((hasher.finish() as u32) | 1).max(1);
            self_node.node_id = id;
            group.add_node(self_node);
        }
        group
    }

    /// Return the socket address the TCP service dispatcher is bound to.
    ///
    /// This may differ from the configured `node_port` when port 0 is used
    /// (the OS assigns an ephemeral port). Returns `None` if the dispatcher
    /// could not be started (e.g. the address is not resolvable).
    pub fn bound_addr(&self) -> Option<SocketAddr> {
        self.bound_addr
    }

    /// Wire a live `EnvironmentImpl` into this replicated environment.
    ///
    /// After this call, state transitions (`become_master`, `become_replica`)
    /// will spawn real feeder/receiver I/O threads backed by the live log.
    ///
    /// If the RESTORE service was not registered at construction time (because
    /// `config.env_home` was `None`), it is registered here using the
    /// environment's actual home path.  This mirrors`RepNode.envSetup()`
    /// which registers the restore handler during environment wiring.
    ///
    /// Environment reference wiring.
    /// `EnvironmentImpl` via `RepImpl.repNode.envImpl` in HA.
    pub fn with_environment(&self, env: Arc<EnvironmentImpl>) {
        // Register RESTORE service lazily if not already done.
        if !self.restore_registered.load(Ordering::SeqCst)
            && let Some(ref dispatcher) = self.tcp_dispatcher
        {
            let env_home = env.get_env_home().to_path_buf();
            let restore_server = NetworkRestoreServer::new(env_home.clone());
            dispatcher.register(RESTORE_SERVICE_NAME, Arc::new(restore_server));
            self.restore_registered.store(true, Ordering::SeqCst);
            log::debug!(
                "Node '{}' RESTORE service registered via with_environment \
                 (env_home={})",
                self.config.node_name,
                env_home.display(),
            );
        }

        // X-14: rebuild the VLSN index from recovery-replayed LN entries.
        // After a crash the on-disk vlsn.idx may be stale (either ahead of
        // the recovered B-tree, or behind if vlsn.idx was not flushed
        // after the last checkpoint).  Re-registering all (vlsn, lsn) pairs
        // from the redo pass gives a consistent in-memory index.
        if !env.recovery_vlsns.is_empty() {
            log::info!(
                "Node '{}': rebuilding VLSN index from {} recovered entries",
                self.config.node_name,
                env.recovery_vlsns.len(),
            );
            for &(vlsn, lsn_u64) in &env.recovery_vlsns {
                let lsn = noxu_util::Lsn::from_u64(lsn_u64);
                self.vlsn_index.register(
                    vlsn,
                    lsn.file_number(),
                    lsn.file_offset(),
                );
            }
        }

        // X-1: truncate the VLSN index to the rollback matchpoint if recovery
        // detected a completed rollback period.  The matchpoint is the highest
        // LSN that is still valid after the rollback; entries with higher VLSNs
        // correspond to data that was rolled back and must not appear in the
        // index.
        if let Some(matchpoint_lsn_u64) = env.recovery_rollback_matchpoint {
            // Find the latest VLSN whose LSN is at or before the matchpoint.
            // Scan the recovered VLSN pairs (sorted ascending) to find the
            // boundary.
            let safe_vlsn = env
                .recovery_vlsns
                .iter()
                .rev()
                .find(|&&(_, lsn_u64)| lsn_u64 <= matchpoint_lsn_u64)
                .map(|&(vlsn, _)| vlsn)
                .unwrap_or(0);
            log::info!(
                "Node '{}': truncating VLSN index after vlsn={} \
                 (rollback matchpoint lsn={:#x})",
                self.config.node_name,
                safe_vlsn,
                matchpoint_lsn_u64,
            );
            self.vlsn_index.truncate_after(safe_vlsn);
        }

        *self.env_impl.lock().unwrap() = Some(Arc::clone(&env));

        // C-C2b: install the VLSN counter so log_txn_commit writes
        // VLSN-tagged headers.  When become_master then spawns an
        // EnvironmentLogScanner-backed FeederRunner, it will find these
        // entries and auto-feed them to replicas without any
        // replicate_entry call from the application.
        env.set_replication_vlsn_counter(Arc::clone(&self.wal_vlsn_counter));
    }

    /// Get the current node state.
    ///
    ///
    ///
    /// Returns the current state of the node associated with this replication
    /// environment. If the caller's intent is to track the state of the node,
    /// `StateChangeListener` may be a more convenient and efficient approach.
    pub fn get_state(&self) -> NodeState {
        self.node_state.get_state()
    }

    /// Check if this node is the master.
    ///
    /// Returns true if the node's current state is Master.
    pub fn is_master(&self) -> bool {
        self.node_state.get_state() == NodeState::Master
    }

    /// Returns true if this node is an *authoritative* master (D4, JE
    /// `ElectionQuorum.isAuthoritativeMaster`): it is the group master AND it
    /// is still connected to enough replicas that, including itself, a
    /// SIMPLE_MAJORITY quorum is present.
    ///
    /// A master on the minority side of a network partition is NOT
    /// authoritative — it must not claim the special election ranking
    /// (`MASTER_RANKING`) nor (eventually) continue accepting writes, so the
    /// majority side can elect a fresh master without it competing
    /// (split-brain prevention).
    ///
    /// "Active replica count" = the number of currently-connected push-feeder
    /// runners serving *electable* peers (Monitors/Secondaries do not count
    /// toward the election quorum). `+ 1` for this master itself.
    pub fn is_authoritative_master(&self) -> bool {
        if !self.is_master() {
            return false;
        }
        let group = self.get_rep_group();
        // Total electable nodes (incl. self) — peers + this master.
        let electable_total: usize = group
            .get_nodes()
            .iter()
            .filter(|n| n.node_type == crate::node_type::NodeType::Electable)
            .count()
            + 1; // +1 for self/master (not registered as a peer)

        // Active replicas = connected feeder runners whose peer is electable.
        let active_electable_replicas: usize = {
            let runners = self.active_feeder_runners.lock().unwrap();
            runners
                .keys()
                .filter(|name| {
                    group
                        .get_node(name)
                        .map(|n| {
                            n.node_type == crate::node_type::NodeType::Electable
                        })
                        .unwrap_or(false)
                })
                .count()
        };
        Self::authoritative_quorum_met(
            active_electable_replicas,
            electable_total,
        )
    }

    /// Pure SIMPLE_MAJORITY quorum check for `is_authoritative_master` (JE
    /// `ElectionQuorum.isAuthoritativeMaster`): `(activeReplicas + 1) >=
    /// quorumSize` where `quorumSize = electableTotal / 2 + 1`.
    fn authoritative_quorum_met(
        active_electable_replicas: usize,
        electable_total: usize,
    ) -> bool {
        let quorum_size = electable_total / 2 + 1;
        (active_electable_replicas + 1) >= quorum_size
    }

    /// Check if this node is a replica.
    ///
    /// Returns true if the node's current state is Replica.
    pub fn is_replica(&self) -> bool {
        self.node_state.get_state() == NodeState::Replica
    }

    /// Returns true if the node is currently participating in the group
    /// as a Replica or a Master.
    pub fn is_active(&self) -> bool {
        self.node_state.get_state().is_active()
    }

    /// Get the node name.
    ///
    ///
    ///
    /// Returns the unique name used to identify this replicated environment.
    pub fn get_node_name(&self) -> &str {
        self.config.node_name.as_str()
    }

    /// Get the group name.
    ///
    /// Returns the name of the replication group this node belongs to.
    pub fn get_group_name(&self) -> &str {
        self.config.group_name.as_str()
    }

    /// Get the current master (if known).
    ///
    /// Returns the name of the node that is currently the master, or None
    /// if the master is not known (e.g. the node is in the Unknown or
    /// Detached state).
    pub fn get_master_name(&self) -> Option<String> {
        self.master_tracker.get_master()
    }

    /// Get the replication group info.
    ///
    ///
    ///
    /// Returns a description of the replication group as known by this node.
    /// The replicated group metadata is stored in a replicated database and
    /// updates are propagated by the current master node to all replicas. If
    /// this node is not the master, it is possible for its description of the
    /// group to be out of date.
    pub fn get_group(&self) -> &GroupService {
        &self.group_service
    }

    /// Add a peer node to the replication group at runtime.
    ///
    /// The node is registered in the `GroupService` so elections and quorum
    /// calculations immediately reflect the new membership.
    pub fn add_peer(&self, node: crate::rep_node::RepNode) -> Result<()> {
        use crate::group_service::NodeInfo;
        use std::time::Instant;

        let info = NodeInfo {
            name: node.name.clone(),
            node_type: node.node_type,
            host: node.host.clone(),
            port: node.port,
            node_id: node.node_id,
            joined_at: Instant::now(),
            last_seen: Instant::now(),
            is_active: true,
            known_vlsn: 0,
            log_range: None,
            read_capacity_pct: node.read_capacity_pct,
            write_capacity_pct: node.write_capacity_pct,
            latency_hint_ms: node.latency_hint_ms,
        };
        self.group_service.add_node(info)?;
        log::info!(
            "Node '{}': added peer '{}' ({}:{}) to group '{}'",
            self.config.node_name,
            node.name,
            node.host,
            node.port,
            self.config.group_name,
        );

        // F9: if we are the current master, immediately register a
        // `Feeder` tracker for the new peer so AckTracker bookkeeping
        // and downstream pull-based streaming work without a forced
        // re-election.
        if self.is_master()
            && (node.node_type == crate::node_type::NodeType::Electable
                || node.node_type == crate::node_type::NodeType::Secondary)
        {
            let mut feeders = self.feeders.write();
            if !feeders.iter().any(|f| f.get_replica_name() == node.name) {
                feeders.push(Feeder::new(node.name.clone()));
                log::debug!(
                    "Node '{}' (master): dispatched Feeder for new peer '{}'",
                    self.config.node_name,
                    node.name,
                );
            }
        }
        Ok(())
    }

    /// Remove a peer node from the replication group by name.
    ///
    /// The node is deregistered from the `GroupService`.  Elections initiated
    /// after this call will not include the removed node in quorum calculations.
    pub fn remove_peer(&self, name: &str) -> Result<()> {
        self.group_service.remove_node(name)?;
        log::info!(
            "Node '{}': removed peer '{}' from group '{}'",
            self.config.node_name,
            name,
            self.config.group_name,
        );
        Ok(())
    }

    /// Update the capacity and latency metadata of an existing peer.
    ///
    /// Only the following fields are updated from `node`:
    ///   - `read_capacity_pct`
    ///   - `write_capacity_pct`
    ///   - `latency_hint_ms`
    ///
    /// The node's identity (name, address, port, node_type) is preserved.
    /// Safe to call while replication is active.
    ///
    /// If the quorum policy is `Flexible` or `Expression`, the quorum system
    /// is rebuilt to reflect the new capacity/latency weights.
    ///
    /// # Note
    ///
    /// `update_peer_metadata` does not currently re-run
    /// `QuorumPolicy::validate(electable_count)` after the metadata
    /// change.  An LP-optimal `Expression` quorum that was safe before
    /// the update may no longer satisfy the intersection property
    /// afterwards.  Until automatic revalidation lands, deployments
    /// using `QuorumPolicy::Expression` should call
    /// `quorum_policy().validate(get_rep_group().electable_count())`
    /// on the returned `RepGroup` after every metadata change and
    /// fail the operator-facing operation if validation reports
    /// unsafety.
    pub fn update_peer_metadata(
        &self,
        name: &str,
        node: crate::rep_node::RepNode,
    ) -> Result<()> {
        self.group_service.update_node_metadata(
            name,
            node.read_capacity_pct,
            node.write_capacity_pct,
            node.latency_hint_ms,
        )?;
        log::info!(
            "Node '{}': updated metadata for peer '{}' \
             (read_cap={}, write_cap={}, latency={}ms)",
            self.config.node_name,
            name,
            node.read_capacity_pct,
            node.write_capacity_pct,
            node.latency_hint_ms,
        );
        Ok(())
    }

    /// Returns a snapshot of the current replication group as a `RepGroup`.
    ///
    /// The snapshot reflects the state at the time of the call; subsequent
    /// `add_peer` / `remove_peer` calls are not reflected in it.
    pub fn get_rep_group(&self) -> crate::rep_group::RepGroup {
        use crate::rep_group::RepGroup;

        let mut group = RepGroup::new(
            self.config.group_name.clone(),
            self.group_service.get_group_id(),
        );
        for info in self.group_service.get_all_nodes() {
            let mut node = crate::rep_node::RepNode::new(
                info.name.clone(),
                info.node_type,
                info.host.clone(),
                info.port,
                info.node_id,
            );
            node.read_capacity_pct = info.read_capacity_pct;
            node.write_capacity_pct = info.write_capacity_pct;
            node.latency_hint_ms = info.latency_hint_ms;
            group.add_node(node);
        }
        group
    }

    /// Get the replication configuration.
    ///
    ///
    ///
    /// Returns the replication configuration that has been used to create this
    /// environment.
    pub fn get_config(&self) -> &RepConfig {
        &self.config
    }

    /// Get the current VLSN range on this node.
    ///
    /// Returns the range of VLSNs currently available on this node.
    pub fn get_vlsn_range(&self) -> VlsnRange {
        self.vlsn_index.get_range()
    }

    /// Get the latest VLSN.
    ///
    /// Returns the most recent VLSN registered on this node.
    pub fn get_current_vlsn(&self) -> u64 {
        self.vlsn_index.get_latest_vlsn()
    }

    /// The replica-side replication stream state (master high-water, applied
    /// VLSN, lag).  Used by the consistency read-gate to learn the master's
    /// latest known commit VLSN (JE `ConsistencyTracker.masterTxnEndVLSN`,
    /// updated by heartbeats).
    pub fn replica_stream(&self) -> &ReplicaStream {
        &self.replica_stream
    }

    /// REP-10 (B): mint a [`CommitToken`] for the most recent commit on this
    /// master.
    ///
    /// Port of `MasterTxn.getCommitToken`: returns
    /// `new CommitToken(envUUID, commitVLSN.getSequence())`.  A client that
    /// just performed a write on the master calls this to obtain the token it
    /// will hand to a subsequent replica read
    /// (`Transaction.getCommitToken`).  Returns `None` on a non-master or when
    /// no commit VLSN exists yet (JE returns `null` when `commitVLSN.isNull`).
    ///
    /// The token's VLSN is the master's latest assigned VLSN — the same
    /// `wal_vlsn_counter` high-water the ack gate keys on (the commit was
    /// logged immediately before this call).
    pub fn commit_token(&self) -> Option<crate::CommitToken> {
        if !self.is_master() {
            return None;
        }
        let vlsn = self.wal_vlsn_counter.load(Ordering::Acquire);
        crate::CommitToken::new(self.config.group_name.clone(), vlsn)
    }

    /// REP-10 (C): the read-gate. Enforce a replica read-consistency policy
    /// before a read transaction proceeds.
    ///
    /// Port of `ReplicaConsistencyPolicy.ensureConsistency` as invoked from a
    /// replica `beginTransaction` (`RepImpl.checkConsistency` /
    /// `Replica.getConsistencyTracker().awaitVLSN`).  Called by the replica
    /// env's transaction-begin / read path.
    ///
    /// - `policy_override`: a per-transaction policy (JE
    ///   `TransactionConfig.setConsistencyPolicy`).  When `None`, the node's
    ///   configured default is used (`ReplicationConfig.setConsistencyPolicy`
    ///   — [`RepConfig::consistency_policy`]).
    ///
    /// On a master, or when the effective policy is
    /// [`ConsistencyPolicy::NoConsistency`], this returns immediately so
    /// existing behaviour is unchanged unless a policy is set.  On a replica
    /// with a non-`NoConsistency` policy it BLOCKS until the replica has
    /// replayed far enough or the policy timeout expires (a clean
    /// [`RepError`], never a hang).
    pub fn begin_read_consistency(
        &self,
        policy_override: Option<&crate::ConsistencyPolicy>,
    ) -> Result<()> {
        // Resolve the effective policy: per-txn override else node default.
        let default_policy = self.config.consistency_policy.clone();
        let policy = policy_override.unwrap_or(&default_policy);

        // NoConsistency never blocks (the master path also lands here).
        if matches!(policy, crate::ConsistencyPolicy::NoConsistency) {
            return Ok(());
        }

        // A non-No policy only makes sense on a replica with a live replay
        // (its last_applied_vlsn is the wait predicate).  Without a tracker
        // there is nothing to wait on — treat as immediately consistent
        // rather than block forever (e.g. on the master, which is by
        // definition fully current).
        let tracker = self.consistency_tracker.lock().unwrap().clone();
        let Some(tracker) = tracker else {
            return Ok(());
        };

        // Surface the master's latest known VLSN for the time policy
        // (heartbeat / feeder high-water).  JE ConsistencyTracker tracks this
        // via trackHeartbeat; here we read the replica_stream high-water.
        let master_vlsn = self.replica_stream.get_master_vlsn();
        if master_vlsn > 0 {
            tracker.set_master_vlsn(master_vlsn);
        }

        tracker.await_consistency(policy)
    }

    /// REP-10 (C) test seam: install a [`ConsistencyTracker`] over an existing
    /// `last_applied_vlsn` handle, exactly as `become_replica` does when it
    /// starts the live replay thread.
    ///
    /// Lets a test drive a real [`noxu_dbi::ReplicaReplay`] and exercise
    /// [`Self::begin_read_consistency`] end-to-end without standing up TCP
    /// feeder/receiver threads.  Not part of the production API.
    #[cfg(any(test, feature = "test-harness"))]
    pub fn install_consistency_tracker_for_test(
        &self,
        last_applied_vlsn: std::sync::Arc<std::sync::atomic::AtomicU64>,
    ) -> crate::ConsistencyTracker {
        let tracker = crate::ConsistencyTracker::new(last_applied_vlsn);
        *self.consistency_tracker.lock().unwrap() = Some(tracker.clone());
        tracker
    }

    /// REP-1 STEP 5 (D): run a live syncup against `feeder` and, if this
    /// replica's tail diverged, ROLL IT BACK to the common matchpoint instead
    /// of falling back to a network restore.
    ///
    /// Port of the replica's side of JE `ReplicaFeederSyncup.execute`:
    /// `findMatchpoint` → `verifyRollback` → `replay.rollback` →
    /// `vlsnIndex.truncateFromTail` → resume streaming from `matchpoint + 1`.
    ///
    /// `feeder` is the master's [`crate::stream::syncup::SyncupView`] (built
    /// from its VLSN index, or exchanged over the syncup wire protocol in
    /// [`crate::stream::syncup_protocol`]). The decision uses the same pure
    /// core the protocol drives: `find_matchpoint` + `verify_rollback`.
    ///
    /// Returns:
    /// - [`SyncupAction::RolledBack`] — the divergent tail was truncated to
    ///   the matchpoint; resume streaming from `start_vlsn`. The non-diverged
    ///   case (matchpoint == last VLSN) returns `RolledBack` with an empty
    ///   tail and is a no-op rollback.
    /// - [`SyncupAction::NeedsRestore`] — `verify_rollback` selected
    ///   NetworkRestore (no common matchpoint) or HardRecovery (the rollback
    ///   would cross a committed/aborted txn); the caller must network-restore
    ///   per JE.
    ///
    /// The non-diverged fast path (the replica's range is a prefix of the
    /// feeder's) is still served by the range-check `negotiate_syncup`
    /// (`SyncupResult::CanServe`) in the streaming path; this method is the
    /// DIVERGED case.
    pub fn syncup_with_feeder(
        &self,
        feeder: &dyn crate::stream::syncup::SyncupView,
    ) -> Result<SyncupAction> {
        // Build the replica's SyncupView. When a real LogManager is wired,
        // re-read the log (SyncupLogView) so the per-VLSN fingerprint is the
        // actual record checksum (JE ReplicaSyncupReader). Otherwise (the
        // VLSN-index-only harness model) fall back to the index view, whose
        // fingerprint is the LSN.
        let log_view: Option<crate::stream::syncup_reader::SyncupLogView> =
            self.env_impl.lock().unwrap().clone().and_then(|env| {
                if let Some(lm) = env.get_log_manager() {
                    // Flush so all VLSN-tagged entries are on disk before the
                    // backward re-read (JE flushNoSync in initScan).
                    let _ = lm.flush_sync();
                }
                crate::stream::syncup_reader::SyncupLogView::scan(
                    env.get_env_home(),
                )
            });
        let index_view = VlsnIndexView::from_index(&self.vlsn_index);
        let replica_view: &dyn crate::stream::syncup::SyncupView =
            match &log_view {
                Some(v) => v,
                None => &index_view,
            };

        let range = self.vlsn_index.get_range();
        let last_sync = range.get_last_sync();
        let last_txn_end = range.get_last_txn_end();
        let to_vlsn = |v: u64| {
            if v == 0 {
                noxu_util::NULL_VLSN
            } else {
                noxu_util::Vlsn::new(v as i64)
            }
        };

        // Step 1: find the matchpoint (JE findMatchpoint).
        let matchpoint = find_matchpoint(replica_view, feeder);

        // numPassedCommits: count of txn ends strictly above the matchpoint.
        // When we re-read the log, count them exactly; otherwise rely on the
        // numeric `lastTxnEnd <= matchpoint` test in verify_rollback (which
        // matches JE when sync points == txn ends).
        let num_passed_commits = match (&log_view, &matchpoint) {
            (Some(v), Matchpoint::Found { vlsn, .. }) => {
                v.num_passed_commits(*vlsn)
            }
            _ => 0,
        };
        let decision = verify_rollback(
            &matchpoint,
            to_vlsn(last_txn_end),
            to_vlsn(last_sync),
            num_passed_commits,
        );

        match decision {
            RollbackDecision::RollbackToMatchpoint {
                matchpoint_vlsn,
                start_vlsn,
            } => {
                let matchpoint_lsn = match &matchpoint {
                    Matchpoint::Found { lsn, .. } => *lsn,
                    Matchpoint::None => 0,
                };
                // Collect the rolled-back LSNs (VLSNs strictly above the
                // matchpoint). When the real log was re-read, use its EXACT
                // per-VLSN LSNs so make-invisible flips the right header bytes
                // (the sparse VLSN index only stores boundary/last LSNs).
                let mp = matchpoint_vlsn.sequence().max(0) as u64;
                let rollback_lsns: Vec<noxu_util::Lsn> = match &log_view {
                    Some(v) => v
                        .entries()
                        .filter(|(vlsn, _)| (vlsn.sequence() as u64) > mp)
                        .map(|(_, e)| noxu_util::Lsn::from_u64(e.lsn))
                        .collect(),
                    None => self
                        .vlsn_index
                        .snapshot_entries()
                        .into_iter()
                        .filter(|(vlsn, _, _)| *vlsn > mp)
                        .map(|(_, file, offset)| {
                            noxu_util::Lsn::new(file, offset)
                        })
                        .collect(),
                };
                self.execute_rollback(mp, matchpoint_lsn, &rollback_lsns)?;
                Ok(SyncupAction::RolledBack {
                    matchpoint_vlsn: mp,
                    start_vlsn: start_vlsn.sequence().max(0) as u64,
                })
            }
            RollbackDecision::HardRecovery { .. }
            | RollbackDecision::NetworkRestore => {
                Ok(SyncupAction::NeedsRestore)
            }
        }
    }

    /// Execute the durable + in-memory rollback to `matchpoint_vlsn`
    /// (LSN `matchpoint_lsn`). Port of JE `Replay.rollback` +
    /// `vlsnIndex.truncateFromTail`.
    ///
    /// Durable steps (RollbackStart/End + make-invisible + fsync) go through
    /// [`noxu_recovery::rollback`] when a `LogManager` is wired; the VLSN index
    /// is always truncated to the matchpoint so the reported range matches the
    /// rolled-back state and streaming resumes from `matchpoint + 1`.
    fn execute_rollback(
        &self,
        matchpoint_vlsn: u64,
        matchpoint_lsn: u64,
        rollback_lsns: &[noxu_util::Lsn],
    ) -> Result<()> {
        // Durable rollback (RollbackStart … make-invisible … RollbackEnd) when
        // a live LogManager is available. The harness-level env (VLSN-index
        // only, no LogManager) skips the on-disk steps; the index truncation
        // below is what makes the replica converge in that model.
        if let Some(env) = self.env_impl.lock().unwrap().clone()
            && let Some(log_mgr) = env.get_log_manager()
            && matchpoint_lsn != 0
        {
            let mp_lsn = noxu_util::Lsn::from_u64(matchpoint_lsn);
            // active_txn_ids: the harness/VLSN-index model has no live txn
            // table here; the durable RollbackStart records an empty set, and
            // the per-txn gating (REP-1 STEP 2) applies during recovery when
            // the analysis pass rebuilds the active set. A future pass can
            // thread the live ReplayTxn ids through (JE
            // localActiveTxns.keySet()).
            noxu_recovery::rollback(
                &log_mgr,
                noxu_util::Vlsn::new(matchpoint_vlsn as i64),
                mp_lsn,
                Vec::new(),
                rollback_lsns,
            )
            .map_err(|e| {
                RepError::DatabaseError(format!(
                    "live rollback to matchpoint failed: {e}"
                ))
            })?;
        }

        // JE vlsnIndex.truncateFromTail(startVLSN, matchpointLSN): drop the
        // divergent VLSN tail so the reported range matches the recovered
        // state and streaming resumes from matchpoint + 1.
        self.vlsn_index.truncate_after(matchpoint_vlsn);

        log::info!(
            "Node '{}': live syncup rolled back to matchpoint vlsn={} \
             (lsn={:#x}); {} tail entries truncated",
            self.config.node_name,
            matchpoint_vlsn,
            matchpoint_lsn,
            rollback_lsns.len(),
        );
        Ok(())
    }

    /// Test-only: clone the env's SHARED VLSN index `Arc`.
    ///
    /// REP-6: the replica receive loop (`become_replica` ->
    /// `EnvironmentLogWriter`) must feed THIS index — the one
    /// `get_vlsn_range`, `flush_to_disk`, and election ranking read — not a
    /// throwaway. Tests use this to build a writer the same way
    /// `become_replica` does and assert the shared index advances.
    #[cfg(feature = "test-harness")]
    pub fn vlsn_index_arc(&self) -> Arc<crate::vlsn::vlsn_index::VlsnIndex> {
        Arc::clone(&self.vlsn_index)
    }

    /// Return the list of replica names that currently have a `Feeder`
    /// tracker on this (master) node.
    ///
    /// Used by tests and operator tooling.  The returned list reflects
    /// the master's view at the time of the call; subsequent
    /// `add_peer`/`remove_peer` calls may change it.
    pub fn feeder_replica_names(&self) -> Vec<String> {
        self.feeders.read().iter().map(|f| f.get_replica_name()).collect()
    }

    /// Number of downstream connections this node has served via the JE
    /// `Feeder`/`MasterFeederSource` mechanism (`FeederRunner +
    /// EnvironmentLogScanner` reading this node's OWN WAL).
    ///
    /// A non-zero value PROVES this node fed a downstream replica by the
    /// SAME mechanism the master uses — a cascading replica and the master
    /// run the identical `PeerFeederService` → `FeederRunner` →
    /// `EnvironmentLogScanner` path (JE `FeederManager` → `Feeder` →
    /// `MasterFeederSource`).  Used by the chained-replication test to assert
    /// the cascade does NOT use the in-memory pull fallback.
    pub fn wal_feeds_served(&self) -> u64 {
        self.wal_feeds_served.load(std::sync::atomic::Ordering::SeqCst)
    }

    // -----------------------------------------------------------------------
    // C-C2 — active push feeder API
    // -----------------------------------------------------------------------

    /// Register a channel for pushing log entries to a specific replica.
    ///
    /// When [`Self::become_master`] is called — or if the node is **already
    /// master** — a [`FeederRunner`] background thread is immediately spawned
    /// for this channel.  The thread reads from a dedicated in-memory queue
    /// that is fed by [`Self::replicate_entry`] / [`Self::apply_entry`], and
    /// sends framed log entries to the replica over `channel`.  Acks sent
    /// back by the replica are visible via
    /// [`Self::active_feeder_runner_acked_vlsn`].
    ///
    /// # Production vs. test use
    ///
    /// *Production*: pass a [`crate::net::TcpChannel`] connected to the
    /// replica's inbound feeder service.  
    /// *Tests*: pass one half of a [`crate::net::LocalChannelPair`].
    ///
    /// # Note on push vs. pull
    ///
    /// Registering a channel activates the **push** path: the master
    /// initiates and owns the feeder connection.  The existing **pull** path
    /// (`PeerFeederService` / `catch_up_from_peer`) continues to operate in
    /// parallel for replicas that connect proactively.  Do not register a
    /// channel for a replica that already connects via the pull path, or
    /// entries may be delivered twice.
    ///
    /// If `become_master` was called *before* registering the channel, call
    /// this method afterward; it will spawn the FeederRunner immediately.
    pub fn register_feeder_channel(
        &self,
        replica_name: String,
        channel: Arc<dyn crate::net::Channel>,
    ) {
        {
            let mut ch = self.feeder_channels.lock().unwrap();
            ch.insert(replica_name.clone(), Arc::clone(&channel));
        }
        if self.is_master() {
            self.spawn_feeder_runner(replica_name, channel);
        }
    }

    /// Return the last VLSN acknowledged by the FeederRunner for `replica_name`.
    ///
    /// Returns `0` if no FeederRunner is currently active for that replica
    /// (either `become_master` was not called yet, or no channel was
    /// registered).  Use this to poll catch-up progress before shutdown.
    pub fn active_feeder_runner_acked_vlsn(&self, replica_name: &str) -> u64 {
        self.active_feeder_runners
            .lock()
            .unwrap()
            .get(replica_name)
            .map(|r| r.known_replica_vlsn())
            .unwrap_or(0)
    }

    /// Spawn a FeederRunner thread for `replica_name` using `channel`.
    ///
    /// Creates a dedicated `PeerLogScanner` queue for the replica, registers
    /// it in `feeder_queues` so that future `replicate_entry` / `apply_entry`
    /// calls fan out into it, spawns the `FeederRunner::run` loop, and
    /// records the `Arc<FeederRunner>` in `active_feeder_runners`.
    ///
    /// Idempotent: if a FeederRunner is already active for `replica_name`
    /// (from a prior `become_master` call), it is replaced — the old channel
    /// should have been closed already via `close()`.
    ///
    /// **WAL-scanner auto-feed path (C-C2b)**: when a live `EnvironmentImpl`
    /// has been wired via `with_environment`, the FeederRunner thread uses an
    /// `EnvironmentLogScanner` as its source.  Every `log_txn_commit` on the
    /// master writes a VLSN-tagged WAL entry (22-byte header); the scanner
    /// finds these entries and streams them to the replica automatically,
    /// without any `replicate_entry` call from the application.
    ///
    /// **Fallback path**: when no `EnvironmentImpl` is wired the runner reads
    /// from the in-memory `PeerLogScanner` queue populated by
    /// `replicate_entry` / `apply_entry` — the previous manual behaviour.
    fn spawn_feeder_runner(
        &self,
        replica_name: String,
        channel: Arc<dyn crate::net::Channel>,
    ) {
        // Dedicated entry queue: entries flowing from this master reach the
        // FeederRunner without competing with PeerFeederService.
        let queue = Arc::new(PeerLogScanner::new());
        {
            self.feeder_queues
                .write()
                .unwrap()
                .insert(replica_name.clone(), Arc::clone(&queue));
        }

        // REP-9 Part 1: wire an ack sink so the FeederRunner forwards every
        // inbound replica ack to `env.record_ack(vlsn, replica_name)`, which
        // reaches BOTH the AckTracker (commit-blocking quorum) and the
        // matching `Feeder::acked_vlsn` (DTVLSN ranking).  Without this the
        // ack reached only the runner's private `known_replica_vlsn`.  The
        // sink holds a `Weak<Self>` so it never extends the env's lifetime;
        // if `self_weak` was never initialised we fall back to the plain
        // (sink-less) runner — `record_ack` is still reachable from tests.
        let runner = match self.self_weak.get().and_then(Weak::upgrade) {
            Some(env_arc) => {
                let weak = Arc::downgrade(&env_arc);
                let sink: crate::stream::feeder::AckSink =
                    Arc::new(move |name: &str, vlsn: u64| {
                        if let Some(env) = weak.upgrade() {
                            env.record_ack(vlsn, name);
                        }
                    });
                Arc::new(FeederRunner::new_with_ack_sink(
                    Arc::clone(&channel),
                    1,
                    replica_name.clone(),
                    sink,
                ))
            }
            None => Arc::new(FeederRunner::new(Arc::clone(&channel), 1)),
        };
        let runner_clone = Arc::clone(&runner);
        let replica_clone = replica_name.clone();

        // C-C2b: prefer EnvironmentLogScanner (WAL auto-feed) when env is
        // wired; fall back to in-memory queue (manual replicate_entry path)
        // otherwise.
        let env_opt = self.env_impl.lock().unwrap().clone();

        let handle = std::thread::Builder::new()
            .name(format!("noxu-feeder-{}", replica_name))
            .spawn(move || {
                if let Some(env) = env_opt {
                    if let Some(mut scanner) =
                        EnvironmentLogScanner::new(&env, None)
                    {
                        log::info!(
                            "FeederRunner for replica '{}': using \
                             EnvironmentLogScanner (WAL auto-feed)",
                            replica_clone,
                        );
                        let _ = runner_clone.run(&mut scanner);
                    } else {
                        log::warn!(
                            "FeederRunner for replica '{}': \
                             EnvironmentLogScanner unavailable, \
                             falling back to in-memory queue",
                            replica_clone,
                        );
                        let mut source = PeerScannerAdapter::new(queue, 0);
                        let _ = runner_clone.run(&mut source);
                    }
                } else {
                    let mut source = PeerScannerAdapter::new(queue, 0);
                    let _ = runner_clone.run(&mut source);
                }
                log::debug!(
                    "FeederRunner for replica '{}' exited cleanly",
                    replica_clone
                );
            })
            .expect("failed to spawn FeederRunner thread");

        {
            let mut runners = self.active_feeder_runners.lock().unwrap();
            runners.insert(replica_name.clone(), Arc::clone(&runner));
        }
        self.io_threads.lock().unwrap().push(handle);

        log::info!(
            "Node '{}' (master): FeederRunner thread spawned for replica '{}'",
            self.config.node_name.as_str(),
            replica_name,
        );
    }

    // -----------------------------------------------------------------------

    /// Bootstrap this node's environment by network-restoring all `.ndb`
    /// files from `peer_name` via the dispatcher's RESTORE service.
    ///
    /// Closes findings F2 / F4 of the 2026 review.
    ///
    /// The standalone `NetworkRestore::execute()` opens raw TCP and
    /// expects to drive the legacy `NetworkRestoreServer::start` listener.
    /// Production replicated environments host the RESTORE handler on the
    /// dispatcher, so this method routes through `execute_via_dispatcher`.
    ///
    /// `peer_name` must be a known peer in `GroupService`; on success the
    /// peer's `.ndb` files are written into `config.env_home`.  Returns
    /// `Err` if `env_home` is `None`, the peer is unknown, or the restore
    /// fails for any reason.
    pub fn bootstrap_via_dispatcher(&self, peer_name: &str) -> Result<()> {
        let env_home = self.config.env_home.clone().ok_or_else(|| {
            RepError::ConfigError(
                "bootstrap_via_dispatcher requires env_home in RepConfig"
                    .into(),
            )
        })?;
        let peer_info = self
            .group_service
            .get_all_nodes()
            .into_iter()
            .find(|n| n.name == peer_name)
            .ok_or_else(|| {
                RepError::ConfigError(format!(
                    "peer '{}' not registered in group '{}'",
                    peer_name, self.config.group_name,
                ))
            })?;

        let cfg = NetworkRestoreConfig {
            source_node: peer_info.name.clone(),
            source_host: peer_info.host.clone(),
            source_port: peer_info.port,
            retain_log_files: true,
        };
        let restore = NetworkRestore::new(cfg).with_local_dir(env_home);
        restore.execute_via_dispatcher()?;
        log::info!(
            "Node '{}' bootstrapped via dispatcher from '{}' ({}:{})",
            self.config.node_name,
            peer_info.name,
            peer_info.host,
            peer_info.port,
        );
        Ok(())
    }

    /// Get replication statistics.
    ///
    ///
    ///
    /// Returns statistics associated with this environment.
    pub fn get_stats(&self) -> &RepStats {
        &self.stats
    }

    /// Get the ack tracker.
    pub fn get_ack_tracker(&self) -> &AckTracker {
        &self.ack_tracker
    }

    /// Ensure the node state machine is in Unknown state, transitioning
    /// from Detached if necessary. This is needed because the state machine
    /// only allows Detached -> Unknown -> Master/Replica.
    pub fn ensure_unknown_state(&self) -> Result<()> {
        let current = self.node_state.get_state();
        match current {
            NodeState::Unknown => Ok(()),
            NodeState::Detached => {
                self.node_state.transition_to(NodeState::Unknown)?;
                Ok(())
            }
            // Master and Replica must transition through Unknown before
            // joining a new group or reconnecting.
            NodeState::Master | NodeState::Replica => {
                self.node_state.transition_to(NodeState::Unknown)?;
                Ok(())
            }
            NodeState::Shutdown => {
                Err(RepError::StateError("Node is shut down".to_string()))
            }
        }
    }

    /// Transition to master state.
    ///
    /// Transitions this node to Master state for the given election term.
    /// As master, the node can accept write operations and feed log entries
    /// to replicas.
    ///
    /// **Active push-feeder** (C-C2): if feeder channels have been registered
    /// via [`Self::register_feeder_channel`] before this call, a
    /// [`FeederRunner`] background thread is spawned per channel.
    ///
    /// **WAL-scanner auto-feed path (C-C2b, v3.3.0)**: when
    /// [`Self::with_environment`] has been called before `become_master`,
    /// each `FeederRunner` thread uses an [`EnvironmentLogScanner`] as its
    /// source.  Every `log_txn_commit` on the master writes a VLSN-tagged
    /// 22-byte WAL entry (via `LogManager::log_with_vlsn`); the scanner
    /// discovers these entries and streams them to replicas automatically,
    /// without any [`Self::replicate_entry`] call from the application.
    ///
    /// **Fallback path**: when no `EnvironmentImpl` is wired, the runner
    /// reads from the in-memory queue populated by [`Self::replicate_entry`] /
    /// [`Self::apply_entry`].
    ///
    /// If no feeder channels are registered, this call registers per-replica
    /// `Feeder` tracker structs for `AckTracker` bookkeeping only.  In that
    /// case replicas must connect proactively to the `PEER_FEEDER` pull
    /// service to receive entries.
    pub fn become_master(&self, term: u64) -> Result<()> {
        if self.is_shutdown() {
            return Err(RepError::StateError(
                "Cannot become master: environment is closed".to_string(),
            ));
        }

        // JE invariant: only `Electable` nodes can become master.  `Secondary`,
        // `Monitor`, and `Arbiter` are not electable and must be rejected at
        // the API layer (mirrors JE `ExceptionTest`).  See
        // `NodeType::can_be_master`.
        if !self.config.node_type.can_be_master() {
            return Err(RepError::InvalidStateTransition(format!(
                "node '{}' has type {} which is not electable as master",
                self.config.node_name.as_str(),
                self.config.node_type,
            )));
        }

        // Ensure we can reach Master state (may need Detached -> Unknown first)
        self.ensure_unknown_state()?;

        let old_state = self.node_state.get_state();
        self.node_state.transition_to(NodeState::Master)?;
        self.master_tracker.set_master(self.config.node_name.as_str(), term);

        // --- F9: spawn Feeder trackers for each known replica -------------
        //
        // Closes finding F9 of the 2026 review.
        // The architecture is pull-based: replicas pull from the master's
        // `PEER_FEEDER` service via `catch_up_from_peer`.  However, the
        // master must:
        //   1. Track each replica via a `Feeder` so AckTracker bookkeeping
        //      can attribute replica acks to the right node.
        //   2. Push its own writes into `peer_scanner` so replicas pulling
        //      from `PEER_FEEDER` actually receive entries (`replicate_entry`).
        //
        // Here we ensure step 1: every known electable peer in the group
        // gets a `Feeder` entry.
        {
            let mut feeders = self.feeders.write();
            // Drop any stale feeders left over from a prior role.  A
            // `Feeder` is just an in-memory tracker; recreating it is
            // cheap and avoids state inversion bugs across role changes.
            feeders.clear();
            for peer in self.group_service.get_all_nodes() {
                if peer.name == self.config.node_name {
                    continue;
                }
                if peer.node_type != crate::node_type::NodeType::Electable
                    && peer.node_type != crate::node_type::NodeType::Secondary
                {
                    // Arbiters do not receive log entries.
                    continue;
                }
                feeders.push(Feeder::new(peer.name.clone()));
                log::debug!(
                    "Node '{}' (master, term={}): registered Feeder for \
                     replica '{}'",
                    self.config.node_name.as_str(),
                    term,
                    peer.name,
                );
            }
        }

        // For observability, log the count.
        log::info!(
            "Node '{}' became master for term {} \
             (feeder trackers: {} known replicas)",
            self.config.node_name.as_str(),
            term,
            self.feeders.read().len(),
        );

        // C-C2: spawn FeederRunner threads for pre-registered channels.
        //
        // When `register_feeder_channel` was called before `become_master`,
        // the channels are already in `feeder_channels`. Drain them and
        // spawn a FeederRunner per replica.  The FeederRunner reads from a
        // dedicated `PeerLogScanner` queue (populated by `replicate_entry`
        // fan-out) and pushes framed log entries to the replica over the
        // registered channel.  Acks from the replica are tracked in the
        // FeederRunner and visible via `active_feeder_runner_acked_vlsn`.
        {
            let channels: Vec<(String, Arc<dyn crate::net::Channel>)> = self
                .feeder_channels
                .lock()
                .unwrap()
                .iter()
                .map(|(k, v)| (k.clone(), Arc::clone(v)))
                .collect();
            for (replica_name, channel) in channels {
                self.spawn_feeder_runner(replica_name, channel);
            }
        }

        // --- WAL-backed PEER_FEEDER for pull-path replicas -------------------
        //
        // The master's writes go to its WAL (VLSN-tagged 22-byte headers) and
        // its VLSN index, but NOT necessarily to the in-memory `peer_scanner`
        // (e.g. `register_vlsn_typed` only updates the index).  A replica that
        // pulls via the `PEER_FEEDER` service therefore finds an empty
        // in-memory scanner and gets `NeedsRestore`.
        //
        // Re-register PEER_FEEDER with a WAL-backed source so a pulling
        // replica receives the VLSN-tagged stream straight from the master's
        // OWN WAL via the same `EnvironmentLogScanner` + `FeederRunner` used
        // throughout.  Faithful to JE `MasterFeederSource(repImpl, vlsnIndex,
        // startVLSN)`, which reads the VLSNIndex + log regardless of node
        // role; `FeederManager` runs feeders on whatever node holds the data.
        // (The same registration runs, gated on `cascade_feeding`, in
        // `become_replica` so a mid-tier replica can cascade downstream.)
        if let Some(env) = self.env_impl.lock().unwrap().clone()
            && let Some(ref dispatcher) = self.tcp_dispatcher
        {
            let wal_source = crate::stream::peer_feeder::WalFeederSource::new(
                Arc::clone(&env),
                Arc::clone(&self.vlsn_index),
            );
            let svc = PeerFeederService::with_wal_source_counted(
                Arc::clone(&self.peer_scanner),
                wal_source,
                Arc::clone(&self.wal_feeds_served),
            );
            dispatcher.register(PEER_FEEDER_SERVICE_NAME, Arc::new(svc));
            log::debug!(
                "Node '{}' (master): PEER_FEEDER now serves replicas from \
                 its own WAL",
                self.config.node_name.as_str(),
            );
        }

        // -------------------------------------------------------------------

        // Notify listeners
        self.notify_listeners(old_state, NodeState::Master);

        Ok(())
    }

    /// Transition to replica state with the given master.
    ///
    /// Transitions this node to Replica state. The node will receive log
    /// entries from the specified master.
    ///
    /// If a live `EnvironmentImpl` has been wired in via `with_environment`,
    /// the method prepares an `EnvironmentLogWriter` so that replicated
    /// entries can be written to the local log.  The actual network connection
    /// is established by the `TcpServiceDispatcher`; this method logs intent.
    ///
    /// In HA.
    pub fn become_replica(&self, master_name: &str) -> Result<()> {
        if self.is_shutdown() {
            return Err(RepError::StateError(
                "Cannot become replica: environment is closed".to_string(),
            ));
        }

        // Ensure we can reach Replica state (may need Detached -> Unknown first)
        self.ensure_unknown_state()?;

        let old_state = self.node_state.get_state();
        self.node_state.transition_to(NodeState::Replica)?;
        self.master_tracker.set_master(master_name, 0);
        self.replica_stream.set_master(master_name);
        self.replica_stream.set_state(
            crate::stream::replica_stream::ReplicaStreamState::Connecting,
        );

        // --- G19: start replica receive loop --------------------------------
        //
        // Connects to the master's PEER_FEEDER service and runs a
        // ReplicaReceiver loop in a background thread.  The receiver writes
        // replicated entries via EnvironmentLogWriter.
        if let Some(env) = self.env_impl.lock().unwrap().clone() {
            if let Some(log_mgr) = env.get_log_manager() {
                // REP-6: feed the env's SHARED, persisted VLSN index (the one
                // flush_to_disk persists and get_vlsn_range / election ranking
                // read) into the replica receive loop — NOT a throwaway. Using
                // a fresh index would leave the persisted vlsn.idx, the
                // reported VLSN range, and the DTVLSN-ranking own_vlsn lagging
                // the actually-received stream, widening catch-up (or forcing
                // an unnecessary network restore) after a clean restart.
                // JE: the replica's VLSNIndex IS the environment's persisted
                // index (see VLSNIndex).
                let vlsn_index = Arc::clone(&self.vlsn_index);

                // --- Chained replication: start a WAL-backed feeder source ---
                //
                // When `cascade_feeding` is enabled, re-register this node's
                // PEER_FEEDER service with a WAL-backed source so a DOWNSTREAM
                // replica can connect and receive the VLSN-tagged log stream
                // FROM THIS REPLICA's OWN WAL (the bytes it received + persisted
                // via EnvironmentLogWriter::log_with_vlsn).  The feeder uses the
                // same EnvironmentLogScanner + FeederRunner the master uses.
                //
                // Faithful to JE's cascading-feeder model: the same
                // FeederManager/Feeder/FeederSource machinery runs on any node
                // that holds the data.  `FeederSource` is documented as "a real
                // Master OR a Replica in a Replica chain that is replaying log
                // records it received from some other source"
                // (`FeederSource.java`); `MasterFeederSource(repImpl, vlsnIndex,
                // startVLSN)` reads the VLSNIndex + log regardless of role.
                //
                // Default OFF (master-direct) preserves current behaviour: a
                // replica's PEER_FEEDER stays backed by the in-memory pull
                // scanner unless cascade is explicitly enabled.
                if self.config.cascade_feeding {
                    if let Some(ref dispatcher) = self.tcp_dispatcher {
                        let wal_source =
                            crate::stream::peer_feeder::WalFeederSource::new(
                                Arc::clone(&env),
                                Arc::clone(&self.vlsn_index),
                            );
                        let svc = PeerFeederService::with_wal_source_counted(
                            Arc::clone(&self.peer_scanner),
                            wal_source,
                            Arc::clone(&self.wal_feeds_served),
                        );
                        dispatcher
                            .register(PEER_FEEDER_SERVICE_NAME, Arc::new(svc));
                        log::info!(
                            "Node '{}' (replica): cascade feeding ENABLED — \
                             PEER_FEEDER now serves downstream replicas from \
                             its own WAL via the SAME FeederRunner + \
                             EnvironmentLogScanner mechanism the master uses \
                             (JE Feeder + MasterFeederSource)",
                            self.config.node_name.as_str(),
                        );
                    } else {
                        log::warn!(
                            "Node '{}': cascade_feeding set but no TCP \
                             dispatcher; downstream replicas cannot connect",
                            self.config.node_name.as_str(),
                        );
                    }
                }

                // Resolve the master's socket address from the GroupService.
                let master_addr_opt: Option<SocketAddr> = self
                    .group_service
                    .get_all_nodes()
                    .iter()
                    .find(|n| n.name == master_name)
                    .and_then(|info| {
                        format!("{}:{}", info.host, info.port)
                            .parse::<SocketAddr>()
                            .ok()
                    });

                let node_name = self.config.node_name.clone();
                let master = master_name.to_string();
                let vlsn_index_clone = Arc::clone(&vlsn_index);
                // Live shutdown flag (shared Arc): the receive loop polls it
                // so `close()` can break the blocking upstream receive and
                // join this thread — vital for a mid-tier replica in a chain
                // that is closed before its upstream feeder.
                let shutdown = Arc::clone(&self.io_shutdown);
                // Wave 9-A fix 2: capture a Weak<Self> so the I/O thread
                // can call `bootstrap_via_dispatcher` automatically when
                // the master signals `NeedsRestore`.  When the env was
                // never registered with `init_self_weak` (raw
                // `Arc::new(Self::new(...))` without going through
                // `open()` or the test harness), the weak ref is `None`
                // and we fall back to operator-driven bootstrap.
                let self_weak: Option<Weak<Self>> =
                    self.self_weak.get().cloned();

                // REP-7 (B): clone the live EnvironmentImpl into the replica
                // thread so the writer can drive a ReplicaReplay that applies
                // each streamed entry to the live in-memory tree.
                let env_for_replay = Arc::clone(&env);

                // REP-10 (C): build the ReplicaReplay HERE (not inside the
                // closure) so we can publish its REP-7 `last_applied_vlsn`
                // handle to a ConsistencyTracker BEFORE the thread starts
                // streaming.  A read on this replica then waits on the same
                // handle the replay thread advances.  Port of
                // RepImpl.getConsistency / Replica.getConsistencyTracker.
                let replay = noxu_dbi::ReplicaReplay::new(env_for_replay);
                let tracker = crate::ConsistencyTracker::new(
                    replay.last_applied_vlsn_handle(),
                );
                *self.consistency_tracker.lock().unwrap() = Some(tracker);

                let handle = std::thread::Builder::new()
                    .name(format!("noxu-replica-{}", node_name))
                    .spawn(move || {
                        // REP-7 (B): wire the live replay-apply path so reads
                        // on the replica see replicated data without a
                        // restart.  JE: the replica writes each entry to its
                        // log, then Replay.replayEntry applies it to the tree.
                        let mut writer = EnvironmentLogWriter::with_replay(
                            log_mgr,
                            vlsn_index_clone,
                            replay,
                        );

                        let Some(addr) = master_addr_opt else {
                            log::warn!(
                                "noxu-replica-{}: master '{}' address not in RepGroup; \
                                 waiting for TCP dispatcher connection",
                                node_name, master,
                            );
                            return;
                        };

                        // Catch-up loop: catch up, observe NeedsRestore,
                        // optionally auto-bootstrap, retry once.  We cap
                        // the retry count at MAX_AUTO_BOOTSTRAP_ATTEMPTS
                        // (small) so a misbehaving master does not loop
                        // forever consuming network bandwidth.
                        const MAX_AUTO_BOOTSTRAP_ATTEMPTS: u32 = 2;
                        let mut attempts: u32 = 0;
                        loop {
                            // Observe close before (re)connecting so a
                            // shutdown between catch-up attempts exits
                            // promptly.
                            if shutdown.load(Ordering::SeqCst) {
                                return;
                            }
                            log::info!(
                                "noxu-replica-{}: connecting to master '{}' at {}",
                                node_name, master, addr,
                            );
                            match crate::stream::peer_feeder::catch_up_from_peer_until(
                                addr, 0, &mut writer, &shutdown,
                            ) {
                                Ok(true) => {
                                    log::info!(
                                        "noxu-replica-{}: catch-up complete from '{}'",
                                        node_name, master,
                                    );
                                    return;
                                }
                                Ok(false) => {
                                    // F2/F4: master signals NeedsRestore.
                                    // Wave 9-A fix 2: if a Weak<Self> was
                                    // plumbed in, upgrade it and call
                                    // `bootstrap_via_dispatcher` ourselves
                                    // so the replica auto-bootstraps and
                                    // resumes catch-up without operator
                                    // intervention.
                                    log::warn!(
                                        "noxu-replica-{}: master '{}' requires restore",
                                        node_name, master,
                                    );
                                    attempts += 1;
                                    if attempts > MAX_AUTO_BOOTSTRAP_ATTEMPTS {
                                        log::error!(
                                            "noxu-replica-{}: exceeded \
                                             auto-bootstrap attempts ({}); giving up",
                                            node_name,
                                            MAX_AUTO_BOOTSTRAP_ATTEMPTS,
                                        );
                                        return;
                                    }
                                    let env_arc = match self_weak
                                        .as_ref()
                                        .and_then(Weak::upgrade)
                                    {
                                        Some(e) => e,
                                        None => {
                                            // No back-ref or env dropped:
                                            // fall back to operator-driven
                                            // bootstrap and exit cleanly.
                                            log::warn!(
                                                "noxu-replica-{}: no back-reference \
                                                 available; operator must call \
                                                 bootstrap_via_dispatcher manually",
                                                node_name,
                                            );
                                            return;
                                        }
                                    };
                                    if env_arc.is_shutdown() {
                                        return;
                                    }
                                    log::info!(
                                        "noxu-replica-{}: auto-bootstrapping via \
                                         dispatcher from '{}' (attempt {})",
                                        node_name, master, attempts,
                                    );
                                    match env_arc
                                        .bootstrap_via_dispatcher(&master)
                                    {
                                        Ok(()) => {
                                            log::info!(
                                                "noxu-replica-{}: auto-bootstrap \
                                                 succeeded; resuming catch-up",
                                                node_name,
                                            );
                                            // Drop the strong ref before
                                            // re-entering catch-up so we
                                            // do not keep the env alive
                                            // longer than necessary.
                                            drop(env_arc);
                                            continue;
                                        }
                                        Err(e) => {
                                            log::error!(
                                                "noxu-replica-{}: auto-bootstrap \
                                                 failed: {}",
                                                node_name, e,
                                            );
                                            return;
                                        }
                                    }
                                }
                                Err(e) => {
                                    if !shutdown.load(Ordering::SeqCst) {
                                        log::error!(
                                            "noxu-replica-{}: error from master '{}': {e}",
                                            node_name, master,
                                        );
                                    }
                                    return;
                                }
                            }
                        }
                    })
                    .expect("failed to spawn noxu-replica thread");

                self.io_threads.lock().unwrap().push(handle);

                log::debug!(
                    "Node '{}': replica receive thread started for master '{}'",
                    self.config.node_name.as_str(),
                    master_name,
                );
            } else {
                log::warn!(
                    "Node '{}': no LogManager available (read-only env?); \
                     replica I/O loop not started",
                    self.config.node_name.as_str(),
                );
            }
        }
        // -------------------------------------------------------------------

        // Notify listeners
        self.notify_listeners(old_state, NodeState::Replica);

        log::info!(
            "Node '{}' became replica of master '{}'",
            self.config.node_name.as_str(),
            master_name
        );
        Ok(())
    }

    /// Initiate a master transfer to the target node.
    ///
    ///
    ///
    /// Transfers the current master state from this node to one of the
    /// electable replicas. The replica that is actually chosen to be the new
    /// master is the one with which the Master Transfer can be completed most
    /// rapidly. The transfer operation ensures that all changes at this node
    /// are available at the new master upon conclusion of the operation.
    pub fn transfer_master(&self, config: MasterTransferConfig) -> Result<()> {
        if self.is_shutdown() {
            return Err(RepError::StateError(
                "Cannot transfer master: environment is closed".to_string(),
            ));
        }

        if !self.is_master() {
            return Err(RepError::InvalidState(
                "Master transfer can only be initiated on the master node"
                    .to_string(),
            ));
        }

        log::info!(
            "Node '{}' initiating master transfer to '{}'",
            self.config.node_name.as_str(),
            config.target_node,
        );

        // Closes finding F7 of the 2026 review.
        //
        // Steps:
        //   1. Locate the target's address.
        //   2. Compute the new term (current observed term + 1).
        //   3. Send TRANSFER_MASTER to the target — it will become master.
        //   4. Send TRANSFER_MASTER (with the same term + new master name) to
        //      every other peer so they re-target.
        //   5. Demote self to Replica of the target.
        //
        // The transfer is best-effort: a peer that doesn't ack is logged
        // and skipped.  The election driver will reconcile any divergence
        // on the next election round.

        let target_addr = self
            .group_service
            .get_all_nodes()
            .into_iter()
            .find(|n| n.name == config.target_node)
            .and_then(|n| {
                format!("{}:{}", n.host, n.port)
                    .parse::<std::net::SocketAddr>()
                    .ok()
            })
            .ok_or_else(|| {
                RepError::ConfigError(format!(
                    "transfer_master: target '{}' not registered or has bad address",
                    config.target_node
                ))
            })?;

        let new_term = self.master_tracker.get_term().saturating_add(1);

        // 1. Tell the target to become master at the new term.
        let target_ack = crate::group_admin::send_transfer_master(
            target_addr,
            &config.target_node,
            new_term,
        )
        .map_err(|e| {
            RepError::NetworkError(format!(
                "transfer_master: failed to signal target '{}': {}",
                config.target_node, e
            ))
        })?;
        if !target_ack {
            return Err(RepError::StateError(format!(
                "transfer_master: target '{}' rejected the transfer",
                config.target_node
            )));
        }

        // 2. Inform all other peers (best-effort).
        for peer in self.group_service.get_all_nodes() {
            if peer.name == self.config.node_name
                || peer.name == config.target_node
            {
                continue;
            }
            if let Ok(addr) = format!("{}:{}", peer.host, peer.port).parse() {
                let _ = crate::group_admin::send_transfer_master(
                    addr,
                    &config.target_node,
                    new_term,
                );
            }
        }

        // 3. Demote self to Replica of the new master.
        self.become_replica(&config.target_node)?;

        log::info!(
            "Node '{}' transferred master to '{}' at term {}",
            self.config.node_name.as_str(),
            config.target_node,
            new_term,
        );
        Ok(())
    }

    /// Register a VLSN (as master, after writing a log entry).
    ///
    /// Maps the given VLSN to the specified log file position. This is called
    /// by the master after it writes a replicated log entry.
    pub fn register_vlsn(&self, vlsn: u64, file_number: u32, file_offset: u32) {
        self.vlsn_index.register(vlsn, file_number, file_offset);
    }

    /// Register a VLSN→LSN mapping with its `LogEntryType`, so `lastSync` /
    /// `lastTxnEnd` advance (JE `VLSNRange.getUpdateForNewMapping`). Used by
    /// the syncup driver/tests that apply VLSN-tagged entries to a real log
    /// and need the sync/commit boundaries to track the stream.
    pub fn register_vlsn_typed(
        &self,
        vlsn: u64,
        file_number: u32,
        file_offset: u32,
        entry_type: noxu_log::LogEntryType,
    ) {
        self.vlsn_index.register_with_type(
            vlsn,
            file_number,
            file_offset,
            entry_type,
        );
    }

    /// Replicate a freshly committed log entry from the master.
    ///
    /// Closes finding F9 of the 2026 review.
    ///
    /// Combines `register_vlsn` with a push into the in-memory
    /// `peer_scanner` so that downstream replicas pulling from this
    /// node's `PEER_FEEDER` service (via `catch_up_from_peer`) can
    /// stream the entry without round-tripping through the on-disk
    /// log.  The local log is still the source of truth; the peer
    /// scanner is a fast-path cache that bounds itself via
    /// `PeerLogScanner::with_capacity` so old entries are evicted.
    ///
    /// Should be called by the master after the local commit has
    /// fsynced.  Calling on a non-master is harmless (the peer
    /// scanner cache is also used by replicas) but is logged at trace
    /// level for diagnostics.
    pub fn replicate_entry(
        &self,
        vlsn: u64,
        file_number: u32,
        file_offset: u32,
        entry_type: u8,
        data: Vec<u8>,
    ) {
        // Register VLSN -> LSN, dispatching entry type so lastSync /
        // lastTxnEnd advance (REP-5; JE VLSNRange.getUpdateForNewMapping).
        // An unknown type byte falls back to extend-only registration.
        match noxu_log::LogEntryType::from_type_num(entry_type) {
            Some(et) => self.vlsn_index.register_with_type(
                vlsn,
                file_number,
                file_offset,
                et,
            ),
            None => self.vlsn_index.register(vlsn, file_number, file_offset),
        }
        // Pull path: shared peer_scanner serves replicas connecting via
        // PeerFeederService (catch_up_from_peer).
        self.peer_scanner.push(vlsn, entry_type, data.clone());
        // Push path (C-C2): fan out to per-replica FeederRunner queues so
        // that threads spawned by become_master can stream entries to each
        // registered replica without competing with PeerFeederService.
        {
            let queues = self.feeder_queues.read().unwrap();
            for queue in queues.values() {
                queue.push(vlsn, entry_type, data.clone());
            }
        }
        if !self.is_master() {
            log::trace!(
                "replicate_entry called on non-master node '{}': vlsn={}, type={}",
                self.config.node_name,
                vlsn,
                entry_type,
            );
        }
    }

    /// Apply a replicated entry (as replica).
    ///
    /// Applies a log entry received from the master. This is called by the
    /// replica stream handler after receiving an entry from the feeder.
    ///
    /// `data` is the wire-encoded log-record payload.  When the
    /// replicated environment has not been wired to a local
    /// `noxu_db::Environment` (i.e., before `with_environment` is
    /// called) the payload is forwarded into the in-memory peer
    /// scanner so that downstream replicas attached to the
    /// `PEER_FEEDER` service can re-stream it; the local log is **not**
    /// updated.  This is documented behaviour rather than a stub — see
    /// the 2026 review finding #26 (medium) for the
    /// `with_environment`-required local-apply path.
    /// cleanup (rep info F35: `_data` placeholder) renames the leading
    /// underscore so reviewers don't read it as a TODO.
    pub fn apply_entry(
        &self,
        vlsn: u64,
        entry_type: u8,
        data: Vec<u8>,
    ) -> Result<()> {
        if self.is_shutdown() {
            return Err(RepError::StateError(
                "Cannot apply entry: environment is closed".to_string(),
            ));
        }

        // Register the VLSN in the index, dispatching entry type so
        // lastSync/lastTxnEnd advance (REP-5; JE
        // VLSNRange.getUpdateForNewMapping).
        match noxu_log::LogEntryType::from_type_num(entry_type) {
            Some(et) => self.vlsn_index.register_with_type(vlsn, 0, 0, et),
            None => self.vlsn_index.register(vlsn, 0, 0),
        }

        // Push into the peer log scanner so downstream replicas can
        // receive this entry via the PEER_FEEDER service.
        self.peer_scanner.push(vlsn, entry_type, data.clone());
        // C-C2 push path: fan out to per-replica FeederRunner queues.
        {
            let queues = self.feeder_queues.read().unwrap();
            for queue in queues.values() {
                queue.push(vlsn, entry_type, data.clone());
            }
        }

        log::trace!(
            "Applied replicated entry: vlsn={}, type={}",
            vlsn,
            entry_type
        );
        Ok(())
    }

    /// Record an ack from a replica (as master).
    ///
    /// Records that the specified replica has acknowledged processing up to
    /// the given VLSN. This is used by the master to track durability
    /// guarantees.
    pub fn record_ack(&self, vlsn: u64, replica_name: &str) {
        // Only acks from ELECTABLE replicas count toward the durability
        // quorum (JE DurabilityQuorum.replicaAcksQualify: Monitors and
        // Secondaries do not qualify). An ack from a non-electable / unknown
        // node is recorded for stats elsewhere but must not satisfy the
        // ReplicaAckPolicy. If the node is unknown to the group view we err
        // toward NOT counting it.
        let qualifies = self
            .get_rep_group()
            .get_node(replica_name)
            .map(|n| n.node_type().is_electable())
            .unwrap_or(false);
        if qualifies {
            self.ack_tracker.record_ack(vlsn, replica_name);
        }
        // REP-9 Part 1: advance the matching `Feeder::acked_vlsn` high-water
        // mark (read by `update_dtvlsn_from_feeders` and exposed via
        // `get_acked_vlsn`).  The production `FeederRunner` previously updated
        // only its private `known_replica_vlsn`, so the DTVLSN ranking never
        // saw production progress (JE `Feeder.getReplicaTxnEndVLSN`).  We
        // record the high-water for *any* replica (electable or not); the
        // electable filter is reapplied when DTVLSN/quorum is computed.
        for feeder in self.feeders.read().iter() {
            if feeder.get_replica_name() == replica_name {
                feeder.record_ack(vlsn);
                break;
            }
        }
        // Recompute the DTVLSN from feeder progress whenever an ack lands.
        self.update_dtvlsn_from_feeders();
        // REP-9: wake any committer parked in `await_replica_acks`. Its
        // satisfaction predicate is the high-water feeder count, not an
        // exact-VLSN registration, so we must notify unconditionally (the
        // AckTracker's own `record_ack` only notifies when the exact VLSN was
        // registered, which the per-frame feeder acks generally are not).
        self.ack_tracker.notify_waiters();
    }

    /// Returns the current Durable Transaction VLSN (D7, JE RepNode.getDTVLSN).
    /// The highest VLSN replicated to a majority of electable replicas; 0 if
    /// none yet. Used by the election ranking so the most-durable node wins.
    pub fn get_dtvlsn(&self) -> u64 {
        self.dtvlsn.load(std::sync::atomic::Ordering::Acquire)
    }

    /// Advance the DTVLSN to `candidate` if it is greater (JE
    /// RepNode.updateDTVLSN — an `AtomicLongMax.updateMax`). The DTVLSN can
    /// only move forward. Returns the resulting (possibly unchanged) value.
    pub fn update_dtvlsn(&self, candidate: u64) -> u64 {
        use std::sync::atomic::Ordering;
        let mut cur = self.dtvlsn.load(Ordering::Acquire);
        while candidate > cur {
            match self.dtvlsn.compare_exchange_weak(
                cur,
                candidate,
                Ordering::AcqRel,
                Ordering::Acquire,
            ) {
                Ok(_) => return candidate,
                Err(observed) => cur = observed,
            }
        }
        cur
    }

    /// Set the DTVLSN from the replication stream (JE RepNode.setDTVLSN —
    /// used exclusively by the replica, which maintains the DTVLSN from
    /// commit/abort records). Still enforced as advance-only via update_max so
    /// an out-of-order or stale record cannot move it backward.
    pub fn set_dtvlsn(&self, vlsn: u64) {
        self.update_dtvlsn(vlsn);
    }

    /// Master-side DTVLSN computation (D7, JE FeederManager.updateDTVLSN):
    /// across the *qualifying* (electable) feeders whose replica-txn-end VLSN
    /// exceeds the current DTVLSN, take the minimum; once a SIMPLE_MAJORITY
    /// ack-count of them exceeds the current value, advance the DTVLSN to that
    /// minimum (a transaction is durable once a majority hold it).
    fn update_dtvlsn_from_feeders(&self) {
        if !self.is_master() {
            return;
        }
        let curr = self.get_dtvlsn();

        // SIMPLE_MAJORITY required-ack-count over the electable group,
        // computed the same way as await_replica_acks.
        let group = self.get_rep_group();
        let electable_peers: u32 = group
            .get_nodes()
            .iter()
            .filter(|n| n.node_type == crate::node_type::NodeType::Electable)
            .count() as u32;
        let electable_count = electable_peers + 1; // +1 for self/master
        // required electable acks for SIMPLE_MAJORITY = floor(n/2) replicas
        // (the master self-acks; a majority is reached when this many peers
        // also hold the VLSN).
        let durable_ack_count = electable_count / 2;
        if durable_ack_count == 0 {
            // Single-node (or majority is self alone): the master's own log is
            // immediately durable up to its latest VLSN.
            self.update_dtvlsn(self.get_current_vlsn());
            return;
        }

        let mut min = u64::MAX;
        let mut ack_count: u32 = 0;
        for feeder in self.feeders.read().iter() {
            // replicaAcksQualify: only electable feeders count (D6).
            let qualifies = group
                .get_node(&feeder.get_replica_name())
                .map(|n| n.node_type == crate::node_type::NodeType::Electable)
                .unwrap_or(false);
            if !qualifies {
                continue;
            }
            let replica_vlsn = feeder.get_acked_vlsn();
            if replica_vlsn <= curr {
                continue;
            }
            if replica_vlsn < min {
                min = replica_vlsn;
            }
            ack_count += 1;
            if ack_count >= durable_ack_count {
                // A majority of electable replicas hold >= min: durable.
                self.update_dtvlsn(min);
                return;
            }
        }
        // DTVLSN unchanged.
    }

    /// REP-9: count qualifying (electable) feeders whose acked high-water VLSN
    /// is `>= commit_vlsn`.  This is the Rust equivalent of JE
    /// `FeederManager.getNumCurrentAckFeeders(commitVLSN)` — the durability
    /// quorum is satisfied when this count reaches the required ack count.
    /// Only Electable replicas qualify (D6, JE
    /// `DurabilityQuorum.replicaAcksQualify`).
    fn count_ack_feeders_ge(&self, commit_vlsn: u64) -> u32 {
        let group = self.get_rep_group();
        let mut count = 0u32;
        for feeder in self.feeders.read().iter() {
            let qualifies = group
                .get_node(&feeder.get_replica_name())
                .map(|n| n.node_type == crate::node_type::NodeType::Electable)
                .unwrap_or(false);
            // A feeder counts only if it has acked a *real* VLSN at or above
            // the commit VLSN.  `acked_vlsn == 0` is the NULL sentinel (no ack
            // yet) and must never satisfy a commit, even when `commit_vlsn`
            // itself is 0 (no replicated commit logged) — mirrors JE
            // `getReplicaTxnEndVLSN()` returning NULL_VLSN for a fresh feeder,
            // which is not `>=` any commit VLSN.
            let acked = feeder.get_acked_vlsn();
            if qualifies && acked > 0 && acked >= commit_vlsn {
                count += 1;
            }
        }
        count
    }

    /// Set the state change listener.
    ///
    ///
    ///
    /// Sets the listener used to receive asynchronous replication node state
    /// change events. Note that there is one listener per replication node,
    /// not one per handle. Invoking this method adds to the set of listeners.
    ///
    /// Invoking this method typically results in an immediate callback to the
    /// application via the `on_state_change` method, so that the application
    /// is made aware of the existing state of the node at the time the listener
    /// is first established.
    pub fn set_state_change_listener(
        &self,
        listener: Arc<dyn StateChangeListener>,
    ) {
        // Immediately notify the listener of the current state
        let current_state = self.node_state.get_state();
        let event = StateChangeEvent::new(
            current_state,
            current_state,
            self.get_master_name(),
        );
        listener.on_state_change(event);

        let mut listeners = self.listeners.write();
        listeners.push(listener);
    }

    /// Close the replicated environment.
    ///
    ///
    ///
    /// Closes this handle and releases any resources. When closed, daemon
    /// threads are stopped, even if they are performing work. The node ceases
    /// participation in the replication group. If the node was currently the
    /// master, the rest of the group will hold an election.
    ///
    /// The ReplicatedEnvironment should not be closed while any other type of
    /// handle that refers to it is not yet closed.
    pub fn close(&self) -> Result<()> {
        if self.shutdown.swap(true, Ordering::SeqCst) {
            // Already closed
            return Ok(());
        }

        let old_state = self.node_state.get_state();

        // Transition to Shutdown state. The state machine allows this from
        // any non-Shutdown state.
        let _ = self.node_state.transition_to(NodeState::Shutdown);

        // Notify listeners of the shutdown
        self.notify_listeners(old_state, NodeState::Shutdown);

        // Clear feeders
        {
            let mut feeders = self.feeders.write();
            feeders.clear();
        }

        // C-C2: close all registered feeder channels so FeederRunner threads
        // observe ChannelClosed and exit their run() loops cleanly.
        {
            let channels = self.feeder_channels.lock().unwrap();
            for (name, ch) in channels.iter() {
                if let Err(e) = ch.close() {
                    log::debug!(
                        "close: feeder channel for '{}' already closed: {}",
                        name,
                        e
                    );
                }
            }
        }
        // Drop all active runners and queues so their Arcs release.
        self.active_feeder_runners.lock().unwrap().clear();
        self.feeder_queues.write().unwrap().clear();

        // Signal and join all I/O threads spawned by become_master /
        // become_replica / start_vlsn_persistence_daemon.  The vlsn-flush
        // thread does a final flush on its way out so a clean close is
        // recoverable.  Closes finding F11.
        self.io_shutdown.store(true, Ordering::SeqCst);
        {
            let mut threads = self.io_threads.lock().unwrap();
            for handle in threads.drain(..) {
                let _ = handle.join();
            }
        }

        // Belt-and-braces: even when no daemon is running (e.g.
        // `ReplicatedEnvironment::new` without `open`), persist a final
        // snapshot if env_home is configured.
        if let Some(ref home) = self.config.env_home
            && let Err(e) =
                crate::vlsn::persist::flush_to_disk(&self.vlsn_index, home)
        {
            log::warn!(
                "close: failed to persist VLSN index to {}: {}",
                home.display(),
                e
            );
        }

        // Stop the service dispatcher (the: serviceDispatcher.shutdown()).
        if let Some(ref dispatcher) = self.tcp_dispatcher {
            dispatcher.stop();
            let kind = if dispatcher.is_tls() { "TLS" } else { "TCP" };
            log::debug!(
                "Node '{}' {} service dispatcher stopped",
                self.config.node_name.as_str(),
                kind,
            );
        }

        log::info!(
            "Replicated environment '{}' in group '{}' closed",
            self.config.node_name.as_str(),
            self.config.group_name.as_str()
        );

        Ok(())
    }

    /// Close this handle and shut down the Replication Group by forcing all
    /// active Replicas to exit.
    ///
    ///
    ///
    /// This method must be invoked on the node that's currently the Master
    /// after all other outstanding handles have been closed.
    ///
    /// When push-feeder threads are active (registered via
    /// [`Self::register_feeder_channel`]), the master first waits up to half
    /// of `replica_shutdown_timeout_ms` for each FeederRunner replica to
    /// acknowledge all outstanding log entries (VLSN catch-up).  Replicas
    /// that do not catch up within the budget receive a warning; the master
    /// proceeds to send `SHUTDOWN_GROUP` regardless.  This closes finding M-4
    /// of the v3.x production-readiness review.
    ///
    /// Replicas that are not fed via a registered channel (pull-based
    /// `PeerFeederService` path) are sent `SHUTDOWN_GROUP` without a
    /// VLSN-level catch-up wait — that wait requires per-replica ack tracking
    /// which the pull path does not yet provide.
    pub fn shutdown_group(
        &self,
        replica_shutdown_timeout_ms: u64,
    ) -> Result<()> {
        if !self.is_master() {
            return Err(RepError::InvalidState(
                "shutdownGroup must be invoked on the master".to_string(),
            ));
        }

        log::info!(
            "Node '{}' shutting down replication group '{}' (replica_timeout={}ms)",
            self.config.node_name.as_str(),
            self.config.group_name.as_str(),
            replica_shutdown_timeout_ms,
        );

        // M-4: Wait for active FeederRunner replicas to ack the master's
        // current VLSN before sending SHUTDOWN_GROUP.  We allow up to half
        // the overall timeout for the catch-up phase so the second half
        // remains for the SHUTDOWN_GROUP send loop.
        let catchup_budget_ms = replica_shutdown_timeout_ms / 2;
        if catchup_budget_ms > 0 {
            let master_vlsn = self.vlsn_index.get_range().last();
            if master_vlsn > 0 {
                let runners: Vec<(String, Arc<FeederRunner>)> = self
                    .active_feeder_runners
                    .lock()
                    .unwrap()
                    .iter()
                    .map(|(k, v)| (k.clone(), Arc::clone(v)))
                    .collect();
                if !runners.is_empty() {
                    let catchup_deadline = std::time::Instant::now()
                        + Duration::from_millis(catchup_budget_ms);
                    for (name, runner) in &runners {
                        loop {
                            let acked = runner.known_replica_vlsn();
                            if acked >= master_vlsn
                                || std::time::Instant::now() >= catchup_deadline
                            {
                                if acked < master_vlsn {
                                    log::warn!(
                                        "shutdown_group: replica '{}' acked \
                                         VLSN {} < master VLSN {}; proceeding",
                                        name,
                                        acked,
                                        master_vlsn,
                                    );
                                } else {
                                    log::info!(
                                        "shutdown_group: replica '{}' caught up \
                                         to VLSN {}",
                                        name,
                                        acked,
                                    );
                                }
                                break;
                            }
                            std::thread::sleep(Duration::from_millis(10));
                        }
                    }
                }
            }
        }

        // Closes finding F8 of the 2026 review.
        //
        // Send SHUTDOWN_GROUP to every known peer.  The recipient calls
        // its own `close()` and the per-connection ADMIN handler
        // returns ACK_OK.  Any peer that doesn't ack within the
        // timeout is logged and the master proceeds.  After signalling
        // every peer, the master closes its own env.
        let deadline = std::time::Instant::now()
            + Duration::from_millis(replica_shutdown_timeout_ms);

        for peer in self.group_service.get_all_nodes() {
            if peer.name == self.config.node_name {
                continue;
            }
            // Don't exceed the deadline waiting for any single peer.
            let now = std::time::Instant::now();
            if now >= deadline {
                log::warn!(
                    "shutdown_group: deadline reached; skipping remaining peers"
                );
                break;
            }
            let addr_str = format!("{}:{}", peer.host, peer.port);
            let addr = match addr_str.parse::<SocketAddr>() {
                Ok(a) => a,
                Err(e) => {
                    log::warn!(
                        "shutdown_group: peer '{}' has bad address {}: {}",
                        peer.name,
                        addr_str,
                        e
                    );
                    continue;
                }
            };
            match crate::group_admin::send_shutdown_group(addr) {
                Ok(true) => log::info!(
                    "shutdown_group: peer '{}' acknowledged",
                    peer.name
                ),
                Ok(false) => log::warn!(
                    "shutdown_group: peer '{}' rejected the request",
                    peer.name
                ),
                Err(e) => log::warn!(
                    "shutdown_group: peer '{}' unreachable: {}",
                    peer.name,
                    e
                ),
            }
        }

        // Master closes itself last.
        self.close()
    }

    /// Check if shutdown is in progress.
    pub fn is_shutdown(&self) -> bool {
        self.shutdown.load(Ordering::SeqCst)
    }

    /// Notify all registered listeners of a state change.
    fn notify_listeners(&self, old_state: NodeState, new_state: NodeState) {
        let listeners = self.listeners.read();
        if !listeners.is_empty() {
            let event = StateChangeEvent::new(
                old_state,
                new_state,
                self.get_master_name(),
            );
            for listener in listeners.iter() {
                listener.on_state_change(event.clone());
            }
        }
    }
}

// ---------------------------------------------------------------------------
// F1: ReplicaAckCoordinator impl wires master commits into the AckTracker.
// ---------------------------------------------------------------------------
//
// `noxu_db::Transaction::commit_with_durability` calls
// `await_replica_acks` after the local WAL fsync.  This impl:
//
//   1. Rejects calls on a non-master node with `NotMaster`.
//   2. Rejects calls during shutdown with `Shutdown`.
//   3. Computes the required ack count from `electable_count` and the
//      requested policy.
//   4. Allocates a unique commit sequence number, registers the ack
//      requirement on the `AckTracker`, and polls `is_satisfied` with
//      a small sleep until either the timeout elapses or the policy
//      is satisfied.
//   5. Cleans up the tracker entry on every exit path.
//
// Closes finding F1 of the 2026 review.
impl ReplicaAckCoordinator for ReplicatedEnvironment {
    fn await_replica_acks(
        &self,
        policy: ReplicaAckPolicyKind,
        timeout: Duration,
    ) -> std::result::Result<u32, AckWaitError> {
        // Fast-path: ReplicaAckPolicy::None never blocks. The trait spec
        // says callers may already short-circuit, but be defensive.
        if matches!(policy, ReplicaAckPolicyKind::None) {
            return Ok(0);
        }

        if self.is_shutdown() {
            return Err(AckWaitError {
                kind: AckWaitErrorKind::Shutdown,
                needed: 0,
                received: 0,
            });
        }

        if !self.is_master() {
            return Err(AckWaitError {
                kind: AckWaitErrorKind::NotMaster,
                needed: 0,
                received: 0,
            });
        }

        // Count electable peers (excluding the master) using the
        // RepGroup view, which counts Arbiters and Electables
        // identically. Only Electable nodes are counted as data
        // replicas able to ack a commit.  The master itself is
        // *implicit*: it is not registered in `group_service` (only
        // peers are), so we add 1 to obtain the total electable
        // count expected by `ReplicaAckPolicyKind::required_acks`.
        let group = self.get_rep_group();
        let electable_peers: u32 = group
            .get_nodes()
            .iter()
            .filter(|n| n.node_type == crate::node_type::NodeType::Electable)
            .count() as u32;
        let electable_count: u32 = electable_peers + 1; // +1 for self/master

        let needed = policy.required_acks(electable_count);
        if needed == 0 {
            // Single-node group, or All with only the master itself.
            return Ok(0);
        }

        // REP-9 Part 2: the commit's VLSN is the key.  The master assigns a
        // VLSN when it logs the TxnCommit (via the shared `wal_vlsn_counter`
        // bumped in `EnvironmentImpl::log_txn_commit`), immediately before
        // this gate runs.  The latest assigned VLSN therefore IS this
        // commit's VLSN (the trait contract: "implementations are responsible
        // for assigning the commit VLSN internally").  We wait until a quorum
        // of qualifying electable replicas have acked a VLSN >= the commit
        // VLSN — faithful to JE `FeederManager.getNumCurrentAckFeeders`, which
        // counts feeders whose `getReplicaTxnEndVLSN() >= commitVLSN` (a
        // high-water `>=` test, NOT an exact-VLSN match).
        //
        // ponytail: reads the global high-water VLSN, so a concurrent later
        // commit can make this gate wait on a slightly higher VLSN than its
        // own. That is strictly SAFE (waiting for >= a newer VLSN never
        // returns early) and only marginally less precise; thread the
        // per-txn VLSN through the trait if exact per-commit granularity is
        // ever needed.
        let commit_vlsn = self.wal_vlsn_counter.load(Ordering::Acquire);

        // Register on the AckTracker too: this is what `record_ack` notifies,
        // so the condvar wakes us as acks land.  The satisfaction decision
        // itself is the high-water feeder count below.
        self.ack_tracker.register(commit_vlsn, needed);

        // Block on the ack condvar until a quorum of electable feeders hold
        // the commit VLSN, the timeout elapses, or shutdown is signalled — no
        // spin-poll (JE FeederTxns.TxnInfo uses a per-transaction
        // CountDownLatch.await; the AckTracker condvar is the shared-mutex
        // equivalent). record_ack notifies us as acks arrive.
        let satisfied = self.ack_tracker.wait_for_predicate(
            timeout,
            || self.count_ack_feeders_ge(commit_vlsn) >= needed,
            || self.is_shutdown(),
        );
        if satisfied {
            self.ack_tracker.cleanup_through(commit_vlsn);
            return Ok(needed);
        }
        if self.is_shutdown() {
            self.ack_tracker.cleanup_through(commit_vlsn);
            return Err(AckWaitError {
                kind: AckWaitErrorKind::Shutdown,
                needed,
                received: 0,
            });
        }
        // Timed out: report the partial ack count (qualifying electable
        // feeders holding the commit VLSN) so the caller can surface
        // InsufficientReplicas.
        let received = self.count_ack_feeders_ge(commit_vlsn);
        self.ack_tracker.cleanup_through(commit_vlsn);
        Err(AckWaitError { kind: AckWaitErrorKind::Timeout, needed, received })
    }

    /// X-3: allocate the next VLSN for a recovered XA commit and register
    /// `lsn` in the VLSN index so feeders can stream the commit.
    ///
    /// Increments off the current latest VLSN so the new VLSN is strictly
    /// monotonically increasing.  In a single-node or master-less environment
    /// (not master) returns 0 (NULL_VLSN — harmless, the default).
    fn alloc_vlsn_for_recovered_commit(&self, lsn: noxu_util::Lsn) -> u64 {
        // Only allocate a VLSN when we are the master; on a replica the
        // recovered XA should have been replicated by the original master.
        if !self.is_master() {
            return 0;
        }
        let next_vlsn = self.vlsn_index.get_latest_vlsn() + 1;
        // A recovered XA commit is a commit log entry; dispatch as TxnCommit
        // so lastTxnEnd/lastSync advance (REP-5).
        self.vlsn_index.register_with_type(
            next_vlsn,
            lsn.file_number(),
            lsn.file_offset(),
            noxu_log::LogEntryType::TxnCommit,
        );
        log::debug!(
            "alloc_vlsn_for_recovered_commit: allocated vlsn={} for lsn={:?}",
            next_vlsn,
            lsn
        );
        next_vlsn
    }

    /// R-3: pre-allocate the next commit VLSN WITHOUT registering in the index.
    ///
    /// The caller writes the `TxnCommit` WAL entry with this VLSN embedded,
    /// then calls `register_recovered_commit_vlsn` with the actual commit LSN.
    /// This two-step approach ensures the WAL entry carries the VLSN so the
    /// X-14 VLSN rebuild on second crash can find it.
    fn pre_alloc_vlsn_for_recovered_commit(&self) -> u64 {
        if !self.is_master() {
            return 0;
        }
        // Peek at the next VLSN without registering.  The actual registration
        // happens in register_recovered_commit_vlsn() after the WAL write.
        self.vlsn_index.get_latest_vlsn() + 1
    }

    /// R-3: register a pre-allocated VLSN in the VLSN index with the actual
    /// commit LSN.  Called after writing the `TxnCommit` WAL entry.
    fn register_recovered_commit_vlsn(
        &self,
        vlsn: u64,
        commit_lsn: noxu_util::Lsn,
    ) {
        if vlsn == 0 || !self.is_master() {
            return;
        }
        // The pre-allocated VLSN is for a TxnCommit WAL entry; dispatch the
        // type so lastTxnEnd/lastSync advance (REP-5).
        self.vlsn_index.register_with_type(
            vlsn,
            commit_lsn.file_number(),
            commit_lsn.file_offset(),
            noxu_log::LogEntryType::TxnCommit,
        );
        log::debug!(
            "register_recovered_commit_vlsn: registered vlsn={} for commit_lsn={:?}",
            vlsn,
            commit_lsn
        );
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::atomic::{AtomicU32, Ordering as AtomicOrdering};

    /// Helper to create a test config with a fixed port (unit-test style,
    /// no real TCP bind needed — hostname "localhost" resolves but the port
    /// might be in use; use `test_config_port0` for real TCP tests).
    fn test_config(node_name: &str) -> RepConfig {
        RepConfig::builder("test_group", node_name, "localhost")
            .node_port(5001)
            .build()
    }

    /// Helper to create a test config that binds to an OS-assigned port.
    fn test_config_port0(node_name: &str) -> RepConfig {
        RepConfig::builder("test_group", node_name, "127.0.0.1")
            .node_port(0)
            .build()
    }

    #[test]
    fn test_initial_state_is_detached() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        // NodeStateMachine starts in Detached state
        assert_eq!(env.get_state(), NodeState::Detached);
        assert!(!env.is_master());
        assert!(!env.is_replica());
        assert!(!env.is_active());
    }

    #[test]
    fn test_become_master() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        env.become_master(1).unwrap();
        assert_eq!(env.get_state(), NodeState::Master);
        assert!(env.is_master());
        assert!(!env.is_replica());
        assert!(env.is_active());
    }

    #[test]
    fn test_become_replica() {
        let env = ReplicatedEnvironment::new(test_config("node2")).unwrap();
        env.become_replica("node1").unwrap();
        assert_eq!(env.get_state(), NodeState::Replica);
        assert!(!env.is_master());
        assert!(env.is_replica());
        assert!(env.is_active());
    }

    #[test]
    fn test_get_node_name() {
        let env = ReplicatedEnvironment::new(test_config("my_node")).unwrap();
        assert_eq!(env.get_node_name(), "my_node");
    }

    #[test]
    fn test_get_group_name() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        assert_eq!(env.get_group_name(), "test_group");
    }

    #[test]
    fn test_register_vlsn_updates_index() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        env.register_vlsn(1, 0, 100);
        env.register_vlsn(2, 0, 200);
        env.register_vlsn(3, 0, 300);

        assert_eq!(env.get_current_vlsn(), 3);
        let range = env.get_vlsn_range();
        assert_eq!(range.first(), 1);
        assert_eq!(range.last(), 3);
    }

    #[test]
    fn test_record_ack() {
        use crate::node_type::NodeType;
        use crate::rep_node::RepNode;
        let env = ReplicatedEnvironment::new(test_config("master")).unwrap();
        env.become_master(1).unwrap();
        // replicaAcksQualify: only ELECTABLE replicas count toward durability,
        // so the replica must be a known electable member of the group.
        env.add_peer(RepNode::new(
            "replica1".to_string(),
            NodeType::Electable,
            "127.0.0.1".to_string(),
            6001,
            2,
        ))
        .unwrap();

        env.register_vlsn(1, 0, 100);
        // Register a pending ack requirement, then record ack
        env.get_ack_tracker().register(1, 1);
        env.record_ack(1, "replica1");
        // Ack should be satisfied
        assert!(env.get_ack_tracker().is_satisfied(1));
    }

    #[test]
    fn test_record_ack_from_non_electable_does_not_qualify() {
        use crate::node_type::NodeType;
        use crate::rep_node::RepNode;
        let env = ReplicatedEnvironment::new(test_config("master")).unwrap();
        env.become_master(1).unwrap();
        // A Monitor is NOT electable -> its ack must not count (JE
        // DurabilityQuorum.replicaAcksQualify).
        env.add_peer(RepNode::new(
            "monitor1".to_string(),
            NodeType::Monitor,
            "127.0.0.1".to_string(),
            6002,
            3,
        ))
        .unwrap();
        env.register_vlsn(1, 0, 100);
        env.get_ack_tracker().register(1, 1);
        env.record_ack(1, "monitor1");
        assert!(
            !env.get_ack_tracker().is_satisfied(1),
            "non-electable ack must not satisfy durability quorum"
        );
        // An unknown replica likewise does not qualify.
        env.record_ack(1, "ghost");
        assert!(!env.get_ack_tracker().is_satisfied(1));
    }

    #[test]
    fn test_authoritative_quorum_met() {
        // 1-node group (electable_total=1): master alone IS authoritative
        // (quorum_size = 1/2+1 = 1; 0 replicas + 1 >= 1).
        assert!(ReplicatedEnvironment::authoritative_quorum_met(0, 1));
        // 3-node group (electable_total=3, quorum_size=2): master with 0
        // connected replicas is the minority -> NOT authoritative.
        assert!(!ReplicatedEnvironment::authoritative_quorum_met(0, 3));
        // 3-node group with 1 connected electable replica -> 1+1=2 >= 2 -> yes.
        assert!(ReplicatedEnvironment::authoritative_quorum_met(1, 3));
        // 5-node group (quorum_size=3): need 2 connected replicas.
        assert!(!ReplicatedEnvironment::authoritative_quorum_met(1, 5));
        assert!(ReplicatedEnvironment::authoritative_quorum_met(2, 5));
    }

    #[test]
    fn test_is_authoritative_master_requires_master_role() {
        // A non-master is never authoritative regardless of connections.
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        assert!(!env.is_master());
        assert!(!env.is_authoritative_master());
        // A single-node master (no peers) IS authoritative.
        env.become_master(1).unwrap();
        assert!(env.is_authoritative_master());
    }

    #[test]
    fn test_dtvlsn_update_max_advances_only() {
        let env = ReplicatedEnvironment::new(test_config("master")).unwrap();
        assert_eq!(env.get_dtvlsn(), 0);
        assert_eq!(env.update_dtvlsn(10), 10);
        assert_eq!(env.get_dtvlsn(), 10);
        // A lower candidate must not move it backward.
        assert_eq!(env.update_dtvlsn(5), 10);
        assert_eq!(env.get_dtvlsn(), 10);
        // Equal is a no-op.
        assert_eq!(env.update_dtvlsn(10), 10);
        // set_dtvlsn (replica path) is also advance-only.
        env.set_dtvlsn(7);
        assert_eq!(env.get_dtvlsn(), 10);
        env.set_dtvlsn(20);
        assert_eq!(env.get_dtvlsn(), 20);
    }

    #[test]
    fn test_dtvlsn_majority_min_across_feeders() {
        use crate::node_type::NodeType;
        use crate::rep_node::RepNode;
        let env = ReplicatedEnvironment::new(test_config("master")).unwrap();
        env.become_master(1).unwrap();
        // Three electable replicas → electable_count = 4 (incl. master) →
        // durable_ack_count = 2. With master self-ack, DTVLSN advances to the
        // min of the 2 highest qualifying feeders that exceed the current
        // DTVLSN.
        for (i, name) in ["r1", "r2", "r3"].iter().enumerate() {
            env.add_peer(RepNode::new(
                name.to_string(),
                NodeType::Electable,
                "127.0.0.1".to_string(),
                6100 + i as u16,
                (i + 2) as u32,
            ))
            .unwrap();
        }
        // Register feeders with differing acked VLSNs: r1=100, r2=80, r3=50.
        for (name, vlsn) in [("r1", 100u64), ("r2", 80), ("r3", 50)] {
            let f = crate::stream::feeder::Feeder::new(name.to_string());
            f.record_ack(vlsn);
            env.feeders.write().push(f);
        }
        env.update_dtvlsn_from_feeders();
        // First two qualifying feeders encountered are r1(100), r2(80);
        // min(100,80)=80 and that is a majority (2 of 4) → DTVLSN = 80.
        // (r3=50 < 80 is not required for durability.)
        assert!(
            env.get_dtvlsn() >= 80,
            "DTVLSN must reach the majority-min (>=80), got {}",
            env.get_dtvlsn()
        );
    }

    #[test]
    fn test_close_sets_shutdown() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        assert!(!env.is_shutdown());

        env.close().unwrap();
        assert!(env.is_shutdown());
        // After close, state should be Shutdown
        assert_eq!(env.get_state(), NodeState::Shutdown);
    }

    #[test]
    fn test_close_is_idempotent() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        env.close().unwrap();
        env.close().unwrap(); // Should not error
        assert!(env.is_shutdown());
    }

    #[test]
    fn test_cannot_become_master_when_shutdown() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        env.close().unwrap();

        let result = env.become_master(1);
        assert!(result.is_err());
    }

    #[test]
    fn test_cannot_become_replica_when_shutdown() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        env.close().unwrap();

        let result = env.become_replica("master");
        assert!(result.is_err());
    }

    #[test]
    fn test_cannot_apply_entry_when_shutdown() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        env.close().unwrap();

        let result = env.apply_entry(1, 0, vec![1, 2, 3]);
        assert!(result.is_err());
    }

    #[test]
    fn test_cannot_transfer_master_when_not_master() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        env.become_replica("other").unwrap();

        let config = MasterTransferConfig::new(
            "target_node".to_string(),
            Duration::from_secs(30),
        );
        let result = env.transfer_master(config);
        assert!(result.is_err());
    }

    #[test]
    fn test_transfer_master_requires_registered_target() {
        // F7: transfer_master is no longer a no-op; it sends an ADMIN
        // TRANSFER_MASTER signal to the target via TCP.  An unregistered
        // target is rejected at the address-resolution step.
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        env.become_master(1).unwrap();

        let config = MasterTransferConfig::new(
            "unknown_target".to_string(),
            Duration::from_secs(30),
        );
        let result = env.transfer_master(config);
        assert!(
            result.is_err(),
            "transfer_master to unregistered target must error"
        );
    }

    #[test]
    fn test_apply_entry_registers_vlsn() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        env.become_replica("master").unwrap();

        env.apply_entry(1, 0, vec![1, 2, 3]).unwrap();
        env.apply_entry(2, 0, vec![4, 5, 6]).unwrap();

        assert_eq!(env.get_current_vlsn(), 2);
    }

    #[test]
    fn test_master_name_tracking() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();

        // Initially no master known
        assert!(env.get_master_name().is_none());

        // After becoming master, this node is the master
        env.become_master(1).unwrap();
        assert_eq!(env.get_master_name(), Some("node1".to_string()));
    }

    #[test]
    fn test_master_to_replica_transition() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();

        // Become master first
        env.become_master(1).unwrap();
        assert_eq!(env.get_master_name(), Some("node1".to_string()));

        // Transition to replica (Master -> Replica is valid)
        env.become_replica("other_master").unwrap();
        assert_eq!(env.get_master_name(), Some("other_master".to_string()));
        assert!(env.is_replica());
    }

    #[test]
    fn test_state_change_listener_notification() {
        struct TestListener {
            call_count: AtomicU32,
            last_new_state: noxu_sync::Mutex<Option<NodeState>>,
        }

        impl StateChangeListener for TestListener {
            fn on_state_change(&self, event: StateChangeEvent) {
                self.call_count.fetch_add(1, AtomicOrdering::SeqCst);
                *self.last_new_state.lock() = Some(event.new_state);
            }
        }

        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        let listener = Arc::new(TestListener {
            call_count: AtomicU32::new(0),
            last_new_state: noxu_sync::Mutex::new(None),
        });

        // Setting the listener should trigger an immediate notification
        env.set_state_change_listener(listener.clone());
        assert_eq!(listener.call_count.load(AtomicOrdering::SeqCst), 1);

        // State change should trigger another notification
        env.become_master(1).unwrap();
        assert_eq!(listener.call_count.load(AtomicOrdering::SeqCst), 2);
        assert_eq!(*listener.last_new_state.lock(), Some(NodeState::Master));
    }

    #[test]
    fn test_close_notifies_listeners() {
        struct ShutdownListener {
            shutdown_seen: AtomicBool,
        }

        impl StateChangeListener for ShutdownListener {
            fn on_state_change(&self, event: StateChangeEvent) {
                if event.new_state == NodeState::Shutdown {
                    self.shutdown_seen.store(true, AtomicOrdering::SeqCst);
                }
            }
        }

        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        let listener = Arc::new(ShutdownListener {
            shutdown_seen: AtomicBool::new(false),
        });

        // The initial notification is for the current (Detached) state
        env.set_state_change_listener(listener.clone());

        // Become master first so the close transition is meaningful
        env.become_master(1).unwrap();
        assert!(!listener.shutdown_seen.load(AtomicOrdering::SeqCst));

        env.close().unwrap();
        assert!(listener.shutdown_seen.load(AtomicOrdering::SeqCst));
    }

    #[test]
    fn test_shutdown_group_requires_master() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        env.become_replica("other").unwrap();

        let result = env.shutdown_group(5000);
        assert!(result.is_err());
    }

    #[test]
    fn test_shutdown_group_as_master() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        env.become_master(1).unwrap();

        let result = env.shutdown_group(5000);
        assert!(result.is_ok());
        assert!(env.is_shutdown());
    }

    #[test]
    fn test_get_config() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        assert_eq!(env.get_config().node_name, "node1");
        assert_eq!(env.get_config().group_name, "test_group");
    }

    #[test]
    fn test_get_stats() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        let _stats = env.get_stats();
        // Just verify we can access stats without panicking
    }

    // -----------------------------------------------------------------------
    // TCP dispatcher tests (H-5 / H-7)
    // -----------------------------------------------------------------------

    #[test]
    fn test_tcp_dispatcher_starts_on_new() {
        // Use port 0 so the OS assigns an ephemeral port.
        let env =
            ReplicatedEnvironment::new(test_config_port0("tcp_node")).unwrap();
        // The dispatcher must have started and bound a real port.
        let addr = env.bound_addr();
        assert!(addr.is_some(), "expected a bound address");
        let addr = addr.unwrap();
        assert_ne!(addr.port(), 0, "OS should assign a non-zero port");
    }

    #[test]
    fn test_tcp_dispatcher_stops_on_close() {
        let env =
            ReplicatedEnvironment::new(test_config_port0("tcp_node2")).unwrap();
        // Dispatcher is running.
        assert!(
            env.tcp_dispatcher
                .as_ref()
                .map(|d| d.is_running())
                .unwrap_or(false)
        );

        env.close().unwrap();

        // After close, dispatcher must be stopped.
        assert!(
            !env.tcp_dispatcher
                .as_ref()
                .map(|d| d.is_running())
                .unwrap_or(false),
            "dispatcher should be stopped after close"
        );
    }

    #[test]
    fn test_tcp_dispatcher_accepts_connection() {
        use crate::net::Channel;
        use crate::net::ServiceHandler;
        use crate::net::service_dispatcher::connect_to_service;
        use std::sync::atomic::{AtomicU32, Ordering as AO};
        use std::time::Duration;

        struct PingHandler {
            count: AtomicU32,
        }
        impl ServiceHandler for PingHandler {
            fn service_name(&self) -> &str {
                "ping"
            }
            fn handle(&self, ch: Box<dyn Channel>) -> crate::error::Result<()> {
                self.count.fetch_add(1, AO::SeqCst);
                // Echo the first message back.
                if let Ok(Some(msg)) = ch.receive(Duration::from_secs(2)) {
                    let _ = ch.send(&msg);
                }
                Ok(())
            }
        }

        let env =
            ReplicatedEnvironment::new(test_config_port0("tcp_node3")).unwrap();
        let addr = env.bound_addr().expect("dispatcher must be bound");

        // Register a ping handler on the running dispatcher.
        if let Some(ref disp) = env.tcp_dispatcher {
            let handler = Arc::new(PingHandler { count: AtomicU32::new(0) });
            disp.register("ping", handler.clone());

            // Give the accept thread a moment.
            std::thread::sleep(Duration::from_millis(20));

            let client = connect_to_service(addr, "ping").unwrap();
            client.send(b"hello").unwrap();
            let reply = client.receive(Duration::from_secs(2)).unwrap();
            assert_eq!(reply, Some(b"hello".to_vec()));

            assert_eq!(handler.count.load(AO::SeqCst), 1);
        }

        env.close().unwrap();
    }

    #[test]
    fn test_become_master_auto_transitions_from_detached() {
        // The state machine requires Detached -> Unknown -> Master.
        // become_master() should handle this automatically.
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        assert_eq!(env.get_state(), NodeState::Detached);
        env.become_master(1).unwrap();
        assert_eq!(env.get_state(), NodeState::Master);
    }

    #[test]
    fn test_become_replica_auto_transitions_from_detached() {
        // The state machine requires Detached -> Unknown -> Replica.
        // become_replica() should handle this automatically.
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        assert_eq!(env.get_state(), NodeState::Detached);
        env.become_replica("master_node").unwrap();
        assert_eq!(env.get_state(), NodeState::Replica);
    }

    #[test]
    fn test_cannot_transfer_master_when_shutdown() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();
        env.become_master(1).unwrap();
        env.close().unwrap();

        let config = MasterTransferConfig::new(
            "target".to_string(),
            Duration::from_secs(30),
        );
        let result = env.transfer_master(config);
        assert!(result.is_err());
    }

    #[test]
    fn test_full_lifecycle() {
        let env = ReplicatedEnvironment::new(test_config("node1")).unwrap();

        // Start as detached
        assert_eq!(env.get_state(), NodeState::Detached);

        // Become master
        env.become_master(1).unwrap();
        assert!(env.is_master());

        // Register some VLSNs
        env.register_vlsn(1, 0, 100);
        env.register_vlsn(2, 0, 200);

        // Record ack from replica
        env.record_ack(1, "replica1");
        env.record_ack(2, "replica1");

        // Transition to replica (simulating failover)
        env.become_replica("node2").unwrap();
        assert!(env.is_replica());

        // Apply entries from new master
        env.apply_entry(3, 0, vec![7, 8, 9]).unwrap();

        // Close
        env.close().unwrap();
        assert!(env.is_shutdown());
    }

    /// Verify that `with_environment` lazily registers the RESTORE service on
    /// the TCP dispatcher when `config.env_home` was not set at construction.
    ///
    /// This mirrors`RepNode.envSetup()` which registers the restore handler
    /// when the environment is wired into the replicated node.
    #[test]
    fn test_restore_registered_lazily_via_with_environment() {
        use noxu_dbi::EnvironmentImpl;
        use tempfile::TempDir;

        let dir = TempDir::new().expect("temp dir");

        // Build config WITHOUT env_home — dispatcher starts, but no RESTORE handler yet.
        let config = RepConfig::builder("test_group", "node1", "127.0.0.1")
            .node_port(0)
            .build();

        let rep_env = ReplicatedEnvironment::new(config).unwrap();

        // Not yet registered.
        assert!(
            !rep_env
                .restore_registered
                .load(std::sync::atomic::Ordering::SeqCst)
        );

        // Wire in a real EnvironmentImpl so get_env_home() returns the temp dir.
        let env_impl = Arc::new(
            EnvironmentImpl::new(dir.path(), false, false).expect("open env"),
        );
        rep_env.with_environment(env_impl);

        // Now the RESTORE service must be registered.
        assert!(
            rep_env
                .restore_registered
                .load(std::sync::atomic::Ordering::SeqCst)
        );
    }

    /// Verify that when `config.env_home` IS set at construction, the RESTORE
    /// service is registered immediately (not deferred).
    #[test]
    fn test_restore_registered_eagerly_when_env_home_in_config() {
        use tempfile::TempDir;

        let dir = TempDir::new().expect("temp dir");

        let config = RepConfig::builder("test_group", "node2", "127.0.0.1")
            .node_port(0)
            .env_home(dir.path())
            .build();

        let rep_env = ReplicatedEnvironment::new(config).unwrap();

        // Should be registered immediately (env_home was in config).
        assert!(
            rep_env
                .restore_registered
                .load(std::sync::atomic::Ordering::SeqCst)
        );
    }
}