crabka-broker 0.3.6

//! Spawned actor task that owns the only `&mut Log` reference (via the
//! shared `Arc<Mutex<Log>>`) and serializes appends for a single partition.
//!
//! Reads bypass the actor — they take the same mutex briefly. The actor's
//! contribution is: ordered acks back to producers + waking long-poll
//! Fetch consumers via a shared `Notify` after every successful append.

use std::path::PathBuf;
use std::sync::{Arc, Mutex};

use arc_swap::ArcSwap;
use crabka_log::Log;
use tokio::sync::{Notify, mpsc};

use crate::log_dir_status::LogDirRegistry;
use crate::partition::{ProduceJob, SwapOutcome, WriterMessage};
use crate::replica_state::ReplicaState;

/// Inspect a `BrokerError` returned by a partition-writer mutation
/// (`append`, `append_at`, `truncate_to`, `reset_to`, `compact`,
/// `trim_to_offset`) and, if it looks like an underlying storage
/// failure (a `LogError::Io(_)`), mark the partition's owning log dir
/// offline on the broker-wide registry.
///
/// We err on the side of pessimism: any `io::Error` propagated by the
/// log layer is a credible "the disk just went sideways" signal. A
/// false positive (e.g. a transient `ENOENT` from a misconfigured
/// scratch path) costs one partition's availability — KIP-113 fail-over
/// elsewhere on the cluster keeps the topic live. A false negative
/// silently corrupts produce acks, which is the failure mode this
/// whole slice exists to prevent.
fn flag_storage_failure(
    err: &crate::error::BrokerError,
    log_dir: &ArcSwap<PathBuf>,
    log_dir_status: &LogDirRegistry,
) {
    if let crate::error::BrokerError::Log(crabka_log::LogError::Io(io_err)) = err {
        let dir = log_dir.load();
        log_dir_status.mark_offline(&dir, &format!("partition write/fsync failed: {io_err}"));
    }
}

/// Lock the partition log, recovering the guard if the mutex was
/// poisoned by a panic in some other critical section.
///
/// In this greenfield single-writer model the log data is consistent
/// enough to keep serving after a poison — the alternative (`expect`)
/// silently kills the writer task (its `JoinHandle` is discarded),
/// taking the whole partition offline. Recovering via `into_inner`
/// keeps the partition live.
fn lock_log(log: &Mutex<Log>) -> std::sync::MutexGuard<'_, Log> {
    log.lock()
        .unwrap_or_else(std::sync::PoisonError::into_inner)
}

/// Build a `BrokerError` standing in for a panic inside a
/// `spawn_blocking` storage closure. A panic during `write_all` /
/// `fsync` is a credible "the disk just went sideways" signal, so we
/// model it as a `LogError::Io` — which `flag_storage_failure` then
/// recognizes and uses to mark the owning log dir offline.
fn storage_failure_error(
    context: &str,
    join_err: &tokio::task::JoinError,
) -> crate::error::BrokerError {
    let io = std::io::Error::other(format!("{context}: {join_err}"));
    crate::error::BrokerError::Log(crabka_log::LogError::Io(io))
}

/// Loop on the receive side of the partition's `WriterMessage` channel.
/// Exits when the channel closes (every sender dropped).
#[allow(clippy::too_many_lines)]
pub async fn run(
    log: Arc<Mutex<Log>>,
    log_dir: Arc<ArcSwap<PathBuf>>,
    mut rx: mpsc::Receiver<WriterMessage>,
    append_notify: Arc<Notify>,
    replica_state: Arc<tokio::sync::Mutex<ReplicaState>>,
    hw_advance_notify: Arc<Notify>,
    log_dir_status: LogDirRegistry,
) {
    while let Some(msg) = rx.recv().await {
        match msg {
            WriterMessage::Produce(ProduceJob { mut batch, ack }) => {
                // Broker-side recompression. If the topic's
                // `compression.type` is a concrete codec (i.e. not
                // Kafka's `producer` pass-through), force the batch's
                // attributes to that codec before append — the
                // `RecordBatch::encode` path inside `Log::append`
                // re-compresses the records body according to the
                // attributes we set here.
                let target = lock_log(&log).config_snapshot().compression_type;
                if let Some(target) = target {
                    let current = batch.attributes.compression();
                    if current != target {
                        batch.attributes = batch.attributes.with_compression(target);
                    }
                }
                // Run the blocking `write_all` + `fsync` off the reactor.
                // The loop is a single serial task per partition, so the
                // `.await` on the `JoinHandle` preserves append ordering.
                // We fold the post-append LEO read into the same closure
                // (it needs the lock anyway) and return it alongside the
                // append result.
                let log_for_blocking = log.clone();
                let join = tokio::task::spawn_blocking(move || {
                    let mut guard = lock_log(&log_for_blocking);
                    let result = guard
                        .append(&mut batch)
                        .map_err(crate::error::BrokerError::from);
                    // LEO is only meaningful on success; read it under the
                    // same lock so the HW recompute below sees this append.
                    let leo = result.as_ref().ok().map(|_| guard.log_end_offset());
                    (result, leo)
                });
                let (result, leo) = match join.await {
                    Ok(v) => v,
                    Err(join_err) => {
                        // A panic inside the closure poisoned/aborted the
                        // append. Treat it as a storage failure for this
                        // message rather than propagating the panic and
                        // killing the writer task.
                        let err = storage_failure_error("append task panicked", &join_err);
                        flag_storage_failure(&err, &log_dir, &log_dir_status);
                        let _ = ack.send(Err(err));
                        continue;
                    }
                };
                let ok = result.is_ok();
                if let Err(ref e) = result {
                    flag_storage_failure(e, &log_dir, &log_dir_status);
                }
                // If the receiver dropped, the handler timed out — that's
                // fine, we don't care if the ack is ignored.
                let _ = ack.send(result);
                if ok {
                    append_notify.notify_waiters();
                    // Update HW from the LEO read inside the blocking
                    // closure. The replica_state mutex is tokio::sync so
                    // we .await it cooperatively.
                    let leader_leo = leo.expect("LEO present on successful append");
                    let advanced = {
                        let mut st = replica_state.lock().await;
                        let prev = st.hw;
                        let new = st.recompute_hw_for_leader_append(leader_leo);
                        new > prev
                    };
                    if advanced {
                        hw_advance_notify.notify_waiters();
                    }
                }
            }
            WriterMessage::Replicate { mut batch, ack } => {
                // Replication appends preserve the leader-assigned offset
                // (`batch.base_offset`) — `Log::append_at` rejects with
                // `OffsetMismatch` if it doesn't line up with the local
                // log's end.
                let offset = batch.base_offset;
                // Run the blocking `write_all` + `fsync` off the reactor;
                // serial loop + `.await` preserves ordering.
                let log_for_blocking = log.clone();
                let join = tokio::task::spawn_blocking(move || {
                    lock_log(&log_for_blocking)
                        .append_at(&mut batch, offset)
                        .map_err(crate::error::BrokerError::from)
                });
                let result = match join.await {
                    Ok(v) => v,
                    Err(join_err) => {
                        let err = storage_failure_error("replicate task panicked", &join_err);
                        flag_storage_failure(&err, &log_dir, &log_dir_status);
                        let _ = ack.send(Err(err));
                        continue;
                    }
                };
                let ok = result.is_ok();
                if let Err(ref e) = result {
                    flag_storage_failure(e, &log_dir, &log_dir_status);
                }
                let _ = ack.send(result);
                if ok {
                    append_notify.notify_waiters();
                }
            }
            WriterMessage::Truncate { offset, ack } => {
                // Truncate rewrites segment files + fsyncs — run it off
                // the reactor like the append paths.
                let log_for_blocking = log.clone();
                let join = tokio::task::spawn_blocking(move || {
                    lock_log(&log_for_blocking)
                        .truncate_to(offset)
                        .map_err(crate::error::BrokerError::from)
                });
                let result = match join.await {
                    Ok(v) => v,
                    Err(join_err) => {
                        let err = storage_failure_error("truncate task panicked", &join_err);
                        flag_storage_failure(&err, &log_dir, &log_dir_status);
                        let _ = ack.send(Err(err));
                        continue;
                    }
                };
                if let Err(ref e) = result {
                    flag_storage_failure(e, &log_dir, &log_dir_status);
                }
                let _ = ack.send(result);
                // No `append_notify` — truncate doesn't deliver new data.
            }
            WriterMessage::ResetTo { new_base, ack } => {
                let log_for_blocking = log.clone();
                let join = tokio::task::spawn_blocking(move || {
                    lock_log(&log_for_blocking)
                        .reset_to(new_base)
                        .map_err(crate::error::BrokerError::from)
                });
                let result = match join.await {
                    Ok(v) => v,
                    Err(join_err) => {
                        let err = storage_failure_error("reset_to task panicked", &join_err);
                        flag_storage_failure(&err, &log_dir, &log_dir_status);
                        let _ = ack.send(Err(err));
                        continue;
                    }
                };
                if let Err(ref e) = result {
                    flag_storage_failure(e, &log_dir, &log_dir_status);
                }
                let _ = ack.send(result);
                // No `append_notify` — reset_to drops data rather than
                // delivering it.
            }
            WriterMessage::TrimToOffset { new_start, ack } => {
                let log_for_blocking = log.clone();
                let join = tokio::task::spawn_blocking(move || {
                    lock_log(&log_for_blocking)
                        .trim_to_offset(new_start)
                        .map_err(crate::error::BrokerError::from)
                });
                let result = match join.await {
                    Ok(v) => v,
                    Err(join_err) => {
                        let err = storage_failure_error("trim_to_offset task panicked", &join_err);
                        flag_storage_failure(&err, &log_dir, &log_dir_status);
                        let _ = ack.send(Err(err));
                        continue;
                    }
                };
                if let Err(ref e) = result {
                    flag_storage_failure(e, &log_dir, &log_dir_status);
                }
                let _ = ack.send(result);
                // No `append_notify` — trim drops data rather than producing it.
            }
            WriterMessage::SetLogConfig { config, ack } => {
                lock_log(&log).set_config(config);
                let _ = ack.send(());
            }
            WriterMessage::Compact { ack } => {
                // Compaction rewrites segments + fsyncs — off the reactor.
                let log_for_blocking = log.clone();
                let join = tokio::task::spawn_blocking(move || {
                    lock_log(&log_for_blocking)
                        .compact()
                        .map_err(crate::error::BrokerError::from)
                });
                let result = match join.await {
                    Ok(v) => v,
                    Err(join_err) => {
                        let err = storage_failure_error("compact task panicked", &join_err);
                        flag_storage_failure(&err, &log_dir, &log_dir_status);
                        let _ = ack.send(Err(err));
                        continue;
                    }
                };
                if let Err(ref e) = result {
                    flag_storage_failure(e, &log_dir, &log_dir_status);
                }
                let _ = ack.send(result);
                // No `append_notify` — compaction doesn't produce new
                // records, only consolidates existing ones at the same
                // absolute offsets.
            }
            #[cfg(any(test, feature = "test-helpers"))]
            WriterMessage::TestSetLogStart { new_start, ack } => {
                let result = lock_log(&log)
                    .set_log_start_offset(new_start)
                    .map_err(crate::error::BrokerError::from);
                let _ = ack.send(result);
            }
            WriterMessage::SwapFutureLog {
                target_log_dir,
                future_log,
                future_path,
                target_partition_path,
                ack,
            } => {
                let result = swap_future_log(
                    &log,
                    &log_dir,
                    target_log_dir,
                    &future_log,
                    &future_path,
                    &target_partition_path,
                );
                let _ = ack.send(result);
                // No `append_notify` — swap doesn't deliver new data,
                // and consumers re-read from the swapped `log` against
                // identical offsets.
            }
        }
    }
}

/// KIP-113 intra-broker log-dir swap. Called from the writer task —
/// holds the partition's `log` mutex for the duration of the rename so
/// no other appender sees a half-swapped state.
///
/// The future log MUST be caught up: its LEO == the current log's LEO.
/// If a producer slipped a batch in between the caller's catch-up
/// check and this writer cycle, we report `NotCaughtUp` so the
/// replicator loop drains the lag and retries.
fn swap_future_log(
    log: &Arc<Mutex<Log>>,
    log_dir: &Arc<ArcSwap<PathBuf>>,
    target_log_dir: PathBuf,
    future_log: &Arc<Mutex<Log>>,
    future_path: &std::path::Path,
    target_partition_path: &std::path::Path,
) -> Result<SwapOutcome, crate::error::BrokerError> {
    // Acquire both logs under the writer's serialization and re-check
    // the caught-up invariant. If the future log fell behind between
    // the caller's check and this cycle, refuse the swap and let the
    // replicator catch up.
    let mut log_guard = lock_log(log);
    let config = log_guard.config_snapshot();
    let current_leo = log_guard.log_end_offset();
    let mut future_guard = lock_log(future_log);
    if future_guard.log_end_offset() < current_leo {
        return Ok(SwapOutcome::NotCaughtUp);
    }

    let source_partition_path = log_guard.dir().to_path_buf();

    // Release segment file descriptors on both Logs before mutating
    // the filesystem. `Log::close` consumes the value, so we move
    // both out via `mem::replace` against throwaway Logs anchored to
    // a sacrificial `*.tomb` directory we delete at the end.
    let tomb_dir = future_path.with_extension("crabka-swap-tomb");
    std::fs::create_dir_all(&tomb_dir)?;
    let old_current = std::mem::replace(&mut *log_guard, Log::open(&tomb_dir, config.clone())?);
    old_current.close();
    let old_future = std::mem::replace(&mut *future_guard, Log::open(&tomb_dir, config.clone())?);
    old_future.close();
    drop(future_guard);

    // Atomically promote the future dir into the canonical
    // `<topic>-<partition>` slot under the target log.dir, then
    // remove the source dir. If the rename fails, reopen the source
    // so the partition keeps serving and bubble the error.
    if let Err(e) = std::fs::rename(future_path, target_partition_path) {
        // Best-effort recovery: reopen the original log in the
        // source dir so the partition keeps serving against the
        // pre-swap location.
        match Log::open(&source_partition_path, config) {
            Ok(reopened) => *log_guard = reopened,
            Err(reopen_err) => {
                tracing::error!(
                    error = %reopen_err,
                    "swap_future_log: rename failed AND source reopen failed; \
                     partition is offline until restart"
                );
            }
        }
        let _ = std::fs::remove_dir_all(&tomb_dir);
        return Err(crate::error::BrokerError::from(e));
    }

    if let Err(e) = std::fs::remove_dir_all(&source_partition_path) {
        tracing::warn!(
            source = %source_partition_path.display(),
            error = %e,
            "swap_future_log: failed to remove source partition dir; \
             partition is live at target, source will be cleaned on next restart"
        );
    }
    let _ = std::fs::remove_dir_all(&tomb_dir);

    *log_guard = Log::open(target_partition_path, config)?;
    log_dir.store(Arc::new(target_log_dir));
    Ok(SwapOutcome::Swapped)
}

#[cfg(test)]
mod tests {
    use super::*;
    use assert2::assert;
    use crabka_log::LogConfig;
    use crabka_protocol::records::{Record, RecordBatch};
    use tempfile::tempdir;
    use tokio::sync::oneshot;

    fn sample_batch(n: i32) -> RecordBatch {
        let mut b = RecordBatch {
            last_offset_delta: n - 1,
            ..RecordBatch::default()
        };
        for i in 0..n {
            b.records.push(Record {
                offset_delta: i,
                ..Default::default()
            });
        }
        b
    }

    #[tokio::test]
    async fn writer_appends_and_acks() {
        let dir = tempdir().expect("tempdir");
        let log = Arc::new(Mutex::new(
            Log::open(dir.path(), LogConfig::default()).expect("open log"),
        ));
        let (tx, rx) = mpsc::channel(1);
        let notify = Arc::new(Notify::new());
        let writer = tokio::spawn(run(
            log.clone(),
            Arc::new(ArcSwap::from_pointee(dir.path().to_path_buf())),
            rx,
            notify.clone(),
            Arc::new(tokio::sync::Mutex::new(
                crate::replica_state::ReplicaState::new(),
            )),
            Arc::new(Notify::new()),
            crate::log_dir_status::LogDirRegistry::default(),
        ));

        let (ack, ack_rx) = oneshot::channel();
        tx.send(WriterMessage::Produce(ProduceJob {
            batch: sample_batch(3),
            ack,
        }))
        .await
        .expect("send job");

        let assigned = ack_rx.await.expect("ack recv").expect("append ok");
        assert!(assigned == 0);

        // Second append assigns offset 3.
        let (ack, ack_rx) = oneshot::channel();
        tx.send(WriterMessage::Produce(ProduceJob {
            batch: sample_batch(2),
            ack,
        }))
        .await
        .expect("send job 2");
        assert!(ack_rx.await.expect("ack recv 2").expect("append 2 ok") == 3);

        drop(tx);
        writer.await.expect("writer join");
    }

    #[tokio::test]
    async fn writer_fires_notify_after_append() {
        let dir = tempdir().expect("tempdir");
        let log = Arc::new(Mutex::new(
            Log::open(dir.path(), LogConfig::default()).expect("open log"),
        ));
        let (tx, rx) = mpsc::channel(1);
        let notify = Arc::new(Notify::new());
        let writer = tokio::spawn(run(
            log.clone(),
            Arc::new(ArcSwap::from_pointee(dir.path().to_path_buf())),
            rx,
            notify.clone(),
            Arc::new(tokio::sync::Mutex::new(
                crate::replica_state::ReplicaState::new(),
            )),
            Arc::new(Notify::new()),
            crate::log_dir_status::LogDirRegistry::default(),
        ));

        // Subscribe BEFORE sending so we don't miss the notification.
        let waiter = notify.notified();
        tokio::pin!(waiter);

        let (ack, _ack_rx) = oneshot::channel();
        tx.send(WriterMessage::Produce(ProduceJob {
            batch: sample_batch(1),
            ack,
        }))
        .await
        .expect("send job");

        // Should wake within a short timeout.
        tokio::time::timeout(std::time::Duration::from_secs(1), waiter)
            .await
            .expect("notify did not fire");

        drop(tx);
        writer.await.expect("writer join");
    }

    #[tokio::test]
    async fn writer_handles_replicate_with_caller_offset() {
        let dir = tempdir().expect("tempdir");
        let log = Arc::new(Mutex::new(
            Log::open(dir.path(), LogConfig::default()).expect("open log"),
        ));
        let (tx, rx) = mpsc::channel(1);
        let notify = Arc::new(Notify::new());
        let writer = tokio::spawn(run(
            log.clone(),
            Arc::new(ArcSwap::from_pointee(dir.path().to_path_buf())),
            rx,
            notify.clone(),
            Arc::new(tokio::sync::Mutex::new(
                crate::replica_state::ReplicaState::new(),
            )),
            Arc::new(Notify::new()),
            crate::log_dir_status::LogDirRegistry::default(),
        ));

        // First replicate batch must start at offset 0 to match the
        // empty local log's `log_end_offset()`.
        let mut batch = sample_batch(3);
        batch.base_offset = 0;
        let (ack, ack_rx) = oneshot::channel();
        tx.send(WriterMessage::Replicate { batch, ack })
            .await
            .expect("send replicate");
        ack_rx.await.expect("ack recv").expect("replicate ok");
        assert!(log.lock().unwrap().log_end_offset() == 3);

        drop(tx);
        writer.await.expect("writer join");
    }

    #[tokio::test]
    async fn writer_replicate_offset_mismatch_surfaces_error() {
        let dir = tempdir().expect("tempdir");
        let log = Arc::new(Mutex::new(
            Log::open(dir.path(), LogConfig::default()).expect("open log"),
        ));
        let (tx, rx) = mpsc::channel(1);
        let notify = Arc::new(Notify::new());
        let writer = tokio::spawn(run(
            log.clone(),
            Arc::new(ArcSwap::from_pointee(dir.path().to_path_buf())),
            rx,
            notify.clone(),
            Arc::new(tokio::sync::Mutex::new(
                crate::replica_state::ReplicaState::new(),
            )),
            Arc::new(Notify::new()),
            crate::log_dir_status::LogDirRegistry::default(),
        ));

        // Wrong offset — log_end_offset is 0 but we claim 7.
        let mut batch = sample_batch(1);
        batch.base_offset = 7;
        let (ack, ack_rx) = oneshot::channel();
        tx.send(WriterMessage::Replicate { batch, ack })
            .await
            .expect("send replicate");
        let err = ack_rx
            .await
            .expect("ack recv")
            .expect_err("expected offset mismatch");
        assert!(matches!(err, crate::error::BrokerError::Log(_)));
        // Local log must not have advanced.
        assert!(log.lock().unwrap().log_end_offset() == 0);

        drop(tx);
        writer.await.expect("writer join");
    }

    #[tokio::test]
    async fn writer_truncate_drops_records() {
        let dir = tempdir().expect("tempdir");
        let log = Arc::new(Mutex::new(
            Log::open(dir.path(), LogConfig::default()).expect("open log"),
        ));
        let (tx, rx) = mpsc::channel(1);
        let notify = Arc::new(Notify::new());
        let writer = tokio::spawn(run(
            log.clone(),
            Arc::new(ArcSwap::from_pointee(dir.path().to_path_buf())),
            rx,
            notify.clone(),
            Arc::new(tokio::sync::Mutex::new(
                crate::replica_state::ReplicaState::new(),
            )),
            Arc::new(Notify::new()),
            crate::log_dir_status::LogDirRegistry::default(),
        ));

        // Produce two batches so the log has some data.
        for _ in 0..2 {
            let (ack, ack_rx) = oneshot::channel();
            tx.send(WriterMessage::Produce(ProduceJob {
                batch: sample_batch(2),
                ack,
            }))
            .await
            .expect("send produce");
            ack_rx.await.expect("ack").expect("ok");
        }
        assert!(log.lock().unwrap().log_end_offset() == 4);

        let (ack, ack_rx) = oneshot::channel();
        tx.send(WriterMessage::Truncate { offset: 0, ack })
            .await
            .expect("send truncate");
        ack_rx.await.expect("ack").expect("truncate ok");
        assert!(log.lock().unwrap().log_end_offset() == 0);

        drop(tx);
        writer.await.expect("writer join");
    }

    #[tokio::test]
    async fn writer_fires_hw_notify_after_produce_when_rf_one() {
        let dir = tempdir().expect("tempdir");
        let log = Arc::new(Mutex::new(
            Log::open(dir.path(), LogConfig::default()).expect("open log"),
        ));
        let (tx, rx) = mpsc::channel(1);
        let append_notify = Arc::new(Notify::new());
        let replica_state = Arc::new(tokio::sync::Mutex::new(
            crate::replica_state::ReplicaState::new(),
        ));
        {
            let mut st = replica_state.lock().await;
            st.install_isr(&[1], &[1], 1);
        }
        let hw_advance_notify = Arc::new(Notify::new());
        let writer = tokio::spawn(run(
            log.clone(),
            Arc::new(ArcSwap::from_pointee(dir.path().to_path_buf())),
            rx,
            append_notify.clone(),
            replica_state.clone(),
            hw_advance_notify.clone(),
            crate::log_dir_status::LogDirRegistry::default(),
        ));

        let waiter = hw_advance_notify.notified();
        tokio::pin!(waiter);

        let (ack, _ack_rx) = oneshot::channel();
        tx.send(WriterMessage::Produce(ProduceJob {
            batch: sample_batch(2),
            ack,
        }))
        .await
        .expect("send job");

        tokio::time::timeout(std::time::Duration::from_secs(1), waiter)
            .await
            .expect("hw_advance_notify did not fire");

        assert!(replica_state.lock().await.hw == 2);

        drop(tx);
        writer.await.expect("writer join");
    }

    #[tokio::test]
    async fn writer_set_log_config_swaps_config() {
        use crabka_log::LogConfig;
        let dir = tempdir().expect("tempdir");
        let log = Arc::new(Mutex::new(
            Log::open(dir.path(), LogConfig::default()).expect("open log"),
        ));
        let (tx, rx) = mpsc::channel(1);
        let append_notify = Arc::new(Notify::new());
        let replica_state = Arc::new(tokio::sync::Mutex::new(
            crate::replica_state::ReplicaState::new(),
        ));
        let hw_advance_notify = Arc::new(Notify::new());
        let writer = tokio::spawn(run(
            log.clone(),
            Arc::new(ArcSwap::from_pointee(dir.path().to_path_buf())),
            rx,
            append_notify,
            replica_state,
            hw_advance_notify,
            crate::log_dir_status::LogDirRegistry::default(),
        ));

        let new_cfg = LogConfig {
            retention_ms: Some(std::time::Duration::from_mins(2)),
            ..LogConfig::default()
        };
        let (ack, ack_rx) = tokio::sync::oneshot::channel();
        tx.send(WriterMessage::SetLogConfig {
            config: new_cfg.clone(),
            ack,
        })
        .await
        .expect("send");
        ack_rx.await.expect("ack");

        let observed = log.lock().expect("lock").config_snapshot();
        assert!(observed.retention_ms == new_cfg.retention_ms);

        drop(tx);
        writer.await.expect("writer join");
    }

    #[tokio::test]
    async fn writer_trim_to_offset_advances_log_start() {
        use crabka_log::LogConfig;
        let dir = tempdir().expect("tempdir");
        let log = Arc::new(Mutex::new(
            Log::open(dir.path(), LogConfig::default()).expect("open log"),
        ));
        // Pre-populate with two batches → LEO = 4.
        for _ in 0..2 {
            log.lock()
                .expect("lock")
                .append(&mut sample_batch(2))
                .expect("append");
        }

        let (tx, rx) = mpsc::channel(1);
        let append_notify = Arc::new(Notify::new());
        let replica_state = Arc::new(tokio::sync::Mutex::new(
            crate::replica_state::ReplicaState::new(),
        ));
        let hw_advance_notify = Arc::new(Notify::new());
        let writer = tokio::spawn(run(
            log.clone(),
            Arc::new(ArcSwap::from_pointee(dir.path().to_path_buf())),
            rx,
            append_notify,
            replica_state,
            hw_advance_notify,
            crate::log_dir_status::LogDirRegistry::default(),
        ));

        let (ack, ack_rx) = tokio::sync::oneshot::channel();
        tx.send(WriterMessage::TrimToOffset { new_start: 3, ack })
            .await
            .expect("send");
        let new_start = ack_rx.await.expect("ack").expect("trim ok");
        assert!(new_start >= 3);
        assert!(log.lock().expect("lock").log_start_offset() == new_start);

        drop(tx);
        writer.await.expect("writer join");
    }

    #[tokio::test]
    async fn writer_does_not_advance_hw_when_followers_lagging() {
        let dir = tempdir().expect("tempdir");
        let log = Arc::new(Mutex::new(
            Log::open(dir.path(), LogConfig::default()).expect("open log"),
        ));
        let (tx, rx) = mpsc::channel(1);
        let append_notify = Arc::new(Notify::new());
        let replica_state = Arc::new(tokio::sync::Mutex::new(
            crate::replica_state::ReplicaState::new(),
        ));
        {
            let mut st = replica_state.lock().await;
            st.install_isr(&[1, 2, 3], &[1, 2, 3], 1);
        }
        let hw_advance_notify = Arc::new(Notify::new());
        let writer = tokio::spawn(run(
            log.clone(),
            Arc::new(ArcSwap::from_pointee(dir.path().to_path_buf())),
            rx,
            append_notify.clone(),
            replica_state.clone(),
            hw_advance_notify.clone(),
            crate::log_dir_status::LogDirRegistry::default(),
        ));

        let (ack, ack_rx) = oneshot::channel();
        tx.send(WriterMessage::Produce(ProduceJob {
            batch: sample_batch(3),
            ack,
        }))
        .await
        .expect("send job");
        ack_rx.await.expect("ack").expect("append ok");

        assert!(replica_state.lock().await.hw == 0);

        drop(tx);
        writer.await.expect("writer join");
    }
}