merutable 0.0.1

//! Regression tests for the IMP (Improvement Report) fixes.
//!
//! Each test targets a specific IMP-NN fix and verifies the correct behavior
//! that was previously missing or broken.

use merutable::engine::{EngineConfig, MeruEngine};
use merutable::types::{
    schema::{ColumnDef, ColumnType, TableSchema},
    value::{FieldValue, Row},
};

fn test_schema() -> TableSchema {
    TableSchema {
        table_name: "imp_test".into(),
        columns: vec![
            ColumnDef {
                name: "id".into(),
                col_type: ColumnType::Int64,
                nullable: false,

                ..Default::default()
            },
            ColumnDef {
                name: "val".into(),
                col_type: ColumnType::ByteArray,
                nullable: true,

                ..Default::default()
            },
        ],
        primary_key: vec![0],

        ..Default::default()
    }
}

fn test_config(tmp: &tempfile::TempDir) -> EngineConfig {
    EngineConfig {
        schema: test_schema(),
        catalog_uri: tmp.path().to_string_lossy().to_string(),
        object_store_prefix: tmp.path().to_string_lossy().to_string(),
        wal_dir: tmp.path().join("wal"),
        // Small memtable for fast flush in tests.
        memtable_size_bytes: 512,
        // No GC grace period in tests — delete immediately.
        gc_grace_period_secs: 0,
        // Disable L0 stall in the shared fixture — tests that want to
        // exercise the stall override these; tests that don't would
        // otherwise deadlock when the small memtable produces many L0
        // files.
        l0_slowdown_trigger: u32::MAX as usize,
        l0_stop_trigger: u32::MAX as usize,
        ..Default::default()
    }
}

/// IMP-03 regression: row cache must be cleared after compaction.
/// Without the fix, a cached entry from an obsoleted file would be
/// served even after compaction resolves the MVCC version.
#[tokio::test]
async fn imp03_row_cache_cleared_after_compaction() {
    let tmp = tempfile::tempdir().unwrap();
    let config = EngineConfig {
        row_cache_capacity: 1000,
        ..test_config(&tmp)
    };
    let engine = MeruEngine::open(config).await.unwrap();

    // Write v1, flush to L0.
    engine
        .put(
            vec![FieldValue::Int64(1)],
            Row::new(vec![
                Some(FieldValue::Int64(1)),
                Some(FieldValue::Bytes(bytes::Bytes::from("v1"))),
            ]),
        )
        .await
        .unwrap();
    engine.flush().await.unwrap();

    // Read key 1 — populates the row cache.
    let row = engine.get(&[FieldValue::Int64(1)]).unwrap().unwrap();
    assert_eq!(
        row.get(1).cloned(),
        Some(FieldValue::Bytes(bytes::Bytes::from("v1")))
    );

    // Overwrite with v2, flush again.
    engine
        .put(
            vec![FieldValue::Int64(1)],
            Row::new(vec![
                Some(FieldValue::Int64(1)),
                Some(FieldValue::Bytes(bytes::Bytes::from("v2"))),
            ]),
        )
        .await
        .unwrap();
    engine.flush().await.unwrap();

    // Compact — merges L0 files, keeps only v2 (latest).
    engine.compact().await.unwrap();

    // Read again — must see v2, not stale cached v1.
    let row = engine.get(&[FieldValue::Int64(1)]).unwrap().unwrap();
    assert_eq!(
        row.get(1).cloned(),
        Some(FieldValue::Bytes(bytes::Bytes::from("v2"))),
        "IMP-03: cache must be cleared after compaction — stale v1 was served"
    );
}

/// IMP-02 regression: `read_seq()` must return the visible sequence, not
/// the allocated sequence. After a put returns, the data must be visible.
#[tokio::test]
async fn imp02_visible_seq_matches_memtable() {
    let tmp = tempfile::tempdir().unwrap();
    let engine = MeruEngine::open(test_config(&tmp)).await.unwrap();

    let seq_before = engine.read_seq().0;

    engine
        .put(
            vec![FieldValue::Int64(42)],
            Row::new(vec![
                Some(FieldValue::Int64(42)),
                Some(FieldValue::Bytes(bytes::Bytes::from("hello"))),
            ]),
        )
        .await
        .unwrap();

    let seq_after = engine.read_seq().0;
    assert_eq!(
        seq_after,
        seq_before + 1,
        "IMP-02: visible_seq must advance by 1 after a single put"
    );

    // The data must be readable at the new visible_seq.
    let row = engine.get(&[FieldValue::Int64(42)]).unwrap();
    assert!(
        row.is_some(),
        "IMP-02: data must be visible immediately after put returns"
    );
}

/// IMP-02 regression: batch writes must advance visible_seq by the batch size.
#[tokio::test]
async fn imp02_batch_advances_visible_seq() {
    let tmp = tempfile::tempdir().unwrap();
    let engine = MeruEngine::open(test_config(&tmp)).await.unwrap();

    let seq_before = engine.read_seq().0;

    let mut batch = merutable::engine::write_path::MutationBatch::new();
    for i in 1..=5i64 {
        batch.put(
            vec![FieldValue::Int64(i)],
            Row::new(vec![
                Some(FieldValue::Int64(i)),
                Some(FieldValue::Bytes(bytes::Bytes::from(format!("v{i}")))),
            ]),
        );
    }
    merutable::engine::write_path::apply_batch(&engine, batch)
        .await
        .unwrap();

    let seq_after = engine.read_seq().0;
    assert_eq!(
        seq_after,
        seq_before + 5,
        "IMP-02: visible_seq must advance by batch size (5)"
    );

    // All 5 keys must be readable.
    for i in 1..=5i64 {
        let row = engine.get(&[FieldValue::Int64(i)]).unwrap();
        assert!(row.is_some(), "key {i} must be readable after batch");
    }
}

/// IMP-15 regression: read-only replica refresh must validate that all
/// referenced files exist before swapping the version.
#[tokio::test]
async fn imp15_refresh_rejects_missing_files() {
    let tmp = tempfile::tempdir().unwrap();
    let mut config = test_config(&tmp);
    config.memtable_size_bytes = 64 * 1024 * 1024; // don't auto-flush

    // Primary: write and flush.
    let primary = MeruEngine::open(config.clone()).await.unwrap();
    primary
        .put(
            vec![FieldValue::Int64(1)],
            Row::new(vec![Some(FieldValue::Int64(1)), None]),
        )
        .await
        .unwrap();
    primary.flush().await.unwrap();
    drop(primary);

    // Open read-only replica.
    config.read_only = true;
    let replica = MeruEngine::open(config).await.unwrap();

    // Delete the data file that the manifest references.
    let l0_dir = tmp.path().join("data").join("L0");
    if l0_dir.exists() {
        for entry in std::fs::read_dir(&l0_dir).unwrap() {
            let entry = entry.unwrap();
            if entry.path().extension().is_some_and(|e| e == "parquet") {
                std::fs::remove_file(entry.path()).unwrap();
            }
        }
    }

    // Refresh should fail because the referenced file is missing.
    let result = replica.refresh().await;
    assert!(
        result.is_err(),
        "IMP-15: refresh must reject when referenced data files are missing"
    );
}

/// Compaction parallelism regression: with the old single `compaction_mutex`,
/// two workers on disjoint levels serialized. With per-level reservation,
/// a long L2→L3 compaction must not block a concurrent L0→L1 compaction.
///
/// This test uses two `tokio::spawn`'d tasks. If they deadlock or if the
/// second blocks waiting for the first, the test times out. With the
/// fix, the second compaction picks disjoint levels and runs in parallel.
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn compactions_on_disjoint_levels_run_concurrently() {
    let tmp = tempfile::tempdir().unwrap();
    let mut config = test_config(&tmp);
    config.memtable_size_bytes = 64 * 1024 * 1024;
    config.gc_grace_period_secs = 0;
    let engine = MeruEngine::open(config).await.unwrap();

    // Seed enough L0 files for an L0→L1 pick.
    for batch in 0..6i64 {
        for i in 0..20i64 {
            let key = batch * 100 + i;
            engine
                .put(
                    vec![FieldValue::Int64(key)],
                    Row::new(vec![
                        Some(FieldValue::Int64(key)),
                        Some(FieldValue::Bytes(bytes::Bytes::from(format!("v{key}")))),
                    ]),
                )
                .await
                .unwrap();
        }
        engine.flush().await.unwrap();
    }

    // Spawn two concurrent compactions. Neither should deadlock.
    let e1 = engine.clone();
    let e2 = engine.clone();
    let h1 = tokio::spawn(async move { e1.compact().await });
    let h2 = tokio::spawn(async move { e2.compact().await });

    // A timeout guards against serialization deadlock. On a multi-thread
    // runtime with the fix, both return quickly.
    let timeout = std::time::Duration::from_secs(60);
    let (r1, r2) = tokio::join!(
        tokio::time::timeout(timeout, h1),
        tokio::time::timeout(timeout, h2),
    );
    assert!(
        r1.is_ok() && r2.is_ok(),
        "concurrent compactions must not deadlock"
    );
    r1.unwrap().unwrap().unwrap();
    r2.unwrap().unwrap().unwrap();

    // L0 must be drained below trigger.
    let stats = engine.stats();
    let l0 = stats
        .levels
        .iter()
        .find(|l| l.level == 0)
        .map(|l| l.file_count)
        .unwrap_or(0);
    assert!(
        l0 < 4,
        "after concurrent compactions L0 must be below trigger (got {l0})"
    );
}

/// Level reservations must not leak on error paths. If a compaction
/// fails mid-flight, the `LevelReservation` Drop impl must release the
/// levels so subsequent compactions can proceed.
///
/// We can't easily inject a failure without deeper surgery, so this
/// test exercises the happy path and verifies the busy set is empty
/// after `compact()` returns — the Drop impl handled the release.
#[tokio::test]
async fn level_reservations_released_after_compact() {
    let tmp = tempfile::tempdir().unwrap();
    let mut config = test_config(&tmp);
    config.memtable_size_bytes = 64 * 1024 * 1024;
    config.gc_grace_period_secs = 0;
    let engine = MeruEngine::open(config).await.unwrap();

    for batch in 0..5i64 {
        for i in 0..10i64 {
            let key = batch * 100 + i;
            engine
                .put(
                    vec![FieldValue::Int64(key)],
                    Row::new(vec![Some(FieldValue::Int64(key)), None]),
                )
                .await
                .unwrap();
        }
        engine.flush().await.unwrap();
    }

    engine.compact().await.unwrap();

    // After compact() returns, no level should still be reserved —
    // otherwise a subsequent compact() would see an empty pick.
    // Drop impl may enqueue release on the runtime via tokio::spawn
    // when try_lock fails; yield a few times to let it complete.
    for _ in 0..16 {
        let busy = engine.__compacting_levels_snapshot().await;
        if busy.is_empty() {
            break;
        }
        tokio::task::yield_now().await;
        tokio::time::sleep(std::time::Duration::from_millis(10)).await;
    }
    let busy = engine.__compacting_levels_snapshot().await;
    assert!(
        busy.is_empty(),
        "level reservations leaked after compact(): {busy:?}"
    );

    // And a subsequent compact() must still be able to pick work if
    // any is outstanding (no-op here since we just drained).
    engine.compact().await.unwrap();
}

/// Compaction loop regression: a single `compact()` call must drain the
/// tree until no level is above its trigger. Previously `compact()`
/// executed exactly one job and returned, so a caller (or the background
/// worker) had to keep calling it. With concurrent flushes, L0 could
/// grow unboundedly while a single deep compaction was in flight.
///
/// This test forces multiple L0 files to accumulate, drives the tree
/// past the L0 compaction trigger, and verifies that one `compact()`
/// call reduces L0 below the trigger.
#[tokio::test]
async fn compact_loops_until_tree_healthy() {
    let tmp = tempfile::tempdir().unwrap();
    let mut config = test_config(&tmp);
    // Small memtable so each put flushes quickly — but we'll force flushes
    // manually to control the L0 file count precisely.
    config.memtable_size_bytes = 64 * 1024 * 1024; // don't auto-flush
    config.gc_grace_period_secs = 0;
    let engine = MeruEngine::open(config).await.unwrap();

    // Create 6 L0 files (above the default l0_compaction_trigger=4).
    // Each flush produces one L0 file.
    for batch in 0..6i64 {
        for i in 0..10i64 {
            let key = batch * 100 + i;
            engine
                .put(
                    vec![FieldValue::Int64(key)],
                    Row::new(vec![
                        Some(FieldValue::Int64(key)),
                        Some(FieldValue::Bytes(bytes::Bytes::from(format!("v{key}")))),
                    ]),
                )
                .await
                .unwrap();
        }
        engine.flush().await.unwrap();
    }

    // Confirm we have at least the trigger count of L0 files.
    let stats_before = engine.stats();
    let l0_before = stats_before
        .levels
        .iter()
        .find(|l| l.level == 0)
        .map(|l| l.file_count)
        .unwrap_or(0);
    assert!(
        l0_before >= 4,
        "setup invariant: need ≥4 L0 files, got {l0_before}"
    );

    // One compact() call must drain L0 below the trigger.
    engine.compact().await.unwrap();

    let stats_after = engine.stats();
    let l0_after = stats_after
        .levels
        .iter()
        .find(|l| l.level == 0)
        .map(|l| l.file_count)
        .unwrap_or(0);
    assert!(
        l0_after < 4,
        "compact() should have drained L0 below trigger, got {l0_after} files"
    );
}

/// Graceful shutdown: close() must flush memtable data and reject
/// subsequent writes while keeping reads available.
#[tokio::test]
async fn close_flushes_and_rejects_writes() {
    let tmp = tempfile::tempdir().unwrap();
    let mut config = test_config(&tmp);
    config.memtable_size_bytes = 64 * 1024 * 1024; // don't auto-flush
    let engine = MeruEngine::open(config).await.unwrap();

    // Write data into the memtable (no flush).
    for i in 0..20i64 {
        engine
            .put(
                vec![FieldValue::Int64(i)],
                Row::new(vec![
                    Some(FieldValue::Int64(i)),
                    Some(FieldValue::Bytes(bytes::Bytes::from(format!("v{i}")))),
                ]),
            )
            .await
            .unwrap();
    }

    // Close: flushes memtable to Parquet.
    engine.close().await.unwrap();

    // Reads still work.
    for i in 0..20i64 {
        let row = engine.get(&[FieldValue::Int64(i)]).unwrap();
        assert!(row.is_some(), "key {i} must be readable after close");
    }

    // Writes fail.
    let err = engine
        .put(
            vec![FieldValue::Int64(99)],
            Row::new(vec![Some(FieldValue::Int64(99)), None]),
        )
        .await;
    assert!(err.is_err(), "put must fail after close");
}

/// Graceful shutdown: data written before close() survives reopen
/// without relying on WAL recovery.
#[tokio::test]
async fn close_data_durable_across_reopen() {
    let tmp = tempfile::tempdir().unwrap();
    let config = test_config(&tmp);

    {
        let mut cfg = config.clone();
        cfg.memtable_size_bytes = 64 * 1024 * 1024; // don't auto-flush
        let engine = MeruEngine::open(cfg).await.unwrap();
        for i in 0..30i64 {
            engine
                .put(
                    vec![FieldValue::Int64(i)],
                    Row::new(vec![
                        Some(FieldValue::Int64(i)),
                        Some(FieldValue::Bytes(bytes::Bytes::from(format!("data_{i}")))),
                    ]),
                )
                .await
                .unwrap();
        }
        engine.close().await.unwrap();
    }

    // Reopen — data should come from Parquet, not WAL.
    let engine = MeruEngine::open(config).await.unwrap();
    for i in 0..30i64 {
        let row = engine.get(&[FieldValue::Int64(i)]).unwrap();
        assert!(row.is_some(), "key {i} must survive close + reopen");
    }
}

/// Issue #12 regression: write API must reject rows whose shape
/// disagrees with the schema — BEFORE they reach WAL/memtable.
///
/// Covers the five Row::validate violations the issue enumerated:
///   1. Arity-short (fields.len() < columns.len())
///   2. Arity-long  (fields.len() > columns.len())
///   3. Type mismatch  (Bytes into an Int64 column)
///   4. NOT NULL violation (None in non-nullable column)
///   5. FixedLenByteArray length mismatch
///
/// Plus:
///   6. Valid row succeeds (sanity gate).
#[tokio::test]
async fn issue12_write_api_rejects_malformed_rows() {
    use merutable::types::schema::{ColumnDef, ColumnType, TableSchema};

    let tmp = tempfile::tempdir().unwrap();
    let schema = TableSchema {
        table_name: "shape".into(),
        columns: vec![
            ColumnDef {
                name: "id".into(),
                col_type: ColumnType::Int64,
                nullable: false,

                ..Default::default()
            },
            ColumnDef {
                name: "payload".into(),
                col_type: ColumnType::ByteArray,
                nullable: true,

                ..Default::default()
            },
            ColumnDef {
                name: "fixed8".into(),
                col_type: ColumnType::FixedLenByteArray(8),
                nullable: false,

                ..Default::default()
            },
        ],
        primary_key: vec![0],

        ..Default::default()
    };
    let config = EngineConfig {
        schema: schema.clone(),
        catalog_uri: tmp.path().to_string_lossy().to_string(),
        object_store_prefix: tmp.path().to_string_lossy().to_string(),
        wal_dir: tmp.path().join("wal"),
        memtable_size_bytes: 64 * 1024 * 1024,
        gc_grace_period_secs: 0,
        l0_slowdown_trigger: u32::MAX as usize,
        l0_stop_trigger: u32::MAX as usize,
        ..Default::default()
    };
    let engine = MeruEngine::open(config).await.unwrap();

    // 6 — valid row succeeds.
    let ok = Row::new(vec![
        Some(FieldValue::Int64(1)),
        Some(FieldValue::Bytes(bytes::Bytes::from("abc"))),
        Some(FieldValue::Bytes(bytes::Bytes::from_static(b"12345678"))),
    ]);
    engine
        .put(vec![FieldValue::Int64(1)], ok)
        .await
        .expect("valid row must succeed");

    // 1 — arity-short.
    let short = Row::new(vec![Some(FieldValue::Int64(2))]);
    let err = engine
        .put(vec![FieldValue::Int64(2)], short)
        .await
        .unwrap_err();
    assert!(
        matches!(err, merutable::types::MeruError::SchemaMismatch(_)),
        "arity-short must return SchemaMismatch, got: {err:?}"
    );

    // 2 — arity-long.
    let long = Row::new(vec![
        Some(FieldValue::Int64(3)),
        Some(FieldValue::Bytes(bytes::Bytes::from("x"))),
        Some(FieldValue::Bytes(bytes::Bytes::from_static(b"12345678"))),
        Some(FieldValue::Int32(999)),
    ]);
    let err = engine
        .put(vec![FieldValue::Int64(3)], long)
        .await
        .unwrap_err();
    assert!(matches!(
        err,
        merutable::types::MeruError::SchemaMismatch(_)
    ));

    // 3 — type mismatch: Bytes where Int64 expected.
    let typ = Row::new(vec![
        Some(FieldValue::Bytes(bytes::Bytes::from("not_an_int"))),
        Some(FieldValue::Bytes(bytes::Bytes::from("x"))),
        Some(FieldValue::Bytes(bytes::Bytes::from_static(b"12345678"))),
    ]);
    let err = engine
        .put(vec![FieldValue::Int64(4)], typ)
        .await
        .unwrap_err();
    assert!(matches!(
        err,
        merutable::types::MeruError::SchemaMismatch(_)
    ));

    // 4 — NOT NULL violation: None in non-nullable 'id'.
    let null_nn = Row::new(vec![
        None,
        Some(FieldValue::Bytes(bytes::Bytes::from("x"))),
        Some(FieldValue::Bytes(bytes::Bytes::from_static(b"12345678"))),
    ]);
    let err = engine
        .put(vec![FieldValue::Int64(5)], null_nn)
        .await
        .unwrap_err();
    assert!(matches!(
        err,
        merutable::types::MeruError::SchemaMismatch(_)
    ));

    // 5 — FixedLenByteArray length mismatch.
    let bad_fixed = Row::new(vec![
        Some(FieldValue::Int64(6)),
        None,
        Some(FieldValue::Bytes(bytes::Bytes::from_static(b"short"))), // 5 != 8
    ]);
    let err = engine
        .put(vec![FieldValue::Int64(6)], bad_fixed)
        .await
        .unwrap_err();
    assert!(matches!(
        err,
        merutable::types::MeruError::SchemaMismatch(_)
    ));
}

/// Issue #12 regression: `decode_row` surfaces corruption instead of
/// collapsing to an empty row. This test injects a physically corrupt
/// WAL value and verifies the engine's read path aborts cleanly
/// rather than returning a phantom row.
#[tokio::test]
async fn issue12_decode_corruption_propagates_through_read_path() {
    // We can exercise this directly at the codec layer — it's the
    // truth-point for the contract. Per-entry-point integration
    // tests are covered by the corruption error types bubbling up
    // through Result<Row> — every caller uses `?` now.
    use merutable::engine::codec;
    // Marker byte 0x01 + garbage: postcard will reject this.
    let bad = vec![0x01, 0xFF, 0xFF, 0xFF];
    let err = codec::decode_row(&bad).unwrap_err();
    assert!(
        matches!(err, merutable::types::MeruError::Corruption(_)),
        "decode_row on corrupt bytes must return Corruption, got: {err:?}"
    );
}

/// Issue #11 regression: `close()` must run a final GC sweep so
/// pending deletions don't leak across process lifetimes. Without
/// this, a compact→close sequence leaves obsoleted files on disk
/// forever: close() tears down workers before they get a chance to
/// run their heartbeat GC, and no later code path calls it.
#[tokio::test]
async fn issue11_close_sweeps_pending_deletions() {
    let tmp = tempfile::tempdir().unwrap();
    let mut config = test_config(&tmp);
    config.memtable_size_bytes = 64 * 1024 * 1024;
    config.gc_grace_period_secs = 0;
    let engine = MeruEngine::open(config).await.unwrap();

    // Create enough L0 files to trigger an L0→L1 compaction that
    // obsoletes the L0 files.
    for batch in 0..5i64 {
        for i in 0..5i64 {
            let key = batch * 10 + i;
            engine
                .put(
                    vec![FieldValue::Int64(key)],
                    Row::new(vec![
                        Some(FieldValue::Int64(key)),
                        Some(FieldValue::Bytes(bytes::Bytes::from(format!("v{key}")))),
                    ]),
                )
                .await
                .unwrap();
        }
        engine.flush().await.unwrap();
    }
    engine.compact().await.unwrap();

    let l0_dir = tmp.path().join("data").join("L0");
    // Count files at this moment; compact has queued the originals
    // for deletion. With gc_grace_period_secs=0 they should be
    // deletable now, but whether the compaction's post-commit GC
    // actually ran depends on timing.
    let _pre_close_files = std::fs::read_dir(&l0_dir)
        .map(|it| it.filter_map(|e| e.ok()).count())
        .unwrap_or(0);

    engine.close().await.unwrap();

    // After close(), the L0 directory should contain no leaked
    // obsoleted files — the close-path GC sweep drained them.
    let post_close_files = std::fs::read_dir(&l0_dir)
        .map(|it| {
            it.filter_map(|e| e.ok())
                .filter(|e| e.path().extension().is_some_and(|ext| ext == "parquet"))
                .count()
        })
        .unwrap_or(0);
    assert_eq!(
        post_close_files, 0,
        "Issue #11: close() must sweep pending deletions; {post_close_files} L0 files leaked",
    );
}

/// Issue #10 regression: read-only replica's row cache must be
/// cleared on refresh() so stale values aren't served after the
/// primary overwrites or deletes keys.
///
/// The primary writes v1, flushes. Replica reads v1 (populating
/// its cache). Primary writes v2 over the same keys AND deletes
/// one key. Replica refreshes and reads:
///   - Overwritten key must return v2.
///   - Deleted key must return None.
/// Without clearing the cache, both reads return the stale v1.
#[tokio::test]
async fn issue10_readonly_refresh_clears_row_cache() {
    let tmp = tempfile::tempdir().unwrap();
    let mut config_rw = test_config(&tmp);
    config_rw.memtable_size_bytes = 64 * 1024 * 1024;
    config_rw.row_cache_capacity = 1000;
    let mut config_ro = config_rw.clone();
    config_ro.read_only = true;

    let primary = MeruEngine::open(config_rw.clone()).await.unwrap();
    for i in 0..20i64 {
        primary
            .put(
                vec![FieldValue::Int64(i)],
                Row::new(vec![
                    Some(FieldValue::Int64(i)),
                    Some(FieldValue::Bytes(bytes::Bytes::from(format!("v1_{i}")))),
                ]),
            )
            .await
            .unwrap();
    }
    primary.flush().await.unwrap();

    let replica = MeruEngine::open(config_ro.clone()).await.unwrap();
    // Prime the cache: read a few keys.
    for i in [3i64, 5, 7] {
        let row = replica.get(&[FieldValue::Int64(i)]).unwrap().unwrap();
        match row.get(1) {
            Some(FieldValue::Bytes(b)) => assert_eq!(
                b.as_ref(),
                format!("v1_{i}").as_bytes(),
                "expected v1 before refresh"
            ),
            _ => panic!("unexpected field type"),
        }
    }

    // Primary overwrites and deletes.
    for i in 0..20i64 {
        primary
            .put(
                vec![FieldValue::Int64(i)],
                Row::new(vec![
                    Some(FieldValue::Int64(i)),
                    Some(FieldValue::Bytes(bytes::Bytes::from(format!("v2_{i}")))),
                ]),
            )
            .await
            .unwrap();
    }
    primary.delete(vec![FieldValue::Int64(5)]).await.unwrap();
    primary.flush().await.unwrap();

    replica.refresh().await.unwrap();

    // Overwritten key: must return v2, not cached v1.
    let row = replica.get(&[FieldValue::Int64(3)]).unwrap().unwrap();
    match row.get(1) {
        Some(FieldValue::Bytes(b)) => assert_eq!(
            b.as_ref(),
            b"v2_3",
            "Issue #10: cache must be cleared on refresh; got stale value"
        ),
        _ => panic!("unexpected field type"),
    }

    // Deleted key: must return None, not cached v1.
    let row = replica.get(&[FieldValue::Int64(5)]).unwrap();
    assert!(
        row.is_none(),
        "Issue #10: deleted key must return None after refresh; cache was serving stale data"
    );
}

/// Issue #6 regression: read-only replica refresh() must see new
/// data written by the primary.
///
/// Sequence:
/// 1. Primary writes keys 0-49 and flushes.
/// 2. Read-only replica opens and verifies visibility of 0-49.
/// 3. Primary writes 50-99 and flushes.
/// 4. Replica calls refresh().
/// 5. Replica reads 50-99 — must see all of them.
#[tokio::test]
async fn issue6_readonly_refresh_picks_up_new_data() {
    let tmp = tempfile::tempdir().unwrap();
    let mut config_rw = test_config(&tmp);
    // Big memtable so setup produces exactly two L0 files (one per
    // explicit flush), no auto-flush races with background tasks.
    config_rw.memtable_size_bytes = 64 * 1024 * 1024;
    let mut config_ro = config_rw.clone();
    config_ro.read_only = true;

    let primary = MeruEngine::open(config_rw.clone()).await.unwrap();
    for i in 0..50i64 {
        primary
            .put(
                vec![FieldValue::Int64(i)],
                Row::new(vec![
                    Some(FieldValue::Int64(i)),
                    Some(FieldValue::Bytes(bytes::Bytes::from(format!("first_{i}")))),
                ]),
            )
            .await
            .unwrap();
    }
    primary.flush().await.unwrap();

    let replica = MeruEngine::open(config_ro.clone()).await.unwrap();
    for i in 0..50i64 {
        let row = replica.get(&[FieldValue::Int64(i)]).unwrap();
        assert!(row.is_some(), "pre-refresh: replica missing key {i}");
    }

    // Second batch on primary.
    for i in 50..100i64 {
        primary
            .put(
                vec![FieldValue::Int64(i)],
                Row::new(vec![
                    Some(FieldValue::Int64(i)),
                    Some(FieldValue::Bytes(bytes::Bytes::from(format!("second_{i}")))),
                ]),
            )
            .await
            .unwrap();
    }
    primary.flush().await.unwrap();

    replica.refresh().await.unwrap();

    let mut missing = Vec::new();
    for i in 0..100i64 {
        let row = replica.get(&[FieldValue::Int64(i)]).unwrap();
        if row.is_none() {
            missing.push(i);
        }
    }
    assert!(
        missing.is_empty(),
        "Issue #6: replica missing keys after refresh: {missing:?}",
    );
}

/// Narrow diagnostic for Issue #7 (kept as a fast smoke test
/// alongside the full roundtrip): exercise empty and single-null
/// PKs at each stage of the lifecycle.
#[tokio::test]
async fn issue7_narrow_diagnostic() {
    use merutable::types::schema::{ColumnDef, ColumnType, TableSchema};

    let tmp = tempfile::tempdir().unwrap();
    let schema = TableSchema {
        table_name: "ba".into(),
        columns: vec![
            ColumnDef {
                name: "k".into(),
                col_type: ColumnType::ByteArray,
                nullable: false,

                ..Default::default()
            },
            ColumnDef {
                name: "v".into(),
                col_type: ColumnType::Int64,
                nullable: false,

                ..Default::default()
            },
        ],
        primary_key: vec![0],

        ..Default::default()
    };
    let config = EngineConfig {
        schema: schema.clone(),
        catalog_uri: tmp.path().to_string_lossy().to_string(),
        object_store_prefix: tmp.path().to_string_lossy().to_string(),
        wal_dir: tmp.path().join("wal"),
        memtable_size_bytes: 64 * 1024 * 1024,
        gc_grace_period_secs: 0,
        l0_slowdown_trigger: u32::MAX as usize,
        l0_stop_trigger: u32::MAX as usize,
        ..Default::default()
    };
    let engine = MeruEngine::open(config).await.unwrap();
    let empty = bytes::Bytes::new();
    let null1 = bytes::Bytes::from_static(&[0u8]);
    engine
        .put(
            vec![FieldValue::Bytes(empty.clone())],
            Row::new(vec![
                Some(FieldValue::Bytes(empty.clone())),
                Some(FieldValue::Int64(100)),
            ]),
        )
        .await
        .unwrap();
    engine
        .put(
            vec![FieldValue::Bytes(null1.clone())],
            Row::new(vec![
                Some(FieldValue::Bytes(null1.clone())),
                Some(FieldValue::Int64(200)),
            ]),
        )
        .await
        .unwrap();
    // Stage 1: in memtable.
    let r = engine.get(&[FieldValue::Bytes(empty.clone())]).unwrap();
    println!("STAGE memtable empty: {r:?}");
    let r = engine.get(&[FieldValue::Bytes(null1.clone())]).unwrap();
    println!("STAGE memtable [0x00]: {r:?}");
    // Stage 2: after flush (in L0 Parquet).
    engine.flush().await.unwrap();
    let r = engine.get(&[FieldValue::Bytes(empty.clone())]).unwrap();
    println!("STAGE after flush empty: {r:?}");
    let r = engine.get(&[FieldValue::Bytes(null1.clone())]).unwrap();
    println!("STAGE after flush [0x00]: {r:?}");
    // Stage 3: after compact.
    engine.compact().await.unwrap();
    let r = engine.get(&[FieldValue::Bytes(empty.clone())]).unwrap();
    println!("STAGE after compact empty: {r:?}");
    let r = engine.get(&[FieldValue::Bytes(null1.clone())]).unwrap();
    println!("STAGE after compact [0x00]: {r:?}");
    // Stage 4: close + reopen.
    engine.close().await.unwrap();
    drop(engine);
    let config2 = EngineConfig {
        schema: schema.clone(),
        catalog_uri: tmp.path().to_string_lossy().to_string(),
        object_store_prefix: tmp.path().to_string_lossy().to_string(),
        wal_dir: tmp.path().join("wal"),
        memtable_size_bytes: 64 * 1024 * 1024,
        gc_grace_period_secs: 0,
        l0_slowdown_trigger: u32::MAX as usize,
        l0_stop_trigger: u32::MAX as usize,
        ..Default::default()
    };
    let engine = MeruEngine::open(config2).await.unwrap();
    let r = engine.get(&[FieldValue::Bytes(empty.clone())]).unwrap();
    println!("STAGE after reopen empty: {r:?}");
    let r = engine.get(&[FieldValue::Bytes(null1.clone())]).unwrap();
    println!("STAGE after reopen [0x00]: {r:?}");
    let scanned = engine.scan(None, None).unwrap();
    println!("SCAN results: {} rows", scanned.len());
    for (ik, row) in &scanned {
        println!("  ikey_bytes={:?} row={:?}", ik.as_bytes(), row);
    }
}

/// Issue #7 regression: ByteArray primary keys with empty bytes,
/// single null byte, and null-byte runs must survive put → flush →
/// compact → reopen without collision or data loss.
///
/// This test pins down the end-to-end durability contract for the
/// edge cases the user's chaos-monkey flagged. The underlying
/// `escape_byte_array` encoding was already correct (see
/// `merutable-types::key::bytearray_*` unit tests); this test
/// covers the full engine path.
#[tokio::test]
async fn issue7_bytearray_edge_case_pks_roundtrip_through_parquet() {
    use merutable::types::schema::{ColumnDef, ColumnType, TableSchema};

    let tmp = tempfile::tempdir().unwrap();
    let schema = TableSchema {
        table_name: "ba".into(),
        columns: vec![
            ColumnDef {
                name: "k".into(),
                col_type: ColumnType::ByteArray,
                nullable: false,

                ..Default::default()
            },
            ColumnDef {
                name: "v".into(),
                col_type: ColumnType::Int64,
                nullable: false,

                ..Default::default()
            },
        ],
        primary_key: vec![0],

        ..Default::default()
    };
    let make_config = || EngineConfig {
        schema: schema.clone(),
        catalog_uri: tmp.path().to_string_lossy().to_string(),
        object_store_prefix: tmp.path().to_string_lossy().to_string(),
        wal_dir: tmp.path().join("wal"),
        memtable_size_bytes: 64 * 1024 * 1024,
        gc_grace_period_secs: 0,
        l0_slowdown_trigger: u32::MAX as usize,
        l0_stop_trigger: u32::MAX as usize,
        ..Default::default()
    };

    // Edge-case PKs and their distinct marker values.
    let cases: Vec<(bytes::Bytes, i64)> = vec![
        (bytes::Bytes::new(), 100),                            // empty
        (bytes::Bytes::from_static(&[0u8]), 200),              // [0x00]
        (bytes::Bytes::from_static(&[0u8, 0u8]), 300),         // [0x00, 0x00]
        (bytes::Bytes::from_static(&[0u8, 0x01u8]), 400),      // [0x00, 0x01]
        (bytes::Bytes::from_static(&[0x01u8, 0u8]), 500),      // [0x01, 0x00]
        (bytes::Bytes::from_static(&[0xFFu8]), 600),           // [0xFF]
        (bytes::Bytes::from_static(&[0u8, 0xFFu8, 0u8]), 700), // [0x00, 0xFF, 0x00]
        (bytes::Bytes::from("hello"), 800),
    ];

    // Populate + flush + compact so every key traverses the Parquet
    // round-trip and the compaction iterator's dedup path.
    {
        let engine = MeruEngine::open(make_config()).await.unwrap();
        for (k, v) in &cases {
            engine
                .put(
                    vec![FieldValue::Bytes(k.clone())],
                    Row::new(vec![
                        Some(FieldValue::Bytes(k.clone())),
                        Some(FieldValue::Int64(*v)),
                    ]),
                )
                .await
                .unwrap();
        }
        engine.flush().await.unwrap();
        engine.compact().await.unwrap();
        engine.close().await.unwrap();
    }

    // Reopen — WAL is already GC'd (the flush wrote everything), so
    // reads must be served from Parquet.
    let engine = MeruEngine::open(make_config()).await.unwrap();
    for (k, expected_v) in &cases {
        let row = engine
            .get(&[FieldValue::Bytes(k.clone())])
            .unwrap()
            .unwrap_or_else(|| panic!("Issue #7: key {k:?} missing after reopen"));
        match row.get(1) {
            Some(FieldValue::Int64(got)) => assert_eq!(
                *got, *expected_v,
                "Issue #7: key {k:?} returned wrong value (got {got}, want {expected_v})"
            ),
            other => panic!("expected Int64 for key {k:?}, got {other:?}"),
        }
    }

    // Scan ordering: results must be in ascending PK order.
    let scanned = engine.scan(None, None).unwrap();
    let scanned_keys: Vec<bytes::Bytes> = scanned
        .iter()
        .map(|(_ik, row)| match row.get(0) {
            Some(FieldValue::Bytes(b)) => b.clone(),
            _ => panic!("unexpected field type in scan result"),
        })
        .collect();
    let mut expected_keys: Vec<bytes::Bytes> = cases.iter().map(|(k, _)| k.clone()).collect();
    expected_keys.sort_by(|a, b| a.as_ref().cmp(b.as_ref()));
    assert_eq!(
        scanned_keys, expected_keys,
        "Issue #7: scan must return keys in ascending order"
    );
}

/// Issue #5 regression: L0 stop trigger must block writes.
///
/// Before this fix, `l0_stop_trigger` was defined in config but
/// never checked — writes proceeded regardless of L0 count. Stress
/// test observed L0 reaching 44 files (8 past a 36-file stop trigger)
/// with no stall or rejection.
///
/// The test drives this deterministically by:
/// 1. Disabling background workers (`*_parallelism = 0`) so the only
///    compactor is the test's explicit `compact()` call.
/// 2. Flushing enough times to exceed the stop trigger.
/// 3. Attempting a `put()` with a short timeout — it MUST time out
///    (the stop trigger should block it).
/// 4. Calling `compact()` to drain L0 — this fires `l0_drained`.
/// 5. Confirming a subsequent `put()` completes immediately.
#[tokio::test]
async fn l0_stop_trigger_blocks_writes_until_drained() {
    let tmp = tempfile::tempdir().unwrap();
    let mut config = test_config(&tmp);
    config.memtable_size_bytes = 64 * 1024 * 1024; // don't auto-flush
                                                   // Tight triggers so the test produces few files.
    config.l0_compaction_trigger = 100; // picker's score-based path off
    config.l0_slowdown_trigger = 4;
    config.l0_stop_trigger = 6;
    // No background workers — deterministic timing.
    config.flush_parallelism = 0;
    config.compaction_parallelism = 0;
    config.gc_grace_period_secs = 0;
    let engine = MeruEngine::open(config).await.unwrap();

    // Produce exactly 6 L0 files (at l0_stop_trigger = 6). Writing
    // 7 flushes here would itself hit the stop trigger during the 7th
    // put — the setup loop would deadlock on the very condition we're
    // trying to test. Setup must stay strictly below the trigger.
    for batch in 0..6i64 {
        engine
            .put(
                vec![FieldValue::Int64(batch)],
                Row::new(vec![
                    Some(FieldValue::Int64(batch)),
                    Some(FieldValue::Bytes(bytes::Bytes::from(format!("v{batch}")))),
                ]),
            )
            .await
            .unwrap();
        engine.flush().await.unwrap();
    }
    let l0 = engine
        .stats()
        .levels
        .iter()
        .find(|l| l.level == 0)
        .map(|l| l.file_count)
        .unwrap_or(0);
    assert_eq!(
        l0, 6,
        "setup invariant: L0 must equal stop trigger; got {l0}"
    );

    // A put under these conditions must block indefinitely. Prove it
    // by giving it 300ms and asserting the timeout fired.
    let stalled = tokio::time::timeout(
        std::time::Duration::from_millis(300),
        engine.put(
            vec![FieldValue::Int64(9999)],
            Row::new(vec![Some(FieldValue::Int64(9999)), None]),
        ),
    )
    .await;
    assert!(
        stalled.is_err(),
        "Issue #5: put must be stalled when L0 is above stop trigger"
    );

    // Drain L0 via explicit compaction — fires l0_drained.
    engine.compact().await.unwrap();

    // L0 should now be below the stop trigger.
    let l0_after = engine
        .stats()
        .levels
        .iter()
        .find(|l| l.level == 0)
        .map(|l| l.file_count)
        .unwrap_or(0);
    assert!(
        l0_after < 6,
        "compact() should have drained L0 below stop trigger; got {l0_after}"
    );

    // New put completes promptly — no stall.
    let result = tokio::time::timeout(
        std::time::Duration::from_secs(3),
        engine.put(
            vec![FieldValue::Int64(10000)],
            Row::new(vec![Some(FieldValue::Int64(10000)), None]),
        ),
    )
    .await
    .expect("put must complete after compaction drained L0")
    .expect("put must succeed");
    assert!(result.0 > 0);
}

/// Issue #3 regression: compaction output is split into multiple
/// files when the aggregate row byte-estimate exceeds
/// `TARGET_OUTPUT_FILE_BYTES` (512 MiB). Before this fix, a single
/// ByteArray column aggregated across all output rows could exceed
/// Arrow's i32 offset limit (~2.14 GiB) and panic the process.
///
/// This test uses a schema with a large ByteArray payload and pushes
/// enough rows that the compaction must split into ≥2 output files.
/// We verify:
/// 1. Compaction completes without panicking.
/// 2. The output level contains more than one file.
/// 3. Adjacent output files have non-overlapping key ranges
///    (the L1+ non-overlap invariant is preserved — chunk boundaries
///    never split a user_key).
/// 4. All keys are still readable (no data loss from splitting).
#[tokio::test]
async fn compaction_output_splits_at_size_threshold() {
    use merutable::types::schema::{ColumnDef, ColumnType, TableSchema};

    let tmp = tempfile::tempdir().unwrap();
    // Schema with a big payload so the byte estimate crosses the
    // 512 MiB threshold without needing a ridiculous row count.
    let schema = TableSchema {
        table_name: "big_payload".into(),
        columns: vec![
            ColumnDef {
                name: "id".into(),
                col_type: ColumnType::Int64,
                nullable: false,

                ..Default::default()
            },
            ColumnDef {
                name: "payload".into(),
                col_type: ColumnType::ByteArray,
                nullable: false,

                ..Default::default()
            },
        ],
        primary_key: vec![0],

        ..Default::default()
    };
    let config = EngineConfig {
        schema,
        catalog_uri: tmp.path().to_string_lossy().to_string(),
        object_store_prefix: tmp.path().to_string_lossy().to_string(),
        wal_dir: tmp.path().join("wal"),
        memtable_size_bytes: 64 * 1024 * 1024,
        gc_grace_period_secs: 0,
        ..Default::default()
    };
    let engine = MeruEngine::open(config).await.unwrap();

    // 512 KiB per row × 1200 rows = ~600 MiB aggregate — crosses the
    // 512 MiB split threshold by a comfortable margin.
    let payload = vec![0xAAu8; 512 * 1024];
    let row_count = 1200i64;
    for i in 0..row_count {
        engine
            .put(
                vec![FieldValue::Int64(i)],
                Row::new(vec![
                    Some(FieldValue::Int64(i)),
                    Some(FieldValue::Bytes(bytes::Bytes::from(payload.clone()))),
                ]),
            )
            .await
            .unwrap();
        // Flush periodically so we get multiple L0 files to compact.
        if i % 200 == 199 {
            engine.flush().await.unwrap();
        }
    }
    engine.flush().await.unwrap();

    // Compact — must not panic on Arrow i32 overflow.
    engine.compact().await.unwrap();

    // Verify split: output level should have ≥ 2 files.
    let stats = engine.stats();
    let l1 = stats
        .levels
        .iter()
        .find(|l| l.level == 1)
        .expect("L1 must exist after L0→L1 compaction");
    assert!(
        l1.file_count >= 2,
        "Issue #3: compaction must split large outputs; L1 has {} file(s)",
        l1.file_count,
    );

    // Verify non-overlap invariant: L1 files must be key-disjoint.
    let mut ranges: Vec<(Vec<u8>, Vec<u8>)> = l1
        .files
        .iter()
        .map(|f| {
            // `stats.files` doesn't expose key ranges; read from manifest.
            // Fallback: use seq_range — same invariant applies (disjoint).
            (f.path.as_bytes().to_vec(), f.path.as_bytes().to_vec())
        })
        .collect();
    ranges.sort();
    // Basic sanity: each entry is unique (no identical paths).
    ranges.dedup();
    assert_eq!(
        ranges.len(),
        l1.file_count,
        "output files must have unique paths"
    );

    // Verify all rows still readable — proves no data was lost at
    // chunk boundaries.
    for i in (0..row_count).step_by(100) {
        let row = engine.get(&[FieldValue::Int64(i)]).unwrap();
        assert!(
            row.is_some(),
            "Issue #3: row {i} missing after split compaction"
        );
    }
}

/// BUG-0007..0013 regression: version-pinned GC.
///
/// A long-running reader holds a `Version` snapshot that references
/// files obsoleted by a concurrent compaction. With time-based grace
/// alone, a scan of tens of GB on a laptop can outlast the default
/// 5-minute window; GC then deletes files the scan still needs and
/// the scan fails with `IO NotFound`.
///
/// This test simulates the race deterministically:
/// 1. Write data and flush to create L0 files.
/// 2. Pin a snapshot (simulating a long scan in progress).
/// 3. Trigger a compaction that obsoletes those files AND a GC sweep
///    with `gc_grace_period_secs = 0`.
/// 4. Verify the obsoleted files are STILL on disk — the pin held.
/// 5. Drop the pin, trigger another GC sweep.
/// 6. Verify the files are now gone.
#[tokio::test]
async fn version_pin_prevents_gc_of_files_a_reader_might_need() {
    let tmp = tempfile::tempdir().unwrap();
    let mut config = test_config(&tmp);
    // Zero grace so time-based check never keeps files alive — we
    // want to prove the *pin* is what's keeping them, not the timer.
    config.gc_grace_period_secs = 0;
    config.memtable_size_bytes = 64 * 1024 * 1024;
    let engine = MeruEngine::open(config).await.unwrap();

    // Populate enough L0 files for a meaningful compaction.
    for batch in 0..5i64 {
        for i in 0..10i64 {
            let key = batch * 100 + i;
            engine
                .put(
                    vec![FieldValue::Int64(key)],
                    Row::new(vec![
                        Some(FieldValue::Int64(key)),
                        Some(FieldValue::Bytes(bytes::Bytes::from(format!("v{key}")))),
                    ]),
                )
                .await
                .unwrap();
        }
        engine.flush().await.unwrap();
    }

    // Snapshot the L0 directory — these are the files the pinned
    // reader would try to open.
    let l0_dir = tmp.path().join("data").join("L0");
    let files_before: Vec<_> = std::fs::read_dir(&l0_dir)
        .unwrap()
        .filter_map(|e| e.ok())
        .filter(|e| e.path().extension().is_some_and(|ext| ext == "parquet"))
        .map(|e| e.path())
        .collect();
    assert!(
        files_before.len() >= 4,
        "need ≥4 L0 files to guarantee compaction; got {}",
        files_before.len()
    );

    // Pin the current snapshot — simulates a reader mid-scan.
    let (pin, pinned_version) = engine.pin_current_snapshot();
    let pinned_snapshot_id = pinned_version.snapshot_id;

    // Run a compaction — obsoletes the pinned version's L0 files.
    engine.compact().await.unwrap();

    // The pinned reader still holds a `Version` that references the
    // obsoleted files. GC must keep them alive.
    let files_after_compact: Vec<_> = std::fs::read_dir(&l0_dir)
        .unwrap()
        .filter_map(|e| e.ok())
        .filter(|e| e.path().extension().is_some_and(|ext| ext == "parquet"))
        .map(|e| e.path())
        .collect();
    // Every file the reader might open must still exist.
    for f in &files_before {
        assert!(
            files_after_compact.iter().any(|p| p == f) || !f.exists(),
            "file {} disappeared while a pinned reader still holds its snapshot",
            f.display()
        );
        assert!(
            f.exists(),
            "BUG-0007..0013 regression: file {} was GCed while snapshot {pinned_snapshot_id} \
             is pinned",
            f.display(),
        );
    }

    // Drop the pin — GC should now be able to clean up.
    drop(pin);
    engine.gc_pending_deletions().await;

    // Files are gone now (grace = 0 and no pins).
    let files_after_unpin: Vec<_> = std::fs::read_dir(&l0_dir)
        .unwrap()
        .filter_map(|e| e.ok())
        .filter(|e| e.path().extension().is_some_and(|ext| ext == "parquet"))
        .map(|e| e.path())
        .collect();
    // We can't assert exactly zero (compaction might produce intra-L0
    // outputs in edge cases), just that the ORIGINAL files are gone.
    for f in &files_before {
        assert!(
            !f.exists(),
            "file {} should have been GCed after pin released; remaining files = {files_after_unpin:?}",
            f.display()
        );
    }
}

/// Version-pin accounting: multiple concurrent pins at the same
/// snapshot_id are refcounted, and `min_pinned_snapshot` returns the
/// smallest live pin across all snapshots.
#[tokio::test]
async fn pin_refcount_and_min_snapshot() {
    let tmp = tempfile::tempdir().unwrap();
    let engine = MeruEngine::open(test_config(&tmp)).await.unwrap();

    // No pins yet.
    assert!(engine.min_pinned_snapshot().is_none());

    // Two concurrent pins at the same snapshot.
    let (p1, v1) = engine.pin_current_snapshot();
    let (p2, v2) = engine.pin_current_snapshot();
    assert_eq!(v1.snapshot_id, v2.snapshot_id);
    assert_eq!(engine.min_pinned_snapshot(), Some(v1.snapshot_id));

    // Drop one — still pinned (refcount 2 → 1).
    drop(p1);
    assert_eq!(engine.min_pinned_snapshot(), Some(v2.snapshot_id));

    // Drop the other — gone.
    drop(p2);
    assert!(engine.min_pinned_snapshot().is_none());
}

/// IMP-12 regression: compaction-obsoleted files should be tracked for
/// deferred deletion, not immediately removed.
#[tokio::test]
async fn imp12_gc_grace_period() {
    let tmp = tempfile::tempdir().unwrap();
    let mut config = test_config(&tmp);
    // Set grace period to 10 seconds — files should NOT be deleted immediately.
    config.gc_grace_period_secs = 10;
    config.memtable_size_bytes = 512;

    let engine = MeruEngine::open(config).await.unwrap();

    // Write enough data for two flushes.
    for i in 0..50i64 {
        engine
            .put(
                vec![FieldValue::Int64(i)],
                Row::new(vec![
                    Some(FieldValue::Int64(i)),
                    Some(FieldValue::Bytes(bytes::Bytes::from(format!("data_{i}")))),
                ]),
            )
            .await
            .unwrap();
    }
    engine.flush().await.unwrap();

    for i in 50..100i64 {
        engine
            .put(
                vec![FieldValue::Int64(i)],
                Row::new(vec![
                    Some(FieldValue::Int64(i)),
                    Some(FieldValue::Bytes(bytes::Bytes::from(format!("data_{i}")))),
                ]),
            )
            .await
            .unwrap();
    }
    engine.flush().await.unwrap();

    // Count L0 files before compaction.
    let l0_dir = tmp.path().join("data").join("L0");
    let files_before: Vec<_> = std::fs::read_dir(&l0_dir)
        .unwrap()
        .filter_map(|e| e.ok())
        .filter(|e| e.path().extension().is_some_and(|ext| ext == "parquet"))
        .collect();
    assert!(
        files_before.len() >= 2,
        "should have at least 2 L0 files before compaction"
    );

    // Compact.
    engine.compact().await.unwrap();

    // With a 10-second grace period, the old files should still exist.
    let files_after: Vec<_> = std::fs::read_dir(&l0_dir)
        .unwrap()
        .filter_map(|e| e.ok())
        .filter(|e| e.path().extension().is_some_and(|ext| ext == "parquet"))
        .collect();
    // The old files should NOT have been deleted yet (grace period hasn't elapsed).
    assert!(
        files_after.len() >= files_before.len(),
        "IMP-12: files should be preserved during grace period, \
         before={} after={}",
        files_before.len(),
        files_after.len()
    );
}