armdb 0.2.0 - Docs.rs

//! Integration tests for armdb/src/replication/ (Bitcask variable replication).
//!
//! Each test spins up a real `VarTree` leader + server and a follower + client
//! talking over TCP, exercises catch-up or streaming, then asserts convergence.

#![cfg(all(feature = "replication", feature = "var-collections"))]

use std::net::{SocketAddr, TcpListener};
use std::sync::Arc;
use std::time::{Duration, Instant};

use armdb::Config;
use armdb::ShutdownSignal;
use armdb::VarTree;
use armdb::replication::{ReplicationClientOptions, ReplicationRegistry, ReplicationServerOptions};

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/// Bind to an ephemeral port and immediately release the listener so the
/// OS keeps the port reserved until we bind again. This is racy by design —
/// use only in tests where the window is tiny.
fn next_bind_addr() -> SocketAddr {
    let listener = TcpListener::bind("127.0.0.1:0").unwrap();
    let addr = listener.local_addr().unwrap();
    drop(listener);
    addr
}

/// Poll `f` every 20 ms until it returns `true` or `timeout` elapses.
fn wait_until(timeout: Duration, mut f: impl FnMut() -> bool) -> bool {
    let start = Instant::now();
    loop {
        if f() {
            return true;
        }
        if start.elapsed() >= timeout {
            return f(); // one last check
        }
        std::thread::sleep(Duration::from_millis(20));
    }
}

/// Config with a small shard count to keep fd usage low in tests.
fn test_cfg() -> Config {
    Config::test()
}

// ---------------------------------------------------------------------------
// Test 1: catch_up_smoke  (C1, C10)
// ---------------------------------------------------------------------------
/// Leader writes N > BATCH_MAX_ENTRIES (256) entries before the follower
/// connects. Follower must catch up via the log-reader path and end up with
/// all 512 entries.
#[test]
fn catch_up_smoke() {
    let leader_dir = tempfile::tempdir().unwrap();
    let follower_dir = tempfile::tempdir().unwrap();
    let addr = next_bind_addr();

    let leader = VarTree::<[u8; 8]>::open(leader_dir.path(), test_cfg()).unwrap();
    for i in 0u64..512 {
        leader
            .put(&i.to_be_bytes(), format!("v{i}").as_bytes())
            .unwrap();
    }

    let leader_signal = ShutdownSignal::new();
    let _server = leader
        .start_replication_server(addr, leader_signal.clone())
        .unwrap();

    let follower = Arc::new(VarTree::<[u8; 8]>::open(follower_dir.path(), test_cfg()).unwrap());
    let registry = Arc::new(ReplicationRegistry::new(follower.as_replication_target()));
    let follower_signal = ShutdownSignal::new();
    let _client = follower
        .start_replication_client(addr, registry, follower_signal.clone())
        .unwrap();

    assert!(
        wait_until(Duration::from_secs(15), || {
            (0u64..512).all(|i| follower.contains(&i.to_be_bytes()))
        }),
        "follower did not catch up within 15s (len={})",
        follower.len()
    );

    follower_signal.shutdown();
    leader_signal.shutdown();
}

// ---------------------------------------------------------------------------
// Test 2: streaming_after_catchup
// ---------------------------------------------------------------------------
/// After initial catch-up, the leader writes more entries. Follower must
/// receive them via the streaming path.
#[test]
fn streaming_after_catchup() {
    let leader_dir = tempfile::tempdir().unwrap();
    let follower_dir = tempfile::tempdir().unwrap();
    let addr = next_bind_addr();

    let leader = VarTree::<[u8; 8]>::open(leader_dir.path(), test_cfg()).unwrap();
    // Write the first batch before the follower connects (catch-up phase).
    for i in 0u64..64 {
        leader.put(&i.to_be_bytes(), b"initial").unwrap();
    }

    let leader_signal = ShutdownSignal::new();
    let _server = leader
        .start_replication_server(addr, leader_signal.clone())
        .unwrap();

    let follower = Arc::new(VarTree::<[u8; 8]>::open(follower_dir.path(), test_cfg()).unwrap());
    let registry = Arc::new(ReplicationRegistry::new(follower.as_replication_target()));
    let follower_signal = ShutdownSignal::new();
    let _client = follower
        .start_replication_client(addr, registry, follower_signal.clone())
        .unwrap();

    // Wait for catch-up to complete.
    assert!(
        wait_until(Duration::from_secs(10), || { follower.len() >= 64 }),
        "catch-up did not complete within 10s"
    );

    // Now write streaming entries (512..768).
    for i in 512u64..768 {
        leader.put(&i.to_be_bytes(), b"streamed").unwrap();
    }

    assert!(
        wait_until(Duration::from_secs(5), || {
            (512u64..768).all(|i| follower.contains(&i.to_be_bytes()))
        }),
        "follower did not receive streamed entries within 5s (len={})",
        follower.len()
    );

    follower_signal.shutdown();
    leader_signal.shutdown();
}

// ---------------------------------------------------------------------------
// Test 3: reconnect_no_loss_no_duplicates  (C11, C12)
// ---------------------------------------------------------------------------
/// Drop the client after catch-up, write more on leader, reconnect — follower
/// must end up with all entries and no logical duplicates (index len == 200).
#[test]
fn reconnect_no_loss_no_duplicates() {
    let leader_dir = tempfile::tempdir().unwrap();
    let follower_dir = tempfile::tempdir().unwrap();
    let addr = next_bind_addr();

    let leader = VarTree::<[u8; 8]>::open(leader_dir.path(), test_cfg()).unwrap();

    let leader_signal = ShutdownSignal::new();
    let _server = leader
        .start_replication_server_with_options(
            addr,
            leader_signal.clone(),
            ReplicationServerOptions {
                heartbeat_interval_secs: 1,
            },
        )
        .unwrap();

    // --- First connection ---
    let follower = Arc::new(VarTree::<[u8; 8]>::open(follower_dir.path(), test_cfg()).unwrap());
    let registry = Arc::new(ReplicationRegistry::new(follower.as_replication_target()));
    let signal1 = ShutdownSignal::new();
    let client1 = follower
        .start_replication_client(addr, registry.clone(), signal1.clone())
        .unwrap();

    // Write first 100 entries.
    for i in 0u64..100 {
        leader.put(&i.to_be_bytes(), b"batch1").unwrap();
    }

    // Wait for first batch.
    assert!(
        wait_until(Duration::from_secs(10), || follower.len() >= 100),
        "follower did not receive first 100 entries"
    );

    // Drop the client (triggers ShutdownSignal via Drop).
    drop(client1);
    signal1.shutdown();

    // Give the server time to notice the disconnect.
    std::thread::sleep(Duration::from_millis(50));

    // Write second 100 entries while disconnected.
    for i in 100u64..200 {
        leader.put(&i.to_be_bytes(), b"batch2").unwrap();
    }

    // --- Second connection (same follower dir = cursor is loaded) ---
    let signal2 = ShutdownSignal::new();
    let _client2 = follower
        .start_replication_client_with_options(
            addr,
            registry,
            signal2.clone(),
            ReplicationClientOptions {
                reconnect_base_ms: 50,
                reconnect_max_ms: 200,
            },
        )
        .unwrap();

    // Follower must end up with exactly 200 unique keys.
    assert!(
        wait_until(Duration::from_secs(15), || follower.len() >= 200),
        "follower did not receive all 200 entries after reconnect (len={})",
        follower.len()
    );

    // No logical duplicates: index reports exactly 200.
    assert_eq!(
        follower.len(),
        200,
        "expected 200 unique entries, got {}",
        follower.len()
    );

    signal2.shutdown();
    leader_signal.shutdown();
}

// ---------------------------------------------------------------------------
// Test 4: rotation_during_streaming  (C2, C17)
// ---------------------------------------------------------------------------
/// Small max_file_size forces many log rotations while the follower streams.
/// Follower's index must resolve all entries correctly via updated file_id.
#[test]
fn rotation_during_streaming() {
    let leader_dir = tempfile::tempdir().unwrap();
    let follower_dir = tempfile::tempdir().unwrap();
    let addr = next_bind_addr();

    // 32 KiB per file → many rotations with 1000 × ~40-byte entries.
    let cfg = Config {
        shard_count: 2,
        max_file_size: 32 * 1024,
        write_buffer_size: 16 * 1024,
        ..Config::default()
    };

    let leader = VarTree::<[u8; 8]>::open(leader_dir.path(), cfg.clone()).unwrap();

    let leader_signal = ShutdownSignal::new();
    let _server = leader
        .start_replication_server(addr, leader_signal.clone())
        .unwrap();

    let follower = Arc::new(VarTree::<[u8; 8]>::open(follower_dir.path(), cfg).unwrap());
    let registry = Arc::new(ReplicationRegistry::new(follower.as_replication_target()));
    let follower_signal = ShutdownSignal::new();
    let _client = follower
        .start_replication_client(addr, registry, follower_signal.clone())
        .unwrap();

    // Write ~1000 entries with 32-byte values — enough to trigger many rotations.
    let value = b"rotation_test_value_32bytes_xxxxx";
    for i in 0u64..1000 {
        leader.put(&i.to_be_bytes(), value).unwrap();
    }

    assert!(
        wait_until(Duration::from_secs(15), || { follower.len() >= 1000 }),
        "follower did not catch up after rotations (len={})",
        follower.len()
    );

    // Spot-check: values must be readable (not stale DiskLoc).
    for i in [0u64, 499, 999] {
        assert!(
            follower.get(&i.to_be_bytes()).is_some(),
            "key {i} missing from follower after rotation"
        );
    }

    follower_signal.shutdown();
    leader_signal.shutdown();
}

// ---------------------------------------------------------------------------
// Test 5: multi_shard
// ---------------------------------------------------------------------------
/// shard_count=4 with shard_prefix_bits=0 (default hash routing). Each shard
/// streams independently via one thread per shard in the client.
#[test]
fn multi_shard() {
    let leader_dir = tempfile::tempdir().unwrap();
    let follower_dir = tempfile::tempdir().unwrap();
    let addr = next_bind_addr();

    let cfg = Config {
        shard_count: 4,
        ..Config::default()
    };

    let leader = VarTree::<[u8; 8]>::open(leader_dir.path(), cfg.clone()).unwrap();
    // Write entries that will spread across shards via hash routing.
    for i in 0u64..200 {
        leader.put(&i.to_be_bytes(), b"multi").unwrap();
    }

    let leader_signal = ShutdownSignal::new();
    let _server = leader
        .start_replication_server(addr, leader_signal.clone())
        .unwrap();

    let follower = Arc::new(VarTree::<[u8; 8]>::open(follower_dir.path(), cfg).unwrap());
    let registry = Arc::new(ReplicationRegistry::new(follower.as_replication_target()));
    let follower_signal = ShutdownSignal::new();
    let _client = follower
        .start_replication_client(addr, registry, follower_signal.clone())
        .unwrap();

    assert!(
        wait_until(Duration::from_secs(15), || {
            (0u64..200).all(|i| follower.contains(&i.to_be_bytes()))
        }),
        "multi-shard follower did not receive all entries (len={})",
        follower.len()
    );

    follower_signal.shutdown();
    leader_signal.shutdown();
}

// ---------------------------------------------------------------------------
// Test 6: encrypted_catchup  (C3, A6)
// ---------------------------------------------------------------------------
/// Encryption enabled; entries written without explicit flush. The server's
/// A6 flush rule pads the trailing encrypted page before the log reader runs,
/// so catch-up must deliver every entry.
#[cfg(feature = "encryption")]
#[test]
fn encrypted_catchup() {
    let leader_dir = tempfile::tempdir().unwrap();
    let follower_dir = tempfile::tempdir().unwrap();
    let addr = next_bind_addr();

    let cfg = Config {
        shard_count: 2,
        write_buffer_size: 8192,
        #[cfg(feature = "encryption")]
        encryption_key: Some([0x42u8; 32]),
        ..Config::default()
    };

    let leader = VarTree::<[u8; 8]>::open(leader_dir.path(), cfg.clone()).unwrap();
    // Write entries WITHOUT a manual flush — A6 flush must handle this.
    for i in 0u64..100 {
        leader.put(&i.to_be_bytes(), b"encrypted").unwrap();
    }
    // Intentionally no flush here — tests the A6 automatic flush path.

    let leader_signal = ShutdownSignal::new();
    let _server = leader
        .start_replication_server(addr, leader_signal.clone())
        .unwrap();

    // Follower must also open with the same encryption key.
    let follower = Arc::new(VarTree::<[u8; 8]>::open(follower_dir.path(), cfg).unwrap());
    let registry = Arc::new(ReplicationRegistry::new(follower.as_replication_target()));
    let follower_signal = ShutdownSignal::new();
    let _client = follower
        .start_replication_client(addr, registry, follower_signal.clone())
        .unwrap();

    assert!(
        wait_until(Duration::from_secs(15), || {
            (0u64..100).all(|i| follower.contains(&i.to_be_bytes()))
        }),
        "encrypted follower did not catch up (len={})",
        follower.len()
    );

    follower_signal.shutdown();
    leader_signal.shutdown();
}

// ---------------------------------------------------------------------------
// Test 7: reject_second_follower  (A2)
// ---------------------------------------------------------------------------
/// Two clients connect to the same shard. The second receives an Error frame
/// (its `start_replication_client` call succeeds — the error is delivered
/// over the wire and logged, then the client's thread backs off). The second
/// follower must remain empty while the first converges.
#[test]
fn reject_second_follower() {
    let leader_dir = tempfile::tempdir().unwrap();
    let follower1_dir = tempfile::tempdir().unwrap();
    let follower2_dir = tempfile::tempdir().unwrap();
    let addr = next_bind_addr();

    let leader = VarTree::<[u8; 8]>::open(leader_dir.path(), test_cfg()).unwrap();

    let leader_signal = ShutdownSignal::new();
    let _server = leader
        .start_replication_server(addr, leader_signal.clone())
        .unwrap();

    // First follower connects and claims the consumer.
    let follower1 = Arc::new(VarTree::<[u8; 8]>::open(follower1_dir.path(), test_cfg()).unwrap());
    let registry1 = Arc::new(ReplicationRegistry::new(follower1.as_replication_target()));
    let signal1 = ShutdownSignal::new();
    let _client1 = follower1
        .start_replication_client(addr, registry1, signal1.clone())
        .unwrap();

    // Give client1 time to connect and claim the shard consumer.
    std::thread::sleep(Duration::from_millis(300));

    // Second follower attempts to connect — server will send Error frame.
    let follower2 = Arc::new(VarTree::<[u8; 8]>::open(follower2_dir.path(), test_cfg()).unwrap());
    let registry2 = Arc::new(ReplicationRegistry::new(follower2.as_replication_target()));
    let signal2 = ShutdownSignal::new();
    let _client2 = follower2
        .start_replication_client(addr, registry2, signal2.clone())
        .unwrap();

    // Write entries on leader.
    for i in 0u64..50 {
        leader
            .put(&i.to_be_bytes(), format!("v{i}").as_bytes())
            .unwrap();
    }

    // follower1 must receive all entries.
    assert!(
        wait_until(Duration::from_secs(10), || { follower1.len() >= 50 }),
        "follower1 did not receive entries (len={})",
        follower1.len()
    );

    // follower2 must remain empty for the duration. Give it 3s to confirm
    // it has NOT received any entries (the Error frame keeps it from applying
    // anything during the backoff window).
    std::thread::sleep(Duration::from_secs(3));
    assert_eq!(
        follower2.len(),
        0,
        "follower2 should be empty but has {} entries",
        follower2.len()
    );

    signal1.shutdown();
    signal2.shutdown();
    leader_signal.shutdown();
}

// ---------------------------------------------------------------------------
// Test 8: shard_id_validation  (C15)
// ---------------------------------------------------------------------------
/// Fabricate a TCP server that sends a malformed EntryBatch (shard_id=99
/// on a stream that was opened for shard 0). The client must return a
/// DbError::Replication for that shard thread, causing it to reconnect.
///
/// We verify this by: (a) the client thread observes an error and logs a
/// reconnect, and (b) doing it via a raw TCP listener acting as a "rogue server".
#[test]
fn shard_id_validation() {
    use armdb::replication::protocol::{
        EntryBatch, ShardInfo, SyncRequest, WireEntry, read_frame, write_frame,
    };
    use std::io::BufReader;

    // Build a fake server that accepts one connection, does the handshake,
    // then sends an EntryBatch with shard_id=99 (mismatched).
    let listener = TcpListener::bind("127.0.0.1:0").unwrap();
    let server_addr: SocketAddr = listener.local_addr().unwrap();

    // Spawn a thread acting as the rogue server.
    let rogue_handle = std::thread::spawn(move || {
        let (stream, _) = listener.accept().expect("accept");
        let _ = stream.set_nodelay(true);

        let mut reader = BufReader::new(stream.try_clone().unwrap());
        let mut writer = stream;

        // Read SyncRequest.
        let frame = read_frame(&mut reader).expect("read SyncRequest");
        let req = SyncRequest::decode(&frame.payload).expect("decode SyncRequest");

        // Send a valid ShardInfo (shard_count must match follower's shard_count).
        // Config::test() uses shard_count=2.
        let info = ShardInfo {
            shard_count: 2,
            max_file_size: 256 * 1024 * 1024,
        };
        write_frame(&mut writer, &info.encode()).expect("write ShardInfo");

        // Now send an EntryBatch with a WRONG shard_id (not req.shard_id).
        let wrong_shard_id: u8 = if req.shard_id == 0 { 1 } else { 0 };
        // Actually use a value that will DEFINITELY mismatch on shard 0's thread:
        // shard_id=99 is always wrong for a 2-shard engine.
        let bad_batch = EntryBatch {
            shard_id: 99, // mismatch — client must reject
            entries: vec![WireEntry {
                entry_len: 0,
                key_len: 8,
                gsn: 1,
                data: vec![],
            }],
        };
        let _ = write_frame(&mut writer, &bad_batch.encode());
        // Server is done; close connection.
        let _ = wrong_shard_id;
    });

    // Set up a follower that connects to the rogue server.
    let follower_dir = tempfile::tempdir().unwrap();
    let follower = Arc::new(VarTree::<[u8; 8]>::open(follower_dir.path(), test_cfg()).unwrap());
    let registry = Arc::new(ReplicationRegistry::new(follower.as_replication_target()));

    // The client will connect, receive the bad batch, detect shard_id mismatch
    // (C15), log the error, and attempt to reconnect (with backoff).
    // We use a signal that shuts down promptly so the test doesn't wait 30s.
    let signal = ShutdownSignal::new();
    let _client = follower
        .start_replication_client(server_addr, registry, signal.clone())
        .unwrap();

    // Wait for the rogue server to finish sending its bad frame.
    rogue_handle.join().expect("rogue server thread panicked");

    // Give the client a moment to process the bad frame and observe the error.
    std::thread::sleep(Duration::from_millis(500));

    // The follower must have received zero entries (bad batch was rejected).
    assert_eq!(
        follower.len(),
        0,
        "follower must not apply entries from a mismatched shard_id"
    );

    // The client is still running (it reconnected) but the rogue server is gone —
    // connect_timeout will fail and the client thread will back off. Shut down.
    signal.shutdown();
}