skippydb 0.2.2 - Docs.rs

/// ~500 GPU unit tests for QMDB CUDA acceleration.
/// Covers: kernel hashing, sync_mtrees_gpu, active bits, upper tree, full pipeline.
///
/// Run with: `cargo test -p qmdb --features cuda -- gpu::gpu_tests`
#[cfg(test)]
#[cfg(feature = "cuda")]
mod tests {
    use crate::gpu::{GpuHasher, NodeHashJob};
    use crate::merkletree::twig::{
        sync_mtree, sync_mtrees_gpu, ActiveBits, Twig, NULL_MT_FOR_TWIG,
        NULL_NODE_IN_HIGHER_TREE, NULL_TWIG,
    };
    use crate::utils::hasher::{self, Hash32, ZERO_HASH32};
    use sha2::{Digest, Sha256};
    use std::collections::HashSet;

    // ========================================================================
    // Helpers & Macros
    // ========================================================================

    macro_rules! gpu_or_skip {
        ($max_batch:expr) => {
            match GpuHasher::new($max_batch) {
                Ok(g) => g,
                Err(e) => {
                    eprintln!("Skipping GPU test (no CUDA): {}", e);
                    return;
                }
            }
        };
        () => {
            gpu_or_skip!(200_000)
        };
    }

    fn cpu_hash2(level: u8, a: &[u8; 32], b: &[u8; 32]) -> [u8; 32] {
        let mut h = Sha256::new();
        h.update([level]);
        h.update(a);
        h.update(b);
        h.finalize().into()
    }

    fn cpu_hash(data: &[u8]) -> [u8; 32] {
        let mut h = Sha256::new();
        h.update(data);
        h.finalize().into()
    }

    fn pseudo_random_bytes(seed: u64, len: usize) -> Vec<u8> {
        let mut v = Vec::with_capacity(len);
        let mut s = seed;
        for _ in 0..len {
            s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
            v.push((s >> 33) as u8);
        }
        v
    }

    fn pseudo_random_hash(seed: u64) -> [u8; 32] {
        let b = pseudo_random_bytes(seed, 32);
        let mut h = [0u8; 32];
        h.copy_from_slice(&b);
        h
    }

    fn make_job(level: u8, left: [u8; 32], right: [u8; 32]) -> NodeHashJob {
        NodeHashJob { level, left, right }
    }

    fn fill_hash(val: u8) -> [u8; 32] {
        [val; 32]
    }

    fn seq_hash() -> [u8; 32] {
        let mut h = [0u8; 32];
        for i in 0..32 { h[i] = i as u8; }
        h
    }

    fn verify_node_hash(gpu: &GpuHasher, level: u8, left: &[u8; 32], right: &[u8; 32]) {
        let expected = cpu_hash2(level, left, right);
        let jobs = vec![make_job(level, *left, *right)];
        let result = gpu.batch_node_hash(&jobs);
        assert_eq!(result[0], expected,
            "Node hash mismatch level={}: GPU={} CPU={}",
            level, hex::encode(result[0]), hex::encode(expected));
    }

    fn verify_batch(gpu: &GpuHasher, jobs: &[NodeHashJob]) {
        let results = gpu.batch_node_hash(jobs);
        for (i, job) in jobs.iter().enumerate() {
            let expected = cpu_hash2(job.level, &job.left, &job.right);
            assert_eq!(results[i], expected,
                "Batch mismatch at {}: GPU={} CPU={}",
                i, hex::encode(results[i]), hex::encode(expected));
        }
    }

    // Generate N random jobs with deterministic seed
    fn make_random_jobs(seed: u64, n: usize) -> Vec<NodeHashJob> {
        let mut jobs = Vec::with_capacity(n);
        for i in 0..n {
            let s = seed.wrapping_add(i as u64);
            jobs.push(NodeHashJob {
                level: (s % 256) as u8,
                left: pseudo_random_hash(s * 2),
                right: pseudo_random_hash(s * 2 + 1),
            });
        }
        jobs
    }

    // ========================================================================
    // Category 1: batch_node_hash — Level Variations (30 tests)
    // ========================================================================

    macro_rules! test_level {
        ($name:ident, $level:expr) => {
            #[test]
            fn $name() {
                let gpu = gpu_or_skip!();
                let left = seq_hash();
                let right = pseudo_random_hash(42);
                verify_node_hash(&gpu, $level, &left, &right);
            }
        };
    }

    test_level!(test_node_hash_level_0, 0);
    test_level!(test_node_hash_level_1, 1);
    test_level!(test_node_hash_level_7, 7);
    test_level!(test_node_hash_level_8, 8);
    test_level!(test_node_hash_level_9, 9);
    test_level!(test_node_hash_level_10, 10);
    test_level!(test_node_hash_level_11, 11);
    test_level!(test_node_hash_level_12, 12);
    test_level!(test_node_hash_level_13, 13);
    test_level!(test_node_hash_level_63, 63);
    test_level!(test_node_hash_level_127, 127);
    test_level!(test_node_hash_level_128, 128);
    test_level!(test_node_hash_level_254, 254);
    test_level!(test_node_hash_level_255, 255);

    #[test]
    fn test_node_hash_all_levels_0_to_15() {
        let gpu = gpu_or_skip!();
        let jobs: Vec<NodeHashJob> = (0..16).map(|l| make_job(l, seq_hash(), fill_hash(0xAB))).collect();
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_all_levels_0_to_63() {
        let gpu = gpu_or_skip!();
        let jobs: Vec<NodeHashJob> = (0..64).map(|l| make_job(l, seq_hash(), fill_hash(0xCD))).collect();
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_all_levels_0_to_255() {
        let gpu = gpu_or_skip!();
        let jobs: Vec<NodeHashJob> = (0..=255u8).map(|l| make_job(l, seq_hash(), fill_hash(0xEF))).collect();
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_same_level_repeated() {
        let gpu = gpu_or_skip!();
        let jobs: Vec<NodeHashJob> = (0..1000).map(|i| {
            make_job(5, pseudo_random_hash(i * 2), pseudo_random_hash(i * 2 + 1))
        }).collect();
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_alternating_levels() {
        let gpu = gpu_or_skip!();
        let jobs: Vec<NodeHashJob> = (0..500).map(|i| {
            let l = if i % 2 == 0 { 0 } else { 255 };
            make_job(l, pseudo_random_hash(i * 2), pseudo_random_hash(i * 2 + 1))
        }).collect();
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_level_ascending() {
        let gpu = gpu_or_skip!();
        let left = fill_hash(0x11);
        let right = fill_hash(0x22);
        let jobs: Vec<NodeHashJob> = (0..=255u8).map(|l| make_job(l, left, right)).collect();
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_level_descending() {
        let gpu = gpu_or_skip!();
        let left = fill_hash(0x33);
        let right = fill_hash(0x44);
        let jobs: Vec<NodeHashJob> = (0..=255u8).rev().map(|l| make_job(l, left, right)).collect();
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_even_levels_only() {
        let gpu = gpu_or_skip!();
        let jobs: Vec<NodeHashJob> = (0..128).map(|i| make_job(i * 2, seq_hash(), fill_hash(0x55))).collect();
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_odd_levels_only() {
        let gpu = gpu_or_skip!();
        let jobs: Vec<NodeHashJob> = (0..128).map(|i| make_job(i * 2 + 1, seq_hash(), fill_hash(0xAA))).collect();
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_power_of_two_levels() {
        let gpu = gpu_or_skip!();
        let levels = [1u8, 2, 4, 8, 16, 32, 64, 128];
        let jobs: Vec<NodeHashJob> = levels.iter().map(|&l| make_job(l, seq_hash(), fill_hash(0x77))).collect();
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_merkle_tree_levels() {
        let gpu = gpu_or_skip!();
        let jobs: Vec<NodeHashJob> = (0..13).map(|l| make_job(l, pseudo_random_hash(l as u64), pseudo_random_hash(l as u64 + 100))).collect();
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_upper_tree_levels() {
        let gpu = gpu_or_skip!();
        let jobs: Vec<NodeHashJob> = (12..64).map(|l| make_job(l, pseudo_random_hash(l as u64), pseudo_random_hash(l as u64 + 200))).collect();
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_level_affects_output() {
        let gpu = gpu_or_skip!();
        let left = fill_hash(0x42);
        let right = fill_hash(0x84);
        let mut hashes = HashSet::new();
        for l in 0..=255u8 {
            let result = gpu.batch_node_hash(&[make_job(l, left, right)]);
            assert!(hashes.insert(result[0]), "Duplicate hash at level {}", l);
        }
    }

    #[test]
    fn test_node_hash_level_0_vs_level_1() {
        let gpu = gpu_or_skip!();
        let left = fill_hash(0x42);
        let right = fill_hash(0x84);
        let r0 = gpu.batch_node_hash(&[make_job(0, left, right)]);
        let r1 = gpu.batch_node_hash(&[make_job(1, left, right)]);
        assert_ne!(r0[0], r1[0], "Level 0 and 1 should produce different hashes");
    }

    #[test]
    fn test_node_hash_level_byte_position() {
        let gpu = gpu_or_skip!();
        // Verify by comparing with manual SHA256 construction
        let level = 42u8;
        let left = fill_hash(0x11);
        let right = fill_hash(0x22);
        let mut manual_input = vec![level];
        manual_input.extend_from_slice(&left);
        manual_input.extend_from_slice(&right);
        let expected = cpu_hash(&manual_input);
        let result = gpu.batch_node_hash(&[make_job(level, left, right)]);
        assert_eq!(result[0], expected);
    }

    #[test]
    fn test_node_hash_batch_mixed_levels_1000() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(7777, 1000);
        verify_batch(&gpu, &jobs);
    }

    // ========================================================================
    // Category 2: batch_node_hash — Input Pattern Variations (45 tests)
    // ========================================================================

    #[test]
    fn test_node_hash_all_zeros() {
        let gpu = gpu_or_skip!();
        verify_node_hash(&gpu, 0, &ZERO_HASH32, &ZERO_HASH32);
    }

    #[test]
    fn test_node_hash_all_ones() {
        let gpu = gpu_or_skip!();
        verify_node_hash(&gpu, 0, &fill_hash(0xFF), &fill_hash(0xFF));
    }

    #[test]
    fn test_node_hash_left_zero_right_ones() {
        let gpu = gpu_or_skip!();
        verify_node_hash(&gpu, 5, &ZERO_HASH32, &fill_hash(0xFF));
    }

    #[test]
    fn test_node_hash_left_ones_right_zero() {
        let gpu = gpu_or_skip!();
        verify_node_hash(&gpu, 5, &fill_hash(0xFF), &ZERO_HASH32);
    }

    #[test]
    fn test_node_hash_sequential_bytes_left() {
        let gpu = gpu_or_skip!();
        verify_node_hash(&gpu, 3, &seq_hash(), &ZERO_HASH32);
    }

    #[test]
    fn test_node_hash_sequential_bytes_right() {
        let gpu = gpu_or_skip!();
        verify_node_hash(&gpu, 3, &ZERO_HASH32, &seq_hash());
    }

    #[test]
    fn test_node_hash_sequential_bytes_both() {
        let gpu = gpu_or_skip!();
        let mut right = [0u8; 32];
        for i in 0..32 { right[i] = (i + 32) as u8; }
        verify_node_hash(&gpu, 3, &seq_hash(), &right);
    }

    macro_rules! test_single_bit {
        ($name:ident, $side:ident, $byte_idx:expr, $bit:expr) => {
            #[test]
            fn $name() {
                let gpu = gpu_or_skip!();
                let mut left = ZERO_HASH32;
                let mut right = ZERO_HASH32;
                let target = if stringify!($side) == "left" { &mut left } else { &mut right };
                target[$byte_idx] = 1 << $bit;
                verify_node_hash(&gpu, 7, &left, &right);
            }
        };
    }

    test_single_bit!(test_node_hash_single_bit_left_0, left, 0, 0);
    test_single_bit!(test_node_hash_single_bit_left_7, left, 0, 7);
    test_single_bit!(test_node_hash_single_bit_left_255, left, 31, 7);
    test_single_bit!(test_node_hash_single_bit_right_0, right, 0, 0);
    test_single_bit!(test_node_hash_single_bit_right_255, right, 31, 7);

    #[test]
    fn test_node_hash_alternating_0x55() {
        let gpu = gpu_or_skip!();
        verify_node_hash(&gpu, 4, &fill_hash(0x55), &fill_hash(0x55));
    }

    #[test]
    fn test_node_hash_alternating_0x_aa() {
        let gpu = gpu_or_skip!();
        verify_node_hash(&gpu, 4, &fill_hash(0xAA), &fill_hash(0xAA));
    }

    #[test]
    fn test_node_hash_left_equals_right() {
        let gpu = gpu_or_skip!();
        let data = pseudo_random_hash(123);
        verify_node_hash(&gpu, 6, &data, &data);
    }

    #[test]
    fn test_node_hash_left_right_swapped() {
        let gpu = gpu_or_skip!();
        let left = pseudo_random_hash(100);
        let right = pseudo_random_hash(200);
        let r1 = gpu.batch_node_hash(&[make_job(5, left, right)]);
        let r2 = gpu.batch_node_hash(&[make_job(5, right, left)]);
        assert_ne!(r1[0], r2[0], "hash(L,R) should differ from hash(R,L)");
    }

    #[test]
    fn test_node_hash_null_twig_hashes() {
        let gpu = gpu_or_skip!();
        // Reproduce NULL_MT_FOR_TWIG[1] from leaves
        let left = NULL_MT_FOR_TWIG[2];
        let right = NULL_MT_FOR_TWIG[3];
        let expected = NULL_MT_FOR_TWIG[1];
        let result = gpu.batch_node_hash(&[make_job(10, left, right)]);
        assert_eq!(result[0], expected, "Null twig root level mismatch");
    }

    #[test]
    fn test_node_hash_null_twig_root() {
        let gpu = gpu_or_skip!();
        let expected = NULL_TWIG.twig_root;
        let result = gpu.batch_node_hash(&[make_job(11, NULL_TWIG.left_root, NULL_TWIG.active_bits_mtl3)]);
        assert_eq!(result[0], expected, "NULL_TWIG.twig_root mismatch");
    }

    #[test]
    fn test_node_hash_known_sha256_vector() {
        let gpu = gpu_or_skip!();
        // Create a known 65-byte input and verify against Rust SHA256
        let mut input = [0u8; 65];
        for i in 0..65 { input[i] = i as u8; }
        let expected = cpu_hash(&input);
        let mut left = [0u8; 32];
        let mut right = [0u8; 32];
        left.copy_from_slice(&input[1..33]);
        right.copy_from_slice(&input[33..65]);
        let result = gpu.batch_node_hash(&[make_job(input[0], left, right)]);
        assert_eq!(result[0], expected);
    }

    #[test]
    fn test_node_hash_cascading_pattern() {
        let gpu = gpu_or_skip!();
        let mut left = fill_hash(0x01);
        let mut right = fill_hash(0x02);
        for i in 0..50u8 {
            let result = gpu.batch_node_hash(&[make_job(i, left, right)]);
            let expected = cpu_hash2(i, &left, &right);
            assert_eq!(result[0], expected, "Cascade step {} mismatch", i);
            right = left;
            left = result[0];
        }
    }

    macro_rules! test_random_seed {
        ($name:ident, $seed:expr, $n:expr) => {
            #[test]
            fn $name() {
                let gpu = gpu_or_skip!();
                let jobs = make_random_jobs($seed, $n);
                verify_batch(&gpu, &jobs);
            }
        };
    }

    test_random_seed!(test_node_hash_random_seed_0, 0, 100);
    test_random_seed!(test_node_hash_random_seed_42, 42, 100);
    test_random_seed!(test_node_hash_random_seed_12345, 12345, 100);
    test_random_seed!(test_node_hash_random_seed_999999, 999999, 100);

    #[test]
    fn test_node_hash_all_same_input() {
        let gpu = gpu_or_skip!();
        let job = make_job(5, fill_hash(0x42), fill_hash(0x84));
        let jobs: Vec<NodeHashJob> = vec![job; 500];
        let results = gpu.batch_node_hash(&jobs);
        for i in 1..500 {
            assert_eq!(results[0], results[i], "Identical inputs should produce identical outputs");
        }
        assert_eq!(results[0], cpu_hash2(5, &fill_hash(0x42), &fill_hash(0x84)));
    }

    #[test]
    fn test_node_hash_all_unique_inputs() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(55555, 500);
        let results = gpu.batch_node_hash(&jobs);
        let unique: HashSet<[u8; 32]> = results.iter().cloned().collect();
        assert_eq!(unique.len(), 500, "500 unique inputs should give 500 unique outputs");
    }

    #[test]
    fn test_node_hash_max_entropy_left() {
        let gpu = gpu_or_skip!();
        let left = pseudo_random_hash(9999);
        verify_node_hash(&gpu, 3, &left, &ZERO_HASH32);
    }

    #[test]
    fn test_node_hash_max_entropy_right() {
        let gpu = gpu_or_skip!();
        let right = pseudo_random_hash(8888);
        verify_node_hash(&gpu, 3, &ZERO_HASH32, &right);
    }

    #[test]
    fn test_node_hash_max_entropy_both() {
        let gpu = gpu_or_skip!();
        verify_node_hash(&gpu, 7, &pseudo_random_hash(1111), &pseudo_random_hash(2222));
    }

    #[test]
    fn test_node_hash_repeating_pattern_abcd() {
        let gpu = gpu_or_skip!();
        let mut h = [0u8; 32];
        for i in 0..16 { h[i*2] = 0xAB; h[i*2+1] = 0xCD; }
        verify_node_hash(&gpu, 2, &h, &h);
    }

    #[test]
    fn test_node_hash_repeating_pattern_dead() {
        let gpu = gpu_or_skip!();
        let mut h = [0u8; 32];
        for i in 0..16 { h[i*2] = 0xDE; h[i*2+1] = 0xAD; }
        verify_node_hash(&gpu, 2, &h, &h);
    }

    #[test]
    fn test_node_hash_incrementing_fill() {
        let gpu = gpu_or_skip!();
        let mut jobs = Vec::new();
        for i in 0..100u8 {
            let mut left = [0u8; 32];
            let mut right = [0u8; 32];
            for j in 0..32 { left[j] = i.wrapping_add(j as u8); right[j] = i.wrapping_add(j as u8 + 32); }
            jobs.push(make_job(i % 12, left, right));
        }
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_xor_pattern() {
        let gpu = gpu_or_skip!();
        let mut left = [0u8; 32];
        let mut right = [0u8; 32];
        for i in 0..32 { left[i] = (i as u8) ^ 0x55; right[i] = (i as u8) ^ 0xAA; }
        verify_node_hash(&gpu, 8, &left, &right);
    }

    #[test]
    fn test_node_hash_byte_boundary_0x7f_0x80() {
        let gpu = gpu_or_skip!();
        verify_node_hash(&gpu, 10, &fill_hash(0x7F), &fill_hash(0x80));
    }

    #[test]
    fn test_node_hash_palindrome_input() {
        let gpu = gpu_or_skip!();
        let mut h = [0u8; 32];
        for i in 0..16 { h[i] = i as u8; h[31-i] = i as u8; }
        verify_node_hash(&gpu, 6, &h, &h);
    }

    #[test]
    fn test_node_hash_left_is_hash_of_right() {
        let gpu = gpu_or_skip!();
        let right = pseudo_random_hash(777);
        let left = cpu_hash(&right);
        verify_node_hash(&gpu, 9, &left, &right);
    }

    #[test]
    fn test_node_hash_one_byte_diff_left() {
        let gpu = gpu_or_skip!();
        let mut left1 = pseudo_random_hash(100);
        let left2 = left1;
        left1[15] ^= 1; // flip 1 bit
        let right = pseudo_random_hash(200);
        let r1 = gpu.batch_node_hash(&[make_job(5, left1, right)]);
        let r2 = gpu.batch_node_hash(&[make_job(5, left2, right)]);
        assert_ne!(r1[0], r2[0], "1-bit diff in left should change hash");
    }

    #[test]
    fn test_node_hash_one_byte_diff_right() {
        let gpu = gpu_or_skip!();
        let left = pseudo_random_hash(300);
        let mut right1 = pseudo_random_hash(400);
        let right2 = right1;
        right1[0] ^= 0x80;
        let r1 = gpu.batch_node_hash(&[make_job(7, left, right1)]);
        let r2 = gpu.batch_node_hash(&[make_job(7, left, right2)]);
        assert_ne!(r1[0], r2[0], "1-bit diff in right should change hash");
    }

    #[test]
    fn test_node_hash_null_node_higher_tree() {
        let gpu = gpu_or_skip!();
        // Reproduce NULL_NODE_IN_HIGHER_TREE[13] (FIRST_LEVEL_ABOVE_TWIG)
        let expected = NULL_NODE_IN_HIGHER_TREE[13];
        let result = gpu.batch_node_hash(&[make_job(12, NULL_TWIG.twig_root, NULL_TWIG.twig_root)]);
        assert_eq!(result[0], expected, "NULL_NODE_IN_HIGHER_TREE[13] mismatch");
    }

    #[test]
    fn test_node_hash_counter_as_bytes() {
        let gpu = gpu_or_skip!();
        let mut jobs = Vec::new();
        for i in 0..100u64 {
            let mut left = ZERO_HASH32;
            let bytes = i.to_be_bytes();
            left[24..32].copy_from_slice(&bytes);
            jobs.push(make_job((i % 12) as u8, left, ZERO_HASH32));
        }
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_hash_of_index() {
        let gpu = gpu_or_skip!();
        let mut jobs = Vec::new();
        for i in 0..100 {
            let left = cpu_hash(&(i as u64).to_le_bytes());
            let right = cpu_hash(&((i + 100) as u64).to_le_bytes());
            jobs.push(make_job((i % 12) as u8, left, right));
        }
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_node_hash_utf8_encoded() {
        let gpu = gpu_or_skip!();
        let mut left = ZERO_HASH32;
        left[..5].copy_from_slice(b"hello");
        let mut right = ZERO_HASH32;
        right[..5].copy_from_slice(b"world");
        verify_node_hash(&gpu, 0, &left, &right);
    }

    #[test]
    fn test_node_hash_runs_of_bits() {
        let gpu = gpu_or_skip!();
        let mut h = [0u8; 32];
        for i in 0..32 { h[i] = if i % 2 == 0 { 0x00 } else { 0xFF }; }
        verify_node_hash(&gpu, 5, &h, &h);
    }

    #[test]
    fn test_node_hash_checkerboard() {
        let gpu = gpu_or_skip!();
        let mut h = [0u8; 32];
        for i in 0..32 { h[i] = if i % 2 == 0 { 0x55 } else { 0xAA }; }
        verify_node_hash(&gpu, 3, &h, &h);
    }

    // ========================================================================
    // Category 3: batch_node_hash — Batch Size Variations (25 tests)
    // ========================================================================

    macro_rules! test_batch_size {
        ($name:ident, $size:expr) => {
            #[test]
            fn $name() {
                let gpu = gpu_or_skip!();
                let jobs = make_random_jobs(42424, $size);
                verify_batch(&gpu, &jobs);
            }
        };
    }

    test_batch_size!(test_node_hash_batch_1, 1);
    test_batch_size!(test_node_hash_batch_2, 2);
    test_batch_size!(test_node_hash_batch_3, 3);
    test_batch_size!(test_node_hash_batch_7, 7);
    test_batch_size!(test_node_hash_batch_8, 8);
    test_batch_size!(test_node_hash_batch_15, 15);
    test_batch_size!(test_node_hash_batch_16, 16);
    test_batch_size!(test_node_hash_batch_31, 31);
    test_batch_size!(test_node_hash_batch_32, 32);
    test_batch_size!(test_node_hash_batch_64, 64);
    test_batch_size!(test_node_hash_batch_100, 100);
    test_batch_size!(test_node_hash_batch_255, 255);
    test_batch_size!(test_node_hash_batch_256, 256);
    test_batch_size!(test_node_hash_batch_257, 257);
    test_batch_size!(test_node_hash_batch_500, 500);
    test_batch_size!(test_node_hash_batch_512, 512);
    test_batch_size!(test_node_hash_batch_1000, 1000);
    test_batch_size!(test_node_hash_batch_1023, 1023);
    test_batch_size!(test_node_hash_batch_1024, 1024);
    test_batch_size!(test_node_hash_batch_2048, 2048);
    test_batch_size!(test_node_hash_batch_4096, 4096);
    test_batch_size!(test_node_hash_batch_10000, 10000);
    test_batch_size!(test_node_hash_batch_50000, 50000);
    test_batch_size!(test_node_hash_batch_100000, 100000);
    test_batch_size!(test_node_hash_batch_200000, 200000);


    // ========================================================================
    // Category 4: batch_node_hash_into — Output Buffer Tests (20 tests)
    // ========================================================================

    #[test]
    fn test_node_hash_into_basic() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(111, 10);
        let expected = gpu.batch_node_hash(&jobs);
        let mut out = vec![ZERO_HASH32; 10];
        gpu.batch_node_hash_into(&jobs, &mut out);
        assert_eq!(out, expected);
    }

    #[test]
    fn test_node_hash_into_single_job() {
        let gpu = gpu_or_skip!();
        let jobs = vec![make_job(5, fill_hash(0x11), fill_hash(0x22))];
        let mut out = vec![ZERO_HASH32; 1];
        gpu.batch_node_hash_into(&jobs, &mut out);
        assert_eq!(out[0], cpu_hash2(5, &fill_hash(0x11), &fill_hash(0x22)));
    }

    #[test]
    fn test_node_hash_into_1000_jobs() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(222, 1000);
        let mut out = vec![ZERO_HASH32; 1000];
        gpu.batch_node_hash_into(&jobs, &mut out);
        for (i, job) in jobs.iter().enumerate() {
            assert_eq!(out[i], cpu_hash2(job.level, &job.left, &job.right), "Into mismatch at {}", i);
        }
    }

    #[test]
    fn test_node_hash_into_matches_batch() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(333, 500);
        let batch_result = gpu.batch_node_hash(&jobs);
        let mut into_result = vec![ZERO_HASH32; 500];
        gpu.batch_node_hash_into(&jobs, &mut into_result);
        assert_eq!(batch_result, into_result);
    }

    #[test]
    fn test_node_hash_into_overwrites_existing() {
        let gpu = gpu_or_skip!();
        let jobs = vec![make_job(0, ZERO_HASH32, ZERO_HASH32)];
        let mut out = vec![fill_hash(0xFF)];
        gpu.batch_node_hash_into(&jobs, &mut out);
        assert_ne!(out[0], fill_hash(0xFF), "Old data should be overwritten");
        assert_eq!(out[0], cpu_hash2(0, &ZERO_HASH32, &ZERO_HASH32));
    }

    #[test]
    fn test_node_hash_into_preallocated_zeros() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(444, 50);
        let mut out = vec![ZERO_HASH32; 50];
        gpu.batch_node_hash_into(&jobs, &mut out);
        for (i, job) in jobs.iter().enumerate() {
            assert_eq!(out[i], cpu_hash2(job.level, &job.left, &job.right));
        }
    }

    #[test]
    fn test_node_hash_into_preallocated_ones() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(555, 50);
        let mut out = vec![fill_hash(0xFF); 50];
        gpu.batch_node_hash_into(&jobs, &mut out);
        for (i, job) in jobs.iter().enumerate() {
            assert_eq!(out[i], cpu_hash2(job.level, &job.left, &job.right));
        }
    }

    #[test]
    fn test_node_hash_into_empty() {
        let gpu = gpu_or_skip!();
        let jobs: Vec<NodeHashJob> = vec![];
        let mut out: Vec<[u8; 32]> = vec![];
        gpu.batch_node_hash_into(&jobs, &mut out);
        assert!(out.is_empty());
    }

    #[test]
    fn test_node_hash_into_256_jobs() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(666, 256);
        let mut out = vec![ZERO_HASH32; 256];
        gpu.batch_node_hash_into(&jobs, &mut out);
        for (i, job) in jobs.iter().enumerate() {
            assert_eq!(out[i], cpu_hash2(job.level, &job.left, &job.right));
        }
    }

    #[test]
    fn test_node_hash_into_257_jobs() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(777, 257);
        let mut out = vec![ZERO_HASH32; 257];
        gpu.batch_node_hash_into(&jobs, &mut out);
        for (i, job) in jobs.iter().enumerate() {
            assert_eq!(out[i], cpu_hash2(job.level, &job.left, &job.right));
        }
    }

    #[test]
    fn test_node_hash_into_random_100() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(888, 100);
        let mut out = vec![ZERO_HASH32; 100];
        gpu.batch_node_hash_into(&jobs, &mut out);
        for (i, job) in jobs.iter().enumerate() {
            assert_eq!(out[i], cpu_hash2(job.level, &job.left, &job.right));
        }
    }

    #[test]
    fn test_node_hash_into_all_levels() {
        let gpu = gpu_or_skip!();
        let jobs: Vec<NodeHashJob> = (0..=255u8).map(|l| make_job(l, seq_hash(), fill_hash(0x33))).collect();
        let mut out = vec![ZERO_HASH32; 256];
        gpu.batch_node_hash_into(&jobs, &mut out);
        for (i, job) in jobs.iter().enumerate() {
            assert_eq!(out[i], cpu_hash2(job.level, &job.left, &job.right));
        }
    }

    #[test]
    fn test_node_hash_into_repeated_calls() {
        let gpu = gpu_or_skip!();
        let mut out = vec![ZERO_HASH32; 10];
        for round in 0..3 {
            let jobs = make_random_jobs(round * 100, 10);
            gpu.batch_node_hash_into(&jobs, &mut out);
            for (i, job) in jobs.iter().enumerate() {
                assert_eq!(out[i], cpu_hash2(job.level, &job.left, &job.right));
            }
        }
    }

    #[test]
    fn test_node_hash_into_large_10000() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(10101, 10000);
        let mut out = vec![ZERO_HASH32; 10000];
        gpu.batch_node_hash_into(&jobs, &mut out);
        for (i, job) in jobs.iter().enumerate() {
            assert_eq!(out[i], cpu_hash2(job.level, &job.left, &job.right), "Into 10K mismatch at {}", i);
        }
    }

    #[test]
    fn test_node_hash_into_deterministic() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(20202, 100);
        let mut out1 = vec![ZERO_HASH32; 100];
        let mut out2 = vec![ZERO_HASH32; 100];
        gpu.batch_node_hash_into(&jobs, &mut out1);
        gpu.batch_node_hash_into(&jobs, &mut out2);
        assert_eq!(out1, out2, "Two identical calls should produce identical results");
    }

    #[test]
    fn test_batch_node_hash_gpu_wrapper() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(30303, 100);
        let mut out = vec![ZERO_HASH32; 100];
        hasher::batch_node_hash_gpu(&gpu, &jobs, &mut out);
        for (i, job) in jobs.iter().enumerate() {
            assert_eq!(out[i], cpu_hash2(job.level, &job.left, &job.right));
        }
    }

    #[test]
    fn test_batch_node_hash_gpu_wrapper_empty() {
        let gpu = gpu_or_skip!();
        let jobs: Vec<NodeHashJob> = vec![];
        let mut out: Vec<[u8; 32]> = vec![];
        hasher::batch_node_hash_gpu(&gpu, &jobs, &mut out);
        assert!(out.is_empty());
    }

    #[test]
    fn test_batch_node_hash_gpu_wrapper_large() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(40404, 5000);
        let mut out = vec![ZERO_HASH32; 5000];
        hasher::batch_node_hash_gpu(&gpu, &jobs, &mut out);
        for (i, job) in jobs.iter().enumerate() {
            assert_eq!(out[i], cpu_hash2(job.level, &job.left, &job.right));
        }
    }

    #[test]
    fn test_node_hash_into_idempotent() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(50505, 200);
        let mut out = vec![ZERO_HASH32; 200];
        gpu.batch_node_hash_into(&jobs, &mut out);
        let first = out.clone();
        gpu.batch_node_hash_into(&jobs, &mut out);
        assert_eq!(first, out, "Idempotent check failed");
    }

    // ========================================================================
    // Category 5: batch_hash_variable — Length Variations (50 tests)
    // ========================================================================

    fn verify_var_hash(gpu: &GpuHasher, data: &[u8]) {
        let expected = cpu_hash(data);
        let result = gpu.batch_hash_variable(&[data]);
        assert_eq!(result[0], expected,
            "Var hash mismatch (len={}): GPU={} CPU={}",
            data.len(), hex::encode(result[0]), hex::encode(expected));
    }

    macro_rules! test_var_len {
        ($name:ident, $len:expr) => {
            #[test]
            fn $name() {
                let gpu = gpu_or_skip!();
                let data = pseudo_random_bytes(($len as u64).wrapping_mul(31337), $len);
                verify_var_hash(&gpu, &data);
            }
        };
    }

    test_var_len!(test_var_hash_len_1, 1);
    test_var_len!(test_var_hash_len_2, 2);
    test_var_len!(test_var_hash_len_10, 10);
    test_var_len!(test_var_hash_len_31, 31);
    test_var_len!(test_var_hash_len_32, 32);
    test_var_len!(test_var_hash_len_33, 33);
    test_var_len!(test_var_hash_len_54, 54);
    test_var_len!(test_var_hash_len_55, 55);
    test_var_len!(test_var_hash_len_56, 56);
    test_var_len!(test_var_hash_len_57, 57);
    test_var_len!(test_var_hash_len_63, 63);
    test_var_len!(test_var_hash_len_64, 64);
    test_var_len!(test_var_hash_len_65, 65);
    test_var_len!(test_var_hash_len_100, 100);
    test_var_len!(test_var_hash_len_118, 118);
    test_var_len!(test_var_hash_len_119, 119);
    test_var_len!(test_var_hash_len_120, 120);
    test_var_len!(test_var_hash_len_127, 127);
    test_var_len!(test_var_hash_len_128, 128);
    test_var_len!(test_var_hash_len_129, 129);
    test_var_len!(test_var_hash_len_200, 200);
    test_var_len!(test_var_hash_len_255, 255);
    test_var_len!(test_var_hash_len_256, 256);
    test_var_len!(test_var_hash_len_300, 300);
    test_var_len!(test_var_hash_len_512, 512);
    test_var_len!(test_var_hash_len_1000, 1000);
    test_var_len!(test_var_hash_len_1024, 1024);
    test_var_len!(test_var_hash_len_2048, 2048);
    test_var_len!(test_var_hash_len_4096, 4096);

    #[test]
    fn test_var_hash_all_zeros_various_lengths() {
        let gpu = gpu_or_skip!();
        for len in &[1, 32, 55, 56, 64, 65, 128] {
            let data = vec![0u8; *len];
            verify_var_hash(&gpu, &data);
        }
    }

    #[test]
    fn test_var_hash_all_ones_various_lengths() {
        let gpu = gpu_or_skip!();
        for len in &[1, 32, 55, 56, 64, 65, 128] {
            let data = vec![0xFFu8; *len];
            verify_var_hash(&gpu, &data);
        }
    }

    #[test]
    fn test_var_hash_sequential_fill_various() {
        let gpu = gpu_or_skip!();
        for len in &[10, 50, 64, 100, 200] {
            let data: Vec<u8> = (0..*len as u32).map(|i| (i % 256) as u8).collect();
            verify_var_hash(&gpu, &data);
        }
    }

    #[test]
    fn test_var_hash_block_boundary_sweep() {
        let gpu = gpu_or_skip!();
        for len in 50..=70 {
            let data = pseudo_random_bytes(len as u64, len);
            verify_var_hash(&gpu, &data);
        }
    }

    #[test]
    fn test_var_hash_two_block_boundary_sweep() {
        let gpu = gpu_or_skip!();
        for len in 113..=135 {
            let data = pseudo_random_bytes(len as u64 + 1000, len);
            verify_var_hash(&gpu, &data);
        }
    }

    #[test]
    fn test_var_hash_three_block_boundary() {
        let gpu = gpu_or_skip!();
        for len in 183..=193 {
            let data = pseudo_random_bytes(len as u64 + 2000, len);
            verify_var_hash(&gpu, &data);
        }
    }

    #[test]
    fn test_var_hash_identical_inputs_same_length() {
        let gpu = gpu_or_skip!();
        let data = pseudo_random_bytes(12345, 100);
        let inputs: Vec<&[u8]> = (0..100).map(|_| data.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        let expected = cpu_hash(&data);
        for (i, r) in results.iter().enumerate() {
            assert_eq!(*r, expected, "Identical input {} should give same hash", i);
        }
    }

    #[test]
    fn test_var_hash_unique_inputs_same_length() {
        let gpu = gpu_or_skip!();
        let mut all_data: Vec<Vec<u8>> = Vec::new();
        for i in 0..100 {
            all_data.push(pseudo_random_bytes(i as u64 * 7, 100));
        }
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        let unique: HashSet<[u8; 32]> = results.iter().cloned().collect();
        assert_eq!(unique.len(), 100, "100 unique inputs should give 100 unique hashes");
    }

    #[test]
    fn test_var_hash_same_data_different_lengths() {
        let gpu = gpu_or_skip!();
        let full_data = pseudo_random_bytes(99999, 200);
        let mut results = Vec::new();
        for len in &[50, 100, 150, 200] {
            let r = gpu.batch_hash_variable(&[&full_data[..*len]]);
            results.push(r[0]);
        }
        // All should be different
        for i in 0..results.len() {
            for j in i+1..results.len() {
                assert_ne!(results[i], results[j], "Different lengths should give different hashes");
            }
        }
    }

    macro_rules! test_var_random_seed {
        ($name:ident, $seed:expr) => {
            #[test]
            fn $name() {
                let gpu = gpu_or_skip!();
                let mut all_data: Vec<Vec<u8>> = Vec::new();
                let mut s = $seed as u64;
                for _ in 0..100 {
                    s = s.wrapping_mul(6364136223846793005).wrapping_add(1);
                    let len = 10 + (s % 291) as usize;
                    all_data.push(pseudo_random_bytes(s, len));
                }
                let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
                let results = gpu.batch_hash_variable(&inputs);
                for (i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
                    let expected = cpu_hash(data);
                    assert_eq!(*result, expected, "Var hash seed {} mismatch at {}", $seed, i);
                }
            }
        };
    }

    test_var_random_seed!(test_var_hash_random_seed_0, 0);
    test_var_random_seed!(test_var_hash_random_seed_42, 42);
    test_var_random_seed!(test_var_hash_random_seed_999, 999);

    #[test]
    fn test_var_hash_entry_like_50_bytes() {
        let gpu = gpu_or_skip!();
        let data = pseudo_random_bytes(50505, 50);
        verify_var_hash(&gpu, &data);
    }

    #[test]
    fn test_var_hash_entry_like_150_bytes() {
        let gpu = gpu_or_skip!();
        let data = pseudo_random_bytes(15015, 150);
        verify_var_hash(&gpu, &data);
    }

    #[test]
    fn test_var_hash_entry_like_300_bytes() {
        let gpu = gpu_or_skip!();
        let data = pseudo_random_bytes(30030, 300);
        verify_var_hash(&gpu, &data);
    }

    #[test]
    fn test_var_hash_single_byte_values_0_to_255() {
        let gpu = gpu_or_skip!();
        let all_data: Vec<Vec<u8>> = (0..=255u8).map(|b| vec![b]).collect();
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        for (i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Single byte {} mismatch", i);
        }
    }

    #[test]
    fn test_var_hash_nist_empty_string() {
        let _gpu = gpu_or_skip!();
        // SHA256("") = e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
        // Note: batch_hash_variable with empty input should still work
        // But we test 1-byte to avoid zero-length edge case
        let data = b"";
        // For zero-length we'd need special handling. Let's verify the known value via CPU first.
        let expected = cpu_hash(data);
        assert_eq!(
            hex::encode(expected),
            "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
        );
    }

    #[test]
    fn test_var_hash_nist_abc() {
        let gpu = gpu_or_skip!();
        let data = b"abc";
        let expected_hex = "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad";
        let result = gpu.batch_hash_variable(&[data.as_slice()]);
        assert_eq!(hex::encode(result[0]), expected_hex, "SHA256('abc') NIST vector mismatch");
    }

    #[test]
    fn test_var_hash_nist_448_bits() {
        let gpu = gpu_or_skip!();
        let data = b"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
        let expected_hex = "248d6a61d20638b8e5c026930c3e6039a33ce45964ff2167f6ecedd419db06c1";
        let result = gpu.batch_hash_variable(&[data.as_slice()]);
        assert_eq!(hex::encode(result[0]), expected_hex, "SHA256 NIST 448-bit vector mismatch");
    }

    #[test]
    fn test_var_hash_ascii_text() {
        let gpu = gpu_or_skip!();
        let texts: Vec<&[u8]> = vec![
            b"Hello, World!",
            b"The quick brown fox jumps over the lazy dog",
            b"QMDB GPU acceleration test",
            b"0123456789abcdef",
        ];
        for text in &texts {
            let expected = cpu_hash(text);
            let result = gpu.batch_hash_variable(&[*text]);
            assert_eq!(result[0], expected, "ASCII text hash mismatch");
        }
    }

    #[test]
    fn test_var_hash_high_bytes_only() {
        let gpu = gpu_or_skip!();
        let data: Vec<u8> = (128..=255).cycle().take(200).collect();
        verify_var_hash(&gpu, &data);
    }


    // ========================================================================
    // Category 6: batch_hash_variable — Batch & Mix Tests (30 tests)
    // ========================================================================

    macro_rules! test_var_batch_size {
        ($name:ident, $size:expr, $len:expr) => {
            #[test]
            fn $name() {
                let gpu = gpu_or_skip!();
                let mut all_data: Vec<Vec<u8>> = Vec::new();
                for i in 0..$size {
                    all_data.push(pseudo_random_bytes(i as u64 * 7 + 99, $len));
                }
                let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
                let results = gpu.batch_hash_variable(&inputs);
                for (i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
                    assert_eq!(*result, cpu_hash(data), "Var batch {} mismatch at {}", $size, i);
                }
            }
        };
    }

    test_var_batch_size!(test_var_hash_batch_1, 1, 100);
    test_var_batch_size!(test_var_hash_batch_2, 2, 100);
    test_var_batch_size!(test_var_hash_batch_10, 10, 100);
    test_var_batch_size!(test_var_hash_batch_100_same_length, 100, 64);
    test_var_batch_size!(test_var_hash_batch_256, 256, 100);
    test_var_batch_size!(test_var_hash_batch_500, 500, 80);
    test_var_batch_size!(test_var_hash_batch_1000, 1000, 100);
    test_var_batch_size!(test_var_hash_batch_5000, 5000, 100);
    test_var_batch_size!(test_var_hash_batch_10000, 10000, 80);
    test_var_batch_size!(test_var_hash_batch_50000, 50000, 60);

    #[test]
    fn test_var_hash_batch_10_mixed_lengths() {
        let gpu = gpu_or_skip!();
        let lengths = [10, 55, 56, 64, 65, 100, 128, 200, 256, 300];
        let mut all_data: Vec<Vec<u8>> = Vec::new();
        for (i, &len) in lengths.iter().enumerate() {
            all_data.push(pseudo_random_bytes(i as u64 * 17, len));
        }
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        for (i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Mixed length {} at {} mismatch", data.len(), i);
        }
    }

    #[test]
    fn test_var_hash_batch_100_mixed_lengths() {
        let gpu = gpu_or_skip!();
        let mut all_data: Vec<Vec<u8>> = Vec::new();
        for i in 0..100 {
            let len = 10 + (i * 3) % 291;
            all_data.push(pseudo_random_bytes(i as u64 * 13, len));
        }
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        for (i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Mixed 100 mismatch at {} (len={})", i, data.len());
        }
    }

    #[test]
    fn test_var_hash_empty_batch() {
        let gpu = gpu_or_skip!();
        let inputs: Vec<&[u8]> = vec![];
        let results = gpu.batch_hash_variable(&inputs);
        assert!(results.is_empty());
    }

    #[test]
    fn test_var_hash_mixed_short_and_long() {
        let gpu = gpu_or_skip!();
        let mut all_data: Vec<Vec<u8>> = Vec::new();
        for i in 0..100 {
            let len = if i % 2 == 0 { 1 } else { 1000 };
            all_data.push(pseudo_random_bytes(i as u64, len));
        }
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        for (i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Short/long mix mismatch at {}", i);
        }
    }

    test_var_batch_size!(test_var_hash_all_same_length_55, 200, 55);
    test_var_batch_size!(test_var_hash_all_same_length_56, 200, 56);
    test_var_batch_size!(test_var_hash_all_same_length_64, 200, 64);

    #[test]
    fn test_var_hash_lengths_1_to_200() {
        let gpu = gpu_or_skip!();
        let mut all_data: Vec<Vec<u8>> = Vec::new();
        for len in 1..=200 {
            all_data.push(pseudo_random_bytes(len as u64 * 41, len));
        }
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        for (i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Length {} mismatch", i + 1);
        }
    }

    #[test]
    fn test_var_hash_lengths_decreasing() {
        let gpu = gpu_or_skip!();
        let mut all_data: Vec<Vec<u8>> = Vec::new();
        for i in 0..200 {
            let len = 200 - i;
            all_data.push(pseudo_random_bytes(len as u64 * 43, len));
        }
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        for (i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Decreasing len mismatch at {}", i);
        }
    }

    #[test]
    fn test_var_hash_batch_hash_variable_gpu_wrapper() {
        let gpu = gpu_or_skip!();
        let mut all_data: Vec<Vec<u8>> = Vec::new();
        for i in 0..50 {
            all_data.push(pseudo_random_bytes(i as u64 * 19, 100));
        }
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = hasher::batch_hash_variable_gpu(&gpu, &inputs);
        for (i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Wrapper mismatch at {}", i);
        }
    }

    #[test]
    fn test_var_hash_gpu_wrapper_empty() {
        let gpu = gpu_or_skip!();
        let inputs: Vec<&[u8]> = vec![];
        let results = hasher::batch_hash_variable_gpu(&gpu, &inputs);
        assert!(results.is_empty());
    }

    #[test]
    fn test_var_hash_gpu_wrapper_large() {
        let gpu = gpu_or_skip!();
        let mut all_data: Vec<Vec<u8>> = Vec::new();
        for i in 0..5000 {
            all_data.push(pseudo_random_bytes(i as u64 * 23, 80));
        }
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = hasher::batch_hash_variable_gpu(&gpu, &inputs);
        for (i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Wrapper large mismatch at {}", i);
        }
    }

    #[test]
    fn test_var_hash_interleaved_lengths() {
        let gpu = gpu_or_skip!();
        let mut all_data: Vec<Vec<u8>> = Vec::new();
        for i in 0..200 {
            let len = if i % 2 == 0 { 10 } else { 300 };
            all_data.push(pseudo_random_bytes(i as u64 * 29, len));
        }
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        for (i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Interleaved mismatch at {}", i);
        }
    }

    #[test]
    fn test_var_hash_fibonacci_lengths() {
        let gpu = gpu_or_skip!();
        let mut fibs = vec![1usize, 2];
        while *fibs.last().unwrap() < 1000 {
            let n = fibs.len();
            fibs.push(fibs[n-1] + fibs[n-2]);
        }
        let mut all_data: Vec<Vec<u8>> = Vec::new();
        for (i, &len) in fibs.iter().enumerate() {
            if len > 4096 { break; }
            all_data.push(pseudo_random_bytes(i as u64 * 37, len));
        }
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        for (_i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Fibonacci len {} mismatch", data.len());
        }
    }

    #[test]
    fn test_var_hash_powers_of_2_lengths() {
        let gpu = gpu_or_skip!();
        let mut all_data: Vec<Vec<u8>> = Vec::new();
        let mut len = 1;
        while len <= 1024 {
            all_data.push(pseudo_random_bytes(len as u64 * 47, len));
            len *= 2;
        }
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        for (_i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Power of 2 len {} mismatch", data.len());
        }
    }

    #[test]
    fn test_var_hash_primes_lengths() {
        let gpu = gpu_or_skip!();
        let primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251];
        let mut all_data: Vec<Vec<u8>> = Vec::new();
        for (i, &len) in primes.iter().enumerate() {
            all_data.push(pseudo_random_bytes(i as u64 * 53, len));
        }
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        for (_i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Prime len {} mismatch", data.len());
        }
    }

    #[test]
    fn test_var_hash_all_boundary_lengths() {
        let gpu = gpu_or_skip!();
        let boundaries = [55, 56, 63, 64, 65, 119, 120, 127, 128];
        let mut all_data: Vec<Vec<u8>> = Vec::new();
        for (i, &len) in boundaries.iter().enumerate() {
            all_data.push(pseudo_random_bytes(i as u64 * 59, len));
        }
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        for (_i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Boundary len {} mismatch", data.len());
        }
    }

    #[test]
    fn test_var_hash_repeated_calls_same_data() {
        let gpu = gpu_or_skip!();
        let mut all_data: Vec<Vec<u8>> = Vec::new();
        for i in 0..100 {
            all_data.push(pseudo_random_bytes(i as u64, 100));
        }
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let r1 = gpu.batch_hash_variable(&inputs);
        let r2 = gpu.batch_hash_variable(&inputs);
        let r3 = gpu.batch_hash_variable(&inputs);
        assert_eq!(r1, r2, "Repeated call 1 vs 2 mismatch");
        assert_eq!(r2, r3, "Repeated call 2 vs 3 mismatch");
    }

    #[test]
    fn test_var_hash_large_then_small() {
        let gpu = gpu_or_skip!();
        // Large batch first
        let large: Vec<Vec<u8>> = (0..10000).map(|i| pseudo_random_bytes(i, 100)).collect();
        let large_refs: Vec<&[u8]> = large.iter().map(|v| v.as_slice()).collect();
        let lr = gpu.batch_hash_variable(&large_refs);
        // Small batch second
        let small: Vec<Vec<u8>> = (0..10).map(|i| pseudo_random_bytes(i + 99999, 100)).collect();
        let small_refs: Vec<&[u8]> = small.iter().map(|v| v.as_slice()).collect();
        let sr = gpu.batch_hash_variable(&small_refs);
        // Verify small batch
        for (i, (data, result)) in small.iter().zip(sr.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Small after large mismatch at {}", i);
        }
        // Verify sample of large batch
        for i in [0, 100, 1000, 5000, 9999] {
            assert_eq!(lr[i], cpu_hash(&large[i]), "Large batch sample {} mismatch", i);
        }
    }

    #[test]
    fn test_var_hash_small_then_large() {
        let gpu = gpu_or_skip!();
        let small: Vec<Vec<u8>> = (0..10).map(|i| pseudo_random_bytes(i + 88888, 100)).collect();
        let small_refs: Vec<&[u8]> = small.iter().map(|v| v.as_slice()).collect();
        let sr = gpu.batch_hash_variable(&small_refs);
        let large: Vec<Vec<u8>> = (0..10000).map(|i| pseudo_random_bytes(i + 77777, 100)).collect();
        let large_refs: Vec<&[u8]> = large.iter().map(|v| v.as_slice()).collect();
        let lr = gpu.batch_hash_variable(&large_refs);
        for (i, (data, result)) in small.iter().zip(sr.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Small before large mismatch at {}", i);
        }
        for i in [0, 100, 5000, 9999] {
            assert_eq!(lr[i], cpu_hash(&large[i]), "Large after small mismatch at {}", i);
        }
    }

    // ========================================================================
    // Category 7: Determinism & Consistency (20 tests)
    // ========================================================================

    #[test]
    fn test_determinism_node_hash_10_runs() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(11111, 1000);
        let baseline = gpu.batch_node_hash(&jobs);
        for run in 1..10 {
            let result = gpu.batch_node_hash(&jobs);
            assert_eq!(result, baseline, "Determinism failed at run {}", run);
        }
    }

    #[test]
    fn test_determinism_var_hash_10_runs() {
        let gpu = gpu_or_skip!();
        let all_data: Vec<Vec<u8>> = (0..500).map(|i| pseudo_random_bytes(i, 100)).collect();
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let baseline = gpu.batch_hash_variable(&inputs);
        for run in 1..10 {
            let result = gpu.batch_hash_variable(&inputs);
            assert_eq!(result, baseline, "Var hash determinism failed at run {}", run);
        }
    }

    #[test]
    fn test_determinism_node_hash_different_batch_order() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(22222, 100);
        let r1 = gpu.batch_node_hash(&jobs);
        // Reverse the batch
        let rev_jobs: Vec<NodeHashJob> = jobs.iter().rev().cloned().collect();
        let r2 = gpu.batch_node_hash(&rev_jobs);
        // Each result should match its original position
        for (i, job) in jobs.iter().enumerate() {
            let expected = cpu_hash2(job.level, &job.left, &job.right);
            assert_eq!(r1[i], expected);
            assert_eq!(r2[99 - i], expected);
        }
    }

    #[test]
    fn test_determinism_var_hash_different_batch_order() {
        let gpu = gpu_or_skip!();
        let all_data: Vec<Vec<u8>> = (0..50).map(|i| pseudo_random_bytes(i * 7, 100)).collect();
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let r1 = gpu.batch_hash_variable(&inputs);
        // Reverse
        let rev_data: Vec<Vec<u8>> = all_data.iter().rev().cloned().collect();
        let rev_inputs: Vec<&[u8]> = rev_data.iter().map(|v| v.as_slice()).collect();
        let r2 = gpu.batch_hash_variable(&rev_inputs);
        for i in 0..50 {
            assert_eq!(r1[i], r2[49 - i], "Reorder var hash mismatch at {}", i);
        }
    }

    #[test]
    fn test_node_hash_cpu_gpu_equivalence_10000() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(33333, 10000);
        let gpu_results = gpu.batch_node_hash(&jobs);
        for (i, job) in jobs.iter().enumerate() {
            let cpu_result = cpu_hash2(job.level, &job.left, &job.right);
            assert_eq!(gpu_results[i], cpu_result, "CPU/GPU equiv mismatch at {}", i);
        }
    }

    #[test]
    fn test_var_hash_cpu_gpu_equivalence_5000() {
        let gpu = gpu_or_skip!();
        let all_data: Vec<Vec<u8>> = (0..5000).map(|i| pseudo_random_bytes(i as u64 * 11, 50 + (i % 251))).collect();
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let gpu_results = gpu.batch_hash_variable(&inputs);
        for (i, (data, result)) in all_data.iter().zip(gpu_results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "CPU/GPU var equiv mismatch at {} (len={})", i, data.len());
        }
    }

    #[test]
    fn test_node_hash_gpu_matches_sha2_crate() {
        let gpu = gpu_or_skip!();
        for i in 0..100 {
            let level = (i * 3) as u8;
            let left = pseudo_random_hash(i as u64 * 5);
            let right = pseudo_random_hash(i as u64 * 5 + 1);
            let mut sha2_input = Vec::with_capacity(65);
            sha2_input.push(level);
            sha2_input.extend_from_slice(&left);
            sha2_input.extend_from_slice(&right);
            let expected: [u8; 32] = Sha256::digest(&sha2_input).into();
            let result = gpu.batch_node_hash(&[make_job(level, left, right)]);
            assert_eq!(result[0], expected, "SHA2 crate vs GPU mismatch at {}", i);
        }
    }

    #[test]
    fn test_var_hash_gpu_matches_sha2_crate() {
        let gpu = gpu_or_skip!();
        for i in 0..100 {
            let data = pseudo_random_bytes(i as u64 * 17, 50 + (i % 200));
            let expected: [u8; 32] = Sha256::digest(&data).into();
            let result = gpu.batch_hash_variable(&[data.as_slice()]);
            assert_eq!(result[0], expected, "SHA2 crate vs GPU var mismatch at {} (len={})", i, data.len());
        }
    }

    #[test]
    fn test_cross_method_consistency() {
        let gpu = gpu_or_skip!();
        // batch_node_hash vs batch_hash_variable for same 65-byte input
        for i in 0..50 {
            let level = (i * 5) as u8;
            let left = pseudo_random_hash(i as u64 * 3);
            let right = pseudo_random_hash(i as u64 * 3 + 1);
            let node_result = gpu.batch_node_hash(&[make_job(level, left, right)]);
            let mut var_input = Vec::with_capacity(65);
            var_input.push(level);
            var_input.extend_from_slice(&left);
            var_input.extend_from_slice(&right);
            let var_result = gpu.batch_hash_variable(&[var_input.as_slice()]);
            assert_eq!(node_result[0], var_result[0], "Cross-method mismatch at {}", i);
        }
    }

    #[test]
    fn test_node_hash_into_matches_node_hash_10k() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(44444, 10000);
        let batch = gpu.batch_node_hash(&jobs);
        let mut into_buf = vec![ZERO_HASH32; 10000];
        gpu.batch_node_hash_into(&jobs, &mut into_buf);
        assert_eq!(batch, into_buf, "batch vs into 10K mismatch");
    }

    #[test]
    fn test_gpu_hasher_new_twice() {
        let gpu1 = gpu_or_skip!();
        let gpu2 = gpu_or_skip!();
        let jobs = make_random_jobs(55555, 100);
        let r1 = gpu1.batch_node_hash(&jobs);
        let r2 = gpu2.batch_node_hash(&jobs);
        assert_eq!(r1, r2, "Two GpuHashers should produce identical results");
    }

    #[test]
    fn test_node_hash_associativity_check() {
        let gpu = gpu_or_skip!();
        // hash(L,R) != hash(R,L) for many random pairs
        for i in 0..100 {
            let left = pseudo_random_hash(i * 2);
            let right = pseudo_random_hash(i * 2 + 1);
            if left == right { continue; }
            let r1 = gpu.batch_node_hash(&[make_job(5, left, right)]);
            let r2 = gpu.batch_node_hash(&[make_job(5, right, left)]);
            assert_ne!(r1[0], r2[0], "hash(L,R) should != hash(R,L) at {}", i);
        }
    }

    #[test]
    fn test_node_hash_collision_resistance_1000() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(66666, 1000);
        let results = gpu.batch_node_hash(&jobs);
        let unique: HashSet<[u8; 32]> = results.iter().cloned().collect();
        assert_eq!(unique.len(), 1000, "1000 unique inputs should give 1000 unique hashes");
    }

    #[test]
    fn test_var_hash_collision_resistance_1000() {
        let gpu = gpu_or_skip!();
        let all_data: Vec<Vec<u8>> = (0..1000).map(|i| pseudo_random_bytes(i as u64 * 31, 100)).collect();
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        let unique: HashSet<[u8; 32]> = results.iter().cloned().collect();
        assert_eq!(unique.len(), 1000, "1000 unique var inputs should give 1000 unique hashes");
    }

    #[test]
    fn test_node_hash_avalanche_single_bit() {
        let gpu = gpu_or_skip!();
        let left = pseudo_random_hash(77777);
        let right = pseudo_random_hash(88888);
        let base = gpu.batch_node_hash(&[make_job(5, left, right)]);
        for byte_idx in 0..32 {
            for bit in 0..8 {
                let mut modified_left = left;
                modified_left[byte_idx] ^= 1 << bit;
                let modified = gpu.batch_node_hash(&[make_job(5, modified_left, right)]);
                // Count differing bits
                let mut diff_bits = 0;
                for i in 0..32 {
                    diff_bits += (base[0][i] ^ modified[0][i]).count_ones();
                }
                // Avalanche: expect ~128 bits (50%) to differ, at least 25% = 64 bits
                assert!(diff_bits >= 40, "Avalanche too low: {} bits differ at byte {} bit {}", diff_bits, byte_idx, bit);
            }
        }
    }

    #[test]
    fn test_var_hash_avalanche_single_bit() {
        let gpu = gpu_or_skip!();
        let data = pseudo_random_bytes(99999, 100);
        let base = gpu.batch_hash_variable(&[data.as_slice()]);
        for byte_idx in 0..data.len().min(32) {
            let mut modified = data.clone();
            modified[byte_idx] ^= 1;
            let result = gpu.batch_hash_variable(&[modified.as_slice()]);
            let mut diff_bits = 0u32;
            for i in 0..32 {
                diff_bits += (base[0][i] ^ result[0][i]).count_ones();
            }
            assert!(diff_bits >= 40, "Var avalanche too low: {} bits at byte {}", diff_bits, byte_idx);
        }
    }

    #[test]
    fn test_node_hash_warmup_then_verify() {
        let gpu = gpu_or_skip!();
        // Warmup
        let warmup = make_random_jobs(10000, 1000);
        let _ = gpu.batch_node_hash(&warmup);
        // Now verify fresh batch
        let fresh = make_random_jobs(20000, 1000);
        verify_batch(&gpu, &fresh);
    }

    #[test]
    fn test_var_hash_warmup_then_verify() {
        let gpu = gpu_or_skip!();
        // Warmup
        let warmup_data: Vec<Vec<u8>> = (0..1000).map(|i| pseudo_random_bytes(i + 30000, 100)).collect();
        let warmup_refs: Vec<&[u8]> = warmup_data.iter().map(|v| v.as_slice()).collect();
        let _ = gpu.batch_hash_variable(&warmup_refs);
        // Verify fresh
        let fresh_data: Vec<Vec<u8>> = (0..1000).map(|i| pseudo_random_bytes(i + 40000, 100)).collect();
        let fresh_refs: Vec<&[u8]> = fresh_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&fresh_refs);
        for (i, (data, result)) in fresh_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Post-warmup var mismatch at {}", i);
        }
    }


    // ========================================================================
    // Category 8: sync_mtrees_gpu — Single Twig Tests (40 tests)
    // ========================================================================

    fn make_mtree_with_leaves(seed: u64, n_leaves: usize) -> Box<[Hash32]> {
        let mut mt = vec![ZERO_HASH32; 4096].into_boxed_slice();
        for i in 0..n_leaves {
            mt[2048 + i] = pseudo_random_hash(seed + i as u64);
        }
        mt
    }

    fn verify_sync_single(gpu: &GpuHasher, seed: u64, n_leaves: usize, start: i32, end: i32) {
        let mut mt_cpu = make_mtree_with_leaves(seed, n_leaves);
        let mut mt_gpu = make_mtree_with_leaves(seed, n_leaves);
        sync_mtree(&mut mt_cpu, start, end);
        sync_mtrees_gpu(gpu, &mut [(&mut mt_gpu, start, end)]);
        for i in 1..2048 {
            assert_eq!(mt_cpu[i], mt_gpu[i],
                "Sync single (seed={},n={},range={}..{}) node {} mismatch",
                seed, n_leaves, start, end, i);
        }
    }

    #[test]
    fn test_sync_mtree_single_full_range() {
        let gpu = gpu_or_skip!();
        verify_sync_single(&gpu, 1000, 2048, 0, 2047);
    }

    #[test]
    fn test_sync_mtree_single_range_0_0() {
        let gpu = gpu_or_skip!();
        verify_sync_single(&gpu, 1001, 2048, 0, 0);
    }

    #[test]
    fn test_sync_mtree_single_range_2047_2047() {
        let gpu = gpu_or_skip!();
        verify_sync_single(&gpu, 1002, 2048, 2047, 2047);
    }

    #[test]
    fn test_sync_mtree_single_range_1023_1024() {
        let gpu = gpu_or_skip!();
        verify_sync_single(&gpu, 1003, 2048, 1023, 1024);
    }

    macro_rules! test_sync_range {
        ($name:ident, $seed:expr, $start:expr, $end:expr) => {
            #[test]
            fn $name() {
                let gpu = gpu_or_skip!();
                verify_sync_single(&gpu, $seed, 2048, $start, $end);
            }
        };
    }

    test_sync_range!(test_sync_mtree_single_range_0_1, 1004, 0, 1);
    test_sync_range!(test_sync_mtree_single_range_0_3, 1005, 0, 3);
    test_sync_range!(test_sync_mtree_single_range_0_7, 1006, 0, 7);
    test_sync_range!(test_sync_mtree_single_range_0_15, 1007, 0, 15);
    test_sync_range!(test_sync_mtree_single_range_0_31, 1008, 0, 31);
    test_sync_range!(test_sync_mtree_single_range_0_63, 1009, 0, 63);
    test_sync_range!(test_sync_mtree_single_range_0_127, 1010, 0, 127);
    test_sync_range!(test_sync_mtree_single_range_0_255, 1011, 0, 255);
    test_sync_range!(test_sync_mtree_single_range_0_511, 1012, 0, 511);
    test_sync_range!(test_sync_mtree_single_range_0_1023, 1013, 0, 1023);
    test_sync_range!(test_sync_mtree_single_range_1024_2047, 1014, 1024, 2047);
    test_sync_range!(test_sync_mtree_single_range_1000_1100, 1015, 1000, 1100);
    test_sync_range!(test_sync_mtree_single_range_500_600, 1016, 500, 600);
    test_sync_range!(test_sync_mtree_single_range_2000_2047, 1017, 2000, 2047);
    test_sync_range!(test_sync_mtree_single_odd_range, 1018, 3, 17);
    test_sync_range!(test_sync_mtree_single_even_range, 1019, 4, 18);
    test_sync_range!(test_sync_mtree_single_odd_start_even_end, 1020, 3, 18);
    test_sync_range!(test_sync_mtree_single_even_start_odd_end, 1021, 4, 17);
    test_sync_range!(test_sync_mtree_single_range_1_1, 1022, 1, 1);
    test_sync_range!(test_sync_mtree_single_range_2_3, 1023, 2, 3);
    test_sync_range!(test_sync_mtree_single_range_100_199, 1024, 100, 199);
    test_sync_range!(test_sync_mtree_single_range_power_of_2, 1025, 0, 511);

    #[test]
    fn test_sync_mtree_single_all_zero_leaves() {
        let gpu = gpu_or_skip!();
        let mut mt_cpu = vec![ZERO_HASH32; 4096].into_boxed_slice();
        let mut mt_gpu = vec![ZERO_HASH32; 4096].into_boxed_slice();
        sync_mtree(&mut mt_cpu, 0, 2047);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, 0, 2047)]);
        for i in 1..2048 { assert_eq!(mt_cpu[i], mt_gpu[i], "Zero leaves node {} mismatch", i); }
    }

    #[test]
    fn test_sync_mtree_single_all_ff_leaves() {
        let gpu = gpu_or_skip!();
        let mut mt_cpu = vec![ZERO_HASH32; 4096].into_boxed_slice();
        let mut mt_gpu = vec![ZERO_HASH32; 4096].into_boxed_slice();
        for i in 2048..4096 { mt_cpu[i] = fill_hash(0xFF); mt_gpu[i] = fill_hash(0xFF); }
        sync_mtree(&mut mt_cpu, 0, 2047);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, 0, 2047)]);
        for i in 1..2048 { assert_eq!(mt_cpu[i], mt_gpu[i], "FF leaves node {} mismatch", i); }
    }

    #[test]
    fn test_sync_mtree_single_sequential_leaves() {
        let gpu = gpu_or_skip!();
        let mut mt_cpu = vec![ZERO_HASH32; 4096].into_boxed_slice();
        let mut mt_gpu = vec![ZERO_HASH32; 4096].into_boxed_slice();
        for i in 0..2048 {
            let h = cpu_hash(&(i as u64).to_le_bytes());
            mt_cpu[2048 + i] = h; mt_gpu[2048 + i] = h;
        }
        sync_mtree(&mut mt_cpu, 0, 2047);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, 0, 2047)]);
        for i in 1..2048 { assert_eq!(mt_cpu[i], mt_gpu[i]); }
    }

    #[test]
    fn test_sync_mtree_single_random_leaves() {
        let gpu = gpu_or_skip!();
        verify_sync_single(&gpu, 7777, 2048, 0, 2047);
    }

    #[test]
    fn test_sync_mtree_single_null_mt_check() {
        let gpu = gpu_or_skip!();
        // Start from NULL_MT and verify GPU reproduces it
        use crate::def::ENTRY_BASE_LENGTH;
        use crate::entryfile::entry;
        let mut bz = [0u8; ENTRY_BASE_LENGTH + 8];
        let null_hash = entry::null_entry(&mut bz[..]).hash();
        let mut mt_gpu = vec![ZERO_HASH32; 4096].into_boxed_slice();
        for i in 2048..4096 { mt_gpu[i] = null_hash; }
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, 0, 2047)]);
        for i in 1..2048 {
            assert_eq!(mt_gpu[i], NULL_MT_FOR_TWIG[i], "Null MT node {} mismatch", i);
        }
    }

    #[test]
    fn test_sync_mtree_single_verify_root() {
        let gpu = gpu_or_skip!();
        let mut mt_gpu = make_mtree_with_leaves(8888, 2048);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, 0, 2047)]);
        // Root should equal hash(level=10, mt[2], mt[3])
        let expected_root = cpu_hash2(10, &mt_gpu[2], &mt_gpu[3]);
        assert_eq!(mt_gpu[1], expected_root, "Root != hash(mt[2], mt[3])");
    }

    #[test]
    fn test_sync_mtree_single_verify_all_internals() {
        let gpu = gpu_or_skip!();
        let mut mt_gpu = make_mtree_with_leaves(9999, 2048);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, 0, 2047)]);
        // Verify every internal node
        let mut level: u8 = 0;
        let mut base: usize = 2048;
        while base >= 2 {
            for i in (0..base).step_by(2) {
                let idx = base + i;
                if idx + 1 < 4096 {
                    let expected = cpu_hash2(level, &mt_gpu[idx], &mt_gpu[idx + 1]);
                    assert_eq!(mt_gpu[idx / 2], expected, "Internal node {} at level {} mismatch", idx/2, level);
                }
            }
            level += 1;
            base /= 2;
        }
    }

    #[test]
    fn test_sync_mtree_single_modify_one_leaf() {
        let gpu = gpu_or_skip!();
        let mut mt_cpu = NULL_MT_FOR_TWIG.to_vec().into_boxed_slice();
        let mut mt_gpu = NULL_MT_FOR_TWIG.to_vec().into_boxed_slice();
        let new_hash = pseudo_random_hash(55555);
        mt_cpu[2048 + 500] = new_hash;
        mt_gpu[2048 + 500] = new_hash;
        sync_mtree(&mut mt_cpu, 500, 500);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, 500, 500)]);
        assert_eq!(mt_cpu[1], mt_gpu[1], "Modify one leaf root mismatch");
        for i in 1..2048 { assert_eq!(mt_cpu[i], mt_gpu[i]); }
    }

    #[test]
    fn test_sync_mtree_single_modify_two_leaves() {
        let gpu = gpu_or_skip!();
        let mut mt_cpu = NULL_MT_FOR_TWIG.to_vec().into_boxed_slice();
        let mut mt_gpu = NULL_MT_FOR_TWIG.to_vec().into_boxed_slice();
        mt_cpu[2048] = pseudo_random_hash(1); mt_gpu[2048] = pseudo_random_hash(1);
        mt_cpu[2048 + 2047] = pseudo_random_hash(2); mt_gpu[2048 + 2047] = pseudo_random_hash(2);
        sync_mtree(&mut mt_cpu, 0, 2047);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, 0, 2047)]);
        for i in 1..2048 { assert_eq!(mt_cpu[i], mt_gpu[i], "Modify 2 leaves node {} mismatch", i); }
    }

    #[test]
    fn test_sync_mtree_single_modify_random_10() {
        let gpu = gpu_or_skip!();
        let mut mt_cpu = NULL_MT_FOR_TWIG.to_vec().into_boxed_slice();
        let mut mt_gpu = NULL_MT_FOR_TWIG.to_vec().into_boxed_slice();
        let positions = [42, 100, 255, 500, 777, 1000, 1234, 1500, 1800, 2000];
        let mut min_pos = 2047i32;
        let mut max_pos = 0i32;
        for (j, &p) in positions.iter().enumerate() {
            let h = pseudo_random_hash(j as u64 * 1000);
            mt_cpu[2048 + p] = h; mt_gpu[2048 + p] = h;
            min_pos = min_pos.min(p as i32);
            max_pos = max_pos.max(p as i32);
        }
        sync_mtree(&mut mt_cpu, min_pos, max_pos);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, min_pos, max_pos)]);
        for i in 1..2048 { assert_eq!(mt_cpu[i], mt_gpu[i], "Modify 10 random node {} mismatch", i); }
    }

    #[test]
    fn test_sync_mtree_single_sparse_updates() {
        let gpu = gpu_or_skip!();
        let mut mt_cpu = NULL_MT_FOR_TWIG.to_vec().into_boxed_slice();
        let mut mt_gpu = NULL_MT_FOR_TWIG.to_vec().into_boxed_slice();
        for i in (0..2048).step_by(100) {
            let h = pseudo_random_hash(i as u64);
            mt_cpu[2048 + i] = h; mt_gpu[2048 + i] = h;
        }
        sync_mtree(&mut mt_cpu, 0, 2047);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, 0, 2047)]);
        for i in 1..2048 { assert_eq!(mt_cpu[i], mt_gpu[i]); }
    }

    #[test]
    fn test_sync_mtree_single_dense_updates() {
        let gpu = gpu_or_skip!();
        verify_sync_single(&gpu, 12345, 2048, 0, 2047);
    }

    // ========================================================================
    // Category 9: sync_mtrees_gpu — Multi-Twig Tests (35 tests)
    // ========================================================================

    fn verify_sync_multi(gpu: &GpuHasher, n_twigs: usize, base_seed: u64, range_fn: impl Fn(usize) -> (i32, i32)) {
        let mut cpu_mts: Vec<Box<[Hash32]>> = Vec::new();
        let mut gpu_mts: Vec<Box<[Hash32]>> = Vec::new();
        let mut ranges = Vec::new();

        for t in 0..n_twigs {
            let mt_cpu = make_mtree_with_leaves(base_seed + t as u64 * 10000, 2048);
            let mt_gpu = mt_cpu.clone();
            let range = range_fn(t);
            ranges.push(range);
            cpu_mts.push(mt_cpu);
            gpu_mts.push(mt_gpu);
        }

        // CPU
        for (t, mt) in cpu_mts.iter_mut().enumerate() {
            sync_mtree(mt, ranges[t].0, ranges[t].1);
        }

        // GPU batch
        let mut slices: Vec<(&mut [Hash32], i32, i32)> = gpu_mts
            .iter_mut()
            .enumerate()
            .map(|(t, mt)| (mt.as_mut() as &mut [Hash32], ranges[t].0, ranges[t].1))
            .collect();
        sync_mtrees_gpu(gpu, &mut slices);

        for t in 0..n_twigs {
            for i in 1..2048 {
                assert_eq!(cpu_mts[t][i], gpu_mts[t][i],
                    "Multi-twig {} node {} mismatch (range {:?})", t, i, ranges[t]);
            }
        }
    }

    #[test]
    fn test_sync_mtrees_two_twigs_same_range() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 2, 2000, |_| (0, 2047));
    }

    #[test]
    fn test_sync_mtrees_two_twigs_different_ranges() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 2, 2001, |t| if t == 0 { (0, 1023) } else { (1024, 2047) });
    }

    #[test]
    fn test_sync_mtrees_three_twigs() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 3, 2002, |t| match t { 0 => (0, 500), 1 => (500, 1500), _ => (1500, 2047) });
    }

    #[test]
    fn test_sync_mtrees_five_twigs() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 5, 2003, |_| (0, 2047));
    }

    #[test]
    fn test_sync_mtrees_ten_twigs() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 10, 2004, |_| (0, 2047));
    }

    #[test]
    fn test_sync_mtrees_ten_twigs_random_ranges() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 10, 2005, |t| {
            let start = (t * 200) % 2048;
            let end = std::cmp::min(start + 200, 2047);
            (start as i32, end as i32)
        });
    }

    #[test]
    fn test_sync_mtrees_twenty_twigs() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 20, 2006, |_| (0, 2047));
    }

    #[test]
    fn test_sync_mtrees_fifty_twigs() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 50, 2007, |_| (0, 2047));
    }

    #[test]
    fn test_sync_mtrees_hundred_twigs() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 100, 2008, |_| (0, 2047));
    }

    #[test]
    fn test_sync_mtrees_empty_list() {
        let gpu = gpu_or_skip!();
        let mut slices: Vec<(&mut [Hash32], i32, i32)> = vec![];
        sync_mtrees_gpu(&gpu, &mut slices);
        // Should not panic
    }

    #[test]
    fn test_sync_mtrees_all_same_data() {
        let gpu = gpu_or_skip!();
        let seed = 3000u64;
        let template = make_mtree_with_leaves(seed, 2048);
        let mut cpu_mts: Vec<Box<[Hash32]>> = (0..10).map(|_| template.clone()).collect();
        let mut gpu_mts: Vec<Box<[Hash32]>> = (0..10).map(|_| template.clone()).collect();
        for mt in cpu_mts.iter_mut() { sync_mtree(mt, 0, 2047); }
        let mut slices: Vec<(&mut [Hash32], i32, i32)> = gpu_mts.iter_mut().map(|mt| (mt.as_mut() as &mut [Hash32], 0, 2047)).collect();
        sync_mtrees_gpu(&gpu, &mut slices);
        // All should have same root
        for t in 0..10 {
            assert_eq!(cpu_mts[0][1], gpu_mts[t][1], "All same data twig {} root mismatch", t);
        }
    }

    #[test]
    fn test_sync_mtrees_all_different_data() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 10, 3001, |_| (0, 2047));
    }

    #[test]
    fn test_sync_mtrees_mixed_range_sizes() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 4, 3002, |t| match t {
            0 => (500, 500),   // 1 leaf
            1 => (100, 109),   // 10 leaves
            2 => (200, 299),   // 100 leaves
            _ => (0, 2047),    // all leaves
        });
    }

    #[test]
    fn test_sync_mtrees_disjoint_small_ranges() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 20, 3003, |t| {
            let start = (t * 3) as i32;
            (start, start + 2)
        });
    }

    #[test]
    fn test_sync_mtrees_order_independence() {
        let gpu = gpu_or_skip!();
        // Sync in order A, B, C and C, B, A — each twig should get same result
        let mut mts_forward: Vec<Box<[Hash32]>> = (0..3).map(|t| make_mtree_with_leaves(4000 + t, 2048)).collect();
        let mut mts_reverse: Vec<Box<[Hash32]>> = mts_forward.iter().rev().cloned().collect();
        let mut slices_f: Vec<(&mut [Hash32], i32, i32)> = mts_forward.iter_mut().map(|mt| (mt.as_mut() as &mut [Hash32], 0, 2047)).collect();
        sync_mtrees_gpu(&gpu, &mut slices_f);
        let mut slices_r: Vec<(&mut [Hash32], i32, i32)> = mts_reverse.iter_mut().map(|mt| (mt.as_mut() as &mut [Hash32], 0, 2047)).collect();
        sync_mtrees_gpu(&gpu, &mut slices_r);
        for t in 0..3 {
            assert_eq!(mts_forward[t][1], mts_reverse[2-t][1], "Order independence twig {} failed", t);
        }
    }

    #[test]
    fn test_sync_mtrees_vs_cpu_two_twigs() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 2, 4001, |_| (0, 2047));
    }

    #[test]
    fn test_sync_mtrees_vs_cpu_ten_twigs() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 10, 4002, |_| (0, 2047));
    }

    #[test]
    fn test_sync_mtrees_vs_cpu_fifty_twigs() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 50, 4003, |_| (0, 2047));
    }

    #[test]
    fn test_sync_mtrees_null_mt_initialization() {
        let gpu = gpu_or_skip!();
        use crate::def::ENTRY_BASE_LENGTH;
        use crate::entryfile::entry;
        let mut bz = [0u8; ENTRY_BASE_LENGTH + 8];
        let null_hash = entry::null_entry(&mut bz[..]).hash();
        let mut mt_gpu = vec![ZERO_HASH32; 4096].into_boxed_slice();
        for i in 2048..4096 { mt_gpu[i] = null_hash; }
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, 0, 2047)]);
        assert_eq!(mt_gpu[1], NULL_MT_FOR_TWIG[1], "GPU null MT root mismatch");
    }

    #[test]
    fn test_sync_mtrees_with_check_mt() {
        let gpu = gpu_or_skip!();
        use crate::merkletree::check::check_mt;
        let mut mt = make_mtree_with_leaves(5000, 2048);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt, 0, 2047)]);
        check_mt(&mt); // Should not panic
    }

    #[test]
    fn test_sync_mtrees_large_batch_25_twigs_full() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 25, 5001, |_| (0, 2047));
    }

    #[test]
    fn test_sync_mtrees_single_leaf_per_twig() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 50, 5002, |t| {
            let pos = (t * 41) % 2048;
            (pos as i32, pos as i32)
        });
    }

    #[test]
    fn test_sync_mtrees_first_half_only() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 10, 5003, |_| (0, 1023));
    }

    #[test]
    fn test_sync_mtrees_second_half_only() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 10, 5004, |_| (1024, 2047));
    }

    #[test]
    fn test_sync_mtrees_edges_only() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 10, 5005, |t| if t % 2 == 0 { (0, 0) } else { (2047, 2047) });
    }

    #[test]
    fn test_sync_mtrees_all_range_0_0() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 100, 5006, |_| (0, 0));
    }

    #[test]
    fn test_sync_mtrees_realistic_block() {
        let gpu = gpu_or_skip!();
        // Simulate a block where each twig gets ~50-200 modified leaves
        verify_sync_multi(&gpu, 50, 5007, |t| {
            let start = (t * 37) % 2048;
            let count = 50 + (t * 3) % 151;
            let end = std::cmp::min(start + count, 2047);
            (start as i32, end as i32)
        });
    }

    #[test]
    fn test_sync_mtrees_gpu_then_modify_then_resync() {
        let gpu = gpu_or_skip!();
        let mut mt_cpu = make_mtree_with_leaves(6000, 2048);
        let mut mt_gpu = mt_cpu.clone();
        sync_mtree(&mut mt_cpu, 0, 2047);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, 0, 2047)]);
        assert_eq!(mt_cpu[1], mt_gpu[1]);
        // Modify and resync
        let new_hash = pseudo_random_hash(99999);
        mt_cpu[2048 + 100] = new_hash;
        mt_gpu[2048 + 100] = new_hash;
        sync_mtree(&mut mt_cpu, 100, 100);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, 100, 100)]);
        assert_eq!(mt_cpu[1], mt_gpu[1], "Post-modify root mismatch");
    }

    #[test]
    fn test_sync_mtrees_vs_cpu_random_100() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 100, 6001, |t| {
            let mut s = (t as u64).wrapping_mul(31337);
            s = s.wrapping_mul(6364136223846793005).wrapping_add(1);
            let start = (s % 2048) as i32;
            s = s.wrapping_mul(6364136223846793005).wrapping_add(1);
            let range = (s % 500) as i32 + 1;
            let end = std::cmp::min(start + range, 2047);
            (start, end)
        });
    }

    #[test]
    fn test_sync_mtrees_deterministic_across_calls() {
        let gpu = gpu_or_skip!();
        let mt_orig = make_mtree_with_leaves(7000, 2048);
        let mut mt1 = mt_orig.clone();
        let mut mt2 = mt_orig.clone();
        sync_mtrees_gpu(&gpu, &mut [(&mut mt1, 0, 2047)]);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt2, 0, 2047)]);
        assert_eq!(mt1[1], mt2[1], "Deterministic sync failed");
        for i in 1..2048 { assert_eq!(mt1[i], mt2[i]); }
    }


    // ========================================================================
    // Category 10: Twig Active Bits GPU Sync (30 tests)
    // ========================================================================

    fn make_active_bits_with_pattern(seed: u64) -> ActiveBits {
        let mut bits = ActiveBits::new();
        let bytes = pseudo_random_bytes(seed, 256);
        for i in 0..256 {
            // Set using internal access via set_bit for each bit
            for b in 0..8u32 {
                if bytes[i] & (1 << b) != 0 {
                    let offset = (i as u32) * 8 + b;
                    if offset < 2048 {
                        bits.set_bit(offset);
                    }
                }
            }
        }
        bits
    }

    fn verify_sync_l1_gpu(gpu: &GpuHasher, active_bits: &ActiveBits, pos: usize) {
        // CPU
        let mut twig_cpu = Twig::new();
        twig_cpu.sync_l1(pos as i32, active_bits);

        // GPU
        let left_page = pos * 2;
        let right_page = pos * 2 + 1;
        let mut left = [0u8; 32];
        let mut right = [0u8; 32];
        left.copy_from_slice(active_bits.get_bits(left_page, 32));
        right.copy_from_slice(active_bits.get_bits(right_page, 32));
        let result = gpu.batch_node_hash(&[NodeHashJob { level: 8, left, right }]);

        assert_eq!(result[0], twig_cpu.active_bits_mtl1[pos],
            "sync_l1 pos={} mismatch: GPU={} CPU={}",
            pos, hex::encode(result[0]), hex::encode(twig_cpu.active_bits_mtl1[pos]));
    }

    #[test]
    fn test_twig_sync_l1_gpu_basic() {
        let gpu = gpu_or_skip!();
        let active_bits = make_active_bits_with_pattern(100);
        for pos in 0..4 {
            verify_sync_l1_gpu(&gpu, &active_bits, pos);
        }
    }

    #[test]
    fn test_twig_sync_l1_gpu_pos0() {
        let gpu = gpu_or_skip!();
        verify_sync_l1_gpu(&gpu, &make_active_bits_with_pattern(101), 0);
    }

    #[test]
    fn test_twig_sync_l1_gpu_pos1() {
        let gpu = gpu_or_skip!();
        verify_sync_l1_gpu(&gpu, &make_active_bits_with_pattern(102), 1);
    }

    #[test]
    fn test_twig_sync_l1_gpu_pos2() {
        let gpu = gpu_or_skip!();
        verify_sync_l1_gpu(&gpu, &make_active_bits_with_pattern(103), 2);
    }

    #[test]
    fn test_twig_sync_l1_gpu_pos3() {
        let gpu = gpu_or_skip!();
        verify_sync_l1_gpu(&gpu, &make_active_bits_with_pattern(104), 3);
    }

    #[test]
    fn test_twig_sync_l1_gpu_all_zeros() {
        let gpu = gpu_or_skip!();
        let active_bits = ActiveBits::new();
        for pos in 0..4 { verify_sync_l1_gpu(&gpu, &active_bits, pos); }
    }

    #[test]
    fn test_twig_sync_l1_gpu_all_ones() {
        let gpu = gpu_or_skip!();
        let mut active_bits = ActiveBits::new();
        for i in 0..2048u32 { active_bits.set_bit(i); }
        for pos in 0..4 { verify_sync_l1_gpu(&gpu, &active_bits, pos); }
    }

    #[test]
    fn test_twig_sync_l1_gpu_sequential() {
        let gpu = gpu_or_skip!();
        let mut active_bits = ActiveBits::new();
        for i in 0..255u32 {
            // Set bits so internal bytes are sequential
            active_bits.set_bit(i * 8);
        }
        for pos in 0..4 { verify_sync_l1_gpu(&gpu, &active_bits, pos); }
    }

    #[test]
    fn test_twig_sync_l1_gpu_single_bit_set() {
        let gpu = gpu_or_skip!();
        let mut active_bits = ActiveBits::new();
        active_bits.set_bit(42);
        for pos in 0..4 { verify_sync_l1_gpu(&gpu, &active_bits, pos); }
    }

    #[test]
    fn test_twig_sync_l1_gpu_alternating() {
        let gpu = gpu_or_skip!();
        let mut active_bits = ActiveBits::new();
        for i in (0..2048u32).step_by(2) { active_bits.set_bit(i); }
        for pos in 0..4 { verify_sync_l1_gpu(&gpu, &active_bits, pos); }
    }

    #[test]
    fn test_twig_sync_l1_gpu_multi_twig() {
        let gpu = gpu_or_skip!();
        // Batch 10 twigs' sync_l1 operations (4 per twig = 40 jobs)
        let mut jobs = Vec::new();
        let mut expected = Vec::new();
        for t in 0..10 {
            let active_bits = make_active_bits_with_pattern(200 + t);
            for pos in 0..4 {
                let left_page = pos * 2;
                let right_page = pos * 2 + 1;
                let mut left = [0u8; 32];
                let mut right = [0u8; 32];
                left.copy_from_slice(active_bits.get_bits(left_page, 32));
                right.copy_from_slice(active_bits.get_bits(right_page, 32));
                jobs.push(NodeHashJob { level: 8, left, right });
                let mut twig = Twig::new();
                twig.sync_l1(pos as i32, &active_bits);
                expected.push(twig.active_bits_mtl1[pos]);
            }
        }
        let results = gpu.batch_node_hash(&jobs);
        for i in 0..40 {
            assert_eq!(results[i], expected[i], "Multi-twig sync_l1 job {} mismatch", i);
        }
    }

    #[test]
    fn test_twig_sync_l1_gpu_vs_cpu_random() {
        let gpu = gpu_or_skip!();
        for t in 0..50 {
            let active_bits = make_active_bits_with_pattern(300 + t);
            for pos in 0..4 {
                verify_sync_l1_gpu(&gpu, &active_bits, pos);
            }
        }
    }

    #[test]
    fn test_twig_sync_l2_gpu_pos0() {
        let gpu = gpu_or_skip!();
        let active_bits = make_active_bits_with_pattern(400);
        let mut twig_cpu = Twig::new();
        for pos in 0..4 { twig_cpu.sync_l1(pos, &active_bits); }
        twig_cpu.sync_l2(0);
        let result = gpu.batch_node_hash(&[NodeHashJob {
            level: 9,
            left: twig_cpu.active_bits_mtl1[0],
            right: twig_cpu.active_bits_mtl1[1],
        }]);
        assert_eq!(result[0], twig_cpu.active_bits_mtl2[0], "sync_l2 pos0 mismatch");
    }

    #[test]
    fn test_twig_sync_l2_gpu_pos1() {
        let gpu = gpu_or_skip!();
        let active_bits = make_active_bits_with_pattern(401);
        let mut twig_cpu = Twig::new();
        for pos in 0..4 { twig_cpu.sync_l1(pos, &active_bits); }
        twig_cpu.sync_l2(1);
        let result = gpu.batch_node_hash(&[NodeHashJob {
            level: 9,
            left: twig_cpu.active_bits_mtl1[2],
            right: twig_cpu.active_bits_mtl1[3],
        }]);
        assert_eq!(result[0], twig_cpu.active_bits_mtl2[1], "sync_l2 pos1 mismatch");
    }

    #[test]
    fn test_twig_sync_l2_gpu_both() {
        let gpu = gpu_or_skip!();
        let active_bits = make_active_bits_with_pattern(402);
        let mut twig_cpu = Twig::new();
        for pos in 0..4 { twig_cpu.sync_l1(pos, &active_bits); }
        twig_cpu.sync_l2(0);
        twig_cpu.sync_l2(1);
        let jobs = vec![
            NodeHashJob { level: 9, left: twig_cpu.active_bits_mtl1[0], right: twig_cpu.active_bits_mtl1[1] },
            NodeHashJob { level: 9, left: twig_cpu.active_bits_mtl1[2], right: twig_cpu.active_bits_mtl1[3] },
        ];
        let results = gpu.batch_node_hash(&jobs);
        assert_eq!(results[0], twig_cpu.active_bits_mtl2[0]);
        assert_eq!(results[1], twig_cpu.active_bits_mtl2[1]);
    }

    #[test]
    fn test_twig_sync_l2_gpu_multi_twig() {
        let gpu = gpu_or_skip!();
        let mut jobs = Vec::new();
        let mut expected = Vec::new();
        for t in 0..10 {
            let active_bits = make_active_bits_with_pattern(500 + t);
            let mut twig = Twig::new();
            for pos in 0..4 { twig.sync_l1(pos, &active_bits); }
            twig.sync_l2(0); twig.sync_l2(1);
            jobs.push(NodeHashJob { level: 9, left: twig.active_bits_mtl1[0], right: twig.active_bits_mtl1[1] });
            jobs.push(NodeHashJob { level: 9, left: twig.active_bits_mtl1[2], right: twig.active_bits_mtl1[3] });
            expected.push(twig.active_bits_mtl2[0]);
            expected.push(twig.active_bits_mtl2[1]);
        }
        let results = gpu.batch_node_hash(&jobs);
        for i in 0..20 { assert_eq!(results[i], expected[i], "Multi-twig sync_l2 {} mismatch", i); }
    }

    #[test]
    fn test_twig_sync_l2_gpu_vs_cpu() {
        let gpu = gpu_or_skip!();
        for t in 0..20 {
            let active_bits = make_active_bits_with_pattern(600 + t);
            let mut twig = Twig::new();
            for pos in 0..4 { twig.sync_l1(pos, &active_bits); }
            twig.sync_l2(0); twig.sync_l2(1);
            let r0 = gpu.batch_node_hash(&[NodeHashJob { level: 9, left: twig.active_bits_mtl1[0], right: twig.active_bits_mtl1[1] }]);
            let r1 = gpu.batch_node_hash(&[NodeHashJob { level: 9, left: twig.active_bits_mtl1[2], right: twig.active_bits_mtl1[3] }]);
            assert_eq!(r0[0], twig.active_bits_mtl2[0], "L2 pos0 twig {} mismatch", t);
            assert_eq!(r1[0], twig.active_bits_mtl2[1], "L2 pos1 twig {} mismatch", t);
        }
    }

    #[test]
    fn test_twig_sync_l3_gpu_basic() {
        let gpu = gpu_or_skip!();
        let active_bits = make_active_bits_with_pattern(700);
        let mut twig = Twig::new();
        for pos in 0..4 { twig.sync_l1(pos, &active_bits); }
        twig.sync_l2(0); twig.sync_l2(1); twig.sync_l3();
        let result = gpu.batch_node_hash(&[NodeHashJob { level: 10, left: twig.active_bits_mtl2[0], right: twig.active_bits_mtl2[1] }]);
        assert_eq!(result[0], twig.active_bits_mtl3, "sync_l3 mismatch");
    }

    #[test]
    fn test_twig_sync_l3_gpu_multi_twig() {
        let gpu = gpu_or_skip!();
        let mut jobs = Vec::new();
        let mut expected = Vec::new();
        for t in 0..10 {
            let active_bits = make_active_bits_with_pattern(800 + t);
            let mut twig = Twig::new();
            for pos in 0..4 { twig.sync_l1(pos, &active_bits); }
            twig.sync_l2(0); twig.sync_l2(1); twig.sync_l3();
            jobs.push(NodeHashJob { level: 10, left: twig.active_bits_mtl2[0], right: twig.active_bits_mtl2[1] });
            expected.push(twig.active_bits_mtl3);
        }
        let results = gpu.batch_node_hash(&jobs);
        for i in 0..10 { assert_eq!(results[i], expected[i], "Multi-twig sync_l3 {} mismatch", i); }
    }

    #[test]
    fn test_twig_sync_l3_gpu_vs_cpu() {
        let gpu = gpu_or_skip!();
        for t in 0..20 {
            let active_bits = make_active_bits_with_pattern(900 + t);
            let mut twig = Twig::new();
            for pos in 0..4 { twig.sync_l1(pos, &active_bits); }
            twig.sync_l2(0); twig.sync_l2(1); twig.sync_l3();
            let r = gpu.batch_node_hash(&[NodeHashJob { level: 10, left: twig.active_bits_mtl2[0], right: twig.active_bits_mtl2[1] }]);
            assert_eq!(r[0], twig.active_bits_mtl3, "L3 twig {} mismatch", t);
        }
    }

    #[test]
    fn test_twig_sync_top_gpu_basic() {
        let gpu = gpu_or_skip!();
        let active_bits = make_active_bits_with_pattern(1000);
        let mut twig = Twig::new();
        twig.left_root = pseudo_random_hash(1001);
        for pos in 0..4 { twig.sync_l1(pos, &active_bits); }
        twig.sync_l2(0); twig.sync_l2(1); twig.sync_l3(); twig.sync_top();
        let result = gpu.batch_node_hash(&[NodeHashJob { level: 11, left: twig.left_root, right: twig.active_bits_mtl3 }]);
        assert_eq!(result[0], twig.twig_root, "sync_top mismatch");
    }

    #[test]
    fn test_twig_sync_top_gpu_multi_twig() {
        let gpu = gpu_or_skip!();
        let mut jobs = Vec::new();
        let mut expected = Vec::new();
        for t in 0..10u64 {
            let active_bits = make_active_bits_with_pattern(1100 + t);
            let mut twig = Twig::new();
            twig.left_root = pseudo_random_hash(t * 100);
            for pos in 0..4 { twig.sync_l1(pos as i32, &active_bits); }
            twig.sync_l2(0); twig.sync_l2(1); twig.sync_l3(); twig.sync_top();
            jobs.push(NodeHashJob { level: 11, left: twig.left_root, right: twig.active_bits_mtl3 });
            expected.push(twig.twig_root);
        }
        let results = gpu.batch_node_hash(&jobs);
        for i in 0..10 { assert_eq!(results[i], expected[i], "Multi-twig sync_top {} mismatch", i); }
    }

    #[test]
    fn test_twig_sync_top_gpu_vs_cpu() {
        let gpu = gpu_or_skip!();
        for t in 0..20u64 {
            let active_bits = make_active_bits_with_pattern(1200 + t);
            let mut twig = Twig::new();
            twig.left_root = pseudo_random_hash(t * 200);
            for pos in 0..4 { twig.sync_l1(pos as i32, &active_bits); }
            twig.sync_l2(0); twig.sync_l2(1); twig.sync_l3(); twig.sync_top();
            let r = gpu.batch_node_hash(&[NodeHashJob { level: 11, left: twig.left_root, right: twig.active_bits_mtl3 }]);
            assert_eq!(r[0], twig.twig_root, "Top twig {} mismatch", t);
        }
    }

    #[test]
    fn test_twig_full_chain_l1_l2_l3_top_gpu() {
        let gpu = gpu_or_skip!();
        let active_bits = make_active_bits_with_pattern(1300);
        let left_root = pseudo_random_hash(1301);

        // GPU: chain all levels
        let mut l1_jobs = Vec::new();
        for pos in 0..4 {
            let lp = pos * 2;
            let rp = pos * 2 + 1;
            let mut left = [0u8; 32];
            let mut right = [0u8; 32];
            left.copy_from_slice(active_bits.get_bits(lp, 32));
            right.copy_from_slice(active_bits.get_bits(rp, 32));
            l1_jobs.push(NodeHashJob { level: 8, left, right });
        }
        let l1 = gpu.batch_node_hash(&l1_jobs);
        let l2 = gpu.batch_node_hash(&[
            NodeHashJob { level: 9, left: l1[0], right: l1[1] },
            NodeHashJob { level: 9, left: l1[2], right: l1[3] },
        ]);
        let l3 = gpu.batch_node_hash(&[NodeHashJob { level: 10, left: l2[0], right: l2[1] }]);
        let top = gpu.batch_node_hash(&[NodeHashJob { level: 11, left: left_root, right: l3[0] }]);

        // CPU chain
        let mut twig_cpu = Twig::new();
        twig_cpu.left_root = left_root;
        for pos in 0..4 { twig_cpu.sync_l1(pos, &active_bits); }
        twig_cpu.sync_l2(0); twig_cpu.sync_l2(1); twig_cpu.sync_l3(); twig_cpu.sync_top();

        assert_eq!(top[0], twig_cpu.twig_root, "Full chain twig_root mismatch");
    }

    #[test]
    fn test_twig_full_chain_gpu_vs_cpu() {
        let gpu = gpu_or_skip!();
        for t in 0..30u64 {
            let active_bits = make_active_bits_with_pattern(1400 + t);
            let left_root = pseudo_random_hash(t * 500);
            let mut twig_cpu = Twig::new();
            twig_cpu.left_root = left_root;
            for pos in 0..4 { twig_cpu.sync_l1(pos, &active_bits); }
            twig_cpu.sync_l2(0); twig_cpu.sync_l2(1); twig_cpu.sync_l3(); twig_cpu.sync_top();

            // GPU chain
            let mut l1_jobs = Vec::new();
            for pos in 0..4 {
                let mut l = [0u8; 32]; let mut r = [0u8; 32];
                l.copy_from_slice(active_bits.get_bits(pos*2, 32));
                r.copy_from_slice(active_bits.get_bits(pos*2+1, 32));
                l1_jobs.push(NodeHashJob { level: 8, left: l, right: r });
            }
            let l1 = gpu.batch_node_hash(&l1_jobs);
            let l2 = gpu.batch_node_hash(&[
                NodeHashJob { level: 9, left: l1[0], right: l1[1] },
                NodeHashJob { level: 9, left: l1[2], right: l1[3] },
            ]);
            let l3 = gpu.batch_node_hash(&[NodeHashJob { level: 10, left: l2[0], right: l2[1] }]);
            let top = gpu.batch_node_hash(&[NodeHashJob { level: 11, left: left_root, right: l3[0] }]);
            assert_eq!(top[0], twig_cpu.twig_root, "Full chain twig {} mismatch", t);
        }
    }

    #[test]
    fn test_twig_full_chain_null_twig_reproduction() {
        let gpu = gpu_or_skip!();
        let null_ab = ActiveBits::new();
        let mut l1_jobs = Vec::new();
        for pos in 0..4 {
            let mut l = [0u8; 32]; let mut r = [0u8; 32];
            l.copy_from_slice(null_ab.get_bits(pos*2, 32));
            r.copy_from_slice(null_ab.get_bits(pos*2+1, 32));
            l1_jobs.push(NodeHashJob { level: 8, left: l, right: r });
        }
        let l1 = gpu.batch_node_hash(&l1_jobs);
        let l2 = gpu.batch_node_hash(&[
            NodeHashJob { level: 9, left: l1[0], right: l1[1] },
            NodeHashJob { level: 9, left: l1[2], right: l1[3] },
        ]);
        let l3 = gpu.batch_node_hash(&[NodeHashJob { level: 10, left: l2[0], right: l2[1] }]);
        let top = gpu.batch_node_hash(&[NodeHashJob { level: 11, left: NULL_MT_FOR_TWIG[1], right: l3[0] }]);
        assert_eq!(top[0], NULL_TWIG.twig_root,
            "NULL_TWIG reproduction: GPU={} expected={}",
            hex::encode(top[0]), hex::encode(NULL_TWIG.twig_root));
    }

    #[test]
    fn test_twig_full_chain_known_hash_values() {
        let gpu = gpu_or_skip!();
        // Reproduce the known values from twig_tests::test_sync
        let mut active_bits = ActiveBits::new();
        for i in 0..255usize {
            // Set internal bytes to sequential values matching the test
            for b in 0..8u32 {
                if (i as u8) & (1 << b) != 0 {
                    let offset = i as u32 * 8 + b;
                    if offset < 2048 { active_bits.set_bit(offset); }
                }
            }
        }
        let mut l1_jobs = Vec::new();
        for pos in 0..4 {
            let mut l = [0u8; 32]; let mut r = [0u8; 32];
            l.copy_from_slice(active_bits.get_bits(pos*2, 32));
            r.copy_from_slice(active_bits.get_bits(pos*2+1, 32));
            l1_jobs.push(NodeHashJob { level: 8, left: l, right: r });
        }
        let l1 = gpu.batch_node_hash(&l1_jobs);
        // Compare against known hex values from twig_tests
        assert_eq!(hex::encode(l1[0]), "ebdc6bccc0d70075f48ab3c602652a1787d41c05f5a0a851ffe479df0975e683");
        assert_eq!(hex::encode(l1[1]), "3eac125482e6c5682c92af7dd633d9e99d027cf3f53237b46e2507ca2c9cd599");
    }


    // ========================================================================
    // Categories 11-12: UpperTree & Tree GPU method tests (via batch hashing)
    // ========================================================================
    // Note: UpperTree/Tree GPU methods (sync_nodes_by_level_gpu, etc.) require
    // full tree setup with files. We test the GPU hashing primitives they use.

    #[test]
    fn test_upper_tree_level_13_hashing() {
        let gpu = gpu_or_skip!();
        // Simulate sync_nodes_by_level_gpu at FIRST_LEVEL_ABOVE_TWIG (13)
        // where child hashes are twig roots
        let mut twig_roots = Vec::new();
        for i in 0..100 {
            twig_roots.push(pseudo_random_hash(i as u64 * 7));
        }
        // Build node hash jobs: pair adjacent twig roots
        let mut jobs = Vec::new();
        let mut i = 0;
        while i < twig_roots.len() {
            let left = twig_roots[i];
            let right = if i + 1 < twig_roots.len() { twig_roots[i + 1] } else { NULL_TWIG.twig_root };
            jobs.push(NodeHashJob { level: 12, left, right }); // TWIG_ROOT_LEVEL = 12
            i += 2;
        }
        let gpu_results = gpu.batch_node_hash(&jobs);
        for (j, job) in jobs.iter().enumerate() {
            let expected = cpu_hash2(job.level, &job.left, &job.right);
            assert_eq!(gpu_results[j], expected, "Upper level 13 node {} mismatch", j);
        }
    }

    #[test]
    fn test_upper_tree_multi_level_hashing() {
        let gpu = gpu_or_skip!();
        // Simulate a multi-level upper tree sync
        let n_twigs = 64;
        let mut current: Vec<[u8; 32]> = (0..n_twigs).map(|i| pseudo_random_hash(i as u64 * 11)).collect();
        let mut level: u8 = 12;

        while current.len() > 1 {
            let mut jobs = Vec::new();
            let mut i = 0;
            while i < current.len() {
                let left = current[i];
                let right = if i + 1 < current.len() { current[i + 1] } else { NULL_TWIG.twig_root };
                jobs.push(NodeHashJob { level, left, right });
                i += 2;
            }
            let gpu_results = gpu.batch_node_hash(&jobs);
            // Verify
            for (j, job) in jobs.iter().enumerate() {
                assert_eq!(gpu_results[j], cpu_hash2(job.level, &job.left, &job.right),
                    "Multi-level at level {} node {} mismatch", level, j);
            }
            current = gpu_results;
            level += 1;
        }
        // Should end with a single root
        assert_eq!(current.len(), 1);
    }

    #[test]
    fn test_upper_tree_4_twigs_root() {
        let gpu = gpu_or_skip!();
        let roots: Vec<[u8; 32]> = (0..4).map(|i| pseudo_random_hash(i * 100)).collect();
        // Level 12: pair twig roots
        let l12_jobs = vec![
            NodeHashJob { level: 12, left: roots[0], right: roots[1] },
            NodeHashJob { level: 12, left: roots[2], right: roots[3] },
        ];
        let l12 = gpu.batch_node_hash(&l12_jobs);
        // Level 13: pair the results
        let l13_jobs = vec![NodeHashJob { level: 13, left: l12[0], right: l12[1] }];
        let l13 = gpu.batch_node_hash(&l13_jobs);
        // Verify against CPU
        let cpu_l12_0 = cpu_hash2(12, &roots[0], &roots[1]);
        let cpu_l12_1 = cpu_hash2(12, &roots[2], &roots[3]);
        let cpu_root = cpu_hash2(13, &cpu_l12_0, &cpu_l12_1);
        assert_eq!(l13[0], cpu_root, "4-twig root mismatch");
    }

    #[test]
    fn test_upper_tree_8_twigs_root() {
        let gpu = gpu_or_skip!();
        let roots: Vec<[u8; 32]> = (0..8).map(|i| pseudo_random_hash(i * 200)).collect();
        let mut current = roots;
        let mut level: u8 = 12;
        while current.len() > 1 {
            let mut jobs = Vec::new();
            let mut i = 0;
            while i < current.len() {
                let right = if i + 1 < current.len() { current[i + 1] } else { ZERO_HASH32 };
                jobs.push(NodeHashJob { level, left: current[i], right });
                i += 2;
            }
            current = gpu.batch_node_hash(&jobs);
            level += 1;
        }
        // Verify via CPU
        let roots2: Vec<[u8; 32]> = (0..8).map(|i| pseudo_random_hash(i * 200)).collect();
        let mut cpu_current = roots2;
        level = 12;
        while cpu_current.len() > 1 {
            let mut next = Vec::new();
            let mut i = 0;
            while i < cpu_current.len() {
                let right = if i + 1 < cpu_current.len() { cpu_current[i + 1] } else { ZERO_HASH32 };
                next.push(cpu_hash2(level, &cpu_current[i], &right));
                i += 2;
            }
            cpu_current = next;
            level += 1;
        }
        assert_eq!(current[0], cpu_current[0], "8-twig root mismatch");
    }

    #[test]
    fn test_upper_tree_16_twigs_root() {
        let gpu = gpu_or_skip!();
        let n = 16;
        let roots: Vec<[u8; 32]> = (0..n).map(|i| pseudo_random_hash(i * 300)).collect();
        let mut gpu_cur = roots.clone();
        let mut cpu_cur = roots;
        let mut level: u8 = 12;
        while gpu_cur.len() > 1 {
            let mut jobs = Vec::new();
            let mut cpu_next = Vec::new();
            let mut i = 0;
            while i < gpu_cur.len() {
                let right = if i + 1 < gpu_cur.len() { gpu_cur[i + 1] } else { ZERO_HASH32 };
                jobs.push(NodeHashJob { level, left: gpu_cur[i], right });
                cpu_next.push(cpu_hash2(level, &cpu_cur[i], &if i + 1 < cpu_cur.len() { cpu_cur[i + 1] } else { ZERO_HASH32 }));
                i += 2;
            }
            gpu_cur = gpu.batch_node_hash(&jobs);
            cpu_cur = cpu_next;
            level += 1;
        }
        assert_eq!(gpu_cur[0], cpu_cur[0], "16-twig root mismatch");
    }

    #[test]
    fn test_upper_tree_32_twigs_root() {
        let gpu = gpu_or_skip!();
        let n = 32;
        let roots: Vec<[u8; 32]> = (0..n).map(|i| pseudo_random_hash(i * 400)).collect();
        let mut gpu_cur = roots.clone();
        let mut cpu_cur = roots;
        let mut level: u8 = 12;
        while gpu_cur.len() > 1 {
            let mut jobs = Vec::new();
            let mut cpu_next = Vec::new();
            let mut i = 0;
            while i < gpu_cur.len() {
                let right_g = if i + 1 < gpu_cur.len() { gpu_cur[i + 1] } else { ZERO_HASH32 };
                let right_c = if i + 1 < cpu_cur.len() { cpu_cur[i + 1] } else { ZERO_HASH32 };
                jobs.push(NodeHashJob { level, left: gpu_cur[i], right: right_g });
                cpu_next.push(cpu_hash2(level, &cpu_cur[i], &right_c));
                i += 2;
            }
            gpu_cur = gpu.batch_node_hash(&jobs);
            cpu_cur = cpu_next;
            level += 1;
        }
        assert_eq!(gpu_cur[0], cpu_cur[0], "32-twig root mismatch");
    }

    #[test]
    fn test_upper_tree_64_twigs_root() {
        let gpu = gpu_or_skip!();
        let n = 64;
        let roots: Vec<[u8; 32]> = (0..n).map(|i| pseudo_random_hash(i * 500)).collect();
        let mut gpu_cur = roots.clone();
        let mut cpu_cur = roots;
        let mut level: u8 = 12;
        while gpu_cur.len() > 1 {
            let mut jobs = Vec::new();
            let mut cpu_next = Vec::new();
            let mut i = 0;
            while i < gpu_cur.len() {
                let right_g = if i + 1 < gpu_cur.len() { gpu_cur[i + 1] } else { ZERO_HASH32 };
                let right_c = if i + 1 < cpu_cur.len() { cpu_cur[i + 1] } else { ZERO_HASH32 };
                jobs.push(NodeHashJob { level, left: gpu_cur[i], right: right_g });
                cpu_next.push(cpu_hash2(level, &cpu_cur[i], &right_c));
                i += 2;
            }
            gpu_cur = gpu.batch_node_hash(&jobs);
            cpu_cur = cpu_next;
            level += 1;
        }
        assert_eq!(gpu_cur[0], cpu_cur[0], "64-twig root mismatch");
    }

    #[test]
    fn test_upper_tree_128_twigs_root() {
        let gpu = gpu_or_skip!();
        let n = 128;
        let roots: Vec<[u8; 32]> = (0..n).map(|i| pseudo_random_hash(i * 600)).collect();
        let mut gpu_cur = roots.clone();
        let mut cpu_cur = roots;
        let mut level: u8 = 12;
        while gpu_cur.len() > 1 {
            let mut jobs = Vec::new();
            let mut cpu_next = Vec::new();
            let mut i = 0;
            while i < gpu_cur.len() {
                let right_g = if i + 1 < gpu_cur.len() { gpu_cur[i + 1] } else { ZERO_HASH32 };
                let right_c = if i + 1 < cpu_cur.len() { cpu_cur[i + 1] } else { ZERO_HASH32 };
                jobs.push(NodeHashJob { level, left: gpu_cur[i], right: right_g });
                cpu_next.push(cpu_hash2(level, &cpu_cur[i], &right_c));
                i += 2;
            }
            gpu_cur = gpu.batch_node_hash(&jobs);
            cpu_cur = cpu_next;
            level += 1;
        }
        assert_eq!(gpu_cur[0], cpu_cur[0], "128-twig root mismatch");
    }

    #[test]
    fn test_upper_tree_null_higher_tree_reproduction() {
        let gpu = gpu_or_skip!();
        // Reproduce NULL_NODE_IN_HIGHER_TREE for levels 13..63
        let mut computed_nulls = [ZERO_HASH32; 64];
        computed_nulls[13] = gpu.batch_node_hash(&[NodeHashJob {
            level: 12, left: NULL_TWIG.twig_root, right: NULL_TWIG.twig_root
        }])[0];
        for i in 14..64 {
            computed_nulls[i] = gpu.batch_node_hash(&[NodeHashJob {
                level: (i - 1) as u8,
                left: computed_nulls[i - 1],
                right: computed_nulls[i - 1],
            }])[0];
        }
        for i in 13..64 {
            assert_eq!(computed_nulls[i], NULL_NODE_IN_HIGHER_TREE[i],
                "NULL_NODE_IN_HIGHER_TREE[{}] mismatch: GPU={} expected={}",
                i, hex::encode(computed_nulls[i]), hex::encode(NULL_NODE_IN_HIGHER_TREE[i]));
        }
    }

    // Full tree pipeline tests using build_test_tree
    #[test]
    fn test_tree_gpu_sync_youngest_twig_100_entries() {
        let gpu = gpu_or_skip!();
        let dir_cpu = "/tmp/qmdb_gpu_test_yt100_cpu";
        let dir_gpu = "/tmp/qmdb_gpu_test_yt100_gpu";
        let _ = std::fs::remove_dir_all(dir_cpu);
        let _ = std::fs::remove_dir_all(dir_gpu);
        let (mut t_cpu, _, _, _) = build_test_tree(dir_cpu, &vec![], 50, 50);
        let (mut t_gpu, _, _, _) = build_test_tree(dir_gpu, &vec![], 50, 50);
        t_cpu.sync_mt_for_youngest_twig(false);
        t_gpu.sync_mt_for_youngest_twig_gpu(&gpu);
        for i in 1..2048 {
            assert_eq!(t_cpu.mtree_for_youngest_twig[i], t_gpu.mtree_for_youngest_twig[i],
                "YT 100 entries node {} mismatch", i);
        }
        check_hash_consistency(&t_cpu);
        check_hash_consistency(&t_gpu);
        t_cpu.close(); t_gpu.close();
        let _ = std::fs::remove_dir_all(dir_cpu);
        let _ = std::fs::remove_dir_all(dir_gpu);
    }

    #[test]
    fn test_tree_gpu_sync_youngest_twig_1000_entries() {
        let gpu = gpu_or_skip!();
        let dir_cpu = "/tmp/qmdb_gpu_test_yt1000_cpu";
        let dir_gpu = "/tmp/qmdb_gpu_test_yt1000_gpu";
        let _ = std::fs::remove_dir_all(dir_cpu);
        let _ = std::fs::remove_dir_all(dir_gpu);
        let (mut t_cpu, _, _, _) = build_test_tree(dir_cpu, &vec![], 500, 500);
        let (mut t_gpu, _, _, _) = build_test_tree(dir_gpu, &vec![], 500, 500);
        t_cpu.sync_mt_for_youngest_twig(false);
        t_gpu.sync_mt_for_youngest_twig_gpu(&gpu);
        assert_eq!(t_cpu.mtree_for_youngest_twig[1], t_gpu.mtree_for_youngest_twig[1],
            "YT 1000 root mismatch");
        check_hash_consistency(&t_cpu);
        check_hash_consistency(&t_gpu);
        t_cpu.close(); t_gpu.close();
        let _ = std::fs::remove_dir_all(dir_cpu);
        let _ = std::fs::remove_dir_all(dir_gpu);
    }

    #[test]
    fn test_tree_gpu_sync_youngest_twig_2048_entries() {
        let gpu = gpu_or_skip!();
        let dir_cpu = "/tmp/qmdb_gpu_test_yt2048_cpu";
        let dir_gpu = "/tmp/qmdb_gpu_test_yt2048_gpu";
        let _ = std::fs::remove_dir_all(dir_cpu);
        let _ = std::fs::remove_dir_all(dir_gpu);
        let (mut t_cpu, _, _, _) = build_test_tree(dir_cpu, &vec![], 1024, 1024);
        let (mut t_gpu, _, _, _) = build_test_tree(dir_gpu, &vec![], 1024, 1024);
        t_cpu.sync_mt_for_youngest_twig(false);
        t_gpu.sync_mt_for_youngest_twig_gpu(&gpu);
        assert_eq!(t_cpu.mtree_for_youngest_twig[1], t_gpu.mtree_for_youngest_twig[1],
            "YT 2048 root mismatch");
        t_cpu.close(); t_gpu.close();
        let _ = std::fs::remove_dir_all(dir_cpu);
        let _ = std::fs::remove_dir_all(dir_gpu);
    }

    #[test]
    fn test_tree_gpu_with_deactivations() {
        let gpu = gpu_or_skip!();
        let dir_cpu = "/tmp/qmdb_gpu_test_deact_cpu";
        let dir_gpu = "/tmp/qmdb_gpu_test_deact_gpu";
        let _ = std::fs::remove_dir_all(dir_cpu);
        let _ = std::fs::remove_dir_all(dir_gpu);
        let deact = vec![3u64, 7, 15, 31, 63];
        let (mut t_cpu, _, _, _) = build_test_tree(dir_cpu, &deact, 50, 50);
        let (mut t_gpu, _, _, _) = build_test_tree(dir_gpu, &deact, 50, 50);
        t_cpu.sync_mt_for_youngest_twig(false);
        t_gpu.sync_mt_for_youngest_twig_gpu(&gpu);
        assert_eq!(t_cpu.mtree_for_youngest_twig[1], t_gpu.mtree_for_youngest_twig[1],
            "Deact root mismatch");
        check_hash_consistency(&t_cpu);
        check_hash_consistency(&t_gpu);
        t_cpu.close(); t_gpu.close();
        let _ = std::fs::remove_dir_all(dir_cpu);
        let _ = std::fs::remove_dir_all(dir_gpu);
    }

    #[test]
    fn test_tree_gpu_phase1_vs_cpu() {
        let gpu = gpu_or_skip!();
        let dir_cpu = "/tmp/qmdb_gpu_test_phase1_cpu";
        let dir_gpu = "/tmp/qmdb_gpu_test_phase1_gpu";
        let _ = std::fs::remove_dir_all(dir_cpu);
        let _ = std::fs::remove_dir_all(dir_gpu);
        let (mut t_cpu, _, _, _) = build_test_tree(dir_cpu, &vec![], 50, 50);
        let (mut t_gpu, _, _, _) = build_test_tree(dir_gpu, &vec![], 50, 50);
        t_cpu.sync_mt_for_youngest_twig(false);
        t_gpu.sync_mt_for_youngest_twig_gpu(&gpu);
        let cpu_nlist = t_cpu.sync_mt_for_active_bits_phase1();
        let gpu_nlist = t_gpu.sync_mt_for_active_bits_phase1_gpu(&gpu);
        let mut cs: Vec<u64> = cpu_nlist; cs.sort(); cs.dedup();
        let mut gs: Vec<u64> = gpu_nlist; gs.sort(); gs.dedup();
        assert_eq!(cs, gs, "Phase1 n_list mismatch");
        check_hash_consistency(&t_cpu);
        check_hash_consistency(&t_gpu);
        t_cpu.close(); t_gpu.close();
        let _ = std::fs::remove_dir_all(dir_cpu);
        let _ = std::fs::remove_dir_all(dir_gpu);
    }

    // ========================================================================
    // Category 14: Stress & Scale Tests (25 tests)
    // ========================================================================

    #[test]
    fn test_stress_node_hash_200k_batch() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(70000, 200_000);
        let results = gpu.batch_node_hash(&jobs);
        // Spot-check first, last, and middle
        for &idx in &[0, 99999, 199999] {
            assert_eq!(results[idx], cpu_hash2(jobs[idx].level, &jobs[idx].left, &jobs[idx].right),
                "200K batch spot check at {} failed", idx);
        }
    }

    #[test]
    fn test_stress_var_hash_100k_batch() {
        let gpu = gpu_or_skip!();
        let all_data: Vec<Vec<u8>> = (0..100_000).map(|i| pseudo_random_bytes(i, 80)).collect();
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        for &idx in &[0, 49999, 99999] {
            assert_eq!(results[idx], cpu_hash(&all_data[idx]), "100K var batch spot check at {} failed", idx);
        }
    }

    #[test]
    fn test_stress_sync_mtrees_100_twigs_full() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 100, 80000, |_| (0, 2047));
    }

    #[test]
    fn test_stress_sync_mtrees_50_twigs_random() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 50, 80001, |t| {
            let s = (t * 137 + 42) % 2048;
            let e = std::cmp::min(s + 200, 2047);
            (s as i32, e as i32)
        });
    }

    #[test]
    fn test_stress_repeated_gpu_init() {
        for i in 0..10 {
            let gpu = match GpuHasher::new(1000) {
                Ok(g) => g,
                Err(e) => { eprintln!("Skipping at iteration {}: {}", i, e); return; }
            };
            let result = gpu.batch_node_hash(&[make_job(0, ZERO_HASH32, ZERO_HASH32)]);
            assert_eq!(result[0], cpu_hash2(0, &ZERO_HASH32, &ZERO_HASH32));
        }
    }

    #[test]
    fn test_stress_rapid_small_batches() {
        let gpu = gpu_or_skip!();
        let expected = cpu_hash2(5, &fill_hash(0x42), &fill_hash(0x84));
        for i in 0..1000 {
            let jobs = vec![make_job(5, fill_hash(0x42), fill_hash(0x84)); 10];
            let results = gpu.batch_node_hash(&jobs);
            for r in &results {
                assert_eq!(*r, expected, "Rapid small batch {} failed", i);
            }
        }
    }

    #[test]
    fn test_stress_alternating_batch_sizes() {
        let gpu = gpu_or_skip!();
        for i in 0..50 {
            let size = if i % 2 == 0 { 10 } else { 10000 };
            let jobs = make_random_jobs(90000 + i, size);
            verify_batch(&gpu, &jobs);
        }
    }

    #[test]
    fn test_stress_max_batch_then_single() {
        let gpu = gpu_or_skip!();
        let big = make_random_jobs(91000, 200_000);
        let _ = gpu.batch_node_hash(&big);
        let small = make_random_jobs(91001, 1);
        verify_batch(&gpu, &small);
    }

    #[test]
    fn test_stress_single_then_max_batch() {
        let gpu = gpu_or_skip!();
        let small = make_random_jobs(92000, 1);
        verify_batch(&gpu, &small);
        let big = make_random_jobs(92001, 200_000);
        let results = gpu.batch_node_hash(&big);
        assert_eq!(results[0], cpu_hash2(big[0].level, &big[0].left, &big[0].right));
    }

    #[test]
    fn test_stress_gradual_increase() {
        let gpu = gpu_or_skip!();
        for &size in &[1usize, 10, 100, 1000, 10000, 100000] {
            let jobs = make_random_jobs(93000 + size as u64, size);
            verify_batch(&gpu, &jobs);
        }
    }

    #[test]
    fn test_stress_gradual_decrease() {
        let gpu = gpu_or_skip!();
        for &size in &[100000usize, 10000, 1000, 100, 10, 1] {
            let jobs = make_random_jobs(94000 + size as u64, size);
            verify_batch(&gpu, &jobs);
        }
    }

    #[test]
    fn test_stress_mixed_node_and_var_hashing() {
        let gpu = gpu_or_skip!();
        for i in 0..100 {
            // Node hash
            let node_jobs = make_random_jobs(95000 + i * 2, 100);
            verify_batch(&gpu, &node_jobs);
            // Var hash
            let var_data: Vec<Vec<u8>> = (0..100).map(|j| pseudo_random_bytes((i * 100 + j) as u64, 100)).collect();
            let var_refs: Vec<&[u8]> = var_data.iter().map(|v| v.as_slice()).collect();
            let var_results = gpu.batch_hash_variable(&var_refs);
            for (j, (data, result)) in var_data.iter().zip(var_results.iter()).enumerate() {
                assert_eq!(*result, cpu_hash(data), "Mixed round {} var {} mismatch", i, j);
            }
        }
    }

    #[test]
    fn test_stress_var_hash_mixed_1_to_1000() {
        let gpu = gpu_or_skip!();
        let all_data: Vec<Vec<u8>> = (1..=1000).map(|len| pseudo_random_bytes(len as u64 * 71, len)).collect();
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        for (i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "Mixed len {} mismatch", i + 1);
        }
    }

    #[test]
    fn test_stress_repeated_sync_100_times() {
        let gpu = gpu_or_skip!();
        let mt_orig = make_mtree_with_leaves(96000, 2048);
        for _ in 0..100 {
            let mut mt = mt_orig.clone();
            sync_mtrees_gpu(&gpu, &mut [(&mut mt, 0, 2047)]);
        }
        // Final verification
        let mut mt_cpu = mt_orig.clone();
        let mut mt_gpu = mt_orig;
        sync_mtree(&mut mt_cpu, 0, 2047);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, 0, 2047)]);
        assert_eq!(mt_cpu[1], mt_gpu[1]);
    }

    #[test]
    fn test_stress_many_small_twigs() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 200, 97000, |t| {
            let pos = (t * 37) % 2048;
            (pos as i32, pos as i32)
        });
    }

    #[test]
    fn test_stress_few_large_twigs() {
        let gpu = gpu_or_skip!();
        verify_sync_multi(&gpu, 5, 98000, |_| (0, 2047));
    }

    #[test]
    fn test_stress_node_hash_correctness_at_scale() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(99000, 100_000);
        let results = gpu.batch_node_hash(&jobs);
        for (i, job) in jobs.iter().enumerate() {
            assert_eq!(results[i], cpu_hash2(job.level, &job.left, &job.right),
                "100K correctness mismatch at {}", i);
        }
    }

    #[test]
    fn test_stress_var_hash_correctness_at_scale() {
        let gpu = gpu_or_skip!();
        let all_data: Vec<Vec<u8>> = (0..50_000).map(|i| pseudo_random_bytes(i + 100000, 80)).collect();
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        for (i, (data, result)) in all_data.iter().zip(results.iter()).enumerate() {
            assert_eq!(*result, cpu_hash(data), "50K var correctness mismatch at {}", i);
        }
    }

    // ========================================================================
    // Category 15: Edge Cases & Error Handling (33 tests)
    // ========================================================================

    #[test]
    fn test_edge_empty_node_hash_batch() {
        let gpu = gpu_or_skip!();
        let result = gpu.batch_node_hash(&[]);
        assert!(result.is_empty());
    }

    #[test]
    fn test_edge_empty_var_hash_batch() {
        let gpu = gpu_or_skip!();
        let result = gpu.batch_hash_variable(&[]);
        assert!(result.is_empty());
    }

    #[test]
    fn test_edge_single_job_level_0() {
        let gpu = gpu_or_skip!();
        verify_node_hash(&gpu, 0, &ZERO_HASH32, &ZERO_HASH32);
    }

    #[test]
    fn test_edge_single_var_input_1_byte() {
        let gpu = gpu_or_skip!();
        verify_var_hash(&gpu, &[0x42]);
    }

    #[test]
    fn test_edge_var_hash_all_zero_byte_inputs() {
        let gpu = gpu_or_skip!();
        let all_data: Vec<Vec<u8>> = (0..100).map(|_| vec![0x00]).collect();
        let inputs: Vec<&[u8]> = all_data.iter().map(|v| v.as_slice()).collect();
        let results = gpu.batch_hash_variable(&inputs);
        let expected = cpu_hash(&[0x00]);
        for r in &results { assert_eq!(*r, expected); }
    }

    #[test]
    fn test_edge_node_hash_max_values() {
        let gpu = gpu_or_skip!();
        verify_node_hash(&gpu, 255, &fill_hash(0xFF), &fill_hash(0xFF));
    }

    #[test]
    fn test_edge_var_hash_near_block_boundary_55() {
        let gpu = gpu_or_skip!();
        verify_var_hash(&gpu, &pseudo_random_bytes(55555, 55));
    }

    #[test]
    fn test_edge_var_hash_near_block_boundary_56() {
        let gpu = gpu_or_skip!();
        verify_var_hash(&gpu, &pseudo_random_bytes(55556, 56));
    }

    #[test]
    fn test_edge_var_hash_near_block_boundary_64() {
        let gpu = gpu_or_skip!();
        verify_var_hash(&gpu, &pseudo_random_bytes(55564, 64));
    }

    #[test]
    fn test_edge_sync_mtree_range_equals() {
        let gpu = gpu_or_skip!();
        verify_sync_single(&gpu, 60000, 2048, 100, 100);
    }

    #[test]
    fn test_edge_sync_mtree_range_zero() {
        let gpu = gpu_or_skip!();
        verify_sync_single(&gpu, 60001, 2048, 0, 0);
    }

    #[test]
    fn test_edge_sync_mtree_range_max() {
        let gpu = gpu_or_skip!();
        verify_sync_single(&gpu, 60002, 2048, 0, 2047);
    }

    #[test]
    fn test_edge_batch_size_equals_max() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(61000, 200_000);
        let results = gpu.batch_node_hash(&jobs);
        assert_eq!(results[0], cpu_hash2(jobs[0].level, &jobs[0].left, &jobs[0].right));
    }

    #[test]
    fn test_edge_node_hash_level_match_sha256() {
        let gpu = gpu_or_skip!();
        // Manual SHA256 vs GPU for various inputs
        for level in [0u8, 1, 8, 12, 127, 255] {
            let left = pseudo_random_hash(level as u64 * 100);
            let right = pseudo_random_hash(level as u64 * 100 + 1);
            let mut input = vec![level];
            input.extend_from_slice(&left);
            input.extend_from_slice(&right);
            let expected: [u8; 32] = Sha256::digest(&input).into();
            let result = gpu.batch_node_hash(&[make_job(level, left, right)]);
            assert_eq!(result[0], expected, "SHA256 manual match failed at level {}", level);
        }
    }

    #[test]
    fn test_edge_var_hash_65_bytes_matches_node() {
        let gpu = gpu_or_skip!();
        // 65-byte var hash of (level||left||right) should match node hash
        for i in 0..20 {
            let level = (i * 13) as u8;
            let left = pseudo_random_hash(i * 2);
            let right = pseudo_random_hash(i * 2 + 1);
            let mut var_input = vec![level];
            var_input.extend_from_slice(&left);
            var_input.extend_from_slice(&right);
            let node_result = gpu.batch_node_hash(&[make_job(level, left, right)]);
            let var_result = gpu.batch_hash_variable(&[var_input.as_slice()]);
            assert_eq!(node_result[0], var_result[0], "65-byte node vs var mismatch at {}", i);
        }
    }

    #[test]
    fn test_edge_sync_mtree_leaf_index_boundary() {
        let gpu = gpu_or_skip!();
        // Test with leaves at extreme indices
        let mut mt_cpu = vec![ZERO_HASH32; 4096].into_boxed_slice();
        let mut mt_gpu = vec![ZERO_HASH32; 4096].into_boxed_slice();
        mt_cpu[2048] = pseudo_random_hash(1); mt_gpu[2048] = pseudo_random_hash(1);
        mt_cpu[4095] = pseudo_random_hash(2); mt_gpu[4095] = pseudo_random_hash(2);
        sync_mtree(&mut mt_cpu, 0, 2047);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt_gpu, 0, 2047)]);
        for i in 1..2048 { assert_eq!(mt_cpu[i], mt_gpu[i], "Boundary leaf node {} mismatch", i); }
    }

    #[test]
    fn test_edge_null_mt_gpu_matches_lazy_static() {
        let gpu = gpu_or_skip!();
        use crate::def::ENTRY_BASE_LENGTH;
        use crate::entryfile::entry;
        let mut bz = [0u8; ENTRY_BASE_LENGTH + 8];
        let null_hash = entry::null_entry(&mut bz[..]).hash();
        let mut mt = vec![ZERO_HASH32; 4096].into_boxed_slice();
        for i in 2048..4096 { mt[i] = null_hash; }
        sync_mtrees_gpu(&gpu, &mut [(&mut mt, 0, 2047)]);
        assert_eq!(mt[1], NULL_MT_FOR_TWIG[1]);
    }

    #[test]
    fn test_edge_null_twig_gpu_matches_lazy_static() {
        let gpu = gpu_or_skip!();
        let ab = ActiveBits::new();
        let mut l1_jobs = Vec::new();
        for pos in 0..4 {
            let mut l = [0u8; 32]; let mut r = [0u8; 32];
            l.copy_from_slice(ab.get_bits(pos*2, 32));
            r.copy_from_slice(ab.get_bits(pos*2+1, 32));
            l1_jobs.push(NodeHashJob { level: 8, left: l, right: r });
        }
        let l1 = gpu.batch_node_hash(&l1_jobs);
        for i in 0..4 { assert_eq!(l1[i], NULL_TWIG.active_bits_mtl1[i]); }
    }

    #[test]
    fn test_edge_repeated_hash_same_data() {
        let gpu = gpu_or_skip!();
        let job = make_job(7, pseudo_random_hash(1), pseudo_random_hash(2));
        let first = gpu.batch_node_hash(&[job])[0];
        for _ in 0..100 {
            let result = gpu.batch_node_hash(&[job])[0];
            assert_eq!(result, first);
        }
    }

    #[test]
    fn test_edge_node_hash_commutative_check() {
        let gpu = gpu_or_skip!();
        for i in 0..50 {
            let left = pseudo_random_hash(i * 10);
            let right = pseudo_random_hash(i * 10 + 1);
            if left == right { continue; }
            let r_lr = gpu.batch_node_hash(&[make_job(5, left, right)])[0];
            let r_rl = gpu.batch_node_hash(&[make_job(5, right, left)])[0];
            assert_ne!(r_lr, r_rl, "Should not be commutative at {}", i);
        }
    }

    #[test]
    fn test_edge_var_hash_prefix_not_equal() {
        let gpu = gpu_or_skip!();
        let data1 = b"abc";
        let data2 = b"ab";
        let r1 = gpu.batch_hash_variable(&[data1.as_slice()]);
        let r2 = gpu.batch_hash_variable(&[data2.as_slice()]);
        assert_ne!(r1[0], r2[0], "Prefix should give different hash");
    }

    #[test]
    fn test_edge_var_hash_append_byte() {
        let gpu = gpu_or_skip!();
        let data = pseudo_random_bytes(12345, 100);
        let mut data_plus = data.clone();
        data_plus.push(0x00);
        let r1 = gpu.batch_hash_variable(&[data.as_slice()]);
        let r2 = gpu.batch_hash_variable(&[data_plus.as_slice()]);
        assert_ne!(r1[0], r2[0], "Appending byte should change hash");
    }

    #[test]
    fn test_edge_gpu_hasher_max_batch_1() {
        let gpu = gpu_or_skip!(1);
        let job = make_job(5, pseudo_random_hash(1), pseudo_random_hash(2));
        let result = gpu.batch_node_hash(&[job]);
        assert_eq!(result[0], cpu_hash2(5, &job.left, &job.right));
    }

    #[test]
    fn test_edge_gpu_hasher_max_batch_256() {
        let gpu = gpu_or_skip!(256);
        let jobs = make_random_jobs(62000, 256);
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_edge_gpu_hasher_max_batch_10() {
        let gpu = gpu_or_skip!(10);
        let jobs = make_random_jobs(63000, 10);
        verify_batch(&gpu, &jobs);
    }

    #[test]
    fn test_edge_node_hash_into_exact_size() {
        let gpu = gpu_or_skip!();
        let jobs = make_random_jobs(64000, 42);
        let mut out = vec![ZERO_HASH32; 42];
        gpu.batch_node_hash_into(&jobs, &mut out);
        for (i, job) in jobs.iter().enumerate() {
            assert_eq!(out[i], cpu_hash2(job.level, &job.left, &job.right));
        }
    }

    #[test]
    fn test_edge_var_hash_large_single_input() {
        let gpu = gpu_or_skip!();
        let data = pseudo_random_bytes(65000, 10240);
        verify_var_hash(&gpu, &data);
    }

    #[test]
    fn test_edge_sync_mtrees_single_twig_check_mt() {
        let gpu = gpu_or_skip!();
        use crate::merkletree::check::check_mt;
        let mut mt = make_mtree_with_leaves(66000, 2048);
        sync_mtrees_gpu(&gpu, &mut [(&mut mt, 0, 2047)]);
        check_mt(&mt);
    }

    use crate::merkletree::check::check_hash_consistency;
    use crate::merkletree::helpers::build_test_tree;

    // ========================================================================
    // SoA (Structure-of-Arrays) Layout Tests
    // ========================================================================

    #[test]
    fn test_soa_basic_matches_aos() {
        let gpu = gpu_or_skip!();
        let mut jobs = Vec::new();
        let mut levels = Vec::new();
        let mut lefts = Vec::new();
        let mut rights = Vec::new();

        for level in 0u8..12 {
            let left = [level.wrapping_mul(7).wrapping_add(0x11); 32];
            let right = [level.wrapping_mul(13).wrapping_add(0xAB); 32];
            jobs.push(NodeHashJob { level, left, right });
            levels.push(level);
            lefts.push(left);
            rights.push(right);
        }

        let aos_results = gpu.batch_node_hash(&jobs);
        let soa_results = gpu.batch_node_hash_soa(&levels, &lefts, &rights);

        assert_eq!(aos_results.len(), soa_results.len());
        for (i, (aos, soa)) in aos_results.iter().zip(soa_results.iter()).enumerate() {
            assert_eq!(
                aos, soa,
                "SoA/AoS mismatch at {}: AoS={} SoA={}",
                i,
                hex::encode(aos),
                hex::encode(soa)
            );
        }
    }

    #[test]
    fn test_soa_matches_cpu() {
        let gpu = gpu_or_skip!();
        let n = 100;
        let mut levels = Vec::with_capacity(n);
        let mut lefts = Vec::with_capacity(n);
        let mut rights = Vec::with_capacity(n);
        let mut expected = Vec::with_capacity(n);

        for i in 0..n {
            let level = (i % 64) as u8;
            let left = [(i as u8).wrapping_mul(3); 32];
            let right = [(i as u8).wrapping_mul(7); 32];
            levels.push(level);
            lefts.push(left);
            rights.push(right);
            expected.push(cpu_hash2(level, &left, &right));
        }

        let soa_results = gpu.batch_node_hash_soa(&levels, &lefts, &rights);

        for (i, (gpu_hash, cpu_hash)) in soa_results.iter().zip(expected.iter()).enumerate() {
            assert_eq!(
                gpu_hash, cpu_hash,
                "SoA/CPU mismatch at {}: SoA={} CPU={}",
                i,
                hex::encode(gpu_hash),
                hex::encode(cpu_hash)
            );
        }
    }

    #[test]
    fn test_soa_large_batch_10000() {
        let gpu = gpu_or_skip!(100_000);
        let n = 10000;
        let mut levels = Vec::with_capacity(n);
        let mut lefts = Vec::with_capacity(n);
        let mut rights = Vec::with_capacity(n);
        let mut jobs = Vec::with_capacity(n);

        for i in 0..n {
            let level = (i % 12) as u8;
            let mut left = [0u8; 32];
            let mut right = [0u8; 32];
            for j in 0..32 {
                left[j] = ((i * 7 + j * 13) & 0xFF) as u8;
                right[j] = ((i * 11 + j * 17) & 0xFF) as u8;
            }
            levels.push(level);
            lefts.push(left);
            rights.push(right);
            jobs.push(NodeHashJob { level, left, right });
        }

        let aos_results = gpu.batch_node_hash(&jobs);
        let soa_results = gpu.batch_node_hash_soa(&levels, &lefts, &rights);

        assert_eq!(aos_results.len(), soa_results.len());
        for (i, (aos, soa)) in aos_results.iter().zip(soa_results.iter()).enumerate() {
            assert_eq!(
                aos, soa,
                "SoA/AoS mismatch at large batch job {}: AoS={} SoA={}",
                i,
                hex::encode(aos),
                hex::encode(soa)
            );
        }
    }

    #[test]
    fn test_soa_empty_batch() {
        let gpu = gpu_or_skip!();
        let result = gpu.batch_node_hash_soa(&[], &[], &[]);
        assert!(result.is_empty());
    }

    #[test]
    fn test_soa_single_job() {
        let gpu = gpu_or_skip!();
        let level = 5u8;
        let left = [0xAA; 32];
        let right = [0xBB; 32];

        let expected = cpu_hash2(level, &left, &right);
        let result = gpu.batch_node_hash_soa(&[level], &[left], &[right]);

        assert_eq!(result.len(), 1);
        assert_eq!(
            result[0], expected,
            "Single SoA job mismatch: got={} expected={}",
            hex::encode(result[0]),
            hex::encode(expected)
        );
    }

    #[test]
    fn test_soa_all_zeros() {
        let gpu = gpu_or_skip!();
        let expected = cpu_hash2(0, &[0; 32], &[0; 32]);
        let result = gpu.batch_node_hash_soa(&[0], &[[0; 32]], &[[0; 32]]);
        assert_eq!(result[0], expected);
    }

    #[test]
    fn test_soa_all_ones() {
        let gpu = gpu_or_skip!();
        let expected = cpu_hash2(255, &[0xFF; 32], &[0xFF; 32]);
        let result = gpu.batch_node_hash_soa(&[255], &[[0xFF; 32]], &[[0xFF; 32]]);
        assert_eq!(result[0], expected);
    }

    #[test]
    fn test_soa_all_levels_0_to_63() {
        let gpu = gpu_or_skip!();
        let n = 64;
        let mut levels = Vec::with_capacity(n);
        let mut lefts = Vec::with_capacity(n);
        let mut rights = Vec::with_capacity(n);

        for i in 0..n {
            levels.push(i as u8);
            lefts.push([(i as u8).wrapping_mul(3); 32]);
            rights.push([(i as u8).wrapping_mul(7); 32]);
        }

        let soa_results = gpu.batch_node_hash_soa(&levels, &lefts, &rights);
        for i in 0..n {
            let expected = cpu_hash2(levels[i], &lefts[i], &rights[i]);
            assert_eq!(soa_results[i], expected, "Level {} mismatch", i);
        }
    }

    #[test]
    fn test_soa_into_matches_soa() {
        let gpu = gpu_or_skip!();
        let n = 50;
        let mut levels = Vec::with_capacity(n);
        let mut lefts = Vec::with_capacity(n);
        let mut rights = Vec::with_capacity(n);

        for i in 0..n {
            levels.push((i % 12) as u8);
            lefts.push([(i as u8).wrapping_mul(5); 32]);
            rights.push([(i as u8).wrapping_mul(11); 32]);
        }

        let soa_results = gpu.batch_node_hash_soa(&levels, &lefts, &rights);
        let mut into_results = vec![[0u8; 32]; n];
        gpu.batch_node_hash_soa_into(&levels, &lefts, &rights, &mut into_results);

        assert_eq!(soa_results, into_results);
    }

    #[test]
    fn test_soa_deterministic() {
        let gpu = gpu_or_skip!();
        let levels = vec![3u8; 100];
        let lefts = vec![[0x42; 32]; 100];
        let rights = vec![[0x84; 32]; 100];

        let r1 = gpu.batch_node_hash_soa(&levels, &lefts, &rights);
        let r2 = gpu.batch_node_hash_soa(&levels, &lefts, &rights);
        assert_eq!(r1, r2, "SoA results should be deterministic");
    }

    #[test]
    fn test_soa_level_affects_output() {
        let gpu = gpu_or_skip!();
        let left = [0x11; 32];
        let right = [0x22; 32];

        let r0 = gpu.batch_node_hash_soa(&[0], &[left], &[right]);
        let r1 = gpu.batch_node_hash_soa(&[1], &[left], &[right]);

        assert_ne!(
            r0[0], r1[0],
            "Different levels with same left/right should produce different hashes"
        );
    }

    #[test]
    fn test_soa_left_right_swap_differs() {
        let gpu = gpu_or_skip!();
        let a = [0xAA; 32];
        let b = [0xBB; 32];

        let r1 = gpu.batch_node_hash_soa(&[5], &[a], &[b]);
        let r2 = gpu.batch_node_hash_soa(&[5], &[b], &[a]);

        assert_ne!(
            r1[0], r2[0],
            "Swapping left/right should produce different hashes"
        );
    }

    #[test]
    fn test_soa_batch_256_boundary() {
        let gpu = gpu_or_skip!();
        // Exactly one CUDA block (256 threads)
        let n = 256;
        let mut levels = Vec::with_capacity(n);
        let mut lefts = Vec::with_capacity(n);
        let mut rights = Vec::with_capacity(n);
        let mut jobs = Vec::with_capacity(n);

        for i in 0..n {
            let level = (i % 64) as u8;
            let left = [(i & 0xFF) as u8; 32];
            let right = [((i * 3) & 0xFF) as u8; 32];
            levels.push(level);
            lefts.push(left);
            rights.push(right);
            jobs.push(NodeHashJob { level, left, right });
        }

        let aos = gpu.batch_node_hash(&jobs);
        let soa = gpu.batch_node_hash_soa(&levels, &lefts, &rights);
        assert_eq!(aos, soa);
    }

    #[test]
    fn test_soa_batch_257_boundary() {
        let gpu = gpu_or_skip!();
        // One full block + 1 thread (tests grid boundary)
        let n = 257;
        let mut levels = Vec::with_capacity(n);
        let mut lefts = Vec::with_capacity(n);
        let mut rights = Vec::with_capacity(n);
        let mut jobs = Vec::with_capacity(n);

        for i in 0..n {
            let level = (i % 64) as u8;
            let left = [(i & 0xFF) as u8; 32];
            let right = [((i * 3) & 0xFF) as u8; 32];
            levels.push(level);
            lefts.push(left);
            rights.push(right);
            jobs.push(NodeHashJob { level, left, right });
        }

        let aos = gpu.batch_node_hash(&jobs);
        let soa = gpu.batch_node_hash_soa(&levels, &lefts, &rights);
        assert_eq!(aos, soa);
    }

    #[test]
    fn test_soa_sequential_bytes() {
        let gpu = gpu_or_skip!();
        let n = 100;
        let mut levels = Vec::with_capacity(n);
        let mut lefts = Vec::with_capacity(n);
        let mut rights = Vec::with_capacity(n);

        for i in 0..n {
            levels.push(i as u8);
            let mut left = [0u8; 32];
            let mut right = [0u8; 32];
            for j in 0..32 {
                left[j] = (i + j) as u8;
                right[j] = (i * 2 + j) as u8;
            }
            lefts.push(left);
            rights.push(right);
        }

        let soa = gpu.batch_node_hash_soa(&levels, &lefts, &rights);
        for i in 0..n {
            let expected = cpu_hash2(levels[i], &lefts[i], &rights[i]);
            assert_eq!(soa[i], expected, "Sequential bytes mismatch at {}", i);
        }
    }

    #[test]
    fn test_soa_collision_resistance() {
        let gpu = gpu_or_skip!();
        let n = 1000;
        let mut levels = Vec::with_capacity(n);
        let mut lefts = Vec::with_capacity(n);
        let mut rights = Vec::with_capacity(n);

        for i in 0..n {
            levels.push((i % 256) as u8);
            let seed = i as u64;
            lefts.push(pseudo_random_hash(seed * 2));
            rights.push(pseudo_random_hash(seed * 2 + 1));
        }

        let results = gpu.batch_node_hash_soa(&levels, &lefts, &rights);
        let unique: HashSet<[u8; 32]> = results.iter().copied().collect();
        assert_eq!(
            unique.len(),
            n,
            "Expected {} unique hashes, got {}",
            n,
            unique.len()
        );
    }

    #[test]
    fn test_soa_repeated_calls_same_data() {
        let gpu = gpu_or_skip!();
        let levels = vec![7u8; 50];
        let lefts = vec![[0x33; 32]; 50];
        let rights = vec![[0x66; 32]; 50];

        for _ in 0..10 {
            let result = gpu.batch_node_hash_soa(&levels, &lefts, &rights);
            let expected = cpu_hash2(7, &[0x33; 32], &[0x66; 32]);
            for hash in &result {
                assert_eq!(hash, &expected);
            }
        }
    }

    #[test]
    fn test_soa_multi_gpu_hasher() {
        let mgpu = match crate::gpu::MultiGpuHasher::new(10000) {
            Ok(g) => g,
            Err(e) => {
                eprintln!("Skipping multi-GPU test: {}", e);
                return;
            }
        };

        let levels = vec![0u8, 1, 2, 3, 4];
        let lefts: Vec<[u8; 32]> = (0..5).map(|i| [i as u8; 32]).collect();
        let rights: Vec<[u8; 32]> = (0..5).map(|i| [(i * 3) as u8; 32]).collect();

        let result = mgpu.batch_node_hash_soa(0, &levels, &lefts, &rights);
        assert_eq!(result.len(), 5);

        for i in 0..5 {
            let expected = cpu_hash2(levels[i], &lefts[i], &rights[i]);
            assert_eq!(result[i], expected);
        }
    }

    #[test]
    fn test_soa_stress_100k() {
        let gpu = gpu_or_skip!(200_000);
        let n = 100_000;
        let mut levels = Vec::with_capacity(n);
        let mut lefts = Vec::with_capacity(n);
        let mut rights = Vec::with_capacity(n);
        let mut jobs = Vec::with_capacity(n);

        for i in 0..n {
            let level = (i % 64) as u8;
            let left = pseudo_random_hash(i as u64);
            let right = pseudo_random_hash(i as u64 + n as u64);
            levels.push(level);
            lefts.push(left);
            rights.push(right);
            jobs.push(NodeHashJob { level, left, right });
        }

        let aos = gpu.batch_node_hash(&jobs);
        let soa = gpu.batch_node_hash_soa(&levels, &lefts, &rights);

        for i in 0..n {
            assert_eq!(
                aos[i], soa[i],
                "Stress 100k: mismatch at {}: AoS={} SoA={}",
                i,
                hex::encode(aos[i]),
                hex::encode(soa[i])
            );
        }
    }

    #[test]
    fn test_soa_avalanche_single_bit() {
        let gpu = gpu_or_skip!();
        let level = 3u8;
        let left = [0u8; 32];
        let right = [0u8; 32];

        let base = gpu.batch_node_hash_soa(&[level], &[left], &[right]);

        // Flip one bit in the left array
        let mut left_flipped = left;
        left_flipped[0] ^= 1;
        let flipped = gpu.batch_node_hash_soa(&[level], &[left_flipped], &[right]);

        assert_ne!(base[0], flipped[0], "Flipping one bit should change the hash");

        // Count differing bits (avalanche effect)
        let mut diff_bits = 0u32;
        for j in 0..32 {
            diff_bits += (base[0][j] ^ flipped[0][j]).count_ones();
        }
        // Expect roughly 128 bits to differ (50% of 256)
        assert!(
            diff_bits > 80 && diff_bits < 180,
            "Avalanche: expected ~128 diff bits, got {}",
            diff_bits
        );
    }
}