vyre-self-substrate 0.6.2

use super::state::{
    adaptive_four_russians_layout_hash, adaptive_traversal_layout_hash, AdaptiveTraversalPlanCache,
};
use super::*;
use crate::graph::csr_frontier_queue_scratch::STRIDED_FORWARD_MIN_ROW_DEGREE;
use crate::optimizer::dispatcher::{DispatchError, OptimizerDispatcher};
use crate::optimizer::dispatcher::{ResidentDispatchStep, ResidentReadRange};
use std::cell::{Cell, RefCell};
use vyre_foundation::ir::Program;

#[derive(Default)]
struct RecordingResidentDispatcher {
    next_handle: Cell<u64>,
    alloc_count: Cell<usize>,
    alloc_lengths: RefCell<Vec<usize>>,
    resident_uploads: RefCell<Vec<(u64, usize)>>,
    upload_handles: RefCell<Vec<Vec<u64>>>,
    step_handles: RefCell<Vec<Vec<Vec<u64>>>>,
    step_grids: RefCell<Vec<Vec<Option<[u32; 3]>>>>,
    freed: RefCell<Vec<u64>>,
}

impl RecordingResidentDispatcher {
    fn last_upload_handles(&self) -> Vec<u64> {
        self.upload_handles
            .borrow()
            .last()
            .cloned()
            .expect("Fix: test dispatcher expected at least one resident upload sequence")
    }

    fn last_step_handles(&self) -> Vec<Vec<u64>> {
        self.step_handles
            .borrow()
            .last()
            .cloned()
            .expect("Fix: test dispatcher expected at least one resident dispatch sequence")
    }

    fn last_step_grids(&self) -> Vec<Option<[u32; 3]>> {
        self.step_grids
            .borrow()
            .last()
            .cloned()
            .expect("Fix: test dispatcher expected at least one resident dispatch sequence")
    }

    fn resident_alloc_lengths(&self) -> Vec<usize> {
        self.alloc_lengths.borrow().clone()
    }

    fn resident_upload_lengths(&self) -> Vec<usize> {
        self.resident_uploads
            .borrow()
            .iter()
            .map(|(_, bytes)| *bytes)
            .collect()
    }

    fn assert_no_resident_work(&self) {
        assert_eq!(
            self.alloc_count.get(),
            0,
            "zero-frontier fast paths must not allocate resident scratch"
        );
        assert!(
            self.upload_handles.borrow().is_empty(),
            "zero-frontier fast paths must not upload resident inputs"
        );
        assert!(
            self.step_handles.borrow().is_empty(),
            "zero-frontier fast paths must not dispatch resident kernels"
        );
    }
}

impl OptimizerDispatcher for RecordingResidentDispatcher {
    fn dispatch(
        &self,
        _program: &Program,
        _inputs: &[Vec<u8>],
        _grid_override: Option<[u32; 3]>,
    ) -> Result<Vec<Vec<u8>>, DispatchError> {
        Err(DispatchError::Rejected(
            "Fix: recording dispatcher only supports resident sequence tests.".to_string(),
        ))
    }

    fn supports_persistent(&self) -> bool {
        true
    }

    fn alloc_resident(&self, byte_len: usize) -> Result<u64, DispatchError> {
        self.alloc_count.set(self.alloc_count.get() + 1);
        self.alloc_lengths.borrow_mut().push(byte_len);
        let handle = self.next_handle.get() + 1;
        self.next_handle.set(handle);
        Ok(handle)
    }

    fn free_resident(&self, handle: u64) -> Result<(), DispatchError> {
        self.freed.borrow_mut().push(handle);
        Ok(())
    }

    fn upload_resident(&self, handle: u64, bytes: &[u8]) -> Result<(), DispatchError> {
        self.resident_uploads
            .borrow_mut()
            .push((handle, bytes.len()));
        Ok(())
    }

    fn upload_resident_many_sequence_read_ranges_into(
        &self,
        uploads: &[(u64, &[u8])],
        steps: &[ResidentDispatchStep<'_>],
        read_ranges: &[ResidentReadRange],
        outputs: &mut Vec<Vec<u8>>,
    ) -> Result<(), DispatchError> {
        self.upload_handles
            .borrow_mut()
            .push(uploads.iter().map(|(handle, _)| *handle).collect());
        self.step_handles
            .borrow_mut()
            .push(steps.iter().map(|step| step.handle_ids.to_vec()).collect());
        self.step_grids
            .borrow_mut()
            .push(steps.iter().map(|step| step.grid_override).collect());
        outputs.clear();
        outputs.extend(read_ranges.iter().map(|range| vec![0u8; range.byte_len]));
        Ok(())
    }
}

#[test]
fn selector_uses_queue_for_tiny_sparse_frontier() {
    assert_eq!(
        select_adaptive_traversal_mode(1_000, 10_000, 1, 25),
        AdaptiveTraversalMode::SparseQueue
    );
}

#[test]
fn selector_uses_sparse_dense_at_dense_cutover() {
    assert_eq!(
        select_adaptive_traversal_mode(1_000, 10_000, 260, 25),
        AdaptiveTraversalMode::SparseDense
    );
}

#[test]
fn selector_exports_four_russians_dense_kernel_choice() {
    assert_eq!(
        select_dense_traversal_kernel(1_024, 900, 2),
        DenseTraversalKernel::FourRussiansByteTile
    );
}

#[test]
fn layout_hash_distinguishes_dense_rows() {
    let offsets = [0, 0];
    let targets = [];
    let masks = [];
    let a = adaptive_traversal_layout_hash(1, &offsets, &targets, &masks, &[1]);
    let b = adaptive_traversal_layout_hash(1, &offsets, &targets, &masks, &[2]);
    assert_ne!(a, b);
}

#[test]
fn four_russians_layout_hash_distinguishes_dense_rows() {
    let a = adaptive_four_russians_layout_hash(8, &[0b0000_0001, 0, 0, 0, 0, 0, 0, 0]);
    let b = adaptive_four_russians_layout_hash(8, &[0b0000_0010, 0, 0, 0, 0, 0, 0, 0]);
    assert_ne!(a, b);
}

#[test]
fn matches_primitive_directly_by_wiring_release_programs() {
    let upload_source = include_str!("upload.rs");
    let resident_source = include_str!("resident_steps.rs");
    let release_path = format!("{upload_source}\n{resident_source}");

    for primitive_call in [
        "primitive_adaptive_sparse_dense_step(",
        "primitive_adaptive_four_russians_dense_step(",
        "primitive_four_russians_dense_lut_from_adj_rows(",
        "primitive_frontier_queue_len_init(",
        "primitive_frontier_words_to_queue_clear_out(",
        "primitive_frontier_word_counts(",
        "primitive_frontier_word_block_offsets(",
        "primitive_frontier_word_block_offsets_queue(",
        "primitive_frontier_word_prefix_queue(",
        "primitive_csr_queue_forward_traverse(",
        "primitive_csr_queue_split_low_forward_traverse(",
    ] {
        assert!(
            release_path.contains(primitive_call),
            "adaptive traversal release path must call primitive output wiring {primitive_call}"
        );
    }
}

#[test]
fn release_resident_paths_do_not_call_cpu_or_local_saturating_helpers() {
    let upload_source = include_str!("upload.rs");
    let resident_source = include_str!("resident_steps.rs");
    let release_path = format!("{upload_source}\n{resident_source}");

    assert!(!release_path.contains("reference_adaptive_sparse_dense_step("));
    assert!(!release_path.contains("cpu_sparse_dense_step("));
    assert!(!release_path.contains("saturating_mul"));
    assert!(!release_path.contains(concat!("checked_mul", "(std::mem::size_of::<u32>())")));
    assert!(release_path.contains("u32_word_bytes("));
    assert!(!release_path.contains(".div_ceil(256)"));
    assert!(release_path.contains("plan_adaptive_resident_frontier_step"));
    assert!(release_path.contains("plan_adaptive_resident_sparse_queue_step"));
    assert!(release_path.contains("plan_adaptive_resident_auto_step"));
}

#[test]
fn sparse_dense_zero_frontier_returns_zero_without_resident_work_or_cache() {
    let dispatcher = RecordingResidentDispatcher::default();
    let graph = ResidentAdaptiveTraversalGraph {
        node_count: 33,
        edge_count: 0,
        max_row_degree: 0,
        high_degree_source_count: 0,
        words: 2,
        layout_hash: 7,
        handles: [101, 102, 103, 104],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let mut frontier_out = vec![9, 9, 9];

    adaptive_traverse_resident_graph_step_with_scratch_into(
        &dispatcher,
        &graph,
        &[0, 0],
        u32::MAX,
        25,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: zero sparse/dense frontier should complete on host");

    assert_eq!(frontier_out, vec![0, 0]);
    assert!(scratch.handles.is_none());
    assert_eq!(
        scratch.plan_cache_snapshot(),
        AdaptiveTraversalPlanCacheSnapshot::default()
    );
    dispatcher.assert_no_resident_work();
}

#[test]
fn four_russians_zero_frontier_returns_zero_without_resident_work_or_cache() {
    let dispatcher = RecordingResidentDispatcher::default();
    let graph = ResidentAdaptiveFourRussiansDenseGraph {
        node_count: 33,
        words: 2,
        layout_hash: 7,
        lut_handle: 201,
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let mut frontier_out = vec![9, 9, 9];

    adaptive_traverse_resident_graph_four_russians_dense_step_with_scratch_into(
        &dispatcher,
        &graph,
        &[0, 0],
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: zero Four-Russians frontier should complete on host");

    assert_eq!(frontier_out, vec![0, 0]);
    assert!(scratch.handles.is_none());
    assert_eq!(
        scratch.plan_cache_snapshot(),
        AdaptiveTraversalPlanCacheSnapshot::default()
    );
    dispatcher.assert_no_resident_work();
}

#[test]
fn sparse_queue_zero_frontier_returns_zero_without_queue_allocation_or_cache() {
    let dispatcher = RecordingResidentDispatcher::default();
    let graph = ResidentAdaptiveTraversalGraph {
        node_count: 33,
        edge_count: 0,
        max_row_degree: 0,
        high_degree_source_count: 0,
        words: 2,
        layout_hash: 7,
        handles: [101, 102, 103, 104],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let mut frontier_out = vec![9, 9, 9];

    adaptive_traverse_resident_graph_sparse_queue_step_with_scratch_into(
        &dispatcher,
        &graph,
        &[0, 0],
        u32::MAX,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: zero sparse-queue frontier should complete on host");

    assert_eq!(frontier_out, vec![0, 0]);
    assert!(scratch.handles.is_none());
    assert!(scratch.queue_handle.is_none());
    assert!(scratch.word_partials_handle.is_none());
    assert!(scratch.word_block_totals_handle.is_none());
    assert_eq!(
        scratch.plan_cache_snapshot(),
        AdaptiveTraversalPlanCacheSnapshot::default()
    );
    dispatcher.assert_no_resident_work();
}

#[test]
fn sparse_queue_graph_upload_skips_dense_adjacency_rows() {
    let dispatcher = RecordingResidentDispatcher::default();
    let node_count = 4u32;
    let edge_offsets = [0, 1, 1, 1, 1];
    let edge_targets = [2];
    let edge_kind_mask = [1];

    let graph = upload_resident_adaptive_sparse_queue_graph(
        &dispatcher,
        node_count,
        &edge_offsets,
        &edge_targets,
        &edge_kind_mask,
    )
    .expect("Fix: CSR-only adaptive sparse-queue graph upload should accept canonical CSR");

    assert_eq!(graph.node_count(), node_count);
    assert_eq!(graph.edge_count(), 1);
    assert_eq!(graph.words(), 1);
    assert_eq!(dispatcher.alloc_count.get(), 3);
    assert_eq!(
        dispatcher.resident_upload_lengths(),
        vec![
            edge_offsets.len() * std::mem::size_of::<u32>(),
            edge_targets.len() * std::mem::size_of::<u32>(),
            edge_kind_mask.len() * std::mem::size_of::<u32>(),
        ],
        "CSR-only sparse-queue upload must not allocate or upload dense adjacency rows"
    );
}

#[test]
fn adaptive_upload_records_exact_high_degree_source_count() {
    let dispatcher = RecordingResidentDispatcher::default();
    let node_count = 5u32;
    let degrees = [
        STRIDED_FORWARD_MIN_ROW_DEGREE,
        STRIDED_FORWARD_MIN_ROW_DEGREE - 1,
        STRIDED_FORWARD_MIN_ROW_DEGREE + 11,
        0,
        3,
    ];
    let mut edge_offsets = Vec::with_capacity(degrees.len() + 1);
    let mut edge_targets = Vec::new();
    let mut edge_kind_mask = Vec::new();
    edge_offsets.push(0);
    for degree in degrees {
        edge_targets.extend((0..degree).map(|edge| edge % node_count));
        edge_kind_mask.extend(std::iter::repeat(1).take(degree as usize));
        edge_offsets.push(edge_targets.len() as u32);
    }
    let adj_rows_dense = vec![0u32; node_count as usize];

    let full_graph = upload_resident_adaptive_traversal_graph(
        &dispatcher,
        node_count,
        &edge_offsets,
        &edge_targets,
        &edge_kind_mask,
        &adj_rows_dense,
    )
    .expect("Fix: full adaptive resident upload should accept high-degree CSR");
    let sparse_graph = upload_resident_adaptive_sparse_queue_graph(
        &dispatcher,
        node_count,
        &edge_offsets,
        &edge_targets,
        &edge_kind_mask,
    )
    .expect("Fix: CSR-only adaptive resident upload should accept high-degree CSR");

    assert_eq!(
        full_graph.high_degree_source_count(),
        2,
        "full adaptive graph metadata must count high-degree rows exactly"
    );
    assert_eq!(
        sparse_graph.high_degree_source_count(),
        2,
        "CSR-only adaptive graph metadata must count high-degree rows exactly"
    );
}

#[test]
fn sparse_queue_step_accepts_csr_only_resident_graph() {
    let dispatcher = RecordingResidentDispatcher::default();
    let node_count = 4u32;
    let edge_offsets = [0, 1, 1, 1, 1];
    let edge_targets = [2];
    let edge_kind_mask = [1];
    let graph = upload_resident_adaptive_sparse_queue_graph(
        &dispatcher,
        node_count,
        &edge_offsets,
        &edge_targets,
        &edge_kind_mask,
    )
    .expect("Fix: CSR-only adaptive sparse-queue graph upload should accept canonical CSR");
    let graph_handles = graph.handles();
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let frontier_in = [1u32];
    let mut frontier_out = Vec::new();

    adaptive_traverse_resident_sparse_queue_step_with_scratch_into(
        &dispatcher,
        &graph,
        &frontier_in,
        1,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: CSR-only adaptive sparse-queue resident step should dispatch");

    let scratch_handles = scratch
        .handles
        .expect("Fix: sparse queue step should allocate frontier scratch");
    let queue_handle = scratch
        .queue_handle
        .expect("Fix: sparse queue step should allocate active queue");
    let steps = dispatcher.last_step_handles();
    assert_eq!(steps.len(), 3);
    assert_eq!(
        steps[2],
        vec![
            queue_handle,
            scratch_handles[2],
            graph_handles[0],
            graph_handles[1],
            graph_handles[2],
            scratch_handles[1],
        ],
        "CSR-only sparse queue traversal must bind only CSR graph handles"
    );
    assert_eq!(frontier_out, vec![0]);
}

#[test]
fn sparse_queue_step_sizes_active_queue_from_frontier_popcount() {
    let dispatcher = RecordingResidentDispatcher::default();
    let node_count = 8_000u32;
    let words = vyre_primitives::bitset::bitset_words(node_count) as usize;
    let graph = ResidentAdaptiveTraversalGraph {
        node_count,
        edge_count: 0,
        max_row_degree: 0,
        high_degree_source_count: 0,
        words,
        layout_hash: 7,
        handles: [101, 102, 103, 104],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let mut frontier_in = vec![0u32; words];
    frontier_in[0] = 1;
    let mut frontier_out = Vec::new();

    adaptive_traverse_resident_graph_sparse_queue_step_with_scratch_into(
        &dispatcher,
        &graph,
        &frontier_in,
        u32::MAX,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: recording dispatcher should complete sparse queue traversal");

    assert_eq!(
        scratch.queue_bytes,
        std::mem::size_of::<u32>(),
        "single-source frontier must not allocate a graph-sized active queue"
    );
    assert_eq!(
        dispatcher.resident_alloc_lengths().last().copied(),
        Some(std::mem::size_of::<u32>()),
        "active queue allocation should be sized from frontier popcount"
    );
    assert_eq!(frontier_out, vec![0; words]);
}

#[test]
fn sparse_queue_step_reuses_larger_queue_scratch_for_smaller_frontier() {
    let dispatcher = RecordingResidentDispatcher::default();
    let node_count = 4096u32;
    let words = vyre_primitives::bitset::bitset_words(node_count) as usize;
    let graph = ResidentAdaptiveTraversalGraph {
        node_count,
        edge_count: 0,
        max_row_degree: 0,
        high_degree_source_count: 0,
        words,
        layout_hash: 11,
        handles: [101, 102, 103, 104],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let mut larger_frontier = vec![0u32; words];
    for node in 0..300u32 {
        larger_frontier[(node / 32) as usize] |= 1 << (node % 32);
    }
    let mut frontier_out = Vec::new();

    adaptive_traverse_resident_graph_sparse_queue_step_with_scratch_into(
        &dispatcher,
        &graph,
        &larger_frontier,
        u32::MAX,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: recording dispatcher should complete larger sparse queue traversal");

    let queue_handle = scratch
        .queue_handle
        .expect("Fix: sparse queue step must allocate active queue");
    assert_eq!(scratch.queue_bytes, 512 * std::mem::size_of::<u32>());
    let allocs_after_large = dispatcher.alloc_count.get();
    let mut single_frontier = vec![0u32; words];
    single_frontier[0] = 1;

    adaptive_traverse_resident_graph_sparse_queue_step_with_scratch_into(
        &dispatcher,
        &graph,
        &single_frontier,
        u32::MAX,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: recording dispatcher should complete smaller sparse queue traversal");

    assert_eq!(scratch.queue_handle, Some(queue_handle));
    assert_eq!(
        scratch.queue_bytes,
        512 * std::mem::size_of::<u32>(),
        "scratch should keep the larger reusable queue buffer instead of shrinking"
    );
    assert_eq!(
        dispatcher.alloc_count.get(),
        allocs_after_large,
        "smaller frontiers should reuse the existing resident queue allocation"
    );
    assert!(
        dispatcher.freed.borrow().is_empty(),
        "smaller frontiers should not free and reallocate resident queue scratch"
    );
}

#[test]
fn skewed_high_degree_sparse_queue_step_uses_bounded_split_queue() {
    let dispatcher = RecordingResidentDispatcher::default();
    let node_count = 2048u32;
    let words = vyre_primitives::bitset::bitset_words(node_count) as usize;
    let graph = ResidentAdaptiveTraversalGraph {
        node_count,
        edge_count: crate::graph::csr_frontier_queue_scratch::STRIDED_FORWARD_MIN_ROW_DEGREE,
        max_row_degree: crate::graph::csr_frontier_queue_scratch::STRIDED_FORWARD_MIN_ROW_DEGREE,
        high_degree_source_count: 1,
        words,
        layout_hash: 13,
        handles: [101, 102, 103, 104],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let mut frontier_in = vec![0u32; words];
    for node in 0..9u32 {
        frontier_in[(node / 32) as usize] |= 1 << (node % 32);
    }
    let mut frontier_out = Vec::new();

    adaptive_traverse_resident_graph_sparse_queue_step_with_scratch_into(
        &dispatcher,
        &graph,
        &frontier_in,
        u32::MAX,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: recording dispatcher should complete high-degree sparse queue traversal");

    assert_eq!(scratch.queue_bytes, 16 * std::mem::size_of::<u32>());
    assert_eq!(
        scratch.high_queue_bytes,
        std::mem::size_of::<u32>(),
        "a graph with one possible hub must not launch a strided team for every active source"
    );
    let high_queue = scratch
        .high_queue_handle
        .expect("Fix: mixed split traversal should allocate a high-degree queue");
    let high_len = scratch
        .high_len_handle
        .expect("Fix: mixed split traversal should allocate a high-degree queue length");
    let scratch_handles = scratch
        .handles
        .expect("Fix: mixed split traversal should allocate frontier scratch");
    let active_queue = scratch
        .queue_handle
        .expect("Fix: mixed split traversal should allocate active queue scratch");
    let steps = dispatcher.last_step_handles();
    assert_eq!(
        steps.len(),
        5,
        "skewed sparse queue traversal should materialize active sources, clear high_len, split low/high rows, then traverse only bounded high rows"
    );
    assert_eq!(
        steps[3],
        vec![
            active_queue,
            scratch_handles[2],
            graph.handles[0],
            graph.handles[1],
            graph.handles[2],
            scratch_handles[1],
            high_queue,
            high_len,
        ],
        "split-low pass must read the active queue and write only bounded high-row scratch"
    );
    assert_eq!(
        steps[4],
        vec![
            high_queue,
            high_len,
            graph.handles[0],
            graph.handles[1],
            graph.handles[2],
            scratch_handles[1],
        ],
        "strided follow-up must consume the bounded high-row queue, not the whole active queue"
    );
    assert_eq!(
        dispatcher.last_step_grids()[4],
        Some(vyre_primitives::graph::csr_queue_strided::csr_queue_strided_forward_dispatch_grid(1)),
        "skewed high-degree sparse queue traversal must launch row-strided teams only for the graph-wide high-row bound"
    );
}

#[test]
fn single_superhub_csr_only_sparse_queue_sizes_split_queue_from_high_row_count() {
    let dispatcher = RecordingResidentDispatcher::default();
    let node_count = 2048u32;
    let words = vyre_primitives::bitset::bitset_words(node_count) as usize;
    let hub_degree = STRIDED_FORWARD_MIN_ROW_DEGREE * 9;
    let mut edge_offsets = vec![hub_degree; node_count as usize + 1];
    edge_offsets[0] = 0;
    let edge_targets = vec![1u32; hub_degree as usize];
    let edge_kind_mask = vec![1u32; hub_degree as usize];
    let graph = upload_resident_adaptive_sparse_queue_graph(
        &dispatcher,
        node_count,
        &edge_offsets,
        &edge_targets,
        &edge_kind_mask,
    )
    .expect("Fix: CSR-only adaptive sparse queue graph should accept a one-superhub graph");
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let mut frontier_in = vec![0u32; words];
    for node in 0..9u32 {
        frontier_in[(node / 32) as usize] |= 1 << (node % 32);
    }
    let mut frontier_out = Vec::new();

    adaptive_traverse_resident_sparse_queue_step_with_scratch_into(
        &dispatcher,
        &graph,
        &frontier_in,
        u32::MAX,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: CSR-only adaptive sparse queue step should complete one-superhub traversal");

    assert_eq!(graph.high_degree_source_count(), 1);
    assert_eq!(scratch.queue_bytes, 16 * std::mem::size_of::<u32>());
    assert_eq!(
        scratch.high_queue_bytes,
        std::mem::size_of::<u32>(),
        "one enormous row should allocate one adaptive high-row slot, not edge_count / threshold slots"
    );
    assert_eq!(
        dispatcher.last_step_grids()[4],
        Some(vyre_primitives::graph::csr_queue_strided::csr_queue_strided_forward_dispatch_grid(1)),
        "one-superhub adaptive traversal must launch only one row-strided team"
    );
}

#[test]
fn uniformly_high_degree_sparse_queue_step_keeps_global_strided_consumer() {
    let dispatcher = RecordingResidentDispatcher::default();
    let node_count = 2048u32;
    let words = vyre_primitives::bitset::bitset_words(node_count) as usize;
    let queue_slots = 16u32;
    let graph = ResidentAdaptiveTraversalGraph {
        node_count,
        edge_count: crate::graph::csr_frontier_queue_scratch::STRIDED_FORWARD_MIN_ROW_DEGREE
            * queue_slots,
        max_row_degree: crate::graph::csr_frontier_queue_scratch::STRIDED_FORWARD_MIN_ROW_DEGREE,
        high_degree_source_count: queue_slots,
        words,
        layout_hash: 17,
        handles: [101, 102, 103, 104],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let mut frontier_in = vec![0u32; words];
    for node in 0..9u32 {
        frontier_in[(node / 32) as usize] |= 1 << (node % 32);
    }
    let mut frontier_out = Vec::new();

    adaptive_traverse_resident_graph_sparse_queue_step_with_scratch_into(
        &dispatcher,
        &graph,
        &frontier_in,
        u32::MAX,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: recording dispatcher should complete uniformly high-degree traversal");

    assert!(scratch.high_queue_handle.is_none());
    assert!(scratch.high_len_handle.is_none());
    assert_eq!(
        scratch.queue_bytes,
        queue_slots as usize * std::mem::size_of::<u32>()
    );
    assert_eq!(dispatcher.last_step_handles().len(), 3);
    assert_eq!(
        dispatcher.last_step_grids()[2],
        Some(
            vyre_primitives::graph::csr_queue_strided::csr_queue_strided_forward_dispatch_grid(
                queue_slots
            )
        ),
        "uniformly high-degree sparse queue traversal should keep the single row-strided consumer"
    );
}

#[test]
fn auto_zero_frontier_returns_sparse_queue_without_resident_work_or_cache() {
    let dispatcher = RecordingResidentDispatcher::default();
    let graph = ResidentAdaptiveTraversalGraph {
        node_count: 33,
        edge_count: 128,
        max_row_degree: 8,
        high_degree_source_count: 0,
        words: 2,
        layout_hash: 7,
        handles: [101, 102, 103, 104],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let mut frontier_out = vec![9, 9, 9];

    let mode = adaptive_traverse_resident_graph_auto_step_with_scratch_into(
        &dispatcher,
        &graph,
        &[0, 0],
        u32::MAX,
        25,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: zero auto frontier should complete on host");

    assert_eq!(mode, AdaptiveTraversalMode::SparseQueue);
    assert_eq!(frontier_out, vec![0, 0]);
    assert!(scratch.handles.is_none());
    assert!(scratch.queue_handle.is_none());
    assert!(scratch.word_partials_handle.is_none());
    assert!(scratch.word_block_totals_handle.is_none());
    assert_eq!(
        scratch.plan_cache_snapshot(),
        AdaptiveTraversalPlanCacheSnapshot::default()
    );
    dispatcher.assert_no_resident_work();
}

#[test]
fn sparse_dense_resident_step_does_not_upload_popcount_zero_seed() {
    let dispatcher = RecordingResidentDispatcher::default();
    let graph = ResidentAdaptiveTraversalGraph {
        node_count: 1,
        edge_count: 0,
        max_row_degree: 0,
        high_degree_source_count: 0,
        words: 1,
        layout_hash: 7,
        handles: [101, 102, 103, 104],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let mut frontier_out = Vec::new();

    adaptive_traverse_resident_graph_step_with_scratch_into(
        &dispatcher,
        &graph,
        &[1],
        u32::MAX,
        25,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: recording dispatcher should complete resident sparse/dense sequence");

    let scratch_handles = scratch
        .handles
        .expect("Fix: sparse/dense resident step must allocate frontier/popcount handles");
    assert_eq!(
        dispatcher.last_upload_handles(),
        vec![scratch_handles[0]],
        "sparse/dense traversal must upload only frontier input; output and popcount are initialized on device"
    );
    assert_eq!(frontier_out, vec![0]);
}

#[test]
fn sparse_dense_resident_program_cache_reuses_same_shape_graphs() {
    let dispatcher = RecordingResidentDispatcher::default();
    let graph_a = ResidentAdaptiveTraversalGraph {
        node_count: 33,
        edge_count: 8,
        max_row_degree: 2,
        high_degree_source_count: 0,
        words: 2,
        layout_hash: 7,
        handles: [101, 102, 103, 104],
    };
    let graph_b = ResidentAdaptiveTraversalGraph {
        node_count: 33,
        edge_count: 8,
        max_row_degree: 2,
        high_degree_source_count: 0,
        words: 2,
        layout_hash: 99,
        handles: [201, 202, 203, 204],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let mut frontier_out = Vec::new();

    adaptive_traverse_resident_graph_step_with_scratch_into(
        &dispatcher,
        &graph_a,
        &[1, 0],
        u32::MAX,
        25,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: first adaptive resident step should dispatch");
    adaptive_traverse_resident_graph_step_with_scratch_into(
        &dispatcher,
        &graph_b,
        &[1, 0],
        u32::MAX,
        25,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: second adaptive resident step should dispatch");

    assert_eq!(
        scratch.plan_cache_snapshot(),
        AdaptiveTraversalPlanCacheSnapshot {
            entries: 3,
            hits: 3,
            misses: 3,
        },
        "adaptive resident programs must be cached by shape/options, not resident graph contents"
    );
}

#[test]
fn sparse_queue_resident_step_initializes_queue_len_on_device() {
    let dispatcher = RecordingResidentDispatcher::default();
    let graph = ResidentAdaptiveTraversalGraph {
        node_count: 1,
        edge_count: 0,
        max_row_degree: 0,
        high_degree_source_count: 0,
        words: 1,
        layout_hash: 7,
        handles: [101, 102, 103, 104],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let mut frontier_out = Vec::new();

    adaptive_traverse_resident_graph_sparse_queue_step_with_scratch_into(
        &dispatcher,
        &graph,
        &[1],
        u32::MAX,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: recording dispatcher should complete resident sparse-queue sequence");

    let scratch_handles = scratch
        .handles
        .expect("Fix: sparse-queue resident step must allocate frontier/queue-len handles");
    let queue_handle = scratch
        .queue_handle
        .expect("Fix: sparse-queue resident step must allocate active queue");
    assert_eq!(
        dispatcher.last_upload_handles(),
        vec![scratch_handles[0]],
        "sparse-queue traversal must upload only frontier input"
    );
    let steps = dispatcher.last_step_handles();
    assert_eq!(
        steps.len(),
        3,
        "sparse-queue traversal should initialize queue_len, compact packed frontier words while clearing output, then traverse"
    );
    assert_eq!(
        steps[0],
        vec![scratch_handles[2]],
        "first sparse-queue resident step must initialize queue_len on device"
    );
    assert_eq!(
        steps[1],
        vec![
            scratch_handles[0],
            queue_handle,
            scratch_handles[2],
            scratch_handles[1],
        ],
        "second sparse-queue resident step must compact packed words while clearing frontier_out"
    );
    assert_eq!(frontier_out, vec![0]);
}

#[test]
fn large_single_word_sparse_queue_resident_step_uses_atomic_materializer() {
    let dispatcher = RecordingResidentDispatcher::default();
    let node_count = 8_193u32;
    let words = vyre_primitives::bitset::bitset_words(node_count) as usize;
    let graph = ResidentAdaptiveTraversalGraph {
        node_count,
        edge_count: 0,
        max_row_degree: 0,
        high_degree_source_count: 0,
        words,
        layout_hash: 7,
        handles: [101, 102, 103, 104],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let mut frontier_in = vec![0u32; words];
    frontier_in[0] = 1;
    let mut frontier_out = Vec::new();

    adaptive_traverse_resident_graph_sparse_queue_step_with_scratch_into(
        &dispatcher,
        &graph,
        &frontier_in,
        u32::MAX,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: recording dispatcher should complete large resident sparse-queue sequence");

    let scratch_handles = scratch
        .handles
        .expect("Fix: sparse-queue resident step must allocate frontier/queue-len handles");
    let queue_handle = scratch
        .queue_handle
        .expect("Fix: sparse-queue resident step must allocate active queue");
    assert!(scratch.word_partials_handle.is_none());
    assert!(scratch.word_block_totals_handle.is_none());
    assert_eq!(dispatcher.last_upload_handles(), vec![scratch_handles[0]]);
    let steps = dispatcher.last_step_handles();
    assert_eq!(
        steps.len(),
        3,
        "wide graph with one nonzero frontier word should keep the single-pass atomic word materializer"
    );
    assert_eq!(
        steps[0],
        vec![scratch_handles[2]],
        "first sparse-queue resident step must initialize queue_len on device"
    );
    assert_eq!(
        steps[1],
        vec![
            scratch_handles[0],
            queue_handle,
            scratch_handles[2],
            scratch_handles[1],
        ],
        "single-word sparse queue traversal must compact packed words while clearing frontier_out"
    );
    assert_eq!(frontier_out, vec![0; words]);
}

#[test]
fn large_dense_sparse_queue_resident_step_uses_word_prefix_materializer() {
    let dispatcher = RecordingResidentDispatcher::default();
    let node_count = 8_193u32;
    let words = vyre_primitives::bitset::bitset_words(node_count) as usize;
    let graph = ResidentAdaptiveTraversalGraph {
        node_count,
        edge_count: 0,
        max_row_degree: 0,
        high_degree_source_count: 0,
        words,
        layout_hash: 7,
        handles: [101, 102, 103, 104],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let frontier_in = vec![u32::MAX; words];
    let mut frontier_out = Vec::new();

    adaptive_traverse_resident_graph_sparse_queue_step_with_scratch_into(
        &dispatcher,
        &graph,
        &frontier_in,
        u32::MAX,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: recording dispatcher should complete dense resident sparse-queue sequence");

    let scratch_handles = scratch
        .handles
        .expect("Fix: sparse-queue resident step must allocate frontier/queue-len handles");
    let queue_handle = scratch
        .queue_handle
        .expect("Fix: sparse-queue resident step must allocate active queue");
    let word_partials = scratch
        .word_partials_handle
        .expect("Fix: dense sparse-queue step must allocate word partials");
    let block_totals = scratch
        .word_block_totals_handle
        .expect("Fix: dense sparse-queue step must allocate block totals");
    assert_eq!(dispatcher.last_upload_handles(), vec![scratch_handles[0]]);
    let steps = dispatcher.last_step_handles();
    assert_eq!(
        steps.len(),
        4,
        "large dense sparse-queue traversal should clear output, scan words, scatter queue, then traverse"
    );
    assert_eq!(steps[0], vec![scratch_handles[1]]);
    assert_eq!(
        steps[1],
        vec![scratch_handles[0], word_partials, block_totals]
    );
    assert_eq!(
        steps[2],
        vec![
            scratch_handles[0],
            word_partials,
            block_totals,
            queue_handle,
            scratch_handles[2],
        ],
        "large dense sparse-queue traversal must use deterministic word-prefix queue scatter"
    );
    assert_eq!(frontier_out, vec![0; words]);
}

#[test]
fn small_multiblock_sparse_queue_resident_step_inlines_block_offsets() {
    let dispatcher = RecordingResidentDispatcher::default();
    let node_count = 32_897u32;
    let words = vyre_primitives::bitset::bitset_words(node_count) as usize;
    let graph = ResidentAdaptiveTraversalGraph {
        node_count,
        edge_count: 0,
        max_row_degree: 0,
        high_degree_source_count: 0,
        words,
        layout_hash: 11,
        handles: [101, 102, 103, 104],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let frontier_in = vec![u32::MAX; words];
    let mut frontier_out = Vec::new();

    adaptive_traverse_resident_graph_sparse_queue_step_with_scratch_into(
        &dispatcher,
        &graph,
        &frontier_in,
        u32::MAX,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: recording dispatcher should complete multiblock resident sparse-queue sequence");

    let scratch_handles = scratch
        .handles
        .expect("Fix: sparse-queue resident step must allocate frontier/queue-len handles");
    let queue_handle = scratch
        .queue_handle
        .expect("Fix: sparse-queue resident step must allocate active queue");
    let word_partials = scratch
        .word_partials_handle
        .expect("Fix: multiblock sparse-queue step must allocate word partials");
    let block_totals = scratch
        .word_block_totals_handle
        .expect("Fix: multiblock sparse-queue step must allocate block totals");
    let steps = dispatcher.last_step_handles();
    assert_eq!(
        steps.len(),
        4,
        "small multiblock sparse-queue traversal should clear, count words, scatter with inline block offsets, then traverse"
    );
    assert_eq!(steps[0], vec![scratch_handles[1]]);
    assert_eq!(
        steps[1],
        vec![scratch_handles[0], word_partials, block_totals]
    );
    assert_eq!(
        steps[2],
        vec![
            scratch_handles[0],
            word_partials,
            block_totals,
            queue_handle,
            scratch_handles[2],
        ],
        "small multiblock sparse-queue traversal must scatter with inline block offsets"
    );
    assert_eq!(
        scratch.plan_cache_snapshot(),
        AdaptiveTraversalPlanCacheSnapshot {
            entries: 4,
            hits: 0,
            misses: 4,
        }
    );
    assert_eq!(frontier_out, vec![0; words]);
}

#[test]
fn many_block_sparse_queue_resident_step_scans_block_offsets_once() {
    let dispatcher = RecordingResidentDispatcher::default();
    let node_count = 262_177u32;
    let words = vyre_primitives::bitset::bitset_words(node_count) as usize;
    let graph = ResidentAdaptiveTraversalGraph {
        node_count,
        edge_count: 0,
        max_row_degree: 0,
        high_degree_source_count: 0,
        words,
        layout_hash: 11,
        handles: [101, 102, 103, 104],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let frontier_in = vec![u32::MAX; words];
    let mut frontier_out = Vec::new();

    adaptive_traverse_resident_graph_sparse_queue_step_with_scratch_into(
        &dispatcher,
        &graph,
        &frontier_in,
        u32::MAX,
        &mut scratch,
        &mut frontier_out,
    )
    .expect("Fix: recording dispatcher should complete many-block resident sparse-queue sequence");

    let scratch_handles = scratch
        .handles
        .expect("Fix: sparse-queue resident step must allocate frontier/queue-len handles");
    let queue_handle = scratch
        .queue_handle
        .expect("Fix: sparse-queue resident step must allocate active queue");
    let word_partials = scratch
        .word_partials_handle
        .expect("Fix: many-block sparse-queue step must allocate word partials");
    let block_totals = scratch
        .word_block_totals_handle
        .expect("Fix: many-block sparse-queue step must allocate block totals");
    let steps = dispatcher.last_step_handles();
    assert_eq!(
        steps.len(),
        5,
        "many-block sparse-queue traversal should clear, count words, scan block offsets once, scatter, then traverse"
    );
    assert_eq!(steps[0], vec![scratch_handles[1]]);
    assert_eq!(
        steps[1],
        vec![scratch_handles[0], word_partials, block_totals]
    );
    assert_eq!(
        steps[2],
        vec![block_totals],
        "many-block sparse-queue traversal must convert block totals into offsets once"
    );
    assert_eq!(
        steps[3],
        vec![
            scratch_handles[0],
            word_partials,
            block_totals,
            queue_handle,
            scratch_handles[2],
        ],
        "many-block sparse-queue traversal must scatter with precomputed block offsets"
    );
    assert_eq!(
        scratch.plan_cache_snapshot(),
        AdaptiveTraversalPlanCacheSnapshot {
            entries: 5,
            hits: 0,
            misses: 5,
        }
    );
    assert_eq!(frontier_out, vec![0; words]);
}

#[test]
fn generated_adaptive_resident_free_releases_each_handle_once_in_first_seen_order() {
    for seed in 0..4096_u64 {
        let dispatcher = RecordingResidentDispatcher::default();
        let base = 20_000 + seed * 16;
        let graph = ResidentAdaptiveTraversalGraph {
            node_count: 4,
            edge_count: 3,
            max_row_degree: 1,
            high_degree_source_count: 0,
            words: 1,
            layout_hash: seed,
            handles: [base, base + 1, base + 2, base],
        };
        graph.free(&dispatcher).expect("Fix: graph free dedup");
        assert_eq!(
            dispatcher.freed.borrow().as_slice(),
            &[base, base + 1, base + 2]
        );

        dispatcher.freed.borrow_mut().clear();
        let mut scratch = AdaptiveTraversalResidentScratch {
            handles: Some([base + 3, base + 4, base + 3]),
            queue_handle: Some(base + 4),
            high_queue_handle: Some(base + 6),
            high_len_handle: Some(base + 6),
            word_partials_handle: Some(base + 5),
            word_block_totals_handle: Some(base + 5),
            frontier_bytes: 4,
            queue_bytes: 4,
            high_queue_bytes: 4,
            word_partials_bytes: 4,
            word_block_totals_bytes: 4,
            frontier_in_bytes: Vec::new(),
            readbacks: Vec::new(),
            plan_cache: AdaptiveTraversalPlanCache::default(),
        };
        scratch.free(&dispatcher).expect("Fix: scratch free dedup");
        assert_eq!(
            dispatcher.freed.borrow().as_slice(),
            &[base + 3, base + 4, base + 6, base + 5]
        );
    }
}

#[test]

fn auto_step_rejects_bad_frontier_before_resident_allocation() {
    let dispatcher = RecordingResidentDispatcher::default();
    let graph = ResidentAdaptiveTraversalGraph {
        node_count: 33,
        edge_count: 0,
        max_row_degree: 0,
        high_degree_source_count: 0,
        words: 2,
        layout_hash: 7,
        handles: [101, 102, 103, 104],
    };
    let mut scratch = AdaptiveTraversalResidentScratch::default();
    let mut frontier_out = vec![123];

    let err = adaptive_traverse_resident_graph_auto_step_with_scratch_into(
        &dispatcher,
        &graph,
        &[1],
        u32::MAX,
        25,
        &mut scratch,
        &mut frontier_out,
    )
    .expect_err("Fix: malformed frontier must be rejected before mode dispatch");

    assert!(
        err.to_string().contains("expected 2 word(s)"),
        "unexpected frontier validation error: {err}"
    );
    assert_eq!(
        dispatcher.alloc_count.get(),
        0,
        "auto mode must validate frontier shape before allocating resident scratch"
    );
    assert_eq!(
        frontier_out,
        vec![123],
        "failed validation must not mutate caller output storage"
    );
}