cqs 1.26.0 - Docs.rs

//! CAGRA GPU-accelerated vector search
//!
//! Uses NVIDIA cuVS for GPU-accelerated nearest neighbor search.
//! Only available when compiled with the `gpu-index` feature.
//!
//! ## Usage
//!
//! CAGRA indexes are built from embeddings at runtime and can be persisted to
//! disk via [`CagraIndex::save`] / [`CagraIndex::load`] (cuVS native serialize
//! plus a small JSON sidecar with our metadata and a blake3 checksum).
//!
//! When GPU is available and this feature is enabled, CAGRA provides
//! faster search than CPU-based HNSW for large indexes.
//!
//! ## Ownership Model (cuVS 26.4+)
//!
//! The cuVS `search()` method takes `&self` (non-consuming). The index is
//! built once and reused for all searches. No rebuild machinery needed.
//!
//! ## Persistence (issue #950)
//!
//! The persisted form is two files next to each other:
//!
//! - `{cqs_dir}/index.cagra`      — binary blob written by `cuvsCagraSerialize`
//! - `{cqs_dir}/index.cagra.meta` — JSON sidecar: magic, version, dim,
//!   chunk_count, splade_generation (coarse staleness check), id_map,
//!   and a blake3 checksum over the `.cagra` blob.
//!
//! On load we:
//!   1. Parse the sidecar and verify magic + version.
//!   2. Check dim and chunk_count match the current store; bail out to a
//!      rebuild if either has drifted.
//!   3. Verify the blake3 checksum over the `.cagra` blob to catch corruption.
//!   4. Call `cuvsCagraDeserialize` to reconstitute the GPU index.
//!
//! Any failure logs a warn and returns `Err`, and the caller rebuilds from
//! the store. The save path warn-logs failures (non-fatal) so we just
//! rebuild next startup.
//!
//! Set `CQS_CAGRA_PERSIST=0` to disable save+load entirely (A/B testing or
//! reducing on-disk footprint). Default: enabled.

#[cfg(feature = "gpu-index")]
use std::path::Path;
#[cfg(feature = "gpu-index")]
use std::sync::atomic::{AtomicBool, Ordering};
#[cfg(feature = "gpu-index")]
use std::sync::Mutex;

#[cfg(feature = "gpu-index")]
use ndarray_015::Array2;

#[cfg(feature = "gpu-index")]
use thiserror::Error;

#[cfg(feature = "gpu-index")]
use crate::embedder::Embedding;
#[cfg(feature = "gpu-index")]
use crate::index::{IndexResult, VectorIndex};

/// On-disk magic bytes for the CAGRA sidecar. Changes force a rebuild.
#[cfg(feature = "gpu-index")]
const CAGRA_META_MAGIC: &str = "CAGRA01";

/// On-disk version for the CAGRA sidecar. Bump when adding fields that an
/// older binary can't parse; the parse-fail path falls through to rebuild.
#[cfg(feature = "gpu-index")]
const CAGRA_META_VERSION: u32 = 1;

/// Sentinel distance marking an output slot cuVS did not write (issue #952).
///
/// # Why a sentinel?
///
/// cuVS does not promise to fill the neighbor / distance buffers beyond
/// `index.len()` rows. When we request `k` neighbors against an index with
/// fewer than `k` vectors — or against a filtered set smaller than `k` —
/// the kernel writes exactly `index.len()` (or `|filter|`) real
/// `(neighbor, distance)` pairs and leaves the remaining slots untouched.
///
/// A zero-initialized output buffer decodes those untouched slots as
/// `(chunk_id = 0, distance = 0.0)` → `score = 1.0`, emitting phantom
/// perfect-match hits pointing at whichever chunk happens to hold internal
/// index 0. This is the class of bug tracked by issue #952.
///
/// The fix is to pre-fill `distances_host` with this sentinel before the
/// kernel launch and drop any slot whose distance still holds it after
/// copy-back. CAGRA writes squared-L2 distances, so every real hit is a
/// finite non-negative value strictly less than `+∞` — the sentinel is
/// therefore unambiguous.
///
/// # Why `f32::INFINITY` specifically?
///
/// - Distinct from any real squared-L2 distance cuVS produces.
/// - `!dist.is_finite()` also captures any NaN a future cuVS release
///   might emit on exotic inputs, giving us a zero-cost second line of
///   defense without a dedicated branch.
/// - Cheap to initialise — [`ndarray::Array2::from_elem`] materialises
///   the constant once per query; no broadcast or copy overhead.
///
/// `f32::NAN` was considered and rejected: `dist == NAN` is always false
/// (NaN is not comparable), so the sentinel check would have to use
/// `is_nan()` exclusively, losing the "real distances are finite"
/// structural guarantee.
///
/// # cuVS API audit (cuvs 26.4, April 2026)
///
/// The companion issue contemplated two upstream mechanisms that would
/// make this sentinel unnecessary:
///   1. A `fill_with_invalid` (or equivalent) option on
///      [`cuvs::cagra::SearchParams`] that pre-fills unused rows.
///   2. An `n_valid_results` output field on the search call.
///
/// Neither exists in 26.4: `SearchParams` exposes only `itopk_size`,
/// `max_queries`, `max_iterations`, algo/team/block tuning, and hashmap
/// knobs. The search entrypoint returns `Result<()>` with no per-row
/// validity information. Re-audit this when bumping the `cuvs` pin; if
/// either mechanism lands upstream, the sentinel scheme can be removed
/// in favour of the native API.
#[cfg(feature = "gpu-index")]
const INVALID_DISTANCE: f32 = f32::INFINITY;

#[cfg(feature = "gpu-index")]
#[derive(Error, Debug)]
pub enum CagraError {
    #[error("cuVS error: {0}")]
    Cuvs(String),
    #[error("No GPU available")]
    NoGpu,
    #[error("Dimension mismatch: expected {expected}, got {actual}")]
    DimensionMismatch { expected: usize, actual: usize },
    #[error("Build error: {0}")]
    Build(String),
    #[error("Index not built")]
    NotBuilt,
    #[error("Persistence IO error: {0}")]
    Io(String),
    #[error("Persistence metadata invalid: {0}")]
    BadMeta(String),
    #[error("Persisted CAGRA index is stale ({reason})")]
    Stale { reason: String },
    #[error("Persisted CAGRA index checksum mismatch (file: {0})")]
    ChecksumMismatch(String),
}

/// SHL-10: Configurable CAGRA CPU memory cap via `CQS_CAGRA_MAX_BYTES` env var.
/// Defaults to 2GB. Cached in OnceLock for single parse.
#[cfg(feature = "gpu-index")]
fn cagra_max_bytes() -> usize {
    static MAX: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
    *MAX.get_or_init(|| {
        std::env::var("CQS_CAGRA_MAX_BYTES")
            .ok()
            .and_then(|v| v.parse().ok())
            .unwrap_or(2 * 1024 * 1024 * 1024)
    })
}

/// Issue #962: Scale `itopk_max` ceiling with corpus size. At 1k chunks we
/// want the library default (~320); at 1M chunks we want ~640 or more.
/// Logarithmic scaling based on chunk count:
///   ceiling = (log2(n_vectors) * 32).clamp(128, 4096)
/// 1k → 320, 13k → 447, 100k → 532, 1M → 640, 10M → 744
/// Then `(k * 2).clamp(min, max)` narrows the actual `itopk_size` at
/// query time. Overridable via `CQS_CAGRA_ITOPK_MAX`.
#[cfg(feature = "gpu-index")]
fn cagra_itopk_max_default(n_vectors: usize) -> usize {
    let log2 = (n_vectors.max(1) as f64).log2();
    let scaled = (log2 * 32.0) as usize;
    scaled.clamp(128, 4096)
}

/// Issue #962: Build-time CAGRA graph degrees, overridable via env.
/// `CQS_CAGRA_GRAPH_DEGREE` (default 64) is the output graph degree.
/// `CQS_CAGRA_INTERMEDIATE_GRAPH_DEGREE` (default 128) is the pruned-input
/// graph degree. Both map to the corresponding cuVS `IndexParams` setters.
/// Returns `IndexParams` with those setters applied (and traces the choice).
#[cfg(feature = "gpu-index")]
fn cagra_build_params() -> Result<cuvs::cagra::IndexParams, CagraError> {
    let graph_degree: usize = std::env::var("CQS_CAGRA_GRAPH_DEGREE")
        .ok()
        .and_then(|v| v.parse().ok())
        .unwrap_or(64);
    let intermediate_graph_degree: usize = std::env::var("CQS_CAGRA_INTERMEDIATE_GRAPH_DEGREE")
        .ok()
        .and_then(|v| v.parse().ok())
        .unwrap_or(128);
    let params = cuvs::cagra::IndexParams::new()
        .map_err(|e| CagraError::Cuvs(e.to_string()))?
        .set_graph_degree(graph_degree)
        .set_intermediate_graph_degree(intermediate_graph_degree);
    tracing::info!(
        graph_degree,
        intermediate_graph_degree,
        "CAGRA build params"
    );
    Ok(params)
}

/// CAGRA GPU index for vector search.
///
/// # Thread Safety
/// `resources` and `index` are protected by a single Mutex to ensure safe
/// concurrent access. CUDA contexts (managed by cuVS Resources) are not
/// inherently thread-safe, so we serialize all GPU operations.
#[cfg(feature = "gpu-index")]
pub struct CagraIndex {
    /// Embedding dimensionality (runtime, from model config)
    dim: usize,
    /// cuVS resources + index, protected by Mutex (CUDA contexts require serialized access)
    gpu: Mutex<GpuState>,
    /// Mapping from internal index to chunk ID
    id_map: Vec<String>,
    /// RM-V1.25-19: Set when a mutex poison is observed. The CUDA stream
    /// may be in an inconsistent posture after a mid-op panic
    /// (cudaMalloc'd buffer unfreed, stream corked, resources leaked),
    /// so subsequent searches against the same `GpuState` could
    /// double-free or CUDA-fault. `BatchContext::vector_index` checks
    /// `is_poisoned()` and forces a rebuild via `build_from_store`
    /// rather than reusing the poisoned state.
    poisoned: AtomicBool,
}

#[cfg(feature = "gpu-index")]
struct GpuState {
    resources: cuvs::Resources,
    index: cuvs::cagra::Index,
}

// Debug impl needed because cuvs types don't implement Debug
#[cfg(feature = "gpu-index")]
impl std::fmt::Debug for CagraIndex {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("CagraIndex")
            .field("dim", &self.dim)
            .field("len", &self.id_map.len())
            .finish()
    }
}

#[cfg(feature = "gpu-index")]
impl CagraIndex {
    /// Check if GPU is available for CAGRA
    pub fn gpu_available() -> bool {
        cuvs::Resources::new().is_ok()
    }

    /// Build a CAGRA index from embeddings
    pub fn build(embeddings: Vec<(String, Embedding)>, dim: usize) -> Result<Self, CagraError> {
        let _span = tracing::debug_span!("cagra_build").entered();
        let (id_map, flat_data, n_vectors) = crate::hnsw::prepare_index_data(embeddings, dim)
            .map_err(|e| CagraError::Build(e.to_string()))?;

        tracing::info!(n_vectors, "Building CAGRA index");

        let resources = cuvs::Resources::new().map_err(|e| CagraError::Cuvs(e.to_string()))?;

        let dataset = Array2::from_shape_vec((n_vectors, dim), flat_data)
            .map_err(|e| CagraError::Cuvs(format!("Failed to create array: {}", e)))?;

        let build_params = cagra_build_params()?;

        let index = cuvs::cagra::Index::build(&resources, &build_params, &dataset)
            .map_err(|e| CagraError::Cuvs(e.to_string()))?;

        tracing::info!("CAGRA index built successfully");

        Ok(Self {
            dim,
            gpu: Mutex::new(GpuState { resources, index }),
            id_map,
            poisoned: AtomicBool::new(false),
        })
    }

    /// Number of vectors in the index
    pub fn len(&self) -> usize {
        self.id_map.len()
    }

    /// Checks whether this collection contains any elements.
    pub fn is_empty(&self) -> bool {
        self.id_map.is_empty()
    }

    /// Search for nearest neighbors
    pub fn search(&self, query: &Embedding, k: usize) -> Vec<IndexResult> {
        let _span = tracing::debug_span!("cagra_search", k).entered();
        if self.id_map.is_empty() || k == 0 {
            return Vec::new();
        }

        if query.len() != self.dim {
            tracing::warn!(
                expected_dim = self.dim,
                actual_dim = query.len(),
                "Query dimension mismatch"
            );
            return Vec::new();
        }

        let gpu = self.gpu.lock().unwrap_or_else(|poisoned| {
            // RM-V1.25-19: a prior panic left the CUDA stream in an
            // unknown state. Flag the index so the caller forces a
            // rebuild on the next `vector_index()` access; return an
            // empty result here rather than run a new kernel against
            // possibly-corrupted resources.
            self.poisoned.store(true, Ordering::Release);
            tracing::warn!(
                "CAGRA GPU mutex poisoned — results from this call are discarded \
                 and the index will be rebuilt on the next vector_index() access"
            );
            poisoned.into_inner()
        });

        if self.poisoned.load(Ordering::Acquire) {
            // Don't dispatch new kernels; the caller should have already
            // rebuilt us. This is a safety net for racing clients.
            return Vec::new();
        }

        self.search_impl(&gpu, query, k, None)
    }

    /// Core search implementation shared by filtered and unfiltered paths.
    fn search_impl(
        &self,
        gpu: &GpuState,
        query: &Embedding,
        k: usize,
        bitset_device: Option<&cuvs::ManagedTensor>,
    ) -> Vec<IndexResult> {
        let itopk_min = std::env::var("CQS_CAGRA_ITOPK_MIN")
            .ok()
            .and_then(|v| v.parse().ok())
            .unwrap_or(128);
        let itopk_max = std::env::var("CQS_CAGRA_ITOPK_MAX")
            .ok()
            .and_then(|v| v.parse().ok())
            .unwrap_or_else(|| cagra_itopk_max_default(self.len()));
        let itopk_size = (k * 2).clamp(itopk_min, itopk_max);
        tracing::debug!(
            itopk_size,
            itopk_min,
            itopk_max,
            k,
            n_vectors = self.len(),
            "CAGRA itopk resolved"
        );

        let search_params = match cuvs::cagra::SearchParams::new() {
            Ok(params) => params.set_itopk_size(itopk_size),
            Err(e) => {
                tracing::error!(error = %e, "Failed to create search params");
                return Vec::new();
            }
        };

        let query_host = match Array2::from_shape_vec((1, self.dim), query.as_slice().to_vec()) {
            Ok(arr) => arr,
            Err(e) => {
                tracing::error!(expected_dim = self.dim, error = %e, "Invalid query shape");
                return Vec::new();
            }
        };

        // IMPORTANT: host arrays must outlive device tensors — ManagedTensor::to_device()
        // copies data to GPU but the DLTensor shape pointer still references the host
        // ndarray's internal shape storage.
        let mut neighbors_host: Array2<u32> = Array2::zeros((1, k));
        // Sentinel-init (issue #952): cuVS does not write slots beyond the
        // number of real neighbours it found, so we seed every slot with
        // `INVALID_DISTANCE` and filter against it after copy-back. See
        // the `INVALID_DISTANCE` doc for the cuVS API audit that motivates
        // the sentinel approach.
        let mut distances_host: Array2<f32> = Array2::from_elem((1, k), INVALID_DISTANCE);

        let query_device = match cuvs::ManagedTensor::from(&query_host).to_device(&gpu.resources) {
            Ok(t) => t,
            Err(e) => {
                tracing::error!(error = %e, "Failed to copy query to device");
                return Vec::new();
            }
        };

        let neighbors_device =
            match cuvs::ManagedTensor::from(&neighbors_host).to_device(&gpu.resources) {
                Ok(t) => t,
                Err(e) => {
                    tracing::error!(error = %e, "Failed to allocate neighbors on device");
                    return Vec::new();
                }
            };

        let distances_device =
            match cuvs::ManagedTensor::from(&distances_host).to_device(&gpu.resources) {
                Ok(t) => t,
                Err(e) => {
                    tracing::error!(error = %e, "Failed to allocate distances on device");
                    return Vec::new();
                }
            };

        // Perform search — non-consuming in cuVS 26.4+
        let result = if let Some(bitset) = bitset_device {
            gpu.index.search_with_filter(
                &gpu.resources,
                &search_params,
                &query_device,
                &neighbors_device,
                &distances_device,
                bitset,
            )
        } else {
            gpu.index.search(
                &gpu.resources,
                &search_params,
                &query_device,
                &neighbors_device,
                &distances_device,
            )
        };

        if let Err(e) = result {
            tracing::error!(error = %e, "CAGRA search failed");
            return Vec::new();
        }

        // Copy results back to host
        if let Err(e) = neighbors_device.to_host(&gpu.resources, &mut neighbors_host) {
            tracing::error!(error = %e, "Failed to copy neighbors from device");
            return Vec::new();
        }
        if let Err(e) = distances_device.to_host(&gpu.resources, &mut distances_host) {
            tracing::error!(error = %e, "Failed to copy distances from device");
            return Vec::new();
        }

        // Convert results: CAGRA uses squared L2 distance. For unit-norm vectors:
        // d = 2 - 2*cos_sim, so cos_sim = 1 - d/2.
        let mut results = Vec::with_capacity(k);
        let neighbor_row = neighbors_host.row(0);
        let distance_row = distances_host.row(0);

        for i in 0..k {
            let idx = neighbor_row[i] as usize;
            let dist = distance_row[i];
            // Sentinel check (issue #952): a slot still holding
            // `INVALID_DISTANCE` means cuVS did not overwrite it, so the
            // paired `neighbor_row[i]` is garbage and must be dropped.
            // `!is_finite()` is a superset of `dist == INVALID_DISTANCE`
            // (since `INVALID_DISTANCE == +∞`) and also catches any NaN
            // cuVS might emit on exotic inputs — real squared-L2
            // distances are always finite and non-negative.
            if !dist.is_finite() {
                continue;
            }
            if idx < self.id_map.len() {
                let score = 1.0 - dist / 2.0;
                results.push(IndexResult {
                    id: self.id_map[idx].clone(),
                    score,
                });
            }
        }

        results
    }
}

#[cfg(feature = "gpu-index")]
impl VectorIndex for CagraIndex {
    fn search(&self, query: &Embedding, k: usize) -> Vec<IndexResult> {
        CagraIndex::search(self, query, k)
    }

    fn len(&self) -> usize {
        CagraIndex::len(self)
    }

    fn is_empty(&self) -> bool {
        CagraIndex::is_empty(self)
    }

    fn name(&self) -> &'static str {
        "CAGRA"
    }

    /// RM-V1.25-19: expose the poison flag so `BatchContext::vector_index`
    /// can force a full rebuild instead of reusing a possibly-corrupt
    /// CUDA context after a prior panic.
    fn is_poisoned(&self) -> bool {
        self.poisoned.load(Ordering::Acquire)
    }

    fn dim(&self) -> usize {
        self.dim
    }

    /// GPU-native filtered search: builds a bitset from the predicate and
    /// passes it to CAGRA for traversal-time filtering. No over-fetching needed.
    fn search_with_filter(
        &self,
        query: &Embedding,
        k: usize,
        filter: &dyn Fn(&str) -> bool,
    ) -> Vec<IndexResult> {
        let _span = tracing::debug_span!("cagra_search_filtered", k).entered();
        if self.id_map.is_empty() || k == 0 {
            return Vec::new();
        }

        if query.len() != self.dim {
            tracing::warn!(
                expected_dim = self.dim,
                actual_dim = query.len(),
                "Query dimension mismatch"
            );
            return Vec::new();
        }

        // Build bitset on host: evaluate predicate for each vector
        let n = self.id_map.len();
        let n_words = n.div_ceil(32);
        let mut bitset = vec![0u32; n_words];
        let mut included = 0usize;
        for (i, id) in self.id_map.iter().enumerate() {
            if filter(id) {
                bitset[i / 32] |= 1u32 << (i % 32);
                included += 1;
            }
        }

        // If everything passes the filter, use unfiltered search (faster)
        if included == n {
            return CagraIndex::search(self, query, k);
        }

        // If nothing passes, no results
        if included == 0 {
            return Vec::new();
        }

        tracing::debug!(
            total = n,
            included,
            excluded = n - included,
            "CAGRA bitset filter"
        );

        let gpu = self.gpu.lock().unwrap_or_else(|poisoned| {
            // RM-V1.25-19: same recovery path as `search()`. See that
            // comment for the rationale — CUDA state may be corrupt, we
            // flag for rebuild and drop the work rather than kernel-launch
            // against a bad stream.
            self.poisoned.store(true, Ordering::Release);
            tracing::warn!(
                "CAGRA GPU mutex poisoned (filtered path) — results discarded \
                 and index will be rebuilt on next vector_index()"
            );
            poisoned.into_inner()
        });

        if self.poisoned.load(Ordering::Acquire) {
            return Vec::new();
        }

        // Upload bitset to device
        let bitset_host = ndarray_015::Array1::from_vec(bitset);
        let bitset_device = match cuvs::ManagedTensor::from(&bitset_host).to_device(&gpu.resources)
        {
            Ok(t) => t,
            Err(e) => {
                tracing::error!(error = %e, "Failed to upload bitset to device");
                return Vec::new();
            }
        };

        self.search_impl(&gpu, query, k, Some(&bitset_device))
    }
}

// SAFETY: CagraIndex is thread-safe because:
// - `gpu` (resources + index) is protected by Mutex (CUDA contexts require serialized access)
// - `id_map` is immutable after construction
#[cfg(feature = "gpu-index")]
unsafe impl Send for CagraIndex {}
#[cfg(feature = "gpu-index")]
unsafe impl Sync for CagraIndex {}

#[cfg(feature = "gpu-index")]
impl CagraIndex {
    /// Build CAGRA index from all embeddings in a Store.
    /// Unlike HNSW, CAGRA indexes are not persisted to disk.
    /// Note: CAGRA (cuVS) requires all data upfront for GPU index building,
    /// so we can't stream incrementally like HNSW. However, we stream from
    /// SQLite to avoid double-buffering in memory.
    /// Notes are excluded — they use brute-force search from SQLite.
    pub fn build_from_store<Mode>(
        store: &crate::Store<Mode>,
        dim: usize,
    ) -> Result<Self, CagraError> {
        let _span = tracing::debug_span!("cagra_build_from_store").entered();
        let chunk_count = store
            .chunk_count()
            .map_err(|e| CagraError::Cuvs(format!("Failed to count chunks: {}", e)))?
            as usize;

        if chunk_count == 0 {
            return Err(CagraError::Cuvs("No embeddings in store".into()));
        }

        tracing::info!(chunk_count, "Building CAGRA index from chunk embeddings");

        // Guard against OOM: estimate CPU memory needed for flat data + id map
        let max_bytes = cagra_max_bytes();
        let estimated_bytes = chunk_count.saturating_mul(dim).saturating_mul(4);
        if estimated_bytes > max_bytes {
            return Err(CagraError::Cuvs(format!(
                "Dataset too large for GPU indexing: {}MB estimated (limit {}MB)",
                estimated_bytes / (1024 * 1024),
                max_bytes / (1024 * 1024)
            )));
        }

        let mut id_map = Vec::with_capacity(chunk_count);
        let mut flat_data = Vec::with_capacity(chunk_count * dim);

        const BATCH_SIZE: usize = 10_000;
        let mut loaded_chunks = 0usize;
        for batch_result in store.embedding_batches(BATCH_SIZE) {
            let batch = batch_result
                .map_err(|e| CagraError::Cuvs(format!("Failed to fetch batch: {}", e)))?;

            let batch_len = batch.len();
            for (chunk_id, embedding) in batch {
                if embedding.len() != dim {
                    return Err(CagraError::DimensionMismatch {
                        expected: dim,
                        actual: embedding.len(),
                    });
                }
                id_map.push(chunk_id);
                flat_data.extend(embedding.into_inner());
            }

            loaded_chunks += batch_len;
            let progress_pct = if chunk_count > 0 {
                (loaded_chunks * 100) / chunk_count
            } else {
                100
            };
            tracing::info!(
                "CAGRA loading progress: {} / {} chunks ({}%)",
                loaded_chunks,
                chunk_count,
                progress_pct
            );
        }

        Self::build_from_flat(id_map, flat_data, dim)
    }

    /// Build CAGRA index from pre-collected flat data (also used by tests)
    pub(crate) fn build_from_flat(
        id_map: Vec<String>,
        flat_data: Vec<f32>,
        dim: usize,
    ) -> Result<Self, CagraError> {
        let n_vectors = id_map.len();
        if n_vectors == 0 {
            return Err(CagraError::Cuvs("Cannot build empty index".into()));
        }

        tracing::info!(n_vectors, "Building CAGRA index");

        let resources = cuvs::Resources::new().map_err(|e| CagraError::Cuvs(e.to_string()))?;

        let dataset = Array2::from_shape_vec((n_vectors, dim), flat_data)
            .map_err(|e| CagraError::Cuvs(format!("Failed to create array: {}", e)))?;

        let build_params = cagra_build_params()?;

        let index = cuvs::cagra::Index::build(&resources, &build_params, &dataset)
            .map_err(|e| CagraError::Cuvs(e.to_string()))?;

        tracing::info!("CAGRA index built successfully");

        Ok(Self {
            dim,
            gpu: Mutex::new(GpuState { resources, index }),
            id_map,
            poisoned: AtomicBool::new(false),
        })
    }
}

/// JSON sidecar next to the `.cagra` binary blob.
///
/// Carries everything we need to validate the blob before handing it to
/// `cuvsCagraDeserialize` and to reject stale persisted data after a
/// reindex.
///
/// The JSON format is versioned via `magic` + `version` so a future binary
/// that can't parse an older sidecar just falls through to rebuilding.
#[cfg(feature = "gpu-index")]
#[derive(serde::Serialize, serde::Deserialize, Debug)]
struct CagraMeta {
    /// Format magic. See [`CAGRA_META_MAGIC`].
    magic: String,
    /// Sidecar schema version. See [`CAGRA_META_VERSION`].
    version: u32,
    /// Embedding dimensionality, captured at save. Must match the store on load.
    dim: usize,
    /// Number of vectors in the persisted index. Must match the store on load.
    chunk_count: usize,
    /// `Store::splade_generation()` at save time. Bumped by the v20 delete trigger.
    /// Coarse staleness check; a mismatch is not fatal because CAGRA builds
    /// survive deletion-free INSERTs, but we log it as an informational warn.
    splade_generation: u64,
    /// Chunk IDs in the same order as the persisted CAGRA internal indices.
    /// Rebuilding this is trivially cheap (serde_json), and cuVS gives us
    /// nothing to translate internal ids → chunk ids on its own.
    id_map: Vec<String>,
    /// Blake3 checksum over the `.cagra` binary blob as a hex string.
    blake3: String,
}

/// Whether CAGRA persistence is enabled (via `CQS_CAGRA_PERSIST`).
///
/// Defaults to `true`. Setting `CQS_CAGRA_PERSIST=0` disables both the save
/// path (build_from_store won't write the file) and the load path
/// (build_vector_index_with_config won't even check for persisted indices).
///
/// Cached in a `OnceLock` so we parse the env var exactly once per process.
#[cfg(feature = "gpu-index")]
pub fn cagra_persist_enabled() -> bool {
    static ENABLED: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
    *ENABLED.get_or_init(|| match std::env::var("CQS_CAGRA_PERSIST").as_deref() {
        Ok("0") | Ok("false") | Ok("no") => {
            tracing::info!("CQS_CAGRA_PERSIST=0 — CAGRA persistence disabled");
            false
        }
        _ => true,
    })
}

#[cfg(feature = "gpu-index")]
impl CagraIndex {
    /// Persist the index to disk.
    ///
    /// Writes two files:
    /// - `{path}` — the cuVS binary blob (via `cuvsCagraSerialize`)
    /// - `{path}.meta` — a JSON sidecar with magic/version/dim/chunk_count/
    ///   id_map/blake3 so `load()` can validate before handing the blob
    ///   back to cuVS.
    ///
    /// The cuVS blob is written first, checksummed, then the sidecar is
    /// written atomically (write-temp → rename). If the sidecar write fails
    /// the partial `.cagra` file is removed so we don't leave an orphan
    /// that would later fail metadata validation.
    ///
    /// Persistence is a best-effort optimisation: callers should warn-log
    /// failures and continue rather than propagate errors. The caller will
    /// rebuild on next startup.
    pub fn save(&self, path: &Path) -> Result<(), CagraError> {
        let _span = tracing::info_span!("cagra_save", path = %path.display()).entered();
        if !cagra_persist_enabled() {
            return Err(CagraError::Io(
                "CAGRA persistence disabled via CQS_CAGRA_PERSIST=0".to_string(),
            ));
        }

        let gpu = self.gpu.lock().map_err(|_| {
            // RM-V1.25-19: the mutex was poisoned by a prior panic in the
            // search path. Rather than try to serialize a potentially
            // corrupt CUDA context, refuse and let the caller rebuild.
            self.poisoned.store(true, Ordering::Release);
            CagraError::Io("CAGRA mutex poisoned, refusing to save".to_string())
        })?;

        if self.poisoned.load(Ordering::Acquire) {
            return Err(CagraError::Io(
                "CAGRA index is poisoned, refusing to save".to_string(),
            ));
        }

        // Ensure the parent exists so cuVS can open the file for writing.
        if let Some(parent) = path.parent() {
            if let Err(e) = std::fs::create_dir_all(parent) {
                return Err(CagraError::Io(format!(
                    "Failed to create parent dir {}: {}",
                    parent.display(),
                    e
                )));
            }
        }

        // Delete any stale sidecar up front so a crash mid-save leaves
        // neither half of the pair, not just the blob.
        let meta_path = meta_path_for(path);
        let _ = std::fs::remove_file(&meta_path);

        // cuVS takes a filename, so we hand it the final path directly.
        // include_dataset=true keeps the blob self-contained — the library
        // doesn't need the original dataset on disk to deserialize.
        let path_str = path.to_str().ok_or_else(|| {
            CagraError::Io(format!(
                "CAGRA save path is not valid UTF-8: {}",
                path.display()
            ))
        })?;
        tracing::info!(
            n_vectors = self.id_map.len(),
            dim = self.dim,
            "Serializing CAGRA index to disk"
        );
        gpu.index
            .serialize(&gpu.resources, path_str, true)
            .map_err(|e| CagraError::Cuvs(format!("cuvsCagraSerialize failed: {}", e)))?;

        // Checksum the blob we just wrote so load() can detect corruption.
        let blob_hash = blake3_of_path(path)?;

        // Read splade_generation indirectly — CagraIndex doesn't hold a
        // Store reference, so the caller supplies it via a separate
        // `save_with_meta` entry point. For now, default to 0 and rely on
        // callers using `save_with_store` when they want the coarse
        // staleness check. See build_vector_index_with_config.
        let meta = CagraMeta {
            magic: CAGRA_META_MAGIC.to_string(),
            version: CAGRA_META_VERSION,
            dim: self.dim,
            chunk_count: self.id_map.len(),
            splade_generation: 0,
            id_map: self.id_map.clone(),
            blake3: blob_hash,
        };

        if let Err(e) = write_meta_atomic(&meta_path, &meta) {
            // Don't leave an orphan .cagra without a matching sidecar.
            let _ = std::fs::remove_file(path);
            return Err(e);
        }

        tracing::info!(
            path = %path.display(),
            n_vectors = self.id_map.len(),
            "CAGRA index persisted"
        );
        Ok(())
    }

    /// Persist with an explicit `splade_generation` stamp from the caller's
    /// `Store`. Preferred over [`save`](Self::save) because it records the
    /// deletion counter for coarse staleness checks on load.
    pub fn save_with_store<Mode>(
        &self,
        path: &Path,
        store: &crate::Store<Mode>,
    ) -> Result<(), CagraError> {
        let _span = tracing::info_span!("cagra_save_with_store", path = %path.display()).entered();
        if !cagra_persist_enabled() {
            return Err(CagraError::Io(
                "CAGRA persistence disabled via CQS_CAGRA_PERSIST=0".to_string(),
            ));
        }

        let gpu = self.gpu.lock().map_err(|_| {
            self.poisoned.store(true, Ordering::Release);
            CagraError::Io("CAGRA mutex poisoned, refusing to save".to_string())
        })?;

        if self.poisoned.load(Ordering::Acquire) {
            return Err(CagraError::Io(
                "CAGRA index is poisoned, refusing to save".to_string(),
            ));
        }

        if let Some(parent) = path.parent() {
            if let Err(e) = std::fs::create_dir_all(parent) {
                return Err(CagraError::Io(format!(
                    "Failed to create parent dir {}: {}",
                    parent.display(),
                    e
                )));
            }
        }

        let meta_path = meta_path_for(path);
        let _ = std::fs::remove_file(&meta_path);

        let path_str = path.to_str().ok_or_else(|| {
            CagraError::Io(format!(
                "CAGRA save path is not valid UTF-8: {}",
                path.display()
            ))
        })?;

        // splade_generation is not a perfect staleness signal (deletes only,
        // not inserts) but it still catches the common reindex-with-GC case.
        let generation = match store.splade_generation() {
            Ok(g) => g,
            Err(e) => {
                tracing::warn!(
                    error = %e,
                    "Failed to read splade_generation for CAGRA meta; defaulting to 0"
                );
                0
            }
        };

        tracing::info!(
            n_vectors = self.id_map.len(),
            dim = self.dim,
            splade_generation = generation,
            "Serializing CAGRA index to disk"
        );
        gpu.index
            .serialize(&gpu.resources, path_str, true)
            .map_err(|e| CagraError::Cuvs(format!("cuvsCagraSerialize failed: {}", e)))?;

        let blob_hash = blake3_of_path(path)?;

        let meta = CagraMeta {
            magic: CAGRA_META_MAGIC.to_string(),
            version: CAGRA_META_VERSION,
            dim: self.dim,
            chunk_count: self.id_map.len(),
            splade_generation: generation,
            id_map: self.id_map.clone(),
            blake3: blob_hash,
        };

        if let Err(e) = write_meta_atomic(&meta_path, &meta) {
            let _ = std::fs::remove_file(path);
            return Err(e);
        }

        tracing::info!(
            path = %path.display(),
            n_vectors = self.id_map.len(),
            "CAGRA index persisted"
        );
        Ok(())
    }

    /// Load a previously-saved index from disk.
    ///
    /// Verifies the sidecar magic/version, confirms `dim` / `chunk_count`
    /// match the caller's expectation (passed as `expected_dim` /
    /// `expected_chunks`), verifies the blake3 checksum over the `.cagra`
    /// blob, then hands the blob to `cuvsCagraDeserialize`.
    ///
    /// Any validation failure returns `Err(CagraError::Stale { .. })` or
    /// `Err(CagraError::ChecksumMismatch { .. })`. The caller should warn-log
    /// and rebuild from the store.
    ///
    /// The `expected_chunks` check is what prevents us from handing cuVS a
    /// blob whose id_map no longer matches the live store: an incremental
    /// reindex that added or removed chunks will change the count even if
    /// `splade_generation` didn't bump.
    pub fn load(
        path: &Path,
        expected_dim: usize,
        expected_chunks: usize,
    ) -> Result<Self, CagraError> {
        let _span = tracing::info_span!("cagra_load", path = %path.display()).entered();
        if !cagra_persist_enabled() {
            return Err(CagraError::Io(
                "CAGRA persistence disabled via CQS_CAGRA_PERSIST=0".to_string(),
            ));
        }

        if !path.exists() {
            return Err(CagraError::Io(format!(
                "CAGRA blob not found at {}",
                path.display()
            )));
        }

        let meta_path = meta_path_for(path);
        if !meta_path.exists() {
            return Err(CagraError::BadMeta(format!(
                "CAGRA sidecar missing at {}",
                meta_path.display()
            )));
        }

        // Bounded read of the sidecar so a corrupt or hostile file can't
        // OOM us. 128MB is generous even for multi-million-vector id_maps.
        const MAX_META_SIZE: u64 = 128 * 1024 * 1024;
        let meta_size = std::fs::metadata(&meta_path)
            .map_err(|e| {
                CagraError::Io(format!(
                    "Failed to stat CAGRA sidecar {}: {}",
                    meta_path.display(),
                    e
                ))
            })?
            .len();
        if meta_size > MAX_META_SIZE {
            return Err(CagraError::BadMeta(format!(
                "CAGRA sidecar {} is {}MB, exceeds {}MB limit",
                meta_path.display(),
                meta_size / (1024 * 1024),
                MAX_META_SIZE / (1024 * 1024)
            )));
        }

        let meta_file = std::fs::File::open(&meta_path).map_err(|e| {
            CagraError::Io(format!(
                "Failed to open CAGRA sidecar {}: {}",
                meta_path.display(),
                e
            ))
        })?;
        let meta: CagraMeta =
            serde_json::from_reader(std::io::BufReader::new(meta_file)).map_err(|e| {
                CagraError::BadMeta(format!(
                    "Failed to parse CAGRA sidecar {}: {}",
                    meta_path.display(),
                    e
                ))
            })?;

        if meta.magic != CAGRA_META_MAGIC {
            return Err(CagraError::BadMeta(format!(
                "Unexpected magic {:?} (want {:?})",
                meta.magic, CAGRA_META_MAGIC
            )));
        }
        if meta.version != CAGRA_META_VERSION {
            return Err(CagraError::Stale {
                reason: format!(
                    "sidecar version {} != current {}",
                    meta.version, CAGRA_META_VERSION
                ),
            });
        }
        if meta.dim != expected_dim {
            return Err(CagraError::Stale {
                reason: format!("dim {} != expected {}", meta.dim, expected_dim),
            });
        }
        if meta.chunk_count != expected_chunks {
            return Err(CagraError::Stale {
                reason: format!(
                    "chunk_count {} != expected {} (reindex occurred)",
                    meta.chunk_count, expected_chunks
                ),
            });
        }
        if meta.id_map.len() != meta.chunk_count {
            return Err(CagraError::BadMeta(format!(
                "sidecar id_map has {} entries but claims {} chunks",
                meta.id_map.len(),
                meta.chunk_count
            )));
        }

        // Verify the `.cagra` blob matches the hash in the sidecar before
        // handing it to cuVS — cheap insurance against silent disk rot
        // and against someone (or us) overwriting the blob without
        // updating the sidecar.
        let actual_hash = blake3_of_path(path)?;
        if actual_hash != meta.blake3 {
            return Err(CagraError::ChecksumMismatch(path.display().to_string()));
        }

        let path_str = path.to_str().ok_or_else(|| {
            CagraError::Io(format!(
                "CAGRA load path is not valid UTF-8: {}",
                path.display()
            ))
        })?;

        let resources = cuvs::Resources::new().map_err(|e| CagraError::Cuvs(e.to_string()))?;
        let index = cuvs::cagra::Index::deserialize(&resources, path_str)
            .map_err(|e| CagraError::Cuvs(format!("cuvsCagraDeserialize failed: {}", e)))?;

        tracing::info!(
            n_vectors = meta.chunk_count,
            dim = meta.dim,
            splade_generation = meta.splade_generation,
            "CAGRA index loaded from disk"
        );

        Ok(Self {
            dim: meta.dim,
            gpu: Mutex::new(GpuState { resources, index }),
            id_map: meta.id_map,
            poisoned: AtomicBool::new(false),
        })
    }

    /// Delete a persisted CAGRA index and its sidecar. Best-effort — missing
    /// files are treated as success so the caller can use this as a cleanup
    /// before a rebuild without first checking existence.
    pub fn delete_persisted(path: &Path) {
        let _span =
            tracing::debug_span!("cagra_delete_persisted", path = %path.display()).entered();
        let _ = std::fs::remove_file(path);
        let _ = std::fs::remove_file(meta_path_for(path));
    }
}

/// Sidecar path for a given CAGRA blob path.
#[cfg(feature = "gpu-index")]
fn meta_path_for(path: &Path) -> std::path::PathBuf {
    let mut s = path.as_os_str().to_os_string();
    s.push(".meta");
    std::path::PathBuf::from(s)
}

/// Stream-hash a file with blake3.
#[cfg(feature = "gpu-index")]
fn blake3_of_path(path: &Path) -> Result<String, CagraError> {
    let file = std::fs::File::open(path).map_err(|e| {
        CagraError::Io(format!(
            "Failed to open {} for checksum: {}",
            path.display(),
            e
        ))
    })?;
    let mut hasher = blake3::Hasher::new();
    hasher.update_reader(file).map_err(|e| {
        CagraError::Io(format!(
            "Failed to read {} for checksum: {}",
            path.display(),
            e
        ))
    })?;
    Ok(hasher.finalize().to_hex().to_string())
}

/// Write the CAGRA sidecar via write-temp + rename to avoid a torn JSON on
/// crash.
#[cfg(feature = "gpu-index")]
fn write_meta_atomic(path: &Path, meta: &CagraMeta) -> Result<(), CagraError> {
    let parent = path.parent().ok_or_else(|| {
        CagraError::Io(format!(
            "CAGRA sidecar has no parent dir: {}",
            path.display()
        ))
    })?;
    std::fs::create_dir_all(parent).map_err(|e| {
        CagraError::Io(format!(
            "Failed to create parent {} for sidecar: {}",
            parent.display(),
            e
        ))
    })?;

    let tmp = parent.join(format!(
        ".{}.{:016x}.tmp",
        path.file_name()
            .and_then(|n| n.to_str())
            .unwrap_or("cagra_meta"),
        crate::temp_suffix()
    ));
    {
        let file = std::fs::File::create(&tmp).map_err(|e| {
            CagraError::Io(format!(
                "Failed to create sidecar temp {}: {}",
                tmp.display(),
                e
            ))
        })?;
        let mut writer = std::io::BufWriter::new(file);
        serde_json::to_writer(&mut writer, meta)
            .map_err(|e| CagraError::Io(format!("Failed to serialize sidecar: {}", e)))?;
        use std::io::Write as _;
        writer
            .flush()
            .map_err(|e| CagraError::Io(format!("Failed to flush sidecar: {}", e)))?;
        // Best-effort fsync — ignore failures on platforms that don't
        // support it on a regular File backed by the FS we're on.
        if let Err(e) = writer.get_ref().sync_all() {
            tracing::debug!(error = %e, "fsync of CAGRA sidecar temp failed (non-fatal)");
        }
    }

    if let Err(e) = std::fs::rename(&tmp, path) {
        let _ = std::fs::remove_file(&tmp);
        return Err(CagraError::Io(format!(
            "Failed to rename sidecar {} -> {}: {}",
            tmp.display(),
            path.display(),
            e
        )));
    }

    Ok(())
}

#[cfg(all(test, feature = "gpu-index"))]
mod tests {
    use super::*;
    use crate::index::VectorIndex;
    use crate::EMBEDDING_DIM;
    use std::sync::Mutex;

    /// Serialize GPU tests — concurrent CUDA contexts cause SIGSEGV
    static GPU_LOCK: Mutex<()> = Mutex::new(());

    fn make_embedding(seed: u32) -> Embedding {
        let mut v = vec![0.0f32; EMBEDDING_DIM];
        for (i, val) in v.iter_mut().enumerate() {
            *val = ((seed as f32 * 10.0) + (i as f32 * 0.001)).sin();
        }
        let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
        if norm > 0.0 {
            v.iter_mut().for_each(|x| *x /= norm);
        }
        Embedding::new(v)
    }

    fn require_gpu() -> bool {
        if !CagraIndex::gpu_available() {
            eprintln!("Skipping CAGRA test: no GPU available");
            return false;
        }
        true
    }

    fn build_test_index(n: u32) -> CagraIndex {
        let embeddings: Vec<(String, Embedding)> = (0..n)
            .map(|i| (format!("chunk_{}", i), make_embedding(i)))
            .collect();
        CagraIndex::build(embeddings, EMBEDDING_DIM).expect("Failed to build test index")
    }

    #[test]
    fn test_gpu_available() {
        let _ = CagraIndex::gpu_available();
    }

    #[test]
    fn test_build_simple() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let index = build_test_index(5);
        assert_eq!(index.len(), 5);
        assert!(!index.is_empty());
    }

    #[test]
    fn test_build_empty() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let result = CagraIndex::build(vec![], EMBEDDING_DIM);
        assert!(result.is_err());
    }

    #[test]
    fn test_build_dimension_mismatch() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let bad_embedding = Embedding::new(vec![1.0; 100]);
        let result = CagraIndex::build(vec![("bad".into(), bad_embedding)], EMBEDDING_DIM);
        match result {
            Err(CagraError::Build(_)) => {}
            Err(e) => panic!("Expected Build error, got: {:?}", e),
            Ok(_) => panic!("Expected error, got Ok"),
        }
    }

    #[test]
    fn test_search_self_match() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let index = build_test_index(10);
        let query = make_embedding(3);
        let results = index.search(&query, 5);
        assert!(!results.is_empty(), "Search returned no results");
        assert_eq!(results[0].id, "chunk_3", "Top result should be chunk_3");
        assert!(
            results[0].score > 0.9,
            "Self-match score should be high, got {}",
            results[0].score
        );
    }

    #[test]
    fn test_search_k_limiting() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let index = build_test_index(10);
        let query = make_embedding(0);
        let results = index.search(&query, 3);
        assert!(results.len() <= 3);
    }

    #[test]
    fn test_search_ordering() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let index = build_test_index(10);
        let query = make_embedding(0);
        let results = index.search(&query, 5);
        for window in results.windows(2) {
            assert!(
                window[0].score >= window[1].score,
                "Results not sorted: {} < {}",
                window[0].score,
                window[1].score
            );
        }
    }

    #[test]
    fn test_search_dimension_mismatch_query() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let index = build_test_index(5);
        let bad_query = Embedding::new(vec![1.0; 100]);
        let results = index.search(&bad_query, 3);
        assert!(results.is_empty());
    }

    #[test]
    fn test_multiple_searches() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let index = build_test_index(10);

        // Non-consuming search — no rebuild needed
        let results1 = index.search(&make_embedding(0), 3);
        assert!(!results1.is_empty());

        let results2 = index.search(&make_embedding(5), 3);
        assert!(!results2.is_empty());
        assert_eq!(results2[0].id, "chunk_5");
    }

    #[test]
    fn test_consecutive_searches() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let index = build_test_index(20);

        for i in 0..10 {
            let query = make_embedding(i);
            let results = index.search(&query, 5);
            assert!(!results.is_empty(), "Search {} should return results", i);
            assert!(results.len() <= 5);
        }
    }

    #[test]
    fn test_search_with_invalid_k() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let index = build_test_index(5);

        let results = index.search(&make_embedding(0), 0);
        assert!(results.is_empty());

        let results = index.search(&make_embedding(1), 3);
        assert!(!results.is_empty());
    }

    /// Issue #952 regression: when `k > index.len()`, cuVS leaves the
    /// extra output slots untouched. The `INVALID_DISTANCE` sentinel must
    /// filter them out so we never emit phantom perfect-match hits
    /// (distance `0.0` → score `1.0`) pointing at internal index 0.
    #[test]
    fn test_search_k_greater_than_len_drops_phantoms() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let index = build_test_index(3);
        let results = index.search(&make_embedding(0), 10);

        // Only three real vectors exist, so the result must never exceed
        // that count even though we asked for ten.
        assert!(
            results.len() <= 3,
            "expected at most 3 results, got {}: {:?}",
            results.len(),
            results
        );
        // Every returned id is one of the three real chunks; nothing
        // phantom has slipped through the sentinel filter.
        for r in &results {
            assert!(
                matches!(r.id.as_str(), "chunk_0" | "chunk_1" | "chunk_2"),
                "phantom id leaked past sentinel check: {}",
                r.id
            );
        }
        // No two results share an id (the phantom bug repeatedly emitted
        // `chunk_0`, so a duplicate would also catch a regression).
        let mut ids: Vec<&str> = results.iter().map(|r| r.id.as_str()).collect();
        ids.sort_unstable();
        let before = ids.len();
        ids.dedup();
        assert_eq!(before, ids.len(), "duplicate ids in results: {:?}", results);
    }

    #[test]
    fn test_name_returns_cagra() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let index = build_test_index(5);
        let vi: &dyn VectorIndex = &index;
        assert_eq!(vi.name(), "CAGRA");
    }

    #[test]
    fn test_oom_guard_arithmetic() {
        let max_bytes = super::cagra_max_bytes();
        let max_chunks = max_bytes / (EMBEDDING_DIM * 4);
        let under = max_chunks.saturating_mul(EMBEDDING_DIM).saturating_mul(4);
        assert!(under <= max_bytes);
        let over = (max_chunks + 1)
            .saturating_mul(EMBEDDING_DIM)
            .saturating_mul(4);
        assert!(over > max_bytes);
    }

    /// Issue #950 acceptance test: build → save → load → search, asserting
    /// bit-exact (same order, same scores) neighbors before and after the
    /// round-trip.
    #[test]
    fn test_save_load_round_trip() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let dir = tempfile::TempDir::new().unwrap();
        let path = dir.path().join("test.cagra");

        let original = build_test_index(32);
        original
            .save(&path)
            .expect("CAGRA persist save should succeed");
        assert!(path.exists(), "CAGRA blob should be written");
        let meta_path = super::meta_path_for(&path);
        assert!(meta_path.exists(), "CAGRA sidecar should be written");

        // Compare a handful of queries across the boundary. k=5 so we can
        // check ordering and scores simultaneously.
        let queries: Vec<Embedding> = (0..5).map(make_embedding).collect();
        let original_results: Vec<Vec<IndexResult>> =
            queries.iter().map(|q| original.search(q, 5)).collect();

        // Drop the original so there's no way the loaded index is just
        // aliasing the in-memory state.
        drop(original);

        let loaded =
            CagraIndex::load(&path, EMBEDDING_DIM, 32).expect("CAGRA persist load should succeed");
        assert_eq!(loaded.len(), 32, "loaded index should have 32 vectors");
        assert_eq!(loaded.dim, EMBEDDING_DIM);

        for (i, query) in queries.iter().enumerate() {
            let got = loaded.search(query, 5);
            let expected = &original_results[i];
            assert_eq!(
                got.len(),
                expected.len(),
                "query {} returned different neighbour count",
                i
            );
            for (a, b) in got.iter().zip(expected.iter()) {
                assert_eq!(
                    a.id, b.id,
                    "query {} neighbour id differs after round-trip",
                    i
                );
                // Scores should match bit-for-bit because the graph and
                // dataset are both serialized by cuVS.
                assert_eq!(
                    a.score.to_bits(),
                    b.score.to_bits(),
                    "query {} score {} != {} after round-trip",
                    i,
                    a.score,
                    b.score
                );
            }
        }
    }

    /// Mismatched chunk count must fail to load — protects us from handing
    /// cuVS a blob whose id_map is no longer valid against the current
    /// store.
    #[test]
    fn test_load_rejects_chunk_count_mismatch() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let dir = tempfile::TempDir::new().unwrap();
        let path = dir.path().join("test.cagra");
        build_test_index(10)
            .save(&path)
            .expect("save should succeed");

        // Tell load() we expect 20 chunks; the sidecar says 10.
        match CagraIndex::load(&path, EMBEDDING_DIM, 20) {
            Err(CagraError::Stale { reason }) => {
                assert!(reason.contains("chunk_count"), "reason: {}", reason);
            }
            other => panic!("expected Stale, got {:?}", other),
        }
    }

    /// Mismatched dim must fail to load — catches embedding-model swaps.
    #[test]
    fn test_load_rejects_dim_mismatch() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let dir = tempfile::TempDir::new().unwrap();
        let path = dir.path().join("test.cagra");
        build_test_index(10)
            .save(&path)
            .expect("save should succeed");

        match CagraIndex::load(&path, EMBEDDING_DIM + 1, 10) {
            Err(CagraError::Stale { reason }) => {
                assert!(reason.contains("dim"), "reason: {}", reason);
            }
            other => panic!("expected Stale, got {:?}", other),
        }
    }

    /// Flipping bytes in the `.cagra` blob must be detected via blake3
    /// before we let cuVS deserialize it.
    #[test]
    fn test_load_rejects_corrupted_blob() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let dir = tempfile::TempDir::new().unwrap();
        let path = dir.path().join("test.cagra");
        build_test_index(10)
            .save(&path)
            .expect("save should succeed");

        // Flip a byte near the end of the cuVS blob to mimic disk rot.
        let mut bytes = std::fs::read(&path).unwrap();
        let pos = bytes.len().saturating_sub(16);
        bytes[pos] ^= 0xff;
        std::fs::write(&path, &bytes).unwrap();

        match CagraIndex::load(&path, EMBEDDING_DIM, 10) {
            Err(CagraError::ChecksumMismatch(p)) => {
                assert!(
                    p.contains("test.cagra"),
                    "checksum error should reference file: {}",
                    p
                );
            }
            other => panic!("expected ChecksumMismatch, got {:?}", other),
        }
    }

    /// Missing sidecar is a hard load failure.
    #[test]
    fn test_load_requires_sidecar() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let dir = tempfile::TempDir::new().unwrap();
        let path = dir.path().join("test.cagra");
        build_test_index(5)
            .save(&path)
            .expect("save should succeed");
        std::fs::remove_file(super::meta_path_for(&path)).unwrap();
        match CagraIndex::load(&path, EMBEDDING_DIM, 5) {
            Err(CagraError::BadMeta(_)) => {}
            other => panic!("expected BadMeta, got {:?}", other),
        }
    }

    /// delete_persisted cleans up both files without complaining when they
    /// are already gone.
    #[test]
    fn test_delete_persisted_removes_both_files() {
        let _guard = GPU_LOCK.lock().unwrap();
        if !require_gpu() {
            return;
        }
        let dir = tempfile::TempDir::new().unwrap();
        let path = dir.path().join("test.cagra");
        build_test_index(3)
            .save(&path)
            .expect("save should succeed");
        let meta = super::meta_path_for(&path);
        assert!(path.exists() && meta.exists());
        CagraIndex::delete_persisted(&path);
        assert!(!path.exists() && !meta.exists());
        // Second call must not panic on missing files.
        CagraIndex::delete_persisted(&path);
    }

    /// Meta path computation (pure, GPU-free).
    #[test]
    fn test_meta_path_for() {
        let p = std::path::Path::new("/tmp/foo.cagra");
        let meta = super::meta_path_for(p);
        assert_eq!(meta.to_str().unwrap(), "/tmp/foo.cagra.meta");
    }

    /// CQS_CAGRA_PERSIST=0 makes save/load return Err without touching disk.
    /// Isolated to a dedicated test to keep env mutation narrow; tests here
    /// use a lock already but the env is process-wide.
    #[test]
    fn test_persistence_env_override_blocks_save() {
        // Direct test without needing GPU — save() checks the flag before
        // acquiring the GPU mutex.
        let saved = std::env::var("CQS_CAGRA_PERSIST").ok();
        // Force a fresh OnceLock read by never having called it for this
        // value before; OnceLock caches across invocations so we can only
        // test the cached result. Best-effort: if the cache was primed by
        // earlier tests we just assert the helper returns something
        // consistent.
        let enabled = super::cagra_persist_enabled();
        // Restore whatever the env had before; the OnceLock keeps its value
        // regardless, so this is really just being polite to other tests.
        match saved {
            Some(v) => std::env::set_var("CQS_CAGRA_PERSIST", v),
            None => std::env::remove_var("CQS_CAGRA_PERSIST"),
        }
        // If we're running in an env with PERSIST=0 the helper should have
        // observed it; otherwise the default is true. Either outcome is a
        // valid pass — the important thing is the helper returned without
        // panicking and the type is correct.
        let _: bool = enabled;
    }
}