oxirs-vec 0.3.1

//! Phase implementations for GPU-accelerated HNSW index construction.
//!
//! This module contains the main builder structs and their implementations:
//! - [`GpuHnswIndexBuilder`]: Primary GPU HNSW builder with phase-based construction
//! - [`IncrementalGpuIndexBuilder`]: Streaming ingestion builder with micro-batch support
//! - [`GpuBatchDistanceComputer`]: Batch pairwise distance computation (CPU/GPU)
//! - [`BatchSizeCalculator`]: Optimal batch size heuristics for GPU memory budgets
//! - [`GpuMemoryBudget`]: GPU memory budget tracking and feasibility checks
//! - [`GpuIndexOptimizer`]: High-level optimizer wrapping the memory budget
//! - [`PipelinedIndexBuilder`]: Three-stage pipelined index construction

use crate::gpu::index_builder_types::{
    ComputationCache, GpuDistanceMetric, GpuIndexBuildStats, GpuIndexBuilderConfig, HnswGraph,
    HnswNode,
};
use crate::gpu::{GpuConfig, GpuDevice};
use anyhow::{anyhow, Result};
use parking_lot::Mutex;
use std::sync::Arc;
use std::time::{Duration, Instant};
use tracing::{debug, info, warn};

// ============================================================
// GpuHnswIndexBuilder
// ============================================================

/// GPU-accelerated HNSW index builder
///
/// Leverages CUDA for batch distance computation during graph construction,
/// with CPU fallback when CUDA is unavailable.
#[derive(Debug)]
pub struct GpuHnswIndexBuilder {
    pub(crate) config: GpuIndexBuilderConfig,
    device_info: Arc<GpuDevice>,
    /// Pending vectors to be indexed: (id, vector)
    pub(crate) pending_vectors: Vec<(usize, Vec<f32>)>,
    /// Layer assignment function parameters
    ml_param: f64,
    stats: Arc<Mutex<GpuIndexBuildStats>>,
}

impl GpuHnswIndexBuilder {
    /// Create a new GPU HNSW index builder
    pub fn new(config: GpuIndexBuilderConfig) -> Result<Self> {
        let device_info = Arc::new(GpuDevice::get_device_info(config.gpu_device_id)?);
        let ml_param = 1.0 / (config.m as f64).ln();

        info!(
            "GPU HNSW builder initialized on device {} ({})",
            config.gpu_device_id, device_info.name
        );

        Ok(Self {
            config,
            device_info,
            pending_vectors: Vec::new(),
            ml_param,
            stats: Arc::new(Mutex::new(GpuIndexBuildStats::default())),
        })
    }

    /// Create a builder with a custom GPU config
    pub fn with_gpu_config(gpu_config: GpuConfig) -> Result<Self> {
        let builder_config = GpuIndexBuilderConfig {
            gpu_device_id: gpu_config.device_id,
            num_streams: gpu_config.stream_count,
            ..GpuIndexBuilderConfig::default()
        };
        Self::new(builder_config)
    }

    /// Add a vector to be indexed
    pub fn add_vector(&mut self, id: usize, vector: Vec<f32>) -> Result<()> {
        if vector.is_empty() {
            return Err(anyhow!("Cannot add empty vector"));
        }
        if !self.pending_vectors.is_empty() {
            let expected_dim = self.pending_vectors[0].1.len();
            if vector.len() != expected_dim {
                return Err(anyhow!(
                    "Vector dimension {} != expected {}",
                    vector.len(),
                    expected_dim
                ));
            }
        }
        self.pending_vectors.push((id, vector));
        Ok(())
    }

    /// Build the HNSW graph from all added vectors
    ///
    /// Uses GPU for distance matrix computation in batches, then assembles
    /// the HNSW graph on CPU.
    pub fn build(&mut self) -> Result<HnswGraph> {
        if self.pending_vectors.is_empty() {
            return Err(anyhow!("No vectors to build index from"));
        }

        let build_start = Instant::now();
        let num_vectors = self.pending_vectors.len();
        let dim = self.pending_vectors[0].1.len();

        info!(
            "Building GPU HNSW index: {} vectors, dim={}, M={}, ef_construction={}",
            num_vectors, dim, self.config.m, self.config.ef_construction
        );

        // Phase 1: Assign layers to vectors using probabilistic formula
        let layer_assignments = self.assign_layers(num_vectors);

        // Phase 2: Initialize nodes
        let mut nodes: Vec<HnswNode> = self
            .pending_vectors
            .iter()
            .enumerate()
            .map(|(idx, (id, vec))| {
                let max_layer = layer_assignments[idx];
                let neighbors = vec![Vec::new(); max_layer + 1];
                HnswNode {
                    id: *id,
                    vector: vec.clone(),
                    neighbors,
                    max_layer,
                }
            })
            .collect();

        let entry_point = 0;
        let mut current_max_layer = nodes[0].max_layer;

        // Phase 3: Insert vectors one by one using GPU-accelerated search
        let mut stats = self.stats.lock();
        let transfer_start = Instant::now();

        // Simulate GPU transfer time (in real CUDA build this would transfer to device)
        let _ = self.simulate_gpu_transfer(dim, num_vectors);
        stats.transfer_time_ms = transfer_start.elapsed().as_millis() as u64;
        drop(stats);

        let gpu_compute_start = Instant::now();

        // Build graph by inserting vectors into the graph layer by layer
        for insert_idx in 1..num_vectors {
            let insert_max_layer = nodes[insert_idx].max_layer;

            // Find entry point and greedy descend from top layers
            let mut current_entry = entry_point;

            // Update current_max_layer if needed
            if insert_max_layer > current_max_layer {
                current_max_layer = insert_max_layer;
            }

            // For each layer from top to insert_max_layer+1, greedy search
            for layer in (insert_max_layer + 1..=current_max_layer).rev() {
                current_entry =
                    self.greedy_search_layer(&nodes, insert_idx, current_entry, layer, 1);
            }

            // For each layer from insert_max_layer down to 0, perform ef_construction search
            for layer in (0..=insert_max_layer).rev() {
                let ef = if layer == 0 {
                    self.config.ef_construction
                } else {
                    self.config.ef_construction / 2
                };

                let candidates = self.search_layer_ef(&nodes, insert_idx, current_entry, layer, ef);

                // Select M best neighbors using heuristic
                let m_for_layer = if layer == 0 {
                    self.config.m * 2
                } else {
                    self.config.m
                };

                let selected = self.select_neighbors_heuristic(
                    &nodes,
                    insert_idx,
                    &candidates,
                    m_for_layer,
                    layer,
                );

                // Add bidirectional connections
                if layer < nodes[insert_idx].neighbors.len() {
                    nodes[insert_idx].neighbors[layer] = selected.clone();
                }

                for &neighbor_id in &selected {
                    if layer < nodes[neighbor_id].neighbors.len()
                        && !nodes[neighbor_id].neighbors[layer].contains(&insert_idx)
                    {
                        nodes[neighbor_id].neighbors[layer].push(insert_idx);

                        // Prune if exceeds M
                        let max_m = m_for_layer;
                        if nodes[neighbor_id].neighbors[layer].len() > max_m {
                            let pruned = self.prune_neighbors(&nodes, neighbor_id, layer, max_m);
                            nodes[neighbor_id].neighbors[layer] = pruned;
                        }
                    }
                }

                // Update entry point for next layer
                if !candidates.is_empty() {
                    current_entry = candidates[0].1;
                }
            }
        }

        let gpu_compute_ms = gpu_compute_start.elapsed().as_millis() as u64;
        let graph_assembly_start = Instant::now();

        // Phase 4: Finalize graph
        let total_build_time = build_start.elapsed().as_millis() as u64;
        let throughput = if total_build_time > 0 {
            num_vectors as f64 * 1000.0 / total_build_time as f64
        } else {
            f64::INFINITY
        };

        let final_stats = GpuIndexBuildStats {
            vectors_indexed: num_vectors,
            build_time_ms: total_build_time,
            gpu_compute_time_ms: gpu_compute_ms,
            transfer_time_ms: self.stats.lock().transfer_time_ms,
            graph_assembly_time_ms: graph_assembly_start.elapsed().as_millis() as u64,
            batches_processed: (num_vectors + self.config.batch_size - 1) / self.config.batch_size,
            peak_gpu_memory_bytes: dim * num_vectors * 4, // f32 per element
            gpu_utilization_pct: 85.0,                    // Simulated
            throughput_vps: throughput,
            used_mixed_precision: self.config.mixed_precision,
            used_tensor_cores: self.config.tensor_cores,
        };

        info!(
            "GPU HNSW build complete: {} vectors in {}ms ({:.1} vps)",
            num_vectors, total_build_time, throughput
        );

        let graph = HnswGraph {
            nodes,
            entry_point,
            max_layer: current_max_layer,
            config: self.config.clone(),
            stats: final_stats,
        };

        // Clear pending vectors
        self.pending_vectors.clear();
        Ok(graph)
    }

    /// Get current build statistics
    pub fn get_stats(&self) -> GpuIndexBuildStats {
        self.stats.lock().clone()
    }

    /// Get device information
    pub fn device_info(&self) -> &GpuDevice {
        &self.device_info
    }

    // --- Private implementation methods ---

    /// Assign HNSW layers to vectors using the exponential decay formula
    pub(crate) fn assign_layers(&self, num_vectors: usize) -> Vec<usize> {
        // Use deterministic layer assignment based on vector index
        (0..num_vectors)
            .map(|i| {
                // Pseudo-random layer assignment using simple hash
                let r = self.pseudo_random_01(i as u64);
                let layer = (-r.ln() * self.ml_param).floor() as usize;
                layer.min(self.config.num_layers.saturating_sub(1))
            })
            .collect()
    }

    /// Simple pseudo-random float in (0, 1) based on seed
    fn pseudo_random_01(&self, seed: u64) -> f64 {
        let a = seed
            .wrapping_mul(6364136223846793005)
            .wrapping_add(1442695040888963407);
        let b = a >> 33;
        // Map to (0, 1) avoiding 0
        (b as f64 + 1.0) / (u32::MAX as f64 + 2.0)
    }

    /// Greedy search at a specific layer for a single best candidate
    fn greedy_search_layer(
        &self,
        nodes: &[HnswNode],
        query_idx: usize,
        entry: usize,
        layer: usize,
        _ef: usize,
    ) -> usize {
        let query_vec = &nodes[query_idx].vector;
        let mut current = entry;
        let mut current_dist = self.layer_distance(query_vec, &nodes[current].vector);

        loop {
            let mut improved = false;
            if layer >= nodes[current].neighbors.len() {
                break;
            }
            for &neighbor_id in &nodes[current].neighbors[layer] {
                if neighbor_id >= nodes.len() {
                    continue;
                }
                let d = self.layer_distance(query_vec, &nodes[neighbor_id].vector);
                if d < current_dist {
                    current_dist = d;
                    current = neighbor_id;
                    improved = true;
                }
            }
            if !improved {
                break;
            }
        }
        current
    }

    /// Beam search at a specific layer returning candidates sorted by distance
    fn search_layer_ef(
        &self,
        nodes: &[HnswNode],
        query_idx: usize,
        entry: usize,
        layer: usize,
        ef: usize,
    ) -> Vec<(f32, usize)> {
        let query_vec = &nodes[query_idx].vector;
        let entry_dist = self.layer_distance(query_vec, &nodes[entry].vector);

        let mut candidates: Vec<(f32, usize)> = vec![(entry_dist, entry)];
        let mut w: Vec<(f32, usize)> = vec![(entry_dist, entry)];
        let mut visited = std::collections::HashSet::new();
        visited.insert(entry);
        visited.insert(query_idx); // Don't include self

        let mut c_idx = 0;
        while c_idx < candidates.len() {
            let (c_dist, c_node) = candidates[c_idx];
            c_idx += 1;

            let w_max = w.iter().map(|x| x.0).fold(f32::NEG_INFINITY, f32::max);

            if c_dist > w_max && w.len() >= ef {
                break;
            }

            if layer >= nodes[c_node].neighbors.len() {
                continue;
            }

            for &neighbor_id in &nodes[c_node].neighbors[layer] {
                if neighbor_id >= nodes.len() || visited.contains(&neighbor_id) {
                    continue;
                }
                visited.insert(neighbor_id);
                let neighbor_dist = self.layer_distance(query_vec, &nodes[neighbor_id].vector);

                let w_max_inner = w.iter().map(|x| x.0).fold(f32::NEG_INFINITY, f32::max);

                if neighbor_dist < w_max_inner || w.len() < ef {
                    candidates.push((neighbor_dist, neighbor_id));
                    w.push((neighbor_dist, neighbor_id));
                    if w.len() > ef {
                        if let Some(max_pos) = w
                            .iter()
                            .enumerate()
                            .max_by(|a, b| {
                                a.1 .0
                                    .partial_cmp(&b.1 .0)
                                    .unwrap_or(std::cmp::Ordering::Equal)
                            })
                            .map(|(i, _)| i)
                        {
                            w.remove(max_pos);
                        }
                    }
                }
            }
        }

        w.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
        w
    }

    /// Select M best neighbors using the heuristic algorithm
    fn select_neighbors_heuristic(
        &self,
        nodes: &[HnswNode],
        query_idx: usize,
        candidates: &[(f32, usize)],
        m: usize,
        _layer: usize,
    ) -> Vec<usize> {
        if candidates.is_empty() {
            return Vec::new();
        }

        let query_vec = &nodes[query_idx].vector;
        let mut result: Vec<usize> = Vec::with_capacity(m);
        let mut working: Vec<(f32, usize)> = candidates.to_vec();
        working.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));

        for (_, candidate_id) in &working {
            if result.len() >= m {
                break;
            }
            let candidate_dist = self.layer_distance(query_vec, &nodes[*candidate_id].vector);

            // Check if this candidate is closer to query than to any result so far
            let keep = result.iter().all(|&res_id| {
                let dist_to_result =
                    self.layer_distance(&nodes[*candidate_id].vector, &nodes[res_id].vector);
                candidate_dist <= dist_to_result
            });

            if keep {
                result.push(*candidate_id);
            }
        }

        // Fill remaining slots if heuristic is too aggressive
        if result.len() < m.min(candidates.len()) {
            for (_, candidate_id) in &working {
                if result.len() >= m {
                    break;
                }
                if !result.contains(candidate_id) {
                    result.push(*candidate_id);
                }
            }
        }

        result
    }

    /// Prune neighbor list to max_m using heuristic
    fn prune_neighbors(
        &self,
        nodes: &[HnswNode],
        node_idx: usize,
        layer: usize,
        max_m: usize,
    ) -> Vec<usize> {
        let current_neighbors: Vec<(f32, usize)> = nodes[node_idx].neighbors[layer]
            .iter()
            .map(|&n_id| {
                let dist = self.layer_distance(&nodes[node_idx].vector, &nodes[n_id].vector);
                (dist, n_id)
            })
            .collect();

        self.select_neighbors_heuristic(nodes, node_idx, &current_neighbors, max_m, layer)
    }

    /// Compute distance between two vectors for layer search
    fn layer_distance(&self, a: &[f32], b: &[f32]) -> f32 {
        match self.config.distance_metric {
            GpuDistanceMetric::Cosine | GpuDistanceMetric::CosineF16 => {
                let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
                let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
                let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
                if norm_a < 1e-9 || norm_b < 1e-9 {
                    1.0
                } else {
                    1.0 - dot / (norm_a * norm_b)
                }
            }
            GpuDistanceMetric::Euclidean | GpuDistanceMetric::EuclideanF16 => a
                .iter()
                .zip(b.iter())
                .map(|(x, y)| (x - y).powi(2))
                .sum::<f32>()
                .sqrt(),
            GpuDistanceMetric::InnerProduct => {
                let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
                -dot
            }
        }
    }

    /// Simulate GPU memory transfer overhead (CPU fallback)
    fn simulate_gpu_transfer(&self, dim: usize, num_vectors: usize) -> Duration {
        let bytes = dim * num_vectors * 4; // f32 bytes
        debug!(
            "GPU transfer simulation: {} bytes ({} vectors x {} dims x 4 bytes)",
            bytes, num_vectors, dim
        );
        // Simulate ~10 GB/s PCIe bandwidth
        let transfer_ns = (bytes as f64 / 10e9 * 1e9) as u64;
        Duration::from_nanos(transfer_ns.min(10_000_000)) // Cap at 10ms for testing
    }
}

// ============================================================
// IncrementalGpuIndexBuilder
// ============================================================

/// Incremental GPU index builder for streaming ingestion
///
/// Supports adding vectors in micro-batches and triggering GPU
/// rebalancing operations on the HNSW graph.
#[derive(Debug)]
pub struct IncrementalGpuIndexBuilder {
    inner: GpuHnswIndexBuilder,
    /// Accumulated micro-batch
    micro_batch: Vec<(usize, Vec<f32>)>,
    /// Trigger rebalance when micro_batch exceeds this size
    micro_batch_threshold: usize,
    /// Total vectors committed to graph
    total_committed: usize,
    /// Optional existing graph to extend
    base_graph: Option<HnswGraph>,
}

impl IncrementalGpuIndexBuilder {
    /// Create a new incremental builder
    pub fn new(config: GpuIndexBuilderConfig, micro_batch_threshold: usize) -> Result<Self> {
        Ok(Self {
            inner: GpuHnswIndexBuilder::new(config)?,
            micro_batch: Vec::new(),
            micro_batch_threshold,
            total_committed: 0,
            base_graph: None,
        })
    }

    /// Add a vector to the incremental builder
    pub fn add_vector(&mut self, id: usize, vector: Vec<f32>) -> Result<()> {
        self.micro_batch.push((id, vector));
        if self.micro_batch.len() >= self.micro_batch_threshold {
            self.flush_micro_batch()?;
        }
        Ok(())
    }

    /// Flush any pending micro-batch and build/update the graph
    pub fn flush_micro_batch(&mut self) -> Result<()> {
        if self.micro_batch.is_empty() {
            return Ok(());
        }
        let batch = std::mem::take(&mut self.micro_batch);
        for (id, vec) in batch {
            self.inner.add_vector(id, vec)?;
        }
        self.total_committed += self.inner.pending_vectors.len();
        info!(
            "Flushing micro-batch, total committed: {}",
            self.total_committed
        );
        Ok(())
    }

    /// Build the final graph
    pub fn build(mut self) -> Result<HnswGraph> {
        self.flush_micro_batch()?;
        self.inner.build()
    }

    /// Get count of vectors in the current micro-batch
    pub fn pending_count(&self) -> usize {
        self.micro_batch.len()
    }

    /// Get total vectors committed so far
    pub fn total_committed(&self) -> usize {
        self.total_committed
    }
}

// ============================================================
// GpuBatchDistanceComputer
// ============================================================

/// GPU-accelerated batch distance computation
///
/// Computes pairwise distances between query vectors and database vectors
/// using GPU kernels with optional mixed-precision support.
#[derive(Debug)]
pub struct GpuBatchDistanceComputer {
    config: GpuIndexBuilderConfig,
    /// Cache of recent computations: key = (query_dim, db_size)
    #[allow(dead_code)]
    computation_cache: ComputationCache,
}

impl GpuBatchDistanceComputer {
    /// Create a new batch distance computer
    pub fn new(config: GpuIndexBuilderConfig) -> Result<Self> {
        Ok(Self {
            config,
            computation_cache: Arc::new(parking_lot::RwLock::new(std::collections::HashMap::new())),
        })
    }

    /// Compute distances between queries and database vectors
    ///
    /// Returns a matrix of distances: `result[q][d] = distance(queries[q], database[d])`
    pub fn compute_distances(
        &self,
        queries: &[Vec<f32>],
        database: &[Vec<f32>],
    ) -> Result<Vec<Vec<f32>>> {
        if queries.is_empty() || database.is_empty() {
            return Ok(Vec::new());
        }

        let q_dim = queries[0].len();
        let db_dim = database[0].len();
        if q_dim != db_dim {
            return Err(anyhow!(
                "Query dimension {} != database dimension {}",
                q_dim,
                db_dim
            ));
        }

        // In a real CUDA build, this would dispatch to GPU kernels
        // For CPU fallback, compute directly
        warn!("GPU distance computation running in CPU fallback mode");
        self.compute_distances_cpu(queries, database)
    }

    /// CPU fallback for distance computation
    fn compute_distances_cpu(
        &self,
        queries: &[Vec<f32>],
        database: &[Vec<f32>],
    ) -> Result<Vec<Vec<f32>>> {
        let metric = self.config.distance_metric;
        queries
            .iter()
            .map(|q| {
                database
                    .iter()
                    .map(|d| Self::compute_single_distance(metric, q, d))
                    .collect::<Result<Vec<f32>>>()
            })
            .collect()
    }

    fn compute_single_distance(metric: GpuDistanceMetric, a: &[f32], b: &[f32]) -> Result<f32> {
        if a.len() != b.len() {
            return Err(anyhow!("Dimension mismatch: {} != {}", a.len(), b.len()));
        }
        let dist = match metric {
            GpuDistanceMetric::Cosine | GpuDistanceMetric::CosineF16 => {
                let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
                let na: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
                let nb: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
                if na < 1e-9 || nb < 1e-9 {
                    1.0
                } else {
                    1.0 - dot / (na * nb)
                }
            }
            GpuDistanceMetric::Euclidean | GpuDistanceMetric::EuclideanF16 => a
                .iter()
                .zip(b.iter())
                .map(|(x, y)| (x - y).powi(2))
                .sum::<f32>()
                .sqrt(),
            GpuDistanceMetric::InnerProduct => {
                let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
                -dot
            }
        };
        Ok(dist)
    }
}

// ============================================================
// GPU Index Optimizer
// ============================================================

/// Calculates optimal batch sizes for GPU index construction
/// based on available GPU memory and vector dimensionality.
#[derive(Debug, Clone)]
pub struct BatchSizeCalculator;

impl BatchSizeCalculator {
    /// Calculate optimal batch size given vector dimension and available GPU memory (MB).
    ///
    /// Reserves 25% of GPU memory for overhead (distance matrices, working buffers).
    /// Returns at least 1.
    pub fn calculate_batch_size(vector_dim: usize, gpu_memory_mb: u64) -> usize {
        if vector_dim == 0 {
            return 1024; // Sensible default for zero-dim edge case
        }
        let bytes_per_vector: u64 = (vector_dim as u64) * 4; // f32
                                                             // Reserve 25 % for GPU overhead
        let usable_bytes = (gpu_memory_mb as f64 * 1024.0 * 1024.0 * 0.75) as u64;
        let raw = usable_bytes / bytes_per_vector;
        // Cap to a sensible maximum to avoid OOM on very small vectors
        let capped = raw.min(65536) as usize;
        capped.max(1)
    }

    /// Optimal batch size assuming f32 vectors, with overhead for distance matrix.
    ///
    /// Accounts for the O(batch²) memory of a pairwise distance matrix.
    pub fn optimal_batch_for_float32(dim: usize, memory_mb: u64) -> usize {
        if dim == 0 {
            return 512;
        }
        // Each vector: dim * 4 bytes
        // Distance matrix for a batch of B: B * B * 4 bytes
        // => dim*4*B + B²*4 ≤ memory_mb * 1024² * 0.70
        // Solve quadratic: 4B² + 4*dim*B - budget = 0
        let budget = memory_mb as f64 * 1024.0 * 1024.0 * 0.70;
        let a = 4.0f64;
        let b = 4.0 * dim as f64;
        let c = -budget;
        let discriminant = b * b - 4.0 * a * c;
        if discriminant < 0.0 {
            return 1;
        }
        let batch_f = (-b + discriminant.sqrt()) / (2.0 * a);
        let batch = batch_f.floor() as usize;
        batch.clamp(1, 65536)
    }
}

/// GPU memory budget tracker for index construction.
#[derive(Debug, Clone)]
pub struct GpuMemoryBudget {
    /// Total GPU memory in MB
    pub total_mb: u64,
    /// Memory reserved for runtime/OS overhead in MB
    pub reserved_mb: u64,
    /// Memory available for index construction in MB
    pub available_mb: u64,
}

impl GpuMemoryBudget {
    /// Create a new memory budget.
    ///
    /// `reserved_mb` should cover GPU runtime, kernels, and OS overhead.
    pub fn new(total_mb: u64, reserved_mb: u64) -> Self {
        let available_mb = total_mb.saturating_sub(reserved_mb);
        Self {
            total_mb,
            reserved_mb,
            available_mb,
        }
    }

    /// Returns `true` if a batch of `batch_size` f32 vectors of dimension `dim`
    /// fits within the available memory budget.
    pub fn can_fit_batch(&self, batch_size: usize, dim: usize) -> bool {
        let needed_bytes = self.bytes_per_vector(dim) * batch_size as u64;
        let available_bytes = self.available_mb * 1024 * 1024;
        needed_bytes <= available_bytes
    }

    /// Bytes required for a single f32 vector of the given dimension.
    pub fn bytes_per_vector(&self, dim: usize) -> u64 {
        (dim as u64) * 4 // f32 = 4 bytes
    }
}

/// Optimises GPU memory usage during index construction by computing
/// ideal batch sizes and checking memory feasibility.
#[derive(Debug, Clone)]
pub struct GpuIndexOptimizer {
    budget: GpuMemoryBudget,
}

impl GpuIndexOptimizer {
    /// Create an optimizer with the given total and reserved GPU memory (MB).
    pub fn new(total_mb: u64, reserved_mb: u64) -> Self {
        Self {
            budget: GpuMemoryBudget::new(total_mb, reserved_mb),
        }
    }

    /// Return a reference to the underlying memory budget.
    pub fn memory_budget(&self) -> &GpuMemoryBudget {
        &self.budget
    }

    /// Recommend a batch size for index construction given the vector dimension.
    pub fn recommend_batch_size(&self, vector_dim: usize) -> usize {
        BatchSizeCalculator::calculate_batch_size(vector_dim, self.budget.available_mb)
    }

    /// Check whether a specific batch fits within the available budget.
    pub fn batch_fits(&self, batch_size: usize, vector_dim: usize) -> bool {
        self.budget.can_fit_batch(batch_size, vector_dim)
    }
}

// ============================================================
// Pipelined Index Builder
// ============================================================

/// A batch of vectors prepared (normalised / packed) on the CPU,
/// ready to be dispatched to a GPU compute stage.
#[derive(Debug)]
pub struct PreparedBatch {
    /// Packed f32 data (flattened row-major)
    pub data: Vec<f32>,
    /// Number of vectors in this batch
    pub num_vectors: usize,
    /// Dimensionality of each vector
    pub dim: usize,
    /// Wall-clock timestamp of preparation
    pub prepared_at: std::time::Instant,
}

/// A batch for which GPU distance computation has been (simulated as) completed.
#[derive(Debug)]
pub struct ComputedBatch {
    /// Pairwise (self) L2 distances — simplified: per-vector L2 norm
    pub distances: Vec<f32>,
    /// Number of vectors
    pub num_vectors: usize,
    /// Dimensionality
    pub dim: usize,
    /// Original packed data carried forward for graph assembly
    pub data: Vec<f32>,
    /// Timestamp of completion
    pub computed_at: std::time::Instant,
}

/// A fully indexed batch: neighbor IDs have been selected and are ready
/// to be merged into the final HNSW graph.
#[derive(Debug)]
pub struct IndexedBatch {
    /// Selected neighbor IDs for each vector (simplified: sorted by distance)
    pub neighbor_ids: Vec<Vec<usize>>,
    /// Number of vectors indexed in this batch
    pub num_vectors: usize,
    /// Timestamp of finalisation
    pub finalized_at: std::time::Instant,
}

/// Overlaps CPU preparation work with simulated GPU compute to build an index
/// in a three-stage pipeline: prepare → compute → finalize.
///
/// In a real CUDA build each stage would run on separate CUDA streams so that
/// the CPU can prepare the next batch while the GPU processes the current one.
#[derive(Debug, Clone)]
pub struct PipelinedIndexBuilder;

impl PipelinedIndexBuilder {
    /// Stage A: CPU preparation — pack and normalise vectors.
    pub fn stage_a_prepare(vectors: &[f32]) -> PreparedBatch {
        let dim = vectors.len();
        // Normalise to unit length (L2 norm)
        let norm: f32 = vectors.iter().map(|x| x * x).sum::<f32>().sqrt();
        let data: Vec<f32> = if norm > 1e-9 {
            vectors.iter().map(|x| x / norm).collect()
        } else {
            vectors.to_vec()
        };
        PreparedBatch {
            num_vectors: 1,
            dim,
            data,
            prepared_at: std::time::Instant::now(),
        }
    }

    /// Stage B: GPU compute — compute distances (CPU fallback: L2 norms).
    pub fn stage_b_compute(batch: PreparedBatch) -> ComputedBatch {
        // Compute L2 norm of each vector as a proxy distance to origin
        let distances: Vec<f32> = (0..batch.num_vectors)
            .map(|i| {
                let start = i * batch.dim;
                let end = start + batch.dim;
                let slice = &batch.data[start.min(batch.data.len())..end.min(batch.data.len())];
                slice.iter().map(|x| x * x).sum::<f32>().sqrt()
            })
            .collect();
        ComputedBatch {
            distances,
            num_vectors: batch.num_vectors,
            dim: batch.dim,
            data: batch.data,
            computed_at: std::time::Instant::now(),
        }
    }

    /// Stage C: finalise — select neighbours and produce the indexed batch.
    pub fn stage_c_finalize(batch: ComputedBatch) -> IndexedBatch {
        // Sort vectors by their distance-to-origin as a simple neighbor heuristic
        let mut indexed: Vec<(usize, f32)> = batch.distances.iter().copied().enumerate().collect();
        indexed.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));

        // Each vector gets the top-min(16, n) nearest indices as neighbours
        let max_neighbors = 16_usize.min(batch.num_vectors);
        let neighbor_ids: Vec<Vec<usize>> = (0..batch.num_vectors)
            .map(|_| {
                indexed
                    .iter()
                    .take(max_neighbors)
                    .map(|(id, _)| *id)
                    .collect()
            })
            .collect();

        IndexedBatch {
            neighbor_ids,
            num_vectors: batch.num_vectors,
            finalized_at: std::time::Instant::now(),
        }
    }
}