aprender-serve 0.51.0

impl OwnedQuantizedModel {

    /// PMAT-880: Fail-closed guard for GQA KV-cache dimension consistency.
    ///
    /// The GQA cached-attention kernels index the KV cache as
    /// `k_cache[pos * kv_dim + kv_head * head_dim ..][..head_dim]` and the
    /// current-position K/V as `current_k[kv_head * head_dim ..][..head_dim]`,
    /// where `kv_dim == num_kv_heads * head_dim`. If a model/config carries
    /// inconsistent KV dims (e.g. a KV cache that was not allocated for the
    /// same `kv_dim`, or a `current_k`/`current_v`/`q` shorter than required),
    /// those indices silently read the WRONG memory → garbage attention →
    /// incoherent output, or run past the slice → out-of-bounds.
    ///
    /// llama.cpp validates KV-cache shape before attention. This is the same
    /// FAIL-CLOSED class as the shipped garbage/extreme-magnitude beats
    /// (PMAT-744 / PMAT-732): `apr` REJECTS a broken model with a clear error
    /// where llama.cpp/Ollama silently produce garbage. It relates to the
    /// PMAT-749 GQA cache fix — this adds the previously-missing guard.
    ///
    /// The check is O(1) and leaves the happy path byte-identical: a valid GQA
    /// (or MHA) model satisfies every invariant and proceeds unchanged.
    ///
    /// # Errors
    /// Returns [`RealizarError::InvalidConfiguration`] when:
    /// - `head_dim == 0` or `num_kv_heads == 0` (degenerate KV geometry), or
    /// - `kv_dim != num_kv_heads * head_dim` (config invariant violated), or
    /// - `k_cache`/`v_cache` length is not a whole multiple of `kv_dim`, or the
    ///   two caches imply different sequence lengths, or
    /// - `current_k`/`current_v` is shorter than `kv_dim`, or
    /// - `q` is shorter than `q_dim` (`num_heads * head_dim`).
    pub fn validate_gqa_kv_dims(
        &self,
        q: &[f32],
        k_cache: &[f32],
        v_cache: &[f32],
        current_k: &[f32],
        current_v: &[f32],
    ) -> Result<()> {
        let num_heads = self.config.num_heads;
        let num_kv_heads = self.config.num_kv_heads;
        let head_dim = self.config.head_dim();
        let q_dim = self.config.q_dim();
        let kv_dim = self.config.kv_dim();

        // Degenerate geometry: kv_dim==0 would make the cache_len division
        // (k_cache.len() / kv_dim) panic and every KV index meaningless.
        if head_dim == 0 || num_kv_heads == 0 {
            return Err(RealizarError::InvalidConfiguration(format!(
                "PMAT-880: GQA KV geometry is degenerate (head_dim={head_dim}, \
                 num_kv_heads={num_kv_heads}); kv_dim would be 0 and the KV-cache \
                 stride is undefined"
            )));
        }

        // Core invariant: kv_dim must equal num_kv_heads * head_dim, otherwise the
        // per-position stride used to index the cache is inconsistent with the
        // per-head layout and reads the wrong memory.
        let expected_kv_dim = num_kv_heads * head_dim;
        if kv_dim != expected_kv_dim {
            return Err(RealizarError::InvalidConfiguration(format!(
                "PMAT-880: inconsistent KV dimensions — kv_dim ({kv_dim}) != \
                 num_kv_heads * head_dim ({num_kv_heads} * {head_dim} = {expected_kv_dim}); \
                 the KV cache stride does not match the per-head layout"
            )));
        }

        // The KV cache is laid out as [seq, kv_dim] row-major; its length must be
        // a whole multiple of kv_dim, and K and V must describe the same seq len.
        if k_cache.len() % kv_dim != 0 {
            return Err(RealizarError::InvalidConfiguration(format!(
                "PMAT-880: k_cache length ({}) is not a multiple of kv_dim ({kv_dim}); \
                 the KV cache was not allocated for these dimensions",
                k_cache.len()
            )));
        }
        if v_cache.len() % kv_dim != 0 {
            return Err(RealizarError::InvalidConfiguration(format!(
                "PMAT-880: v_cache length ({}) is not a multiple of kv_dim ({kv_dim}); \
                 the KV cache was not allocated for these dimensions",
                v_cache.len()
            )));
        }
        if k_cache.len() != v_cache.len() {
            return Err(RealizarError::InvalidConfiguration(format!(
                "PMAT-880: k_cache length ({}) != v_cache length ({}); \
                 the K and V caches imply different sequence lengths",
                k_cache.len(),
                v_cache.len()
            )));
        }

        // The current position's K/V are indexed up to kv_dim; they must be long
        // enough or the per-head slice runs out of bounds.
        if current_k.len() < kv_dim {
            return Err(RealizarError::InvalidConfiguration(format!(
                "PMAT-880: current_k length ({}) < kv_dim ({kv_dim}); \
                 the current-position key does not cover all KV heads",
                current_k.len()
            )));
        }
        if current_v.len() < kv_dim {
            return Err(RealizarError::InvalidConfiguration(format!(
                "PMAT-880: current_v length ({}) < kv_dim ({kv_dim}); \
                 the current-position value does not cover all KV heads",
                current_v.len()
            )));
        }

        // The query is indexed up to q_dim = num_heads * head_dim.
        if q.len() < q_dim {
            return Err(RealizarError::InvalidConfiguration(format!(
                "PMAT-880: q length ({}) < q_dim ({q_dim} = {num_heads} * {head_dim}); \
                 the query does not cover all attention heads",
                q.len()
            )));
        }

        Ok(())
    }

    /// Compute attention with Grouped Query Attention (GQA) support (IMP-105)
    ///
    /// GQA uses fewer KV heads than Q heads, with multiple Q heads sharing each KV head.
    /// This reduces memory bandwidth and KV cache size for large models.
    ///
    /// # Arguments
    /// * `q` - Query vector for current position [hidden_dim] (num_heads Q heads)
    /// * `k_cache` - Cached keys [cache_len, kv_dim] (num_kv_heads KV heads)
    /// * `v_cache` - Cached values [cache_len, kv_dim] (num_kv_heads KV heads)
    /// * `current_k` - Key for current position [kv_dim]
    /// * `current_v` - Value for current position [kv_dim]
    ///
    /// # Returns
    /// Attention output [hidden_dim]
    ///
    /// # GQA Mapping
    /// Q head i uses KV head (i * num_kv_heads / num_heads)
    /// Example: 8 Q heads, 2 KV heads → Q heads 0-3 use KV head 0, Q heads 4-7 use KV head 1
    pub fn attention_with_cache_gqa(
        &self,
        q: &[f32],
        k_cache: &[f32],
        v_cache: &[f32],
        current_k: &[f32],
        current_v: &[f32],
    ) -> Vec<f32> {
        let num_heads = self.config.num_heads;
        let num_kv_heads = self.config.num_kv_heads;
        // GH-479: Use config methods (Qwen3 head_dim != hidden/heads)
        let head_dim = self.config.head_dim();
        let q_dim = self.config.q_dim();
        let kv_dim = self.config.kv_dim();
        // PMAT-810: Gemma2 scales by 1/sqrt(query_pre_attn_scalar); every other
        // arch (and gemma-2-2b, key absent) → 1/sqrt(head_dim), byte-identical.
        let scale = self.config.attn_scale();
        // PMAT-810: Gemma2 caps attention logits with `cap*tanh(scores/cap)`
        // (cap=50) BEFORE softmax. `None` for every other arch → no-op.
        let attn_softcap = self.config.attn_logit_softcap();

        // Number of Q heads that share each KV head
        let q_per_kv = num_heads / num_kv_heads;

        // Total sequence length = cached + 1 (current)
        let cache_len = if kv_dim > 0 {
            k_cache.len() / kv_dim
        } else {
            0
        };
        let total_len = cache_len + 1;

        let mut output = vec![0.0f32; q_dim];

        // Score buffer for the current group.
        // Size: q_per_kv * total_len.
        // We reuse this buffer for each KV group to minimize allocation.
        let mut group_scores = vec![0.0f32; q_per_kv * total_len];

        // Process each KV head group (OPTIMIZATION: Scan KV cache once per group)
        for kv_head in 0..num_kv_heads {
            let kv_head_offset = kv_head * head_dim;

            // 1. Compute Scores (Scan K Cache Once)
            for pos in 0..cache_len {
                let k_start = pos * kv_dim + kv_head_offset;
                let cached_key = &k_cache[k_start..k_start + head_dim];

                // For each Q head in this group
                for i in 0..q_per_kv {
                    let q_head_idx = kv_head * q_per_kv + i;
                    let q_head_offset = q_head_idx * head_dim;
                    let q_head_data = &q[q_head_offset..q_head_offset + head_dim];

                    let score = Self::simd_dot_f32(q_head_data, cached_key) * scale;
                    group_scores[i * total_len + pos] = score;
                }
            }

            // Handle current position K
            let curr_key = &current_k[kv_head_offset..kv_head_offset + head_dim];
            for i in 0..q_per_kv {
                let q_head_idx = kv_head * q_per_kv + i;
                let q_head_offset = q_head_idx * head_dim;
                let q_head_data = &q[q_head_offset..q_head_offset + head_dim];

                let score = Self::simd_dot_f32(q_head_data, curr_key) * scale;
                group_scores[i * total_len + cache_len] = score;
            }

            // 2. Softmax (Per Q Head). PMAT-810: Gemma2 softcaps the scores first.
            for i in 0..q_per_kv {
                let start = i * total_len;
                let end = start + total_len;
                if let Some(cap) = attn_softcap {
                    crate::gguf::ops::softcap(&mut group_scores[start..end], cap);
                }
                crate::quantize::softmax_simd(&mut group_scores[start..end]);
            }

            // 3. Accumulate Values (Scan V Cache Once)
            for pos in 0..cache_len {
                let v_start = pos * kv_dim + kv_head_offset;
                let cached_val = &v_cache[v_start..v_start + head_dim];

                for i in 0..q_per_kv {
                    let weight = group_scores[i * total_len + pos];
                    let q_head_idx = kv_head * q_per_kv + i;
                    let out_offset = q_head_idx * head_dim;
                    let out_head = &mut output[out_offset..out_offset + head_dim];

                    Self::simd_axpy_f32(out_head, weight, cached_val);
                }
            }

            // Handle current position V
            let curr_val = &current_v[kv_head_offset..kv_head_offset + head_dim];
            for i in 0..q_per_kv {
                let weight = group_scores[i * total_len + cache_len];
                let q_head_idx = kv_head * q_per_kv + i;
                let out_offset = q_head_idx * head_dim;
                let out_head = &mut output[out_offset..out_offset + head_dim];

                Self::simd_axpy_f32(out_head, weight, curr_val);
            }
        }

        output
    }

    /// Attention with cache - writes to pre-allocated buffer (IMP-131)
    pub fn attention_with_cache_gqa_into(
        &self,
        q: &[f32],
        k_cache: &[f32],
        v_cache: &[f32],
        current_k: &[f32],
        current_v: &[f32],
        output: &mut [f32],
    ) {
        let num_heads = self.config.num_heads;
        let num_kv_heads = self.config.num_kv_heads;
        // GH-479: Use config methods (Qwen3 head_dim != hidden/heads)
        let head_dim = self.config.head_dim();
        let q_dim = self.config.q_dim();
        let kv_dim = self.config.kv_dim();
        // PMAT-810: Gemma2 scales by 1/sqrt(query_pre_attn_scalar); every other
        // arch (and gemma-2-2b, key absent) → 1/sqrt(head_dim), byte-identical.
        let scale = self.config.attn_scale();
        // PMAT-810: Gemma2 attention-logit softcap (None elsewhere → no-op).
        let attn_softcap = self.config.attn_logit_softcap();

        let q_per_kv = num_heads / num_kv_heads;

        let cache_len = if kv_dim > 0 {
            k_cache.len() / kv_dim
        } else {
            0
        };
        let total_len = cache_len + 1;

        // Zero output buffer
        // GH-479: Use q_dim (may differ from hidden_dim for Qwen3)
        output[..q_dim].iter_mut().for_each(|x| *x = 0.0);

        // Score buffer for the current group.
        // Size: q_per_kv * total_len.
        // We reuse this buffer for each KV group to minimize allocation.
        let mut group_scores = vec![0.0f32; q_per_kv * total_len];

        // Process each KV head group (OPTIMIZATION: Scan KV cache once per group)
        for kv_head in 0..num_kv_heads {
            let kv_head_offset = kv_head * head_dim;

            // 1. Compute Scores (Scan K Cache Once)
            for pos in 0..cache_len {
                let k_start = pos * kv_dim + kv_head_offset;
                let cached_key = &k_cache[k_start..k_start + head_dim];

                // For each Q head in this group
                for i in 0..q_per_kv {
                    let q_head_idx = kv_head * q_per_kv + i;
                    let q_head_offset = q_head_idx * head_dim;
                    let q_head_data = &q[q_head_offset..q_head_offset + head_dim];

                    let score = Self::simd_dot_f32(q_head_data, cached_key) * scale;
                    group_scores[i * total_len + pos] = score;
                }
            }

            // Handle current position K
            let curr_key = &current_k[kv_head_offset..kv_head_offset + head_dim];
            for i in 0..q_per_kv {
                let q_head_idx = kv_head * q_per_kv + i;
                let q_head_offset = q_head_idx * head_dim;
                let q_head_data = &q[q_head_offset..q_head_offset + head_dim];

                let score = Self::simd_dot_f32(q_head_data, curr_key) * scale;
                group_scores[i * total_len + cache_len] = score;
            }

            // 2. Softmax (Per Q Head). PMAT-810: Gemma2 softcaps the scores first.
            for i in 0..q_per_kv {
                let start = i * total_len;
                let end = start + total_len;
                if let Some(cap) = attn_softcap {
                    crate::gguf::ops::softcap(&mut group_scores[start..end], cap);
                }
                crate::quantize::softmax_simd(&mut group_scores[start..end]);
            }

            // 3. Accumulate Values (Scan V Cache Once)
            for pos in 0..cache_len {
                let v_start = pos * kv_dim + kv_head_offset;
                let cached_val = &v_cache[v_start..v_start + head_dim];

                for i in 0..q_per_kv {
                    let weight = group_scores[i * total_len + pos];
                    let q_head_idx = kv_head * q_per_kv + i;
                    let out_offset = q_head_idx * head_dim;
                    let out_head = &mut output[out_offset..out_offset + head_dim];

                    Self::simd_axpy_f32(out_head, weight, cached_val);
                }
            }

            // Handle current position V
            let curr_val = &current_v[kv_head_offset..kv_head_offset + head_dim];
            for i in 0..q_per_kv {
                let weight = group_scores[i * total_len + cache_len];
                let q_head_idx = kv_head * q_per_kv + i;
                let out_offset = q_head_idx * head_dim;
                let out_head = &mut output[out_offset..out_offset + head_dim];

                Self::simd_axpy_f32(out_head, weight, curr_val);
            }
        }
    }

    /// Adaptive attention with KV cache - auto-selects CPU or GPU backend (IMP-122)
    ///
    /// For short cache lengths (< 64), uses efficient CPU implementation.
    /// For long cache lengths (>= 64), uses GPU-accelerated computation.
    ///
    /// # Arguments
    /// * `q` - Query vector for current position [hidden_dim]
    /// * `k_cache` - Cached keys [cache_len, hidden_dim]
    /// * `v_cache` - Cached values [cache_len, hidden_dim]
    /// * `current_k` - Key for current position [hidden_dim]
    /// * `current_v` - Value for current position [hidden_dim]
    ///
    /// # Returns
    /// Result containing attention output [hidden_dim]
    ///
    /// # Errors
    /// Returns error if GPU operations fail (for GPU path)
    #[cfg(feature = "gpu")]
    pub fn adaptive_attention_with_cache(
        &self,
        q: &[f32],
        k_cache: &[f32],
        v_cache: &[f32],
        current_k: &[f32],
        current_v: &[f32],
    ) -> Result<Vec<f32>> {
        // PMAT-749: the MHA cache-attention path (gpu_attention_with_cache /
        // attention_with_cache) strides the KV cache by q_dim/hidden_dim and indexes
        // current_k/current_v by head*head_dim over num_heads — correct ONLY for MHA
        // (num_kv_heads == num_heads). For GQA models the cache is [seq, kv_dim], so at
        // head >= num_kv_heads the current-K/V slice runs past kv_dim → panic/garbage
        // once a sequence crosses the >=64 GPU threshold. Route GQA to the GQA-correct
        // function (it maps each q-head to its kv-head and strides by kv_dim; it also
        // handles MHA as q_per_kv==1, but we keep the GPU path for MHA to avoid a perf
        // regression). Every prior test of this path was MHA, so GQA was uncovered.
        if self.config.num_kv_heads < self.config.num_heads {
            // PMAT-880: fail closed on inconsistent KV-cache dims before indexing.
            self.validate_gqa_kv_dims(q, k_cache, v_cache, current_k, current_v)?;
            return Ok(self.attention_with_cache_gqa(q, k_cache, v_cache, current_k, current_v));
        }

        let hidden_dim = self.config.hidden_dim;

        // Calculate cache length
        let cache_len = if hidden_dim > 0 {
            k_cache.len() / hidden_dim
        } else {
            0
        };

        // Threshold for GPU dispatch (matches IMP-119)
        const GPU_CACHE_LEN_THRESHOLD: usize = 64;

        if cache_len >= GPU_CACHE_LEN_THRESHOLD {
            // GPU path for long sequences
            self.gpu_attention_with_cache(q, k_cache, v_cache, current_k, current_v)
        } else {
            // CPU path for short sequences - use existing implementation
            Ok(self.attention_with_cache(q, k_cache, v_cache, current_k, current_v))
        }
    }

    /// CPU-only version of adaptive attention
    #[cfg(not(feature = "gpu"))]
    pub fn adaptive_attention_with_cache(
        &self,
        q: &[f32],
        k_cache: &[f32],
        v_cache: &[f32],
        current_k: &[f32],
        current_v: &[f32],
    ) -> Result<Vec<f32>> {
        // PMAT-749: GQA models need the kv_dim-strided path; attention_with_cache is
        // MHA-only and panics for num_kv_heads < num_heads. See the gpu variant above.
        if self.config.num_kv_heads < self.config.num_heads {
            // PMAT-880: fail closed on inconsistent KV-cache dims before indexing.
            self.validate_gqa_kv_dims(q, k_cache, v_cache, current_k, current_v)?;
            return Ok(self.attention_with_cache_gqa(q, k_cache, v_cache, current_k, current_v));
        }
        Ok(self.attention_with_cache(q, k_cache, v_cache, current_k, current_v))
    }

    /// GPU-accelerated attention with KV cache (IMP-122)
    ///
    /// Uses GPU for Q@K^T computation when cache is large enough.
    #[cfg(feature = "gpu")]
    fn gpu_attention_with_cache(
        &self,
        q: &[f32],
        k_cache: &[f32],
        v_cache: &[f32],
        current_k: &[f32],
        current_v: &[f32],
    ) -> Result<Vec<f32>> {
        use crate::gpu::HybridScheduler;

        let num_heads = self.config.num_heads;
        // GH-479: Use config methods (Qwen3 head_dim != hidden/heads)
        let head_dim = self.config.head_dim();
        let q_dim = self.config.q_dim();
        let scale = 1.0 / (head_dim as f32).sqrt();

        // Total sequence length = cached + 1 (current)
        let cache_len = k_cache.len() / q_dim;
        let total_len = cache_len + 1;

        let mut output = vec![0.0f32; q_dim];

        // Create scheduler for GPU operations
        let mut scheduler = HybridScheduler::with_threshold(1000).map_err(|e| {
            RealizarError::UnsupportedOperation {
                operation: "gpu_attention_with_cache".to_string(),
                reason: format!("Failed to create scheduler: {e}"),
            }
        })?;

        // Process each head
        for head in 0..num_heads {
            let head_offset = head * head_dim;
            let q_head = &q[head_offset..head_offset + head_dim];

            // Build full K matrix for this head: [total_len, head_dim]
            let mut k_full = Vec::with_capacity(total_len * head_dim);
            for pos in 0..cache_len {
                let k_start = pos * q_dim + head_offset;
                k_full.extend_from_slice(&k_cache[k_start..k_start + head_dim]);
            }
            k_full.extend_from_slice(&current_k[head_offset..head_offset + head_dim]);

            // Transpose K to [head_dim, total_len] for matmul
            let mut k_t = vec![0.0f32; head_dim * total_len];
            for pos in 0..total_len {
                for d in 0..head_dim {
                    k_t[d * total_len + pos] = k_full[pos * head_dim + d];
                }
            }

            // GPU matmul: Q[1, head_dim] @ K_T[head_dim, total_len] -> [1, total_len]
            let scores_raw = scheduler
                .matmul(q_head, &k_t, 1, head_dim, total_len)
                .map_err(|e| RealizarError::UnsupportedOperation {
                    operation: "gpu_attention_with_cache".to_string(),
                    reason: format!("GPU matmul failed: {e}"),
                })?;

            // Scale scores
            let mut scores: Vec<f32> = scores_raw.iter().map(|&s| s * scale).collect();

            // Softmax (SIMD-optimized)
            crate::quantize::softmax_simd(&mut scores);

            // Weighted sum of values
            let out_head = &mut output[head_offset..head_offset + head_dim];

            // Cached values
            for (pos, &weight) in scores.iter().enumerate().take(cache_len) {
                let v_start = pos * q_dim + head_offset;
                let cached_val = &v_cache[v_start..v_start + head_dim];
                for d in 0..head_dim {
                    out_head[d] += weight * cached_val[d];
                }
            }

            // Current value
            let curr_val = &current_v[head_offset..head_offset + head_dim];
            let current_weight = scores[cache_len];
            for d in 0..head_dim {
                out_head[d] += current_weight * curr_val[d];
            }
        }

        Ok(output)
    }
}