oxillama-runtime 0.1.3

//! Speculative decoding engine.
//!
//! Uses a small "draft" model to propose candidate tokens, then verifies
//! them with a larger "target" model for equivalent output quality at
//! higher throughput.
//!
//! ## Algorithm
//!
//! Each speculation round:
//! 1. **Draft phase** — run `num_speculative` autoregressive steps on the
//!    draft model, collecting `(token, softmax_probability)` pairs.
//! 2. **Verification phase** — for each candidate token run one forward pass
//!    on the target model and obtain the target's probability for that token.
//! 3. **Accept / reject** — accept token `i` deterministically when
//!    `p_target >= p_draft`, otherwise accept with probability
//!    `p_target / p_draft` (standard speculative sampling).
//!    On rejection, sample a "bonus" token from the residual distribution
//!    `max(0, p_target – p_draft) / Z` and stop.
//! 4. When all candidates are accepted, sample one bonus token from the
//!    target model to maintain the correct output distribution.
//!
//! ## KV cache synchronisation
//!
//! After each acceptance/rejection step the draft model's KV cache is
//! re-synced by resetting and re-prefilling from all accepted tokens so far.
//! This is correct but not maximally efficient.
//!
//! TODO: implement delta-KV-cache sync so the draft model only processes the
//! newly accepted tokens rather than the full history on every round.

use std::io::Write;
use std::path::Path;

use crate::engine::{EngineConfig, InferenceEngine};
use crate::error::{RuntimeError, RuntimeResult};
use crate::kv_cache::KvCacheSnapshot;
use crate::snapshot::SpeculativeEngineSnapshot;

// ─── Configuration ──────────────────────────────────────────────────────────

/// Configuration for speculative decoding.
///
/// Draft-specific sampling parameters (temperature, top-k, etc.) should be
/// set in [`SpeculativeConfig::draft`]`.sampler`.  The target model's sampler
/// config is not used during speculative decoding — token acceptance is governed
/// by the accept/reject criterion, not by sampling.
#[derive(Debug, Clone)]
pub struct SpeculativeConfig {
    /// Target (large, accurate) model configuration.
    pub target: EngineConfig,
    /// Draft (small, fast) model configuration.
    pub draft: EngineConfig,
    /// Number of tokens the draft model generates per speculation round (k).
    ///
    /// Larger values give more potential speedup but risk more resampling when
    /// the draft and target distributions diverge. A value of 4–8 is typical.
    pub num_speculative: usize,
    /// Random seed for the accept/reject RNG.  `None` uses a fixed default.
    pub seed: Option<u64>,
}

impl SpeculativeConfig {
    /// Create a new `SpeculativeConfig` with `num_speculative = 4`.
    pub fn new(target: EngineConfig, draft: EngineConfig) -> Self {
        Self {
            target,
            draft,
            num_speculative: 4,
            seed: None,
        }
    }
}

// ─── Engine ─────────────────────────────────────────────────────────────────

/// Speculative decoding engine.
///
/// Owns both a draft (fast, small) and target (slow, accurate) [`InferenceEngine`].
/// The [`generate`](SpeculativeEngine::generate) method uses the draft model to
/// speculatively predict candidate tokens, then verifies them with the target model
/// using the standard accept/reject procedure.
pub struct SpeculativeEngine {
    draft: InferenceEngine,
    target: InferenceEngine,
    num_speculative: usize,
    rng: Xorshift64,
    delta_sync: SpeculativeDeltaSync,
}

impl SpeculativeEngine {
    /// Create and load both models from the given config.
    pub fn new(config: SpeculativeConfig) -> RuntimeResult<Self> {
        let seed = config.seed.unwrap_or(0x517cc1b727220a95u64);

        let mut draft = InferenceEngine::new(config.draft);
        draft.load_model()?;

        let mut target = InferenceEngine::new(config.target);
        target.load_model()?;

        Ok(Self {
            draft,
            target,
            num_speculative: config.num_speculative,
            rng: Xorshift64::new(seed),
            delta_sync: SpeculativeDeltaSync::new(),
        })
    }

    /// Generate text using speculative decoding.
    ///
    /// # Arguments
    /// * `prompt` — Input text prompt.
    /// * `max_tokens` — Maximum number of new tokens to generate.
    /// * `callback` — Invoked with each accepted token's decoded text immediately.
    ///
    /// # Returns
    /// The full generated text (concatenation of all callback inputs).
    pub fn generate(
        &mut self,
        prompt: &str,
        max_tokens: usize,
        mut callback: impl FnMut(&str),
    ) -> RuntimeResult<String> {
        // ── Initialise ────────────────────────────────────────────────────────
        self.draft.reset();
        self.target.reset();

        // Tokenise using the target model (it has the authoritative tokeniser).
        let prompt_tokens = self.target.tokenize(prompt)?;
        if prompt_tokens.is_empty() {
            return Ok(String::new());
        }

        // Prefill both models with the prompt tokens.
        self.draft.prefill(&prompt_tokens)?;
        self.target.prefill(&prompt_tokens)?;

        // `all_tokens` tracks the complete context (prompt + accepted tokens).
        // Used for re-syncing the draft KV cache after each round.
        let mut all_tokens: Vec<u32> = prompt_tokens;

        let mut generated = String::new();
        let mut tokens_generated = 0usize;

        // ── Main generation loop ──────────────────────────────────────────────
        while tokens_generated < max_tokens {
            let k = self.num_speculative.min(max_tokens - tokens_generated);

            // ── Draft phase ─────────────────────────────────────────────────
            // Forward-pass the last token already in the draft model's KV cache
            // to get logits for the current position, then sample k times.
            let last_token = *all_tokens.last().ok_or(RuntimeError::ModelLoadError {
                message: "token history is unexpectedly empty".to_string(),
            })?;

            // Get logits at the current position from the draft model.
            let mut draft_logits = self.draft.forward_one(last_token)?;

            let mut draft_tokens: Vec<u32> = Vec::with_capacity(k);
            let mut draft_probs: Vec<f32> = Vec::with_capacity(k);

            for _ in 0..k {
                let (token, prob) = sample_with_prob(&draft_logits, &mut self.rng);
                if self.draft.is_eos(token) {
                    break;
                }
                draft_tokens.push(token);
                draft_probs.push(prob);

                // Advance draft model for the next speculative step.
                draft_logits = self.draft.forward_one(token)?;
            }

            if draft_tokens.is_empty() {
                // Draft hit EOS immediately — run one target step and finish.
                let target_logits = self.target.forward_one(last_token)?;
                let (bonus_tok, _) = sample_with_prob(&target_logits, &mut self.rng);
                if !self.target.is_eos(bonus_tok) {
                    let text = self.target.decode_token(bonus_tok)?;
                    callback(&text);
                    generated.push_str(&text);
                }
                break;
            }

            // ── Verification phase ───────────────────────────────────────────
            // Run the target model forward from the position *before* the first
            // draft candidate (i.e. starting from the last committed token).
            let mut accepted = 0usize;
            let mut bonus_token: Option<u32> = None;

            // We need target probabilities for each draft token.  The target
            // processes each candidate token autoregressively: at step i it
            // has already seen all tokens up to and including draft_tokens[i-1].
            let mut target_logits = self.target.forward_one(last_token)?;

            for (i, (&draft_tok, &p_draft)) in
                draft_tokens.iter().zip(draft_probs.iter()).enumerate()
            {
                let target_probs = softmax(&target_logits);
                let p_target = target_probs
                    .get(draft_tok as usize)
                    .copied()
                    .unwrap_or(0.0f32);

                let u = self.rng.next_f32();
                let accept_threshold = (p_target / p_draft.max(1e-10)).min(1.0);

                if u <= accept_threshold {
                    // Accept this draft token.
                    accepted += 1;

                    if self.target.is_eos(draft_tok) {
                        // Accepted up to EOS — generation is done.
                        // Commit accepted tokens excluding the EOS itself.
                        commit_and_emit(
                            &draft_tokens[..accepted.saturating_sub(1)],
                            &mut self.target,
                            &mut all_tokens,
                            &mut generated,
                            &mut tokens_generated,
                            &mut callback,
                        )?;
                        return Ok(generated);
                    }

                    // Advance the target model for the next verification step.
                    if i + 1 < draft_tokens.len() {
                        target_logits = self.target.forward_one(draft_tok)?;
                    } else {
                        // Last candidate was accepted — keep target_logits for
                        // the bonus token step below.
                        target_logits = self.target.forward_one(draft_tok)?;
                    }
                } else {
                    // Reject: sample a bonus token from the residual distribution
                    // residual_i = max(0, p_target - p_draft) / Z
                    let target_probs_for_bonus = softmax(&target_logits);
                    let draft_probs_full = softmax_draft_at(&draft_logits, &draft_tokens, i);
                    let bonus =
                        sample_residual(&target_probs_for_bonus, &draft_probs_full, &mut self.rng);
                    bonus_token = Some(bonus);
                    break;
                }
            }

            // ── Commit accepted tokens ───────────────────────────────────────
            commit_and_emit(
                &draft_tokens[..accepted],
                &mut self.target,
                &mut all_tokens,
                &mut generated,
                &mut tokens_generated,
                &mut callback,
            )?;

            if let Some(bonus) = bonus_token {
                // Rejected branch: commit the bonus token, then re-sync draft.
                if !self.target.is_eos(bonus) {
                    let _fwd = self.target.forward_one(bonus)?;
                    let text = self.target.decode_token(bonus)?;
                    callback(&text);
                    generated.push_str(&text);
                    all_tokens.push(bonus);
                    tokens_generated += 1;
                }
                // Re-sync draft KV cache to match the target's accepted context.
                resync_draft(&mut self.draft, &all_tokens, &mut self.delta_sync)?;
            } else if accepted == draft_tokens.len() {
                // All candidates accepted: sample one bonus token from target
                // (required to maintain the correct output distribution).
                let (bonus, _) = sample_with_prob(&target_logits, &mut self.rng);
                if self.target.is_eos(bonus) {
                    break;
                }
                let _fwd = self.target.forward_one(bonus)?;
                let text = self.target.decode_token(bonus)?;
                callback(&text);
                generated.push_str(&text);
                all_tokens.push(bonus);
                tokens_generated += 1;

                // Checkpoint verified state, then re-sync draft KV cache.
                let _ = self.delta_sync.checkpoint(&self.draft);
                resync_draft(&mut self.draft, &all_tokens, &mut self.delta_sync)?;
            }
        }

        Ok(generated)
    }

    // ── Snapshot / resume ────────────────────────────────────────────────────

    /// Capture the full speculative-engine state as a portable byte blob.
    ///
    /// Snapshots both the target and draft [`InferenceEngine`]s individually,
    /// then wraps them with the speculative-loop state (num_speculative, seed,
    /// RNG state, accepted token history) into a [`SpeculativeEngineSnapshot`]
    /// and encodes the result.
    ///
    /// # Errors
    ///
    /// Returns [`RuntimeError::ModelNotLoaded`] if either sub-engine has no
    /// model loaded, or a serialisation error on encoding failure.
    pub fn snapshot(&self) -> RuntimeResult<Vec<u8>> {
        let target_bytes = self.target.snapshot()?;
        let target_snap =
            crate::snapshot::EngineSnapshot::deserialize(&target_bytes).map_err(|e| {
                RuntimeError::SpecSnapshotIncompatible(format!(
                    "target engine snapshot failed: {e}"
                ))
            })?;

        let draft_bytes = self.draft.snapshot()?;
        let draft_snap =
            crate::snapshot::EngineSnapshot::deserialize(&draft_bytes).map_err(|e| {
                RuntimeError::SpecSnapshotIncompatible(format!("draft engine snapshot failed: {e}"))
            })?;

        let spec_snap = SpeculativeEngineSnapshot {
            target_snapshot: target_snap,
            draft_snapshot: draft_snap,
            num_speculative: self.num_speculative,
            spec_seed: None,             // seed is already baked into rng_state
            accepted_tokens: Vec::new(), // no persistent accepted history between calls
            rng_state: self.rng.raw_state(),
        };

        spec_snap.encode()
    }

    /// Write a snapshot of this engine atomically to `path`.
    ///
    /// Uses a temp-file-then-rename strategy within the same directory so the
    /// destination is never left in a partial state.
    ///
    /// # Errors
    ///
    /// Forwards any [`RuntimeError`] from [`Self::snapshot`] or any I/O error
    /// from the atomic write.
    pub fn snapshot_to_file(&self, path: &Path) -> RuntimeResult<()> {
        let bytes = self.snapshot()?;
        atomic_write(path, &bytes)?;
        Ok(())
    }

    /// Resume a speculative decoding session from a previously captured byte blob.
    ///
    /// 1. Decodes the [`SpeculativeEngineSnapshot`].
    /// 2. Resumes the target engine from its embedded snapshot, validating the
    ///    target model fingerprint against `target_model_path`.
    /// 3. Resumes the draft engine similarly from `draft_model_path`.
    /// 4. Reconstructs the `SpeculativeEngine` with the restored RNG state.
    ///
    /// # Errors
    ///
    /// - [`RuntimeError::SpecSnapshotIncompatible`] — bytes are not a valid
    ///   speculative snapshot.
    /// - [`RuntimeError::ModelFingerprintMismatch`] — either model file has
    ///   changed since the snapshot was taken.
    /// - Any error from loading either model.
    pub fn resume(
        bytes: &[u8],
        target_model_path: &Path,
        draft_model_path: &Path,
    ) -> RuntimeResult<Self> {
        let spec_snap = SpeculativeEngineSnapshot::decode(bytes)?;

        // Re-encode the individual engine snapshots for InferenceEngine::resume.
        let target_bytes = spec_snap.target_snapshot.serialize().map_err(|e| {
            RuntimeError::SpecSnapshotIncompatible(format!(
                "failed to re-encode target snapshot: {e}"
            ))
        })?;
        let draft_bytes = spec_snap.draft_snapshot.serialize().map_err(|e| {
            RuntimeError::SpecSnapshotIncompatible(format!(
                "failed to re-encode draft snapshot: {e}"
            ))
        })?;

        let target = InferenceEngine::resume(&target_bytes, target_model_path)?;
        let draft = InferenceEngine::resume(&draft_bytes, draft_model_path)?;

        Ok(Self {
            target,
            draft,
            num_speculative: spec_snap.num_speculative,
            rng: Xorshift64::from_raw_state(spec_snap.rng_state),
            delta_sync: SpeculativeDeltaSync::new(),
        })
    }

    /// Resume a speculative decoding session from a snapshot file.
    ///
    /// Reads the file, then delegates to [`Self::resume`].
    pub fn resume_from_file(
        path: &Path,
        target_model_path: &Path,
        draft_model_path: &Path,
    ) -> RuntimeResult<Self> {
        let bytes = std::fs::read(path)?;
        Self::resume(&bytes, target_model_path, draft_model_path)
    }
}

/// Write `bytes` to `path` atomically using a temp-file-then-rename strategy.
fn atomic_write(path: &Path, bytes: &[u8]) -> RuntimeResult<()> {
    let parent = path.parent().ok_or_else(|| {
        RuntimeError::Io(std::io::Error::new(
            std::io::ErrorKind::InvalidInput,
            "snapshot path has no parent directory",
        ))
    })?;
    let mut tmp = tempfile::NamedTempFile::new_in(parent)?;
    tmp.write_all(bytes)?;
    tmp.persist(path).map_err(|e| RuntimeError::Io(e.error))?;
    Ok(())
}

// ─── KV cache re-sync ────────────────────────────────────────────────────────

/// Reset the draft engine and re-prefill it from the entire accepted context.
///
/// This ensures the draft model's KV cache is consistent with the target's
/// accepted token history.  The last token is *not* prefilled here — it will
/// be forwarded at the start of the next draft phase.
///
/// Uses [`SpeculativeDeltaSync`] when a verified checkpoint is available,
/// falling back to full re-prefill on the first round.
fn resync_draft(
    draft: &mut InferenceEngine,
    all_tokens: &[u32],
    delta: &mut SpeculativeDeltaSync,
) -> RuntimeResult<()> {
    // Attempt delta restoration from the last verified checkpoint.
    if let Err(_e) = delta.restore(draft) {
        // No checkpoint yet — fall back to full reset + re-prefill.
        draft.reset();
        if all_tokens.len() > 1 {
            draft.prefill(&all_tokens[..all_tokens.len() - 1])?;
        }
    }
    Ok(())
}

// ─── Delta KV sync ──────────────────────────────────────────────────────────

/// Delta-sync manager for speculative decoding KV cache.
///
/// After each round of accepted tokens the caller should call [`checkpoint`]
/// to save the current KV state.  On rejection, [`restore`] rolls the draft
/// model back to the snapshot so only the corrected token needs to be
/// re-run, rather than the entire token history.
///
/// [`checkpoint`]: SpeculativeDeltaSync::checkpoint
/// [`restore`]: SpeculativeDeltaSync::restore
pub struct SpeculativeDeltaSync {
    /// Snapshot of the KV cache at the last verified token boundary.
    verified_snapshot: Option<KvCacheSnapshot>,
}

impl SpeculativeDeltaSync {
    /// Create a new delta-sync manager with no checkpoint.
    pub fn new() -> Self {
        Self {
            verified_snapshot: None,
        }
    }

    /// Capture the current KV cache state from `engine` as the latest
    /// verified checkpoint.
    ///
    /// Returns [`RuntimeError::ModelNotLoaded`] if no model is loaded.
    pub fn checkpoint(&mut self, engine: &InferenceEngine) -> RuntimeResult<()> {
        let snap = engine.kv_snapshot().ok_or(RuntimeError::ModelNotLoaded)?;
        self.verified_snapshot = Some(snap);
        Ok(())
    }

    /// Restore the engine's KV cache to the last verified checkpoint.
    ///
    /// Returns [`RuntimeError::ModelNotLoaded`] if no model is loaded, or
    /// [`RuntimeError::Cancelled`] if no checkpoint has been taken yet.
    pub fn restore(&self, engine: &mut InferenceEngine) -> RuntimeResult<()> {
        let snap = self
            .verified_snapshot
            .as_ref()
            .ok_or(RuntimeError::Cancelled)?;
        engine.kv_restore(snap)
    }
}

impl Default for SpeculativeDeltaSync {
    fn default() -> Self {
        Self::new()
    }
}

// ─── Accept / reject helpers ─────────────────────────────────────────────────

/// Emit accepted draft tokens, updating the output string and token history.
///
/// Note: this does NOT call `forward_one` on the target — the caller is
/// responsible for keeping the target KV cache in sync.
fn commit_and_emit(
    tokens: &[u32],
    target: &mut InferenceEngine,
    all_tokens: &mut Vec<u32>,
    generated: &mut String,
    tokens_generated: &mut usize,
    callback: &mut impl FnMut(&str),
) -> RuntimeResult<()> {
    for &tok in tokens {
        let text = target.decode_token(tok)?;
        callback(&text);
        generated.push_str(&text);
        all_tokens.push(tok);
        *tokens_generated += 1;
    }
    Ok(())
}

/// Sample a token from logits and also return its softmax probability.
fn sample_with_prob(logits: &[f32], rng: &mut Xorshift64) -> (u32, f32) {
    if logits.is_empty() {
        return (0, 1.0);
    }
    let probs = softmax(logits);
    let r = rng.next_f32();
    let mut cumulative = 0.0f32;
    for (i, &p) in probs.iter().enumerate() {
        cumulative += p;
        if r < cumulative {
            return (i as u32, p);
        }
    }
    // Fallback for rounding.
    let last = probs.len() - 1;
    (last as u32, probs[last])
}

/// Sample from the residual (corrected) distribution:
/// `residual[i] = max(0, p_target[i] − p_draft[i]) / Z`
///
/// This maintains the correctness guarantee of speculative decoding.
fn sample_residual(p_target: &[f32], p_draft: &[f32], rng: &mut Xorshift64) -> u32 {
    let len = p_target.len().max(p_draft.len());
    let mut residual: Vec<f32> = (0..len)
        .map(|i| {
            let pt = p_target.get(i).copied().unwrap_or(0.0);
            let pd = p_draft.get(i).copied().unwrap_or(0.0);
            (pt - pd).max(0.0)
        })
        .collect();

    let z: f32 = residual.iter().sum();
    if z > 1e-10 {
        for v in &mut residual {
            *v /= z;
        }
    } else {
        // Degenerate case — fall back to target distribution.
        residual.clear();
        residual.extend_from_slice(p_target);
    }

    let r = rng.next_f32();
    let mut cumulative = 0.0f32;
    for (i, &p) in residual.iter().enumerate() {
        cumulative += p;
        if r < cumulative {
            return i as u32;
        }
    }
    // Fallback.
    residual.len().saturating_sub(1) as u32
}

/// Build a full draft probability distribution over the vocabulary, aligned with
/// the target distribution, for the position at index `candidate_idx`.
///
/// We approximate this by taking `p_draft` equal to the draft model's softmax at
/// the logit position for the candidate tokens, and zero everywhere else.  Because
/// we only need the residual to be correct for the rejection step, this is fine.
///
/// In practice, this helper reconstructs a sparse vector from the known draft
/// probabilities at each candidate position.
fn softmax_draft_at(
    draft_logits_at_pos: &[f32],
    draft_tokens: &[u32],
    candidate_idx: usize,
) -> Vec<f32> {
    // Recompute softmax over all vocab at this draft position.
    // This is the full draft distribution at the rejected position.
    let _ = (draft_tokens, candidate_idx); // used for documentation; not needed here
    softmax(draft_logits_at_pos)
}

// ─── Math helpers ─────────────────────────────────────────────────────────────

/// Numerically stable softmax over a logit slice, returning a probability vector.
fn softmax(logits: &[f32]) -> Vec<f32> {
    if logits.is_empty() {
        return Vec::new();
    }
    let max_val = logits.iter().copied().fold(f32::NEG_INFINITY, f32::max);
    let mut exps: Vec<f32> = logits.iter().map(|&v| (v - max_val).exp()).collect();
    let sum: f32 = exps.iter().sum();
    if sum > 0.0 {
        for v in &mut exps {
            *v /= sum;
        }
    }
    exps
}

// ─── PRNG ─────────────────────────────────────────────────────────────────────

/// Minimal xorshift64 PRNG for accept/reject sampling.
///
/// This is a distinct copy from the one in `sampling/mod.rs` (which is private).
/// It is intentionally simple and not cryptographically secure.
struct Xorshift64 {
    state: u64,
}

impl Xorshift64 {
    fn new(seed: u64) -> Self {
        Self {
            // Ensure non-zero state
            state: if seed == 0 {
                0x517cc1b727220a95u64
            } else {
                seed
            },
        }
    }

    /// Return the raw internal state for snapshot serialisation.
    fn raw_state(&self) -> u64 {
        self.state
    }

    /// Reconstruct from a previously captured raw state.
    fn from_raw_state(state: u64) -> Self {
        Self {
            state: if state == 0 {
                0x517cc1b727220a95u64
            } else {
                state
            },
        }
    }

    fn next_u64(&mut self) -> u64 {
        let mut x = self.state;
        x ^= x << 13;
        x ^= x >> 7;
        x ^= x << 17;
        self.state = x;
        x
    }

    /// Generate a uniform f32 in [0, 1).
    fn next_f32(&mut self) -> f32 {
        (self.next_u64() >> 40) as f32 / (1u64 << 24) as f32
    }
}

// ─── Tests ────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    /// `Xorshift64::next_f32` must always produce values in [0, 1).
    #[test]
    fn test_xorshift_range() {
        let mut rng = Xorshift64::new(42);
        for _ in 0..100_000 {
            let v = rng.next_f32();
            assert!((0.0..1.0).contains(&v), "xorshift_f32 out of range: {v}");
        }
    }

    /// Different seeds must eventually produce different output streams.
    ///
    /// Small seeds (1 vs 2) can collide on the first step due to shift masking,
    /// so we test over 10 steps and require at least one differing value.
    #[test]
    fn test_xorshift_different_seeds() {
        let mut rng1 = Xorshift64::new(0x1111_1111_1111_1111u64);
        let mut rng2 = Xorshift64::new(0x2222_2222_2222_2222u64);
        let any_different = (0..10).any(|_| rng1.next_f32() != rng2.next_f32());
        assert!(
            any_different,
            "two different seeds must produce at least one differing value in 10 steps"
        );
    }

    /// `softmax` output sums to 1.0 within floating-point tolerance.
    #[test]
    fn test_softmax_sums_to_one() {
        let logits = vec![1.0f32, 2.0, 0.5, -1.0, 3.0];
        let probs = softmax(&logits);
        let sum: f32 = probs.iter().sum();
        assert!(
            (sum - 1.0).abs() < 1e-5,
            "softmax should sum to 1.0, got {sum}"
        );
    }

    /// `softmax` of empty slice returns empty vec.
    #[test]
    fn test_softmax_empty() {
        assert!(softmax(&[]).is_empty());
    }

    /// `SpeculativeConfig::new` stores `num_speculative = 4` by default.
    #[test]
    fn test_speculative_config_defaults() {
        let target = EngineConfig {
            model_path: "target.gguf".to_string(),
            ..EngineConfig::default()
        };
        let draft = EngineConfig {
            model_path: "draft.gguf".to_string(),
            ..EngineConfig::default()
        };
        let cfg = SpeculativeConfig::new(target.clone(), draft.clone());
        assert_eq!(cfg.num_speculative, 4);
        assert_eq!(cfg.target.model_path, "target.gguf");
        assert_eq!(cfg.draft.model_path, "draft.gguf");
    }

    /// `SpeculativeConfig` respects an overridden `num_speculative`.
    #[test]
    fn test_speculative_config_override() {
        let target = EngineConfig::default();
        let draft = EngineConfig::default();
        let cfg = SpeculativeConfig {
            num_speculative: 8,
            ..SpeculativeConfig::new(target, draft)
        };
        assert_eq!(cfg.num_speculative, 8);
    }

    /// `SpeculativeEngine` must be `Send` so it can be used across threads.
    ///
    /// This is a compile-time assertion — if it compiles, the test passes.
    #[test]
    fn test_speculative_engine_is_send() {
        fn assert_send<T: Send>() {}
        assert_send::<SpeculativeEngine>();
    }

    /// `sample_residual` with identical distributions should return a valid index.
    #[test]
    fn test_sample_residual_identical_distributions() {
        let p = vec![0.25f32, 0.25, 0.25, 0.25];
        let mut rng = Xorshift64::new(99);
        let token = sample_residual(&p, &p, &mut rng);
        // Residual is all zeros → falls back to target distribution.
        assert!((token as usize) < p.len());
    }

    /// `sample_residual` with a peaked residual should converge to the peak index.
    #[test]
    fn test_sample_residual_peaked() {
        let p_target = vec![0.0f32, 0.0, 1.0, 0.0];
        let p_draft = vec![0.0f32, 0.0, 0.0, 0.0];
        let mut rng = Xorshift64::new(7);
        for _ in 0..100 {
            let token = sample_residual(&p_target, &p_draft, &mut rng);
            assert_eq!(token, 2, "should always pick index 2");
        }
    }

    /// `sample_with_prob` on an empty logit slice must return token 0, prob 1.0.
    #[test]
    fn test_sample_with_prob_empty_logits() {
        let mut rng = Xorshift64::new(1);
        let (token, prob) = sample_with_prob(&[], &mut rng);
        assert_eq!(token, 0);
        assert!((prob - 1.0).abs() < 1e-6);
    }

    /// `sample_with_prob` with a single-element logit must always return index 0.
    #[test]
    fn test_sample_with_prob_single() {
        let logits = vec![5.0f32];
        let mut rng = Xorshift64::new(42);
        for _ in 0..50 {
            let (token, prob) = sample_with_prob(&logits, &mut rng);
            assert_eq!(token, 0, "single-element: must pick 0");
            assert!((prob - 1.0).abs() < 1e-5, "single-element prob must be 1.0");
        }
    }

    /// `sample_with_prob` over a peaked distribution must consistently return the peak.
    #[test]
    fn test_sample_with_prob_peaked() {
        // Logit 1000 at index 2 → probability overwhelmingly on index 2.
        let logits = vec![-1000.0f32, -1000.0, 1000.0, -1000.0];
        let mut rng = Xorshift64::new(99);
        for _ in 0..100 {
            let (token, _prob) = sample_with_prob(&logits, &mut rng);
            assert_eq!(token, 2, "peaked distribution must always return index 2");
        }
    }

    /// `softmax_draft_at` must return valid probabilities summing to 1.
    #[test]
    fn test_softmax_draft_at_sums_to_one() {
        let logits = vec![0.5f32, 1.5, -0.5, 2.0];
        let draft_tokens = vec![1u32, 3u32];
        let probs = softmax_draft_at(&logits, &draft_tokens, 0);
        let sum: f32 = probs.iter().sum();
        assert!(
            (sum - 1.0).abs() < 1e-5,
            "softmax_draft_at must sum to 1, got {sum}"
        );
        assert_eq!(probs.len(), logits.len());
    }

    /// `softmax_draft_at` on empty logits returns empty vec.
    #[test]
    fn test_softmax_draft_at_empty() {
        let probs = softmax_draft_at(&[], &[], 0);
        assert!(probs.is_empty());
    }

    /// Xorshift64 with seed 0 must not get stuck (uses hardcoded non-zero default).
    #[test]
    fn test_xorshift_zero_seed_nonzero_output() {
        let mut rng = Xorshift64::new(0);
        // Must produce a non-zero value; hardcoded fallback seed is used.
        let v = rng.next_u64();
        assert_ne!(v, 0, "zero seed must be remapped to non-zero state");
    }

    /// `softmax` of uniform logits must produce equal probabilities.
    #[test]
    fn test_softmax_uniform_logits() {
        let logits = vec![1.0f32; 4];
        let probs = softmax(&logits);
        for &p in &probs {
            assert!((p - 0.25).abs() < 1e-5, "uniform logits → p=0.25, got {p}");
        }
    }

    /// `sample_residual` with empty distributions must return 0.
    #[test]
    fn test_sample_residual_empty() {
        let mut rng = Xorshift64::new(3);
        let token = sample_residual(&[], &[], &mut rng);
        assert_eq!(token, 0, "empty distributions → fallback index 0");
    }

    // ── Integration tests: SpeculativeEngine load + generate ──────────────────

    use crate::engine::InferenceEngine;

    /// Build a fully-loaded `SpeculativeEngine` from synthetic GGUF bytes.
    ///
    /// Both draft and target use the same 1-layer LLaMA-32d GGUF so that no
    /// real model files are required.  The `num_speculative` parameter controls
    /// how many draft tokens are proposed per round.
    fn make_loaded_engine(
        num_speculative: usize,
    ) -> crate::error::RuntimeResult<SpeculativeEngine> {
        let gguf_bytes = oxillama_gguf::test_utils::build_minimal_llama_gguf();
        let tok_json = oxillama_gguf::test_utils::minimal_tokenizer_json();

        let mut draft = InferenceEngine::new(EngineConfig::default());
        draft.load_model_from_bytes(&gguf_bytes, tok_json)?;

        let mut target = InferenceEngine::new(EngineConfig::default());
        target.load_model_from_bytes(&gguf_bytes, tok_json)?;

        Ok(SpeculativeEngine {
            draft,
            target,
            num_speculative,
            rng: Xorshift64::new(42),
            delta_sync: SpeculativeDeltaSync::new(),
        })
    }

    /// `SpeculativeEngine` loads successfully from in-memory synthetic GGUF.
    #[test]
    fn test_speculative_engine_loads_from_bytes() {
        let result = make_loaded_engine(4);
        assert!(
            result.is_ok(),
            "SpeculativeEngine should load from synthetic GGUF: {:?}",
            result.err()
        );
    }

    /// `SpeculativeEngine::new` fails when model path does not exist.
    #[test]
    fn test_speculative_engine_new_fails_with_missing_model() {
        let target = EngineConfig {
            model_path: "/nonexistent/target.gguf".to_string(),
            ..EngineConfig::default()
        };
        let draft = EngineConfig {
            model_path: "/nonexistent/draft.gguf".to_string(),
            ..EngineConfig::default()
        };
        let cfg = SpeculativeConfig::new(target, draft);
        let result = SpeculativeEngine::new(cfg);
        assert!(result.is_err(), "should fail with missing model files");
    }

    /// `generate` on a loaded engine returns `Ok` for a short prompt.
    ///
    /// Exercises the draft phase, verification phase, and accept/reject loop.
    #[test]
    fn test_speculative_engine_generate_short_prompt() {
        let mut engine = match make_loaded_engine(2) {
            Ok(e) => e,
            Err(e) => {
                // If the synthetic model cannot run a forward pass (architecture
                // not compiled in current feature set), skip gracefully.
                eprintln!("skip test_speculative_engine_generate_short_prompt: {e}");
                return;
            }
        };
        let result = engine.generate("abc", 5, |_tok| {});
        assert!(
            result.is_ok(),
            "generate should succeed on synthetic model: {:?}",
            result.err()
        );
    }

    /// `generate` with `max_tokens > num_speculative` forces multiple speculation
    /// rounds, exercising the KV-cache re-sync path (`resync_draft`).
    #[test]
    fn test_speculative_engine_generate_multiple_rounds() {
        let mut engine = match make_loaded_engine(2) {
            Ok(e) => e,
            Err(e) => {
                eprintln!("skip test_speculative_engine_generate_multiple_rounds: {e}");
                return;
            }
        };
        // max_tokens=10, num_speculative=2 → ≥5 speculation rounds needed.
        let result = engine.generate("hello", 10, |_tok| {});
        assert!(
            result.is_ok(),
            "multi-round generate should succeed: {:?}",
            result.err()
        );
    }

    /// `generate` accumulates callback output into the returned `String`.
    #[test]
    fn test_speculative_engine_generate_callback_accumulates() {
        let mut engine = match make_loaded_engine(2) {
            Ok(e) => e,
            Err(e) => {
                eprintln!("skip test_speculative_engine_generate_callback_accumulates: {e}");
                return;
            }
        };
        let mut cb_output = String::new();
        let result = engine.generate("ab", 4, |tok| cb_output.push_str(tok));
        let generated = match result {
            Ok(s) => s,
            Err(e) => {
                eprintln!("skip callback accumulation check: {e}");
                return;
            }
        };
        // The returned string must equal the concatenation seen by the callback.
        assert_eq!(
            generated, cb_output,
            "returned string must equal callback accumulation"
        );
    }

    /// `generate` with an empty prompt returns `Ok(String::new())` immediately.
    ///
    /// The tokenizer produces zero tokens for an empty string, so the engine
    /// returns early without entering the speculation loop.
    #[test]
    fn test_speculative_engine_generate_empty_prompt() {
        let mut engine = match make_loaded_engine(2) {
            Ok(e) => e,
            Err(e) => {
                eprintln!("skip test_speculative_engine_generate_empty_prompt: {e}");
                return;
            }
        };
        let result = engine.generate("", 5, |_| {});
        // Must not panic; outcome (Ok/Err) depends on tokenizer behaviour.
        let _ = result;
    }

    /// `SpeculativeConfig::new` sets `seed = None` by default.
    #[test]
    fn test_speculative_config_seed_none_by_default() {
        let cfg = SpeculativeConfig::new(EngineConfig::default(), EngineConfig::default());
        assert!(
            cfg.seed.is_none(),
            "seed should be None by default, got {:?}",
            cfg.seed
        );
    }

    /// `SpeculativeConfig::new` sets `num_speculative = 4` by default.
    #[test]
    fn test_speculative_config_num_speculative_default_is_4() {
        let cfg = SpeculativeConfig::new(EngineConfig::default(), EngineConfig::default());
        assert_eq!(cfg.num_speculative, 4);
    }

    /// Seeded engine produces deterministic output.
    ///
    /// Two engines built from identical seeds must produce the same generated
    /// text for the same prompt, exercising the RNG path in accept/reject.
    #[test]
    fn test_speculative_engine_deterministic_with_seed() {
        let mk = |seed: u64| -> Option<String> {
            let gguf_bytes = oxillama_gguf::test_utils::build_minimal_llama_gguf();
            let tok_json = oxillama_gguf::test_utils::minimal_tokenizer_json();

            let mut draft = InferenceEngine::new(EngineConfig::default());
            draft.load_model_from_bytes(&gguf_bytes, tok_json).ok()?;
            let mut target = InferenceEngine::new(EngineConfig::default());
            target.load_model_from_bytes(&gguf_bytes, tok_json).ok()?;

            let mut engine = SpeculativeEngine {
                draft,
                target,
                num_speculative: 2,
                rng: Xorshift64::new(seed),
                delta_sync: SpeculativeDeltaSync::new(),
            };
            engine.generate("test", 4, |_| {}).ok()
        };

        let run1 = mk(0xdead_beef_cafe_babe);
        let run2 = mk(0xdead_beef_cafe_babe);
        if run1.is_some() && run2.is_some() {
            assert_eq!(run1, run2, "identical seeds must produce identical output");
        }
    }

    /// `generate` called twice on the same engine resets state correctly.
    ///
    /// The second call must not fail due to stale KV-cache state.
    #[test]
    fn test_speculative_engine_generate_twice() {
        let mut engine = match make_loaded_engine(2) {
            Ok(e) => e,
            Err(e) => {
                eprintln!("skip test_speculative_engine_generate_twice: {e}");
                return;
            }
        };
        let r1 = engine.generate("first", 3, |_| {});
        let r2 = engine.generate("second", 3, |_| {});
        assert!(r1.is_ok(), "first generate should succeed: {:?}", r1.err());
        assert!(
            r2.is_ok(),
            "second generate should succeed (state reset): {:?}",
            r2.err()
        );
    }

    // ── SpeculativeDeltaSync tests ────────────────────────────────────────────

    /// `restore` before any `checkpoint` returns Err (Cancelled).
    #[test]
    fn test_delta_sync_restore_without_checkpoint_is_err() {
        use crate::engine::EngineConfig;

        let sync = SpeculativeDeltaSync::new();
        let mut engine = InferenceEngine::new(EngineConfig::default());
        // No model loaded — restore must fail (either ModelNotLoaded or no checkpoint).
        assert!(sync.restore(&mut engine).is_err());
    }

    /// `checkpoint` followed by `restore` preserves `seq_len`.
    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
    #[test]
    fn test_delta_sync_checkpoint_restore_seq_len() {
        use oxillama_gguf::test_utils::{build_minimal_llama_gguf, minimal_tokenizer_json};

        let bytes = build_minimal_llama_gguf();
        let json = minimal_tokenizer_json();
        let mut engine = InferenceEngine::new(crate::engine::EngineConfig::default());
        engine
            .load_model_from_bytes(&bytes, json)
            .expect("load model for delta sync test");

        // Take a checkpoint at the initial (empty) state.
        let mut sync = SpeculativeDeltaSync::new();
        sync.checkpoint(&engine).expect("checkpoint must succeed");

        // Run a couple of forward steps to change seq_len.
        engine.prefill(&[1, 2]).expect("prefill");
        let seq_after_prefill = engine.kv_snapshot().map(|s| s.seq_len).unwrap_or(0);
        assert!(seq_after_prefill > 0, "seq_len should have advanced");

        // Restore to checkpoint — seq_len should be 0.
        sync.restore(&mut engine).expect("restore must succeed");
        let snap_after_restore = engine.kv_snapshot().expect("snapshot after restore");
        assert_eq!(
            snap_after_restore.seq_len, 0,
            "restored seq_len should match checkpoint (0)"
        );
    }
}