llama-cpp-v3-agent-sdk 0.1.7

//! Shared inference engine — load a model once, share it across agents.
//!
//! This module follows the same pattern as `vnai::ai::TextGeneration`:
//! the heavy resources (`LlamaBackend`, `LlamaModel`) are wrapped in `Arc`
//! so they can be cloned cheaply and shared between multiple `Agent` instances.
//! Each agent creates its own `LlamaContext` (KV cache), so agents don't
//! interfere with each other.
//!
//! ## Concurrency
//!
//! - **Without a scheduler**: Agents run truly parallel (safe, but GPU-heavy).
//! - **With `InferenceScheduler`**: A semaphore limits how many agents can
//!   run inference concurrently. `max_concurrent = 1` serializes all inference.

use crate::error::AgentError;
use llama_cpp_v3::{LlamaBackend, LlamaContext, LlamaModel, LoadOptions};
use std::path::{Path, PathBuf};
use std::sync::{Arc, Condvar, Mutex};

/// Configuration for loading a model.
#[derive(Debug, Clone)]
pub struct InferenceConfig {
    /// Compute backend.
    pub backend: llama_cpp_v3::backend::Backend,
    /// Path to the GGUF model file.
    pub model_path: String,
    /// Number of layers to offload to GPU (-1 = all).
    pub n_gpu_layers: i32,
    /// Context window size in tokens (default for contexts created from this engine).
    pub n_ctx: u32,
    /// Application name (used for DLL cache directory).
    pub app_name: String,
    /// Explicit DLL path (skips auto-download).
    pub explicit_dll_path: Option<PathBuf>,
    /// DLL version tag to download.
    pub dll_version: Option<String>,
    /// DLL cache directory.
    pub cache_dir: Option<PathBuf>,
    /// Optional chat template (Jinja). If not provided, uses model metadata.
    pub chat_template: Option<String>,
}

impl Default for InferenceConfig {
    fn default() -> Self {
        Self {
            backend: llama_cpp_v3::backend::Backend::Cpu,
            model_path: String::new(),
            n_gpu_layers: 0,
            n_ctx: 8192,
            app_name: "llama-cpp-v3-agent-sdk".to_string(),
            explicit_dll_path: None,
            dll_version: None,
            cache_dir: None,
            chat_template: None,
        }
    }
}

/// Common chat templates for models that lack them.
pub mod templates {
    /// Llama-3 / Llama-3.1 chat template.
    pub const LLAMA_3: &str = "{% set loop_messages = messages %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{% endif %}";

    /// ChatML template (used by Qwen, Yi, Hermes, etc.).
    pub const CHATML: &str = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>\\n'}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant\\n'}}{% endif %}";

    /// Llama-2 template.
    pub const LLAMA_2: &str = "{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% else %}{{ message['content'] }}{% endif %}{% endfor %}";
}

/// Shared inference engine that holds the backend + model in `Arc`s.
///
/// This is the resource-heavy object: it loads the DLL and the model weights.
/// Multiple agents can share the same `InferenceEngine` — each agent just
/// creates its own lightweight `LlamaContext`.
///
/// # Example
/// ```no_run
/// use llama_cpp_v3_agent_sdk::inference::{InferenceEngine, InferenceConfig};
/// use llama_cpp_v3::backend::Backend;
/// use std::sync::Arc;
///
/// let engine = Arc::new(InferenceEngine::load(InferenceConfig {
///     backend: Backend::Vulkan,
///     model_path: "model.gguf".into(),
///     n_gpu_layers: 99,
///     ..Default::default()
/// }).expect("Failed to load model"));
///
/// // Share with multiple agents:
/// let agent1 = llama_cpp_v3_agent_sdk::AgentBuilder::new()
///     .engine(engine.clone())
///     .build().unwrap();
/// let agent2 = llama_cpp_v3_agent_sdk::AgentBuilder::new()
///     .engine(engine.clone())
///     .system_prompt("You are a different agent.")
///     .build().unwrap();
/// ```
#[derive(Clone)]
pub struct InferenceEngine {
    pub backend: Arc<LlamaBackend>,
    pub model: Arc<LlamaModel>,
    pub config: InferenceConfig,
}

impl InferenceEngine {
    /// Load a model from the given configuration.
    ///
    /// This performs the expensive operations (DLL loading, model weight loading)
    /// exactly once. The returned engine can be wrapped in `Arc` and shared.
    pub fn load(config: InferenceConfig) -> Result<Self, AgentError> {
        let path = Path::new(&config.model_path);
        if !path.exists() {
            return Err(AgentError::Other(format!(
                "Model file not found: {}",
                config.model_path
            )));
        }

        let options = LoadOptions {
            backend: config.backend,
            app_name: &config.app_name,
            version: config.dll_version.as_deref(),
            explicit_path: config.explicit_dll_path.as_deref(),
            cache_dir: config.cache_dir.clone(),
        };

        let backend = LlamaBackend::load(options)?;

        let mut model_params = LlamaModel::default_params(&backend);
        model_params.n_gpu_layers = config.n_gpu_layers;

        let path_str = config.model_path.replace('\\', "/");
        let model = LlamaModel::load_from_file(&backend, &path_str, model_params)?;

        Ok(Self {
            backend: Arc::new(backend),
            model: Arc::new(model),
            config,
        })
    }

    /// Create a new `LlamaContext` from this engine.
    ///
    /// Each agent should have its own context (it holds the KV cache).
    /// The `n_ctx` override lets callers use a different context size
    /// than the engine default.
    pub fn create_context(&self, n_ctx_override: Option<u32>) -> Result<LlamaContext, AgentError> {
        let mut ctx_params = LlamaContext::default_params(&self.model);
        ctx_params.n_ctx = n_ctx_override.unwrap_or(self.config.n_ctx);
        let ctx = LlamaContext::new(&self.model, ctx_params)?;
        Ok(ctx)
    }

    /// Access the raw `LlamaModel`.
    pub fn model(&self) -> &LlamaModel {
        &self.model
    }

    /// Access the raw `LlamaBackend`.
    pub fn backend(&self) -> &LlamaBackend {
        &self.backend
    }

    /// Get the `Arc<LlamaLib>` for creating samplers and batches.
    pub fn lib(&self) -> Arc<llama_cpp_sys_v3::LlamaLib> {
        self.backend.lib.clone()
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Inference Scheduler
// ─────────────────────────────────────────────────────────────────────────────

/// Controls how many agents can perform inference at the same time.
///
/// This is a simple counting semaphore: agents call `acquire()` before running
/// their inference loop and `release()` when done. If `max_concurrent` slots
/// are already in use, `acquire()` blocks until one is freed.
///
/// # Why?
///
/// Each agent has its own `LlamaContext` (KV cache) which is independent and
/// thread-safe. But all contexts share the same GPU for compute. Running too
/// many inferences in parallel can:
/// - Exhaust GPU VRAM (multiple KV caches)
/// - Thrash the GPU scheduler (context switches)
/// - Cause OOM errors on smaller GPUs
///
/// A scheduler with `max_concurrent = 1` serializes all inference (like the
/// worker-thread pattern in `vnai::ai`), while higher values allow controlled
/// parallelism.
///
/// # Example
/// ```
/// use llama_cpp_v3_agent_sdk::InferenceScheduler;
/// use std::sync::Arc;
///
/// // Allow at most 2 agents to infer concurrently:
/// let scheduler = Arc::new(InferenceScheduler::new(2));
///
/// // Use with AgentBuilder:
/// // AgentBuilder::new()
/// //     .engine(engine.clone())
/// //     .scheduler(scheduler.clone())
/// //     .build()?;
/// ```
pub struct InferenceScheduler {
    state: Mutex<SchedulerState>,
    cond: Condvar,
    pool: Mutex<Vec<LlamaContext>>,
}

struct SchedulerState {
    max_concurrent: usize,
    active: usize,
}

/// RAII guard — releases the scheduler slot and returns the context to the pool on drop.
pub struct InferencePermit<'a> {
    scheduler: &'a InferenceScheduler,
    context: Option<LlamaContext>,
}

impl<'a> InferencePermit<'a> {
    /// Access the leased context.
    pub fn context_mut(&mut self) -> Option<&mut LlamaContext> {
        self.context.as_mut()
    }
}

impl<'a> Drop for InferencePermit<'a> {
    fn drop(&mut self) {
        let ctx = self.context.take();
        self.scheduler.release(ctx);
    }
}

impl InferenceScheduler {
    /// Create a new scheduler with the given concurrency limit.
    ///
    /// - `max_concurrent = 1` → fully serialized (one agent at a time)
    /// - `max_concurrent = N` → up to N agents run inference in parallel
    pub fn new(max_concurrent: usize) -> Self {
        assert!(max_concurrent > 0, "max_concurrent must be at least 1");
        Self {
            state: Mutex::new(SchedulerState {
                max_concurrent,
                active: 0,
            }),
            cond: Condvar::new(),
            pool: Mutex::new(Vec::with_capacity(max_concurrent)),
        }
    }

    /// Pre-initialize the context pool with the given engine.
    /// This avoids lazy allocation during the first inference runs.
    pub fn init_pool(
        &self,
        engine: &InferenceEngine,
        n_ctx: Option<u32>,
    ) -> Result<(), AgentError> {
        let mut pool = self.pool.lock().unwrap();
        let count = self.max_concurrent();
        for _ in 0..count {
            pool.push(engine.create_context(n_ctx)?);
        }
        Ok(())
    }

    /// Acquire a permit and a context from the pool. Blocks if all slots are in use.
    ///
    /// Returns an RAII guard that automatically releases the slot on drop.
    pub fn acquire(&self) -> InferencePermit<'_> {
        let mut state = self.state.lock().unwrap();
        while state.active >= state.max_concurrent {
            state = self.cond.wait(state).unwrap();
        }
        state.active += 1;

        let context = self.pool.lock().unwrap().pop();
        InferencePermit {
            scheduler: self,
            context,
        }
    }

    /// Try to acquire a permit without blocking.
    ///
    /// Returns `None` if all slots are in use.
    pub fn try_acquire(&self) -> Option<InferencePermit<'_>> {
        let mut state = self.state.lock().unwrap();
        if state.active < state.max_concurrent {
            state.active += 1;
            let context = self.pool.lock().unwrap().pop();
            Some(InferencePermit {
                scheduler: self,
                context,
            })
        } else {
            None
        }
    }

    /// Release a slot and return the context to the pool.
    fn release(&self, context: Option<LlamaContext>) {
        if let Some(mut ctx) = context {
            ctx.kv_cache_clear();
            self.pool.lock().unwrap().push(ctx);
        }

        let mut state = self.state.lock().unwrap();
        state.active -= 1;
        self.cond.notify_one();
    }

    /// Number of currently active inferences.
    pub fn active_count(&self) -> usize {
        self.state.lock().unwrap().active
    }

    /// Maximum allowed concurrent inferences.
    pub fn max_concurrent(&self) -> usize {
        self.state.lock().unwrap().max_concurrent
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::atomic::{AtomicUsize, Ordering};
    use std::thread;

    #[test]
    fn test_scheduler_serialized() {
        let scheduler = Arc::new(InferenceScheduler::new(1));
        let counter = Arc::new(AtomicUsize::new(0));
        let max_seen = Arc::new(AtomicUsize::new(0));

        let mut handles = Vec::new();

        for _ in 0..4 {
            let sched = scheduler.clone();
            let cnt = counter.clone();
            let max = max_seen.clone();

            handles.push(thread::spawn(move || {
                let _permit = sched.acquire();
                let current = cnt.fetch_add(1, Ordering::SeqCst) + 1;
                // Record the max concurrency we observe
                max.fetch_max(current, Ordering::SeqCst);
                thread::sleep(std::time::Duration::from_millis(10));
                cnt.fetch_sub(1, Ordering::SeqCst);
            }));
        }

        for h in handles {
            h.join().unwrap();
        }

        // With max_concurrent=1, we should never have seen > 1 active
        assert_eq!(max_seen.load(Ordering::SeqCst), 1);
    }

    #[test]
    fn test_scheduler_parallel() {
        let scheduler = Arc::new(InferenceScheduler::new(3));
        let counter = Arc::new(AtomicUsize::new(0));
        let max_seen = Arc::new(AtomicUsize::new(0));

        let mut handles = Vec::new();

        for _ in 0..6 {
            let sched = scheduler.clone();
            let cnt = counter.clone();
            let max = max_seen.clone();

            handles.push(thread::spawn(move || {
                let _permit = sched.acquire();
                let current = cnt.fetch_add(1, Ordering::SeqCst) + 1;
                max.fetch_max(current, Ordering::SeqCst);
                thread::sleep(std::time::Duration::from_millis(50));
                cnt.fetch_sub(1, Ordering::SeqCst);
            }));
        }

        for h in handles {
            h.join().unwrap();
        }

        // With max_concurrent=3, should have seen > 1 active (likely 3)
        assert!(max_seen.load(Ordering::SeqCst) > 1);
        // And never more than 3
        assert!(max_seen.load(Ordering::SeqCst) <= 3);
    }

    #[test]
    fn test_try_acquire() {
        let scheduler = InferenceScheduler::new(1);
        let _permit = scheduler.acquire();
        assert!(scheduler.try_acquire().is_none());
        assert_eq!(scheduler.active_count(), 1);
    }
}