Skip to main content

llama_cpp_v3_agent_sdk/
agent.rs

1use crate::agent_loop::{AgentEvent, AgentLoopConfig, KvCacheState, run_agent_loop};
2use crate::agents_md::AgentsMdRegistry;
3use crate::conversation::Conversation;
4use crate::error::AgentError;
5use crate::inference::{InferenceConfig, InferenceEngine, InferenceScheduler};
6use crate::permission::{PermissionMode, PermissionTracker};
7use crate::skills::SkillRegistry;
8use crate::tool::{Tool, ToolRegistry};
9use crate::tools;
10use llama_cpp_v3::LlamaContext;
11use std::path::PathBuf;
12use std::sync::Arc;
13
14/// Builder for constructing an `Agent` with custom configuration.
15///
16/// Supports two modes:
17///
18/// 1. **Standalone** – the builder loads the model itself (simple, one agent):
19/// ```no_run
20/// # use llama_cpp_v3_agent_sdk::AgentBuilder;
21/// # use llama_cpp_v3::backend::Backend;
22/// let agent = AgentBuilder::new()
23///     .backend(Backend::Cpu)
24///     .model_path("model.gguf")
25///     .build()
26///     .expect("Failed to build agent");
27/// ```
28///
29/// 2. **Shared engine** – multiple agents share one model (no redundant loading):
30/// ```no_run
31/// # use llama_cpp_v3_agent_sdk::{AgentBuilder, InferenceEngine, InferenceConfig};
32/// # use llama_cpp_v3::backend::Backend;
33/// # use std::sync::Arc;
34/// let engine = Arc::new(InferenceEngine::load(InferenceConfig {
35///     backend: Backend::Vulkan,
36///     model_path: "model.gguf".into(),
37///     n_gpu_layers: 99,
38///     ..Default::default()
39/// }).unwrap());
40///
41/// let agent_a = AgentBuilder::new()
42///     .engine(engine.clone())
43///     .system_prompt("You are agent A.")
44///     .build().unwrap();
45///
46/// let agent_b = AgentBuilder::new()
47///     .engine(engine.clone())
48///     .system_prompt("You are agent B.")
49///     .build().unwrap();
50/// ```
51pub struct AgentBuilder {
52    // ── Inference source (mutually exclusive) ───────────────────────────
53    /// Shared engine (takes priority over standalone fields).
54    shared_engine: Option<Arc<InferenceEngine>>,
55
56    // ── Standalone model-loading fields (used when no shared engine) ─────
57    backend_type: llama_cpp_v3::backend::Backend,
58    model_path: Option<String>,
59    n_gpu_layers: i32,
60    app_name: String,
61    cache_dir: Option<PathBuf>,
62    explicit_dll_path: Option<PathBuf>,
63    dll_version: Option<String>,
64    chat_template: Option<String>,
65
66    // ── Per-agent fields (always used) ──────────────────────────────────
67    system_prompt: String,
68    n_ctx: u32,
69    loop_config: AgentLoopConfig,
70    permission_mode: PermissionMode,
71    custom_tools: Vec<Box<dyn Tool>>,
72    skip_builtin_tools: bool,
73    // Skills
74    enable_skills: bool,
75    extra_skills_paths: Vec<PathBuf>,
76    activated_skills: Vec<String>,
77    // AGENTS.md
78    enable_agents_md: bool,
79    // Scheduler
80    scheduler: Option<Arc<InferenceScheduler>>,
81}
82
83impl AgentBuilder {
84    pub fn new() -> Self {
85        Self {
86            shared_engine: None,
87            backend_type: llama_cpp_v3::backend::Backend::Cpu,
88            model_path: None,
89            n_gpu_layers: 0,
90            n_ctx: 8192,
91            app_name: "llama-cpp-v3-agent-sdk".to_string(),
92            cache_dir: None,
93            explicit_dll_path: None,
94            dll_version: None,
95            chat_template: None,
96            system_prompt: DEFAULT_SYSTEM_PROMPT.to_string(),
97            loop_config: AgentLoopConfig::default(),
98            permission_mode: PermissionMode::AutoApprove,
99            custom_tools: Vec::new(),
100            skip_builtin_tools: false,
101            enable_skills: true,
102            extra_skills_paths: Vec::new(),
103            activated_skills: Vec::new(),
104            enable_agents_md: true,
105            scheduler: None,
106        }
107    }
108
109    // ── Shared engine ───────────────────────────────────────────────────
110
111    /// Use a shared `InferenceEngine` instead of loading a new model.
112    ///
113    /// When set, `backend()`, `model_path()`, `n_gpu_layers()`, `dll_*()`,
114    /// `cache_dir()`, and `app_name()` are ignored — the engine already
115    /// has those configured.
116    pub fn engine(mut self, engine: Arc<InferenceEngine>) -> Self {
117        self.shared_engine = Some(engine);
118        self
119    }
120
121    // ── Standalone model-loading (used when no shared engine) ───────────
122
123    /// Set the compute backend (CPU, CUDA, Vulkan, etc.)
124    pub fn backend(mut self, backend: llama_cpp_v3::backend::Backend) -> Self {
125        self.backend_type = backend;
126        self
127    }
128
129    /// Path to the GGUF model file.
130    pub fn model_path(mut self, path: &str) -> Self {
131        self.model_path = Some(path.to_string());
132        self
133    }
134
135    /// Number of layers to offload to GPU (-1 = all, 0 = none).
136    pub fn n_gpu_layers(mut self, n: i32) -> Self {
137        self.n_gpu_layers = n;
138        self
139    }
140
141    /// Application name (used for cache directory).
142    pub fn app_name(mut self, name: &str) -> Self {
143        self.app_name = name.to_string();
144        self
145    }
146
147    /// Directory for caching downloaded DLLs.
148    pub fn cache_dir(mut self, dir: PathBuf) -> Self {
149        self.cache_dir = Some(dir);
150        self
151    }
152
153    /// Explicit path to the llama.cpp DLL (bypasses download).
154    pub fn explicit_dll_path(mut self, path: PathBuf) -> Self {
155        self.explicit_dll_path = Some(path);
156        self
157    }
158
159    /// DLL version tag to download.
160    pub fn dll_version(mut self, version: &str) -> Self {
161        self.dll_version = Some(version.to_string());
162        self
163    }
164 
165    /// Set a custom chat template (Jinja).
166    pub fn chat_template(mut self, template: &str) -> Self {
167        self.chat_template = Some(template.to_string());
168        self
169    }
170
171    // ── Per-agent configuration ─────────────────────────────────────────
172
173    /// System prompt that instructs the model on its role and tool usage.
174    pub fn system_prompt(mut self, prompt: &str) -> Self {
175        self.system_prompt = prompt.to_string();
176        self
177    }
178
179    /// Context window size in tokens.
180    pub fn n_ctx(mut self, n: u32) -> Self {
181        self.n_ctx = n;
182        self
183    }
184
185    /// Maximum agent loop iterations (0 = unlimited).
186    pub fn max_iterations(mut self, n: usize) -> Self {
187        self.loop_config.max_iterations = n;
188        self
189    }
190
191    /// Maximum tokens per model completion.
192    pub fn max_tokens_per_completion(mut self, n: usize) -> Self {
193        self.loop_config.max_tokens_per_completion = n;
194        self
195    }
196
197    /// Sampling temperature.
198    pub fn temperature(mut self, temp: f32) -> Self {
199        self.loop_config.temperature = temp;
200        self
201    }
202
203    /// Top-K sampling parameter.
204    pub fn top_k(mut self, k: i32) -> Self {
205        self.loop_config.top_k = k;
206        self
207    }
208
209    /// Min-P sampling parameter.
210    pub fn min_p(mut self, p: f32) -> Self {
211        self.loop_config.min_p = p;
212        self
213    }
214
215    /// Repetition penalty.
216    pub fn repeat_penalty(mut self, p: f32) -> Self {
217        self.loop_config.repeat_penalty = p;
218        self
219    }
220
221    /// Add a stop sequence.
222    pub fn stop_sequence(mut self, stop: &str) -> Self {
223        self.loop_config.stop_sequences.push(stop.to_string());
224        self
225    }
226
227    /// Auto-approve all tool calls (YOLO mode — dangerous!).
228    pub fn auto_approve(mut self) -> Self {
229        self.permission_mode = PermissionMode::AutoApprove;
230        self
231    }
232
233    /// Set a permission callback for interactive approval.
234    pub fn permission_callback(
235        mut self,
236        cb: impl Fn(&crate::permission::PermissionRequest) -> crate::permission::PermissionDecision
237            + Send
238            + Sync
239            + 'static,
240    ) -> Self {
241        self.permission_mode = PermissionMode::Callback(Box::new(cb));
242        self
243    }
244
245    /// Add a custom tool.
246    pub fn tool(mut self, tool: Box<dyn Tool>) -> Self {
247        self.custom_tools.push(tool);
248        self
249    }
250
251    /// Skip registering built-in tools (bash, read, write, edit, glob).
252    pub fn skip_builtin_tools(mut self) -> Self {
253        self.skip_builtin_tools = true;
254        self
255    }
256
257    /// Disable skill discovery entirely.
258    pub fn no_skills(mut self) -> Self {
259        self.enable_skills = false;
260        self
261    }
262
263    /// Add an extra directory to search for skills.
264    pub fn skills_path(mut self, path: PathBuf) -> Self {
265        self.extra_skills_paths.push(path);
266        self
267    }
268
269    /// Explicitly activate a skill by name.
270    pub fn activate_skill(mut self, name: &str) -> Self {
271        self.activated_skills.push(name.to_string());
272        self
273    }
274
275    /// Disable AGENTS.md discovery entirely.
276    pub fn no_agents_md(mut self) -> Self {
277        self.enable_agents_md = false;
278        self
279    }
280
281    /// Set an inference scheduler to limit concurrent inferences.
282    ///
283    /// Use `InferenceScheduler::new(1)` to serialize all inference (one
284    /// agent at a time), or a higher value for controlled parallelism.
285    /// Without a scheduler, agents run truly parallel (safe, but GPU-heavy).
286    pub fn scheduler(mut self, scheduler: Arc<InferenceScheduler>) -> Self {
287        self.scheduler = Some(scheduler);
288        self
289    }
290
291    /// Build the agent.
292    ///
293    /// If a shared engine was provided via `.engine()`, it is reused.
294    /// Otherwise, a new engine is created from the standalone fields.
295    pub fn build(self) -> Result<Agent, AgentError> {
296        // ── Resolve the inference engine ─────────────────────────────────
297        let engine = if let Some(engine) = self.shared_engine {
298            engine
299        } else {
300            let model_path = self
301                .model_path
302                .ok_or_else(|| AgentError::Other(
303                    "No model path specified. Use .model_path() or .engine().".to_string(),
304                ))?;
305
306            let config = InferenceConfig {
307                backend: self.backend_type,
308                model_path,
309                n_gpu_layers: self.n_gpu_layers,
310                n_ctx: self.n_ctx,
311                app_name: self.app_name,
312                explicit_dll_path: self.explicit_dll_path,
313                dll_version: self.dll_version,
314                cache_dir: self.cache_dir,
315                chat_template: self.chat_template,
316            };
317
318            Arc::new(InferenceEngine::load(config)?)
319        };
320
321        // ── Resolve the context ─────────────────────────────────────────
322        // If we have a scheduler, we skip creating a dedicated context for this agent
323        // and instead rely on the pooled contexts during inference.
324        let ctx = if self.scheduler.is_some() {
325            None
326        } else {
327            Some(engine.create_context(Some(self.n_ctx))?)
328        };
329
330        // ── Build tool registry ─────────────────────────────────────────
331        let mut tool_registry = ToolRegistry::new();
332        if !self.skip_builtin_tools {
333            tools::register_builtin_tools(&mut tool_registry);
334        }
335        for tool in self.custom_tools {
336            tool_registry.register(tool);
337        }
338
339        // ── Skills ──────────────────────────────────────────────────────
340        let mut skill_registry = SkillRegistry::new();
341        if self.enable_skills {
342            skill_registry.add_default_paths();
343            for path in &self.extra_skills_paths {
344                skill_registry.add_search_path(path.clone());
345            }
346            skill_registry.discover();
347
348            if self.activated_skills.is_empty() {
349                skill_registry.load_all();
350            } else {
351                for name in &self.activated_skills {
352                    skill_registry.load(name);
353                }
354            }
355        }
356
357        // ── AGENTS.md ───────────────────────────────────────────────────
358        let mut agents_md_registry = AgentsMdRegistry::new();
359        if self.enable_agents_md {
360            agents_md_registry.discover();
361        }
362
363        // ── Assemble system prompt ──────────────────────────────────────
364        let tools_prompt = tool_registry.tools_prompt();
365        let skills_prompt = if self.enable_skills {
366            let summary = skill_registry.skills_summary_prompt();
367            let loaded = skill_registry.loaded_skills_prompt();
368            if summary.is_empty() && loaded.is_empty() {
369                String::new()
370            } else {
371                format!("{}\n{}", summary, loaded)
372            }
373        } else {
374            String::new()
375        };
376        let agents_md_prompt = agents_md_registry.agents_md_prompt();
377
378        let mut full_system_prompt = self.system_prompt.clone();
379        if !agents_md_prompt.is_empty() {
380            full_system_prompt.push_str("\n\n");
381            full_system_prompt.push_str(&agents_md_prompt);
382        }
383        if !skills_prompt.is_empty() {
384            full_system_prompt.push_str("\n\n");
385            full_system_prompt.push_str(&skills_prompt);
386        }
387        full_system_prompt.push_str("\n\n");
388        full_system_prompt.push_str(&tools_prompt);
389
390        let conversation = Conversation::with_system_prompt(&full_system_prompt);
391
392        Ok(Agent {
393            engine,
394            ctx,
395            conversation,
396            tool_registry,
397            permissions: PermissionTracker::new(self.permission_mode),
398            loop_config: self.loop_config,
399            skill_registry,
400            agents_md_registry,
401            scheduler: self.scheduler,
402            kv_cache: KvCacheState::new(),
403        })
404    }
405}
406
407impl Default for AgentBuilder {
408    fn default() -> Self {
409        Self::new()
410    }
411}
412
413/// A configured agent ready to accept user messages and execute tool-use loops.
414///
415/// The agent holds a shared reference to an `InferenceEngine` (model + backend)
416/// and its own `LlamaContext` (KV cache). Multiple agents can share the same
417/// engine without loading the model multiple times.
418pub struct Agent {
419    engine: Arc<InferenceEngine>,
420    ctx: Option<LlamaContext>,
421    conversation: Conversation,
422    tool_registry: ToolRegistry,
423    permissions: PermissionTracker,
424    loop_config: AgentLoopConfig,
425    skill_registry: SkillRegistry,
426    agents_md_registry: AgentsMdRegistry,
427    scheduler: Option<Arc<InferenceScheduler>>,
428    kv_cache: KvCacheState,
429}
430
431impl Agent {
432    /// Create an `AgentBuilder` for step-by-step configuration.
433    pub fn builder() -> AgentBuilder {
434        AgentBuilder::new()
435    }
436
437    /// Send a user message and run the agent loop until completion.
438    ///
439    /// The `on_event` callback receives streaming events as the agent generates
440    /// text and executes tools.
441    pub fn chat(
442        &mut self,
443        user_message: &str,
444        on_event: impl FnMut(AgentEvent),
445    ) -> Result<(), AgentError> {
446        self.conversation.add_user(user_message);
447
448        // Acquire a scheduler permit if configured.
449        // The permit is held for the entire agent loop and released on drop.
450        let mut permit = self.scheduler.as_ref().map(|s| s.acquire());
451
452        // Use the pooled context from the permit if available, otherwise fall back to the agent's own context.
453        let ctx = if let Some(p) = &mut permit {
454            p.context_mut()
455                .or(self.ctx.as_mut())
456                .ok_or_else(|| AgentError::Other("No context available for inference (no pool and no owned context)".to_string()))?
457        } else {
458            self.ctx.as_mut().ok_or_else(|| AgentError::Other("Agent has no owned context and no scheduler was provided".to_string()))?
459        };
460
461        run_agent_loop(
462            &self.engine,
463            ctx,
464            &mut self.conversation,
465            &self.tool_registry,
466            &mut self.permissions,
467            &self.loop_config,
468            &mut self.kv_cache,
469            on_event,
470        )
471    }
472
473    /// Send a user message and collect the full response text.
474    ///
475    /// Convenience method that runs the agent loop and returns the final
476    /// concatenated response text.
477    pub fn chat_simple(&mut self, user_message: &str) -> Result<String, AgentError> {
478        let mut response = String::new();
479
480        self.chat(user_message, |event| match event {
481            AgentEvent::TextDelta(text) => response.push_str(&text),
482            _ => {}
483        })?;
484
485        Ok(response)
486    }
487
488    /// Access the shared inference engine.
489    pub fn engine(&self) -> &Arc<InferenceEngine> {
490        &self.engine
491    }
492
493    /// Access the conversation history.
494    pub fn conversation(&self) -> &Conversation {
495        &self.conversation
496    }
497
498    /// Mutable access to the conversation history.
499    pub fn conversation_mut(&mut self) -> &mut Conversation {
500        &mut self.conversation
501    }
502
503    /// Access the tool registry.
504    pub fn tools(&self) -> &ToolRegistry {
505        &self.tool_registry
506    }
507
508    /// Register an additional tool at runtime.
509    pub fn register_tool(&mut self, tool: Box<dyn Tool>) {
510        self.tool_registry.register(tool);
511    }
512
513    /// Access the skill registry.
514    pub fn skills(&self) -> &SkillRegistry {
515        &self.skill_registry
516    }
517
518    /// Access the AGENTS.md registry.
519    pub fn agents_md(&self) -> &AgentsMdRegistry {
520        &self.agents_md_registry
521    }
522
523    /// Clear the conversation history (keeping the system prompt).
524    pub fn clear_history(&mut self) {
525        // Preserve the system prompt
526        let msgs = self.conversation.messages().to_vec();
527        self.conversation.clear();
528        if let Some(sys) = msgs.first() {
529            if sys.role == crate::conversation::Role::System {
530                self.conversation.add_system(&sys.content);
531            }
532        }
533        // Invalidate KV cache — the conversation changed dramatically
534        self.kv_cache.invalidate();
535    }
536}
537
538const DEFAULT_SYSTEM_PROMPT: &str = "\
539You are a helpful AI coding assistant. You can interact with the user's codebase \
540and system using the tools available to you.
541
542When the user asks you to perform a task:
5431. Think through the steps needed
5442. Use tools to gather information and make changes
5453. Verify your work when appropriate
5464. Explain what you did
547
548Be precise and careful with file edits. Always verify file contents before editing.";