Skip to main content

provider_agent/backend/
llamacpp.rs

1//! llama.cpp `server` adapter. OpenAI-compatible.
2//!
3//! - Default port: 8080
4//! - Health: `GET /health`
5//! - Models: `GET /v1/models` (often a single entry — the loaded GGUF)
6//! - Execute: `POST /v1/chat/completions`
7
8use async_trait::async_trait;
9
10use super::http::{
11    build_client, get_json, parse_openai_models, probe, stream_chat_completions, trim_url,
12};
13use super::{Backend, BackendHealth, BackendModel, BackendResult, Job, JobResult, JobSink};
14
15pub struct LlamaCppBackend {
16    id: String,
17    base_url: String,
18    client: reqwest::Client,
19}
20
21impl LlamaCppBackend {
22    pub fn new(url: &str) -> Self {
23        let base_url = trim_url(url).to_string();
24        Self {
25            id: format!("llamacpp:{base_url}"),
26            base_url,
27            client: build_client(),
28        }
29    }
30}
31
32#[async_trait]
33impl Backend for LlamaCppBackend {
34    fn kind(&self) -> &'static str {
35        "llamacpp"
36    }
37
38    fn id(&self) -> &str {
39        &self.id
40    }
41
42    async fn list_models(&self) -> BackendResult<Vec<BackendModel>> {
43        let url = format!("{}/v1/models", self.base_url);
44        let v = get_json(&self.client, &url, None).await?;
45        // llama.cpp returns OpenAI-shaped `data: [...]`, but some older builds
46        // return a bare object. Try both.
47        let mut models = parse_openai_models(&v, true);
48        if models.is_empty() {
49            if let Some(id) = v.get("id").and_then(|s| s.as_str()) {
50                models.push(BackendModel {
51                    model_id: id.to_string(),
52                    context_window: None,
53                    native: true,
54                });
55            }
56        }
57        Ok(models)
58    }
59
60    async fn health(&self) -> BackendResult<BackendHealth> {
61        let url = format!("{}/health", self.base_url);
62        match probe(&self.client, &url, None).await {
63            Ok(latency_ms) => Ok(BackendHealth {
64                reachable: true,
65                latency_ms: Some(latency_ms),
66                last_error: None,
67            }),
68            Err(e) => Ok(BackendHealth {
69                reachable: false,
70                latency_ms: None,
71                last_error: Some(e.to_string()),
72            }),
73        }
74    }
75
76    async fn execute(&self, job: &Job, sink: &mut dyn JobSink) -> BackendResult<JobResult> {
77        let endpoint = format!("{}/v1/chat/completions", self.base_url);
78        stream_chat_completions(&self.client, &endpoint, None, job, sink).await
79    }
80}