Skip to main content

provider_agent/backend/
vllm.rs

1//! vLLM backend adapter. See `research/inference-engines.md` for details.
2//!
3//! - Default port: 8000
4//! - Health: `GET /health`
5//! - Models: `GET /v1/models`
6//! - Execute: `POST /v1/chat/completions` (OpenAI-compatible SSE stream)
7
8use async_trait::async_trait;
9
10use super::http::{
11    build_client, get_json, parse_openai_models, probe, stream_chat_completions, trim_url,
12};
13use super::{Backend, BackendHealth, BackendModel, BackendResult, Job, JobResult, JobSink};
14
15pub struct VllmBackend {
16    id: String,
17    base_url: String,
18    client: reqwest::Client,
19}
20
21impl VllmBackend {
22    pub fn new(url: &str) -> Self {
23        let base_url = trim_url(url).to_string();
24        Self {
25            id: format!("vllm:{base_url}"),
26            base_url,
27            client: build_client(),
28        }
29    }
30}
31
32#[async_trait]
33impl Backend for VllmBackend {
34    fn kind(&self) -> &'static str {
35        "vllm"
36    }
37
38    fn id(&self) -> &str {
39        &self.id
40    }
41
42    async fn list_models(&self) -> BackendResult<Vec<BackendModel>> {
43        let url = format!("{}/v1/models", self.base_url);
44        let v = get_json(&self.client, &url, None).await?;
45        Ok(parse_openai_models(&v, true))
46    }
47
48    async fn health(&self) -> BackendResult<BackendHealth> {
49        let url = format!("{}/health", self.base_url);
50        match probe(&self.client, &url, None).await {
51            Ok(latency_ms) => Ok(BackendHealth {
52                reachable: true,
53                latency_ms: Some(latency_ms),
54                last_error: None,
55            }),
56            Err(e) => Ok(BackendHealth {
57                reachable: false,
58                latency_ms: None,
59                last_error: Some(e.to_string()),
60            }),
61        }
62    }
63
64    async fn execute(&self, job: &Job, sink: &mut dyn JobSink) -> BackendResult<JobResult> {
65        let endpoint = format!("{}/v1/chat/completions", self.base_url);
66        stream_chat_completions(&self.client, &endpoint, None, job, sink).await
67    }
68}