offline_intelligence/model_runtime/
ggml_runtime.rs

1//! GGML Runtime Adapter (legacy format)
2//! Similar to GGUF but for older llama.cpp GGML models
3
4use async_trait::async_trait;
5use super::gguf_runtime::GGUFRuntime;
6use super::runtime_trait::*;
7
8/// GGML runtime - reuses GGUF runtime implementation since llama-server supports both
9pub struct GGMLRuntime {
10    inner: GGUFRuntime,
11}
12
13impl GGMLRuntime {
14    pub fn new() -> Self {
15        Self {
16            inner: GGUFRuntime::new(),
17        }
18    }
19}
20
21impl Default for GGMLRuntime {
22    fn default() -> Self {
23        Self::new()
24    }
25}
26
27#[async_trait]
28impl ModelRuntime for GGMLRuntime {
29    fn supported_format(&self) -> ModelFormat {
30        ModelFormat::GGML
31    }
32
33    async fn initialize(&mut self, mut config: RuntimeConfig) -> anyhow::Result<()> {
34        // GGML uses the same llama-server as GGUF
35        config.format = ModelFormat::GGUF; // Internal override
36        self.inner.initialize(config).await
37    }
38
39    async fn is_ready(&self) -> bool {
40        self.inner.is_ready().await
41    }
42
43    async fn health_check(&self) -> anyhow::Result<String> {
44        self.inner.health_check().await
45    }
46
47    fn base_url(&self) -> String {
48        self.inner.base_url()
49    }
50
51    async fn generate(&self, request: InferenceRequest) -> anyhow::Result<InferenceResponse> {
52        self.inner.generate(request).await
53    }
54
55    async fn generate_stream(
56        &self,
57        request: InferenceRequest,
58    ) -> anyhow::Result<Box<dyn futures_util::Stream<Item = Result<String, anyhow::Error>> + Send + Unpin>> {
59        self.inner.generate_stream(request).await
60    }
61
62    async fn shutdown(&mut self) -> anyhow::Result<()> {
63        self.inner.shutdown().await
64    }
65
66    fn metadata(&self) -> RuntimeMetadata {
67        RuntimeMetadata {
68            format: ModelFormat::GGML,
69            runtime_name: "llama.cpp (llama-server)".to_string(),
70            version: "latest".to_string(),
71            supports_gpu: true,
72            supports_streaming: true,
73        }
74    }
75}
offline_intelligence/model_runtime/ggml_runtime.rs

offline_intelligence/model_runtime/
ggml_runtime.rs