inferd_engine/backend.rs
1//! `Backend` trait and shared types.
2
3use async_trait::async_trait;
4use inferd_proto::embed::{EmbedResolved, EmbedUsage};
5use inferd_proto::v2::{ResolvedV2, StopReasonV2, ToolCallId, ToolUseInput, UsageV2};
6use inferd_proto::{Resolved, StopReason, Usage};
7use std::pin::Pin;
8use tokio_stream::Stream;
9
10/// One event in a generation stream.
11///
12/// A successful generation produces zero or more `Token` events terminated by
13/// exactly one `Done`. A failed generation produces zero or more `Token`
14/// events followed by no further events; the adapter returns the failure as
15/// a `GenerateError` from `generate()` (pre-stream) or terminates the stream
16/// without a `Done` (mid-stream) — see ADR 0007 for the failure-semantics
17/// contract.
18#[derive(Debug, Clone, PartialEq, Eq)]
19pub enum TokenEvent {
20 /// One incremental generated token.
21 Token(String),
22 /// Final event for a successful generation.
23 Done {
24 /// Reason generation stopped.
25 stop_reason: StopReason,
26 /// Token-count usage.
27 usage: Usage,
28 },
29}
30
31/// Stream of `TokenEvent` values produced by a backend during generation.
32///
33/// Dropping the stream cancels the in-flight generation. Adapters must wire
34/// drop to their underlying cancellation primitive (e.g. a `CancellationToken`
35/// or by aborting the spawned task).
36pub type TokenStream = Pin<Box<dyn Stream<Item = TokenEvent> + Send>>;
37
38/// One event in a v2 generation stream — typed-content-block surface
39/// per ADR 0015.
40///
41/// v2 separates user-visible text (`Text`) from reasoning trace
42/// (`Thinking`) and emits complete tool-call requests (`ToolUse`) as
43/// their own variant rather than raw tokens. Backends that don't
44/// distinguish thinking content (any non-Gemma-4 backend) emit only
45/// `Text` events.
46#[derive(Debug, Clone, PartialEq)]
47pub enum TokenEventV2 {
48 /// Incremental user-visible text.
49 Text(String),
50 /// Incremental reasoning trace (Gemma 4 `<|think|>` content).
51 Thinking(String),
52 /// Complete tool-call request emitted by the model.
53 ToolUse {
54 /// Identifier paired with the consumer's eventual ToolResult.
55 tool_call_id: ToolCallId,
56 /// Tool name from the request's `tools[]` table.
57 name: String,
58 /// JSON arguments emitted by the model.
59 input: ToolUseInput,
60 },
61 /// Final event for a successful generation.
62 Done {
63 /// Reason generation stopped.
64 stop_reason: StopReasonV2,
65 /// Token-count usage.
66 usage: UsageV2,
67 },
68}
69
70/// Stream of `TokenEventV2` values produced by a backend during a v2
71/// generation. Dropping the stream cancels the in-flight generation.
72pub type TokenStreamV2 = Pin<Box<dyn Stream<Item = TokenEventV2> + Send>>;
73
74/// Hardware-acceleration backend the engine adapter is built and
75/// running with. Reflects compile-time GGML feature flags. Pure CPU
76/// builds (no `cuda` / `metal` / `vulkan` / `rocm` features) report
77/// `Cpu`. A build *with* support but where `n_gpu_layers == 0` also
78/// effectively uses CPU at runtime — see [`AcceleratorInfo::gpu_layers`].
79///
80/// New variants may be added in future patch releases (NPU, etc.);
81/// older subscribers should treat unknown variants as `Cpu` (the
82/// fallback that's always safe).
83#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
84pub enum AcceleratorKind {
85 /// Built without any GPU/accelerator support.
86 #[default]
87 Cpu,
88 /// Built with CUDA — NVIDIA GPU offload available.
89 Cuda,
90 /// Built with Metal — Apple GPU offload available.
91 Metal,
92 /// Built with Vulkan — cross-vendor GPU offload available.
93 Vulkan,
94 /// Built with HIP/ROCm — AMD GPU offload available.
95 Rocm,
96}
97
98impl AcceleratorKind {
99 /// Stable wire-form name: `"cpu"`, `"cuda"`, `"metal"`, `"vulkan"`,
100 /// `"rocm"`. Used in admin status frames and `inferd doctor`.
101 pub fn as_str(self) -> &'static str {
102 match self {
103 AcceleratorKind::Cpu => "cpu",
104 AcceleratorKind::Cuda => "cuda",
105 AcceleratorKind::Metal => "metal",
106 AcceleratorKind::Vulkan => "vulkan",
107 AcceleratorKind::Rocm => "rocm",
108 }
109 }
110}
111
112/// Snapshot of the active hardware-acceleration configuration.
113///
114/// `kind` is the compile-time GGML backend this engine was built with;
115/// `gpu_layers` is the runtime configuration the adapter was
116/// constructed with. A backend that compiled with `cuda` but was
117/// configured with `n_gpu_layers = 0` reports `kind = Cuda,
118/// gpu_layers = 0` — i.e. CUDA-capable but currently CPU-bound. The
119/// distinction is useful: it tells consumers the daemon *could*
120/// accelerate if reconfigured, vs. it can never accelerate without a
121/// rebuild.
122#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
123pub struct AcceleratorInfo {
124 /// Compile-time GGML backend.
125 pub kind: AcceleratorKind,
126 /// Layers offloaded to the accelerator at construction time. 0
127 /// means CPU-only at runtime regardless of `kind`.
128 pub gpu_layers: u32,
129}
130
131/// Per-backend capability advertisement. The daemon consults this on
132/// boot to decide whether v2 multimodal / tool-use requests can be
133/// dispatched, and reports the advertised set on the admin status
134/// surface so middleware authors can introspect what the running
135/// daemon can do without trial-and-error.
136///
137/// Per the v0.2 plan: until cloud adapters land, the only adapters
138/// shipped are `mock` and `llamacpp`. Both opt-in selectively —
139/// `mock` for tests, `llamacpp` once Phase 3+ wires mtmd / tool
140/// parsing.
141#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
142pub struct BackendCapabilities {
143 /// `true` if the backend implements `generate_v2` (typed
144 /// content blocks, tool definitions). When `false` the daemon's
145 /// v2 dispatch falls back to `Error{Internal,
146 /// "v2 not supported by this backend"}`.
147 pub v2: bool,
148 /// `true` if the backend can ingest image attachments. Reported
149 /// to consumers; requests with image content blocks against a
150 /// non-image backend get `Error{AttachmentUnsupported,...}`.
151 pub vision: bool,
152 /// `true` if the backend can ingest audio attachments.
153 pub audio: bool,
154 /// `true` if the backend can ingest video attachments. (Reserved.)
155 pub video: bool,
156 /// `true` if the backend natively supports tool-use round-tripping
157 /// (parses `<|tool_call>` from token stream, accepts `tool_result`
158 /// blocks in the next request, etc.).
159 pub tools: bool,
160 /// `true` if the backend separates `<|think|>` reasoning trace
161 /// from user-visible output.
162 pub thinking: bool,
163 /// `true` if the backend implements `embed` (per ADR 0017). When
164 /// `false` the daemon does not bind the embed socket; if it
165 /// somehow gets bound and a request arrives, dispatch returns
166 /// `Error{EmbedUnsupported}`.
167 pub embed: bool,
168 /// Hardware-acceleration snapshot. `Cpu / 0` for the default
169 /// trait impl; `mock` keeps the default; `llamacpp` reports the
170 /// compile-time GGML backend + the configured `n_gpu_layers`.
171 /// Reported on admin `status: capabilities` frames and in
172 /// `inferd doctor` (#77).
173 pub accelerator: AcceleratorInfo,
174}
175
176/// Result of a successful `Backend::embed()` call.
177///
178/// Embedding requests produce a single complete result, not a stream:
179/// one vector per input string in the same order as the request's
180/// `input`. `dimensions` is the actual length of each inner vector
181/// after any MRL truncation.
182#[derive(Debug, Clone, PartialEq)]
183pub struct EmbedResult {
184 /// One vector per input string, in input order. All inner vectors
185 /// share the same length (`dimensions`).
186 pub embeddings: Vec<Vec<f32>>,
187 /// Actual length of each inner vector after any MRL truncation.
188 pub dimensions: u32,
189 /// Backend-reported model name (e.g. `"embeddinggemma-300m"`).
190 pub model: String,
191 /// Token-count usage.
192 pub usage: EmbedUsage,
193}
194
195/// Errors returned by `Backend::embed()`.
196///
197/// Distinct from `GenerateError` because the embed surface has a
198/// different error taxonomy (no streaming → no mid-stream concept;
199/// adds `Unsupported` for the not-an-embed-backend case).
200#[derive(Debug, thiserror::Error)]
201pub enum EmbedError {
202 /// Backend was not ready when `embed()` was called.
203 #[error("backend not ready")]
204 NotReady,
205 /// Backend doesn't expose embedding capability.
206 #[error("embed not supported by this backend")]
207 Unsupported,
208 /// Backend rejected the request as malformed (dimensions out of
209 /// range for the model, input too long for the context, etc.).
210 #[error("invalid request: {0}")]
211 InvalidRequest(String),
212 /// Backend tried to embed and failed (model not loaded, remote
213 /// API errored, etc.).
214 #[error("backend unavailable: {0}")]
215 Unavailable(String),
216 /// Anything else.
217 #[error("internal: {0}")]
218 Internal(String),
219}
220
221/// Errors returned by `Backend::generate()` *before* any tokens have streamed.
222///
223/// Mid-stream failures terminate the stream silently (no `Done` event); the
224/// caller observes the absence of a terminal event and translates that to
225/// `Response::Error` with `code: backend_unavailable` per `docs/protocol-v1.md`.
226#[derive(Debug, thiserror::Error)]
227pub enum GenerateError {
228 /// Backend was not ready when `generate()` was called.
229 #[error("backend not ready")]
230 NotReady,
231 /// Backend rejected the request as malformed (sampling out of range, etc.).
232 #[error("invalid request: {0}")]
233 InvalidRequest(String),
234 /// Backend tried to start generation and failed (model not loaded,
235 /// remote API errored, etc.).
236 #[error("backend unavailable: {0}")]
237 Unavailable(String),
238 /// Anything else.
239 #[error("internal: {0}")]
240 Internal(String),
241}
242
243/// An inference backend.
244///
245/// Implementations are owned by the daemon and shared across requests through
246/// `Arc<dyn Backend>`. Methods take `&self`; concurrent invocations of
247/// `generate()` are serialised by the daemon's admission queue, not by the
248/// trait.
249#[async_trait]
250pub trait Backend: Send + Sync {
251 /// Stable identifier for the backend, e.g. `"mock"`, `"llamacpp"`,
252 /// `"anthropic"`. Echoed in `Response::Done::backend` for diagnostic
253 /// purposes (ADR 0007).
254 fn name(&self) -> &str;
255
256 /// Whether the backend has finished its boot sequence and can serve
257 /// requests. The daemon does not create its inference listener until
258 /// every registered backend reports `true` (see `THREAT_MODEL.md` F-13).
259 fn ready(&self) -> bool;
260
261 /// Capabilities the backend advertises to the daemon and (via
262 /// the admin status surface) to consumers. Default: text-only v1
263 /// backend, no v2, no multimodal, no tools — matches the v0.1
264 /// `mock` and `llamacpp` shape so existing implementors compile
265 /// unchanged.
266 fn capabilities(&self) -> BackendCapabilities {
267 BackendCapabilities::default()
268 }
269
270 /// Begin a generation and return a stream of `TokenEvent` values.
271 ///
272 /// Errors returned here surface as `Response::Error` *before* any tokens
273 /// reach the client. Errors that occur after the first token has streamed
274 /// terminate the stream without a `Done`.
275 async fn generate(&self, req: Resolved) -> Result<TokenStream, GenerateError>;
276
277 /// Begin a v2 generation and return a stream of `TokenEventV2`
278 /// values. Default impl returns `GenerateError::Internal("v2 not
279 /// supported by this backend")` — adapters opt in by overriding.
280 /// The daemon checks `capabilities().v2` before calling this on
281 /// the v2 path; the default `false` capability prevents dispatch
282 /// from reaching here for non-v2 backends.
283 async fn generate_v2(&self, _req: ResolvedV2) -> Result<TokenStreamV2, GenerateError> {
284 Err(GenerateError::Internal(
285 "v2 not supported by this backend".into(),
286 ))
287 }
288
289 /// Compute embeddings for the request's input strings (per
290 /// ADR 0017). Default impl returns `EmbedError::Unsupported` —
291 /// adapters opt in by overriding and setting
292 /// `capabilities().embed = true`. The daemon binds the embed
293 /// socket only when the active backend's capability is `true`,
294 /// so reaching this default impl in production is a fail-safe
295 /// for misconfiguration.
296 async fn embed(&self, _req: EmbedResolved) -> Result<EmbedResult, EmbedError> {
297 Err(EmbedError::Unsupported)
298 }
299
300 /// Best-effort graceful shutdown. The daemon calls this on stop; the
301 /// adapter should release model memory, terminate worker threads, and
302 /// any other long-lived resources within the deadline.
303 async fn stop(&self, _timeout: std::time::Duration) -> Result<(), GenerateError> {
304 Ok(())
305 }
306}