Skip to main content

inferd_engine/
backend.rs

1//! `Backend` trait and shared types.
2
3use async_trait::async_trait;
4use inferd_proto::embed::{EmbedResolved, EmbedUsage};
5use inferd_proto::v2::{ResolvedV2, StopReasonV2, ToolCallId, ToolUseInput, UsageV2};
6use inferd_proto::{Resolved, StopReason, Usage};
7use std::pin::Pin;
8use tokio_stream::Stream;
9
10/// One event in a generation stream.
11///
12/// A successful generation produces zero or more `Token` events terminated by
13/// exactly one `Done`. A failed generation produces zero or more `Token`
14/// events followed by no further events; the adapter returns the failure as
15/// a `GenerateError` from `generate()` (pre-stream) or terminates the stream
16/// without a `Done` (mid-stream) — see ADR 0007 for the failure-semantics
17/// contract.
18#[derive(Debug, Clone, PartialEq, Eq)]
19pub enum TokenEvent {
20    /// One incremental generated token.
21    Token(String),
22    /// Final event for a successful generation.
23    Done {
24        /// Reason generation stopped.
25        stop_reason: StopReason,
26        /// Token-count usage.
27        usage: Usage,
28    },
29}
30
31/// Stream of `TokenEvent` values produced by a backend during generation.
32///
33/// Dropping the stream cancels the in-flight generation. Adapters must wire
34/// drop to their underlying cancellation primitive (e.g. a `CancellationToken`
35/// or by aborting the spawned task).
36pub type TokenStream = Pin<Box<dyn Stream<Item = TokenEvent> + Send>>;
37
38/// One event in a v2 generation stream — typed-content-block surface
39/// per ADR 0015.
40///
41/// v2 separates user-visible text (`Text`) from reasoning trace
42/// (`Thinking`) and emits complete tool-call requests (`ToolUse`) as
43/// their own variant rather than raw tokens. Backends that don't
44/// distinguish thinking content (any non-Gemma-4 backend) emit only
45/// `Text` events.
46#[derive(Debug, Clone, PartialEq)]
47pub enum TokenEventV2 {
48    /// Incremental user-visible text.
49    Text(String),
50    /// Incremental reasoning trace (Gemma 4 `<|think|>` content).
51    Thinking(String),
52    /// Complete tool-call request emitted by the model.
53    ToolUse {
54        /// Identifier paired with the consumer's eventual ToolResult.
55        tool_call_id: ToolCallId,
56        /// Tool name from the request's `tools[]` table.
57        name: String,
58        /// JSON arguments emitted by the model.
59        input: ToolUseInput,
60    },
61    /// Final event for a successful generation.
62    Done {
63        /// Reason generation stopped.
64        stop_reason: StopReasonV2,
65        /// Token-count usage.
66        usage: UsageV2,
67    },
68}
69
70/// Stream of `TokenEventV2` values produced by a backend during a v2
71/// generation. Dropping the stream cancels the in-flight generation.
72pub type TokenStreamV2 = Pin<Box<dyn Stream<Item = TokenEventV2> + Send>>;
73
74/// Hardware-acceleration backend the engine adapter is built and
75/// running with. Reflects compile-time GGML feature flags. Pure CPU
76/// builds (no `cuda` / `metal` / `vulkan` / `rocm` features) report
77/// `Cpu`. A build *with* support but where `n_gpu_layers == 0` also
78/// effectively uses CPU at runtime — see [`AcceleratorInfo::gpu_layers`].
79///
80/// New variants may be added in future patch releases (NPU, etc.);
81/// older subscribers should treat unknown variants as `Cpu` (the
82/// fallback that's always safe).
83#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
84pub enum AcceleratorKind {
85    /// Built without any GPU/accelerator support.
86    #[default]
87    Cpu,
88    /// Built with CUDA — NVIDIA GPU offload available.
89    Cuda,
90    /// Built with Metal — Apple GPU offload available.
91    Metal,
92    /// Built with Vulkan — cross-vendor GPU offload available.
93    Vulkan,
94    /// Built with HIP/ROCm — AMD GPU offload available.
95    Rocm,
96}
97
98impl AcceleratorKind {
99    /// Stable wire-form name: `"cpu"`, `"cuda"`, `"metal"`, `"vulkan"`,
100    /// `"rocm"`. Used in admin status frames and `inferdctl doctor`.
101    pub fn as_str(self) -> &'static str {
102        match self {
103            AcceleratorKind::Cpu => "cpu",
104            AcceleratorKind::Cuda => "cuda",
105            AcceleratorKind::Metal => "metal",
106            AcceleratorKind::Vulkan => "vulkan",
107            AcceleratorKind::Rocm => "rocm",
108        }
109    }
110}
111
112/// Snapshot of the active hardware-acceleration configuration.
113///
114/// `kind` is the compile-time GGML backend this engine was built with;
115/// `gpu_layers` is the runtime configuration the adapter was
116/// constructed with. A backend that compiled with `cuda` but was
117/// configured with `n_gpu_layers = 0` reports `kind = Cuda,
118/// gpu_layers = 0` — i.e. CUDA-capable but currently CPU-bound. The
119/// distinction is useful: it tells consumers the daemon *could*
120/// accelerate if reconfigured, vs. it can never accelerate without a
121/// rebuild.
122#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
123pub struct AcceleratorInfo {
124    /// Compile-time GGML backend.
125    pub kind: AcceleratorKind,
126    /// Layers offloaded to the accelerator at construction time. 0
127    /// means CPU-only at runtime regardless of `kind`.
128    pub gpu_layers: u32,
129}
130
131/// Per-backend capability advertisement. The daemon consults this on
132/// boot to decide whether v2 multimodal / tool-use requests can be
133/// dispatched, and reports the advertised set on the admin status
134/// surface so middleware authors can introspect what the running
135/// daemon can do without trial-and-error.
136///
137/// Per the v0.2 plan: until cloud adapters land, the only adapters
138/// shipped are `mock` and `llamacpp`. Both opt-in selectively —
139/// `mock` for tests, `llamacpp` once Phase 3+ wires mtmd / tool
140/// parsing.
141#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
142pub struct BackendCapabilities {
143    /// `true` if the backend implements `generate_v2` (typed
144    /// content blocks, tool definitions). When `false` the daemon's
145    /// v2 dispatch falls back to `Error{Internal,
146    /// "v2 not supported by this backend"}`.
147    pub v2: bool,
148    /// `true` if the backend can ingest image attachments. Reported
149    /// to consumers; requests with image content blocks against a
150    /// non-image backend get `Error{AttachmentUnsupported,...}`.
151    pub vision: bool,
152    /// `true` if the backend can ingest audio attachments.
153    pub audio: bool,
154    /// `true` if the backend can ingest video attachments. (Reserved.)
155    pub video: bool,
156    /// `true` if the backend natively supports tool-use round-tripping
157    /// (parses `<|tool_call>` from token stream, accepts `tool_result`
158    /// blocks in the next request, etc.).
159    pub tools: bool,
160    /// `true` if the backend separates `<|think|>` reasoning trace
161    /// from user-visible output.
162    pub thinking: bool,
163    /// `true` if the backend implements `embed` (per ADR 0017). When
164    /// `false` the daemon does not bind the embed socket; if it
165    /// somehow gets bound and a request arrives, dispatch returns
166    /// `Error{EmbedUnsupported}`.
167    pub embed: bool,
168    /// Hardware-acceleration snapshot. `Cpu / 0` for the default
169    /// trait impl; `mock` keeps the default; `llamacpp` reports the
170    /// compile-time GGML backend + the configured `n_gpu_layers`.
171    /// Reported on admin `status: capabilities` frames and in
172    /// `inferdctl doctor` (#77).
173    pub accelerator: AcceleratorInfo,
174}
175
176/// Result of a successful `Backend::embed()` call.
177///
178/// Embedding requests produce a single complete result, not a stream:
179/// one vector per input string in the same order as the request's
180/// `input`. `dimensions` is the actual length of each inner vector
181/// after any MRL truncation.
182#[derive(Debug, Clone, PartialEq)]
183pub struct EmbedResult {
184    /// One vector per input string, in input order. All inner vectors
185    /// share the same length (`dimensions`).
186    pub embeddings: Vec<Vec<f32>>,
187    /// Actual length of each inner vector after any MRL truncation.
188    pub dimensions: u32,
189    /// Backend-reported model name (e.g. `"embeddinggemma-300m"`).
190    pub model: String,
191    /// Token-count usage.
192    pub usage: EmbedUsage,
193}
194
195/// Errors returned by `Backend::embed()`.
196///
197/// Distinct from `GenerateError` because the embed surface has a
198/// different error taxonomy (no streaming → no mid-stream concept;
199/// adds `Unsupported` for the not-an-embed-backend case).
200#[derive(Debug, thiserror::Error)]
201pub enum EmbedError {
202    /// Backend was not ready when `embed()` was called.
203    #[error("backend not ready")]
204    NotReady,
205    /// Backend doesn't expose embedding capability.
206    #[error("embed not supported by this backend")]
207    Unsupported,
208    /// Backend rejected the request as malformed (dimensions out of
209    /// range for the model, input too long for the context, etc.).
210    #[error("invalid request: {0}")]
211    InvalidRequest(String),
212    /// Backend tried to embed and failed (model not loaded, remote
213    /// API errored, etc.).
214    #[error("backend unavailable: {0}")]
215    Unavailable(String),
216    /// Anything else.
217    #[error("internal: {0}")]
218    Internal(String),
219}
220
221/// Errors returned by `Backend::generate()` *before* any tokens have streamed.
222///
223/// Mid-stream failures terminate the stream silently (no `Done` event); the
224/// caller observes the absence of a terminal event and translates that to
225/// `Response::Error` with `code: backend_unavailable` per `docs/protocol-v1.md`.
226#[derive(Debug, thiserror::Error)]
227pub enum GenerateError {
228    /// Backend was not ready when `generate()` was called.
229    #[error("backend not ready")]
230    NotReady,
231    /// Backend rejected the request as malformed (sampling out of range, etc.).
232    #[error("invalid request: {0}")]
233    InvalidRequest(String),
234    /// Backend tried to start generation and failed (model not loaded,
235    /// remote API errored, etc.).
236    #[error("backend unavailable: {0}")]
237    Unavailable(String),
238    /// Anything else.
239    #[error("internal: {0}")]
240    Internal(String),
241}
242
243/// An inference backend.
244///
245/// Implementations are owned by the daemon and shared across requests through
246/// `Arc<dyn Backend>`. Methods take `&self`; concurrent invocations of
247/// `generate()` are serialised by the daemon's admission queue, not by the
248/// trait.
249#[async_trait]
250pub trait Backend: Send + Sync {
251    /// Stable identifier for the backend, e.g. `"mock"`, `"llamacpp"`,
252    /// `"anthropic"`. Echoed in `Response::Done::backend` for diagnostic
253    /// purposes (ADR 0007).
254    fn name(&self) -> &str;
255
256    /// Whether the backend has finished its boot sequence and can serve
257    /// requests. The daemon does not create its inference listener until
258    /// every registered backend reports `true` (see `THREAT_MODEL.md` F-13).
259    fn ready(&self) -> bool;
260
261    /// Capabilities the backend advertises to the daemon and (via
262    /// the admin status surface) to consumers. Default: text-only v1
263    /// backend, no v2, no multimodal, no tools — matches the v0.1
264    /// `mock` and `llamacpp` shape so existing implementors compile
265    /// unchanged.
266    fn capabilities(&self) -> BackendCapabilities {
267        BackendCapabilities::default()
268    }
269
270    /// Begin a generation and return a stream of `TokenEvent` values.
271    ///
272    /// Errors returned here surface as `Response::Error` *before* any tokens
273    /// reach the client. Errors that occur after the first token has streamed
274    /// terminate the stream without a `Done`.
275    async fn generate(&self, req: Resolved) -> Result<TokenStream, GenerateError>;
276
277    /// Begin a v2 generation and return a stream of `TokenEventV2`
278    /// values. Default impl returns `GenerateError::Internal("v2 not
279    /// supported by this backend")` — adapters opt in by overriding.
280    /// The daemon checks `capabilities().v2` before calling this on
281    /// the v2 path; the default `false` capability prevents dispatch
282    /// from reaching here for non-v2 backends.
283    async fn generate_v2(&self, _req: ResolvedV2) -> Result<TokenStreamV2, GenerateError> {
284        Err(GenerateError::Internal(
285            "v2 not supported by this backend".into(),
286        ))
287    }
288
289    /// Compute embeddings for the request's input strings (per
290    /// ADR 0017). Default impl returns `EmbedError::Unsupported` —
291    /// adapters opt in by overriding and setting
292    /// `capabilities().embed = true`. The daemon binds the embed
293    /// socket only when the active backend's capability is `true`,
294    /// so reaching this default impl in production is a fail-safe
295    /// for misconfiguration.
296    async fn embed(&self, _req: EmbedResolved) -> Result<EmbedResult, EmbedError> {
297        Err(EmbedError::Unsupported)
298    }
299
300    /// Best-effort graceful shutdown. The daemon calls this on stop; the
301    /// adapter should release model memory, terminate worker threads, and
302    /// any other long-lived resources within the deadline.
303    async fn stop(&self, _timeout: std::time::Duration) -> Result<(), GenerateError> {
304        Ok(())
305    }
306}