inferd_proto/v2/response.rs
1//! v2 response frame schema.
2//!
3//! Per ADR 0015 §"v2 Response frames". Three frame variants:
4//!
5//! - `frame` — streaming partial output. Carries one `ResponseBlock`,
6//! which is either a `text` delta (incremental token text), a
7//! `thinking` delta (reasoning trace, separated from user-visible
8//! output per `docs/thinking.mode.in.gemma.md`), or a complete
9//! `tool_use` block (the daemon parses Gemma 4's
10//! `<|tool_call>...<tool_call|>` sequence in full before emitting).
11//!
12//! - `done` — terminal success frame. Carries usage and the v2
13//! stop-reason taxonomy (`end_turn`, `max_tokens`, `tool_use`,
14//! `stop_sequence`).
15//!
16//! - `error` — terminal failure frame. Carries the v2 error-code
17//! taxonomy (v1 codes plus `attachment_unsupported` and
18//! `tool_call_malformed`).
19
20use crate::v2::tool::{ToolCallId, ToolUseInput};
21use serde::{Deserialize, Serialize};
22
23/// Why a v2 generation ended. Carried on `done` frames.
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
25#[serde(rename_all = "snake_case")]
26pub enum StopReasonV2 {
27 /// Model emitted the end-of-turn token cleanly. Equivalent to v1's `end`.
28 EndTurn,
29 /// `max_tokens` reached.
30 MaxTokens,
31 /// Model emitted a complete `tool_use` block; consumer must
32 /// execute the tool and send a follow-up request with a
33 /// `tool_result` content block.
34 ToolUse,
35 /// Generation hit a configured stop sequence (reserved for v2.x;
36 /// v2.0 daemons may not emit this).
37 StopSequence,
38 /// Caller disconnected or otherwise cancelled.
39 Cancelled,
40 /// Generation aborted; partial output was already emitted.
41 Error,
42}
43
44/// Token-count usage report carried on v2 `done` frames.
45///
46/// Field names (`input_tokens` / `output_tokens`) match Anthropic's
47/// shape; v1's `prompt_tokens` / `completion_tokens` matched
48/// llama.cpp's terminology. Both refer to the same underlying counts.
49#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
50pub struct UsageV2 {
51 /// Tokens consumed by the prompt (including templated chat structure).
52 pub input_tokens: u32,
53 /// Tokens generated in the response.
54 pub output_tokens: u32,
55}
56
57/// One streaming-output payload carried inside a `frame` response.
58///
59/// Variants:
60/// - `Text { delta }` — incremental token text. Multiple `Text` frames
61/// per request are normal; concatenating their `delta`s yields the
62/// full response text.
63/// - `Thinking { delta }` — incremental reasoning-trace text. The
64/// daemon separates content emitted between Gemma 4's `<|think|>`
65/// and `<|/think|>` tokens into this variant so middleware can
66/// choose to display, hide, or log it independently.
67/// - `ToolUse { ... }` — a complete tool-call request from the model.
68/// Arrives whole, not streamed. Following a `ToolUse` block, the
69/// stream typically terminates with `Done { stop_reason: ToolUse }`.
70#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
71#[serde(tag = "type", rename_all = "snake_case")]
72pub enum ResponseBlock {
73 /// Incremental user-visible text.
74 Text {
75 /// Newly-generated text since the last `Text` frame.
76 delta: String,
77 },
78 /// Incremental reasoning-trace text.
79 Thinking {
80 /// Newly-generated reasoning trace since the last `Thinking` frame.
81 delta: String,
82 },
83 /// Complete model-emitted tool-call request.
84 ToolUse {
85 /// Pairs this invocation with the consumer's eventual
86 /// `ToolResult` block in the next request.
87 tool_call_id: ToolCallId,
88 /// Tool name; matches a `Tool::name` from the request's
89 /// `tools[]` table.
90 name: String,
91 /// JSON arguments emitted by the model.
92 input: ToolUseInput,
93 },
94}
95
96/// v2 error-code taxonomy. Superset of v1's `ErrorCode` (kept
97/// independent so the v1 enum stays frozen per ADR 0008).
98#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
99#[serde(rename_all = "snake_case")]
100pub enum ErrorCodeV2 {
101 /// Admission queue full at submit time.
102 QueueFull,
103 /// Selected backend errored before or during generation.
104 BackendUnavailable,
105 /// Request failed validation (bad shape, dangling attachment id, etc.).
106 InvalidRequest,
107 /// Frame exceeded the 64 MiB cap.
108 FrameTooLarge,
109 /// Daemon-side bug or unexpected condition.
110 Internal,
111 /// Backend cannot handle the requested attachment kind / MIME.
112 AttachmentUnsupported,
113 /// Model emitted a tool-call sequence the daemon couldn't parse.
114 ToolCallMalformed,
115}
116
117/// One frame on the v2 response NDJSON stream.
118#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
119#[serde(tag = "type", rename_all = "snake_case")]
120pub enum ResponseV2 {
121 /// One streaming-output payload.
122 Frame {
123 /// Request id.
124 id: String,
125 /// The payload — text delta, thinking delta, or complete tool_use.
126 block: ResponseBlock,
127 },
128 /// Terminal frame for a successful generation.
129 Done {
130 /// Request id.
131 id: String,
132 /// Token-count usage.
133 usage: UsageV2,
134 /// Why generation stopped.
135 stop_reason: StopReasonV2,
136 /// `Backend::name()` of the adapter that served this request.
137 ///
138 /// Diagnostic only — apps must not branch on this (ADR 0007).
139 backend: String,
140 },
141 /// Terminal frame for a failed generation.
142 Error {
143 /// Request id.
144 id: String,
145 /// Machine-readable classification.
146 code: ErrorCodeV2,
147 /// Human-readable description.
148 message: String,
149 },
150}
151
152impl ResponseV2 {
153 /// Correlation id of the frame regardless of variant.
154 pub fn id(&self) -> &str {
155 match self {
156 ResponseV2::Frame { id, .. }
157 | ResponseV2::Done { id, .. }
158 | ResponseV2::Error { id, .. } => id,
159 }
160 }
161
162 /// `true` if this frame ends a request stream.
163 pub fn is_terminal(&self) -> bool {
164 matches!(self, ResponseV2::Done { .. } | ResponseV2::Error { .. })
165 }
166}