inferd_proto/v2/attachment.rs
1//! v2 attachment table — binary payloads referenced by content blocks.
2//!
3//! Per ADR 0015 §"v2 Attachment", as amended by ADR 0016 (consumer
4//! decodes media before sending). Attachments are sent once at the
5//! request envelope's top level and referenced by `id` from any
6//! number of `image` / `audio` / `video` content blocks across the
7//! request's `messages[]`. This indirection matches the Anthropic
8//! shape and lets a multi-image conversation avoid duplicating bytes.
9//!
10//! ## Decode posture (ADR 0013 + ADR 0016)
11//!
12//! The wire carries **already-decoded** binary payloads — raw RGB
13//! interleaved bytes for images, float32 PCM samples for audio.
14//! The daemon does *not* link image/audio codec libraries; consumer
15//! middleware decodes before sending. This matches ADR 0013's
16//! gateway framing ("middleware owns the bytes") and matches what
17//! libmtmd's C API expects (`mtmd_bitmap_init` takes `nx * ny * 3`
18//! interleaved RGB; `mtmd_bitmap_init_from_audio` takes a float32
19//! PCM slice).
20//!
21//! Each attachment kind carries the metadata it needs:
22//! - `Image`: `width`, `height` (the daemon recomputes nothing).
23//! - `Audio`: `sample_rate` (Hz; the daemon doesn't resample).
24//! - `Video`: reserved; the actual shape is TBD when a video-
25//! capable adapter lands.
26
27use serde::{Deserialize, Serialize};
28
29/// One binary attachment in the request's top-level `attachments[]` table.
30///
31/// Tagged-enum shape: each variant carries exactly the metadata libmtmd
32/// (and other engines' multimodal interfaces) need for that modality.
33/// Unknown variants deserialise as [`Attachment::Unknown`] so v2.0
34/// clients don't reject newer payloads at parse time; resolve()
35/// rejects them only when they reach validation.
36///
37/// `id` must be unique within a single request; content blocks
38/// reference attachments by exactly this string.
39///
40/// `bytes` is standard-base64-encoded (RFC 4648, with `+/` and `=`
41/// padding). After ~1.33× inflation the raw payload must still leave
42/// room within the 64 MiB per-frame cap.
43#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
44#[serde(tag = "kind", rename_all = "lowercase")]
45pub enum Attachment {
46 /// Decoded RGB image. `bytes` is `width * height * 3` interleaved
47 /// RGB octets (no alpha channel; consumer drops alpha or
48 /// composites against a known background before sending).
49 Image {
50 /// Caller-chosen identifier; unique within the request.
51 id: String,
52 /// Image width in pixels.
53 width: u32,
54 /// Image height in pixels.
55 height: u32,
56 /// Base64 of `width * height * 3` interleaved RGB bytes.
57 bytes: String,
58 },
59 /// Decoded audio PCM. `bytes` is base64 of `n_samples *
60 /// sizeof(f32)` little-endian float32 samples at the named
61 /// sample rate.
62 Audio {
63 /// Caller-chosen identifier; unique within the request.
64 id: String,
65 /// Sample rate in Hz (e.g. 16000 for Whisper-class encoders;
66 /// Gemma 4 audio uses its own rate which the daemon learns at
67 /// adapter init time and reports via
68 /// `BackendCapabilities`).
69 sample_rate: u32,
70 /// Base64 of float32 PCM samples (little-endian).
71 bytes: String,
72 },
73 /// Reserved. Engine support is a separate concern; v2.0 daemons
74 /// reject video attachments with `attachment_unsupported` until
75 /// a video-capable adapter ships. Wire shape is intentionally
76 /// kept stub-thin; future revisions add fields without breaking
77 /// v2.0 clients (forward-compat: serde will accept extra fields
78 /// silently).
79 Video {
80 /// Caller-chosen identifier; unique within the request.
81 id: String,
82 /// Base64 of decoded video frames; precise format TBD.
83 bytes: String,
84 },
85 /// Forward-compat escape hatch — any `kind` value the local build
86 /// doesn't recognise lands here so older clients/daemons don't
87 /// reject newer payloads at parse time. `resolve()` rejects them
88 /// only when they reach validation.
89 #[serde(other)]
90 Unknown,
91}
92
93impl Attachment {
94 /// The attachment's id (independent of variant).
95 ///
96 /// Returns an empty string for `Unknown` since unknown variants
97 /// don't carry an id field reliably.
98 pub fn id(&self) -> &str {
99 match self {
100 Attachment::Image { id, .. }
101 | Attachment::Audio { id, .. }
102 | Attachment::Video { id, .. } => id,
103 Attachment::Unknown => "",
104 }
105 }
106
107 /// `true` if this attachment is an image.
108 pub fn is_image(&self) -> bool {
109 matches!(self, Attachment::Image { .. })
110 }
111
112 /// `true` if this attachment is audio.
113 pub fn is_audio(&self) -> bool {
114 matches!(self, Attachment::Audio { .. })
115 }
116
117 /// `true` if this attachment is video.
118 pub fn is_video(&self) -> bool {
119 matches!(self, Attachment::Video { .. })
120 }
121}