Skip to main content

inferd_proto/v2/
attachment.rs

1//! v2 attachment table — binary payloads referenced by content blocks.
2//!
3//! Per ADR 0015 §"v2 Attachment", as amended by ADR 0016 (consumer
4//! decodes media before sending). Attachments are sent once at the
5//! request envelope's top level and referenced by `id` from any
6//! number of `image` / `audio` / `video` content blocks across the
7//! request's `messages[]`. This indirection matches the Anthropic
8//! shape and lets a multi-image conversation avoid duplicating bytes.
9//!
10//! ## Decode posture (ADR 0013 + ADR 0016)
11//!
12//! The wire carries **already-decoded** binary payloads — raw RGB
13//! interleaved bytes for images, float32 PCM samples for audio.
14//! The daemon does *not* link image/audio codec libraries; consumer
15//! middleware decodes before sending. This matches ADR 0013's
16//! gateway framing ("middleware owns the bytes") and matches what
17//! libmtmd's C API expects (`mtmd_bitmap_init` takes `nx * ny * 3`
18//! interleaved RGB; `mtmd_bitmap_init_from_audio` takes a float32
19//! PCM slice).
20//!
21//! Each attachment kind carries the metadata it needs:
22//!   - `Image`: `width`, `height` (the daemon recomputes nothing).
23//!   - `Audio`: `sample_rate` (Hz; the daemon doesn't resample).
24//!   - `Video`: reserved; the actual shape is TBD when a video-
25//!     capable adapter lands.
26
27use serde::{Deserialize, Serialize};
28
29/// One binary attachment in the request's top-level `attachments[]` table.
30///
31/// Tagged-enum shape: each variant carries exactly the metadata libmtmd
32/// (and other engines' multimodal interfaces) need for that modality.
33/// Unknown variants deserialise as [`Attachment::Unknown`] so v2.0
34/// clients don't reject newer payloads at parse time; resolve()
35/// rejects them only when they reach validation.
36///
37/// `id` must be unique within a single request; content blocks
38/// reference attachments by exactly this string.
39///
40/// `bytes` is standard-base64-encoded (RFC 4648, with `+/` and `=`
41/// padding). After ~1.33× inflation the raw payload must still leave
42/// room within the 64 MiB per-frame cap.
43#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
44#[serde(tag = "kind", rename_all = "lowercase")]
45pub enum Attachment {
46    /// Decoded RGB image. `bytes` is `width * height * 3` interleaved
47    /// RGB octets (no alpha channel; consumer drops alpha or
48    /// composites against a known background before sending).
49    Image {
50        /// Caller-chosen identifier; unique within the request.
51        id: String,
52        /// Image width in pixels.
53        width: u32,
54        /// Image height in pixels.
55        height: u32,
56        /// Base64 of `width * height * 3` interleaved RGB bytes.
57        bytes: String,
58    },
59    /// Decoded audio PCM. `bytes` is base64 of `n_samples *
60    /// sizeof(f32)` little-endian float32 samples at the named
61    /// sample rate.
62    Audio {
63        /// Caller-chosen identifier; unique within the request.
64        id: String,
65        /// Sample rate in Hz (e.g. 16000 for Whisper-class encoders;
66        /// Gemma 4 audio uses its own rate which the daemon learns at
67        /// adapter init time and reports via
68        /// `BackendCapabilities`).
69        sample_rate: u32,
70        /// Base64 of float32 PCM samples (little-endian).
71        bytes: String,
72    },
73    /// Reserved. Engine support is a separate concern; v2.0 daemons
74    /// reject video attachments with `attachment_unsupported` until
75    /// a video-capable adapter ships. Wire shape is intentionally
76    /// kept stub-thin; future revisions add fields without breaking
77    /// v2.0 clients (forward-compat: serde will accept extra fields
78    /// silently).
79    Video {
80        /// Caller-chosen identifier; unique within the request.
81        id: String,
82        /// Base64 of decoded video frames; precise format TBD.
83        bytes: String,
84    },
85    /// Forward-compat escape hatch — any `kind` value the local build
86    /// doesn't recognise lands here so older clients/daemons don't
87    /// reject newer payloads at parse time. `resolve()` rejects them
88    /// only when they reach validation.
89    #[serde(other)]
90    Unknown,
91}
92
93impl Attachment {
94    /// The attachment's id (independent of variant).
95    ///
96    /// Returns an empty string for `Unknown` since unknown variants
97    /// don't carry an id field reliably.
98    pub fn id(&self) -> &str {
99        match self {
100            Attachment::Image { id, .. }
101            | Attachment::Audio { id, .. }
102            | Attachment::Video { id, .. } => id,
103            Attachment::Unknown => "",
104        }
105    }
106
107    /// `true` if this attachment is an image.
108    pub fn is_image(&self) -> bool {
109        matches!(self, Attachment::Image { .. })
110    }
111
112    /// `true` if this attachment is audio.
113    pub fn is_audio(&self) -> bool {
114        matches!(self, Attachment::Audio { .. })
115    }
116
117    /// `true` if this attachment is video.
118    pub fn is_video(&self) -> bool {
119        matches!(self, Attachment::Video { .. })
120    }
121}