Skip to main content

gemini_live/types/
client_message.rs

1//! Client → Server message types.
2//!
3//! The Gemini Live protocol defines **4 client message kinds**, each carrying
4//! exactly one top-level field:
5//!
6//! | Variant          | Wire field        | When to send                     |
7//! |------------------|-------------------|----------------------------------|
8//! | `Setup`          | `setup`           | First message only               |
9//! | `ClientContent`  | `clientContent`   | Conversation history / turns     |
10//! | `RealtimeInput`  | `realtimeInput`   | Streaming audio / video / text   |
11//! | `ToolResponse`   | `toolResponse`    | Replies to server `toolCall`     |
12//!
13//! [`ClientMessage`] is serialised as a serde externally-tagged enum, which
14//! naturally produces `{"setup": {...}}` etc.
15
16use serde::Serialize;
17
18use super::common::{Blob, Content, EmptyObject};
19use super::config::*;
20
21/// A message sent from client to server.
22///
23/// The protocol requires each message to carry **exactly one** top-level field.
24/// Serde's externally-tagged enum representation satisfies this constraint
25/// directly — `ClientMessage::Setup(cfg)` serialises to `{"setup": { ... }}`.
26#[derive(Debug, Clone, Serialize)]
27pub enum ClientMessage {
28    #[serde(rename = "setup")]
29    Setup(SetupConfig),
30    #[serde(rename = "clientContent")]
31    ClientContent(ClientContent),
32    #[serde(rename = "realtimeInput")]
33    RealtimeInput(RealtimeInput),
34    #[serde(rename = "toolResponse")]
35    ToolResponse(ToolResponseMessage),
36}
37
38// ── Setup ────────────────────────────────────────────────────────────────────
39
40/// The first (and only) `setup` message, configuring the session.
41///
42/// `model` is the only required field. All others have sensible server
43/// defaults when omitted.
44///
45/// This is the canonical home for setup-field semantics in the crate. Keep
46/// model-family caveats and wire-format notes on these fields rather than
47/// restating them in standalone docs.
48#[derive(Debug, Clone, Default, Serialize)]
49#[serde(rename_all = "camelCase")]
50pub struct SetupConfig {
51    /// Model resource name, e.g. `"models/gemini-3.1-flash-live-preview"`.
52    pub model: String,
53    /// Generation-time settings such as response modalities, voice, thinking,
54    /// and sampling controls.
55    #[serde(skip_serializing_if = "Option::is_none")]
56    pub generation_config: Option<GenerationConfig>,
57    /// System prompt or instruction content applied at session setup time.
58    #[serde(skip_serializing_if = "Option::is_none")]
59    pub system_instruction: Option<Content>,
60    /// Tool definitions available to the model for this session.
61    #[serde(skip_serializing_if = "Option::is_none")]
62    pub tools: Option<Vec<Tool>>,
63    /// Real-time audio/video interpretation settings including VAD behaviour.
64    #[serde(skip_serializing_if = "Option::is_none")]
65    pub realtime_input_config: Option<RealtimeInputConfig>,
66    /// Opts the session into server-issued resume handles.
67    ///
68    /// The session layer patches `handle` automatically during reconnects.
69    #[serde(skip_serializing_if = "Option::is_none")]
70    pub session_resumption: Option<SessionResumptionConfig>,
71    /// Server-managed context compression settings.
72    #[serde(skip_serializing_if = "Option::is_none")]
73    pub context_window_compression: Option<ContextWindowCompressionConfig>,
74    /// Presence-activated input speech transcription (`{}` to enable).
75    #[serde(skip_serializing_if = "Option::is_none")]
76    pub input_audio_transcription: Option<AudioTranscriptionConfig>,
77    /// Presence-activated output speech transcription (`{}` to enable).
78    #[serde(skip_serializing_if = "Option::is_none")]
79    pub output_audio_transcription: Option<AudioTranscriptionConfig>,
80    /// Proactive audio (v1alpha, Gemini 2.5 only).
81    #[serde(skip_serializing_if = "Option::is_none")]
82    pub proactivity: Option<ProactivityConfig>,
83    /// History bootstrapping (Gemini 3.1). This only affects how initial
84    /// `clientContent` may be sent before the first `realtimeInput`.
85    #[serde(skip_serializing_if = "Option::is_none")]
86    pub history_config: Option<HistoryConfig>,
87}
88
89// ── ClientContent ────────────────────────────────────────────────────────────
90
91/// Conversation history or incremental content.
92///
93/// On Gemini 2.5 this can be sent at any time during the session.
94/// On Gemini 3.1 it can only be sent as initial history (before the first
95/// `realtimeInput`), and requires `historyConfig.initialHistoryInClientContent = true`.
96#[derive(Debug, Clone, Serialize)]
97#[serde(rename_all = "camelCase")]
98pub struct ClientContent {
99    #[serde(skip_serializing_if = "Option::is_none")]
100    pub turns: Option<Vec<Content>>,
101    #[serde(skip_serializing_if = "Option::is_none")]
102    pub turn_complete: Option<bool>,
103}
104
105// ── RealtimeInput ────────────────────────────────────────────────────────────
106
107/// Streaming real-time input — audio, video, text, or VAD control signals.
108///
109/// Each message should carry only **one** of these fields.
110///
111/// # Audio format
112/// 16-bit signed little-endian PCM, recommended 16 kHz sample rate.
113/// Chunk size: 100–250 ms (3,200–8,000 bytes raw).
114///
115/// # Video format
116/// JPEG or PNG, max 1 fps, recommended < 200 KB per frame.
117#[derive(Debug, Clone, Default, Serialize)]
118#[serde(rename_all = "camelCase")]
119pub struct RealtimeInput {
120    #[serde(skip_serializing_if = "Option::is_none")]
121    pub audio: Option<Blob>,
122    #[serde(skip_serializing_if = "Option::is_none")]
123    pub video: Option<Blob>,
124    #[serde(skip_serializing_if = "Option::is_none")]
125    pub text: Option<String>,
126    /// Manual VAD: signal that user activity has started.
127    /// Requires `automaticActivityDetection.disabled = true`.
128    #[serde(skip_serializing_if = "Option::is_none")]
129    pub activity_start: Option<EmptyObject>,
130    /// Manual VAD: signal that user activity has ended.
131    #[serde(skip_serializing_if = "Option::is_none")]
132    pub activity_end: Option<EmptyObject>,
133    /// Auto VAD: notify server that the mic has been muted / stream ended.
134    #[serde(skip_serializing_if = "Option::is_none")]
135    pub audio_stream_end: Option<bool>,
136}
137
138// ── ToolResponse ─────────────────────────────────────────────────────────────
139
140/// Response to one or more server-initiated function calls.
141#[derive(Debug, Clone, Serialize)]
142#[serde(rename_all = "camelCase")]
143pub struct ToolResponseMessage {
144    pub function_responses: Vec<FunctionResponse>,
145}
146
147/// A single function call result, keyed by the server-assigned `id`.
148#[derive(Debug, Clone, Serialize)]
149#[serde(rename_all = "camelCase")]
150pub struct FunctionResponse {
151    /// Must match the `id` from the corresponding [`FunctionCallRequest`](super::server_message::FunctionCallRequest).
152    pub id: String,
153    pub name: String,
154    /// Arbitrary JSON result returned to the model.
155    ///
156    /// Keep the payload flexible: current Live API docs place some Gemini 2.5
157    /// tool-response scheduling knobs inside this JSON object rather than as a
158    /// top-level Rust field.
159    pub response: serde_json::Value,
160}