gemini_live/types/client_message.rs
1//! Client → Server message types.
2//!
3//! The Gemini Live protocol defines **4 client message kinds**, each carrying
4//! exactly one top-level field:
5//!
6//! | Variant | Wire field | When to send |
7//! |------------------|-------------------|----------------------------------|
8//! | `Setup` | `setup` | First message only |
9//! | `ClientContent` | `clientContent` | Conversation history / turns |
10//! | `RealtimeInput` | `realtimeInput` | Streaming audio / video / text |
11//! | `ToolResponse` | `toolResponse` | Replies to server `toolCall` |
12//!
13//! [`ClientMessage`] is serialised as a serde externally-tagged enum, which
14//! naturally produces `{"setup": {...}}` etc.
15
16use serde::Serialize;
17
18use super::common::{Blob, Content, EmptyObject};
19use super::config::*;
20
21/// A message sent from client to server.
22///
23/// The protocol requires each message to carry **exactly one** top-level field.
24/// Serde's externally-tagged enum representation satisfies this constraint
25/// directly — `ClientMessage::Setup(cfg)` serialises to `{"setup": { ... }}`.
26#[derive(Debug, Clone, Serialize)]
27pub enum ClientMessage {
28 #[serde(rename = "setup")]
29 Setup(SetupConfig),
30 #[serde(rename = "clientContent")]
31 ClientContent(ClientContent),
32 #[serde(rename = "realtimeInput")]
33 RealtimeInput(RealtimeInput),
34 #[serde(rename = "toolResponse")]
35 ToolResponse(ToolResponseMessage),
36}
37
38// ── Setup ────────────────────────────────────────────────────────────────────
39
40/// The first (and only) `setup` message, configuring the session.
41///
42/// `model` is the only required field. All others have sensible server
43/// defaults when omitted.
44#[derive(Debug, Clone, Default, Serialize)]
45#[serde(rename_all = "camelCase")]
46pub struct SetupConfig {
47 /// Model resource name, e.g. `"models/gemini-3.1-flash-live-preview"`.
48 pub model: String,
49 #[serde(skip_serializing_if = "Option::is_none")]
50 pub generation_config: Option<GenerationConfig>,
51 #[serde(skip_serializing_if = "Option::is_none")]
52 pub system_instruction: Option<Content>,
53 #[serde(skip_serializing_if = "Option::is_none")]
54 pub tools: Option<Vec<Tool>>,
55 #[serde(skip_serializing_if = "Option::is_none")]
56 pub realtime_input_config: Option<RealtimeInputConfig>,
57 #[serde(skip_serializing_if = "Option::is_none")]
58 pub session_resumption: Option<SessionResumptionConfig>,
59 #[serde(skip_serializing_if = "Option::is_none")]
60 pub context_window_compression: Option<ContextWindowCompressionConfig>,
61 #[serde(skip_serializing_if = "Option::is_none")]
62 pub input_audio_transcription: Option<AudioTranscriptionConfig>,
63 #[serde(skip_serializing_if = "Option::is_none")]
64 pub output_audio_transcription: Option<AudioTranscriptionConfig>,
65 /// Proactive audio (v1alpha, Gemini 2.5 only).
66 #[serde(skip_serializing_if = "Option::is_none")]
67 pub proactivity: Option<ProactivityConfig>,
68 /// History bootstrapping (Gemini 3.1).
69 #[serde(skip_serializing_if = "Option::is_none")]
70 pub history_config: Option<HistoryConfig>,
71}
72
73// ── ClientContent ────────────────────────────────────────────────────────────
74
75/// Conversation history or incremental content.
76///
77/// On Gemini 2.5 this can be sent at any time during the session.
78/// On Gemini 3.1 it can only be sent as initial history (before the first
79/// `realtimeInput`), and requires `historyConfig.initialHistoryInClientContent = true`.
80#[derive(Debug, Clone, Serialize)]
81#[serde(rename_all = "camelCase")]
82pub struct ClientContent {
83 #[serde(skip_serializing_if = "Option::is_none")]
84 pub turns: Option<Vec<Content>>,
85 #[serde(skip_serializing_if = "Option::is_none")]
86 pub turn_complete: Option<bool>,
87}
88
89// ── RealtimeInput ────────────────────────────────────────────────────────────
90
91/// Streaming real-time input — audio, video, text, or VAD control signals.
92///
93/// Each message should carry only **one** of these fields.
94///
95/// # Audio format
96/// 16-bit signed little-endian PCM, recommended 16 kHz sample rate.
97/// Chunk size: 100–250 ms (3,200–8,000 bytes raw).
98///
99/// # Video format
100/// JPEG or PNG, max 1 fps, recommended < 200 KB per frame.
101#[derive(Debug, Clone, Serialize)]
102#[serde(rename_all = "camelCase")]
103pub struct RealtimeInput {
104 #[serde(skip_serializing_if = "Option::is_none")]
105 pub audio: Option<Blob>,
106 #[serde(skip_serializing_if = "Option::is_none")]
107 pub video: Option<Blob>,
108 #[serde(skip_serializing_if = "Option::is_none")]
109 pub text: Option<String>,
110 /// Manual VAD: signal that user activity has started.
111 /// Requires `automaticActivityDetection.disabled = true`.
112 #[serde(skip_serializing_if = "Option::is_none")]
113 pub activity_start: Option<EmptyObject>,
114 /// Manual VAD: signal that user activity has ended.
115 #[serde(skip_serializing_if = "Option::is_none")]
116 pub activity_end: Option<EmptyObject>,
117 /// Auto VAD: notify server that the mic has been muted / stream ended.
118 #[serde(skip_serializing_if = "Option::is_none")]
119 pub audio_stream_end: Option<bool>,
120}
121
122// ── ToolResponse ─────────────────────────────────────────────────────────────
123
124/// Response to one or more server-initiated function calls.
125#[derive(Debug, Clone, Serialize)]
126#[serde(rename_all = "camelCase")]
127pub struct ToolResponseMessage {
128 pub function_responses: Vec<FunctionResponse>,
129}
130
131/// A single function call result, keyed by the server-assigned `id`.
132#[derive(Debug, Clone, Serialize)]
133#[serde(rename_all = "camelCase")]
134pub struct FunctionResponse {
135 /// Must match the `id` from the corresponding [`FunctionCallRequest`](super::server_message::FunctionCallRequest).
136 pub id: String,
137 pub name: String,
138 /// Arbitrary JSON result returned to the model.
139 pub response: serde_json::Value,
140}