Skip to main content

gemini_live/types/
config.rs

1//! Configuration types used within the `setup` message.
2//!
3//! These control model behaviour, audio/video input handling, VAD, session
4//! resumption, context compression, and more.  All structs derive [`Default`]
5//! so callers can use the `..Default::default()` pattern for partial init.
6
7use serde::{Deserialize, Serialize};
8
9// ── Generation config ────────────────────────────────────────────────────────
10
11/// Controls how the model generates responses.
12#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
13#[serde(rename_all = "camelCase")]
14pub struct GenerationConfig {
15    /// Which modalities the model should produce (`AUDIO`, `TEXT`, or both).
16    #[serde(skip_serializing_if = "Option::is_none")]
17    pub response_modalities: Option<Vec<Modality>>,
18    #[serde(skip_serializing_if = "Option::is_none")]
19    pub speech_config: Option<SpeechConfig>,
20    #[serde(skip_serializing_if = "Option::is_none")]
21    pub thinking_config: Option<ThinkingConfig>,
22    /// Image resolution hint sent to the model.
23    #[serde(skip_serializing_if = "Option::is_none")]
24    pub media_resolution: Option<MediaResolution>,
25    #[serde(skip_serializing_if = "Option::is_none")]
26    pub temperature: Option<f32>,
27    #[serde(skip_serializing_if = "Option::is_none")]
28    pub top_p: Option<f32>,
29    #[serde(skip_serializing_if = "Option::is_none")]
30    pub top_k: Option<u32>,
31    #[serde(skip_serializing_if = "Option::is_none")]
32    pub max_output_tokens: Option<u32>,
33    #[serde(skip_serializing_if = "Option::is_none")]
34    pub candidate_count: Option<u32>,
35}
36
37/// Output modality requested from the model.
38#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
39#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
40pub enum Modality {
41    Audio,
42    Text,
43}
44
45// ── Speech / Voice ───────────────────────────────────────────────────────────
46
47#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
48#[serde(rename_all = "camelCase")]
49pub struct SpeechConfig {
50    pub voice_config: VoiceConfig,
51}
52
53#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
54#[serde(rename_all = "camelCase")]
55pub struct VoiceConfig {
56    pub prebuilt_voice_config: PrebuiltVoiceConfig,
57}
58
59#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
60#[serde(rename_all = "camelCase")]
61pub struct PrebuiltVoiceConfig {
62    /// Voice name, e.g. `"Kore"`, `"Puck"`, `"Charon"`, etc.
63    pub voice_name: String,
64}
65
66// ── Thinking ─────────────────────────────────────────────────────────────────
67
68/// Thinking / reasoning configuration.
69///
70/// Gemini 3.1 uses `thinking_level` (enum), while Gemini 2.5 uses
71/// `thinking_budget` (token count).  Both may be set; the model ignores
72/// the field it doesn't understand.
73#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
74#[serde(rename_all = "camelCase")]
75pub struct ThinkingConfig {
76    /// Gemini 3.1: discrete level.
77    #[serde(skip_serializing_if = "Option::is_none")]
78    pub thinking_level: Option<ThinkingLevel>,
79    /// Gemini 2.5: token budget.
80    #[serde(skip_serializing_if = "Option::is_none")]
81    pub thinking_budget: Option<u32>,
82    #[serde(skip_serializing_if = "Option::is_none")]
83    pub include_thoughts: Option<bool>,
84}
85
86#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
87#[serde(rename_all = "lowercase")]
88pub enum ThinkingLevel {
89    Minimal,
90    Low,
91    Medium,
92    High,
93}
94
95// ── Media resolution ─────────────────────────────────────────────────────────
96
97#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
98#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
99pub enum MediaResolution {
100    MediaResolutionLow,
101    MediaResolutionHigh,
102}
103
104// ── Realtime input config ────────────────────────────────────────────────────
105
106/// Controls how the server interprets real-time audio/video input.
107#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
108#[serde(rename_all = "camelCase")]
109pub struct RealtimeInputConfig {
110    #[serde(skip_serializing_if = "Option::is_none")]
111    pub automatic_activity_detection: Option<AutomaticActivityDetection>,
112    /// What happens when user activity is detected while the model is speaking.
113    #[serde(skip_serializing_if = "Option::is_none")]
114    pub activity_handling: Option<ActivityHandling>,
115    /// What audio is included in the user's turn.
116    #[serde(skip_serializing_if = "Option::is_none")]
117    pub turn_coverage: Option<TurnCoverage>,
118}
119
120/// Server-side Voice Activity Detection parameters.
121///
122/// When `disabled` is `true`, the client must send `activityStart` /
123/// `activityEnd` signals manually.
124#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
125#[serde(rename_all = "camelCase")]
126pub struct AutomaticActivityDetection {
127    #[serde(skip_serializing_if = "Option::is_none")]
128    pub disabled: Option<bool>,
129    #[serde(skip_serializing_if = "Option::is_none")]
130    pub start_of_speech_sensitivity: Option<StartSensitivity>,
131    /// Milliseconds of audio to retain before the detected speech onset.
132    #[serde(skip_serializing_if = "Option::is_none")]
133    pub prefix_padding_ms: Option<u32>,
134    #[serde(skip_serializing_if = "Option::is_none")]
135    pub end_of_speech_sensitivity: Option<EndSensitivity>,
136    /// Milliseconds of silence required to mark speech as ended.
137    #[serde(skip_serializing_if = "Option::is_none")]
138    pub silence_duration_ms: Option<u32>,
139}
140
141#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
142#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
143pub enum StartSensitivity {
144    StartSensitivityHigh,
145    StartSensitivityLow,
146}
147
148#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
149#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
150pub enum EndSensitivity {
151    EndSensitivityHigh,
152    EndSensitivityLow,
153}
154
155/// What happens when user activity (speech) is detected while the model is
156/// generating a response.
157#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
158#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
159pub enum ActivityHandling {
160    /// User speech interrupts the model (default).
161    StartOfActivityInterrupts,
162    /// Model continues uninterrupted.
163    NoInterruption,
164}
165
166/// Which portions of the audio stream are included in the user's turn.
167#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
168#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
169pub enum TurnCoverage {
170    /// Only detected speech activity (default on Gemini 2.5).
171    TurnIncludesOnlyActivity,
172    /// All audio including silence.
173    TurnIncludesAllInput,
174    /// Speech activity + all video frames (default on Gemini 3.1).
175    TurnIncludesAudioActivityAndAllVideo,
176}
177
178// ── Session resumption ───────────────────────────────────────────────────────
179
180/// Enables session resumption.  Include an empty struct to opt in; pass a
181/// previous `handle` to resume a disconnected session.
182///
183/// Handles are valid for **2 hours** after disconnect; sessions can be
184/// resumed within **24 hours**.
185#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
186#[serde(rename_all = "camelCase")]
187pub struct SessionResumptionConfig {
188    #[serde(skip_serializing_if = "Option::is_none")]
189    pub handle: Option<String>,
190}
191
192// ── Context window compression ───────────────────────────────────────────────
193
194/// Server-side context compression.  When the context grows past
195/// `trigger_tokens`, the server compresses it down to
196/// `sliding_window.target_tokens`.
197#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
198#[serde(rename_all = "camelCase")]
199pub struct ContextWindowCompressionConfig {
200    #[serde(skip_serializing_if = "Option::is_none")]
201    pub sliding_window: Option<SlidingWindow>,
202    /// Token count that triggers compression (default ≈ 80% of context limit).
203    #[serde(skip_serializing_if = "Option::is_none")]
204    pub trigger_tokens: Option<u64>,
205}
206
207#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
208#[serde(rename_all = "camelCase")]
209pub struct SlidingWindow {
210    #[serde(skip_serializing_if = "Option::is_none")]
211    pub target_tokens: Option<u64>,
212}
213
214// ── Transcription ────────────────────────────────────────────────────────────
215
216/// Presence-activated config — include an empty `{}` to enable transcription
217/// for the corresponding direction (input or output).
218#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
219pub struct AudioTranscriptionConfig {}
220
221// ── Proactivity (v1alpha, Gemini 2.5) ────────────────────────────────────────
222
223#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
224#[serde(rename_all = "camelCase")]
225pub struct ProactivityConfig {
226    #[serde(skip_serializing_if = "Option::is_none")]
227    pub proactive_audio: Option<bool>,
228}
229
230// ── History (Gemini 3.1) ─────────────────────────────────────────────────────
231
232/// Controls how conversation history is bootstrapped.
233///
234/// On Gemini 3.1, `clientContent` can only be sent as initial history
235/// (before the first `realtimeInput`).  Set `initial_history_in_client_content`
236/// to `true` to enable this.
237#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
238#[serde(rename_all = "camelCase")]
239pub struct HistoryConfig {
240    #[serde(skip_serializing_if = "Option::is_none")]
241    pub initial_history_in_client_content: Option<bool>,
242}
243
244// ── Tool definitions ─────────────────────────────────────────────────────────
245
246#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
247#[serde(rename_all = "camelCase")]
248pub struct Tool {
249    pub function_declarations: Vec<FunctionDeclaration>,
250}
251
252#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
253#[serde(rename_all = "camelCase")]
254pub struct FunctionDeclaration {
255    pub name: String,
256    pub description: String,
257    /// JSON Schema object describing the function's parameters.
258    pub parameters: serde_json::Value,
259    /// Gemini 2.5: when to trigger this function relative to model output.
260    #[serde(skip_serializing_if = "Option::is_none")]
261    pub scheduling: Option<FunctionScheduling>,
262    /// Gemini 2.5: whether the function blocks model generation.
263    #[serde(skip_serializing_if = "Option::is_none")]
264    pub behavior: Option<FunctionBehavior>,
265}
266
267/// When the function call is dispatched relative to model output.
268#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
269#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
270pub enum FunctionScheduling {
271    /// Immediately interrupt model output (default).
272    Interrupt,
273    /// Wait until the model is idle.
274    WhenIdle,
275    /// Run silently without interrupting.
276    Silent,
277}
278
279/// Whether the function response blocks continued model generation.
280#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
281#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
282pub enum FunctionBehavior {
283    /// Model continues generating while awaiting the response.
284    NonBlocking,
285}