Skip to main content

gemini_live/types/
config.rs

1//! Configuration types used within the `setup` message.
2//!
3//! These control model behaviour, audio/video input handling, VAD, session
4//! resumption, context compression, and more.  All structs derive [`Default`]
5//! so callers can use the `..Default::default()` pattern for partial init.
6//!
7//! When upstream model families differ, prefer documenting the difference on
8//! the exact field or type that carries it.
9
10use std::fmt;
11use std::str::FromStr;
12
13use serde::{Deserialize, Serialize};
14
15// ── Generation config ────────────────────────────────────────────────────────
16
17/// Controls how the model generates responses.
18#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
19#[serde(rename_all = "camelCase")]
20pub struct GenerationConfig {
21    /// Which modalities the model should produce (`AUDIO`, `TEXT`, or both).
22    ///
23    /// For voice-first clients this is typically `[AUDIO]` or
24    /// `[AUDIO, TEXT]`.
25    #[serde(skip_serializing_if = "Option::is_none")]
26    pub response_modalities: Option<Vec<Modality>>,
27    #[serde(skip_serializing_if = "Option::is_none")]
28    pub speech_config: Option<SpeechConfig>,
29    #[serde(skip_serializing_if = "Option::is_none")]
30    pub thinking_config: Option<ThinkingConfig>,
31    /// Image resolution hint sent to the model.
32    #[serde(skip_serializing_if = "Option::is_none")]
33    pub media_resolution: Option<MediaResolution>,
34    #[serde(skip_serializing_if = "Option::is_none")]
35    pub temperature: Option<f32>,
36    #[serde(skip_serializing_if = "Option::is_none")]
37    pub top_p: Option<f32>,
38    #[serde(skip_serializing_if = "Option::is_none")]
39    pub top_k: Option<u32>,
40    #[serde(skip_serializing_if = "Option::is_none")]
41    pub max_output_tokens: Option<u32>,
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub candidate_count: Option<u32>,
44}
45
46/// Output modality requested from the model.
47#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
48#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
49pub enum Modality {
50    Audio,
51    Text,
52}
53
54// ── Speech / Voice ───────────────────────────────────────────────────────────
55
56#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
57#[serde(rename_all = "camelCase")]
58pub struct SpeechConfig {
59    pub voice_config: VoiceConfig,
60}
61
62#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
63#[serde(rename_all = "camelCase")]
64pub struct VoiceConfig {
65    pub prebuilt_voice_config: PrebuiltVoiceConfig,
66}
67
68#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
69#[serde(rename_all = "camelCase")]
70pub struct PrebuiltVoiceConfig {
71    /// Voice name, e.g. `"Kore"`, `"Puck"`, `"Charon"`, etc.
72    pub voice_name: String,
73}
74
75// ── Thinking ─────────────────────────────────────────────────────────────────
76
77/// Thinking / reasoning configuration.
78///
79/// Gemini 3.1 uses `thinking_level` (enum), while Gemini 2.5 uses
80/// `thinking_budget` (token count).  Both may be set; the model ignores
81/// the field it doesn't understand.
82#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
83#[serde(rename_all = "camelCase")]
84pub struct ThinkingConfig {
85    /// Gemini 3.1: discrete level.
86    #[serde(skip_serializing_if = "Option::is_none")]
87    pub thinking_level: Option<ThinkingLevel>,
88    /// Gemini 2.5: token budget.
89    #[serde(skip_serializing_if = "Option::is_none")]
90    pub thinking_budget: Option<u32>,
91    #[serde(skip_serializing_if = "Option::is_none")]
92    pub include_thoughts: Option<bool>,
93}
94
95#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
96#[serde(rename_all = "lowercase")]
97pub enum ThinkingLevel {
98    Minimal,
99    Low,
100    Medium,
101    High,
102}
103
104impl ThinkingLevel {
105    pub const fn as_str(self) -> &'static str {
106        match self {
107            Self::Minimal => "minimal",
108            Self::Low => "low",
109            Self::Medium => "medium",
110            Self::High => "high",
111        }
112    }
113}
114
115impl fmt::Display for ThinkingLevel {
116    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
117        f.write_str(self.as_str())
118    }
119}
120
121#[derive(Debug, Clone, PartialEq, Eq)]
122pub struct ParseThinkingLevelError {
123    raw: String,
124}
125
126impl ParseThinkingLevelError {
127    pub fn raw(&self) -> &str {
128        &self.raw
129    }
130}
131
132impl fmt::Display for ParseThinkingLevelError {
133    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
134        write!(
135            f,
136            "unsupported thinking level {:?}; expected one of: minimal, low, medium, high",
137            self.raw
138        )
139    }
140}
141
142impl std::error::Error for ParseThinkingLevelError {}
143
144impl FromStr for ThinkingLevel {
145    type Err = ParseThinkingLevelError;
146
147    fn from_str(raw: &str) -> Result<Self, Self::Err> {
148        match raw.trim().to_ascii_lowercase().as_str() {
149            "minimal" => Ok(Self::Minimal),
150            "low" => Ok(Self::Low),
151            "medium" => Ok(Self::Medium),
152            "high" => Ok(Self::High),
153            _ => Err(ParseThinkingLevelError {
154                raw: raw.to_string(),
155            }),
156        }
157    }
158}
159
160// ── Media resolution ─────────────────────────────────────────────────────────
161
162#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
163#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
164pub enum MediaResolution {
165    MediaResolutionLow,
166    MediaResolutionHigh,
167}
168
169// ── Realtime input config ────────────────────────────────────────────────────
170
171/// Controls how the server interprets real-time audio/video input.
172#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
173#[serde(rename_all = "camelCase")]
174pub struct RealtimeInputConfig {
175    #[serde(skip_serializing_if = "Option::is_none")]
176    pub automatic_activity_detection: Option<AutomaticActivityDetection>,
177    /// What happens when user activity is detected while the model is speaking.
178    #[serde(skip_serializing_if = "Option::is_none")]
179    pub activity_handling: Option<ActivityHandling>,
180    /// What audio is included in the user's turn.
181    #[serde(skip_serializing_if = "Option::is_none")]
182    pub turn_coverage: Option<TurnCoverage>,
183}
184
185/// Server-side Voice Activity Detection parameters.
186///
187/// When `disabled` is `true`, the client must send `activityStart` /
188/// `activityEnd` signals manually.
189#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
190#[serde(rename_all = "camelCase")]
191pub struct AutomaticActivityDetection {
192    #[serde(skip_serializing_if = "Option::is_none")]
193    pub disabled: Option<bool>,
194    #[serde(skip_serializing_if = "Option::is_none")]
195    pub start_of_speech_sensitivity: Option<StartSensitivity>,
196    /// Milliseconds of audio to retain before the detected speech onset.
197    #[serde(skip_serializing_if = "Option::is_none")]
198    pub prefix_padding_ms: Option<u32>,
199    #[serde(skip_serializing_if = "Option::is_none")]
200    pub end_of_speech_sensitivity: Option<EndSensitivity>,
201    /// Milliseconds of silence required to mark speech as ended.
202    #[serde(skip_serializing_if = "Option::is_none")]
203    pub silence_duration_ms: Option<u32>,
204}
205
206#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
207#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
208pub enum StartSensitivity {
209    StartSensitivityHigh,
210    StartSensitivityLow,
211}
212
213#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
214#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
215pub enum EndSensitivity {
216    EndSensitivityHigh,
217    EndSensitivityLow,
218}
219
220/// What happens when user activity (speech) is detected while the model is
221/// generating a response.
222#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
223#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
224pub enum ActivityHandling {
225    /// User speech interrupts the model (default).
226    StartOfActivityInterrupts,
227    /// Model continues uninterrupted.
228    NoInterruption,
229}
230
231/// Which portions of the audio stream are included in the user's turn.
232#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
233#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
234pub enum TurnCoverage {
235    /// Only detected speech activity (default on Gemini 2.5).
236    TurnIncludesOnlyActivity,
237    /// All audio including silence.
238    TurnIncludesAllInput,
239    /// Speech activity + all video frames (default on Gemini 3.1).
240    TurnIncludesAudioActivityAndAllVideo,
241}
242
243// ── Session resumption ───────────────────────────────────────────────────────
244
245/// Enables session resumption.  Include an empty struct to opt in; pass a
246/// previous `handle` to resume a disconnected session.
247///
248/// The Live API currently documents resumption tokens as valid for **2 hours
249/// after the last session termination**.
250#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
251#[serde(rename_all = "camelCase")]
252pub struct SessionResumptionConfig {
253    #[serde(skip_serializing_if = "Option::is_none")]
254    pub handle: Option<String>,
255}
256
257// ── Context window compression ───────────────────────────────────────────────
258
259/// Server-side context compression.  When the context grows past
260/// `trigger_tokens`, the server compresses it down to
261/// `sliding_window.target_tokens`.
262///
263/// This is not a presence-activated empty object. The Live API expects a
264/// compression mechanism to be selected, so callers should normally send at
265/// least `sliding_window: Some(SlidingWindow::default())`, which serializes as
266/// `{"slidingWindow": {}}`.
267#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
268#[serde(rename_all = "camelCase")]
269pub struct ContextWindowCompressionConfig {
270    #[serde(skip_serializing_if = "Option::is_none")]
271    pub sliding_window: Option<SlidingWindow>,
272    /// Token count that triggers compression (default ≈ 80% of context limit).
273    #[serde(skip_serializing_if = "Option::is_none")]
274    pub trigger_tokens: Option<u64>,
275}
276
277#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
278#[serde(rename_all = "camelCase")]
279pub struct SlidingWindow {
280    #[serde(skip_serializing_if = "Option::is_none")]
281    pub target_tokens: Option<u64>,
282}
283
284// ── Transcription ────────────────────────────────────────────────────────────
285
286/// Presence-activated config — include an empty `{}` to enable transcription
287/// for the corresponding direction (input or output).
288///
289/// The server treats the field's presence as the signal; callers should send
290/// `Some(AudioTranscriptionConfig {})`, not a boolean.
291#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
292pub struct AudioTranscriptionConfig {}
293
294// ── Proactivity (v1alpha, Gemini 2.5) ────────────────────────────────────────
295
296/// Gemini 2.5-only proactive audio settings (`v1alpha`).
297///
298/// New model families may ignore this entirely, so callers should treat it as
299/// model-specific rather than universally available.
300#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
301#[serde(rename_all = "camelCase")]
302pub struct ProactivityConfig {
303    #[serde(skip_serializing_if = "Option::is_none")]
304    pub proactive_audio: Option<bool>,
305}
306
307// ── History (Gemini 3.1) ─────────────────────────────────────────────────────
308
309/// Controls how conversation history is bootstrapped.
310///
311/// On Gemini 3.1, `clientContent` can only be sent as initial history
312/// (before the first `realtimeInput`).  Set `initial_history_in_client_content`
313/// to `true` to enable this.
314#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
315#[serde(rename_all = "camelCase")]
316pub struct HistoryConfig {
317    #[serde(skip_serializing_if = "Option::is_none")]
318    pub initial_history_in_client_content: Option<bool>,
319}
320
321// ── Tool definitions ─────────────────────────────────────────────────────────
322
323/// Tool declarations made available during `setup`.
324///
325/// Each entry maps to exactly one top-level tool field on the wire, for
326/// example `{"googleSearch": {}}` or
327/// `{"functionDeclarations": [{...}, {...}]}`.
328///
329/// Keep built-in Live tools as first-class enum variants here instead of
330/// forcing callers through raw JSON.
331#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
332pub enum Tool {
333    /// Custom client-side functions that the model may call.
334    #[serde(rename = "functionDeclarations")]
335    FunctionDeclarations(Vec<FunctionDeclaration>),
336    /// Google-managed web search executed on the server side.
337    #[serde(rename = "googleSearch")]
338    GoogleSearch(GoogleSearchTool),
339}
340
341/// Enable the built-in Google Search tool.
342///
343/// This is a presence-activated empty object on the wire:
344/// `{"googleSearch": {}}`.
345#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
346pub struct GoogleSearchTool {}
347
348/// A custom function the model may call during the session.
349///
350/// Keep this aligned with the official Live API tool docs. Gemini 3.1 and 2.5
351/// differ in important ways, especially around asynchronous tool execution.
352#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
353#[serde(rename_all = "camelCase")]
354pub struct FunctionDeclaration {
355    pub name: String,
356    pub description: String,
357    /// JSON Schema object describing the function's parameters.
358    pub parameters: serde_json::Value,
359    /// Crate-level scheduling field retained for Gemini 2.5 compatibility
360    /// work.
361    ///
362    /// Official current Live API docs place response scheduling inside the
363    /// tool-response payload, not here. Treat this field as under audit until
364    /// roadmap item `F-9` is completed.
365    #[serde(skip_serializing_if = "Option::is_none")]
366    pub scheduling: Option<FunctionScheduling>,
367    /// Gemini 2.5: whether the function blocks model generation.
368    #[serde(skip_serializing_if = "Option::is_none")]
369    pub behavior: Option<FunctionBehavior>,
370}
371
372/// Scheduling values historically associated with Gemini 2.5 tool execution.
373///
374/// See [`FunctionDeclaration::scheduling`] for the current crate caveat.
375#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
376#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
377pub enum FunctionScheduling {
378    /// Immediately interrupt model output (default).
379    Interrupt,
380    /// Wait until the model is idle.
381    WhenIdle,
382    /// Run silently without interrupting.
383    Silent,
384}
385
386/// Whether the function response blocks continued model generation.
387#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
388#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
389pub enum FunctionBehavior {
390    /// Model continues generating while awaiting the response.
391    NonBlocking,
392}