Skip to main content

adk_realtime/
config.rs

1//! Configuration types for realtime sessions.
2
3use crate::audio::AudioEncoding;
4use serde::{Deserialize, Serialize};
5use serde_json::Value;
6use std::ops::{Deref, DerefMut};
7
8/// Controls how the realtime session handles user interruptions during agent
9/// audio output.
10///
11/// When set to [`Automatic`](InterruptionDetection::Automatic), the session
12/// uses voice activity detection to detect user speech onset and immediately
13/// cancels the current agent audio output, enabling natural conversational
14/// turn-taking.
15///
16/// When set to [`Manual`](InterruptionDetection::Manual) (the default), the
17/// session relies on explicit API calls (e.g. `response.cancel`) to signal
18/// that the user is interrupting. This gives the application full control
19/// over interruption behavior.
20#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
21#[serde(rename_all = "snake_case")]
22pub enum InterruptionDetection {
23    /// Rely on explicit API calls to signal interruptions.
24    ///
25    /// The application is responsible for detecting user speech and calling
26    /// the appropriate cancellation method on the session. No automatic
27    /// voice activity detection is performed for interruption purposes.
28    #[default]
29    Manual,
30    /// Detect user speech onset and cancel the current agent audio output.
31    ///
32    /// The session monitors incoming audio for voice activity. When user
33    /// speech is detected while the agent is producing audio, the agent's
34    /// audio output is automatically cancelled, allowing the user to
35    /// take the conversational turn.
36    Automatic,
37}
38
39/// Voice Activity Detection mode.
40#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
41#[serde(rename_all = "snake_case")]
42pub enum VadMode {
43    /// Server-side VAD (default for most providers).
44    #[default]
45    ServerVad,
46    /// Semantic VAD (OpenAI-specific).
47    SemanticVad,
48    /// No automatic VAD - manual turn management.
49    None,
50}
51
52/// VAD configuration options.
53#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
54pub struct VadConfig {
55    /// VAD mode to use.
56    #[serde(rename = "type")]
57    pub mode: VadMode,
58    /// Silence duration (ms) before considering speech ended.
59    #[serde(skip_serializing_if = "Option::is_none")]
60    pub silence_duration_ms: Option<u32>,
61    /// Detection threshold (0.0 - 1.0).
62    #[serde(skip_serializing_if = "Option::is_none")]
63    pub threshold: Option<f32>,
64    /// Prefix padding (ms) to include before detected speech.
65    #[serde(skip_serializing_if = "Option::is_none")]
66    pub prefix_padding_ms: Option<u32>,
67    /// Whether to interrupt the model when user starts speaking.
68    #[serde(skip_serializing_if = "Option::is_none")]
69    pub interrupt_response: Option<bool>,
70    /// Eagerness of turn detection (OpenAI-specific).
71    #[serde(skip_serializing_if = "Option::is_none")]
72    pub eagerness: Option<String>,
73}
74
75impl Default for VadConfig {
76    fn default() -> Self {
77        Self {
78            mode: VadMode::ServerVad,
79            silence_duration_ms: Some(500),
80            threshold: None,
81            prefix_padding_ms: None,
82            interrupt_response: Some(true),
83            eagerness: None,
84        }
85    }
86}
87
88impl VadConfig {
89    /// Create a server VAD config with default settings.
90    pub fn server_vad() -> Self {
91        Self::default()
92    }
93
94    /// Create a semantic VAD config (OpenAI).
95    pub fn semantic_vad() -> Self {
96        Self { mode: VadMode::SemanticVad, ..Default::default() }
97    }
98
99    /// Create a config with VAD disabled.
100    pub fn disabled() -> Self {
101        Self { mode: VadMode::None, ..Default::default() }
102    }
103
104    /// Set silence duration threshold.
105    pub fn with_silence_duration(mut self, ms: u32) -> Self {
106        self.silence_duration_ms = Some(ms);
107        self
108    }
109
110    /// Set whether to interrupt on user speech.
111    pub fn with_interrupt(mut self, interrupt: bool) -> Self {
112        self.interrupt_response = Some(interrupt);
113        self
114    }
115}
116
117/// Tool/function definition for realtime sessions.
118#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
119pub struct ToolDefinition {
120    /// Tool name.
121    pub name: String,
122    /// Tool description.
123    #[serde(skip_serializing_if = "Option::is_none")]
124    pub description: Option<String>,
125    /// JSON Schema for parameters.
126    #[serde(skip_serializing_if = "Option::is_none")]
127    pub parameters: Option<Value>,
128}
129
130impl ToolDefinition {
131    /// Create a new tool definition.
132    pub fn new(name: impl Into<String>) -> Self {
133        Self { name: name.into(), description: None, parameters: None }
134    }
135
136    /// Set the tool description.
137    pub fn with_description(mut self, desc: impl Into<String>) -> Self {
138        self.description = Some(desc.into());
139        self
140    }
141
142    /// Set the parameters schema.
143    pub fn with_parameters(mut self, schema: Value) -> Self {
144        self.parameters = Some(schema);
145        self
146    }
147}
148
149/// Configuration for a realtime session.
150#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
151pub struct RealtimeConfig {
152    /// Model to use (provider-specific).
153    #[serde(skip_serializing_if = "Option::is_none")]
154    pub model: Option<String>,
155
156    /// System instruction for the agent.
157    #[serde(skip_serializing_if = "Option::is_none")]
158    pub instruction: Option<String>,
159
160    /// Voice to use for audio output.
161    #[serde(skip_serializing_if = "Option::is_none")]
162    pub voice: Option<String>,
163
164    /// Output modalities: ["text"], ["audio"], or ["text", "audio"].
165    #[serde(skip_serializing_if = "Option::is_none")]
166    pub modalities: Option<Vec<String>>,
167
168    /// Input audio format.
169    #[serde(skip_serializing_if = "Option::is_none")]
170    pub input_audio_format: Option<AudioEncoding>,
171
172    /// Output audio format.
173    #[serde(skip_serializing_if = "Option::is_none")]
174    pub output_audio_format: Option<AudioEncoding>,
175
176    /// VAD configuration.
177    #[serde(skip_serializing_if = "Option::is_none")]
178    pub turn_detection: Option<VadConfig>,
179
180    /// Available tools/functions.
181    #[serde(skip_serializing_if = "Option::is_none")]
182    pub tools: Option<Vec<ToolDefinition>>,
183
184    /// Tool selection mode: "auto", "none", "required".
185    #[serde(skip_serializing_if = "Option::is_none")]
186    pub tool_choice: Option<String>,
187
188    /// Whether to include input audio transcription.
189    #[serde(skip_serializing_if = "Option::is_none")]
190    pub input_audio_transcription: Option<TranscriptionConfig>,
191
192    /// Temperature for response generation.
193    #[serde(skip_serializing_if = "Option::is_none")]
194    pub temperature: Option<f32>,
195
196    /// Maximum output tokens.
197    #[serde(skip_serializing_if = "Option::is_none")]
198    pub max_response_output_tokens: Option<u32>,
199
200    /// Cached content resource name (e.g. `cachedContents/1234`).
201    #[serde(skip_serializing_if = "Option::is_none")]
202    pub cached_content: Option<String>,
203
204    /// Interruption detection mode for voice sessions.
205    ///
206    /// Controls whether the session automatically detects user speech to
207    /// cancel agent audio output, or relies on explicit API calls.
208    /// Defaults to [`Manual`](InterruptionDetection::Manual) when `None`.
209    #[serde(skip_serializing_if = "Option::is_none")]
210    pub interruption_detection: Option<InterruptionDetection>,
211
212    /// Provider-specific options.
213    #[serde(skip_serializing_if = "Option::is_none")]
214    pub extra: Option<Value>,
215}
216
217/// A delta payload for safely updating an active realtime session.
218///
219/// Wraps `RealtimeConfig` to prevent struct duplication. Since all fields are
220/// `Option<T>` and skip serialization if `None`, omitting fields preserves
221/// the server's active state.
222///
223/// **⚠️ WARNING:** You must construct a *fresh* configuration containing **only**
224/// the fields to modify. Wrapping your original startup config will resend
225/// immutable fields (like `model`), causing the provider to reject the update.
226///
227/// This is the idiomatic mechanism for dynamic Finite State Machine (FSM) state
228/// transitions, allowing seamless "persona shifts" or tool swaps without
229/// dropping the audio connection.
230///
231/// # Example
232///
233/// ```rust
234/// use adk_realtime::config::{SessionUpdateConfig, RealtimeConfig};
235///
236/// // Update *only* the instruction mid-session.
237/// let delta = SessionUpdateConfig(
238///     RealtimeConfig::default().with_instruction("You are now a travel agent.")
239/// );
240/// ```
241#[derive(Debug, Clone, Default, Serialize, Deserialize)]
242#[serde(transparent)]
243pub struct SessionUpdateConfig(pub RealtimeConfig);
244
245impl Deref for SessionUpdateConfig {
246    type Target = RealtimeConfig;
247
248    fn deref(&self) -> &Self::Target {
249        &self.0
250    }
251}
252
253impl DerefMut for SessionUpdateConfig {
254    fn deref_mut(&mut self) -> &mut Self::Target {
255        &mut self.0
256    }
257}
258
259impl From<RealtimeConfig> for SessionUpdateConfig {
260    fn from(config: RealtimeConfig) -> Self {
261        Self(config)
262    }
263}
264
265/// Transcription configuration.
266#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
267pub struct TranscriptionConfig {
268    /// Transcription model to use.
269    pub model: String,
270}
271
272impl TranscriptionConfig {
273    /// Use whisper-1 for transcription.
274    pub fn whisper() -> Self {
275        Self { model: "whisper-1".to_string() }
276    }
277}
278
279impl RealtimeConfig {
280    /// Create a new empty configuration.
281    pub fn new() -> Self {
282        Self::default()
283    }
284
285    /// Create a builder for RealtimeConfig.
286    pub fn builder() -> RealtimeConfigBuilder {
287        RealtimeConfigBuilder::new()
288    }
289
290    /// Set the model.
291    pub fn with_model(mut self, model: impl Into<String>) -> Self {
292        self.model = Some(model.into());
293        self
294    }
295
296    /// Set the system instruction.
297    pub fn with_instruction(mut self, instruction: impl Into<String>) -> Self {
298        self.instruction = Some(instruction.into());
299        self
300    }
301
302    /// Set the voice.
303    pub fn with_voice(mut self, voice: impl Into<String>) -> Self {
304        self.voice = Some(voice.into());
305        self
306    }
307
308    /// Set output modalities.
309    pub fn with_modalities(mut self, modalities: Vec<String>) -> Self {
310        self.modalities = Some(modalities);
311        self
312    }
313
314    /// Enable text and audio output.
315    pub fn with_text_and_audio(mut self) -> Self {
316        self.modalities = Some(vec!["text".to_string(), "audio".to_string()]);
317        self
318    }
319
320    /// Enable audio-only output.
321    pub fn with_audio_only(mut self) -> Self {
322        self.modalities = Some(vec!["audio".to_string()]);
323        self
324    }
325
326    /// Set VAD configuration.
327    pub fn with_vad(mut self, vad: VadConfig) -> Self {
328        self.turn_detection = Some(vad);
329        self
330    }
331
332    /// Enable server-side VAD with default settings.
333    pub fn with_server_vad(self) -> Self {
334        self.with_vad(VadConfig::server_vad())
335    }
336
337    /// Disable VAD (manual turn management).
338    pub fn without_vad(mut self) -> Self {
339        self.turn_detection = Some(VadConfig::disabled());
340        self
341    }
342
343    /// Add a tool definition.
344    pub fn with_tool(mut self, tool: ToolDefinition) -> Self {
345        self.tools.get_or_insert_with(Vec::new).push(tool);
346        self
347    }
348
349    /// Set multiple tools.
350    pub fn with_tools(mut self, tools: Vec<ToolDefinition>) -> Self {
351        self.tools = Some(tools);
352        self
353    }
354
355    /// Enable input audio transcription.
356    pub fn with_transcription(mut self) -> Self {
357        self.input_audio_transcription = Some(TranscriptionConfig::whisper());
358        self
359    }
360
361    /// Set temperature.
362    pub fn with_temperature(mut self, temp: f32) -> Self {
363        self.temperature = Some(temp);
364        self
365    }
366
367    /// Set cached content resource.
368    pub fn with_cached_content(mut self, content: impl Into<String>) -> Self {
369        self.cached_content = Some(content.into());
370        self
371    }
372
373    /// Set the interruption detection mode.
374    ///
375    /// See [`InterruptionDetection`] for details on each variant.
376    pub fn with_interruption_detection(mut self, mode: InterruptionDetection) -> Self {
377        self.interruption_detection = Some(mode);
378        self
379    }
380
381    /// Enable automatic interruption detection.
382    ///
383    /// The session will detect user speech onset and cancel the current
384    /// agent audio output automatically.
385    pub fn with_automatic_interruption(self) -> Self {
386        self.with_interruption_detection(InterruptionDetection::Automatic)
387    }
388}
389
390/// Builder for RealtimeConfig.
391#[derive(Debug, Clone, Default)]
392pub struct RealtimeConfigBuilder {
393    config: RealtimeConfig,
394}
395
396impl RealtimeConfigBuilder {
397    /// Create a new builder.
398    pub fn new() -> Self {
399        Self::default()
400    }
401
402    /// Set the model.
403    pub fn model(mut self, model: impl Into<String>) -> Self {
404        self.config.model = Some(model.into());
405        self
406    }
407
408    /// Set the system instruction.
409    pub fn instruction(mut self, instruction: impl Into<String>) -> Self {
410        self.config.instruction = Some(instruction.into());
411        self
412    }
413
414    /// Set the voice.
415    pub fn voice(mut self, voice: impl Into<String>) -> Self {
416        self.config.voice = Some(voice.into());
417        self
418    }
419
420    /// Enable VAD.
421    pub fn vad_enabled(mut self, enabled: bool) -> Self {
422        if enabled {
423            self.config.turn_detection = Some(VadConfig::server_vad());
424        } else {
425            self.config.turn_detection = Some(VadConfig::disabled());
426        }
427        self
428    }
429
430    /// Set VAD configuration.
431    pub fn vad(mut self, vad: VadConfig) -> Self {
432        self.config.turn_detection = Some(vad);
433        self
434    }
435
436    /// Add a tool.
437    pub fn tool(mut self, tool: ToolDefinition) -> Self {
438        self.config.tools.get_or_insert_with(Vec::new).push(tool);
439        self
440    }
441
442    /// Set temperature.
443    pub fn temperature(mut self, temp: f32) -> Self {
444        self.config.temperature = Some(temp);
445        self
446    }
447
448    /// Set cached content resource.
449    pub fn cached_content(mut self, content: impl Into<String>) -> Self {
450        self.config.cached_content = Some(content.into());
451        self
452    }
453
454    /// Set the interruption detection mode.
455    pub fn interruption_detection(mut self, mode: InterruptionDetection) -> Self {
456        self.config.interruption_detection = Some(mode);
457        self
458    }
459
460    /// Build the configuration.
461    pub fn build(self) -> RealtimeConfig {
462        self.config
463    }
464}