openai_rust_sdk/models/realtime_audio/
session_config.rs

1//! Session configuration types and enums for real-time audio
2//!
3//! Contains data structures for configuring real-time audio sessions,
4//! including audio formats, voice options, detection settings, and tools.
5
6use crate::{De, Ser};
7use chrono::{DateTime, Utc};
8use serde::{self, Deserialize, Serialize};
9
10/// Request for creating a real-time audio session
11#[derive(Debug, Clone, Ser, De)]
12pub struct RealtimeSessionRequest {
13    /// The model to use for real-time audio
14    pub model: String,
15
16    /// Configuration for the audio session
17    #[serde(skip_serializing_if = "Option::is_none")]
18    pub config: Option<RealtimeSessionConfig>,
19
20    /// Instructions for the AI assistant
21    #[serde(skip_serializing_if = "Option::is_none")]
22    pub instructions: Option<String>,
23
24    /// Voice to use for responses
25    #[serde(skip_serializing_if = "Option::is_none")]
26    pub voice: Option<RealtimeVoice>,
27
28    /// Temperature for response generation
29    #[serde(skip_serializing_if = "Option::is_none")]
30    pub temperature: Option<f32>,
31
32    /// Maximum response tokens
33    #[serde(skip_serializing_if = "Option::is_none")]
34    pub max_response_output_tokens: Option<u32>,
35}
36
37/// Configuration for real-time audio session
38#[derive(Debug, Clone, Ser, De)]
39pub struct RealtimeSessionConfig {
40    /// Audio input configuration
41    #[serde(skip_serializing_if = "Option::is_none")]
42    pub input_audio_format: Option<RealtimeAudioFormat>,
43
44    /// Audio output configuration
45    #[serde(skip_serializing_if = "Option::is_none")]
46    pub output_audio_format: Option<RealtimeAudioFormat>,
47
48    /// Voice activity detection settings
49    #[serde(skip_serializing_if = "Option::is_none")]
50    pub voice_activity_detection: Option<VoiceActivityDetectionConfig>,
51
52    /// Turn detection settings
53    #[serde(skip_serializing_if = "Option::is_none")]
54    pub turn_detection: Option<TurnDetectionConfig>,
55
56    /// Tools available to the assistant
57    #[serde(skip_serializing_if = "Option::is_none")]
58    pub tools: Option<Vec<RealtimeTool>>,
59
60    /// Tool choice configuration
61    #[serde(skip_serializing_if = "Option::is_none")]
62    pub tool_choice: Option<String>,
63
64    /// Modalities supported in the session
65    #[serde(skip_serializing_if = "Option::is_none")]
66    pub modalities: Option<Vec<RealtimeModality>>,
67}
68
69/// Audio formats supported for real-time streaming
70#[derive(Debug, Clone, Ser, De, PartialEq, Eq)]
71#[serde(rename_all = "lowercase")]
72pub enum RealtimeAudioFormat {
73    /// Raw PCM 16-bit 24kHz mono
74    #[serde(rename = "pcm16")]
75    Pcm16,
76    /// G.711 ยต-law encoding
77    #[serde(rename = "g711_ulaw")]
78    G711Ulaw,
79    /// G.711 A-law encoding
80    #[serde(rename = "g711_alaw")]
81    G711Alaw,
82}
83
84/// Voice options for real-time audio responses
85#[derive(Debug, Clone, Ser, De, PartialEq, Eq)]
86#[serde(rename_all = "lowercase")]
87pub enum RealtimeVoice {
88    /// Alloy voice - balanced and natural
89    Alloy,
90    /// Echo voice - deep and resonant
91    Echo,
92    /// Fable voice - expressive and storytelling
93    Fable,
94    /// Onyx voice - authoritative and deep
95    Onyx,
96    /// Nova voice - bright and energetic
97    Nova,
98    /// Shimmer voice - warm and friendly
99    Shimmer,
100}
101
102/// Voice activity detection configuration
103#[derive(Debug, Clone, Ser, De)]
104pub struct VoiceActivityDetectionConfig {
105    /// Threshold for voice activity detection
106    pub threshold: f32,
107
108    /// Prefix padding in milliseconds
109    pub prefix_padding_ms: u32,
110
111    /// Silence duration to detect end of speech
112    pub silence_duration_ms: u32,
113}
114
115/// Turn detection configuration
116#[derive(Debug, Clone, Ser, De)]
117pub struct TurnDetectionConfig {
118    /// Type of turn detection
119    #[serde(rename = "type")]
120    pub detection_type: TurnDetectionType,
121
122    /// Threshold for turn detection
123    #[serde(skip_serializing_if = "Option::is_none")]
124    pub threshold: Option<f32>,
125
126    /// Prefix padding in milliseconds
127    #[serde(skip_serializing_if = "Option::is_none")]
128    pub prefix_padding_ms: Option<u32>,
129
130    /// Silence duration in milliseconds
131    #[serde(skip_serializing_if = "Option::is_none")]
132    pub silence_duration_ms: Option<u32>,
133}
134
135/// Types of turn detection
136#[derive(Debug, Clone, Ser, De)]
137#[serde(rename_all = "snake_case")]
138pub enum TurnDetectionType {
139    /// Server-side voice activity detection
140    ServerVad,
141    /// No turn detection
142    None,
143}
144
145/// Modalities supported in real-time sessions
146#[derive(Debug, Clone, Ser, De, PartialEq, Eq)]
147#[serde(rename_all = "lowercase")]
148pub enum RealtimeModality {
149    /// Text modality
150    Text,
151    /// Audio modality
152    Audio,
153}
154
155/// Tool definition for real-time sessions
156#[derive(Debug, Clone, Ser, De)]
157pub struct RealtimeTool {
158    /// Type of the tool
159    #[serde(rename = "type")]
160    pub tool_type: String,
161
162    /// Name of the function
163    pub name: String,
164
165    /// Description of the function
166    pub description: String,
167
168    /// Parameters schema
169    pub parameters: serde_json::Value,
170}
171
172/// Response from session creation
173#[derive(Debug, Clone, Ser, De)]
174pub struct RealtimeSessionResponse {
175    /// Unique session identifier
176    pub id: String,
177
178    /// Session object type
179    pub object: String,
180
181    /// Current session status
182    pub status: SessionStatus,
183
184    /// Ephemeral API key for WebRTC connection
185    pub ephemeral_key: String,
186
187    /// WebRTC connection URL
188    pub webrtc_url: String,
189
190    /// Session configuration
191    pub config: RealtimeSessionConfig,
192
193    /// Expiration time of the session
194    pub expires_at: DateTime<Utc>,
195
196    /// Creation timestamp
197    pub created_at: DateTime<Utc>,
198}
199
200/// Session status
201#[derive(Debug, Clone, Ser, De)]
202#[serde(rename_all = "lowercase")]
203pub enum SessionStatus {
204    /// Session is active and ready
205    Active,
206    /// Session is connecting
207    Connecting,
208    /// Session is disconnected
209    Disconnected,
210    /// Session has expired
211    Expired,
212    /// Session encountered an error
213    Error,
214}
215
216/// Response configuration
217#[derive(Debug, Clone, Ser, De)]
218pub struct ResponseConfig {
219    /// Modalities for the response
220    #[serde(skip_serializing_if = "Option::is_none")]
221    pub modalities: Option<Vec<RealtimeModality>>,
222
223    /// Instructions for the response
224    #[serde(skip_serializing_if = "Option::is_none")]
225    pub instructions: Option<String>,
226
227    /// Voice to use for audio responses
228    #[serde(skip_serializing_if = "Option::is_none")]
229    pub voice: Option<RealtimeVoice>,
230
231    /// Output audio format
232    #[serde(skip_serializing_if = "Option::is_none")]
233    pub output_audio_format: Option<RealtimeAudioFormat>,
234
235    /// Tools available for the response
236    #[serde(skip_serializing_if = "Option::is_none")]
237    pub tools: Option<Vec<RealtimeTool>>,
238
239    /// Tool choice configuration
240    #[serde(skip_serializing_if = "Option::is_none")]
241    pub tool_choice: Option<String>,
242
243    /// Temperature for response generation
244    #[serde(skip_serializing_if = "Option::is_none")]
245    pub temperature: Option<f32>,
246
247    /// Maximum response output tokens
248    #[serde(skip_serializing_if = "Option::is_none")]
249    pub max_response_output_tokens: Option<u32>,
250}
251
252/// Input audio transcription configuration
253#[derive(Debug, Clone, Ser, De)]
254pub struct InputAudioTranscriptionConfig {
255    /// Whether transcription is enabled
256    pub enabled: bool,
257
258    /// Model to use for transcription
259    pub model: String,
260}
261
262/// Real-time audio models
263pub struct RealtimeAudioModels;
264
265impl RealtimeAudioModels {
266    /// GPT-4o real-time preview model
267    pub const GPT_4O_REALTIME_PREVIEW: &'static str = "gpt-4o-realtime-preview";
268
269    /// GPT-4o mini real-time preview model
270    pub const GPT_4O_MINI_REALTIME_PREVIEW: &'static str = "gpt-4o-mini-realtime-preview";
271}
272
273impl Default for RealtimeSessionConfig {
274    fn default() -> Self {
275        Self {
276            input_audio_format: Some(RealtimeAudioFormat::Pcm16),
277            output_audio_format: Some(RealtimeAudioFormat::Pcm16),
278            voice_activity_detection: Some(VoiceActivityDetectionConfig::default()),
279            turn_detection: Some(TurnDetectionConfig {
280                detection_type: TurnDetectionType::ServerVad,
281                threshold: Some(0.5),
282                prefix_padding_ms: Some(300),
283                silence_duration_ms: Some(200),
284            }),
285            tools: None,
286            tool_choice: Some("auto".to_string()),
287            modalities: Some(vec![RealtimeModality::Text, RealtimeModality::Audio]),
288        }
289    }
290}
291
292impl Default for VoiceActivityDetectionConfig {
293    fn default() -> Self {
294        Self {
295            threshold: 0.5,
296            prefix_padding_ms: 300,
297            silence_duration_ms: 200,
298        }
299    }
300}
301
302#[cfg(test)]
303mod tests {
304    use super::*;
305
306    #[test]
307    fn test_session_request_creation() {
308        let request = RealtimeSessionRequest {
309            model: RealtimeAudioModels::GPT_4O_REALTIME_PREVIEW.to_string(),
310            config: Some(RealtimeSessionConfig::default()),
311            instructions: Some("You are a helpful assistant.".to_string()),
312            voice: Some(RealtimeVoice::Alloy),
313            temperature: Some(0.8),
314            max_response_output_tokens: Some(4096),
315        };
316
317        assert_eq!(request.model, "gpt-4o-realtime-preview");
318        assert!(request.config.is_some());
319        assert_eq!(request.voice, Some(RealtimeVoice::Alloy));
320    }
321
322    #[test]
323    fn test_voice_activity_detection_config() {
324        let config = VoiceActivityDetectionConfig::default();
325        assert_eq!(config.threshold, 0.5);
326        assert_eq!(config.prefix_padding_ms, 300);
327        assert_eq!(config.silence_duration_ms, 200);
328    }
329
330    #[test]
331    fn test_audio_format_serialization() {
332        assert_eq!(
333            serde_json::to_string(&RealtimeAudioFormat::Pcm16).unwrap(),
334            "\"pcm16\""
335        );
336        assert_eq!(
337            serde_json::to_string(&RealtimeAudioFormat::G711Ulaw).unwrap(),
338            "\"g711_ulaw\""
339        );
340    }
341
342    #[test]
343    fn test_voice_serialization() {
344        assert_eq!(
345            serde_json::to_string(&RealtimeVoice::Alloy).unwrap(),
346            "\"alloy\""
347        );
348        assert_eq!(
349            serde_json::to_string(&RealtimeVoice::Nova).unwrap(),
350            "\"nova\""
351        );
352    }
353
354    #[test]
355    fn test_session_config_default() {
356        let config = RealtimeSessionConfig::default();
357        assert!(config.input_audio_format.is_some());
358        assert!(config.output_audio_format.is_some());
359        assert!(config.voice_activity_detection.is_some());
360        assert!(config.turn_detection.is_some());
361        assert_eq!(config.tool_choice, Some("auto".to_string()));
362    }
363}