openai_rust_sdk/models/realtime_audio/
session_config.rs1use crate::{De, Ser};
7use chrono::{DateTime, Utc};
8use serde::{self, Deserialize, Serialize};
9
10#[derive(Debug, Clone, Ser, De)]
12pub struct RealtimeSessionRequest {
13 pub model: String,
15
16 #[serde(skip_serializing_if = "Option::is_none")]
18 pub config: Option<RealtimeSessionConfig>,
19
20 #[serde(skip_serializing_if = "Option::is_none")]
22 pub instructions: Option<String>,
23
24 #[serde(skip_serializing_if = "Option::is_none")]
26 pub voice: Option<RealtimeVoice>,
27
28 #[serde(skip_serializing_if = "Option::is_none")]
30 pub temperature: Option<f32>,
31
32 #[serde(skip_serializing_if = "Option::is_none")]
34 pub max_response_output_tokens: Option<u32>,
35}
36
37#[derive(Debug, Clone, Ser, De)]
39pub struct RealtimeSessionConfig {
40 #[serde(skip_serializing_if = "Option::is_none")]
42 pub input_audio_format: Option<RealtimeAudioFormat>,
43
44 #[serde(skip_serializing_if = "Option::is_none")]
46 pub output_audio_format: Option<RealtimeAudioFormat>,
47
48 #[serde(skip_serializing_if = "Option::is_none")]
50 pub voice_activity_detection: Option<VoiceActivityDetectionConfig>,
51
52 #[serde(skip_serializing_if = "Option::is_none")]
54 pub turn_detection: Option<TurnDetectionConfig>,
55
56 #[serde(skip_serializing_if = "Option::is_none")]
58 pub tools: Option<Vec<RealtimeTool>>,
59
60 #[serde(skip_serializing_if = "Option::is_none")]
62 pub tool_choice: Option<String>,
63
64 #[serde(skip_serializing_if = "Option::is_none")]
66 pub modalities: Option<Vec<RealtimeModality>>,
67}
68
69#[derive(Debug, Clone, Ser, De, PartialEq, Eq)]
71#[serde(rename_all = "lowercase")]
72pub enum RealtimeAudioFormat {
73 #[serde(rename = "pcm16")]
75 Pcm16,
76 #[serde(rename = "g711_ulaw")]
78 G711Ulaw,
79 #[serde(rename = "g711_alaw")]
81 G711Alaw,
82}
83
84#[derive(Debug, Clone, Ser, De, PartialEq, Eq)]
86#[serde(rename_all = "lowercase")]
87pub enum RealtimeVoice {
88 Alloy,
90 Echo,
92 Fable,
94 Onyx,
96 Nova,
98 Shimmer,
100}
101
102#[derive(Debug, Clone, Ser, De)]
104pub struct VoiceActivityDetectionConfig {
105 pub threshold: f32,
107
108 pub prefix_padding_ms: u32,
110
111 pub silence_duration_ms: u32,
113}
114
115#[derive(Debug, Clone, Ser, De)]
117pub struct TurnDetectionConfig {
118 #[serde(rename = "type")]
120 pub detection_type: TurnDetectionType,
121
122 #[serde(skip_serializing_if = "Option::is_none")]
124 pub threshold: Option<f32>,
125
126 #[serde(skip_serializing_if = "Option::is_none")]
128 pub prefix_padding_ms: Option<u32>,
129
130 #[serde(skip_serializing_if = "Option::is_none")]
132 pub silence_duration_ms: Option<u32>,
133}
134
135#[derive(Debug, Clone, Ser, De)]
137#[serde(rename_all = "snake_case")]
138pub enum TurnDetectionType {
139 ServerVad,
141 None,
143}
144
145#[derive(Debug, Clone, Ser, De, PartialEq, Eq)]
147#[serde(rename_all = "lowercase")]
148pub enum RealtimeModality {
149 Text,
151 Audio,
153}
154
155#[derive(Debug, Clone, Ser, De)]
157pub struct RealtimeTool {
158 #[serde(rename = "type")]
160 pub tool_type: String,
161
162 pub name: String,
164
165 pub description: String,
167
168 pub parameters: serde_json::Value,
170}
171
172#[derive(Debug, Clone, Ser, De)]
174pub struct RealtimeSessionResponse {
175 pub id: String,
177
178 pub object: String,
180
181 pub status: SessionStatus,
183
184 pub ephemeral_key: String,
186
187 pub webrtc_url: String,
189
190 pub config: RealtimeSessionConfig,
192
193 pub expires_at: DateTime<Utc>,
195
196 pub created_at: DateTime<Utc>,
198}
199
200#[derive(Debug, Clone, Ser, De)]
202#[serde(rename_all = "lowercase")]
203pub enum SessionStatus {
204 Active,
206 Connecting,
208 Disconnected,
210 Expired,
212 Error,
214}
215
216#[derive(Debug, Clone, Ser, De)]
218pub struct ResponseConfig {
219 #[serde(skip_serializing_if = "Option::is_none")]
221 pub modalities: Option<Vec<RealtimeModality>>,
222
223 #[serde(skip_serializing_if = "Option::is_none")]
225 pub instructions: Option<String>,
226
227 #[serde(skip_serializing_if = "Option::is_none")]
229 pub voice: Option<RealtimeVoice>,
230
231 #[serde(skip_serializing_if = "Option::is_none")]
233 pub output_audio_format: Option<RealtimeAudioFormat>,
234
235 #[serde(skip_serializing_if = "Option::is_none")]
237 pub tools: Option<Vec<RealtimeTool>>,
238
239 #[serde(skip_serializing_if = "Option::is_none")]
241 pub tool_choice: Option<String>,
242
243 #[serde(skip_serializing_if = "Option::is_none")]
245 pub temperature: Option<f32>,
246
247 #[serde(skip_serializing_if = "Option::is_none")]
249 pub max_response_output_tokens: Option<u32>,
250}
251
252#[derive(Debug, Clone, Ser, De)]
254pub struct InputAudioTranscriptionConfig {
255 pub enabled: bool,
257
258 pub model: String,
260}
261
262pub struct RealtimeAudioModels;
264
265impl RealtimeAudioModels {
266 pub const GPT_4O_REALTIME_PREVIEW: &'static str = "gpt-4o-realtime-preview";
268
269 pub const GPT_4O_MINI_REALTIME_PREVIEW: &'static str = "gpt-4o-mini-realtime-preview";
271}
272
273impl Default for RealtimeSessionConfig {
274 fn default() -> Self {
275 Self {
276 input_audio_format: Some(RealtimeAudioFormat::Pcm16),
277 output_audio_format: Some(RealtimeAudioFormat::Pcm16),
278 voice_activity_detection: Some(VoiceActivityDetectionConfig::default()),
279 turn_detection: Some(TurnDetectionConfig {
280 detection_type: TurnDetectionType::ServerVad,
281 threshold: Some(0.5),
282 prefix_padding_ms: Some(300),
283 silence_duration_ms: Some(200),
284 }),
285 tools: None,
286 tool_choice: Some("auto".to_string()),
287 modalities: Some(vec![RealtimeModality::Text, RealtimeModality::Audio]),
288 }
289 }
290}
291
292impl Default for VoiceActivityDetectionConfig {
293 fn default() -> Self {
294 Self {
295 threshold: 0.5,
296 prefix_padding_ms: 300,
297 silence_duration_ms: 200,
298 }
299 }
300}
301
302#[cfg(test)]
303mod tests {
304 use super::*;
305
306 #[test]
307 fn test_session_request_creation() {
308 let request = RealtimeSessionRequest {
309 model: RealtimeAudioModels::GPT_4O_REALTIME_PREVIEW.to_string(),
310 config: Some(RealtimeSessionConfig::default()),
311 instructions: Some("You are a helpful assistant.".to_string()),
312 voice: Some(RealtimeVoice::Alloy),
313 temperature: Some(0.8),
314 max_response_output_tokens: Some(4096),
315 };
316
317 assert_eq!(request.model, "gpt-4o-realtime-preview");
318 assert!(request.config.is_some());
319 assert_eq!(request.voice, Some(RealtimeVoice::Alloy));
320 }
321
322 #[test]
323 fn test_voice_activity_detection_config() {
324 let config = VoiceActivityDetectionConfig::default();
325 assert_eq!(config.threshold, 0.5);
326 assert_eq!(config.prefix_padding_ms, 300);
327 assert_eq!(config.silence_duration_ms, 200);
328 }
329
330 #[test]
331 fn test_audio_format_serialization() {
332 assert_eq!(
333 serde_json::to_string(&RealtimeAudioFormat::Pcm16).unwrap(),
334 "\"pcm16\""
335 );
336 assert_eq!(
337 serde_json::to_string(&RealtimeAudioFormat::G711Ulaw).unwrap(),
338 "\"g711_ulaw\""
339 );
340 }
341
342 #[test]
343 fn test_voice_serialization() {
344 assert_eq!(
345 serde_json::to_string(&RealtimeVoice::Alloy).unwrap(),
346 "\"alloy\""
347 );
348 assert_eq!(
349 serde_json::to_string(&RealtimeVoice::Nova).unwrap(),
350 "\"nova\""
351 );
352 }
353
354 #[test]
355 fn test_session_config_default() {
356 let config = RealtimeSessionConfig::default();
357 assert!(config.input_audio_format.is_some());
358 assert!(config.output_audio_format.is_some());
359 assert!(config.voice_activity_detection.is_some());
360 assert!(config.turn_detection.is_some());
361 assert_eq!(config.tool_choice, Some("auto".to_string()));
362 }
363}