1use serde::{Deserialize, Serialize};
4
5use super::tool::Tool;
6
7#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
9#[serde(rename_all = "lowercase")]
10pub enum Voice {
11 #[default]
13 Ara,
14 Rex,
16 Sal,
18 Eve,
20 Leo,
22}
23
24#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
26#[serde(rename_all = "lowercase")]
27pub enum AudioFormat {
28 #[default]
30 Pcm16,
31 #[serde(rename = "g711_ulaw")]
33 G711Ulaw,
34 #[serde(rename = "g711_alaw")]
36 G711Alaw,
37}
38
39#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct SessionConfig {
42 pub model: String,
44 #[serde(default)]
46 pub voice: Voice,
47 #[serde(default)]
49 pub input_audio_format: AudioFormat,
50 #[serde(default)]
52 pub output_audio_format: AudioFormat,
53 #[serde(skip_serializing_if = "Option::is_none")]
55 pub instructions: Option<String>,
56 #[serde(skip_serializing_if = "Option::is_none")]
58 pub tools: Option<Vec<Tool>>,
59 #[serde(skip_serializing_if = "Option::is_none")]
61 pub input_audio_transcription: Option<AudioTranscriptionConfig>,
62 #[serde(skip_serializing_if = "Option::is_none")]
64 pub turn_detection: Option<TurnDetectionConfig>,
65}
66
67impl SessionConfig {
68 pub fn new(model: impl Into<String>) -> Self {
70 Self {
71 model: model.into(),
72 voice: Voice::default(),
73 input_audio_format: AudioFormat::default(),
74 output_audio_format: AudioFormat::default(),
75 instructions: None,
76 tools: None,
77 input_audio_transcription: None,
78 turn_detection: None,
79 }
80 }
81
82 pub fn voice(mut self, voice: Voice) -> Self {
84 self.voice = voice;
85 self
86 }
87
88 pub fn input_format(mut self, format: AudioFormat) -> Self {
90 self.input_audio_format = format;
91 self
92 }
93
94 pub fn output_format(mut self, format: AudioFormat) -> Self {
96 self.output_audio_format = format;
97 self
98 }
99
100 pub fn instructions(mut self, instructions: impl Into<String>) -> Self {
102 self.instructions = Some(instructions.into());
103 self
104 }
105
106 pub fn tools(mut self, tools: Vec<Tool>) -> Self {
108 self.tools = Some(tools);
109 self
110 }
111}
112
113#[derive(Debug, Clone, Serialize, Deserialize)]
115pub struct AudioTranscriptionConfig {
116 #[serde(default)]
118 pub enabled: bool,
119}
120
121#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct TurnDetectionConfig {
124 #[serde(rename = "type")]
126 pub detection_type: String,
127 #[serde(skip_serializing_if = "Option::is_none")]
129 pub silence_duration_ms: Option<u32>,
130}
131
132#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
134#[serde(rename_all = "snake_case")]
135pub enum ConversationItemType {
136 Message,
138 FunctionCall,
140 FunctionCallOutput,
142}
143
144#[derive(Debug, Clone, Serialize, Deserialize)]
146pub struct ConversationItem {
147 #[serde(skip_serializing_if = "Option::is_none")]
149 pub id: Option<String>,
150 #[serde(rename = "type")]
152 pub item_type: ConversationItemType,
153 #[serde(skip_serializing_if = "Option::is_none")]
155 pub role: Option<String>,
156 #[serde(skip_serializing_if = "Option::is_none")]
158 pub content: Option<Vec<ConversationContent>>,
159}
160
161#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
163#[serde(rename_all = "snake_case")]
164pub enum ConversationContentType {
165 Text,
167 Audio,
169 InputText,
171 InputAudio,
173}
174
175#[derive(Debug, Clone, Serialize, Deserialize)]
177pub struct ConversationContent {
178 #[serde(rename = "type")]
180 pub content_type: ConversationContentType,
181 #[serde(skip_serializing_if = "Option::is_none")]
183 pub text: Option<String>,
184 #[serde(skip_serializing_if = "Option::is_none")]
186 pub audio: Option<String>,
187 #[serde(skip_serializing_if = "Option::is_none")]
189 pub transcript: Option<String>,
190}
191
192#[derive(Debug, Clone, Serialize)]
194#[serde(tag = "type", rename_all = "snake_case")]
195pub enum RealtimeClientMessage {
196 SessionUpdate {
198 session: SessionConfig,
200 },
201 InputAudioBufferAppend {
203 audio: String,
205 },
206 InputAudioBufferCommit {},
208 InputAudioBufferClear {},
210 ConversationItemCreate {
212 item: ConversationItem,
214 },
215 ResponseCreate {
217 #[serde(skip_serializing_if = "Option::is_none")]
219 response: Option<ResponseConfig>,
220 },
221 ResponseCancel {},
223}
224
225#[derive(Debug, Clone, Serialize, Deserialize)]
227pub struct ResponseConfig {
228 #[serde(skip_serializing_if = "Option::is_none")]
230 pub modalities: Option<Vec<String>>,
231 #[serde(skip_serializing_if = "Option::is_none")]
233 pub instructions: Option<String>,
234}
235
236#[derive(Debug, Clone, Deserialize)]
238#[serde(tag = "type", rename_all = "snake_case")]
239pub enum RealtimeServerMessage {
240 SessionCreated {
242 session: SessionConfig,
244 },
245 SessionUpdated {
247 session: SessionConfig,
249 },
250 ConversationItemCreated {
252 item: ConversationItem,
254 },
255 InputAudioBufferCommitted {
257 item_id: String,
259 },
260 InputAudioBufferCleared {},
262 InputAudioBufferSpeechStarted {
264 audio_start_ms: u32,
266 },
267 InputAudioBufferSpeechStopped {
269 audio_end_ms: u32,
271 },
272 ResponseCreated {
274 response: RealtimeResponse,
276 },
277 ResponseAudioDelta {
279 response_id: String,
281 item_id: String,
283 delta: String,
285 },
286 ResponseAudioTranscriptDelta {
288 response_id: String,
290 item_id: String,
292 delta: String,
294 },
295 ResponseTextDelta {
297 response_id: String,
299 item_id: String,
301 delta: String,
303 },
304 ResponseDone {
306 response: RealtimeResponse,
308 },
309 Error {
311 error: RealtimeError,
313 },
314 RateLimitsUpdated {
316 rate_limits: Vec<RateLimit>,
318 },
319}
320
321#[derive(Debug, Clone, Deserialize)]
323pub struct RealtimeResponse {
324 pub id: String,
326 #[serde(default)]
328 pub status: String,
329 #[serde(default)]
331 pub output: Vec<ConversationItem>,
332}
333
334#[derive(Debug, Clone, Deserialize)]
336pub struct RealtimeError {
337 #[serde(rename = "type")]
339 pub error_type: String,
340 #[serde(default)]
342 pub code: Option<String>,
343 pub message: String,
345}
346
347impl std::fmt::Display for RealtimeError {
348 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
349 write!(f, "{}: {}", self.error_type, self.message)
350 }
351}
352
353#[derive(Debug, Clone, Deserialize)]
355pub struct RateLimit {
356 pub name: String,
358 pub limit: u32,
360 pub remaining: u32,
362 #[serde(default)]
364 pub reset_seconds: Option<f64>,
365}
366
367#[cfg(test)]
368mod tests {
369 use super::*;
370 use serde_json::json;
371
372 #[test]
375 fn conversation_item_type_roundtrip_all() {
376 for (variant, expected) in [
377 (ConversationItemType::Message, "message"),
378 (ConversationItemType::FunctionCall, "function_call"),
379 (
380 ConversationItemType::FunctionCallOutput,
381 "function_call_output",
382 ),
383 ] {
384 let json_val = serde_json::to_value(variant).unwrap();
385 assert_eq!(json_val, json!(expected));
386
387 let back: ConversationItemType = serde_json::from_value(json_val).unwrap();
388 assert_eq!(back, variant);
389 }
390 }
391
392 #[test]
393 fn conversation_item_type_rejects_unknown() {
394 let result = serde_json::from_str::<ConversationItemType>(r#""unknown_type""#);
395 assert!(result.is_err());
396 }
397
398 #[test]
401 fn conversation_content_type_roundtrip_all() {
402 for (variant, expected) in [
403 (ConversationContentType::Text, "text"),
404 (ConversationContentType::Audio, "audio"),
405 (ConversationContentType::InputText, "input_text"),
406 (ConversationContentType::InputAudio, "input_audio"),
407 ] {
408 let json_val = serde_json::to_value(variant).unwrap();
409 assert_eq!(json_val, json!(expected));
410
411 let back: ConversationContentType = serde_json::from_value(json_val).unwrap();
412 assert_eq!(back, variant);
413 }
414 }
415
416 #[test]
417 fn conversation_content_type_rejects_unknown() {
418 let result = serde_json::from_str::<ConversationContentType>(r#""video""#);
419 assert!(result.is_err());
420 }
421
422 #[test]
425 fn conversation_item_message_roundtrip() {
426 let item = ConversationItem {
427 id: Some("item_1".to_string()),
428 item_type: ConversationItemType::Message,
429 role: Some("user".to_string()),
430 content: Some(vec![ConversationContent {
431 content_type: ConversationContentType::InputText,
432 text: Some("Hello".to_string()),
433 audio: None,
434 transcript: None,
435 }]),
436 };
437
438 let json_val = serde_json::to_value(&item).unwrap();
439 assert_eq!(json_val["type"], "message");
440 assert_eq!(json_val["role"], "user");
441 assert_eq!(json_val["content"][0]["type"], "input_text");
442 assert_eq!(json_val["content"][0]["text"], "Hello");
443
444 let back: ConversationItem = serde_json::from_value(json_val).unwrap();
445 assert_eq!(back.item_type, ConversationItemType::Message);
446 assert_eq!(back.id.as_deref(), Some("item_1"));
447 }
448
449 #[test]
450 fn conversation_item_function_call_roundtrip() {
451 let item = ConversationItem {
452 id: Some("fc_1".to_string()),
453 item_type: ConversationItemType::FunctionCall,
454 role: None,
455 content: None,
456 };
457
458 let json_val = serde_json::to_value(&item).unwrap();
459 assert_eq!(json_val["type"], "function_call");
460 assert!(json_val.get("role").is_none());
461 assert!(json_val.get("content").is_none());
462
463 let back: ConversationItem = serde_json::from_value(json_val).unwrap();
464 assert_eq!(back.item_type, ConversationItemType::FunctionCall);
465 }
466
467 #[test]
470 fn conversation_content_audio_roundtrip() {
471 let cc = ConversationContent {
472 content_type: ConversationContentType::Audio,
473 text: None,
474 audio: Some("base64data".to_string()),
475 transcript: Some("transcribed text".to_string()),
476 };
477
478 let json_val = serde_json::to_value(&cc).unwrap();
479 assert_eq!(json_val["type"], "audio");
480 assert_eq!(json_val["audio"], "base64data");
481 assert_eq!(json_val["transcript"], "transcribed text");
482
483 let back: ConversationContent = serde_json::from_value(json_val).unwrap();
484 assert_eq!(back.content_type, ConversationContentType::Audio);
485 assert_eq!(back.audio.as_deref(), Some("base64data"));
486 }
487
488 #[test]
491 fn voice_roundtrip_all() {
492 for (variant, expected) in [
493 (Voice::Ara, "ara"),
494 (Voice::Rex, "rex"),
495 (Voice::Sal, "sal"),
496 (Voice::Eve, "eve"),
497 (Voice::Leo, "leo"),
498 ] {
499 let json_val = serde_json::to_value(variant).unwrap();
500 assert_eq!(json_val, json!(expected));
501
502 let back: Voice = serde_json::from_value(json_val).unwrap();
503 assert_eq!(back, variant);
504 }
505 }
506
507 #[test]
508 fn voice_default_is_ara() {
509 assert_eq!(Voice::default(), Voice::Ara);
510 }
511
512 #[test]
515 fn audio_format_roundtrip_all() {
516 for (variant, expected) in [
517 (AudioFormat::Pcm16, "pcm16"),
518 (AudioFormat::G711Ulaw, "g711_ulaw"),
519 (AudioFormat::G711Alaw, "g711_alaw"),
520 ] {
521 let json_val = serde_json::to_value(variant).unwrap();
522 assert_eq!(json_val, json!(expected));
523
524 let back: AudioFormat = serde_json::from_value(json_val).unwrap();
525 assert_eq!(back, variant);
526 }
527 }
528
529 #[test]
530 fn audio_format_default_is_pcm16() {
531 assert_eq!(AudioFormat::default(), AudioFormat::Pcm16);
532 }
533
534 #[test]
537 fn session_config_builder_pattern() {
538 let config = SessionConfig::new("grok-4")
539 .voice(Voice::Eve)
540 .input_format(AudioFormat::G711Ulaw)
541 .output_format(AudioFormat::G711Alaw)
542 .instructions("Be helpful");
543 assert_eq!(config.model, "grok-4");
544 assert_eq!(config.voice, Voice::Eve);
545 assert_eq!(config.input_audio_format, AudioFormat::G711Ulaw);
546 assert_eq!(config.output_audio_format, AudioFormat::G711Alaw);
547 assert_eq!(config.instructions.as_deref(), Some("Be helpful"));
548 }
549
550 #[test]
551 fn session_config_roundtrip() {
552 let config = SessionConfig::new("grok-4")
553 .voice(Voice::Rex)
554 .instructions("Test instructions");
555
556 let json_val = serde_json::to_value(&config).unwrap();
557 assert_eq!(json_val["model"], "grok-4");
558 assert_eq!(json_val["voice"], "rex");
559 assert_eq!(json_val["instructions"], "Test instructions");
560
561 let back: SessionConfig = serde_json::from_value(json_val).unwrap();
562 assert_eq!(back.model, "grok-4");
563 assert_eq!(back.voice, Voice::Rex);
564 }
565
566 #[test]
569 fn realtime_client_message_session_update_serialize() {
570 let msg = RealtimeClientMessage::SessionUpdate {
571 session: SessionConfig::new("grok-4"),
572 };
573 let json_val = serde_json::to_value(&msg).unwrap();
574 assert_eq!(json_val["type"], "session_update");
575 assert_eq!(json_val["session"]["model"], "grok-4");
576 }
577
578 #[test]
579 fn realtime_client_message_audio_append_serialize() {
580 let msg = RealtimeClientMessage::InputAudioBufferAppend {
581 audio: "base64data".to_string(),
582 };
583 let json_val = serde_json::to_value(&msg).unwrap();
584 assert_eq!(json_val["type"], "input_audio_buffer_append");
585 assert_eq!(json_val["audio"], "base64data");
586 }
587
588 #[test]
589 fn realtime_client_message_response_create_serialize() {
590 let msg = RealtimeClientMessage::ResponseCreate { response: None };
591 let json_val = serde_json::to_value(&msg).unwrap();
592 assert_eq!(json_val["type"], "response_create");
593 }
594
595 #[test]
598 fn realtime_server_message_session_created() {
599 let json_val = json!({
600 "type": "session_created",
601 "session": {
602 "model": "grok-4",
603 "voice": "ara",
604 "input_audio_format": "pcm16",
605 "output_audio_format": "pcm16"
606 }
607 });
608 let msg: RealtimeServerMessage = serde_json::from_value(json_val).unwrap();
609 assert!(matches!(msg, RealtimeServerMessage::SessionCreated { .. }));
610 }
611
612 #[test]
613 fn realtime_server_message_error() {
614 let json_val = json!({
615 "type": "error",
616 "error": {
617 "type": "invalid_request",
618 "message": "Bad request"
619 }
620 });
621 let msg: RealtimeServerMessage = serde_json::from_value(json_val).unwrap();
622 if let RealtimeServerMessage::Error { error } = msg {
623 assert_eq!(error.error_type, "invalid_request");
624 assert_eq!(error.message, "Bad request");
625 assert_eq!(format!("{error}"), "invalid_request: Bad request");
626 } else {
627 panic!("Expected Error variant");
628 }
629 }
630
631 #[test]
632 fn realtime_server_message_response_audio_delta() {
633 let json_val = json!({
634 "type": "response_audio_delta",
635 "response_id": "resp_1",
636 "item_id": "item_1",
637 "delta": "YXVkaW8="
638 });
639 let msg: RealtimeServerMessage = serde_json::from_value(json_val).unwrap();
640 if let RealtimeServerMessage::ResponseAudioDelta { delta, .. } = msg {
641 assert_eq!(delta, "YXVkaW8=");
642 } else {
643 panic!("Expected ResponseAudioDelta variant");
644 }
645 }
646}