Skip to main content

openai_tools/realtime/
audio.rs

1//! Audio types for the Realtime API.
2
3use serde::{Deserialize, Serialize};
4
5/// Audio formats supported by the Realtime API.
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
7#[serde(rename_all = "snake_case")]
8pub enum AudioFormat {
9    /// PCM 16-bit linear encoding (24kHz, mono)
10    #[default]
11    Pcm16,
12    /// G.711 mu-law encoding
13    G711Ulaw,
14    /// G.711 A-law encoding
15    G711Alaw,
16}
17
18/// Voice options for text-to-speech output.
19#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
20#[serde(rename_all = "lowercase")]
21pub enum Voice {
22    #[default]
23    Alloy,
24    Ash,
25    Ballad,
26    Coral,
27    Echo,
28    Sage,
29    Shimmer,
30    Verse,
31}
32
33/// Transcription model options for input audio.
34#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
35pub enum TranscriptionModel {
36    #[serde(rename = "whisper-1")]
37    #[default]
38    Whisper1,
39    #[serde(rename = "gpt-4o-transcribe")]
40    Gpt4oTranscribe,
41    #[serde(rename = "gpt-4o-mini-transcribe")]
42    Gpt4oMiniTranscribe,
43    #[serde(rename = "gpt-4o-transcribe-diarize")]
44    Gpt4oTranscribeDiarize,
45}
46
47/// Input audio transcription configuration.
48#[derive(Debug, Clone, Default, Serialize, Deserialize)]
49pub struct InputAudioTranscription {
50    /// The transcription model to use.
51    #[serde(skip_serializing_if = "Option::is_none")]
52    pub model: Option<TranscriptionModel>,
53
54    /// Language hint for transcription (ISO-639-1 code).
55    #[serde(skip_serializing_if = "Option::is_none")]
56    pub language: Option<String>,
57
58    /// Optional prompt to guide transcription.
59    #[serde(skip_serializing_if = "Option::is_none")]
60    pub prompt: Option<String>,
61}
62
63impl InputAudioTranscription {
64    /// Create a new transcription configuration with the specified model.
65    pub fn new(model: TranscriptionModel) -> Self {
66        Self { model: Some(model), language: None, prompt: None }
67    }
68
69    /// Set the language hint.
70    pub fn with_language(mut self, language: impl Into<String>) -> Self {
71        self.language = Some(language.into());
72        self
73    }
74
75    /// Set the transcription prompt.
76    pub fn with_prompt(mut self, prompt: impl Into<String>) -> Self {
77        self.prompt = Some(prompt.into());
78        self
79    }
80}
81
82/// Input audio noise reduction configuration.
83#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct InputAudioNoiseReduction {
85    /// Type of noise reduction to apply.
86    #[serde(rename = "type")]
87    pub noise_type: NoiseReductionType,
88}
89
90/// Noise reduction type options.
91#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
92#[serde(rename_all = "snake_case")]
93pub enum NoiseReductionType {
94    /// Optimized for near-field audio (close microphone).
95    NearField,
96    /// Optimized for far-field audio (distant microphone).
97    FarField,
98}