async_openai/types/realtime/translation.rs
1use serde::{Deserialize, Serialize};
2
3use crate::types::realtime::{NoiseReductionType, RealtimeServerEventError};
4
5/// Optional source-language transcription configuration for a translation session.
6/// When configured, the server emits `session.input_transcript.delta` events.
7/// Translation itself still runs from the input audio stream.
8#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct RealtimeTranslationInputTranscription {
10 /// The transcription model used for source transcript deltas.
11 pub model: String,
12}
13
14/// Optional input noise reduction for a translation session.
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct RealtimeTranslationNoiseReduction {
17 pub r#type: NoiseReductionType,
18}
19
20/// Configuration for translation input audio.
21#[derive(Debug, Default, Clone, Serialize, Deserialize)]
22pub struct RealtimeTranslationInputAudio {
23 // DONT add because it can be 'null': #[serde(skip_serializing_if = "Option::is_none")]
24 /// Optional source-language transcription. Set to `null` to disable.
25 pub transcription: Option<RealtimeTranslationInputTranscription>,
26 // DONT add because it can be 'null': #[serde(skip_serializing_if = "Option::is_none")]
27 /// Optional input noise reduction. Set to `null` to disable.
28 pub noise_reduction: Option<RealtimeTranslationNoiseReduction>,
29}
30
31/// Configuration for translation output audio.
32#[derive(Debug, Default, Clone, Serialize, Deserialize)]
33pub struct RealtimeTranslationOutputAudio {
34 /// Target language for translated output audio and transcript deltas.
35 #[serde(skip_serializing_if = "Option::is_none")]
36 pub language: Option<String>,
37}
38
39/// Configuration for translation input and output audio.
40#[derive(Debug, Default, Clone, Serialize, Deserialize)]
41pub struct RealtimeTranslationAudio {
42 #[serde(skip_serializing_if = "Option::is_none")]
43 pub input: Option<RealtimeTranslationInputAudio>,
44 #[serde(skip_serializing_if = "Option::is_none")]
45 pub output: Option<RealtimeTranslationOutputAudio>,
46}
47
48/// A Realtime translation session. Translation sessions continuously translate input
49/// audio into the configured output language.
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct RealtimeTranslationSession {
52 /// Unique identifier for the session that looks like `sess_1234567890abcdef`.
53 pub id: String,
54 /// The session type. Always `translation` for Realtime translation sessions.
55 pub r#type: String,
56 /// Expiration timestamp for the session, in seconds since epoch.
57 pub expires_at: u64,
58 /// The Realtime translation model used for this session.
59 pub model: String,
60 /// Configuration for translation input and output audio.
61 pub audio: RealtimeTranslationAudio,
62}
63
64/// Realtime translation session configuration. Translation sessions stream
65/// source audio in and translated audio plus transcript deltas out continuously.
66#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct RealtimeTranslationSessionCreateRequest {
68 /// The Realtime translation model used for this session.
69 pub model: String,
70 /// Configuration for translation input and output audio.
71 #[serde(skip_serializing_if = "Option::is_none")]
72 pub audio: Option<RealtimeTranslationAudio>,
73}
74
75/// Realtime translation session fields that can be updated with `session.update`.
76#[derive(Debug, Default, Clone, Serialize, Deserialize)]
77pub struct RealtimeTranslationSessionUpdateRequest {
78 /// Configuration for translation input and output audio.
79 #[serde(skip_serializing_if = "Option::is_none")]
80 pub audio: Option<RealtimeTranslationAudio>,
81}
82
83/// The anchor point for the translation client secret expiration.
84#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
85#[serde(rename_all = "snake_case")]
86pub enum RealtimeTranslationClientSecretExpiresAnchor {
87 #[default]
88 CreatedAt,
89}
90
91/// Configuration for the client secret expiration. Expiration refers to
92/// the time after which a client secret will no longer be valid for creating sessions.
93/// The session itself may continue after that time once started. A secret can be used to
94/// create multiple sessions until it expires.
95#[derive(Debug, Default, Clone, Serialize, Deserialize)]
96pub struct RealtimeTranslationClientSecretExpiresAfter {
97 /// The anchor point for the client secret expiration, meaning that
98 /// `seconds` will be added to the `created_at` time of the client
99 /// secret to produce an expiration timestamp. Only `created_at` is currently supported.
100 #[serde(skip_serializing_if = "Option::is_none")]
101 pub anchor: Option<RealtimeTranslationClientSecretExpiresAnchor>,
102 /// The number of seconds from the anchor point to the expiration. Select a value between
103 /// `10` and `7200` (2 hours). Defaults to 600 seconds (10 minutes) if not specified.
104 #[serde(skip_serializing_if = "Option::is_none")]
105 pub seconds: Option<u32>,
106}
107
108/// Create a translation session and client secret for the Realtime API.
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct RealtimeTranslationClientSecretCreateRequest {
111 #[serde(skip_serializing_if = "Option::is_none")]
112 pub expires_after: Option<RealtimeTranslationClientSecretExpiresAfter>,
113 pub session: RealtimeTranslationSessionCreateRequest,
114}
115
116/// Response from creating a translation session and client secret for the Realtime API.
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct RealtimeTranslationClientSecretCreateResponse {
119 /// The generated client secret value.
120 pub value: String,
121 /// Expiration timestamp for the client secret, in seconds since epoch.
122 pub expires_at: u64,
123 /// The translation session.
124 pub session: RealtimeTranslationSession,
125}
126
127/// Send this event to update the translation session configuration.
128/// Translation sessions support updates to `audio.output.language`,
129/// `audio.input.transcription`, and `audio.input.noise_reduction`.
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct RealtimeTranslationClientEventSessionUpdate {
132 /// Optional client-generated ID used to identify this event.
133 #[serde(skip_serializing_if = "Option::is_none")]
134 pub event_id: Option<String>,
135 /// Translation session fields to update. The session `type` and `model` are set
136 /// at creation and cannot be changed with `session.update`.
137 pub session: RealtimeTranslationSessionUpdateRequest,
138}
139
140/// Send this event to append audio bytes to the translation session input audio buffer.
141///
142/// WebSocket translation sessions accept base64-encoded 24 kHz PCM16 mono
143/// little-endian raw audio bytes. Unsupported websocket audio formats
144/// return a validation error because lower-quality audio materially degrades translation
145/// quality.
146///
147/// Translation consumes 200 ms engine frames. For best realtime behavior,
148/// append audio in 200 ms chunks. If a chunk is shorter, the server buffers it
149/// until it has enough audio for one frame. If a chunk is longer, the server splits
150/// it into 200 ms frames and enqueues them back-to-back.
151///
152/// Keep appending silence while the session is active. If a client stops
153/// sending audio and later resumes, model time treats the resumed audio as
154/// contiguous with the previous audio rather than as a real-world pause.
155#[derive(Debug, Clone, Serialize, Deserialize)]
156pub struct RealtimeTranslationClientEventInputAudioBufferAppend {
157 /// Optional client-generated ID used to identify this event.
158 #[serde(skip_serializing_if = "Option::is_none")]
159 pub event_id: Option<String>,
160 /// Base64-encoded 24 kHz PCM16 mono audio bytes.
161 pub audio: String,
162}
163
164/// Gracefully close the realtime translation session. The server flushes pending
165/// input audio and emits any remaining translated output before closing the session.
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct RealtimeTranslationClientEventSessionClose {
168 /// Optional client-generated ID used to identify this event.
169 #[serde(skip_serializing_if = "Option::is_none")]
170 pub event_id: Option<String>,
171}
172
173/// A Realtime translation client event.
174#[derive(Debug, Clone, Serialize, Deserialize)]
175#[serde(tag = "type")]
176pub enum RealtimeTranslationClientEvent {
177 #[serde(rename = "session.update")]
178 SessionUpdate(RealtimeTranslationClientEventSessionUpdate),
179 #[serde(rename = "session.input_audio_buffer.append")]
180 InputAudioBufferAppend(RealtimeTranslationClientEventInputAudioBufferAppend),
181 #[serde(rename = "session.close")]
182 SessionClose(RealtimeTranslationClientEventSessionClose),
183}
184
185/// Audio encoding for the translated audio delta.
186#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
187#[serde(rename_all = "lowercase")]
188pub enum RealtimeTranslationAudioFormat {
189 PCM16,
190}
191
192/// Returned when a translation session is created. Emitted automatically when a
193/// new connection is established as the first server event. This event contains
194/// the default translation session configuration.
195#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct RealtimeTranslationServerEventSessionCreated {
197 /// The unique ID of the server event.
198 pub event_id: String,
199 /// The translation session configuration.
200 pub session: RealtimeTranslationSession,
201}
202
203/// Returned when a translation session is updated with a `session.update` event,
204/// unless there is an error.
205#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct RealtimeTranslationServerEventSessionUpdated {
207 /// The unique ID of the server event.
208 pub event_id: String,
209 /// The translation session configuration.
210 pub session: RealtimeTranslationSession,
211}
212
213/// Returned when a realtime translation session is closed.
214#[derive(Debug, Clone, Serialize, Deserialize)]
215pub struct RealtimeTranslationServerEventSessionClosed {
216 /// The unique ID of the server event.
217 pub event_id: String,
218}
219
220/// Returned when optional source-language transcript text is available. This event
221/// is emitted only when `audio.input.transcription` is configured.
222///
223/// Transcript deltas are append-only text fragments. Clients should not insert
224/// unconditional spaces between deltas.
225#[derive(Debug, Clone, Serialize, Deserialize)]
226pub struct RealtimeTranslationServerEventSessionInputTranscriptDelta {
227 /// The unique ID of the server event.
228 pub event_id: String,
229 /// Append-only source-language transcript text.
230 pub delta: String,
231 /// Timing metadata for stream alignment, derived from the translation frame
232 /// when available. It advances in 200 ms increments, but multiple transcript
233 /// deltas may share the same `elapsed_ms`. Treat it as alignment metadata,
234 /// not a unique transcript-delta identifier.
235 #[serde(default, skip_serializing_if = "Option::is_none")]
236 pub elapsed_ms: Option<u64>,
237}
238
239/// Returned when translated transcript text is available.
240///
241/// Transcript deltas are append-only text fragments. Clients should not insert
242/// unconditional spaces between deltas.
243#[derive(Debug, Clone, Serialize, Deserialize)]
244pub struct RealtimeTranslationServerEventSessionOutputTranscriptDelta {
245 /// The unique ID of the server event.
246 pub event_id: String,
247 /// Append-only transcript text for the translated output audio.
248 pub delta: String,
249 /// Timing metadata for stream alignment, derived from the translation frame
250 /// when available. It advances in 200 ms increments, but multiple transcript
251 /// deltas may share the same `elapsed_ms`. Treat it as alignment metadata,
252 /// not a unique transcript-delta identifier.
253 #[serde(default, skip_serializing_if = "Option::is_none")]
254 pub elapsed_ms: Option<u64>,
255}
256
257/// Returned when translated output audio is available. Output audio deltas are
258/// 200 ms frames of PCM16 audio.
259#[derive(Debug, Clone, Serialize, Deserialize)]
260pub struct RealtimeTranslationServerEventSessionOutputAudioDelta {
261 /// The unique ID of the server event.
262 pub event_id: String,
263 /// Base64-encoded translated audio data.
264 pub delta: String,
265 /// Sample rate of the audio delta. Defaults to 24000.
266 #[serde(default, skip_serializing_if = "Option::is_none")]
267 pub sample_rate: Option<u32>,
268 /// Number of audio channels. Defaults to 1.
269 #[serde(default, skip_serializing_if = "Option::is_none")]
270 pub channels: Option<u32>,
271 /// Audio encoding for `delta`.
272 #[serde(default, skip_serializing_if = "Option::is_none")]
273 pub format: Option<RealtimeTranslationAudioFormat>,
274 /// Timing metadata for stream alignment, derived from the translation frame
275 /// when available. Treat `elapsed_ms` as alignment metadata, not a unique
276 /// event identifier.
277 #[serde(default, skip_serializing_if = "Option::is_none")]
278 pub elapsed_ms: Option<u64>,
279}
280
281/// A Realtime translation server event.
282#[derive(Debug, Clone, Serialize, Deserialize)]
283#[serde(tag = "type")]
284pub enum RealtimeTranslationServerEvent {
285 #[serde(rename = "error")]
286 Error(RealtimeServerEventError),
287 #[serde(rename = "session.created")]
288 SessionCreated(RealtimeTranslationServerEventSessionCreated),
289 #[serde(rename = "session.updated")]
290 SessionUpdated(RealtimeTranslationServerEventSessionUpdated),
291 #[serde(rename = "session.closed")]
292 SessionClosed(RealtimeTranslationServerEventSessionClosed),
293 #[serde(rename = "session.input_transcript.delta")]
294 SessionInputTranscriptDelta(RealtimeTranslationServerEventSessionInputTranscriptDelta),
295 #[serde(rename = "session.output_transcript.delta")]
296 SessionOutputTranscriptDelta(RealtimeTranslationServerEventSessionOutputTranscriptDelta),
297 #[serde(rename = "session.output_audio.delta")]
298 SessionOutputAudioDelta(RealtimeTranslationServerEventSessionOutputAudioDelta),
299}