Skip to main content

async_openai/types/realtime/
translation.rs

1use serde::{Deserialize, Serialize};
2
3use crate::types::realtime::{NoiseReductionType, RealtimeServerEventError};
4
5/// Optional source-language transcription configuration for a translation session.
6/// When configured, the server emits `session.input_transcript.delta` events.
7/// Translation itself still runs from the input audio stream.
8#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct RealtimeTranslationInputTranscription {
10    /// The transcription model used for source transcript deltas.
11    pub model: String,
12}
13
14/// Optional input noise reduction for a translation session.
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct RealtimeTranslationNoiseReduction {
17    pub r#type: NoiseReductionType,
18}
19
20/// Configuration for translation input audio.
21#[derive(Debug, Default, Clone, Serialize, Deserialize)]
22pub struct RealtimeTranslationInputAudio {
23    // DONT add because it can be 'null': #[serde(skip_serializing_if = "Option::is_none")]
24    /// Optional source-language transcription. Set to `null` to disable.
25    pub transcription: Option<RealtimeTranslationInputTranscription>,
26    // DONT add because it can be 'null': #[serde(skip_serializing_if = "Option::is_none")]
27    /// Optional input noise reduction. Set to `null` to disable.
28    pub noise_reduction: Option<RealtimeTranslationNoiseReduction>,
29}
30
31/// Configuration for translation output audio.
32#[derive(Debug, Default, Clone, Serialize, Deserialize)]
33pub struct RealtimeTranslationOutputAudio {
34    /// Target language for translated output audio and transcript deltas.
35    #[serde(skip_serializing_if = "Option::is_none")]
36    pub language: Option<String>,
37}
38
39/// Configuration for translation input and output audio.
40#[derive(Debug, Default, Clone, Serialize, Deserialize)]
41pub struct RealtimeTranslationAudio {
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub input: Option<RealtimeTranslationInputAudio>,
44    #[serde(skip_serializing_if = "Option::is_none")]
45    pub output: Option<RealtimeTranslationOutputAudio>,
46}
47
48/// A Realtime translation session. Translation sessions continuously translate input
49/// audio into the configured output language.
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct RealtimeTranslationSession {
52    /// Unique identifier for the session that looks like `sess_1234567890abcdef`.
53    pub id: String,
54    /// The session type. Always `translation` for Realtime translation sessions.
55    pub r#type: String,
56    /// Expiration timestamp for the session, in seconds since epoch.
57    pub expires_at: u64,
58    /// The Realtime translation model used for this session.
59    pub model: String,
60    /// Configuration for translation input and output audio.
61    pub audio: RealtimeTranslationAudio,
62}
63
64/// Realtime translation session configuration. Translation sessions stream
65/// source audio in and translated audio plus transcript deltas out continuously.
66#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct RealtimeTranslationSessionCreateRequest {
68    /// The Realtime translation model used for this session.
69    pub model: String,
70    /// Configuration for translation input and output audio.
71    #[serde(skip_serializing_if = "Option::is_none")]
72    pub audio: Option<RealtimeTranslationAudio>,
73}
74
75/// Realtime translation session fields that can be updated with `session.update`.
76#[derive(Debug, Default, Clone, Serialize, Deserialize)]
77pub struct RealtimeTranslationSessionUpdateRequest {
78    /// Configuration for translation input and output audio.
79    #[serde(skip_serializing_if = "Option::is_none")]
80    pub audio: Option<RealtimeTranslationAudio>,
81}
82
83/// The anchor point for the translation client secret expiration.
84#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
85#[serde(rename_all = "snake_case")]
86pub enum RealtimeTranslationClientSecretExpiresAnchor {
87    #[default]
88    CreatedAt,
89}
90
91/// Configuration for the client secret expiration. Expiration refers to
92/// the time after which a client secret will no longer be valid for creating sessions.
93/// The session itself may continue after that time once started. A secret can be used to
94/// create multiple sessions until it expires.
95#[derive(Debug, Default, Clone, Serialize, Deserialize)]
96pub struct RealtimeTranslationClientSecretExpiresAfter {
97    /// The anchor point for the client secret expiration, meaning that
98    /// `seconds` will be added to the `created_at` time of the client
99    /// secret to produce an expiration timestamp. Only `created_at` is currently supported.
100    #[serde(skip_serializing_if = "Option::is_none")]
101    pub anchor: Option<RealtimeTranslationClientSecretExpiresAnchor>,
102    /// The number of seconds from the anchor point to the expiration. Select a value between
103    /// `10` and `7200` (2 hours). Defaults to 600 seconds (10 minutes) if not specified.
104    #[serde(skip_serializing_if = "Option::is_none")]
105    pub seconds: Option<u32>,
106}
107
108/// Create a translation session and client secret for the Realtime API.
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct RealtimeTranslationClientSecretCreateRequest {
111    #[serde(skip_serializing_if = "Option::is_none")]
112    pub expires_after: Option<RealtimeTranslationClientSecretExpiresAfter>,
113    pub session: RealtimeTranslationSessionCreateRequest,
114}
115
116/// Response from creating a translation session and client secret for the Realtime API.
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct RealtimeTranslationClientSecretCreateResponse {
119    /// The generated client secret value.
120    pub value: String,
121    /// Expiration timestamp for the client secret, in seconds since epoch.
122    pub expires_at: u64,
123    /// The translation session.
124    pub session: RealtimeTranslationSession,
125}
126
127/// Send this event to update the translation session configuration.
128/// Translation sessions support updates to `audio.output.language`,
129/// `audio.input.transcription`, and `audio.input.noise_reduction`.
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct RealtimeTranslationClientEventSessionUpdate {
132    /// Optional client-generated ID used to identify this event.
133    #[serde(skip_serializing_if = "Option::is_none")]
134    pub event_id: Option<String>,
135    /// Translation session fields to update. The session `type` and `model` are set
136    /// at creation and cannot be changed with `session.update`.
137    pub session: RealtimeTranslationSessionUpdateRequest,
138}
139
140/// Send this event to append audio bytes to the translation session input audio buffer.
141///
142/// WebSocket translation sessions accept base64-encoded 24 kHz PCM16 mono
143/// little-endian raw audio bytes. Unsupported websocket audio formats
144/// return a validation error because lower-quality audio materially degrades translation
145/// quality.
146///
147/// Translation consumes 200 ms engine frames. For best realtime behavior,
148/// append audio in 200 ms chunks. If a chunk is shorter, the server buffers it
149/// until it has enough audio for one frame. If a chunk is longer, the server splits
150/// it into 200 ms frames and enqueues them back-to-back.
151///
152/// Keep appending silence while the session is active. If a client stops
153/// sending audio and later resumes, model time treats the resumed audio as
154/// contiguous with the previous audio rather than as a real-world pause.
155#[derive(Debug, Clone, Serialize, Deserialize)]
156pub struct RealtimeTranslationClientEventInputAudioBufferAppend {
157    /// Optional client-generated ID used to identify this event.
158    #[serde(skip_serializing_if = "Option::is_none")]
159    pub event_id: Option<String>,
160    /// Base64-encoded 24 kHz PCM16 mono audio bytes.
161    pub audio: String,
162}
163
164/// Gracefully close the realtime translation session. The server flushes pending
165/// input audio and emits any remaining translated output before closing the session.
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct RealtimeTranslationClientEventSessionClose {
168    /// Optional client-generated ID used to identify this event.
169    #[serde(skip_serializing_if = "Option::is_none")]
170    pub event_id: Option<String>,
171}
172
173/// A Realtime translation client event.
174#[derive(Debug, Clone, Serialize, Deserialize)]
175#[serde(tag = "type")]
176pub enum RealtimeTranslationClientEvent {
177    #[serde(rename = "session.update")]
178    SessionUpdate(RealtimeTranslationClientEventSessionUpdate),
179    #[serde(rename = "session.input_audio_buffer.append")]
180    InputAudioBufferAppend(RealtimeTranslationClientEventInputAudioBufferAppend),
181    #[serde(rename = "session.close")]
182    SessionClose(RealtimeTranslationClientEventSessionClose),
183}
184
185/// Audio encoding for the translated audio delta.
186#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
187#[serde(rename_all = "lowercase")]
188pub enum RealtimeTranslationAudioFormat {
189    PCM16,
190}
191
192/// Returned when a translation session is created. Emitted automatically when a
193/// new connection is established as the first server event. This event contains
194/// the default translation session configuration.
195#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct RealtimeTranslationServerEventSessionCreated {
197    /// The unique ID of the server event.
198    pub event_id: String,
199    /// The translation session configuration.
200    pub session: RealtimeTranslationSession,
201}
202
203/// Returned when a translation session is updated with a `session.update` event,
204/// unless there is an error.
205#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct RealtimeTranslationServerEventSessionUpdated {
207    /// The unique ID of the server event.
208    pub event_id: String,
209    /// The translation session configuration.
210    pub session: RealtimeTranslationSession,
211}
212
213/// Returned when a realtime translation session is closed.
214#[derive(Debug, Clone, Serialize, Deserialize)]
215pub struct RealtimeTranslationServerEventSessionClosed {
216    /// The unique ID of the server event.
217    pub event_id: String,
218}
219
220/// Returned when optional source-language transcript text is available. This event
221/// is emitted only when `audio.input.transcription` is configured.
222///
223/// Transcript deltas are append-only text fragments. Clients should not insert
224/// unconditional spaces between deltas.
225#[derive(Debug, Clone, Serialize, Deserialize)]
226pub struct RealtimeTranslationServerEventSessionInputTranscriptDelta {
227    /// The unique ID of the server event.
228    pub event_id: String,
229    /// Append-only source-language transcript text.
230    pub delta: String,
231    /// Timing metadata for stream alignment, derived from the translation frame
232    /// when available. It advances in 200 ms increments, but multiple transcript
233    /// deltas may share the same `elapsed_ms`. Treat it as alignment metadata,
234    /// not a unique transcript-delta identifier.
235    #[serde(default, skip_serializing_if = "Option::is_none")]
236    pub elapsed_ms: Option<u64>,
237}
238
239/// Returned when translated transcript text is available.
240///
241/// Transcript deltas are append-only text fragments. Clients should not insert
242/// unconditional spaces between deltas.
243#[derive(Debug, Clone, Serialize, Deserialize)]
244pub struct RealtimeTranslationServerEventSessionOutputTranscriptDelta {
245    /// The unique ID of the server event.
246    pub event_id: String,
247    /// Append-only transcript text for the translated output audio.
248    pub delta: String,
249    /// Timing metadata for stream alignment, derived from the translation frame
250    /// when available. It advances in 200 ms increments, but multiple transcript
251    /// deltas may share the same `elapsed_ms`. Treat it as alignment metadata,
252    /// not a unique transcript-delta identifier.
253    #[serde(default, skip_serializing_if = "Option::is_none")]
254    pub elapsed_ms: Option<u64>,
255}
256
257/// Returned when translated output audio is available. Output audio deltas are
258/// 200 ms frames of PCM16 audio.
259#[derive(Debug, Clone, Serialize, Deserialize)]
260pub struct RealtimeTranslationServerEventSessionOutputAudioDelta {
261    /// The unique ID of the server event.
262    pub event_id: String,
263    /// Base64-encoded translated audio data.
264    pub delta: String,
265    /// Sample rate of the audio delta. Defaults to 24000.
266    #[serde(default, skip_serializing_if = "Option::is_none")]
267    pub sample_rate: Option<u32>,
268    /// Number of audio channels. Defaults to 1.
269    #[serde(default, skip_serializing_if = "Option::is_none")]
270    pub channels: Option<u32>,
271    /// Audio encoding for `delta`.
272    #[serde(default, skip_serializing_if = "Option::is_none")]
273    pub format: Option<RealtimeTranslationAudioFormat>,
274    /// Timing metadata for stream alignment, derived from the translation frame
275    /// when available. Treat `elapsed_ms` as alignment metadata, not a unique
276    /// event identifier.
277    #[serde(default, skip_serializing_if = "Option::is_none")]
278    pub elapsed_ms: Option<u64>,
279}
280
281/// A Realtime translation server event.
282#[derive(Debug, Clone, Serialize, Deserialize)]
283#[serde(tag = "type")]
284pub enum RealtimeTranslationServerEvent {
285    #[serde(rename = "error")]
286    Error(RealtimeServerEventError),
287    #[serde(rename = "session.created")]
288    SessionCreated(RealtimeTranslationServerEventSessionCreated),
289    #[serde(rename = "session.updated")]
290    SessionUpdated(RealtimeTranslationServerEventSessionUpdated),
291    #[serde(rename = "session.closed")]
292    SessionClosed(RealtimeTranslationServerEventSessionClosed),
293    #[serde(rename = "session.input_transcript.delta")]
294    SessionInputTranscriptDelta(RealtimeTranslationServerEventSessionInputTranscriptDelta),
295    #[serde(rename = "session.output_transcript.delta")]
296    SessionOutputTranscriptDelta(RealtimeTranslationServerEventSessionOutputTranscriptDelta),
297    #[serde(rename = "session.output_audio.delta")]
298    SessionOutputAudioDelta(RealtimeTranslationServerEventSessionOutputAudioDelta),
299}