async_openai/types/realtime/session.rs
1use serde::{Deserialize, Serialize};
2
3use crate::types::{
4 mcp::MCPTool,
5 responses::{Prompt, ToolChoiceFunction, ToolChoiceMCP, ToolChoiceOptions},
6};
7
8/// Controls how long the model waits before emitting transcription text.
9#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
10#[serde(rename_all = "lowercase")]
11pub enum AudioTranscriptionDelay {
12 Minimal,
13 Low,
14 Medium,
15 High,
16 #[serde(rename = "xhigh")]
17 XHigh,
18}
19
20#[derive(Debug, Default, Serialize, Deserialize, Clone)]
21pub struct AudioTranscription {
22 /// The language of the input audio. Supplying the input language in
23 /// [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format will improve accuracy and latency.
24 #[serde(skip_serializing_if = "Option::is_none")]
25 pub language: Option<String>,
26 /// The model to use for transcription. Current options are `whisper-1`,
27 /// `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`,
28 /// `gpt-4o-transcribe-diarize`, and `gpt-realtime-whisper`.
29 /// Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.
30 #[serde(skip_serializing_if = "Option::is_none")]
31 pub model: Option<String>,
32 /// An optional text to guide the model's style or continue a previous audio segment.
33 /// For `whisper-1`, the [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). For `gpt-4o-transcribe` models
34 /// (excluding gpt-4o-transcribe-diarize), the prompt is a free text string, for example
35 /// "expect words related to technology".
36 /// Prompt is not supported with `gpt-realtime-whisper` in GA Realtime sessions.
37 #[serde(skip_serializing_if = "Option::is_none")]
38 pub prompt: Option<String>,
39 /// Controls how long the model waits before emitting transcription text.
40 /// Higher values can improve transcription accuracy at the cost of latency.
41 /// Only supported with `gpt-realtime-whisper` in GA Realtime sessions.
42 #[serde(skip_serializing_if = "Option::is_none")]
43 pub delay: Option<AudioTranscriptionDelay>,
44}
45
46/// Configuration of the transcription model returned by the server.
47#[derive(Debug, Default, Serialize, Deserialize, Clone)]
48pub struct AudioTranscriptionResponse {
49 /// The language of the input audio.
50 #[serde(skip_serializing_if = "Option::is_none")]
51 pub language: Option<String>,
52 /// The model used for transcription.
53 #[serde(skip_serializing_if = "Option::is_none")]
54 pub model: Option<String>,
55 /// The prompt configured for input audio transcription, when present.
56 #[serde(skip_serializing_if = "Option::is_none")]
57 pub prompt: Option<String>,
58}
59
60#[derive(Debug, Serialize, Deserialize, Clone)]
61#[serde(tag = "type")]
62pub enum RealtimeTurnDetection {
63 /// Server-side voice activity detection (VAD) which flips on when user speech is detected
64 /// and off after a period of silence.
65 #[serde(rename = "server_vad")]
66 ServerVAD {
67 /// Whether or not to automatically generate a response when a VAD stop event occurs. If
68 /// `interrupt_response` is set to `false` this may fail to create a response if the model is
69 /// already responding.
70 ///
71 /// If both `create_response` and `interrupt_response` are set to `false`, the model will
72 /// never respond automatically but VAD events will still be emitted.
73 #[serde(skip_serializing_if = "Option::is_none")]
74 create_response: Option<bool>,
75
76 /// Optional timeout after which a model response will be triggered automatically.
77 /// This is useful for situations in which a long pause from the user is unexpected,
78 /// such as a phone call. The model will effectively prompt the user to continue the
79 /// conversation based on the current context.
80 ///
81 /// The timeout value will be applied after the last model response's audio has finished
82 /// playing, i.e. it's set to the response.done time plus audio playback duration.
83 ///
84 /// An input_audio_buffer.timeout_triggered event (plus events associated with the Response)
85 /// will be emitted when the timeout is reached. Idle timeout is currently only supported
86 /// for server_vad mode.
87 #[serde(skip_serializing_if = "Option::is_none")]
88 idle_timeout_ms: Option<u32>,
89
90 /// Whether or not to automatically interrupt (cancel) any ongoing response with output to the
91 /// default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs. If `true` then
92 /// the response will be cancelled, otherwise it will continue until complete.
93 ///
94 /// If both `create_response` and `interrupt_response` are set to `false`, the model will
95 /// never respond automatically but VAD events will still be emitted.
96 #[serde(skip_serializing_if = "Option::is_none")]
97 interrupt_response: Option<bool>,
98
99 /// Used only for server_vad mode. Amount of audio to include before the VAD detected speech
100 /// (in milliseconds). Defaults to 300ms.
101 prefix_padding_ms: u32,
102 /// Used only for server_vad mode. Duration of silence to detect speech stop
103 /// (in milliseconds). Defaults to 500ms. With shorter values the model will respond
104 /// more quickly, but may jump in on short pauses from the user.
105 silence_duration_ms: u32,
106
107 /// Used only for server_vad mode. Activation threshold for VAD (0.0 to 1.0),
108 /// this defaults to 0.5. A higher threshold will require louder audio to activate
109 /// the model, and thus might perform better in noisy environments.
110 threshold: f32,
111 },
112
113 /// Server-side semantic turn detection which uses a model to determine when the user has
114 /// finished speaking.
115 #[serde(rename = "semantic_vad")]
116 SemanticVAD {
117 /// Whether or not to automatically generate a response when a VAD stop event occurs.
118 #[serde(skip_serializing_if = "Option::is_none", default)]
119 create_response: Option<bool>,
120
121 /// Used only for `semantic_vad` mode. The eagerness of the model to respond.
122 /// `low` will wait longer for the user to continue speaking, `high` will respond more
123 /// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, and `high`
124 /// have max timeouts of 8s, 4s, and 2s respectively.
125 eagerness: String,
126
127 /// Whether or not to automatically interrupt any ongoing response with output to
128 /// the default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs.
129 #[serde(skip_serializing_if = "Option::is_none", default)]
130 interrupt_response: Option<bool>,
131 },
132}
133
134#[derive(Debug, Serialize, Deserialize, Clone)]
135pub enum MaxOutputTokens {
136 #[serde(rename = "inf")]
137 Inf,
138 #[serde(untagged)]
139 Num(u16),
140}
141
142#[derive(Debug, Serialize, Deserialize, Clone)]
143pub struct RealtimeFunctionTool {
144 /// The name of the function.
145 pub name: String,
146 /// The description of the function, including guidance on when and how to call it,
147 /// and guidance about what to tell the user when calling (if anything).
148 pub description: String,
149 /// Parameters of the function in JSON Schema.
150 pub parameters: serde_json::Value,
151}
152
153#[derive(Debug, Serialize, Deserialize, Clone)]
154#[serde(tag = "type")]
155pub enum RealtimeTool {
156 #[serde(rename = "function")]
157 Function(RealtimeFunctionTool),
158 /// Give the model access to additional tools via remote Model Context Protocol (MCP) servers.
159 /// [Learn more about MCP](https://platform.openai.com/docs/guides/tools-remote-mcp).
160 #[serde(rename = "mcp")]
161 MCP(MCPTool),
162}
163
164#[derive(Debug, Serialize, Deserialize, Clone)]
165#[serde(rename_all = "lowercase")]
166pub enum FunctionType {
167 Function,
168}
169
170#[derive(Debug, Serialize, Deserialize, Clone)]
171#[serde(tag = "type", rename_all = "snake_case")]
172pub enum ToolChoice {
173 /// Use this option to force the model to call a specific function.
174 Function(ToolChoiceFunction),
175 /// Use this option to force the model to call a specific tool on a remote MCP server.
176 Mcp(ToolChoiceMCP),
177
178 #[serde(untagged)]
179 Mode(ToolChoiceOptions),
180}
181
182#[derive(Debug, Serialize, Deserialize, Clone)]
183#[serde(rename_all = "lowercase")]
184pub enum RealtimeVoice {
185 Alloy,
186 Ash,
187 Ballad,
188 Coral,
189 Echo,
190 Sage,
191 Shimmer,
192 Verse,
193 Marin,
194 Cedar,
195 #[serde(untagged)]
196 Other(String),
197}
198
199#[derive(Debug, Serialize, Deserialize, Clone)]
200#[serde(tag = "type")]
201pub enum RealtimeAudioFormats {
202 /// The PCM audio format. Only a 24kHz sample rate is supported.
203 #[serde(rename = "audio/pcm")]
204 PCMAudioFormat {
205 /// The sample rate of the audio. Always 24000.
206 rate: u32,
207 },
208 /// The G.711 μ-law format.
209 #[serde(rename = "audio/pcmu")]
210 PCMUAudioFormat,
211 /// The G.711 A-law format.
212 #[serde(rename = "audio/pcma")]
213 PCMAAudioFormat,
214}
215
216#[derive(Debug, Serialize, Deserialize, Clone, Default)]
217pub struct G711ULAWAudioFormat {
218 pub sample_rate: u32,
219 pub channels: u32,
220}
221
222#[derive(Debug, Serialize, Deserialize, Clone)]
223pub struct AudioInput {
224 /// The format of the input audio.
225 pub format: RealtimeAudioFormats,
226 /// Configuration for input audio noise reduction. This can be set to null to turn off.
227 /// Noise reduction filters audio added to the input audio buffer before it is sent to VAD
228 /// and the model. Filtering the audio can improve VAD and turn detection accuracy
229 /// (reducing false positives) and model performance by improving perception of the
230 /// input audio.
231 pub noise_reduction: Option<NoiseReductionType>,
232 /// Configuration for input audio transcription, defaults to off and can be set to `null` to turn off once on.
233 /// Input audio transcription is not native to the model, since the model consumes audio directly.
234 /// Transcription runs asynchronously through [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
235 /// and should be treated as guidance of input audio content rather than precisely what the model
236 /// heard. The client can optionally set the language and prompt for transcription,
237 /// these offer additional guidance to the transcription service.
238 pub transcription: Option<AudioTranscription>,
239
240 /// Configuration for turn detection, ether Server VAD or Semantic VAD. This can
241 /// be set to null to turn off, in which case the client must manually trigger model response.
242 ///
243 /// Server VAD means that the model will detect the start and end of speech
244 /// based on audio volume and respond at the end of user speech.
245 ///
246 /// Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD)
247 /// to semantically estimate whether the user has finished speaking, then dynamically sets
248 /// a timeout based on this probability. For example, if user audio trails off with "uhhm",
249 /// the model will score a low probability of turn end and wait longer for the user to
250 /// continue speaking. This can be useful for more natural conversations, but may have a
251 /// higher latency.
252 pub turn_detection: Option<RealtimeTurnDetection>,
253}
254
255#[derive(Debug, Serialize, Deserialize, Clone)]
256pub struct AudioOutput {
257 /// The format of the output audio.
258 pub format: RealtimeAudioFormats,
259 /// The speed of the model's spoken response as a multiple of the original speed.
260 /// 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
261 /// This value can only be changed in between model turns, not while a response
262 /// is in progress.
263 ///
264 /// This parameter is a post-processing adjustment to the audio after it is generated,
265 /// it's also possible to prompt the model to speak faster or slower.
266 pub speed: f32,
267 /// The voice the model uses to respond. Supported built-in voices are `alloy`, `ash`, `ballad`,
268 /// `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. Voice cannot be changed during
269 /// the session once the model has responded with audio at least once. We recommend `marin` and `cedar`
270 /// for best quality.
271 pub voice: RealtimeVoice,
272}
273
274#[derive(Debug, Serialize, Deserialize, Clone)]
275pub struct Audio {
276 pub input: AudioInput,
277 pub output: AudioOutput,
278}
279
280#[derive(Debug, Serialize, Deserialize, Clone)]
281#[serde(rename_all = "lowercase")]
282pub enum Tracing {
283 /// Enables tracing and sets default values for tracing configuration options. Always `auto`.
284 Auto,
285
286 #[serde(untagged)]
287 Configuration(TracingConfiguration),
288}
289
290#[derive(Debug, Serialize, Deserialize, Clone)]
291pub struct TracingConfiguration {
292 /// The group id to attach to this trace to enable filtering and grouping in the Traces Dashboard.
293 pub group_id: String,
294 /// The arbitrary metadata to attach to this trace to enable filtering in the Traces Dashboard.
295 pub metadata: serde_json::Value,
296 /// The name of the workflow to attach to this trace. This is used to name the trace in the Traces Dashboard.
297 pub workflow_name: String,
298}
299
300/// The truncation strategy to use for the session.
301#[derive(Debug, Serialize, Deserialize, Clone)]
302#[serde(rename_all = "lowercase")]
303pub enum RealtimeTruncation {
304 /// `auto` is the default truncation strategy.
305 Auto,
306 /// `disabled` will disable truncation and emit errors when the conversation exceeds the input
307 /// token limit.
308 Disabled,
309
310 /// Retain a fraction of the conversation tokens when the conversation exceeds the input token
311 /// limit. This allows you to amortize truncations across multiple turns, which can help improve
312 /// cached token usage.
313 #[serde(untagged)]
314 RetentionRatio(RetentionRatioTruncation),
315}
316
317#[derive(Debug, Serialize, Deserialize, Clone)]
318pub struct RetentionRatioTruncation {
319 /// Fraction of post-instruction conversation tokens to retain (0.0 - 1.0) when the conversation
320 /// exceeds the input token limit. Setting this to 0.8 means that messages will be dropped
321 /// until 80% of the maximum allowed tokens are used. This helps reduce the frequency of
322 /// truncations and improve cache rates.
323 pub retention_ratio: f32,
324
325 /// Use retention ratio truncation.
326 pub r#type: String,
327
328 /// Optional custom token limits for this truncation strategy. If not provided, the model's
329 /// default token limits will be used.
330 #[serde(skip_serializing_if = "Option::is_none")]
331 pub token_limits: Option<TokenLimits>,
332}
333
334#[derive(Debug, Serialize, Deserialize, Clone)]
335pub struct TokenLimits {
336 /// Maximum tokens allowed in the conversation after instructions (which including tool
337 /// definitions). For example, setting this to 5,000 would mean that truncation would occur
338 /// when the conversation exceeds 5,000 tokens after instructions. This cannot be higher
339 /// than the model's context window size minus the maximum output tokens.
340 pub post_instructions: u32,
341}
342
343#[derive(Debug, Serialize, Deserialize, Clone)]
344#[serde(tag = "type")]
345pub enum Session {
346 // Boxed as per clippy suggestion:
347 // https://rust-lang.github.io/rust -clippy/rust-1.91.0/index.html#large_enum_variant
348 // the largest variant contains at least 600 bytes, the second-largest variant contains at least 144 bytes
349 /// The type of session to create. Always `realtime` for the Realtime API.
350 #[serde(rename = "realtime")]
351 RealtimeSession(Box<RealtimeSession>),
352 /// The type of session to create. Always `transcription` for transcription sessions.
353 #[serde(rename = "transcription")]
354 RealtimeTranscriptionSession(RealtimeTranscriptionSession),
355}
356
357#[derive(Debug, Serialize, Deserialize, Clone)]
358#[serde(tag = "type")]
359pub enum RealtimeSessionConfiguration {
360 Realtime(RealtimeSession),
361}
362
363impl Default for RealtimeSessionConfiguration {
364 fn default() -> Self {
365 Self::Realtime(RealtimeSession::default())
366 }
367}
368
369/// Constrains effort on reasoning for reasoning-capable Realtime models such as `gpt-realtime-2`.
370#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Default)]
371#[serde(rename_all = "lowercase")]
372pub enum RealtimeReasoningEffort {
373 Minimal,
374 #[default]
375 Low,
376 Medium,
377 High,
378 #[serde(rename = "xhigh")]
379 XHigh,
380}
381
382/// Configuration for reasoning-capable Realtime models such as `gpt-realtime-2`.
383#[derive(Debug, Default, Serialize, Deserialize, Clone)]
384pub struct RealtimeReasoning {
385 #[serde(skip_serializing_if = "Option::is_none")]
386 pub effort: Option<RealtimeReasoningEffort>,
387}
388
389/// Realtime session object configuration.
390/// openapi spec type: RealtimeSessionCreateRequestGA
391#[derive(Debug, Serialize, Deserialize, Clone, Default)]
392pub struct RealtimeSession {
393 #[serde(skip_serializing_if = "Option::is_none")]
394 pub id: Option<String>,
395 #[serde(skip_serializing_if = "Option::is_none")]
396 pub expires_at: Option<u64>,
397 #[serde(skip_serializing_if = "Option::is_none")]
398 pub audio: Option<Audio>,
399
400 /// Additional fields to include in server outputs.
401 ///
402 /// `item.input_audio_transcription.logprobs`: Include logprobs for input audio transcription.
403 #[serde(skip_serializing_if = "Option::is_none")]
404 pub include: Option<Vec<String>>,
405
406 /// The default system instructions (i.e. system message) prepended to model calls.
407 /// This field allows the client to guide the model on desired responses.
408 /// The model can be instructed on response content and format,
409 /// (e.g. "be extremely succinct", "act friendly", "here are examples of good responses")
410 /// and on audio behavior (e.g. "talk quickly", "inject emotion into your voice",
411 /// "laugh frequently"). The instructions are not guaranteed to be followed by the model, but
412 /// they provide guidance to the model on the desired behavior.
413 ///
414 /// Note that the server sets default instructions which will be used if this field is not set
415 /// and are visible in the `session.created` event at the start of the session.
416 #[serde(skip_serializing_if = "Option::is_none")]
417 pub instructions: Option<String>,
418
419 /// Maximum number of output tokens for a single assistant response,
420 /// inclusive of tool calls. Provide an integer between 1 and 4096 to limit output tokens,
421 /// or `inf` for the maximum available tokens for a given model. Defaults to `inf`.
422 #[serde(skip_serializing_if = "Option::is_none")]
423 pub max_output_tokens: Option<MaxOutputTokens>,
424
425 /// The Realtime model used for this session.
426 #[serde(skip_serializing_if = "Option::is_none")]
427 pub model: Option<String>,
428
429 /// The set of modalities the model can respond with. It defaults to
430 /// `["audio"]`, indicating that the model will respond with audio plus a transcript. `["text"]`
431 /// can be used to make the model respond with text only. It is not possible to request both
432 /// `text` and `audio` at the same time.
433 #[serde(skip_serializing_if = "Option::is_none")]
434 pub output_modalities: Option<Vec<String>>,
435
436 /// Reference to a prompt template and its variables.
437 /// [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
438 #[serde(skip_serializing_if = "Option::is_none")]
439 pub prompt: Option<Prompt>,
440
441 /// How the model chooses tools. Provide one of the string modes or force a specific
442 /// function/MCP tool.
443 #[serde(skip_serializing_if = "Option::is_none")]
444 pub tool_choice: Option<ToolChoice>,
445
446 /// Tools available to the model.
447 #[serde(skip_serializing_if = "Option::is_none")]
448 pub tools: Option<Vec<RealtimeTool>>,
449
450 /// Realtime API can write session traces to the [Traces Dashboard](https://platform.openai.com/logs?api=traces).
451 /// Set to null to disable tracing. Once tracing is enabled for a session, the configuration cannot be modified.
452 ///
453 /// `auto` will create a trace for the session with default values for the workflow name,
454 /// group id, and metadata.
455 #[serde(skip_serializing_if = "Option::is_none")]
456 pub tracing: Option<Tracing>,
457
458 /// When the number of tokens in a conversation exceeds the model's input token limit,
459 /// the conversation be truncated, meaning messages (starting from the oldest) will not be
460 /// included in the model's context. A 32k context model with 4,096 max output tokens can
461 /// only include 28,224 tokens in the context before truncation occurs. Clients can configure
462 /// truncation behavior to truncate with a lower max token limit, which is an effective way to
463 /// control token usage and cost. Truncation will reduce the number of cached tokens on the next
464 /// turn (busting the cache), since messages are dropped from the beginning of the context.
465 /// However, clients can also configure truncation to retain messages up to a fraction of the
466 /// maximum context size, which will reduce the need for future truncations and thus improve
467 /// the cache rate. Truncation can be disabled entirely, which means the server will never
468 /// truncate but would instead return an error if the conversation exceeds the model's input
469 /// token limit.
470 #[serde(skip_serializing_if = "Option::is_none")]
471 pub truncation: Option<RealtimeTruncation>,
472
473 /// Whether the model may call multiple tools in parallel.
474 #[serde(skip_serializing_if = "Option::is_none")]
475 pub parallel_tool_calls: Option<bool>,
476
477 /// Configuration for reasoning-capable Realtime models such as `gpt-realtime-2`.
478 #[serde(skip_serializing_if = "Option::is_none")]
479 pub reasoning: Option<RealtimeReasoning>,
480}
481
482/// Type of noise reduction. `near_field` is for close-talking microphones such as
483/// headphones, `far_field` is for far-field microphones such as laptop or conference
484/// room microphones.
485#[derive(Debug, Serialize, Deserialize, Clone)]
486#[serde(tag = "type", rename_all = "snake_case")]
487pub enum NoiseReductionType {
488 NearField,
489 FarField,
490}
491
492#[derive(Debug, Serialize, Deserialize, Clone)]
493pub struct TranscriptionAudio {
494 pub input: AudioInput,
495}
496
497/// Realtime transcription session object configuration.
498/// openapi spec type: RealtimeTranscriptionSessionCreateRequestGA
499#[derive(Debug, Serialize, Deserialize, Clone)]
500pub struct RealtimeTranscriptionSession {
501 #[serde(skip_serializing_if = "Option::is_none")]
502 pub id: Option<String>,
503 #[serde(skip_serializing_if = "Option::is_none")]
504 pub expires_at: Option<u64>,
505 /// Configuration for input and output audio.
506 pub audio: TranscriptionAudio,
507
508 /// Additional fields to include in server outputs.
509 ///
510 /// `item.input_audio_transcription.logprobs`: Include logprobs for input audio transcription.
511 #[serde(skip_serializing_if = "Option::is_none")]
512 pub include: Option<Vec<String>>,
513}