async_openai/types/realtime/session.rs
1use serde::{Deserialize, Serialize};
2
3use crate::types::{
4 mcp::MCPTool,
5 responses::{Prompt, ToolChoiceFunction, ToolChoiceMCP, ToolChoiceOptions},
6};
7
8#[derive(Debug, Default, Serialize, Deserialize, Clone)]
9pub struct AudioTranscription {
10 /// The language of the input audio. Supplying the input language in
11 /// [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format will improve accuracy and latency.
12 #[serde(skip_serializing_if = "Option::is_none")]
13 pub language: Option<String>,
14 /// The model to use for transcription. Current options are `whisper-1`,
15 /// `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
16 /// Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.
17 #[serde(skip_serializing_if = "Option::is_none")]
18 pub model: Option<String>,
19 /// An optional text to guide the model's style or continue a previous audio segment.
20 /// For `whisper-1`, the [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). For `gpt-4o-transcribe` models
21 /// (excluding gpt-4o-transcribe-diarize), the prompt is a free text string, for example
22 /// "expect words related to technology".
23 #[serde(skip_serializing_if = "Option::is_none")]
24 pub prompt: Option<String>,
25}
26
27#[derive(Debug, Serialize, Deserialize, Clone)]
28#[serde(tag = "type")]
29pub enum RealtimeTurnDetection {
30 /// Server-side voice activity detection (VAD) which flips on when user speech is detected
31 /// and off after a period of silence.
32 #[serde(rename = "server_vad")]
33 ServerVAD {
34 /// Whether or not to automatically generate a response when a VAD stop event occurs. If
35 /// `interrupt_response` is set to `false` this may fail to create a response if the model is
36 /// already responding.
37 ///
38 /// If both `create_response` and `interrupt_response` are set to `false`, the model will
39 /// never respond automatically but VAD events will still be emitted.
40 #[serde(skip_serializing_if = "Option::is_none")]
41 create_response: Option<bool>,
42
43 /// Optional timeout after which a model response will be triggered automatically.
44 /// This is useful for situations in which a long pause from the user is unexpected,
45 /// such as a phone call. The model will effectively prompt the user to continue the
46 /// conversation based on the current context.
47 ///
48 /// The timeout value will be applied after the last model response's audio has finished
49 /// playing, i.e. it's set to the response.done time plus audio playback duration.
50 ///
51 /// An input_audio_buffer.timeout_triggered event (plus events associated with the Response)
52 /// will be emitted when the timeout is reached. Idle timeout is currently only supported
53 /// for server_vad mode.
54 #[serde(skip_serializing_if = "Option::is_none")]
55 idle_timeout_ms: Option<u32>,
56
57 /// Whether or not to automatically interrupt (cancel) any ongoing response with output to the
58 /// default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs. If `true` then
59 /// the response will be cancelled, otherwise it will continue until complete.
60 ///
61 /// If both `create_response` and `interrupt_response` are set to `false`, the model will
62 /// never respond automatically but VAD events will still be emitted.
63 #[serde(skip_serializing_if = "Option::is_none")]
64 interrupt_response: Option<bool>,
65
66 /// Used only for server_vad mode. Amount of audio to include before the VAD detected speech
67 /// (in milliseconds). Defaults to 300ms.
68 prefix_padding_ms: u32,
69 /// Used only for server_vad mode. Duration of silence to detect speech stop
70 /// (in milliseconds). Defaults to 500ms. With shorter values the model will respond
71 /// more quickly, but may jump in on short pauses from the user.
72 silence_duration_ms: u32,
73
74 /// Used only for server_vad mode. Activation threshold for VAD (0.0 to 1.0),
75 /// this defaults to 0.5. A higher threshold will require louder audio to activate
76 /// the model, and thus might perform better in noisy environments.
77 threshold: f32,
78 },
79
80 /// Server-side semantic turn detection which uses a model to determine when the user has
81 /// finished speaking.
82 #[serde(rename = "semantic_vad")]
83 SemanticVAD {
84 /// Whether or not to automatically generate a response when a VAD stop event occurs.
85 #[serde(skip_serializing_if = "Option::is_none", default)]
86 create_response: Option<bool>,
87
88 /// Used only for `semantic_vad` mode. The eagerness of the model to respond.
89 /// `low` will wait longer for the user to continue speaking, `high` will respond more
90 /// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, and `high`
91 /// have max timeouts of 8s, 4s, and 2s respectively.
92 eagerness: String,
93
94 /// Whether or not to automatically interrupt any ongoing response with output to
95 /// the default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs.
96 #[serde(skip_serializing_if = "Option::is_none", default)]
97 interrupt_response: Option<bool>,
98 },
99}
100
101#[derive(Debug, Serialize, Deserialize, Clone)]
102pub enum MaxOutputTokens {
103 #[serde(rename = "inf")]
104 Inf,
105 #[serde(untagged)]
106 Num(u16),
107}
108
109#[derive(Debug, Serialize, Deserialize, Clone)]
110pub struct RealtimeFunctionTool {
111 /// The name of the function.
112 pub name: String,
113 /// The description of the function, including guidance on when and how to call it,
114 /// and guidance about what to tell the user when calling (if anything).
115 pub description: String,
116 /// Parameters of the function in JSON Schema.
117 pub parameters: serde_json::Value,
118}
119
120#[derive(Debug, Serialize, Deserialize, Clone)]
121#[serde(tag = "type")]
122pub enum RealtimeTool {
123 #[serde(rename = "function")]
124 Function(RealtimeFunctionTool),
125 /// Give the model access to additional tools via remote Model Context Protocol (MCP) servers.
126 /// [Learn more about MCP](https://platform.openai.com/docs/guides/tools-remote-mcp).
127 #[serde(rename = "mcp")]
128 MCP(MCPTool),
129}
130
131#[derive(Debug, Serialize, Deserialize, Clone)]
132#[serde(rename_all = "lowercase")]
133pub enum FunctionType {
134 Function,
135}
136
137#[derive(Debug, Serialize, Deserialize, Clone)]
138#[serde(tag = "type", rename_all = "snake_case")]
139pub enum ToolChoice {
140 /// Use this option to force the model to call a specific function.
141 Function(ToolChoiceFunction),
142 /// Use this option to force the model to call a specific tool on a remote MCP server.
143 Mcp(ToolChoiceMCP),
144
145 #[serde(untagged)]
146 Mode(ToolChoiceOptions),
147}
148
149#[derive(Debug, Serialize, Deserialize, Clone)]
150#[serde(rename_all = "lowercase")]
151pub enum RealtimeVoice {
152 Alloy,
153 Ash,
154 Ballad,
155 Coral,
156 Echo,
157 Sage,
158 Shimmer,
159 Verse,
160 Marin,
161 Cedar,
162 #[serde(untagged)]
163 Other(String),
164}
165
166#[derive(Debug, Serialize, Deserialize, Clone)]
167#[serde(tag = "type")]
168pub enum RealtimeAudioFormats {
169 /// The PCM audio format. Only a 24kHz sample rate is supported.
170 #[serde(rename = "audio/pcm")]
171 PCMAudioFormat {
172 /// The sample rate of the audio. Always 24000.
173 rate: u32,
174 },
175 /// The G.711 μ-law format.
176 #[serde(rename = "audio/pcmu")]
177 PCMUAudioFormat,
178 /// The G.711 A-law format.
179 #[serde(rename = "audio/pcma")]
180 PCMAAudioFormat,
181}
182
183#[derive(Debug, Serialize, Deserialize, Clone, Default)]
184pub struct G711ULAWAudioFormat {
185 pub sample_rate: u32,
186 pub channels: u32,
187}
188
189#[derive(Debug, Serialize, Deserialize, Clone)]
190pub struct AudioInput {
191 /// The format of the input audio.
192 pub format: RealtimeAudioFormats,
193 /// Configuration for input audio noise reduction. This can be set to null to turn off.
194 /// Noise reduction filters audio added to the input audio buffer before it is sent to VAD
195 /// and the model. Filtering the audio can improve VAD and turn detection accuracy
196 /// (reducing false positives) and model performance by improving perception of the
197 /// input audio.
198 pub noise_reduction: Option<NoiseReductionType>,
199 /// Configuration for input audio transcription, defaults to off and can be set to `null` to turn off once on.
200 /// Input audio transcription is not native to the model, since the model consumes audio directly.
201 /// Transcription runs asynchronously through [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
202 /// and should be treated as guidance of input audio content rather than precisely what the model
203 /// heard. The client can optionally set the language and prompt for transcription,
204 /// these offer additional guidance to the transcription service.
205 pub transcription: Option<AudioTranscription>,
206
207 /// Configuration for turn detection, ether Server VAD or Semantic VAD. This can
208 /// be set to null to turn off, in which case the client must manually trigger model response.
209 ///
210 /// Server VAD means that the model will detect the start and end of speech
211 /// based on audio volume and respond at the end of user speech.
212 ///
213 /// Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD)
214 /// to semantically estimate whether the user has finished speaking, then dynamically sets
215 /// a timeout based on this probability. For example, if user audio trails off with "uhhm",
216 /// the model will score a low probability of turn end and wait longer for the user to
217 /// continue speaking. This can be useful for more natural conversations, but may have a
218 /// higher latency.
219 pub turn_detection: RealtimeTurnDetection,
220}
221
222#[derive(Debug, Serialize, Deserialize, Clone)]
223pub struct AudioOutput {
224 /// The format of the output audio.
225 pub format: RealtimeAudioFormats,
226 /// The speed of the model's spoken response as a multiple of the original speed.
227 /// 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
228 /// This value can only be changed in between model turns, not while a response
229 /// is in progress.
230 ///
231 /// This parameter is a post-processing adjustment to the audio after it is generated,
232 /// it's also possible to prompt the model to speak faster or slower.
233 pub speed: f32,
234 /// The voice the model uses to respond. Voice cannot be changed during the session once
235 /// the model has responded with audio at least once. Current voice options are
236 /// `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`.
237 /// We recommend `marin` and `cedar` for best quality.
238 pub voice: RealtimeVoice,
239}
240
241#[derive(Debug, Serialize, Deserialize, Clone)]
242pub struct Audio {
243 pub input: AudioInput,
244 pub output: AudioOutput,
245}
246
247#[derive(Debug, Serialize, Deserialize, Clone)]
248#[serde(rename_all = "lowercase")]
249pub enum Tracing {
250 /// Enables tracing and sets default values for tracing configuration options. Always `auto`.
251 Auto,
252
253 #[serde(untagged)]
254 Configuration(TracingConfiguration),
255}
256
257#[derive(Debug, Serialize, Deserialize, Clone)]
258pub struct TracingConfiguration {
259 /// The group id to attach to this trace to enable filtering and grouping in the Traces Dashboard.
260 pub group_id: String,
261 /// The arbitrary metadata to attach to this trace to enable filtering in the Traces Dashboard.
262 pub metadata: serde_json::Value,
263 /// The name of the workflow to attach to this trace. This is used to name the trace in the Traces Dashboard.
264 pub workflow_name: String,
265}
266
267/// The truncation strategy to use for the session.
268#[derive(Debug, Serialize, Deserialize, Clone)]
269#[serde(rename_all = "lowercase")]
270pub enum RealtimeTruncation {
271 /// `auto` is the default truncation strategy.
272 Auto,
273 /// `disabled` will disable truncation and emit errors when the conversation exceeds the input
274 /// token limit.
275 Disabled,
276
277 /// Retain a fraction of the conversation tokens when the conversation exceeds the input token
278 /// limit. This allows you to amortize truncations across multiple turns, which can help improve
279 /// cached token usage.
280 #[serde(untagged)]
281 RetentionRatio(RetentionRatioTruncation),
282}
283
284#[derive(Debug, Serialize, Deserialize, Clone)]
285pub struct RetentionRatioTruncation {
286 /// Fraction of post-instruction conversation tokens to retain (0.0 - 1.0) when the conversation
287 /// exceeds the input token limit. Setting this to 0.8 means that messages will be dropped
288 /// until 80% of the maximum allowed tokens are used. This helps reduce the frequency of
289 /// truncations and improve cache rates.
290 pub retention_ratio: f32,
291
292 /// Use retention ratio truncation.
293 pub r#type: String,
294
295 /// Optional custom token limits for this truncation strategy. If not provided, the model's
296 /// default token limits will be used.
297 #[serde(skip_serializing_if = "Option::is_none")]
298 pub token_limits: Option<TokenLimits>,
299}
300
301#[derive(Debug, Serialize, Deserialize, Clone)]
302pub struct TokenLimits {
303 /// Maximum tokens allowed in the conversation after instructions (which including tool
304 /// definitions). For example, setting this to 5,000 would mean that truncation would occur
305 /// when the conversation exceeds 5,000 tokens after instructions. This cannot be higher
306 /// than the model's context window size minus the maximum output tokens.
307 pub post_instructions: u32,
308}
309
310#[derive(Debug, Serialize, Deserialize, Clone)]
311#[serde(tag = "type")]
312pub enum Session {
313 // Boxed as per clippy suggestion:
314 // https://rust-lang.github.io/rust -clippy/rust-1.91.0/index.html#large_enum_variant
315 // the largest variant contains at least 600 bytes, the second-largest variant contains at least 144 bytes
316 /// The type of session to create. Always `realtime` for the Realtime API.
317 #[serde(rename = "realtime")]
318 RealtimeSession(Box<RealtimeSession>),
319 /// The type of session to create. Always `transcription` for transcription sessions.
320 #[serde(rename = "transcription")]
321 RealtimeTranscriptionSession(RealtimeTranscriptionSession),
322}
323
324#[derive(Debug, Serialize, Deserialize, Clone)]
325#[serde(tag = "type")]
326pub enum RealtimeSessionConfiguration {
327 Realtime(RealtimeSession),
328}
329
330impl Default for RealtimeSessionConfiguration {
331 fn default() -> Self {
332 Self::Realtime(RealtimeSession::default())
333 }
334}
335
336/// Realtime session object configuration.
337/// openapi spec type: RealtimeSessionCreateRequestGA
338#[derive(Debug, Serialize, Deserialize, Clone, Default)]
339pub struct RealtimeSession {
340 #[serde(skip_serializing_if = "Option::is_none")]
341 pub audio: Option<Audio>,
342
343 /// Additional fields to include in server outputs.
344 ///
345 /// `item.input_audio_transcription.logprobs`: Include logprobs for input audio transcription.
346 #[serde(skip_serializing_if = "Option::is_none")]
347 pub include: Option<Vec<String>>,
348
349 /// The default system instructions (i.e. system message) prepended to model calls.
350 /// This field allows the client to guide the model on desired responses.
351 /// The model can be instructed on response content and format,
352 /// (e.g. "be extremely succinct", "act friendly", "here are examples of good responses")
353 /// and on audio behavior (e.g. "talk quickly", "inject emotion into your voice",
354 /// "laugh frequently"). The instructions are not guaranteed to be followed by the model, but
355 /// they provide guidance to the model on the desired behavior.
356 ///
357 /// Note that the server sets default instructions which will be used if this field is not set
358 /// and are visible in the `session.created` event at the start of the session.
359 #[serde(skip_serializing_if = "Option::is_none")]
360 pub instructions: Option<String>,
361
362 /// Maximum number of output tokens for a single assistant response,
363 /// inclusive of tool calls. Provide an integer between 1 and 4096 to limit output tokens,
364 /// or `inf` for the maximum available tokens for a given model. Defaults to `inf`.
365 #[serde(skip_serializing_if = "Option::is_none")]
366 pub max_output_tokens: Option<MaxOutputTokens>,
367
368 /// The Realtime model used for this session.
369 #[serde(skip_serializing_if = "Option::is_none")]
370 pub model: Option<String>,
371
372 /// The set of modalities the model can respond with. It defaults to
373 /// `["audio"]`, indicating that the model will respond with audio plus a transcript. `["text"]`
374 /// can be used to make the model respond with text only. It is not possible to request both
375 /// `text` and `audio` at the same time.
376 #[serde(skip_serializing_if = "Option::is_none")]
377 pub output_modalities: Option<Vec<String>>,
378
379 /// Reference to a prompt template and its variables.
380 /// [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
381 #[serde(skip_serializing_if = "Option::is_none")]
382 pub prompt: Option<Prompt>,
383
384 /// How the model chooses tools. Provide one of the string modes or force a specific
385 /// function/MCP tool.
386 #[serde(skip_serializing_if = "Option::is_none")]
387 pub tool_choice: Option<ToolChoice>,
388
389 /// Tools available to the model.
390 #[serde(skip_serializing_if = "Option::is_none")]
391 pub tools: Option<Vec<RealtimeTool>>,
392
393 /// Realtime API can write session traces to the [Traces Dashboard](https://platform.openai.com/logs?api=traces).
394 /// Set to null to disable tracing. Once tracing is enabled for a session, the configuration cannot be modified.
395 ///
396 /// `auto` will create a trace for the session with default values for the workflow name,
397 /// group id, and metadata.
398 #[serde(skip_serializing_if = "Option::is_none")]
399 pub tracing: Option<Tracing>,
400
401 /// When the number of tokens in a conversation exceeds the model's input token limit,
402 /// the conversation be truncated, meaning messages (starting from the oldest) will not be
403 /// included in the model's context. A 32k context model with 4,096 max output tokens can
404 /// only include 28,224 tokens in the context before truncation occurs. Clients can configure
405 /// truncation behavior to truncate with a lower max token limit, which is an effective way to
406 /// control token usage and cost. Truncation will reduce the number of cached tokens on the next
407 /// turn (busting the cache), since messages are dropped from the beginning of the context.
408 /// However, clients can also configure truncation to retain messages up to a fraction of the
409 /// maximum context size, which will reduce the need for future truncations and thus improve
410 /// the cache rate. Truncation can be disabled entirely, which means the server will never
411 /// truncate but would instead return an error if the conversation exceeds the model's input
412 /// token limit.
413 #[serde(skip_serializing_if = "Option::is_none")]
414 pub truncation: Option<RealtimeTruncation>,
415}
416
417/// Type of noise reduction. `near_field` is for close-talking microphones such as
418/// headphones, `far_field` is for far-field microphones such as laptop or conference
419/// room microphones.
420#[derive(Debug, Serialize, Deserialize, Clone)]
421#[serde(tag = "type", rename_all = "snake_case")]
422pub enum NoiseReductionType {
423 NearField,
424 FarField,
425}
426
427#[derive(Debug, Serialize, Deserialize, Clone)]
428pub struct TranscriptionAudio {
429 pub input: AudioInput,
430}
431
432/// Realtime transcription session object configuration.
433/// openapi spec type: RealtimeTranscriptionSessionCreateRequestGA
434#[derive(Debug, Serialize, Deserialize, Clone)]
435pub struct RealtimeTranscriptionSession {
436 /// Configuration for input and output audio.
437 pub audio: TranscriptionAudio,
438
439 /// Additional fields to include in server outputs.
440 ///
441 /// `item.input_audio_transcription.logprobs`: Include logprobs for input audio transcription.
442 #[serde(skip_serializing_if = "Option::is_none")]
443 pub include: Option<Vec<String>>,
444}