async_openai/types/realtime/
session.rs

1use serde::{Deserialize, Serialize};
2
3use crate::types::{
4    responses::{Prompt, ToolChoiceFunction, ToolChoiceMCP, ToolChoiceOptions},
5    MCPTool,
6};
7
8#[derive(Debug, Default, Serialize, Deserialize, Clone)]
9pub struct AudioTranscription {
10    /// The language of the input audio. Supplying the input language in
11    /// [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format will improve accuracy and latency.
12    #[serde(skip_serializing_if = "Option::is_none")]
13    pub language: Option<String>,
14    /// The model to use for transcription. Current options are `whisper-1`,
15    /// `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
16    /// Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.
17    #[serde(skip_serializing_if = "Option::is_none")]
18    pub model: Option<String>,
19    /// An optional text to guide the model's style or continue a previous audio segment.
20    /// For `whisper-1`, the [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). For `gpt-4o-transcribe` models
21    /// (excluding gpt-4o-transcribe-diarize), the prompt is a free text string, for example
22    /// "expect words related to technology".
23    #[serde(skip_serializing_if = "Option::is_none")]
24    pub prompt: Option<String>,
25}
26
27#[derive(Debug, Serialize, Deserialize, Clone)]
28#[serde(tag = "type")]
29pub enum RealtimeTurnDetection {
30    /// Server-side voice activity detection (VAD) which flips on when user speech is detected
31    /// and off after a period of silence.
32    #[serde(rename = "server_vad")]
33    ServerVAD {
34        /// Whether or not to automatically generate a response when a VAD stop event occurs.
35        #[serde(skip_serializing_if = "Option::is_none")]
36        create_response: Option<bool>,
37
38        /// Optional timeout after which a model response will be triggered automatically.
39        /// This is useful for situations in which a long pause from the user is unexpected,
40        /// such as a phone call. The model will effectively prompt the user to continue the
41        /// conversation based on the current context.
42        ///
43        /// The timeout value will be applied after the last model response's audio has finished
44        /// playing, i.e. it's set to the response.done time plus audio playback duration.
45        ///
46        /// An input_audio_buffer.timeout_triggered event (plus events associated with the Response)
47        ///  will be emitted when the timeout is reached. Idle timeout is currently only supported
48        /// for server_vad mode.
49        #[serde(skip_serializing_if = "Option::is_none")]
50        idle_timeout_ms: Option<u32>,
51
52        /// Whether or not to automatically interrupt any ongoing response with output to
53        /// the default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs.
54        #[serde(skip_serializing_if = "Option::is_none")]
55        interrupt_response: Option<bool>,
56
57        /// Used only for server_vad mode. Amount of audio to include before the VAD detected speech
58        /// (in milliseconds). Defaults to 300ms.
59        prefix_padding_ms: u32,
60        /// Used only for server_vad mode. Duration of silence to detect speech stop
61        /// (in milliseconds). Defaults to 500ms. With shorter values the model will respond
62        ///  more quickly, but may jump in on short pauses from the user.
63        silence_duration_ms: u32,
64
65        /// Used only for server_vad mode. Activation threshold for VAD (0.0 to 1.0),
66        /// this defaults to 0.5. A higher threshold will require louder audio to activate
67        /// the model, and thus might perform better in noisy environments.
68        threshold: f32,
69    },
70
71    /// Server-side semantic turn detection which uses a model to determine when the user has
72    ///  finished speaking.
73    #[serde(rename = "semantic_vad")]
74    SemanticVAD {
75        /// Whether or not to automatically generate a response when a VAD stop event occurs.
76        #[serde(skip_serializing_if = "Option::is_none", default)]
77        create_response: Option<bool>,
78
79        /// Used only for `semantic_vad` mode. The eagerness of the model to respond.
80        /// `low` will wait longer for the user to continue speaking, `high` will respond more
81        /// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, and `high`
82        /// have max timeouts of 8s, 4s, and 2s respectively.
83        eagerness: String,
84
85        /// Whether or not to automatically interrupt any ongoing response with output to
86        /// the default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs.
87        #[serde(skip_serializing_if = "Option::is_none", default)]
88        interrupt_response: Option<bool>,
89    },
90}
91
92#[derive(Debug, Serialize, Deserialize, Clone)]
93pub enum MaxOutputTokens {
94    #[serde(rename = "inf")]
95    Inf,
96    #[serde(untagged)]
97    Num(u16),
98}
99
100#[derive(Debug, Serialize, Deserialize, Clone)]
101pub struct RealtimeFunctionTool {
102    /// The name of the function.
103    pub name: String,
104    /// The description of the function, including guidance on when and how to call it,
105    /// and guidance about what to tell the user when calling (if anything).
106    pub description: String,
107    /// Parameters of the function in JSON Schema.
108    pub parameters: serde_json::Value,
109}
110
111#[derive(Debug, Serialize, Deserialize, Clone)]
112#[serde(tag = "type")]
113pub enum RealtimeTool {
114    #[serde(rename = "function")]
115    Function(RealtimeFunctionTool),
116    /// Give the model access to additional tools via remote Model Context Protocol (MCP) servers.
117    /// [Learn more about MCP](https://platform.openai.com/docs/guides/tools-remote-mcp).
118    #[serde(rename = "mcp")]
119    MCP(MCPTool),
120}
121
122#[derive(Debug, Serialize, Deserialize, Clone)]
123#[serde(rename_all = "lowercase")]
124pub enum FunctionType {
125    Function,
126}
127
128#[derive(Debug, Serialize, Deserialize, Clone)]
129#[serde(tag = "type", rename_all = "snake_case")]
130pub enum ToolChoice {
131    /// Use this option to force the model to call a specific function.
132    Function(ToolChoiceFunction),
133    /// Use this option to force the model to call a specific tool on a remote MCP server.
134    Mcp(ToolChoiceMCP),
135
136    #[serde(untagged)]
137    Mode(ToolChoiceOptions),
138}
139
140#[derive(Debug, Serialize, Deserialize, Clone)]
141#[serde(rename_all = "lowercase")]
142pub enum RealtimeVoice {
143    Alloy,
144    Ash,
145    Ballad,
146    Coral,
147    Echo,
148    Sage,
149    Shimmer,
150    Verse,
151    Marin,
152    Cedar,
153}
154
155#[derive(Debug, Serialize, Deserialize, Clone)]
156#[serde(tag = "type")]
157pub enum RealtimeAudioFormats {
158    /// The PCM audio format. Only a 24kHz sample rate is supported.
159    #[serde(rename = "audio/pcm")]
160    PCMAudioFormat {
161        /// The sample rate of the audio. Always 24000.
162        rate: u32,
163    },
164    /// The G.711 μ-law format.
165    #[serde(rename = "audio/pcmu")]
166    PCMUAudioFormat,
167    /// The G.711 A-law format.
168    #[serde(rename = "audio/pcma")]
169    PCMAAudioFormat,
170}
171
172#[derive(Debug, Serialize, Deserialize, Clone, Default)]
173pub struct G711ULAWAudioFormat {
174    pub sample_rate: u32,
175    pub channels: u32,
176}
177
178#[derive(Debug, Serialize, Deserialize, Clone)]
179pub struct AudioInput {
180    /// The format of the input audio.
181    pub format: RealtimeAudioFormats,
182    /// Configuration for input audio noise reduction. This can be set to null to turn off.
183    /// Noise reduction filters audio added to the input audio buffer before it is sent to VAD
184    /// and the model. Filtering the audio can improve VAD and turn detection accuracy
185    /// (reducing false positives) and model performance by improving perception of the
186    /// input audio.
187    pub noise_reduction: Option<NoiseReductionType>,
188    /// Configuration for input audio transcription, defaults to off and can be set to `null` to turn off once on.
189    /// Input audio transcription is not native to the model, since the model consumes audio directly.
190    /// Transcription runs asynchronously through [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
191    /// and should be treated as guidance of input audio content rather than precisely what the model
192    /// heard. The client can optionally set the language and prompt for transcription,
193    /// these offer additional guidance to the transcription service.
194    pub transcription: Option<AudioTranscription>,
195
196    /// Configuration for turn detection, ether Server VAD or Semantic VAD. This can
197    /// be set to null to turn off, in which case the client must manually trigger model response.
198    ///
199    ///  Server VAD means that the model will detect the start and end of speech
200    /// based on audio volume and respond at the end of user speech.
201    ///
202    /// Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD)
203    /// to semantically estimate whether the user has finished speaking, then dynamically sets
204    /// a timeout based on this probability. For example, if user audio trails off with "uhhm",
205    /// the model will score a low probability of turn end and wait longer for the user to
206    /// continue speaking. This can be useful for more natural conversations, but may have a
207    /// higher latency.    
208    pub turn_detection: RealtimeTurnDetection,
209}
210
211#[derive(Debug, Serialize, Deserialize, Clone)]
212pub struct AudioOutput {
213    /// The format of the output audio.
214    pub format: RealtimeAudioFormats,
215    /// The speed of the model's spoken response as a multiple of the original speed.
216    /// 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
217    /// This value can only be changed in between model turns, not while a response
218    /// is in progress.
219    ///
220    /// This parameter is a post-processing adjustment to the audio after it is generated,
221    /// it's also possible to prompt the model to speak faster or slower.
222    pub speed: f32,
223    /// The voice the model uses to respond. Voice cannot be changed during the session once
224    /// the model has responded with audio at least once. Current voice options are
225    /// `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`.
226    /// We recommend `marin` and `cedar` for best quality.
227    pub voice: RealtimeVoice,
228}
229
230#[derive(Debug, Serialize, Deserialize, Clone)]
231pub struct Audio {
232    pub input: AudioInput,
233    pub output: AudioOutput,
234}
235
236#[derive(Debug, Serialize, Deserialize, Clone)]
237#[serde(rename_all = "lowercase")]
238pub enum Tracing {
239    /// Enables tracing and sets default values for tracing configuration options. Always `auto`.
240    Auto,
241
242    #[serde(untagged)]
243    Configuration(TracingConfiguration),
244}
245
246#[derive(Debug, Serialize, Deserialize, Clone)]
247pub struct TracingConfiguration {
248    /// The group id to attach to this trace to enable filtering and grouping in the Traces Dashboard.
249    pub group_id: String,
250    /// The arbitrary metadata to attach to this trace to enable filtering in the Traces Dashboard.
251    pub metadata: serde_json::Value,
252    /// The name of the workflow to attach to this trace. This is used to name the trace in the Traces Dashboard.
253    pub workflow_name: String,
254}
255
256/// The truncation strategy to use for the session.
257#[derive(Debug, Serialize, Deserialize, Clone)]
258#[serde(rename_all = "lowercase")]
259pub enum RealtimeTruncation {
260    /// `auto` is the default truncation strategy.
261    Auto,
262    /// `disabled` will disable truncation and emit errors when the conversation exceeds the input
263    /// token limit.
264    Disabled,
265
266    /// Retain a fraction of the conversation tokens when the conversation exceeds the input token
267    /// limit. This allows you to amortize truncations across multiple turns, which can help improve
268    /// cached token usage.
269    #[serde(untagged)]
270    RetentionRatio(RetentionRatioTruncation),
271}
272
273#[derive(Debug, Serialize, Deserialize, Clone)]
274pub struct RetentionRatioTruncation {
275    /// Fraction of post-instruction conversation tokens to retain (0.0 - 1.0) when the conversation
276    ///  exceeds the input token limit. Setting this to 0.8 means that messages will be dropped
277    /// until 80% of the maximum allowed tokens are used. This helps reduce the frequency of
278    /// truncations and improve cache rates.
279    pub retention_ratio: f32,
280
281    /// Use retention ratio truncation.
282    pub r#type: String,
283
284    /// Optional custom token limits for this truncation strategy. If not provided, the model's
285    ///  default token limits will be used.
286    #[serde(skip_serializing_if = "Option::is_none")]
287    pub token_limits: Option<TokenLimits>,
288}
289
290#[derive(Debug, Serialize, Deserialize, Clone)]
291pub struct TokenLimits {
292    /// Maximum tokens allowed in the conversation after instructions (which including tool
293    /// definitions). For example, setting this to 5,000 would mean that truncation would occur
294    /// when the conversation exceeds 5,000 tokens after instructions. This cannot be higher
295    /// than the model's context window size minus the maximum output tokens.
296    pub post_instructions: u32,
297}
298
299#[derive(Debug, Serialize, Deserialize, Clone)]
300#[serde(tag = "type")]
301pub enum Session {
302    /// The type of session to create. Always `realtime` for the Realtime API.
303    #[serde(rename = "realtime")]
304    RealtimeSession(RealtimeSession),
305    /// The type of session to create. Always `transcription` for transcription sessions.
306    #[serde(rename = "transcription")]
307    RealtimeTranscriptionSession(RealtimeTranscriptionSession),
308}
309
310/// Realtime session object configuration.
311/// openapi spec type: RealtimeSessionCreateRequestGA
312#[derive(Debug, Serialize, Deserialize, Clone)]
313pub struct RealtimeSession {
314    pub audio: Audio,
315
316    /// Additional fields to include in server outputs.
317    ///
318    /// `item.input_audio_transcription.logprobs`: Include logprobs for input audio transcription.
319    #[serde(skip_serializing_if = "Option::is_none")]
320    pub include: Option<Vec<String>>,
321
322    /// The default system instructions (i.e. system message) prepended to model calls.
323    /// This field allows the client to guide the model on desired responses.
324    /// The model can be instructed on response content and format,
325    /// (e.g. "be extremely succinct", "act friendly", "here are examples of good responses")
326    /// and on audio behavior (e.g. "talk quickly", "inject emotion into your voice",
327    /// "laugh frequently"). The instructions are not guaranteed to be followed by the model, but
328    /// they provide guidance to the model on the desired behavior.
329    ///
330    /// Note that the server sets default instructions which will be used if this field is not set
331    /// and are visible in the `session.created` event at the start of the session.
332    #[serde(skip_serializing_if = "Option::is_none")]
333    pub instructions: Option<String>,
334
335    /// Maximum number of output tokens for a single assistant response,
336    /// inclusive of tool calls. Provide an integer between 1 and 4096 to limit output tokens,
337    /// or `inf` for the maximum available tokens for a given model. Defaults to `inf`.
338    #[serde(skip_serializing_if = "Option::is_none")]
339    pub max_output_tokens: Option<MaxOutputTokens>,
340
341    /// The Realtime model used for this session.
342    #[serde(skip_serializing_if = "Option::is_none")]
343    pub model: Option<String>,
344
345    /// The set of modalities the model can respond with. It defaults to
346    /// `["audio"]`, indicating that the model will respond with audio plus a transcript. `["text"]`
347    /// can be used to make the model respond with text only. It is not possible to request both
348    /// `text` and `audio` at the same time.
349    #[serde(skip_serializing_if = "Option::is_none")]
350    pub output_modalities: Option<Vec<String>>,
351
352    /// Reference to a prompt template and its variables.
353    /// [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
354    #[serde(skip_serializing_if = "Option::is_none")]
355    pub prompt: Option<Prompt>,
356
357    /// How the model chooses tools. Provide one of the string modes or force a specific
358    /// function/MCP tool.
359    #[serde(skip_serializing_if = "Option::is_none")]
360    pub tool_choice: Option<ToolChoice>,
361
362    /// Tools available to the model.
363    #[serde(skip_serializing_if = "Option::is_none")]
364    pub tools: Option<Vec<RealtimeTool>>,
365
366    /// Realtime API can write session traces to the [Traces Dashboard](https://platform.openai.com/logs?api=traces).
367    /// Set to null to disable tracing. Once tracing is enabled for a session, the configuration cannot be modified.
368    ///
369    /// `auto` will create a trace for the session with default values for the workflow name,
370    ///  group id, and metadata.
371    #[serde(skip_serializing_if = "Option::is_none")]
372    pub tracing: Option<Tracing>,
373
374    /// When the number of tokens in a conversation exceeds the model's input token limit,
375    /// the conversation be truncated, meaning messages (starting from the oldest) will not be
376    /// included in the model's context. A 32k context model with 4,096 max output tokens can
377    /// only include 28,224 tokens in the context before truncation occurs. Clients can configure
378    /// truncation behavior to truncate with a lower max token limit, which is an effective way to
379    /// control token usage and cost. Truncation will reduce the number of cached tokens on the next
380    ///  turn (busting the cache), since messages are dropped from the beginning of the context.
381    /// However, clients can also configure truncation to retain messages up to a fraction of the
382    /// maximum context size, which will reduce the need for future truncations and thus improve
383    /// the cache rate. Truncation can be disabled entirely, which means the server will never
384    /// truncate but would instead return an error if the conversation exceeds the model's input
385    /// token limit.
386    #[serde(skip_serializing_if = "Option::is_none")]
387    pub truncation: Option<RealtimeTruncation>,
388}
389
390/// Type of noise reduction. `near_field` is for close-talking microphones such as
391/// headphones, `far_field` is for far-field microphones such as laptop or conference
392/// room microphones.
393#[derive(Debug, Serialize, Deserialize, Clone)]
394#[serde(tag = "type", rename_all = "snake_case")]
395pub enum NoiseReductionType {
396    NearField,
397    FarField,
398}
399
400#[derive(Debug, Serialize, Deserialize, Clone)]
401pub struct TranscriptionAudio {
402    pub input: AudioInput,
403}
404
405/// Realtime transcription session object configuration.
406/// openapi spec type: RealtimeTranscriptionSessionCreateRequestGA
407#[derive(Debug, Serialize, Deserialize, Clone)]
408pub struct RealtimeTranscriptionSession {
409    /// Configuration for input and output audio.
410    pub audio: TranscriptionAudio,
411
412    /// Additional fields to include in server outputs.
413    ///
414    /// `item.input_audio_transcription.logprobs`: Include logprobs for input audio transcription.
415    #[serde(skip_serializing_if = "Option::is_none")]
416    pub include: Option<Vec<String>>,
417}