Skip to main content

openai_protocol/
realtime_session.rs

1// OpenAI Realtime Session API types
2// https://platform.openai.com/docs/api-reference/realtime
3
4use std::collections::HashMap;
5
6use serde::{Deserialize, Serialize};
7use serde_json::Value;
8
9use crate::common::{Redacted, ResponsePrompt, ToolReference};
10
11// ============================================================================
12// Session Configuration
13// ============================================================================
14
15#[serde_with::skip_serializing_none]
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct RealtimeSessionCreateRequest {
18    #[serde(rename = "type")]
19    pub r#type: RealtimeSessionType,
20    pub audio: Option<RealtimeAudioConfig>,
21    pub include: Option<Vec<RealtimeIncludeOption>>,
22    pub instructions: Option<String>,
23    pub max_output_tokens: Option<MaxOutputTokens>,
24    pub model: Option<String>,
25    #[serde(default = "audio")]
26    pub output_modalities: Option<Vec<OutputModality>>,
27    pub prompt: Option<ResponsePrompt>,
28    pub tool_choice: Option<RealtimeToolChoiceConfig>,
29    pub tools: Option<RealtimeToolsConfig>,
30    pub tracing: Option<RealtimeTracingConfig>,
31    pub truncation: Option<RealtimeTruncation>,
32}
33
34// ============================================================================
35// Session Object
36// ============================================================================
37
38#[serde_with::skip_serializing_none]
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct RealtimeSessionCreateResponse {
41    pub client_secret: RealtimeSessionClientSecret,
42    #[serde(rename = "type")]
43    pub r#type: RealtimeSessionType,
44    pub audio: Option<RealtimeAudioConfig>,
45    pub include: Option<Vec<RealtimeIncludeOption>>,
46    pub instructions: Option<String>,
47    pub max_output_tokens: Option<MaxOutputTokens>,
48    pub model: Option<String>,
49    #[serde(default = "audio")]
50    pub output_modalities: Option<Vec<OutputModality>>,
51    pub prompt: Option<ResponsePrompt>,
52    pub tool_choice: Option<RealtimeToolChoiceConfig>,
53    pub tools: Option<Vec<RealtimeToolsConfig>>,
54    pub tracing: Option<RealtimeTracingConfig>,
55    pub truncation: Option<RealtimeTruncation>,
56}
57
58// ============================================================================
59// Transcription Session Configuration
60// ============================================================================
61
62#[serde_with::skip_serializing_none]
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct RealtimeTranscriptionSessionCreateRequest {
65    #[serde(rename = "type")]
66    pub r#type: RealtimeTranscriptionSessionType,
67    pub audio: Option<RealtimeTranscriptionSessionAudio>,
68    pub include: Option<Vec<RealtimeIncludeOption>>,
69}
70
71// ============================================================================
72// Transcription Session Object from Create Response
73// ============================================================================
74
75#[serde_with::skip_serializing_none]
76#[derive(Debug, Clone, Serialize, Deserialize)]
77pub struct RealtimeTranscriptionSessionCreateResponse {
78    pub id: String,
79    pub object: String,
80    #[serde(rename = "type")]
81    pub r#type: RealtimeTranscriptionSessionType,
82    pub audio: Option<RealtimeTranscriptionSessionResponseAudio>,
83    pub expires_at: Option<i64>,
84    pub include: Option<Vec<RealtimeIncludeOption>>,
85}
86
87// ============================================================================
88// Audio Formats
89// ============================================================================
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
92#[serde(tag = "type")]
93pub enum RealtimeAudioFormats {
94    #[serde(rename = "audio/pcm")]
95    Pcm {
96        /// Sample rate. Only 24000 is supported.
97        #[serde(skip_serializing_if = "Option::is_none")]
98        rate: Option<u32>,
99    },
100    #[serde(rename = "audio/pcmu")]
101    Pcmu,
102    #[serde(rename = "audio/pcma")]
103    Pcma,
104}
105
106// ============================================================================
107// Audio Transcription
108// ============================================================================
109
110#[serde_with::skip_serializing_none]
111#[derive(Debug, Clone, Serialize, Deserialize)]
112pub struct AudioTranscription {
113    pub language: Option<String>,
114    pub model: Option<String>,
115    pub prompt: Option<String>,
116}
117
118// ============================================================================
119// Noise Reduction
120// ============================================================================
121
122#[derive(Debug, Clone, Serialize, Deserialize)]
123#[serde(rename_all = "snake_case")]
124pub enum NoiseReductionType {
125    NearField,
126    FarField,
127}
128
129#[derive(Debug, Clone, Serialize, Deserialize)]
130pub struct NoiseReduction {
131    #[serde(rename = "type")]
132    pub r#type: NoiseReductionType,
133}
134
135// ============================================================================
136// Turn Detection
137// ============================================================================
138/// Used only for `semantic_vad` mode. The eagerness of the model to respond.
139/// `low` will wait longer for the user to continue speaking, `high` will respond more quickly.
140/// `auto` is the default and is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s, 4s, and 2s respectively.
141#[derive(Debug, Clone, Serialize, Deserialize, Default)]
142#[serde(rename_all = "snake_case")]
143pub enum SemanticVadEagerness {
144    Low,
145    Medium,
146    High,
147    #[default]
148    Auto,
149}
150
151#[serde_with::skip_serializing_none]
152#[derive(Debug, Clone, Serialize, Deserialize)]
153#[serde(tag = "type")]
154pub enum TurnDetection {
155    #[serde(rename = "server_vad")]
156    ServerVad {
157        create_response: Option<bool>,
158        idle_timeout_ms: Option<u32>,
159        interrupt_response: Option<bool>,
160        prefix_padding_ms: Option<u32>,
161        silence_duration_ms: Option<u32>,
162        threshold: Option<f64>,
163    },
164    #[serde(rename = "semantic_vad")]
165    SemanticVad {
166        create_response: Option<bool>,
167        eagerness: Option<SemanticVadEagerness>,
168        interrupt_response: Option<bool>,
169    },
170}
171
172/// Turn detection for transcription sessions. Only `server_vad` is currently supported.
173#[serde_with::skip_serializing_none]
174#[derive(Debug, Clone, Serialize, Deserialize)]
175#[serde(tag = "type")]
176pub enum RealtimeTranscriptionSessionTurnDetection {
177    #[serde(rename = "server_vad")]
178    ServerVad {
179        prefix_padding_ms: Option<u32>,
180        silence_duration_ms: Option<u32>,
181        threshold: Option<f64>,
182    },
183}
184
185// ============================================================================
186// Voice
187// ============================================================================
188
189/// Built-in voice name (e.g. "alloy", "ash") or custom voice reference.
190///
191/// Variant order matters for `#[serde(untagged)]`: a bare JSON string (e.g. `"alloy"`)
192/// always deserializes as `BuiltIn`. A JSON object `{"id": "..."}` fails `BuiltIn`
193/// and falls through to `Custom`. The two forms are structurally distinct (string vs
194/// object) per the OpenAI spec, so there is no ambiguity.
195#[derive(Debug, Clone, Serialize, Deserialize)]
196#[serde(untagged)]
197pub enum Voice {
198    VoiceIDsShared(String),
199    Custom { id: String },
200}
201
202// ============================================================================
203// Output Modality
204// ============================================================================
205
206#[derive(Debug, Clone, Serialize, Deserialize)]
207#[serde(rename_all = "snake_case")]
208pub enum OutputModality {
209    Text,
210    Audio,
211}
212
213#[expect(
214    clippy::unnecessary_wraps,
215    reason = "must return Option to match serde default field type"
216)]
217fn audio() -> Option<Vec<OutputModality>> {
218    Some(vec![OutputModality::Audio])
219}
220
221// ============================================================================
222// Tracing
223// ============================================================================
224
225#[serde_with::skip_serializing_none]
226#[derive(Debug, Clone, Serialize, Deserialize)]
227pub struct TracingConfig {
228    pub group_id: Option<String>,
229    pub metadata: Option<Value>,
230    pub workflow_name: Option<String>,
231}
232
233/// The tracing mode. Always `"auto"`.
234#[derive(Debug, Clone, Serialize, Deserialize)]
235pub enum TracingMode {
236    #[serde(rename = "auto")]
237    Auto,
238}
239
240/// Either the string `"auto"` or a granular tracing configuration.
241#[derive(Debug, Clone, Serialize, Deserialize)]
242#[serde(untagged)]
243pub enum RealtimeTracingConfig {
244    Mode(TracingMode),
245    Config(TracingConfig),
246}
247
248// ============================================================================
249// Connector ID
250// ============================================================================
251
252#[derive(Debug, Clone, Serialize, Deserialize)]
253#[serde(rename_all = "snake_case")]
254#[expect(
255    clippy::enum_variant_names,
256    reason = "variant names match OpenAI Realtime API spec"
257)]
258pub enum ConnectorId {
259    ConnectorDropbox,
260    ConnectorGmail,
261    ConnectorGooglecalendar,
262    ConnectorGoogledrive,
263    ConnectorMicrosoftteams,
264    ConnectorOutlookcalendar,
265    ConnectorOutlookemail,
266    ConnectorSharepoint,
267}
268
269// ============================================================================
270// Tools
271// ============================================================================
272
273#[serde_with::skip_serializing_none]
274#[derive(Debug, Clone, Serialize, Deserialize)]
275#[serde(tag = "type")]
276pub enum RealtimeToolsConfig {
277    #[serde(rename = "function")]
278    RealtimeFunctionTool {
279        description: Option<String>,
280        name: Option<String>,
281        parameters: Option<Value>,
282    },
283    #[serde(rename = "mcp")]
284    McpTool {
285        server_label: String,
286        allowed_tools: Option<McpAllowedTools>,
287        authorization: Option<Redacted>,
288        connector_id: Option<ConnectorId>,
289        headers: Option<HashMap<String, Redacted>>,
290        require_approval: Option<McpToolApproval>,
291        server_description: Option<String>,
292        server_url: Option<String>,
293    },
294}
295
296// ============================================================================
297// MCP Tool Filter
298// ============================================================================
299
300/// List of allowed tool names or a filter object.
301///
302/// Variant order matters for `#[serde(untagged)]`: serde tries `List` first
303/// (JSON array). A JSON object falls through to `Filter`.
304#[derive(Debug, Clone, Serialize, Deserialize)]
305#[serde(untagged)]
306pub enum McpAllowedTools {
307    List(Vec<String>),
308    Filter(McpToolFilter),
309}
310
311/// A filter object to specify which tools are allowed.
312#[serde_with::skip_serializing_none]
313#[derive(Debug, Clone, Serialize, Deserialize)]
314pub struct McpToolFilter {
315    pub read_only: Option<bool>,
316    pub tool_names: Option<Vec<String>>,
317}
318
319/// Approval policy for MCP tools: a filter object or `"always"`/`"never"`.
320///
321/// Variant order matters for `#[serde(untagged)]`: serde tries `Setting` first
322/// (plain string). A JSON object falls through to `Filter`.
323#[derive(Debug, Clone, Serialize, Deserialize)]
324#[serde(untagged)]
325pub enum McpToolApproval {
326    Setting(McpToolApprovalSetting),
327    Filter(McpToolApprovalFilter),
328}
329
330/// Single approval policy for all tools.
331#[derive(Debug, Clone, Serialize, Deserialize)]
332#[serde(rename_all = "snake_case")]
333pub enum McpToolApprovalSetting {
334    Always,
335    Never,
336}
337
338/// Granular approval filter specifying which tools always/never require approval.
339#[serde_with::skip_serializing_none]
340#[derive(Debug, Clone, Serialize, Deserialize)]
341pub struct McpToolApprovalFilter {
342    pub always: Option<McpToolFilter>,
343    pub never: Option<McpToolFilter>,
344}
345
346// ============================================================================
347// Tool Choice
348// ============================================================================
349
350/// `"none"`, `"auto"`, `"required"`, or a specific function/MCP tool reference.
351///
352/// Variant order matters for `#[serde(untagged)]`: serde tries `Options` first
353/// (plain string). A JSON object fails and falls through to `Reference`.
354/// Reuses [`ToolReference`] from `common` for the tagged object forms.
355#[derive(Debug, Clone, Serialize, Deserialize)]
356#[serde(untagged)]
357pub enum RealtimeToolChoiceConfig {
358    Options(ToolChoiceOptions),
359    Reference(ToolReference),
360}
361
362/// Controls which (if any) tool is called by the model.
363#[derive(Debug, Clone, Serialize, Deserialize)]
364#[serde(rename_all = "snake_case")]
365pub enum ToolChoiceOptions {
366    None,
367    Auto,
368    Required,
369}
370
371// ============================================================================
372// Max Output Tokens
373// ============================================================================
374
375/// Integer token limit (1–4096) or `"inf"` for the maximum available tokens.
376/// Defaults to `"inf"`.
377///
378/// Variant order matters for `#[serde(untagged)]`: serde tries `Integer` first.
379/// A JSON number succeeds immediately; the string `"inf"` fails `Integer` and
380/// falls through to `Inf`.
381#[derive(Debug, Clone, Serialize, Deserialize)]
382#[serde(untagged)]
383pub enum MaxOutputTokens {
384    /// An integer between 1 and 4096.
385    Integer(u32),
386    Inf(InfMarker),
387}
388
389impl Default for MaxOutputTokens {
390    fn default() -> Self {
391        Self::Inf(InfMarker::Inf)
392    }
393}
394
395/// The literal string `"inf"`. Used by [`MaxOutputTokens::Inf`].
396#[derive(Debug, Clone, Serialize, Deserialize)]
397pub enum InfMarker {
398    #[serde(rename = "inf")]
399    Inf,
400}
401
402// ============================================================================
403// Truncation
404// ============================================================================
405
406#[serde_with::skip_serializing_none]
407#[derive(Debug, Clone, Serialize, Deserialize)]
408pub struct TruncationTokenLimits {
409    pub post_instructions: Option<u32>,
410}
411
412/// The retention ratio truncation type. Always `"retention_ratio"`.
413#[derive(Debug, Clone, Serialize, Deserialize)]
414pub enum RetentionRatioTruncationType {
415    #[serde(rename = "retention_ratio")]
416    RetentionRatio,
417}
418
419#[serde_with::skip_serializing_none]
420#[derive(Debug, Clone, Serialize, Deserialize)]
421pub struct RetentionRatioTruncation {
422    pub retention_ratio: f64,
423    #[serde(rename = "type")]
424    pub r#type: RetentionRatioTruncationType,
425    pub token_limits: Option<TruncationTokenLimits>,
426}
427
428/// The truncation mode.
429#[derive(Debug, Clone, Serialize, Deserialize, Default)]
430#[serde(rename_all = "snake_case")]
431pub enum TruncationMode {
432    #[default]
433    Auto,
434    Disabled,
435}
436
437/// `"auto"`, `"disabled"`, or a retention ratio configuration.
438#[derive(Debug, Clone, Serialize, Deserialize)]
439#[serde(untagged)]
440pub enum RealtimeTruncation {
441    Mode(TruncationMode),
442    RetentionRatio(RetentionRatioTruncation),
443}
444
445// ============================================================================
446// Client Secret
447// ============================================================================
448
449#[derive(Debug, Clone, Serialize, Deserialize)]
450pub struct RealtimeSessionClientSecret {
451    pub expires_at: i64,
452    pub value: Redacted,
453}
454
455// ============================================================================
456// Audio Configuration
457// ============================================================================
458
459#[serde_with::skip_serializing_none]
460#[derive(Debug, Clone, Serialize, Deserialize)]
461pub struct RealtimeAudioConfigInput {
462    pub format: Option<RealtimeAudioFormats>,
463    pub noise_reduction: Option<NoiseReduction>,
464    pub transcription: Option<AudioTranscription>,
465    pub turn_detection: Option<TurnDetection>,
466}
467
468#[serde_with::skip_serializing_none]
469#[derive(Debug, Clone, Serialize, Deserialize)]
470pub struct RealtimeAudioConfigOutput {
471    pub format: Option<RealtimeAudioFormats>,
472    pub speed: Option<f64>,
473    pub voice: Option<Voice>,
474}
475
476#[serde_with::skip_serializing_none]
477#[derive(Debug, Clone, Serialize, Deserialize)]
478pub struct RealtimeAudioConfig {
479    pub input: Option<RealtimeAudioConfigInput>,
480    pub output: Option<RealtimeAudioConfigOutput>,
481}
482
483#[serde_with::skip_serializing_none]
484#[derive(Debug, Clone, Serialize, Deserialize)]
485pub struct RealtimeTranscriptionSessionAudio {
486    pub input: Option<RealtimeAudioConfigInput>,
487}
488
489#[serde_with::skip_serializing_none]
490#[derive(Debug, Clone, Serialize, Deserialize)]
491pub struct RealtimeTranscriptionSessionResponseAudio {
492    pub input: Option<RealtimeTranscriptionSessionResponseAudioConfigInput>,
493}
494
495#[serde_with::skip_serializing_none]
496#[derive(Debug, Clone, Serialize, Deserialize)]
497pub struct RealtimeTranscriptionSessionResponseAudioConfigInput {
498    pub format: Option<RealtimeAudioFormats>,
499    pub noise_reduction: Option<NoiseReduction>,
500    pub transcription: Option<AudioTranscription>,
501    pub turn_detection: Option<RealtimeTranscriptionSessionTurnDetection>,
502}
503
504// ============================================================================
505// Include Options
506// ============================================================================
507
508#[derive(Debug, Clone, Serialize, Deserialize)]
509pub enum RealtimeIncludeOption {
510    #[serde(rename = "item.input_audio_transcription.logprobs")]
511    InputAudioTranscriptionLogprobs,
512}
513
514// ============================================================================
515// Session Type
516// ============================================================================
517
518/// The type of session. Always `"realtime"` for the Realtime API.
519#[derive(Debug, Clone, Serialize, Deserialize)]
520pub enum RealtimeSessionType {
521    #[serde(rename = "realtime")]
522    Realtime,
523}
524
525// ============================================================================
526// TranscriptionSession Type
527// ============================================================================
528
529/// The type of session. Always `"transcription"` for the Realtime API.
530#[derive(Debug, Clone, Serialize, Deserialize)]
531pub enum RealtimeTranscriptionSessionType {
532    #[serde(rename = "transcription")]
533    Transcription,
534}