Skip to main content

openai_protocol/
realtime_session.rs

1// OpenAI Realtime Session API types
2// https://platform.openai.com/docs/api-reference/realtime
3
4use std::collections::HashMap;
5
6use serde::{Deserialize, Serialize};
7use serde_json::Value;
8use validator::{Validate, ValidationError};
9
10use crate::{
11    common::{Redacted, ResponsePrompt, ToolReference},
12    validated::Normalizable,
13};
14
15// ============================================================================
16// Session Configuration
17// ============================================================================
18
19#[serde_with::skip_serializing_none]
20#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
21#[validate(schema(function = "validate_session_create_request"))]
22pub struct RealtimeSessionCreateRequest {
23    #[serde(rename = "type")]
24    pub r#type: RealtimeSessionType,
25    pub audio: Option<RealtimeAudioConfig>,
26    pub include: Option<Vec<RealtimeIncludeOption>>,
27    pub instructions: Option<String>,
28    pub max_output_tokens: Option<MaxOutputTokens>,
29    pub model: Option<String>,
30    #[serde(default = "audio")]
31    pub output_modalities: Option<Vec<OutputModality>>,
32    pub prompt: Option<ResponsePrompt>,
33    pub tool_choice: Option<RealtimeToolChoiceConfig>,
34    pub tools: Option<RealtimeToolsConfig>,
35    pub tracing: Option<RealtimeTracingConfig>,
36    pub truncation: Option<RealtimeTruncation>,
37}
38
39impl Normalizable for RealtimeSessionCreateRequest {}
40
41fn validate_session_create_request(
42    req: &RealtimeSessionCreateRequest,
43) -> Result<(), ValidationError> {
44    let has_model = req.model.as_deref().is_some_and(|m| !m.trim().is_empty());
45    if !has_model {
46        return Err(ValidationError::new("model is required"));
47    }
48    Ok(())
49}
50
51// ============================================================================
52// Session Object
53// ============================================================================
54
55#[serde_with::skip_serializing_none]
56#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct RealtimeSessionCreateResponse {
58    pub client_secret: RealtimeSessionClientSecret,
59    #[serde(rename = "type")]
60    pub r#type: RealtimeSessionType,
61    pub audio: Option<RealtimeAudioConfig>,
62    pub include: Option<Vec<RealtimeIncludeOption>>,
63    pub instructions: Option<String>,
64    pub max_output_tokens: Option<MaxOutputTokens>,
65    pub model: Option<String>,
66    #[serde(default = "audio")]
67    pub output_modalities: Option<Vec<OutputModality>>,
68    pub prompt: Option<ResponsePrompt>,
69    pub tool_choice: Option<RealtimeToolChoiceConfig>,
70    pub tools: Option<Vec<RealtimeToolsConfig>>,
71    pub tracing: Option<RealtimeTracingConfig>,
72    pub truncation: Option<RealtimeTruncation>,
73}
74
75// ============================================================================
76// Transcription Session Configuration
77// ============================================================================
78
79#[serde_with::skip_serializing_none]
80#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
81#[validate(schema(function = "validate_transcription_session_create_request"))]
82pub struct RealtimeTranscriptionSessionCreateRequest {
83    #[serde(rename = "type")]
84    pub r#type: RealtimeTranscriptionSessionType,
85    pub audio: Option<RealtimeTranscriptionSessionAudio>,
86    pub include: Option<Vec<RealtimeIncludeOption>>,
87    pub model: Option<String>,
88    pub language: Option<String>,
89    pub prompt: Option<String>,
90}
91
92impl Normalizable for RealtimeTranscriptionSessionCreateRequest {
93    // Use default no-op implementation
94}
95
96fn validate_transcription_session_create_request(
97    req: &RealtimeTranscriptionSessionCreateRequest,
98) -> Result<(), ValidationError> {
99    let has_model = req.model.as_deref().is_some_and(|m| !m.trim().is_empty());
100    if !has_model {
101        return Err(ValidationError::new("model is required"));
102    }
103    Ok(())
104}
105
106// ============================================================================
107// Transcription Session Object from Create Response
108// ============================================================================
109
110#[serde_with::skip_serializing_none]
111#[derive(Debug, Clone, Serialize, Deserialize)]
112pub struct RealtimeTranscriptionSessionCreateResponse {
113    pub id: String,
114    pub object: String,
115    #[serde(rename = "type")]
116    pub r#type: RealtimeTranscriptionSessionType,
117    pub audio: Option<RealtimeTranscriptionSessionResponseAudio>,
118    pub expires_at: Option<i64>,
119    pub include: Option<Vec<RealtimeIncludeOption>>,
120}
121
122// ============================================================================
123// Audio Formats
124// ============================================================================
125
126#[derive(Debug, Clone, Serialize, Deserialize)]
127#[serde(tag = "type")]
128pub enum RealtimeAudioFormats {
129    #[serde(rename = "audio/pcm")]
130    Pcm {
131        /// Sample rate. Only 24000 is supported.
132        #[serde(skip_serializing_if = "Option::is_none")]
133        rate: Option<u32>,
134    },
135    #[serde(rename = "audio/pcmu")]
136    Pcmu,
137    #[serde(rename = "audio/pcma")]
138    Pcma,
139}
140
141// ============================================================================
142// Audio Transcription
143// ============================================================================
144
145#[serde_with::skip_serializing_none]
146#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct AudioTranscription {
148    pub language: Option<String>,
149    pub model: Option<String>,
150    pub prompt: Option<String>,
151}
152
153// ============================================================================
154// Noise Reduction
155// ============================================================================
156
157#[derive(Debug, Clone, Serialize, Deserialize)]
158#[serde(rename_all = "snake_case")]
159pub enum NoiseReductionType {
160    NearField,
161    FarField,
162}
163
164#[derive(Debug, Clone, Serialize, Deserialize)]
165pub struct NoiseReduction {
166    #[serde(rename = "type")]
167    pub r#type: NoiseReductionType,
168}
169
170// ============================================================================
171// Turn Detection
172// ============================================================================
173/// Used only for `semantic_vad` mode. The eagerness of the model to respond.
174/// `low` will wait longer for the user to continue speaking, `high` will respond more quickly.
175/// `auto` is the default and is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s, 4s, and 2s respectively.
176#[derive(Debug, Clone, Serialize, Deserialize, Default)]
177#[serde(rename_all = "snake_case")]
178pub enum SemanticVadEagerness {
179    Low,
180    Medium,
181    High,
182    #[default]
183    Auto,
184}
185
186#[serde_with::skip_serializing_none]
187#[derive(Debug, Clone, Serialize, Deserialize)]
188#[serde(tag = "type")]
189pub enum TurnDetection {
190    #[serde(rename = "server_vad")]
191    ServerVad {
192        create_response: Option<bool>,
193        idle_timeout_ms: Option<u32>,
194        interrupt_response: Option<bool>,
195        prefix_padding_ms: Option<u32>,
196        silence_duration_ms: Option<u32>,
197        threshold: Option<f64>,
198    },
199    #[serde(rename = "semantic_vad")]
200    SemanticVad {
201        create_response: Option<bool>,
202        eagerness: Option<SemanticVadEagerness>,
203        interrupt_response: Option<bool>,
204    },
205}
206
207/// Turn detection for transcription sessions. Only `server_vad` is currently supported.
208#[serde_with::skip_serializing_none]
209#[derive(Debug, Clone, Serialize, Deserialize)]
210#[serde(tag = "type")]
211pub enum RealtimeTranscriptionSessionTurnDetection {
212    #[serde(rename = "server_vad")]
213    ServerVad {
214        prefix_padding_ms: Option<u32>,
215        silence_duration_ms: Option<u32>,
216        threshold: Option<f64>,
217    },
218}
219
220// ============================================================================
221// Voice
222// ============================================================================
223
224/// Built-in voice name (e.g. "alloy", "ash") or custom voice reference.
225///
226/// Variant order matters for `#[serde(untagged)]`: a bare JSON string (e.g. `"alloy"`)
227/// always deserializes as `BuiltIn`. A JSON object `{"id": "..."}` fails `BuiltIn`
228/// and falls through to `Custom`. The two forms are structurally distinct (string vs
229/// object) per the OpenAI spec, so there is no ambiguity.
230#[derive(Debug, Clone, Serialize, Deserialize)]
231#[serde(untagged)]
232pub enum Voice {
233    VoiceIDsShared(String),
234    Custom { id: String },
235}
236
237// ============================================================================
238// Output Modality
239// ============================================================================
240
241#[derive(Debug, Clone, Serialize, Deserialize)]
242#[serde(rename_all = "snake_case")]
243pub enum OutputModality {
244    Text,
245    Audio,
246}
247
248#[expect(
249    clippy::unnecessary_wraps,
250    reason = "must return Option to match serde default field type"
251)]
252fn audio() -> Option<Vec<OutputModality>> {
253    Some(vec![OutputModality::Audio])
254}
255
256// ============================================================================
257// Tracing
258// ============================================================================
259
260#[serde_with::skip_serializing_none]
261#[derive(Debug, Clone, Serialize, Deserialize)]
262pub struct TracingConfig {
263    pub group_id: Option<String>,
264    pub metadata: Option<Value>,
265    pub workflow_name: Option<String>,
266}
267
268/// The tracing mode. Always `"auto"`.
269#[derive(Debug, Clone, Serialize, Deserialize)]
270pub enum TracingMode {
271    #[serde(rename = "auto")]
272    Auto,
273}
274
275/// Either the string `"auto"` or a granular tracing configuration.
276#[derive(Debug, Clone, Serialize, Deserialize)]
277#[serde(untagged)]
278pub enum RealtimeTracingConfig {
279    Mode(TracingMode),
280    Config(TracingConfig),
281}
282
283// ============================================================================
284// Connector ID
285// ============================================================================
286
287#[derive(Debug, Clone, Serialize, Deserialize)]
288#[serde(rename_all = "snake_case")]
289#[expect(
290    clippy::enum_variant_names,
291    reason = "variant names match OpenAI Realtime API spec"
292)]
293pub enum ConnectorId {
294    ConnectorDropbox,
295    ConnectorGmail,
296    ConnectorGooglecalendar,
297    ConnectorGoogledrive,
298    ConnectorMicrosoftteams,
299    ConnectorOutlookcalendar,
300    ConnectorOutlookemail,
301    ConnectorSharepoint,
302}
303
304// ============================================================================
305// Tools
306// ============================================================================
307
308#[serde_with::skip_serializing_none]
309#[derive(Debug, Clone, Serialize, Deserialize)]
310#[serde(tag = "type")]
311pub enum RealtimeToolsConfig {
312    #[serde(rename = "function")]
313    RealtimeFunctionTool {
314        description: Option<String>,
315        name: Option<String>,
316        parameters: Option<Value>,
317    },
318    #[serde(rename = "mcp")]
319    McpTool {
320        server_label: String,
321        allowed_tools: Option<McpAllowedTools>,
322        authorization: Option<Redacted>,
323        connector_id: Option<ConnectorId>,
324        headers: Option<HashMap<String, Redacted>>,
325        require_approval: Option<McpToolApproval>,
326        server_description: Option<String>,
327        server_url: Option<String>,
328    },
329}
330
331// ============================================================================
332// MCP Tool Filter
333// ============================================================================
334
335/// List of allowed tool names or a filter object.
336///
337/// Variant order matters for `#[serde(untagged)]`: serde tries `List` first
338/// (JSON array). A JSON object falls through to `Filter`.
339#[derive(Debug, Clone, Serialize, Deserialize)]
340#[serde(untagged)]
341pub enum McpAllowedTools {
342    List(Vec<String>),
343    Filter(McpToolFilter),
344}
345
346/// A filter object to specify which tools are allowed.
347#[serde_with::skip_serializing_none]
348#[derive(Debug, Clone, Serialize, Deserialize)]
349pub struct McpToolFilter {
350    pub read_only: Option<bool>,
351    pub tool_names: Option<Vec<String>>,
352}
353
354/// Approval policy for MCP tools: a filter object or `"always"`/`"never"`.
355///
356/// Variant order matters for `#[serde(untagged)]`: serde tries `Setting` first
357/// (plain string). A JSON object falls through to `Filter`.
358#[derive(Debug, Clone, Serialize, Deserialize)]
359#[serde(untagged)]
360pub enum McpToolApproval {
361    Setting(McpToolApprovalSetting),
362    Filter(McpToolApprovalFilter),
363}
364
365/// Single approval policy for all tools.
366#[derive(Debug, Clone, Serialize, Deserialize)]
367#[serde(rename_all = "snake_case")]
368pub enum McpToolApprovalSetting {
369    Always,
370    Never,
371}
372
373/// Granular approval filter specifying which tools always/never require approval.
374#[serde_with::skip_serializing_none]
375#[derive(Debug, Clone, Serialize, Deserialize)]
376pub struct McpToolApprovalFilter {
377    pub always: Option<McpToolFilter>,
378    pub never: Option<McpToolFilter>,
379}
380
381// ============================================================================
382// Tool Choice
383// ============================================================================
384
385/// `"none"`, `"auto"`, `"required"`, or a specific function/MCP tool reference.
386///
387/// Variant order matters for `#[serde(untagged)]`: serde tries `Options` first
388/// (plain string). A JSON object fails and falls through to `Reference`.
389/// Reuses [`ToolReference`] from `common` for the tagged object forms.
390#[derive(Debug, Clone, Serialize, Deserialize)]
391#[serde(untagged)]
392pub enum RealtimeToolChoiceConfig {
393    Options(ToolChoiceOptions),
394    Reference(ToolReference),
395}
396
397/// Controls which (if any) tool is called by the model.
398#[derive(Debug, Clone, Serialize, Deserialize)]
399#[serde(rename_all = "snake_case")]
400pub enum ToolChoiceOptions {
401    None,
402    Auto,
403    Required,
404}
405
406// ============================================================================
407// Max Output Tokens
408// ============================================================================
409
410/// Integer token limit (1–4096) or `"inf"` for the maximum available tokens.
411/// Defaults to `"inf"`.
412///
413/// Variant order matters for `#[serde(untagged)]`: serde tries `Integer` first.
414/// A JSON number succeeds immediately; the string `"inf"` fails `Integer` and
415/// falls through to `Inf`.
416#[derive(Debug, Clone, Serialize, Deserialize)]
417#[serde(untagged)]
418pub enum MaxOutputTokens {
419    /// An integer between 1 and 4096.
420    Integer(u32),
421    Inf(InfMarker),
422}
423
424impl Default for MaxOutputTokens {
425    fn default() -> Self {
426        Self::Inf(InfMarker::Inf)
427    }
428}
429
430/// The literal string `"inf"`. Used by [`MaxOutputTokens::Inf`].
431#[derive(Debug, Clone, Serialize, Deserialize)]
432pub enum InfMarker {
433    #[serde(rename = "inf")]
434    Inf,
435}
436
437// ============================================================================
438// Truncation
439// ============================================================================
440
441#[serde_with::skip_serializing_none]
442#[derive(Debug, Clone, Serialize, Deserialize)]
443pub struct TruncationTokenLimits {
444    pub post_instructions: Option<u32>,
445}
446
447/// The retention ratio truncation type. Always `"retention_ratio"`.
448#[derive(Debug, Clone, Serialize, Deserialize)]
449pub enum RetentionRatioTruncationType {
450    #[serde(rename = "retention_ratio")]
451    RetentionRatio,
452}
453
454#[serde_with::skip_serializing_none]
455#[derive(Debug, Clone, Serialize, Deserialize)]
456pub struct RetentionRatioTruncation {
457    pub retention_ratio: f64,
458    #[serde(rename = "type")]
459    pub r#type: RetentionRatioTruncationType,
460    pub token_limits: Option<TruncationTokenLimits>,
461}
462
463/// The truncation mode.
464#[derive(Debug, Clone, Serialize, Deserialize, Default)]
465#[serde(rename_all = "snake_case")]
466pub enum TruncationMode {
467    #[default]
468    Auto,
469    Disabled,
470}
471
472/// `"auto"`, `"disabled"`, or a retention ratio configuration.
473#[derive(Debug, Clone, Serialize, Deserialize)]
474#[serde(untagged)]
475pub enum RealtimeTruncation {
476    Mode(TruncationMode),
477    RetentionRatio(RetentionRatioTruncation),
478}
479
480// ============================================================================
481// Client Secret
482// ============================================================================
483
484#[serde_with::skip_serializing_none]
485#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
486#[validate(schema(function = "validate_client_secret_create_request"))]
487pub struct RealtimeClientSecretCreateRequest {
488    pub session: RealtimeSessionCreateRequest,
489}
490
491impl Normalizable for RealtimeClientSecretCreateRequest {}
492
493fn validate_client_secret_create_request(
494    req: &RealtimeClientSecretCreateRequest,
495) -> Result<(), ValidationError> {
496    let has_model = req
497        .session
498        .model
499        .as_deref()
500        .is_some_and(|m| !m.trim().is_empty());
501    if !has_model {
502        return Err(ValidationError::new("session.model is required"));
503    }
504    Ok(())
505}
506
507#[derive(Debug, Clone, Serialize, Deserialize)]
508pub struct RealtimeSessionClientSecret {
509    pub expires_at: i64,
510    pub value: Redacted,
511}
512
513// ============================================================================
514// Audio Configuration
515// ============================================================================
516
517#[serde_with::skip_serializing_none]
518#[derive(Debug, Clone, Serialize, Deserialize)]
519pub struct RealtimeAudioConfigInput {
520    pub format: Option<RealtimeAudioFormats>,
521    pub noise_reduction: Option<NoiseReduction>,
522    pub transcription: Option<AudioTranscription>,
523    pub turn_detection: Option<TurnDetection>,
524}
525
526#[serde_with::skip_serializing_none]
527#[derive(Debug, Clone, Serialize, Deserialize)]
528pub struct RealtimeAudioConfigOutput {
529    pub format: Option<RealtimeAudioFormats>,
530    pub speed: Option<f64>,
531    pub voice: Option<Voice>,
532}
533
534#[serde_with::skip_serializing_none]
535#[derive(Debug, Clone, Serialize, Deserialize)]
536pub struct RealtimeAudioConfig {
537    pub input: Option<RealtimeAudioConfigInput>,
538    pub output: Option<RealtimeAudioConfigOutput>,
539}
540
541#[serde_with::skip_serializing_none]
542#[derive(Debug, Clone, Serialize, Deserialize)]
543pub struct RealtimeTranscriptionSessionAudio {
544    pub input: Option<RealtimeAudioConfigInput>,
545}
546
547#[serde_with::skip_serializing_none]
548#[derive(Debug, Clone, Serialize, Deserialize)]
549pub struct RealtimeTranscriptionSessionResponseAudio {
550    pub input: Option<RealtimeTranscriptionSessionResponseAudioConfigInput>,
551}
552
553#[serde_with::skip_serializing_none]
554#[derive(Debug, Clone, Serialize, Deserialize)]
555pub struct RealtimeTranscriptionSessionResponseAudioConfigInput {
556    pub format: Option<RealtimeAudioFormats>,
557    pub noise_reduction: Option<NoiseReduction>,
558    pub transcription: Option<AudioTranscription>,
559    pub turn_detection: Option<RealtimeTranscriptionSessionTurnDetection>,
560}
561
562// ============================================================================
563// Include Options
564// ============================================================================
565
566#[derive(Debug, Clone, Serialize, Deserialize)]
567pub enum RealtimeIncludeOption {
568    #[serde(rename = "item.input_audio_transcription.logprobs")]
569    InputAudioTranscriptionLogprobs,
570}
571
572// ============================================================================
573// Session Type
574// ============================================================================
575
576/// The type of session. Always `"realtime"` for the Realtime API.
577#[derive(Debug, Clone, Serialize, Deserialize)]
578pub enum RealtimeSessionType {
579    #[serde(rename = "realtime")]
580    Realtime,
581}
582
583// ============================================================================
584// TranscriptionSession Type
585// ============================================================================
586
587/// The type of session. Always `"transcription"` for the Realtime API.
588#[derive(Debug, Clone, Serialize, Deserialize)]
589pub enum RealtimeTranscriptionSessionType {
590    #[serde(rename = "transcription")]
591    Transcription,
592}