outfox_openai/spec/realtime/
session_resource.rs

1use serde::{Deserialize, Serialize};
2
3#[derive(Debug, Serialize, Deserialize, Clone)]
4pub enum AudioFormat {
5    #[serde(rename = "pcm16")]
6    PCM16,
7    #[serde(rename = "g711_law")]
8    G711ULAW,
9    #[serde(rename = "g711_alaw")]
10    G711ALAW,
11}
12
13#[derive(Debug, Default, Serialize, Deserialize, Clone)]
14pub struct AudioTranscription {
15    /// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency.
16    #[serde(skip_serializing_if = "Option::is_none")]
17    pub language: Option<String>,
18    /// The model to use for transcription, current options are gpt-4o-transcribe, gpt-4o-mini-transcribe, and whisper-1.
19    #[serde(skip_serializing_if = "Option::is_none")]
20    pub model: Option<String>,
21    /// An optional text to guide the model's style or continue a previous audio segment.
22    /// For whisper-1, the prompt is a list of keywords. For gpt-4o-transcribe models,
23    /// the prompt is a free text string, for example "expect words related to technology".
24    #[serde(skip_serializing_if = "Option::is_none")]
25    pub prompt: Option<String>,
26}
27
28#[derive(Debug, Serialize, Deserialize, Clone)]
29#[serde(tag = "type")]
30pub enum TurnDetection {
31    /// Type of turn detection, only "server_vad" is currently supported.
32    #[serde(rename = "server_vad")]
33    ServerVAD {
34        /// Activation threshold for VAD (0.0 to 1.0).
35        threshold: f32,
36        /// Amount of audio to include before speech starts (in milliseconds).
37        prefix_padding_ms: u32,
38        /// Duration of silence to detect speech stop (in milliseconds).
39        silence_duration_ms: u32,
40
41        /// Whether or not to automatically generate a response when a VAD stop event occurs.
42        #[serde(skip_serializing_if = "Option::is_none")]
43        create_response: Option<bool>,
44
45        /// Whether or not to automatically interrupt any ongoing response with output to
46        /// the default conversation (i.e. conversation of auto) when a VAD start event occurs.
47        #[serde(skip_serializing_if = "Option::is_none")]
48        interrupt_response: Option<bool>,
49    },
50
51    #[serde(rename = "semantic_vad")]
52    SemanticVAD {
53        /// The eagerness of the model to respond.
54        /// `low` will wait longer for the user to continue speaking,
55        /// `high`` will respond more quickly. `auto`` is the default and is equivalent to `medium`
56        eagerness: String,
57
58        /// Whether or not to automatically generate a response when a VAD stop event occurs.
59        #[serde(skip_serializing_if = "Option::is_none", default)]
60        create_response: Option<bool>,
61
62        /// Whether or not to automatically interrupt any ongoing response with output to
63        /// the default conversation (i.e. conversation of auto) when a VAD start event occurs.
64        #[serde(skip_serializing_if = "Option::is_none", default)]
65        interrupt_response: Option<bool>,
66    },
67}
68
69#[derive(Debug, Serialize, Deserialize, Clone)]
70pub enum MaxResponseOutputTokens {
71    #[serde(rename = "inf")]
72    Inf,
73    #[serde(untagged)]
74    Num(u16),
75}
76
77#[derive(Debug, Serialize, Deserialize, Clone)]
78#[serde(tag = "type")]
79pub enum ToolDefinition {
80    #[serde(rename = "function")]
81    Function {
82        /// The name of the function.
83        name: String,
84        /// The description of the function.
85        description: String,
86        /// Parameters of the function in JSON Schema.
87        parameters: serde_json::Value,
88    },
89}
90
91#[derive(Debug, Serialize, Deserialize, Clone)]
92#[serde(rename_all = "lowercase")]
93pub enum FunctionType {
94    Function,
95}
96
97#[derive(Debug, Serialize, Deserialize, Clone)]
98#[serde(rename_all = "lowercase")]
99pub enum ToolChoice {
100    Auto,
101    None,
102    Required,
103    #[serde(untagged)]
104    Function {
105    #[serde(rename = "type")]
106        kind: FunctionType,
107        name: String,
108    },
109}
110
111#[derive(Debug, Serialize, Deserialize, Clone)]
112#[serde(rename_all = "lowercase")]
113pub enum RealtimeVoice {
114    Alloy,
115    Ash,
116    Ballad,
117    Coral,
118    Echo,
119    Fable,
120    Onyx,
121    Nova,
122    Shimmer,
123    Verse,
124}
125
126#[derive(Debug, Serialize, Deserialize, Clone, Default)]
127pub struct SessionResource {
128    /// The default model used for this session.
129    #[serde(skip_serializing_if = "Option::is_none")]
130    pub model: Option<String>,
131
132    /// The set of modalities the model can respond with. To disable audio, set this to ["text"].
133    #[serde(skip_serializing_if = "Option::is_none")]
134    pub modalities: Option<Vec<String>>,
135
136    //// The default system instructions prepended to model calls.
137    #[serde(skip_serializing_if = "Option::is_none")]
138    pub instructions: Option<String>,
139
140    /// The voice the model uses to respond. Cannot be changed once the model has responded with audio at least once.
141    #[serde(skip_serializing_if = "Option::is_none")]
142    pub voice: Option<RealtimeVoice>,
143
144    /// The format of input audio. Options are "pcm16", "g711_ulaw", or "g711_alaw".
145    #[serde(skip_serializing_if = "Option::is_none")]
146    pub input_audio_format: Option<AudioFormat>,
147
148    /// The format of output audio. Options are "pcm16", "g711_ulaw", or "g711_alaw".
149    #[serde(skip_serializing_if = "Option::is_none")]
150    pub output_audio_format: Option<AudioFormat>,
151
152    /// Configuration for input audio transcription. Can be set to null to turn off.
153    #[serde(skip_serializing_if = "Option::is_none")]
154    pub input_audio_transcription: Option<AudioTranscription>,
155
156    /// Configuration for turn detection. Can be set to null to turn off.
157    #[serde(skip_serializing_if = "Option::is_none")]
158    pub turn_detection: Option<TurnDetection>,
159
160    /// Tools (functions) available to the model.
161    #[serde(skip_serializing_if = "Option::is_none")]
162    pub tools: Option<Vec<ToolDefinition>>,
163
164    #[serde(skip_serializing_if = "Option::is_none")]
165    /// How the model chooses tools.
166    pub tool_choice: Option<ToolChoice>,
167
168    #[serde(skip_serializing_if = "Option::is_none")]
169    /// Sampling temperature for the model.
170    pub temperature: Option<f32>,
171
172    /// Maximum number of output tokens for a single assistant response, inclusive of tool calls.
173    /// Provide an integer between 1 and 4096 to limit output tokens, or "inf" for the maximum available tokens for a given model.
174    /// Defaults to "inf".
175    #[serde(skip_serializing_if = "Option::is_none")]
176    pub max_response_output_tokens: Option<MaxResponseOutputTokens>,
177}