1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
use serde::{Deserialize, Serialize};
#[derive(Debug, Serialize, Deserialize, Clone)]
pub enum AudioFormat {
#[serde(rename = "pcm16")]
PCM16,
#[serde(rename = "g711_law")]
G711ULAW,
#[serde(rename = "g711_alaw")]
G711ALAW,
}
#[derive(Debug, Default, Serialize, Deserialize, Clone)]
pub struct AudioTranscription {
/// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format
/// will improve accuracy and latency.
#[serde(skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
/// The model to use for transcription, current options are gpt-4o-transcribe,
/// gpt-4o-mini-transcribe, and whisper-1.
#[serde(skip_serializing_if = "Option::is_none")]
pub model: Option<String>,
/// An optional text to guide the model's style or continue a previous audio segment.
/// For whisper-1, the prompt is a list of keywords. For gpt-4o-transcribe models,
/// the prompt is a free text string, for example "expect words related to technology".
#[serde(skip_serializing_if = "Option::is_none")]
pub prompt: Option<String>,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(tag = "type")]
pub enum TurnDetection {
/// Type of turn detection, only "server_vad" is currently supported.
#[serde(rename = "server_vad")]
ServerVAD {
/// Activation threshold for VAD (0.0 to 1.0).
threshold: f32,
/// Amount of audio to include before speech starts (in milliseconds).
prefix_padding_ms: u32,
/// Duration of silence to detect speech stop (in milliseconds).
silence_duration_ms: u32,
/// Whether or not to automatically generate a response when a VAD stop event occurs.
#[serde(skip_serializing_if = "Option::is_none")]
create_response: Option<bool>,
/// Whether or not to automatically interrupt any ongoing response with output to
/// the default conversation (i.e. conversation of auto) when a VAD start event occurs.
#[serde(skip_serializing_if = "Option::is_none")]
interrupt_response: Option<bool>,
},
#[serde(rename = "semantic_vad")]
SemanticVAD {
/// The eagerness of the model to respond.
/// `low` will wait longer for the user to continue speaking,
/// `high`` will respond more quickly. `auto`` is the default and is equivalent to `medium`
eagerness: String,
/// Whether or not to automatically generate a response when a VAD stop event occurs.
#[serde(skip_serializing_if = "Option::is_none", default)]
create_response: Option<bool>,
/// Whether or not to automatically interrupt any ongoing response with output to
/// the default conversation (i.e. conversation of auto) when a VAD start event occurs.
#[serde(skip_serializing_if = "Option::is_none", default)]
interrupt_response: Option<bool>,
},
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub enum MaxResponseOutputTokens {
#[serde(rename = "inf")]
Inf,
#[serde(untagged)]
Num(u16),
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(tag = "type")]
pub enum ToolDefinition {
#[serde(rename = "function")]
Function {
/// The name of the function.
name: String,
/// The description of the function.
description: String,
/// Parameters of the function in JSON Schema.
parameters: serde_json::Value,
},
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "lowercase")]
pub enum FunctionType {
Function,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "lowercase")]
pub enum ToolChoice {
Auto,
None,
Required,
#[serde(untagged)]
Function {
#[serde(rename = "type")]
kind: FunctionType,
name: String,
},
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "lowercase")]
pub enum RealtimeVoice {
Alloy,
Ash,
Ballad,
Coral,
Echo,
Fable,
Onyx,
Nova,
Shimmer,
Verse,
}
#[derive(Debug, Serialize, Deserialize, Clone, Default)]
pub struct SessionResource {
/// The default model used for this session.
#[serde(skip_serializing_if = "Option::is_none")]
pub model: Option<String>,
/// The set of modalities the model can respond with. To disable audio, set this to ["text"].
#[serde(skip_serializing_if = "Option::is_none")]
pub modalities: Option<Vec<String>>,
//// The default system instructions prepended to model calls.
#[serde(skip_serializing_if = "Option::is_none")]
pub instructions: Option<String>,
/// The voice the model uses to respond. Cannot be changed once the model has responded with
/// audio at least once.
#[serde(skip_serializing_if = "Option::is_none")]
pub voice: Option<RealtimeVoice>,
/// The format of input audio. Options are "pcm16", "g711_ulaw", or "g711_alaw".
#[serde(skip_serializing_if = "Option::is_none")]
pub input_audio_format: Option<AudioFormat>,
/// The format of output audio. Options are "pcm16", "g711_ulaw", or "g711_alaw".
#[serde(skip_serializing_if = "Option::is_none")]
pub output_audio_format: Option<AudioFormat>,
/// Configuration for input audio transcription. Can be set to null to turn off.
#[serde(skip_serializing_if = "Option::is_none")]
pub input_audio_transcription: Option<AudioTranscription>,
/// Configuration for turn detection. Can be set to null to turn off.
#[serde(skip_serializing_if = "Option::is_none")]
pub turn_detection: Option<TurnDetection>,
/// Tools (functions) available to the model.
#[serde(skip_serializing_if = "Option::is_none")]
pub tools: Option<Vec<ToolDefinition>>,
#[serde(skip_serializing_if = "Option::is_none")]
/// How the model chooses tools.
pub tool_choice: Option<ToolChoice>,
#[serde(skip_serializing_if = "Option::is_none")]
/// Sampling temperature for the model.
pub temperature: Option<f32>,
/// Maximum number of output tokens for a single assistant response, inclusive of tool calls.
/// Provide an integer between 1 and 4096 to limit output tokens, or "inf" for the maximum
/// available tokens for a given model. Defaults to "inf".
#[serde(skip_serializing_if = "Option::is_none")]
pub max_response_output_tokens: Option<MaxResponseOutputTokens>,
}