1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
//! Session configuration and management structures for the Realtime API.
/// Define a private namespace for session-related items.
mod private
{
// Use full paths from crate root for components
use crate::components::common::VoiceIdsShared;
use crate::components::tools::Tool;
// Serde imports
use serde::{ Serialize, Deserialize };
use serde_json::Value; // Keep if needed for Value type
/// Configuration for input audio transcription in a Realtime session.
///
/// # Used By
/// - `RealtimeSession`
/// - `RealtimeSessionCreateResponse`
/// - `RealtimeSessionCreateRequest`
/// - `RealtimeTranscriptionSessionCreateResponse`
/// - `RealtimeTranscriptionSessionCreateRequest`
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq, former::Former ) ]
pub struct RealtimeSessionInputAudioTranscription
{
/// The model to use for transcription (e.g., "whisper-1").
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub model : Option< String >,
/// The language of the input audio (ISO-639-1 format).
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub language : Option< String >,
/// An optional text prompt to guide the transcription model.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub prompt : Option< String >,
}
/// Configuration for turn detection (VAD) in a Realtime session.
///
/// # Used By
/// - `RealtimeSession`
/// - `RealtimeSessionCreateResponse`
/// - `RealtimeSessionCreateRequest`
/// - `RealtimeTranscriptionSessionCreateResponse`
/// - `RealtimeTranscriptionSessionCreateRequest`
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq, former::Former ) ]
pub struct RealtimeSessionTurnDetection
{
/// Type of turn detection (`server_vad` or `semantic_vad`). Defaults to `server_vad`.
#[ serde( default = "default_turn_detection_type" ) ]
pub r#type : String,
/// Eagerness for `semantic_vad` (`low`, `medium`, `high`, `auto`). Defaults to `auto`.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub eagerness : Option< String >,
/// Activation threshold for `server_vad` (0.0 to 1.0). Defaults to 0.5.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub threshold : Option< f64 >,
/// Audio padding before VAD start for `server_vad` (ms). Defaults to 300ms.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub prefix_padding_ms : Option< i32 >,
/// Silence duration to detect speech stop for `server_vad` (ms). Defaults to 500ms.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub silence_duration_ms : Option< i32 >,
/// Whether to automatically create a response on VAD stop. Defaults to true.
#[ serde( default = "default_true", skip_serializing_if = "is_true" ) ]
pub create_response : Option< bool >,
/// Whether to automatically interrupt ongoing response on VAD start. Defaults to true. Not applicable for transcription sessions.
#[ serde( default = "default_true", skip_serializing_if = "is_true" ) ]
pub interrupt_response : Option< bool >,
}
// Helper functions for default values in RealtimeSessionTurnDetection
fn default_turn_detection_type() -> String { "server_vad".to_string() }
#[ allow(clippy::unnecessary_wraps) ]
fn default_true() -> Option< bool > { Some(true) }
#[ allow(clippy::ref_option, clippy::trivially_copy_pass_by_ref) ]
fn is_true(val : &Option< bool >) -> bool { val == &Some(true) }
/// Configuration for input audio noise reduction.
///
/// # Used By
/// - `RealtimeSession`
/// - `RealtimeSessionCreateResponse`
/// - `RealtimeSessionCreateRequest`
/// - `RealtimeTranscriptionSessionCreateResponse`
/// - `RealtimeTranscriptionSessionCreateRequest`
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq, former::Former ) ]
pub struct RealtimeSessionInputAudioNoiseReduction
{
/// Type of noise reduction (`near_field` or `far_field`).
pub r#type : String,
}
/// Represents the configuration of an active Realtime session.
///
/// # Used By
/// - `RealtimeServerEventSessionCreated`
/// - `RealtimeServerEventSessionUpdated`
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ] // Added Serialize
pub struct RealtimeSession
{
/// Unique identifier for the session.
pub id : String,
/// The set of modalities the model can respond with.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub modalities : Option< Vec< String > >,
/// The Realtime model used for this session.
pub model : String,
/// The default system instructions for the session.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub instructions : Option< String >,
/// The voice the model uses to respond.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub voice : Option< VoiceIdsShared >,
/// The format of input audio.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub input_audio_format : Option< String >,
/// The format of output audio.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub output_audio_format : Option< String >,
/// Configuration for input audio transcription.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub input_audio_transcription : Option< RealtimeSessionInputAudioTranscription >,
/// Configuration for turn detection.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub turn_detection : Option< RealtimeSessionTurnDetection >,
/// Configuration for input audio noise reduction.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub input_audio_noise_reduction : Option< RealtimeSessionInputAudioNoiseReduction >,
/// Tools (functions) available to the model.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub tools : Option< Vec< Tool > >,
/// How the model chooses tools.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub tool_choice : Option< String >,
/// Sampling temperature for the model.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub temperature : Option< f64 >,
/// Maximum number of output tokens for a single assistant response.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub max_response_output_tokens : Option< Value >, // Can be integer or "inf"
}
/// Represents an ephemeral client secret for authenticating Realtime API connections.
///
/// # Used By
/// - `RealtimeSessionCreateResponse`
/// - `RealtimeTranscriptionSessionCreateResponse`
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ] // Added Serialize
pub struct RealtimeClientSecret
{
/// Ephemeral key usable in client environments.
pub value : String,
/// Timestamp for when the token expires (Unix seconds).
pub expires_at : i64,
}
/// Response object returned when creating a Realtime session via REST API.
///
/// # Used By
/// - `/realtime/sessions` (POST)
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ] // Added Serialize
pub struct RealtimeSessionCreateResponse
{
/// Ephemeral key for client authentication.
pub client_secret : RealtimeClientSecret,
/// Unique identifier for the session.
pub id : String,
/// The object type, always "realtime.session".
pub object : String,
/// The set of modalities the model can respond with.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub modalities : Option< Vec< String > >,
/// The Realtime model used for this session.
pub model : String,
/// The default system instructions for the session.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub instructions : Option< String >,
/// The voice the model uses to respond.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub voice : Option< VoiceIdsShared >,
/// The format of input audio.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub input_audio_format : Option< String >,
/// The format of output audio.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub output_audio_format : Option< String >,
/// Configuration for input audio transcription.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub input_audio_transcription : Option< RealtimeSessionInputAudioTranscription >,
/// Configuration for turn detection.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub turn_detection : Option< RealtimeSessionTurnDetection >,
/// Configuration for input audio noise reduction.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub input_audio_noise_reduction : Option< RealtimeSessionInputAudioNoiseReduction >,
/// Tools (functions) available to the model.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub tools : Option< Vec< Tool > >,
/// How the model chooses tools.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub tool_choice : Option< String >,
/// Sampling temperature for the model.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub temperature : Option< f64 >,
/// Maximum number of output tokens for a single assistant response.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub max_response_output_tokens : Option< Value >, // Can be integer or "inf"
}
/// Represents the request body for creating or updating a Realtime session via REST or WebSocket.
///
/// # Used By
/// - `/realtime/sessions` (POST)
/// - `RealtimeClientEventSessionUpdate`
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq, Default, former::Former ) ] // Added Serialize, Default
pub struct RealtimeSessionCreateRequest
{
/// The set of modalities the model can respond with.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub modalities : Option< Vec< String > >,
/// The Realtime model ID.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub model : Option< String >,
/// Default system instructions.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub instructions : Option< String >,
/// Default voice for audio output.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub voice : Option< VoiceIdsShared >,
/// Format of input audio.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub input_audio_format : Option< String >,
/// Format of output audio.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub output_audio_format : Option< String >,
/// Input audio transcription configuration.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub input_audio_transcription : Option< RealtimeSessionInputAudioTranscription >,
/// Turn detection configuration.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub turn_detection : Option< RealtimeSessionTurnDetection >,
/// Input audio noise reduction configuration.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub input_audio_noise_reduction : Option< RealtimeSessionInputAudioNoiseReduction >,
/// Default tools available to the model.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub tools : Option< Vec< Tool > >,
/// Default tool choice strategy.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub tool_choice : Option< String >, // Should ideally be ToolChoice enum
/// Default sampling temperature.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub temperature : Option< f64 >,
/// Default maximum output tokens.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub max_response_output_tokens : Option< Value >, // Can be integer or "inf"
}
} // end mod private
crate ::mod_interface!
{
exposed use
{
RealtimeSessionInputAudioTranscription,
RealtimeSessionTurnDetection,
RealtimeSessionInputAudioNoiseReduction,
RealtimeSession,
RealtimeClientSecret,
RealtimeSessionCreateResponse,
RealtimeSessionCreateRequest,
};
}