1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
//! Protocol definitions for Gradium API communication.
//!
//! This module contains the types and structures used for communicating with the
//! Gradium API via WebSocket connections.
/// Audio output format for TTS generation.
#[derive(Debug, Clone)]
pub enum AudioFormat {
/// Raw PCM audio data
Pcm,
/// WAV file format
Wav,
/// Opus compressed audio
Opus,
Other(String),
}
impl<'de> serde::Deserialize<'de> for AudioFormat {
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
let format = match s.as_str() {
"pcm" => AudioFormat::Pcm,
"wav" => AudioFormat::Wav,
"opus" => AudioFormat::Opus,
other => AudioFormat::Other(other.to_string()),
};
Ok(format)
}
}
impl serde::Serialize for AudioFormat {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let s = match self {
AudioFormat::Pcm => "pcm".to_string(),
AudioFormat::Wav => "wav".to_string(),
AudioFormat::Opus => "opus".to_string(),
AudioFormat::Other(other) => other.clone(),
};
serializer.serialize_str(&s)
}
}
/// Text-to-speech protocol types.
///
/// This module contains request and response types for the TTS WebSocket API.
pub mod tts {
/// Configuration for initializing a TTS session.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Setup {
/// The name of the TTS model to use (e.g., "default")
pub model_name: String,
/// Optional voice name to use
pub voice: Option<String>,
/// Optional voice ID for custom voices
pub voice_id: Option<String>,
/// The desired audio output format
pub output_format: super::AudioFormat,
/// Optional pronunciation dictionary ID to customize word pronunciation
#[serde(default, skip_serializing_if = "Option::is_none")]
pub pronunciation_id: Option<String>,
/// Custom config for the TTS model in JSON format
#[serde(default, skip_serializing_if = "Option::is_none")]
pub json_config: Option<String>,
/// Optional client request ID for multiplexing multiple requests over a single WebSocket
#[serde(default, skip_serializing_if = "Option::is_none")]
pub client_req_id: Option<String>,
/// When set to `false`, the server will NOT close the WebSocket after EndOfStream,
/// allowing additional requests on the same connection (multiplexing).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub close_ws_on_eos: Option<bool>,
}
impl Default for Setup {
fn default() -> Self {
Self {
model_name: "default".to_string(),
voice: None,
voice_id: None,
output_format: super::AudioFormat::Pcm,
pronunciation_id: None,
json_config: None,
client_req_id: None,
close_ws_on_eos: None,
}
}
}
impl Setup {
/// Creates a new `Setup` instance with default values.
pub fn new(voice_id: &str) -> Self {
Self::default().with_voice_id(voice_id)
}
pub fn with_model_name(mut self, model_name: &str) -> Self {
self.model_name = model_name.to_string();
self
}
pub fn with_voice_id(mut self, voice_id: &str) -> Self {
self.voice_id = Some(voice_id.to_string());
self
}
pub fn with_output_format(mut self, output_format: super::AudioFormat) -> Self {
self.output_format = output_format;
self
}
pub fn with_pronunciation_id(mut self, pronunciation_id: &str) -> Self {
self.pronunciation_id = Some(pronunciation_id.to_string());
self
}
pub fn with_json_config(mut self, json_config: &serde_json::Value) -> Self {
self.json_config = Some(json_config.to_string());
self
}
/// Sets the client request ID for multiplexing.
pub fn with_client_req_id(mut self, client_req_id: &str) -> Self {
self.client_req_id = Some(client_req_id.to_string());
self
}
/// Sets close_ws_on_eos. When `false`, the server keeps the WebSocket open
/// after EndOfStream, enabling multiplexing.
pub fn with_close_ws_on_eos(mut self, close: bool) -> Self {
self.close_ws_on_eos = Some(close);
self
}
}
/// Text to be synthesized into speech.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Text {
/// The text content to synthesize
pub text: String,
/// Optional client request ID for multiplexing
#[serde(default, skip_serializing_if = "Option::is_none")]
pub client_req_id: Option<String>,
}
/// Client-to-server request messages.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum Request {
/// Initialize the TTS session with configuration
Setup(Setup),
/// Send text to be synthesized
Text(Text),
/// Signal end of input stream
EndOfStream {
/// Optional client request ID for multiplexing
#[serde(default, skip_serializing_if = "Option::is_none")]
client_req_id: Option<String>,
},
}
/// Server response indicating the session is ready.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Ready {
/// The model name being used
pub model_name: String,
/// Audio sample rate in Hz
pub sample_rate: u32,
/// Audio frame size in samples
pub frame_size: u32,
/// Names of available audio streams
pub audio_stream_names: Vec<String>,
/// Names of available text streams
pub text_stream_names: Vec<String>,
/// Optional request ID for tracking
pub request_id: String,
/// Optional client request ID for multiplexing
#[serde(default, skip_serializing_if = "Option::is_none")]
pub client_req_id: Option<String>,
}
/// Audio data response from the server.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Audio {
/// Base64-encoded audio data
pub audio: String,
/// Start time of this audio segment in seconds
pub start_s: f64,
/// End time of this audio segment in seconds
pub stop_s: f64,
/// Stream identifier
pub stream_id: u32,
/// Optional client request ID for multiplexing
#[serde(default, skip_serializing_if = "Option::is_none")]
pub client_req_id: Option<String>,
}
impl Audio {
/// Decodes the base64-encoded audio data into raw bytes.
///
/// # Returns
///
/// The decoded audio data as a `Vec<u8>`
///
/// # Errors
///
/// Returns an error if the base64 decoding fails
pub fn raw_audio(&self) -> anyhow::Result<Vec<u8>> {
use base64::prelude::*;
Ok(base64::engine::general_purpose::STANDARD.decode(&self.audio)?)
}
}
/// Text response with timing information.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct TextResponse {
/// The text content
pub text: String,
/// Start time in seconds
pub start_s: f64,
/// End time in seconds
pub stop_s: f64,
/// Stream identifier
pub stream_id: u32,
/// Optional client request ID for multiplexing
#[serde(default, skip_serializing_if = "Option::is_none")]
pub client_req_id: Option<String>,
}
/// Server-to-client response messages.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum Response {
/// Session is ready for text input
Ready(Ready),
/// Audio data chunk
Audio(Audio),
/// Text with timing information
Text(TextResponse),
/// Error occurred during processing
Error {
/// Error code
code: Option<i64>,
/// Error message
message: String,
/// Optional client request ID for multiplexing
#[serde(default, skip_serializing_if = "Option::is_none")]
client_req_id: Option<String>,
},
/// End of output stream
EndOfStream {
/// Optional client request ID for multiplexing
#[serde(default, skip_serializing_if = "Option::is_none")]
client_req_id: Option<String>,
},
}
}
/// Speech-to-text protocol types.
///
/// This module contains request and response types for the STT (ASR) WebSocket API.
pub mod stt {
/// Configuration for initializing an STT session.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Setup {
/// The name of the STT model to use (e.g., "default")
pub model_name: String,
/// The desired audio input format (how the client will send audio)
pub input_format: super::AudioFormat,
/// Custom config for the STT model in JSON format
#[serde(default, skip_serializing_if = "Option::is_none")]
pub json_config: Option<String>,
}
impl Default for Setup {
fn default() -> Self {
Self {
model_name: "default".to_string(),
input_format: super::AudioFormat::Pcm,
json_config: None,
}
}
}
impl Setup {
/// Creates a new `Setup` instance with default values.
pub fn new() -> Self {
Self::default()
}
pub fn with_model_name(mut self, model_name: &str) -> Self {
self.model_name = model_name.to_string();
self
}
pub fn with_input_format(mut self, input_format: super::AudioFormat) -> Self {
self.input_format = input_format;
self
}
pub fn with_json_config(mut self, json_config: &serde_json::Value) -> Self {
self.json_config = Some(json_config.to_string());
self
}
}
/// Server response indicating the STT session is ready.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Ready {
/// The model name being used
pub model_name: String,
/// Audio sample rate in Hz (expected input rate)
pub sample_rate: u32,
/// Audio frame size in samples (recommended chunk size)
pub frame_size: u32,
/// Delay in frames before transcription begins
pub delay_in_frames: f64,
/// Names of available text streams
pub text_stream_names: Vec<String>,
/// Request ID for tracking this session
pub request_id: String,
}
/// Audio data sent from client to server for transcription.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Audio {
/// Raw audio data bytes, base64 encoded.
pub audio: String,
}
/// Client-to-server request messages.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum Request {
/// Initialize the STT session with configuration
Setup(Setup),
/// Send audio data for transcription
Audio(Audio),
/// Signal end of audio input stream
EndOfStream,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct VadPrediction {
pub horizon_s: f64,
pub inactivity_prob: f64,
}
/// Voice Activity Detection information.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Vad {
/// Current step index
pub step_idx: u64,
/// Duration of this step in seconds
pub step_duration_s: f64,
/// Total duration processed in seconds
pub total_duration_s: f64,
pub vad: Vec<VadPrediction>,
}
/// Transcribed text with timing information.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Text {
/// The transcribed text content
pub text: String,
/// Start time of this text segment in seconds
pub start_s: f64,
/// Stream identifier
pub stream_id: u32,
}
/// Marks the end of a text segment with final timing.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct EndText {
/// End time of the text segment in seconds
pub stop_s: f64,
/// Stream identifier
pub stream_id: u32,
}
/// Server-to-client response messages.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum Response {
/// Session is ready for audio input
Ready(Ready),
/// Voice activity detection update
#[serde(rename = "step")]
Vad(Vad),
/// Transcribed text segment
Text(Text),
/// End of a text segment with final timing
EndText(EndText),
/// Error occurred during processing
Error {
/// Error code
code: i64,
/// Error message
message: String,
},
/// End of output stream
EndOfStream,
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct CreditsResponse {
pub remaining_credits: i64,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct UsageResponse {
pub consumed_credits: i64,
pub egress_audio_duration: f64,
pub egress_messages: i64,
pub egress_text_size: i64,
pub ingress_audio_duration: f64,
pub ingress_messages: i64,
pub ingress_text_size: i64,
pub sessions: i64,
}