Skip to main content

roder_protocol/
speech.rs

1use std::collections::BTreeMap;
2
3use roder_api::inference::ProviderAuthType;
4use roder_api::speech::{
5    SpeechCapabilities, SpeechModelDescriptor, SpeechSegment, SpeechSynthesisCapabilities,
6    SpeechSynthesisModelDescriptor,
7};
8use serde::{Deserialize, Serialize};
9
10#[derive(Debug, Clone, Serialize, Deserialize)]
11#[serde(rename_all = "camelCase")]
12pub struct SpeechProviderDescriptor {
13    pub id: String,
14    pub name: String,
15    pub description: Option<String>,
16    pub auth_type: ProviderAuthType,
17    pub auth_label: Option<String>,
18    pub authenticated: bool,
19    pub auth_detail: Option<String>,
20    pub recommended: bool,
21    pub sort_order: i32,
22    pub capabilities: SpeechCapabilities,
23    pub models: Vec<SpeechModelDescriptor>,
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize)]
27#[serde(rename_all = "camelCase")]
28pub struct SpeechProvidersListResult {
29    pub providers: Vec<SpeechProviderDescriptor>,
30}
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
33#[serde(rename_all = "camelCase")]
34pub struct SpeechSynthesisProviderDescriptor {
35    pub id: String,
36    pub name: String,
37    pub description: Option<String>,
38    pub auth_type: ProviderAuthType,
39    pub auth_label: Option<String>,
40    pub authenticated: bool,
41    pub auth_detail: Option<String>,
42    pub recommended: bool,
43    pub sort_order: i32,
44    pub capabilities: SpeechSynthesisCapabilities,
45    pub models: Vec<SpeechSynthesisModelDescriptor>,
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize)]
49#[serde(rename_all = "camelCase")]
50pub struct SpeechSynthesisProvidersListResult {
51    pub providers: Vec<SpeechSynthesisProviderDescriptor>,
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize)]
55#[serde(rename_all = "camelCase")]
56pub struct SpeechAudioPayload {
57    pub bytes_base64: String,
58    pub mime_type: String,
59    pub filename: Option<String>,
60}
61
62#[derive(Debug, Clone, Serialize, Deserialize)]
63#[serde(rename_all = "camelCase")]
64pub struct SpeechTranscribeParams {
65    #[serde(default, skip_serializing_if = "Option::is_none")]
66    pub provider: Option<String>,
67    #[serde(default, skip_serializing_if = "Option::is_none")]
68    pub model: Option<String>,
69    pub audio: SpeechAudioPayload,
70    #[serde(default, skip_serializing_if = "Option::is_none")]
71    pub language: Option<String>,
72    #[serde(default, skip_serializing_if = "Option::is_none")]
73    pub prompt: Option<String>,
74    #[serde(default)]
75    pub diarization: bool,
76    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
77    pub metadata: BTreeMap<String, serde_json::Value>,
78}
79
80#[derive(Debug, Clone, Serialize, Deserialize)]
81#[serde(rename_all = "camelCase")]
82pub struct SpeechTranscribeResult {
83    pub provider: String,
84    pub model: String,
85    pub text: String,
86    pub language: Option<String>,
87    pub duration_millis: Option<u64>,
88    pub segments: Vec<SpeechSegment>,
89    pub provider_response_id: Option<String>,
90    #[serde(default)]
91    pub metadata: serde_json::Value,
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
95#[serde(rename_all = "camelCase")]
96pub struct SpeechSynthesizeParams {
97    #[serde(default, skip_serializing_if = "Option::is_none")]
98    pub provider: Option<String>,
99    #[serde(default, skip_serializing_if = "Option::is_none")]
100    pub model: Option<String>,
101    pub text: String,
102    #[serde(default, skip_serializing_if = "Option::is_none")]
103    pub voice: Option<String>,
104    #[serde(default, skip_serializing_if = "Option::is_none")]
105    pub audio_format: Option<String>,
106    #[serde(default, skip_serializing_if = "Option::is_none")]
107    pub prompt: Option<String>,
108    #[serde(default, skip_serializing_if = "Option::is_none")]
109    pub voice_sample: Option<SpeechAudioPayload>,
110    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
111    pub metadata: BTreeMap<String, serde_json::Value>,
112}
113
114#[derive(Debug, Clone, Serialize, Deserialize)]
115#[serde(rename_all = "camelCase")]
116pub struct SpeechSynthesizeResult {
117    pub provider: String,
118    pub model: String,
119    pub audio: SpeechAudioPayload,
120    pub duration_millis: Option<u64>,
121    pub provider_response_id: Option<String>,
122    #[serde(default)]
123    pub metadata: serde_json::Value,
124}
125
126#[cfg(test)]
127mod tests {
128    use super::*;
129
130    #[test]
131    fn transcribe_params_use_camel_case_audio_bytes() {
132        let params: SpeechTranscribeParams = serde_json::from_value(serde_json::json!({
133            "provider": "openai-speech",
134            "model": "gpt-4o-mini-transcribe",
135            "audio": {
136                "bytesBase64": "YXVkaW8=",
137                "mimeType": "audio/wav",
138                "filename": "clip.wav"
139            },
140            "language": "en",
141            "diarization": false
142        }))
143        .unwrap();
144
145        assert_eq!(params.audio.bytes_base64, "YXVkaW8=");
146        assert_eq!(params.audio.mime_type, "audio/wav");
147        assert_eq!(params.audio.filename.as_deref(), Some("clip.wav"));
148    }
149
150    #[test]
151    fn synthesize_params_use_camel_case_audio_format() {
152        let params: SpeechSynthesizeParams = serde_json::from_value(serde_json::json!({
153            "provider": "xiaomi-mimo",
154            "model": "mimo-v2.5-tts",
155            "text": "hello",
156            "audioFormat": "wav",
157            "voiceSample": {
158                "bytesBase64": "dm9pY2U=",
159                "mimeType": "audio/wav",
160                "filename": "voice.wav"
161            }
162        }))
163        .unwrap();
164
165        assert_eq!(params.audio_format.as_deref(), Some("wav"));
166        assert_eq!(
167            params.voice_sample.unwrap().bytes_base64.as_str(),
168            "dm9pY2U="
169        );
170    }
171}