1use std::collections::BTreeMap;
2
3use roder_api::inference::ProviderAuthType;
4use roder_api::speech::{
5 SpeechCapabilities, SpeechModelDescriptor, SpeechSegment, SpeechSynthesisCapabilities,
6 SpeechSynthesisModelDescriptor,
7};
8use serde::{Deserialize, Serialize};
9
10#[derive(Debug, Clone, Serialize, Deserialize)]
11#[serde(rename_all = "camelCase")]
12pub struct SpeechProviderDescriptor {
13 pub id: String,
14 pub name: String,
15 pub description: Option<String>,
16 pub auth_type: ProviderAuthType,
17 pub auth_label: Option<String>,
18 pub authenticated: bool,
19 pub auth_detail: Option<String>,
20 pub recommended: bool,
21 pub sort_order: i32,
22 pub capabilities: SpeechCapabilities,
23 pub models: Vec<SpeechModelDescriptor>,
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize)]
27#[serde(rename_all = "camelCase")]
28pub struct SpeechProvidersListResult {
29 pub providers: Vec<SpeechProviderDescriptor>,
30}
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
33#[serde(rename_all = "camelCase")]
34pub struct SpeechSynthesisProviderDescriptor {
35 pub id: String,
36 pub name: String,
37 pub description: Option<String>,
38 pub auth_type: ProviderAuthType,
39 pub auth_label: Option<String>,
40 pub authenticated: bool,
41 pub auth_detail: Option<String>,
42 pub recommended: bool,
43 pub sort_order: i32,
44 pub capabilities: SpeechSynthesisCapabilities,
45 pub models: Vec<SpeechSynthesisModelDescriptor>,
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize)]
49#[serde(rename_all = "camelCase")]
50pub struct SpeechSynthesisProvidersListResult {
51 pub providers: Vec<SpeechSynthesisProviderDescriptor>,
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize)]
55#[serde(rename_all = "camelCase")]
56pub struct SpeechAudioPayload {
57 pub bytes_base64: String,
58 pub mime_type: String,
59 pub filename: Option<String>,
60}
61
62#[derive(Debug, Clone, Serialize, Deserialize)]
63#[serde(rename_all = "camelCase")]
64pub struct SpeechTranscribeParams {
65 #[serde(default, skip_serializing_if = "Option::is_none")]
66 pub provider: Option<String>,
67 #[serde(default, skip_serializing_if = "Option::is_none")]
68 pub model: Option<String>,
69 pub audio: SpeechAudioPayload,
70 #[serde(default, skip_serializing_if = "Option::is_none")]
71 pub language: Option<String>,
72 #[serde(default, skip_serializing_if = "Option::is_none")]
73 pub prompt: Option<String>,
74 #[serde(default)]
75 pub diarization: bool,
76 #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
77 pub metadata: BTreeMap<String, serde_json::Value>,
78}
79
80#[derive(Debug, Clone, Serialize, Deserialize)]
81#[serde(rename_all = "camelCase")]
82pub struct SpeechTranscribeResult {
83 pub provider: String,
84 pub model: String,
85 pub text: String,
86 pub language: Option<String>,
87 pub duration_millis: Option<u64>,
88 pub segments: Vec<SpeechSegment>,
89 pub provider_response_id: Option<String>,
90 #[serde(default)]
91 pub metadata: serde_json::Value,
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
95#[serde(rename_all = "camelCase")]
96pub struct SpeechSynthesizeParams {
97 #[serde(default, skip_serializing_if = "Option::is_none")]
98 pub provider: Option<String>,
99 #[serde(default, skip_serializing_if = "Option::is_none")]
100 pub model: Option<String>,
101 pub text: String,
102 #[serde(default, skip_serializing_if = "Option::is_none")]
103 pub voice: Option<String>,
104 #[serde(default, skip_serializing_if = "Option::is_none")]
105 pub audio_format: Option<String>,
106 #[serde(default, skip_serializing_if = "Option::is_none")]
107 pub prompt: Option<String>,
108 #[serde(default, skip_serializing_if = "Option::is_none")]
109 pub voice_sample: Option<SpeechAudioPayload>,
110 #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
111 pub metadata: BTreeMap<String, serde_json::Value>,
112}
113
114#[derive(Debug, Clone, Serialize, Deserialize)]
115#[serde(rename_all = "camelCase")]
116pub struct SpeechSynthesizeResult {
117 pub provider: String,
118 pub model: String,
119 pub audio: SpeechAudioPayload,
120 pub duration_millis: Option<u64>,
121 pub provider_response_id: Option<String>,
122 #[serde(default)]
123 pub metadata: serde_json::Value,
124}
125
126#[cfg(test)]
127mod tests {
128 use super::*;
129
130 #[test]
131 fn transcribe_params_use_camel_case_audio_bytes() {
132 let params: SpeechTranscribeParams = serde_json::from_value(serde_json::json!({
133 "provider": "openai-speech",
134 "model": "gpt-4o-mini-transcribe",
135 "audio": {
136 "bytesBase64": "YXVkaW8=",
137 "mimeType": "audio/wav",
138 "filename": "clip.wav"
139 },
140 "language": "en",
141 "diarization": false
142 }))
143 .unwrap();
144
145 assert_eq!(params.audio.bytes_base64, "YXVkaW8=");
146 assert_eq!(params.audio.mime_type, "audio/wav");
147 assert_eq!(params.audio.filename.as_deref(), Some("clip.wav"));
148 }
149
150 #[test]
151 fn synthesize_params_use_camel_case_audio_format() {
152 let params: SpeechSynthesizeParams = serde_json::from_value(serde_json::json!({
153 "provider": "xiaomi-mimo",
154 "model": "mimo-v2.5-tts",
155 "text": "hello",
156 "audioFormat": "wav",
157 "voiceSample": {
158 "bytesBase64": "dm9pY2U=",
159 "mimeType": "audio/wav",
160 "filename": "voice.wav"
161 }
162 }))
163 .unwrap();
164
165 assert_eq!(params.audio_format.as_deref(), Some("wav"));
166 assert_eq!(
167 params.voice_sample.unwrap().bytes_base64.as_str(),
168 "dm9pY2U="
169 );
170 }
171}