Skip to main content

roder_api/
speech.rs

1use std::collections::BTreeMap;
2
3use serde::{Deserialize, Serialize};
4
5use crate::extension::{SpeechSynthesizerId, SpeechTranscriberId};
6use crate::inference::ProviderAuthType;
7
8#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
9#[serde(rename_all = "camelCase")]
10pub struct SpeechCapabilities {
11    pub batch: bool,
12    pub streaming: bool,
13    pub diarization: bool,
14    pub timestamps: bool,
15    pub language_hints: bool,
16    pub prompt: bool,
17}
18
19#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
20#[serde(rename_all = "camelCase")]
21pub struct SpeechSynthesisCapabilities {
22    pub batch: bool,
23    pub streaming: bool,
24    pub builtin_voices: bool,
25    pub voice_design: bool,
26    pub voice_clone: bool,
27    pub prompt: bool,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
31pub struct SpeechProviderMetadata {
32    pub name: String,
33    pub description: Option<String>,
34    pub auth_type: ProviderAuthType,
35    pub auth_label: Option<String>,
36    pub auth_configured: Option<bool>,
37    pub recommended: bool,
38    pub sort_order: i32,
39}
40
41impl SpeechProviderMetadata {
42    pub fn local(name: impl Into<String>) -> Self {
43        Self {
44            name: name.into(),
45            description: None,
46            auth_type: ProviderAuthType::None,
47            auth_label: None,
48            auth_configured: Some(true),
49            recommended: false,
50            sort_order: 100,
51        }
52    }
53}
54
55#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
56pub struct SpeechModelDescriptor {
57    pub id: String,
58    pub name: String,
59    pub description: Option<String>,
60    pub capabilities: SpeechCapabilities,
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
64pub struct SpeechSynthesisModelDescriptor {
65    pub id: String,
66    pub name: String,
67    pub description: Option<String>,
68    pub capabilities: SpeechSynthesisCapabilities,
69}
70
71#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
72pub struct SpeechAudio {
73    pub bytes: Vec<u8>,
74    pub mime_type: String,
75    pub filename: Option<String>,
76}
77
78#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
79pub struct SpeechTranscriptionRequest {
80    pub model: String,
81    pub audio: SpeechAudio,
82    pub language: Option<String>,
83    pub prompt: Option<String>,
84    pub diarization: bool,
85    #[serde(default)]
86    pub metadata: BTreeMap<String, serde_json::Value>,
87}
88
89#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
90pub struct SpeechSegment {
91    pub text: String,
92    pub start_millis: Option<u64>,
93    pub end_millis: Option<u64>,
94    pub speaker: Option<String>,
95    pub confidence: Option<f32>,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
99pub struct SpeechTranscriptionResult {
100    pub text: String,
101    pub language: Option<String>,
102    pub duration_millis: Option<u64>,
103    pub segments: Vec<SpeechSegment>,
104    pub provider_response_id: Option<String>,
105    #[serde(default)]
106    pub metadata: serde_json::Value,
107}
108
109#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
110pub struct SpeechSynthesisRequest {
111    pub model: String,
112    pub text: String,
113    pub voice: Option<String>,
114    pub audio_format: Option<String>,
115    pub prompt: Option<String>,
116    pub voice_sample: Option<SpeechAudio>,
117    #[serde(default)]
118    pub metadata: BTreeMap<String, serde_json::Value>,
119}
120
121#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
122pub struct SpeechSynthesisResult {
123    pub audio: SpeechAudio,
124    pub duration_millis: Option<u64>,
125    pub provider_response_id: Option<String>,
126    #[serde(default)]
127    pub metadata: serde_json::Value,
128}
129
130#[derive(Debug, Clone, Copy)]
131pub struct SpeechProviderContext<'a> {
132    pub provider_id: &'a str,
133}
134
135#[async_trait::async_trait]
136pub trait SpeechTranscriber: Send + Sync + 'static {
137    fn id(&self) -> SpeechTranscriberId;
138
139    fn capabilities(&self) -> SpeechCapabilities;
140
141    fn metadata(&self) -> SpeechProviderMetadata;
142
143    async fn list_models(
144        &self,
145        ctx: SpeechProviderContext<'_>,
146    ) -> anyhow::Result<Vec<SpeechModelDescriptor>>;
147
148    async fn transcribe(
149        &self,
150        ctx: SpeechProviderContext<'_>,
151        request: SpeechTranscriptionRequest,
152    ) -> anyhow::Result<SpeechTranscriptionResult>;
153}
154
155#[async_trait::async_trait]
156pub trait SpeechSynthesizer: Send + Sync + 'static {
157    fn id(&self) -> SpeechSynthesizerId;
158
159    fn capabilities(&self) -> SpeechSynthesisCapabilities;
160
161    fn metadata(&self) -> SpeechProviderMetadata;
162
163    async fn list_models(
164        &self,
165        ctx: SpeechProviderContext<'_>,
166    ) -> anyhow::Result<Vec<SpeechSynthesisModelDescriptor>>;
167
168    async fn synthesize(
169        &self,
170        ctx: SpeechProviderContext<'_>,
171        request: SpeechSynthesisRequest,
172    ) -> anyhow::Result<SpeechSynthesisResult>;
173}