1use std::collections::BTreeMap;
2
3use serde::{Deserialize, Serialize};
4
5use crate::extension::{SpeechSynthesizerId, SpeechTranscriberId};
6use crate::inference::ProviderAuthType;
7
8#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
9#[serde(rename_all = "camelCase")]
10pub struct SpeechCapabilities {
11 pub batch: bool,
12 pub streaming: bool,
13 pub diarization: bool,
14 pub timestamps: bool,
15 pub language_hints: bool,
16 pub prompt: bool,
17}
18
19#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
20#[serde(rename_all = "camelCase")]
21pub struct SpeechSynthesisCapabilities {
22 pub batch: bool,
23 pub streaming: bool,
24 pub builtin_voices: bool,
25 pub voice_design: bool,
26 pub voice_clone: bool,
27 pub prompt: bool,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
31pub struct SpeechProviderMetadata {
32 pub name: String,
33 pub description: Option<String>,
34 pub auth_type: ProviderAuthType,
35 pub auth_label: Option<String>,
36 pub auth_configured: Option<bool>,
37 pub recommended: bool,
38 pub sort_order: i32,
39}
40
41impl SpeechProviderMetadata {
42 pub fn local(name: impl Into<String>) -> Self {
43 Self {
44 name: name.into(),
45 description: None,
46 auth_type: ProviderAuthType::None,
47 auth_label: None,
48 auth_configured: Some(true),
49 recommended: false,
50 sort_order: 100,
51 }
52 }
53}
54
55#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
56pub struct SpeechModelDescriptor {
57 pub id: String,
58 pub name: String,
59 pub description: Option<String>,
60 pub capabilities: SpeechCapabilities,
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
64pub struct SpeechSynthesisModelDescriptor {
65 pub id: String,
66 pub name: String,
67 pub description: Option<String>,
68 pub capabilities: SpeechSynthesisCapabilities,
69}
70
71#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
72pub struct SpeechAudio {
73 pub bytes: Vec<u8>,
74 pub mime_type: String,
75 pub filename: Option<String>,
76}
77
78#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
79pub struct SpeechTranscriptionRequest {
80 pub model: String,
81 pub audio: SpeechAudio,
82 pub language: Option<String>,
83 pub prompt: Option<String>,
84 pub diarization: bool,
85 #[serde(default)]
86 pub metadata: BTreeMap<String, serde_json::Value>,
87}
88
89#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
90pub struct SpeechSegment {
91 pub text: String,
92 pub start_millis: Option<u64>,
93 pub end_millis: Option<u64>,
94 pub speaker: Option<String>,
95 pub confidence: Option<f32>,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
99pub struct SpeechTranscriptionResult {
100 pub text: String,
101 pub language: Option<String>,
102 pub duration_millis: Option<u64>,
103 pub segments: Vec<SpeechSegment>,
104 pub provider_response_id: Option<String>,
105 #[serde(default)]
106 pub metadata: serde_json::Value,
107}
108
109#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
110pub struct SpeechSynthesisRequest {
111 pub model: String,
112 pub text: String,
113 pub voice: Option<String>,
114 pub audio_format: Option<String>,
115 pub prompt: Option<String>,
116 pub voice_sample: Option<SpeechAudio>,
117 #[serde(default)]
118 pub metadata: BTreeMap<String, serde_json::Value>,
119}
120
121#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
122pub struct SpeechSynthesisResult {
123 pub audio: SpeechAudio,
124 pub duration_millis: Option<u64>,
125 pub provider_response_id: Option<String>,
126 #[serde(default)]
127 pub metadata: serde_json::Value,
128}
129
130#[derive(Debug, Clone, Copy)]
131pub struct SpeechProviderContext<'a> {
132 pub provider_id: &'a str,
133}
134
135#[async_trait::async_trait]
136pub trait SpeechTranscriber: Send + Sync + 'static {
137 fn id(&self) -> SpeechTranscriberId;
138
139 fn capabilities(&self) -> SpeechCapabilities;
140
141 fn metadata(&self) -> SpeechProviderMetadata;
142
143 async fn list_models(
144 &self,
145 ctx: SpeechProviderContext<'_>,
146 ) -> anyhow::Result<Vec<SpeechModelDescriptor>>;
147
148 async fn transcribe(
149 &self,
150 ctx: SpeechProviderContext<'_>,
151 request: SpeechTranscriptionRequest,
152 ) -> anyhow::Result<SpeechTranscriptionResult>;
153}
154
155#[async_trait::async_trait]
156pub trait SpeechSynthesizer: Send + Sync + 'static {
157 fn id(&self) -> SpeechSynthesizerId;
158
159 fn capabilities(&self) -> SpeechSynthesisCapabilities;
160
161 fn metadata(&self) -> SpeechProviderMetadata;
162
163 async fn list_models(
164 &self,
165 ctx: SpeechProviderContext<'_>,
166 ) -> anyhow::Result<Vec<SpeechSynthesisModelDescriptor>>;
167
168 async fn synthesize(
169 &self,
170 ctx: SpeechProviderContext<'_>,
171 request: SpeechSynthesisRequest,
172 ) -> anyhow::Result<SpeechSynthesisResult>;
173}