voice_engine/synthesis/
mod.rs

1use anyhow::Result;
2use async_trait::async_trait;
3use bytes::Bytes;
4use futures::stream::BoxStream;
5use serde::{Deserialize, Serialize};
6use std::collections::HashMap;
7use tokio::sync::mpsc;
8mod aliyun;
9mod deepgram;
10mod tencent_cloud;
11mod tencent_cloud_basic;
12mod voiceapi;
13pub use aliyun::AliyunTtsClient;
14pub use deepgram::DeepegramTtsClient;
15pub use tencent_cloud::TencentCloudTtsClient;
16pub use tencent_cloud_basic::TencentCloudTtsBasicClient;
17pub use voiceapi::VoiceApiTtsClient;
18
19#[derive(Clone, Default)]
20pub struct SynthesisCommand {
21    pub text: String,
22    pub speaker: Option<String>,
23    pub play_id: Option<String>,
24    pub streaming: bool,
25    pub end_of_stream: bool,
26    pub option: SynthesisOption,
27    pub base64: bool,
28}
29pub type SynthesisCommandSender = mpsc::UnboundedSender<SynthesisCommand>;
30pub type SynthesisCommandReceiver = mpsc::UnboundedReceiver<SynthesisCommand>;
31pub use self::tencent_cloud::strip_emoji_chars;
32
33#[derive(Debug, Clone, Serialize, Hash, Eq, PartialEq)]
34pub enum SynthesisType {
35    #[serde(rename = "tencent")]
36    TencentCloud,
37    #[serde(rename = "voiceapi")]
38    VoiceApi,
39    #[serde(rename = "aliyun")]
40    Aliyun,
41    #[serde(rename = "deepgram")]
42    Deepgram,
43    #[serde(rename = "other")]
44    Other(String),
45}
46
47impl std::fmt::Display for SynthesisType {
48    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
49        match self {
50            SynthesisType::TencentCloud => write!(f, "tencent"),
51            SynthesisType::VoiceApi => write!(f, "voiceapi"),
52            SynthesisType::Aliyun => write!(f, "aliyun"),
53            SynthesisType::Deepgram => write!(f, "deepgram"),
54            SynthesisType::Other(provider) => write!(f, "{}", provider),
55        }
56    }
57}
58
59impl<'de> Deserialize<'de> for SynthesisType {
60    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
61    where
62        D: serde::Deserializer<'de>,
63    {
64        let value = String::deserialize(deserializer)?;
65        match value.as_str() {
66            "tencent" => Ok(SynthesisType::TencentCloud),
67            "voiceapi" => Ok(SynthesisType::VoiceApi),
68            "aliyun" => Ok(SynthesisType::Aliyun),
69            "deepgram" => Ok(SynthesisType::Deepgram),
70            _ => Ok(SynthesisType::Other(value)),
71        }
72    }
73}
74
75#[cfg(test)]
76mod tests;
77#[derive(Debug, Clone, Deserialize, Serialize)]
78#[serde(rename_all = "camelCase")]
79#[serde(default)]
80pub struct SynthesisOption {
81    pub samplerate: Option<i32>,
82    pub provider: Option<SynthesisType>,
83    pub speed: Option<f32>,
84    pub app_id: Option<String>,
85    pub secret_id: Option<String>,
86    pub secret_key: Option<String>,
87    pub volume: Option<i32>,
88    pub speaker: Option<String>,
89    pub codec: Option<String>,
90    pub subtitle: Option<bool>,
91    /// emotion: neutral、sad、happy、angry、fear、news、story、radio、poetry、
92    /// call、sajiao、disgusted、amaze、peaceful、exciting、aojiao、jieshuo
93    pub emotion: Option<String>,
94    pub endpoint: Option<String>,
95    pub extra: Option<HashMap<String, String>>,
96    pub max_concurrent_tasks: Option<usize>,
97}
98
99impl SynthesisOption {
100    pub fn merge_with(&self, option: Option<SynthesisOption>) -> Self {
101        if let Some(other) = option {
102            Self {
103                samplerate: other.samplerate.or(self.samplerate),
104                provider: other.provider.or(self.provider.clone()),
105                speed: other.speed.or(self.speed),
106                app_id: other.app_id.or(self.app_id.clone()),
107                secret_id: other.secret_id.or(self.secret_id.clone()),
108                secret_key: other.secret_key.or(self.secret_key.clone()),
109                volume: other.volume.or(self.volume),
110                speaker: other.speaker.or(self.speaker.clone()),
111                codec: other.codec.or(self.codec.clone()),
112                subtitle: other.subtitle.or(self.subtitle),
113                emotion: other.emotion.or(self.emotion.clone()),
114                endpoint: other.endpoint.or(self.endpoint.clone()),
115                extra: other.extra.or(self.extra.clone()),
116                max_concurrent_tasks: other.max_concurrent_tasks.or(self.max_concurrent_tasks),
117            }
118        } else {
119            self.clone()
120        }
121    }
122}
123
124#[derive(Debug)]
125pub enum SynthesisEvent {
126    /// Raw audio data chunk
127    AudioChunk(Bytes),
128    /// Progress information including completion status
129    Subtitles(Vec<Subtitle>),
130    Finished,
131}
132
133#[derive(Debug, Clone)]
134pub struct Subtitle {
135    pub text: String,
136    pub begin_time: u32,
137    pub end_time: u32,
138    pub begin_index: u32,
139    pub end_index: u32,
140}
141
142impl Subtitle {
143    pub fn new(
144        text: String,
145        begin_time: u32,
146        end_time: u32,
147        begin_index: u32,
148        end_index: u32,
149    ) -> Self {
150        Self {
151            text,
152            begin_time,
153            end_time,
154            begin_index,
155            end_index,
156        }
157    }
158}
159
160// calculate audio duration from bytes size and sample rate
161pub fn bytes_size_to_duration(bytes: usize, sample_rate: u32) -> u32 {
162    (500.0 * bytes as f32 / sample_rate as f32) as u32
163}
164
165#[async_trait]
166pub trait SynthesisClient: Send {
167    // provider of the synthesis client.
168    fn provider(&self) -> SynthesisType;
169
170    // connect to the synthesis service.
171    // (cmd_seq, result), return the cmd_seq that passed from `synthesize`
172    async fn start(
173        &mut self,
174    ) -> Result<BoxStream<'static, (Option<usize>, Result<SynthesisEvent>)>>;
175
176    // send text to the synthesis service.
177    // `cmd_seq` and `option` are used for non streaming mode
178    // for streaming mode, `cmd_seq` and `option` are None
179    async fn synthesize(
180        &mut self,
181        text: &str,
182        cmd_seq: Option<usize>,
183        option: Option<SynthesisOption>,
184    ) -> Result<()>;
185
186    async fn stop(&mut self) -> Result<()>;
187}
188
189impl Default for SynthesisOption {
190    fn default() -> Self {
191        Self {
192            samplerate: Some(16000),
193            provider: None,
194            speed: Some(1.0),
195            app_id: None,
196            secret_id: None,
197            secret_key: None,
198            volume: Some(5), // 0-10
199            speaker: None,
200            codec: Some("pcm".to_string()),
201            subtitle: None,
202            emotion: None,
203            endpoint: None,
204            extra: None,
205            max_concurrent_tasks: None,
206        }
207    }
208}
209
210impl SynthesisOption {
211    pub fn check_default(&mut self) {
212        if let Some(provider) = &self.provider {
213            match provider.to_string().as_str() {
214                "tencent" | "tencent_basic" => {
215                    if self.app_id.is_none() {
216                        self.app_id = std::env::var("TENCENT_APPID").ok();
217                    }
218                    if self.secret_id.is_none() {
219                        self.secret_id = std::env::var("TENCENT_SECRET_ID").ok();
220                    }
221                    if self.secret_key.is_none() {
222                        self.secret_key = std::env::var("TENCENT_SECRET_KEY").ok();
223                    }
224                }
225                "voiceapi" => {
226                    // Set the endpoint from environment variable if not already set
227                    if self.endpoint.is_none() {
228                        self.endpoint = std::env::var("VOICEAPI_ENDPOINT")
229                            .ok()
230                            .or_else(|| Some("http://localhost:8000".to_string()));
231                    }
232                    // Set speaker ID from environment variable if not already set
233                    if self.speaker.is_none() {
234                        self.speaker = std::env::var("VOICEAPI_SPEAKER_ID")
235                            .ok()
236                            .or_else(|| Some("0".to_string()));
237                    }
238                }
239                "aliyun" => {
240                    if self.secret_key.is_none() {
241                        self.secret_key = std::env::var("DASHSCOPE_API_KEY").ok();
242                    }
243                }
244                "deepgram" => {
245                    if self.secret_key.is_none() {
246                        self.secret_key = std::env::var("DEEPGRAM_API_KEY").ok();
247                    }
248                }
249                _ => {}
250            }
251        }
252    }
253}