active_call/synthesis/
mod.rs

1use anyhow::Result;
2use async_trait::async_trait;
3use bytes::Bytes;
4use futures::stream::BoxStream;
5use serde::{Deserialize, Serialize};
6use std::collections::HashMap;
7use tokio::sync::mpsc;
8mod aliyun;
9mod deepgram;
10mod tencent_cloud;
11mod tencent_cloud_basic;
12mod voiceapi;
13pub use aliyun::AliyunTtsClient;
14pub use deepgram::DeepegramTtsClient;
15pub use tencent_cloud::TencentCloudTtsClient;
16pub use tencent_cloud_basic::TencentCloudTtsBasicClient;
17pub use voiceapi::VoiceApiTtsClient;
18
19#[derive(Clone, Default)]
20pub struct SynthesisCommand {
21    pub text: String,
22    pub speaker: Option<String>,
23    pub play_id: Option<String>,
24    pub streaming: bool,
25    pub end_of_stream: bool,
26    pub option: SynthesisOption,
27    pub base64: bool,
28}
29pub type SynthesisCommandSender = mpsc::UnboundedSender<SynthesisCommand>;
30pub type SynthesisCommandReceiver = mpsc::UnboundedReceiver<SynthesisCommand>;
31
32#[derive(Debug, Clone, Serialize, Hash, Eq, PartialEq)]
33pub enum SynthesisType {
34    #[serde(rename = "tencent")]
35    TencentCloud,
36    #[serde(rename = "voiceapi")]
37    VoiceApi,
38    #[serde(rename = "aliyun")]
39    Aliyun,
40    #[serde(rename = "deepgram")]
41    Deepgram,
42    #[serde(rename = "other")]
43    Other(String),
44}
45
46impl std::fmt::Display for SynthesisType {
47    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48        match self {
49            SynthesisType::TencentCloud => write!(f, "tencent"),
50            SynthesisType::VoiceApi => write!(f, "voiceapi"),
51            SynthesisType::Aliyun => write!(f, "aliyun"),
52            SynthesisType::Deepgram => write!(f, "deepgram"),
53            SynthesisType::Other(provider) => write!(f, "{}", provider),
54        }
55    }
56}
57
58impl<'de> Deserialize<'de> for SynthesisType {
59    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
60    where
61        D: serde::Deserializer<'de>,
62    {
63        let value = String::deserialize(deserializer)?;
64        match value.as_str() {
65            "tencent" => Ok(SynthesisType::TencentCloud),
66            "voiceapi" => Ok(SynthesisType::VoiceApi),
67            "aliyun" => Ok(SynthesisType::Aliyun),
68            "deepgram" => Ok(SynthesisType::Deepgram),
69            _ => Ok(SynthesisType::Other(value)),
70        }
71    }
72}
73
74#[cfg(test)]
75mod tests;
76#[derive(Debug, Clone, Deserialize, Serialize)]
77#[serde(rename_all = "camelCase")]
78#[serde(default)]
79pub struct SynthesisOption {
80    pub samplerate: Option<i32>,
81    pub provider: Option<SynthesisType>,
82    pub speed: Option<f32>,
83    pub app_id: Option<String>,
84    pub secret_id: Option<String>,
85    pub secret_key: Option<String>,
86    pub volume: Option<i32>,
87    pub speaker: Option<String>,
88    pub codec: Option<String>,
89    pub subtitle: Option<bool>,
90    pub model: Option<String>,
91    /// emotion: neutral、sad、happy、angry、fear、news、story、radio、poetry、
92    /// call、sajiao、disgusted、amaze、peaceful、exciting、aojiao、jieshuo
93    pub emotion: Option<String>,
94    pub endpoint: Option<String>,
95    pub extra: Option<HashMap<String, String>>,
96    pub max_concurrent_tasks: Option<usize>,
97}
98
99impl SynthesisOption {
100    pub fn merge_with(&self, option: Option<SynthesisOption>) -> Self {
101        if let Some(other) = option {
102            Self {
103                samplerate: other.samplerate.or(self.samplerate),
104                provider: other.provider.or(self.provider.clone()),
105                speed: other.speed.or(self.speed),
106                app_id: other.app_id.or(self.app_id.clone()),
107                secret_id: other.secret_id.or(self.secret_id.clone()),
108                secret_key: other.secret_key.or(self.secret_key.clone()),
109                volume: other.volume.or(self.volume),
110                speaker: other.speaker.or(self.speaker.clone()),
111                codec: other.codec.or(self.codec.clone()),
112                subtitle: other.subtitle.or(self.subtitle),
113                model: other.model.or(self.model.clone()),
114                emotion: other.emotion.or(self.emotion.clone()),
115                endpoint: other.endpoint.or(self.endpoint.clone()),
116                extra: other.extra.or(self.extra.clone()),
117                max_concurrent_tasks: other.max_concurrent_tasks.or(self.max_concurrent_tasks),
118            }
119        } else {
120            self.clone()
121        }
122    }
123}
124
125#[derive(Debug)]
126pub enum SynthesisEvent {
127    /// Raw audio data chunk
128    AudioChunk(Bytes),
129    /// Progress information including completion status
130    Subtitles(Vec<Subtitle>),
131    Finished,
132}
133
134#[derive(Debug, Clone)]
135pub struct Subtitle {
136    pub text: String,
137    pub begin_time: u32,
138    pub end_time: u32,
139    pub begin_index: u32,
140    pub end_index: u32,
141}
142
143impl Subtitle {
144    pub fn new(
145        text: String,
146        begin_time: u32,
147        end_time: u32,
148        begin_index: u32,
149        end_index: u32,
150    ) -> Self {
151        Self {
152            text,
153            begin_time,
154            end_time,
155            begin_index,
156            end_index,
157        }
158    }
159}
160
161// calculate audio duration from bytes size and sample rate
162pub fn bytes_size_to_duration(bytes: usize, sample_rate: u32) -> u32 {
163    (500.0 * bytes as f32 / sample_rate as f32) as u32
164}
165
166#[async_trait]
167pub trait SynthesisClient: Send {
168    // provider of the synthesis client.
169    fn provider(&self) -> SynthesisType;
170
171    // connect to the synthesis service.
172    // (cmd_seq, result), return the cmd_seq that passed from `synthesize`
173    async fn start(
174        &mut self,
175    ) -> Result<BoxStream<'static, (Option<usize>, Result<SynthesisEvent>)>>;
176
177    // send text to the synthesis service.
178    // `cmd_seq` and `option` are used for non streaming mode
179    // for streaming mode, `cmd_seq` and `option` are None
180    async fn synthesize(
181        &mut self,
182        text: &str,
183        cmd_seq: Option<usize>,
184        option: Option<SynthesisOption>,
185    ) -> Result<()>;
186
187    async fn stop(&mut self) -> Result<()>;
188}
189
190impl Default for SynthesisOption {
191    fn default() -> Self {
192        Self {
193            samplerate: Some(16000),
194            provider: None,
195            speed: Some(1.0),
196            app_id: None,
197            secret_id: None,
198            secret_key: None,
199            volume: Some(5), // 0-10
200            speaker: None,
201            codec: Some("pcm".to_string()),
202            subtitle: None,
203            model: None,
204            emotion: None,
205            endpoint: None,
206            extra: None,
207            max_concurrent_tasks: None,
208        }
209    }
210}
211
212impl SynthesisOption {
213    pub fn check_default(&mut self) {
214        if let Some(provider) = &self.provider {
215            match provider.to_string().as_str() {
216                "tencent" | "tencent_basic" => {
217                    if self.app_id.is_none() {
218                        self.app_id = std::env::var("TENCENT_APPID").ok();
219                    }
220                    if self.secret_id.is_none() {
221                        self.secret_id = std::env::var("TENCENT_SECRET_ID").ok();
222                    }
223                    if self.secret_key.is_none() {
224                        self.secret_key = std::env::var("TENCENT_SECRET_KEY").ok();
225                    }
226                }
227                "voiceapi" => {
228                    // Set the endpoint from environment variable if not already set
229                    if self.endpoint.is_none() {
230                        self.endpoint = std::env::var("VOICEAPI_ENDPOINT")
231                            .ok()
232                            .or_else(|| Some("http://localhost:8000".to_string()));
233                    }
234                    // Set speaker ID from environment variable if not already set
235                    if self.speaker.is_none() {
236                        self.speaker = std::env::var("VOICEAPI_SPEAKER_ID")
237                            .ok()
238                            .or_else(|| Some("0".to_string()));
239                    }
240                }
241                "aliyun" => {
242                    if self.secret_key.is_none() {
243                        self.secret_key = std::env::var("DASHSCOPE_API_KEY").ok();
244                    }
245                }
246                "deepgram" => {
247                    if self.secret_key.is_none() {
248                        self.secret_key = std::env::var("DEEPGRAM_API_KEY").ok();
249                    }
250                }
251                _ => {}
252            }
253        }
254    }
255}