active_call/synthesis/
mod.rs

1use anyhow::Result;
2use async_trait::async_trait;
3use bytes::Bytes;
4use futures::stream::BoxStream;
5use serde::{Deserialize, Serialize};
6use std::collections::HashMap;
7use tokio::sync::mpsc;
8
9mod aliyun;
10mod deepgram;
11mod msedge;
12mod tencent_cloud;
13mod tencent_cloud_basic;
14
15#[cfg(feature = "offline")]
16mod supertonic;
17
18pub use aliyun::AliyunTtsClient;
19pub use deepgram::DeepegramTtsClient;
20pub use msedge::MsEdgeTtsClient;
21pub use tencent_cloud::TencentCloudTtsClient;
22pub use tencent_cloud_basic::TencentCloudTtsBasicClient;
23
24#[cfg(feature = "offline")]
25pub use supertonic::SupertonicTtsClient;
26
27#[derive(Clone, Default)]
28pub struct SynthesisCommand {
29    pub text: String,
30    pub speaker: Option<String>,
31    pub play_id: Option<String>,
32    pub streaming: bool,
33    pub end_of_stream: bool,
34    pub option: SynthesisOption,
35    pub base64: bool,
36}
37pub type SynthesisCommandSender = mpsc::UnboundedSender<SynthesisCommand>;
38pub type SynthesisCommandReceiver = mpsc::UnboundedReceiver<SynthesisCommand>;
39
40#[derive(Debug, Clone, Serialize, Hash, Eq, PartialEq)]
41pub enum SynthesisType {
42    #[serde(rename = "tencent")]
43    TencentCloud,
44    #[serde(rename = "aliyun")]
45    Aliyun,
46    #[serde(rename = "deepgram")]
47    Deepgram,
48    #[serde(rename = "msedge")]
49    MsEdge,
50    #[cfg(feature = "offline")]
51    #[serde(rename = "supertonic")]
52    Supertonic,
53    #[serde(rename = "other")]
54    Other(String),
55}
56
57impl std::fmt::Display for SynthesisType {
58    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
59        match self {
60            SynthesisType::TencentCloud => write!(f, "tencent"),
61            SynthesisType::Aliyun => write!(f, "aliyun"),
62            SynthesisType::Deepgram => write!(f, "deepgram"),
63            SynthesisType::MsEdge => write!(f, "msedge"),
64            #[cfg(feature = "offline")]
65            SynthesisType::Supertonic => write!(f, "supertonic"),
66            SynthesisType::Other(provider) => write!(f, "{}", provider),
67        }
68    }
69}
70
71impl<'de> Deserialize<'de> for SynthesisType {
72    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
73    where
74        D: serde::Deserializer<'de>,
75    {
76        let value = String::deserialize(deserializer)?;
77        match value.as_str() {
78            "tencent" => Ok(SynthesisType::TencentCloud),
79            "aliyun" => Ok(SynthesisType::Aliyun),
80            "deepgram" => Ok(SynthesisType::Deepgram),
81            "msedge" => Ok(SynthesisType::MsEdge),
82            #[cfg(feature = "offline")]
83            "supertonic" => Ok(SynthesisType::Supertonic),
84            _ => Ok(SynthesisType::Other(value)),
85        }
86    }
87}
88
89#[cfg(test)]
90mod tests;
91#[derive(Debug, Clone, Deserialize, Serialize)]
92#[serde(rename_all = "camelCase")]
93#[serde(default)]
94pub struct SynthesisOption {
95    pub samplerate: Option<i32>,
96    pub provider: Option<SynthesisType>,
97    pub speed: Option<f32>,
98    pub app_id: Option<String>,
99    pub secret_id: Option<String>,
100    pub secret_key: Option<String>,
101    pub volume: Option<i32>,
102    pub speaker: Option<String>,
103    pub codec: Option<String>,
104    pub subtitle: Option<bool>,
105    pub model: Option<String>,
106    pub language: Option<String>,
107    /// emotion: neutral、sad、happy、angry、fear、news、story、radio、poetry、
108    /// call、sajiao、disgusted、amaze、peaceful、exciting、aojiao、jieshuo
109    pub emotion: Option<String>,
110    pub endpoint: Option<String>,
111    pub extra: Option<HashMap<String, String>>,
112    pub max_concurrent_tasks: Option<usize>,
113}
114
115impl SynthesisOption {
116    pub fn merge_with(&self, option: Option<SynthesisOption>) -> Self {
117        if let Some(other) = option {
118            Self {
119                samplerate: other.samplerate.or(self.samplerate),
120                provider: other.provider.or(self.provider.clone()),
121                speed: other.speed.or(self.speed),
122                app_id: other.app_id.or(self.app_id.clone()),
123                secret_id: other.secret_id.or(self.secret_id.clone()),
124                secret_key: other.secret_key.or(self.secret_key.clone()),
125                volume: other.volume.or(self.volume),
126                speaker: other.speaker.or(self.speaker.clone()),
127                codec: other.codec.or(self.codec.clone()),
128                subtitle: other.subtitle.or(self.subtitle),
129                model: other.model.or(self.model.clone()),
130                language: other.language.or(self.language.clone()),
131                emotion: other.emotion.or(self.emotion.clone()),
132                endpoint: other.endpoint.or(self.endpoint.clone()),
133                extra: other.extra.or(self.extra.clone()),
134                max_concurrent_tasks: other.max_concurrent_tasks.or(self.max_concurrent_tasks),
135            }
136        } else {
137            self.clone()
138        }
139    }
140}
141
142#[derive(Debug)]
143pub enum SynthesisEvent {
144    /// Raw audio data chunk
145    AudioChunk(Bytes),
146    /// Progress information including completion status
147    Subtitles(Vec<Subtitle>),
148    Finished,
149}
150
151#[derive(Debug, Clone)]
152pub struct Subtitle {
153    pub text: String,
154    pub begin_time: u32,
155    pub end_time: u32,
156    pub begin_index: u32,
157    pub end_index: u32,
158}
159
160impl Subtitle {
161    pub fn new(
162        text: String,
163        begin_time: u32,
164        end_time: u32,
165        begin_index: u32,
166        end_index: u32,
167    ) -> Self {
168        Self {
169            text,
170            begin_time,
171            end_time,
172            begin_index,
173            end_index,
174        }
175    }
176}
177
178// calculate audio duration from bytes size and sample rate
179pub fn bytes_size_to_duration(bytes: usize, sample_rate: u32) -> u32 {
180    (500.0 * bytes as f32 / sample_rate as f32) as u32
181}
182
183#[async_trait]
184pub trait SynthesisClient: Send {
185    // provider of the synthesis client.
186    fn provider(&self) -> SynthesisType;
187
188    // connect to the synthesis service.
189    // (cmd_seq, result), return the cmd_seq that passed from `synthesize`
190    async fn start(
191        &mut self,
192    ) -> Result<BoxStream<'static, (Option<usize>, Result<SynthesisEvent>)>>;
193
194    // send text to the synthesis service.
195    // `cmd_seq` and `option` are used for non streaming mode
196    // for streaming mode, `cmd_seq` and `option` are None
197    async fn synthesize(
198        &mut self,
199        text: &str,
200        cmd_seq: Option<usize>,
201        option: Option<SynthesisOption>,
202    ) -> Result<()>;
203
204    async fn stop(&mut self) -> Result<()>;
205}
206
207impl Default for SynthesisOption {
208    fn default() -> Self {
209        Self {
210            samplerate: Some(16000),
211            provider: None,
212            speed: Some(1.0),
213            app_id: None,
214            secret_id: None,
215            secret_key: None,
216            volume: Some(5), // 0-10
217            speaker: None,
218            codec: Some("pcm".to_string()),
219            subtitle: None,
220            model: None,
221            language: None,
222            emotion: None,
223            endpoint: None,
224            extra: None,
225            max_concurrent_tasks: None,
226        }
227    }
228}
229
230impl SynthesisOption {
231    pub fn check_default(&mut self) {
232        if let Some(provider) = &self.provider {
233            match provider.to_string().as_str() {
234                "tencent" | "tencent_basic" => {
235                    if self.app_id.is_none() {
236                        self.app_id = std::env::var("TENCENT_APPID").ok();
237                    }
238                    if self.secret_id.is_none() {
239                        self.secret_id = std::env::var("TENCENT_SECRET_ID").ok();
240                    }
241                    if self.secret_key.is_none() {
242                        self.secret_key = std::env::var("TENCENT_SECRET_KEY").ok();
243                    }
244                }
245                "voiceapi" => {
246                    // Set the endpoint from environment variable if not already set
247                    if self.endpoint.is_none() {
248                        self.endpoint = std::env::var("VOICEAPI_ENDPOINT")
249                            .ok()
250                            .or_else(|| Some("http://localhost:8000".to_string()));
251                    }
252                    // Set speaker ID from environment variable if not already set
253                    if self.speaker.is_none() {
254                        self.speaker = std::env::var("VOICEAPI_SPEAKER_ID")
255                            .ok()
256                            .or_else(|| Some("0".to_string()));
257                    }
258                }
259                "aliyun" => {
260                    if self.secret_key.is_none() {
261                        self.secret_key = std::env::var("DASHSCOPE_API_KEY").ok();
262                    }
263                }
264                "deepgram" => {
265                    if self.secret_key.is_none() {
266                        self.secret_key = std::env::var("DEEPGRAM_API_KEY").ok();
267                    }
268                }
269                _ => {}
270            }
271        }
272    }
273}