active_call/synthesis/
mod.rs

1use anyhow::Result;
2use async_trait::async_trait;
3use bytes::Bytes;
4use futures::stream::BoxStream;
5use serde::{Deserialize, Serialize};
6use std::collections::HashMap;
7use tokio::sync::mpsc;
8
9mod aliyun;
10mod deepgram;
11mod tencent_cloud;
12mod tencent_cloud_basic;
13
14#[cfg(feature = "offline")]
15mod supertonic;
16
17pub use aliyun::AliyunTtsClient;
18pub use deepgram::DeepegramTtsClient;
19pub use tencent_cloud::TencentCloudTtsClient;
20pub use tencent_cloud_basic::TencentCloudTtsBasicClient;
21
22#[cfg(feature = "offline")]
23pub use supertonic::SupertonicTtsClient;
24
25#[derive(Clone, Default)]
26pub struct SynthesisCommand {
27    pub text: String,
28    pub speaker: Option<String>,
29    pub play_id: Option<String>,
30    pub streaming: bool,
31    pub end_of_stream: bool,
32    pub option: SynthesisOption,
33    pub base64: bool,
34}
35pub type SynthesisCommandSender = mpsc::UnboundedSender<SynthesisCommand>;
36pub type SynthesisCommandReceiver = mpsc::UnboundedReceiver<SynthesisCommand>;
37
38#[derive(Debug, Clone, Serialize, Hash, Eq, PartialEq)]
39pub enum SynthesisType {
40    #[serde(rename = "tencent")]
41    TencentCloud,
42    #[serde(rename = "aliyun")]
43    Aliyun,
44    #[serde(rename = "deepgram")]
45    Deepgram,
46    #[cfg(feature = "offline")]
47    #[serde(rename = "supertonic")]
48    Supertonic,
49    #[serde(rename = "other")]
50    Other(String),
51}
52
53impl std::fmt::Display for SynthesisType {
54    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55        match self {
56            SynthesisType::TencentCloud => write!(f, "tencent"),
57            SynthesisType::Aliyun => write!(f, "aliyun"),
58            SynthesisType::Deepgram => write!(f, "deepgram"),
59            #[cfg(feature = "offline")]
60            SynthesisType::Supertonic => write!(f, "supertonic"),
61            SynthesisType::Other(provider) => write!(f, "{}", provider),
62        }
63    }
64}
65
66impl<'de> Deserialize<'de> for SynthesisType {
67    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
68    where
69        D: serde::Deserializer<'de>,
70    {
71        let value = String::deserialize(deserializer)?;
72        match value.as_str() {
73            "tencent" => Ok(SynthesisType::TencentCloud),
74            "aliyun" => Ok(SynthesisType::Aliyun),
75            "deepgram" => Ok(SynthesisType::Deepgram),
76            #[cfg(feature = "offline")]
77            "supertonic" => Ok(SynthesisType::Supertonic),
78            _ => Ok(SynthesisType::Other(value)),
79        }
80    }
81}
82
83#[cfg(test)]
84mod tests;
85#[derive(Debug, Clone, Deserialize, Serialize)]
86#[serde(rename_all = "camelCase")]
87#[serde(default)]
88pub struct SynthesisOption {
89    pub samplerate: Option<i32>,
90    pub provider: Option<SynthesisType>,
91    pub speed: Option<f32>,
92    pub app_id: Option<String>,
93    pub secret_id: Option<String>,
94    pub secret_key: Option<String>,
95    pub volume: Option<i32>,
96    pub speaker: Option<String>,
97    pub codec: Option<String>,
98    pub subtitle: Option<bool>,
99    pub model: Option<String>,
100    pub language: Option<String>,
101    /// emotion: neutral、sad、happy、angry、fear、news、story、radio、poetry、
102    /// call、sajiao、disgusted、amaze、peaceful、exciting、aojiao、jieshuo
103    pub emotion: Option<String>,
104    pub endpoint: Option<String>,
105    pub extra: Option<HashMap<String, String>>,
106    pub max_concurrent_tasks: Option<usize>,
107}
108
109impl SynthesisOption {
110    pub fn merge_with(&self, option: Option<SynthesisOption>) -> Self {
111        if let Some(other) = option {
112            Self {
113                samplerate: other.samplerate.or(self.samplerate),
114                provider: other.provider.or(self.provider.clone()),
115                speed: other.speed.or(self.speed),
116                app_id: other.app_id.or(self.app_id.clone()),
117                secret_id: other.secret_id.or(self.secret_id.clone()),
118                secret_key: other.secret_key.or(self.secret_key.clone()),
119                volume: other.volume.or(self.volume),
120                speaker: other.speaker.or(self.speaker.clone()),
121                codec: other.codec.or(self.codec.clone()),
122                subtitle: other.subtitle.or(self.subtitle),
123                model: other.model.or(self.model.clone()),
124                language: other.language.or(self.language.clone()),
125                emotion: other.emotion.or(self.emotion.clone()),
126                endpoint: other.endpoint.or(self.endpoint.clone()),
127                extra: other.extra.or(self.extra.clone()),
128                max_concurrent_tasks: other.max_concurrent_tasks.or(self.max_concurrent_tasks),
129            }
130        } else {
131            self.clone()
132        }
133    }
134}
135
136#[derive(Debug)]
137pub enum SynthesisEvent {
138    /// Raw audio data chunk
139    AudioChunk(Bytes),
140    /// Progress information including completion status
141    Subtitles(Vec<Subtitle>),
142    Finished,
143}
144
145#[derive(Debug, Clone)]
146pub struct Subtitle {
147    pub text: String,
148    pub begin_time: u32,
149    pub end_time: u32,
150    pub begin_index: u32,
151    pub end_index: u32,
152}
153
154impl Subtitle {
155    pub fn new(
156        text: String,
157        begin_time: u32,
158        end_time: u32,
159        begin_index: u32,
160        end_index: u32,
161    ) -> Self {
162        Self {
163            text,
164            begin_time,
165            end_time,
166            begin_index,
167            end_index,
168        }
169    }
170}
171
172// calculate audio duration from bytes size and sample rate
173pub fn bytes_size_to_duration(bytes: usize, sample_rate: u32) -> u32 {
174    (500.0 * bytes as f32 / sample_rate as f32) as u32
175}
176
177#[async_trait]
178pub trait SynthesisClient: Send {
179    // provider of the synthesis client.
180    fn provider(&self) -> SynthesisType;
181
182    // connect to the synthesis service.
183    // (cmd_seq, result), return the cmd_seq that passed from `synthesize`
184    async fn start(
185        &mut self,
186    ) -> Result<BoxStream<'static, (Option<usize>, Result<SynthesisEvent>)>>;
187
188    // send text to the synthesis service.
189    // `cmd_seq` and `option` are used for non streaming mode
190    // for streaming mode, `cmd_seq` and `option` are None
191    async fn synthesize(
192        &mut self,
193        text: &str,
194        cmd_seq: Option<usize>,
195        option: Option<SynthesisOption>,
196    ) -> Result<()>;
197
198    async fn stop(&mut self) -> Result<()>;
199}
200
201impl Default for SynthesisOption {
202    fn default() -> Self {
203        Self {
204            samplerate: Some(16000),
205            provider: None,
206            speed: Some(1.0),
207            app_id: None,
208            secret_id: None,
209            secret_key: None,
210            volume: Some(5), // 0-10
211            speaker: None,
212            codec: Some("pcm".to_string()),
213            subtitle: None,
214            model: None,
215            language: None,
216            emotion: None,
217            endpoint: None,
218            extra: None,
219            max_concurrent_tasks: None,
220        }
221    }
222}
223
224impl SynthesisOption {
225    pub fn check_default(&mut self) {
226        if let Some(provider) = &self.provider {
227            match provider.to_string().as_str() {
228                "tencent" | "tencent_basic" => {
229                    if self.app_id.is_none() {
230                        self.app_id = std::env::var("TENCENT_APPID").ok();
231                    }
232                    if self.secret_id.is_none() {
233                        self.secret_id = std::env::var("TENCENT_SECRET_ID").ok();
234                    }
235                    if self.secret_key.is_none() {
236                        self.secret_key = std::env::var("TENCENT_SECRET_KEY").ok();
237                    }
238                }
239                "voiceapi" => {
240                    // Set the endpoint from environment variable if not already set
241                    if self.endpoint.is_none() {
242                        self.endpoint = std::env::var("VOICEAPI_ENDPOINT")
243                            .ok()
244                            .or_else(|| Some("http://localhost:8000".to_string()));
245                    }
246                    // Set speaker ID from environment variable if not already set
247                    if self.speaker.is_none() {
248                        self.speaker = std::env::var("VOICEAPI_SPEAKER_ID")
249                            .ok()
250                            .or_else(|| Some("0".to_string()));
251                    }
252                }
253                "aliyun" => {
254                    if self.secret_key.is_none() {
255                        self.secret_key = std::env::var("DASHSCOPE_API_KEY").ok();
256                    }
257                }
258                "deepgram" => {
259                    if self.secret_key.is_none() {
260                        self.secret_key = std::env::var("DEEPGRAM_API_KEY").ok();
261                    }
262                }
263                _ => {}
264            }
265        }
266    }
267}