Skip to main content

active_call/synthesis/
mod.rs

1use anyhow::Result;
2use async_trait::async_trait;
3use bytes::Bytes;
4use futures::stream::BoxStream;
5use serde::{Deserialize, Serialize};
6use std::collections::HashMap;
7use tokio::sync::mpsc;
8
9mod aliyun;
10mod deepgram;
11mod tencent_cloud;
12mod tencent_cloud_basic;
13
14#[cfg(feature = "offline")]
15mod supertonic;
16
17pub use aliyun::AliyunTtsClient;
18pub use deepgram::DeepegramTtsClient;
19pub use tencent_cloud::TencentCloudTtsClient;
20pub use tencent_cloud_basic::TencentCloudTtsBasicClient;
21
22#[cfg(feature = "offline")]
23pub use supertonic::SupertonicTtsClient;
24
25#[derive(Clone, Default)]
26pub struct SynthesisCommand {
27    pub text: String,
28    pub speaker: Option<String>,
29    pub play_id: Option<String>,
30    pub streaming: bool,
31    pub end_of_stream: bool,
32    pub option: SynthesisOption,
33    pub base64: bool,
34    pub cache_key: Option<String>,
35}
36pub type SynthesisCommandSender = mpsc::UnboundedSender<SynthesisCommand>;
37pub type SynthesisCommandReceiver = mpsc::UnboundedReceiver<SynthesisCommand>;
38
39#[derive(Debug, Clone, Serialize, Hash, Eq, PartialEq)]
40pub enum SynthesisType {
41    #[serde(rename = "tencent")]
42    TencentCloud,
43    #[serde(rename = "aliyun")]
44    Aliyun,
45    #[serde(rename = "deepgram")]
46    Deepgram,
47    #[cfg(feature = "offline")]
48    #[serde(rename = "supertonic")]
49    Supertonic,
50    #[serde(rename = "other")]
51    Other(String),
52}
53
54impl std::fmt::Display for SynthesisType {
55    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
56        match self {
57            SynthesisType::TencentCloud => write!(f, "tencent"),
58            SynthesisType::Aliyun => write!(f, "aliyun"),
59            SynthesisType::Deepgram => write!(f, "deepgram"),
60            #[cfg(feature = "offline")]
61            SynthesisType::Supertonic => write!(f, "supertonic"),
62            SynthesisType::Other(provider) => write!(f, "{}", provider),
63        }
64    }
65}
66
67impl<'de> Deserialize<'de> for SynthesisType {
68    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
69    where
70        D: serde::Deserializer<'de>,
71    {
72        let value = String::deserialize(deserializer)?;
73        match value.as_str() {
74            "tencent" => Ok(SynthesisType::TencentCloud),
75            "aliyun" => Ok(SynthesisType::Aliyun),
76            "deepgram" => Ok(SynthesisType::Deepgram),
77            #[cfg(feature = "offline")]
78            "supertonic" => Ok(SynthesisType::Supertonic),
79            _ => Ok(SynthesisType::Other(value)),
80        }
81    }
82}
83
84#[cfg(test)]
85mod tests;
86#[derive(Debug, Clone, Deserialize, Serialize)]
87#[serde(rename_all = "camelCase")]
88#[serde(default)]
89pub struct SynthesisOption {
90    pub samplerate: Option<i32>,
91    pub provider: Option<SynthesisType>,
92    pub speed: Option<f32>,
93    pub app_id: Option<String>,
94    pub secret_id: Option<String>,
95    pub secret_key: Option<String>,
96    pub volume: Option<i32>,
97    pub speaker: Option<String>,
98    pub codec: Option<String>,
99    pub subtitle: Option<bool>,
100    pub model: Option<String>,
101    pub language: Option<String>,
102    /// emotion: neutral、sad、happy、angry、fear、news、story、radio、poetry、
103    /// call、sajiao、disgusted、amaze、peaceful、exciting、aojiao、jieshuo
104    pub emotion: Option<String>,
105    pub endpoint: Option<String>,
106    pub extra: Option<HashMap<String, String>>,
107    pub max_concurrent_tasks: Option<usize>,
108    pub session_id: Option<String>,
109}
110
111impl SynthesisOption {
112    pub fn merge_with(&self, option: Option<SynthesisOption>) -> Self {
113        if let Some(other) = option {
114            Self {
115                samplerate: other.samplerate.or(self.samplerate),
116                provider: other.provider.or(self.provider.clone()),
117                speed: other.speed.or(self.speed),
118                app_id: other.app_id.or(self.app_id.clone()),
119                secret_id: other.secret_id.or(self.secret_id.clone()),
120                secret_key: other.secret_key.or(self.secret_key.clone()),
121                volume: other.volume.or(self.volume),
122                speaker: other.speaker.or(self.speaker.clone()),
123                codec: other.codec.or(self.codec.clone()),
124                subtitle: other.subtitle.or(self.subtitle),
125                model: other.model.or(self.model.clone()),
126                language: other.language.or(self.language.clone()),
127                emotion: other.emotion.or(self.emotion.clone()),
128                endpoint: other.endpoint.or(self.endpoint.clone()),
129                extra: other.extra.or(self.extra.clone()),
130                max_concurrent_tasks: other.max_concurrent_tasks.or(self.max_concurrent_tasks),
131                session_id: other.session_id.or(self.session_id.clone()),
132            }
133        } else {
134            self.clone()
135        }
136    }
137}
138
139#[derive(Debug)]
140pub enum SynthesisEvent {
141    /// Raw audio data chunk
142    AudioChunk(Bytes),
143    /// Progress information including completion status
144    Subtitles(Vec<Subtitle>),
145    Finished,
146}
147
148#[derive(Debug, Clone)]
149pub struct Subtitle {
150    pub text: String,
151    pub begin_time: u32,
152    pub end_time: u32,
153    pub begin_index: u32,
154    pub end_index: u32,
155}
156
157impl Subtitle {
158    pub fn new(
159        text: String,
160        begin_time: u32,
161        end_time: u32,
162        begin_index: u32,
163        end_index: u32,
164    ) -> Self {
165        Self {
166            text,
167            begin_time,
168            end_time,
169            begin_index,
170            end_index,
171        }
172    }
173}
174
175// calculate audio duration from bytes size and sample rate
176pub fn bytes_size_to_duration(bytes: usize, sample_rate: u32) -> u32 {
177    (500.0 * bytes as f32 / sample_rate as f32) as u32
178}
179
180#[async_trait]
181pub trait SynthesisClient: Send {
182    // provider of the synthesis client.
183    fn provider(&self) -> SynthesisType;
184
185    // connect to the synthesis service.
186    // (cmd_seq, result), return the cmd_seq that passed from `synthesize`
187    async fn start(
188        &mut self,
189    ) -> Result<BoxStream<'static, (Option<usize>, Result<SynthesisEvent>)>>;
190
191    // send text to the synthesis service.
192    // `cmd_seq` and `option` are used for non streaming mode
193    // for streaming mode, `cmd_seq` and `option` are None
194    async fn synthesize(
195        &mut self,
196        text: &str,
197        cmd_seq: Option<usize>,
198        option: Option<SynthesisOption>,
199    ) -> Result<()>;
200
201    async fn stop(&mut self) -> Result<()>;
202}
203
204impl Default for SynthesisOption {
205    fn default() -> Self {
206        Self {
207            samplerate: Some(16000),
208            provider: None,
209            speed: Some(1.0),
210            app_id: None,
211            secret_id: None,
212            secret_key: None,
213            volume: Some(5), // 0-10
214            speaker: None,
215            codec: Some("pcm".to_string()),
216            subtitle: None,
217            model: None,
218            language: None,
219            emotion: None,
220            endpoint: None,
221            extra: None,
222            max_concurrent_tasks: None,
223            session_id: None,
224        }
225    }
226}
227
228impl SynthesisOption {
229    pub fn check_default(&mut self) {
230        if let Some(provider) = &self.provider {
231            match provider.to_string().as_str() {
232                "tencent" | "tencent_basic" => {
233                    if self.app_id.is_none() {
234                        self.app_id = std::env::var("TENCENT_APPID").ok();
235                    }
236                    if self.secret_id.is_none() {
237                        self.secret_id = std::env::var("TENCENT_SECRET_ID").ok();
238                    }
239                    if self.secret_key.is_none() {
240                        self.secret_key = std::env::var("TENCENT_SECRET_KEY").ok();
241                    }
242                }
243                "voiceapi" => {
244                    // Set the endpoint from environment variable if not already set
245                    if self.endpoint.is_none() {
246                        self.endpoint = std::env::var("VOICEAPI_ENDPOINT")
247                            .ok()
248                            .or_else(|| Some("http://localhost:8000".to_string()));
249                    }
250                    // Set speaker ID from environment variable if not already set
251                    if self.speaker.is_none() {
252                        self.speaker = std::env::var("VOICEAPI_SPEAKER_ID")
253                            .ok()
254                            .or_else(|| Some("0".to_string()));
255                    }
256                }
257                "aliyun" => {
258                    if self.secret_key.is_none() {
259                        self.secret_key = std::env::var("DASHSCOPE_API_KEY").ok();
260                    }
261                }
262                "deepgram" => {
263                    if self.secret_key.is_none() {
264                        self.secret_key = std::env::var("DEEPGRAM_API_KEY").ok();
265                    }
266                }
267                _ => {}
268            }
269        }
270    }
271}