msedge_tts/tts/
mod.rs

1//! TTS Client and Stream, SpeechConfig, Response Type.
2
3pub mod client;
4pub mod stream;
5
6mod proxy;
7use crate::error::{Error, ProxyError, Result};
8use proxy::{
9    http_proxy, http_proxy_async, socks4_proxy, socks4_proxy_async, socks5_proxy,
10    socks5_proxy_asnyc, ProxyAsyncStream, ProxyStream,
11};
12
13use sha2::Digest;
14
15/// Synthesis Config
16#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
17pub struct SpeechConfig {
18    pub voice_name: String,
19    /// should be one of Streaming or NonStreaming audio output formats.
20    ///
21    /// Streaming audio output formats:
22    /// + amr-wb-16000hz
23    /// + audio-16khz-16bit-32kbps-mono-opus
24    /// + audio-16khz-32kbitrate-mono-mp3
25    /// + audio-16khz-64kbitrate-mono-mp3
26    /// + audio-16khz-128kbitrate-mono-mp3
27    /// + audio-24khz-16bit-24kbps-mono-opus
28    /// + audio-24khz-16bit-48kbps-mono-opus
29    /// + audio-24khz-48kbitrate-mono-mp3
30    /// + audio-24khz-96kbitrate-mono-mp3
31    /// + audio-24khz-160kbitrate-mono-mp3
32    /// + audio-48khz-96kbitrate-mono-mp3
33    /// + audio-48khz-192kbitrate-mono-mp3
34    /// + ogg-16khz-16bit-mono-opus
35    /// + ogg-24khz-16bit-mono-opus
36    /// + ogg-48khz-16bit-mono-opus
37    /// + raw-8khz-8bit-mono-alaw
38    /// + raw-8khz-8bit-mono-mulaw
39    /// + raw-8khz-16bit-mono-pcm
40    /// + raw-16khz-16bit-mono-pcm
41    /// + raw-16khz-16bit-mono-truesilk
42    /// + raw-22050hz-16bit-mono-pcm
43    /// + raw-24khz-16bit-mono-pcm
44    /// + raw-24khz-16bit-mono-truesilk
45    /// + raw-44100hz-16bit-mono-pcm
46    /// + raw-48khz-16bit-mono-pcm
47    /// + webm-16khz-16bit-mono-opus
48    /// + webm-24khz-16bit-24kbps-mono-opus
49    /// + webm-24khz-16bit-mono-opus
50    ///
51    /// NonStreaming audio output formats:
52    /// + riff-8khz-8bit-mono-alaw
53    /// + riff-8khz-8bit-mono-mulaw
54    /// + riff-8khz-16bit-mono-pcm
55    /// + riff-22050hz-16bit-mono-pcm
56    /// + riff-24khz-16bit-mono-pcm
57    /// + riff-44100hz-16bit-mono-pcm
58    /// + riff-48khz-16bit-mono-pcm
59    pub audio_format: String,
60    pub pitch: i32,
61    pub rate: i32,
62    pub volume: i32,
63}
64
65impl From<&super::voice::Voice> for SpeechConfig {
66    fn from(voice: &super::voice::Voice) -> Self {
67        let audio_output_format = if let Some(ref output_format) = voice.suggested_codec {
68            output_format.clone()
69        } else {
70            "audio-24khz-48kbitrate-mono-mp3".to_string()
71        };
72        Self {
73            voice_name: voice.name.clone(),
74            audio_format: audio_output_format,
75            pitch: 0,
76            rate: 0,
77            volume: 0,
78        }
79    }
80}
81
82/// Audio Metadata
83#[derive(Debug)]
84pub struct AudioMetadata {
85    pub metadata_type: Option<String>,
86    pub offset: u64,
87    pub duration: u64,
88    pub text: Option<String>,
89    pub length: u64,
90    pub boundary_type: Option<String>,
91}
92
93impl AudioMetadata {
94    fn from_str(text: &str) -> Result<Vec<Self>> {
95        let value: serde_json::Value = serde_json::from_str(text)?;
96        if let Some(items) = value["Metadata"].as_array() {
97            let mut audio_metadata = Vec::new();
98            for item in items {
99                let metadata_type = item["Type"].as_str().map(|x| x.to_owned());
100                let offset = item["Data"]["Offset"].as_u64().unwrap_or(0);
101                let duration = item["Data"]["Duration"].as_u64().unwrap_or(0);
102                let text = item["Data"]["text"]["Text"].as_str().map(|x| x.to_owned());
103                let length = item["Data"]["text"]["Length"].as_u64().unwrap_or(0);
104                let boundary_type = item["Data"]["text"]["BoundaryType"]
105                    .as_str()
106                    .map(|x| x.to_owned());
107                audio_metadata.push(AudioMetadata {
108                    metadata_type,
109                    offset,
110                    duration,
111                    text,
112                    length,
113                    boundary_type,
114                });
115            }
116            Ok(audio_metadata)
117        } else {
118            Err(Error::UnexpectedMessage(format!(
119                "unexpected json text: {}",
120                text
121            )))
122        }
123    }
124}
125
126enum ProcessedMessage {
127    AudioBytes((tungstenite::Bytes, usize)),
128    AudioMetadata(Vec<AudioMetadata>),
129}
130
131fn process_message(
132    message: tungstenite::Message,
133    turn_start: &mut bool,
134    response: &mut bool,
135    turn_end: &mut bool,
136) -> Result<Option<ProcessedMessage>> {
137    match message {
138        tungstenite::Message::Text(text) => {
139            if text.contains("audio.metadata") {
140                if let Some(index) = text.find("\r\n\r\n") {
141                    let metadata = AudioMetadata::from_str(&text[index + 4..])?;
142                    Ok(Some(ProcessedMessage::AudioMetadata(metadata)))
143                } else {
144                    Ok(None)
145                }
146            } else if text.contains("turn.start") {
147                *turn_start = true;
148                Ok(None)
149            } else if text.contains("response") {
150                *response = true;
151                Ok(None)
152            } else if text.contains("turn.end") {
153                *turn_end = true;
154                Ok(None)
155            } else {
156                Err(Error::UnexpectedMessage(format!(
157                    "unexpected text message: {}",
158                    text
159                )))
160            }
161        }
162        tungstenite::Message::Binary(bytes) => {
163            if *turn_start || *response {
164                let header_len = u16::from_be_bytes([bytes[0], bytes[1]]) as usize;
165                Ok(Some(ProcessedMessage::AudioBytes((bytes, header_len + 2))))
166            } else {
167                Ok(None)
168            }
169        }
170        tungstenite::Message::Close(_) => {
171            *turn_end = true;
172            Ok(None)
173        }
174        _ => Err(Error::UnexpectedMessage(format!(
175            "unexpected message: {}",
176            message
177        ))),
178    }
179}
180
181// try to fix china mainland 403 forbidden issue
182// solution from:
183// https://github.com/rany2/edge-tts/issues/290#issuecomment-2464956570
184fn gen_sec_ms_gec() -> String {
185    // UTC time from 1601-01-01
186    let duration = std::time::SystemTime::now()
187        .duration_since(std::time::UNIX_EPOCH)
188        .unwrap()
189        + std::time::Duration::from_secs(11644473600);
190    let ticks = duration.as_nanos() / 100;
191    let ticks = ticks - ticks % 3_000_000_000;
192
193    let mut hasher = sha2::Sha256::new();
194    hasher.update(format!("{ticks}6A5AA1D4EAFF4E9FB37E23D68491D6F4"));
195    let hash_code = hasher.finalize();
196    let mut hex_str = String::new();
197    for &byte in hash_code.iter() {
198        hex_str.push_str(&format!("{:02X}", byte));
199    }
200    hex_str
201}
202
203fn build_websocket_request() -> Result<tungstenite::handshake::client::Request> {
204    use super::constants;
205    use tungstenite::client::IntoClientRequest;
206    use tungstenite::http::header;
207
208    let uuid = uuid::Uuid::new_v4().simple().to_string();
209    let sec_ms_gec = gen_sec_ms_gec();
210    let sec_ms_gec_version = "1-130.0.2849.68";
211    let mut request = format!(
212        "{}{}&Sec-MS-GEC={}&Sec-MS-GEC-Version={}",
213        constants::WSS_URL,
214        uuid,
215        sec_ms_gec,
216        sec_ms_gec_version
217    )
218    .into_client_request()?;
219    let headers = request.headers_mut();
220    headers.insert(
221        header::PRAGMA,
222        "no-cache"
223            .parse()
224            .map_err(|err| tungstenite::Error::from(http::Error::from(err)))?,
225    );
226    headers.insert(
227        header::CACHE_CONTROL,
228        "no-cache"
229            .parse()
230            .map_err(|err| tungstenite::Error::from(http::Error::from(err)))?,
231    );
232    headers.insert(
233        header::USER_AGENT,
234        constants::USER_AGENT
235            .parse()
236            .map_err(|err| tungstenite::Error::from(http::Error::from(err)))?,
237    );
238    headers.insert(
239        header::ORIGIN,
240        constants::ORIGIN
241            .parse()
242            .map_err(|err| tungstenite::Error::from(http::Error::from(err)))?,
243    );
244    Ok(request)
245}
246
247fn build_config_message(config: &SpeechConfig) -> tungstenite::Message {
248    static SPEECH_CONFIG_HEAD: &str = r#"{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},"outputFormat":""#;
249    static SPEECH_CONFIG_TAIL: &str = r#""}}}}"#;
250    let speech_config_message = format!(
251        "X-Timestamp:{}\r\nContent-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n{}{}{}",
252        chrono::Local::now().to_rfc2822(),
253        SPEECH_CONFIG_HEAD,
254        config.audio_format,
255        SPEECH_CONFIG_TAIL
256    );
257    tungstenite::Message::Text(speech_config_message.into())
258}
259
260fn build_ssml_message(text: &str, config: &SpeechConfig) -> tungstenite::Message {
261    let ssml = format!(
262        "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'><voice name='{}'><prosody pitch='{:+}Hz' rate='{:+}%' volume='{:+}%'>{}</prosody></voice></speak>",
263        config.voice_name,
264        config.pitch,
265        config.rate,
266        config.volume,
267        text,
268    );
269    let ssml_message = format!(
270        "X-RequestId:{}\r\nContent-Type:application/ssml+xml\r\nX-Timestamp:{}\r\nPath:ssml\r\n\r\n{}",
271        uuid::Uuid::new_v4().simple(),
272        chrono::Local::now().to_rfc2822(),
273        ssml,
274    );
275    tungstenite::Message::Text(ssml_message.into())
276}
277
278type WebSocketStream<T> = tungstenite::WebSocket<tungstenite::stream::MaybeTlsStream<T>>;
279
280fn websocket_connect() -> Result<WebSocketStream<std::net::TcpStream>> {
281    let request = build_websocket_request()?;
282    let (websocket, _) = tungstenite::connect(request)?;
283    Ok(websocket)
284}
285
286fn websocket_connect_proxy(
287    proxy: http::Uri,
288    username: Option<&str>,
289    password: Option<&str>,
290) -> Result<WebSocketStream<ProxyStream>> {
291    use tungstenite::handshake::HandshakeError;
292
293    let request = build_websocket_request()?;
294    let stream: std::result::Result<ProxyStream, ProxyError> = match proxy.scheme_str() {
295        Some(scheme) => match scheme.to_lowercase().as_str() {
296            "socks4" | "socks4a" => {
297                socks4_proxy(request.uri().host().unwrap(), proxy, username).map_err(|e| e.into())
298            }
299            "socks5" | "socks5h" => {
300                socks5_proxy(request.uri().host().unwrap(), proxy, username, password)
301                    .map_err(|e| e.into())
302            }
303            "http" | "https" => {
304                http_proxy(request.uri().host().unwrap(), proxy, username, password)
305                    .map_err(|e| e.into())
306            }
307            _ => Err(ProxyError::NotSupportedScheme(proxy)),
308        },
309        None => http_proxy(request.uri().host().unwrap(), proxy, username, password)
310            .map_err(|e| e.into()),
311    };
312    let (websocket, _) = tungstenite::client_tls(request, stream?).map_err(|e| match e {
313        HandshakeError::Failure(e) => e,
314        HandshakeError::Interrupted(_) => panic!("Bug: blocking handshake not blocked"),
315    })?;
316    Ok(websocket)
317}
318
319type WebSocketStreamAsync<T> =
320    async_tungstenite::WebSocketStream<async_tungstenite::async_std::ClientStream<T>>;
321
322async fn websocket_connect_async() -> Result<WebSocketStreamAsync<async_std::net::TcpStream>> {
323    let request = build_websocket_request()?;
324    let (websocket, _) = async_tungstenite::async_std::connect_async(request).await?;
325    Ok(websocket)
326}
327
328async fn websocket_connect_proxy_async(
329    proxy: http::Uri,
330    username: Option<&str>,
331    password: Option<&str>,
332) -> Result<WebSocketStreamAsync<ProxyAsyncStream>> {
333    let request = build_websocket_request()?;
334    let stream: std::result::Result<ProxyAsyncStream, ProxyError> = match proxy.scheme_str() {
335        Some(scheme) => match scheme.to_lowercase().as_str() {
336            "socks4" | "socks4a" => {
337                socks4_proxy_async(request.uri().host().unwrap(), proxy, username)
338                    .await
339                    .map_err(|e| e.into())
340            }
341            "socks5" | "socks5h" => {
342                socks5_proxy_asnyc(request.uri().host().unwrap(), proxy, username, password)
343                    .await
344                    .map_err(|e| e.into())
345            }
346            "http" | "https" => {
347                http_proxy_async(request.uri().host().unwrap(), proxy, username, password)
348                    .await
349                    .map_err(|e| e.into())
350            }
351            _ => Err(ProxyError::NotSupportedScheme(proxy)),
352        },
353        None => http_proxy_async(request.uri().host().unwrap(), proxy, username, password)
354            .await
355            .map_err(|e| e.into()),
356    };
357    let (websocket, _) = async_tungstenite::async_std::client_async_tls(request, stream?).await?;
358    Ok(websocket)
359}