1pub mod client;
4pub mod stream;
5
6mod proxy;
7use crate::error::{Error, ProxyError, Result};
8use proxy::{
9 http_proxy, http_proxy_async, socks4_proxy, socks4_proxy_async, socks5_proxy,
10 socks5_proxy_asnyc, ProxyAsyncStream, ProxyStream,
11};
12
13use sha2::Digest;
14
15#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
17pub struct SpeechConfig {
18 pub voice_name: String,
19 pub audio_format: String,
60 pub pitch: i32,
61 pub rate: i32,
62 pub volume: i32,
63}
64
65impl From<&super::voice::Voice> for SpeechConfig {
66 fn from(voice: &super::voice::Voice) -> Self {
67 let audio_output_format = if let Some(ref output_format) = voice.suggested_codec {
68 output_format.clone()
69 } else {
70 "audio-24khz-48kbitrate-mono-mp3".to_string()
71 };
72 Self {
73 voice_name: voice.name.clone(),
74 audio_format: audio_output_format,
75 pitch: 0,
76 rate: 0,
77 volume: 0,
78 }
79 }
80}
81
82#[derive(Debug)]
84pub struct AudioMetadata {
85 pub metadata_type: Option<String>,
86 pub offset: u64,
87 pub duration: u64,
88 pub text: Option<String>,
89 pub length: u64,
90 pub boundary_type: Option<String>,
91}
92
93impl AudioMetadata {
94 fn from_str(text: &str) -> Result<Vec<Self>> {
95 let value: serde_json::Value = serde_json::from_str(text)?;
96 if let Some(items) = value["Metadata"].as_array() {
97 let mut audio_metadata = Vec::new();
98 for item in items {
99 let metadata_type = item["Type"].as_str().map(|x| x.to_owned());
100 let offset = item["Data"]["Offset"].as_u64().unwrap_or(0);
101 let duration = item["Data"]["Duration"].as_u64().unwrap_or(0);
102 let text = item["Data"]["text"]["Text"].as_str().map(|x| x.to_owned());
103 let length = item["Data"]["text"]["Length"].as_u64().unwrap_or(0);
104 let boundary_type = item["Data"]["text"]["BoundaryType"]
105 .as_str()
106 .map(|x| x.to_owned());
107 audio_metadata.push(AudioMetadata {
108 metadata_type,
109 offset,
110 duration,
111 text,
112 length,
113 boundary_type,
114 });
115 }
116 Ok(audio_metadata)
117 } else {
118 Err(Error::UnexpectedMessage(format!(
119 "unexpected json text: {}",
120 text
121 )))
122 }
123 }
124}
125
126enum ProcessedMessage {
127 AudioBytes((tungstenite::Bytes, usize)),
128 AudioMetadata(Vec<AudioMetadata>),
129}
130
131fn process_message(
132 message: tungstenite::Message,
133 turn_start: &mut bool,
134 response: &mut bool,
135 turn_end: &mut bool,
136) -> Result<Option<ProcessedMessage>> {
137 match message {
138 tungstenite::Message::Text(text) => {
139 if text.contains("audio.metadata") {
140 if let Some(index) = text.find("\r\n\r\n") {
141 let metadata = AudioMetadata::from_str(&text[index + 4..])?;
142 Ok(Some(ProcessedMessage::AudioMetadata(metadata)))
143 } else {
144 Ok(None)
145 }
146 } else if text.contains("turn.start") {
147 *turn_start = true;
148 Ok(None)
149 } else if text.contains("response") {
150 *response = true;
151 Ok(None)
152 } else if text.contains("turn.end") {
153 *turn_end = true;
154 Ok(None)
155 } else {
156 Err(Error::UnexpectedMessage(format!(
157 "unexpected text message: {}",
158 text
159 )))
160 }
161 }
162 tungstenite::Message::Binary(bytes) => {
163 if *turn_start || *response {
164 let header_len = u16::from_be_bytes([bytes[0], bytes[1]]) as usize;
165 Ok(Some(ProcessedMessage::AudioBytes((bytes, header_len + 2))))
166 } else {
167 Ok(None)
168 }
169 }
170 tungstenite::Message::Close(_) => {
171 *turn_end = true;
172 Ok(None)
173 }
174 _ => Err(Error::UnexpectedMessage(format!(
175 "unexpected message: {}",
176 message
177 ))),
178 }
179}
180
181fn gen_sec_ms_gec() -> String {
185 let duration = std::time::SystemTime::now()
187 .duration_since(std::time::UNIX_EPOCH)
188 .unwrap()
189 + std::time::Duration::from_secs(11644473600);
190 let ticks = duration.as_nanos() / 100;
191 let ticks = ticks - ticks % 3_000_000_000;
192
193 let mut hasher = sha2::Sha256::new();
194 hasher.update(format!("{ticks}6A5AA1D4EAFF4E9FB37E23D68491D6F4"));
195 let hash_code = hasher.finalize();
196 let mut hex_str = String::new();
197 for &byte in hash_code.iter() {
198 hex_str.push_str(&format!("{:02X}", byte));
199 }
200 hex_str
201}
202
203fn build_websocket_request() -> Result<tungstenite::handshake::client::Request> {
204 use super::constants;
205 use tungstenite::client::IntoClientRequest;
206 use tungstenite::http::header;
207
208 let uuid = uuid::Uuid::new_v4().simple().to_string();
209 let sec_ms_gec = gen_sec_ms_gec();
210 let sec_ms_gec_version = "1-130.0.2849.68";
211 let mut request = format!(
212 "{}{}&Sec-MS-GEC={}&Sec-MS-GEC-Version={}",
213 constants::WSS_URL,
214 uuid,
215 sec_ms_gec,
216 sec_ms_gec_version
217 )
218 .into_client_request()?;
219 let headers = request.headers_mut();
220 headers.insert(
221 header::PRAGMA,
222 "no-cache"
223 .parse()
224 .map_err(|err| tungstenite::Error::from(http::Error::from(err)))?,
225 );
226 headers.insert(
227 header::CACHE_CONTROL,
228 "no-cache"
229 .parse()
230 .map_err(|err| tungstenite::Error::from(http::Error::from(err)))?,
231 );
232 headers.insert(
233 header::USER_AGENT,
234 constants::USER_AGENT
235 .parse()
236 .map_err(|err| tungstenite::Error::from(http::Error::from(err)))?,
237 );
238 headers.insert(
239 header::ORIGIN,
240 constants::ORIGIN
241 .parse()
242 .map_err(|err| tungstenite::Error::from(http::Error::from(err)))?,
243 );
244 Ok(request)
245}
246
247fn build_config_message(config: &SpeechConfig) -> tungstenite::Message {
248 static SPEECH_CONFIG_HEAD: &str = r#"{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},"outputFormat":""#;
249 static SPEECH_CONFIG_TAIL: &str = r#""}}}}"#;
250 let speech_config_message = format!(
251 "X-Timestamp:{}\r\nContent-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n{}{}{}",
252 chrono::Local::now().to_rfc2822(),
253 SPEECH_CONFIG_HEAD,
254 config.audio_format,
255 SPEECH_CONFIG_TAIL
256 );
257 tungstenite::Message::Text(speech_config_message.into())
258}
259
260fn build_ssml_message(text: &str, config: &SpeechConfig) -> tungstenite::Message {
261 let ssml = format!(
262 "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'><voice name='{}'><prosody pitch='{:+}Hz' rate='{:+}%' volume='{:+}%'>{}</prosody></voice></speak>",
263 config.voice_name,
264 config.pitch,
265 config.rate,
266 config.volume,
267 text,
268 );
269 let ssml_message = format!(
270 "X-RequestId:{}\r\nContent-Type:application/ssml+xml\r\nX-Timestamp:{}\r\nPath:ssml\r\n\r\n{}",
271 uuid::Uuid::new_v4().simple(),
272 chrono::Local::now().to_rfc2822(),
273 ssml,
274 );
275 tungstenite::Message::Text(ssml_message.into())
276}
277
278type WebSocketStream<T> = tungstenite::WebSocket<tungstenite::stream::MaybeTlsStream<T>>;
279
280fn websocket_connect() -> Result<WebSocketStream<std::net::TcpStream>> {
281 let request = build_websocket_request()?;
282 let (websocket, _) = tungstenite::connect(request)?;
283 Ok(websocket)
284}
285
286fn websocket_connect_proxy(
287 proxy: http::Uri,
288 username: Option<&str>,
289 password: Option<&str>,
290) -> Result<WebSocketStream<ProxyStream>> {
291 use tungstenite::handshake::HandshakeError;
292
293 let request = build_websocket_request()?;
294 let stream: std::result::Result<ProxyStream, ProxyError> = match proxy.scheme_str() {
295 Some(scheme) => match scheme.to_lowercase().as_str() {
296 "socks4" | "socks4a" => {
297 socks4_proxy(request.uri().host().unwrap(), proxy, username).map_err(|e| e.into())
298 }
299 "socks5" | "socks5h" => {
300 socks5_proxy(request.uri().host().unwrap(), proxy, username, password)
301 .map_err(|e| e.into())
302 }
303 "http" | "https" => {
304 http_proxy(request.uri().host().unwrap(), proxy, username, password)
305 .map_err(|e| e.into())
306 }
307 _ => Err(ProxyError::NotSupportedScheme(proxy)),
308 },
309 None => http_proxy(request.uri().host().unwrap(), proxy, username, password)
310 .map_err(|e| e.into()),
311 };
312 let (websocket, _) = tungstenite::client_tls(request, stream?).map_err(|e| match e {
313 HandshakeError::Failure(e) => e,
314 HandshakeError::Interrupted(_) => panic!("Bug: blocking handshake not blocked"),
315 })?;
316 Ok(websocket)
317}
318
319type WebSocketStreamAsync<T> =
320 async_tungstenite::WebSocketStream<async_tungstenite::async_std::ClientStream<T>>;
321
322async fn websocket_connect_async() -> Result<WebSocketStreamAsync<async_std::net::TcpStream>> {
323 let request = build_websocket_request()?;
324 let (websocket, _) = async_tungstenite::async_std::connect_async(request).await?;
325 Ok(websocket)
326}
327
328async fn websocket_connect_proxy_async(
329 proxy: http::Uri,
330 username: Option<&str>,
331 password: Option<&str>,
332) -> Result<WebSocketStreamAsync<ProxyAsyncStream>> {
333 let request = build_websocket_request()?;
334 let stream: std::result::Result<ProxyAsyncStream, ProxyError> = match proxy.scheme_str() {
335 Some(scheme) => match scheme.to_lowercase().as_str() {
336 "socks4" | "socks4a" => {
337 socks4_proxy_async(request.uri().host().unwrap(), proxy, username)
338 .await
339 .map_err(|e| e.into())
340 }
341 "socks5" | "socks5h" => {
342 socks5_proxy_asnyc(request.uri().host().unwrap(), proxy, username, password)
343 .await
344 .map_err(|e| e.into())
345 }
346 "http" | "https" => {
347 http_proxy_async(request.uri().host().unwrap(), proxy, username, password)
348 .await
349 .map_err(|e| e.into())
350 }
351 _ => Err(ProxyError::NotSupportedScheme(proxy)),
352 },
353 None => http_proxy_async(request.uri().host().unwrap(), proxy, username, password)
354 .await
355 .map_err(|e| e.into()),
356 };
357 let (websocket, _) = async_tungstenite::async_std::client_async_tls(request, stream?).await?;
358 Ok(websocket)
359}