Skip to main content

edge_tts_rust/
protocol.rs

1use std::collections::BTreeMap;
2use std::time::{Duration, SystemTime, UNIX_EPOCH};
3
4use chrono::Utc;
5use rand::RngCore;
6use serde::Deserialize;
7use serde_json::json;
8use sha2::{Digest, Sha256};
9use uuid::Uuid;
10
11use crate::constants::{
12    MP3_BITRATE_BPS, OUTPUT_FORMAT, SEC_MS_GEC_VERSION, TICKS_PER_SECOND, TRUSTED_CLIENT_TOKEN,
13};
14use crate::error::{Error, Result};
15use crate::options::{SpeakOptions, normalize_voice};
16use crate::types::{Boundary, BoundaryEvent, SynthesisEvent};
17
18const WINDOWS_EPOCH_OFFSET_SECONDS: u64 = 11_644_473_600;
19
20pub fn generate_connection_id() -> String {
21    Uuid::new_v4().simple().to_string()
22}
23
24pub fn generate_muid() -> String {
25    let mut bytes = [0u8; 16];
26    rand::thread_rng().fill_bytes(&mut bytes);
27    bytes.iter().map(|byte| format!("{byte:02X}")).collect()
28}
29
30pub fn generate_sec_ms_gec(now: SystemTime) -> String {
31    let unix_seconds = now
32        .duration_since(UNIX_EPOCH)
33        .unwrap_or(Duration::ZERO)
34        .as_secs();
35    let rounded = (unix_seconds + WINDOWS_EPOCH_OFFSET_SECONDS) / 300 * 300;
36    let windows_ticks = rounded * 10_000_000;
37    let mut hasher = Sha256::new();
38    hasher.update(format!("{windows_ticks}{TRUSTED_CLIENT_TOKEN}").as_bytes());
39    format!("{:X}", hasher.finalize())
40}
41
42pub fn javascript_timestamp() -> String {
43    Utc::now()
44        .format("%a %b %d %Y %H:%M:%S GMT+0000 (Coordinated Universal Time)")
45        .to_string()
46}
47
48pub fn speech_config_message(boundary: Boundary) -> String {
49    let (word, sentence) = boundary.metadata_flags();
50    let payload = json!({
51        "context": {
52            "synthesis": {
53                "audio": {
54                    "metadataoptions": {
55                        "sentenceBoundaryEnabled": sentence,
56                        "wordBoundaryEnabled": word
57                    },
58                    "outputFormat": OUTPUT_FORMAT
59                }
60            }
61        }
62    });
63    format!(
64        "X-Timestamp:{}\r\nContent-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n{}\r\n",
65        javascript_timestamp(),
66        payload
67    )
68}
69
70pub fn ssml_message(options: &SpeakOptions, chunk: &str) -> Result<String> {
71    let voice = normalize_voice(&options.voice)?;
72    Ok(format!(
73        "X-RequestId:{}\r\nContent-Type:application/ssml+xml\r\nX-Timestamp:{}Z\r\nPath:ssml\r\n\r\n{}",
74        generate_connection_id(),
75        javascript_timestamp(),
76        build_ssml(
77            &voice,
78            &options.rate,
79            &options.volume,
80            &options.pitch,
81            chunk
82        )
83    ))
84}
85
86pub fn build_ssml(voice: &str, rate: &str, volume: &str, pitch: &str, text: &str) -> String {
87    format!(
88        "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'><voice name='{voice}'><prosody pitch='{pitch}' rate='{rate}' volume='{volume}'>{}</prosody></voice></speak>",
89        escape_ssml_text(text)
90    )
91}
92
93pub fn escape_ssml_text(text: &str) -> String {
94    let sanitized = text
95        .chars()
96        .map(|ch| match ch as u32 {
97            0..=8 | 11..=12 | 14..=31 => ' ',
98            _ => ch,
99        })
100        .collect::<String>();
101
102    sanitized
103        .replace('&', "&amp;")
104        .replace('<', "&lt;")
105        .replace('>', "&gt;")
106        .replace('"', "&quot;")
107        .replace('\'', "&apos;")
108}
109
110pub fn split_text(text: &str, max_bytes: usize) -> Result<Vec<String>> {
111    if max_bytes == 0 {
112        return Err(Error::InvalidChunkSize);
113    }
114
115    let escaped = escape_ssml_text(text);
116    let mut bytes = escaped.as_bytes();
117    let mut chunks = Vec::new();
118
119    while bytes.len() > max_bytes {
120        let mut split_at = bytes[..max_bytes]
121            .iter()
122            .rposition(|byte| *byte == b'\n' || *byte == b' ')
123            .unwrap_or(max_bytes);
124
125        while std::str::from_utf8(&bytes[..split_at]).is_err() && split_at > 0 {
126            split_at -= 1;
127        }
128
129        split_at = adjust_entity_boundary(bytes, split_at);
130        if split_at == 0 {
131            return Err(Error::InvalidSplitPoint);
132        }
133
134        let chunk = std::str::from_utf8(&bytes[..split_at])
135            .map_err(|_| Error::InvalidSplitPoint)?
136            .trim();
137        if !chunk.is_empty() {
138            chunks.push(chunk.to_owned());
139        }
140        bytes = &bytes[split_at..];
141    }
142
143    let tail = std::str::from_utf8(bytes)
144        .map_err(|_| Error::InvalidSplitPoint)?
145        .trim();
146    if !tail.is_empty() {
147        chunks.push(tail.to_owned());
148    }
149
150    Ok(chunks)
151}
152
153fn adjust_entity_boundary(bytes: &[u8], mut split_at: usize) -> usize {
154    while split_at > 0 {
155        if let Some(amp_index) = bytes[..split_at].iter().rposition(|byte| *byte == b'&') {
156            if bytes[amp_index..split_at].contains(&b';') {
157                break;
158            }
159            split_at = amp_index;
160            continue;
161        }
162        break;
163    }
164    split_at
165}
166
167pub fn parse_headers(
168    data: &[u8],
169    header_length: usize,
170) -> Result<(BTreeMap<String, String>, &[u8])> {
171    if header_length > data.len() {
172        return Err(Error::UnexpectedResponse(
173            "header length exceeds frame length",
174        ));
175    }
176
177    let header_bytes = &data[..header_length];
178    let payload = data
179        .get(header_length..)
180        .ok_or(Error::UnexpectedResponse("frame missing payload"))?;
181    let payload = payload
182        .strip_prefix(b"\r\n\r\n")
183        .or_else(|| payload.strip_prefix(b"\r\n"))
184        .unwrap_or(payload);
185    let header_str = std::str::from_utf8(header_bytes)
186        .map_err(|_| Error::UnexpectedResponse("headers are not valid utf-8"))?;
187
188    let mut headers = BTreeMap::new();
189    for line in header_str.split("\r\n").filter(|line| !line.is_empty()) {
190        let Some((key, value)) = line.split_once(':') else {
191            continue;
192        };
193        headers.insert(key.to_owned(), value.to_owned());
194    }
195    Ok((headers, payload))
196}
197
198pub fn parse_binary_headers(
199    data: &[u8],
200    header_length: usize,
201) -> Result<(BTreeMap<String, String>, &[u8])> {
202    let header_start = 2usize;
203    let header_end = header_start
204        .checked_add(header_length)
205        .ok_or(Error::UnexpectedResponse("binary header length overflow"))?;
206    if header_end > data.len() {
207        return Err(Error::UnexpectedResponse(
208            "binary header length exceeds frame length",
209        ));
210    }
211
212    let header_bytes = &data[header_start..header_end];
213    let payload = data
214        .get(header_end..)
215        .ok_or(Error::UnexpectedResponse("binary frame missing payload"))?;
216    let payload = payload.strip_prefix(b"\r\n").unwrap_or(payload);
217    let header_str = std::str::from_utf8(header_bytes)
218        .map_err(|_| Error::UnexpectedResponse("headers are not valid utf-8"))?;
219
220    let mut headers = BTreeMap::new();
221    for line in header_str.split("\r\n").filter(|line| !line.is_empty()) {
222        let Some((key, value)) = line.split_once(':') else {
223            continue;
224        };
225        headers.insert(key.to_owned(), value.to_owned());
226    }
227    Ok((headers, payload))
228}
229
230#[derive(Debug, Deserialize)]
231struct MetadataEnvelope {
232    #[serde(rename = "Metadata")]
233    metadata: Vec<MetadataItem>,
234}
235
236#[derive(Debug, Deserialize)]
237struct MetadataItem {
238    #[serde(rename = "Type")]
239    kind: String,
240    #[serde(rename = "Data")]
241    data: Option<MetadataData>,
242}
243
244#[derive(Debug, Deserialize)]
245struct MetadataData {
246    #[serde(rename = "Offset")]
247    offset: u64,
248    #[serde(rename = "Duration")]
249    duration: u64,
250    #[serde(rename = "text")]
251    text: MetadataText,
252}
253
254#[derive(Debug, Deserialize)]
255struct MetadataText {
256    #[serde(rename = "Text")]
257    text: String,
258}
259
260pub fn parse_metadata(payload: &[u8], offset_compensation: u64) -> Result<Vec<SynthesisEvent>> {
261    let envelope: MetadataEnvelope = serde_json::from_slice(payload)?;
262    let mut events = Vec::new();
263
264    for item in envelope.metadata {
265        match item.kind.as_str() {
266            "WordBoundary" | "SentenceBoundary" => {
267                let data = item
268                    .data
269                    .ok_or(Error::UnexpectedResponse("boundary metadata missing data"))?;
270                let kind = if item.kind == "WordBoundary" {
271                    Boundary::Word
272                } else {
273                    Boundary::Sentence
274                };
275                events.push(SynthesisEvent::Boundary(BoundaryEvent {
276                    kind,
277                    offset_ticks: data.offset + offset_compensation,
278                    duration_ticks: data.duration,
279                    text: unescape_xml(&data.text.text),
280                }));
281            }
282            "SessionEnd" => {}
283            other => return Err(Error::UnknownMetadata(other.to_owned())),
284        }
285    }
286
287    Ok(events)
288}
289
290pub fn offset_from_audio_bytes(bytes: usize) -> u64 {
291    (bytes as u64 * 8 * TICKS_PER_SECOND) / MP3_BITRATE_BPS
292}
293
294fn unescape_xml(text: &str) -> String {
295    text.replace("&lt;", "<")
296        .replace("&gt;", ">")
297        .replace("&quot;", "\"")
298        .replace("&apos;", "'")
299        .replace("&amp;", "&")
300}
301
302pub fn voice_headers() -> [(&'static str, String); 8] {
303    [
304        ("Authority", "speech.platform.bing.com".to_owned()),
305        (
306            "Sec-CH-UA",
307            format!(
308                "\" Not;A Brand\";v=\"99\", \"Microsoft Edge\";v=\"{0}\", \"Chromium\";v=\"{0}\"",
309                crate::constants::CHROMIUM_MAJOR_VERSION
310            ),
311        ),
312        ("Sec-CH-UA-Mobile", "?0".to_owned()),
313        ("Accept", "*/*".to_owned()),
314        ("Sec-Fetch-Site", "none".to_owned()),
315        ("Sec-Fetch-Mode", "cors".to_owned()),
316        ("Sec-Fetch-Dest", "empty".to_owned()),
317        ("User-Agent", crate::constants::user_agent()),
318    ]
319}
320
321pub fn websocket_headers(muid: &str) -> [(&'static str, String); 8] {
322    [
323        ("Pragma", "no-cache".to_owned()),
324        ("Cache-Control", "no-cache".to_owned()),
325        (
326            "Origin",
327            "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold".to_owned(),
328        ),
329        ("Sec-WebSocket-Version", "13".to_owned()),
330        ("User-Agent", crate::constants::user_agent()),
331        ("Accept-Encoding", "gzip, deflate, br, zstd".to_owned()),
332        ("Accept-Language", "en-US,en;q=0.9".to_owned()),
333        ("Cookie", format!("muid={muid};")),
334    ]
335}
336
337pub fn sec_ms_gec_version() -> &'static str {
338    SEC_MS_GEC_VERSION
339}