Skip to main content

voirs_cli/
ssml.rs

1//! SSML (Speech Synthesis Markup Language) support for VoiRS CLI.
2
3use crate::error::{CliError, Result};
4use once_cell::sync::Lazy;
5use regex::Regex;
6use std::collections::HashMap;
7
8// Static regex patterns compiled once at first use
9static RE_SPEAK: Lazy<Regex> =
10    Lazy::new(|| Regex::new(r"<speak[^>]*>.*</speak>").expect("Invalid SPEAK regex pattern"));
11static RE_VOICE: Lazy<Regex> =
12    Lazy::new(|| Regex::new(r"<voice[^>]*>.*</voice>").expect("Invalid VOICE regex pattern"));
13static RE_PROSODY: Lazy<Regex> =
14    Lazy::new(|| Regex::new(r"<prosody[^>]*>.*</prosody>").expect("Invalid PROSODY regex pattern"));
15static RE_BREAK: Lazy<Regex> =
16    Lazy::new(|| Regex::new(r"<break[^/>]*/>").expect("Invalid BREAK regex pattern"));
17static RE_EMPHASIS: Lazy<Regex> = Lazy::new(|| {
18    Regex::new(r"<emphasis[^>]*>.*</emphasis>").expect("Invalid EMPHASIS regex pattern")
19});
20static RE_SAY_AS: Lazy<Regex> =
21    Lazy::new(|| Regex::new(r"<say-as[^>]*>.*</say-as>").expect("Invalid SAY-AS regex pattern"));
22static RE_PHONEME: Lazy<Regex> =
23    Lazy::new(|| Regex::new(r"<phoneme[^>]*>.*</phoneme>").expect("Invalid PHONEME regex pattern"));
24static RE_SUB: Lazy<Regex> =
25    Lazy::new(|| Regex::new(r"<sub[^>]*>.*</sub>").expect("Invalid SUB regex pattern"));
26static RE_TAG: Lazy<Regex> =
27    Lazy::new(|| Regex::new(r"<(/?)(\w+)(?:[^>]*)>").expect("Invalid TAG regex pattern"));
28static RE_TAG_REMOVE: Lazy<Regex> =
29    Lazy::new(|| Regex::new(r"<[^>]*>").expect("Invalid TAG_REMOVE regex pattern"));
30static RE_WHITESPACE: Lazy<Regex> =
31    Lazy::new(|| Regex::new(r"\s+").expect("Invalid WHITESPACE regex pattern"));
32static RE_PROSODY_TAG: Lazy<Regex> =
33    Lazy::new(|| Regex::new(r#"<prosody\s+([^>]+)>"#).expect("Invalid PROSODY_TAG regex pattern"));
34static RE_RATE: Lazy<Regex> =
35    Lazy::new(|| Regex::new(r#"rate\s*=\s*["']([^"']+)["']"#).expect("Invalid RATE regex pattern"));
36static RE_PITCH: Lazy<Regex> = Lazy::new(|| {
37    Regex::new(r#"pitch\s*=\s*["']([^"']+)["']"#).expect("Invalid PITCH regex pattern")
38});
39static RE_VOLUME: Lazy<Regex> = Lazy::new(|| {
40    Regex::new(r#"volume\s*=\s*["']([^"']+)["']"#).expect("Invalid VOLUME regex pattern")
41});
42static RE_VOICE_NAME: Lazy<Regex> = Lazy::new(|| {
43    Regex::new(r#"<voice\s+name\s*=\s*["']([^"']+)["']"#).expect("Invalid VOICE_NAME regex pattern")
44});
45
46/// SSML validation and processing utilities
47pub struct SsmlProcessor {
48    /// Regex patterns for SSML validation
49    patterns: HashMap<String, &'static Lazy<Regex>>,
50}
51
52impl Default for SsmlProcessor {
53    fn default() -> Self {
54        Self::new()
55    }
56}
57
58impl SsmlProcessor {
59    /// Create a new SSML processor
60    pub fn new() -> Self {
61        let mut patterns = HashMap::new();
62
63        // Reference static regex patterns
64        patterns.insert("speak".to_string(), &RE_SPEAK);
65        patterns.insert("voice".to_string(), &RE_VOICE);
66        patterns.insert("prosody".to_string(), &RE_PROSODY);
67        patterns.insert("break".to_string(), &RE_BREAK);
68        patterns.insert("emphasis".to_string(), &RE_EMPHASIS);
69        patterns.insert("say-as".to_string(), &RE_SAY_AS);
70        patterns.insert("phoneme".to_string(), &RE_PHONEME);
71        patterns.insert("sub".to_string(), &RE_SUB);
72
73        Self { patterns }
74    }
75
76    /// Check if text contains SSML markup
77    pub fn is_ssml(&self, text: &str) -> bool {
78        text.trim_start().starts_with('<') && text.contains("</")
79    }
80
81    /// Validate SSML markup
82    pub fn validate(&self, ssml: &str) -> Result<Vec<SsmlValidationIssue>> {
83        let mut issues = Vec::new();
84
85        // Basic structure validation
86        if !ssml.trim().starts_with("<speak") {
87            issues.push(SsmlValidationIssue {
88                issue_type: SsmlIssueType::Error,
89                message: "SSML must start with <speak> tag".to_string(),
90                line: 1,
91                column: 1,
92                suggestion: Some("Wrap your content in <speak>...</speak> tags".to_string()),
93            });
94        }
95
96        if !ssml.trim().ends_with("</speak>") {
97            let line_count = ssml.lines().count();
98            let last_line_len = ssml.lines().last().map(|l| l.len()).unwrap_or(0);
99
100            issues.push(SsmlValidationIssue {
101                issue_type: SsmlIssueType::Error,
102                message: "SSML must end with </speak> tag".to_string(),
103                line: line_count,
104                column: last_line_len,
105                suggestion: Some("Add closing </speak> tag".to_string()),
106            });
107        }
108
109        // Tag balance validation
110        issues.extend(self.validate_tag_balance(ssml)?);
111
112        // Attribute validation
113        issues.extend(self.validate_attributes(ssml)?);
114
115        Ok(issues)
116    }
117
118    /// Validate that opening and closing tags are balanced
119    fn validate_tag_balance(&self, ssml: &str) -> Result<Vec<SsmlValidationIssue>> {
120        let mut issues = Vec::new();
121        let mut tag_stack = Vec::new();
122
123        for (line_num, line) in ssml.lines().enumerate() {
124            for cap in RE_TAG.captures_iter(line) {
125                let is_closing = !cap[1].is_empty();
126                let tag_name = &cap[2];
127
128                // Skip self-closing tags
129                if line.contains(&format!("<{}", tag_name)) && line.contains("/>") {
130                    continue;
131                }
132
133                if is_closing {
134                    if let Some(last_tag) = tag_stack.pop() {
135                        if last_tag != tag_name {
136                            issues.push(SsmlValidationIssue {
137                                issue_type: SsmlIssueType::Error,
138                                message: format!(
139                                    "Mismatched closing tag: expected </{}>, found </{}>",
140                                    last_tag, tag_name
141                                ),
142                                line: line_num + 1,
143                                column: line.find(&cap[0]).unwrap_or(0) + 1,
144                                suggestion: Some(format!("Change to </{}>", last_tag)),
145                            });
146                        }
147                    } else {
148                        issues.push(SsmlValidationIssue {
149                            issue_type: SsmlIssueType::Error,
150                            message: format!("Unexpected closing tag: </{}>", tag_name),
151                            line: line_num + 1,
152                            column: line.find(&cap[0]).unwrap_or(0) + 1,
153                            suggestion: Some(
154                                "Remove this closing tag or add matching opening tag".to_string(),
155                            ),
156                        });
157                    }
158                } else {
159                    tag_stack.push(tag_name.to_string());
160                }
161            }
162        }
163
164        // Check for unclosed tags
165        let line_count = ssml.lines().count();
166        let last_line_len = ssml.lines().last().map(|l| l.len()).unwrap_or(0);
167
168        for unclosed_tag in tag_stack {
169            issues.push(SsmlValidationIssue {
170                issue_type: SsmlIssueType::Error,
171                message: format!("Unclosed tag: <{}>", unclosed_tag),
172                line: line_count,
173                column: last_line_len,
174                suggestion: Some(format!("Add closing tag: </{}>", unclosed_tag)),
175            });
176        }
177
178        Ok(issues)
179    }
180
181    /// Validate SSML attributes
182    fn validate_attributes(&self, ssml: &str) -> Result<Vec<SsmlValidationIssue>> {
183        let mut issues = Vec::new();
184
185        for (line_num, line) in ssml.lines().enumerate() {
186            if let Some(cap) = RE_PROSODY_TAG.captures(line) {
187                let attributes = &cap[1];
188
189                // Validate rate attribute
190                if let Some(rate_match) = RE_RATE.captures(attributes) {
191                    let rate_value = &rate_match[1];
192                    if !self.is_valid_prosody_rate(rate_value) {
193                        issues.push(SsmlValidationIssue {
194                            issue_type: SsmlIssueType::Warning,
195                            message: format!("Invalid prosody rate: '{rate_value}'"),
196                            line: line_num + 1,
197                            column: line.find(rate_value).unwrap_or(0) + 1,
198                            suggestion: Some("Use values like: x-slow, slow, medium, fast, x-fast, or percentage/Hz values".to_string()),
199                        });
200                    }
201                }
202
203                // Validate pitch attribute
204                if let Some(pitch_match) = RE_PITCH.captures(attributes) {
205                    let pitch_value = &pitch_match[1];
206                    if !self.is_valid_prosody_pitch(pitch_value) {
207                        issues.push(SsmlValidationIssue {
208                            issue_type: SsmlIssueType::Warning,
209                            message: format!("Invalid prosody pitch: '{pitch_value}'"),
210                            line: line_num + 1,
211                            column: line.find(pitch_value).unwrap_or(0) + 1,
212                            suggestion: Some("Use values like: x-low, low, medium, high, x-high, or Hz/semitone values".to_string()),
213                        });
214                    }
215                }
216
217                // Validate volume attribute
218                if let Some(volume_match) = RE_VOLUME.captures(attributes) {
219                    let volume_value = &volume_match[1];
220                    if !self.is_valid_prosody_volume(volume_value) {
221                        issues.push(SsmlValidationIssue {
222                            issue_type: SsmlIssueType::Warning,
223                            message: format!("Invalid prosody volume: '{volume_value}'"),
224                            line: line_num + 1,
225                            column: line.find(volume_value).unwrap_or(0) + 1,
226                            suggestion: Some("Use values like: silent, x-soft, soft, medium, loud, x-loud, or dB values".to_string()),
227                        });
228                    }
229                }
230            }
231        }
232
233        Ok(issues)
234    }
235
236    /// Check if prosody rate value is valid
237    fn is_valid_prosody_rate(&self, value: &str) -> bool {
238        matches!(value, "x-slow" | "slow" | "medium" | "fast" | "x-fast")
239            || value.ends_with('%')
240            || value.ends_with("Hz")
241            || value.parse::<f32>().is_ok()
242    }
243
244    /// Check if prosody pitch value is valid
245    fn is_valid_prosody_pitch(&self, value: &str) -> bool {
246        matches!(value, "x-low" | "low" | "medium" | "high" | "x-high")
247            || value.ends_with("Hz")
248            || value.ends_with("st")
249            || value.starts_with('+')
250            || value.starts_with('-')
251            || value.parse::<f32>().is_ok()
252    }
253
254    /// Check if prosody volume value is valid
255    fn is_valid_prosody_volume(&self, value: &str) -> bool {
256        matches!(
257            value,
258            "silent" | "x-soft" | "soft" | "medium" | "loud" | "x-loud"
259        ) || value.ends_with("dB")
260            || value.starts_with('+')
261            || value.starts_with('-')
262            || value.parse::<f32>().is_ok()
263    }
264
265    /// Convert SSML to plain text (remove markup)
266    pub fn to_plain_text(&self, ssml: &str) -> String {
267        // Remove SSML tags but keep their content
268        let text = RE_TAG_REMOVE.replace_all(ssml, "");
269
270        // Clean up extra whitespace
271        let text = RE_WHITESPACE.replace_all(&text, " ");
272
273        text.trim().to_string()
274    }
275
276    /// Extract synthesis parameters from SSML
277    pub fn extract_synthesis_params(&self, ssml: &str) -> SsmlSynthesisParams {
278        let mut params = SsmlSynthesisParams::default();
279
280        // Extract voice parameter
281        if let Some(voice_match) = RE_VOICE_NAME.captures(ssml) {
282            params.voice = Some(voice_match[1].to_string());
283        }
284
285        // Extract prosody parameters (use the first occurrence)
286        if let Some(prosody_match) = RE_PROSODY_TAG.captures(ssml) {
287            let attributes = &prosody_match[1];
288
289            if let Some(rate_match) = RE_RATE.captures(attributes) {
290                params.speaking_rate = self.parse_rate_value(&rate_match[1]);
291            }
292
293            if let Some(pitch_match) = RE_PITCH.captures(attributes) {
294                params.pitch_shift = self.parse_pitch_value(&pitch_match[1]);
295            }
296
297            if let Some(volume_match) = RE_VOLUME.captures(attributes) {
298                params.volume_gain = self.parse_volume_value(&volume_match[1]);
299            }
300        }
301
302        params
303    }
304
305    /// Parse rate value to numeric multiplier
306    fn parse_rate_value(&self, value: &str) -> Option<f32> {
307        match value {
308            "x-slow" => Some(0.5),
309            "slow" => Some(0.75),
310            "medium" => Some(1.0),
311            "fast" => Some(1.25),
312            "x-fast" => Some(1.5),
313            _ => {
314                if value.ends_with('%') {
315                    value
316                        .trim_end_matches('%')
317                        .parse::<f32>()
318                        .ok()
319                        .map(|v| v / 100.0)
320                } else {
321                    value.parse::<f32>().ok()
322                }
323            }
324        }
325    }
326
327    /// Parse pitch value to semitone shift
328    fn parse_pitch_value(&self, value: &str) -> Option<f32> {
329        match value {
330            "x-low" => Some(-6.0),
331            "low" => Some(-3.0),
332            "medium" => Some(0.0),
333            "high" => Some(3.0),
334            "x-high" => Some(6.0),
335            _ => {
336                if value.ends_with("st") {
337                    value.trim_end_matches("st").parse::<f32>().ok()
338                } else if value.ends_with("Hz") {
339                    // Convert Hz to approximate semitones (simplified)
340                    value.trim_end_matches("Hz").parse::<f32>().ok().map(|hz| {
341                        // Very rough conversion, would need proper pitch detection
342                        (hz - 200.0) / 20.0
343                    })
344                } else {
345                    value.parse::<f32>().ok()
346                }
347            }
348        }
349    }
350
351    /// Parse volume value to dB gain
352    fn parse_volume_value(&self, value: &str) -> Option<f32> {
353        match value {
354            "silent" => Some(-60.0),
355            "x-soft" => Some(-20.0),
356            "soft" => Some(-10.0),
357            "medium" => Some(0.0),
358            "loud" => Some(6.0),
359            "x-loud" => Some(12.0),
360            _ => {
361                if value.ends_with("dB") {
362                    value.trim_end_matches("dB").parse::<f32>().ok()
363                } else {
364                    value.parse::<f32>().ok()
365                }
366            }
367        }
368    }
369}
370
371/// SSML validation issue
372#[derive(Debug, Clone)]
373pub struct SsmlValidationIssue {
374    pub issue_type: SsmlIssueType,
375    pub message: String,
376    pub line: usize,
377    pub column: usize,
378    pub suggestion: Option<String>,
379}
380
381/// Type of SSML validation issue
382#[derive(Debug, Clone, PartialEq)]
383pub enum SsmlIssueType {
384    Error,
385    Warning,
386    Info,
387}
388
389/// Synthesis parameters extracted from SSML
390#[derive(Debug, Default)]
391pub struct SsmlSynthesisParams {
392    pub voice: Option<String>,
393    pub speaking_rate: Option<f32>,
394    pub pitch_shift: Option<f32>,
395    pub volume_gain: Option<f32>,
396}
397
398/// SSML processing utilities
399pub mod utils {
400    use super::*;
401
402    /// Wrap plain text in SSML speak tags
403    pub fn wrap_in_speak(text: &str) -> String {
404        if text.trim_start().starts_with("<speak") {
405            text.to_string()
406        } else {
407            format!("<speak>{}</speak>", text)
408        }
409    }
410
411    /// Create SSML with prosody tags
412    pub fn with_prosody(
413        text: &str,
414        rate: Option<f32>,
415        pitch: Option<f32>,
416        volume: Option<f32>,
417    ) -> String {
418        let mut prosody_attrs = Vec::new();
419
420        if let Some(rate) = rate {
421            prosody_attrs.push(format!(
422                "rate=\"{}\"",
423                if rate < 1.0 {
424                    "slow"
425                } else if rate > 1.0 {
426                    "fast"
427                } else {
428                    "medium"
429                }
430            ));
431        }
432
433        if let Some(pitch) = pitch {
434            prosody_attrs.push(format!("pitch=\"{}st\"", pitch));
435        }
436
437        if let Some(volume) = volume {
438            prosody_attrs.push(format!("volume=\"{}dB\"", volume));
439        }
440
441        if prosody_attrs.is_empty() {
442            wrap_in_speak(text)
443        } else {
444            wrap_in_speak(&format!(
445                "<prosody {}>{}</prosody>",
446                prosody_attrs.join(" "),
447                text
448            ))
449        }
450    }
451
452    /// Add break (pause) to SSML
453    pub fn add_break(time: &str) -> String {
454        format!("<break time=\"{}\"/>", time)
455    }
456
457    /// Add emphasis to text
458    pub fn add_emphasis(text: &str, level: &str) -> String {
459        format!("<emphasis level=\"{}\">{}</emphasis>", level, text)
460    }
461}
462
463/// Process SSML text and return processed text
464/// For now, this is a simple implementation that validates SSML and returns the text content
465pub fn process_ssml(text: &str) -> crate::error::Result<String> {
466    let processor = SsmlProcessor::new();
467
468    // Check if the text is SSML
469    if !processor.is_ssml(text) {
470        // If not SSML, wrap in speak tags
471        return Ok(utils::wrap_in_speak(text));
472    }
473
474    // Validate SSML
475    let issues = processor.validate(text)?;
476
477    // Check for errors
478    let errors: Vec<_> = issues
479        .iter()
480        .filter(|issue| matches!(issue.issue_type, SsmlIssueType::Error))
481        .collect();
482
483    if !errors.is_empty() {
484        let error_messages: Vec<String> = errors
485            .iter()
486            .map(|error| format!("Line {}: {}", error.line, error.message))
487            .collect();
488        return Err(crate::error::CliError::ValidationError(format!(
489            "SSML validation failed:\n{}",
490            error_messages.join("\n")
491        )));
492    }
493
494    // For now, just return the original text since we don't have full SSML processing
495    // In a real implementation, this would convert SSML to synthesis parameters
496    Ok(text.to_string())
497}
498
499#[cfg(test)]
500mod tests {
501    use super::*;
502
503    #[test]
504    fn test_is_ssml() {
505        let processor = SsmlProcessor::new();
506
507        assert!(processor.is_ssml("<speak>Hello</speak>"));
508        assert!(processor.is_ssml("  <voice>Text</voice>"));
509        assert!(!processor.is_ssml("Plain text"));
510        assert!(!processor.is_ssml("Text with <emphasis> but no closing"));
511    }
512
513    #[test]
514    fn test_to_plain_text() {
515        let processor = SsmlProcessor::new();
516
517        let ssml =
518            "<speak><prosody rate=\"slow\">Hello <emphasis>world</emphasis></prosody></speak>";
519        let plain = processor.to_plain_text(ssml);
520        assert_eq!(plain, "Hello world");
521    }
522
523    #[test]
524    fn test_wrap_in_speak() {
525        assert_eq!(utils::wrap_in_speak("Hello"), "<speak>Hello</speak>");
526        assert_eq!(
527            utils::wrap_in_speak("<speak>Hello</speak>"),
528            "<speak>Hello</speak>"
529        );
530    }
531
532    #[test]
533    fn test_extract_synthesis_params() {
534        let processor = SsmlProcessor::new();
535
536        let ssml = r#"<speak><voice name="female-voice"><prosody rate="fast" pitch="high" volume="loud">Hello</prosody></voice></speak>"#;
537        let params = processor.extract_synthesis_params(ssml);
538
539        assert_eq!(params.voice, Some("female-voice".to_string()));
540        assert_eq!(params.speaking_rate, Some(1.25));
541        assert_eq!(params.pitch_shift, Some(3.0));
542        assert_eq!(params.volume_gain, Some(6.0));
543    }
544}