ass_core/analysis/events/
text_analysis.rs

1//! Text content analysis for ASS dialogue events
2//!
3//! Provides comprehensive analysis of dialogue text including override tag parsing,
4//! Unicode complexity detection, and character counting. Uses zero-copy design
5//! with lifetime-generic references to original text.
6//!
7//! # Features
8//!
9//! - Override tag extraction and complexity scoring
10//! - Plain text extraction (tags removed)
11//! - Unicode bidirectional text detection
12//! - Character and line counting
13//! - Zero-copy tag argument references
14//!
15//! # Performance
16//!
17//! - Target: <0.5ms per event text analysis
18//! - Memory: Minimal allocations via string slices
19//! - Unicode: Efficient detection without full normalization
20
21use crate::{
22    analysis::events::tags::{parse_override_block, DiagnosticKind, OverrideTag, TagDiagnostic},
23    utils::{errors::resource::check_depth_limit, CoreError},
24    Result,
25};
26
27#[cfg(feature = "plugins")]
28use crate::analysis::events::tags::parse_override_block_with_registry;
29
30#[cfg(feature = "plugins")]
31use crate::plugin::ExtensionRegistry;
32use alloc::{string::String, vec::Vec};
33
34/// Analysis results for dialogue text content
35///
36/// Contains extracted plain text, override tag information, and Unicode
37/// complexity indicators. Uses zero-copy references where possible.
38#[derive(Debug, Clone)]
39pub struct TextAnalysis<'a> {
40    /// Plain text with override tags removed
41    plain_text: String,
42    /// Unicode character count
43    char_count: usize,
44    /// Line count after processing linebreaks
45    line_count: usize,
46    /// Contains bidirectional text (RTL scripts)
47    has_bidi_text: bool,
48    /// Contains complex Unicode beyond basic Latin
49    has_complex_unicode: bool,
50    /// Parsed override tags
51    override_tags: Vec<OverrideTag<'a>>,
52    /// Parse diagnostics collected during analysis
53    parse_diagnostics: Vec<TagDiagnostic<'a>>,
54}
55
56impl<'a> TextAnalysis<'a> {
57    /// Analyze dialogue text content comprehensively
58    ///
59    /// Extracts plain text, parses override tags, and analyzes Unicode
60    /// complexity. Uses zero-copy references for tag arguments.
61    ///
62    /// # Arguments
63    ///
64    /// * `text` - Original dialogue text with potential override tags
65    ///
66    /// # Returns
67    ///
68    /// Complete text analysis results or parsing error.
69    ///
70    /// # Example
71    ///
72    /// ```rust
73    /// # use ass_core::analysis::events::text_analysis::TextAnalysis;
74    /// let text = "Hello {\\b1}world{\\b0}!";
75    /// let analysis = TextAnalysis::analyze(text)?;
76    /// assert_eq!(analysis.plain_text(), "Hello world!");
77    /// assert_eq!(analysis.override_tags().len(), 2);
78    /// # Ok::<(), Box<dyn std::error::Error>>(())
79    /// ```
80    ///
81    /// # Errors
82    ///
83    /// Returns an error if text parsing fails or contains invalid override tags.
84    pub fn analyze(text: &'a str) -> Result<Self> {
85        #[cfg(feature = "plugins")]
86        return Self::analyze_with_registry(text, None);
87        #[cfg(not(feature = "plugins"))]
88        return Self::analyze_impl(text);
89    }
90
91    /// Analyze dialogue text content with extension registry support
92    ///
93    /// Same as [`analyze`](Self::analyze) but allows custom tag handlers via registry.
94    /// Unhandled tags fall back to standard processing.
95    ///
96    /// # Arguments
97    ///
98    /// * `text` - Original dialogue text with potential override tags
99    /// * `registry` - Optional registry for custom tag handlers
100    ///
101    /// # Returns
102    ///
103    /// Complete text analysis results or parsing error.
104    ///
105    /// # Errors
106    ///
107    /// Returns an error if text parsing fails or contains invalid override tags.
108    #[cfg(feature = "plugins")]
109    pub fn analyze_with_registry(
110        text: &'a str,
111        registry: Option<&ExtensionRegistry>,
112    ) -> Result<Self> {
113        Self::analyze_impl_with_registry(text, registry)
114    }
115
116    /// Internal implementation without plugins support
117    #[cfg(not(feature = "plugins"))]
118    fn analyze_impl(text: &'a str) -> Result<Self> {
119        Self::analyze_impl_with_registry(text)
120    }
121
122    /// Internal implementation of analysis with optional registry support
123    fn analyze_impl_with_registry(
124        text: &'a str,
125        #[cfg(feature = "plugins")] registry: Option<&ExtensionRegistry>,
126    ) -> Result<Self> {
127        const MAX_BRACE_DEPTH: usize = 100; // Prevent DoS with deeply nested braces
128
129        let mut override_tags = Vec::new();
130        let mut parse_diagnostics = Vec::new();
131
132        let mut plain_text = String::new();
133        let mut position = 0;
134        let mut drawing_mode = false;
135
136        let mut chars = text.chars();
137        while let Some(ch) = chars.next() {
138            if ch == '{' {
139                let mut brace_count = 1;
140                let tag_start = position + ch.len_utf8();
141
142                for inner_ch in chars.by_ref() {
143                    position += inner_ch.len_utf8();
144
145                    if inner_ch == '{' {
146                        brace_count += 1;
147                        // Check for excessive nesting depth to prevent DoS
148                        if check_depth_limit(brace_count, MAX_BRACE_DEPTH).is_err() {
149                            return Err(CoreError::parse("Maximum brace nesting depth exceeded"));
150                        }
151                    } else if inner_ch == '}' {
152                        brace_count -= 1;
153                        if brace_count == 0 {
154                            break;
155                        }
156                    }
157                }
158
159                if position > tag_start {
160                    let tag_content = &text[tag_start..position];
161
162                    #[cfg(feature = "plugins")]
163                    if let Some(registry) = registry {
164                        parse_override_block_with_registry(
165                            tag_content,
166                            tag_start,
167                            &mut override_tags,
168                            &mut parse_diagnostics,
169                            Some(registry),
170                        );
171                    } else {
172                        parse_override_block(
173                            tag_content,
174                            tag_start,
175                            &mut override_tags,
176                            &mut parse_diagnostics,
177                        );
178                    }
179
180                    #[cfg(not(feature = "plugins"))]
181                    parse_override_block(
182                        tag_content,
183                        tag_start,
184                        &mut override_tags,
185                        &mut parse_diagnostics,
186                    );
187
188                    // Check for drawing mode changes in this tag block
189                    drawing_mode = Self::update_drawing_mode(tag_content, drawing_mode);
190                } else {
191                    parse_diagnostics.push(TagDiagnostic {
192                        span: &text[tag_start..position.max(tag_start + 1)],
193                        offset: tag_start,
194                        kind: DiagnosticKind::EmptyOverride,
195                    });
196                }
197            } else if ch == '\\' {
198                if let Some(next_ch) = chars.next() {
199                    position += next_ch.len_utf8();
200                    match next_ch {
201                        'n' | 'N' => {
202                            if !drawing_mode {
203                                plain_text.push('\n');
204                            }
205                        }
206                        'h' => {
207                            if !drawing_mode {
208                                plain_text.push('\u{00A0}');
209                            }
210                        }
211                        _ => {
212                            if !drawing_mode {
213                                plain_text.push(ch);
214                                plain_text.push(next_ch);
215                            }
216                        }
217                    }
218                }
219            } else if !drawing_mode {
220                plain_text.push(ch);
221            }
222
223            position += ch.len_utf8();
224        }
225
226        let char_count = plain_text.chars().count();
227        let line_count = Self::count_lines(&plain_text);
228        let has_bidi_text = Self::detect_bidi_text(&plain_text);
229        let has_complex_unicode = Self::detect_complex_unicode(&plain_text);
230
231        Ok(Self {
232            plain_text,
233            char_count,
234            line_count,
235            has_bidi_text,
236            has_complex_unicode,
237            override_tags,
238            parse_diagnostics,
239        })
240    }
241
242    /// Get plain text without override tags
243    #[must_use]
244    pub fn plain_text(&self) -> &str {
245        &self.plain_text
246    }
247
248    /// Get Unicode character count
249    #[must_use]
250    pub const fn char_count(&self) -> usize {
251        self.char_count
252    }
253
254    /// Get line count after processing linebreaks
255    #[must_use]
256    pub const fn line_count(&self) -> usize {
257        self.line_count
258    }
259
260    /// Check if text contains bidirectional content
261    #[must_use]
262    pub const fn has_bidi_text(&self) -> bool {
263        self.has_bidi_text
264    }
265
266    /// Check if text contains complex Unicode beyond basic Latin
267    #[must_use]
268    pub const fn has_complex_unicode(&self) -> bool {
269        self.has_complex_unicode
270    }
271
272    /// Get parsed override tags
273    #[must_use]
274    pub fn override_tags(&self) -> &[OverrideTag<'a>] {
275        &self.override_tags
276    }
277
278    /// Get parse diagnostics collected during analysis
279    #[must_use]
280    pub fn diagnostics(&self) -> &[TagDiagnostic<'a>] {
281        &self.parse_diagnostics
282    }
283
284    /// Update drawing mode state based on override tag content
285    fn update_drawing_mode(tag_content: &str, current_mode: bool) -> bool {
286        let mut pos = 0;
287        let chars: Vec<char> = tag_content.chars().collect();
288        let mut drawing_mode = current_mode;
289
290        while pos < chars.len() {
291            if chars[pos] == '\\' && pos + 1 < chars.len() && chars[pos + 1] == 'p' {
292                pos += 2;
293                let mut number_str = String::new();
294
295                while pos < chars.len() && (chars[pos].is_ascii_digit() || chars[pos] == '-') {
296                    number_str.push(chars[pos]);
297                    pos += 1;
298                }
299
300                if let Ok(p_value) = number_str.parse::<i32>() {
301                    drawing_mode = p_value > 0;
302                }
303            } else {
304                pos += 1;
305            }
306        }
307
308        drawing_mode
309    }
310
311    /// Count lines correctly, handling empty lines and trailing newlines
312    fn count_lines(text: &str) -> usize {
313        if text.is_empty() {
314            return 1;
315        }
316
317        // For ASS subtitles, count newlines and add 1, but handle special cases
318        let newline_count = text.chars().filter(|&ch| ch == '\n').count();
319
320        if newline_count == 0 {
321            // No newlines means 1 line
322            1
323        } else if text.trim_end_matches('\n').is_empty() {
324            // Text is only newlines - each newline creates a line boundary
325            newline_count + 1
326        } else {
327            // Text has content - trailing newlines don't create additional lines
328            // Use lines() count which handles this correctly
329            text.lines().count().max(1)
330        }
331    }
332
333    /// Detect bidirectional text (RTL scripts)
334    fn detect_bidi_text(text: &str) -> bool {
335        text.chars().any(|ch| matches!(ch as u32, 0x0590..=0x05FF | 0x0600..=0x06FF | 0x0750..=0x077F | 0x08A0..=0x08FF))
336    }
337
338    /// Detect complex Unicode beyond basic Latin
339    fn detect_complex_unicode(text: &str) -> bool {
340        text.chars().any(|ch| {
341            let code = ch as u32;
342            code > 0x00FF || matches!(code, 0x0000..=0x001F | 0x007F..=0x009F | 0x200C..=0x200D | 0x2060..=0x206F)
343        })
344    }
345}
346
347#[cfg(test)]
348mod tests {
349    use super::*;
350    #[cfg(not(feature = "std"))]
351    use alloc::string::ToString;
352
353    #[test]
354    fn text_analysis_simple_text() {
355        let text = "Hello world!";
356        let analysis = TextAnalysis::analyze(text).unwrap();
357
358        assert_eq!(analysis.plain_text(), "Hello world!");
359        assert_eq!(analysis.char_count(), 12);
360        assert_eq!(analysis.line_count(), 1);
361        assert!(!analysis.has_bidi_text());
362        assert!(!analysis.has_complex_unicode());
363        assert!(analysis.override_tags().is_empty());
364        assert!(analysis.diagnostics().is_empty());
365    }
366
367    #[test]
368    fn text_analysis_with_override_tags() {
369        let text = "Hello {\\b1}bold{\\b0} world!";
370        let analysis = TextAnalysis::analyze(text).unwrap();
371
372        assert_eq!(analysis.plain_text(), "Hello bold world!");
373        assert_eq!(analysis.char_count(), 17);
374        assert_eq!(analysis.line_count(), 1);
375        assert!(!analysis.override_tags().is_empty());
376    }
377
378    #[test]
379    fn text_analysis_nested_braces() {
380        let text = "Text {\\pos(100,{\\some}200)} more text";
381        let analysis = TextAnalysis::analyze(text).unwrap();
382
383        assert_eq!(analysis.plain_text(), "Text  more text");
384        assert!(!analysis.override_tags().is_empty());
385    }
386
387    #[test]
388    fn text_analysis_line_breaks() {
389        let text = "First line\\NSecond line\\nThird line";
390        let analysis = TextAnalysis::analyze(text).unwrap();
391
392        assert_eq!(analysis.plain_text(), "First line\nSecond line\nThird line");
393        assert_eq!(analysis.line_count(), 3);
394    }
395
396    #[test]
397    fn text_analysis_hard_spaces() {
398        let text = "Text\\hwith\\hhard\\hspaces";
399        let analysis = TextAnalysis::analyze(text).unwrap();
400
401        assert_eq!(
402            analysis.plain_text(),
403            "Text\u{00A0}with\u{00A0}hard\u{00A0}spaces"
404        );
405    }
406
407    #[test]
408    fn text_analysis_mixed_escapes() {
409        let text = "Line 1\\NLine 2\\hspace\\nLine 3";
410        let analysis = TextAnalysis::analyze(text).unwrap();
411
412        assert_eq!(analysis.plain_text(), "Line 1\nLine 2\u{00A0}space\nLine 3");
413        assert_eq!(analysis.line_count(), 3);
414    }
415
416    #[test]
417    fn text_analysis_bidi_text_arabic() {
418        let text = "Hello مرحبا world";
419        let analysis = TextAnalysis::analyze(text).unwrap();
420
421        assert!(analysis.has_bidi_text());
422        assert!(analysis.has_complex_unicode());
423    }
424
425    #[test]
426    fn text_analysis_bidi_text_hebrew() {
427        let text = "Hello שלום world";
428        let analysis = TextAnalysis::analyze(text).unwrap();
429
430        assert!(analysis.has_bidi_text());
431        assert!(analysis.has_complex_unicode());
432    }
433
434    #[test]
435    fn text_analysis_complex_unicode_emoji() {
436        let text = "Hello 🌍 world";
437        let analysis = TextAnalysis::analyze(text).unwrap();
438
439        assert!(!analysis.has_bidi_text());
440        assert!(analysis.has_complex_unicode());
441    }
442
443    #[test]
444    fn text_analysis_complex_unicode_control_chars() {
445        let text = "Text\u{200C}with\u{200D}controls";
446        let analysis = TextAnalysis::analyze(text).unwrap();
447
448        assert!(analysis.has_complex_unicode());
449    }
450
451    #[test]
452    fn text_analysis_basic_latin_only() {
453        let text = "Basic ASCII text 123!@#";
454        let analysis = TextAnalysis::analyze(text).unwrap();
455
456        assert!(!analysis.has_bidi_text());
457        assert!(!analysis.has_complex_unicode());
458    }
459
460    #[test]
461    fn text_analysis_extended_latin() {
462        let text = "Café naïve résumé";
463        let analysis = TextAnalysis::analyze(text).unwrap();
464
465        assert!(!analysis.has_bidi_text());
466        assert!(!analysis.has_complex_unicode()); // These are still in Latin-1 range
467    }
468
469    #[test]
470    fn text_analysis_empty_override_blocks() {
471        let text = "Text {} more text";
472        let analysis = TextAnalysis::analyze(text).unwrap();
473
474        assert_eq!(analysis.plain_text(), "Text  more text");
475        // Should have diagnostic for empty override
476        assert!(!analysis.diagnostics().is_empty());
477    }
478
479    #[test]
480    fn text_analysis_unmatched_braces() {
481        let text = "Text {\\b1 unmatched";
482        let analysis = TextAnalysis::analyze(text).unwrap();
483
484        assert_eq!(analysis.plain_text(), "Text ");
485        // Should handle unmatched braces gracefully
486    }
487
488    #[test]
489    fn text_analysis_multiple_override_blocks() {
490        let text = "{\\b1}Bold{\\b0} and {\\i1}italic{\\i0} text";
491        let analysis = TextAnalysis::analyze(text).unwrap();
492
493        assert_eq!(analysis.plain_text(), "Bold and italic text");
494        assert_eq!(analysis.override_tags().len(), 4);
495    }
496
497    #[test]
498    fn text_analysis_complex_tags() {
499        let text = "{\\move(0,0,100,100)}{\\t(0,1000,\\fscx120)}{\\fade(255,0,0,0,800,900,1000)}Animated text";
500        let analysis = TextAnalysis::analyze(text).unwrap();
501
502        assert_eq!(analysis.plain_text(), "Animated text");
503        assert!(!analysis.override_tags().is_empty());
504    }
505
506    #[test]
507    fn text_analysis_drawing_commands() {
508        let text = "{\\p1}m 0 0 l 100 0 100 100 0 100{\\p0}Square";
509        let analysis = TextAnalysis::analyze(text).unwrap();
510
511        assert_eq!(analysis.plain_text(), "Square");
512        assert!(!analysis.override_tags().is_empty());
513    }
514
515    #[test]
516    fn text_analysis_color_tags() {
517        let text = "{\\c&H0000FF&}Red text{\\c} and {\\1c&H00FF00&}green text";
518        let analysis = TextAnalysis::analyze(text).unwrap();
519
520        assert_eq!(analysis.plain_text(), "Red text and green text");
521        assert!(!analysis.override_tags().is_empty());
522    }
523
524    #[test]
525    fn text_analysis_mixed_content() {
526        let text = "Start {\\b1}bold\\N{\\i1}italic{\\i0}{\\b0}\\hnormal end";
527        let analysis = TextAnalysis::analyze(text).unwrap();
528
529        assert_eq!(
530            analysis.plain_text(),
531            "Start bold\nitalic\u{00A0}normal end"
532        );
533        assert_eq!(analysis.line_count(), 2);
534        assert!(!analysis.override_tags().is_empty());
535    }
536
537    #[test]
538    fn text_analysis_whitespace_only() {
539        let text = "   \t\n  ";
540        let analysis = TextAnalysis::analyze(text).unwrap();
541
542        assert_eq!(analysis.plain_text(), "   \t\n  ");
543        assert_eq!(analysis.char_count(), 7);
544        assert_eq!(analysis.line_count(), 2);
545    }
546
547    #[test]
548    fn text_analysis_empty_text() {
549        let text = "";
550        let analysis = TextAnalysis::analyze(text).unwrap();
551
552        assert_eq!(analysis.plain_text(), "");
553        assert_eq!(analysis.char_count(), 0);
554        assert_eq!(analysis.line_count(), 1); // Minimum 1 line
555        assert!(analysis.override_tags().is_empty());
556    }
557
558    #[test]
559    fn text_analysis_only_override_tags() {
560        let text = "{\\b1}{\\i1}{\\u1}";
561        let analysis = TextAnalysis::analyze(text).unwrap();
562
563        assert_eq!(analysis.plain_text(), "");
564        assert_eq!(analysis.char_count(), 0);
565        assert!(!analysis.override_tags().is_empty());
566    }
567
568    #[test]
569    fn text_analysis_escape_sequences() {
570        let text = "Test`[Events]`backslash and \\{brace and \\}close";
571        let analysis = TextAnalysis::analyze(text).unwrap();
572
573        // These should be treated as literal characters, not escape sequences
574        assert_eq!(
575            analysis.plain_text(),
576            "Test`[Events]`backslash and \\{brace and \\}close"
577        );
578    }
579
580    #[test]
581    fn text_analysis_karaoke_tags() {
582        let text = "{\\k50}Ka{\\k30}ra{\\k70}o{\\k40}ke";
583        let analysis = TextAnalysis::analyze(text).unwrap();
584
585        assert_eq!(analysis.plain_text(), "Karaoke");
586        assert!(!analysis.override_tags().is_empty());
587    }
588
589    #[test]
590    fn text_analysis_position_and_rotation() {
591        let text = "{\\pos(320,240)}{\\frz45}Rotated positioned text";
592        let analysis = TextAnalysis::analyze(text).unwrap();
593
594        assert_eq!(analysis.plain_text(), "Rotated positioned text");
595        assert!(!analysis.override_tags().is_empty());
596    }
597
598    #[test]
599    fn text_analysis_very_long_text() {
600        let text = "A".repeat(1000);
601        let analysis = TextAnalysis::analyze(&text).unwrap();
602
603        assert_eq!(analysis.char_count(), 1000);
604        assert_eq!(analysis.plain_text().len(), 1000);
605    }
606
607    #[test]
608    fn text_analysis_line_count_edge_cases() {
609        // Text ending with newline
610        let text1 = "Line 1\\nLine 2\\n";
611        let analysis1 = TextAnalysis::analyze(text1).unwrap();
612        assert_eq!(analysis1.line_count(), 2);
613
614        // Multiple consecutive newlines
615        let text2 = "Line 1\\n\\n\\nLine 2";
616        let analysis2 = TextAnalysis::analyze(text2).unwrap();
617        assert_eq!(analysis2.line_count(), 4);
618
619        // Only newlines
620        let text3 = "\\n\\N\\n";
621        let analysis3 = TextAnalysis::analyze(text3).unwrap();
622        assert_eq!(analysis3.line_count(), 4);
623    }
624
625    #[test]
626    fn text_analysis_excessive_brace_nesting() {
627        // Create deeply nested braces to trigger depth limit error
628        let mut text = String::new();
629        for _ in 0..110 {
630            text.push('{');
631        }
632        text.push_str("\\b1");
633        for _ in 0..110 {
634            text.push('}');
635        }
636
637        let result = TextAnalysis::analyze(&text);
638        assert!(result.is_err());
639        assert!(result
640            .unwrap_err()
641            .to_string()
642            .contains("Maximum brace nesting depth exceeded"));
643    }
644
645    #[test]
646    fn text_analysis_drawing_mode_escape_sequences() {
647        // Test escape sequences in drawing mode - they should not be processed
648        let text = "{\\p1}Line1\\nLine2\\hSpace\\NNewline{\\p0}Normal\\ntext";
649        let analysis = TextAnalysis::analyze(text).unwrap();
650
651        // In drawing mode, text is ignored entirely from plain_text
652        // After {p0}, normal processing resumes
653        assert_eq!(analysis.plain_text(), "Normal\ntext");
654        assert!(!analysis.override_tags().is_empty());
655    }
656
657    #[test]
658    fn text_analysis_drawing_mode_p_value_parsing() {
659        // Test various p values to trigger drawing mode logic
660        let text1 = "{\\p0}Not drawing mode";
661        let analysis1 = TextAnalysis::analyze(text1).unwrap();
662        assert_eq!(analysis1.plain_text(), "Not drawing mode");
663
664        let text2 = "{\\p1}Drawing mode";
665        let analysis2 = TextAnalysis::analyze(text2).unwrap();
666        assert_eq!(analysis2.plain_text(), ""); // Drawing mode excludes text
667
668        let text3 = "{\\p5}Also drawing mode";
669        let analysis3 = TextAnalysis::analyze(text3).unwrap();
670        assert_eq!(analysis3.plain_text(), ""); // Drawing mode excludes text
671    }
672
673    #[test]
674    fn text_analysis_line_count_only_newlines() {
675        // Test line counting when text is only newlines (line 252)
676        let text = "\n\n\n";
677        let analysis = TextAnalysis::analyze(text).unwrap();
678        assert_eq!(analysis.line_count(), 4); // 3 newlines = 4 lines
679    }
680
681    #[test]
682    fn text_analysis_drawing_mode_mixed_escapes() {
683        // Test all escape sequence types in drawing mode
684        let text = "{\\p1}Start\\nNew\\NLine\\hHard{\\p0}End\\nNormal";
685        let analysis = TextAnalysis::analyze(text).unwrap();
686
687        // Drawing mode excludes all text, normal mode processes escape sequences
688        assert_eq!(analysis.plain_text(), "End\nNormal");
689        assert!(!analysis.override_tags().is_empty());
690    }
691}