ass_core/utils/utf8/
mod.rs

1//! UTF-8 and text encoding utilities for ASS script processing
2//!
3//! Provides BOM handling, encoding detection, and UTF-8 validation utilities
4//! optimized for ASS subtitle script processing with zero-copy design.
5//!
6//! # Features
7//!
8//! - BOM detection and stripping for common encodings
9//! - UTF-8 validation with detailed error reporting
10//! - Encoding detection for legacy ASS files
11//! - `nostd` compatible implementation
12//! - Zero-copy operations where possible
13//!
14//! # Examples
15//!
16//! ```rust
17//! use ass_core::utils::utf8::{strip_bom, detect_encoding, validate_utf8};
18//!
19//! // Strip BOM if present
20//! let input = "\u{FEFF}[Script Info]\nTitle: Test";
21//! let (stripped, had_bom) = strip_bom(input);
22//! assert_eq!(stripped, "[Script Info]\nTitle: Test");
23//! assert!(had_bom);
24//!
25//! // Detect encoding
26//! let text = "[Script Info]\nTitle: Test";
27//! let encoding = detect_encoding(text.as_bytes());
28//! assert_eq!(encoding.encoding, "UTF-8");
29//! assert!(encoding.confidence > 0.8);
30//!
31//! // Validate UTF-8
32//! let valid_text = "Hello, 世界! 🎵";
33//! assert!(validate_utf8(valid_text.as_bytes()).is_ok());
34//! ```
35
36mod bom;
37mod encoding;
38mod normalization;
39mod validation;
40
41// Re-export all public types and functions for API compatibility
42pub use bom::{detect_bom, strip_bom, BomType};
43pub use encoding::{detect_encoding, is_likely_ass_content, EncodingInfo};
44pub use normalization::{
45    normalize_line_endings, normalize_whitespace, remove_control_chars, trim_lines,
46};
47pub use validation::{
48    count_replacement_chars, is_valid_ass_text, recover_utf8, truncate_at_char_boundary,
49    validate_utf8,
50};
51
52#[cfg(test)]
53mod tests {
54    use super::*;
55
56    #[test]
57    fn integration_bom_detection() {
58        let text_with_bom = "\u{FEFF}Hello World";
59        let (stripped, had_bom) = strip_bom(text_with_bom);
60        assert_eq!(stripped, "Hello World");
61        assert!(had_bom);
62
63        let bytes = &[0xEF, 0xBB, 0xBF, b'H', b'i'];
64        let (bom_type, skip) = detect_bom(bytes).unwrap();
65        assert_eq!(bom_type, BomType::Utf8);
66        assert_eq!(skip, 3);
67    }
68
69    #[test]
70    fn integration_encoding_detection() {
71        let text = "[Script Info]\nTitle: Test Script";
72        let encoding = detect_encoding(text.as_bytes());
73        assert_eq!(encoding.encoding, "UTF-8");
74        assert!(encoding.confidence > 0.9); // High confidence due to ASS patterns
75        assert!(!encoding.has_bom);
76
77        assert!(is_likely_ass_content(text));
78        assert!(!is_likely_ass_content("Just regular text"));
79    }
80
81    #[test]
82    fn integration_validation_and_recovery() {
83        let valid_text = "Hello, 世界! 🎵";
84        assert!(validate_utf8(valid_text.as_bytes()).is_ok());
85
86        let invalid_bytes = &[b'H', b'i', 0xFF, b'!'];
87        let (recovered, replacements) = recover_utf8(invalid_bytes);
88        assert_eq!(recovered, "Hi�!");
89        assert_eq!(replacements, 1);
90
91        assert_eq!(count_replacement_chars(&recovered), 1);
92    }
93
94    #[test]
95    fn integration_normalization() {
96        let input = "Line 1\r\nLine 2\rLine 3\n";
97        let normalized = normalize_line_endings(input);
98        assert_eq!(normalized, "Line 1\nLine 2\nLine 3\n");
99
100        let whitespace_text = "Hello    World   Test";
101        let normalized_ws = normalize_whitespace(whitespace_text, true);
102        assert_eq!(normalized_ws, "Hello World Test");
103
104        let input_with_control = "Hello\x00World\x1FTest\nValid";
105        let cleaned = remove_control_chars(input_with_control);
106        assert_eq!(cleaned, "HelloWorldTest\nValid");
107    }
108
109    #[test]
110    fn integration_text_validation() {
111        assert!(is_valid_ass_text("Hello World"));
112        assert!(is_valid_ass_text("Hello\tWorld\n"));
113        assert!(is_valid_ass_text("Hello 世界"));
114        assert!(!is_valid_ass_text("Hello\x00World")); // Null character
115        assert!(!is_valid_ass_text("Hello\x1FWorld")); // Control character
116    }
117
118    #[test]
119    fn integration_truncation() {
120        let text = "Hello World";
121        let (truncated, was_truncated) = truncate_at_char_boundary(text, 5);
122        assert_eq!(truncated, "Hello");
123        assert!(was_truncated);
124
125        let unicode_text = "Hello 世界";
126        let (truncated, was_truncated) = truncate_at_char_boundary(unicode_text, 8);
127        assert_eq!(truncated, "Hello "); // Stops before the Unicode character
128        assert!(was_truncated);
129
130        let short_text = "Hi";
131        let (truncated, was_truncated) = truncate_at_char_boundary(short_text, 10);
132        assert_eq!(truncated, "Hi");
133        assert!(!was_truncated);
134    }
135
136    #[test]
137    fn integration_full_workflow() {
138        // Simulate processing an ASS file with various encoding issues
139        let input = "\u{FEFF}[Script Info]\r\nTitle: Test\x00Script\r\n\r\n[Events]\nDialogue: Hello    World";
140
141        // Step 1: Strip BOM
142        let (without_bom, had_bom) = strip_bom(input);
143        assert!(had_bom);
144
145        // Step 2: Normalize line endings
146        let normalized = normalize_line_endings(without_bom);
147
148        // Step 3: Remove control characters
149        let cleaned = remove_control_chars(&normalized);
150
151        // Step 4: Normalize whitespace
152        let final_text = normalize_whitespace(&cleaned, true);
153
154        // Verify final result
155        assert!(final_text.contains("[Script Info]"));
156        assert!(final_text.contains("Title: TestScript")); // Control char removed
157        assert!(!final_text.contains('\r')); // No carriage returns
158        assert!(final_text.contains("Dialogue: Hello World")); // Whitespace normalized
159    }
160}