ass_core/utils/utf8/
validation.rs

1//! UTF-8 validation and recovery utilities for ASS subtitle processing
2//!
3//! Provides detailed UTF-8 validation with position-specific error reporting
4//! and recovery mechanisms for handling invalid UTF-8 sequences. Designed
5//! for robust processing of subtitle files with various encoding issues.
6//!
7//! # Examples
8//!
9//! ```rust
10//! use ass_core::utils::utf8::{validate_utf8, recover_utf8};
11//!
12//! let valid_text = "Hello, 世界! 🎵";
13//! assert!(validate_utf8(valid_text.as_bytes()).is_ok());
14//!
15//! let invalid_bytes = &[b'H', b'i', 0xFF, b'!'];
16//! let (recovered, replacements) = recover_utf8(invalid_bytes);
17//! assert_eq!(recovered, "Hi�!");
18//! assert_eq!(replacements, 1);
19//! ```
20
21use crate::utils::CoreError;
22use alloc::{
23    format,
24    string::{String, ToString},
25};
26use core::str;
27
28/// Validate UTF-8 with detailed error information
29///
30/// Provides more detailed error reporting than standard UTF-8 validation,
31/// including the position and nature of encoding errors. Essential for
32/// processing subtitle files with encoding issues.
33///
34/// # Arguments
35///
36/// * `bytes` - Byte sequence to validate
37///
38/// # Returns
39///
40/// `Ok(())` if valid UTF-8, detailed error with position if invalid
41///
42/// # Examples
43///
44/// ```rust
45/// # use ass_core::utils::utf8::validate_utf8;
46/// let valid_text = "Hello, 世界!";
47/// assert!(validate_utf8(valid_text.as_bytes()).is_ok());
48///
49/// let invalid_bytes = &[0xFF, 0xFE, 0x80];
50/// assert!(validate_utf8(invalid_bytes).is_err());
51/// ```
52///
53/// # Errors
54///
55/// Returns an error if the byte slice contains invalid UTF-8 sequences.
56pub fn validate_utf8(bytes: &[u8]) -> Result<(), CoreError> {
57    match str::from_utf8(bytes) {
58        Ok(_) => Ok(()),
59        Err(err) => {
60            let position = err.valid_up_to();
61            let message = err.error_len().map_or_else(
62                || format!("Incomplete UTF-8 sequence at position {position}"),
63                |len| format!("Invalid UTF-8 sequence of {len} bytes at position {position}"),
64            );
65
66            Err(CoreError::utf8_error(position, message))
67        }
68    }
69}
70
71/// Attempt to recover from UTF-8 errors by replacing invalid sequences
72///
73/// Returns valid UTF-8 text with invalid sequences replaced by the Unicode
74/// replacement character (�). Also returns the number of replacements made
75/// for diagnostic purposes.
76///
77/// # Arguments
78///
79/// * `bytes` - Byte sequence that may contain invalid UTF-8
80///
81/// # Returns
82///
83/// Tuple of (`recovered_text`, `replacement_count`)
84///
85/// # Examples
86///
87/// ```rust
88/// # use ass_core::utils::utf8::recover_utf8;
89/// let valid_text = "Hello, World!";
90/// let (recovered, replacements) = recover_utf8(valid_text.as_bytes());
91/// assert_eq!(recovered, "Hello, World!");
92/// assert_eq!(replacements, 0);
93///
94/// let invalid_bytes = &[b'H', b'i', 0xFF, b'!'];
95/// let (recovered, replacements) = recover_utf8(invalid_bytes);
96/// assert_eq!(recovered, "Hi�!");
97/// assert_eq!(replacements, 1);
98/// ```
99#[must_use]
100pub fn recover_utf8(bytes: &[u8]) -> (String, usize) {
101    str::from_utf8(bytes).map_or_else(
102        |_| {
103            let recovered = String::from_utf8_lossy(bytes);
104            let replacements = recovered.matches('\u{FFFD}').count();
105            (recovered.into_owned(), replacements)
106        },
107        |s| (s.to_string(), 0),
108    )
109}
110
111/// Check if text contains only valid ASS characters
112///
113/// ASS files should generally contain only printable characters plus
114/// specific control characters like tabs and newlines. This function
115/// validates character content according to ASS specification guidelines.
116///
117/// # Arguments
118///
119/// * `text` - Text content to validate
120///
121/// # Returns
122///
123/// `true` if all characters are valid for ASS content
124#[must_use]
125pub fn is_valid_ass_text(text: &str) -> bool {
126    text.chars().all(|c| {
127        c.is_ascii_graphic()  // Printable ASCII
128            || c == ' '       // Space
129            || c == '\t'      // Tab
130            || c == '\n'      // Newline
131            || c == '\r'      // Carriage return
132            || (!c.is_ascii() && !c.is_control()) // Non-ASCII printable (Unicode)
133    })
134}
135
136/// Truncate text at UTF-8 character boundary
137///
138/// Safely truncates text to the specified byte length without breaking
139/// UTF-8 character sequences. Essential for handling length limits
140/// while maintaining valid UTF-8 encoding.
141///
142/// # Arguments
143///
144/// * `text` - Input text to truncate
145/// * `max_bytes` - Maximum byte length
146///
147/// # Returns
148///
149/// Tuple of (`truncated_text`, `was_truncated`)
150///
151/// # Examples
152///
153/// ```rust
154/// # use ass_core::utils::utf8::truncate_at_char_boundary;
155/// let text = "Hello World";
156/// let (truncated, was_truncated) = truncate_at_char_boundary(text, 5);
157/// assert_eq!(truncated, "Hello");
158/// assert!(was_truncated);
159///
160/// let text = "Hello 世界";
161/// let (truncated, was_truncated) = truncate_at_char_boundary(text, 8);
162/// assert_eq!(truncated, "Hello "); // Stops before the Unicode character
163/// assert!(was_truncated);
164/// ```
165#[must_use]
166pub fn truncate_at_char_boundary(text: &str, max_bytes: usize) -> (&str, bool) {
167    if text.len() <= max_bytes {
168        return (text, false);
169    }
170
171    let mut boundary = max_bytes;
172    while boundary > 0 && !text.is_char_boundary(boundary) {
173        boundary -= 1;
174    }
175
176    (&text[..boundary], true)
177}
178
179/// Count replacement characters in text
180///
181/// Counts the number of Unicode replacement characters (�) in text,
182/// which typically indicate encoding errors or data corruption.
183/// Useful for assessing text quality and encoding issues.
184///
185/// # Arguments
186///
187/// * `text` - Text to analyze
188///
189/// # Returns
190///
191/// Number of replacement characters found
192#[must_use]
193pub fn count_replacement_chars(text: &str) -> usize {
194    text.matches('\u{FFFD}').count()
195}
196
197#[cfg(test)]
198mod tests {
199    use super::*;
200
201    #[test]
202    fn validate_valid_utf8() {
203        let text = "Hello, 世界! 🎵";
204        assert!(validate_utf8(text.as_bytes()).is_ok());
205    }
206
207    #[test]
208    fn validate_invalid_utf8() {
209        let invalid_bytes = &[0xFF, 0xFE, 0x80]; // Invalid UTF-8
210        assert!(validate_utf8(invalid_bytes).is_err());
211    }
212
213    #[test]
214    fn validate_incomplete_utf8() {
215        let incomplete_bytes = &[0xC2]; // Incomplete UTF-8 sequence
216        let result = validate_utf8(incomplete_bytes);
217        assert!(result.is_err());
218    }
219
220    #[test]
221    fn recover_valid_utf8() {
222        let text = "Hello, World!";
223        let (recovered, replacements) = recover_utf8(text.as_bytes());
224        assert_eq!(recovered, "Hello, World!");
225        assert_eq!(replacements, 0);
226    }
227
228    #[test]
229    fn recover_invalid_utf8() {
230        let invalid_bytes = &[b'H', b'i', 0xFF, b'!'];
231        let (recovered, replacements) = recover_utf8(invalid_bytes);
232        assert_eq!(recovered, "Hi�!");
233        assert_eq!(replacements, 1);
234    }
235
236    #[test]
237    fn recover_multiple_invalid_sequences() {
238        let invalid_bytes = &[b'A', 0xFF, b'B', 0xFE, b'C'];
239        let (recovered, replacements) = recover_utf8(invalid_bytes);
240        assert_eq!(recovered, "A�B�C");
241        assert_eq!(replacements, 2);
242    }
243
244    #[test]
245    fn valid_ass_text() {
246        assert!(is_valid_ass_text("Hello World"));
247        assert!(is_valid_ass_text("Hello\tWorld\n"));
248        assert!(is_valid_ass_text("Hello 世界"));
249        assert!(!is_valid_ass_text("Hello\x00World")); // Null character
250        assert!(!is_valid_ass_text("Hello\x1FWorld")); // Control character
251    }
252
253    #[test]
254    fn truncate_ascii() {
255        let text = "Hello World";
256        let (truncated, was_truncated) = truncate_at_char_boundary(text, 5);
257        assert_eq!(truncated, "Hello");
258        assert!(was_truncated);
259    }
260
261    #[test]
262    fn truncate_unicode() {
263        let text = "Hello 世界";
264        let (truncated, was_truncated) = truncate_at_char_boundary(text, 8);
265        assert_eq!(truncated, "Hello "); // Stops before the Unicode character
266        assert!(was_truncated);
267    }
268
269    #[test]
270    fn truncate_no_change() {
271        let text = "Hello";
272        let (truncated, was_truncated) = truncate_at_char_boundary(text, 10);
273        assert_eq!(truncated, "Hello");
274        assert!(!was_truncated);
275    }
276
277    #[test]
278    fn truncate_at_unicode_boundary() {
279        let text = "世界";
280        let (truncated, was_truncated) = truncate_at_char_boundary(text, 3);
281        assert_eq!(truncated, "世");
282        assert!(was_truncated);
283    }
284
285    #[test]
286    fn count_replacement_characters() {
287        assert_eq!(count_replacement_chars("Hello World"), 0);
288        assert_eq!(count_replacement_chars("Hello � World"), 1);
289        assert_eq!(count_replacement_chars("� Test � Again �"), 3);
290    }
291}