ass_core/utils/utf8/validation.rs
1//! UTF-8 validation and recovery utilities for ASS subtitle processing
2//!
3//! Provides detailed UTF-8 validation with position-specific error reporting
4//! and recovery mechanisms for handling invalid UTF-8 sequences. Designed
5//! for robust processing of subtitle files with various encoding issues.
6//!
7//! # Examples
8//!
9//! ```rust
10//! use ass_core::utils::utf8::{validate_utf8, recover_utf8};
11//!
12//! let valid_text = "Hello, 世界! 🎵";
13//! assert!(validate_utf8(valid_text.as_bytes()).is_ok());
14//!
15//! let invalid_bytes = &[b'H', b'i', 0xFF, b'!'];
16//! let (recovered, replacements) = recover_utf8(invalid_bytes);
17//! assert_eq!(recovered, "Hi�!");
18//! assert_eq!(replacements, 1);
19//! ```
20
21use crate::utils::CoreError;
22use alloc::{
23 format,
24 string::{String, ToString},
25};
26use core::str;
27
28/// Validate UTF-8 with detailed error information
29///
30/// Provides more detailed error reporting than standard UTF-8 validation,
31/// including the position and nature of encoding errors. Essential for
32/// processing subtitle files with encoding issues.
33///
34/// # Arguments
35///
36/// * `bytes` - Byte sequence to validate
37///
38/// # Returns
39///
40/// `Ok(())` if valid UTF-8, detailed error with position if invalid
41///
42/// # Examples
43///
44/// ```rust
45/// # use ass_core::utils::utf8::validate_utf8;
46/// let valid_text = "Hello, 世界!";
47/// assert!(validate_utf8(valid_text.as_bytes()).is_ok());
48///
49/// let invalid_bytes = &[0xFF, 0xFE, 0x80];
50/// assert!(validate_utf8(invalid_bytes).is_err());
51/// ```
52///
53/// # Errors
54///
55/// Returns an error if the byte slice contains invalid UTF-8 sequences.
56pub fn validate_utf8(bytes: &[u8]) -> Result<(), CoreError> {
57 match str::from_utf8(bytes) {
58 Ok(_) => Ok(()),
59 Err(err) => {
60 let position = err.valid_up_to();
61 let message = err.error_len().map_or_else(
62 || format!("Incomplete UTF-8 sequence at position {position}"),
63 |len| format!("Invalid UTF-8 sequence of {len} bytes at position {position}"),
64 );
65
66 Err(CoreError::utf8_error(position, message))
67 }
68 }
69}
70
71/// Attempt to recover from UTF-8 errors by replacing invalid sequences
72///
73/// Returns valid UTF-8 text with invalid sequences replaced by the Unicode
74/// replacement character (�). Also returns the number of replacements made
75/// for diagnostic purposes.
76///
77/// # Arguments
78///
79/// * `bytes` - Byte sequence that may contain invalid UTF-8
80///
81/// # Returns
82///
83/// Tuple of (`recovered_text`, `replacement_count`)
84///
85/// # Examples
86///
87/// ```rust
88/// # use ass_core::utils::utf8::recover_utf8;
89/// let valid_text = "Hello, World!";
90/// let (recovered, replacements) = recover_utf8(valid_text.as_bytes());
91/// assert_eq!(recovered, "Hello, World!");
92/// assert_eq!(replacements, 0);
93///
94/// let invalid_bytes = &[b'H', b'i', 0xFF, b'!'];
95/// let (recovered, replacements) = recover_utf8(invalid_bytes);
96/// assert_eq!(recovered, "Hi�!");
97/// assert_eq!(replacements, 1);
98/// ```
99#[must_use]
100pub fn recover_utf8(bytes: &[u8]) -> (String, usize) {
101 str::from_utf8(bytes).map_or_else(
102 |_| {
103 let recovered = String::from_utf8_lossy(bytes);
104 let replacements = recovered.matches('\u{FFFD}').count();
105 (recovered.into_owned(), replacements)
106 },
107 |s| (s.to_string(), 0),
108 )
109}
110
111/// Check if text contains only valid ASS characters
112///
113/// ASS files should generally contain only printable characters plus
114/// specific control characters like tabs and newlines. This function
115/// validates character content according to ASS specification guidelines.
116///
117/// # Arguments
118///
119/// * `text` - Text content to validate
120///
121/// # Returns
122///
123/// `true` if all characters are valid for ASS content
124#[must_use]
125pub fn is_valid_ass_text(text: &str) -> bool {
126 text.chars().all(|c| {
127 c.is_ascii_graphic() // Printable ASCII
128 || c == ' ' // Space
129 || c == '\t' // Tab
130 || c == '\n' // Newline
131 || c == '\r' // Carriage return
132 || (!c.is_ascii() && !c.is_control()) // Non-ASCII printable (Unicode)
133 })
134}
135
136/// Truncate text at UTF-8 character boundary
137///
138/// Safely truncates text to the specified byte length without breaking
139/// UTF-8 character sequences. Essential for handling length limits
140/// while maintaining valid UTF-8 encoding.
141///
142/// # Arguments
143///
144/// * `text` - Input text to truncate
145/// * `max_bytes` - Maximum byte length
146///
147/// # Returns
148///
149/// Tuple of (`truncated_text`, `was_truncated`)
150///
151/// # Examples
152///
153/// ```rust
154/// # use ass_core::utils::utf8::truncate_at_char_boundary;
155/// let text = "Hello World";
156/// let (truncated, was_truncated) = truncate_at_char_boundary(text, 5);
157/// assert_eq!(truncated, "Hello");
158/// assert!(was_truncated);
159///
160/// let text = "Hello 世界";
161/// let (truncated, was_truncated) = truncate_at_char_boundary(text, 8);
162/// assert_eq!(truncated, "Hello "); // Stops before the Unicode character
163/// assert!(was_truncated);
164/// ```
165#[must_use]
166pub fn truncate_at_char_boundary(text: &str, max_bytes: usize) -> (&str, bool) {
167 if text.len() <= max_bytes {
168 return (text, false);
169 }
170
171 let mut boundary = max_bytes;
172 while boundary > 0 && !text.is_char_boundary(boundary) {
173 boundary -= 1;
174 }
175
176 (&text[..boundary], true)
177}
178
179/// Count replacement characters in text
180///
181/// Counts the number of Unicode replacement characters (�) in text,
182/// which typically indicate encoding errors or data corruption.
183/// Useful for assessing text quality and encoding issues.
184///
185/// # Arguments
186///
187/// * `text` - Text to analyze
188///
189/// # Returns
190///
191/// Number of replacement characters found
192#[must_use]
193pub fn count_replacement_chars(text: &str) -> usize {
194 text.matches('\u{FFFD}').count()
195}
196
197#[cfg(test)]
198mod tests {
199 use super::*;
200
201 #[test]
202 fn validate_valid_utf8() {
203 let text = "Hello, 世界! 🎵";
204 assert!(validate_utf8(text.as_bytes()).is_ok());
205 }
206
207 #[test]
208 fn validate_invalid_utf8() {
209 let invalid_bytes = &[0xFF, 0xFE, 0x80]; // Invalid UTF-8
210 assert!(validate_utf8(invalid_bytes).is_err());
211 }
212
213 #[test]
214 fn validate_incomplete_utf8() {
215 let incomplete_bytes = &[0xC2]; // Incomplete UTF-8 sequence
216 let result = validate_utf8(incomplete_bytes);
217 assert!(result.is_err());
218 }
219
220 #[test]
221 fn recover_valid_utf8() {
222 let text = "Hello, World!";
223 let (recovered, replacements) = recover_utf8(text.as_bytes());
224 assert_eq!(recovered, "Hello, World!");
225 assert_eq!(replacements, 0);
226 }
227
228 #[test]
229 fn recover_invalid_utf8() {
230 let invalid_bytes = &[b'H', b'i', 0xFF, b'!'];
231 let (recovered, replacements) = recover_utf8(invalid_bytes);
232 assert_eq!(recovered, "Hi�!");
233 assert_eq!(replacements, 1);
234 }
235
236 #[test]
237 fn recover_multiple_invalid_sequences() {
238 let invalid_bytes = &[b'A', 0xFF, b'B', 0xFE, b'C'];
239 let (recovered, replacements) = recover_utf8(invalid_bytes);
240 assert_eq!(recovered, "A�B�C");
241 assert_eq!(replacements, 2);
242 }
243
244 #[test]
245 fn valid_ass_text() {
246 assert!(is_valid_ass_text("Hello World"));
247 assert!(is_valid_ass_text("Hello\tWorld\n"));
248 assert!(is_valid_ass_text("Hello 世界"));
249 assert!(!is_valid_ass_text("Hello\x00World")); // Null character
250 assert!(!is_valid_ass_text("Hello\x1FWorld")); // Control character
251 }
252
253 #[test]
254 fn truncate_ascii() {
255 let text = "Hello World";
256 let (truncated, was_truncated) = truncate_at_char_boundary(text, 5);
257 assert_eq!(truncated, "Hello");
258 assert!(was_truncated);
259 }
260
261 #[test]
262 fn truncate_unicode() {
263 let text = "Hello 世界";
264 let (truncated, was_truncated) = truncate_at_char_boundary(text, 8);
265 assert_eq!(truncated, "Hello "); // Stops before the Unicode character
266 assert!(was_truncated);
267 }
268
269 #[test]
270 fn truncate_no_change() {
271 let text = "Hello";
272 let (truncated, was_truncated) = truncate_at_char_boundary(text, 10);
273 assert_eq!(truncated, "Hello");
274 assert!(!was_truncated);
275 }
276
277 #[test]
278 fn truncate_at_unicode_boundary() {
279 let text = "世界";
280 let (truncated, was_truncated) = truncate_at_char_boundary(text, 3);
281 assert_eq!(truncated, "世");
282 assert!(was_truncated);
283 }
284
285 #[test]
286 fn count_replacement_characters() {
287 assert_eq!(count_replacement_chars("Hello World"), 0);
288 assert_eq!(count_replacement_chars("Hello � World"), 1);
289 assert_eq!(count_replacement_chars("� Test � Again �"), 3);
290 }
291}