ass_core/utils/errors/
encoding.rs

1//! Text encoding error utilities for ASS-RS
2//!
3//! Provides specialized error creation and validation functions for text
4//! encoding issues including UTF-8 validation, encoding detection, and
5//! character conversion errors. Focuses on providing detailed context.
6
7use super::CoreError;
8use alloc::{format, string::String};
9use core::fmt;
10
11/// Create UTF-8 encoding error with position information
12///
13/// Generates a `CoreError::Utf8Error` with detailed position and context
14/// information about the encoding failure.
15///
16/// # Arguments
17///
18/// * `position` - Byte position where the error occurred
19/// * `message` - Descriptive error message
20///
21/// # Examples
22///
23/// ```rust
24/// use ass_core::utils::errors::{utf8_error, CoreError};
25///
26/// let error = utf8_error(42, "Invalid UTF-8 sequence".to_string());
27/// assert!(matches!(error, CoreError::Utf8Error { .. }));
28/// ```
29#[must_use]
30pub const fn utf8_error(position: usize, message: String) -> CoreError {
31    CoreError::Utf8Error { position, message }
32}
33
34/// Create validation error for text content
35///
36/// Generates a `CoreError::Validation` for content that fails ASS-specific
37/// text validation rules (e.g., contains invalid control characters).
38///
39/// # Arguments
40///
41/// * `message` - Description of the validation failure
42pub fn validation_error<T: fmt::Display>(message: T) -> CoreError {
43    CoreError::Validation(format!("{message}"))
44}
45
46/// Validate UTF-8 with detailed error reporting
47///
48/// Provides more detailed error information than standard UTF-8 validation,
49/// including the exact position and nature of encoding errors.
50///
51/// # Arguments
52///
53/// * `bytes` - Byte sequence to validate
54///
55/// # Returns
56///
57/// `Ok(())` if valid UTF-8, detailed error if invalid
58///
59/// # Errors
60///
61/// Returns an error if the byte slice contains invalid UTF-8 sequences.
62pub fn validate_utf8_detailed(bytes: &[u8]) -> Result<(), CoreError> {
63    match core::str::from_utf8(bytes) {
64        Ok(_) => Ok(()),
65        Err(err) => {
66            let position = err.valid_up_to();
67            let message = err.error_len().map_or_else(
68                || format!("Incomplete UTF-8 sequence at position {position}"),
69                |len| format!("Invalid UTF-8 sequence of {len} bytes at position {position}"),
70            );
71
72            Err(utf8_error(position, message))
73        }
74    }
75}
76
77/// Validate text contains only valid ASS characters
78///
79/// Checks that text contains only characters appropriate for ASS subtitle
80/// content, rejecting problematic control characters and sequences.
81///
82/// # Arguments
83///
84/// * `text` - Text content to validate
85///
86/// # Returns
87///
88/// `Ok(())` if valid, validation error if invalid characters found
89///
90/// # Errors
91///
92/// Returns an error if the text contains invalid characters for ASS format.
93pub fn validate_ass_text_content(text: &str) -> Result<(), CoreError> {
94    for (pos, ch) in text.char_indices() {
95        if !is_valid_ass_char(ch) {
96            return Err(validation_error(format!(
97                "Invalid character '{}' (U+{:04X}) at position {}",
98                ch.escape_default().collect::<String>(),
99                ch as u32,
100                pos
101            )));
102        }
103    }
104    Ok(())
105}
106
107/// Check if character is valid in ASS content
108///
109/// Determines whether a character is acceptable in ASS subtitle content
110/// based on ASS specification guidelines.
111fn is_valid_ass_char(ch: char) -> bool {
112    match ch {
113        // Allow printable ASCII
114        c if c.is_ascii_graphic() => true,
115        // Allow whitespace
116        ' ' | '\t' | '\n' | '\r' => true,
117        // Allow non-ASCII printable characters (Unicode)
118        c if !c.is_ascii() && !c.is_control() => true,
119        // Reject control characters and other problematic chars
120        _ => false,
121    }
122}
123
124/// Validate BOM (Byte Order Mark) handling
125///
126/// Ensures that BOM is properly handled or warns if unexpected BOM found.
127/// ASS files should typically use UTF-8 without BOM for compatibility.
128///
129/// # Arguments
130///
131/// * `bytes` - Input bytes that may contain BOM
132///
133/// # Returns
134///
135/// `Ok(())` if BOM handling is appropriate, warning if issues found
136///
137/// # Errors
138///
139/// Returns an error if UTF-16 BOM is detected or other BOM issues are found
140pub fn validate_bom_handling(bytes: &[u8]) -> Result<(), CoreError> {
141    if bytes.len() >= 3 && bytes[0..3] == [0xEF, 0xBB, 0xBF] {
142        // UTF-8 BOM found - this is acceptable but not ideal
143        return Ok(());
144    }
145
146    if bytes.len() >= 2 && (bytes[0..2] == [0xFF, 0xFE] || bytes[0..2] == [0xFE, 0xFF]) {
147        return Err(validation_error(
148            "UTF-16 BOM detected - ASS files should be UTF-8",
149        ));
150    }
151
152    if bytes.len() >= 4
153        && (bytes[0..4] == [0xFF, 0xFE, 0x00, 0x00] || bytes[0..4] == [0x00, 0x00, 0xFE, 0xFF])
154    {
155        return Err(validation_error(
156            "UTF-32 BOM detected - ASS files should be UTF-8",
157        ));
158    }
159
160    // Check for partial BOM sequences that could indicate encoding issues
161    if bytes.len() >= 2 && bytes[0..2] == [0xEF, 0xBB] {
162        return Err(validation_error(
163            "Partial UTF-8 BOM detected - file may be corrupted or incorrectly encoded",
164        ));
165    }
166
167    if !bytes.is_empty() && bytes[0] == 0xEF && (bytes.len() == 1 || bytes[1] != 0xBB) {
168        return Err(validation_error(
169            "Suspicious byte sequence that could be partial BOM - check file encoding",
170        ));
171    }
172
173    Ok(())
174}
175
176/// Check for common encoding issues in ASS content
177///
178/// Performs heuristic checks for common encoding problems that can occur
179/// when ASS files are saved with incorrect encoding settings.
180///
181/// # Arguments
182///
183/// * `text` - Text content to analyze
184#[cfg(test)]
185mod tests {
186    use super::*;
187    #[cfg(not(feature = "std"))]
188    use alloc::string::ToString;
189
190    #[test]
191    fn utf8_error_creation() {
192        let error = utf8_error(42, "test message".to_string());
193        assert!(matches!(error, CoreError::Utf8Error { position: 42, .. }));
194    }
195
196    #[test]
197    fn validation_error_creation() {
198        let error = validation_error("invalid content");
199        assert!(matches!(error, CoreError::Validation(_)));
200    }
201
202    #[test]
203    fn validate_valid_utf8() {
204        let text = "Hello, 世界! 🎵";
205        assert!(validate_utf8_detailed(text.as_bytes()).is_ok());
206    }
207
208    #[test]
209    fn validate_invalid_utf8() {
210        let invalid_bytes = &[0xFF, 0xFE, 0x80];
211        assert!(validate_utf8_detailed(invalid_bytes).is_err());
212    }
213
214    #[test]
215    fn validate_ass_text_valid() {
216        assert!(validate_ass_text_content("Hello World").is_ok());
217        assert!(validate_ass_text_content("Hello\tWorld\n").is_ok());
218        assert!(validate_ass_text_content("Hello 世界").is_ok());
219    }
220
221    #[test]
222    fn validate_ass_text_invalid() {
223        assert!(validate_ass_text_content("Hello\x00World").is_err()); // Null character
224        assert!(validate_ass_text_content("Hello\x1FWorld").is_err()); // Control character
225    }
226
227    #[test]
228    fn valid_ass_char_check() {
229        assert!(is_valid_ass_char('A'));
230        assert!(is_valid_ass_char(' '));
231        assert!(is_valid_ass_char('\n'));
232        assert!(is_valid_ass_char('世'));
233        assert!(!is_valid_ass_char('\x00'));
234        assert!(!is_valid_ass_char('\x1F'));
235    }
236
237    #[test]
238    fn bom_validation_utf8() {
239        let utf8_bom = &[0xEF, 0xBB, 0xBF, b'H', b'i'];
240        assert!(validate_bom_handling(utf8_bom).is_ok());
241    }
242
243    #[test]
244    fn bom_validation_utf16() {
245        let utf16_bom = &[0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
246        assert!(validate_bom_handling(utf16_bom).is_err());
247    }
248
249    #[test]
250    fn bom_validation_no_bom() {
251        let no_bom = b"Hello World";
252        assert!(validate_bom_handling(no_bom).is_ok());
253    }
254
255    #[test]
256    fn bom_validation_partial_utf8() {
257        let partial_bom = &[0xEF, 0xBB, b'H', b'i'];
258        assert!(validate_bom_handling(partial_bom).is_err());
259    }
260
261    #[test]
262    fn bom_validation_single_ef_byte() {
263        let single_ef = &[0xEF, b'H', b'i'];
264        assert!(validate_bom_handling(single_ef).is_err());
265    }
266
267    #[test]
268    fn bom_validation_ef_only() {
269        let ef_only = &[0xEF];
270        assert!(validate_bom_handling(ef_only).is_err());
271    }
272}