ass_core/utils/utf8/
encoding.rs

1//! Encoding detection utilities for ASS subtitle files
2//!
3//! Provides functionality for detecting text encodings, analyzing content
4//! patterns, and validating encoding assumptions. Focuses on encodings
5//! commonly used in ASS subtitle files with confidence scoring.
6//!
7//! # Examples
8//!
9//! ```rust
10//! use ass_core::utils::utf8::{detect_encoding, EncodingInfo};
11//!
12//! let text = "[Script Info]\nTitle: Test";
13//! let encoding = detect_encoding(text.as_bytes());
14//! assert_eq!(encoding.encoding, "UTF-8");
15//! assert!(encoding.confidence > 0.8);
16//! ```
17
18use super::bom::{detect_bom, BomType};
19use alloc::{string::String, string::ToString};
20use core::str;
21
22/// Detected text encoding information with confidence scoring
23///
24/// Contains the results of encoding detection analysis including
25/// the detected encoding name, confidence level, and BOM information.
26#[derive(Debug, Clone, PartialEq)]
27pub struct EncodingInfo {
28    /// Detected encoding name (e.g., "UTF-8", "Windows-1252")
29    pub encoding: String,
30    /// Confidence level (0.0 to 1.0)
31    pub confidence: f32,
32    /// Whether a BOM was detected
33    pub has_bom: bool,
34    /// BOM type if detected
35    pub bom_type: Option<BomType>,
36    /// Whether the text appears to be valid in this encoding
37    pub is_valid: bool,
38}
39
40impl EncodingInfo {
41    /// Create new encoding info with basic parameters
42    ///
43    /// # Arguments
44    ///
45    /// * `encoding` - Name of the detected encoding
46    /// * `confidence` - Confidence level (0.0 to 1.0)
47    #[must_use]
48    pub const fn new(encoding: String, confidence: f32) -> Self {
49        Self {
50            encoding,
51            confidence,
52            has_bom: false,
53            bom_type: None,
54            is_valid: true,
55        }
56    }
57
58    /// Create encoding info with BOM information
59    ///
60    /// # Arguments
61    ///
62    /// * `encoding` - Name of the detected encoding
63    /// * `confidence` - Confidence level (0.0 to 1.0)
64    /// * `bom_type` - Type of BOM detected
65    #[must_use]
66    pub const fn with_bom(encoding: String, confidence: f32, bom_type: BomType) -> Self {
67        Self {
68            encoding,
69            confidence,
70            has_bom: true,
71            bom_type: Some(bom_type),
72            is_valid: true,
73        }
74    }
75}
76
77/// Detect text encoding with confidence scoring
78///
79/// Analyzes byte content to determine the most likely encoding.
80/// Focuses on encodings commonly used in ASS subtitle files and
81/// provides confidence scoring based on content analysis.
82///
83/// # Arguments
84///
85/// * `bytes` - Byte sequence to analyze
86///
87/// # Returns
88///
89/// `EncodingInfo` with detected encoding and confidence level
90///
91/// # Examples
92///
93/// ```rust
94/// # use ass_core::utils::utf8::detect_encoding;
95/// let text = "[Script Info]\nTitle: Test";
96/// let encoding = detect_encoding(text.as_bytes());
97/// assert_eq!(encoding.encoding, "UTF-8");
98/// assert!(encoding.confidence > 0.8);
99/// ```
100#[must_use]
101pub fn detect_encoding(bytes: &[u8]) -> EncodingInfo {
102    // Check for BOM first - gives us certainty about encoding
103    if let Some((bom_type, _)) = detect_bom(bytes) {
104        return EncodingInfo::with_bom(
105            bom_type.encoding_name().to_string(),
106            1.0, // BOM gives us certainty
107            bom_type,
108        );
109    }
110
111    // Try UTF-8 validation
112    str::from_utf8(bytes).map_or_else(
113        |_| detect_non_utf8_encoding(bytes),
114        |text| {
115            let confidence = if is_likely_ass_content(text) {
116                0.95 // High confidence for ASS-like content
117            } else {
118                0.8 // Still likely UTF-8 but less certain
119            };
120            EncodingInfo::new("UTF-8".to_string(), confidence)
121        },
122    )
123}
124
125/// Check if text content contains patterns typical of ASS subtitle files
126///
127/// Analyzes text for ASS-specific patterns like section headers,
128/// field names, and content structure to increase confidence
129/// in encoding detection.
130///
131/// # Arguments
132///
133/// * `text` - Text content to analyze
134///
135/// # Returns
136///
137/// `true` if content appears to be ASS subtitle format
138#[must_use]
139pub fn is_likely_ass_content(text: &str) -> bool {
140    // Check for ASS section headers
141    if text.contains("[Script Info]")
142        || text.contains("[V4+ Styles]")
143        || text.contains("[Events]")
144        || text.contains("[Fonts]")
145        || text.contains("[Graphics]")
146    {
147        return true;
148    }
149
150    // Check for common ASS field names
151    if text.contains("Dialogue:")
152        || text.contains("Comment:")
153        || text.contains("ScriptType:")
154        || text.contains("PlayRes")
155        || text.contains("Style:")
156    {
157        return true;
158    }
159
160    false
161}
162
163/// Attempt to detect non-UTF-8 encodings commonly used in older ASS files
164///
165/// Provides fallback detection for files that aren't valid UTF-8,
166/// focusing on legacy encodings commonly used in subtitle files.
167///
168/// # Arguments
169///
170/// * `bytes` - Byte sequence that failed UTF-8 validation
171///
172/// # Returns
173///
174/// `EncodingInfo` with best guess for the encoding
175fn detect_non_utf8_encoding(bytes: &[u8]) -> EncodingInfo {
176    let has_extended_ascii = bytes.iter().any(|&b| b >= 0x80);
177
178    if has_extended_ascii {
179        // Common legacy encoding for subtitle files
180        EncodingInfo::new("Windows-1252".to_string(), 0.6)
181    } else {
182        // Pure ASCII is safe to assume
183        EncodingInfo::new("ASCII".to_string(), 0.9)
184    }
185}
186
187#[cfg(test)]
188mod tests {
189    use super::*;
190    #[cfg(not(feature = "std"))]
191    use alloc::format;
192
193    #[test]
194    fn encoding_info_creation() {
195        let info = EncodingInfo::new("UTF-8".to_string(), 0.95);
196        assert_eq!(info.encoding, "UTF-8");
197        assert!((info.confidence - 0.95).abs() < f32::EPSILON);
198        assert!(!info.has_bom);
199        assert!(info.is_valid);
200
201        let info_with_bom = EncodingInfo::with_bom("UTF-8".to_string(), 1.0, BomType::Utf8);
202        assert!((info_with_bom.confidence - 1.0).abs() < f32::EPSILON);
203        assert!(info_with_bom.has_bom);
204        assert_eq!(info_with_bom.bom_type, Some(BomType::Utf8));
205    }
206
207    #[test]
208    fn detect_utf8_encoding() {
209        let text = "[Script Info]\nTitle: Test Script";
210        let encoding = detect_encoding(text.as_bytes());
211        assert_eq!(encoding.encoding, "UTF-8");
212        assert!(encoding.confidence > 0.9); // High confidence due to ASS patterns
213        assert!(!encoding.has_bom);
214    }
215
216    #[test]
217    fn detect_encoding_with_bom() {
218        let text = "\u{FEFF}[Script Info]";
219        let encoding = detect_encoding(text.as_bytes());
220        assert_eq!(encoding.encoding, "UTF-8");
221        assert!((encoding.confidence - 1.0).abs() < f32::EPSILON);
222        assert!(encoding.has_bom);
223        assert_eq!(encoding.bom_type, Some(BomType::Utf8));
224    }
225
226    #[test]
227    fn detect_non_utf8_encoding() {
228        let invalid_bytes = &[0x80, 0x81, b'H', b'e', b'l', b'l', b'o']; // Invalid UTF-8, no BOM
229        let encoding = detect_encoding(invalid_bytes);
230        assert_eq!(encoding.encoding, "Windows-1252");
231        assert!(encoding.confidence < 1.0);
232    }
233
234    #[test]
235    fn detect_ascii_encoding() {
236        let ascii_bytes = b"Hello World"; // Pure ASCII
237        let encoding = detect_encoding(ascii_bytes);
238        assert_eq!(encoding.encoding, "UTF-8"); // ASCII is valid UTF-8
239        assert!(encoding.confidence > 0.7);
240    }
241
242    #[test]
243    fn is_likely_ass_content_detection() {
244        assert!(is_likely_ass_content("[Script Info]\nTitle: Test"));
245        assert!(is_likely_ass_content("[V4+ Styles]\nFormat: Name"));
246        assert!(is_likely_ass_content("Dialogue: 0,0:00:00.00"));
247        assert!(is_likely_ass_content("ScriptType: v4.00+"));
248        assert!(!is_likely_ass_content("This is just regular text"));
249        assert!(!is_likely_ass_content("No ASS patterns here"));
250    }
251
252    #[test]
253    fn encoding_info_equality() {
254        let info1 = EncodingInfo::new("UTF-8".to_string(), 0.95);
255        let info2 = EncodingInfo::new("UTF-8".to_string(), 0.95);
256        let info3 = EncodingInfo::new("ASCII".to_string(), 0.95);
257
258        assert_eq!(info1, info2);
259        assert_ne!(info1, info3);
260    }
261
262    #[test]
263    fn encoding_info_debug() {
264        let info = EncodingInfo::new("UTF-8".to_string(), 0.95);
265        let debug_str = format!("{info:?}");
266        assert!(debug_str.contains("EncodingInfo"));
267        assert!(debug_str.contains("UTF-8"));
268    }
269}