ass_core/utils/utf8/encoding/detection.rs
1//! Encoding detection routines for ASS subtitle content.
2//!
3//! Analyzes raw byte content to determine the most likely text encoding,
4//! combining BOM detection, UTF-8 validation, and ASS-specific content
5//! heuristics with confidence scoring.
6
7use super::super::bom::detect_bom;
8use super::EncodingInfo;
9use alloc::string::ToString;
10use core::str;
11
12/// Detect text encoding with confidence scoring
13///
14/// Analyzes byte content to determine the most likely encoding.
15/// Focuses on encodings commonly used in ASS subtitle files and
16/// provides confidence scoring based on content analysis.
17///
18/// # Arguments
19///
20/// * `bytes` - Byte sequence to analyze
21///
22/// # Returns
23///
24/// `EncodingInfo` with detected encoding and confidence level
25///
26/// # Examples
27///
28/// ```rust
29/// # use ass_core::utils::utf8::detect_encoding;
30/// let text = "[Script Info]\nTitle: Test";
31/// let encoding = detect_encoding(text.as_bytes());
32/// assert_eq!(encoding.encoding, "UTF-8");
33/// assert!(encoding.confidence > 0.8);
34/// ```
35#[must_use]
36pub fn detect_encoding(bytes: &[u8]) -> EncodingInfo {
37 // Check for BOM first - gives us certainty about encoding
38 if let Some((bom_type, _)) = detect_bom(bytes) {
39 return EncodingInfo::with_bom(
40 bom_type.encoding_name().to_string(),
41 1.0, // BOM gives us certainty
42 bom_type,
43 );
44 }
45
46 // Try UTF-8 validation
47 str::from_utf8(bytes).map_or_else(
48 |_| detect_non_utf8_encoding(bytes),
49 |text| {
50 let confidence = if is_likely_ass_content(text) {
51 0.95 // High confidence for ASS-like content
52 } else {
53 0.8 // Still likely UTF-8 but less certain
54 };
55 EncodingInfo::new("UTF-8".to_string(), confidence)
56 },
57 )
58}
59
60/// Check if text content contains patterns typical of ASS subtitle files
61///
62/// Analyzes text for ASS-specific patterns like section headers,
63/// field names, and content structure to increase confidence
64/// in encoding detection.
65///
66/// # Arguments
67///
68/// * `text` - Text content to analyze
69///
70/// # Returns
71///
72/// `true` if content appears to be ASS subtitle format
73#[must_use]
74pub fn is_likely_ass_content(text: &str) -> bool {
75 // Check for ASS section headers
76 if text.contains("[Script Info]")
77 || text.contains("[V4+ Styles]")
78 || text.contains("[Events]")
79 || text.contains("[Fonts]")
80 || text.contains("[Graphics]")
81 {
82 return true;
83 }
84
85 // Check for common ASS field names
86 if text.contains("Dialogue:")
87 || text.contains("Comment:")
88 || text.contains("ScriptType:")
89 || text.contains("PlayRes")
90 || text.contains("Style:")
91 {
92 return true;
93 }
94
95 false
96}
97
98/// Attempt to detect non-UTF-8 encodings commonly used in older ASS files
99///
100/// Provides fallback detection for files that aren't valid UTF-8,
101/// focusing on legacy encodings commonly used in subtitle files.
102///
103/// # Arguments
104///
105/// * `bytes` - Byte sequence that failed UTF-8 validation
106///
107/// # Returns
108///
109/// `EncodingInfo` with best guess for the encoding
110fn detect_non_utf8_encoding(bytes: &[u8]) -> EncodingInfo {
111 let has_extended_ascii = bytes.iter().any(|&b| b >= 0x80);
112
113 if has_extended_ascii {
114 // Common legacy encoding for subtitle files
115 EncodingInfo::new("Windows-1252".to_string(), 0.6)
116 } else {
117 // Pure ASCII is safe to assume
118 EncodingInfo::new("ASCII".to_string(), 0.9)
119 }
120}