ass_core/utils/utf8/
encoding.rs1use super::bom::{detect_bom, BomType};
19use alloc::{string::String, string::ToString};
20use core::str;
21
22#[derive(Debug, Clone, PartialEq)]
27pub struct EncodingInfo {
28 pub encoding: String,
30 pub confidence: f32,
32 pub has_bom: bool,
34 pub bom_type: Option<BomType>,
36 pub is_valid: bool,
38}
39
40impl EncodingInfo {
41 #[must_use]
48 pub const fn new(encoding: String, confidence: f32) -> Self {
49 Self {
50 encoding,
51 confidence,
52 has_bom: false,
53 bom_type: None,
54 is_valid: true,
55 }
56 }
57
58 #[must_use]
66 pub const fn with_bom(encoding: String, confidence: f32, bom_type: BomType) -> Self {
67 Self {
68 encoding,
69 confidence,
70 has_bom: true,
71 bom_type: Some(bom_type),
72 is_valid: true,
73 }
74 }
75}
76
77#[must_use]
101pub fn detect_encoding(bytes: &[u8]) -> EncodingInfo {
102 if let Some((bom_type, _)) = detect_bom(bytes) {
104 return EncodingInfo::with_bom(
105 bom_type.encoding_name().to_string(),
106 1.0, bom_type,
108 );
109 }
110
111 str::from_utf8(bytes).map_or_else(
113 |_| detect_non_utf8_encoding(bytes),
114 |text| {
115 let confidence = if is_likely_ass_content(text) {
116 0.95 } else {
118 0.8 };
120 EncodingInfo::new("UTF-8".to_string(), confidence)
121 },
122 )
123}
124
125#[must_use]
139pub fn is_likely_ass_content(text: &str) -> bool {
140 if text.contains("[Script Info]")
142 || text.contains("[V4+ Styles]")
143 || text.contains("[Events]")
144 || text.contains("[Fonts]")
145 || text.contains("[Graphics]")
146 {
147 return true;
148 }
149
150 if text.contains("Dialogue:")
152 || text.contains("Comment:")
153 || text.contains("ScriptType:")
154 || text.contains("PlayRes")
155 || text.contains("Style:")
156 {
157 return true;
158 }
159
160 false
161}
162
163fn detect_non_utf8_encoding(bytes: &[u8]) -> EncodingInfo {
176 let has_extended_ascii = bytes.iter().any(|&b| b >= 0x80);
177
178 if has_extended_ascii {
179 EncodingInfo::new("Windows-1252".to_string(), 0.6)
181 } else {
182 EncodingInfo::new("ASCII".to_string(), 0.9)
184 }
185}
186
187#[cfg(test)]
188mod tests {
189 use super::*;
190 #[cfg(not(feature = "std"))]
191 use alloc::format;
192
193 #[test]
194 fn encoding_info_creation() {
195 let info = EncodingInfo::new("UTF-8".to_string(), 0.95);
196 assert_eq!(info.encoding, "UTF-8");
197 assert!((info.confidence - 0.95).abs() < f32::EPSILON);
198 assert!(!info.has_bom);
199 assert!(info.is_valid);
200
201 let info_with_bom = EncodingInfo::with_bom("UTF-8".to_string(), 1.0, BomType::Utf8);
202 assert!((info_with_bom.confidence - 1.0).abs() < f32::EPSILON);
203 assert!(info_with_bom.has_bom);
204 assert_eq!(info_with_bom.bom_type, Some(BomType::Utf8));
205 }
206
207 #[test]
208 fn detect_utf8_encoding() {
209 let text = "[Script Info]\nTitle: Test Script";
210 let encoding = detect_encoding(text.as_bytes());
211 assert_eq!(encoding.encoding, "UTF-8");
212 assert!(encoding.confidence > 0.9); assert!(!encoding.has_bom);
214 }
215
216 #[test]
217 fn detect_encoding_with_bom() {
218 let text = "\u{FEFF}[Script Info]";
219 let encoding = detect_encoding(text.as_bytes());
220 assert_eq!(encoding.encoding, "UTF-8");
221 assert!((encoding.confidence - 1.0).abs() < f32::EPSILON);
222 assert!(encoding.has_bom);
223 assert_eq!(encoding.bom_type, Some(BomType::Utf8));
224 }
225
226 #[test]
227 fn detect_non_utf8_encoding() {
228 let invalid_bytes = &[0x80, 0x81, b'H', b'e', b'l', b'l', b'o']; let encoding = detect_encoding(invalid_bytes);
230 assert_eq!(encoding.encoding, "Windows-1252");
231 assert!(encoding.confidence < 1.0);
232 }
233
234 #[test]
235 fn detect_ascii_encoding() {
236 let ascii_bytes = b"Hello World"; let encoding = detect_encoding(ascii_bytes);
238 assert_eq!(encoding.encoding, "UTF-8"); assert!(encoding.confidence > 0.7);
240 }
241
242 #[test]
243 fn is_likely_ass_content_detection() {
244 assert!(is_likely_ass_content("[Script Info]\nTitle: Test"));
245 assert!(is_likely_ass_content("[V4+ Styles]\nFormat: Name"));
246 assert!(is_likely_ass_content("Dialogue: 0,0:00:00.00"));
247 assert!(is_likely_ass_content("ScriptType: v4.00+"));
248 assert!(!is_likely_ass_content("This is just regular text"));
249 assert!(!is_likely_ass_content("No ASS patterns here"));
250 }
251
252 #[test]
253 fn encoding_info_equality() {
254 let info1 = EncodingInfo::new("UTF-8".to_string(), 0.95);
255 let info2 = EncodingInfo::new("UTF-8".to_string(), 0.95);
256 let info3 = EncodingInfo::new("ASCII".to_string(), 0.95);
257
258 assert_eq!(info1, info2);
259 assert_ne!(info1, info3);
260 }
261
262 #[test]
263 fn encoding_info_debug() {
264 let info = EncodingInfo::new("UTF-8".to_string(), 0.95);
265 let debug_str = format!("{info:?}");
266 assert!(debug_str.contains("EncodingInfo"));
267 assert!(debug_str.contains("UTF-8"));
268 }
269}