use super::bom::{detect_bom, BomType};
use alloc::{string::String, string::ToString};
use core::str;
#[derive(Debug, Clone, PartialEq)]
pub struct EncodingInfo {
pub encoding: String,
pub confidence: f32,
pub has_bom: bool,
pub bom_type: Option<BomType>,
pub is_valid: bool,
}
impl EncodingInfo {
#[must_use]
pub const fn new(encoding: String, confidence: f32) -> Self {
Self {
encoding,
confidence,
has_bom: false,
bom_type: None,
is_valid: true,
}
}
#[must_use]
pub const fn with_bom(encoding: String, confidence: f32, bom_type: BomType) -> Self {
Self {
encoding,
confidence,
has_bom: true,
bom_type: Some(bom_type),
is_valid: true,
}
}
}
#[must_use]
pub fn detect_encoding(bytes: &[u8]) -> EncodingInfo {
if let Some((bom_type, _)) = detect_bom(bytes) {
return EncodingInfo::with_bom(
bom_type.encoding_name().to_string(),
1.0, bom_type,
);
}
str::from_utf8(bytes).map_or_else(
|_| detect_non_utf8_encoding(bytes),
|text| {
let confidence = if is_likely_ass_content(text) {
0.95 } else {
0.8 };
EncodingInfo::new("UTF-8".to_string(), confidence)
},
)
}
#[must_use]
pub fn is_likely_ass_content(text: &str) -> bool {
if text.contains("[Script Info]")
|| text.contains("[V4+ Styles]")
|| text.contains("[Events]")
|| text.contains("[Fonts]")
|| text.contains("[Graphics]")
{
return true;
}
if text.contains("Dialogue:")
|| text.contains("Comment:")
|| text.contains("ScriptType:")
|| text.contains("PlayRes")
|| text.contains("Style:")
{
return true;
}
false
}
fn detect_non_utf8_encoding(bytes: &[u8]) -> EncodingInfo {
let has_extended_ascii = bytes.iter().any(|&b| b >= 0x80);
if has_extended_ascii {
EncodingInfo::new("Windows-1252".to_string(), 0.6)
} else {
EncodingInfo::new("ASCII".to_string(), 0.9)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(not(feature = "std"))]
use alloc::format;
#[test]
fn encoding_info_creation() {
let info = EncodingInfo::new("UTF-8".to_string(), 0.95);
assert_eq!(info.encoding, "UTF-8");
assert!((info.confidence - 0.95).abs() < f32::EPSILON);
assert!(!info.has_bom);
assert!(info.is_valid);
let info_with_bom = EncodingInfo::with_bom("UTF-8".to_string(), 1.0, BomType::Utf8);
assert!((info_with_bom.confidence - 1.0).abs() < f32::EPSILON);
assert!(info_with_bom.has_bom);
assert_eq!(info_with_bom.bom_type, Some(BomType::Utf8));
}
#[test]
fn detect_utf8_encoding() {
let text = "[Script Info]\nTitle: Test Script";
let encoding = detect_encoding(text.as_bytes());
assert_eq!(encoding.encoding, "UTF-8");
assert!(encoding.confidence > 0.9); assert!(!encoding.has_bom);
}
#[test]
fn detect_encoding_with_bom() {
let text = "\u{FEFF}[Script Info]";
let encoding = detect_encoding(text.as_bytes());
assert_eq!(encoding.encoding, "UTF-8");
assert!((encoding.confidence - 1.0).abs() < f32::EPSILON);
assert!(encoding.has_bom);
assert_eq!(encoding.bom_type, Some(BomType::Utf8));
}
#[test]
fn detect_non_utf8_encoding() {
let invalid_bytes = &[0x80, 0x81, b'H', b'e', b'l', b'l', b'o']; let encoding = detect_encoding(invalid_bytes);
assert_eq!(encoding.encoding, "Windows-1252");
assert!(encoding.confidence < 1.0);
}
#[test]
fn detect_ascii_encoding() {
let ascii_bytes = b"Hello World"; let encoding = detect_encoding(ascii_bytes);
assert_eq!(encoding.encoding, "UTF-8"); assert!(encoding.confidence > 0.7);
}
#[test]
fn is_likely_ass_content_detection() {
assert!(is_likely_ass_content("[Script Info]\nTitle: Test"));
assert!(is_likely_ass_content("[V4+ Styles]\nFormat: Name"));
assert!(is_likely_ass_content("Dialogue: 0,0:00:00.00"));
assert!(is_likely_ass_content("ScriptType: v4.00+"));
assert!(!is_likely_ass_content("This is just regular text"));
assert!(!is_likely_ass_content("No ASS patterns here"));
}
#[test]
fn encoding_info_equality() {
let info1 = EncodingInfo::new("UTF-8".to_string(), 0.95);
let info2 = EncodingInfo::new("UTF-8".to_string(), 0.95);
let info3 = EncodingInfo::new("ASCII".to_string(), 0.95);
assert_eq!(info1, info2);
assert_ne!(info1, info3);
}
#[test]
fn encoding_info_debug() {
let info = EncodingInfo::new("UTF-8".to_string(), 0.95);
let debug_str = format!("{info:?}");
assert!(debug_str.contains("EncodingInfo"));
assert!(debug_str.contains("UTF-8"));
}
}