use crate::format_detector::{FormatDetector, MediaFormat};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TextEncoding {
Utf8,
Utf16Le,
Utf16Be,
Latin1,
Ascii,
}
impl std::fmt::Display for TextEncoding {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
TextEncoding::Utf8 => write!(f, "UTF-8"),
TextEncoding::Utf16Le => write!(f, "UTF-16LE"),
TextEncoding::Utf16Be => write!(f, "UTF-16BE"),
TextEncoding::Latin1 => write!(f, "Latin-1"),
TextEncoding::Ascii => write!(f, "ASCII"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MediaType {
Video,
Audio,
Image,
Archive,
Text,
Binary,
Unknown,
}
impl std::fmt::Display for MediaType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
MediaType::Video => write!(f, "Video"),
MediaType::Audio => write!(f, "Audio"),
MediaType::Image => write!(f, "Image"),
MediaType::Archive => write!(f, "Archive"),
MediaType::Text => write!(f, "Text"),
MediaType::Binary => write!(f, "Binary"),
MediaType::Unknown => write!(f, "Unknown"),
}
}
}
const BINARY_PROBE_BYTES: usize = 8192;
const NULL_BYTE_THRESHOLD: f64 = 0.01;
const NON_PRINTABLE_THRESHOLD: f64 = 0.30;
pub struct ContentDetector;
impl ContentDetector {
#[must_use]
pub fn detect_encoding(data: &[u8]) -> TextEncoding {
if data.is_empty() {
return TextEncoding::Ascii;
}
if data.len() >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
return TextEncoding::Utf8;
}
if data.len() >= 2 && data[0] == 0xFF && data[1] == 0xFE {
return TextEncoding::Utf16Le;
}
if data.len() >= 2 && data[0] == 0xFE && data[1] == 0xFF {
return TextEncoding::Utf16Be;
}
let is_ascii_compat =
|b: u8| -> bool { (0x20..=0x7E).contains(&b) || b == 0x09 || b == 0x0A || b == 0x0D };
if data.iter().copied().all(is_ascii_compat) {
return TextEncoding::Ascii;
}
if std::str::from_utf8(data).is_ok() {
return TextEncoding::Utf8;
}
if data.iter().any(|&b| b >= 0x80) {
return TextEncoding::Latin1;
}
TextEncoding::Ascii
}
#[must_use]
pub fn is_binary(data: &[u8]) -> bool {
if data.is_empty() {
return false;
}
let probe = &data[..data.len().min(BINARY_PROBE_BYTES)];
let total = probe.len() as f64;
let mut null_count = 0usize;
let mut non_printable_count = 0usize;
for &b in probe {
if b == 0x00 {
null_count += 1;
}
if b < 0x08 || (0x0E..=0x1F).contains(&b) {
non_printable_count += 1;
}
}
let null_ratio = null_count as f64 / total;
let non_printable_ratio = non_printable_count as f64 / total;
null_ratio > NULL_BYTE_THRESHOLD || non_printable_ratio > NON_PRINTABLE_THRESHOLD
}
#[must_use]
pub fn detect_media_type(data: &[u8]) -> MediaType {
if data.is_empty() {
return MediaType::Unknown;
}
let detection = FormatDetector::detect(data);
Self::media_format_to_type(detection.format, data)
}
fn media_format_to_type(format: MediaFormat, data: &[u8]) -> MediaType {
if format.is_video() {
return MediaType::Video;
}
if format.is_audio() {
return MediaType::Audio;
}
if format.is_image() {
return MediaType::Image;
}
match format {
MediaFormat::Zip
| MediaFormat::Tar
| MediaFormat::Gz
| MediaFormat::Bz2
| MediaFormat::Xz
| MediaFormat::Zstd => MediaType::Archive,
MediaFormat::Srt | MediaFormat::Vtt | MediaFormat::Ass | MediaFormat::Svg => {
MediaType::Text
}
MediaFormat::Unknown => {
if Self::is_binary(data) {
MediaType::Binary
} else {
MediaType::Text
}
}
_ => MediaType::Binary,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_encoding_utf8_bom() {
let data = b"\xEF\xBB\xBFHello, world!";
assert_eq!(ContentDetector::detect_encoding(data), TextEncoding::Utf8);
}
#[test]
fn test_encoding_utf8_no_bom() {
let data = "Hello, world! ✓ café".as_bytes();
assert_eq!(ContentDetector::detect_encoding(data), TextEncoding::Utf8);
}
#[test]
fn test_encoding_utf16_le_bom() {
let data = b"\xFF\xFE\x48\x00\x65\x00"; assert_eq!(
ContentDetector::detect_encoding(data),
TextEncoding::Utf16Le
);
}
#[test]
fn test_encoding_utf16_be_bom() {
let data = b"\xFE\xFF\x00\x48\x00\x65"; assert_eq!(
ContentDetector::detect_encoding(data),
TextEncoding::Utf16Be
);
}
#[test]
fn test_encoding_ascii_printable() {
let data = b"Hello World 123";
assert_eq!(ContentDetector::detect_encoding(data), TextEncoding::Ascii);
}
#[test]
fn test_encoding_ascii_with_crlf() {
let data = b"line1\r\nline2\r\n";
assert_eq!(ContentDetector::detect_encoding(data), TextEncoding::Ascii);
}
#[test]
fn test_encoding_ascii_with_tab() {
let data = b"col1\tcol2\tcol3";
assert_eq!(ContentDetector::detect_encoding(data), TextEncoding::Ascii);
}
#[test]
fn test_encoding_empty_returns_ascii() {
assert_eq!(ContentDetector::detect_encoding(&[]), TextEncoding::Ascii);
}
#[test]
fn test_encoding_latin1_extended_bytes() {
let data = b"Caf\xe9 au lait"; assert_eq!(ContentDetector::detect_encoding(data), TextEncoding::Latin1);
}
#[test]
fn test_encoding_latin1_high_bytes() {
let data = &[0x80u8, 0x9F, 0xA0, 0xFF];
assert_eq!(ContentDetector::detect_encoding(data), TextEncoding::Latin1);
}
#[test]
fn test_is_binary_empty() {
assert!(!ContentDetector::is_binary(&[]));
}
#[test]
fn test_is_binary_plain_text() {
let text = b"This is plain ASCII text.\nNo binary bytes here.\n";
assert!(!ContentDetector::is_binary(text));
}
#[test]
fn test_is_binary_null_bytes() {
let mut data = vec![0x41u8; 100]; data[10] = 0x00;
data[20] = 0x00;
data[30] = 0x00;
data[40] = 0x00;
data[50] = 0x00;
assert!(ContentDetector::is_binary(&data));
}
#[test]
fn test_is_binary_jpeg_magic() {
let data = [0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46];
assert!(ContentDetector::is_binary(&data));
}
#[test]
fn test_is_binary_utf8_text() {
let text = "The quick brown fox jumps over the lazy dog. 1234567890!".as_bytes();
assert!(!ContentDetector::is_binary(text));
}
#[test]
fn test_media_type_empty_returns_unknown() {
assert_eq!(ContentDetector::detect_media_type(&[]), MediaType::Unknown);
}
#[test]
fn test_media_type_jpeg_is_image() {
let data = [0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46];
assert_eq!(ContentDetector::detect_media_type(&data), MediaType::Image);
}
#[test]
fn test_media_type_png_is_image() {
let data = [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
assert_eq!(ContentDetector::detect_media_type(&data), MediaType::Image);
}
#[test]
fn test_media_type_flac_is_audio() {
let data = b"fLaC\x00\x00\x00\x22";
assert_eq!(ContentDetector::detect_media_type(data), MediaType::Audio);
}
#[test]
fn test_media_type_wav_is_audio() {
let data = b"RIFF\x00\x00\x00\x00WAVE";
assert_eq!(ContentDetector::detect_media_type(data), MediaType::Audio);
}
#[test]
fn test_media_type_zip_is_archive() {
let data = [0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00];
assert_eq!(
ContentDetector::detect_media_type(&data),
MediaType::Archive
);
}
#[test]
fn test_media_type_gz_is_archive() {
let data = [0x1F, 0x8B, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00];
assert_eq!(
ContentDetector::detect_media_type(&data),
MediaType::Archive
);
}
#[test]
fn test_media_type_unknown_binary_is_binary() {
let data = [
0x00u8, 0x01, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xAD, 0xBE, 0xEF, 0x00, 0x00,
0x00, 0x00,
];
assert_eq!(ContentDetector::detect_media_type(&data), MediaType::Binary);
}
#[test]
fn test_media_type_unknown_text_is_text() {
let data = b"1\n00:00:01,000 --> 00:00:03,000\nHello world\n\n";
let mt = ContentDetector::detect_media_type(data);
assert!(
matches!(mt, MediaType::Text | MediaType::Unknown),
"expected Text or Unknown, got {mt:?}"
);
}
#[test]
fn test_media_type_display() {
assert_eq!(MediaType::Video.to_string(), "Video");
assert_eq!(MediaType::Audio.to_string(), "Audio");
assert_eq!(MediaType::Image.to_string(), "Image");
assert_eq!(MediaType::Archive.to_string(), "Archive");
assert_eq!(MediaType::Text.to_string(), "Text");
assert_eq!(MediaType::Binary.to_string(), "Binary");
assert_eq!(MediaType::Unknown.to_string(), "Unknown");
}
#[test]
fn test_text_encoding_display() {
assert_eq!(TextEncoding::Utf8.to_string(), "UTF-8");
assert_eq!(TextEncoding::Utf16Le.to_string(), "UTF-16LE");
assert_eq!(TextEncoding::Utf16Be.to_string(), "UTF-16BE");
assert_eq!(TextEncoding::Latin1.to_string(), "Latin-1");
assert_eq!(TextEncoding::Ascii.to_string(), "ASCII");
}
}