#![cfg_attr(not(feature = "std"), no_std)]
extern crate alloc;
use alloc::string::{String, ToString};
use alloc::vec;
use alloc::vec::Vec;
use core::fmt;
#[cfg(feature = "std")]
use std::sync::atomic::{AtomicU64, Ordering};
#[cfg(not(feature = "std"))]
use core::sync::atomic::{AtomicU64, Ordering};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[allow(missing_docs)]
pub enum ContentType {
Jpeg,
Png,
Gif,
WebP,
Bmp,
Tiff,
Ico,
Avif,
Heic,
Mp4,
Mkv,
Avi,
WebM,
Mov,
Flv,
Mp3,
Flac,
Wav,
Ogg,
Aac,
M4a,
Zip,
Gzip,
Zstd,
Xz,
Bzip2,
Lz4,
SevenZip,
Rar,
Tar,
Elf,
Pe,
MachO,
Wasm,
PyTorch,
SafeTensors,
Gguf,
Ggml,
Onnx,
TensorFlowSaved,
Pdf,
Docx,
Xlsx,
Pptx,
Odt,
Sqlite,
Parquet,
Arrow,
Protobuf,
PlainText,
Json,
Xml,
Html,
Css,
JavaScript,
SourceCode,
Empty,
Random,
Unknown,
}
impl ContentType {
pub fn category(&self) -> ContentCategory {
match self {
ContentType::Jpeg
| ContentType::Png
| ContentType::Gif
| ContentType::WebP
| ContentType::Avif
| ContentType::Heic => ContentCategory::CompressedImage,
ContentType::Bmp | ContentType::Tiff | ContentType::Ico => {
ContentCategory::UncompressedImage
}
ContentType::Mp4
| ContentType::Mkv
| ContentType::Avi
| ContentType::WebM
| ContentType::Mov
| ContentType::Flv => ContentCategory::CompressedVideo,
ContentType::Mp3 | ContentType::Aac | ContentType::M4a | ContentType::Ogg => {
ContentCategory::CompressedAudio
}
ContentType::Wav | ContentType::Flac => ContentCategory::UncompressedAudio,
ContentType::Zip
| ContentType::Gzip
| ContentType::Zstd
| ContentType::Xz
| ContentType::Bzip2
| ContentType::Lz4
| ContentType::SevenZip
| ContentType::Rar => ContentCategory::Archive,
ContentType::Tar => ContentCategory::UncompressedArchive,
ContentType::Elf | ContentType::Pe | ContentType::MachO | ContentType::Wasm => {
ContentCategory::Executable
}
ContentType::PyTorch
| ContentType::SafeTensors
| ContentType::Gguf
| ContentType::Ggml
| ContentType::Onnx
| ContentType::TensorFlowSaved => ContentCategory::AiModel,
ContentType::Pdf => ContentCategory::Document,
ContentType::Docx | ContentType::Xlsx | ContentType::Pptx | ContentType::Odt => {
ContentCategory::CompressedDocument
}
ContentType::Sqlite
| ContentType::Parquet
| ContentType::Arrow
| ContentType::Protobuf => ContentCategory::StructuredData,
ContentType::PlainText
| ContentType::Json
| ContentType::Xml
| ContentType::Html
| ContentType::Css
| ContentType::JavaScript
| ContentType::SourceCode => ContentCategory::Text,
ContentType::Empty => ContentCategory::Empty,
ContentType::Random => ContentCategory::Random,
ContentType::Unknown => ContentCategory::Unknown,
}
}
pub fn name(&self) -> &'static str {
match self {
ContentType::Jpeg => "JPEG Image",
ContentType::Png => "PNG Image",
ContentType::Gif => "GIF Image",
ContentType::WebP => "WebP Image",
ContentType::Bmp => "BMP Image",
ContentType::Tiff => "TIFF Image",
ContentType::Ico => "ICO Icon",
ContentType::Avif => "AVIF Image",
ContentType::Heic => "HEIC Image",
ContentType::Mp4 => "MP4 Video",
ContentType::Mkv => "MKV Video",
ContentType::Avi => "AVI Video",
ContentType::WebM => "WebM Video",
ContentType::Mov => "MOV Video",
ContentType::Flv => "FLV Video",
ContentType::Mp3 => "MP3 Audio",
ContentType::Flac => "FLAC Audio",
ContentType::Wav => "WAV Audio",
ContentType::Ogg => "OGG Audio",
ContentType::Aac => "AAC Audio",
ContentType::M4a => "M4A Audio",
ContentType::Zip => "ZIP Archive",
ContentType::Gzip => "GZIP Archive",
ContentType::Zstd => "ZSTD Archive",
ContentType::Xz => "XZ Archive",
ContentType::Bzip2 => "BZIP2 Archive",
ContentType::Lz4 => "LZ4 Archive",
ContentType::SevenZip => "7-Zip Archive",
ContentType::Rar => "RAR Archive",
ContentType::Tar => "TAR Archive",
ContentType::Elf => "ELF Executable",
ContentType::Pe => "PE Executable",
ContentType::MachO => "Mach-O Executable",
ContentType::Wasm => "WebAssembly",
ContentType::PyTorch => "PyTorch Model",
ContentType::SafeTensors => "SafeTensors Model",
ContentType::Gguf => "GGUF Model",
ContentType::Ggml => "GGML Model",
ContentType::Onnx => "ONNX Model",
ContentType::TensorFlowSaved => "TensorFlow SavedModel",
ContentType::Pdf => "PDF Document",
ContentType::Docx => "Word Document",
ContentType::Xlsx => "Excel Spreadsheet",
ContentType::Pptx => "PowerPoint Presentation",
ContentType::Odt => "OpenDocument Text",
ContentType::Sqlite => "SQLite Database",
ContentType::Parquet => "Parquet Data",
ContentType::Arrow => "Arrow Data",
ContentType::Protobuf => "Protocol Buffers",
ContentType::PlainText => "Plain Text",
ContentType::Json => "JSON",
ContentType::Xml => "XML",
ContentType::Html => "HTML",
ContentType::Css => "CSS",
ContentType::JavaScript => "JavaScript",
ContentType::SourceCode => "Source Code",
ContentType::Empty => "Empty",
ContentType::Random => "Random/Encrypted",
ContentType::Unknown => "Unknown",
}
}
pub fn extensions(&self) -> &'static [&'static str] {
match self {
ContentType::Jpeg => &["jpg", "jpeg", "jpe", "jfif"],
ContentType::Png => &["png"],
ContentType::Gif => &["gif"],
ContentType::WebP => &["webp"],
ContentType::Bmp => &["bmp", "dib"],
ContentType::Tiff => &["tiff", "tif"],
ContentType::Ico => &["ico"],
ContentType::Avif => &["avif"],
ContentType::Heic => &["heic", "heif"],
ContentType::Mp4 => &["mp4", "m4v"],
ContentType::Mkv => &["mkv"],
ContentType::Avi => &["avi"],
ContentType::WebM => &["webm"],
ContentType::Mov => &["mov", "qt"],
ContentType::Flv => &["flv"],
ContentType::Mp3 => &["mp3"],
ContentType::Flac => &["flac"],
ContentType::Wav => &["wav", "wave"],
ContentType::Ogg => &["ogg", "oga", "ogv"],
ContentType::Aac => &["aac"],
ContentType::M4a => &["m4a"],
ContentType::Zip => &["zip"],
ContentType::Gzip => &["gz", "gzip"],
ContentType::Zstd => &["zst", "zstd"],
ContentType::Xz => &["xz"],
ContentType::Bzip2 => &["bz2", "bzip2"],
ContentType::Lz4 => &["lz4"],
ContentType::SevenZip => &["7z"],
ContentType::Rar => &["rar"],
ContentType::Tar => &["tar"],
ContentType::Elf => &["elf", "so", "o"],
ContentType::Pe => &["exe", "dll", "sys"],
ContentType::MachO => &["dylib"],
ContentType::Wasm => &["wasm"],
ContentType::PyTorch => &["pt", "pth", "bin"],
ContentType::SafeTensors => &["safetensors"],
ContentType::Gguf => &["gguf"],
ContentType::Ggml => &["ggml"],
ContentType::Onnx => &["onnx"],
ContentType::TensorFlowSaved => &["pb"],
ContentType::Pdf => &["pdf"],
ContentType::Docx => &["docx"],
ContentType::Xlsx => &["xlsx"],
ContentType::Pptx => &["pptx"],
ContentType::Odt => &["odt"],
ContentType::Sqlite => &["db", "sqlite", "sqlite3"],
ContentType::Parquet => &["parquet"],
ContentType::Arrow => &["arrow", "feather"],
ContentType::Protobuf => &["proto"],
ContentType::PlainText => &["txt", "text"],
ContentType::Json => &["json"],
ContentType::Xml => &["xml"],
ContentType::Html => &["html", "htm"],
ContentType::Css => &["css"],
ContentType::JavaScript => &["js", "mjs", "cjs"],
ContentType::SourceCode => {
&["rs", "c", "cpp", "h", "py", "go", "java", "ts", "rb", "php"]
}
ContentType::Empty => &[],
ContentType::Random => &[],
ContentType::Unknown => &[],
}
}
}
impl fmt::Display for ContentType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.name())
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[allow(missing_docs)]
pub enum ContentCategory {
CompressedImage,
UncompressedImage,
CompressedVideo,
CompressedAudio,
UncompressedAudio,
Archive,
UncompressedArchive,
Executable,
AiModel,
Document,
CompressedDocument,
StructuredData,
Text,
Empty,
Random,
Unknown,
}
impl ContentCategory {
pub fn is_precompressed(&self) -> bool {
matches!(
self,
ContentCategory::CompressedImage
| ContentCategory::CompressedVideo
| ContentCategory::CompressedAudio
| ContentCategory::Archive
| ContentCategory::CompressedDocument
| ContentCategory::Random
)
}
pub fn expected_ratio(&self) -> f32 {
match self {
ContentCategory::Text => 0.15, ContentCategory::UncompressedImage => 0.30, ContentCategory::UncompressedAudio => 0.40, ContentCategory::Executable => 0.50, ContentCategory::StructuredData => 0.35, ContentCategory::Document => 0.60, ContentCategory::UncompressedArchive => 0.40, ContentCategory::AiModel => 0.06, ContentCategory::CompressedImage => 0.98, ContentCategory::CompressedVideo => 0.99, ContentCategory::CompressedAudio => 0.98, ContentCategory::Archive => 0.99, ContentCategory::CompressedDocument => 0.98, ContentCategory::Empty => 1.0, ContentCategory::Random => 1.0, ContentCategory::Unknown => 0.70, }
}
}
struct MagicSignature {
pattern: &'static [u8],
offset: usize,
mask: Option<&'static [u8]>,
content_type: ContentType,
}
static MAGIC_SIGNATURES: &[MagicSignature] = &[
MagicSignature {
pattern: &[0xFF, 0xD8, 0xFF],
offset: 0,
mask: None,
content_type: ContentType::Jpeg,
},
MagicSignature {
pattern: &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A],
offset: 0,
mask: None,
content_type: ContentType::Png,
},
MagicSignature {
pattern: b"GIF87a",
offset: 0,
mask: None,
content_type: ContentType::Gif,
},
MagicSignature {
pattern: b"GIF89a",
offset: 0,
mask: None,
content_type: ContentType::Gif,
},
MagicSignature {
pattern: b"RIFF",
offset: 0,
mask: None,
content_type: ContentType::WebP,
}, MagicSignature {
pattern: &[0x42, 0x4D],
offset: 0,
mask: None,
content_type: ContentType::Bmp,
},
MagicSignature {
pattern: &[0x49, 0x49, 0x2A, 0x00],
offset: 0,
mask: None,
content_type: ContentType::Tiff,
}, MagicSignature {
pattern: &[0x4D, 0x4D, 0x00, 0x2A],
offset: 0,
mask: None,
content_type: ContentType::Tiff,
}, MagicSignature {
pattern: &[0x00, 0x00, 0x01, 0x00],
offset: 0,
mask: None,
content_type: ContentType::Ico,
},
MagicSignature {
pattern: b"ftyp",
offset: 4,
mask: None,
content_type: ContentType::Mp4,
}, MagicSignature {
pattern: &[0x1A, 0x45, 0xDF, 0xA3],
offset: 0,
mask: None,
content_type: ContentType::Mkv,
},
MagicSignature {
pattern: b"RIFF",
offset: 0,
mask: None,
content_type: ContentType::Avi,
}, MagicSignature {
pattern: b"FLV",
offset: 0,
mask: None,
content_type: ContentType::Flv,
},
MagicSignature {
pattern: &[0xFF, 0xFB],
offset: 0,
mask: None,
content_type: ContentType::Mp3,
},
MagicSignature {
pattern: &[0xFF, 0xFA],
offset: 0,
mask: None,
content_type: ContentType::Mp3,
},
MagicSignature {
pattern: &[0xFF, 0xF3],
offset: 0,
mask: None,
content_type: ContentType::Mp3,
},
MagicSignature {
pattern: &[0xFF, 0xF2],
offset: 0,
mask: None,
content_type: ContentType::Mp3,
},
MagicSignature {
pattern: b"ID3",
offset: 0,
mask: None,
content_type: ContentType::Mp3,
},
MagicSignature {
pattern: b"fLaC",
offset: 0,
mask: None,
content_type: ContentType::Flac,
},
MagicSignature {
pattern: b"OggS",
offset: 0,
mask: None,
content_type: ContentType::Ogg,
},
MagicSignature {
pattern: &[0x50, 0x4B, 0x03, 0x04],
offset: 0,
mask: None,
content_type: ContentType::Zip,
},
MagicSignature {
pattern: &[0x50, 0x4B, 0x05, 0x06],
offset: 0,
mask: None,
content_type: ContentType::Zip,
}, MagicSignature {
pattern: &[0x50, 0x4B, 0x07, 0x08],
offset: 0,
mask: None,
content_type: ContentType::Zip,
}, MagicSignature {
pattern: &[0x1F, 0x8B],
offset: 0,
mask: None,
content_type: ContentType::Gzip,
},
MagicSignature {
pattern: &[0x28, 0xB5, 0x2F, 0xFD],
offset: 0,
mask: None,
content_type: ContentType::Zstd,
},
MagicSignature {
pattern: &[0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00],
offset: 0,
mask: None,
content_type: ContentType::Xz,
},
MagicSignature {
pattern: &[0x42, 0x5A, 0x68],
offset: 0,
mask: None,
content_type: ContentType::Bzip2,
},
MagicSignature {
pattern: &[0x04, 0x22, 0x4D, 0x18],
offset: 0,
mask: None,
content_type: ContentType::Lz4,
},
MagicSignature {
pattern: &[0x37, 0x7A, 0xBC, 0xAF, 0x27, 0x1C],
offset: 0,
mask: None,
content_type: ContentType::SevenZip,
},
MagicSignature {
pattern: &[0x52, 0x61, 0x72, 0x21, 0x1A, 0x07],
offset: 0,
mask: None,
content_type: ContentType::Rar,
},
MagicSignature {
pattern: &[0x7F, 0x45, 0x4C, 0x46],
offset: 0,
mask: None,
content_type: ContentType::Elf,
},
MagicSignature {
pattern: &[0x4D, 0x5A],
offset: 0,
mask: None,
content_type: ContentType::Pe,
},
MagicSignature {
pattern: &[0xFE, 0xED, 0xFA, 0xCE],
offset: 0,
mask: None,
content_type: ContentType::MachO,
}, MagicSignature {
pattern: &[0xFE, 0xED, 0xFA, 0xCF],
offset: 0,
mask: None,
content_type: ContentType::MachO,
}, MagicSignature {
pattern: &[0xCE, 0xFA, 0xED, 0xFE],
offset: 0,
mask: None,
content_type: ContentType::MachO,
}, MagicSignature {
pattern: &[0xCF, 0xFA, 0xED, 0xFE],
offset: 0,
mask: None,
content_type: ContentType::MachO,
}, MagicSignature {
pattern: &[0xCA, 0xFE, 0xBA, 0xBE],
offset: 0,
mask: None,
content_type: ContentType::MachO,
}, MagicSignature {
pattern: &[0x00, 0x61, 0x73, 0x6D],
offset: 0,
mask: None,
content_type: ContentType::Wasm,
},
MagicSignature {
pattern: &[0x80, 0x02],
offset: 0,
mask: None,
content_type: ContentType::PyTorch,
}, MagicSignature {
pattern: b"PK",
offset: 0,
mask: None,
content_type: ContentType::PyTorch,
}, MagicSignature {
pattern: &[0x47, 0x47, 0x55, 0x46],
offset: 0,
mask: None,
content_type: ContentType::Gguf,
}, MagicSignature {
pattern: &[0x67, 0x67, 0x6D, 0x6C],
offset: 0,
mask: None,
content_type: ContentType::Ggml,
}, MagicSignature {
pattern: &[0x67, 0x67, 0x6D, 0x66],
offset: 0,
mask: None,
content_type: ContentType::Ggml,
}, MagicSignature {
pattern: &[0x67, 0x67, 0x6A, 0x74],
offset: 0,
mask: None,
content_type: ContentType::Ggml,
}, MagicSignature {
pattern: &[0x25, 0x50, 0x44, 0x46],
offset: 0,
mask: None,
content_type: ContentType::Pdf,
}, MagicSignature {
pattern: b"SQLite format 3",
offset: 0,
mask: None,
content_type: ContentType::Sqlite,
},
MagicSignature {
pattern: b"PAR1",
offset: 0,
mask: None,
content_type: ContentType::Parquet,
},
MagicSignature {
pattern: b"ARROW1",
offset: 0,
mask: None,
content_type: ContentType::Arrow,
},
MagicSignature {
pattern: &[0xEF, 0xBB, 0xBF],
offset: 0,
mask: None,
content_type: ContentType::PlainText,
}, MagicSignature {
pattern: b"<?xml",
offset: 0,
mask: None,
content_type: ContentType::Xml,
},
MagicSignature {
pattern: b"<!DOCTYPE html",
offset: 0,
mask: None,
content_type: ContentType::Html,
},
MagicSignature {
pattern: b"<!doctype html",
offset: 0,
mask: None,
content_type: ContentType::Html,
},
MagicSignature {
pattern: b"<html",
offset: 0,
mask: None,
content_type: ContentType::Html,
},
];
pub fn detect_by_magic(data: &[u8]) -> Option<ContentType> {
if data.is_empty() {
return Some(ContentType::Empty);
}
if data.len() >= 8 {
let header_size = u64::from_le_bytes([
data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
]);
if header_size > 0 && header_size < 100_000_000 && data.len() > 8 && data[8] == b'{' {
if data.len() > 50 {
let header_preview = &data[8..core::cmp::min(data.len(), 200)];
if header_preview.windows(8).any(|w| w == b"__metada")
|| header_preview.windows(6).any(|w| w == b"dtype\"")
{
return Some(ContentType::SafeTensors);
}
}
}
}
if data.len() >= 12 && &data[0..4] == b"RIFF" {
let format = &data[8..12];
if format == b"WEBP" {
return Some(ContentType::WebP);
} else if format == b"AVI " {
return Some(ContentType::Avi);
} else if format == b"WAVE" {
return Some(ContentType::Wav);
}
}
if data.len() >= 4 && data[0] == 0x08 {
}
for sig in MAGIC_SIGNATURES {
if data.len() >= sig.offset + sig.pattern.len() {
let slice = &data[sig.offset..sig.offset + sig.pattern.len()];
let matches = if let Some(mask) = sig.mask {
slice
.iter()
.zip(sig.pattern.iter())
.zip(mask.iter())
.all(|((d, p), m)| (d & m) == (p & m))
} else {
slice == sig.pattern
};
if matches {
match sig.content_type {
ContentType::Zip => {
if is_pytorch_zip(data) {
return Some(ContentType::PyTorch);
}
if is_office_document(data) {
return detect_office_type(data);
}
return Some(ContentType::Zip);
}
_ => return Some(sig.content_type),
}
}
}
}
None
}
fn is_pytorch_zip(data: &[u8]) -> bool {
if data.windows(8).any(|w| w == b"data.pkl") {
return true;
}
if data.windows(17).any(|w| w == b"model.safetensors") {
return true;
}
data.windows(7).any(|w| w == b"version")
}
fn is_office_document(data: &[u8]) -> bool {
data.windows(19).any(|w| w == b"[Content_Types].xml")
}
fn detect_office_type(data: &[u8]) -> Option<ContentType> {
if data.windows(15).any(|w| w == b"word/document.x") {
Some(ContentType::Docx)
} else if data.windows(14).any(|w| w == b"xl/workbook.x") {
Some(ContentType::Xlsx)
} else if data.windows(17).any(|w| w == b"ppt/presentation") {
Some(ContentType::Pptx)
} else {
Some(ContentType::Zip)
}
}
pub fn detect_by_extension(extension: &str) -> Option<ContentType> {
let ext_lower = extension.to_lowercase();
let ext = ext_lower.trim_start_matches('.');
match ext {
"jpg" | "jpeg" | "jpe" | "jfif" => Some(ContentType::Jpeg),
"png" => Some(ContentType::Png),
"gif" => Some(ContentType::Gif),
"webp" => Some(ContentType::WebP),
"bmp" | "dib" => Some(ContentType::Bmp),
"tiff" | "tif" => Some(ContentType::Tiff),
"ico" => Some(ContentType::Ico),
"avif" => Some(ContentType::Avif),
"heic" | "heif" => Some(ContentType::Heic),
"mp4" | "m4v" => Some(ContentType::Mp4),
"mkv" => Some(ContentType::Mkv),
"avi" => Some(ContentType::Avi),
"webm" => Some(ContentType::WebM),
"mov" | "qt" => Some(ContentType::Mov),
"flv" => Some(ContentType::Flv),
"mp3" => Some(ContentType::Mp3),
"flac" => Some(ContentType::Flac),
"wav" | "wave" => Some(ContentType::Wav),
"ogg" | "oga" | "ogv" => Some(ContentType::Ogg),
"aac" => Some(ContentType::Aac),
"m4a" => Some(ContentType::M4a),
"zip" => Some(ContentType::Zip),
"gz" | "gzip" => Some(ContentType::Gzip),
"zst" | "zstd" => Some(ContentType::Zstd),
"xz" => Some(ContentType::Xz),
"bz2" | "bzip2" => Some(ContentType::Bzip2),
"lz4" => Some(ContentType::Lz4),
"7z" => Some(ContentType::SevenZip),
"rar" => Some(ContentType::Rar),
"tar" => Some(ContentType::Tar),
"elf" | "so" | "o" => Some(ContentType::Elf),
"exe" | "dll" | "sys" => Some(ContentType::Pe),
"dylib" => Some(ContentType::MachO),
"wasm" => Some(ContentType::Wasm),
"pt" | "pth" => Some(ContentType::PyTorch),
"safetensors" => Some(ContentType::SafeTensors),
"gguf" => Some(ContentType::Gguf),
"ggml" => Some(ContentType::Ggml),
"onnx" => Some(ContentType::Onnx),
"pb" => Some(ContentType::TensorFlowSaved),
"pdf" => Some(ContentType::Pdf),
"docx" => Some(ContentType::Docx),
"xlsx" => Some(ContentType::Xlsx),
"pptx" => Some(ContentType::Pptx),
"odt" => Some(ContentType::Odt),
"db" | "sqlite" | "sqlite3" => Some(ContentType::Sqlite),
"parquet" => Some(ContentType::Parquet),
"arrow" | "feather" => Some(ContentType::Arrow),
"proto" => Some(ContentType::Protobuf),
"txt" | "text" | "log" => Some(ContentType::PlainText),
"json" => Some(ContentType::Json),
"xml" => Some(ContentType::Xml),
"html" | "htm" => Some(ContentType::Html),
"css" => Some(ContentType::Css),
"js" | "mjs" | "cjs" => Some(ContentType::JavaScript),
"ts" | "tsx" => Some(ContentType::SourceCode),
"rs" | "c" | "cpp" | "cc" | "cxx" | "h" | "hpp" => Some(ContentType::SourceCode),
"py" | "pyw" | "pyi" => Some(ContentType::SourceCode),
"go" | "java" | "kt" | "scala" => Some(ContentType::SourceCode),
"rb" | "php" | "pl" | "pm" => Some(ContentType::SourceCode),
"sh" | "bash" | "zsh" | "fish" => Some(ContentType::SourceCode),
"yaml" | "yml" => Some(ContentType::PlainText),
"toml" | "ini" | "cfg" | "conf" => Some(ContentType::PlainText),
"md" | "markdown" | "rst" => Some(ContentType::PlainText),
_ => None,
}
}
pub fn calculate_entropy(data: &[u8]) -> f64 {
if data.is_empty() {
return 0.0;
}
let mut counts = [0u64; 256];
for &byte in data {
counts[byte as usize] += 1;
}
let len = data.len() as f64;
let mut entropy = 0.0;
for &count in &counts {
if count > 0 {
let p = count as f64 / len;
entropy -= p * libm::log2(p);
}
}
entropy
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum EntropyClass {
VeryLow,
Low,
Medium,
High,
VeryHigh,
}
impl EntropyClass {
pub fn from_entropy(entropy: f64) -> Self {
if entropy < 2.0 {
EntropyClass::VeryLow
} else if entropy < 4.0 {
EntropyClass::Low
} else if entropy < 6.0 {
EntropyClass::Medium
} else if entropy < 7.5 {
EntropyClass::High
} else {
EntropyClass::VeryHigh
}
}
pub fn is_compressible(&self) -> bool {
matches!(
self,
EntropyClass::VeryLow | EntropyClass::Low | EntropyClass::Medium
)
}
}
pub fn detect_by_entropy(data: &[u8]) -> ContentType {
if data.is_empty() {
return ContentType::Empty;
}
let entropy = calculate_entropy(data);
let class = EntropyClass::from_entropy(entropy);
match class {
EntropyClass::VeryHigh => ContentType::Random,
EntropyClass::High => {
if is_likely_text(data) {
ContentType::PlainText
} else {
ContentType::Unknown
}
}
EntropyClass::Medium | EntropyClass::Low | EntropyClass::VeryLow => {
if is_likely_text(data) {
if is_likely_json(data) {
ContentType::Json
} else if is_likely_xml(data) {
ContentType::Xml
} else {
ContentType::PlainText
}
} else {
ContentType::Unknown
}
}
}
}
fn is_likely_text(data: &[u8]) -> bool {
if data.is_empty() {
return false;
}
let sample_size = core::cmp::min(data.len(), 4096);
let sample = &data[..sample_size];
let printable_count = sample
.iter()
.filter(|&&b| {
(0x20..=0x7E).contains(&b) || b == 0x09 || b == 0x0A || b == 0x0D
})
.count();
(printable_count * 100 / sample_size) >= 85
}
fn is_likely_json(data: &[u8]) -> bool {
let trimmed = skip_whitespace(data);
if trimmed.is_empty() {
return false;
}
matches!(trimmed[0], b'{' | b'[')
}
fn is_likely_xml(data: &[u8]) -> bool {
let trimmed = skip_whitespace(data);
if trimmed.is_empty() {
return false;
}
trimmed[0] == b'<'
}
fn skip_whitespace(data: &[u8]) -> &[u8] {
let start = data
.iter()
.position(|&b| !matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
match start {
Some(i) => &data[i..],
None => &[],
}
}
#[derive(Debug, Clone)]
pub struct DetectionResult {
pub content_type: ContentType,
pub method: DetectionMethod,
pub confidence: f32,
pub entropy: Option<f64>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DetectionMethod {
MagicBytes,
Extension,
Entropy,
Empty,
Unknown,
}
pub fn detect_content_type(data: &[u8], extension_hint: Option<&str>) -> DetectionResult {
if data.is_empty() {
return DetectionResult {
content_type: ContentType::Empty,
method: DetectionMethod::Empty,
confidence: 1.0,
entropy: Some(0.0),
};
}
if let Some(content_type) = detect_by_magic(data) {
return DetectionResult {
content_type,
method: DetectionMethod::MagicBytes,
confidence: 0.95,
entropy: None,
};
}
if let Some(ext) = extension_hint {
if let Some(content_type) = detect_by_extension(ext) {
return DetectionResult {
content_type,
method: DetectionMethod::Extension,
confidence: 0.70,
entropy: None,
};
}
}
let entropy = calculate_entropy(data);
let content_type = detect_by_entropy(data);
DetectionResult {
content_type,
method: DetectionMethod::Entropy,
confidence: match content_type {
ContentType::Random => 0.80,
ContentType::PlainText => 0.60,
_ => 0.40,
},
entropy: Some(entropy),
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CompressionAlgorithm {
None,
Lz4,
Zstd,
ZstdHigh,
QLoRA,
}
impl CompressionAlgorithm {
pub fn expected_speed(&self) -> u32 {
match self {
CompressionAlgorithm::None => u32::MAX,
CompressionAlgorithm::Lz4 => 500,
CompressionAlgorithm::Zstd => 200,
CompressionAlgorithm::ZstdHigh => 50,
CompressionAlgorithm::QLoRA => 100,
}
}
pub fn name(&self) -> &'static str {
match self {
CompressionAlgorithm::None => "none",
CompressionAlgorithm::Lz4 => "lz4",
CompressionAlgorithm::Zstd => "zstd",
CompressionAlgorithm::ZstdHigh => "zstd-high",
CompressionAlgorithm::QLoRA => "qlora",
}
}
}
#[derive(Debug, Clone)]
pub struct CompressionDecision {
pub algorithm: CompressionAlgorithm,
pub expected_ratio: f32,
pub reason: CompressionReason,
pub level: Option<u32>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CompressionReason {
Empty,
AlreadyCompressed,
HighEntropy,
AiModelWeights,
TextData,
StructuredData,
Executable,
Default,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum CompressionProfile {
Speed,
#[default]
Balanced,
Ratio,
Storage,
}
pub fn decide_compression(
detection: &DetectionResult,
profile: CompressionProfile,
) -> CompressionDecision {
let category = detection.content_type.category();
match detection.content_type {
ContentType::Empty => {
return CompressionDecision {
algorithm: CompressionAlgorithm::None,
expected_ratio: 1.0,
reason: CompressionReason::Empty,
level: None,
};
}
ContentType::Random => {
return CompressionDecision {
algorithm: CompressionAlgorithm::None,
expected_ratio: 1.0,
reason: CompressionReason::HighEntropy,
level: None,
};
}
_ => {}
}
if category.is_precompressed() {
return CompressionDecision {
algorithm: CompressionAlgorithm::None,
expected_ratio: category.expected_ratio(),
reason: CompressionReason::AlreadyCompressed,
level: None,
};
}
if matches!(category, ContentCategory::AiModel) {
return CompressionDecision {
algorithm: CompressionAlgorithm::QLoRA,
expected_ratio: 0.06, reason: CompressionReason::AiModelWeights,
level: Some(4), };
}
match category {
ContentCategory::Text => {
let (algo, level) = match profile {
CompressionProfile::Speed => (CompressionAlgorithm::Lz4, None),
CompressionProfile::Balanced => (CompressionAlgorithm::Zstd, Some(3)),
CompressionProfile::Ratio => (CompressionAlgorithm::Zstd, Some(9)),
CompressionProfile::Storage => (CompressionAlgorithm::ZstdHigh, Some(19)),
};
CompressionDecision {
algorithm: algo,
expected_ratio: category.expected_ratio(),
reason: CompressionReason::TextData,
level,
}
}
ContentCategory::StructuredData => {
let (algo, level) = match profile {
CompressionProfile::Speed => (CompressionAlgorithm::Lz4, None),
CompressionProfile::Balanced => (CompressionAlgorithm::Zstd, Some(5)),
CompressionProfile::Ratio | CompressionProfile::Storage => {
(CompressionAlgorithm::Zstd, Some(12))
}
};
CompressionDecision {
algorithm: algo,
expected_ratio: category.expected_ratio(),
reason: CompressionReason::StructuredData,
level,
}
}
ContentCategory::Executable => {
let (algo, level) = match profile {
CompressionProfile::Speed => (CompressionAlgorithm::Lz4, None),
CompressionProfile::Balanced => (CompressionAlgorithm::Zstd, Some(3)),
CompressionProfile::Ratio | CompressionProfile::Storage => {
(CompressionAlgorithm::Zstd, Some(9))
}
};
CompressionDecision {
algorithm: algo,
expected_ratio: category.expected_ratio(),
reason: CompressionReason::Executable,
level,
}
}
ContentCategory::UncompressedImage
| ContentCategory::UncompressedAudio
| ContentCategory::UncompressedArchive
| ContentCategory::Document => {
let (algo, level) = match profile {
CompressionProfile::Speed => (CompressionAlgorithm::Lz4, None),
CompressionProfile::Balanced => (CompressionAlgorithm::Zstd, Some(5)),
CompressionProfile::Ratio | CompressionProfile::Storage => {
(CompressionAlgorithm::Zstd, Some(12))
}
};
CompressionDecision {
algorithm: algo,
expected_ratio: category.expected_ratio(),
reason: CompressionReason::Default,
level,
}
}
_ => {
let (algo, level) = match profile {
CompressionProfile::Speed => (CompressionAlgorithm::Lz4, None),
CompressionProfile::Balanced => (CompressionAlgorithm::Zstd, Some(3)),
CompressionProfile::Ratio => (CompressionAlgorithm::Zstd, Some(9)),
CompressionProfile::Storage => (CompressionAlgorithm::ZstdHigh, Some(15)),
};
CompressionDecision {
algorithm: algo,
expected_ratio: 0.70,
reason: CompressionReason::Default,
level,
}
}
}
}
#[derive(Debug)]
pub struct CompressionResult {
pub data: Vec<u8>,
pub original_size: usize,
pub compressed_size: usize,
pub algorithm: CompressionAlgorithm,
pub content_type: ContentType,
pub ratio: f32,
}
pub struct ContentAwareCompressor {
profile: CompressionProfile,
stats: ContentAwareStats,
min_compress_size: usize,
skip_threshold: f32,
}
impl ContentAwareCompressor {
pub fn new() -> Self {
Self {
profile: CompressionProfile::Balanced,
stats: ContentAwareStats::new(),
min_compress_size: 64,
skip_threshold: 0.95,
}
}
pub fn with_profile(profile: CompressionProfile) -> Self {
Self {
profile,
stats: ContentAwareStats::new(),
min_compress_size: 64,
skip_threshold: 0.95,
}
}
pub fn set_min_compress_size(&mut self, size: usize) {
self.min_compress_size = size;
}
pub fn set_skip_threshold(&mut self, threshold: f32) {
self.skip_threshold = threshold;
}
pub fn analyze(&self, data: &[u8], extension_hint: Option<&str>) -> CompressionDecision {
let detection = detect_content_type(data, extension_hint);
decide_compression(&detection, self.profile)
}
pub fn compress(&self, data: &[u8], extension_hint: Option<&str>) -> CompressionResult {
let detection = detect_content_type(data, extension_hint);
let decision = decide_compression(&detection, self.profile);
self.stats.record_analysis(detection.content_type);
if data.len() < self.min_compress_size {
return CompressionResult {
data: data.to_vec(),
original_size: data.len(),
compressed_size: data.len(),
algorithm: CompressionAlgorithm::None,
content_type: detection.content_type,
ratio: 1.0,
};
}
if decision.expected_ratio > self.skip_threshold {
self.stats
.record_skipped(detection.content_type, data.len());
return CompressionResult {
data: data.to_vec(),
original_size: data.len(),
compressed_size: data.len(),
algorithm: CompressionAlgorithm::None,
content_type: detection.content_type,
ratio: 1.0,
};
}
let compressed = compress_with_algorithm(data, decision.algorithm, decision.level);
let compressed_size = compressed.len();
let ratio = compressed_size as f32 / data.len() as f32;
self.stats.record_compression(
detection.content_type,
data.len(),
compressed_size,
decision.algorithm,
);
CompressionResult {
data: compressed,
original_size: data.len(),
compressed_size,
algorithm: decision.algorithm,
content_type: detection.content_type,
ratio,
}
}
pub fn stats(&self) -> &ContentAwareStats {
&self.stats
}
pub fn reset_stats(&self) {
self.stats.reset();
}
}
impl Default for ContentAwareCompressor {
fn default() -> Self {
Self::new()
}
}
fn compress_with_algorithm(
data: &[u8],
algorithm: CompressionAlgorithm,
_level: Option<u32>,
) -> Vec<u8> {
use crate::compress::compress::{Compress, CompressionType};
match algorithm {
CompressionAlgorithm::None => data.to_vec(),
CompressionAlgorithm::Lz4 => {
Compress::compress_with_type(data, CompressionType::Lz4)
.unwrap_or_else(|| data.to_vec())
}
#[cfg(feature = "std")]
CompressionAlgorithm::Zstd => Compress::compress_with_type(data, CompressionType::Zstd)
.unwrap_or_else(|| data.to_vec()),
#[cfg(feature = "std")]
CompressionAlgorithm::ZstdHigh => {
Compress::compress_with_type(data, CompressionType::Zstd)
.unwrap_or_else(|| data.to_vec())
}
#[cfg(not(feature = "std"))]
CompressionAlgorithm::Zstd | CompressionAlgorithm::ZstdHigh => {
Compress::compress_with_type(data, CompressionType::Lz4)
.unwrap_or_else(|| data.to_vec())
}
CompressionAlgorithm::QLoRA => {
#[cfg(feature = "lunaos")]
{
use crate::compress::qlora::compress_qlora;
match compress_qlora(data) {
Some((compressed, _metadata)) => compressed,
None => Compress::compress_with_type(data, CompressionType::Lz4)
.unwrap_or_else(|| data.to_vec()),
}
}
#[cfg(not(feature = "lunaos"))]
{
Compress::compress_with_type(data, CompressionType::Lz4)
.unwrap_or_else(|| data.to_vec())
}
}
}
}
pub struct ContentAwareStats {
total_bytes_in: AtomicU64,
total_bytes_out: AtomicU64,
files_analyzed: AtomicU64,
files_skipped: AtomicU64,
type_stats: spin::RwLock<TypeStatsMap>,
}
#[derive(Debug, Clone, Default)]
pub struct TypeStats {
pub count: u64,
pub bytes_in: u64,
pub bytes_out: u64,
pub skipped: u64,
}
impl TypeStats {
pub fn ratio(&self) -> f32 {
if self.bytes_in == 0 {
1.0
} else {
self.bytes_out as f32 / self.bytes_in as f32
}
}
pub fn space_saved(&self) -> u64 {
self.bytes_in.saturating_sub(self.bytes_out)
}
}
struct TypeStatsMap {
stats: Vec<(ContentType, TypeStats)>,
}
impl TypeStatsMap {
fn new() -> Self {
Self { stats: Vec::new() }
}
fn get_or_insert(&mut self, content_type: ContentType) -> &mut TypeStats {
if let Some(pos) = self.stats.iter().position(|(t, _)| *t == content_type) {
&mut self.stats[pos].1
} else {
self.stats.push((content_type, TypeStats::default()));
&mut self.stats.last_mut().unwrap().1
}
}
fn get(&self, content_type: ContentType) -> Option<&TypeStats> {
self.stats
.iter()
.find(|(t, _)| *t == content_type)
.map(|(_, s)| s)
}
fn iter(&self) -> impl Iterator<Item = &(ContentType, TypeStats)> {
self.stats.iter()
}
fn clear(&mut self) {
self.stats.clear();
}
}
impl ContentAwareStats {
pub fn new() -> Self {
Self {
total_bytes_in: AtomicU64::new(0),
total_bytes_out: AtomicU64::new(0),
files_analyzed: AtomicU64::new(0),
files_skipped: AtomicU64::new(0),
type_stats: spin::RwLock::new(TypeStatsMap::new()),
}
}
pub fn record_analysis(&self, _content_type: ContentType) {
self.files_analyzed.fetch_add(1, Ordering::Relaxed);
}
pub fn record_compression(
&self,
content_type: ContentType,
bytes_in: usize,
bytes_out: usize,
_algorithm: CompressionAlgorithm,
) {
self.total_bytes_in
.fetch_add(bytes_in as u64, Ordering::Relaxed);
self.total_bytes_out
.fetch_add(bytes_out as u64, Ordering::Relaxed);
let mut stats = self.type_stats.write();
let type_stat = stats.get_or_insert(content_type);
type_stat.count += 1;
type_stat.bytes_in += bytes_in as u64;
type_stat.bytes_out += bytes_out as u64;
}
pub fn record_skipped(&self, content_type: ContentType, bytes: usize) {
self.files_skipped.fetch_add(1, Ordering::Relaxed);
self.total_bytes_in
.fetch_add(bytes as u64, Ordering::Relaxed);
self.total_bytes_out
.fetch_add(bytes as u64, Ordering::Relaxed);
let mut stats = self.type_stats.write();
let type_stat = stats.get_or_insert(content_type);
type_stat.skipped += 1;
}
pub fn total_bytes_in(&self) -> u64 {
self.total_bytes_in.load(Ordering::Relaxed)
}
pub fn total_bytes_out(&self) -> u64 {
self.total_bytes_out.load(Ordering::Relaxed)
}
pub fn files_analyzed(&self) -> u64 {
self.files_analyzed.load(Ordering::Relaxed)
}
pub fn files_skipped(&self) -> u64 {
self.files_skipped.load(Ordering::Relaxed)
}
pub fn overall_ratio(&self) -> f32 {
let bytes_in = self.total_bytes_in.load(Ordering::Relaxed);
let bytes_out = self.total_bytes_out.load(Ordering::Relaxed);
if bytes_in == 0 {
1.0
} else {
bytes_out as f32 / bytes_in as f32
}
}
pub fn total_space_saved(&self) -> u64 {
let bytes_in = self.total_bytes_in.load(Ordering::Relaxed);
let bytes_out = self.total_bytes_out.load(Ordering::Relaxed);
bytes_in.saturating_sub(bytes_out)
}
pub fn get_type_stats(&self, content_type: ContentType) -> Option<TypeStats> {
self.type_stats.read().get(content_type).cloned()
}
pub fn all_type_stats(&self) -> Vec<(ContentType, TypeStats)> {
self.type_stats.read().iter().cloned().collect()
}
pub fn reset(&self) {
self.total_bytes_in.store(0, Ordering::Relaxed);
self.total_bytes_out.store(0, Ordering::Relaxed);
self.files_analyzed.store(0, Ordering::Relaxed);
self.files_skipped.store(0, Ordering::Relaxed);
self.type_stats.write().clear();
}
}
impl Default for ContentAwareStats {
fn default() -> Self {
Self::new()
}
}
pub fn compress_content_aware(data: &[u8], extension_hint: Option<&str>) -> CompressionResult {
let compressor = ContentAwareCompressor::new();
compressor.compress(data, extension_hint)
}
pub fn analyze_content(
data: &[u8],
extension_hint: Option<&str>,
) -> (DetectionResult, CompressionDecision) {
let detection = detect_content_type(data, extension_hint);
let decision = decide_compression(&detection, CompressionProfile::Balanced);
(detection, decision)
}
pub fn should_skip_compression(data: &[u8], extension_hint: Option<&str>) -> bool {
let detection = detect_content_type(data, extension_hint);
let decision = decide_compression(&detection, CompressionProfile::Balanced);
matches!(decision.algorithm, CompressionAlgorithm::None)
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec;
#[test]
fn test_detect_empty() {
let result = detect_content_type(&[], None);
assert_eq!(result.content_type, ContentType::Empty);
assert_eq!(result.method, DetectionMethod::Empty);
}
#[test]
fn test_detect_jpeg() {
let jpeg_header = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46];
let result = detect_content_type(&jpeg_header, None);
assert_eq!(result.content_type, ContentType::Jpeg);
assert_eq!(result.method, DetectionMethod::MagicBytes);
}
#[test]
fn test_detect_png() {
let png_header = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
let result = detect_content_type(&png_header, None);
assert_eq!(result.content_type, ContentType::Png);
assert_eq!(result.method, DetectionMethod::MagicBytes);
}
#[test]
fn test_detect_gif() {
let gif87_header = b"GIF87atest";
let result = detect_content_type(gif87_header, None);
assert_eq!(result.content_type, ContentType::Gif);
let gif89_header = b"GIF89atest";
let result = detect_content_type(gif89_header, None);
assert_eq!(result.content_type, ContentType::Gif);
}
#[test]
fn test_detect_zip() {
let zip_header = vec![0x50, 0x4B, 0x03, 0x04, 0x00, 0x00, 0x00, 0x00];
let result = detect_content_type(&zip_header, None);
assert_eq!(result.content_type, ContentType::Zip);
}
#[test]
fn test_detect_gzip() {
let gzip_header = vec![0x1F, 0x8B, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00];
let result = detect_content_type(&gzip_header, None);
assert_eq!(result.content_type, ContentType::Gzip);
}
#[test]
fn test_detect_zstd() {
let zstd_header = vec![0x28, 0xB5, 0x2F, 0xFD, 0x00, 0x00, 0x00, 0x00];
let result = detect_content_type(&zstd_header, None);
assert_eq!(result.content_type, ContentType::Zstd);
}
#[test]
fn test_detect_elf() {
let elf_header = vec![0x7F, 0x45, 0x4C, 0x46, 0x02, 0x01, 0x01, 0x00];
let result = detect_content_type(&elf_header, None);
assert_eq!(result.content_type, ContentType::Elf);
}
#[test]
fn test_detect_pe() {
let pe_header = vec![0x4D, 0x5A, 0x90, 0x00, 0x03, 0x00, 0x00, 0x00];
let result = detect_content_type(&pe_header, None);
assert_eq!(result.content_type, ContentType::Pe);
}
#[test]
fn test_detect_pdf() {
let pdf_header = b"%PDF-1.4 test";
let result = detect_content_type(pdf_header, None);
assert_eq!(result.content_type, ContentType::Pdf);
}
#[test]
fn test_detect_sqlite() {
let sqlite_header = b"SQLite format 3\x00test";
let result = detect_content_type(sqlite_header, None);
assert_eq!(result.content_type, ContentType::Sqlite);
}
#[test]
fn test_detect_gguf() {
let gguf_header = vec![0x47, 0x47, 0x55, 0x46, 0x03, 0x00, 0x00, 0x00];
let result = detect_content_type(&gguf_header, None);
assert_eq!(result.content_type, ContentType::Gguf);
}
#[test]
fn test_detect_mp4() {
let mp4_header = vec![0x00, 0x00, 0x00, 0x20, 0x66, 0x74, 0x79, 0x70];
let result = detect_content_type(&mp4_header, None);
assert_eq!(result.content_type, ContentType::Mp4);
}
#[test]
fn test_detect_webp() {
let mut webp_header = b"RIFF".to_vec();
webp_header.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]);
webp_header.extend_from_slice(b"WEBP");
let result = detect_content_type(&webp_header, None);
assert_eq!(result.content_type, ContentType::WebP);
}
#[test]
fn test_detect_wav() {
let mut wav_header = b"RIFF".to_vec();
wav_header.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]);
wav_header.extend_from_slice(b"WAVE");
let result = detect_content_type(&wav_header, None);
assert_eq!(result.content_type, ContentType::Wav);
}
#[test]
fn test_detect_by_extension() {
assert_eq!(detect_by_extension("jpg"), Some(ContentType::Jpeg));
assert_eq!(detect_by_extension(".png"), Some(ContentType::Png));
assert_eq!(detect_by_extension("RS"), Some(ContentType::SourceCode));
assert_eq!(
detect_by_extension("safetensors"),
Some(ContentType::SafeTensors)
);
assert_eq!(detect_by_extension("unknown_ext"), None);
}
#[test]
fn test_extension_fallback() {
let unknown_data = vec![0x01, 0x02, 0x03, 0x04];
let result = detect_content_type(&unknown_data, Some("rs"));
assert_eq!(result.content_type, ContentType::SourceCode);
assert_eq!(result.method, DetectionMethod::Extension);
}
#[test]
fn test_entropy_calculation() {
let zeros = vec![0u8; 1000];
let entropy = calculate_entropy(&zeros);
assert!(entropy < 0.01);
let same = vec![42u8; 1000];
let entropy = calculate_entropy(&same);
assert!(entropy < 0.01);
let alternating: Vec<u8> = (0..1000).map(|i| (i % 2) as u8).collect();
let entropy = calculate_entropy(&alternating);
assert!(entropy > 0.9 && entropy < 1.1);
let varied: Vec<u8> = (0..256).cycle().take(1024).map(|x| x as u8).collect();
let entropy = calculate_entropy(&varied);
assert!(entropy > 7.9);
}
#[test]
fn test_entropy_classification() {
assert_eq!(EntropyClass::from_entropy(0.5), EntropyClass::VeryLow);
assert_eq!(EntropyClass::from_entropy(3.0), EntropyClass::Low);
assert_eq!(EntropyClass::from_entropy(5.0), EntropyClass::Medium);
assert_eq!(EntropyClass::from_entropy(7.0), EntropyClass::High);
assert_eq!(EntropyClass::from_entropy(7.8), EntropyClass::VeryHigh);
}
#[test]
fn test_detect_text_by_entropy() {
let text = b"Hello, World! This is a test of the content detection system. It should detect this as plain text based on the character distribution.";
let result = detect_content_type(text.as_slice(), None);
assert!(matches!(
result.content_type,
ContentType::PlainText | ContentType::Json | ContentType::Xml
));
}
#[test]
fn test_detect_json() {
let json = br#"{"name": "test", "value": 42, "nested": {"key": "value"}}"#;
let result = detect_content_type(json.as_slice(), None);
assert_eq!(result.content_type, ContentType::Json);
}
#[test]
fn test_detect_xml() {
let xml = br#"<?xml version="1.0"?><root><item>test</item></root>"#;
let result = detect_content_type(xml.as_slice(), None);
assert_eq!(result.content_type, ContentType::Xml);
}
#[test]
fn test_compression_decision_already_compressed() {
let jpeg_header = vec![0xFF, 0xD8, 0xFF, 0xE0];
let detection = detect_content_type(&jpeg_header, None);
let decision = decide_compression(&detection, CompressionProfile::Balanced);
assert_eq!(decision.algorithm, CompressionAlgorithm::None);
assert_eq!(decision.reason, CompressionReason::AlreadyCompressed);
}
#[test]
fn test_compression_decision_text() {
let text = b"This is some plain text that should compress well.";
let detection = detect_content_type(text.as_slice(), Some("txt"));
let decision = decide_compression(&detection, CompressionProfile::Balanced);
assert!(matches!(
decision.algorithm,
CompressionAlgorithm::Zstd | CompressionAlgorithm::Lz4
));
assert_eq!(decision.reason, CompressionReason::TextData);
}
#[test]
fn test_compression_decision_ai_model() {
let gguf_header = vec![0x47, 0x47, 0x55, 0x46, 0x03, 0x00, 0x00, 0x00];
let detection = detect_content_type(&gguf_header, None);
let decision = decide_compression(&detection, CompressionProfile::Balanced);
assert_eq!(decision.algorithm, CompressionAlgorithm::QLoRA);
assert_eq!(decision.reason, CompressionReason::AiModelWeights);
}
#[test]
fn test_compression_profiles() {
let text = b"Test data for compression profile testing.";
let detection = detect_content_type(text.as_slice(), Some("txt"));
let speed = decide_compression(&detection, CompressionProfile::Speed);
let balanced = decide_compression(&detection, CompressionProfile::Balanced);
let ratio = decide_compression(&detection, CompressionProfile::Ratio);
let storage = decide_compression(&detection, CompressionProfile::Storage);
assert_eq!(speed.algorithm, CompressionAlgorithm::Lz4);
assert_eq!(balanced.algorithm, CompressionAlgorithm::Zstd);
assert_eq!(ratio.algorithm, CompressionAlgorithm::Zstd);
assert_eq!(storage.algorithm, CompressionAlgorithm::ZstdHigh);
assert!(storage.level.unwrap_or(0) > balanced.level.unwrap_or(0));
}
#[test]
fn test_content_aware_compressor() {
let compressor = ContentAwareCompressor::with_profile(CompressionProfile::Balanced);
let text = b"This is test data for the content-aware compressor. It should analyze and compress appropriately.";
let result = compressor.compress(text.as_slice(), Some("txt"));
assert_eq!(result.original_size, text.len());
assert!(matches!(
result.content_type,
ContentType::PlainText | ContentType::Unknown
));
}
#[test]
fn test_compressor_skip_small() {
let mut compressor = ContentAwareCompressor::new();
compressor.set_min_compress_size(100);
let small_data = b"tiny";
let result = compressor.compress(small_data.as_slice(), None);
assert_eq!(result.algorithm, CompressionAlgorithm::None);
assert_eq!(result.ratio, 1.0);
}
#[test]
fn test_statistics_tracking() {
let stats = ContentAwareStats::new();
stats.record_analysis(ContentType::PlainText);
stats.record_compression(
ContentType::PlainText,
1000,
500,
CompressionAlgorithm::Zstd,
);
stats.record_compression(
ContentType::PlainText,
2000,
800,
CompressionAlgorithm::Zstd,
);
assert_eq!(stats.files_analyzed(), 1);
assert_eq!(stats.total_bytes_in(), 3000);
assert_eq!(stats.total_bytes_out(), 1300);
assert_eq!(stats.total_space_saved(), 1700);
let type_stats = stats.get_type_stats(ContentType::PlainText).unwrap();
assert_eq!(type_stats.count, 2);
assert_eq!(type_stats.bytes_in, 3000);
assert_eq!(type_stats.bytes_out, 1300);
}
#[test]
fn test_statistics_reset() {
let stats = ContentAwareStats::new();
stats.record_compression(ContentType::Json, 1000, 500, CompressionAlgorithm::Zstd);
assert_eq!(stats.total_bytes_in(), 1000);
stats.reset();
assert_eq!(stats.total_bytes_in(), 0);
assert_eq!(stats.files_analyzed(), 0);
}
#[test]
fn test_content_type_category() {
assert_eq!(
ContentType::Jpeg.category(),
ContentCategory::CompressedImage
);
assert_eq!(
ContentType::Bmp.category(),
ContentCategory::UncompressedImage
);
assert_eq!(
ContentType::Mp4.category(),
ContentCategory::CompressedVideo
);
assert_eq!(ContentType::Zip.category(), ContentCategory::Archive);
assert_eq!(ContentType::Elf.category(), ContentCategory::Executable);
assert_eq!(
ContentType::SafeTensors.category(),
ContentCategory::AiModel
);
assert_eq!(ContentType::PlainText.category(), ContentCategory::Text);
}
#[test]
fn test_category_precompressed() {
assert!(ContentCategory::CompressedImage.is_precompressed());
assert!(ContentCategory::Archive.is_precompressed());
assert!(!ContentCategory::Text.is_precompressed());
assert!(!ContentCategory::Executable.is_precompressed());
}
#[test]
fn test_convenience_functions() {
let data = b"Test data for convenience function testing";
let result = compress_content_aware(data.as_slice(), Some("txt"));
assert!(result.original_size > 0);
let (detection, decision) = analyze_content(data.as_slice(), Some("txt"));
assert!(detection.confidence > 0.0);
assert!(decision.expected_ratio < 1.0);
let zip_header = vec![0x50, 0x4B, 0x03, 0x04];
assert!(should_skip_compression(&zip_header, None));
}
#[test]
fn test_content_type_name() {
assert_eq!(ContentType::Jpeg.name(), "JPEG Image");
assert_eq!(ContentType::SafeTensors.name(), "SafeTensors Model");
assert_eq!(ContentType::Unknown.name(), "Unknown");
}
#[test]
fn test_content_type_extensions() {
assert!(ContentType::Jpeg.extensions().contains(&"jpg"));
assert!(
ContentType::SafeTensors
.extensions()
.contains(&"safetensors")
);
assert!(ContentType::Empty.extensions().is_empty());
}
#[test]
fn test_compression_algorithm_properties() {
assert_eq!(CompressionAlgorithm::None.name(), "none");
assert_eq!(CompressionAlgorithm::Lz4.name(), "lz4");
assert!(
CompressionAlgorithm::Lz4.expected_speed()
> CompressionAlgorithm::Zstd.expected_speed()
);
}
#[test]
fn test_type_stats_ratio() {
let mut stats = TypeStats::default();
assert_eq!(stats.ratio(), 1.0);
stats.bytes_in = 1000;
stats.bytes_out = 500;
assert!((stats.ratio() - 0.5).abs() < 0.001);
}
#[test]
fn test_html_detection() {
let html1 = b"<!DOCTYPE html><html><body>Test</body></html>";
let result = detect_content_type(html1.as_slice(), None);
assert_eq!(result.content_type, ContentType::Html);
let html2 = b"<html><head></head><body>Test</body></html>";
let result = detect_content_type(html2.as_slice(), None);
assert_eq!(result.content_type, ContentType::Html);
}
#[test]
fn test_flac_detection() {
let flac_header = b"fLaC\x00\x00\x00\x22";
let result = detect_content_type(flac_header, None);
assert_eq!(result.content_type, ContentType::Flac);
}
#[test]
fn test_ogg_detection() {
let ogg_header = b"OggS\x00\x02\x00\x00";
let result = detect_content_type(ogg_header, None);
assert_eq!(result.content_type, ContentType::Ogg);
}
#[test]
fn test_wasm_detection() {
let wasm_header = vec![0x00, 0x61, 0x73, 0x6D, 0x01, 0x00, 0x00, 0x00];
let result = detect_content_type(&wasm_header, None);
assert_eq!(result.content_type, ContentType::Wasm);
}
}