use std::io::{Cursor, Read};
use zip::ZipArchive;
#[derive(Debug, Clone)]
pub struct MagicPattern {
pub mime_type: String,
pub offset: usize,
pub pattern: Vec<u8>,
pub mask: Option<Vec<u8>>,
}
impl MagicPattern {
pub fn new(mime_type: &str, offset: usize, pattern: Vec<u8>) -> Self {
Self {
mime_type: mime_type.to_string(),
offset,
pattern,
mask: None,
}
}
pub fn with_mask(mime_type: &str, offset: usize, pattern: Vec<u8>, mask: Vec<u8>) -> Self {
Self {
mime_type: mime_type.to_string(),
offset,
pattern,
mask: Some(mask),
}
}
pub fn matches(&self, data: &[u8]) -> bool {
if data.len() < self.offset + self.pattern.len() {
return false;
}
let data_slice = &data[self.offset..self.offset + self.pattern.len()];
if let Some(mask) = &self.mask {
for i in 0..self.pattern.len() {
if (data_slice[i] & mask[i]) != (self.pattern[i] & mask[i]) {
return false;
}
}
true
} else {
data_slice == self.pattern.as_slice()
}
}
}
pub fn detect_openxml_type(data: &[u8]) -> Option<String> {
if data.len() < 4 || &data[0..4] != b"PK\x03\x04" {
return None;
}
let cursor = Cursor::new(data);
let mut archive = ZipArchive::new(cursor).ok()?;
if let Ok(mut mimetype_file) = archive.by_name("mimetype") {
let mut mimetype = String::new();
if mimetype_file.read_to_string(&mut mimetype).is_ok() {
let trimmed = mimetype.trim();
if trimmed.starts_with("application/vnd.oasis.opendocument.") {
return Some(trimmed.to_string());
}
}
}
if let Ok(mut content_types_file) = archive.by_name("[Content_Types].xml") {
let mut content = String::new();
if content_types_file.read_to_string(&mut content).is_ok() {
if content.contains("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") {
return Some("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string());
} else if content.contains("application/vnd.openxmlformats-officedocument.presentationml.presentation") {
return Some("application/vnd.openxmlformats-officedocument.presentationml.presentation".to_string());
} else if content.contains("application/vnd.openxmlformats-officedocument.wordprocessingml.document") {
return Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string());
}
}
}
None
}
pub fn detect_ole2_type(data: &[u8]) -> Option<(String, f32)> {
if data.len() < 512 || &data[0..8] != b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" {
return None;
}
if contains_stream_name(data, b"WordDocument") ||
contains_stream_name(data, b"Word") {
return Some(("application/msword".to_string(), 0.90));
}
if contains_stream_name(data, b"Workbook") ||
contains_stream_name(data, b"Book") {
return Some(("application/vnd.ms-excel".to_string(), 0.90));
}
if contains_stream_name(data, b"PowerPoint Document") ||
contains_stream_name(data, b"Current User") {
return Some(("application/vnd.ms-powerpoint".to_string(), 0.90));
}
None
}
fn contains_stream_name(data: &[u8], stream_name: &[u8]) -> bool {
if search_bytes(data, stream_name) {
return true;
}
let utf16le_name = to_utf16le(stream_name);
if search_bytes(data, &utf16le_name) {
return true;
}
false
}
fn search_bytes(data: &[u8], pattern: &[u8]) -> bool {
if pattern.is_empty() || data.len() < pattern.len() {
return false;
}
for i in 0..=data.len() - pattern.len() {
if &data[i..i + pattern.len()] == pattern {
return true;
}
}
false
}
fn to_utf16le(ascii: &[u8]) -> Vec<u8> {
let mut result = Vec::with_capacity(ascii.len() * 2);
for &byte in ascii {
result.push(byte);
result.push(0);
}
result
}
pub fn get_magic_patterns() -> Vec<MagicPattern> {
vec![
MagicPattern::new("application/pdf", 0, b"%PDF-".to_vec()),
MagicPattern::new("application/vnd.openxmlformats-officedocument.wordprocessingml.document", 0,
vec![0x50, 0x4B, 0x03, 0x04]), MagicPattern::new("application/vnd.oasis.opendocument.text", 0,
vec![0x50, 0x4B, 0x03, 0x04]), MagicPattern::new("application/rtf", 0, b"{\\rtf".to_vec()),
MagicPattern::new("image/jpeg", 0, vec![0xFF, 0xD8, 0xFF]),
MagicPattern::new("image/png", 0, vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]),
MagicPattern::new("image/gif", 0, b"GIF87a".to_vec()),
MagicPattern::new("image/gif", 0, b"GIF89a".to_vec()),
MagicPattern::new("image/tiff", 0, vec![0x49, 0x49, 0x2A, 0x00]), MagicPattern::new("image/tiff", 0, vec![0x4D, 0x4D, 0x00, 0x2A]), MagicPattern::new("image/bmp", 0, b"BM".to_vec()),
MagicPattern::new("image/webp", 8, b"WEBP".to_vec()),
MagicPattern::new("image/x-icon", 0, vec![0x00, 0x00, 0x01, 0x00]),
MagicPattern::new("image/svg+xml", 0, b"<svg".to_vec()),
MagicPattern::new("application/zip", 0, vec![0x50, 0x4B, 0x03, 0x04]),
MagicPattern::new("application/zip", 0, vec![0x50, 0x4B, 0x05, 0x06]), MagicPattern::new("application/zip", 0, vec![0x50, 0x4B, 0x07, 0x08]), MagicPattern::new("application/x-tar", 257, b"ustar".to_vec()),
MagicPattern::new("application/gzip", 0, vec![0x1F, 0x8B]),
MagicPattern::new("application/x-bzip2", 0, b"BZh".to_vec()),
MagicPattern::new("application/x-7z-compressed", 0, vec![0x37, 0x7A, 0xBC, 0xAF, 0x27, 0x1C]),
MagicPattern::new("application/x-rar-compressed", 0, b"Rar!\x1A\x07".to_vec()),
MagicPattern::new("application/x-xz", 0, vec![0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00]),
MagicPattern::new("text/html", 0, b"<!DOCTYPE html".to_vec()),
MagicPattern::new("text/html", 0, b"<html".to_vec()),
MagicPattern::new("text/html", 0, b"<HTML".to_vec()),
MagicPattern::new("text/xml", 0, b"<?xml".to_vec()),
MagicPattern::new("application/json", 0, b"{".to_vec()),
MagicPattern::new("application/json", 0, b"[".to_vec()),
MagicPattern::new("audio/mpeg", 0, vec![0xFF, 0xFB]),
MagicPattern::new("audio/mpeg", 0, vec![0xFF, 0xF3]),
MagicPattern::new("audio/mpeg", 0, vec![0xFF, 0xF2]),
MagicPattern::new("audio/mpeg", 0, b"ID3".to_vec()),
MagicPattern::new("audio/wav", 0, b"RIFF".to_vec()),
MagicPattern::new("audio/x-flac", 0, b"fLaC".to_vec()),
MagicPattern::new("audio/ogg", 0, b"OggS".to_vec()),
MagicPattern::new("audio/x-m4a", 4, b"ftyp".to_vec()),
MagicPattern::new("video/mp4", 4, b"ftyp".to_vec()),
MagicPattern::new("video/x-msvideo", 0, b"RIFF".to_vec()),
MagicPattern::new("video/x-matroska", 0, vec![0x1A, 0x45, 0xDF, 0xA3]),
MagicPattern::new("video/webm", 0, vec![0x1A, 0x45, 0xDF, 0xA3]),
MagicPattern::new("video/quicktime", 4, b"moov".to_vec()),
MagicPattern::new("video/quicktime", 4, b"mdat".to_vec()),
MagicPattern::new("application/x-executable", 0, vec![0x7F, 0x45, 0x4C, 0x46]), MagicPattern::new("application/x-mach-binary", 0, vec![0xFE, 0xED, 0xFA, 0xCE]), MagicPattern::new("application/x-mach-binary", 0, vec![0xFE, 0xED, 0xFA, 0xCF]), MagicPattern::new("application/x-msdownload", 0, b"MZ".to_vec()),
MagicPattern::new("font/ttf", 0, vec![0x00, 0x01, 0x00, 0x00]),
MagicPattern::new("font/otf", 0, b"OTTO".to_vec()),
MagicPattern::new("font/woff", 0, b"wOFF".to_vec()),
MagicPattern::new("font/woff2", 0, b"wOF2".to_vec()),
MagicPattern::new("application/x-sqlite3", 0, b"SQLite format 3\0".to_vec()),
MagicPattern::new("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 0,
vec![0x50, 0x4B, 0x03, 0x04]), MagicPattern::new("application/vnd.openxmlformats-officedocument.presentationml.presentation", 0,
vec![0x50, 0x4B, 0x03, 0x04]), MagicPattern::new("application/vnd.oasis.opendocument.spreadsheet", 0,
vec![0x50, 0x4B, 0x03, 0x04]), MagicPattern::new("application/vnd.oasis.opendocument.presentation", 0,
vec![0x50, 0x4B, 0x03, 0x04]), ]
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_magic_pattern_matches() {
let pattern = MagicPattern::new("application/pdf", 0, b"%PDF-".to_vec());
assert!(pattern.matches(b"%PDF-1.4"));
assert!(!pattern.matches(b"Not a PDF"));
}
#[test]
fn test_magic_pattern_with_offset() {
let pattern = MagicPattern::new("application/x-tar", 257, b"ustar".to_vec());
let mut data = vec![0u8; 262];
data[257..262].copy_from_slice(b"ustar");
assert!(pattern.matches(&data));
}
#[test]
fn test_magic_pattern_with_mask() {
let pattern = MagicPattern::with_mask(
"test/masked",
0,
vec![0xF0, 0x0F],
vec![0xF0, 0x0F],
);
assert!(pattern.matches(&[0xF5, 0x0F]));
assert!(!pattern.matches(&[0x0F, 0xF0]));
}
#[test]
fn test_get_magic_patterns_count() {
let patterns = get_magic_patterns();
assert!(patterns.len() >= 50, "Should have at least 50 MIME types");
}
#[test]
fn test_detect_ole2_type_with_word_document() {
let mut data = vec![0u8; 1024];
data[0..8].copy_from_slice(b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1");
let word_doc_utf16 = to_utf16le(b"WordDocument");
data[512..512 + word_doc_utf16.len()].copy_from_slice(&word_doc_utf16);
let result = detect_ole2_type(&data);
assert!(result.is_some());
let (mime_type, confidence) = result.unwrap();
assert_eq!(mime_type, "application/msword");
assert_eq!(confidence, 0.90);
}
#[test]
fn test_detect_ole2_type_with_workbook() {
let mut data = vec![0u8; 1024];
data[0..8].copy_from_slice(b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1");
let workbook_utf16 = to_utf16le(b"Workbook");
data[512..512 + workbook_utf16.len()].copy_from_slice(&workbook_utf16);
let result = detect_ole2_type(&data);
assert!(result.is_some());
let (mime_type, confidence) = result.unwrap();
assert_eq!(mime_type, "application/vnd.ms-excel");
assert_eq!(confidence, 0.90);
}
#[test]
fn test_detect_ole2_type_with_powerpoint() {
let mut data = vec![0u8; 1024];
data[0..8].copy_from_slice(b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1");
let current_user_utf16 = to_utf16le(b"Current User");
data[512..512 + current_user_utf16.len()].copy_from_slice(¤t_user_utf16);
let result = detect_ole2_type(&data);
assert!(result.is_some());
let (mime_type, confidence) = result.unwrap();
assert_eq!(mime_type, "application/vnd.ms-powerpoint");
assert_eq!(confidence, 0.90);
}
#[test]
fn test_detect_ole2_type_invalid_header() {
let data = vec![0u8; 512];
let result = detect_ole2_type(&data);
assert!(result.is_none());
}
#[test]
fn test_detect_ole2_type_too_short() {
let data = vec![0xD0, 0xCF, 0x11, 0xE0];
let result = detect_ole2_type(&data);
assert!(result.is_none());
}
#[test]
fn test_to_utf16le() {
let ascii = b"Test";
let utf16le = to_utf16le(ascii);
assert_eq!(utf16le, vec![b'T', 0, b'e', 0, b's', 0, b't', 0]);
}
#[test]
fn test_search_bytes() {
let data = b"Hello World";
assert!(search_bytes(data, b"World"));
assert!(search_bytes(data, b"Hello"));
assert!(!search_bytes(data, b"Goodbye"));
assert!(!search_bytes(data, b""));
}
#[test]
fn test_contains_stream_name() {
let mut data = vec![0u8; 1024];
let word_doc_utf16 = to_utf16le(b"WordDocument");
data[100..100 + word_doc_utf16.len()].copy_from_slice(&word_doc_utf16);
assert!(contains_stream_name(&data, b"WordDocument"));
assert!(!contains_stream_name(&data, b"Workbook"));
}
}