use std::io::Cursor;
use std::path::Path;
use crate::error::Result;
use crate::ftguess::types::*;
use crate::ole::clsid;
use crate::ole::container::OleFile;
use crate::ooxml::content_types::ContentTypes;
pub struct FileTypeGuesser;
impl FileTypeGuesser {
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<FileTypeResult> {
let data = std::fs::read(path)?;
Self::from_bytes(&data)
}
pub fn from_bytes(data: &[u8]) -> Result<FileTypeResult> {
if data.is_empty() {
return Ok(FileTypeResult::default());
}
if Self::is_rtf(data) {
return Ok(Self::detect_rtf());
}
if OleFile::is_ole(data) {
return Self::detect_ole(data);
}
if Self::is_zip(data) {
return Self::detect_ooxml(data);
}
if Self::is_pdf(data) {
return Ok(Self::detect_pdf());
}
if Self::is_png(data) {
return Ok(Self::detect_png());
}
if Self::is_jpeg(data) {
return Ok(Self::detect_jpeg());
}
if Self::is_gif(data) {
return Ok(Self::detect_gif());
}
if Self::is_pe(data) {
return Ok(Self::detect_pe());
}
if Self::is_elf(data) {
return Ok(Self::detect_elf());
}
if Self::is_onenote(data) {
return Ok(Self::detect_onenote());
}
if let Ok(text) = std::str::from_utf8(&data[..std::cmp::min(data.len(), 2000)]) {
if text.contains("<?mso-application") || text.contains("pkg:package") {
return Ok(Self::detect_flat_opc());
}
if text.contains("<?xml")
&& (text.contains("urn:schemas-microsoft-com:office:word")
|| text.contains("w:wordDocument"))
{
return Ok(Self::detect_word2003xml());
}
}
Ok(FileTypeResult::default())
}
fn is_rtf(data: &[u8]) -> bool {
data.len() >= 4 && &data[0..4] == b"{\\rt"
}
fn is_zip(data: &[u8]) -> bool {
data.len() >= 4 && data[0..4] == [0x50, 0x4B, 0x03, 0x04]
}
fn is_pdf(data: &[u8]) -> bool {
data.len() >= 5 && &data[0..5] == b"%PDF-"
}
fn is_png(data: &[u8]) -> bool {
data.len() >= 8 && data[0..8] == [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]
}
fn is_jpeg(data: &[u8]) -> bool {
data.len() >= 3 && data[0..3] == [0xFF, 0xD8, 0xFF]
}
fn is_gif(data: &[u8]) -> bool {
data.len() >= 4 && (&data[0..4] == b"GIF8")
}
fn is_pe(data: &[u8]) -> bool {
data.len() >= 2 && data[0..2] == [0x4D, 0x5A]
}
fn is_elf(data: &[u8]) -> bool {
data.len() >= 4 && data[0..4] == [0x7F, 0x45, 0x4C, 0x46]
}
fn is_onenote(data: &[u8]) -> bool {
data.len() >= 16
&& data[0..16]
== [
0xE4, 0x52, 0x5C, 0x7B, 0x8C, 0xD8, 0xA7, 0x4D, 0xAE, 0xB1, 0x53, 0x78,
0xD0, 0x29, 0x96, 0xD3,
]
}
fn detect_rtf() -> FileTypeResult {
FileTypeResult {
file_type: FileType::Rtf,
container: Container::Rtf,
application: Application::MsWord,
may_contain_vba: false,
may_contain_xlm: false,
mime_type: "application/rtf",
}
}
fn detect_ole(data: &[u8]) -> Result<FileTypeResult> {
let ole = OleFile::from_bytes(data)?;
let root_clsid = ole.root_clsid();
let streams = ole.list_streams();
let has_vba = streams.iter().any(|s| {
let lower = s.to_lowercase();
lower.contains("vba") && lower.ends_with("/dir")
});
let clsid_str = root_clsid.to_string().to_lowercase();
let (file_type, application, may_contain_xlm, mime_type) = match clsid_str.as_str() {
"00020906-0000-0000-c000-000000000046" => {
(FileType::Word97, Application::MsWord, false, "application/msword")
}
"00020900-0000-0000-c000-000000000046" => {
(FileType::Word6, Application::MsWord, false, "application/msword")
}
"00020810-0000-0000-c000-000000000046" | "00020820-0000-0000-c000-000000000046" => {
(FileType::Excel97, Application::MsExcel, true, "application/vnd.ms-excel")
}
"00020811-0000-0000-c000-000000000046" => {
(FileType::Excel5, Application::MsExcel, true, "application/vnd.ms-excel")
}
"64818d10-4f9b-11cf-86ea-00aa00b929e8" | "64818d11-4f9b-11cf-86ea-00aa00b929e8" => {
(FileType::Powerpoint97, Application::MsPowerpoint, false, "application/vnd.ms-powerpoint")
}
"000c1084-0000-0000-c000-000000000046" | "000c1086-0000-0000-c000-000000000046" => {
(FileType::Msi, Application::MsOffice, false, "application/x-msi")
}
"00021a20-0000-0000-c000-000000000046" => {
(FileType::Visio, Application::MsVisio, false, "application/vnd.visio")
}
"73a4c9c1-d68d-11d0-98bf-00a0c90dc8d9" => {
(FileType::Access, Application::MsAccess, false, "application/msaccess")
}
"0002123b-0000-0000-c000-000000000046" => {
(FileType::Publisher, Application::MsPublisher, false, "application/x-mspublisher")
}
_ => {
let has_workbook = streams.iter().any(|s| {
let lower = s.to_lowercase();
lower.contains("workbook") || lower.contains("book")
});
let has_word = streams.iter().any(|s| {
let lower = s.to_lowercase();
lower.contains("worddocument") || lower.contains("1table") || lower.contains("0table")
});
let has_ppt = streams.iter().any(|s| {
let lower = s.to_lowercase();
lower.contains("powerpoint")
});
if has_word {
(FileType::Word97, Application::MsWord, false, "application/msword")
} else if has_workbook {
(FileType::Excel97, Application::MsExcel, true, "application/vnd.ms-excel")
} else if has_ppt {
(FileType::Powerpoint97, Application::MsPowerpoint, false, "application/vnd.ms-powerpoint")
} else {
let desc = clsid::lookup_clsid(&root_clsid);
log::debug!("Unknown OLE CLSID: {} ({:?})", root_clsid, desc);
(FileType::OleUnknown, Application::MsOffice, false, "application/x-ole-storage")
}
}
};
Ok(FileTypeResult {
file_type,
container: Container::Ole,
application,
may_contain_vba: has_vba,
may_contain_xlm,
mime_type,
})
}
fn detect_ooxml(data: &[u8]) -> Result<FileTypeResult> {
let cursor = Cursor::new(data);
let mut archive = match zip::ZipArchive::new(cursor) {
Ok(a) => a,
Err(_) => {
return Ok(FileTypeResult {
file_type: FileType::ZipUnknown,
container: Container::Zip,
..Default::default()
})
}
};
let is_ooxml = archive.by_name("_rels/.rels").is_ok();
if !is_ooxml {
return Ok(FileTypeResult {
file_type: FileType::ZipUnknown,
container: Container::Zip,
..Default::default()
});
}
let has_vba = (0..archive.len()).any(|i| {
archive
.by_index(i)
.ok()
.is_some_and(|e| e.name().to_lowercase().ends_with("vbaproject.bin"))
});
let content_types = {
let mut ct_data = Vec::new();
if let Ok(mut entry) = archive.by_name("[Content_Types].xml") {
use std::io::Read;
let _ = entry.read_to_end(&mut ct_data);
}
if ct_data.is_empty() {
None
} else {
ContentTypes::parse(&ct_data).ok()
}
};
let (file_type, application, may_contain_xlm, mime_type) = match content_types {
Some(ref ct) => Self::detect_from_content_types(ct),
None => (FileType::ZipUnknown, Application::Unknown, false, "application/zip"),
};
let may_contain_vba = has_vba
|| matches!(
file_type,
FileType::Word2007Docm
| FileType::Word2007Dotm
| FileType::Excel2007Xlsm
| FileType::Excel2007Xltm
| FileType::Excel2007Xlam
| FileType::Excel2007Xlsb
| FileType::Powerpoint2007Pptm
| FileType::Powerpoint2007Ppsm
| FileType::Powerpoint2007Potm
| FileType::Powerpoint2007Ppam
);
Ok(FileTypeResult {
file_type,
container: Container::OpenXml,
application,
may_contain_vba,
may_contain_xlm,
mime_type,
})
}
fn detect_from_content_types(
ct: &ContentTypes,
) -> (FileType, Application, bool, &'static str) {
if let Some(main_ct) = ct.main_content_type() {
return Self::content_type_to_file_type(main_ct);
}
for content_type in ct.overrides.values() {
if content_type.contains("wordprocessingml") {
return Self::content_type_to_file_type(content_type);
}
if content_type.contains("spreadsheetml") {
return Self::content_type_to_file_type(content_type);
}
if content_type.contains("presentationml") {
return Self::content_type_to_file_type(content_type);
}
}
(FileType::ZipUnknown, Application::Unknown, false, "application/zip")
}
fn content_type_to_file_type(ct: &str) -> (FileType, Application, bool, &'static str) {
if ct.contains("wordprocessingml.document.main") {
if ct.contains("macroEnabled") {
return (FileType::Word2007Docm, Application::MsWord, false,
"application/vnd.ms-word.document.macroEnabled.12");
}
return (FileType::Word2007Docx, Application::MsWord, false,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
}
if ct.contains("wordprocessingml.template") {
if ct.contains("macroEnabled") {
return (FileType::Word2007Dotm, Application::MsWord, false,
"application/vnd.ms-word.template.macroEnabled.12");
}
return (FileType::Word2007Dotx, Application::MsWord, false,
"application/vnd.openxmlformats-officedocument.wordprocessingml.template");
}
if ct.contains("spreadsheetml.sheet.main") {
if ct.contains("macroEnabled") {
return (FileType::Excel2007Xlsm, Application::MsExcel, true,
"application/vnd.ms-excel.sheet.macroEnabled.12");
}
return (FileType::Excel2007Xlsx, Application::MsExcel, true,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
}
if ct.contains("spreadsheetml.template") {
if ct.contains("macroEnabled") {
return (FileType::Excel2007Xltm, Application::MsExcel, true,
"application/vnd.ms-excel.template.macroEnabled.12");
}
return (FileType::Excel2007Xltx, Application::MsExcel, true,
"application/vnd.openxmlformats-officedocument.spreadsheetml.template");
}
if ct.contains("spreadsheetml.addinMacroEnabled") {
return (FileType::Excel2007Xlam, Application::MsExcel, true,
"application/vnd.ms-excel.addinMacroEnabled.12");
}
if ct.contains("spreadsheetml.sheet.binary") {
return (FileType::Excel2007Xlsb, Application::MsExcel, true,
"application/vnd.ms-excel.sheet.binary.macroEnabled.12");
}
if ct.contains("presentationml.presentation.main") {
if ct.contains("macroEnabled") {
return (FileType::Powerpoint2007Pptm, Application::MsPowerpoint, false,
"application/vnd.ms-powerpoint.presentation.macroEnabled.12");
}
return (FileType::Powerpoint2007Pptx, Application::MsPowerpoint, false,
"application/vnd.openxmlformats-officedocument.presentationml.presentation");
}
if ct.contains("presentationml.slideshow") {
if ct.contains("macroEnabled") {
return (FileType::Powerpoint2007Ppsm, Application::MsPowerpoint, false,
"application/vnd.ms-powerpoint.slideshow.macroEnabled.12");
}
return (FileType::Powerpoint2007Ppsx, Application::MsPowerpoint, false,
"application/vnd.openxmlformats-officedocument.presentationml.slideshow");
}
if ct.contains("presentationml.template") {
if ct.contains("macroEnabled") {
return (FileType::Powerpoint2007Potm, Application::MsPowerpoint, false,
"application/vnd.ms-powerpoint.template.macroEnabled.12");
}
return (FileType::Powerpoint2007Potx, Application::MsPowerpoint, false,
"application/vnd.openxmlformats-officedocument.presentationml.template");
}
if ct.contains("presentationml.addinMacroEnabled") {
return (FileType::Powerpoint2007Ppam, Application::MsPowerpoint, false,
"application/vnd.ms-powerpoint.addinMacroEnabled.12");
}
if ct.contains("ms-word") {
if ct.contains("macroEnabled") {
return (FileType::Word2007Docm, Application::MsWord, false,
"application/vnd.ms-word.document.macroEnabled.12");
}
return (FileType::Word2007Docx, Application::MsWord, false,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
}
if ct.contains("ms-excel") {
if ct.contains("addinMacroEnabled") {
return (FileType::Excel2007Xlam, Application::MsExcel, true,
"application/vnd.ms-excel.addinMacroEnabled.12");
}
if ct.contains("binary") {
return (FileType::Excel2007Xlsb, Application::MsExcel, true,
"application/vnd.ms-excel.sheet.binary.macroEnabled.12");
}
if ct.contains("template") && ct.contains("macroEnabled") {
return (FileType::Excel2007Xltm, Application::MsExcel, true,
"application/vnd.ms-excel.template.macroEnabled.12");
}
if ct.contains("macroEnabled") {
return (FileType::Excel2007Xlsm, Application::MsExcel, true,
"application/vnd.ms-excel.sheet.macroEnabled.12");
}
return (FileType::Excel2007Xlsx, Application::MsExcel, true,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
}
if ct.contains("ms-powerpoint") {
if ct.contains("macroEnabled") {
return (FileType::Powerpoint2007Pptm, Application::MsPowerpoint, false,
"application/vnd.ms-powerpoint.presentation.macroEnabled.12");
}
return (FileType::Powerpoint2007Pptx, Application::MsPowerpoint, false,
"application/vnd.openxmlformats-officedocument.presentationml.presentation");
}
(FileType::ZipUnknown, Application::Unknown, false, "application/zip")
}
fn detect_pdf() -> FileTypeResult {
FileTypeResult {
file_type: FileType::Pdf,
container: Container::Unknown,
application: Application::Unknown,
may_contain_vba: false,
may_contain_xlm: false,
mime_type: "application/pdf",
}
}
fn detect_png() -> FileTypeResult {
FileTypeResult {
file_type: FileType::Png,
container: Container::Unknown,
application: Application::Unknown,
may_contain_vba: false,
may_contain_xlm: false,
mime_type: "image/png",
}
}
fn detect_jpeg() -> FileTypeResult {
FileTypeResult {
file_type: FileType::Jpeg,
container: Container::Unknown,
application: Application::Unknown,
may_contain_vba: false,
may_contain_xlm: false,
mime_type: "image/jpeg",
}
}
fn detect_gif() -> FileTypeResult {
FileTypeResult {
file_type: FileType::Gif,
container: Container::Unknown,
application: Application::Unknown,
may_contain_vba: false,
may_contain_xlm: false,
mime_type: "image/gif",
}
}
fn detect_pe() -> FileTypeResult {
FileTypeResult {
file_type: FileType::ExePe,
container: Container::Unknown,
application: Application::Unknown,
may_contain_vba: false,
may_contain_xlm: false,
mime_type: "application/x-dosexec",
}
}
fn detect_elf() -> FileTypeResult {
FileTypeResult {
file_type: FileType::ExeElf,
container: Container::Unknown,
application: Application::Unknown,
may_contain_vba: false,
may_contain_xlm: false,
mime_type: "application/x-elf",
}
}
fn detect_onenote() -> FileTypeResult {
FileTypeResult {
file_type: FileType::OneNote,
container: Container::Unknown,
application: Application::MsOneNote,
may_contain_vba: false,
may_contain_xlm: false,
mime_type: "application/onenote",
}
}
fn detect_flat_opc() -> FileTypeResult {
FileTypeResult {
file_type: FileType::FlatOpc,
container: Container::FlatOpc,
application: Application::MsOffice,
may_contain_vba: true,
may_contain_xlm: false,
mime_type: "application/xml",
}
}
fn detect_word2003xml() -> FileTypeResult {
FileTypeResult {
file_type: FileType::Word2003Xml,
container: Container::Xml,
application: Application::MsWord,
may_contain_vba: true,
may_contain_xlm: false,
mime_type: "application/xml",
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_rtf() {
let data = br"{\rtf1\ansi Hello}";
let result = FileTypeGuesser::from_bytes(data).unwrap();
assert_eq!(result.file_type, FileType::Rtf);
assert_eq!(result.container, Container::Rtf);
assert_eq!(result.application, Application::MsWord);
assert!(!result.may_contain_vba);
}
#[test]
fn test_detect_pdf() {
let data = b"%PDF-1.7 test";
let result = FileTypeGuesser::from_bytes(data).unwrap();
assert_eq!(result.file_type, FileType::Pdf);
}
#[test]
fn test_detect_png() {
let data = [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x00];
let result = FileTypeGuesser::from_bytes(&data).unwrap();
assert_eq!(result.file_type, FileType::Png);
}
#[test]
fn test_detect_pe() {
let data = [0x4D, 0x5A, 0x90, 0x00];
let result = FileTypeGuesser::from_bytes(&data).unwrap();
assert_eq!(result.file_type, FileType::ExePe);
}
#[test]
fn test_detect_unknown() {
let data = [0x00, 0x01, 0x02, 0x03];
let result = FileTypeGuesser::from_bytes(&data).unwrap();
assert_eq!(result.file_type, FileType::Unknown);
}
#[test]
fn test_detect_empty() {
let result = FileTypeGuesser::from_bytes(&[]).unwrap();
assert_eq!(result.file_type, FileType::Unknown);
}
#[test]
fn test_content_type_word_docx() {
let ct = "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml";
let (ft, app, xlm, _) = FileTypeGuesser::content_type_to_file_type(ct);
assert_eq!(ft, FileType::Word2007Docx);
assert_eq!(app, Application::MsWord);
assert!(!xlm);
}
#[test]
fn test_content_type_excel_xlsm() {
let ct = "application/vnd.ms-excel.sheet.macroEnabled.12";
let (ft, app, _, _) = FileTypeGuesser::content_type_to_file_type(ct);
assert_eq!(ft, FileType::Excel2007Xlsm);
assert_eq!(app, Application::MsExcel);
}
}