infer 0.12.0

Small crate to infer file type based on magic number signatures
Documentation
use core::convert::TryInto;

use super::compare_bytes;

#[allow(clippy::upper_case_acronyms)]
#[derive(Debug, Eq, PartialEq)]
enum DocType {
    DOC,
    DOCX,
    XLS,
    XLSX,
    PPT,
    PPTX,
    OOXML,
}

/// Returns whether a buffer is Microsoft Word Document (DOC) data.
pub fn is_doc(buf: &[u8]) -> bool {
    ole2(buf) == Some(DocType::DOC)
}

/// Returns whether a buffer is Microsoft Word Open XML Format Document (DOCX) data.
pub fn is_docx(buf: &[u8]) -> bool {
    msooxml(buf) == Some(DocType::DOCX)
}

/// Returns whether a buffer is Microsoft Excel 97-2003 Worksheet (XLS) data.
pub fn is_xls(buf: &[u8]) -> bool {
    ole2(buf) == Some(DocType::XLS)
}

/// Returns whether a buffer is Microsoft Excel Open XML Format Spreadsheet (XLSX) data.
pub fn is_xlsx(buf: &[u8]) -> bool {
    msooxml(buf) == Some(DocType::XLSX)
}

/// Returns whether a buffer is Microsoft PowerPoint 97-2003 Presentation (PPT) data.
pub fn is_ppt(buf: &[u8]) -> bool {
    ole2(buf) == Some(DocType::PPT)
}

/// Returns whether a buffer is Microsoft PowerPoint Open XML Presentation (PPTX) data.
pub fn is_pptx(buf: &[u8]) -> bool {
    msooxml(buf) == Some(DocType::PPTX)
}

fn msooxml(buf: &[u8]) -> Option<DocType> {
    let signature = [b'P', b'K', 0x03, 0x04];

    // start by checking for ZIP local file header signature
    if !compare_bytes(buf, &signature, 0) {
        return None;
    }

    let v = check_msooml(buf, 0x1E);
    if v.is_some() {
        return v;
    }

    if !compare_bytes(buf, b"[Content_Types].xml", 0x1E)
        && !compare_bytes(buf, b"_rels/.rels", 0x1E)
        && !compare_bytes(buf, b"docProps", 0x1E)
    {
        return None;
    }

    // skip to the second local file header
    // since some documents include a 520-byte extra field following the file
    // header, we need to scan for the next header
    let mut start_offset = match u32::from_le_bytes(buf[18..22].try_into().unwrap()).checked_add(49)
    {
        Some(int) => int as usize,
        None => return None,
    };

    let idx = search(buf, start_offset, 6000)?;

    // now skip to the *third* local file header; again, we need to scan due to a
    // 520-byte extra field following the file header
    start_offset += idx + 4 + 26;
    let idx = search(buf, start_offset, 6000)?;

    // and check the subdirectory name to determine which type of OOXML
    // file we have.  Correct the mimetype with the registered ones:
    // http://technet.microsoft.com/en-us/library/cc179224.aspx
    start_offset += idx + 4 + 26;
    check_msooml(buf, start_offset)?;

    // OpenOffice/Libreoffice orders ZIP entry differently, so check the 4th file
    start_offset += 26;
    let idx = search(buf, start_offset, 6000);
    match idx {
        Some(idx) => start_offset += idx + 4 + 26,
        None => return Some(DocType::OOXML),
    };

    let typo = check_msooml(buf, start_offset);
    if typo.is_some() {
        return typo;
    }

    Some(DocType::OOXML)
}

#[cfg(feature = "std")]
fn ole2(buf: &[u8]) -> Option<DocType> {
    use std::io::Cursor;

    if !compare_bytes(buf, &[0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1], 0) {
        return None;
    }
    if let Ok(file) = cfb::CompoundFile::open(Cursor::new(buf)) {
        return match file.root_entry().clsid().to_string().as_str() {
            "00020810-0000-0000-c000-000000000046" | "00020820-0000-0000-c000-000000000046" => {
                Some(DocType::XLS)
            }
            "00020906-0000-0000-c000-000000000046" => Some(DocType::DOC),
            "64818d10-4f9b-11cf-86ea-00aa00b929e8" => Some(DocType::PPT),
            _ => None,
        };
    }
    None
}

#[cfg(not(feature = "std"))]
fn ole2(buf: &[u8]) -> Option<DocType> {
    if !compare_bytes(buf, &[0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1], 0) {
        return None;
    }
    Some(DocType::DOC)
}

fn check_msooml(buf: &[u8], offset: usize) -> Option<DocType> {
    if compare_bytes(buf, &[b'w', b'o', b'r', b'd', b'/'], offset) {
        Some(DocType::DOCX)
    } else if compare_bytes(buf, &[b'p', b'p', b't', b'/'], offset) {
        Some(DocType::PPTX)
    } else if compare_bytes(buf, &[b'x', b'l', b'/'], offset) {
        Some(DocType::XLSX)
    } else {
        None
    }
}

fn search(buf: &[u8], start: usize, range: usize) -> Option<usize> {
    let length = buf.len();
    let mut end = start + range;
    let signature: &[_] = &[b'P', b'K', 0x03, 0x04];

    if end > length {
        end = length;
    }

    if start >= end {
        return None;
    }

    buf[start..end]
        .windows(signature.len())
        .position(|window| window == signature)
}