malwaredb-types 0.3.3

Data types and parsers for MalwareDB.
Documentation
// SPDX-License-Identifier: Apache-2.0

use chrono::{DateTime, Utc};
#[cfg(feature = "officexml")]
use std::fs::File;
#[cfg(feature = "officexml")]
use std::io::Cursor;
#[cfg(feature = "officexml")]
use std::path::Path;
#[cfg(feature = "officexml")]
use zip::ZipArchive;

/// MS Office `DOCFILE` parsing
#[cfg_attr(docsrs, doc(cfg(feature = "office95")))]
#[cfg(feature = "office95")]
pub mod office95;

/// Portable Document Format file parsing
#[cfg_attr(docsrs, doc(cfg(feature = "pdf")))]
#[cfg(feature = "pdf")]
pub mod pdf;

/// Rich Text Format file parsing
#[cfg_attr(docsrs, doc(cfg(feature = "rtf")))]
#[cfg(feature = "rtf")]
pub mod rtf;

/// ZIP (PK) header, as some types really are just special Zip files
pub const PK_HEADER: [u8; 2] = [0x50u8, 0x4bu8];

// In the future, this logic will be extended to identify _which_ Office document is found.
#[cfg(feature = "officexml")]
const CONTENT_TYPE: &str = "[Content_Types].xml";
#[cfg(feature = "officexml")]
const WORD_DIR: &str = "word";
#[cfg(feature = "officexml")]
const EXCEL_DIR: &str = "xl";
#[cfg(feature = "officexml")]
const POWERPOINT_DIR: &str = "ppt";

/// Is the Zip file just an Office document?
///
/// # Errors
///
/// Returns an error if the file isn't a valid Zip file
#[cfg_attr(docsrs, doc(cfg(feature = "officexml")))]
#[cfg(feature = "officexml")]
pub fn is_zip_file_doc(fname: impl AsRef<Path>) -> anyhow::Result<bool> {
    let file = File::open(fname)?;
    let mut archive = ZipArchive::new(file)?;

    let mut has_content_type = false;
    let mut has_office_doc_dir = false;
    for i in 0..archive.len() {
        let file = match archive.by_index(i) {
            Ok(f) => f,
            Err(e) => {
                if let zip::result::ZipError::UnsupportedArchive(msg) = e {
                    if msg == "Password required to decrypt file" {
                        // Assumption is that no Office document is in a password-protected Zip
                        return Ok(false);
                    }
                }
                return Err(e.into());
            }
        };

        if file.name().starts_with(WORD_DIR)
            || file.name().starts_with(EXCEL_DIR)
            || file.name().starts_with(POWERPOINT_DIR)
        {
            has_office_doc_dir = true;
        } else if file.name().contains(CONTENT_TYPE) {
            has_content_type = true;
        }

        if has_content_type && has_office_doc_dir {
            return Ok(true);
        }
    }

    Ok(false)
}

/// Is the Zip buffer just an Office document?
///
/// # Errors
///
/// Returns an error if the buffer isn't a valid Zip file
#[cfg_attr(docsrs, doc(cfg(feature = "officexml")))]
#[cfg(feature = "officexml")]
pub fn is_zip_buffer_doc(contents: &[u8]) -> anyhow::Result<bool> {
    let mut buffer = Cursor::new(contents);
    let mut archive = ZipArchive::new(&mut buffer)?;

    let mut has_content_type = false;
    let mut has_office_doc_dir = false;
    for i in 0..archive.len() {
        let file = archive.by_index(i)?;

        if file.name().starts_with(WORD_DIR)
            || file.name().starts_with(EXCEL_DIR)
            || file.name().starts_with(POWERPOINT_DIR)
        {
            has_office_doc_dir = true;
        } else if file.name().contains(CONTENT_TYPE) {
            has_content_type = true;
        }

        if has_content_type && has_office_doc_dir {
            return Ok(true);
        }
    }

    Ok(false)
}

/// Common functions for document file types
pub trait DocumentFile {
    /// Number of pages
    fn pages(&self) -> u32;

    /// Author of the document
    fn author(&self) -> Option<String>;

    /// Title of the document
    fn title(&self) -> Option<String>;

    /// If the document has Javascript (PDF)
    fn has_javascript(&self) -> bool;

    /// If the document has a fillable form (PDF)
    fn has_form(&self) -> bool;

    /// Creation timestamp of the document, if found
    fn creation_time(&self) -> Option<DateTime<Utc>>;

    /// Modification timestamp of the document, if found
    fn modification_time(&self) -> Option<DateTime<Utc>>;
}

#[cfg(test)]
mod tests {
    use super::*;
    use rstest::rstest;

    #[cfg(feature = "officexml")]
    #[rstest]
    #[case::xlsx(include_bytes!("../../testdata/office_zip/excel.xlsx"), true)]
    #[case::xltx(include_bytes!("../../testdata/office_zip/excel.xltx"), true)]
    #[case::potx(include_bytes!("../../testdata/office_zip/powerpoint.potx"), true)]
    #[case::pptx(include_bytes!("../../testdata/office_zip/powerpoint.pptx"), true)]
    #[case::docx(include_bytes!("../../testdata/office_zip/word.docx"), true)]
    #[case::dotx(include_bytes!("../../testdata/office_zip/word.dotx"), true)]
    #[case::plain_zip(include_bytes!("../../testdata/zip/source.c.zip"), false)]
    #[test]
    fn zip(#[case] contents: &[u8], #[case] is_doc: bool) {
        assert_eq!(is_zip_buffer_doc(contents).unwrap(), is_doc);
    }
}