malwaredb-types 0.3.4

Data types and parsers for MalwareDB.
Documentation
// SPDX-License-Identifier: Apache-2.0

use crate::doc::DocumentFile;
use crate::utils::u64_from_offset;
use crate::{Ordering, SpecimenFile};

use std::fmt::{Display, Formatter};

use anyhow::{ensure, Context, Result};
use chrono::{DateTime, Utc};
use tracing::instrument;
use uuid::{uuid, Uuid};

const DOCFILE_MAGIC: [u8; 8] = [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1];

/// Clsid is the UUID which matches a `Docfile` subtype, and the first three segments
/// could be in big or little endian, so we have to check both
/// <http://fileformats.archiveteam.org/wiki/Microsoft_Compound_File>
#[derive(Clone, Debug, Eq)]
pub struct Clsid {
    /// Little Endian representation, most common
    pub le_uuid: Uuid,

    /// Big Endian representation, unlikely to be encountered
    pub be_uuid: Uuid,
}

impl PartialEq for Clsid {
    fn eq(&self, other: &Self) -> bool {
        self.be_uuid == other.be_uuid || self.le_uuid == other.le_uuid
    }
}

impl Clsid {
    /// Microsoft Excel 5 through '95
    pub const EXCEL5: Self = Clsid {
        le_uuid: uuid!("10080200-0000-0000-c000-000000000046"),
        be_uuid: uuid!("00020810-0000-0000-c000-000000000046"),
    };

    /// Microsoft Excel '97
    pub const EXCEL97: Self = Clsid {
        le_uuid: uuid!("20080200-0000-0000-c000-000000000046"),
        be_uuid: uuid!("00020820-0000-0000-c000-000000000046"),
    };

    /// Microsoft Word 6 through '95
    pub const WORD6: Self = Clsid {
        le_uuid: uuid!("00090200-0000-0000-c000-000000000046"),
        be_uuid: uuid!("00020900-0000-0000-c000-000000000046"),
    };

    /// Microsoft Word document
    pub const DOC: Self = Clsid {
        le_uuid: uuid!("06090200-0000-0000-c000-000000000046"),
        be_uuid: uuid!("00020906-0000-0000-c000-000000000046"),
    };

    /// Microsoft Power Point 4
    pub const POWERPOINT4: Self = Clsid {
        le_uuid: uuid!("51480400-0000-0000-c000-000000000046"),
        be_uuid: uuid!("00044851-0000-0000-c000-000000000046"),
    };

    /// Microsoft Power Point '95
    pub const POWERPOINT95: Self = Clsid {
        le_uuid: uuid!("ea7bae70-fb3b-11cd-a903-00aa00510ea3"),
        be_uuid: uuid!("70ae7bea-3bfb-cd11-a903-00aa00510ea3"),
    };

    /// Microsoft Power Point '97 through 2003
    pub const PPT: Self = Clsid {
        le_uuid: uuid!("108d8164-9b4f-cf11-86ea-00aa00b929e8"),
        be_uuid: uuid!("64818d10-4f9b-11cf-86ea-00aa00b929e8"),
    };

    /// Microsoft Installer
    pub const MSI: Self = Clsid {
        le_uuid: uuid!("000c1084-0000-0000-c000-000000000046"),
        be_uuid: uuid!("84100c00-0000-0000-c000-000000000046"),
    };

    /// Microsoft Windows Update Patch
    pub const MSP: Self = Clsid {
        le_uuid: uuid!("000c1086-0000-0000-c000-000000000046"),
        be_uuid: uuid!("86100c00-0000-0000-c000-000000000046"),
    };

    /// Equality between a [Clsid] and byte array
    #[must_use]
    pub fn equal(&self, bytes: &[u8; 16]) -> bool {
        self.be_uuid.as_bytes() == bytes || self.le_uuid.as_bytes() == bytes
    }
}

/// UUID file type, of which only a subset is of interest
/// This is how we can filter out container formats, like .msi (installer) files.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ClsidType {
    /// Microsoft Excel
    Excel,

    /// Microsoft Power Point
    PowerPoint,

    /// Microsoft Word
    Word,

    /// Microsoft Installer
    MSI,

    /// Microsoft Windows Patch
    MSP,

    /// Unknown or unsupported non-MS Office document type
    Unknown([u8; 16]),
}

impl ClsidType {
    /// Clsid from a byte array
    #[instrument]
    pub fn from(bytes: &[u8; 16]) -> Self {
        if Clsid::EXCEL5.equal(bytes) || Clsid::EXCEL97.equal(bytes) {
            return Self::Excel;
        }

        if Clsid::WORD6.equal(bytes) || Clsid::DOC.equal(bytes) {
            return Self::Word;
        }

        if Clsid::PPT.equal(bytes)
            || Clsid::POWERPOINT4.equal(bytes)
            || Clsid::POWERPOINT95.equal(bytes)
        {
            return Self::PowerPoint;
        }

        if Clsid::MSI.equal(bytes) {
            return Self::MSI;
        }

        if Clsid::MSP.equal(bytes) {
            return Self::MSP;
        }

        Self::Unknown(*bytes)
    }
}

impl ClsidType {
    /// If the Clsid is a document type
    #[inline]
    #[must_use]
    pub fn is_document(&self) -> bool {
        matches!(
            self,
            ClsidType::Excel | ClsidType::PowerPoint | ClsidType::Word
        )
    }
}

impl Display for ClsidType {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        match self {
            ClsidType::Excel => write!(f, "Excel"),
            ClsidType::PowerPoint => write!(f, "PowerPoint"),
            ClsidType::Word => write!(f, "Word"),
            ClsidType::MSI => write!(f, "Installer"),
            ClsidType::MSP => write!(f, "Windows Patch"),
            ClsidType::Unknown(uuid) => write!(f, "Unknown/other {}", hex::encode(uuid)),
        }
    }
}

/// A struct representing the older Microsoft Office format, Office95, aka Docfile.
///
/// This format is really a container format, and could be used to hold a non-Office files,
/// such as installers (.msi files), Windows update files, and others. Here we are only concerned
/// with MS Office types.
#[derive(Clone, Debug)]
pub struct Office95<'a> {
    /// Sub-type for the file
    pub clsid: ClsidType,

    /// Creation date of the document
    pub creation_time: Option<DateTime<Utc>>,

    /// Date the document was last modified
    pub modification_time: Option<DateTime<Utc>>,

    /// The array containing the raw bytes used to parse this document
    pub contents: &'a [u8],
}

impl<'a> Office95<'a> {
    /// Office95 `DOCFILE` type parsed from a sequence of bytes
    ///
    /// # Errors
    ///
    /// Returns an error if the document fails to parse as an Office95/Docfile, or if the CLSID isn't known.
    #[instrument(name = "Office95/Docfile parser", skip(contents))]
    pub fn from(contents: &'a [u8]) -> Result<Self> {
        ensure!(contents.starts_with(&DOCFILE_MAGIC), "Not a DOCFILE");

        let offset: [u8; 4] = contents[48..52]
            .try_into()
            .context("Failed to get slice for Office95 offset")?;
        let offset_int = u32::from_le_bytes(offset);
        let offset_int = (512 * (1 + offset_int) + 80) as usize;
        let clsid: [u8; 16] = contents[offset_int..offset_int + 16]
            .try_into()
            .context("Failed to get slide for Office95 clsid")?;

        let creation_time = if let Some(creation_time) =
            u64_from_offset(contents, offset_int + 20, Ordering::LittleEndian)
        {
            if creation_time > 0 {
                // The `nt_time` use of the From trait has `.expect()` which may be a problem, since
                // we're dealing with malware, so we have to expect funny business.
                // https://github.com/sorairolake/nt-time/issues/149
                Some(DateTime::<Utc>::from(nt_time::FileTime::new(creation_time)))
            } else {
                None
            }
        } else {
            None
        };

        let modification_time = if let Some(modification_time) =
            u64_from_offset(contents, offset_int + 28, Ordering::LittleEndian)
        {
            if modification_time > 0 {
                // The `nt_time` use of the From trait has `.expect()` which may be a problem, since
                // we're dealing with malware, so we have to expect funny business.
                // https://github.com/sorairolake/nt-time/issues/149
                Some(DateTime::<Utc>::from(nt_time::FileTime::new(
                    modification_time,
                )))
            } else {
                None
            }
        } else {
            None
        };

        let clsid = ClsidType::from(&clsid);
        ensure!(
            clsid.is_document(),
            "Office95: CLSID `{clsid}` is not a known or supported document type"
        );

        Ok(Self {
            clsid,
            creation_time,
            modification_time,
            contents,
        })
    }
}

// TODO: Better Office95 parsing
impl DocumentFile for Office95<'_> {
    fn pages(&self) -> u32 {
        0
    }

    fn author(&self) -> Option<String> {
        None
    }

    fn title(&self) -> Option<String> {
        None
    }

    fn has_javascript(&self) -> bool {
        false
    }

    fn has_form(&self) -> bool {
        false
    }

    fn creation_time(&self) -> Option<DateTime<Utc>> {
        self.creation_time
    }

    fn modification_time(&self) -> Option<DateTime<Utc>> {
        self.modification_time
    }
}

impl SpecimenFile for Office95<'_> {
    const MAGIC: &'static [&'static [u8]] = &[&DOCFILE_MAGIC];

    fn type_name(&self) -> &'static str {
        "Office95"
    }
}

impl Display for Office95<'_> {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        write!(f, "Type: {}", self.clsid)?;
        if let Some(created) = self.creation_time {
            write!(f, ", Created: {created}")?;
        }
        if let Some(modified) = self.modification_time {
            write!(f, ", Modified: {modified}")?;
        }
        write!(f, ", Size: {}", self.contents.len())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use rstest::rstest;

    #[rstest]
    #[case::word(include_bytes!("../../testdata/office95/word.doc"), ClsidType::Word)]
    #[case::excel(include_bytes!("../../testdata/office95/excel.xls"), ClsidType::Excel)]
    #[case::powerpoint(include_bytes!("../../testdata/office95/powerpoint.ppt"), ClsidType::PowerPoint)]
    fn doc(#[case] bytes: &[u8], #[case] expected_clsid: ClsidType) {
        let office = Office95::from(bytes).unwrap();
        println!("{office}");
        assert_eq!(office.clsid, expected_clsid);
    }
}