malwaredb-types 0.3.4

Data types and parsers for MalwareDB.
Documentation
// SPDX-License-Identifier: Apache-2.0

use crate::utils::find_subsequence;
use crate::SpecimenFile;

use std::fmt::{Display, Formatter};

use crate::doc::DocumentFile;
use anyhow::{bail, Result};
use chrono::{DateTime, Utc};
use tracing::instrument;

const RTF_MAGIC: [u8; 4] = [0x7B, 0x5C, 0x72, 0x74];

// This is added by LibreOffice. If that's the only program which uses it, maybe we won't bother with it.
const TITLE_HEADER: [u8; 13] = [
    0x7B, 0x5C, 0x69, 0x6E, 0x66, 0x6F, 0x7B, 0x5C, 0x74, 0x69, 0x74, 0x6C, 0x65,
]; // {\info{\title

const ANSI_CHARSET: [u8; 5] = [0x5C, 0x61, 0x6E, 0x73, 0x69]; // \ansi
const MAC_CHARSET: [u8; 4] = [0x5C, 0x6D, 0x61, 0x63]; // \mac
const PC_CHARSET: [u8; 3] = [0x5C, 0x70, 0x63]; // \pc
const PCA_CHARSET: [u8; 4] = [0x5C, 0x70, 0x63, 0x61]; // \pca

/// Character Set of the RTF document
/// <https://docs.fileformat.com/word-processing/rtf/>
/// <https://www.biblioscape.com/rtf15_spec.htm>
#[derive(Copy, Clone, Debug)]
pub enum CharacterSet {
    /// The normal character set
    Ansi,

    /// Pre-Mac OS X (macOS 10.x) ANSI
    MacAnsi,

    /// The normal character set for MS-DOS
    Pc,

    /// MS-DOS Multilingual
    Pca,
}

/// A struct representing Rich Text Format (RTF) files
#[derive(Clone, Debug)]
pub struct Rtf<'a> {
    /// The character set (encoding) used by the document
    pub character_set: Option<CharacterSet>,

    /// Document title
    pub title: Option<String>,

    /// The array containing the raw bytes used to parse this document
    pub contents: &'a [u8],
}

impl<'a> Rtf<'a> {
    /// RTF parsed from a byte sequence
    ///
    /// # Errors
    ///
    /// Returns an error if the parser fails.
    #[instrument(name = "RTF parser", skip(contents))]
    pub fn from(contents: &'a [u8]) -> Result<Self> {
        let doc_front = &contents[..40.min(contents.len())];

        if !doc_front.starts_with(&RTF_MAGIC) {
            bail!("not an RTF");
        }

        let mut character_set = None;
        if find_subsequence(doc_front, &ANSI_CHARSET).is_some() {
            character_set = Some(CharacterSet::Ansi);
        } else if find_subsequence(doc_front, &MAC_CHARSET).is_some() {
            character_set = Some(CharacterSet::MacAnsi);
        } else if find_subsequence(doc_front, &PCA_CHARSET).is_some() {
            character_set = Some(CharacterSet::Pca);
        } else if find_subsequence(doc_front, &PC_CHARSET).is_some() {
            character_set = Some(CharacterSet::Pc);
        }

        let mut title = None;
        if contents.len() < 100_000 {
            title = if let Some(start_index) = find_subsequence(contents, &TITLE_HEADER) {
                let start_index = start_index + TITLE_HEADER.len() + 1; // The index has the start of the title header, so we need to pass it.
                let mut end_index = start_index;
                while contents[end_index] != 0x7D
                    && end_index < start_index + 200
                    && end_index < contents.len()
                {
                    end_index += 1;
                }
                if end_index < contents.len() && end_index > start_index + 1 {
                    String::from_utf8(contents[start_index..end_index].to_vec()).ok()
                } else {
                    None // Failed to find the title
                }
            } else {
                None // No title found
            };
        }

        Ok(Self {
            character_set,
            title,
            contents,
        })
    }
}

// TODO: Better RTF parsing
impl DocumentFile for Rtf<'_> {
    fn pages(&self) -> u32 {
        0
    }

    fn author(&self) -> Option<String> {
        None
    }

    fn title(&self) -> Option<String> {
        None
    }

    fn has_javascript(&self) -> bool {
        false // RTFs don't contained embedded javascript
    }

    fn has_form(&self) -> bool {
        false
    }

    fn creation_time(&self) -> Option<DateTime<Utc>> {
        None
    }

    fn modification_time(&self) -> Option<DateTime<Utc>> {
        None
    }
}

impl SpecimenFile for Rtf<'_> {
    const MAGIC: &'static [&'static [u8]] = &[&RTF_MAGIC];

    fn type_name(&self) -> &'static str {
        "RTF"
    }
}

impl Display for Rtf<'_> {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        write!(f, "RTF")?;
        if let Some(title) = &self.title {
            write!(f, ", Title: \"{title}\"")?;
        }
        if let Some(charset) = &self.character_set {
            write!(f, ", Character Set: {charset:?}")?;
        }
        write!(f, ", Bytes: {}", self.contents.len())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn rtf() {
        const BYTES: &[u8] = include_bytes!("../../testdata/rtf/hello.rtf");

        let rtf = Rtf::from(BYTES);
        assert!(rtf.is_ok());

        let rtf = rtf.unwrap();
        println!("RTF: {rtf}");
        assert_eq!(rtf.title.unwrap(), "RTF Title");
    }

    #[test]
    fn emtpy() {
        const BYTES: &[u8] = include_bytes!("../../testdata/rtf/empty.rtf");

        let rtf = Rtf::from(BYTES);
        assert!(rtf.is_ok());
        assert!(rtf.unwrap().title.is_none());
    }
}