malwaredb-types 0.3.3

Data types and parsers for MalwareDB.
Documentation
// SPDX-License-Identifier: Apache-2.0

use crate::doc::DocumentFile;
use crate::SpecimenFile;

use std::fmt::{Display, Formatter};

use anyhow::Result;
use chrono::{DateTime, Utc};
use pdf::file::FileOptions;
use tracing::instrument;

const MAGIC: [u8; 4] = [0x25, 0x50, 0x44, 0x46]; // %PDF

/// A struct representing Portable Document Format (PDF) files
#[derive(Clone, Debug)]
pub struct PDF<'a> {
    /// Version of the PDF spec
    pub version: Option<f32>,

    /// Number of pages in the document
    pub pages: u32,

    /// Document title, if available
    pub title: Option<String>,

    /// Document author, if available
    pub author: Option<String>,

    /// Creation date, if available
    pub creation_date: Option<DateTime<Utc>>,

    /// Contains Javascript
    pub has_javascript: bool,

    /// Has a Form
    pub has_form: bool,

    /// The array containing the raw bytes used to parse this program
    pub contents: &'a [u8],
}

impl<'a> PDF<'a> {
    /// PDF parsed from a sequence of bytes
    ///
    /// # Errors
    ///
    /// Returns an error if parsing fails
    #[instrument(name = "PDF parser", skip(contents))]
    pub fn from(contents: &'a [u8]) -> Result<Self> {
        let file = FileOptions::cached().load(contents)?;
        let pages = file.num_pages();

        let mut title = None;
        let mut author = None;
        let mut creation_date = None;

        if let Some(info) = &file.trailer.info_dict {
            title = info
                .title
                .as_ref()
                .and_then(|p| Option::from(p.to_string_lossy()));
            author = info
                .author
                .as_ref()
                .and_then(|p| Option::from(p.to_string_lossy()));
            creation_date = info.creation_date.as_ref().and_then(|p| {
                let date_string = format!(
                    "{}{:02}{:02}{:02}{:02}{:02}-{:02}{:02}",
                    p.year, p.month, p.day, p.hour, p.minute, p.second, p.tz_hour, p.tz_minute
                );

                if let Ok(timestamp) = DateTime::parse_from_str(&date_string, "%Y%m%d%H%M%S%z") {
                    let date_obj: DateTime<Utc> = DateTime::from(timestamp);
                    Some(date_obj)
                } else {
                    None
                }
            });
        }

        let has_form = file.trailer.root.forms.is_some();

        let has_javascript = match &file.trailer.root.names {
            Some(x) => x.javascript.is_some(),
            None => false,
        };

        let version = {
            if let Ok(ver) = std::str::from_utf8(&contents[5..8]) {
                ver.parse::<f32>().ok()
            } else {
                None
            }
        };

        Ok(Self {
            version,
            pages,
            title,
            author,
            creation_date,
            has_javascript,
            has_form,
            contents,
        })
    }
}

impl DocumentFile for PDF<'_> {
    fn pages(&self) -> u32 {
        self.pages
    }

    fn author(&self) -> Option<String> {
        self.author.clone()
    }

    fn title(&self) -> Option<String> {
        self.title.clone()
    }

    fn has_javascript(&self) -> bool {
        self.has_javascript
    }

    fn has_form(&self) -> bool {
        self.has_form
    }

    fn creation_time(&self) -> Option<DateTime<Utc>> {
        self.creation_date
    }

    fn modification_time(&self) -> Option<DateTime<Utc>> {
        None
    }
}

impl SpecimenFile for PDF<'_> {
    const MAGIC: &'static [&'static [u8]] = &[&MAGIC];

    fn type_name(&self) -> &'static str {
        "PDF"
    }
}

impl Display for PDF<'_> {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        write!(f, "PDF")?;
        if let Some(version) = self.version {
            write!(f, " version {version:.1}")?;
        }
        if let Some(title) = &self.title {
            write!(f, " \"{title}\"")?;
        }
        if let Some(author) = &self.author {
            write!(f, " by {author}")?;
        }
        if let Some(date) = &self.creation_date {
            write!(f, " created {date}")?;
        }
        if self.has_form {
            write!(f, " has form")?;
        }
        if self.has_javascript {
            write!(f, " has Javascript")?;
        }
        write!(f, " {} pages, {} bytes", self.pages, self.contents.len())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use rstest::rstest;

    #[rstest]
    #[case::plain_pdf(include_bytes!("../../testdata/pdf/test.pdf"), false, false)]
    #[case::pdf_archival(include_bytes!("../../testdata/pdf/test_archival.pdf"), false, false)]
    #[case::pdf_form_js(include_bytes!("../../testdata/pdf/test_form_barcode.pdf"), true, true)]
    #[test]
    fn plain_pdf(#[case] contents: &[u8], #[case] has_form: bool, #[case] has_js: bool) {
        let pdf = PDF::from(contents).unwrap();
        eprintln!("PDF: {pdf}");
        assert_eq!(pdf.pages, 1);
        assert_eq!(pdf.version, Some(1.6));
        assert_eq!(pdf.title, Some("MalwareDB Test Document".into()));
        assert_eq!(pdf.has_form, has_form);
        assert_eq!(pdf.has_javascript, has_js);

        let date = pdf.creation_date.unwrap().date_naive();
        assert_eq!("2023-05-26", date.to_string());
    }
}