malwaredb_types/doc/
pdf.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use crate::SpecimenFile;
4
5use std::fmt::{Display, Formatter};
6
7use crate::doc::DocumentFile;
8use anyhow::Result;
9use chrono::{DateTime, Utc};
10use pdf::file::FileOptions;
11use tracing::instrument;
12
13const MAGIC: [u8; 4] = [0x25, 0x50, 0x44, 0x46]; // %PDF
14
15/// A struct representing Portable Document Format (PDF) files
16#[derive(Clone, Debug)]
17pub struct PDF<'a> {
18    /// Version of the PDF spec
19    pub version: Option<f32>,
20
21    /// Number of pages in the document
22    pub pages: u32,
23
24    /// Document title, if available
25    pub title: Option<String>,
26
27    /// Document author, if available
28    pub author: Option<String>,
29
30    /// Creation date, if available
31    pub creation_date: Option<DateTime<Utc>>,
32
33    /// Contains Javascript
34    pub has_javascript: bool,
35
36    /// Has a From
37    pub has_form: bool,
38
39    /// The array containing the raw bytes used to parse this program
40    pub contents: &'a [u8],
41}
42
43impl<'a> PDF<'a> {
44    /// PDF parsed from a sequence of bytes
45    #[instrument(name = "PDF parser", skip(contents))]
46    pub fn from(contents: &'a [u8]) -> Result<Self> {
47        let file = FileOptions::cached().load(contents)?;
48        let pages = file.num_pages();
49
50        let mut title = None;
51        let mut author = None;
52        let mut creation_date = None;
53
54        if let Some(info) = &file.trailer.info_dict {
55            title = info
56                .title
57                .as_ref()
58                .and_then(|p| Option::from(p.to_string_lossy()));
59            author = info
60                .author
61                .as_ref()
62                .and_then(|p| Option::from(p.to_string_lossy()));
63            creation_date = info.creation_date.as_ref().and_then(|p| {
64                let date_string = format!(
65                    "{}{:02}{:02}{:02}{:02}{:02}-{:02}{:02}",
66                    p.year, p.month, p.day, p.hour, p.minute, p.second, p.tz_hour, p.tz_minute
67                );
68
69                if let Ok(timestamp) = DateTime::parse_from_str(&date_string, "%Y%m%d%H%M%S%z") {
70                    let date_obj: DateTime<Utc> = DateTime::from(timestamp);
71                    Some(date_obj)
72                } else {
73                    None
74                }
75            });
76        }
77
78        let has_form = file.trailer.root.forms.is_some();
79
80        let has_javascript = match &file.trailer.root.names {
81            Some(x) => x.javascript.is_some(),
82            None => false,
83        };
84
85        let version = {
86            if let Ok(ver) = std::str::from_utf8(&contents[5..8]) {
87                ver.parse::<f32>().ok()
88            } else {
89                None
90            }
91        };
92
93        Ok(Self {
94            version,
95            pages,
96            title,
97            author,
98            creation_date,
99            has_javascript,
100            has_form,
101            contents,
102        })
103    }
104}
105
106impl DocumentFile for PDF<'_> {
107    fn pages(&self) -> u32 {
108        self.pages
109    }
110
111    fn author(&self) -> Option<String> {
112        self.author.clone()
113    }
114
115    fn title(&self) -> Option<String> {
116        self.title.clone()
117    }
118
119    fn has_javascript(&self) -> bool {
120        self.has_javascript
121    }
122
123    fn has_form(&self) -> bool {
124        self.has_form
125    }
126
127    fn creation_time(&self) -> Option<DateTime<Utc>> {
128        self.creation_date
129    }
130
131    fn modification_time(&self) -> Option<DateTime<Utc>> {
132        None
133    }
134}
135
136impl SpecimenFile for PDF<'_> {
137    const MAGIC: &'static [&'static [u8]] = &[&MAGIC];
138
139    fn type_name(&self) -> &'static str {
140        "PDF"
141    }
142}
143
144impl Display for PDF<'_> {
145    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
146        write!(f, "PDF")?;
147        if let Some(version) = self.version {
148            write!(f, " version {version:.1}")?;
149        }
150        if let Some(title) = &self.title {
151            write!(f, " \"{title}\"")?;
152        }
153        if let Some(author) = &self.author {
154            write!(f, " by {author}")?;
155        }
156        if let Some(date) = &self.creation_date {
157            write!(f, " created {date}")?;
158        }
159        if self.has_form {
160            write!(f, " has form")?;
161        }
162        if self.has_javascript {
163            write!(f, " has Javascript")?;
164        }
165        write!(f, " {} pages, {} bytes", self.pages, self.contents.len())
166    }
167}
168
169#[cfg(test)]
170mod tests {
171    use super::*;
172    use rstest::rstest;
173
174    #[rstest]
175    #[case::plain_pdf(include_bytes!("../../testdata/pdf/test.pdf"), false, false)]
176    #[case::pdf_archival(include_bytes!("../../testdata/pdf/test_archival.pdf"), false, false)]
177    #[case::pdf_form_js(include_bytes!("../../testdata/pdf/test_form_barcode.pdf"), true, true)]
178    #[test]
179    fn plain_pdf(#[case] contents: &[u8], #[case] has_form: bool, #[case] has_js: bool) {
180        let pdf = PDF::from(contents).unwrap();
181        eprintln!("PDF: {pdf}");
182        assert_eq!(pdf.pages, 1);
183        assert_eq!(pdf.version, Some(1.6));
184        assert_eq!(pdf.title, Some("MalwareDB Test Document".into()));
185        assert_eq!(pdf.has_form, has_form);
186        assert_eq!(pdf.has_javascript, has_js);
187
188        let date = pdf.creation_date.unwrap().date_naive();
189        assert_eq!("2023-05-26", date.to_string());
190    }
191}