malwaredb_types/doc/
pdf.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use crate::doc::DocumentFile;
4use crate::SpecimenFile;
5
6use std::fmt::{Display, Formatter};
7
8use anyhow::Result;
9use chrono::{DateTime, Utc};
10use pdf::file::FileOptions;
11use tracing::instrument;
12
13const MAGIC: [u8; 4] = [0x25, 0x50, 0x44, 0x46]; // %PDF
14
15/// A struct representing Portable Document Format (PDF) files
16#[derive(Clone, Debug)]
17pub struct PDF<'a> {
18    /// Version of the PDF spec
19    pub version: Option<f32>,
20
21    /// Number of pages in the document
22    pub pages: u32,
23
24    /// Document title, if available
25    pub title: Option<String>,
26
27    /// Document author, if available
28    pub author: Option<String>,
29
30    /// Creation date, if available
31    pub creation_date: Option<DateTime<Utc>>,
32
33    /// Contains Javascript
34    pub has_javascript: bool,
35
36    /// Has a Form
37    pub has_form: bool,
38
39    /// The array containing the raw bytes used to parse this program
40    pub contents: &'a [u8],
41}
42
43impl<'a> PDF<'a> {
44    /// PDF parsed from a sequence of bytes
45    ///
46    /// # Errors
47    ///
48    /// Returns an error if parsing fails
49    #[instrument(name = "PDF parser", skip(contents))]
50    pub fn from(contents: &'a [u8]) -> Result<Self> {
51        let file = FileOptions::cached().load(contents)?;
52        let pages = file.num_pages();
53
54        let mut title = None;
55        let mut author = None;
56        let mut creation_date = None;
57
58        if let Some(info) = &file.trailer.info_dict {
59            title = info
60                .title
61                .as_ref()
62                .and_then(|p| Option::from(p.to_string_lossy()));
63            author = info
64                .author
65                .as_ref()
66                .and_then(|p| Option::from(p.to_string_lossy()));
67            creation_date = info.creation_date.as_ref().and_then(|p| {
68                let date_string = format!(
69                    "{}{:02}{:02}{:02}{:02}{:02}-{:02}{:02}",
70                    p.year, p.month, p.day, p.hour, p.minute, p.second, p.tz_hour, p.tz_minute
71                );
72
73                if let Ok(timestamp) = DateTime::parse_from_str(&date_string, "%Y%m%d%H%M%S%z") {
74                    let date_obj: DateTime<Utc> = DateTime::from(timestamp);
75                    Some(date_obj)
76                } else {
77                    None
78                }
79            });
80        }
81
82        let has_form = file.trailer.root.forms.is_some();
83
84        let has_javascript = match &file.trailer.root.names {
85            Some(x) => x.javascript.is_some(),
86            None => false,
87        };
88
89        let version = {
90            if let Ok(ver) = std::str::from_utf8(&contents[5..8]) {
91                ver.parse::<f32>().ok()
92            } else {
93                None
94            }
95        };
96
97        Ok(Self {
98            version,
99            pages,
100            title,
101            author,
102            creation_date,
103            has_javascript,
104            has_form,
105            contents,
106        })
107    }
108}
109
110impl DocumentFile for PDF<'_> {
111    fn pages(&self) -> u32 {
112        self.pages
113    }
114
115    fn author(&self) -> Option<String> {
116        self.author.clone()
117    }
118
119    fn title(&self) -> Option<String> {
120        self.title.clone()
121    }
122
123    fn has_javascript(&self) -> bool {
124        self.has_javascript
125    }
126
127    fn has_form(&self) -> bool {
128        self.has_form
129    }
130
131    fn creation_time(&self) -> Option<DateTime<Utc>> {
132        self.creation_date
133    }
134
135    fn modification_time(&self) -> Option<DateTime<Utc>> {
136        None
137    }
138}
139
140impl SpecimenFile for PDF<'_> {
141    const MAGIC: &'static [&'static [u8]] = &[&MAGIC];
142
143    fn type_name(&self) -> &'static str {
144        "PDF"
145    }
146}
147
148impl Display for PDF<'_> {
149    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
150        write!(f, "PDF")?;
151        if let Some(version) = self.version {
152            write!(f, " version {version:.1}")?;
153        }
154        if let Some(title) = &self.title {
155            write!(f, " \"{title}\"")?;
156        }
157        if let Some(author) = &self.author {
158            write!(f, " by {author}")?;
159        }
160        if let Some(date) = &self.creation_date {
161            write!(f, " created {date}")?;
162        }
163        if self.has_form {
164            write!(f, " has form")?;
165        }
166        if self.has_javascript {
167            write!(f, " has Javascript")?;
168        }
169        write!(f, " {} pages, {} bytes", self.pages, self.contents.len())
170    }
171}
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176    use rstest::rstest;
177
178    #[rstest]
179    #[case::plain_pdf(include_bytes!("../../testdata/pdf/test.pdf"), false, false)]
180    #[case::pdf_archival(include_bytes!("../../testdata/pdf/test_archival.pdf"), false, false)]
181    #[case::pdf_form_js(include_bytes!("../../testdata/pdf/test_form_barcode.pdf"), true, true)]
182    #[test]
183    fn plain_pdf(#[case] contents: &[u8], #[case] has_form: bool, #[case] has_js: bool) {
184        let pdf = PDF::from(contents).unwrap();
185        eprintln!("PDF: {pdf}");
186        assert_eq!(pdf.pages, 1);
187        assert_eq!(pdf.version, Some(1.6));
188        assert_eq!(pdf.title, Some("MalwareDB Test Document".into()));
189        assert_eq!(pdf.has_form, has_form);
190        assert_eq!(pdf.has_javascript, has_js);
191
192        let date = pdf.creation_date.unwrap().date_naive();
193        assert_eq!("2023-05-26", date.to_string());
194    }
195}