malwaredb_types/doc/
pdf.rs1use crate::doc::DocumentFile;
4use crate::SpecimenFile;
5
6use std::fmt::{Display, Formatter};
7
8use anyhow::Result;
9use chrono::{DateTime, Utc};
10use pdf::file::FileOptions;
11use tracing::instrument;
12
13const MAGIC: [u8; 4] = [0x25, 0x50, 0x44, 0x46]; #[derive(Clone, Debug)]
17pub struct PDF<'a> {
18 pub version: Option<f32>,
20
21 pub pages: u32,
23
24 pub title: Option<String>,
26
27 pub author: Option<String>,
29
30 pub creation_date: Option<DateTime<Utc>>,
32
33 pub has_javascript: bool,
35
36 pub has_form: bool,
38
39 pub contents: &'a [u8],
41}
42
43impl<'a> PDF<'a> {
44 #[instrument(name = "PDF parser", skip(contents))]
50 pub fn from(contents: &'a [u8]) -> Result<Self> {
51 let file = FileOptions::cached().load(contents)?;
52 let pages = file.num_pages();
53
54 let mut title = None;
55 let mut author = None;
56 let mut creation_date = None;
57
58 if let Some(info) = &file.trailer.info_dict {
59 title = info
60 .title
61 .as_ref()
62 .and_then(|p| Option::from(p.to_string_lossy()));
63 author = info
64 .author
65 .as_ref()
66 .and_then(|p| Option::from(p.to_string_lossy()));
67 creation_date = info.creation_date.as_ref().and_then(|p| {
68 let date_string = format!(
69 "{}{:02}{:02}{:02}{:02}{:02}-{:02}{:02}",
70 p.year, p.month, p.day, p.hour, p.minute, p.second, p.tz_hour, p.tz_minute
71 );
72
73 if let Ok(timestamp) = DateTime::parse_from_str(&date_string, "%Y%m%d%H%M%S%z") {
74 let date_obj: DateTime<Utc> = DateTime::from(timestamp);
75 Some(date_obj)
76 } else {
77 None
78 }
79 });
80 }
81
82 let has_form = file.trailer.root.forms.is_some();
83
84 let has_javascript = match &file.trailer.root.names {
85 Some(x) => x.javascript.is_some(),
86 None => false,
87 };
88
89 let version = {
90 if let Ok(ver) = std::str::from_utf8(&contents[5..8]) {
91 ver.parse::<f32>().ok()
92 } else {
93 None
94 }
95 };
96
97 Ok(Self {
98 version,
99 pages,
100 title,
101 author,
102 creation_date,
103 has_javascript,
104 has_form,
105 contents,
106 })
107 }
108}
109
110impl DocumentFile for PDF<'_> {
111 fn pages(&self) -> u32 {
112 self.pages
113 }
114
115 fn author(&self) -> Option<String> {
116 self.author.clone()
117 }
118
119 fn title(&self) -> Option<String> {
120 self.title.clone()
121 }
122
123 fn has_javascript(&self) -> bool {
124 self.has_javascript
125 }
126
127 fn has_form(&self) -> bool {
128 self.has_form
129 }
130
131 fn creation_time(&self) -> Option<DateTime<Utc>> {
132 self.creation_date
133 }
134
135 fn modification_time(&self) -> Option<DateTime<Utc>> {
136 None
137 }
138}
139
140impl SpecimenFile for PDF<'_> {
141 const MAGIC: &'static [&'static [u8]] = &[&MAGIC];
142
143 fn type_name(&self) -> &'static str {
144 "PDF"
145 }
146}
147
148impl Display for PDF<'_> {
149 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
150 write!(f, "PDF")?;
151 if let Some(version) = self.version {
152 write!(f, " version {version:.1}")?;
153 }
154 if let Some(title) = &self.title {
155 write!(f, " \"{title}\"")?;
156 }
157 if let Some(author) = &self.author {
158 write!(f, " by {author}")?;
159 }
160 if let Some(date) = &self.creation_date {
161 write!(f, " created {date}")?;
162 }
163 if self.has_form {
164 write!(f, " has form")?;
165 }
166 if self.has_javascript {
167 write!(f, " has Javascript")?;
168 }
169 write!(f, " {} pages, {} bytes", self.pages, self.contents.len())
170 }
171}
172
173#[cfg(test)]
174mod tests {
175 use super::*;
176 use rstest::rstest;
177
178 #[rstest]
179 #[case::plain_pdf(include_bytes!("../../testdata/pdf/test.pdf"), false, false)]
180 #[case::pdf_archival(include_bytes!("../../testdata/pdf/test_archival.pdf"), false, false)]
181 #[case::pdf_form_js(include_bytes!("../../testdata/pdf/test_form_barcode.pdf"), true, true)]
182 #[test]
183 fn plain_pdf(#[case] contents: &[u8], #[case] has_form: bool, #[case] has_js: bool) {
184 let pdf = PDF::from(contents).unwrap();
185 eprintln!("PDF: {pdf}");
186 assert_eq!(pdf.pages, 1);
187 assert_eq!(pdf.version, Some(1.6));
188 assert_eq!(pdf.title, Some("MalwareDB Test Document".into()));
189 assert_eq!(pdf.has_form, has_form);
190 assert_eq!(pdf.has_javascript, has_js);
191
192 let date = pdf.creation_date.unwrap().date_naive();
193 assert_eq!("2023-05-26", date.to_string());
194 }
195}