malwaredb_types/doc/
pdf.rs1use crate::SpecimenFile;
4
5use std::fmt::{Display, Formatter};
6
7use crate::doc::DocumentFile;
8use anyhow::Result;
9use chrono::{DateTime, Utc};
10use pdf::file::FileOptions;
11use tracing::instrument;
12
13const MAGIC: [u8; 4] = [0x25, 0x50, 0x44, 0x46]; #[derive(Clone, Debug)]
17pub struct PDF<'a> {
18 pub version: Option<f32>,
20
21 pub pages: u32,
23
24 pub title: Option<String>,
26
27 pub author: Option<String>,
29
30 pub creation_date: Option<DateTime<Utc>>,
32
33 pub has_javascript: bool,
35
36 pub has_form: bool,
38
39 pub contents: &'a [u8],
41}
42
43impl<'a> PDF<'a> {
44 #[instrument(name = "PDF parser", skip(contents))]
46 pub fn from(contents: &'a [u8]) -> Result<Self> {
47 let file = FileOptions::cached().load(contents)?;
48 let pages = file.num_pages();
49
50 let mut title = None;
51 let mut author = None;
52 let mut creation_date = None;
53
54 if let Some(info) = &file.trailer.info_dict {
55 title = info
56 .title
57 .as_ref()
58 .and_then(|p| Option::from(p.to_string_lossy()));
59 author = info
60 .author
61 .as_ref()
62 .and_then(|p| Option::from(p.to_string_lossy()));
63 creation_date = info.creation_date.as_ref().and_then(|p| {
64 let date_string = format!(
65 "{}{:02}{:02}{:02}{:02}{:02}-{:02}{:02}",
66 p.year, p.month, p.day, p.hour, p.minute, p.second, p.tz_hour, p.tz_minute
67 );
68
69 if let Ok(timestamp) = DateTime::parse_from_str(&date_string, "%Y%m%d%H%M%S%z") {
70 let date_obj: DateTime<Utc> = DateTime::from(timestamp);
71 Some(date_obj)
72 } else {
73 None
74 }
75 });
76 }
77
78 let has_form = file.trailer.root.forms.is_some();
79
80 let has_javascript = match &file.trailer.root.names {
81 Some(x) => x.javascript.is_some(),
82 None => false,
83 };
84
85 let version = {
86 if let Ok(ver) = std::str::from_utf8(&contents[5..8]) {
87 ver.parse::<f32>().ok()
88 } else {
89 None
90 }
91 };
92
93 Ok(Self {
94 version,
95 pages,
96 title,
97 author,
98 creation_date,
99 has_javascript,
100 has_form,
101 contents,
102 })
103 }
104}
105
106impl DocumentFile for PDF<'_> {
107 fn pages(&self) -> u32 {
108 self.pages
109 }
110
111 fn author(&self) -> Option<String> {
112 self.author.clone()
113 }
114
115 fn title(&self) -> Option<String> {
116 self.title.clone()
117 }
118
119 fn has_javascript(&self) -> bool {
120 self.has_javascript
121 }
122
123 fn has_form(&self) -> bool {
124 self.has_form
125 }
126
127 fn creation_time(&self) -> Option<DateTime<Utc>> {
128 self.creation_date
129 }
130
131 fn modification_time(&self) -> Option<DateTime<Utc>> {
132 None
133 }
134}
135
136impl SpecimenFile for PDF<'_> {
137 const MAGIC: &'static [&'static [u8]] = &[&MAGIC];
138
139 fn type_name(&self) -> &'static str {
140 "PDF"
141 }
142}
143
144impl Display for PDF<'_> {
145 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
146 write!(f, "PDF")?;
147 if let Some(version) = self.version {
148 write!(f, " version {version:.1}")?;
149 }
150 if let Some(title) = &self.title {
151 write!(f, " \"{title}\"")?;
152 }
153 if let Some(author) = &self.author {
154 write!(f, " by {author}")?;
155 }
156 if let Some(date) = &self.creation_date {
157 write!(f, " created {date}")?;
158 }
159 if self.has_form {
160 write!(f, " has form")?;
161 }
162 if self.has_javascript {
163 write!(f, " has Javascript")?;
164 }
165 write!(f, " {} pages, {} bytes", self.pages, self.contents.len())
166 }
167}
168
169#[cfg(test)]
170mod tests {
171 use super::*;
172 use rstest::rstest;
173
174 #[rstest]
175 #[case::plain_pdf(include_bytes!("../../testdata/pdf/test.pdf"), false, false)]
176 #[case::pdf_archival(include_bytes!("../../testdata/pdf/test_archival.pdf"), false, false)]
177 #[case::pdf_form_js(include_bytes!("../../testdata/pdf/test_form_barcode.pdf"), true, true)]
178 #[test]
179 fn plain_pdf(#[case] contents: &[u8], #[case] has_form: bool, #[case] has_js: bool) {
180 let pdf = PDF::from(contents).unwrap();
181 eprintln!("PDF: {pdf}");
182 assert_eq!(pdf.pages, 1);
183 assert_eq!(pdf.version, Some(1.6));
184 assert_eq!(pdf.title, Some("MalwareDB Test Document".into()));
185 assert_eq!(pdf.has_form, has_form);
186 assert_eq!(pdf.has_javascript, has_js);
187
188 let date = pdf.creation_date.unwrap().date_naive();
189 assert_eq!("2023-05-26", date.to_string());
190 }
191}