use crate::SpecimenFile;
use std::fmt::{Display, Formatter};
use crate::doc::DocumentFile;
use anyhow::Result;
use chrono::{DateTime, Utc};
use pdf::file::FileOptions;
const MAGIC: [u8; 4] = [0x25, 0x50, 0x44, 0x46]; #[derive(Clone, Debug)]
pub struct PDF<'a> {
pub version: Option<f32>,
pub pages: u32,
pub title: Option<String>,
pub author: Option<String>,
pub creation_date: Option<DateTime<Utc>>,
pub has_javascript: bool,
pub has_form: bool,
pub contents: &'a [u8],
}
impl<'a> PDF<'a> {
pub fn from(contents: &'a [u8]) -> Result<Self> {
let file = FileOptions::cached().load(contents)?;
let pages = file.num_pages();
let mut title = None;
let mut author = None;
let mut creation_date = None;
if let Some(info) = &file.trailer.info_dict {
title = info
.title
.as_ref()
.and_then(|p| Option::from(p.to_string_lossy()));
author = info
.author
.as_ref()
.and_then(|p| Option::from(p.to_string_lossy()));
creation_date = info.creation_date.as_ref().and_then(|p| {
let date_string = format!(
"{}{:02}{:02}{:02}{:02}{:02}-{:02}{:02}",
p.year, p.month, p.day, p.hour, p.minute, p.second, p.tz_hour, p.tz_minute
);
if let Ok(timestamp) = DateTime::parse_from_str(&date_string, "%Y%m%d%H%M%S%z") {
let date_obj: DateTime<Utc> = DateTime::from(timestamp);
Some(date_obj)
} else {
None
}
});
}
let has_form = file.trailer.root.forms.is_some();
let has_javascript = match &file.trailer.root.names {
Some(x) => x.javascript.is_some(),
None => false,
};
let version = {
if let Ok(ver) = std::str::from_utf8(&contents[5..8]) {
ver.parse::<f32>().ok()
} else {
None
}
};
Ok(Self {
version,
contents,
pages,
title,
author,
has_form,
has_javascript,
creation_date,
})
}
}
impl<'a> DocumentFile for PDF<'a> {
fn pages(&self) -> u32 {
self.pages
}
fn author(&self) -> Option<String> {
self.author.clone()
}
fn title(&self) -> Option<String> {
self.title.clone()
}
fn has_javascript(&self) -> bool {
self.has_javascript
}
fn has_form(&self) -> bool {
self.has_form
}
}
impl<'a> SpecimenFile for PDF<'a> {
const MAGIC: &'static [&'static [u8]] = &[&MAGIC];
fn type_name(&self) -> &'static str {
"PDF"
}
}
impl<'a> Display for PDF<'a> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "PDF")?;
if let Some(version) = self.version {
write!(f, " version {version:.1}")?;
}
if let Some(title) = &self.title {
write!(f, " \"{title}\"")?;
}
if let Some(author) = &self.author {
write!(f, " by {author}")?;
}
if let Some(date) = &self.creation_date {
write!(f, " created {date}")?;
}
if self.has_form {
write!(f, " has form")?;
}
if self.has_javascript {
write!(f, " has Javascript")?;
}
write!(f, " {} pages, {} bytes", self.pages, self.contents.len())
}
}
#[cfg(test)]
mod tests {
use super::*;
use rstest::rstest;
#[rstest]
#[case(include_bytes!("../../testdata/pdf/test.pdf"), false, false)]
#[case(include_bytes!("../../testdata/pdf/test_archival.pdf"), false, false)]
#[case(include_bytes!("../../testdata/pdf/test_form_barcode.pdf"), true, true)]
#[test]
fn plain_pdf(#[case] contents: &[u8], #[case] has_form: bool, #[case] has_js: bool) {
let pdf = PDF::from(contents).unwrap();
eprintln!("PDF: {pdf}");
assert_eq!(pdf.pages, 1);
assert_eq!(pdf.version, Some(1.6));
assert_eq!(pdf.title, Some("MalwareDB Test Document".into()));
assert_eq!(pdf.has_form, has_form);
assert_eq!(pdf.has_javascript, has_js);
let date = pdf.creation_date.unwrap().date_naive();
assert_eq!("2023-05-26", date.to_string());
}
}