use crate::TypeMagic;
use std::fmt::{Display, Formatter};
use anyhow::Result;
use chrono::{DateTime, Utc};
use pdf::file::FileOptions;
const MAGIC: [u8; 4] = [0x25, 0x50, 0x44, 0x46]; #[derive(Clone, Debug)]
pub struct PDF<'a> {
pub version: Option<f32>,
pub pages: u32,
pub title: Option<String>,
pub author: Option<String>,
pub creation_date: Option<DateTime<Utc>>,
pub has_javascript: bool,
pub has_form: bool,
pub contents: &'a [u8],
}
impl<'a> PDF<'a> {
pub fn from(contents: &'a [u8]) -> Result<Self> {
let file = FileOptions::cached().load(contents)?;
let pages = file.num_pages();
let mut title = None;
let mut author = None;
let mut creation_date = None;
if let Some(info) = &file.trailer.info_dict {
title = info.get("Title").and_then(|p| p.to_string_lossy().ok());
author = info.get("Author").and_then(|p| p.to_string_lossy().ok());
let date = info
.get("CreationDate")
.and_then(|p| p.to_string_lossy().ok());
if let Some(real_date) = date.as_ref() {
if real_date.len() == 23 {
let date_time = &real_date[2..15].to_string();
let timezone = &real_date[16..];
let timezone = format!("{}{}", &timezone[0..3], &timezone[4..6]);
let new_date_string = format!("{date_time} {timezone}");
if let Ok(date_obj) =
DateTime::parse_from_str(&new_date_string, "%Y%m%d%H%M%S%z")
{
let date_obj: DateTime<Utc> = DateTime::from(date_obj);
creation_date = Some(date_obj);
}
}
}
}
let has_form = file.trailer.root.forms.is_some();
let has_javascript = match &file.trailer.root.names {
Some(x) => x.javascript.is_some(),
None => false,
};
let version = {
if let Ok(ver) = std::str::from_utf8(&contents[5..8]) {
ver.parse::<f32>().ok()
} else {
None
}
};
Ok(Self {
version,
contents,
pages,
title,
author,
has_form,
has_javascript,
creation_date,
})
}
}
impl<'a> TypeMagic for PDF<'a> {
const MAGIC: &'static [&'static [u8]] = &[&MAGIC];
}
impl<'a> Display for PDF<'a> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "PDF")?;
if let Some(version) = self.version {
write!(f, " version {version:.1}")?;
}
if let Some(title) = &self.title {
write!(f, " \"{title}\"")?;
}
if let Some(author) = &self.author {
write!(f, " by {author}")?;
}
if let Some(date) = &self.creation_date {
write!(f, " created {date}")?;
}
if self.has_form {
write!(f, " has form")?;
}
if self.has_javascript {
write!(f, " has Javascript")?;
}
write!(f, " {} pages, {} bytes", self.pages, self.contents.len())
}
}
#[cfg(test)]
mod tests {
use super::*;
use rstest::rstest;
#[rstest]
#[case(include_bytes!("../../testdata/pdf/test.pdf"), false, false)]
#[case(include_bytes!("../../testdata/pdf/test_archival.pdf"), false, false)]
#[case(include_bytes!("../../testdata/pdf/test_form_barcode.pdf"), true, true)]
#[test]
fn plain_pdf(#[case] contents: &[u8], #[case] has_form: bool, #[case] has_js: bool) {
let pdf = PDF::from(contents).unwrap();
eprintln!("PDF: {pdf}");
assert_eq!(pdf.pages, 1);
assert_eq!(pdf.version, Some(1.6));
assert_eq!(pdf.title, Some("MalwareDB Test Document".into()));
assert_eq!(pdf.has_form, has_form);
assert_eq!(pdf.has_javascript, has_js);
let date = pdf.creation_date.unwrap().date_naive();
assert_eq!("2023-05-26", date.to_string());
}
}