use chrono::{DateTime, Utc};
#[cfg(feature = "officexml")]
use std::fs::File;
#[cfg(feature = "officexml")]
use std::io::Cursor;
#[cfg(feature = "officexml")]
use std::path::Path;
#[cfg(feature = "officexml")]
use zip::ZipArchive;
#[cfg(feature = "office95")]
pub mod office95;
#[cfg(feature = "pdf")]
pub mod pdf;
#[cfg(feature = "rtf")]
pub mod rtf;
pub const PK_HEADER: [u8; 2] = [0x50u8, 0x4bu8];
#[cfg(feature = "officexml")]
const CONTENT_TYPE: &str = "[Content_Types].xml";
#[cfg(feature = "officexml")]
const WORD_DIR: &str = "word";
#[cfg(feature = "officexml")]
const EXCEL_DIR: &str = "xl";
#[cfg(feature = "officexml")]
const POWERPOINT_DIR: &str = "ppt";
#[cfg(feature = "officexml")]
pub fn is_zip_file_doc(fname: &Path) -> anyhow::Result<bool> {
let file = File::open(fname)?;
let mut archive = ZipArchive::new(file)?;
let mut has_content_type = false;
let mut has_office_doc_dir = false;
for i in 0..archive.len() {
let file = match archive.by_index(i) {
Ok(f) => f,
Err(e) => {
if let zip::result::ZipError::UnsupportedArchive(msg) = e {
if msg == "Password required to decrypt file" {
return Ok(false);
}
}
return Err(e.into());
}
};
if file.name().starts_with(WORD_DIR)
|| file.name().starts_with(EXCEL_DIR)
|| file.name().starts_with(POWERPOINT_DIR)
{
has_office_doc_dir = true;
} else if file.name().contains(CONTENT_TYPE) {
has_content_type = true;
}
if has_content_type && has_office_doc_dir {
return Ok(true);
}
}
Ok(false)
}
#[cfg(feature = "officexml")]
pub fn is_zip_buffer_doc(contents: &[u8]) -> anyhow::Result<bool> {
let mut buffer = Cursor::new(contents);
let mut archive = ZipArchive::new(&mut buffer)?;
let mut has_content_type = false;
let mut has_office_doc_dir = false;
for i in 0..archive.len() {
let file = archive.by_index(i).unwrap();
if file.name().starts_with(WORD_DIR)
|| file.name().starts_with(EXCEL_DIR)
|| file.name().starts_with(POWERPOINT_DIR)
{
has_office_doc_dir = true;
} else if file.name().contains(CONTENT_TYPE) {
has_content_type = true;
}
if has_content_type && has_office_doc_dir {
return Ok(true);
}
}
Ok(false)
}
pub trait DocumentFile {
fn pages(&self) -> u32;
fn author(&self) -> Option<String>;
fn title(&self) -> Option<String>;
fn has_javascript(&self) -> bool;
fn has_form(&self) -> bool;
fn creation_time(&self) -> Option<DateTime<Utc>>;
fn modification_time(&self) -> Option<DateTime<Utc>>;
}
#[cfg(test)]
mod tests {
use super::*;
use rstest::rstest;
#[cfg(feature = "officexml")]
#[rstest]
#[case::xlsx(include_bytes!("../../testdata/office_zip/excel.xlsx"), true)]
#[case::xltx(include_bytes!("../../testdata/office_zip/excel.xltx"), true)]
#[case::potx(include_bytes!("../../testdata/office_zip/powerpoint.potx"), true)]
#[case::pptx(include_bytes!("../../testdata/office_zip/powerpoint.pptx"), true)]
#[case::docx(include_bytes!("../../testdata/office_zip/word.docx"), true)]
#[case::dotx(include_bytes!("../../testdata/office_zip/word.dotx"), true)]
#[case::plain_zip(include_bytes!("../../testdata/zip/source.c.zip"), false)]
#[test]
fn zip(#[case] contents: &[u8], #[case] is_doc: bool) {
assert_eq!(is_zip_buffer_doc(contents).unwrap(), is_doc);
}
}