1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
use chrono::{DateTime, Utc};
#[cfg(feature = "officexml")]
use std::fs::File;
#[cfg(feature = "officexml")]
use std::io::Cursor;
#[cfg(feature = "officexml")]
use std::path::Path;
#[cfg(feature = "officexml")]
use zip::ZipArchive;

/// MS Office "DOCFILE" parsing
#[cfg(feature = "office95")]
pub mod office95;

/// Portable Document Format file parsing
#[cfg(feature = "pdf")]
pub mod pdf;

/// Rich Text Format file parsing
#[cfg(feature = "rtf")]
pub mod rtf;

/// ZIP (PK) header, as some types really are just special Zip files
pub const PK_HEADER: [u8; 2] = [0x50u8, 0x4bu8];

// In the future, this logic will be extended to identify _which_ Office document is found.
#[cfg(feature = "officexml")]
const CONTENT_TYPE: &str = "[Content_Types].xml";
#[cfg(feature = "officexml")]
const WORD_DIR: &str = "word";
#[cfg(feature = "officexml")]
const EXCEL_DIR: &str = "xl";
#[cfg(feature = "officexml")]
const POWERPOINT_DIR: &str = "ppt";

#[cfg(feature = "officexml")]
/// Is the Zip file just an Office document?
pub fn is_zip_file_doc(fname: &Path) -> anyhow::Result<bool> {
    let file = File::open(fname)?;
    let mut archive = ZipArchive::new(file)?;

    let mut has_content_type = false;
    let mut has_office_doc_dir = false;
    for i in 0..archive.len() {
        let file = match archive.by_index(i) {
            Ok(f) => f,
            Err(e) => {
                if let zip::result::ZipError::UnsupportedArchive(msg) = e {
                    if msg == "Password required to decrypt file" {
                        // Assumption is that no Office document is in a password-protected Zip
                        return Ok(false);
                    }
                }
                return Err(e.into());
            }
        };

        if file.name().starts_with(WORD_DIR)
            || file.name().starts_with(EXCEL_DIR)
            || file.name().starts_with(POWERPOINT_DIR)
        {
            has_office_doc_dir = true;
        } else if file.name().contains(CONTENT_TYPE) {
            has_content_type = true;
        }

        if has_content_type && has_office_doc_dir {
            return Ok(true);
        }
    }

    Ok(false)
}

#[cfg(feature = "officexml")]
/// Is the Zip buffer just an Office document?
pub fn is_zip_buffer_doc(contents: &[u8]) -> anyhow::Result<bool> {
    let mut buffer = Cursor::new(contents);
    let mut archive = ZipArchive::new(&mut buffer)?;

    let mut has_content_type = false;
    let mut has_office_doc_dir = false;
    for i in 0..archive.len() {
        let file = archive.by_index(i).unwrap();

        if file.name().starts_with(WORD_DIR)
            || file.name().starts_with(EXCEL_DIR)
            || file.name().starts_with(POWERPOINT_DIR)
        {
            has_office_doc_dir = true;
        } else if file.name().contains(CONTENT_TYPE) {
            has_content_type = true;
        }

        if has_content_type && has_office_doc_dir {
            return Ok(true);
        }
    }

    Ok(false)
}

/// Common functions for document file types
pub trait DocumentFile {
    /// Number of pages
    fn pages(&self) -> u32;

    /// Author of the document
    fn author(&self) -> Option<String>;

    /// Title of the document
    fn title(&self) -> Option<String>;

    /// If the document has Javascript (PDF)
    fn has_javascript(&self) -> bool;

    /// If the document has a fillable form (PDF)
    fn has_form(&self) -> bool;

    /// Creation timestamp of the document, if found
    fn creation_time(&self) -> Option<DateTime<Utc>>;

    /// Modification timestamp of the document, if found
    fn modification_time(&self) -> Option<DateTime<Utc>>;
}

#[cfg(test)]
mod tests {
    use super::*;
    use rstest::rstest;

    #[cfg(feature = "officexml")]
    #[rstest]
    #[case::xlsx(include_bytes!("../../testdata/office_zip/excel.xlsx"), true)]
    #[case::xltx(include_bytes!("../../testdata/office_zip/excel.xltx"), true)]
    #[case::potx(include_bytes!("../../testdata/office_zip/powerpoint.potx"), true)]
    #[case::pptx(include_bytes!("../../testdata/office_zip/powerpoint.pptx"), true)]
    #[case::docx(include_bytes!("../../testdata/office_zip/word.docx"), true)]
    #[case::dotx(include_bytes!("../../testdata/office_zip/word.dotx"), true)]
    #[case::plain_zip(include_bytes!("../../testdata/zip/source.c.zip"), false)]
    #[test]
    fn zip(#[case] contents: &[u8], #[case] is_doc: bool) {
        assert_eq!(is_zip_buffer_doc(contents).unwrap(), is_doc);
    }
}