1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
use chrono::{DateTime, Utc};
#[cfg(feature = "officexml")]
use std::fs::File;
#[cfg(feature = "officexml")]
use std::io::Cursor;
#[cfg(feature = "officexml")]
use std::path::Path;
#[cfg(feature = "officexml")]
use zip::ZipArchive;

#[cfg(feature = "office95")]
pub mod office95;
#[cfg(feature = "pdf")]
pub mod pdf;

#[cfg(feature = "rtf")]
pub mod rtf;

pub const PK_HEADER: [u8; 2] = [0x50u8, 0x4bu8];

// In the future, this logic will be extended to identify _which_ Office document is found.
#[cfg(feature = "officexml")]
const CONTENT_TYPE: &str = "[Content_Types].xml";
#[cfg(feature = "officexml")]
const WORD_DIR: &str = "word";
#[cfg(feature = "officexml")]
const EXCEL_DIR: &str = "xl";
#[cfg(feature = "officexml")]
const POWERPOINT_DIR: &str = "ppt";

#[cfg(feature = "officexml")]
/// Is the Zip file just an Office document?
pub fn is_zip_file_doc(fname: &Path) -> anyhow::Result<bool> {
    let file = File::open(fname)?;
    let mut archive = ZipArchive::new(file)?;

    let mut has_content_type = false;
    let mut has_office_doc_dir = false;
    for i in 0..archive.len() {
        let file = match archive.by_index(i) {
            Ok(f) => f,
            Err(e) => {
                if let zip::result::ZipError::UnsupportedArchive(msg) = e {
                    if msg == "Password required to decrypt file" {
                        // Assumption is that no Office document is in a password-protected Zip
                        return Ok(false);
                    }
                }
                return Err(e.into());
            }
        };

        if file.name().starts_with(WORD_DIR)
            || file.name().starts_with(EXCEL_DIR)
            || file.name().starts_with(POWERPOINT_DIR)
        {
            has_office_doc_dir = true;
        } else if file.name().contains(CONTENT_TYPE) {
            has_content_type = true;
        }

        if has_content_type && has_office_doc_dir {
            return Ok(true);
        }
    }

    Ok(false)
}

#[cfg(feature = "officexml")]
/// Is the Zip buffer just an Office document?
pub fn is_zip_buffer_doc(contents: &[u8]) -> anyhow::Result<bool> {
    let mut buffer = Cursor::new(contents);
    let mut archive = ZipArchive::new(&mut buffer)?;

    let mut has_content_type = false;
    let mut has_office_doc_dir = false;
    for i in 0..archive.len() {
        let file = archive.by_index(i).unwrap();

        if file.name().starts_with(WORD_DIR)
            || file.name().starts_with(EXCEL_DIR)
            || file.name().starts_with(POWERPOINT_DIR)
        {
            has_office_doc_dir = true;
        } else if file.name().contains(CONTENT_TYPE) {
            has_content_type = true;
        }

        if has_content_type && has_office_doc_dir {
            return Ok(true);
        }
    }

    Ok(false)
}

pub trait DocumentFile {
    fn pages(&self) -> u32;

    fn author(&self) -> Option<String>;

    fn title(&self) -> Option<String>;

    fn has_javascript(&self) -> bool;

    fn has_form(&self) -> bool;

    fn creation_time(&self) -> Option<DateTime<Utc>>;

    fn modification_time(&self) -> Option<DateTime<Utc>>;
}

#[cfg(test)]
mod tests {
    use super::*;
    use rstest::rstest;

    #[cfg(feature = "officexml")]
    #[rstest]
    #[case(include_bytes!("../../testdata/office_zip/excel.xlsx"), true)]
    #[case(include_bytes!("../../testdata/office_zip/excel.xltx"), true)]
    #[case(include_bytes!("../../testdata/office_zip/powerpoint.potx"), true)]
    #[case(include_bytes!("../../testdata/office_zip/powerpoint.pptx"), true)]
    #[case(include_bytes!("../../testdata/office_zip/word.docx"), true)]
    #[case(include_bytes!("../../testdata/office_zip/word.dotx"), true)]
    #[case(include_bytes!("../../testdata/zip/source.c.zip"), false)]
    #[test]
    fn zip(#[case] contents: &[u8], #[case] is_doc: bool) {
        assert_eq!(is_zip_buffer_doc(contents).unwrap(), is_doc);
    }
}