malwaredb_types/doc/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use chrono::{DateTime, Utc};
4#[cfg(feature = "officexml")]
5use std::fs::File;
6#[cfg(feature = "officexml")]
7use std::io::Cursor;
8#[cfg(feature = "officexml")]
9use std::path::Path;
10#[cfg(feature = "officexml")]
11use zip::ZipArchive;
12
13/// MS Office `DOCFILE` parsing
14#[cfg(feature = "office95")]
15pub mod office95;
16
17/// Portable Document Format file parsing
18#[cfg(feature = "pdf")]
19pub mod pdf;
20
21/// Rich Text Format file parsing
22#[cfg(feature = "rtf")]
23pub mod rtf;
24
25/// ZIP (PK) header, as some types really are just special Zip files
26pub const PK_HEADER: [u8; 2] = [0x50u8, 0x4bu8];
27
28// In the future, this logic will be extended to identify _which_ Office document is found.
29#[cfg(feature = "officexml")]
30const CONTENT_TYPE: &str = "[Content_Types].xml";
31#[cfg(feature = "officexml")]
32const WORD_DIR: &str = "word";
33#[cfg(feature = "officexml")]
34const EXCEL_DIR: &str = "xl";
35#[cfg(feature = "officexml")]
36const POWERPOINT_DIR: &str = "ppt";
37
38/// Is the Zip file just an Office document?
39///
40/// # Errors
41///
42/// Returns an error if the file isn't a valid Zip file
43#[cfg(feature = "officexml")]
44pub fn is_zip_file_doc(fname: impl AsRef<Path>) -> anyhow::Result<bool> {
45    let file = File::open(fname)?;
46    let mut archive = ZipArchive::new(file)?;
47
48    let mut has_content_type = false;
49    let mut has_office_doc_dir = false;
50    for i in 0..archive.len() {
51        let file = match archive.by_index(i) {
52            Ok(f) => f,
53            Err(e) => {
54                if let zip::result::ZipError::UnsupportedArchive(msg) = e {
55                    if msg == "Password required to decrypt file" {
56                        // Assumption is that no Office document is in a password-protected Zip
57                        return Ok(false);
58                    }
59                }
60                return Err(e.into());
61            }
62        };
63
64        if file.name().starts_with(WORD_DIR)
65            || file.name().starts_with(EXCEL_DIR)
66            || file.name().starts_with(POWERPOINT_DIR)
67        {
68            has_office_doc_dir = true;
69        } else if file.name().contains(CONTENT_TYPE) {
70            has_content_type = true;
71        }
72
73        if has_content_type && has_office_doc_dir {
74            return Ok(true);
75        }
76    }
77
78    Ok(false)
79}
80
81/// Is the Zip buffer just an Office document?
82///
83/// # Errors
84///
85/// Returns an error if the buffer isn't a valid Zip file
86#[cfg(feature = "officexml")]
87pub fn is_zip_buffer_doc(contents: &[u8]) -> anyhow::Result<bool> {
88    let mut buffer = Cursor::new(contents);
89    let mut archive = ZipArchive::new(&mut buffer)?;
90
91    let mut has_content_type = false;
92    let mut has_office_doc_dir = false;
93    for i in 0..archive.len() {
94        let file = archive.by_index(i)?;
95
96        if file.name().starts_with(WORD_DIR)
97            || file.name().starts_with(EXCEL_DIR)
98            || file.name().starts_with(POWERPOINT_DIR)
99        {
100            has_office_doc_dir = true;
101        } else if file.name().contains(CONTENT_TYPE) {
102            has_content_type = true;
103        }
104
105        if has_content_type && has_office_doc_dir {
106            return Ok(true);
107        }
108    }
109
110    Ok(false)
111}
112
113/// Common functions for document file types
114pub trait DocumentFile {
115    /// Number of pages
116    fn pages(&self) -> u32;
117
118    /// Author of the document
119    fn author(&self) -> Option<String>;
120
121    /// Title of the document
122    fn title(&self) -> Option<String>;
123
124    /// If the document has Javascript (PDF)
125    fn has_javascript(&self) -> bool;
126
127    /// If the document has a fillable form (PDF)
128    fn has_form(&self) -> bool;
129
130    /// Creation timestamp of the document, if found
131    fn creation_time(&self) -> Option<DateTime<Utc>>;
132
133    /// Modification timestamp of the document, if found
134    fn modification_time(&self) -> Option<DateTime<Utc>>;
135}
136
137#[cfg(test)]
138mod tests {
139    use super::*;
140    use rstest::rstest;
141
142    #[cfg(feature = "officexml")]
143    #[rstest]
144    #[case::xlsx(include_bytes!("../../testdata/office_zip/excel.xlsx"), true)]
145    #[case::xltx(include_bytes!("../../testdata/office_zip/excel.xltx"), true)]
146    #[case::potx(include_bytes!("../../testdata/office_zip/powerpoint.potx"), true)]
147    #[case::pptx(include_bytes!("../../testdata/office_zip/powerpoint.pptx"), true)]
148    #[case::docx(include_bytes!("../../testdata/office_zip/word.docx"), true)]
149    #[case::dotx(include_bytes!("../../testdata/office_zip/word.dotx"), true)]
150    #[case::plain_zip(include_bytes!("../../testdata/zip/source.c.zip"), false)]
151    #[test]
152    fn zip(#[case] contents: &[u8], #[case] is_doc: bool) {
153        assert_eq!(is_zip_buffer_doc(contents).unwrap(), is_doc);
154    }
155}