malwaredb_types/doc/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use chrono::{DateTime, Utc};
4#[cfg(feature = "officexml")]
5use std::fs::File;
6#[cfg(feature = "officexml")]
7use std::io::Cursor;
8#[cfg(feature = "officexml")]
9use std::path::Path;
10#[cfg(feature = "officexml")]
11use zip::ZipArchive;
12
13/// MS Office `DOCFILE` parsing
14#[cfg_attr(docsrs, doc(cfg(feature = "office95")))]
15#[cfg(feature = "office95")]
16pub mod office95;
17
18/// Portable Document Format file parsing
19#[cfg_attr(docsrs, doc(cfg(feature = "pdf")))]
20#[cfg(feature = "pdf")]
21pub mod pdf;
22
23/// Rich Text Format file parsing
24#[cfg_attr(docsrs, doc(cfg(feature = "rtf")))]
25#[cfg(feature = "rtf")]
26pub mod rtf;
27
28/// ZIP (PK) header, as some types really are just special Zip files
29pub const PK_HEADER: [u8; 2] = [0x50u8, 0x4bu8];
30
31// In the future, this logic will be extended to identify _which_ Office document is found.
32#[cfg(feature = "officexml")]
33const CONTENT_TYPE: &str = "[Content_Types].xml";
34#[cfg(feature = "officexml")]
35const WORD_DIR: &str = "word";
36#[cfg(feature = "officexml")]
37const EXCEL_DIR: &str = "xl";
38#[cfg(feature = "officexml")]
39const POWERPOINT_DIR: &str = "ppt";
40
41/// Is the Zip file just an Office document?
42///
43/// # Errors
44///
45/// Returns an error if the file isn't a valid Zip file
46#[cfg_attr(docsrs, doc(cfg(feature = "officexml")))]
47#[cfg(feature = "officexml")]
48pub fn is_zip_file_doc(fname: impl AsRef<Path>) -> anyhow::Result<bool> {
49    let file = File::open(fname)?;
50    let mut archive = ZipArchive::new(file)?;
51
52    let mut has_content_type = false;
53    let mut has_office_doc_dir = false;
54    for i in 0..archive.len() {
55        let file = match archive.by_index(i) {
56            Ok(f) => f,
57            Err(e) => {
58                if let zip::result::ZipError::UnsupportedArchive(msg) = e {
59                    if msg == "Password required to decrypt file" {
60                        // Assumption is that no Office document is in a password-protected Zip
61                        return Ok(false);
62                    }
63                }
64                return Err(e.into());
65            }
66        };
67
68        if file.name().starts_with(WORD_DIR)
69            || file.name().starts_with(EXCEL_DIR)
70            || file.name().starts_with(POWERPOINT_DIR)
71        {
72            has_office_doc_dir = true;
73        } else if file.name().contains(CONTENT_TYPE) {
74            has_content_type = true;
75        }
76
77        if has_content_type && has_office_doc_dir {
78            return Ok(true);
79        }
80    }
81
82    Ok(false)
83}
84
85/// Is the Zip buffer just an Office document?
86///
87/// # Errors
88///
89/// Returns an error if the buffer isn't a valid Zip file
90#[cfg_attr(docsrs, doc(cfg(feature = "officexml")))]
91#[cfg(feature = "officexml")]
92pub fn is_zip_buffer_doc(contents: &[u8]) -> anyhow::Result<bool> {
93    let mut buffer = Cursor::new(contents);
94    let mut archive = ZipArchive::new(&mut buffer)?;
95
96    let mut has_content_type = false;
97    let mut has_office_doc_dir = false;
98    for i in 0..archive.len() {
99        let file = archive.by_index(i)?;
100
101        if file.name().starts_with(WORD_DIR)
102            || file.name().starts_with(EXCEL_DIR)
103            || file.name().starts_with(POWERPOINT_DIR)
104        {
105            has_office_doc_dir = true;
106        } else if file.name().contains(CONTENT_TYPE) {
107            has_content_type = true;
108        }
109
110        if has_content_type && has_office_doc_dir {
111            return Ok(true);
112        }
113    }
114
115    Ok(false)
116}
117
118/// Common functions for document file types
119pub trait DocumentFile {
120    /// Number of pages
121    fn pages(&self) -> u32;
122
123    /// Author of the document
124    fn author(&self) -> Option<String>;
125
126    /// Title of the document
127    fn title(&self) -> Option<String>;
128
129    /// If the document has Javascript (PDF)
130    fn has_javascript(&self) -> bool;
131
132    /// If the document has a fillable form (PDF)
133    fn has_form(&self) -> bool;
134
135    /// Creation timestamp of the document, if found
136    fn creation_time(&self) -> Option<DateTime<Utc>>;
137
138    /// Modification timestamp of the document, if found
139    fn modification_time(&self) -> Option<DateTime<Utc>>;
140}
141
142#[cfg(test)]
143mod tests {
144    use super::*;
145    use rstest::rstest;
146
147    #[cfg(feature = "officexml")]
148    #[rstest]
149    #[case::xlsx(include_bytes!("../../testdata/office_zip/excel.xlsx"), true)]
150    #[case::xltx(include_bytes!("../../testdata/office_zip/excel.xltx"), true)]
151    #[case::potx(include_bytes!("../../testdata/office_zip/powerpoint.potx"), true)]
152    #[case::pptx(include_bytes!("../../testdata/office_zip/powerpoint.pptx"), true)]
153    #[case::docx(include_bytes!("../../testdata/office_zip/word.docx"), true)]
154    #[case::dotx(include_bytes!("../../testdata/office_zip/word.dotx"), true)]
155    #[case::plain_zip(include_bytes!("../../testdata/zip/source.c.zip"), false)]
156    #[test]
157    fn zip(#[case] contents: &[u8], #[case] is_doc: bool) {
158        assert_eq!(is_zip_buffer_doc(contents).unwrap(), is_doc);
159    }
160}