malwaredb_types/doc/
office95.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use crate::doc::DocumentFile;
4use crate::utils::u64_from_offset;
5use crate::{Ordering, SpecimenFile};
6
7use std::fmt::{Display, Formatter};
8
9use anyhow::{ensure, Context, Result};
10use chrono::{DateTime, Utc};
11use tracing::instrument;
12use uuid::{uuid, Uuid};
13
14const DOCFILE_MAGIC: [u8; 8] = [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1];
15
16/// Clsid is the UUID which matches a `Docfile` subtype, and the first three segments
17/// could be in big or little endian, so we have to check both
18/// <http://fileformats.archiveteam.org/wiki/Microsoft_Compound_File>
19#[derive(Clone, Debug, Eq)]
20pub struct Clsid {
21    /// Little Endian representation, most common
22    pub le_uuid: Uuid,
23
24    /// Big Endian representation, unlikely to be encountered
25    pub be_uuid: Uuid,
26}
27
28impl PartialEq for Clsid {
29    fn eq(&self, other: &Self) -> bool {
30        self.be_uuid == other.be_uuid || self.le_uuid == other.le_uuid
31    }
32}
33
34impl Clsid {
35    /// Microsoft Excel 5 through '95
36    pub const EXCEL5: Self = Clsid {
37        le_uuid: uuid!("10080200-0000-0000-c000-000000000046"),
38        be_uuid: uuid!("00020810-0000-0000-c000-000000000046"),
39    };
40
41    /// Microsoft Excel '97
42    pub const EXCEL97: Self = Clsid {
43        le_uuid: uuid!("20080200-0000-0000-c000-000000000046"),
44        be_uuid: uuid!("00020820-0000-0000-c000-000000000046"),
45    };
46
47    /// Microsoft Word 6 through '95
48    pub const WORD6: Self = Clsid {
49        le_uuid: uuid!("00090200-0000-0000-c000-000000000046"),
50        be_uuid: uuid!("00020900-0000-0000-c000-000000000046"),
51    };
52
53    /// Microsoft Word document
54    pub const DOC: Self = Clsid {
55        le_uuid: uuid!("06090200-0000-0000-c000-000000000046"),
56        be_uuid: uuid!("00020906-0000-0000-c000-000000000046"),
57    };
58
59    /// Microsoft Power Point 4
60    pub const POWERPOINT4: Self = Clsid {
61        le_uuid: uuid!("51480400-0000-0000-c000-000000000046"),
62        be_uuid: uuid!("00044851-0000-0000-c000-000000000046"),
63    };
64
65    /// Microsoft Power Point '95
66    pub const POWERPOINT95: Self = Clsid {
67        le_uuid: uuid!("ea7bae70-fb3b-11cd-a903-00aa00510ea3"),
68        be_uuid: uuid!("70ae7bea-3bfb-cd11-a903-00aa00510ea3"),
69    };
70
71    /// Microsoft Power Point '97 through 2003
72    pub const PPT: Self = Clsid {
73        le_uuid: uuid!("108d8164-9b4f-cf11-86ea-00aa00b929e8"),
74        be_uuid: uuid!("64818d10-4f9b-11cf-86ea-00aa00b929e8"),
75    };
76
77    /// Microsoft Installer
78    pub const MSI: Self = Clsid {
79        le_uuid: uuid!("000c1084-0000-0000-c000-000000000046"),
80        be_uuid: uuid!("84100c00-0000-0000-c000-000000000046"),
81    };
82
83    /// Microsoft Windows Update Patch
84    pub const MSP: Self = Clsid {
85        le_uuid: uuid!("000c1086-0000-0000-c000-000000000046"),
86        be_uuid: uuid!("86100c00-0000-0000-c000-000000000046"),
87    };
88
89    /// Equality between a [Clsid] and byte array
90    #[must_use]
91    pub fn equal(&self, bytes: &[u8; 16]) -> bool {
92        self.be_uuid.as_bytes() == bytes || self.le_uuid.as_bytes() == bytes
93    }
94}
95
96/// UUID file type, of which only a subset is of interest
97/// This is how we can filter out container formats, like .msi (installer) files.
98#[derive(Clone, Debug, Eq, PartialEq)]
99pub enum ClsidType {
100    /// Microsoft Excel
101    Excel,
102
103    /// Microsoft Power Point
104    PowerPoint,
105
106    /// Microsoft Word
107    Word,
108
109    /// Microsoft Installer
110    MSI,
111
112    /// Microsoft Windows Patch
113    MSP,
114
115    /// Unknown or unsupported non-MS Office document type
116    Unknown([u8; 16]),
117}
118
119impl ClsidType {
120    /// Clsid from a byte array
121    #[instrument]
122    pub fn from(bytes: &[u8; 16]) -> Self {
123        if Clsid::EXCEL5.equal(bytes) || Clsid::EXCEL97.equal(bytes) {
124            return Self::Excel;
125        }
126
127        if Clsid::WORD6.equal(bytes) || Clsid::DOC.equal(bytes) {
128            return Self::Word;
129        }
130
131        if Clsid::PPT.equal(bytes)
132            || Clsid::POWERPOINT4.equal(bytes)
133            || Clsid::POWERPOINT95.equal(bytes)
134        {
135            return Self::PowerPoint;
136        }
137
138        if Clsid::MSI.equal(bytes) {
139            return Self::MSI;
140        }
141
142        if Clsid::MSP.equal(bytes) {
143            return Self::MSP;
144        }
145
146        Self::Unknown(*bytes)
147    }
148}
149
150impl ClsidType {
151    /// If the Clsid is a document type
152    #[inline]
153    #[must_use]
154    pub fn is_document(&self) -> bool {
155        matches!(
156            self,
157            ClsidType::Excel | ClsidType::PowerPoint | ClsidType::Word
158        )
159    }
160}
161
162impl Display for ClsidType {
163    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
164        match self {
165            ClsidType::Excel => write!(f, "Excel"),
166            ClsidType::PowerPoint => write!(f, "PowerPoint"),
167            ClsidType::Word => write!(f, "Word"),
168            ClsidType::MSI => write!(f, "Installer"),
169            ClsidType::MSP => write!(f, "Windows Patch"),
170            ClsidType::Unknown(uuid) => write!(f, "Unknown/other {}", hex::encode(uuid)),
171        }
172    }
173}
174
175/// A struct representing the older Microsoft Office format, Office95, aka Docfile.
176///
177/// This format is really a container format, and could be used to hold a non-Office files,
178/// such as installers (.msi files), Windows update files, and others. Here we are only concerned
179/// with MS Office types.
180#[derive(Clone, Debug)]
181pub struct Office95<'a> {
182    /// Sub-type for the file
183    pub clsid: ClsidType,
184
185    /// Creation date of the document
186    pub creation_time: Option<DateTime<Utc>>,
187
188    /// Date the document was last modified
189    pub modification_time: Option<DateTime<Utc>>,
190
191    /// The array containing the raw bytes used to parse this document
192    pub contents: &'a [u8],
193}
194
195impl<'a> Office95<'a> {
196    /// Office95 `DOCFILE` type parsed from a sequence of bytes
197    ///
198    /// # Errors
199    ///
200    /// Returns an error if the document fails to parse as an Office95/Docfile, or if the CLSID isn't known.
201    #[instrument(name = "Office95/Docfile parser", skip(contents))]
202    pub fn from(contents: &'a [u8]) -> Result<Self> {
203        ensure!(contents.starts_with(&DOCFILE_MAGIC), "Not a DOCFILE");
204
205        let offset: [u8; 4] = contents[48..52]
206            .try_into()
207            .context("Failed to get slice for Office95 offset")?;
208        let offset_int = u32::from_le_bytes(offset);
209        let offset_int = (512 * (1 + offset_int) + 80) as usize;
210        let clsid: [u8; 16] = contents[offset_int..offset_int + 16]
211            .try_into()
212            .context("Failed to get slide for Office95 clsid")?;
213
214        let creation_time = if let Some(creation_time) =
215            u64_from_offset(contents, offset_int + 20, Ordering::LittleEndian)
216        {
217            if creation_time > 0 {
218                // The `nt_time` use of the From trait has `.expect()` which may be a problem, since
219                // we're dealing with malware, so we have to expect funny business.
220                // https://github.com/sorairolake/nt-time/issues/149
221                Some(DateTime::<Utc>::from(nt_time::FileTime::new(creation_time)))
222            } else {
223                None
224            }
225        } else {
226            None
227        };
228
229        let modification_time = if let Some(modification_time) =
230            u64_from_offset(contents, offset_int + 28, Ordering::LittleEndian)
231        {
232            if modification_time > 0 {
233                // The `nt_time` use of the From trait has `.expect()` which may be a problem, since
234                // we're dealing with malware, so we have to expect funny business.
235                // https://github.com/sorairolake/nt-time/issues/149
236                Some(DateTime::<Utc>::from(nt_time::FileTime::new(
237                    modification_time,
238                )))
239            } else {
240                None
241            }
242        } else {
243            None
244        };
245
246        let clsid = ClsidType::from(&clsid);
247        ensure!(
248            clsid.is_document(),
249            "Office95: CLSID `{clsid}` is not a known or supported document type"
250        );
251
252        Ok(Self {
253            clsid,
254            creation_time,
255            modification_time,
256            contents,
257        })
258    }
259}
260
261// TODO: Better Office95 parsing
262impl DocumentFile for Office95<'_> {
263    fn pages(&self) -> u32 {
264        0
265    }
266
267    fn author(&self) -> Option<String> {
268        None
269    }
270
271    fn title(&self) -> Option<String> {
272        None
273    }
274
275    fn has_javascript(&self) -> bool {
276        false
277    }
278
279    fn has_form(&self) -> bool {
280        false
281    }
282
283    fn creation_time(&self) -> Option<DateTime<Utc>> {
284        self.creation_time
285    }
286
287    fn modification_time(&self) -> Option<DateTime<Utc>> {
288        self.modification_time
289    }
290}
291
292impl SpecimenFile for Office95<'_> {
293    const MAGIC: &'static [&'static [u8]] = &[&DOCFILE_MAGIC];
294
295    fn type_name(&self) -> &'static str {
296        "Office95"
297    }
298}
299
300impl Display for Office95<'_> {
301    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
302        write!(f, "Type: {}", self.clsid)?;
303        if let Some(created) = self.creation_time {
304            write!(f, ", Created: {created}")?;
305        }
306        if let Some(modified) = self.modification_time {
307            write!(f, ", Modified: {modified}")?;
308        }
309        write!(f, ", Size: {}", self.contents.len())
310    }
311}
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316    use rstest::rstest;
317
318    #[rstest]
319    #[case::word(include_bytes!("../../testdata/office95/word.doc"), ClsidType::Word)]
320    #[case::excel(include_bytes!("../../testdata/office95/excel.xls"), ClsidType::Excel)]
321    #[case::powerpoint(include_bytes!("../../testdata/office95/powerpoint.ppt"), ClsidType::PowerPoint)]
322    fn doc(#[case] bytes: &[u8], #[case] expected_clsid: ClsidType) {
323        let office = Office95::from(bytes).unwrap();
324        println!("{office}");
325        assert_eq!(office.clsid, expected_clsid);
326    }
327}