malwaredb_types/doc/
office95.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use crate::doc::DocumentFile;
4use crate::utils::u64_from_offset;
5use crate::{Ordering, SpecimenFile};
6
7use std::fmt::{Display, Formatter};
8
9use anyhow::{ensure, Context, Result};
10use chrono::{DateTime, Utc};
11use tracing::instrument;
12use uuid::{uuid, Uuid};
13
14const DOCFILE_MAGIC: [u8; 8] = [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1];
15
16/// Clsid is the UUID which matches a `Docfile` subtype, and the first three segments
17/// could be in big or little endian, so we have to check both
18/// <http://fileformats.archiveteam.org/wiki/Microsoft_Compound_File>
19#[derive(Clone, Debug, Eq)]
20pub struct Clsid {
21    /// Little Endian representation, most common
22    pub le_uuid: Uuid,
23
24    /// Big Endian representation, unlikely to be encountered
25    pub be_uuid: Uuid,
26}
27
28impl PartialEq for Clsid {
29    fn eq(&self, other: &Self) -> bool {
30        self.be_uuid == other.be_uuid || self.le_uuid == other.le_uuid
31    }
32}
33
34impl Clsid {
35    /// Microsoft Excel 5 through '95
36    pub const EXCEL5: Self = Clsid {
37        le_uuid: uuid!("10080200-0000-0000-c000-000000000046"),
38        be_uuid: uuid!("00020810-0000-0000-c000-000000000046"),
39    };
40
41    /// Microsoft Excel '97
42    pub const EXCEL97: Self = Clsid {
43        le_uuid: uuid!("20080200-0000-0000-c000-000000000046"),
44        be_uuid: uuid!("00020820-0000-0000-c000-000000000046"),
45    };
46
47    /// Microsoft Word 6 through '95
48    pub const WORD6: Self = Clsid {
49        le_uuid: uuid!("00090200-0000-0000-c000-000000000046"),
50        be_uuid: uuid!("00020900-0000-0000-c000-000000000046"),
51    };
52
53    /// Microsoft Word document
54    pub const DOC: Self = Clsid {
55        le_uuid: uuid!("06090200-0000-0000-c000-000000000046"),
56        be_uuid: uuid!("00020906-0000-0000-c000-000000000046"),
57    };
58
59    /// Microsoft Power Point 4
60    pub const POWERPOINT4: Self = Clsid {
61        le_uuid: uuid!("51480400-0000-0000-c000-000000000046"),
62        be_uuid: uuid!("00044851-0000-0000-c000-000000000046"),
63    };
64
65    /// Microsoft Power Point '95
66    pub const POWERPOINT95: Self = Clsid {
67        le_uuid: uuid!("ea7bae70-fb3b-11cd-a903-00aa00510ea3"),
68        be_uuid: uuid!("70ae7bea-3bfb-cd11-a903-00aa00510ea3"),
69    };
70
71    /// Microsoft Power Point '97 through 2003
72    pub const PPT: Self = Clsid {
73        le_uuid: uuid!("108d8164-9b4f-cf11-86ea-00aa00b929e8"),
74        be_uuid: uuid!("64818d10-4f9b-11cf-86ea-00aa00b929e8"),
75    };
76
77    /// Microsoft Installer
78    pub const MSI: Self = Clsid {
79        le_uuid: uuid!("000c1084-0000-0000-c000-000000000046"),
80        be_uuid: uuid!("84100c00-0000-0000-c000-000000000046"),
81    };
82
83    /// Microsoft Windows Update Patch
84    pub const MSP: Self = Clsid {
85        le_uuid: uuid!("000c1086-0000-0000-c000-000000000046"),
86        be_uuid: uuid!("86100c00-0000-0000-c000-000000000046"),
87    };
88
89    /// Equality between a [Clsid] and byte array
90    #[must_use]
91    pub fn equal(&self, bytes: &[u8; 16]) -> bool {
92        self.be_uuid.as_bytes() == bytes || self.le_uuid.as_bytes() == bytes
93    }
94}
95
96/// UUID file type, of which only a subset is of interest
97/// This is how we can filter out container formats, like .msi (installer) files.
98#[derive(Clone, Debug, Eq, PartialEq)]
99pub enum ClsidType {
100    /// Microsoft Excel
101    Excel,
102
103    /// Microsoft Power Point
104    PowerPoint,
105
106    /// Microsoft Word
107    Word,
108
109    /// Microsoft Installer
110    MSI,
111
112    /// Microsoft Windows Patch
113    MSP,
114
115    /// Unknown or unsupported non-MS Office document type
116    Unknown([u8; 16]),
117}
118
119impl ClsidType {
120    /// Clsid from a byte array
121    #[instrument]
122    pub fn from(bytes: &[u8; 16]) -> Self {
123        if Clsid::EXCEL5.equal(bytes) || Clsid::EXCEL97.equal(bytes) {
124            return Self::Excel;
125        }
126
127        if Clsid::WORD6.equal(bytes) || Clsid::DOC.equal(bytes) {
128            return Self::Word;
129        }
130
131        if Clsid::PPT.equal(bytes)
132            || Clsid::POWERPOINT4.equal(bytes)
133            || Clsid::POWERPOINT95.equal(bytes)
134        {
135            return Self::PowerPoint;
136        }
137
138        if Clsid::MSI.equal(bytes) {
139            return Self::MSI;
140        }
141
142        if Clsid::MSP.equal(bytes) {
143            return Self::MSP;
144        }
145
146        Self::Unknown(*bytes)
147    }
148}
149
150impl ClsidType {
151    /// If the Clsid is a document type
152    #[inline]
153    #[must_use]
154    pub fn is_document(&self) -> bool {
155        matches!(
156            self,
157            ClsidType::Excel | ClsidType::PowerPoint | ClsidType::Word
158        )
159    }
160}
161
162impl Display for ClsidType {
163    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
164        match self {
165            ClsidType::Excel => write!(f, "Excel"),
166            ClsidType::PowerPoint => write!(f, "PowerPoint"),
167            ClsidType::Word => write!(f, "Word"),
168            ClsidType::MSI => write!(f, "Installer"),
169            ClsidType::MSP => write!(f, "Windows Patch"),
170            ClsidType::Unknown(uuid) => write!(f, "Unknown/other {}", hex::encode(uuid)),
171        }
172    }
173}
174
175/// A struct representing the older Microsoft Office format, Office95, aka Docfile.
176///
177/// This format is really a container format, and could be used to hold a non-Office files,
178/// such as installers (.msi files), Windows update files, and others. Here we are only concerned
179/// with MS Office types.
180#[derive(Clone, Debug)]
181pub struct Office95<'a> {
182    /// Sub-type for the file
183    pub clsid: ClsidType,
184
185    /// Creation date of the document
186    pub creation_time: Option<DateTime<Utc>>,
187
188    /// Date the document was last modified
189    pub modification_time: Option<DateTime<Utc>>,
190
191    /// The array containing the raw bytes used to parse this document
192    pub contents: &'a [u8],
193}
194
195impl<'a> Office95<'a> {
196    /// Office95 `DOCFILE` type parsed from a sequence of bytes
197    #[instrument(name = "Office95/Docfile parser", skip(contents))]
198    pub fn from(contents: &'a [u8]) -> Result<Self> {
199        ensure!(contents.starts_with(&DOCFILE_MAGIC), "Not a DOCFILE");
200
201        let offset: [u8; 4] = contents[48..52]
202            .try_into()
203            .context("Failed to get slice for Office95 offset")?;
204        let offset_int = u32::from_le_bytes(offset);
205        let offset_int = (512 * (1 + offset_int) + 80) as usize;
206        let clsid: [u8; 16] = contents[offset_int..offset_int + 16]
207            .try_into()
208            .context("Failed to get slide for Office95 clsid")?;
209
210        let creation_time = u64_from_offset(contents, offset_int + 20, Ordering::LittleEndian);
211        let creation_time = if creation_time > 0 {
212            // The `nt_time` use of the From trait has `.expect()` which may be a problem, since
213            // we're dealing with malware, so we have to expect funny business.
214            // https://github.com/sorairolake/nt-time/issues/149
215            Some(DateTime::<Utc>::from(nt_time::FileTime::new(creation_time)))
216        } else {
217            None
218        };
219
220        let modification_time = u64_from_offset(contents, offset_int + 28, Ordering::LittleEndian);
221        let modification_time = if modification_time > 0 {
222            // The `nt_time` use of the From trait has `.expect()` which may be a problem, since
223            // we're dealing with malware, so we have to expect funny business.
224            // https://github.com/sorairolake/nt-time/issues/149
225            Some(DateTime::<Utc>::from(nt_time::FileTime::new(
226                modification_time,
227            )))
228        } else {
229            None
230        };
231
232        let clsid = ClsidType::from(&clsid);
233        ensure!(clsid.is_document(), "{clsid} is not a document type");
234
235        Ok(Self {
236            clsid,
237            creation_time,
238            modification_time,
239            contents,
240        })
241    }
242}
243
244// TODO: Better Office95 parsing
245impl DocumentFile for Office95<'_> {
246    fn pages(&self) -> u32 {
247        0
248    }
249
250    fn author(&self) -> Option<String> {
251        None
252    }
253
254    fn title(&self) -> Option<String> {
255        None
256    }
257
258    fn has_javascript(&self) -> bool {
259        false
260    }
261
262    fn has_form(&self) -> bool {
263        false
264    }
265
266    fn creation_time(&self) -> Option<DateTime<Utc>> {
267        self.creation_time
268    }
269
270    fn modification_time(&self) -> Option<DateTime<Utc>> {
271        self.modification_time
272    }
273}
274
275impl SpecimenFile for Office95<'_> {
276    const MAGIC: &'static [&'static [u8]] = &[&DOCFILE_MAGIC];
277
278    fn type_name(&self) -> &'static str {
279        "Office95"
280    }
281}
282
283impl Display for Office95<'_> {
284    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
285        write!(f, "Type: {}", self.clsid)?;
286        if let Some(created) = self.creation_time {
287            write!(f, ", Created: {created}")?;
288        }
289        if let Some(modified) = self.modification_time {
290            write!(f, ", Modified: {modified}")?;
291        }
292        write!(f, ", Size: {}", self.contents.len())
293    }
294}
295
296#[cfg(test)]
297mod tests {
298    use super::*;
299    use rstest::rstest;
300
301    #[rstest]
302    #[case::word(include_bytes!("../../testdata/office95/word.doc"), ClsidType::Word)]
303    #[case::excel(include_bytes!("../../testdata/office95/excel.xls"), ClsidType::Excel)]
304    #[case::powerpoint(include_bytes!("../../testdata/office95/powerpoint.ppt"), ClsidType::PowerPoint)]
305    fn doc(#[case] bytes: &[u8], #[case] expected_clsid: ClsidType) {
306        let office = Office95::from(bytes).unwrap();
307        println!("{office}");
308        assert_eq!(office.clsid, expected_clsid);
309    }
310}