Skip to main content

cfb_forensic/
raw.rs

1//! Thin, panic-free raw decode of the parts of an OLE Compound File the `cfb`
2//! crate hides: the header, the FAT and mini-FAT sector chains, and the full
3//! 128-byte directory-entry array (including entries the live red-black tree no
4//! longer reaches). Offsets and sentinels come from
5//! [`forensicnomicon::olecf`] — never hardcoded here.
6//!
7//! This is **not** a second CFB reader: live navigation, clean metadata, and
8//! stream extraction stay with `cfb`. This module exists only so the analyzer
9//! can see deleted/orphaned residue that a spec-faithful reader skips.
10
11use forensicnomicon::olecf as k;
12
13/// A decoded 128-byte directory entry. Every field is read with bounds checks;
14/// a truncated entry yields zeroed / empty fields rather than a panic.
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct DirEntry {
17    /// Index of this entry in the directory array (its stream id / SID).
18    pub sid: u32,
19    /// UTF-16LE name, lossily decoded and trimmed at the declared name length.
20    pub name: String,
21    /// Raw object-type byte (`0x00`/`0x01`/`0x02`/`0x05`).
22    pub object_type: u8,
23    /// Raw colour byte (`0x00` red / `0x01` black).
24    pub color: u8,
25    /// Left-sibling SID, or [`forensicnomicon::olecf::NOSTREAM`].
26    pub left: u32,
27    /// Right-sibling SID, or `NOSTREAM`.
28    pub right: u32,
29    /// Child SID (storages only), or `NOSTREAM`.
30    pub child: u32,
31    /// 16-byte class id, verbatim.
32    pub clsid: [u8; 16],
33    /// User-defined state bits.
34    pub state_bits: u32,
35    /// Creation FILETIME (raw `u64`).
36    pub create_time: u64,
37    /// Modification FILETIME (raw `u64`).
38    pub modify_time: u64,
39    /// Starting sector id of the entry's stream (or mini-stream for the root).
40    pub start_sector: u32,
41    /// Declared stream size in bytes.
42    pub stream_size: u64,
43}
44
45impl DirEntry {
46    /// True if this entry's object type marks it as a live stream or storage
47    /// (`0x01`/`0x02`/`0x05`) rather than an unallocated `0x00` slot.
48    #[must_use]
49    pub fn is_allocated(&self) -> bool {
50        matches!(self.object_type, 0x01 | 0x02 | 0x05)
51    }
52
53    /// True for a stream object (`0x02`).
54    #[must_use]
55    pub fn is_stream(&self) -> bool {
56        self.object_type == 0x02
57    }
58}
59
60/// The decoded shape of a compound file, carrying everything the analyzer needs
61/// that `cfb` does not surface.
62#[derive(Debug, Clone)]
63pub struct RawCfb {
64    /// Major version (3 or 4) as read from the header.
65    pub major_version: u16,
66    /// `log2(sector size)` from the header.
67    pub sector_shift: u16,
68    /// `log2(mini-sector size)` from the header (normally 6).
69    pub mini_sector_shift: u16,
70    /// Mini-stream size cutoff (normally 4096).
71    pub mini_stream_cutoff: u32,
72    /// Byte-order mark as read (normally `0xFFFE`).
73    pub byte_order: u16,
74    /// Resolved sector size in bytes (`1 << sector_shift`, clamped to a sane range).
75    pub sector_size: usize,
76    /// First DIFAT sector id from the header.
77    pub first_difat_sector: u32,
78    /// Declared DIFAT sector count from the header.
79    pub num_difat_sectors: u32,
80    /// The full FAT: one next-SID slot per sector in the file.
81    pub fat: Vec<u32>,
82    /// The mini-FAT: one next-mini-SID slot per mini-sector.
83    pub mini_fat: Vec<u32>,
84    /// Every directory entry, in array order (live and orphaned alike).
85    pub dir_entries: Vec<DirEntry>,
86    /// Total file length in bytes.
87    pub file_len: u64,
88}
89
90/// Hard caps so a hostile length/count field cannot drive an allocation bomb or
91/// an unbounded walk. A real CFB never approaches these.
92const MAX_SECTORS: usize = 16 * 1024 * 1024;
93const MAX_DIR_ENTRIES: usize = 4 * 1024 * 1024;
94const MAX_CHAIN_STEPS: usize = 32 * 1024 * 1024;
95
96#[inline]
97fn le_u16(data: &[u8], off: usize) -> u16 {
98    let mut b = [0u8; 2];
99    if let Some(s) = data.get(off..off + 2) {
100        b.copy_from_slice(s);
101    }
102    u16::from_le_bytes(b)
103}
104
105#[inline]
106fn le_u32(data: &[u8], off: usize) -> u32 {
107    let mut b = [0u8; 4];
108    if let Some(s) = data.get(off..off + 4) {
109        b.copy_from_slice(s);
110    }
111    u32::from_le_bytes(b)
112}
113
114#[inline]
115fn le_u64(data: &[u8], off: usize) -> u64 {
116    let mut b = [0u8; 8];
117    if let Some(s) = data.get(off..off + 8) {
118        b.copy_from_slice(s);
119    }
120    u64::from_le_bytes(b)
121}
122
123/// Byte offset of regular sector `sid`: `(sid + 1) << sector_shift`
124/// ([`forensicnomicon::olecf`] formula). Returns `None` on overflow.
125#[must_use]
126pub fn sector_offset(sid: u32, sector_shift: u16) -> Option<u64> {
127    (u64::from(sid))
128        .checked_add(1)?
129        .checked_shl(u32::from(sector_shift))
130}
131
132/// Decode a compound file's header, FAT, mini-FAT, and directory array.
133///
134/// Returns `None` only when the buffer is too small to hold a header or lacks
135/// the OLECF signature; every field beyond that degrades to a safe default
136/// rather than failing, so partially-corrupt images still yield residue.
137#[must_use]
138pub fn decode(data: &[u8]) -> Option<RawCfb> {
139    if data.len() < k::HEADER_SIZE {
140        return None;
141    }
142    if data.get(0..8) != Some(&k::OLECF_SIGNATURE) {
143        return None;
144    }
145
146    let major_version = le_u16(data, k::MAJOR_VERSION);
147    let sector_shift = le_u16(data, k::SECTOR_SHIFT);
148    let mini_sector_shift = le_u16(data, k::MINI_SECTOR_SHIFT);
149    let mini_stream_cutoff = le_u32(data, k::MINI_STREAM_CUTOFF);
150    let byte_order = le_u16(data, k::BYTE_ORDER);
151
152    // Clamp the sector shift to the spec range (9..=12 covers v3/v4); an absurd
153    // shift would otherwise produce a meaningless sector size. Default to v3.
154    let effective_shift = if (9..=20).contains(&sector_shift) {
155        sector_shift
156    } else {
157        k::SECTOR_SHIFT_V3
158    };
159    let sector_size = 1usize << effective_shift;
160
161    let first_dir_sector = le_u32(data, k::FIRST_DIR_SECTOR);
162    let first_minifat_sector = le_u32(data, k::FIRST_MINIFAT_SECTOR);
163    let first_difat_sector = le_u32(data, k::FIRST_DIFAT_SECTOR);
164    let num_difat_sectors = le_u32(data, k::NUM_DIFAT_SECTORS);
165
166    let file_len = data.len() as u64;
167
168    let fat = read_fat(data, sector_size, first_difat_sector, num_difat_sectors);
169    let mini_fat = read_chain_table(data, sector_size, &fat, first_minifat_sector);
170    let dir_entries = read_directory(data, sector_size, &fat, first_dir_sector);
171
172    Some(RawCfb {
173        major_version,
174        sector_shift,
175        mini_sector_shift,
176        mini_stream_cutoff,
177        byte_order,
178        sector_size,
179        first_difat_sector,
180        num_difat_sectors,
181        fat,
182        mini_fat,
183        dir_entries,
184        file_len,
185    })
186}
187
188/// Read the slice of bytes for regular sector `sid`, or `None` if out of range.
189fn sector_slice(data: &[u8], sector_size: usize, sid: u32) -> Option<&[u8]> {
190    let start = (u64::from(sid) + 1).checked_mul(sector_size as u64)?;
191    let start = usize::try_from(start).ok()?;
192    let end = start.checked_add(sector_size)?;
193    data.get(start..end)
194}
195
196/// Assemble the FAT by following the DIFAT: the 109 in-header slots, then any
197/// DIFAT-sector chain. Each referenced FAT sector contributes `sector_size/4`
198/// next-pointers. Bounded by [`MAX_SECTORS`] and [`MAX_CHAIN_STEPS`].
199fn read_fat(
200    data: &[u8],
201    sector_size: usize,
202    first_difat_sector: u32,
203    num_difat_sectors: u32,
204) -> Vec<u32> {
205    let entries_per_sector = sector_size / 4;
206    let mut fat_sector_ids: Vec<u32> = Vec::new();
207
208    // 109 in-header DIFAT entries.
209    for i in 0..k::DIFAT_HEADER_COUNT {
210        let sid = le_u32(data, k::DIFAT_HEADER_OFFSET + i * 4);
211        if sid <= k::MAXREGSECT {
212            fat_sector_ids.push(sid);
213        }
214    }
215
216    // DIFAT-sector chain: each DIFAT sector holds (entries_per_sector - 1) FAT
217    // sector ids plus a trailing next-DIFAT-sector pointer.
218    let mut difat_sid = first_difat_sector;
219    let mut steps = 0usize;
220    let difat_cap = (num_difat_sectors as usize)
221        .saturating_add(MAX_SECTORS)
222        .min(MAX_SECTORS);
223    while difat_sid <= k::MAXREGSECT && steps < difat_cap && steps < MAX_CHAIN_STEPS {
224        let Some(sector) = sector_slice(data, sector_size, difat_sid) else {
225            break;
226        };
227        if entries_per_sector == 0 {
228            break; // cov:unreachable: sector_size>=512 so entries_per_sector>=128
229        }
230        for i in 0..(entries_per_sector - 1) {
231            let sid = le_u32(sector, i * 4);
232            if sid <= k::MAXREGSECT {
233                fat_sector_ids.push(sid);
234            }
235        }
236        difat_sid = le_u32(sector, (entries_per_sector - 1) * 4);
237        steps += 1;
238        if fat_sector_ids.len() > MAX_SECTORS {
239            break;
240        }
241    }
242
243    // Concatenate the FAT sectors into the full next-pointer table.
244    let mut fat: Vec<u32> = Vec::new();
245    for sid in fat_sector_ids {
246        let Some(sector) = sector_slice(data, sector_size, sid) else {
247            continue;
248        };
249        for i in 0..entries_per_sector {
250            fat.push(le_u32(sector, i * 4));
251            if fat.len() >= MAX_SECTORS {
252                return fat;
253            }
254        }
255    }
256    fat
257}
258
259/// Follow a FAT sector chain from `first_sid`, concatenating each sector's
260/// `u32` slots into a flat table (used for the mini-FAT). Loop-guarded.
261fn read_chain_table(data: &[u8], sector_size: usize, fat: &[u32], first_sid: u32) -> Vec<u32> {
262    let entries_per_sector = sector_size / 4;
263    let mut out: Vec<u32> = Vec::new();
264    let mut sid = first_sid;
265    let mut seen = 0usize;
266    let mut visited = vec![false; fat.len()];
267    while sid <= k::MAXREGSECT && seen < MAX_CHAIN_STEPS {
268        if let Some(slot) = visited.get_mut(sid as usize) {
269            if *slot {
270                break; // chain loop
271            }
272            *slot = true;
273        }
274        let Some(sector) = sector_slice(data, sector_size, sid) else {
275            break;
276        };
277        for i in 0..entries_per_sector {
278            out.push(le_u32(sector, i * 4));
279            if out.len() >= MAX_SECTORS {
280                return out;
281            }
282        }
283        sid = next_in_fat(fat, sid);
284        seen += 1;
285    }
286    out
287}
288
289/// Read every directory entry by following the directory-sector chain from
290/// `first_dir_sector` and slicing each sector into 128-byte entries. Bounded by
291/// [`MAX_DIR_ENTRIES`]; loop-guarded against a cyclic directory chain.
292fn read_directory(
293    data: &[u8],
294    sector_size: usize,
295    fat: &[u32],
296    first_dir_sector: u32,
297) -> Vec<DirEntry> {
298    let entries_per_sector = sector_size / k::DIR_ENTRY_SIZE;
299    let mut entries: Vec<DirEntry> = Vec::new();
300    let mut sid = first_dir_sector;
301    let mut seen = 0usize;
302    let mut visited = vec![false; fat.len()];
303    while sid <= k::MAXREGSECT && seen < MAX_CHAIN_STEPS {
304        if let Some(slot) = visited.get_mut(sid as usize) {
305            if *slot {
306                break; // directory-chain loop
307            }
308            *slot = true;
309        }
310        let Some(sector) = sector_slice(data, sector_size, sid) else {
311            break;
312        };
313        for i in 0..entries_per_sector {
314            let base = i * k::DIR_ENTRY_SIZE;
315            let Some(raw) = sector.get(base..base + k::DIR_ENTRY_SIZE) else {
316                break; // cov:unreachable: entries_per_sector bounds i within the sector
317            };
318            let sid_index = entries.len() as u32;
319            entries.push(parse_dir_entry(raw, sid_index));
320            if entries.len() >= MAX_DIR_ENTRIES {
321                return entries;
322            }
323        }
324        sid = next_in_fat(fat, sid);
325        seen += 1;
326    }
327    entries
328}
329
330/// FAT next-pointer for `sid`, or [`forensicnomicon::olecf::ENDOFCHAIN`] when
331/// the sid is past the table (treat an out-of-range pointer as chain end).
332fn next_in_fat(fat: &[u32], sid: u32) -> u32 {
333    fat.get(sid as usize).copied().unwrap_or(k::ENDOFCHAIN)
334}
335
336/// Decode one 128-byte directory entry. `raw` is guaranteed 128 bytes by the
337/// caller; every field read still goes through a bounds-checked helper.
338fn parse_dir_entry(raw: &[u8], sid: u32) -> DirEntry {
339    let name_len = le_u16(raw, k::NAME_LEN) as usize;
340    let name = decode_entry_name(raw, name_len);
341
342    let mut clsid = [0u8; 16];
343    if let Some(s) = raw.get(k::CLSID..k::CLSID + 16) {
344        clsid.copy_from_slice(s);
345    }
346
347    DirEntry {
348        sid,
349        name,
350        object_type: raw.get(k::OBJECT_TYPE).copied().unwrap_or(0),
351        color: raw.get(k::COLOR).copied().unwrap_or(0),
352        left: le_u32(raw, k::LEFT_SIBLING),
353        right: le_u32(raw, k::RIGHT_SIBLING),
354        child: le_u32(raw, k::CHILD),
355        clsid,
356        state_bits: le_u32(raw, k::STATE_BITS),
357        create_time: le_u64(raw, k::CREATE_TIME),
358        modify_time: le_u64(raw, k::MODIFY_TIME),
359        start_sector: le_u32(raw, k::START_SECTOR),
360        stream_size: le_u64(raw, k::STREAM_SIZE),
361    }
362}
363
364/// Lossily decode an entry's UTF-16LE name, honoring the declared byte length
365/// (which includes the terminating NUL) but never trusting it past 64 bytes.
366fn decode_entry_name(raw: &[u8], declared_len: usize) -> String {
367    // `declared_len` counts bytes including the trailing UTF-16 NUL; clamp to the
368    // 64-byte field and drop the terminator before decoding.
369    let byte_len = declared_len.min(k::NAME_LEN);
370    // The declared length includes the terminating UTF-16 NUL; drop it (a no-op
371    // when the name is empty).
372    let chars = (byte_len / 2).saturating_sub(1);
373    let mut units: Vec<u16> = Vec::with_capacity(chars);
374    for i in 0..chars {
375        units.push(le_u16(raw, k::NAME + i * 2));
376    }
377    String::from_utf16_lossy(&units)
378}
379
380/// Walk the live red-black directory tree from the root entry, returning the set
381/// of SIDs reachable through child/left/right pointers. Mirrors what the `cfb`
382/// crate exposes; everything allocated-but-unreached is an orphan.
383///
384/// Loop-guarded by a visited set, so a corrupt sibling cycle terminates.
385#[must_use]
386pub fn reachable_sids(entries: &[DirEntry]) -> Vec<bool> {
387    let mut reachable = vec![false; entries.len()];
388    if entries.is_empty() {
389        return reachable;
390    }
391    // The root storage is always entry 0 (`[MS-CFB]` §2.6.2).
392    let mut stack: Vec<u32> = vec![0];
393    let mut steps = 0usize;
394    while let Some(sid) = stack.pop() {
395        steps += 1;
396        if steps > MAX_DIR_ENTRIES {
397            break; // cov:unreachable: each SID is marked once, so steps <= entries.len()
398        }
399        let idx = sid as usize;
400        let Some(slot) = reachable.get_mut(idx) else {
401            continue;
402        };
403        if *slot {
404            continue;
405        }
406        *slot = true;
407        let Some(entry) = entries.get(idx) else {
408            continue; // cov:unreachable: reachable and entries share a length
409        };
410        for next in [entry.child, entry.left, entry.right] {
411            if next <= k::MAXREGSECT && (next as usize) < entries.len() {
412                stack.push(next);
413            }
414        }
415    }
416    reachable
417}