Skip to main content

mbr_forensic/
carve.rs

1//! File-signature carving and string extraction over raw byte regions.
2//!
3//! Unpartitioned gaps and slack retain remnants of deleted files — leftover
4//! data with direct forensic implications. [`carve`] recovers file *headers* by
5//! magic bytes; [`extract_ascii_strings`] surfaces embedded paths, URLs, and
6//! notes. Both are pure functions over a caller-supplied slice, so a caller can
7//! apply them to any region (a gap window, EBR slack, the whole disk).
8//!
9//! Carving reports header *locations*, not full file boundaries — it identifies
10//! that a recoverable artifact begins at an offset, which is the forensic signal
11//! for unallocated space. Magics are kept ≥ 3 bytes to bound false positives.
12
13/// A file header recovered from a byte region by its magic signature.
14#[derive(Debug, Clone, PartialEq, Eq)]
15#[cfg_attr(feature = "serde", derive(serde::Serialize))]
16pub struct CarvedFile {
17    /// Short type label, e.g. `"PNG"`, `"ZIP"`, `"PDF"`.
18    pub kind: &'static str,
19    /// Absolute byte offset of the header (caller's `base_offset` + position).
20    pub offset: u64,
21}
22
23/// A file-type magic signature: a `kind` label and the leading `magic` bytes.
24#[derive(Debug, Clone, Copy)]
25#[cfg_attr(feature = "serde", derive(serde::Serialize))]
26pub struct FileMagic {
27    pub kind: &'static str,
28    pub magic: &'static [u8],
29}
30
31/// Curated table of well-known file-header magics (all ≥ 3 bytes).
32pub const FILE_MAGICS: &[FileMagic] = &[
33    FileMagic {
34        kind: "ZIP",
35        magic: b"PK\x03\x04",
36    },
37    FileMagic {
38        kind: "PDF",
39        magic: b"%PDF-",
40    },
41    FileMagic {
42        kind: "PNG",
43        magic: b"\x89PNG\r\n\x1a\n",
44    },
45    FileMagic {
46        kind: "JPEG",
47        magic: b"\xFF\xD8\xFF",
48    },
49    FileMagic {
50        kind: "GIF",
51        magic: b"GIF87a",
52    },
53    FileMagic {
54        kind: "GIF",
55        magic: b"GIF89a",
56    },
57    FileMagic {
58        kind: "BZIP2",
59        magic: b"BZh",
60    },
61    FileMagic {
62        kind: "7Z",
63        magic: b"7z\xBC\xAF\x27\x1C",
64    },
65    FileMagic {
66        kind: "RAR",
67        magic: b"Rar!\x1A\x07",
68    },
69    FileMagic {
70        kind: "XZ",
71        magic: b"\xFD7zXZ\x00",
72    },
73    FileMagic {
74        kind: "ELF",
75        magic: b"\x7FELF",
76    },
77    FileMagic {
78        kind: "RIFF",
79        magic: b"RIFF",
80    },
81    FileMagic {
82        kind: "SQLite",
83        magic: b"SQLite format 3\x00",
84    },
85    FileMagic {
86        kind: "OLE",
87        magic: b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",
88    },
89    FileMagic {
90        kind: "CAB",
91        magic: b"MSCF",
92    },
93];
94
95/// Carve `data` for every known file-header magic, returning each match's type
96/// and **absolute** offset (`base_offset` + position within `data`).
97///
98/// `base_offset` is the absolute disk byte offset that `data[0]` came from, so
99/// the returned offsets are directly usable as disk locations.
100#[must_use]
101pub fn carve(data: &[u8], base_offset: u64) -> Vec<CarvedFile> {
102    let mut out = Vec::new();
103    for sig in FILE_MAGICS {
104        let m = sig.magic;
105        if m.is_empty() || m.len() > data.len() {
106            continue;
107        }
108        for (i, window) in data.windows(m.len()).enumerate() {
109            if window == m {
110                out.push(CarvedFile {
111                    kind: sig.kind,
112                    offset: base_offset + i as u64,
113                });
114            }
115        }
116    }
117    out.sort_by_key(|c| c.offset);
118    out
119}
120
121/// Lowest printable ASCII byte (space).
122const ASCII_MIN: u8 = 0x20;
123/// Highest printable ASCII byte (tilde).
124const ASCII_MAX: u8 = 0x7E;
125
126/// Extract runs of printable ASCII (`0x20`–`0x7E`) at least `min_len` bytes long.
127///
128/// The classic `strings(1)` behaviour: surfaces paths, URLs, banners, and notes
129/// left in unallocated space. Non-printable bytes terminate a run.
130#[must_use]
131pub fn extract_ascii_strings(data: &[u8], min_len: usize) -> Vec<String> {
132    let mut out = Vec::new();
133    let mut run: Vec<u8> = Vec::new();
134    for &b in data {
135        if (ASCII_MIN..=ASCII_MAX).contains(&b) {
136            run.push(b);
137        } else {
138            flush(&mut run, min_len, &mut out);
139        }
140    }
141    flush(&mut run, min_len, &mut out);
142    out
143}
144
145/// Emit the accumulated run as a string if it meets `min_len`, then clear it.
146fn flush(run: &mut Vec<u8>, min_len: usize, out: &mut Vec<String>) {
147    if run.len() >= min_len {
148        // Bytes are guaranteed printable ASCII, so this is always valid UTF-8.
149        out.push(String::from_utf8_lossy(run).into_owned());
150    }
151    run.clear();
152}