mbr_forensic/carve.rs
1//! File-signature carving and string extraction over raw byte regions.
2//!
3//! Unpartitioned gaps and slack retain remnants of deleted files — leftover
4//! data with direct forensic implications. [`carve`] recovers file *headers* by
5//! magic bytes; [`extract_ascii_strings`] surfaces embedded paths, URLs, and
6//! notes. Both are pure functions over a caller-supplied slice, so a caller can
7//! apply them to any region (a gap window, EBR slack, the whole disk).
8//!
9//! Carving reports header *locations*, not full file boundaries — it identifies
10//! that a recoverable artifact begins at an offset, which is the forensic signal
11//! for unallocated space. Magics are kept ≥ 3 bytes to bound false positives.
12
13/// A file header recovered from a byte region by its magic signature.
14#[derive(Debug, Clone, PartialEq, Eq)]
15#[cfg_attr(feature = "serde", derive(serde::Serialize))]
16pub struct CarvedFile {
17 /// Short type label, e.g. `"PNG"`, `"ZIP"`, `"PDF"`.
18 pub kind: &'static str,
19 /// Absolute byte offset of the header (caller's `base_offset` + position).
20 pub offset: u64,
21}
22
23/// A file-type magic signature: a `kind` label and the leading `magic` bytes.
24#[derive(Debug, Clone, Copy)]
25#[cfg_attr(feature = "serde", derive(serde::Serialize))]
26pub struct FileMagic {
27 pub kind: &'static str,
28 pub magic: &'static [u8],
29}
30
31/// Curated table of well-known file-header magics (all ≥ 3 bytes).
32pub const FILE_MAGICS: &[FileMagic] = &[
33 FileMagic {
34 kind: "ZIP",
35 magic: b"PK\x03\x04",
36 },
37 FileMagic {
38 kind: "PDF",
39 magic: b"%PDF-",
40 },
41 FileMagic {
42 kind: "PNG",
43 magic: b"\x89PNG\r\n\x1a\n",
44 },
45 FileMagic {
46 kind: "JPEG",
47 magic: b"\xFF\xD8\xFF",
48 },
49 FileMagic {
50 kind: "GIF",
51 magic: b"GIF87a",
52 },
53 FileMagic {
54 kind: "GIF",
55 magic: b"GIF89a",
56 },
57 FileMagic {
58 kind: "BZIP2",
59 magic: b"BZh",
60 },
61 FileMagic {
62 kind: "7Z",
63 magic: b"7z\xBC\xAF\x27\x1C",
64 },
65 FileMagic {
66 kind: "RAR",
67 magic: b"Rar!\x1A\x07",
68 },
69 FileMagic {
70 kind: "XZ",
71 magic: b"\xFD7zXZ\x00",
72 },
73 FileMagic {
74 kind: "ELF",
75 magic: b"\x7FELF",
76 },
77 FileMagic {
78 kind: "RIFF",
79 magic: b"RIFF",
80 },
81 FileMagic {
82 kind: "SQLite",
83 magic: b"SQLite format 3\x00",
84 },
85 FileMagic {
86 kind: "OLE",
87 magic: b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",
88 },
89 FileMagic {
90 kind: "CAB",
91 magic: b"MSCF",
92 },
93];
94
95/// Carve `data` for every known file-header magic, returning each match's type
96/// and **absolute** offset (`base_offset` + position within `data`).
97///
98/// `base_offset` is the absolute disk byte offset that `data[0]` came from, so
99/// the returned offsets are directly usable as disk locations.
100#[must_use]
101pub fn carve(data: &[u8], base_offset: u64) -> Vec<CarvedFile> {
102 let mut out = Vec::new();
103 for sig in FILE_MAGICS {
104 let m = sig.magic;
105 if m.is_empty() || m.len() > data.len() {
106 continue;
107 }
108 for (i, window) in data.windows(m.len()).enumerate() {
109 if window == m {
110 out.push(CarvedFile {
111 kind: sig.kind,
112 offset: base_offset + i as u64,
113 });
114 }
115 }
116 }
117 out.sort_by_key(|c| c.offset);
118 out
119}
120
121/// Lowest printable ASCII byte (space).
122const ASCII_MIN: u8 = 0x20;
123/// Highest printable ASCII byte (tilde).
124const ASCII_MAX: u8 = 0x7E;
125
126/// Extract runs of printable ASCII (`0x20`–`0x7E`) at least `min_len` bytes long.
127///
128/// The classic `strings(1)` behaviour: surfaces paths, URLs, banners, and notes
129/// left in unallocated space. Non-printable bytes terminate a run.
130#[must_use]
131pub fn extract_ascii_strings(data: &[u8], min_len: usize) -> Vec<String> {
132 let mut out = Vec::new();
133 let mut run: Vec<u8> = Vec::new();
134 for &b in data {
135 if (ASCII_MIN..=ASCII_MAX).contains(&b) {
136 run.push(b);
137 } else {
138 flush(&mut run, min_len, &mut out);
139 }
140 }
141 flush(&mut run, min_len, &mut out);
142 out
143}
144
145/// Emit the accumulated run as a string if it meets `min_len`, then clear it.
146fn flush(run: &mut Vec<u8>, min_len: usize, out: &mut Vec<String>) {
147 if run.len() >= min_len {
148 // Bytes are guaranteed printable ASCII, so this is always valid UTF-8.
149 out.push(String::from_utf8_lossy(run).into_owned());
150 }
151 run.clear();
152}