Skip to main content

chat4n6_sqlite_forensics/
unalloc.rs

1use crate::record::{decode_serial_type, RecoveredRecord, SqlValue};
2use crate::varint::read_varint;
3use chat4n6_plugin_api::{EvidenceSource, UnallocatedRegion};
4use std::collections::HashMap;
5
6// ── Signature database ────────────────────────────────────────────────────────
7
8/// A learned column pattern from live records.
9#[derive(Debug, Clone)]
10pub struct ColumnPattern {
11    pub table: String,
12    pub col_index: usize,
13    /// Most common serial type seen for this column in live records.
14    pub serial_type_hint: u64,
15}
16
17/// Database of column patterns learned from live records.
18pub struct SignatureDb {
19    patterns: Vec<ColumnPattern>,
20}
21
22impl SignatureDb {
23    /// Return patterns for a given table, sorted by col_index.
24    pub fn patterns_for(&self, table: &str) -> Vec<&ColumnPattern> {
25        let mut v: Vec<&ColumnPattern> =
26            self.patterns.iter().filter(|p| p.table == table).collect();
27        v.sort_by_key(|p| p.col_index);
28        v
29    }
30}
31
32// ── learn_signatures ─────────────────────────────────────────────────────────
33
34/// Analyse live records to learn the most common serial type per (table, col_index).
35pub fn learn_signatures(records: &[RecoveredRecord]) -> SignatureDb {
36    // (table, col_index) -> {serial_type -> count}
37    let mut counts: HashMap<(String, usize), HashMap<u64, usize>> = HashMap::new();
38
39    for rec in records {
40        for (col_index, value) in rec.values.iter().enumerate() {
41            let st = value_to_serial_type(value);
42            *counts
43                .entry((rec.table.clone(), col_index))
44                .or_default()
45                .entry(st)
46                .or_insert(0) += 1;
47        }
48    }
49
50    let mut patterns: Vec<ColumnPattern> = counts
51        .into_iter()
52        .map(|((table, col_index), type_counts)| {
53            let serial_type_hint = type_counts
54                .into_iter()
55                .max_by_key(|&(_, c)| c)
56                .map(|(st, _)| st)
57                .unwrap_or(0);
58            ColumnPattern {
59                table,
60                col_index,
61                serial_type_hint,
62            }
63        })
64        .collect();
65
66    // Deterministic order for tests
67    patterns.sort_by(|a, b| a.table.cmp(&b.table).then(a.col_index.cmp(&b.col_index)));
68
69    SignatureDb { patterns }
70}
71
72/// Map a `SqlValue` to an approximate SQLite serial type for signature matching.
73fn value_to_serial_type(value: &SqlValue) -> u64 {
74    match value {
75        SqlValue::Null => 0,
76        SqlValue::Int(v) => {
77            // Pick the smallest serial type that fits
78            if *v == 0 || *v == 1 {
79                8 // special single-byte literal 0/1 — approximate with 1-byte int
80            } else if (-128..=127).contains(v) {
81                1
82            } else if (-32768..=32767).contains(v) {
83                2
84            } else if (-8388608..=8388607).contains(v) {
85                3
86            } else if (-2147483648..=2147483647).contains(v) {
87                4
88            } else {
89                6
90            }
91        }
92        SqlValue::Real(_) => 7,
93        SqlValue::Text(s) => {
94            let len = s.len() as u64;
95            13 + len * 2
96        }
97        SqlValue::Blob(b) => {
98            let len = b.len() as u64;
99            12 + len * 2
100        }
101    }
102}
103
104// ── carve_unallocated ─────────────────────────────────────────────────────────
105
106/// Scan an unallocated region byte-by-byte looking for SQLite record patterns that
107/// match the learned signatures for `table_hint`.
108pub fn carve_unallocated(
109    region: &UnallocatedRegion,
110    sig_db: &SignatureDb,
111    table_hint: &str,
112) -> Vec<RecoveredRecord> {
113    let data = &region.data;
114    let abs_base = region.offset;
115    let patterns = sig_db.patterns_for(table_hint);
116
117    let mut results = Vec::new();
118
119    if data.is_empty() || patterns.is_empty() {
120        // No signatures to match against — still try a plain structural scan
121        // when there are no learned patterns; skip the region in that case.
122        return results;
123    }
124
125    let col_count = patterns.iter().map(|p| p.col_index + 1).max().unwrap_or(0);
126
127    if col_count == 0 {
128        return results;
129    }
130
131    // Minimum bytes: 1 (header_len) + col_count (serial types, 1 byte each minimum) + 0 data
132    let min_len = 1 + col_count;
133
134    let mut pos = 0;
135    while pos + min_len <= data.len() {
136        if let Some((record, consumed)) = try_parse_record(
137            &data[pos..],
138            abs_base + pos as u64,
139            table_hint,
140            col_count,
141            &patterns,
142        ) {
143            results.push(record);
144            pos += consumed;
145        } else {
146            pos += 1;
147        }
148    }
149
150    results
151}
152
153/// Try to parse a single SQLite record at the start of `data`.
154/// Returns `(RecoveredRecord, bytes_consumed)` on success.
155fn try_parse_record(
156    data: &[u8],
157    abs_offset: u64,
158    table: &str,
159    col_count: usize,
160    patterns: &[&ColumnPattern],
161) -> Option<(RecoveredRecord, usize)> {
162    // Read header_len varint
163    let (header_len, hl_consumed) = read_varint(data, 0)?;
164    let header_end = header_len as usize;
165
166    // Sanity checks: header must be plausible
167    // header_end includes the header_len varint itself
168    if header_end < hl_consumed || header_end > data.len() || header_end > 512
169    // guard against huge garbage headers
170    {
171        return None;
172    }
173
174    // Read serial types from header
175    let mut pos = hl_consumed;
176    let mut serial_types: Vec<u64> = Vec::with_capacity(col_count);
177    while pos < header_end {
178        let (st, consumed) = read_varint(data, pos)?;
179        serial_types.push(st);
180        pos += consumed;
181        if serial_types.len() >= col_count {
182            break;
183        }
184    }
185
186    if serial_types.is_empty() {
187        return None;
188    }
189
190    // Validate that at least 1 column serial type matches the signature
191    let matched = serial_types
192        .iter()
193        .enumerate()
194        .filter(|&(i, &st)| {
195            patterns
196                .iter()
197                .any(|p| p.col_index == i && serial_types_compatible(st, p.serial_type_hint))
198        })
199        .count();
200
201    if matched == 0 {
202        return None;
203    }
204
205    // Decode values — return None on any decode failure to avoid corrupting
206    // the consumed-byte count (which would mis-advance the scanner).
207    let mut data_pos = header_end;
208    let mut values: Vec<SqlValue> = Vec::with_capacity(serial_types.len());
209    for &st in &serial_types {
210        match decode_serial_type(st, data, data_pos) {
211            Some((val, consumed)) => {
212                data_pos += consumed;
213                values.push(val);
214            }
215            None => return None,
216        }
217    }
218
219    let total_cols = serial_types.len().max(col_count);
220    let confidence_pct = ((matched * 100) / total_cols.max(1)).min(100) as u8;
221    let confidence = confidence_pct as f32 / 100.0;
222
223    Some((
224        RecoveredRecord {
225            table: table.to_string(),
226            row_id: None,
227            values,
228            source: EvidenceSource::CarvedUnalloc { confidence_pct },
229            offset: abs_offset,
230            confidence,
231        },
232        data_pos,
233    ))
234}
235
236/// Determine whether a found serial type is "compatible" with the learned hint.
237/// We use a class-based comparison: both must be the same broad type class
238/// (integer, real, text, blob, null).
239fn serial_types_compatible(found: u64, hint: u64) -> bool {
240    if found == hint {
241        return true;
242    }
243    // Both integer types (1-6 are fixed-size integers, 8-9 are literal 0/1)
244    let int_class = |st: u64| matches!(st, 1..=6 | 8 | 9);
245    if int_class(found) && int_class(hint) {
246        return true;
247    }
248    // Both real
249    if found == 7 && hint == 7 {
250        return true;
251    }
252    // Both text (odd >= 13)
253    if found >= 13 && found % 2 == 1 && hint >= 13 && hint % 2 == 1 {
254        return true;
255    }
256    // Both blob (even >= 12)
257    if found >= 12 && found.is_multiple_of(2) && hint >= 12 && hint.is_multiple_of(2) {
258        return true;
259    }
260    false
261}
262
263// ── recover_layer6 ────────────────────────────────────────────────────────────
264
265/// Layer 6: carve records from all unallocated regions.
266pub fn recover_layer6(
267    regions: &[UnallocatedRegion],
268    sig_db: &SignatureDb,
269    table_hint: &str,
270) -> Vec<RecoveredRecord> {
271    regions
272        .iter()
273        .flat_map(|r| carve_unallocated(r, sig_db, table_hint))
274        .collect()
275}
276
277// ── Tests ─────────────────────────────────────────────────────────────────────
278
279#[cfg(test)]
280mod tests {
281    use super::*;
282    use chat4n6_plugin_api::EvidenceSource;
283
284    fn make_record(table: &str, values: Vec<SqlValue>) -> RecoveredRecord {
285        RecoveredRecord {
286            table: table.to_string(),
287            row_id: None,
288            values,
289            source: EvidenceSource::Live,
290            offset: 0,
291            confidence: 1.0,
292        }
293    }
294
295    // ── 1. test_learn_signatures_empty ────────────────────────────────────────
296
297    #[test]
298    fn test_learn_signatures_empty() {
299        let db = learn_signatures(&[]);
300        assert!(
301            db.patterns.is_empty(),
302            "Empty records should yield empty signature db"
303        );
304    }
305
306    // ── 2. test_learn_signatures_basic ────────────────────────────────────────
307
308    #[test]
309    fn test_learn_signatures_basic() {
310        let records = vec![
311            make_record(
312                "messages",
313                vec![SqlValue::Int(1), SqlValue::Text("hello".into())],
314            ),
315            make_record(
316                "messages",
317                vec![SqlValue::Int(2), SqlValue::Text("world".into())],
318            ),
319            make_record(
320                "messages",
321                vec![SqlValue::Int(3), SqlValue::Text("foo".into())],
322            ),
323        ];
324        let db = learn_signatures(&records);
325        let pats = db.patterns_for("messages");
326        assert_eq!(
327            pats.len(),
328            2,
329            "Should have 2 column patterns for 'messages'"
330        );
331
332        // col 0 → int → serial type 1 (fits in 1 byte for 1-3)
333        let col0 = pats
334            .iter()
335            .find(|p| p.col_index == 0)
336            .expect("col 0 pattern");
337        assert!(
338            matches!(col0.serial_type_hint, 1..=6 | 8 | 9),
339            "col 0 hint should be an int type, got {}",
340            col0.serial_type_hint
341        );
342
343        // col 1 → text → odd serial type >= 13
344        let col1 = pats
345            .iter()
346            .find(|p| p.col_index == 1)
347            .expect("col 1 pattern");
348        assert!(
349            col1.serial_type_hint >= 13 && col1.serial_type_hint % 2 == 1,
350            "col 1 hint should be text serial type, got {}",
351            col1.serial_type_hint
352        );
353    }
354
355    // ── 3. test_carve_unallocated_finds_record ────────────────────────────────
356
357    #[test]
358    fn test_carve_unallocated_finds_record() {
359        // Build a minimal SQLite record:
360        //   header_len=2 (varint: just 1 byte = 0x02, covers itself + 1 serial type byte)
361        //   serial_type=1 (1-byte int)
362        //   value=42 (0x2a)
363        //
364        // Teach the signature db that col 0 of "test_table" is an integer type.
365        let live = vec![make_record("test_table", vec![SqlValue::Int(99)])];
366        let sig_db = learn_signatures(&live);
367
368        let record_bytes: Vec<u8> = vec![
369            0x02, // header_len varint = 2 (includes itself)
370            0x01, // serial type 1 = 1-byte int
371            0x2a, // value = 42
372        ];
373
374        let region = UnallocatedRegion {
375            offset: 1000,
376            data: record_bytes,
377        };
378
379        let found = carve_unallocated(&region, &sig_db, "test_table");
380        assert!(!found.is_empty(), "Should find the embedded record");
381        assert!(found[0].values.contains(&SqlValue::Int(42)));
382        assert_eq!(found[0].offset, 1000);
383        // Confidence should be > 0
384        assert!(found[0].confidence > 0.0);
385        // Source should be CarvedUnalloc
386        assert!(matches!(
387            found[0].source,
388            EvidenceSource::CarvedUnalloc { confidence_pct } if confidence_pct > 0
389        ));
390    }
391
392    // ── 4. test_carve_unallocated_empty_region ────────────────────────────────
393
394    #[test]
395    fn test_carve_unallocated_empty_region() {
396        let live = vec![make_record("test_table", vec![SqlValue::Int(1)])];
397        let sig_db = learn_signatures(&live);
398
399        let region = UnallocatedRegion {
400            offset: 0,
401            data: vec![],
402        };
403
404        let found = carve_unallocated(&region, &sig_db, "test_table");
405        assert!(found.is_empty(), "Empty region should yield no results");
406    }
407
408    // ── 5. test_recover_layer6_combines_regions ───────────────────────────────
409
410    #[test]
411    fn test_recover_layer6_combines_regions() {
412        let live = vec![make_record("tbl", vec![SqlValue::Int(0)])];
413        let sig_db = learn_signatures(&live);
414
415        // Two regions each containing one valid record
416        let record_bytes: Vec<u8> = vec![0x02, 0x01, 0x07];
417
418        let regions = vec![
419            UnallocatedRegion {
420                offset: 0,
421                data: record_bytes.clone(),
422            },
423            UnallocatedRegion {
424                offset: 5000,
425                data: record_bytes.clone(),
426            },
427        ];
428
429        let results = recover_layer6(&regions, &sig_db, "tbl");
430        assert_eq!(results.len(), 2, "Should combine records from both regions");
431        // Offsets should differ
432        assert_ne!(results[0].offset, results[1].offset);
433    }
434}