Skip to main content

shadowforge_lib/adapters/
corpus.rs

1//! Adapter implementing the [`CorpusIndex`] port for zero-modification
2//! steganographic cover selection.
3
4use std::cell::RefCell;
5use std::collections::HashMap;
6use std::path::Path;
7
8use sha2::{Digest, Sha256};
9
10use crate::domain::corpus;
11use crate::domain::errors::CorpusError;
12use crate::domain::ports::CorpusIndex;
13use crate::domain::types::{CorpusEntry, CoverMediaKind, Payload, SpectralKey, StegoTechnique};
14
15/// In-memory corpus index backed by a `HashMap<file_hash, CorpusEntry>`.
16///
17/// Search uses a linear scan with Hamming distance — sufficient for corpora
18/// up to ~100 K images. Interior mutability via [`RefCell`] keeps the port
19/// trait's `&self` receiver while allowing mutation during `add_to_index`
20/// and `build_index`.
21pub struct CorpusIndexImpl {
22    entries: RefCell<HashMap<[u8; 32], CorpusEntry>>,
23    /// Spectral key → file hashes for model-aware search.
24    spectral_index: RefCell<HashMap<SpectralKey, Vec<[u8; 32]>>>,
25}
26
27impl CorpusIndexImpl {
28    /// Create an empty corpus index.
29    #[must_use]
30    pub fn new() -> Self {
31        Self {
32            entries: RefCell::new(HashMap::new()),
33            spectral_index: RefCell::new(HashMap::new()),
34        }
35    }
36
37    /// Return the number of entries currently in the index.
38    #[must_use]
39    pub fn len(&self) -> usize {
40        self.entries.borrow().len()
41    }
42
43    /// Return `true` if the index contains no entries.
44    #[must_use]
45    pub fn is_empty(&self) -> bool {
46        self.entries.borrow().is_empty()
47    }
48}
49
50impl Default for CorpusIndexImpl {
51    fn default() -> Self {
52        Self::new()
53    }
54}
55
56/// Detect the cover media kind from a file extension.
57fn kind_from_extension(path: &Path) -> Option<CoverMediaKind> {
58    let ext = path.extension()?.to_str()?.to_lowercase();
59    match ext.as_str() {
60        "png" => Some(CoverMediaKind::PngImage),
61        "bmp" => Some(CoverMediaKind::BmpImage),
62        "jpg" | "jpeg" => Some(CoverMediaKind::JpegImage),
63        "gif" => Some(CoverMediaKind::GifImage),
64        "wav" => Some(CoverMediaKind::WavAudio),
65        _ => None,
66    }
67}
68
69impl CorpusIndex for CorpusIndexImpl {
70    fn search(
71        &self,
72        payload: &Payload,
73        _technique: StegoTechnique,
74        max_results: usize,
75    ) -> Result<Vec<CorpusEntry>, CorpusError> {
76        let entries = self.entries.borrow();
77        if entries.is_empty() {
78            return Err(CorpusError::NoSuitableCover {
79                payload_bytes: payload.len() as u64,
80            });
81        }
82
83        let payload_pattern = corpus::payload_to_bit_pattern(payload.as_bytes(), None);
84
85        // Score each entry by Hamming distance and collect results
86        let mut scored: Vec<(u64, CorpusEntry)> = entries
87            .values()
88            .map(|entry| {
89                let dist = corpus::score_match(&entry.precomputed_bit_pattern, &payload_pattern);
90                (dist, entry.clone())
91            })
92            .collect();
93
94        scored.sort_by_key(|(dist, _)| *dist);
95        scored.truncate(max_results);
96
97        if scored.is_empty() {
98            return Err(CorpusError::NoSuitableCover {
99                payload_bytes: payload.len() as u64,
100            });
101        }
102
103        Ok(scored.into_iter().map(|(_, entry)| entry).collect())
104    }
105
106    fn add_to_index(&self, path: &Path) -> Result<CorpusEntry, CorpusError> {
107        let cover_kind = kind_from_extension(path).ok_or_else(|| CorpusError::AddFailed {
108            path: path.display().to_string(),
109            reason: "unsupported file extension".into(),
110        })?;
111
112        let data = std::fs::read(path).map_err(|e| CorpusError::AddFailed {
113            path: path.display().to_string(),
114            reason: e.to_string(),
115        })?;
116
117        let file_hash: [u8; 32] = Sha256::digest(&data).into();
118        let bit_pattern = corpus::extract_lsb_pattern(&data);
119
120        let entry = CorpusEntry {
121            file_hash,
122            path: path.display().to_string(),
123            cover_kind,
124            precomputed_bit_pattern: bit_pattern,
125            spectral_key: None,
126        };
127
128        self.entries.borrow_mut().insert(file_hash, entry.clone());
129        // Spectral key not populated during base indexing (no image decode here).
130        // Callers that have spectral info may insert entries via `add_entry_with_key`.
131        Ok(entry)
132    }
133
134    fn build_index(&self, corpus_dir: &Path) -> Result<usize, CorpusError> {
135        if !corpus_dir.is_dir() {
136            return Err(CorpusError::IndexError {
137                reason: format!("{} is not a directory", corpus_dir.display()),
138            });
139        }
140
141        let mut count = 0usize;
142        let entries = std::fs::read_dir(corpus_dir).map_err(|e| CorpusError::IndexError {
143            reason: e.to_string(),
144        })?;
145
146        for entry in entries {
147            let entry = entry.map_err(|e| CorpusError::IndexError {
148                reason: e.to_string(),
149            })?;
150            let path = entry.path();
151            if path.is_file()
152                && kind_from_extension(&path).is_some()
153                && self.add_to_index(&path).is_ok()
154            {
155                count = count.strict_add(1);
156            }
157        }
158
159        Ok(count)
160    }
161
162    fn search_for_model(
163        &self,
164        payload: &Payload,
165        model_id: &str,
166        resolution: (u32, u32),
167        max_results: usize,
168    ) -> Result<Vec<CorpusEntry>, CorpusError> {
169        let key = SpectralKey {
170            model_id: model_id.to_string(),
171            resolution,
172        };
173        let spectral_index = self.spectral_index.borrow();
174        let hashes = spectral_index.get(&key).map_or(&[][..], Vec::as_slice);
175        if hashes.is_empty() {
176            return Err(CorpusError::NoSuitableCover {
177                payload_bytes: payload.len() as u64,
178            });
179        }
180
181        let entries = self.entries.borrow();
182        let payload_pattern = corpus::payload_to_bit_pattern(payload.as_bytes(), None);
183        let mut scored: Vec<(u64, CorpusEntry)> = hashes
184            .iter()
185            .filter_map(|h| entries.get(h))
186            .map(|entry| {
187                let dist = corpus::score_match(&entry.precomputed_bit_pattern, &payload_pattern);
188                (dist, entry.clone())
189            })
190            .collect();
191
192        scored.sort_by_key(|(dist, _)| *dist);
193        scored.truncate(max_results);
194
195        if scored.is_empty() {
196            return Err(CorpusError::NoSuitableCover {
197                payload_bytes: payload.len() as u64,
198            });
199        }
200
201        Ok(scored.into_iter().map(|(_, e)| e).collect())
202    }
203
204    fn model_stats(&self) -> Vec<(SpectralKey, usize)> {
205        let spectral_index = self.spectral_index.borrow();
206        let mut stats: Vec<(SpectralKey, usize)> = spectral_index
207            .iter()
208            .map(|(k, v)| (k.clone(), v.len()))
209            .collect();
210        stats.sort_by(|a, b| a.0.model_id.cmp(&b.0.model_id));
211        stats
212    }
213}
214
215impl CorpusIndexImpl {
216    /// Insert a pre-built [`CorpusEntry`] that already carries a
217    /// [`SpectralKey`].  Used by higher-level pipelines that have already
218    /// decoded the image and run spectral analysis.
219    pub fn add_entry_with_key(&self, entry: CorpusEntry) {
220        if let Some(ref key) = entry.spectral_key {
221            self.spectral_index
222                .borrow_mut()
223                .entry(key.clone())
224                .or_default()
225                .push(entry.file_hash);
226        }
227        self.entries.borrow_mut().insert(entry.file_hash, entry);
228    }
229}
230
231#[cfg(test)]
232mod tests {
233    use std::io::Write;
234
235    type TestResult = Result<(), Box<dyn std::error::Error>>;
236
237    use super::*;
238
239    /// Create a minimal 1×1 BMP file with known pixel data.
240    fn make_test_bmp(pixel_rgb: [u8; 3]) -> Result<Vec<u8>, Box<dyn std::error::Error>> {
241        // Minimal 1×1 24-bit BMP
242        let mut bmp = Vec::new();
243        // BMP header (14 bytes)
244        bmp.write_all(b"BM")?;
245        let file_size: u32 = 14 + 40 + 4; // header + DIB + 1 pixel (padded to 4 bytes)
246        bmp.write_all(&file_size.to_le_bytes())?;
247        bmp.write_all(&0u32.to_le_bytes())?; // reserved
248        bmp.write_all(&54u32.to_le_bytes())?; // pixel data offset
249
250        // DIB header (40 bytes - BITMAPINFOHEADER)
251        bmp.write_all(&40u32.to_le_bytes())?; // header size
252        bmp.write_all(&1i32.to_le_bytes())?; // width
253        bmp.write_all(&1i32.to_le_bytes())?; // height
254        bmp.write_all(&1u16.to_le_bytes())?; // color planes
255        bmp.write_all(&24u16.to_le_bytes())?; // bits per pixel
256        bmp.write_all(&0u32.to_le_bytes())?; // compression
257        bmp.write_all(&4u32.to_le_bytes())?; // image size (padded row)
258        bmp.write_all(&2835i32.to_le_bytes())?; // h resolution
259        bmp.write_all(&2835i32.to_le_bytes())?; // v resolution
260        bmp.write_all(&0u32.to_le_bytes())?; // colors in palette
261        bmp.write_all(&0u32.to_le_bytes())?; // important colors
262
263        // Pixel data (BGR, padded to 4 bytes)
264        bmp.push(pixel_rgb[2]); // B
265        bmp.push(pixel_rgb[1]); // G
266        bmp.push(pixel_rgb[0]); // R
267        bmp.push(0); // padding
268
269        Ok(bmp)
270    }
271
272    #[test]
273    fn build_index_counts_files() -> TestResult {
274        let dir = tempfile::tempdir()?;
275        for i in 0..5 {
276            let path = dir.path().join(format!("img_{i}.bmp"));
277            std::fs::write(&path, make_test_bmp([i * 50, 0, 0])?)?;
278        }
279
280        let index = CorpusIndexImpl::new();
281        let count = index.build_index(dir.path())?;
282        assert_eq!(count, 5);
283        assert_eq!(index.len(), 5);
284        Ok(())
285    }
286
287    #[test]
288    fn build_index_skips_non_image_files() -> TestResult {
289        let dir = tempfile::tempdir()?;
290        std::fs::write(dir.path().join("readme.txt"), b"hello")?;
291        std::fs::write(dir.path().join("img.bmp"), make_test_bmp([0, 0, 0])?)?;
292
293        let index = CorpusIndexImpl::new();
294        let count = index.build_index(dir.path())?;
295        assert_eq!(count, 1);
296        Ok(())
297    }
298
299    #[test]
300    fn search_returns_exact_match_first() -> TestResult {
301        let dir = tempfile::tempdir()?;
302        let target_data = make_test_bmp([0xFF, 0xFF, 0xFF])?;
303        let target_path = dir.path().join("target.bmp");
304        std::fs::write(&target_path, &target_data)?;
305
306        // Add a different image too
307        std::fs::write(dir.path().join("other.bmp"), make_test_bmp([0, 0, 0])?)?;
308
309        let index = CorpusIndexImpl::new();
310        index.build_index(dir.path())?;
311
312        // Search with a payload that matches the target's bit pattern
313        let target_hash: [u8; 32] = Sha256::digest(&target_data).into();
314        let target_entry = index.entries.borrow();
315        let expected_pattern = &target_entry
316            .get(&target_hash)
317            .ok_or("target hash not found in index")?
318            .precomputed_bit_pattern;
319        let payload = Payload::from_bytes(expected_pattern.to_vec());
320        drop(target_entry);
321
322        let results = index.search(&payload, StegoTechnique::LsbImage, 5)?;
323        assert!(!results.is_empty());
324        // First result should be the exact match
325        assert_eq!(
326            results.first().ok_or("no search results")?.file_hash,
327            target_hash
328        );
329        Ok(())
330    }
331
332    #[test]
333    fn search_empty_index_returns_error() {
334        let index = CorpusIndexImpl::new();
335        let payload = Payload::from_bytes(vec![0x42]);
336        let result = index.search(&payload, StegoTechnique::LsbImage, 5);
337        assert!(result.is_err());
338    }
339
340    #[test]
341    fn add_to_index_rejects_unsupported_extension() -> TestResult {
342        let dir = tempfile::tempdir()?;
343        let path = dir.path().join("readme.txt");
344        std::fs::write(&path, b"not an image")?;
345
346        let index = CorpusIndexImpl::new();
347        assert!(index.add_to_index(&path).is_err());
348        Ok(())
349    }
350
351    #[test]
352    fn build_index_rejects_non_directory() -> TestResult {
353        let file = tempfile::NamedTempFile::new()?;
354        let index = CorpusIndexImpl::new();
355        let result = index.build_index(file.path());
356        assert!(result.is_err());
357        Ok(())
358    }
359
360    #[test]
361    fn default_impl() {
362        let index = CorpusIndexImpl::default();
363        assert!(index.is_empty());
364    }
365}