Skip to main content

microscope_memory/
fingerprint.rs

1//! Structural fingerprinting for Microscope Memory.
2//!
3//! Each block gets a structural fingerprint at index time:
4//! Shannon entropy + byte distribution hash. Blocks with similar
5//! fingerprints are linked in `links.bin` — creating "wormholes"
6//! between structurally similar but spatially distant blocks.
7//!
8//! Binary format: fingerprints.idx (FGP1), links.bin (LNK1)
9
10use std::collections::HashMap;
11use std::fs;
12use std::path::Path;
13
14// ─── Constants ──────────────────────────────────────
15
16/// Similarity threshold for creating a structural link (0..1).
17const LINK_THRESHOLD: f32 = 0.85;
18/// Maximum links per block.
19const MAX_LINKS_PER_BLOCK: usize = 8;
20/// Number of byte histogram buckets (compressed from 256).
21const HIST_BUCKETS: usize = 16;
22/// Fingerprint binary size: 4 (entropy f32) + 16 (histogram) + 8 (hash u64) = 28 bytes
23const FINGERPRINT_BYTES: usize = 28;
24
25// ─── Types ──────────────────────────────────────────
26
27/// Structural fingerprint for a single block.
28#[derive(Clone, Debug)]
29pub struct BlockFingerprint {
30    /// Shannon entropy of the block content (0.0 = uniform, ~8.0 = max random).
31    pub entropy: f32,
32    /// Compressed byte distribution (16 buckets, normalized to 0..255).
33    pub histogram: [u8; HIST_BUCKETS],
34    /// FNV-1a hash of the histogram for fast comparison.
35    pub hash: u64,
36}
37
38/// A structural link between two blocks.
39#[derive(Clone, Debug)]
40pub struct StructuralLink {
41    pub block_a: u32,
42    pub block_b: u32,
43    pub similarity: f32,
44}
45
46/// Link table — mmap-friendly storage of structural links.
47pub struct LinkTable {
48    pub fingerprints: Vec<BlockFingerprint>,
49    pub links: Vec<StructuralLink>,
50}
51
52impl LinkTable {
53    /// Build fingerprints and links from block data.
54    pub fn build(block_texts: &[&str]) -> Self {
55        let fingerprints: Vec<BlockFingerprint> = block_texts
56            .iter()
57            .map(|text| compute_fingerprint(text.as_bytes()))
58            .collect();
59
60        let links = find_links(&fingerprints);
61
62        LinkTable {
63            fingerprints,
64            links,
65        }
66    }
67
68    /// Get structural links for a specific block.
69    pub fn links_for(&self, block_idx: u32) -> Vec<&StructuralLink> {
70        self.links
71            .iter()
72            .filter(|l| l.block_a == block_idx || l.block_b == block_idx)
73            .collect()
74    }
75
76    /// Get the other end of a link from a given block.
77    pub fn linked_blocks(&self, block_idx: u32) -> Vec<(u32, f32)> {
78        self.links
79            .iter()
80            .filter_map(|l| {
81                if l.block_a == block_idx {
82                    Some((l.block_b, l.similarity))
83                } else if l.block_b == block_idx {
84                    Some((l.block_a, l.similarity))
85                } else {
86                    None
87                }
88            })
89            .collect()
90    }
91
92    /// Find the most structurally similar blocks to a query text.
93    pub fn find_similar(&self, text: &str, k: usize) -> Vec<(u32, f32)> {
94        let query_fp = compute_fingerprint(text.as_bytes());
95        let mut results: Vec<(u32, f32)> = self
96            .fingerprints
97            .iter()
98            .enumerate()
99            .map(|(i, fp)| (i as u32, fingerprint_similarity(&query_fp, fp)))
100            .filter(|(_, sim)| *sim > 0.5)
101            .collect();
102
103        results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
104        results.truncate(k);
105        results
106    }
107
108    /// Statistics.
109    pub fn stats(&self) -> FingerprintStats {
110        let avg_entropy = if self.fingerprints.is_empty() {
111            0.0
112        } else {
113            self.fingerprints.iter().map(|f| f.entropy).sum::<f32>()
114                / self.fingerprints.len() as f32
115        };
116
117        // Count unique hash buckets
118        let mut hash_counts: HashMap<u64, usize> = HashMap::new();
119        for fp in &self.fingerprints {
120            *hash_counts.entry(fp.hash).or_insert(0) += 1;
121        }
122        let unique_hashes = hash_counts.len();
123        let largest_cluster = hash_counts.values().max().copied().unwrap_or(0);
124
125        FingerprintStats {
126            block_count: self.fingerprints.len(),
127            link_count: self.links.len(),
128            avg_entropy,
129            unique_hashes,
130            largest_cluster,
131        }
132    }
133
134    /// Save to disk.
135    pub fn save(&self, output_dir: &Path) -> Result<(), String> {
136        save_fingerprints(output_dir, &self.fingerprints)?;
137        save_links(output_dir, &self.links)?;
138        Ok(())
139    }
140
141    /// Load from disk.
142    pub fn load(output_dir: &Path) -> Option<Self> {
143        let fingerprints = load_fingerprints(output_dir)?;
144        let links = load_links(output_dir).unwrap_or_default();
145        Some(LinkTable {
146            fingerprints,
147            links,
148        })
149    }
150}
151
152pub struct FingerprintStats {
153    pub block_count: usize,
154    pub link_count: usize,
155    pub avg_entropy: f32,
156    pub unique_hashes: usize,
157    pub largest_cluster: usize,
158}
159
160// ─── Core algorithms ────────────────────────────────
161
162/// Compute the structural fingerprint of a block's content.
163pub fn compute_fingerprint(data: &[u8]) -> BlockFingerprint {
164    // Shannon entropy
165    let mut byte_counts = [0u32; 256];
166    for &b in data {
167        byte_counts[b as usize] += 1;
168    }
169    let len = data.len().max(1) as f32;
170
171    let entropy: f32 = byte_counts
172        .iter()
173        .filter(|&&c| c > 0)
174        .map(|&c| {
175            let p = c as f32 / len;
176            -p * p.log2()
177        })
178        .sum();
179
180    // Compressed histogram: 256 → 16 buckets
181    let mut histogram = [0u8; HIST_BUCKETS];
182    for (i, bucket) in histogram.iter_mut().enumerate() {
183        let start = i * 16;
184        let end = start + 16;
185        let sum: u32 = byte_counts[start..end].iter().sum();
186        // Normalize to 0..255
187        *bucket = ((sum as f32 / len) * 255.0).min(255.0) as u8;
188    }
189
190    // FNV-1a hash of histogram
191    let hash = fnv1a_hash(&histogram);
192
193    BlockFingerprint {
194        entropy,
195        histogram,
196        hash,
197    }
198}
199
200/// Compute similarity between two fingerprints (0.0..1.0).
201pub fn fingerprint_similarity(a: &BlockFingerprint, b: &BlockFingerprint) -> f32 {
202    // Fast path: if hashes differ wildly, skip detailed comparison
203    if a.hash != b.hash {
204        // Histogram cosine similarity
205        let mut dot = 0u32;
206        let mut norm_a = 0u32;
207        let mut norm_b = 0u32;
208        for i in 0..HIST_BUCKETS {
209            let va = a.histogram[i] as u32;
210            let vb = b.histogram[i] as u32;
211            dot += va * vb;
212            norm_a += va * va;
213            norm_b += vb * vb;
214        }
215
216        let denom = ((norm_a as f64).sqrt() * (norm_b as f64).sqrt()) as f32;
217        if denom < 1.0 {
218            return 0.0;
219        }
220        let cosine = dot as f32 / denom;
221
222        // Entropy similarity (penalize large entropy differences)
223        let entropy_diff = (a.entropy - b.entropy).abs();
224        let entropy_sim = 1.0 - (entropy_diff / 8.0).min(1.0);
225
226        cosine * 0.7 + entropy_sim * 0.3
227    } else {
228        // Same hash → very similar histograms, refine with entropy
229        let entropy_diff = (a.entropy - b.entropy).abs();
230        let entropy_sim = 1.0 - (entropy_diff / 8.0).min(1.0);
231        0.85 + entropy_sim * 0.15
232    }
233}
234
235/// Find structural links between fingerprints above threshold.
236fn find_links(fingerprints: &[BlockFingerprint]) -> Vec<StructuralLink> {
237    let mut links = Vec::new();
238    let n = fingerprints.len();
239
240    // Group by hash for O(n*k) instead of O(n²)
241    let mut hash_groups: HashMap<u64, Vec<usize>> = HashMap::new();
242    for (i, fp) in fingerprints.iter().enumerate() {
243        hash_groups.entry(fp.hash).or_default().push(i);
244    }
245
246    // Within each hash group, all pairs are candidates
247    for group in hash_groups.values() {
248        if group.len() < 2 {
249            continue;
250        }
251        for i in 0..group.len() {
252            let mut block_links = 0;
253            for j in (i + 1)..group.len() {
254                if block_links >= MAX_LINKS_PER_BLOCK {
255                    break;
256                }
257                let a = group[i];
258                let b = group[j];
259                let sim = fingerprint_similarity(&fingerprints[a], &fingerprints[b]);
260                if sim >= LINK_THRESHOLD {
261                    links.push(StructuralLink {
262                        block_a: a as u32,
263                        block_b: b as u32,
264                        similarity: sim,
265                    });
266                    block_links += 1;
267                }
268            }
269        }
270    }
271
272    // Also check near-miss hashes (different hash but potentially similar)
273    // Only for small block counts to avoid O(n²)
274    if n < 5000 {
275        for i in 0..n {
276            let mut block_links = links
277                .iter()
278                .filter(|l| l.block_a == i as u32 || l.block_b == i as u32)
279                .count();
280            if block_links >= MAX_LINKS_PER_BLOCK {
281                continue;
282            }
283
284            for j in (i + 1)..n {
285                if fingerprints[i].hash == fingerprints[j].hash {
286                    continue; // already checked
287                }
288                let sim = fingerprint_similarity(&fingerprints[i], &fingerprints[j]);
289                if sim >= LINK_THRESHOLD {
290                    links.push(StructuralLink {
291                        block_a: i as u32,
292                        block_b: j as u32,
293                        similarity: sim,
294                    });
295                    block_links += 1;
296                    if block_links >= MAX_LINKS_PER_BLOCK {
297                        break;
298                    }
299                }
300            }
301        }
302    }
303
304    links
305}
306
307fn fnv1a_hash(data: &[u8]) -> u64 {
308    let mut h: u64 = 0xcbf29ce484222325;
309    for &b in data {
310        h ^= b as u64;
311        h = h.wrapping_mul(0x100000001b3);
312    }
313    h
314}
315
316// ─── Binary I/O ─────────────────────────────────────
317
318fn save_fingerprints(output_dir: &Path, fps: &[BlockFingerprint]) -> Result<(), String> {
319    let path = output_dir.join("fingerprints.idx");
320    let mut buf = Vec::with_capacity(8 + fps.len() * FINGERPRINT_BYTES);
321    buf.extend_from_slice(b"FGP1");
322    buf.extend_from_slice(&(fps.len() as u32).to_le_bytes());
323    for fp in fps {
324        buf.extend_from_slice(&fp.entropy.to_le_bytes());
325        buf.extend_from_slice(&fp.histogram);
326        buf.extend_from_slice(&fp.hash.to_le_bytes());
327    }
328    fs::write(&path, &buf).map_err(|e| format!("write fingerprints.idx: {}", e))
329}
330
331fn load_fingerprints(output_dir: &Path) -> Option<Vec<BlockFingerprint>> {
332    let path = output_dir.join("fingerprints.idx");
333    let data = fs::read(&path).ok()?;
334    if data.len() < 8 || &data[0..4] != b"FGP1" {
335        return None;
336    }
337    let count = u32::from_le_bytes(data[4..8].try_into().unwrap()) as usize;
338    let mut fps = Vec::with_capacity(count);
339    let mut pos = 8;
340    for _ in 0..count {
341        if pos + FINGERPRINT_BYTES > data.len() {
342            break;
343        }
344        let entropy = f32::from_le_bytes(data[pos..pos + 4].try_into().unwrap());
345        let mut histogram = [0u8; HIST_BUCKETS];
346        histogram.copy_from_slice(&data[pos + 4..pos + 4 + HIST_BUCKETS]);
347        let hash = u64::from_le_bytes(data[pos + 20..pos + 28].try_into().unwrap());
348        fps.push(BlockFingerprint {
349            entropy,
350            histogram,
351            hash,
352        });
353        pos += FINGERPRINT_BYTES;
354    }
355    Some(fps)
356}
357
358fn save_links(output_dir: &Path, links: &[StructuralLink]) -> Result<(), String> {
359    let path = output_dir.join("links.bin");
360    let mut buf = Vec::with_capacity(8 + links.len() * 12);
361    buf.extend_from_slice(b"LNK1");
362    buf.extend_from_slice(&(links.len() as u32).to_le_bytes());
363    for link in links {
364        buf.extend_from_slice(&link.block_a.to_le_bytes());
365        buf.extend_from_slice(&link.block_b.to_le_bytes());
366        buf.extend_from_slice(&link.similarity.to_le_bytes());
367    }
368    fs::write(&path, &buf).map_err(|e| format!("write links.bin: {}", e))
369}
370
371fn load_links(output_dir: &Path) -> Option<Vec<StructuralLink>> {
372    let path = output_dir.join("links.bin");
373    let data = fs::read(&path).ok()?;
374    if data.len() < 8 || &data[0..4] != b"LNK1" {
375        return None;
376    }
377    let count = u32::from_le_bytes(data[4..8].try_into().unwrap()) as usize;
378    let mut links = Vec::with_capacity(count);
379    let mut pos = 8;
380    for _ in 0..count {
381        if pos + 12 > data.len() {
382            break;
383        }
384        let block_a = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap());
385        let block_b = u32::from_le_bytes(data[pos + 4..pos + 8].try_into().unwrap());
386        let similarity = f32::from_le_bytes(data[pos + 8..pos + 12].try_into().unwrap());
387        links.push(StructuralLink {
388            block_a,
389            block_b,
390            similarity,
391        });
392        pos += 12;
393    }
394    Some(links)
395}
396
397#[cfg(test)]
398mod tests {
399    use super::*;
400
401    #[test]
402    fn test_entropy_uniform() {
403        let fp = compute_fingerprint(b"aaaaaaaaaa");
404        assert!(fp.entropy < 0.01); // single byte = 0 entropy
405    }
406
407    #[test]
408    fn test_entropy_varied() {
409        let data: Vec<u8> = (0..=255).collect();
410        let fp = compute_fingerprint(&data);
411        assert!(fp.entropy > 7.0); // max entropy ≈ 8.0
412    }
413
414    #[test]
415    fn test_fingerprint_deterministic() {
416        let a = compute_fingerprint(b"hello world, this is a test of the fingerprint system");
417        let b = compute_fingerprint(b"hello world, this is a test of the fingerprint system");
418        assert_eq!(a.hash, b.hash);
419        assert!((a.entropy - b.entropy).abs() < 0.001);
420        assert_eq!(a.histogram, b.histogram);
421    }
422
423    #[test]
424    fn test_similarity_identical() {
425        let fp = compute_fingerprint(b"hello world test");
426        let sim = fingerprint_similarity(&fp, &fp);
427        assert!(sim > 0.99);
428    }
429
430    #[test]
431    fn test_similarity_different() {
432        let a = compute_fingerprint(b"aaaaaaaaaaaaaaaa");
433        let data: Vec<u8> = (0..=255).collect();
434        let b = compute_fingerprint(&data);
435        let sim = fingerprint_similarity(&a, &b);
436        assert!(sim < 0.5); // very different
437    }
438
439    #[test]
440    fn test_similarity_similar_text() {
441        let a = compute_fingerprint(b"the quick brown fox jumps over the lazy dog");
442        let b = compute_fingerprint(b"the fast brown fox leaps over the tired dog");
443        let sim = fingerprint_similarity(&a, &b);
444        assert!(sim > 0.7); // similar letter distributions
445    }
446
447    #[test]
448    fn test_find_links() {
449        let texts = [
450            "hello world this is a test",
451            "hello world this is a test", // identical
452            "completely different binary data 12345!@#$%",
453            "hello world this is another test", // similar
454        ];
455        let fps: Vec<BlockFingerprint> = texts
456            .iter()
457            .map(|t| compute_fingerprint(t.as_bytes()))
458            .collect();
459        let links = find_links(&fps);
460
461        // At minimum, blocks 0 and 1 should be linked (identical)
462        assert!(links
463            .iter()
464            .any(|l| { (l.block_a == 0 && l.block_b == 1) || (l.block_a == 1 && l.block_b == 0) }));
465    }
466
467    #[test]
468    fn test_link_table_build() {
469        let texts = vec!["hello world", "hello world", "binary data xyz"];
470        let table = LinkTable::build(&texts);
471        assert_eq!(table.fingerprints.len(), 3);
472        // At least one link between the two identical texts
473        assert!(!table.links.is_empty());
474    }
475
476    #[test]
477    fn test_find_similar() {
478        let texts = vec![
479            "hello world testing one two three",
480            "hello world testing four five six",
481            "completely random binary noise !!!",
482        ];
483        let table = LinkTable::build(&texts);
484        let results = table.find_similar("hello world testing", 5);
485        assert!(!results.is_empty());
486    }
487
488    #[test]
489    fn test_save_load_roundtrip() {
490        let tmp = tempfile::tempdir().expect("create temp dir");
491        let dir = tmp.path();
492
493        let texts = vec!["hello world", "test data", "more content here"];
494        let table = LinkTable::build(&texts);
495        table.save(dir).expect("save");
496
497        let loaded = LinkTable::load(dir).expect("load");
498        assert_eq!(loaded.fingerprints.len(), 3);
499        assert!((loaded.fingerprints[0].entropy - table.fingerprints[0].entropy).abs() < 0.001);
500        assert_eq!(loaded.fingerprints[0].hash, table.fingerprints[0].hash);
501        assert_eq!(loaded.links.len(), table.links.len());
502    }
503
504    #[test]
505    fn test_linked_blocks() {
506        let texts = vec!["abc abc abc", "abc abc abc", "xyz xyz xyz"];
507        let table = LinkTable::build(&texts);
508        let linked = table.linked_blocks(0);
509        // Block 0 and 1 are identical, should be linked
510        assert!(linked.iter().any(|(idx, _)| *idx == 1));
511    }
512
513    #[test]
514    fn test_stats() {
515        let texts = vec!["hello", "world", "hello"];
516        let table = LinkTable::build(&texts);
517        let stats = table.stats();
518        assert_eq!(stats.block_count, 3);
519    }
520
521    #[test]
522    fn test_fnv1a_deterministic() {
523        assert_eq!(fnv1a_hash(b"hello"), fnv1a_hash(b"hello"));
524        assert_ne!(fnv1a_hash(b"hello"), fnv1a_hash(b"world"));
525    }
526}