1use std::collections::HashMap;
11use std::fs;
12use std::path::Path;
13
14const LINK_THRESHOLD: f32 = 0.85;
18const MAX_LINKS_PER_BLOCK: usize = 8;
20const HIST_BUCKETS: usize = 16;
22const FINGERPRINT_BYTES: usize = 28;
24
25#[derive(Clone, Debug)]
29pub struct BlockFingerprint {
30 pub entropy: f32,
32 pub histogram: [u8; HIST_BUCKETS],
34 pub hash: u64,
36}
37
38#[derive(Clone, Debug)]
40pub struct StructuralLink {
41 pub block_a: u32,
42 pub block_b: u32,
43 pub similarity: f32,
44}
45
46pub struct LinkTable {
48 pub fingerprints: Vec<BlockFingerprint>,
49 pub links: Vec<StructuralLink>,
50}
51
52impl LinkTable {
53 pub fn build(block_texts: &[&str]) -> Self {
55 let fingerprints: Vec<BlockFingerprint> = block_texts
56 .iter()
57 .map(|text| compute_fingerprint(text.as_bytes()))
58 .collect();
59
60 let links = find_links(&fingerprints);
61
62 LinkTable {
63 fingerprints,
64 links,
65 }
66 }
67
68 pub fn links_for(&self, block_idx: u32) -> Vec<&StructuralLink> {
70 self.links
71 .iter()
72 .filter(|l| l.block_a == block_idx || l.block_b == block_idx)
73 .collect()
74 }
75
76 pub fn linked_blocks(&self, block_idx: u32) -> Vec<(u32, f32)> {
78 self.links
79 .iter()
80 .filter_map(|l| {
81 if l.block_a == block_idx {
82 Some((l.block_b, l.similarity))
83 } else if l.block_b == block_idx {
84 Some((l.block_a, l.similarity))
85 } else {
86 None
87 }
88 })
89 .collect()
90 }
91
92 pub fn find_similar(&self, text: &str, k: usize) -> Vec<(u32, f32)> {
94 let query_fp = compute_fingerprint(text.as_bytes());
95 let mut results: Vec<(u32, f32)> = self
96 .fingerprints
97 .iter()
98 .enumerate()
99 .map(|(i, fp)| (i as u32, fingerprint_similarity(&query_fp, fp)))
100 .filter(|(_, sim)| *sim > 0.5)
101 .collect();
102
103 results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
104 results.truncate(k);
105 results
106 }
107
108 pub fn stats(&self) -> FingerprintStats {
110 let avg_entropy = if self.fingerprints.is_empty() {
111 0.0
112 } else {
113 self.fingerprints.iter().map(|f| f.entropy).sum::<f32>()
114 / self.fingerprints.len() as f32
115 };
116
117 let mut hash_counts: HashMap<u64, usize> = HashMap::new();
119 for fp in &self.fingerprints {
120 *hash_counts.entry(fp.hash).or_insert(0) += 1;
121 }
122 let unique_hashes = hash_counts.len();
123 let largest_cluster = hash_counts.values().max().copied().unwrap_or(0);
124
125 FingerprintStats {
126 block_count: self.fingerprints.len(),
127 link_count: self.links.len(),
128 avg_entropy,
129 unique_hashes,
130 largest_cluster,
131 }
132 }
133
134 pub fn save(&self, output_dir: &Path) -> Result<(), String> {
136 save_fingerprints(output_dir, &self.fingerprints)?;
137 save_links(output_dir, &self.links)?;
138 Ok(())
139 }
140
141 pub fn load(output_dir: &Path) -> Option<Self> {
143 let fingerprints = load_fingerprints(output_dir)?;
144 let links = load_links(output_dir).unwrap_or_default();
145 Some(LinkTable {
146 fingerprints,
147 links,
148 })
149 }
150}
151
152pub struct FingerprintStats {
153 pub block_count: usize,
154 pub link_count: usize,
155 pub avg_entropy: f32,
156 pub unique_hashes: usize,
157 pub largest_cluster: usize,
158}
159
160pub fn compute_fingerprint(data: &[u8]) -> BlockFingerprint {
164 let mut byte_counts = [0u32; 256];
166 for &b in data {
167 byte_counts[b as usize] += 1;
168 }
169 let len = data.len().max(1) as f32;
170
171 let entropy: f32 = byte_counts
172 .iter()
173 .filter(|&&c| c > 0)
174 .map(|&c| {
175 let p = c as f32 / len;
176 -p * p.log2()
177 })
178 .sum();
179
180 let mut histogram = [0u8; HIST_BUCKETS];
182 for (i, bucket) in histogram.iter_mut().enumerate() {
183 let start = i * 16;
184 let end = start + 16;
185 let sum: u32 = byte_counts[start..end].iter().sum();
186 *bucket = ((sum as f32 / len) * 255.0).min(255.0) as u8;
188 }
189
190 let hash = fnv1a_hash(&histogram);
192
193 BlockFingerprint {
194 entropy,
195 histogram,
196 hash,
197 }
198}
199
200pub fn fingerprint_similarity(a: &BlockFingerprint, b: &BlockFingerprint) -> f32 {
202 if a.hash != b.hash {
204 let mut dot = 0u32;
206 let mut norm_a = 0u32;
207 let mut norm_b = 0u32;
208 for i in 0..HIST_BUCKETS {
209 let va = a.histogram[i] as u32;
210 let vb = b.histogram[i] as u32;
211 dot += va * vb;
212 norm_a += va * va;
213 norm_b += vb * vb;
214 }
215
216 let denom = ((norm_a as f64).sqrt() * (norm_b as f64).sqrt()) as f32;
217 if denom < 1.0 {
218 return 0.0;
219 }
220 let cosine = dot as f32 / denom;
221
222 let entropy_diff = (a.entropy - b.entropy).abs();
224 let entropy_sim = 1.0 - (entropy_diff / 8.0).min(1.0);
225
226 cosine * 0.7 + entropy_sim * 0.3
227 } else {
228 let entropy_diff = (a.entropy - b.entropy).abs();
230 let entropy_sim = 1.0 - (entropy_diff / 8.0).min(1.0);
231 0.85 + entropy_sim * 0.15
232 }
233}
234
235fn find_links(fingerprints: &[BlockFingerprint]) -> Vec<StructuralLink> {
237 let mut links = Vec::new();
238 let n = fingerprints.len();
239
240 let mut hash_groups: HashMap<u64, Vec<usize>> = HashMap::new();
242 for (i, fp) in fingerprints.iter().enumerate() {
243 hash_groups.entry(fp.hash).or_default().push(i);
244 }
245
246 for group in hash_groups.values() {
248 if group.len() < 2 {
249 continue;
250 }
251 for i in 0..group.len() {
252 let mut block_links = 0;
253 for j in (i + 1)..group.len() {
254 if block_links >= MAX_LINKS_PER_BLOCK {
255 break;
256 }
257 let a = group[i];
258 let b = group[j];
259 let sim = fingerprint_similarity(&fingerprints[a], &fingerprints[b]);
260 if sim >= LINK_THRESHOLD {
261 links.push(StructuralLink {
262 block_a: a as u32,
263 block_b: b as u32,
264 similarity: sim,
265 });
266 block_links += 1;
267 }
268 }
269 }
270 }
271
272 if n < 5000 {
275 for i in 0..n {
276 let mut block_links = links
277 .iter()
278 .filter(|l| l.block_a == i as u32 || l.block_b == i as u32)
279 .count();
280 if block_links >= MAX_LINKS_PER_BLOCK {
281 continue;
282 }
283
284 for j in (i + 1)..n {
285 if fingerprints[i].hash == fingerprints[j].hash {
286 continue; }
288 let sim = fingerprint_similarity(&fingerprints[i], &fingerprints[j]);
289 if sim >= LINK_THRESHOLD {
290 links.push(StructuralLink {
291 block_a: i as u32,
292 block_b: j as u32,
293 similarity: sim,
294 });
295 block_links += 1;
296 if block_links >= MAX_LINKS_PER_BLOCK {
297 break;
298 }
299 }
300 }
301 }
302 }
303
304 links
305}
306
307fn fnv1a_hash(data: &[u8]) -> u64 {
308 let mut h: u64 = 0xcbf29ce484222325;
309 for &b in data {
310 h ^= b as u64;
311 h = h.wrapping_mul(0x100000001b3);
312 }
313 h
314}
315
316fn save_fingerprints(output_dir: &Path, fps: &[BlockFingerprint]) -> Result<(), String> {
319 let path = output_dir.join("fingerprints.idx");
320 let mut buf = Vec::with_capacity(8 + fps.len() * FINGERPRINT_BYTES);
321 buf.extend_from_slice(b"FGP1");
322 buf.extend_from_slice(&(fps.len() as u32).to_le_bytes());
323 for fp in fps {
324 buf.extend_from_slice(&fp.entropy.to_le_bytes());
325 buf.extend_from_slice(&fp.histogram);
326 buf.extend_from_slice(&fp.hash.to_le_bytes());
327 }
328 fs::write(&path, &buf).map_err(|e| format!("write fingerprints.idx: {}", e))
329}
330
331fn load_fingerprints(output_dir: &Path) -> Option<Vec<BlockFingerprint>> {
332 let path = output_dir.join("fingerprints.idx");
333 let data = fs::read(&path).ok()?;
334 if data.len() < 8 || &data[0..4] != b"FGP1" {
335 return None;
336 }
337 let count = u32::from_le_bytes(data[4..8].try_into().unwrap()) as usize;
338 let mut fps = Vec::with_capacity(count);
339 let mut pos = 8;
340 for _ in 0..count {
341 if pos + FINGERPRINT_BYTES > data.len() {
342 break;
343 }
344 let entropy = f32::from_le_bytes(data[pos..pos + 4].try_into().unwrap());
345 let mut histogram = [0u8; HIST_BUCKETS];
346 histogram.copy_from_slice(&data[pos + 4..pos + 4 + HIST_BUCKETS]);
347 let hash = u64::from_le_bytes(data[pos + 20..pos + 28].try_into().unwrap());
348 fps.push(BlockFingerprint {
349 entropy,
350 histogram,
351 hash,
352 });
353 pos += FINGERPRINT_BYTES;
354 }
355 Some(fps)
356}
357
358fn save_links(output_dir: &Path, links: &[StructuralLink]) -> Result<(), String> {
359 let path = output_dir.join("links.bin");
360 let mut buf = Vec::with_capacity(8 + links.len() * 12);
361 buf.extend_from_slice(b"LNK1");
362 buf.extend_from_slice(&(links.len() as u32).to_le_bytes());
363 for link in links {
364 buf.extend_from_slice(&link.block_a.to_le_bytes());
365 buf.extend_from_slice(&link.block_b.to_le_bytes());
366 buf.extend_from_slice(&link.similarity.to_le_bytes());
367 }
368 fs::write(&path, &buf).map_err(|e| format!("write links.bin: {}", e))
369}
370
371fn load_links(output_dir: &Path) -> Option<Vec<StructuralLink>> {
372 let path = output_dir.join("links.bin");
373 let data = fs::read(&path).ok()?;
374 if data.len() < 8 || &data[0..4] != b"LNK1" {
375 return None;
376 }
377 let count = u32::from_le_bytes(data[4..8].try_into().unwrap()) as usize;
378 let mut links = Vec::with_capacity(count);
379 let mut pos = 8;
380 for _ in 0..count {
381 if pos + 12 > data.len() {
382 break;
383 }
384 let block_a = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap());
385 let block_b = u32::from_le_bytes(data[pos + 4..pos + 8].try_into().unwrap());
386 let similarity = f32::from_le_bytes(data[pos + 8..pos + 12].try_into().unwrap());
387 links.push(StructuralLink {
388 block_a,
389 block_b,
390 similarity,
391 });
392 pos += 12;
393 }
394 Some(links)
395}
396
397#[cfg(test)]
398mod tests {
399 use super::*;
400
401 #[test]
402 fn test_entropy_uniform() {
403 let fp = compute_fingerprint(b"aaaaaaaaaa");
404 assert!(fp.entropy < 0.01); }
406
407 #[test]
408 fn test_entropy_varied() {
409 let data: Vec<u8> = (0..=255).collect();
410 let fp = compute_fingerprint(&data);
411 assert!(fp.entropy > 7.0); }
413
414 #[test]
415 fn test_fingerprint_deterministic() {
416 let a = compute_fingerprint(b"hello world, this is a test of the fingerprint system");
417 let b = compute_fingerprint(b"hello world, this is a test of the fingerprint system");
418 assert_eq!(a.hash, b.hash);
419 assert!((a.entropy - b.entropy).abs() < 0.001);
420 assert_eq!(a.histogram, b.histogram);
421 }
422
423 #[test]
424 fn test_similarity_identical() {
425 let fp = compute_fingerprint(b"hello world test");
426 let sim = fingerprint_similarity(&fp, &fp);
427 assert!(sim > 0.99);
428 }
429
430 #[test]
431 fn test_similarity_different() {
432 let a = compute_fingerprint(b"aaaaaaaaaaaaaaaa");
433 let data: Vec<u8> = (0..=255).collect();
434 let b = compute_fingerprint(&data);
435 let sim = fingerprint_similarity(&a, &b);
436 assert!(sim < 0.5); }
438
439 #[test]
440 fn test_similarity_similar_text() {
441 let a = compute_fingerprint(b"the quick brown fox jumps over the lazy dog");
442 let b = compute_fingerprint(b"the fast brown fox leaps over the tired dog");
443 let sim = fingerprint_similarity(&a, &b);
444 assert!(sim > 0.7); }
446
447 #[test]
448 fn test_find_links() {
449 let texts = [
450 "hello world this is a test",
451 "hello world this is a test", "completely different binary data 12345!@#$%",
453 "hello world this is another test", ];
455 let fps: Vec<BlockFingerprint> = texts
456 .iter()
457 .map(|t| compute_fingerprint(t.as_bytes()))
458 .collect();
459 let links = find_links(&fps);
460
461 assert!(links
463 .iter()
464 .any(|l| { (l.block_a == 0 && l.block_b == 1) || (l.block_a == 1 && l.block_b == 0) }));
465 }
466
467 #[test]
468 fn test_link_table_build() {
469 let texts = vec!["hello world", "hello world", "binary data xyz"];
470 let table = LinkTable::build(&texts);
471 assert_eq!(table.fingerprints.len(), 3);
472 assert!(!table.links.is_empty());
474 }
475
476 #[test]
477 fn test_find_similar() {
478 let texts = vec![
479 "hello world testing one two three",
480 "hello world testing four five six",
481 "completely random binary noise !!!",
482 ];
483 let table = LinkTable::build(&texts);
484 let results = table.find_similar("hello world testing", 5);
485 assert!(!results.is_empty());
486 }
487
488 #[test]
489 fn test_save_load_roundtrip() {
490 let tmp = tempfile::tempdir().expect("create temp dir");
491 let dir = tmp.path();
492
493 let texts = vec!["hello world", "test data", "more content here"];
494 let table = LinkTable::build(&texts);
495 table.save(dir).expect("save");
496
497 let loaded = LinkTable::load(dir).expect("load");
498 assert_eq!(loaded.fingerprints.len(), 3);
499 assert!((loaded.fingerprints[0].entropy - table.fingerprints[0].entropy).abs() < 0.001);
500 assert_eq!(loaded.fingerprints[0].hash, table.fingerprints[0].hash);
501 assert_eq!(loaded.links.len(), table.links.len());
502 }
503
504 #[test]
505 fn test_linked_blocks() {
506 let texts = vec!["abc abc abc", "abc abc abc", "xyz xyz xyz"];
507 let table = LinkTable::build(&texts);
508 let linked = table.linked_blocks(0);
509 assert!(linked.iter().any(|(idx, _)| *idx == 1));
511 }
512
513 #[test]
514 fn test_stats() {
515 let texts = vec!["hello", "world", "hello"];
516 let table = LinkTable::build(&texts);
517 let stats = table.stats();
518 assert_eq!(stats.block_count, 3);
519 }
520
521 #[test]
522 fn test_fnv1a_deterministic() {
523 assert_eq!(fnv1a_hash(b"hello"), fnv1a_hash(b"hello"));
524 assert_ne!(fnv1a_hash(b"hello"), fnv1a_hash(b"world"));
525 }
526}