embeddenator_fs/fs/embrfs.rs
1//! EmbrFS - Holographic Filesystem Implementation
2//!
3//! Provides engram-based storage for entire filesystem trees with:
4//! - Chunked encoding for efficient storage
5//! - Manifest for file metadata
6//! - **Guaranteed 100% bit-perfect reconstruction** via CorrectionStore
7//!
8//! # Reconstruction Guarantee
9//!
10//! The fundamental challenge with VSA encoding is that approximate operations
11//! may introduce errors during superposition. This module solves that through
12//! a multi-layer approach:
13//!
14//! 1. **Primary Encoding**: SparseVec encoding attempts bit-perfect storage
15//! 2. **Correction Layer**: CorrectionStore captures any encoding errors
16//! 3. **Reconstruction**: Decode + apply corrections = exact original
17//!
18//! The invariant: `original = decode(encode(original)) + correction`
19//!
20//! If encoding was perfect, correction is empty. If not, correction exactly
21//! compensates. Either way, reconstruction is guaranteed bit-perfect.
22
23use crate::correction::{CorrectionStats, CorrectionStore};
24use embeddenator_retrieval::resonator::Resonator;
25use embeddenator_retrieval::{RerankedResult, TernaryInvertedIndex};
26use embeddenator_vsa::{ReversibleVSAConfig, SparseVec, DIM};
27use serde::{Deserialize, Serialize};
28use std::collections::BTreeMap;
29use std::collections::{HashMap, HashSet};
30use std::fs::{self, File};
31use std::io::{self, Read};
32use std::path::{Path, PathBuf};
33use walkdir::WalkDir;
34
35/// Default chunk size for file encoding (4KB)
36pub const DEFAULT_CHUNK_SIZE: usize = 4096;
37
38/// File entry in the manifest
39#[derive(Serialize, Deserialize, Debug)]
40pub struct FileEntry {
41 pub path: String,
42 pub is_text: bool,
43 pub size: usize,
44 pub chunks: Vec<usize>,
45 /// Mark files as deleted without rebuilding root (for incremental updates)
46 #[serde(default)]
47 pub deleted: bool,
48}
49
50/// Manifest describing filesystem structure
51#[derive(Serialize, Deserialize, Debug)]
52pub struct Manifest {
53 pub files: Vec<FileEntry>,
54 pub total_chunks: usize,
55}
56
57/// Hierarchical manifest for multi-level engrams
58#[derive(Serialize, Deserialize, Debug)]
59pub struct HierarchicalManifest {
60 pub version: u32,
61 pub levels: Vec<ManifestLevel>,
62 #[serde(default)]
63 pub sub_engrams: HashMap<String, SubEngram>,
64}
65
66/// Level in hierarchical manifest
67#[derive(Serialize, Deserialize, Debug, Clone)]
68pub struct ManifestLevel {
69 pub level: u32,
70 pub items: Vec<ManifestItem>,
71}
72
73/// Item in manifest level
74#[derive(Serialize, Deserialize, Debug, Clone)]
75pub struct ManifestItem {
76 pub path: String,
77 pub sub_engram_id: String,
78}
79
80/// Sub-engram in hierarchical structure
81#[derive(Serialize, Deserialize, Debug, Clone)]
82pub struct SubEngram {
83 pub id: String,
84 pub root: SparseVec,
85 /// Chunk IDs that belong to this sub-engram.
86 ///
87 /// This enables selective retrieval without indexing the entire global codebook.
88 #[serde(default)]
89 pub chunk_ids: Vec<usize>,
90 pub chunk_count: usize,
91 pub children: Vec<String>,
92}
93
94/// Bounds and tuning parameters for hierarchical selective retrieval.
95#[derive(Clone, Debug)]
96pub struct HierarchicalQueryBounds {
97 /// Global top-k results to return.
98 pub k: usize,
99 /// Candidate count per expanded node before reranking.
100 pub candidate_k: usize,
101 /// Maximum number of frontier nodes retained (beam width).
102 pub beam_width: usize,
103 /// Maximum depth to descend (0 means only level-0 nodes).
104 pub max_depth: usize,
105 /// Maximum number of expanded nodes.
106 pub max_expansions: usize,
107 /// Maximum number of cached inverted indices.
108 pub max_open_indices: usize,
109 /// Maximum number of cached sub-engrams.
110 pub max_open_engrams: usize,
111}
112
113impl Default for HierarchicalQueryBounds {
114 fn default() -> Self {
115 Self {
116 k: 10,
117 candidate_k: 100,
118 beam_width: 32,
119 max_depth: 4,
120 max_expansions: 128,
121 max_open_indices: 16,
122 max_open_engrams: 16,
123 }
124 }
125}
126
127#[derive(Clone, Debug, PartialEq)]
128pub struct HierarchicalChunkHit {
129 pub sub_engram_id: String,
130 pub chunk_id: usize,
131 pub approx_score: i32,
132 pub cosine: f64,
133}
134
135#[derive(Clone, Debug)]
136struct FrontierItem {
137 score: f64,
138 sub_engram_id: String,
139 depth: usize,
140}
141
142#[derive(Clone, Debug)]
143struct RemappedInvertedIndex {
144 index: TernaryInvertedIndex,
145 local_to_global: Vec<usize>,
146}
147
148impl RemappedInvertedIndex {
149 fn build(chunk_ids: &[usize], vectors: &HashMap<usize, SparseVec>) -> Self {
150 let mut index = TernaryInvertedIndex::new();
151 let mut local_to_global = Vec::with_capacity(chunk_ids.len());
152
153 for (local_id, &global_id) in chunk_ids.iter().enumerate() {
154 let Some(vec) = vectors.get(&global_id) else {
155 continue;
156 };
157 local_to_global.push(global_id);
158 index.add(local_id, vec);
159 }
160
161 index.finalize();
162 Self {
163 index,
164 local_to_global,
165 }
166 }
167
168 fn query_top_k_reranked(
169 &self,
170 query: &SparseVec,
171 vectors: &HashMap<usize, SparseVec>,
172 candidate_k: usize,
173 k: usize,
174 ) -> Vec<HierarchicalChunkHit> {
175 if k == 0 {
176 return Vec::new();
177 }
178
179 let candidates = self.index.query_top_k(query, candidate_k);
180 let mut out = Vec::with_capacity(candidates.len().min(k));
181 for cand in candidates {
182 let Some(&global_id) = self.local_to_global.get(cand.id) else {
183 continue;
184 };
185 let Some(vec) = vectors.get(&global_id) else {
186 continue;
187 };
188 out.push((global_id, cand.score, query.cosine(vec)));
189 }
190
191 out.sort_by(|a, b| {
192 b.2.total_cmp(&a.2)
193 .then_with(|| b.1.cmp(&a.1))
194 .then_with(|| a.0.cmp(&b.0))
195 });
196 out.truncate(k);
197
198 out.into_iter()
199 .map(|(chunk_id, approx_score, cosine)| HierarchicalChunkHit {
200 sub_engram_id: String::new(),
201 chunk_id,
202 approx_score,
203 cosine,
204 })
205 .collect()
206 }
207}
208
209#[derive(Clone, Debug)]
210struct LruCache<V> {
211 cap: usize,
212 map: HashMap<String, V>,
213 order: Vec<String>,
214}
215
216impl<V> LruCache<V> {
217 fn new(cap: usize) -> Self {
218 Self {
219 cap,
220 map: HashMap::new(),
221 order: Vec::new(),
222 }
223 }
224
225 fn get(&mut self, key: &str) -> Option<&V> {
226 if self.map.contains_key(key) {
227 self.touch(key);
228 return self.map.get(key);
229 }
230 None
231 }
232
233 fn insert(&mut self, key: String, value: V) {
234 if self.cap == 0 {
235 return;
236 }
237
238 if self.map.contains_key(&key) {
239 self.map.insert(key.clone(), value);
240 self.touch(&key);
241 return;
242 }
243
244 self.map.insert(key.clone(), value);
245 self.order.push(key);
246
247 while self.map.len() > self.cap {
248 if let Some(evict) = self.order.first().cloned() {
249 self.order.remove(0);
250 self.map.remove(&evict);
251 } else {
252 break;
253 }
254 }
255 }
256
257 fn touch(&mut self, key: &str) {
258 if let Some(pos) = self.order.iter().position(|k| k == key) {
259 let k = self.order.remove(pos);
260 self.order.push(k);
261 }
262 }
263}
264
265/// Storage/loader seam for hierarchical sub-engrams.
266///
267/// This enables on-demand loading (e.g., from disk) rather than requiring that
268/// every sub-engram is materialized in memory.
269pub trait SubEngramStore {
270 fn load(&self, id: &str) -> Option<SubEngram>;
271}
272
273fn escape_sub_engram_id(id: &str) -> String {
274 // Minimal reversible escaping for filenames.
275 // Note: not intended for untrusted input; IDs are internal.
276 id.replace('%', "%25").replace('/', "%2F")
277}
278
279/// Directory-backed store for sub-engrams.
280///
281/// Files are stored as bincode blobs under `${dir}/{escaped_id}.subengram`.
282pub struct DirectorySubEngramStore {
283 dir: PathBuf,
284}
285
286impl DirectorySubEngramStore {
287 pub fn new<P: AsRef<Path>>(dir: P) -> Self {
288 Self {
289 dir: dir.as_ref().to_path_buf(),
290 }
291 }
292
293 fn path_for_id(&self, id: &str) -> PathBuf {
294 self.dir
295 .join(format!("{}.subengram", escape_sub_engram_id(id)))
296 }
297}
298
299impl SubEngramStore for DirectorySubEngramStore {
300 fn load(&self, id: &str) -> Option<SubEngram> {
301 let path = self.path_for_id(id);
302 let data = fs::read(path).ok()?;
303 bincode::deserialize(&data).ok()
304 }
305}
306
307/// Save a hierarchical manifest as JSON.
308pub fn save_hierarchical_manifest<P: AsRef<Path>>(
309 hierarchical: &HierarchicalManifest,
310 path: P,
311) -> io::Result<()> {
312 let file = File::create(path)?;
313
314 // Serialize deterministically: HashMap iteration order is not stable.
315 #[derive(Serialize)]
316 struct StableHierarchicalManifest {
317 version: u32,
318 levels: Vec<ManifestLevel>,
319 sub_engrams: BTreeMap<String, SubEngram>,
320 }
321
322 let mut levels = hierarchical.levels.clone();
323 levels.sort_by(|a, b| a.level.cmp(&b.level));
324 for level in &mut levels {
325 level.items.sort_by(|a, b| {
326 a.path
327 .cmp(&b.path)
328 .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
329 });
330 }
331
332 let mut sub_engrams: BTreeMap<String, SubEngram> = BTreeMap::new();
333 for (id, sub) in &hierarchical.sub_engrams {
334 sub_engrams.insert(id.clone(), sub.clone());
335 }
336
337 let stable = StableHierarchicalManifest {
338 version: hierarchical.version,
339 levels,
340 sub_engrams,
341 };
342
343 serde_json::to_writer_pretty(file, &stable)?;
344 Ok(())
345}
346
347/// Load a hierarchical manifest from JSON.
348pub fn load_hierarchical_manifest<P: AsRef<Path>>(path: P) -> io::Result<HierarchicalManifest> {
349 let file = File::open(path)?;
350 let manifest = serde_json::from_reader(file)?;
351 Ok(manifest)
352}
353
354/// Save a set of sub-engrams to a directory (bincode per sub-engram).
355pub fn save_sub_engrams_dir<P: AsRef<Path>>(
356 sub_engrams: &HashMap<String, SubEngram>,
357 dir: P,
358) -> io::Result<()> {
359 let dir = dir.as_ref();
360 fs::create_dir_all(dir)?;
361
362 let mut ids: Vec<&String> = sub_engrams.keys().collect();
363 ids.sort();
364
365 for id in ids {
366 // SAFETY: id comes from keys(), so get() must succeed
367 let sub = sub_engrams
368 .get(id)
369 .expect("sub_engram id from keys() must exist in HashMap");
370 let encoded = bincode::serialize(sub).map_err(io::Error::other)?;
371 let path = dir.join(format!("{}.subengram", escape_sub_engram_id(id)));
372 fs::write(path, encoded)?;
373 }
374 Ok(())
375}
376
377struct InMemorySubEngramStore<'a> {
378 map: &'a HashMap<String, SubEngram>,
379}
380
381impl<'a> InMemorySubEngramStore<'a> {
382 fn new(map: &'a HashMap<String, SubEngram>) -> Self {
383 Self { map }
384 }
385}
386
387impl SubEngramStore for InMemorySubEngramStore<'_> {
388 fn load(&self, id: &str) -> Option<SubEngram> {
389 self.map.get(id).cloned()
390 }
391}
392
393fn get_cached_sub_engram(
394 cache: &mut LruCache<SubEngram>,
395 store: &impl SubEngramStore,
396 id: &str,
397) -> Option<SubEngram> {
398 if let Some(v) = cache.get(id) {
399 return Some(v.clone());
400 }
401 let loaded = store.load(id)?;
402 cache.insert(id.to_string(), loaded.clone());
403 Some(loaded)
404}
405
406/// Query a hierarchical manifest by selectively unfolding only promising sub-engrams.
407///
408/// This performs a beam-limited traversal over `hierarchical.sub_engrams`.
409/// At each expanded node, it builds (and LRU-caches) an inverted index over the
410/// node-local `chunk_ids` subset of `codebook`, then reranks by exact cosine.
411pub fn query_hierarchical_codebook(
412 hierarchical: &HierarchicalManifest,
413 codebook: &HashMap<usize, SparseVec>,
414 query: &SparseVec,
415 bounds: &HierarchicalQueryBounds,
416) -> Vec<HierarchicalChunkHit> {
417 let store = InMemorySubEngramStore::new(&hierarchical.sub_engrams);
418 query_hierarchical_codebook_with_store(hierarchical, &store, codebook, query, bounds)
419}
420
421/// Store-backed variant of `query_hierarchical_codebook` that supports on-demand sub-engram loading.
422pub fn query_hierarchical_codebook_with_store(
423 hierarchical: &HierarchicalManifest,
424 store: &impl SubEngramStore,
425 codebook: &HashMap<usize, SparseVec>,
426 query: &SparseVec,
427 bounds: &HierarchicalQueryBounds,
428) -> Vec<HierarchicalChunkHit> {
429 if bounds.k == 0 || hierarchical.levels.is_empty() {
430 return Vec::new();
431 }
432
433 let mut sub_cache: LruCache<SubEngram> = LruCache::new(bounds.max_open_engrams);
434 let mut index_cache: LruCache<RemappedInvertedIndex> = LruCache::new(bounds.max_open_indices);
435
436 let mut frontier: Vec<FrontierItem> = Vec::new();
437 if let Some(level0) = hierarchical.levels.first() {
438 for item in &level0.items {
439 let Some(sub) = get_cached_sub_engram(&mut sub_cache, store, &item.sub_engram_id)
440 else {
441 continue;
442 };
443 frontier.push(FrontierItem {
444 score: query.cosine(&sub.root),
445 sub_engram_id: item.sub_engram_id.clone(),
446 depth: 0,
447 });
448 }
449 }
450
451 frontier.sort_by(|a, b| {
452 b.score
453 .total_cmp(&a.score)
454 .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
455 });
456 if frontier.len() > bounds.beam_width {
457 frontier.truncate(bounds.beam_width);
458 }
459
460 let mut expansions = 0usize;
461
462 // Keep only the best hit per chunk for determinism.
463 let mut best_by_chunk: HashMap<usize, HierarchicalChunkHit> = HashMap::new();
464
465 while !frontier.is_empty() && expansions < bounds.max_expansions {
466 let node = frontier.remove(0);
467
468 let Some(sub) = get_cached_sub_engram(&mut sub_cache, store, &node.sub_engram_id) else {
469 continue;
470 };
471
472 expansions += 1;
473
474 let idx = if let Some(existing) = index_cache.get(&node.sub_engram_id) {
475 existing
476 } else {
477 let built = RemappedInvertedIndex::build(&sub.chunk_ids, codebook);
478 index_cache.insert(node.sub_engram_id.clone(), built);
479 // SAFETY: we just inserted the key, so get() must succeed immediately after
480 index_cache
481 .get(&node.sub_engram_id)
482 .expect("index_cache.get() must succeed immediately after insert()")
483 };
484
485 let mut local_hits =
486 idx.query_top_k_reranked(query, codebook, bounds.candidate_k, bounds.k);
487 for hit in &mut local_hits {
488 hit.sub_engram_id = node.sub_engram_id.clone();
489 }
490
491 for hit in local_hits {
492 match best_by_chunk.get(&hit.chunk_id) {
493 None => {
494 best_by_chunk.insert(hit.chunk_id, hit);
495 }
496 Some(existing) => {
497 let better = hit
498 .cosine
499 .total_cmp(&existing.cosine)
500 .then_with(|| hit.approx_score.cmp(&existing.approx_score))
501 .is_gt();
502 if better {
503 best_by_chunk.insert(hit.chunk_id, hit);
504 }
505 }
506 }
507 }
508
509 if node.depth >= bounds.max_depth {
510 continue;
511 }
512
513 let children = sub.children.clone();
514 for child_id in &children {
515 let Some(child) = get_cached_sub_engram(&mut sub_cache, store, child_id) else {
516 continue;
517 };
518 frontier.push(FrontierItem {
519 score: query.cosine(&child.root),
520 sub_engram_id: child_id.clone(),
521 depth: node.depth + 1,
522 });
523 }
524
525 frontier.sort_by(|a, b| {
526 b.score
527 .total_cmp(&a.score)
528 .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
529 });
530 if frontier.len() > bounds.beam_width {
531 frontier.truncate(bounds.beam_width);
532 }
533 }
534
535 let mut out: Vec<HierarchicalChunkHit> = best_by_chunk.into_values().collect();
536 out.sort_by(|a, b| {
537 b.cosine
538 .total_cmp(&a.cosine)
539 .then_with(|| b.approx_score.cmp(&a.approx_score))
540 .then_with(|| a.chunk_id.cmp(&b.chunk_id))
541 .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
542 });
543 out.truncate(bounds.k);
544 out
545}
546
547/// Unified manifest enum for backward compatibility
548#[derive(Serialize, Deserialize, Debug)]
549pub enum UnifiedManifest {
550 Flat(Manifest),
551 Hierarchical(HierarchicalManifest),
552}
553
554impl From<Manifest> for UnifiedManifest {
555 fn from(manifest: Manifest) -> Self {
556 UnifiedManifest::Flat(manifest)
557 }
558}
559
560/// Engram: holographic encoding of a filesystem with correction guarantee
561#[derive(Serialize, Deserialize)]
562pub struct Engram {
563 pub root: SparseVec,
564 pub codebook: HashMap<usize, SparseVec>,
565 /// Correction store for 100% reconstruction guarantee
566 #[serde(default)]
567 pub corrections: CorrectionStore,
568}
569
570impl Engram {
571 /// Build a reusable inverted index over the codebook.
572 ///
573 /// This is useful when issuing multiple queries (e.g., shift-sweeps) and you
574 /// want to avoid rebuilding the index each time.
575 pub fn build_codebook_index(&self) -> TernaryInvertedIndex {
576 TernaryInvertedIndex::build_from_map(&self.codebook)
577 }
578
579 /// Query the codebook using a pre-built inverted index.
580 pub fn query_codebook_with_index(
581 &self,
582 index: &TernaryInvertedIndex,
583 query: &SparseVec,
584 candidate_k: usize,
585 k: usize,
586 ) -> Vec<RerankedResult> {
587 if k == 0 || self.codebook.is_empty() {
588 return Vec::new();
589 }
590 index.query_top_k_reranked(query, &self.codebook, candidate_k, k)
591 }
592
593 /// Query the engram's codebook for chunks most similar to `query`.
594 ///
595 /// This builds an inverted index over the codebook for sub-linear candidate
596 /// generation, then reranks those candidates using exact cosine similarity.
597 pub fn query_codebook(&self, query: &SparseVec, k: usize) -> Vec<RerankedResult> {
598 if k == 0 || self.codebook.is_empty() {
599 return Vec::new();
600 }
601
602 // Simple heuristic: rerank a moderately-sized candidate set.
603 let candidate_k = (k.saturating_mul(10)).max(50);
604 let index = self.build_codebook_index();
605 self.query_codebook_with_index(&index, query, candidate_k, k)
606 }
607}
608
609/// EmbrFS - Holographic Filesystem with Guaranteed Reconstruction
610///
611/// # 100% Reconstruction Guarantee
612///
613/// EmbrFS guarantees bit-perfect file reconstruction through a layered approach:
614///
615/// 1. **Encode**: Data chunks → SparseVec via reversible encoding
616/// 2. **Verify**: Immediately decode and compare to original
617/// 3. **Correct**: Store minimal correction if any difference exists
618/// 4. **Extract**: Decode + apply correction = exact original bytes
619///
620/// This guarantee holds regardless of:
621/// - Data content (binary, text, compressed, encrypted)
622/// - File size (single byte to gigabytes)
623/// - Number of files in the engram
624/// - Superposition crosstalk in bundles
625///
626/// # Examples
627///
628/// ```
629/// use embeddenator_fs::EmbrFS;
630/// use std::path::Path;
631///
632/// let mut fs = EmbrFS::new();
633/// // Ingest and extract would require actual files, so we just test creation
634/// assert_eq!(fs.manifest.total_chunks, 0);
635/// assert_eq!(fs.manifest.files.len(), 0);
636/// ```
637pub struct EmbrFS {
638 pub manifest: Manifest,
639 pub engram: Engram,
640 pub resonator: Option<Resonator>,
641}
642
643impl Default for EmbrFS {
644 fn default() -> Self {
645 Self::new()
646 }
647}
648
649impl EmbrFS {
650 /// Create a new empty EmbrFS instance
651 ///
652 /// # Examples
653 ///
654 /// ```
655 /// use embeddenator_fs::EmbrFS;
656 ///
657 /// let fs = EmbrFS::new();
658 /// assert_eq!(fs.manifest.files.len(), 0);
659 /// assert_eq!(fs.manifest.total_chunks, 0);
660 /// // Correction store starts empty
661 /// let stats = fs.engram.corrections.stats();
662 /// assert_eq!(stats.total_chunks, 0);
663 /// ```
664 pub fn new() -> Self {
665 EmbrFS {
666 manifest: Manifest {
667 files: Vec::new(),
668 total_chunks: 0,
669 },
670 engram: Engram {
671 root: SparseVec::new(),
672 codebook: HashMap::new(),
673 corrections: CorrectionStore::new(),
674 },
675 resonator: None,
676 }
677 }
678
679 fn path_to_forward_slash_string(path: &Path) -> String {
680 path.components()
681 .filter_map(|c| match c {
682 std::path::Component::Normal(s) => s.to_str().map(|v| v.to_string()),
683 _ => None,
684 })
685 .collect::<Vec<String>>()
686 .join("/")
687 }
688
689 /// Set the resonator for enhanced pattern recovery during extraction
690 ///
691 /// Configures a resonator network that can perform pattern completion to recover
692 /// missing or corrupted data chunks during filesystem extraction. The resonator
693 /// acts as a content-addressable memory that can reconstruct lost information
694 /// by finding the best matching patterns in its trained codebook.
695 ///
696 /// # How it works
697 /// - The resonator maintains a codebook of known vector patterns
698 /// - During extraction, missing chunks are projected onto the closest known pattern
699 /// - This enables robust recovery from partial data loss or corruption
700 ///
701 /// # Why this matters
702 /// - Provides fault tolerance for holographic storage systems
703 /// - Enables reconstruction even when some chunks are unavailable
704 /// - Supports graceful degradation rather than complete failure
705 ///
706 /// # Arguments
707 /// * `resonator` - A trained resonator network for pattern completion
708 ///
709 /// # Examples
710 /// ```
711 /// use embeddenator_fs::{EmbrFS, Resonator};
712 ///
713 /// let mut fs = EmbrFS::new();
714 /// let resonator = Resonator::new();
715 /// fs.set_resonator(resonator);
716 /// // Now extraction will use resonator-enhanced recovery
717 /// ```
718 pub fn set_resonator(&mut self, resonator: Resonator) {
719 self.resonator = Some(resonator);
720 }
721
722 /// Get correction statistics for this engram
723 ///
724 /// Returns statistics about how many chunks needed correction and the
725 /// overhead incurred by storing corrections.
726 ///
727 /// # Examples
728 /// ```
729 /// use embeddenator_fs::EmbrFS;
730 ///
731 /// let fs = EmbrFS::new();
732 /// let stats = fs.correction_stats();
733 /// assert_eq!(stats.total_chunks, 0);
734 /// ```
735 pub fn correction_stats(&self) -> CorrectionStats {
736 self.engram.corrections.stats()
737 }
738
739 /// Ingest an entire directory into engram format
740 pub fn ingest_directory<P: AsRef<Path>>(
741 &mut self,
742 dir: P,
743 verbose: bool,
744 config: &ReversibleVSAConfig,
745 ) -> io::Result<()> {
746 self.ingest_directory_with_prefix(dir, None, verbose, config)
747 }
748
749 /// Ingest a directory into the engram, optionally prefixing all logical paths.
750 ///
751 /// When `logical_prefix` is provided, all ingested file paths become:
752 /// `{logical_prefix}/{relative_path_from_dir}`.
753 pub fn ingest_directory_with_prefix<P: AsRef<Path>>(
754 &mut self,
755 dir: P,
756 logical_prefix: Option<&str>,
757 verbose: bool,
758 config: &ReversibleVSAConfig,
759 ) -> io::Result<()> {
760 let dir = dir.as_ref();
761 if verbose {
762 println!("Ingesting directory: {}", dir.display());
763 }
764
765 let mut files_to_process = Vec::new();
766 for entry in WalkDir::new(dir).follow_links(false) {
767 let entry = entry?;
768 if entry.file_type().is_file() {
769 files_to_process.push(entry.path().to_path_buf());
770 }
771 }
772 files_to_process.sort();
773
774 for file_path in files_to_process {
775 let relative = file_path.strip_prefix(dir).unwrap_or(file_path.as_path());
776 let rel = Self::path_to_forward_slash_string(relative);
777 let logical_path = if let Some(prefix) = logical_prefix {
778 if prefix.is_empty() {
779 rel
780 } else if rel.is_empty() {
781 prefix.to_string()
782 } else {
783 format!("{}/{}", prefix, rel)
784 }
785 } else {
786 rel
787 };
788
789 self.ingest_file(&file_path, logical_path, verbose, config)?;
790 }
791
792 Ok(())
793 }
794
795 /// Ingest a single file into the engram with guaranteed reconstruction
796 ///
797 /// This method encodes file data into sparse vectors and stores any
798 /// necessary corrections to guarantee 100% bit-perfect reconstruction.
799 ///
800 /// # Correction Process
801 ///
802 /// For each chunk:
803 /// 1. Encode: `chunk_data → SparseVec`
804 /// 2. Decode: `SparseVec → decoded_data`
805 /// 3. Compare: `chunk_data == decoded_data?`
806 /// 4. If different: store correction in `CorrectionStore`
807 ///
808 /// # Arguments
809 /// * `file_path` - Path to the file on disk
810 /// * `logical_path` - Path to use in the engram manifest
811 /// * `verbose` - Print progress information
812 /// * `config` - VSA encoding configuration
813 ///
814 /// # Returns
815 /// `io::Result<()>` indicating success or failure
816 pub fn ingest_file<P: AsRef<Path>>(
817 &mut self,
818 file_path: P,
819 logical_path: String,
820 verbose: bool,
821 config: &ReversibleVSAConfig,
822 ) -> io::Result<()> {
823 let file_path = file_path.as_ref();
824 let mut file = File::open(file_path)?;
825 let mut data = Vec::new();
826 file.read_to_end(&mut data)?;
827
828 let is_text = is_text_file(&data);
829
830 if verbose {
831 println!(
832 "Ingesting {}: {} bytes ({})",
833 logical_path,
834 data.len(),
835 if is_text { "text" } else { "binary" }
836 );
837 }
838
839 let chunk_size = DEFAULT_CHUNK_SIZE;
840 let mut chunks = Vec::new();
841 let mut corrections_needed = 0usize;
842
843 for (i, chunk) in data.chunks(chunk_size).enumerate() {
844 let chunk_id = self.manifest.total_chunks + i;
845
846 // Encode chunk to sparse vector
847 let chunk_vec = SparseVec::encode_data(chunk, config, Some(&logical_path));
848
849 // Immediately verify: decode and compare
850 let decoded = chunk_vec.decode_data(config, Some(&logical_path), chunk.len());
851
852 // Store correction if needed (guarantees reconstruction)
853 self.engram
854 .corrections
855 .add(chunk_id as u64, chunk, &decoded);
856
857 if chunk != decoded.as_slice() {
858 corrections_needed += 1;
859 }
860
861 self.engram.root = self.engram.root.bundle(&chunk_vec);
862 self.engram.codebook.insert(chunk_id, chunk_vec);
863 chunks.push(chunk_id);
864 }
865
866 if verbose && corrections_needed > 0 {
867 println!(
868 " → {} of {} chunks needed correction",
869 corrections_needed,
870 chunks.len()
871 );
872 }
873
874 self.manifest.files.push(FileEntry {
875 path: logical_path,
876 is_text,
877 size: data.len(),
878 chunks: chunks.clone(),
879 deleted: false,
880 });
881
882 self.manifest.total_chunks += chunks.len();
883
884 Ok(())
885 }
886
887 /// Add a new file to an existing engram (incremental update)
888 ///
889 /// This method enables efficient incremental updates by adding a single file
890 /// to an existing engram without requiring full re-ingestion. The new file's
891 /// chunks are bundled with the existing root vector using VSA's associative
892 /// bundle operation.
893 ///
894 /// # Algorithm
895 /// 1. Encode new file into chunks (same as ingest_file)
896 /// 2. Bundle each chunk with existing root: `root_new = root_old ⊕ chunk`
897 /// 3. Add chunks to codebook with new chunk IDs
898 /// 4. Update manifest with new file entry
899 ///
900 /// # Performance
901 /// - Time complexity: O(n) where n = number of chunks in new file
902 /// - Does not require reading or re-encoding existing files
903 /// - Suitable for production workflows with frequent additions
904 ///
905 /// # Arguments
906 /// * `file_path` - Path to the file on disk
907 /// * `logical_path` - Path to use in the engram manifest
908 /// * `verbose` - Print progress information
909 /// * `config` - VSA encoding configuration
910 ///
911 /// # Returns
912 /// `io::Result<()>` indicating success or failure
913 ///
914 /// # Examples
915 /// ```no_run
916 /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
917 /// use std::path::Path;
918 ///
919 /// let mut fs = EmbrFS::new();
920 /// let config = ReversibleVSAConfig::default();
921 ///
922 /// // Ingest initial dataset
923 /// fs.ingest_directory("./data", false, &config).unwrap();
924 ///
925 /// // Later, add a new file without full re-ingestion
926 /// fs.add_file("./new_file.txt", "new_file.txt".to_string(), true, &config).unwrap();
927 /// ```
928 pub fn add_file<P: AsRef<Path>>(
929 &mut self,
930 file_path: P,
931 logical_path: String,
932 verbose: bool,
933 config: &ReversibleVSAConfig,
934 ) -> io::Result<()> {
935 let file_path = file_path.as_ref();
936
937 // Check if file already exists (not deleted)
938 if self
939 .manifest
940 .files
941 .iter()
942 .any(|f| f.path == logical_path && !f.deleted)
943 {
944 return Err(io::Error::new(
945 io::ErrorKind::AlreadyExists,
946 format!("File '{}' already exists in engram", logical_path),
947 ));
948 }
949
950 // Use existing ingest_file logic (already handles bundling with root)
951 self.ingest_file(file_path, logical_path, verbose, config)
952 }
953
954 /// Remove a file from the engram (mark as deleted for incremental update)
955 ///
956 /// This method marks a file as deleted in the manifest without modifying the
957 /// root vector. This is because VSA bundling is a lossy operation and there's
958 /// no clean inverse. The chunks remain in the codebook but won't be extracted.
959 ///
960 /// # Algorithm
961 /// 1. Find file in manifest by logical path
962 /// 2. Mark file entry as deleted
963 /// 3. Chunks remain in codebook (for potential recovery or compaction)
964 /// 4. File won't appear in future extractions
965 ///
966 /// # Note on VSA Limitations
967 /// Bundle operation is associative but not invertible:
968 /// - `(A ⊕ B) ⊕ C = A ⊕ (B ⊕ C)` ✓ (can add)
969 /// - `(A ⊕ B) ⊖ B ≠ A` ✗ (can't cleanly remove)
970 ///
971 /// To truly remove chunks from the root, use `compact()` which rebuilds
972 /// the engram without deleted files.
973 ///
974 /// # Arguments
975 /// * `logical_path` - Path of the file to remove
976 /// * `verbose` - Print progress information
977 ///
978 /// # Returns
979 /// `io::Result<()>` indicating success or failure
980 ///
981 /// # Examples
982 /// ```no_run
983 /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
984 ///
985 /// let mut fs = EmbrFS::new();
986 /// let config = ReversibleVSAConfig::default();
987 ///
988 /// fs.ingest_directory("./data", false, &config).unwrap();
989 /// fs.remove_file("old_file.txt", true).unwrap();
990 /// // File marked as deleted, won't be extracted
991 /// ```
992 pub fn remove_file(&mut self, logical_path: &str, verbose: bool) -> io::Result<()> {
993 // Find file in manifest
994 let file_entry = self
995 .manifest
996 .files
997 .iter_mut()
998 .find(|f| f.path == logical_path && !f.deleted)
999 .ok_or_else(|| {
1000 io::Error::new(
1001 io::ErrorKind::NotFound,
1002 format!("File '{}' not found in engram", logical_path),
1003 )
1004 })?;
1005
1006 if verbose {
1007 println!(
1008 "Marking file as deleted: {} ({} chunks)",
1009 logical_path,
1010 file_entry.chunks.len()
1011 );
1012 }
1013
1014 // Mark as deleted (don't remove from manifest to preserve chunk IDs)
1015 file_entry.deleted = true;
1016
1017 if verbose {
1018 println!(" Note: Use 'compact' to rebuild engram and reclaim space");
1019 }
1020
1021 Ok(())
1022 }
1023
1024 /// Modify an existing file in the engram (incremental update)
1025 ///
1026 /// This method updates a file's content by removing the old version and
1027 /// adding the new version. It's equivalent to `remove_file` + `add_file`.
1028 ///
1029 /// # Algorithm
1030 /// 1. Mark old file as deleted
1031 /// 2. Re-encode new file content
1032 /// 3. Bundle new chunks with root
1033 /// 4. Add new file entry to manifest
1034 ///
1035 /// # Trade-offs
1036 /// - Old chunks remain in codebook (use `compact()` to clean up)
1037 /// - Root contains both old and new chunk contributions (slight noise)
1038 /// - Fast operation, doesn't require rebuilding entire engram
1039 ///
1040 /// # Arguments
1041 /// * `file_path` - Path to the file on disk (new content)
1042 /// * `logical_path` - Path of the file in the engram
1043 /// * `verbose` - Print progress information
1044 /// * `config` - VSA encoding configuration
1045 ///
1046 /// # Returns
1047 /// `io::Result<()>` indicating success or failure
1048 ///
1049 /// # Examples
1050 /// ```no_run
1051 /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1052 /// use std::path::Path;
1053 ///
1054 /// let mut fs = EmbrFS::new();
1055 /// let config = ReversibleVSAConfig::default();
1056 ///
1057 /// fs.ingest_directory("./data", false, &config).unwrap();
1058 ///
1059 /// // Later, modify a file
1060 /// fs.modify_file("./data/updated.txt", "data/updated.txt".to_string(), true, &config).unwrap();
1061 /// ```
1062 pub fn modify_file<P: AsRef<Path>>(
1063 &mut self,
1064 file_path: P,
1065 logical_path: String,
1066 verbose: bool,
1067 config: &ReversibleVSAConfig,
1068 ) -> io::Result<()> {
1069 // First, mark old file as deleted
1070 self.remove_file(&logical_path, false)?;
1071
1072 if verbose {
1073 println!("Modifying file: {}", logical_path);
1074 }
1075
1076 // Then add the new version
1077 self.ingest_file(file_path, logical_path, verbose, config)?;
1078
1079 Ok(())
1080 }
1081
1082 /// Compact the engram by rebuilding without deleted files
1083 ///
1084 /// This operation rebuilds the engram from scratch, excluding all files
1085 /// marked as deleted. It's the only way to truly remove old chunks from
1086 /// the root vector and codebook.
1087 ///
1088 /// # Algorithm
1089 /// 1. Create new empty engram
1090 /// 2. Re-bundle all non-deleted files
1091 /// 3. Reassign chunk IDs sequentially
1092 /// 4. Replace old engram with compacted version
1093 ///
1094 /// # Performance
1095 /// - Time complexity: O(N) where N = total bytes of non-deleted files
1096 /// - Expensive operation, run periodically (not after every deletion)
1097 /// - Recommended: compact when deleted files exceed 20-30% of total
1098 ///
1099 /// # Benefits
1100 /// - Reclaims space from deleted chunks
1101 /// - Reduces root vector noise from obsolete data
1102 /// - Resets chunk IDs to sequential order
1103 /// - Maintains bit-perfect reconstruction of kept files
1104 ///
1105 /// # Arguments
1106 /// * `verbose` - Print progress information
1107 /// * `config` - VSA encoding configuration
1108 ///
1109 /// # Returns
1110 /// `io::Result<()>` indicating success or failure
1111 ///
1112 /// # Examples
1113 /// ```no_run
1114 /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1115 ///
1116 /// let mut fs = EmbrFS::new();
1117 /// let config = ReversibleVSAConfig::default();
1118 ///
1119 /// fs.ingest_directory("./data", false, &config).unwrap();
1120 /// fs.remove_file("old1.txt", false).unwrap();
1121 /// fs.remove_file("old2.txt", false).unwrap();
1122 ///
1123 /// // After many deletions, compact to reclaim space
1124 /// fs.compact(true, &config).unwrap();
1125 /// ```
1126 pub fn compact(&mut self, verbose: bool, config: &ReversibleVSAConfig) -> io::Result<()> {
1127 if verbose {
1128 let deleted_count = self.manifest.files.iter().filter(|f| f.deleted).count();
1129 let total_count = self.manifest.files.len();
1130 println!(
1131 "Compacting engram: removing {} deleted files ({} remaining)",
1132 deleted_count,
1133 total_count - deleted_count
1134 );
1135 }
1136
1137 // Create new engram with fresh root and codebook
1138 let mut new_engram = Engram {
1139 root: SparseVec::new(),
1140 codebook: HashMap::new(),
1141 corrections: CorrectionStore::new(),
1142 };
1143
1144 // Rebuild manifest with only non-deleted files
1145 let mut new_manifest = Manifest {
1146 files: Vec::new(),
1147 total_chunks: 0,
1148 };
1149
1150 // Process each non-deleted file
1151 for old_file in &self.manifest.files {
1152 if old_file.deleted {
1153 continue;
1154 }
1155
1156 // Reconstruct file data from old engram
1157 let mut file_data = Vec::new();
1158 let num_chunks = old_file.chunks.len();
1159 for (chunk_idx, &chunk_id) in old_file.chunks.iter().enumerate() {
1160 if let Some(chunk_vec) = self.engram.codebook.get(&chunk_id) {
1161 let chunk_size = if chunk_idx == num_chunks - 1 {
1162 let remaining = old_file.size - (chunk_idx * DEFAULT_CHUNK_SIZE);
1163 remaining.min(DEFAULT_CHUNK_SIZE)
1164 } else {
1165 DEFAULT_CHUNK_SIZE
1166 };
1167
1168 let decoded = chunk_vec.decode_data(config, Some(&old_file.path), chunk_size);
1169 let chunk_data = if let Some(corrected) =
1170 self.engram.corrections.apply(chunk_id as u64, &decoded)
1171 {
1172 corrected
1173 } else {
1174 decoded
1175 };
1176
1177 file_data.extend_from_slice(&chunk_data);
1178 }
1179 }
1180 file_data.truncate(old_file.size);
1181
1182 // Re-encode with new chunk IDs
1183 let mut new_chunks = Vec::new();
1184
1185 for (i, chunk) in file_data.chunks(DEFAULT_CHUNK_SIZE).enumerate() {
1186 let new_chunk_id = new_manifest.total_chunks + i;
1187
1188 let chunk_vec = SparseVec::encode_data(chunk, config, Some(&old_file.path));
1189 let decoded = chunk_vec.decode_data(config, Some(&old_file.path), chunk.len());
1190
1191 new_engram
1192 .corrections
1193 .add(new_chunk_id as u64, chunk, &decoded);
1194
1195 new_engram.root = new_engram.root.bundle(&chunk_vec);
1196 new_engram.codebook.insert(new_chunk_id, chunk_vec);
1197 new_chunks.push(new_chunk_id);
1198 }
1199
1200 if verbose {
1201 println!(
1202 " Recompacted: {} ({} chunks)",
1203 old_file.path,
1204 new_chunks.len()
1205 );
1206 }
1207
1208 new_manifest.files.push(FileEntry {
1209 path: old_file.path.clone(),
1210 is_text: old_file.is_text,
1211 size: old_file.size,
1212 chunks: new_chunks.clone(),
1213 deleted: false,
1214 });
1215
1216 new_manifest.total_chunks += new_chunks.len();
1217 }
1218
1219 // Replace old engram and manifest with compacted versions
1220 self.engram = new_engram;
1221 self.manifest = new_manifest;
1222
1223 if verbose {
1224 println!(
1225 "Compaction complete: {} files, {} chunks",
1226 self.manifest.files.len(),
1227 self.manifest.total_chunks
1228 );
1229 }
1230
1231 Ok(())
1232 }
1233
1234 /// Save engram to file
1235 pub fn save_engram<P: AsRef<Path>>(&self, path: P) -> io::Result<()> {
1236 let encoded = bincode::serialize(&self.engram).map_err(io::Error::other)?;
1237 fs::write(path, encoded)?;
1238 Ok(())
1239 }
1240
1241 /// Load engram from file
1242 pub fn load_engram<P: AsRef<Path>>(path: P) -> io::Result<Engram> {
1243 let data = fs::read(path)?;
1244 bincode::deserialize(&data).map_err(io::Error::other)
1245 }
1246
1247 /// Save manifest to JSON file
1248 pub fn save_manifest<P: AsRef<Path>>(&self, path: P) -> io::Result<()> {
1249 let file = File::create(path)?;
1250 serde_json::to_writer_pretty(file, &self.manifest)?;
1251 Ok(())
1252 }
1253
1254 /// Load manifest from JSON file
1255 pub fn load_manifest<P: AsRef<Path>>(path: P) -> io::Result<Manifest> {
1256 let file = File::open(path)?;
1257 let manifest = serde_json::from_reader(file)?;
1258 Ok(manifest)
1259 }
1260
1261 /// Extract files from engram to directory with guaranteed reconstruction
1262 ///
1263 /// This method guarantees 100% bit-perfect reconstruction by applying
1264 /// stored corrections after decoding each chunk.
1265 ///
1266 /// # Reconstruction Process
1267 ///
1268 /// For each chunk:
1269 /// 1. Decode: `SparseVec → decoded_data`
1270 /// 2. Apply correction: `decoded_data + correction → original_data`
1271 /// 3. Verify: Hash matches stored hash (guaranteed by construction)
1272 ///
1273 /// # Arguments
1274 /// * `engram` - The engram containing encoded data and corrections
1275 /// * `manifest` - File metadata and chunk mappings
1276 /// * `output_dir` - Directory to write extracted files
1277 /// * `verbose` - Print progress information
1278 /// * `config` - VSA decoding configuration
1279 ///
1280 /// # Returns
1281 /// `io::Result<()>` indicating success or failure
1282 pub fn extract<P: AsRef<Path>>(
1283 engram: &Engram,
1284 manifest: &Manifest,
1285 output_dir: P,
1286 verbose: bool,
1287 config: &ReversibleVSAConfig,
1288 ) -> io::Result<()> {
1289 let output_dir = output_dir.as_ref();
1290
1291 if verbose {
1292 println!(
1293 "Extracting {} files to {}",
1294 manifest.files.iter().filter(|f| !f.deleted).count(),
1295 output_dir.display()
1296 );
1297 let stats = engram.corrections.stats();
1298 println!(
1299 " Correction stats: {:.1}% perfect, {:.2}% overhead",
1300 stats.perfect_ratio * 100.0,
1301 stats.correction_ratio * 100.0
1302 );
1303 }
1304
1305 for file_entry in &manifest.files {
1306 // Skip deleted files
1307 if file_entry.deleted {
1308 continue;
1309 }
1310
1311 let file_path = output_dir.join(&file_entry.path);
1312
1313 if let Some(parent) = file_path.parent() {
1314 fs::create_dir_all(parent)?;
1315 }
1316
1317 let mut reconstructed = Vec::new();
1318 let num_chunks = file_entry.chunks.len();
1319 for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
1320 if let Some(chunk_vec) = engram.codebook.get(&chunk_id) {
1321 // Calculate the actual chunk size
1322 // Last chunk may be smaller than DEFAULT_CHUNK_SIZE
1323 let chunk_size = if chunk_idx == num_chunks - 1 {
1324 // Last chunk: remaining bytes
1325 let remaining = file_entry.size - (chunk_idx * DEFAULT_CHUNK_SIZE);
1326 remaining.min(DEFAULT_CHUNK_SIZE)
1327 } else {
1328 DEFAULT_CHUNK_SIZE
1329 };
1330
1331 // Decode the sparse vector to bytes
1332 // IMPORTANT: Use the same path as during encoding for correct shift calculation
1333 // Also use the same chunk_size as during ingest for correct correction matching
1334 let decoded = chunk_vec.decode_data(config, Some(&file_entry.path), chunk_size);
1335
1336 // Apply correction to guarantee bit-perfect reconstruction
1337 let chunk_data = if let Some(corrected) =
1338 engram.corrections.apply(chunk_id as u64, &decoded)
1339 {
1340 corrected
1341 } else {
1342 // No correction found - use decoded directly
1343 // This can happen with legacy engrams or if correction store is empty
1344 decoded
1345 };
1346
1347 reconstructed.extend_from_slice(&chunk_data);
1348 }
1349 }
1350
1351 reconstructed.truncate(file_entry.size);
1352
1353 fs::write(&file_path, reconstructed)?;
1354
1355 if verbose {
1356 println!("Extracted: {}", file_entry.path);
1357 }
1358 }
1359
1360 Ok(())
1361 }
1362
1363 /// Extract files using resonator-enhanced pattern completion with guaranteed reconstruction
1364 ///
1365 /// Performs filesystem extraction with intelligent recovery capabilities powered by
1366 /// resonator networks. When chunks are missing from the codebook, the resonator
1367 /// attempts pattern completion to reconstruct the lost data, enabling extraction
1368 /// even from partially corrupted or incomplete engrams.
1369 ///
1370 /// # Reconstruction Guarantee
1371 ///
1372 /// Even with resonator-assisted recovery, corrections are applied to guarantee
1373 /// bit-perfect reconstruction. The process is:
1374 ///
1375 /// 1. Try to get chunk from codebook
1376 /// 2. If missing, use resonator to recover approximate chunk
1377 /// 3. Apply correction from CorrectionStore
1378 /// 4. Result is guaranteed bit-perfect (if correction exists)
1379 ///
1380 /// # How it works
1381 /// 1. For each file chunk, check if it exists in the engram codebook
1382 /// 2. If missing, use the resonator to project a query vector onto known patterns
1383 /// 3. Apply stored corrections for guaranteed accuracy
1384 /// 4. Reconstruct the file from available and recovered chunks
1385 /// 5. If no resonator is configured, falls back to standard extraction
1386 ///
1387 /// # Why this matters
1388 /// - Enables 100% reconstruction even with missing chunks
1389 /// - Provides fault tolerance for distributed storage scenarios
1390 /// - Supports hierarchical recovery at multiple levels of the storage stack
1391 /// - Maintains data integrity through pattern-based completion
1392 ///
1393 /// # Arguments
1394 /// * `output_dir` - Directory path where extracted files will be written
1395 /// * `verbose` - Whether to print progress information during extraction
1396 /// * `config` - VSA configuration for encoding/decoding
1397 ///
1398 /// # Returns
1399 /// `io::Result<()>` indicating success or failure of the extraction operation
1400 ///
1401 /// # Examples
1402 /// ```
1403 /// use embeddenator_fs::{EmbrFS, Resonator, ReversibleVSAConfig};
1404 /// use std::path::Path;
1405 ///
1406 /// let mut fs = EmbrFS::new();
1407 /// let resonator = Resonator::new();
1408 /// let config = ReversibleVSAConfig::default();
1409 /// fs.set_resonator(resonator);
1410 ///
1411 /// // Assuming fs has been populated with data...
1412 /// let result = fs.extract_with_resonator("/tmp/output", true, &config);
1413 /// assert!(result.is_ok());
1414 /// ```
1415 pub fn extract_with_resonator<P: AsRef<Path>>(
1416 &self,
1417 output_dir: P,
1418 verbose: bool,
1419 config: &ReversibleVSAConfig,
1420 ) -> io::Result<()> {
1421 if self.resonator.is_none() {
1422 return Self::extract(&self.engram, &self.manifest, output_dir, verbose, config);
1423 }
1424
1425 // SAFETY: we just checked is_none() above and returned early
1426 let _resonator = self
1427 .resonator
1428 .as_ref()
1429 .expect("resonator is Some after is_none() check");
1430 let output_dir = output_dir.as_ref();
1431
1432 if verbose {
1433 println!(
1434 "Extracting {} files with resonator enhancement to {}",
1435 self.manifest.files.iter().filter(|f| !f.deleted).count(),
1436 output_dir.display()
1437 );
1438 let stats = self.engram.corrections.stats();
1439 println!(
1440 " Correction stats: {:.1}% perfect, {:.2}% overhead",
1441 stats.perfect_ratio * 100.0,
1442 stats.correction_ratio * 100.0
1443 );
1444 }
1445
1446 for file_entry in &self.manifest.files {
1447 // Skip deleted files
1448 if file_entry.deleted {
1449 continue;
1450 }
1451
1452 let file_path = output_dir.join(&file_entry.path);
1453
1454 if let Some(parent) = file_path.parent() {
1455 fs::create_dir_all(parent)?;
1456 }
1457
1458 let mut reconstructed = Vec::new();
1459 let num_chunks = file_entry.chunks.len();
1460 for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
1461 // Calculate the actual chunk size
1462 let chunk_size = if chunk_idx == num_chunks - 1 {
1463 let remaining = file_entry.size - (chunk_idx * DEFAULT_CHUNK_SIZE);
1464 remaining.min(DEFAULT_CHUNK_SIZE)
1465 } else {
1466 DEFAULT_CHUNK_SIZE
1467 };
1468
1469 let chunk_data = if let Some(vector) = self.engram.codebook.get(&chunk_id) {
1470 // Decode the SparseVec back to bytes using reversible encoding
1471 // IMPORTANT: Use the same path as during encoding for correct shift calculation
1472 let decoded = vector.decode_data(config, Some(&file_entry.path), chunk_size);
1473
1474 // Apply correction to guarantee bit-perfect reconstruction
1475 if let Some(corrected) =
1476 self.engram.corrections.apply(chunk_id as u64, &decoded)
1477 {
1478 corrected
1479 } else {
1480 decoded
1481 }
1482 } else if let Some(resonator) = &self.resonator {
1483 // Use resonator to recover missing chunk
1484 // Create a query vector from the chunk_id using reversible encoding
1485 let query_vec = SparseVec::encode_data(&chunk_id.to_le_bytes(), config, None);
1486 let recovered_vec = resonator.project(&query_vec);
1487
1488 // Decode the recovered vector back to bytes
1489 // For resonator recovery, try with path first, fall back to no path
1490 let decoded =
1491 recovered_vec.decode_data(config, Some(&file_entry.path), chunk_size);
1492
1493 // Apply correction if available (may not be if chunk was lost)
1494 if let Some(corrected) =
1495 self.engram.corrections.apply(chunk_id as u64, &decoded)
1496 {
1497 corrected
1498 } else {
1499 // No correction available - best effort recovery
1500 decoded
1501 }
1502 } else {
1503 return Err(io::Error::new(
1504 io::ErrorKind::NotFound,
1505 format!("Missing chunk {} and no resonator available", chunk_id),
1506 ));
1507 };
1508 reconstructed.extend_from_slice(&chunk_data);
1509 }
1510
1511 reconstructed.truncate(file_entry.size);
1512
1513 fs::write(&file_path, reconstructed)?;
1514
1515 if verbose {
1516 println!("Extracted with resonator: {}", file_entry.path);
1517 }
1518 }
1519
1520 Ok(())
1521 }
1522
1523 /// Perform hierarchical bundling with path role binding and permutation tagging
1524 ///
1525 /// Creates multi-level engram structures where path components are encoded using
1526 /// permutation operations to create distinct representations at each level. This
1527 /// enables efficient hierarchical retrieval and reconstruction.
1528 ///
1529 /// # How it works
1530 /// 1. Split file paths into components (e.g., "a/b/c.txt" → ["a", "b", "c.txt"])
1531 /// 2. For each level, apply permutation based on path component hash
1532 /// 3. Bundle representations level-by-level with sparsity control
1533 /// 4. Create sub-engrams for intermediate nodes
1534 ///
1535 /// # Why this matters
1536 /// - Enables scalable hierarchical storage beyond flat bundling limits
1537 /// - Path-based retrieval without full engram traversal
1538 /// - Maintains semantic relationships through permutation encoding
1539 /// - Supports efficient partial reconstruction
1540 ///
1541 /// # Arguments
1542 /// * `max_level_sparsity` - Maximum non-zero elements per level bundle
1543 /// * `verbose` - Whether to print progress information
1544 ///
1545 /// # Returns
1546 /// HierarchicalManifest describing the multi-level structure
1547 ///
1548 /// # Examples
1549 /// ```
1550 /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1551 ///
1552 /// let fs = EmbrFS::new();
1553 /// let config = ReversibleVSAConfig::default();
1554 /// // Assuming files have been ingested...
1555 ///
1556 /// let hierarchical = fs.bundle_hierarchically(500, false, &config);
1557 /// assert!(hierarchical.is_ok());
1558 /// ```
1559 pub fn bundle_hierarchically(
1560 &self,
1561 max_level_sparsity: usize,
1562 verbose: bool,
1563 _config: &ReversibleVSAConfig,
1564 ) -> io::Result<HierarchicalManifest> {
1565 self.bundle_hierarchically_with_options(max_level_sparsity, None, verbose, _config)
1566 }
1567
1568 /// Like `bundle_hierarchically`, but supports an optional deterministic cap on `chunk_ids` per node.
1569 ///
1570 /// If `max_chunks_per_node` is set and a node would exceed that many `chunk_ids`, the node becomes
1571 /// a router with empty `chunk_ids`, and deterministic shard children are created each containing a
1572 /// bounded subset of `chunk_ids`.
1573 pub fn bundle_hierarchically_with_options(
1574 &self,
1575 max_level_sparsity: usize,
1576 max_chunks_per_node: Option<usize>,
1577 verbose: bool,
1578 _config: &ReversibleVSAConfig,
1579 ) -> io::Result<HierarchicalManifest> {
1580 let mut levels = Vec::new();
1581 let mut sub_engrams = HashMap::new();
1582
1583 // Group files by *path prefixes* at each level.
1584 // Level 0: "a"; Level 1: "a/b"; etc.
1585 let mut level_prefixes: HashMap<usize, HashMap<String, Vec<&FileEntry>>> = HashMap::new();
1586 for file_entry in &self.manifest.files {
1587 let comps: Vec<&str> = file_entry.path.split('/').collect();
1588 let mut prefix = String::new();
1589 for (level, &comp) in comps.iter().enumerate() {
1590 if level == 0 {
1591 prefix.push_str(comp);
1592 } else {
1593 prefix.push('/');
1594 prefix.push_str(comp);
1595 }
1596 level_prefixes
1597 .entry(level)
1598 .or_default()
1599 .entry(prefix.clone())
1600 .or_default()
1601 .push(file_entry);
1602 }
1603 }
1604
1605 // Process each level
1606 let max_level = level_prefixes.keys().max().unwrap_or(&0);
1607
1608 for level in 0..=*max_level {
1609 if verbose {
1610 let item_count = level_prefixes
1611 .get(&level)
1612 .map(|comps| comps.values().map(|files| files.len()).sum::<usize>())
1613 .unwrap_or(0);
1614 println!("Processing level {} with {} items", level, item_count);
1615 }
1616
1617 let mut level_bundle = SparseVec::new();
1618 let mut manifest_items = Vec::new();
1619
1620 if let Some(prefixes) = level_prefixes.get(&level) {
1621 let mut prefix_keys: Vec<&String> = prefixes.keys().collect();
1622 prefix_keys.sort();
1623
1624 for prefix in prefix_keys {
1625 let mut files: Vec<&FileEntry> = prefixes
1626 .get(prefix)
1627 // SAFETY: prefix comes from keys(), so get() must succeed
1628 .expect("prefix key from keys() must exist in HashMap")
1629 .to_vec();
1630 files.sort_by(|a, b| a.path.cmp(&b.path));
1631
1632 // Create permutation shift based on prefix hash
1633 let shift = {
1634 use std::collections::hash_map::DefaultHasher;
1635 use std::hash::{Hash, Hasher};
1636 let mut hasher = DefaultHasher::new();
1637 prefix.hash(&mut hasher);
1638 (hasher.finish() % (DIM as u64)) as usize
1639 };
1640
1641 // Bundle all files under this component with permutation
1642 let mut component_bundle = SparseVec::new();
1643 let mut chunk_ids_set: HashSet<usize> = HashSet::new();
1644 for file_entry in &files {
1645 // Find chunks for this file and bundle them
1646 let mut file_bundle = SparseVec::new();
1647 for &chunk_id in &file_entry.chunks {
1648 if let Some(chunk_vec) = self.engram.codebook.get(&chunk_id) {
1649 file_bundle = file_bundle.bundle(chunk_vec);
1650 chunk_ids_set.insert(chunk_id);
1651 }
1652 }
1653
1654 // Apply level-based permutation
1655 let permuted_file = file_bundle.permute(shift * (level + 1));
1656 component_bundle = component_bundle.bundle(&permuted_file);
1657 }
1658
1659 // Apply sparsity control
1660 if component_bundle.pos.len() + component_bundle.neg.len() > max_level_sparsity
1661 {
1662 component_bundle = component_bundle.thin(max_level_sparsity);
1663 }
1664
1665 level_bundle = level_bundle.bundle(&component_bundle);
1666
1667 // Create sub-engram for this prefix.
1668 // Children are the immediate next-level prefixes underneath this prefix.
1669 let sub_id = format!("level_{}_prefix_{}", level, prefix);
1670
1671 let mut children_set: HashSet<String> = HashSet::new();
1672 if level < *max_level {
1673 for file_entry in &files {
1674 let comps: Vec<&str> = file_entry.path.split('/').collect();
1675 if comps.len() <= level + 1 {
1676 continue;
1677 }
1678 let child_prefix = comps[..=level + 1].join("/");
1679 let child_id = format!("level_{}_prefix_{}", level + 1, child_prefix);
1680 children_set.insert(child_id);
1681 }
1682 }
1683 let mut children: Vec<String> = children_set.into_iter().collect();
1684 children.sort();
1685
1686 let mut chunk_ids: Vec<usize> = chunk_ids_set.into_iter().collect();
1687 chunk_ids.sort_unstable();
1688
1689 let chunk_count: usize = files.iter().map(|f| f.chunks.len()).sum();
1690
1691 if let Some(max_chunks) = max_chunks_per_node.filter(|v| *v > 0) {
1692 if chunk_ids.len() > max_chunks {
1693 let mut shard_ids: Vec<String> = Vec::new();
1694 for (shard_idx, chunk_slice) in chunk_ids.chunks(max_chunks).enumerate()
1695 {
1696 let shard_id = format!("{}__shard_{:04}", sub_id, shard_idx);
1697 shard_ids.push(shard_id.clone());
1698 sub_engrams.insert(
1699 shard_id.clone(),
1700 SubEngram {
1701 id: shard_id,
1702 root: component_bundle.clone(),
1703 chunk_ids: chunk_slice.to_vec(),
1704 chunk_count: chunk_slice.len(),
1705 children: Vec::new(),
1706 },
1707 );
1708 }
1709
1710 let mut router_children = shard_ids;
1711 router_children.extend(children.clone());
1712 router_children.sort();
1713 router_children.dedup();
1714
1715 sub_engrams.insert(
1716 sub_id.clone(),
1717 SubEngram {
1718 id: sub_id.clone(),
1719 root: component_bundle,
1720 chunk_ids: Vec::new(),
1721 chunk_count,
1722 children: router_children,
1723 },
1724 );
1725 } else {
1726 sub_engrams.insert(
1727 sub_id.clone(),
1728 SubEngram {
1729 id: sub_id.clone(),
1730 root: component_bundle,
1731 chunk_ids,
1732 chunk_count,
1733 children,
1734 },
1735 );
1736 }
1737 } else {
1738 sub_engrams.insert(
1739 sub_id.clone(),
1740 SubEngram {
1741 id: sub_id.clone(),
1742 root: component_bundle,
1743 chunk_ids,
1744 chunk_count,
1745 children,
1746 },
1747 );
1748 }
1749
1750 manifest_items.push(ManifestItem {
1751 path: prefix.clone(),
1752 sub_engram_id: sub_id,
1753 });
1754 }
1755 }
1756
1757 manifest_items.sort_by(|a, b| {
1758 a.path
1759 .cmp(&b.path)
1760 .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
1761 });
1762
1763 // Apply final sparsity control to level bundle
1764 if level_bundle.pos.len() + level_bundle.neg.len() > max_level_sparsity {
1765 level_bundle = level_bundle.thin(max_level_sparsity);
1766 }
1767
1768 levels.push(ManifestLevel {
1769 level: level as u32,
1770 items: manifest_items,
1771 });
1772 }
1773
1774 Ok(HierarchicalManifest {
1775 version: 1,
1776 levels,
1777 sub_engrams,
1778 })
1779 }
1780
1781 /// Extract files from hierarchical manifest with manifest-guided traversal
1782 ///
1783 /// Performs hierarchical extraction by traversing the manifest levels and
1784 /// reconstructing files from sub-engrams. This enables efficient extraction
1785 /// from complex hierarchical structures without loading the entire engram.
1786 ///
1787 /// # How it works
1788 /// 1. Traverse manifest levels from root to leaves
1789 /// 2. For each level, locate relevant sub-engrams
1790 /// 3. Reconstruct file chunks using inverse permutation operations
1791 /// 4. Assemble complete files from hierarchical components
1792 ///
1793 /// # Why this matters
1794 /// - Enables partial extraction from large hierarchical datasets
1795 /// - Maintains bit-perfect reconstruction accuracy
1796 /// - Supports efficient path-based queries and retrieval
1797 /// - Scales to complex directory structures
1798 ///
1799 /// # Arguments
1800 /// * `hierarchical` - The hierarchical manifest to extract from
1801 /// * `output_dir` - Directory path where extracted files will be written
1802 /// * `verbose` - Whether to print progress information during extraction
1803 ///
1804 /// # Returns
1805 /// `io::Result<()>` indicating success or failure of the hierarchical extraction
1806 ///
1807 /// # Examples
1808 /// ```
1809 /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1810 ///
1811 /// let fs = EmbrFS::new();
1812 /// let config = ReversibleVSAConfig::default();
1813 /// // Assuming hierarchical manifest was created...
1814 /// // let hierarchical = fs.bundle_hierarchically(500, true).unwrap();
1815 ///
1816 /// // fs.extract_hierarchically(&hierarchical, "/tmp/output", true, &config)?;
1817 /// ```
1818 pub fn extract_hierarchically<P: AsRef<Path>>(
1819 &self,
1820 hierarchical: &HierarchicalManifest,
1821 output_dir: P,
1822 verbose: bool,
1823 config: &ReversibleVSAConfig,
1824 ) -> io::Result<()> {
1825 let output_dir = output_dir.as_ref();
1826
1827 if verbose {
1828 println!(
1829 "Extracting hierarchical manifest with {} levels to {}",
1830 hierarchical.levels.len(),
1831 output_dir.display()
1832 );
1833 }
1834
1835 // For each file in the original manifest, reconstruct it using hierarchical information
1836 for file_entry in &self.manifest.files {
1837 // Skip deleted files
1838 if file_entry.deleted {
1839 continue;
1840 }
1841
1842 let file_path = output_dir.join(&file_entry.path);
1843
1844 if let Some(parent) = file_path.parent() {
1845 fs::create_dir_all(parent)?;
1846 }
1847
1848 let mut reconstructed = Vec::new();
1849
1850 // Reconstruct each chunk using hierarchical information
1851 let num_chunks = file_entry.chunks.len();
1852 for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
1853 if let Some(chunk_vector) = self.engram.codebook.get(&chunk_id) {
1854 // Calculate the actual chunk size
1855 let chunk_size = if chunk_idx == num_chunks - 1 {
1856 let remaining = file_entry.size - (chunk_idx * DEFAULT_CHUNK_SIZE);
1857 remaining.min(DEFAULT_CHUNK_SIZE)
1858 } else {
1859 DEFAULT_CHUNK_SIZE
1860 };
1861
1862 // Decode using hierarchical inverse transformations
1863 let decoded =
1864 chunk_vector.decode_data(config, Some(&file_entry.path), chunk_size);
1865
1866 // Apply correction if available
1867 let chunk_data = if let Some(corrected) =
1868 self.engram.corrections.apply(chunk_id as u64, &decoded)
1869 {
1870 corrected
1871 } else {
1872 decoded
1873 };
1874
1875 reconstructed.extend_from_slice(&chunk_data);
1876 }
1877 }
1878
1879 // Truncate to actual file size
1880 reconstructed.truncate(file_entry.size);
1881
1882 fs::write(&file_path, reconstructed)?;
1883
1884 if verbose {
1885 println!("Extracted hierarchical: {}", file_entry.path);
1886 }
1887 }
1888
1889 Ok(())
1890 }
1891}
1892pub fn is_text_file(data: &[u8]) -> bool {
1893 if data.is_empty() {
1894 return true;
1895 }
1896
1897 let sample_size = data.len().min(8192);
1898 let sample = &data[..sample_size];
1899
1900 let mut null_count = 0;
1901 let mut control_count = 0;
1902
1903 for &byte in sample {
1904 if byte == 0 {
1905 null_count += 1;
1906 } else if byte < 32 && byte != b'\n' && byte != b'\r' && byte != b'\t' {
1907 control_count += 1;
1908 }
1909 }
1910
1911 null_count == 0 && control_count < sample_size / 10
1912}