embeddenator_fs/fs/embrfs.rs
1//! EmbrFS - Holographic Filesystem Implementation
2//!
3//! Provides engram-based storage for entire filesystem trees with:
4//! - Chunked encoding for efficient storage
5//! - Manifest for file metadata
6//! - **Guaranteed 100% bit-perfect reconstruction** via CorrectionStore
7//!
8//! # Reconstruction Guarantee
9//!
10//! The fundamental challenge with VSA encoding is that approximate operations
11//! may introduce errors during superposition. This module solves that through
12//! a multi-layer approach:
13//!
14//! 1. **Primary Encoding**: SparseVec encoding attempts bit-perfect storage
15//! 2. **Correction Layer**: CorrectionStore captures any encoding errors
16//! 3. **Reconstruction**: Decode + apply corrections = exact original
17//!
18//! The invariant: `original = decode(encode(original)) + correction`
19//!
20//! If encoding was perfect, correction is empty. If not, correction exactly
21//! compensates. Either way, reconstruction is guaranteed bit-perfect.
22
23use crate::correction::{CorrectionStats, CorrectionStore};
24use embeddenator_retrieval::resonator::Resonator;
25use embeddenator_retrieval::{RerankedResult, TernaryInvertedIndex};
26use embeddenator_vsa::{ReversibleVSAConfig, ReversibleVSAEncoder, SparseVec, DIM};
27use serde::{Deserialize, Serialize};
28use std::collections::BTreeMap;
29use std::collections::{HashMap, HashSet};
30use std::fs::{self, File};
31use std::io::{self, Read};
32use std::path::{Path, PathBuf};
33use walkdir::WalkDir;
34
35/// Default chunk size for file encoding (4KB)
36pub const DEFAULT_CHUNK_SIZE: usize = 4096;
37
38/// File entry in the manifest
39#[derive(Serialize, Deserialize, Debug)]
40pub struct FileEntry {
41 pub path: String,
42 pub is_text: bool,
43 pub size: usize,
44 pub chunks: Vec<usize>,
45 /// Mark files as deleted without rebuilding root (for incremental updates)
46 #[serde(default)]
47 pub deleted: bool,
48}
49
50/// Manifest describing filesystem structure
51#[derive(Serialize, Deserialize, Debug)]
52pub struct Manifest {
53 pub files: Vec<FileEntry>,
54 pub total_chunks: usize,
55 /// Chunk size used during encoding (64 for holographic, 4096 for legacy)
56 #[serde(default = "default_chunk_size")]
57 pub chunk_size: usize,
58 /// Whether holographic encoding was used
59 #[serde(default)]
60 pub holographic: bool,
61}
62
63fn default_chunk_size() -> usize {
64 DEFAULT_CHUNK_SIZE
65}
66
67/// Hierarchical manifest for multi-level engrams
68#[derive(Serialize, Deserialize, Debug)]
69pub struct HierarchicalManifest {
70 pub version: u32,
71 pub levels: Vec<ManifestLevel>,
72 #[serde(default)]
73 pub sub_engrams: HashMap<String, SubEngram>,
74}
75
76/// Level in hierarchical manifest
77#[derive(Serialize, Deserialize, Debug, Clone)]
78pub struct ManifestLevel {
79 pub level: u32,
80 pub items: Vec<ManifestItem>,
81}
82
83/// Item in manifest level
84#[derive(Serialize, Deserialize, Debug, Clone)]
85pub struct ManifestItem {
86 pub path: String,
87 pub sub_engram_id: String,
88}
89
90/// Sub-engram in hierarchical structure
91#[derive(Serialize, Deserialize, Debug, Clone)]
92pub struct SubEngram {
93 pub id: String,
94 pub root: SparseVec,
95 /// Chunk IDs that belong to this sub-engram.
96 ///
97 /// This enables selective retrieval without indexing the entire global codebook.
98 #[serde(default)]
99 pub chunk_ids: Vec<usize>,
100 pub chunk_count: usize,
101 pub children: Vec<String>,
102}
103
104/// Bounds and tuning parameters for hierarchical selective retrieval.
105#[derive(Clone, Debug)]
106pub struct HierarchicalQueryBounds {
107 /// Global top-k results to return.
108 pub k: usize,
109 /// Candidate count per expanded node before reranking.
110 pub candidate_k: usize,
111 /// Maximum number of frontier nodes retained (beam width).
112 pub beam_width: usize,
113 /// Maximum depth to descend (0 means only level-0 nodes).
114 pub max_depth: usize,
115 /// Maximum number of expanded nodes.
116 pub max_expansions: usize,
117 /// Maximum number of cached inverted indices.
118 pub max_open_indices: usize,
119 /// Maximum number of cached sub-engrams.
120 pub max_open_engrams: usize,
121}
122
123impl Default for HierarchicalQueryBounds {
124 fn default() -> Self {
125 Self {
126 k: 10,
127 candidate_k: 100,
128 beam_width: 32,
129 max_depth: 4,
130 max_expansions: 128,
131 max_open_indices: 16,
132 max_open_engrams: 16,
133 }
134 }
135}
136
137#[derive(Clone, Debug, PartialEq)]
138pub struct HierarchicalChunkHit {
139 pub sub_engram_id: String,
140 pub chunk_id: usize,
141 pub approx_score: i32,
142 pub cosine: f64,
143}
144
145#[derive(Clone, Debug)]
146struct FrontierItem {
147 score: f64,
148 sub_engram_id: String,
149 depth: usize,
150}
151
152#[derive(Clone, Debug)]
153struct RemappedInvertedIndex {
154 index: TernaryInvertedIndex,
155 local_to_global: Vec<usize>,
156}
157
158impl RemappedInvertedIndex {
159 fn build(chunk_ids: &[usize], vectors: &HashMap<usize, SparseVec>) -> Self {
160 let mut index = TernaryInvertedIndex::new();
161 let mut local_to_global = Vec::with_capacity(chunk_ids.len());
162
163 for (local_id, &global_id) in chunk_ids.iter().enumerate() {
164 let Some(vec) = vectors.get(&global_id) else {
165 continue;
166 };
167 local_to_global.push(global_id);
168 index.add(local_id, vec);
169 }
170
171 index.finalize();
172 Self {
173 index,
174 local_to_global,
175 }
176 }
177
178 fn query_top_k_reranked(
179 &self,
180 query: &SparseVec,
181 vectors: &HashMap<usize, SparseVec>,
182 candidate_k: usize,
183 k: usize,
184 ) -> Vec<HierarchicalChunkHit> {
185 if k == 0 {
186 return Vec::new();
187 }
188
189 let candidates = self.index.query_top_k(query, candidate_k);
190 let mut out = Vec::with_capacity(candidates.len().min(k));
191 for cand in candidates {
192 let Some(&global_id) = self.local_to_global.get(cand.id) else {
193 continue;
194 };
195 let Some(vec) = vectors.get(&global_id) else {
196 continue;
197 };
198 out.push((global_id, cand.score, query.cosine(vec)));
199 }
200
201 out.sort_by(|a, b| {
202 b.2.total_cmp(&a.2)
203 .then_with(|| b.1.cmp(&a.1))
204 .then_with(|| a.0.cmp(&b.0))
205 });
206 out.truncate(k);
207
208 out.into_iter()
209 .map(|(chunk_id, approx_score, cosine)| HierarchicalChunkHit {
210 sub_engram_id: String::new(),
211 chunk_id,
212 approx_score,
213 cosine,
214 })
215 .collect()
216 }
217}
218
219#[derive(Clone, Debug)]
220struct LruCache<V> {
221 cap: usize,
222 map: HashMap<String, V>,
223 order: Vec<String>,
224}
225
226impl<V> LruCache<V> {
227 fn new(cap: usize) -> Self {
228 Self {
229 cap,
230 map: HashMap::new(),
231 order: Vec::new(),
232 }
233 }
234
235 fn get(&mut self, key: &str) -> Option<&V> {
236 if self.map.contains_key(key) {
237 self.touch(key);
238 return self.map.get(key);
239 }
240 None
241 }
242
243 fn insert(&mut self, key: String, value: V) {
244 if self.cap == 0 {
245 return;
246 }
247
248 if self.map.contains_key(&key) {
249 self.map.insert(key.clone(), value);
250 self.touch(&key);
251 return;
252 }
253
254 self.map.insert(key.clone(), value);
255 self.order.push(key);
256
257 while self.map.len() > self.cap {
258 if let Some(evict) = self.order.first().cloned() {
259 self.order.remove(0);
260 self.map.remove(&evict);
261 } else {
262 break;
263 }
264 }
265 }
266
267 fn touch(&mut self, key: &str) {
268 if let Some(pos) = self.order.iter().position(|k| k == key) {
269 let k = self.order.remove(pos);
270 self.order.push(k);
271 }
272 }
273}
274
275/// Storage/loader seam for hierarchical sub-engrams.
276///
277/// This enables on-demand loading (e.g., from disk) rather than requiring that
278/// every sub-engram is materialized in memory.
279pub trait SubEngramStore {
280 fn load(&self, id: &str) -> Option<SubEngram>;
281}
282
283fn escape_sub_engram_id(id: &str) -> String {
284 // Minimal reversible escaping for filenames.
285 // Note: not intended for untrusted input; IDs are internal.
286 id.replace('%', "%25").replace('/', "%2F")
287}
288
289/// Directory-backed store for sub-engrams.
290///
291/// Files are stored as bincode blobs under `${dir}/{escaped_id}.subengram`.
292pub struct DirectorySubEngramStore {
293 dir: PathBuf,
294}
295
296impl DirectorySubEngramStore {
297 pub fn new<P: AsRef<Path>>(dir: P) -> Self {
298 Self {
299 dir: dir.as_ref().to_path_buf(),
300 }
301 }
302
303 fn path_for_id(&self, id: &str) -> PathBuf {
304 self.dir
305 .join(format!("{}.subengram", escape_sub_engram_id(id)))
306 }
307}
308
309impl SubEngramStore for DirectorySubEngramStore {
310 fn load(&self, id: &str) -> Option<SubEngram> {
311 let path = self.path_for_id(id);
312 let data = fs::read(path).ok()?;
313 bincode::deserialize(&data).ok()
314 }
315}
316
317/// Save a hierarchical manifest as JSON.
318pub fn save_hierarchical_manifest<P: AsRef<Path>>(
319 hierarchical: &HierarchicalManifest,
320 path: P,
321) -> io::Result<()> {
322 let file = File::create(path)?;
323
324 // Serialize deterministically: HashMap iteration order is not stable.
325 #[derive(Serialize)]
326 struct StableHierarchicalManifest {
327 version: u32,
328 levels: Vec<ManifestLevel>,
329 sub_engrams: BTreeMap<String, SubEngram>,
330 }
331
332 let mut levels = hierarchical.levels.clone();
333 levels.sort_by_key(|a| a.level);
334 for level in &mut levels {
335 level.items.sort_by(|a, b| {
336 a.path
337 .cmp(&b.path)
338 .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
339 });
340 }
341
342 let mut sub_engrams: BTreeMap<String, SubEngram> = BTreeMap::new();
343 for (id, sub) in &hierarchical.sub_engrams {
344 sub_engrams.insert(id.clone(), sub.clone());
345 }
346
347 let stable = StableHierarchicalManifest {
348 version: hierarchical.version,
349 levels,
350 sub_engrams,
351 };
352
353 serde_json::to_writer_pretty(file, &stable)?;
354 Ok(())
355}
356
357/// Load a hierarchical manifest from JSON.
358pub fn load_hierarchical_manifest<P: AsRef<Path>>(path: P) -> io::Result<HierarchicalManifest> {
359 let file = File::open(path)?;
360 let manifest = serde_json::from_reader(file)?;
361 Ok(manifest)
362}
363
364/// Save a set of sub-engrams to a directory (bincode per sub-engram).
365pub fn save_sub_engrams_dir<P: AsRef<Path>>(
366 sub_engrams: &HashMap<String, SubEngram>,
367 dir: P,
368) -> io::Result<()> {
369 let dir = dir.as_ref();
370 fs::create_dir_all(dir)?;
371
372 let mut ids: Vec<&String> = sub_engrams.keys().collect();
373 ids.sort();
374
375 for id in ids {
376 // SAFETY: id comes from keys(), so get() must succeed
377 let sub = sub_engrams
378 .get(id)
379 .expect("sub_engram id from keys() must exist in HashMap");
380 let encoded = bincode::serialize(sub).map_err(io::Error::other)?;
381 let path = dir.join(format!("{}.subengram", escape_sub_engram_id(id)));
382 fs::write(path, encoded)?;
383 }
384 Ok(())
385}
386
387struct InMemorySubEngramStore<'a> {
388 map: &'a HashMap<String, SubEngram>,
389}
390
391impl<'a> InMemorySubEngramStore<'a> {
392 fn new(map: &'a HashMap<String, SubEngram>) -> Self {
393 Self { map }
394 }
395}
396
397impl SubEngramStore for InMemorySubEngramStore<'_> {
398 fn load(&self, id: &str) -> Option<SubEngram> {
399 self.map.get(id).cloned()
400 }
401}
402
403fn get_cached_sub_engram(
404 cache: &mut LruCache<SubEngram>,
405 store: &impl SubEngramStore,
406 id: &str,
407) -> Option<SubEngram> {
408 if let Some(v) = cache.get(id) {
409 return Some(v.clone());
410 }
411 let loaded = store.load(id)?;
412 cache.insert(id.to_string(), loaded.clone());
413 Some(loaded)
414}
415
416/// Query a hierarchical manifest by selectively unfolding only promising sub-engrams.
417///
418/// This performs a beam-limited traversal over `hierarchical.sub_engrams`.
419/// At each expanded node, it builds (and LRU-caches) an inverted index over the
420/// node-local `chunk_ids` subset of `codebook`, then reranks by exact cosine.
421pub fn query_hierarchical_codebook(
422 hierarchical: &HierarchicalManifest,
423 codebook: &HashMap<usize, SparseVec>,
424 query: &SparseVec,
425 bounds: &HierarchicalQueryBounds,
426) -> Vec<HierarchicalChunkHit> {
427 let store = InMemorySubEngramStore::new(&hierarchical.sub_engrams);
428 query_hierarchical_codebook_with_store(hierarchical, &store, codebook, query, bounds)
429}
430
431/// Store-backed variant of `query_hierarchical_codebook` that supports on-demand sub-engram loading.
432pub fn query_hierarchical_codebook_with_store(
433 hierarchical: &HierarchicalManifest,
434 store: &impl SubEngramStore,
435 codebook: &HashMap<usize, SparseVec>,
436 query: &SparseVec,
437 bounds: &HierarchicalQueryBounds,
438) -> Vec<HierarchicalChunkHit> {
439 if bounds.k == 0 || hierarchical.levels.is_empty() {
440 return Vec::new();
441 }
442
443 let mut sub_cache: LruCache<SubEngram> = LruCache::new(bounds.max_open_engrams);
444 let mut index_cache: LruCache<RemappedInvertedIndex> = LruCache::new(bounds.max_open_indices);
445
446 let mut frontier: Vec<FrontierItem> = Vec::new();
447 if let Some(level0) = hierarchical.levels.first() {
448 for item in &level0.items {
449 let Some(sub) = get_cached_sub_engram(&mut sub_cache, store, &item.sub_engram_id)
450 else {
451 continue;
452 };
453 frontier.push(FrontierItem {
454 score: query.cosine(&sub.root),
455 sub_engram_id: item.sub_engram_id.clone(),
456 depth: 0,
457 });
458 }
459 }
460
461 frontier.sort_by(|a, b| {
462 b.score
463 .total_cmp(&a.score)
464 .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
465 });
466 if frontier.len() > bounds.beam_width {
467 frontier.truncate(bounds.beam_width);
468 }
469
470 let mut expansions = 0usize;
471
472 // Keep only the best hit per chunk for determinism.
473 let mut best_by_chunk: HashMap<usize, HierarchicalChunkHit> = HashMap::new();
474
475 while !frontier.is_empty() && expansions < bounds.max_expansions {
476 let node = frontier.remove(0);
477
478 let Some(sub) = get_cached_sub_engram(&mut sub_cache, store, &node.sub_engram_id) else {
479 continue;
480 };
481
482 expansions += 1;
483
484 let idx = if let Some(existing) = index_cache.get(&node.sub_engram_id) {
485 existing
486 } else {
487 let built = RemappedInvertedIndex::build(&sub.chunk_ids, codebook);
488 index_cache.insert(node.sub_engram_id.clone(), built);
489 // SAFETY: we just inserted the key, so get() must succeed immediately after
490 index_cache
491 .get(&node.sub_engram_id)
492 .expect("index_cache.get() must succeed immediately after insert()")
493 };
494
495 let mut local_hits =
496 idx.query_top_k_reranked(query, codebook, bounds.candidate_k, bounds.k);
497 for hit in &mut local_hits {
498 hit.sub_engram_id = node.sub_engram_id.clone();
499 }
500
501 for hit in local_hits {
502 match best_by_chunk.get(&hit.chunk_id) {
503 None => {
504 best_by_chunk.insert(hit.chunk_id, hit);
505 }
506 Some(existing) => {
507 let better = hit
508 .cosine
509 .total_cmp(&existing.cosine)
510 .then_with(|| hit.approx_score.cmp(&existing.approx_score))
511 .is_gt();
512 if better {
513 best_by_chunk.insert(hit.chunk_id, hit);
514 }
515 }
516 }
517 }
518
519 if node.depth >= bounds.max_depth {
520 continue;
521 }
522
523 let children = sub.children.clone();
524 for child_id in &children {
525 let Some(child) = get_cached_sub_engram(&mut sub_cache, store, child_id) else {
526 continue;
527 };
528 frontier.push(FrontierItem {
529 score: query.cosine(&child.root),
530 sub_engram_id: child_id.clone(),
531 depth: node.depth + 1,
532 });
533 }
534
535 frontier.sort_by(|a, b| {
536 b.score
537 .total_cmp(&a.score)
538 .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
539 });
540 if frontier.len() > bounds.beam_width {
541 frontier.truncate(bounds.beam_width);
542 }
543 }
544
545 let mut out: Vec<HierarchicalChunkHit> = best_by_chunk.into_values().collect();
546 out.sort_by(|a, b| {
547 b.cosine
548 .total_cmp(&a.cosine)
549 .then_with(|| b.approx_score.cmp(&a.approx_score))
550 .then_with(|| a.chunk_id.cmp(&b.chunk_id))
551 .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
552 });
553 out.truncate(bounds.k);
554 out
555}
556
557/// Unified manifest enum for backward compatibility
558#[derive(Serialize, Deserialize, Debug)]
559pub enum UnifiedManifest {
560 Flat(Manifest),
561 Hierarchical(HierarchicalManifest),
562}
563
564impl From<Manifest> for UnifiedManifest {
565 fn from(manifest: Manifest) -> Self {
566 UnifiedManifest::Flat(manifest)
567 }
568}
569
570/// Engram: holographic encoding of a filesystem with correction guarantee
571#[derive(Serialize, Deserialize)]
572pub struct Engram {
573 pub root: SparseVec,
574 pub codebook: HashMap<usize, SparseVec>,
575 /// Correction store for 100% reconstruction guarantee
576 #[serde(default)]
577 pub corrections: CorrectionStore,
578}
579
580impl Engram {
581 /// Build a reusable inverted index over the codebook.
582 ///
583 /// This is useful when issuing multiple queries (e.g., shift-sweeps) and you
584 /// want to avoid rebuilding the index each time.
585 pub fn build_codebook_index(&self) -> TernaryInvertedIndex {
586 TernaryInvertedIndex::build_from_map(&self.codebook)
587 }
588
589 /// Query the codebook using a pre-built inverted index.
590 pub fn query_codebook_with_index(
591 &self,
592 index: &TernaryInvertedIndex,
593 query: &SparseVec,
594 candidate_k: usize,
595 k: usize,
596 ) -> Vec<RerankedResult> {
597 if k == 0 || self.codebook.is_empty() {
598 return Vec::new();
599 }
600 index.query_top_k_reranked(query, &self.codebook, candidate_k, k)
601 }
602
603 /// Query the engram's codebook for chunks most similar to `query`.
604 ///
605 /// This builds an inverted index over the codebook for sub-linear candidate
606 /// generation, then reranks those candidates using exact cosine similarity.
607 pub fn query_codebook(&self, query: &SparseVec, k: usize) -> Vec<RerankedResult> {
608 if k == 0 || self.codebook.is_empty() {
609 return Vec::new();
610 }
611
612 // Simple heuristic: rerank a moderately-sized candidate set.
613 let candidate_k = (k.saturating_mul(10)).max(50);
614 let index = self.build_codebook_index();
615 self.query_codebook_with_index(&index, query, candidate_k, k)
616 }
617}
618
619/// Chunk size optimized for holographic encoding (8 bytes)
620/// Smaller chunks achieve higher accuracy (~94%) with ReversibleVSAEncoder
621/// Using larger chunks creates too much crosstalk from bundling
622pub const HOLOGRAPHIC_CHUNK_SIZE: usize = 8;
623
624/// EmbrFS - Holographic Filesystem with Guaranteed Reconstruction
625///
626/// # 100% Reconstruction Guarantee
627///
628/// EmbrFS guarantees bit-perfect file reconstruction through a layered approach:
629///
630/// 1. **Encode**: Data chunks → SparseVec via reversible encoding
631/// 2. **Verify**: Immediately decode and compare to original
632/// 3. **Correct**: Store minimal correction if any difference exists
633/// 4. **Extract**: Decode + apply correction = exact original bytes
634///
635/// This guarantee holds regardless of:
636/// - Data content (binary, text, compressed, encrypted)
637/// - File size (single byte to gigabytes)
638/// - Number of files in the engram
639/// - Superposition crosstalk in bundles
640///
641/// # Holographic Mode
642///
643/// When created with `new_holographic()`, uses `ReversibleVSAEncoder` which achieves
644/// ~94% uncorrected accuracy through position-aware VSA binding. This results in
645/// <10% correction overhead instead of the ~200%+ overhead of legacy encoding.
646///
647/// # Examples
648///
649/// ```
650/// use embeddenator_fs::EmbrFS;
651/// use std::path::Path;
652///
653/// // Legacy mode (not recommended)
654/// let mut fs_legacy = EmbrFS::new();
655///
656/// // Holographic mode (recommended - minimal storage overhead)
657/// let mut fs = EmbrFS::new_holographic();
658/// assert_eq!(fs.manifest.total_chunks, 0);
659/// assert_eq!(fs.manifest.files.len(), 0);
660/// ```
661pub struct EmbrFS {
662 pub manifest: Manifest,
663 pub engram: Engram,
664 pub resonator: Option<Resonator>,
665 /// ReversibleVSAEncoder for true holographic encoding (~94% accuracy)
666 /// None in legacy mode, Some in holographic mode
667 encoder: Option<ReversibleVSAEncoder>,
668 /// Chunk size for encoding (64 bytes for holographic, 4096 for legacy)
669 chunk_size: usize,
670}
671
672impl Default for EmbrFS {
673 fn default() -> Self {
674 Self::new_holographic()
675 }
676}
677
678impl EmbrFS {
679 /// Create a new empty EmbrFS instance (legacy mode - NOT RECOMMENDED)
680 ///
681 /// This constructor creates an EmbrFS with legacy encoding that has only ~10%
682 /// accuracy, resulting in ~200%+ storage overhead due to verbatim corrections.
683 ///
684 /// **Use `new_holographic()` instead for production use.**
685 ///
686 /// # Examples
687 ///
688 /// ```
689 /// use embeddenator_fs::EmbrFS;
690 ///
691 /// // Legacy mode - high storage overhead
692 /// let fs = EmbrFS::new();
693 /// assert_eq!(fs.manifest.files.len(), 0);
694 /// ```
695 #[deprecated(
696 since = "0.25.0",
697 note = "Use new_holographic() instead for ~94% encoding accuracy and <10% storage overhead"
698 )]
699 pub fn new() -> Self {
700 EmbrFS {
701 manifest: Manifest {
702 files: Vec::new(),
703 total_chunks: 0,
704 chunk_size: DEFAULT_CHUNK_SIZE,
705 holographic: false,
706 },
707 engram: Engram {
708 root: SparseVec::new(),
709 codebook: HashMap::new(),
710 corrections: CorrectionStore::new(),
711 },
712 resonator: None,
713 encoder: None,
714 chunk_size: DEFAULT_CHUNK_SIZE,
715 }
716 }
717
718 /// Create a new EmbrFS with holographic encoding (RECOMMENDED)
719 ///
720 /// Uses `ReversibleVSAEncoder` which achieves ~94% uncorrected accuracy through
721 /// position-aware VSA binding. This results in <10% correction overhead instead
722 /// of the ~200%+ overhead of legacy encoding.
723 ///
724 /// # Examples
725 ///
726 /// ```
727 /// use embeddenator_fs::EmbrFS;
728 ///
729 /// let fs = EmbrFS::new_holographic();
730 /// assert_eq!(fs.manifest.files.len(), 0);
731 /// assert_eq!(fs.manifest.total_chunks, 0);
732 /// assert!(fs.is_holographic());
733 /// ```
734 pub fn new_holographic() -> Self {
735 EmbrFS {
736 manifest: Manifest {
737 files: Vec::new(),
738 total_chunks: 0,
739 chunk_size: HOLOGRAPHIC_CHUNK_SIZE,
740 holographic: true,
741 },
742 engram: Engram {
743 root: SparseVec::new(),
744 codebook: HashMap::new(),
745 corrections: CorrectionStore::new(),
746 },
747 resonator: None,
748 encoder: Some(ReversibleVSAEncoder::new()),
749 chunk_size: HOLOGRAPHIC_CHUNK_SIZE,
750 }
751 }
752
753 /// Check if holographic mode is enabled
754 pub fn is_holographic(&self) -> bool {
755 self.encoder.is_some()
756 }
757
758 /// Get the chunk size being used
759 pub fn chunk_size(&self) -> usize {
760 self.chunk_size
761 }
762
763 fn path_to_forward_slash_string(path: &Path) -> String {
764 path.components()
765 .filter_map(|c| match c {
766 std::path::Component::Normal(s) => s.to_str().map(|v| v.to_string()),
767 _ => None,
768 })
769 .collect::<Vec<String>>()
770 .join("/")
771 }
772
773 /// Set the resonator for enhanced pattern recovery during extraction
774 ///
775 /// Configures a resonator network that can perform pattern completion to recover
776 /// missing or corrupted data chunks during filesystem extraction. The resonator
777 /// acts as a content-addressable memory that can reconstruct lost information
778 /// by finding the best matching patterns in its trained codebook.
779 ///
780 /// # How it works
781 /// - The resonator maintains a codebook of known vector patterns
782 /// - During extraction, missing chunks are projected onto the closest known pattern
783 /// - This enables robust recovery from partial data loss or corruption
784 ///
785 /// # Why this matters
786 /// - Provides fault tolerance for holographic storage systems
787 /// - Enables reconstruction even when some chunks are unavailable
788 /// - Supports graceful degradation rather than complete failure
789 ///
790 /// # Arguments
791 /// * `resonator` - A trained resonator network for pattern completion
792 ///
793 /// # Examples
794 /// ```
795 /// use embeddenator_fs::{EmbrFS, Resonator};
796 ///
797 /// let mut fs = EmbrFS::new();
798 /// let resonator = Resonator::new();
799 /// fs.set_resonator(resonator);
800 /// // Now extraction will use resonator-enhanced recovery
801 /// ```
802 pub fn set_resonator(&mut self, resonator: Resonator) {
803 self.resonator = Some(resonator);
804 }
805
806 /// Get correction statistics for this engram
807 ///
808 /// Returns statistics about how many chunks needed correction and the
809 /// overhead incurred by storing corrections.
810 ///
811 /// # Examples
812 /// ```
813 /// use embeddenator_fs::EmbrFS;
814 ///
815 /// let fs = EmbrFS::new();
816 /// let stats = fs.correction_stats();
817 /// assert_eq!(stats.total_chunks, 0);
818 /// ```
819 pub fn correction_stats(&self) -> CorrectionStats {
820 self.engram.corrections.stats()
821 }
822
823 /// Ingest an entire directory into engram format
824 pub fn ingest_directory<P: AsRef<Path>>(
825 &mut self,
826 dir: P,
827 verbose: bool,
828 config: &ReversibleVSAConfig,
829 ) -> io::Result<()> {
830 self.ingest_directory_with_prefix(dir, None, verbose, config)
831 }
832
833 /// Ingest a directory into the engram, optionally prefixing all logical paths.
834 ///
835 /// When `logical_prefix` is provided, all ingested file paths become:
836 /// `{logical_prefix}/{relative_path_from_dir}`.
837 pub fn ingest_directory_with_prefix<P: AsRef<Path>>(
838 &mut self,
839 dir: P,
840 logical_prefix: Option<&str>,
841 verbose: bool,
842 config: &ReversibleVSAConfig,
843 ) -> io::Result<()> {
844 let dir = dir.as_ref();
845 if verbose {
846 println!("Ingesting directory: {}", dir.display());
847 }
848
849 let mut files_to_process = Vec::new();
850 for entry in WalkDir::new(dir).follow_links(false) {
851 let entry = entry?;
852 if entry.file_type().is_file() {
853 files_to_process.push(entry.path().to_path_buf());
854 }
855 }
856 files_to_process.sort();
857
858 for file_path in files_to_process {
859 let relative = file_path.strip_prefix(dir).unwrap_or(file_path.as_path());
860 let rel = Self::path_to_forward_slash_string(relative);
861 let logical_path = if let Some(prefix) = logical_prefix {
862 if prefix.is_empty() {
863 rel
864 } else if rel.is_empty() {
865 prefix.to_string()
866 } else {
867 format!("{}/{}", prefix, rel)
868 }
869 } else {
870 rel
871 };
872
873 self.ingest_file(&file_path, logical_path, verbose, config)?;
874 }
875
876 Ok(())
877 }
878
879 /// Ingest a single file into the engram with guaranteed reconstruction
880 ///
881 /// This method encodes file data into sparse vectors and stores any
882 /// necessary corrections to guarantee 100% bit-perfect reconstruction.
883 ///
884 /// # Correction Process
885 ///
886 /// For each chunk:
887 /// 1. Encode: `chunk_data → SparseVec`
888 /// 2. Decode: `SparseVec → decoded_data`
889 /// 3. Compare: `chunk_data == decoded_data?`
890 /// 4. If different: store correction in `CorrectionStore`
891 ///
892 /// # Arguments
893 /// * `file_path` - Path to the file on disk
894 /// * `logical_path` - Path to use in the engram manifest
895 /// * `verbose` - Print progress information
896 /// * `config` - VSA encoding configuration
897 ///
898 /// # Returns
899 /// `io::Result<()>` indicating success or failure
900 pub fn ingest_file<P: AsRef<Path>>(
901 &mut self,
902 file_path: P,
903 logical_path: String,
904 verbose: bool,
905 config: &ReversibleVSAConfig,
906 ) -> io::Result<()> {
907 let file_path = file_path.as_ref();
908 let mut file = File::open(file_path)?;
909 let mut data = Vec::new();
910 file.read_to_end(&mut data)?;
911
912 let is_text = is_text_file(&data);
913 let is_holographic = self.encoder.is_some();
914
915 if verbose {
916 println!(
917 "Ingesting {}: {} bytes ({}, {})",
918 logical_path,
919 data.len(),
920 if is_text { "text" } else { "binary" },
921 if is_holographic {
922 "holographic"
923 } else {
924 "legacy"
925 }
926 );
927 }
928
929 let chunk_size = self.chunk_size;
930 let mut chunks = Vec::new();
931 let mut corrections_needed = 0usize;
932 let mut total_correction_bytes = 0usize;
933
934 for (i, chunk) in data.chunks(chunk_size).enumerate() {
935 let chunk_id = self.manifest.total_chunks + i;
936
937 // Encode chunk to sparse vector
938 let (chunk_vec, decoded) = if let Some(ref mut encoder) = self.encoder {
939 // Holographic mode: use ReversibleVSAEncoder (~94% accuracy)
940 let encoded = encoder.encode(chunk);
941 let decoded = encoder.decode(&encoded, chunk.len());
942 (encoded, decoded)
943 } else {
944 // Legacy mode: use SparseVec::encode_data (~10% accuracy)
945 let encoded = SparseVec::encode_data(chunk, config, Some(&logical_path));
946 let decoded = encoded.decode_data(config, Some(&logical_path), chunk.len());
947 (encoded, decoded)
948 };
949
950 // Store correction if needed (guarantees reconstruction)
951 self.engram
952 .corrections
953 .add(chunk_id as u64, chunk, &decoded);
954
955 if chunk != decoded.as_slice() {
956 corrections_needed += 1;
957 // Track correction overhead
958 if let Some(correction) = self.engram.corrections.get(chunk_id as u64) {
959 total_correction_bytes += correction.storage_size();
960 }
961 }
962
963 self.engram.root = self.engram.root.bundle(&chunk_vec);
964 self.engram.codebook.insert(chunk_id, chunk_vec);
965 chunks.push(chunk_id);
966 }
967
968 if verbose {
969 let total_chunks = chunks.len();
970 let perfect_chunks = total_chunks - corrections_needed;
971 let accuracy = if total_chunks > 0 {
972 (perfect_chunks as f64 / total_chunks as f64) * 100.0
973 } else {
974 100.0
975 };
976 let overhead = if !data.is_empty() {
977 (total_correction_bytes as f64 / data.len() as f64) * 100.0
978 } else {
979 0.0
980 };
981 println!(
982 " → {}/{} chunks perfect ({:.1}% accuracy), {:.1}% correction overhead",
983 perfect_chunks, total_chunks, accuracy, overhead
984 );
985 }
986
987 self.manifest.files.push(FileEntry {
988 path: logical_path,
989 is_text,
990 size: data.len(),
991 chunks: chunks.clone(),
992 deleted: false,
993 });
994
995 self.manifest.total_chunks += chunks.len();
996
997 Ok(())
998 }
999
1000 /// Add a new file to an existing engram (incremental update)
1001 ///
1002 /// This method enables efficient incremental updates by adding a single file
1003 /// to an existing engram without requiring full re-ingestion. The new file's
1004 /// chunks are bundled with the existing root vector using VSA's associative
1005 /// bundle operation.
1006 ///
1007 /// # Algorithm
1008 /// 1. Encode new file into chunks (same as ingest_file)
1009 /// 2. Bundle each chunk with existing root: `root_new = root_old ⊕ chunk`
1010 /// 3. Add chunks to codebook with new chunk IDs
1011 /// 4. Update manifest with new file entry
1012 ///
1013 /// # Performance
1014 /// - Time complexity: O(n) where n = number of chunks in new file
1015 /// - Does not require reading or re-encoding existing files
1016 /// - Suitable for production workflows with frequent additions
1017 ///
1018 /// # Arguments
1019 /// * `file_path` - Path to the file on disk
1020 /// * `logical_path` - Path to use in the engram manifest
1021 /// * `verbose` - Print progress information
1022 /// * `config` - VSA encoding configuration
1023 ///
1024 /// # Returns
1025 /// `io::Result<()>` indicating success or failure
1026 ///
1027 /// # Examples
1028 /// ```no_run
1029 /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1030 /// use std::path::Path;
1031 ///
1032 /// let mut fs = EmbrFS::new();
1033 /// let config = ReversibleVSAConfig::default();
1034 ///
1035 /// // Ingest initial dataset
1036 /// fs.ingest_directory("./data", false, &config).unwrap();
1037 ///
1038 /// // Later, add a new file without full re-ingestion
1039 /// fs.add_file("./new_file.txt", "new_file.txt".to_string(), true, &config).unwrap();
1040 /// ```
1041 pub fn add_file<P: AsRef<Path>>(
1042 &mut self,
1043 file_path: P,
1044 logical_path: String,
1045 verbose: bool,
1046 config: &ReversibleVSAConfig,
1047 ) -> io::Result<()> {
1048 let file_path = file_path.as_ref();
1049
1050 // Check if file already exists (not deleted)
1051 if self
1052 .manifest
1053 .files
1054 .iter()
1055 .any(|f| f.path == logical_path && !f.deleted)
1056 {
1057 return Err(io::Error::new(
1058 io::ErrorKind::AlreadyExists,
1059 format!("File '{}' already exists in engram", logical_path),
1060 ));
1061 }
1062
1063 // Use existing ingest_file logic (already handles bundling with root)
1064 self.ingest_file(file_path, logical_path, verbose, config)
1065 }
1066
1067 /// Remove a file from the engram (mark as deleted for incremental update)
1068 ///
1069 /// This method marks a file as deleted in the manifest without modifying the
1070 /// root vector. This is because VSA bundling is a lossy operation and there's
1071 /// no clean inverse. The chunks remain in the codebook but won't be extracted.
1072 ///
1073 /// # Algorithm
1074 /// 1. Find file in manifest by logical path
1075 /// 2. Mark file entry as deleted
1076 /// 3. Chunks remain in codebook (for potential recovery or compaction)
1077 /// 4. File won't appear in future extractions
1078 ///
1079 /// # Note on VSA Limitations
1080 /// Bundle operation is associative but not invertible:
1081 /// - `(A ⊕ B) ⊕ C = A ⊕ (B ⊕ C)` ✓ (can add)
1082 /// - `(A ⊕ B) ⊖ B ≠ A` ✗ (can't cleanly remove)
1083 ///
1084 /// To truly remove chunks from the root, use `compact()` which rebuilds
1085 /// the engram without deleted files.
1086 ///
1087 /// # Arguments
1088 /// * `logical_path` - Path of the file to remove
1089 /// * `verbose` - Print progress information
1090 ///
1091 /// # Returns
1092 /// `io::Result<()>` indicating success or failure
1093 ///
1094 /// # Examples
1095 /// ```no_run
1096 /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1097 ///
1098 /// let mut fs = EmbrFS::new();
1099 /// let config = ReversibleVSAConfig::default();
1100 ///
1101 /// fs.ingest_directory("./data", false, &config).unwrap();
1102 /// fs.remove_file("old_file.txt", true).unwrap();
1103 /// // File marked as deleted, won't be extracted
1104 /// ```
1105 pub fn remove_file(&mut self, logical_path: &str, verbose: bool) -> io::Result<()> {
1106 // Find file in manifest
1107 let file_entry = self
1108 .manifest
1109 .files
1110 .iter_mut()
1111 .find(|f| f.path == logical_path && !f.deleted)
1112 .ok_or_else(|| {
1113 io::Error::new(
1114 io::ErrorKind::NotFound,
1115 format!("File '{}' not found in engram", logical_path),
1116 )
1117 })?;
1118
1119 if verbose {
1120 println!(
1121 "Marking file as deleted: {} ({} chunks)",
1122 logical_path,
1123 file_entry.chunks.len()
1124 );
1125 }
1126
1127 // Mark as deleted (don't remove from manifest to preserve chunk IDs)
1128 file_entry.deleted = true;
1129
1130 if verbose {
1131 println!(" Note: Use 'compact' to rebuild engram and reclaim space");
1132 }
1133
1134 Ok(())
1135 }
1136
1137 /// Modify an existing file in the engram (incremental update)
1138 ///
1139 /// This method updates a file's content by removing the old version and
1140 /// adding the new version. It's equivalent to `remove_file` + `add_file`.
1141 ///
1142 /// # Algorithm
1143 /// 1. Mark old file as deleted
1144 /// 2. Re-encode new file content
1145 /// 3. Bundle new chunks with root
1146 /// 4. Add new file entry to manifest
1147 ///
1148 /// # Trade-offs
1149 /// - Old chunks remain in codebook (use `compact()` to clean up)
1150 /// - Root contains both old and new chunk contributions (slight noise)
1151 /// - Fast operation, doesn't require rebuilding entire engram
1152 ///
1153 /// # Arguments
1154 /// * `file_path` - Path to the file on disk (new content)
1155 /// * `logical_path` - Path of the file in the engram
1156 /// * `verbose` - Print progress information
1157 /// * `config` - VSA encoding configuration
1158 ///
1159 /// # Returns
1160 /// `io::Result<()>` indicating success or failure
1161 ///
1162 /// # Examples
1163 /// ```no_run
1164 /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1165 /// use std::path::Path;
1166 ///
1167 /// let mut fs = EmbrFS::new();
1168 /// let config = ReversibleVSAConfig::default();
1169 ///
1170 /// fs.ingest_directory("./data", false, &config).unwrap();
1171 ///
1172 /// // Later, modify a file
1173 /// fs.modify_file("./data/updated.txt", "data/updated.txt".to_string(), true, &config).unwrap();
1174 /// ```
1175 pub fn modify_file<P: AsRef<Path>>(
1176 &mut self,
1177 file_path: P,
1178 logical_path: String,
1179 verbose: bool,
1180 config: &ReversibleVSAConfig,
1181 ) -> io::Result<()> {
1182 // First, mark old file as deleted
1183 self.remove_file(&logical_path, false)?;
1184
1185 if verbose {
1186 println!("Modifying file: {}", logical_path);
1187 }
1188
1189 // Then add the new version
1190 self.ingest_file(file_path, logical_path, verbose, config)?;
1191
1192 Ok(())
1193 }
1194
1195 /// Compact the engram by rebuilding without deleted files
1196 ///
1197 /// This operation rebuilds the engram from scratch, excluding all files
1198 /// marked as deleted. It's the only way to truly remove old chunks from
1199 /// the root vector and codebook.
1200 ///
1201 /// # Algorithm
1202 /// 1. Create new empty engram
1203 /// 2. Re-bundle all non-deleted files
1204 /// 3. Reassign chunk IDs sequentially
1205 /// 4. Replace old engram with compacted version
1206 ///
1207 /// # Performance
1208 /// - Time complexity: O(N) where N = total bytes of non-deleted files
1209 /// - Expensive operation, run periodically (not after every deletion)
1210 /// - Recommended: compact when deleted files exceed 20-30% of total
1211 ///
1212 /// # Benefits
1213 /// - Reclaims space from deleted chunks
1214 /// - Reduces root vector noise from obsolete data
1215 /// - Resets chunk IDs to sequential order
1216 /// - Maintains bit-perfect reconstruction of kept files
1217 ///
1218 /// # Arguments
1219 /// * `verbose` - Print progress information
1220 /// * `config` - VSA encoding configuration
1221 ///
1222 /// # Returns
1223 /// `io::Result<()>` indicating success or failure
1224 ///
1225 /// # Examples
1226 /// ```no_run
1227 /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1228 ///
1229 /// let mut fs = EmbrFS::new();
1230 /// let config = ReversibleVSAConfig::default();
1231 ///
1232 /// fs.ingest_directory("./data", false, &config).unwrap();
1233 /// fs.remove_file("old1.txt", false).unwrap();
1234 /// fs.remove_file("old2.txt", false).unwrap();
1235 ///
1236 /// // After many deletions, compact to reclaim space
1237 /// fs.compact(true, &config).unwrap();
1238 /// ```
1239 pub fn compact(&mut self, verbose: bool, config: &ReversibleVSAConfig) -> io::Result<()> {
1240 if verbose {
1241 let deleted_count = self.manifest.files.iter().filter(|f| f.deleted).count();
1242 let total_count = self.manifest.files.len();
1243 println!(
1244 "Compacting engram: removing {} deleted files ({} remaining)",
1245 deleted_count,
1246 total_count - deleted_count
1247 );
1248 }
1249
1250 let is_holographic = self.encoder.is_some();
1251 let chunk_size = self.chunk_size;
1252
1253 // Create new engram with fresh root and codebook
1254 let mut new_engram = Engram {
1255 root: SparseVec::new(),
1256 codebook: HashMap::new(),
1257 corrections: CorrectionStore::new(),
1258 };
1259
1260 // Rebuild manifest with only non-deleted files
1261 let mut new_manifest = Manifest {
1262 files: Vec::new(),
1263 total_chunks: 0,
1264 chunk_size,
1265 holographic: is_holographic,
1266 };
1267
1268 // Process each non-deleted file
1269 for old_file in &self.manifest.files {
1270 if old_file.deleted {
1271 continue;
1272 }
1273
1274 // Reconstruct file data from old engram using current decoder
1275 let mut file_data = Vec::new();
1276 let num_chunks = old_file.chunks.len();
1277 let old_chunk_size = self.manifest.chunk_size;
1278
1279 for (chunk_idx, &chunk_id) in old_file.chunks.iter().enumerate() {
1280 if let Some(chunk_vec) = self.engram.codebook.get(&chunk_id) {
1281 let this_chunk_size = if chunk_idx == num_chunks - 1 {
1282 let remaining = old_file.size.saturating_sub(chunk_idx * old_chunk_size);
1283 remaining.min(old_chunk_size)
1284 } else {
1285 old_chunk_size
1286 };
1287
1288 // Decode using appropriate method
1289 let decoded = if self.manifest.holographic {
1290 if let Some(ref encoder) = self.encoder {
1291 encoder.decode(chunk_vec, this_chunk_size)
1292 } else {
1293 // Fallback if encoder not available
1294 chunk_vec.decode_data(config, Some(&old_file.path), this_chunk_size)
1295 }
1296 } else {
1297 chunk_vec.decode_data(config, Some(&old_file.path), this_chunk_size)
1298 };
1299
1300 let chunk_data = if let Some(corrected) =
1301 self.engram.corrections.apply(chunk_id as u64, &decoded)
1302 {
1303 corrected
1304 } else {
1305 decoded
1306 };
1307
1308 file_data.extend_from_slice(&chunk_data);
1309 }
1310 }
1311 file_data.truncate(old_file.size);
1312
1313 // Re-encode with new chunk IDs using current encoder
1314 let mut new_chunks = Vec::new();
1315
1316 for (i, chunk) in file_data.chunks(chunk_size).enumerate() {
1317 let new_chunk_id = new_manifest.total_chunks + i;
1318
1319 // Encode using appropriate method
1320 let (chunk_vec, decoded) = if let Some(ref mut encoder) = self.encoder {
1321 let encoded = encoder.encode(chunk);
1322 let decoded = encoder.decode(&encoded, chunk.len());
1323 (encoded, decoded)
1324 } else {
1325 let encoded = SparseVec::encode_data(chunk, config, Some(&old_file.path));
1326 let decoded = encoded.decode_data(config, Some(&old_file.path), chunk.len());
1327 (encoded, decoded)
1328 };
1329
1330 new_engram
1331 .corrections
1332 .add(new_chunk_id as u64, chunk, &decoded);
1333
1334 new_engram.root = new_engram.root.bundle(&chunk_vec);
1335 new_engram.codebook.insert(new_chunk_id, chunk_vec);
1336 new_chunks.push(new_chunk_id);
1337 }
1338
1339 if verbose {
1340 println!(
1341 " Recompacted: {} ({} chunks)",
1342 old_file.path,
1343 new_chunks.len()
1344 );
1345 }
1346
1347 new_manifest.files.push(FileEntry {
1348 path: old_file.path.clone(),
1349 is_text: old_file.is_text,
1350 size: old_file.size,
1351 chunks: new_chunks.clone(),
1352 deleted: false,
1353 });
1354
1355 new_manifest.total_chunks += new_chunks.len();
1356 }
1357
1358 // Replace old engram and manifest with compacted versions
1359 self.engram = new_engram;
1360 self.manifest = new_manifest;
1361
1362 if verbose {
1363 let stats = self.engram.corrections.stats();
1364 println!(
1365 "Compaction complete: {} files, {} chunks ({:.1}% perfect, {:.2}% correction overhead)",
1366 self.manifest.files.len(),
1367 self.manifest.total_chunks,
1368 stats.perfect_ratio * 100.0,
1369 stats.correction_ratio * 100.0
1370 );
1371 }
1372
1373 Ok(())
1374 }
1375
1376 /// Save engram to file
1377 pub fn save_engram<P: AsRef<Path>>(&self, path: P) -> io::Result<()> {
1378 let encoded = bincode::serialize(&self.engram).map_err(io::Error::other)?;
1379 fs::write(path, encoded)?;
1380 Ok(())
1381 }
1382
1383 /// Load engram from file
1384 pub fn load_engram<P: AsRef<Path>>(path: P) -> io::Result<Engram> {
1385 let data = fs::read(path)?;
1386 bincode::deserialize(&data).map_err(io::Error::other)
1387 }
1388
1389 /// Save manifest to JSON file
1390 pub fn save_manifest<P: AsRef<Path>>(&self, path: P) -> io::Result<()> {
1391 let file = File::create(path)?;
1392 serde_json::to_writer_pretty(file, &self.manifest)?;
1393 Ok(())
1394 }
1395
1396 /// Load manifest from JSON file
1397 pub fn load_manifest<P: AsRef<Path>>(path: P) -> io::Result<Manifest> {
1398 let file = File::open(path)?;
1399 let manifest = serde_json::from_reader(file)?;
1400 Ok(manifest)
1401 }
1402
1403 /// Load an EmbrFS from engram and manifest files
1404 ///
1405 /// Automatically detects if the engram was created with holographic mode
1406 /// and sets up the appropriate encoder for extraction.
1407 ///
1408 /// # Arguments
1409 /// * `engram_path` - Path to the engram file
1410 /// * `manifest_path` - Path to the manifest JSON file
1411 ///
1412 /// # Returns
1413 /// `io::Result<EmbrFS>` with the loaded engram and manifest
1414 pub fn load<P: AsRef<Path>, Q: AsRef<Path>>(
1415 engram_path: P,
1416 manifest_path: Q,
1417 ) -> io::Result<Self> {
1418 let engram = Self::load_engram(engram_path)?;
1419 let manifest = Self::load_manifest(manifest_path)?;
1420
1421 // Create encoder if holographic mode was used
1422 let (encoder, chunk_size) = if manifest.holographic {
1423 (Some(ReversibleVSAEncoder::new()), manifest.chunk_size)
1424 } else {
1425 (None, manifest.chunk_size)
1426 };
1427
1428 Ok(EmbrFS {
1429 manifest,
1430 engram,
1431 resonator: None,
1432 encoder,
1433 chunk_size,
1434 })
1435 }
1436
1437 /// Extract files from engram to directory with guaranteed reconstruction
1438 ///
1439 /// This method guarantees 100% bit-perfect reconstruction by applying
1440 /// stored corrections after decoding each chunk.
1441 ///
1442 /// # Reconstruction Process
1443 ///
1444 /// For each chunk:
1445 /// 1. Decode: `SparseVec → decoded_data`
1446 /// 2. Apply correction: `decoded_data + correction → original_data`
1447 /// 3. Verify: Hash matches stored hash (guaranteed by construction)
1448 ///
1449 /// # Arguments
1450 /// * `engram` - The engram containing encoded data and corrections
1451 /// * `manifest` - File metadata and chunk mappings
1452 /// * `output_dir` - Directory to write extracted files
1453 /// * `verbose` - Print progress information
1454 /// * `config` - VSA decoding configuration
1455 ///
1456 /// # Returns
1457 /// `io::Result<()>` indicating success or failure
1458 pub fn extract<P: AsRef<Path>>(
1459 engram: &Engram,
1460 manifest: &Manifest,
1461 output_dir: P,
1462 verbose: bool,
1463 config: &ReversibleVSAConfig,
1464 ) -> io::Result<()> {
1465 let output_dir = output_dir.as_ref();
1466
1467 // Use manifest's chunk_size and holographic flag
1468 let chunk_size = manifest.chunk_size;
1469 let is_holographic = manifest.holographic;
1470
1471 // Create encoder for holographic decoding if needed
1472 let encoder = if is_holographic {
1473 Some(ReversibleVSAEncoder::new())
1474 } else {
1475 None
1476 };
1477
1478 if verbose {
1479 println!(
1480 "Extracting {} files to {} ({})",
1481 manifest.files.iter().filter(|f| !f.deleted).count(),
1482 output_dir.display(),
1483 if is_holographic {
1484 "holographic"
1485 } else {
1486 "legacy"
1487 }
1488 );
1489 let stats = engram.corrections.stats();
1490 println!(
1491 " Correction stats: {:.1}% perfect, {:.2}% overhead",
1492 stats.perfect_ratio * 100.0,
1493 stats.correction_ratio * 100.0
1494 );
1495 }
1496
1497 for file_entry in &manifest.files {
1498 // Skip deleted files
1499 if file_entry.deleted {
1500 continue;
1501 }
1502
1503 let file_path = output_dir.join(&file_entry.path);
1504
1505 if let Some(parent) = file_path.parent() {
1506 fs::create_dir_all(parent)?;
1507 }
1508
1509 let mut reconstructed = Vec::new();
1510 let num_chunks = file_entry.chunks.len();
1511 for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
1512 if let Some(chunk_vec) = engram.codebook.get(&chunk_id) {
1513 // Calculate the actual chunk size for this chunk
1514 // Last chunk may be smaller than the standard chunk_size
1515 let this_chunk_size = if chunk_idx == num_chunks - 1 {
1516 // Last chunk: remaining bytes
1517 let remaining = file_entry.size.saturating_sub(chunk_idx * chunk_size);
1518 remaining.min(chunk_size)
1519 } else {
1520 chunk_size
1521 };
1522
1523 // Decode the sparse vector to bytes using appropriate method
1524 let decoded = if let Some(ref enc) = encoder {
1525 // Holographic mode: use ReversibleVSAEncoder
1526 enc.decode(chunk_vec, this_chunk_size)
1527 } else {
1528 // Legacy mode: use SparseVec::decode_data
1529 chunk_vec.decode_data(config, Some(&file_entry.path), this_chunk_size)
1530 };
1531
1532 // Apply correction to guarantee bit-perfect reconstruction
1533 let chunk_data = if let Some(corrected) =
1534 engram.corrections.apply(chunk_id as u64, &decoded)
1535 {
1536 corrected
1537 } else {
1538 // No correction found - use decoded directly
1539 // This can happen with legacy engrams or if correction store is empty
1540 decoded
1541 };
1542
1543 reconstructed.extend_from_slice(&chunk_data);
1544 }
1545 }
1546
1547 reconstructed.truncate(file_entry.size);
1548
1549 fs::write(&file_path, reconstructed)?;
1550
1551 if verbose {
1552 println!("Extracted: {}", file_entry.path);
1553 }
1554 }
1555
1556 Ok(())
1557 }
1558
1559 /// Extract files using resonator-enhanced pattern completion with guaranteed reconstruction
1560 ///
1561 /// Performs filesystem extraction with intelligent recovery capabilities powered by
1562 /// resonator networks. When chunks are missing from the codebook, the resonator
1563 /// attempts pattern completion to reconstruct the lost data, enabling extraction
1564 /// even from partially corrupted or incomplete engrams.
1565 ///
1566 /// # Reconstruction Guarantee
1567 ///
1568 /// Even with resonator-assisted recovery, corrections are applied to guarantee
1569 /// bit-perfect reconstruction. The process is:
1570 ///
1571 /// 1. Try to get chunk from codebook
1572 /// 2. If missing, use resonator to recover approximate chunk
1573 /// 3. Apply correction from CorrectionStore
1574 /// 4. Result is guaranteed bit-perfect (if correction exists)
1575 ///
1576 /// # How it works
1577 /// 1. For each file chunk, check if it exists in the engram codebook
1578 /// 2. If missing, use the resonator to project a query vector onto known patterns
1579 /// 3. Apply stored corrections for guaranteed accuracy
1580 /// 4. Reconstruct the file from available and recovered chunks
1581 /// 5. If no resonator is configured, falls back to standard extraction
1582 ///
1583 /// # Why this matters
1584 /// - Enables 100% reconstruction even with missing chunks
1585 /// - Provides fault tolerance for distributed storage scenarios
1586 /// - Supports hierarchical recovery at multiple levels of the storage stack
1587 /// - Maintains data integrity through pattern-based completion
1588 ///
1589 /// # Arguments
1590 /// * `output_dir` - Directory path where extracted files will be written
1591 /// * `verbose` - Whether to print progress information during extraction
1592 /// * `config` - VSA configuration for encoding/decoding
1593 ///
1594 /// # Returns
1595 /// `io::Result<()>` indicating success or failure of the extraction operation
1596 ///
1597 /// # Examples
1598 /// ```
1599 /// use embeddenator_fs::{EmbrFS, Resonator, ReversibleVSAConfig};
1600 /// use std::path::Path;
1601 ///
1602 /// let mut fs = EmbrFS::new();
1603 /// let resonator = Resonator::new();
1604 /// let config = ReversibleVSAConfig::default();
1605 /// fs.set_resonator(resonator);
1606 ///
1607 /// // Assuming fs has been populated with data...
1608 /// let result = fs.extract_with_resonator("/tmp/output", true, &config);
1609 /// assert!(result.is_ok());
1610 /// ```
1611 pub fn extract_with_resonator<P: AsRef<Path>>(
1612 &self,
1613 output_dir: P,
1614 verbose: bool,
1615 config: &ReversibleVSAConfig,
1616 ) -> io::Result<()> {
1617 if self.resonator.is_none() {
1618 return Self::extract(&self.engram, &self.manifest, output_dir, verbose, config);
1619 }
1620
1621 // SAFETY: we just checked is_none() above and returned early
1622 let _resonator = self
1623 .resonator
1624 .as_ref()
1625 .expect("resonator is Some after is_none() check");
1626 let output_dir = output_dir.as_ref();
1627
1628 if verbose {
1629 println!(
1630 "Extracting {} files with resonator enhancement to {}",
1631 self.manifest.files.iter().filter(|f| !f.deleted).count(),
1632 output_dir.display()
1633 );
1634 let stats = self.engram.corrections.stats();
1635 println!(
1636 " Correction stats: {:.1}% perfect, {:.2}% overhead",
1637 stats.perfect_ratio * 100.0,
1638 stats.correction_ratio * 100.0
1639 );
1640 }
1641
1642 for file_entry in &self.manifest.files {
1643 // Skip deleted files
1644 if file_entry.deleted {
1645 continue;
1646 }
1647
1648 let file_path = output_dir.join(&file_entry.path);
1649
1650 if let Some(parent) = file_path.parent() {
1651 fs::create_dir_all(parent)?;
1652 }
1653
1654 let mut reconstructed = Vec::new();
1655 let num_chunks = file_entry.chunks.len();
1656 for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
1657 // Calculate the actual chunk size
1658 let chunk_size = if chunk_idx == num_chunks - 1 {
1659 let remaining = file_entry.size - (chunk_idx * DEFAULT_CHUNK_SIZE);
1660 remaining.min(DEFAULT_CHUNK_SIZE)
1661 } else {
1662 DEFAULT_CHUNK_SIZE
1663 };
1664
1665 let chunk_data = if let Some(vector) = self.engram.codebook.get(&chunk_id) {
1666 // Decode the SparseVec back to bytes using reversible encoding
1667 // IMPORTANT: Use the same path as during encoding for correct shift calculation
1668 let decoded = vector.decode_data(config, Some(&file_entry.path), chunk_size);
1669
1670 // Apply correction to guarantee bit-perfect reconstruction
1671 if let Some(corrected) =
1672 self.engram.corrections.apply(chunk_id as u64, &decoded)
1673 {
1674 corrected
1675 } else {
1676 decoded
1677 }
1678 } else if let Some(resonator) = &self.resonator {
1679 // Use resonator to recover missing chunk
1680 // Create a query vector from the chunk_id using reversible encoding
1681 let query_vec = SparseVec::encode_data(&chunk_id.to_le_bytes(), config, None);
1682 let recovered_vec = resonator.project(&query_vec);
1683
1684 // Decode the recovered vector back to bytes
1685 // For resonator recovery, try with path first, fall back to no path
1686 let decoded =
1687 recovered_vec.decode_data(config, Some(&file_entry.path), chunk_size);
1688
1689 // Apply correction if available (may not be if chunk was lost)
1690 if let Some(corrected) =
1691 self.engram.corrections.apply(chunk_id as u64, &decoded)
1692 {
1693 corrected
1694 } else {
1695 // No correction available - best effort recovery
1696 decoded
1697 }
1698 } else {
1699 return Err(io::Error::new(
1700 io::ErrorKind::NotFound,
1701 format!("Missing chunk {} and no resonator available", chunk_id),
1702 ));
1703 };
1704 reconstructed.extend_from_slice(&chunk_data);
1705 }
1706
1707 reconstructed.truncate(file_entry.size);
1708
1709 fs::write(&file_path, reconstructed)?;
1710
1711 if verbose {
1712 println!("Extracted with resonator: {}", file_entry.path);
1713 }
1714 }
1715
1716 Ok(())
1717 }
1718
1719 /// Perform hierarchical bundling with path role binding and permutation tagging
1720 ///
1721 /// Creates multi-level engram structures where path components are encoded using
1722 /// permutation operations to create distinct representations at each level. This
1723 /// enables efficient hierarchical retrieval and reconstruction.
1724 ///
1725 /// # How it works
1726 /// 1. Split file paths into components (e.g., "a/b/c.txt" → ["a", "b", "c.txt"])
1727 /// 2. For each level, apply permutation based on path component hash
1728 /// 3. Bundle representations level-by-level with sparsity control
1729 /// 4. Create sub-engrams for intermediate nodes
1730 ///
1731 /// # Why this matters
1732 /// - Enables scalable hierarchical storage beyond flat bundling limits
1733 /// - Path-based retrieval without full engram traversal
1734 /// - Maintains semantic relationships through permutation encoding
1735 /// - Supports efficient partial reconstruction
1736 ///
1737 /// # Arguments
1738 /// * `max_level_sparsity` - Maximum non-zero elements per level bundle
1739 /// * `verbose` - Whether to print progress information
1740 ///
1741 /// # Returns
1742 /// HierarchicalManifest describing the multi-level structure
1743 ///
1744 /// # Examples
1745 /// ```
1746 /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1747 ///
1748 /// let fs = EmbrFS::new();
1749 /// let config = ReversibleVSAConfig::default();
1750 /// // Assuming files have been ingested...
1751 ///
1752 /// let hierarchical = fs.bundle_hierarchically(500, false, &config);
1753 /// assert!(hierarchical.is_ok());
1754 /// ```
1755 pub fn bundle_hierarchically(
1756 &self,
1757 max_level_sparsity: usize,
1758 verbose: bool,
1759 _config: &ReversibleVSAConfig,
1760 ) -> io::Result<HierarchicalManifest> {
1761 self.bundle_hierarchically_with_options(max_level_sparsity, None, verbose, _config)
1762 }
1763
1764 /// Like `bundle_hierarchically`, but supports an optional deterministic cap on `chunk_ids` per node.
1765 ///
1766 /// If `max_chunks_per_node` is set and a node would exceed that many `chunk_ids`, the node becomes
1767 /// a router with empty `chunk_ids`, and deterministic shard children are created each containing a
1768 /// bounded subset of `chunk_ids`.
1769 pub fn bundle_hierarchically_with_options(
1770 &self,
1771 max_level_sparsity: usize,
1772 max_chunks_per_node: Option<usize>,
1773 verbose: bool,
1774 _config: &ReversibleVSAConfig,
1775 ) -> io::Result<HierarchicalManifest> {
1776 let mut levels = Vec::new();
1777 let mut sub_engrams = HashMap::new();
1778
1779 // Group files by *path prefixes* at each level.
1780 // Level 0: "a"; Level 1: "a/b"; etc.
1781 let mut level_prefixes: HashMap<usize, HashMap<String, Vec<&FileEntry>>> = HashMap::new();
1782 for file_entry in &self.manifest.files {
1783 let comps: Vec<&str> = file_entry.path.split('/').collect();
1784 let mut prefix = String::new();
1785 for (level, &comp) in comps.iter().enumerate() {
1786 if level == 0 {
1787 prefix.push_str(comp);
1788 } else {
1789 prefix.push('/');
1790 prefix.push_str(comp);
1791 }
1792 level_prefixes
1793 .entry(level)
1794 .or_default()
1795 .entry(prefix.clone())
1796 .or_default()
1797 .push(file_entry);
1798 }
1799 }
1800
1801 // Process each level
1802 let max_level = level_prefixes.keys().max().unwrap_or(&0);
1803
1804 for level in 0..=*max_level {
1805 if verbose {
1806 let item_count = level_prefixes
1807 .get(&level)
1808 .map(|comps| comps.values().map(|files| files.len()).sum::<usize>())
1809 .unwrap_or(0);
1810 println!("Processing level {} with {} items", level, item_count);
1811 }
1812
1813 let mut level_bundle = SparseVec::new();
1814 let mut manifest_items = Vec::new();
1815
1816 if let Some(prefixes) = level_prefixes.get(&level) {
1817 let mut prefix_keys: Vec<&String> = prefixes.keys().collect();
1818 prefix_keys.sort();
1819
1820 for prefix in prefix_keys {
1821 let mut files: Vec<&FileEntry> = prefixes
1822 .get(prefix)
1823 // SAFETY: prefix comes from keys(), so get() must succeed
1824 .expect("prefix key from keys() must exist in HashMap")
1825 .to_vec();
1826 files.sort_by(|a, b| a.path.cmp(&b.path));
1827
1828 // Create permutation shift based on prefix hash
1829 let shift = {
1830 use std::collections::hash_map::DefaultHasher;
1831 use std::hash::{Hash, Hasher};
1832 let mut hasher = DefaultHasher::new();
1833 prefix.hash(&mut hasher);
1834 (hasher.finish() % (DIM as u64)) as usize
1835 };
1836
1837 // Bundle all files under this component with permutation
1838 let mut component_bundle = SparseVec::new();
1839 let mut chunk_ids_set: HashSet<usize> = HashSet::new();
1840 for file_entry in &files {
1841 // Find chunks for this file and bundle them
1842 let mut file_bundle = SparseVec::new();
1843 for &chunk_id in &file_entry.chunks {
1844 if let Some(chunk_vec) = self.engram.codebook.get(&chunk_id) {
1845 file_bundle = file_bundle.bundle(chunk_vec);
1846 chunk_ids_set.insert(chunk_id);
1847 }
1848 }
1849
1850 // Apply level-based permutation
1851 let permuted_file = file_bundle.permute(shift * (level + 1));
1852 component_bundle = component_bundle.bundle(&permuted_file);
1853 }
1854
1855 // Apply sparsity control
1856 if component_bundle.pos.len() + component_bundle.neg.len() > max_level_sparsity
1857 {
1858 component_bundle = component_bundle.thin(max_level_sparsity);
1859 }
1860
1861 level_bundle = level_bundle.bundle(&component_bundle);
1862
1863 // Create sub-engram for this prefix.
1864 // Children are the immediate next-level prefixes underneath this prefix.
1865 let sub_id = format!("level_{}_prefix_{}", level, prefix);
1866
1867 let mut children_set: HashSet<String> = HashSet::new();
1868 if level < *max_level {
1869 for file_entry in &files {
1870 let comps: Vec<&str> = file_entry.path.split('/').collect();
1871 if comps.len() <= level + 1 {
1872 continue;
1873 }
1874 let child_prefix = comps[..=level + 1].join("/");
1875 let child_id = format!("level_{}_prefix_{}", level + 1, child_prefix);
1876 children_set.insert(child_id);
1877 }
1878 }
1879 let mut children: Vec<String> = children_set.into_iter().collect();
1880 children.sort();
1881
1882 let mut chunk_ids: Vec<usize> = chunk_ids_set.into_iter().collect();
1883 chunk_ids.sort_unstable();
1884
1885 let chunk_count: usize = files.iter().map(|f| f.chunks.len()).sum();
1886
1887 if let Some(max_chunks) = max_chunks_per_node.filter(|v| *v > 0) {
1888 if chunk_ids.len() > max_chunks {
1889 let mut shard_ids: Vec<String> = Vec::new();
1890 for (shard_idx, chunk_slice) in chunk_ids.chunks(max_chunks).enumerate()
1891 {
1892 let shard_id = format!("{}__shard_{:04}", sub_id, shard_idx);
1893 shard_ids.push(shard_id.clone());
1894 sub_engrams.insert(
1895 shard_id.clone(),
1896 SubEngram {
1897 id: shard_id,
1898 root: component_bundle.clone(),
1899 chunk_ids: chunk_slice.to_vec(),
1900 chunk_count: chunk_slice.len(),
1901 children: Vec::new(),
1902 },
1903 );
1904 }
1905
1906 let mut router_children = shard_ids;
1907 router_children.extend(children.clone());
1908 router_children.sort();
1909 router_children.dedup();
1910
1911 sub_engrams.insert(
1912 sub_id.clone(),
1913 SubEngram {
1914 id: sub_id.clone(),
1915 root: component_bundle,
1916 chunk_ids: Vec::new(),
1917 chunk_count,
1918 children: router_children,
1919 },
1920 );
1921 } else {
1922 sub_engrams.insert(
1923 sub_id.clone(),
1924 SubEngram {
1925 id: sub_id.clone(),
1926 root: component_bundle,
1927 chunk_ids,
1928 chunk_count,
1929 children,
1930 },
1931 );
1932 }
1933 } else {
1934 sub_engrams.insert(
1935 sub_id.clone(),
1936 SubEngram {
1937 id: sub_id.clone(),
1938 root: component_bundle,
1939 chunk_ids,
1940 chunk_count,
1941 children,
1942 },
1943 );
1944 }
1945
1946 manifest_items.push(ManifestItem {
1947 path: prefix.clone(),
1948 sub_engram_id: sub_id,
1949 });
1950 }
1951 }
1952
1953 manifest_items.sort_by(|a, b| {
1954 a.path
1955 .cmp(&b.path)
1956 .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
1957 });
1958
1959 // Apply final sparsity control to level bundle
1960 if level_bundle.pos.len() + level_bundle.neg.len() > max_level_sparsity {
1961 level_bundle = level_bundle.thin(max_level_sparsity);
1962 }
1963
1964 levels.push(ManifestLevel {
1965 level: level as u32,
1966 items: manifest_items,
1967 });
1968 }
1969
1970 Ok(HierarchicalManifest {
1971 version: 1,
1972 levels,
1973 sub_engrams,
1974 })
1975 }
1976
1977 /// Extract files from hierarchical manifest with manifest-guided traversal
1978 ///
1979 /// Performs hierarchical extraction by traversing the manifest levels and
1980 /// reconstructing files from sub-engrams. This enables efficient extraction
1981 /// from complex hierarchical structures without loading the entire engram.
1982 ///
1983 /// # How it works
1984 /// 1. Traverse manifest levels from root to leaves
1985 /// 2. For each level, locate relevant sub-engrams
1986 /// 3. Reconstruct file chunks using inverse permutation operations
1987 /// 4. Assemble complete files from hierarchical components
1988 ///
1989 /// # Why this matters
1990 /// - Enables partial extraction from large hierarchical datasets
1991 /// - Maintains bit-perfect reconstruction accuracy
1992 /// - Supports efficient path-based queries and retrieval
1993 /// - Scales to complex directory structures
1994 ///
1995 /// # Arguments
1996 /// * `hierarchical` - The hierarchical manifest to extract from
1997 /// * `output_dir` - Directory path where extracted files will be written
1998 /// * `verbose` - Whether to print progress information during extraction
1999 ///
2000 /// # Returns
2001 /// `io::Result<()>` indicating success or failure of the hierarchical extraction
2002 ///
2003 /// # Examples
2004 /// ```
2005 /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
2006 ///
2007 /// let fs = EmbrFS::new();
2008 /// let config = ReversibleVSAConfig::default();
2009 /// // Assuming hierarchical manifest was created...
2010 /// // let hierarchical = fs.bundle_hierarchically(500, true).unwrap();
2011 ///
2012 /// // fs.extract_hierarchically(&hierarchical, "/tmp/output", true, &config)?;
2013 /// ```
2014 pub fn extract_hierarchically<P: AsRef<Path>>(
2015 &self,
2016 hierarchical: &HierarchicalManifest,
2017 output_dir: P,
2018 verbose: bool,
2019 config: &ReversibleVSAConfig,
2020 ) -> io::Result<()> {
2021 let output_dir = output_dir.as_ref();
2022
2023 if verbose {
2024 println!(
2025 "Extracting hierarchical manifest with {} levels to {}",
2026 hierarchical.levels.len(),
2027 output_dir.display()
2028 );
2029 }
2030
2031 // For each file in the original manifest, reconstruct it using hierarchical information
2032 for file_entry in &self.manifest.files {
2033 // Skip deleted files
2034 if file_entry.deleted {
2035 continue;
2036 }
2037
2038 let file_path = output_dir.join(&file_entry.path);
2039
2040 if let Some(parent) = file_path.parent() {
2041 fs::create_dir_all(parent)?;
2042 }
2043
2044 let mut reconstructed = Vec::new();
2045
2046 // Reconstruct each chunk using hierarchical information
2047 let num_chunks = file_entry.chunks.len();
2048 for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
2049 if let Some(chunk_vector) = self.engram.codebook.get(&chunk_id) {
2050 // Calculate the actual chunk size
2051 let chunk_size = if chunk_idx == num_chunks - 1 {
2052 let remaining = file_entry.size - (chunk_idx * DEFAULT_CHUNK_SIZE);
2053 remaining.min(DEFAULT_CHUNK_SIZE)
2054 } else {
2055 DEFAULT_CHUNK_SIZE
2056 };
2057
2058 // Decode using hierarchical inverse transformations
2059 let decoded =
2060 chunk_vector.decode_data(config, Some(&file_entry.path), chunk_size);
2061
2062 // Apply correction if available
2063 let chunk_data = if let Some(corrected) =
2064 self.engram.corrections.apply(chunk_id as u64, &decoded)
2065 {
2066 corrected
2067 } else {
2068 decoded
2069 };
2070
2071 reconstructed.extend_from_slice(&chunk_data);
2072 }
2073 }
2074
2075 // Truncate to actual file size
2076 reconstructed.truncate(file_entry.size);
2077
2078 fs::write(&file_path, reconstructed)?;
2079
2080 if verbose {
2081 println!("Extracted hierarchical: {}", file_entry.path);
2082 }
2083 }
2084
2085 Ok(())
2086 }
2087}
2088pub fn is_text_file(data: &[u8]) -> bool {
2089 if data.is_empty() {
2090 return true;
2091 }
2092
2093 let sample_size = data.len().min(8192);
2094 let sample = &data[..sample_size];
2095
2096 let mut null_count = 0;
2097 let mut control_count = 0;
2098
2099 for &byte in sample {
2100 if byte == 0 {
2101 null_count += 1;
2102 } else if byte < 32 && byte != b'\n' && byte != b'\r' && byte != b'\t' {
2103 control_count += 1;
2104 }
2105 }
2106
2107 null_count == 0 && control_count < sample_size / 10
2108}