Skip to main content

embeddenator_fs/fs/
embrfs.rs

1//! EmbrFS - Holographic Filesystem Implementation
2//!
3//! Provides engram-based storage for entire filesystem trees with:
4//! - Chunked encoding for efficient storage
5//! - Manifest for file metadata
6//! - **Guaranteed 100% bit-perfect reconstruction** via CorrectionStore
7//!
8//! # Reconstruction Guarantee
9//!
10//! The fundamental challenge with VSA encoding is that approximate operations
11//! may introduce errors during superposition. This module solves that through
12//! a multi-layer approach:
13//!
14//! 1. **Primary Encoding**: SparseVec encoding attempts bit-perfect storage
15//! 2. **Correction Layer**: CorrectionStore captures any encoding errors
16//! 3. **Reconstruction**: Decode + apply corrections = exact original
17//!
18//! The invariant: `original = decode(encode(original)) + correction`
19//!
20//! If encoding was perfect, correction is empty. If not, correction exactly
21//! compensates. Either way, reconstruction is guaranteed bit-perfect.
22
23use crate::correction::{CorrectionStats, CorrectionStore};
24use embeddenator_retrieval::resonator::Resonator;
25use embeddenator_retrieval::{RerankedResult, TernaryInvertedIndex};
26use embeddenator_vsa::{ReversibleVSAConfig, ReversibleVSAEncoder, SparseVec, DIM};
27use serde::{Deserialize, Serialize};
28use std::collections::BTreeMap;
29use std::collections::{HashMap, HashSet};
30use std::fs::{self, File};
31use std::io::{self, Read};
32use std::path::{Path, PathBuf};
33use walkdir::WalkDir;
34
35/// Default chunk size for file encoding (4KB)
36pub const DEFAULT_CHUNK_SIZE: usize = 4096;
37
38/// File entry in the manifest
39#[derive(Serialize, Deserialize, Debug)]
40pub struct FileEntry {
41    pub path: String,
42    pub is_text: bool,
43    pub size: usize,
44    pub chunks: Vec<usize>,
45    /// Mark files as deleted without rebuilding root (for incremental updates)
46    #[serde(default)]
47    pub deleted: bool,
48}
49
50/// Manifest describing filesystem structure
51#[derive(Serialize, Deserialize, Debug)]
52pub struct Manifest {
53    pub files: Vec<FileEntry>,
54    pub total_chunks: usize,
55    /// Chunk size used during encoding (64 for holographic, 4096 for legacy)
56    #[serde(default = "default_chunk_size")]
57    pub chunk_size: usize,
58    /// Whether holographic encoding was used
59    #[serde(default)]
60    pub holographic: bool,
61}
62
63fn default_chunk_size() -> usize {
64    DEFAULT_CHUNK_SIZE
65}
66
67/// Hierarchical manifest for multi-level engrams
68#[derive(Serialize, Deserialize, Debug)]
69pub struct HierarchicalManifest {
70    pub version: u32,
71    pub levels: Vec<ManifestLevel>,
72    #[serde(default)]
73    pub sub_engrams: HashMap<String, SubEngram>,
74}
75
76/// Level in hierarchical manifest
77#[derive(Serialize, Deserialize, Debug, Clone)]
78pub struct ManifestLevel {
79    pub level: u32,
80    pub items: Vec<ManifestItem>,
81}
82
83/// Item in manifest level
84#[derive(Serialize, Deserialize, Debug, Clone)]
85pub struct ManifestItem {
86    pub path: String,
87    pub sub_engram_id: String,
88}
89
90/// Sub-engram in hierarchical structure
91#[derive(Serialize, Deserialize, Debug, Clone)]
92pub struct SubEngram {
93    pub id: String,
94    pub root: SparseVec,
95    /// Chunk IDs that belong to this sub-engram.
96    ///
97    /// This enables selective retrieval without indexing the entire global codebook.
98    #[serde(default)]
99    pub chunk_ids: Vec<usize>,
100    pub chunk_count: usize,
101    pub children: Vec<String>,
102}
103
104/// Bounds and tuning parameters for hierarchical selective retrieval.
105#[derive(Clone, Debug)]
106pub struct HierarchicalQueryBounds {
107    /// Global top-k results to return.
108    pub k: usize,
109    /// Candidate count per expanded node before reranking.
110    pub candidate_k: usize,
111    /// Maximum number of frontier nodes retained (beam width).
112    pub beam_width: usize,
113    /// Maximum depth to descend (0 means only level-0 nodes).
114    pub max_depth: usize,
115    /// Maximum number of expanded nodes.
116    pub max_expansions: usize,
117    /// Maximum number of cached inverted indices.
118    pub max_open_indices: usize,
119    /// Maximum number of cached sub-engrams.
120    pub max_open_engrams: usize,
121}
122
123impl Default for HierarchicalQueryBounds {
124    fn default() -> Self {
125        Self {
126            k: 10,
127            candidate_k: 100,
128            beam_width: 32,
129            max_depth: 4,
130            max_expansions: 128,
131            max_open_indices: 16,
132            max_open_engrams: 16,
133        }
134    }
135}
136
137#[derive(Clone, Debug, PartialEq)]
138pub struct HierarchicalChunkHit {
139    pub sub_engram_id: String,
140    pub chunk_id: usize,
141    pub approx_score: i32,
142    pub cosine: f64,
143}
144
145#[derive(Clone, Debug)]
146struct FrontierItem {
147    score: f64,
148    sub_engram_id: String,
149    depth: usize,
150}
151
152#[derive(Clone, Debug)]
153struct RemappedInvertedIndex {
154    index: TernaryInvertedIndex,
155    local_to_global: Vec<usize>,
156}
157
158impl RemappedInvertedIndex {
159    fn build(chunk_ids: &[usize], vectors: &HashMap<usize, SparseVec>) -> Self {
160        let mut index = TernaryInvertedIndex::new();
161        let mut local_to_global = Vec::with_capacity(chunk_ids.len());
162
163        for (local_id, &global_id) in chunk_ids.iter().enumerate() {
164            let Some(vec) = vectors.get(&global_id) else {
165                continue;
166            };
167            local_to_global.push(global_id);
168            index.add(local_id, vec);
169        }
170
171        index.finalize();
172        Self {
173            index,
174            local_to_global,
175        }
176    }
177
178    fn query_top_k_reranked(
179        &self,
180        query: &SparseVec,
181        vectors: &HashMap<usize, SparseVec>,
182        candidate_k: usize,
183        k: usize,
184    ) -> Vec<HierarchicalChunkHit> {
185        if k == 0 {
186            return Vec::new();
187        }
188
189        let candidates = self.index.query_top_k(query, candidate_k);
190        let mut out = Vec::with_capacity(candidates.len().min(k));
191        for cand in candidates {
192            let Some(&global_id) = self.local_to_global.get(cand.id) else {
193                continue;
194            };
195            let Some(vec) = vectors.get(&global_id) else {
196                continue;
197            };
198            out.push((global_id, cand.score, query.cosine(vec)));
199        }
200
201        out.sort_by(|a, b| {
202            b.2.total_cmp(&a.2)
203                .then_with(|| b.1.cmp(&a.1))
204                .then_with(|| a.0.cmp(&b.0))
205        });
206        out.truncate(k);
207
208        out.into_iter()
209            .map(|(chunk_id, approx_score, cosine)| HierarchicalChunkHit {
210                sub_engram_id: String::new(),
211                chunk_id,
212                approx_score,
213                cosine,
214            })
215            .collect()
216    }
217}
218
219#[derive(Clone, Debug)]
220struct LruCache<V> {
221    cap: usize,
222    map: HashMap<String, V>,
223    order: Vec<String>,
224}
225
226impl<V> LruCache<V> {
227    fn new(cap: usize) -> Self {
228        Self {
229            cap,
230            map: HashMap::new(),
231            order: Vec::new(),
232        }
233    }
234
235    fn get(&mut self, key: &str) -> Option<&V> {
236        if self.map.contains_key(key) {
237            self.touch(key);
238            return self.map.get(key);
239        }
240        None
241    }
242
243    fn insert(&mut self, key: String, value: V) {
244        if self.cap == 0 {
245            return;
246        }
247
248        if self.map.contains_key(&key) {
249            self.map.insert(key.clone(), value);
250            self.touch(&key);
251            return;
252        }
253
254        self.map.insert(key.clone(), value);
255        self.order.push(key);
256
257        while self.map.len() > self.cap {
258            if let Some(evict) = self.order.first().cloned() {
259                self.order.remove(0);
260                self.map.remove(&evict);
261            } else {
262                break;
263            }
264        }
265    }
266
267    fn touch(&mut self, key: &str) {
268        if let Some(pos) = self.order.iter().position(|k| k == key) {
269            let k = self.order.remove(pos);
270            self.order.push(k);
271        }
272    }
273}
274
275/// Storage/loader seam for hierarchical sub-engrams.
276///
277/// This enables on-demand loading (e.g., from disk) rather than requiring that
278/// every sub-engram is materialized in memory.
279pub trait SubEngramStore {
280    fn load(&self, id: &str) -> Option<SubEngram>;
281}
282
283fn escape_sub_engram_id(id: &str) -> String {
284    // Minimal reversible escaping for filenames.
285    // Note: not intended for untrusted input; IDs are internal.
286    id.replace('%', "%25").replace('/', "%2F")
287}
288
289/// Directory-backed store for sub-engrams.
290///
291/// Files are stored as bincode blobs under `${dir}/{escaped_id}.subengram`.
292pub struct DirectorySubEngramStore {
293    dir: PathBuf,
294}
295
296impl DirectorySubEngramStore {
297    pub fn new<P: AsRef<Path>>(dir: P) -> Self {
298        Self {
299            dir: dir.as_ref().to_path_buf(),
300        }
301    }
302
303    fn path_for_id(&self, id: &str) -> PathBuf {
304        self.dir
305            .join(format!("{}.subengram", escape_sub_engram_id(id)))
306    }
307}
308
309impl SubEngramStore for DirectorySubEngramStore {
310    fn load(&self, id: &str) -> Option<SubEngram> {
311        let path = self.path_for_id(id);
312        let data = fs::read(path).ok()?;
313        bincode::deserialize(&data).ok()
314    }
315}
316
317/// Save a hierarchical manifest as JSON.
318pub fn save_hierarchical_manifest<P: AsRef<Path>>(
319    hierarchical: &HierarchicalManifest,
320    path: P,
321) -> io::Result<()> {
322    let file = File::create(path)?;
323
324    // Serialize deterministically: HashMap iteration order is not stable.
325    #[derive(Serialize)]
326    struct StableHierarchicalManifest {
327        version: u32,
328        levels: Vec<ManifestLevel>,
329        sub_engrams: BTreeMap<String, SubEngram>,
330    }
331
332    let mut levels = hierarchical.levels.clone();
333    levels.sort_by_key(|a| a.level);
334    for level in &mut levels {
335        level.items.sort_by(|a, b| {
336            a.path
337                .cmp(&b.path)
338                .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
339        });
340    }
341
342    let mut sub_engrams: BTreeMap<String, SubEngram> = BTreeMap::new();
343    for (id, sub) in &hierarchical.sub_engrams {
344        sub_engrams.insert(id.clone(), sub.clone());
345    }
346
347    let stable = StableHierarchicalManifest {
348        version: hierarchical.version,
349        levels,
350        sub_engrams,
351    };
352
353    serde_json::to_writer_pretty(file, &stable)?;
354    Ok(())
355}
356
357/// Load a hierarchical manifest from JSON.
358pub fn load_hierarchical_manifest<P: AsRef<Path>>(path: P) -> io::Result<HierarchicalManifest> {
359    let file = File::open(path)?;
360    let manifest = serde_json::from_reader(file)?;
361    Ok(manifest)
362}
363
364/// Save a set of sub-engrams to a directory (bincode per sub-engram).
365pub fn save_sub_engrams_dir<P: AsRef<Path>>(
366    sub_engrams: &HashMap<String, SubEngram>,
367    dir: P,
368) -> io::Result<()> {
369    let dir = dir.as_ref();
370    fs::create_dir_all(dir)?;
371
372    let mut ids: Vec<&String> = sub_engrams.keys().collect();
373    ids.sort();
374
375    for id in ids {
376        // SAFETY: id comes from keys(), so get() must succeed
377        let sub = sub_engrams
378            .get(id)
379            .expect("sub_engram id from keys() must exist in HashMap");
380        let encoded = bincode::serialize(sub).map_err(io::Error::other)?;
381        let path = dir.join(format!("{}.subengram", escape_sub_engram_id(id)));
382        fs::write(path, encoded)?;
383    }
384    Ok(())
385}
386
387struct InMemorySubEngramStore<'a> {
388    map: &'a HashMap<String, SubEngram>,
389}
390
391impl<'a> InMemorySubEngramStore<'a> {
392    fn new(map: &'a HashMap<String, SubEngram>) -> Self {
393        Self { map }
394    }
395}
396
397impl SubEngramStore for InMemorySubEngramStore<'_> {
398    fn load(&self, id: &str) -> Option<SubEngram> {
399        self.map.get(id).cloned()
400    }
401}
402
403fn get_cached_sub_engram(
404    cache: &mut LruCache<SubEngram>,
405    store: &impl SubEngramStore,
406    id: &str,
407) -> Option<SubEngram> {
408    if let Some(v) = cache.get(id) {
409        return Some(v.clone());
410    }
411    let loaded = store.load(id)?;
412    cache.insert(id.to_string(), loaded.clone());
413    Some(loaded)
414}
415
416/// Query a hierarchical manifest by selectively unfolding only promising sub-engrams.
417///
418/// This performs a beam-limited traversal over `hierarchical.sub_engrams`.
419/// At each expanded node, it builds (and LRU-caches) an inverted index over the
420/// node-local `chunk_ids` subset of `codebook`, then reranks by exact cosine.
421pub fn query_hierarchical_codebook(
422    hierarchical: &HierarchicalManifest,
423    codebook: &HashMap<usize, SparseVec>,
424    query: &SparseVec,
425    bounds: &HierarchicalQueryBounds,
426) -> Vec<HierarchicalChunkHit> {
427    let store = InMemorySubEngramStore::new(&hierarchical.sub_engrams);
428    query_hierarchical_codebook_with_store(hierarchical, &store, codebook, query, bounds)
429}
430
431/// Store-backed variant of `query_hierarchical_codebook` that supports on-demand sub-engram loading.
432pub fn query_hierarchical_codebook_with_store(
433    hierarchical: &HierarchicalManifest,
434    store: &impl SubEngramStore,
435    codebook: &HashMap<usize, SparseVec>,
436    query: &SparseVec,
437    bounds: &HierarchicalQueryBounds,
438) -> Vec<HierarchicalChunkHit> {
439    if bounds.k == 0 || hierarchical.levels.is_empty() {
440        return Vec::new();
441    }
442
443    let mut sub_cache: LruCache<SubEngram> = LruCache::new(bounds.max_open_engrams);
444    let mut index_cache: LruCache<RemappedInvertedIndex> = LruCache::new(bounds.max_open_indices);
445
446    let mut frontier: Vec<FrontierItem> = Vec::new();
447    if let Some(level0) = hierarchical.levels.first() {
448        for item in &level0.items {
449            let Some(sub) = get_cached_sub_engram(&mut sub_cache, store, &item.sub_engram_id)
450            else {
451                continue;
452            };
453            frontier.push(FrontierItem {
454                score: query.cosine(&sub.root),
455                sub_engram_id: item.sub_engram_id.clone(),
456                depth: 0,
457            });
458        }
459    }
460
461    frontier.sort_by(|a, b| {
462        b.score
463            .total_cmp(&a.score)
464            .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
465    });
466    if frontier.len() > bounds.beam_width {
467        frontier.truncate(bounds.beam_width);
468    }
469
470    let mut expansions = 0usize;
471
472    // Keep only the best hit per chunk for determinism.
473    let mut best_by_chunk: HashMap<usize, HierarchicalChunkHit> = HashMap::new();
474
475    while !frontier.is_empty() && expansions < bounds.max_expansions {
476        let node = frontier.remove(0);
477
478        let Some(sub) = get_cached_sub_engram(&mut sub_cache, store, &node.sub_engram_id) else {
479            continue;
480        };
481
482        expansions += 1;
483
484        let idx = if let Some(existing) = index_cache.get(&node.sub_engram_id) {
485            existing
486        } else {
487            let built = RemappedInvertedIndex::build(&sub.chunk_ids, codebook);
488            index_cache.insert(node.sub_engram_id.clone(), built);
489            // SAFETY: we just inserted the key, so get() must succeed immediately after
490            index_cache
491                .get(&node.sub_engram_id)
492                .expect("index_cache.get() must succeed immediately after insert()")
493        };
494
495        let mut local_hits =
496            idx.query_top_k_reranked(query, codebook, bounds.candidate_k, bounds.k);
497        for hit in &mut local_hits {
498            hit.sub_engram_id = node.sub_engram_id.clone();
499        }
500
501        for hit in local_hits {
502            match best_by_chunk.get(&hit.chunk_id) {
503                None => {
504                    best_by_chunk.insert(hit.chunk_id, hit);
505                }
506                Some(existing) => {
507                    let better = hit
508                        .cosine
509                        .total_cmp(&existing.cosine)
510                        .then_with(|| hit.approx_score.cmp(&existing.approx_score))
511                        .is_gt();
512                    if better {
513                        best_by_chunk.insert(hit.chunk_id, hit);
514                    }
515                }
516            }
517        }
518
519        if node.depth >= bounds.max_depth {
520            continue;
521        }
522
523        let children = sub.children.clone();
524        for child_id in &children {
525            let Some(child) = get_cached_sub_engram(&mut sub_cache, store, child_id) else {
526                continue;
527            };
528            frontier.push(FrontierItem {
529                score: query.cosine(&child.root),
530                sub_engram_id: child_id.clone(),
531                depth: node.depth + 1,
532            });
533        }
534
535        frontier.sort_by(|a, b| {
536            b.score
537                .total_cmp(&a.score)
538                .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
539        });
540        if frontier.len() > bounds.beam_width {
541            frontier.truncate(bounds.beam_width);
542        }
543    }
544
545    let mut out: Vec<HierarchicalChunkHit> = best_by_chunk.into_values().collect();
546    out.sort_by(|a, b| {
547        b.cosine
548            .total_cmp(&a.cosine)
549            .then_with(|| b.approx_score.cmp(&a.approx_score))
550            .then_with(|| a.chunk_id.cmp(&b.chunk_id))
551            .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
552    });
553    out.truncate(bounds.k);
554    out
555}
556
557/// Unified manifest enum for backward compatibility
558#[derive(Serialize, Deserialize, Debug)]
559pub enum UnifiedManifest {
560    Flat(Manifest),
561    Hierarchical(HierarchicalManifest),
562}
563
564impl From<Manifest> for UnifiedManifest {
565    fn from(manifest: Manifest) -> Self {
566        UnifiedManifest::Flat(manifest)
567    }
568}
569
570/// Engram: holographic encoding of a filesystem with correction guarantee
571#[derive(Serialize, Deserialize)]
572pub struct Engram {
573    pub root: SparseVec,
574    pub codebook: HashMap<usize, SparseVec>,
575    /// Correction store for 100% reconstruction guarantee
576    #[serde(default)]
577    pub corrections: CorrectionStore,
578}
579
580impl Engram {
581    /// Build a reusable inverted index over the codebook.
582    ///
583    /// This is useful when issuing multiple queries (e.g., shift-sweeps) and you
584    /// want to avoid rebuilding the index each time.
585    pub fn build_codebook_index(&self) -> TernaryInvertedIndex {
586        TernaryInvertedIndex::build_from_map(&self.codebook)
587    }
588
589    /// Query the codebook using a pre-built inverted index.
590    pub fn query_codebook_with_index(
591        &self,
592        index: &TernaryInvertedIndex,
593        query: &SparseVec,
594        candidate_k: usize,
595        k: usize,
596    ) -> Vec<RerankedResult> {
597        if k == 0 || self.codebook.is_empty() {
598            return Vec::new();
599        }
600        index.query_top_k_reranked(query, &self.codebook, candidate_k, k)
601    }
602
603    /// Query the engram's codebook for chunks most similar to `query`.
604    ///
605    /// This builds an inverted index over the codebook for sub-linear candidate
606    /// generation, then reranks those candidates using exact cosine similarity.
607    pub fn query_codebook(&self, query: &SparseVec, k: usize) -> Vec<RerankedResult> {
608        if k == 0 || self.codebook.is_empty() {
609            return Vec::new();
610        }
611
612        // Simple heuristic: rerank a moderately-sized candidate set.
613        let candidate_k = (k.saturating_mul(10)).max(50);
614        let index = self.build_codebook_index();
615        self.query_codebook_with_index(&index, query, candidate_k, k)
616    }
617}
618
619/// Chunk size optimized for holographic encoding (8 bytes)
620/// Smaller chunks achieve higher accuracy (~94%) with ReversibleVSAEncoder
621/// Using larger chunks creates too much crosstalk from bundling
622pub const HOLOGRAPHIC_CHUNK_SIZE: usize = 8;
623
624/// EmbrFS - Holographic Filesystem with Guaranteed Reconstruction
625///
626/// # 100% Reconstruction Guarantee
627///
628/// EmbrFS guarantees bit-perfect file reconstruction through a layered approach:
629///
630/// 1. **Encode**: Data chunks → SparseVec via reversible encoding
631/// 2. **Verify**: Immediately decode and compare to original
632/// 3. **Correct**: Store minimal correction if any difference exists
633/// 4. **Extract**: Decode + apply correction = exact original bytes
634///
635/// This guarantee holds regardless of:
636/// - Data content (binary, text, compressed, encrypted)
637/// - File size (single byte to gigabytes)
638/// - Number of files in the engram
639/// - Superposition crosstalk in bundles
640///
641/// # Holographic Mode
642///
643/// When created with `new_holographic()`, uses `ReversibleVSAEncoder` which achieves
644/// ~94% uncorrected accuracy through position-aware VSA binding. This results in
645/// <10% correction overhead instead of the ~200%+ overhead of legacy encoding.
646///
647/// # Examples
648///
649/// ```
650/// use embeddenator_fs::EmbrFS;
651/// use std::path::Path;
652///
653/// // Legacy mode (not recommended)
654/// let mut fs_legacy = EmbrFS::new();
655///
656/// // Holographic mode (recommended - minimal storage overhead)
657/// let mut fs = EmbrFS::new_holographic();
658/// assert_eq!(fs.manifest.total_chunks, 0);
659/// assert_eq!(fs.manifest.files.len(), 0);
660/// ```
661pub struct EmbrFS {
662    pub manifest: Manifest,
663    pub engram: Engram,
664    pub resonator: Option<Resonator>,
665    /// ReversibleVSAEncoder for true holographic encoding (~94% accuracy)
666    /// None in legacy mode, Some in holographic mode
667    encoder: Option<ReversibleVSAEncoder>,
668    /// Chunk size for encoding (64 bytes for holographic, 4096 for legacy)
669    chunk_size: usize,
670}
671
672impl Default for EmbrFS {
673    fn default() -> Self {
674        Self::new_holographic()
675    }
676}
677
678impl EmbrFS {
679    /// Create a new empty EmbrFS instance (legacy mode - NOT RECOMMENDED)
680    ///
681    /// This constructor creates an EmbrFS with legacy encoding that has only ~10%
682    /// accuracy, resulting in ~200%+ storage overhead due to verbatim corrections.
683    ///
684    /// **Use `new_holographic()` instead for production use.**
685    ///
686    /// # Examples
687    ///
688    /// ```
689    /// use embeddenator_fs::EmbrFS;
690    ///
691    /// // Legacy mode - high storage overhead
692    /// let fs = EmbrFS::new();
693    /// assert_eq!(fs.manifest.files.len(), 0);
694    /// ```
695    #[deprecated(
696        since = "0.25.0",
697        note = "Use new_holographic() instead for ~94% encoding accuracy and <10% storage overhead"
698    )]
699    pub fn new() -> Self {
700        EmbrFS {
701            manifest: Manifest {
702                files: Vec::new(),
703                total_chunks: 0,
704                chunk_size: DEFAULT_CHUNK_SIZE,
705                holographic: false,
706            },
707            engram: Engram {
708                root: SparseVec::new(),
709                codebook: HashMap::new(),
710                corrections: CorrectionStore::new(),
711            },
712            resonator: None,
713            encoder: None,
714            chunk_size: DEFAULT_CHUNK_SIZE,
715        }
716    }
717
718    /// Create a new EmbrFS with holographic encoding (RECOMMENDED)
719    ///
720    /// Uses `ReversibleVSAEncoder` which achieves ~94% uncorrected accuracy through
721    /// position-aware VSA binding. This results in <10% correction overhead instead
722    /// of the ~200%+ overhead of legacy encoding.
723    ///
724    /// # Examples
725    ///
726    /// ```
727    /// use embeddenator_fs::EmbrFS;
728    ///
729    /// let fs = EmbrFS::new_holographic();
730    /// assert_eq!(fs.manifest.files.len(), 0);
731    /// assert_eq!(fs.manifest.total_chunks, 0);
732    /// assert!(fs.is_holographic());
733    /// ```
734    pub fn new_holographic() -> Self {
735        EmbrFS {
736            manifest: Manifest {
737                files: Vec::new(),
738                total_chunks: 0,
739                chunk_size: HOLOGRAPHIC_CHUNK_SIZE,
740                holographic: true,
741            },
742            engram: Engram {
743                root: SparseVec::new(),
744                codebook: HashMap::new(),
745                corrections: CorrectionStore::new(),
746            },
747            resonator: None,
748            encoder: Some(ReversibleVSAEncoder::new()),
749            chunk_size: HOLOGRAPHIC_CHUNK_SIZE,
750        }
751    }
752
753    /// Check if holographic mode is enabled
754    pub fn is_holographic(&self) -> bool {
755        self.encoder.is_some()
756    }
757
758    /// Get the chunk size being used
759    pub fn chunk_size(&self) -> usize {
760        self.chunk_size
761    }
762
763    fn path_to_forward_slash_string(path: &Path) -> String {
764        path.components()
765            .filter_map(|c| match c {
766                std::path::Component::Normal(s) => s.to_str().map(|v| v.to_string()),
767                _ => None,
768            })
769            .collect::<Vec<String>>()
770            .join("/")
771    }
772
773    /// Set the resonator for enhanced pattern recovery during extraction
774    ///
775    /// Configures a resonator network that can perform pattern completion to recover
776    /// missing or corrupted data chunks during filesystem extraction. The resonator
777    /// acts as a content-addressable memory that can reconstruct lost information
778    /// by finding the best matching patterns in its trained codebook.
779    ///
780    /// # How it works
781    /// - The resonator maintains a codebook of known vector patterns
782    /// - During extraction, missing chunks are projected onto the closest known pattern
783    /// - This enables robust recovery from partial data loss or corruption
784    ///
785    /// # Why this matters
786    /// - Provides fault tolerance for holographic storage systems
787    /// - Enables reconstruction even when some chunks are unavailable
788    /// - Supports graceful degradation rather than complete failure
789    ///
790    /// # Arguments
791    /// * `resonator` - A trained resonator network for pattern completion
792    ///
793    /// # Examples
794    /// ```
795    /// use embeddenator_fs::{EmbrFS, Resonator};
796    ///
797    /// let mut fs = EmbrFS::new();
798    /// let resonator = Resonator::new();
799    /// fs.set_resonator(resonator);
800    /// // Now extraction will use resonator-enhanced recovery
801    /// ```
802    pub fn set_resonator(&mut self, resonator: Resonator) {
803        self.resonator = Some(resonator);
804    }
805
806    /// Get correction statistics for this engram
807    ///
808    /// Returns statistics about how many chunks needed correction and the
809    /// overhead incurred by storing corrections.
810    ///
811    /// # Examples
812    /// ```
813    /// use embeddenator_fs::EmbrFS;
814    ///
815    /// let fs = EmbrFS::new();
816    /// let stats = fs.correction_stats();
817    /// assert_eq!(stats.total_chunks, 0);
818    /// ```
819    pub fn correction_stats(&self) -> CorrectionStats {
820        self.engram.corrections.stats()
821    }
822
823    /// Ingest an entire directory into engram format
824    pub fn ingest_directory<P: AsRef<Path>>(
825        &mut self,
826        dir: P,
827        verbose: bool,
828        config: &ReversibleVSAConfig,
829    ) -> io::Result<()> {
830        self.ingest_directory_with_prefix(dir, None, verbose, config)
831    }
832
833    /// Ingest a directory into the engram, optionally prefixing all logical paths.
834    ///
835    /// When `logical_prefix` is provided, all ingested file paths become:
836    /// `{logical_prefix}/{relative_path_from_dir}`.
837    pub fn ingest_directory_with_prefix<P: AsRef<Path>>(
838        &mut self,
839        dir: P,
840        logical_prefix: Option<&str>,
841        verbose: bool,
842        config: &ReversibleVSAConfig,
843    ) -> io::Result<()> {
844        let dir = dir.as_ref();
845        if verbose {
846            println!("Ingesting directory: {}", dir.display());
847        }
848
849        let mut files_to_process = Vec::new();
850        for entry in WalkDir::new(dir).follow_links(false) {
851            let entry = entry?;
852            if entry.file_type().is_file() {
853                files_to_process.push(entry.path().to_path_buf());
854            }
855        }
856        files_to_process.sort();
857
858        for file_path in files_to_process {
859            let relative = file_path.strip_prefix(dir).unwrap_or(file_path.as_path());
860            let rel = Self::path_to_forward_slash_string(relative);
861            let logical_path = if let Some(prefix) = logical_prefix {
862                if prefix.is_empty() {
863                    rel
864                } else if rel.is_empty() {
865                    prefix.to_string()
866                } else {
867                    format!("{}/{}", prefix, rel)
868                }
869            } else {
870                rel
871            };
872
873            self.ingest_file(&file_path, logical_path, verbose, config)?;
874        }
875
876        Ok(())
877    }
878
879    /// Ingest a single file into the engram with guaranteed reconstruction
880    ///
881    /// This method encodes file data into sparse vectors and stores any
882    /// necessary corrections to guarantee 100% bit-perfect reconstruction.
883    ///
884    /// # Correction Process
885    ///
886    /// For each chunk:
887    /// 1. Encode: `chunk_data → SparseVec`
888    /// 2. Decode: `SparseVec → decoded_data`  
889    /// 3. Compare: `chunk_data == decoded_data?`
890    /// 4. If different: store correction in `CorrectionStore`
891    ///
892    /// # Arguments
893    /// * `file_path` - Path to the file on disk
894    /// * `logical_path` - Path to use in the engram manifest
895    /// * `verbose` - Print progress information
896    /// * `config` - VSA encoding configuration
897    ///
898    /// # Returns
899    /// `io::Result<()>` indicating success or failure
900    pub fn ingest_file<P: AsRef<Path>>(
901        &mut self,
902        file_path: P,
903        logical_path: String,
904        verbose: bool,
905        config: &ReversibleVSAConfig,
906    ) -> io::Result<()> {
907        let file_path = file_path.as_ref();
908        let mut file = File::open(file_path)?;
909        let mut data = Vec::new();
910        file.read_to_end(&mut data)?;
911
912        let is_text = is_text_file(&data);
913        let is_holographic = self.encoder.is_some();
914
915        if verbose {
916            println!(
917                "Ingesting {}: {} bytes ({}, {})",
918                logical_path,
919                data.len(),
920                if is_text { "text" } else { "binary" },
921                if is_holographic {
922                    "holographic"
923                } else {
924                    "legacy"
925                }
926            );
927        }
928
929        let chunk_size = self.chunk_size;
930        let mut chunks = Vec::new();
931        let mut corrections_needed = 0usize;
932        let mut total_correction_bytes = 0usize;
933
934        for (i, chunk) in data.chunks(chunk_size).enumerate() {
935            let chunk_id = self.manifest.total_chunks + i;
936
937            // Encode chunk to sparse vector
938            let (chunk_vec, decoded) = if let Some(ref mut encoder) = self.encoder {
939                // Holographic mode: use ReversibleVSAEncoder (~94% accuracy)
940                let encoded = encoder.encode(chunk);
941                let decoded = encoder.decode(&encoded, chunk.len());
942                (encoded, decoded)
943            } else {
944                // Legacy mode: use SparseVec::encode_data (~10% accuracy)
945                let encoded = SparseVec::encode_data(chunk, config, Some(&logical_path));
946                let decoded = encoded.decode_data(config, Some(&logical_path), chunk.len());
947                (encoded, decoded)
948            };
949
950            // Store correction if needed (guarantees reconstruction)
951            self.engram
952                .corrections
953                .add(chunk_id as u64, chunk, &decoded);
954
955            if chunk != decoded.as_slice() {
956                corrections_needed += 1;
957                // Track correction overhead
958                if let Some(correction) = self.engram.corrections.get(chunk_id as u64) {
959                    total_correction_bytes += correction.storage_size();
960                }
961            }
962
963            self.engram.root = self.engram.root.bundle(&chunk_vec);
964            self.engram.codebook.insert(chunk_id, chunk_vec);
965            chunks.push(chunk_id);
966        }
967
968        if verbose {
969            let total_chunks = chunks.len();
970            let perfect_chunks = total_chunks - corrections_needed;
971            let accuracy = if total_chunks > 0 {
972                (perfect_chunks as f64 / total_chunks as f64) * 100.0
973            } else {
974                100.0
975            };
976            let overhead = if !data.is_empty() {
977                (total_correction_bytes as f64 / data.len() as f64) * 100.0
978            } else {
979                0.0
980            };
981            println!(
982                "  → {}/{} chunks perfect ({:.1}% accuracy), {:.1}% correction overhead",
983                perfect_chunks, total_chunks, accuracy, overhead
984            );
985        }
986
987        self.manifest.files.push(FileEntry {
988            path: logical_path,
989            is_text,
990            size: data.len(),
991            chunks: chunks.clone(),
992            deleted: false,
993        });
994
995        self.manifest.total_chunks += chunks.len();
996
997        Ok(())
998    }
999
1000    /// Add a new file to an existing engram (incremental update)
1001    ///
1002    /// This method enables efficient incremental updates by adding a single file
1003    /// to an existing engram without requiring full re-ingestion. The new file's
1004    /// chunks are bundled with the existing root vector using VSA's associative
1005    /// bundle operation.
1006    ///
1007    /// # Algorithm
1008    /// 1. Encode new file into chunks (same as ingest_file)
1009    /// 2. Bundle each chunk with existing root: `root_new = root_old ⊕ chunk`
1010    /// 3. Add chunks to codebook with new chunk IDs
1011    /// 4. Update manifest with new file entry
1012    ///
1013    /// # Performance
1014    /// - Time complexity: O(n) where n = number of chunks in new file
1015    /// - Does not require reading or re-encoding existing files
1016    /// - Suitable for production workflows with frequent additions
1017    ///
1018    /// # Arguments
1019    /// * `file_path` - Path to the file on disk
1020    /// * `logical_path` - Path to use in the engram manifest
1021    /// * `verbose` - Print progress information
1022    /// * `config` - VSA encoding configuration
1023    ///
1024    /// # Returns
1025    /// `io::Result<()>` indicating success or failure
1026    ///
1027    /// # Examples
1028    /// ```no_run
1029    /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1030    /// use std::path::Path;
1031    ///
1032    /// let mut fs = EmbrFS::new();
1033    /// let config = ReversibleVSAConfig::default();
1034    ///
1035    /// // Ingest initial dataset
1036    /// fs.ingest_directory("./data", false, &config).unwrap();
1037    ///
1038    /// // Later, add a new file without full re-ingestion
1039    /// fs.add_file("./new_file.txt", "new_file.txt".to_string(), true, &config).unwrap();
1040    /// ```
1041    pub fn add_file<P: AsRef<Path>>(
1042        &mut self,
1043        file_path: P,
1044        logical_path: String,
1045        verbose: bool,
1046        config: &ReversibleVSAConfig,
1047    ) -> io::Result<()> {
1048        let file_path = file_path.as_ref();
1049
1050        // Check if file already exists (not deleted)
1051        if self
1052            .manifest
1053            .files
1054            .iter()
1055            .any(|f| f.path == logical_path && !f.deleted)
1056        {
1057            return Err(io::Error::new(
1058                io::ErrorKind::AlreadyExists,
1059                format!("File '{}' already exists in engram", logical_path),
1060            ));
1061        }
1062
1063        // Use existing ingest_file logic (already handles bundling with root)
1064        self.ingest_file(file_path, logical_path, verbose, config)
1065    }
1066
1067    /// Remove a file from the engram (mark as deleted for incremental update)
1068    ///
1069    /// This method marks a file as deleted in the manifest without modifying the
1070    /// root vector. This is because VSA bundling is a lossy operation and there's
1071    /// no clean inverse. The chunks remain in the codebook but won't be extracted.
1072    ///
1073    /// # Algorithm
1074    /// 1. Find file in manifest by logical path
1075    /// 2. Mark file entry as deleted
1076    /// 3. Chunks remain in codebook (for potential recovery or compaction)
1077    /// 4. File won't appear in future extractions
1078    ///
1079    /// # Note on VSA Limitations
1080    /// Bundle operation is associative but not invertible:
1081    /// - `(A ⊕ B) ⊕ C = A ⊕ (B ⊕ C)` ✓ (can add)
1082    /// - `(A ⊕ B) ⊖ B ≠ A` ✗ (can't cleanly remove)
1083    ///
1084    /// To truly remove chunks from the root, use `compact()` which rebuilds
1085    /// the engram without deleted files.
1086    ///
1087    /// # Arguments
1088    /// * `logical_path` - Path of the file to remove
1089    /// * `verbose` - Print progress information
1090    ///
1091    /// # Returns
1092    /// `io::Result<()>` indicating success or failure
1093    ///
1094    /// # Examples
1095    /// ```no_run
1096    /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1097    ///
1098    /// let mut fs = EmbrFS::new();
1099    /// let config = ReversibleVSAConfig::default();
1100    ///
1101    /// fs.ingest_directory("./data", false, &config).unwrap();
1102    /// fs.remove_file("old_file.txt", true).unwrap();
1103    /// // File marked as deleted, won't be extracted
1104    /// ```
1105    pub fn remove_file(&mut self, logical_path: &str, verbose: bool) -> io::Result<()> {
1106        // Find file in manifest
1107        let file_entry = self
1108            .manifest
1109            .files
1110            .iter_mut()
1111            .find(|f| f.path == logical_path && !f.deleted)
1112            .ok_or_else(|| {
1113                io::Error::new(
1114                    io::ErrorKind::NotFound,
1115                    format!("File '{}' not found in engram", logical_path),
1116                )
1117            })?;
1118
1119        if verbose {
1120            println!(
1121                "Marking file as deleted: {} ({} chunks)",
1122                logical_path,
1123                file_entry.chunks.len()
1124            );
1125        }
1126
1127        // Mark as deleted (don't remove from manifest to preserve chunk IDs)
1128        file_entry.deleted = true;
1129
1130        if verbose {
1131            println!("  Note: Use 'compact' to rebuild engram and reclaim space");
1132        }
1133
1134        Ok(())
1135    }
1136
1137    /// Modify an existing file in the engram (incremental update)
1138    ///
1139    /// This method updates a file's content by removing the old version and
1140    /// adding the new version. It's equivalent to `remove_file` + `add_file`.
1141    ///
1142    /// # Algorithm
1143    /// 1. Mark old file as deleted
1144    /// 2. Re-encode new file content
1145    /// 3. Bundle new chunks with root
1146    /// 4. Add new file entry to manifest
1147    ///
1148    /// # Trade-offs
1149    /// - Old chunks remain in codebook (use `compact()` to clean up)
1150    /// - Root contains both old and new chunk contributions (slight noise)
1151    /// - Fast operation, doesn't require rebuilding entire engram
1152    ///
1153    /// # Arguments
1154    /// * `file_path` - Path to the file on disk (new content)
1155    /// * `logical_path` - Path of the file in the engram
1156    /// * `verbose` - Print progress information
1157    /// * `config` - VSA encoding configuration
1158    ///
1159    /// # Returns
1160    /// `io::Result<()>` indicating success or failure
1161    ///
1162    /// # Examples
1163    /// ```no_run
1164    /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1165    /// use std::path::Path;
1166    ///
1167    /// let mut fs = EmbrFS::new();
1168    /// let config = ReversibleVSAConfig::default();
1169    ///
1170    /// fs.ingest_directory("./data", false, &config).unwrap();
1171    ///
1172    /// // Later, modify a file
1173    /// fs.modify_file("./data/updated.txt", "data/updated.txt".to_string(), true, &config).unwrap();
1174    /// ```
1175    pub fn modify_file<P: AsRef<Path>>(
1176        &mut self,
1177        file_path: P,
1178        logical_path: String,
1179        verbose: bool,
1180        config: &ReversibleVSAConfig,
1181    ) -> io::Result<()> {
1182        // First, mark old file as deleted
1183        self.remove_file(&logical_path, false)?;
1184
1185        if verbose {
1186            println!("Modifying file: {}", logical_path);
1187        }
1188
1189        // Then add the new version
1190        self.ingest_file(file_path, logical_path, verbose, config)?;
1191
1192        Ok(())
1193    }
1194
1195    /// Compact the engram by rebuilding without deleted files
1196    ///
1197    /// This operation rebuilds the engram from scratch, excluding all files
1198    /// marked as deleted. It's the only way to truly remove old chunks from
1199    /// the root vector and codebook.
1200    ///
1201    /// # Algorithm
1202    /// 1. Create new empty engram
1203    /// 2. Re-bundle all non-deleted files
1204    /// 3. Reassign chunk IDs sequentially
1205    /// 4. Replace old engram with compacted version
1206    ///
1207    /// # Performance
1208    /// - Time complexity: O(N) where N = total bytes of non-deleted files
1209    /// - Expensive operation, run periodically (not after every deletion)
1210    /// - Recommended: compact when deleted files exceed 20-30% of total
1211    ///
1212    /// # Benefits
1213    /// - Reclaims space from deleted chunks
1214    /// - Reduces root vector noise from obsolete data
1215    /// - Resets chunk IDs to sequential order
1216    /// - Maintains bit-perfect reconstruction of kept files
1217    ///
1218    /// # Arguments
1219    /// * `verbose` - Print progress information
1220    /// * `config` - VSA encoding configuration
1221    ///
1222    /// # Returns
1223    /// `io::Result<()>` indicating success or failure
1224    ///
1225    /// # Examples
1226    /// ```no_run
1227    /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1228    ///
1229    /// let mut fs = EmbrFS::new();
1230    /// let config = ReversibleVSAConfig::default();
1231    ///
1232    /// fs.ingest_directory("./data", false, &config).unwrap();
1233    /// fs.remove_file("old1.txt", false).unwrap();
1234    /// fs.remove_file("old2.txt", false).unwrap();
1235    ///
1236    /// // After many deletions, compact to reclaim space
1237    /// fs.compact(true, &config).unwrap();
1238    /// ```
1239    pub fn compact(&mut self, verbose: bool, config: &ReversibleVSAConfig) -> io::Result<()> {
1240        if verbose {
1241            let deleted_count = self.manifest.files.iter().filter(|f| f.deleted).count();
1242            let total_count = self.manifest.files.len();
1243            println!(
1244                "Compacting engram: removing {} deleted files ({} remaining)",
1245                deleted_count,
1246                total_count - deleted_count
1247            );
1248        }
1249
1250        let is_holographic = self.encoder.is_some();
1251        let chunk_size = self.chunk_size;
1252
1253        // Create new engram with fresh root and codebook
1254        let mut new_engram = Engram {
1255            root: SparseVec::new(),
1256            codebook: HashMap::new(),
1257            corrections: CorrectionStore::new(),
1258        };
1259
1260        // Rebuild manifest with only non-deleted files
1261        let mut new_manifest = Manifest {
1262            files: Vec::new(),
1263            total_chunks: 0,
1264            chunk_size,
1265            holographic: is_holographic,
1266        };
1267
1268        // Process each non-deleted file
1269        for old_file in &self.manifest.files {
1270            if old_file.deleted {
1271                continue;
1272            }
1273
1274            // Reconstruct file data from old engram using current decoder
1275            let mut file_data = Vec::new();
1276            let num_chunks = old_file.chunks.len();
1277            let old_chunk_size = self.manifest.chunk_size;
1278
1279            for (chunk_idx, &chunk_id) in old_file.chunks.iter().enumerate() {
1280                if let Some(chunk_vec) = self.engram.codebook.get(&chunk_id) {
1281                    let this_chunk_size = if chunk_idx == num_chunks - 1 {
1282                        let remaining = old_file.size.saturating_sub(chunk_idx * old_chunk_size);
1283                        remaining.min(old_chunk_size)
1284                    } else {
1285                        old_chunk_size
1286                    };
1287
1288                    // Decode using appropriate method
1289                    let decoded = if self.manifest.holographic {
1290                        if let Some(ref encoder) = self.encoder {
1291                            encoder.decode(chunk_vec, this_chunk_size)
1292                        } else {
1293                            // Fallback if encoder not available
1294                            chunk_vec.decode_data(config, Some(&old_file.path), this_chunk_size)
1295                        }
1296                    } else {
1297                        chunk_vec.decode_data(config, Some(&old_file.path), this_chunk_size)
1298                    };
1299
1300                    let chunk_data = if let Some(corrected) =
1301                        self.engram.corrections.apply(chunk_id as u64, &decoded)
1302                    {
1303                        corrected
1304                    } else {
1305                        decoded
1306                    };
1307
1308                    file_data.extend_from_slice(&chunk_data);
1309                }
1310            }
1311            file_data.truncate(old_file.size);
1312
1313            // Re-encode with new chunk IDs using current encoder
1314            let mut new_chunks = Vec::new();
1315
1316            for (i, chunk) in file_data.chunks(chunk_size).enumerate() {
1317                let new_chunk_id = new_manifest.total_chunks + i;
1318
1319                // Encode using appropriate method
1320                let (chunk_vec, decoded) = if let Some(ref mut encoder) = self.encoder {
1321                    let encoded = encoder.encode(chunk);
1322                    let decoded = encoder.decode(&encoded, chunk.len());
1323                    (encoded, decoded)
1324                } else {
1325                    let encoded = SparseVec::encode_data(chunk, config, Some(&old_file.path));
1326                    let decoded = encoded.decode_data(config, Some(&old_file.path), chunk.len());
1327                    (encoded, decoded)
1328                };
1329
1330                new_engram
1331                    .corrections
1332                    .add(new_chunk_id as u64, chunk, &decoded);
1333
1334                new_engram.root = new_engram.root.bundle(&chunk_vec);
1335                new_engram.codebook.insert(new_chunk_id, chunk_vec);
1336                new_chunks.push(new_chunk_id);
1337            }
1338
1339            if verbose {
1340                println!(
1341                    "  Recompacted: {} ({} chunks)",
1342                    old_file.path,
1343                    new_chunks.len()
1344                );
1345            }
1346
1347            new_manifest.files.push(FileEntry {
1348                path: old_file.path.clone(),
1349                is_text: old_file.is_text,
1350                size: old_file.size,
1351                chunks: new_chunks.clone(),
1352                deleted: false,
1353            });
1354
1355            new_manifest.total_chunks += new_chunks.len();
1356        }
1357
1358        // Replace old engram and manifest with compacted versions
1359        self.engram = new_engram;
1360        self.manifest = new_manifest;
1361
1362        if verbose {
1363            let stats = self.engram.corrections.stats();
1364            println!(
1365                "Compaction complete: {} files, {} chunks ({:.1}% perfect, {:.2}% correction overhead)",
1366                self.manifest.files.len(),
1367                self.manifest.total_chunks,
1368                stats.perfect_ratio * 100.0,
1369                stats.correction_ratio * 100.0
1370            );
1371        }
1372
1373        Ok(())
1374    }
1375
1376    /// Save engram to file
1377    pub fn save_engram<P: AsRef<Path>>(&self, path: P) -> io::Result<()> {
1378        let encoded = bincode::serialize(&self.engram).map_err(io::Error::other)?;
1379        fs::write(path, encoded)?;
1380        Ok(())
1381    }
1382
1383    /// Load engram from file
1384    pub fn load_engram<P: AsRef<Path>>(path: P) -> io::Result<Engram> {
1385        let data = fs::read(path)?;
1386        bincode::deserialize(&data).map_err(io::Error::other)
1387    }
1388
1389    /// Save manifest to JSON file
1390    pub fn save_manifest<P: AsRef<Path>>(&self, path: P) -> io::Result<()> {
1391        let file = File::create(path)?;
1392        serde_json::to_writer_pretty(file, &self.manifest)?;
1393        Ok(())
1394    }
1395
1396    /// Load manifest from JSON file
1397    pub fn load_manifest<P: AsRef<Path>>(path: P) -> io::Result<Manifest> {
1398        let file = File::open(path)?;
1399        let manifest = serde_json::from_reader(file)?;
1400        Ok(manifest)
1401    }
1402
1403    /// Load an EmbrFS from engram and manifest files
1404    ///
1405    /// Automatically detects if the engram was created with holographic mode
1406    /// and sets up the appropriate encoder for extraction.
1407    ///
1408    /// # Arguments
1409    /// * `engram_path` - Path to the engram file
1410    /// * `manifest_path` - Path to the manifest JSON file
1411    ///
1412    /// # Returns
1413    /// `io::Result<EmbrFS>` with the loaded engram and manifest
1414    pub fn load<P: AsRef<Path>, Q: AsRef<Path>>(
1415        engram_path: P,
1416        manifest_path: Q,
1417    ) -> io::Result<Self> {
1418        let engram = Self::load_engram(engram_path)?;
1419        let manifest = Self::load_manifest(manifest_path)?;
1420
1421        // Create encoder if holographic mode was used
1422        let (encoder, chunk_size) = if manifest.holographic {
1423            (Some(ReversibleVSAEncoder::new()), manifest.chunk_size)
1424        } else {
1425            (None, manifest.chunk_size)
1426        };
1427
1428        Ok(EmbrFS {
1429            manifest,
1430            engram,
1431            resonator: None,
1432            encoder,
1433            chunk_size,
1434        })
1435    }
1436
1437    /// Extract files from engram to directory with guaranteed reconstruction
1438    ///
1439    /// This method guarantees 100% bit-perfect reconstruction by applying
1440    /// stored corrections after decoding each chunk.
1441    ///
1442    /// # Reconstruction Process
1443    ///
1444    /// For each chunk:
1445    /// 1. Decode: `SparseVec → decoded_data`
1446    /// 2. Apply correction: `decoded_data + correction → original_data`
1447    /// 3. Verify: Hash matches stored hash (guaranteed by construction)
1448    ///
1449    /// # Arguments
1450    /// * `engram` - The engram containing encoded data and corrections
1451    /// * `manifest` - File metadata and chunk mappings
1452    /// * `output_dir` - Directory to write extracted files
1453    /// * `verbose` - Print progress information
1454    /// * `config` - VSA decoding configuration
1455    ///
1456    /// # Returns
1457    /// `io::Result<()>` indicating success or failure
1458    pub fn extract<P: AsRef<Path>>(
1459        engram: &Engram,
1460        manifest: &Manifest,
1461        output_dir: P,
1462        verbose: bool,
1463        config: &ReversibleVSAConfig,
1464    ) -> io::Result<()> {
1465        let output_dir = output_dir.as_ref();
1466
1467        // Use manifest's chunk_size and holographic flag
1468        let chunk_size = manifest.chunk_size;
1469        let is_holographic = manifest.holographic;
1470
1471        // Create encoder for holographic decoding if needed
1472        let encoder = if is_holographic {
1473            Some(ReversibleVSAEncoder::new())
1474        } else {
1475            None
1476        };
1477
1478        if verbose {
1479            println!(
1480                "Extracting {} files to {} ({})",
1481                manifest.files.iter().filter(|f| !f.deleted).count(),
1482                output_dir.display(),
1483                if is_holographic {
1484                    "holographic"
1485                } else {
1486                    "legacy"
1487                }
1488            );
1489            let stats = engram.corrections.stats();
1490            println!(
1491                "  Correction stats: {:.1}% perfect, {:.2}% overhead",
1492                stats.perfect_ratio * 100.0,
1493                stats.correction_ratio * 100.0
1494            );
1495        }
1496
1497        for file_entry in &manifest.files {
1498            // Skip deleted files
1499            if file_entry.deleted {
1500                continue;
1501            }
1502
1503            let file_path = output_dir.join(&file_entry.path);
1504
1505            if let Some(parent) = file_path.parent() {
1506                fs::create_dir_all(parent)?;
1507            }
1508
1509            let mut reconstructed = Vec::new();
1510            let num_chunks = file_entry.chunks.len();
1511            for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
1512                if let Some(chunk_vec) = engram.codebook.get(&chunk_id) {
1513                    // Calculate the actual chunk size for this chunk
1514                    // Last chunk may be smaller than the standard chunk_size
1515                    let this_chunk_size = if chunk_idx == num_chunks - 1 {
1516                        // Last chunk: remaining bytes
1517                        let remaining = file_entry.size.saturating_sub(chunk_idx * chunk_size);
1518                        remaining.min(chunk_size)
1519                    } else {
1520                        chunk_size
1521                    };
1522
1523                    // Decode the sparse vector to bytes using appropriate method
1524                    let decoded = if let Some(ref enc) = encoder {
1525                        // Holographic mode: use ReversibleVSAEncoder
1526                        enc.decode(chunk_vec, this_chunk_size)
1527                    } else {
1528                        // Legacy mode: use SparseVec::decode_data
1529                        chunk_vec.decode_data(config, Some(&file_entry.path), this_chunk_size)
1530                    };
1531
1532                    // Apply correction to guarantee bit-perfect reconstruction
1533                    let chunk_data = if let Some(corrected) =
1534                        engram.corrections.apply(chunk_id as u64, &decoded)
1535                    {
1536                        corrected
1537                    } else {
1538                        // No correction found - use decoded directly
1539                        // This can happen with legacy engrams or if correction store is empty
1540                        decoded
1541                    };
1542
1543                    reconstructed.extend_from_slice(&chunk_data);
1544                }
1545            }
1546
1547            reconstructed.truncate(file_entry.size);
1548
1549            fs::write(&file_path, reconstructed)?;
1550
1551            if verbose {
1552                println!("Extracted: {}", file_entry.path);
1553            }
1554        }
1555
1556        Ok(())
1557    }
1558
1559    /// Extract files using resonator-enhanced pattern completion with guaranteed reconstruction
1560    ///
1561    /// Performs filesystem extraction with intelligent recovery capabilities powered by
1562    /// resonator networks. When chunks are missing from the codebook, the resonator
1563    /// attempts pattern completion to reconstruct the lost data, enabling extraction
1564    /// even from partially corrupted or incomplete engrams.
1565    ///
1566    /// # Reconstruction Guarantee
1567    ///
1568    /// Even with resonator-assisted recovery, corrections are applied to guarantee
1569    /// bit-perfect reconstruction. The process is:
1570    ///
1571    /// 1. Try to get chunk from codebook
1572    /// 2. If missing, use resonator to recover approximate chunk
1573    /// 3. Apply correction from CorrectionStore
1574    /// 4. Result is guaranteed bit-perfect (if correction exists)
1575    ///
1576    /// # How it works
1577    /// 1. For each file chunk, check if it exists in the engram codebook
1578    /// 2. If missing, use the resonator to project a query vector onto known patterns
1579    /// 3. Apply stored corrections for guaranteed accuracy
1580    /// 4. Reconstruct the file from available and recovered chunks
1581    /// 5. If no resonator is configured, falls back to standard extraction
1582    ///
1583    /// # Why this matters
1584    /// - Enables 100% reconstruction even with missing chunks
1585    /// - Provides fault tolerance for distributed storage scenarios
1586    /// - Supports hierarchical recovery at multiple levels of the storage stack
1587    /// - Maintains data integrity through pattern-based completion
1588    ///
1589    /// # Arguments
1590    /// * `output_dir` - Directory path where extracted files will be written
1591    /// * `verbose` - Whether to print progress information during extraction
1592    /// * `config` - VSA configuration for encoding/decoding
1593    ///
1594    /// # Returns
1595    /// `io::Result<()>` indicating success or failure of the extraction operation
1596    ///
1597    /// # Examples
1598    /// ```
1599    /// use embeddenator_fs::{EmbrFS, Resonator, ReversibleVSAConfig};
1600    /// use std::path::Path;
1601    ///
1602    /// let mut fs = EmbrFS::new();
1603    /// let resonator = Resonator::new();
1604    /// let config = ReversibleVSAConfig::default();
1605    /// fs.set_resonator(resonator);
1606    ///
1607    /// // Assuming fs has been populated with data...
1608    /// let result = fs.extract_with_resonator("/tmp/output", true, &config);
1609    /// assert!(result.is_ok());
1610    /// ```
1611    pub fn extract_with_resonator<P: AsRef<Path>>(
1612        &self,
1613        output_dir: P,
1614        verbose: bool,
1615        config: &ReversibleVSAConfig,
1616    ) -> io::Result<()> {
1617        if self.resonator.is_none() {
1618            return Self::extract(&self.engram, &self.manifest, output_dir, verbose, config);
1619        }
1620
1621        // SAFETY: we just checked is_none() above and returned early
1622        let _resonator = self
1623            .resonator
1624            .as_ref()
1625            .expect("resonator is Some after is_none() check");
1626        let output_dir = output_dir.as_ref();
1627
1628        if verbose {
1629            println!(
1630                "Extracting {} files with resonator enhancement to {}",
1631                self.manifest.files.iter().filter(|f| !f.deleted).count(),
1632                output_dir.display()
1633            );
1634            let stats = self.engram.corrections.stats();
1635            println!(
1636                "  Correction stats: {:.1}% perfect, {:.2}% overhead",
1637                stats.perfect_ratio * 100.0,
1638                stats.correction_ratio * 100.0
1639            );
1640        }
1641
1642        for file_entry in &self.manifest.files {
1643            // Skip deleted files
1644            if file_entry.deleted {
1645                continue;
1646            }
1647
1648            let file_path = output_dir.join(&file_entry.path);
1649
1650            if let Some(parent) = file_path.parent() {
1651                fs::create_dir_all(parent)?;
1652            }
1653
1654            let mut reconstructed = Vec::new();
1655            let num_chunks = file_entry.chunks.len();
1656            for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
1657                // Calculate the actual chunk size
1658                let chunk_size = if chunk_idx == num_chunks - 1 {
1659                    let remaining = file_entry.size - (chunk_idx * DEFAULT_CHUNK_SIZE);
1660                    remaining.min(DEFAULT_CHUNK_SIZE)
1661                } else {
1662                    DEFAULT_CHUNK_SIZE
1663                };
1664
1665                let chunk_data = if let Some(vector) = self.engram.codebook.get(&chunk_id) {
1666                    // Decode the SparseVec back to bytes using reversible encoding
1667                    // IMPORTANT: Use the same path as during encoding for correct shift calculation
1668                    let decoded = vector.decode_data(config, Some(&file_entry.path), chunk_size);
1669
1670                    // Apply correction to guarantee bit-perfect reconstruction
1671                    if let Some(corrected) =
1672                        self.engram.corrections.apply(chunk_id as u64, &decoded)
1673                    {
1674                        corrected
1675                    } else {
1676                        decoded
1677                    }
1678                } else if let Some(resonator) = &self.resonator {
1679                    // Use resonator to recover missing chunk
1680                    // Create a query vector from the chunk_id using reversible encoding
1681                    let query_vec = SparseVec::encode_data(&chunk_id.to_le_bytes(), config, None);
1682                    let recovered_vec = resonator.project(&query_vec);
1683
1684                    // Decode the recovered vector back to bytes
1685                    // For resonator recovery, try with path first, fall back to no path
1686                    let decoded =
1687                        recovered_vec.decode_data(config, Some(&file_entry.path), chunk_size);
1688
1689                    // Apply correction if available (may not be if chunk was lost)
1690                    if let Some(corrected) =
1691                        self.engram.corrections.apply(chunk_id as u64, &decoded)
1692                    {
1693                        corrected
1694                    } else {
1695                        // No correction available - best effort recovery
1696                        decoded
1697                    }
1698                } else {
1699                    return Err(io::Error::new(
1700                        io::ErrorKind::NotFound,
1701                        format!("Missing chunk {} and no resonator available", chunk_id),
1702                    ));
1703                };
1704                reconstructed.extend_from_slice(&chunk_data);
1705            }
1706
1707            reconstructed.truncate(file_entry.size);
1708
1709            fs::write(&file_path, reconstructed)?;
1710
1711            if verbose {
1712                println!("Extracted with resonator: {}", file_entry.path);
1713            }
1714        }
1715
1716        Ok(())
1717    }
1718
1719    /// Perform hierarchical bundling with path role binding and permutation tagging
1720    ///
1721    /// Creates multi-level engram structures where path components are encoded using
1722    /// permutation operations to create distinct representations at each level. This
1723    /// enables efficient hierarchical retrieval and reconstruction.
1724    ///
1725    /// # How it works
1726    /// 1. Split file paths into components (e.g., "a/b/c.txt" → ["a", "b", "c.txt"])
1727    /// 2. For each level, apply permutation based on path component hash
1728    /// 3. Bundle representations level-by-level with sparsity control
1729    /// 4. Create sub-engrams for intermediate nodes
1730    ///
1731    /// # Why this matters
1732    /// - Enables scalable hierarchical storage beyond flat bundling limits
1733    /// - Path-based retrieval without full engram traversal
1734    /// - Maintains semantic relationships through permutation encoding
1735    /// - Supports efficient partial reconstruction
1736    ///
1737    /// # Arguments
1738    /// * `max_level_sparsity` - Maximum non-zero elements per level bundle
1739    /// * `verbose` - Whether to print progress information
1740    ///
1741    /// # Returns
1742    /// HierarchicalManifest describing the multi-level structure
1743    ///
1744    /// # Examples
1745    /// ```
1746    /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1747    ///
1748    /// let fs = EmbrFS::new();
1749    /// let config = ReversibleVSAConfig::default();
1750    /// // Assuming files have been ingested...
1751    ///
1752    /// let hierarchical = fs.bundle_hierarchically(500, false, &config);
1753    /// assert!(hierarchical.is_ok());
1754    /// ```
1755    pub fn bundle_hierarchically(
1756        &self,
1757        max_level_sparsity: usize,
1758        verbose: bool,
1759        _config: &ReversibleVSAConfig,
1760    ) -> io::Result<HierarchicalManifest> {
1761        self.bundle_hierarchically_with_options(max_level_sparsity, None, verbose, _config)
1762    }
1763
1764    /// Like `bundle_hierarchically`, but supports an optional deterministic cap on `chunk_ids` per node.
1765    ///
1766    /// If `max_chunks_per_node` is set and a node would exceed that many `chunk_ids`, the node becomes
1767    /// a router with empty `chunk_ids`, and deterministic shard children are created each containing a
1768    /// bounded subset of `chunk_ids`.
1769    pub fn bundle_hierarchically_with_options(
1770        &self,
1771        max_level_sparsity: usize,
1772        max_chunks_per_node: Option<usize>,
1773        verbose: bool,
1774        _config: &ReversibleVSAConfig,
1775    ) -> io::Result<HierarchicalManifest> {
1776        let mut levels = Vec::new();
1777        let mut sub_engrams = HashMap::new();
1778
1779        // Group files by *path prefixes* at each level.
1780        // Level 0: "a"; Level 1: "a/b"; etc.
1781        let mut level_prefixes: HashMap<usize, HashMap<String, Vec<&FileEntry>>> = HashMap::new();
1782        for file_entry in &self.manifest.files {
1783            let comps: Vec<&str> = file_entry.path.split('/').collect();
1784            let mut prefix = String::new();
1785            for (level, &comp) in comps.iter().enumerate() {
1786                if level == 0 {
1787                    prefix.push_str(comp);
1788                } else {
1789                    prefix.push('/');
1790                    prefix.push_str(comp);
1791                }
1792                level_prefixes
1793                    .entry(level)
1794                    .or_default()
1795                    .entry(prefix.clone())
1796                    .or_default()
1797                    .push(file_entry);
1798            }
1799        }
1800
1801        // Process each level
1802        let max_level = level_prefixes.keys().max().unwrap_or(&0);
1803
1804        for level in 0..=*max_level {
1805            if verbose {
1806                let item_count = level_prefixes
1807                    .get(&level)
1808                    .map(|comps| comps.values().map(|files| files.len()).sum::<usize>())
1809                    .unwrap_or(0);
1810                println!("Processing level {} with {} items", level, item_count);
1811            }
1812
1813            let mut level_bundle = SparseVec::new();
1814            let mut manifest_items = Vec::new();
1815
1816            if let Some(prefixes) = level_prefixes.get(&level) {
1817                let mut prefix_keys: Vec<&String> = prefixes.keys().collect();
1818                prefix_keys.sort();
1819
1820                for prefix in prefix_keys {
1821                    let mut files: Vec<&FileEntry> = prefixes
1822                        .get(prefix)
1823                        // SAFETY: prefix comes from keys(), so get() must succeed
1824                        .expect("prefix key from keys() must exist in HashMap")
1825                        .to_vec();
1826                    files.sort_by(|a, b| a.path.cmp(&b.path));
1827
1828                    // Create permutation shift based on prefix hash
1829                    let shift = {
1830                        use std::collections::hash_map::DefaultHasher;
1831                        use std::hash::{Hash, Hasher};
1832                        let mut hasher = DefaultHasher::new();
1833                        prefix.hash(&mut hasher);
1834                        (hasher.finish() % (DIM as u64)) as usize
1835                    };
1836
1837                    // Bundle all files under this component with permutation
1838                    let mut component_bundle = SparseVec::new();
1839                    let mut chunk_ids_set: HashSet<usize> = HashSet::new();
1840                    for file_entry in &files {
1841                        // Find chunks for this file and bundle them
1842                        let mut file_bundle = SparseVec::new();
1843                        for &chunk_id in &file_entry.chunks {
1844                            if let Some(chunk_vec) = self.engram.codebook.get(&chunk_id) {
1845                                file_bundle = file_bundle.bundle(chunk_vec);
1846                                chunk_ids_set.insert(chunk_id);
1847                            }
1848                        }
1849
1850                        // Apply level-based permutation
1851                        let permuted_file = file_bundle.permute(shift * (level + 1));
1852                        component_bundle = component_bundle.bundle(&permuted_file);
1853                    }
1854
1855                    // Apply sparsity control
1856                    if component_bundle.pos.len() + component_bundle.neg.len() > max_level_sparsity
1857                    {
1858                        component_bundle = component_bundle.thin(max_level_sparsity);
1859                    }
1860
1861                    level_bundle = level_bundle.bundle(&component_bundle);
1862
1863                    // Create sub-engram for this prefix.
1864                    // Children are the immediate next-level prefixes underneath this prefix.
1865                    let sub_id = format!("level_{}_prefix_{}", level, prefix);
1866
1867                    let mut children_set: HashSet<String> = HashSet::new();
1868                    if level < *max_level {
1869                        for file_entry in &files {
1870                            let comps: Vec<&str> = file_entry.path.split('/').collect();
1871                            if comps.len() <= level + 1 {
1872                                continue;
1873                            }
1874                            let child_prefix = comps[..=level + 1].join("/");
1875                            let child_id = format!("level_{}_prefix_{}", level + 1, child_prefix);
1876                            children_set.insert(child_id);
1877                        }
1878                    }
1879                    let mut children: Vec<String> = children_set.into_iter().collect();
1880                    children.sort();
1881
1882                    let mut chunk_ids: Vec<usize> = chunk_ids_set.into_iter().collect();
1883                    chunk_ids.sort_unstable();
1884
1885                    let chunk_count: usize = files.iter().map(|f| f.chunks.len()).sum();
1886
1887                    if let Some(max_chunks) = max_chunks_per_node.filter(|v| *v > 0) {
1888                        if chunk_ids.len() > max_chunks {
1889                            let mut shard_ids: Vec<String> = Vec::new();
1890                            for (shard_idx, chunk_slice) in chunk_ids.chunks(max_chunks).enumerate()
1891                            {
1892                                let shard_id = format!("{}__shard_{:04}", sub_id, shard_idx);
1893                                shard_ids.push(shard_id.clone());
1894                                sub_engrams.insert(
1895                                    shard_id.clone(),
1896                                    SubEngram {
1897                                        id: shard_id,
1898                                        root: component_bundle.clone(),
1899                                        chunk_ids: chunk_slice.to_vec(),
1900                                        chunk_count: chunk_slice.len(),
1901                                        children: Vec::new(),
1902                                    },
1903                                );
1904                            }
1905
1906                            let mut router_children = shard_ids;
1907                            router_children.extend(children.clone());
1908                            router_children.sort();
1909                            router_children.dedup();
1910
1911                            sub_engrams.insert(
1912                                sub_id.clone(),
1913                                SubEngram {
1914                                    id: sub_id.clone(),
1915                                    root: component_bundle,
1916                                    chunk_ids: Vec::new(),
1917                                    chunk_count,
1918                                    children: router_children,
1919                                },
1920                            );
1921                        } else {
1922                            sub_engrams.insert(
1923                                sub_id.clone(),
1924                                SubEngram {
1925                                    id: sub_id.clone(),
1926                                    root: component_bundle,
1927                                    chunk_ids,
1928                                    chunk_count,
1929                                    children,
1930                                },
1931                            );
1932                        }
1933                    } else {
1934                        sub_engrams.insert(
1935                            sub_id.clone(),
1936                            SubEngram {
1937                                id: sub_id.clone(),
1938                                root: component_bundle,
1939                                chunk_ids,
1940                                chunk_count,
1941                                children,
1942                            },
1943                        );
1944                    }
1945
1946                    manifest_items.push(ManifestItem {
1947                        path: prefix.clone(),
1948                        sub_engram_id: sub_id,
1949                    });
1950                }
1951            }
1952
1953            manifest_items.sort_by(|a, b| {
1954                a.path
1955                    .cmp(&b.path)
1956                    .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
1957            });
1958
1959            // Apply final sparsity control to level bundle
1960            if level_bundle.pos.len() + level_bundle.neg.len() > max_level_sparsity {
1961                level_bundle = level_bundle.thin(max_level_sparsity);
1962            }
1963
1964            levels.push(ManifestLevel {
1965                level: level as u32,
1966                items: manifest_items,
1967            });
1968        }
1969
1970        Ok(HierarchicalManifest {
1971            version: 1,
1972            levels,
1973            sub_engrams,
1974        })
1975    }
1976
1977    /// Extract files from hierarchical manifest with manifest-guided traversal
1978    ///
1979    /// Performs hierarchical extraction by traversing the manifest levels and
1980    /// reconstructing files from sub-engrams. This enables efficient extraction
1981    /// from complex hierarchical structures without loading the entire engram.
1982    ///
1983    /// # How it works
1984    /// 1. Traverse manifest levels from root to leaves
1985    /// 2. For each level, locate relevant sub-engrams
1986    /// 3. Reconstruct file chunks using inverse permutation operations
1987    /// 4. Assemble complete files from hierarchical components
1988    ///
1989    /// # Why this matters
1990    /// - Enables partial extraction from large hierarchical datasets
1991    /// - Maintains bit-perfect reconstruction accuracy
1992    /// - Supports efficient path-based queries and retrieval
1993    /// - Scales to complex directory structures
1994    ///
1995    /// # Arguments
1996    /// * `hierarchical` - The hierarchical manifest to extract from
1997    /// * `output_dir` - Directory path where extracted files will be written
1998    /// * `verbose` - Whether to print progress information during extraction
1999    ///
2000    /// # Returns
2001    /// `io::Result<()>` indicating success or failure of the hierarchical extraction
2002    ///
2003    /// # Examples
2004    /// ```
2005    /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
2006    ///
2007    /// let fs = EmbrFS::new();
2008    /// let config = ReversibleVSAConfig::default();
2009    /// // Assuming hierarchical manifest was created...
2010    /// // let hierarchical = fs.bundle_hierarchically(500, true).unwrap();
2011    ///
2012    /// // fs.extract_hierarchically(&hierarchical, "/tmp/output", true, &config)?;
2013    /// ```
2014    pub fn extract_hierarchically<P: AsRef<Path>>(
2015        &self,
2016        hierarchical: &HierarchicalManifest,
2017        output_dir: P,
2018        verbose: bool,
2019        config: &ReversibleVSAConfig,
2020    ) -> io::Result<()> {
2021        let output_dir = output_dir.as_ref();
2022
2023        if verbose {
2024            println!(
2025                "Extracting hierarchical manifest with {} levels to {}",
2026                hierarchical.levels.len(),
2027                output_dir.display()
2028            );
2029        }
2030
2031        // For each file in the original manifest, reconstruct it using hierarchical information
2032        for file_entry in &self.manifest.files {
2033            // Skip deleted files
2034            if file_entry.deleted {
2035                continue;
2036            }
2037
2038            let file_path = output_dir.join(&file_entry.path);
2039
2040            if let Some(parent) = file_path.parent() {
2041                fs::create_dir_all(parent)?;
2042            }
2043
2044            let mut reconstructed = Vec::new();
2045
2046            // Reconstruct each chunk using hierarchical information
2047            let num_chunks = file_entry.chunks.len();
2048            for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
2049                if let Some(chunk_vector) = self.engram.codebook.get(&chunk_id) {
2050                    // Calculate the actual chunk size
2051                    let chunk_size = if chunk_idx == num_chunks - 1 {
2052                        let remaining = file_entry.size - (chunk_idx * DEFAULT_CHUNK_SIZE);
2053                        remaining.min(DEFAULT_CHUNK_SIZE)
2054                    } else {
2055                        DEFAULT_CHUNK_SIZE
2056                    };
2057
2058                    // Decode using hierarchical inverse transformations
2059                    let decoded =
2060                        chunk_vector.decode_data(config, Some(&file_entry.path), chunk_size);
2061
2062                    // Apply correction if available
2063                    let chunk_data = if let Some(corrected) =
2064                        self.engram.corrections.apply(chunk_id as u64, &decoded)
2065                    {
2066                        corrected
2067                    } else {
2068                        decoded
2069                    };
2070
2071                    reconstructed.extend_from_slice(&chunk_data);
2072                }
2073            }
2074
2075            // Truncate to actual file size
2076            reconstructed.truncate(file_entry.size);
2077
2078            fs::write(&file_path, reconstructed)?;
2079
2080            if verbose {
2081                println!("Extracted hierarchical: {}", file_entry.path);
2082            }
2083        }
2084
2085        Ok(())
2086    }
2087}
2088pub fn is_text_file(data: &[u8]) -> bool {
2089    if data.is_empty() {
2090        return true;
2091    }
2092
2093    let sample_size = data.len().min(8192);
2094    let sample = &data[..sample_size];
2095
2096    let mut null_count = 0;
2097    let mut control_count = 0;
2098
2099    for &byte in sample {
2100        if byte == 0 {
2101            null_count += 1;
2102        } else if byte < 32 && byte != b'\n' && byte != b'\r' && byte != b'\t' {
2103            control_count += 1;
2104        }
2105    }
2106
2107    null_count == 0 && control_count < sample_size / 10
2108}