embeddenator_fs/fs/
embrfs.rs

1//! EmbrFS - Holographic Filesystem Implementation
2//!
3//! Provides engram-based storage for entire filesystem trees with:
4//! - Chunked encoding for efficient storage
5//! - Manifest for file metadata
6//! - **Guaranteed 100% bit-perfect reconstruction** via CorrectionStore
7//!
8//! # Reconstruction Guarantee
9//!
10//! The fundamental challenge with VSA encoding is that approximate operations
11//! may introduce errors during superposition. This module solves that through
12//! a multi-layer approach:
13//!
14//! 1. **Primary Encoding**: SparseVec encoding attempts bit-perfect storage
15//! 2. **Correction Layer**: CorrectionStore captures any encoding errors
16//! 3. **Reconstruction**: Decode + apply corrections = exact original
17//!
18//! The invariant: `original = decode(encode(original)) + correction`
19//!
20//! If encoding was perfect, correction is empty. If not, correction exactly
21//! compensates. Either way, reconstruction is guaranteed bit-perfect.
22
23use crate::correction::{CorrectionStats, CorrectionStore};
24use embeddenator_retrieval::resonator::Resonator;
25use embeddenator_retrieval::{RerankedResult, TernaryInvertedIndex};
26use embeddenator_vsa::{ReversibleVSAConfig, SparseVec, DIM};
27use serde::{Deserialize, Serialize};
28use std::collections::BTreeMap;
29use std::collections::{HashMap, HashSet};
30use std::fs::{self, File};
31use std::io::{self, Read};
32use std::path::{Path, PathBuf};
33use walkdir::WalkDir;
34
35/// Default chunk size for file encoding (4KB)
36pub const DEFAULT_CHUNK_SIZE: usize = 4096;
37
38/// File entry in the manifest
39#[derive(Serialize, Deserialize, Debug)]
40pub struct FileEntry {
41    pub path: String,
42    pub is_text: bool,
43    pub size: usize,
44    pub chunks: Vec<usize>,
45    /// Mark files as deleted without rebuilding root (for incremental updates)
46    #[serde(default)]
47    pub deleted: bool,
48}
49
50/// Manifest describing filesystem structure
51#[derive(Serialize, Deserialize, Debug)]
52pub struct Manifest {
53    pub files: Vec<FileEntry>,
54    pub total_chunks: usize,
55}
56
57/// Hierarchical manifest for multi-level engrams
58#[derive(Serialize, Deserialize, Debug)]
59pub struct HierarchicalManifest {
60    pub version: u32,
61    pub levels: Vec<ManifestLevel>,
62    #[serde(default)]
63    pub sub_engrams: HashMap<String, SubEngram>,
64}
65
66/// Level in hierarchical manifest
67#[derive(Serialize, Deserialize, Debug, Clone)]
68pub struct ManifestLevel {
69    pub level: u32,
70    pub items: Vec<ManifestItem>,
71}
72
73/// Item in manifest level
74#[derive(Serialize, Deserialize, Debug, Clone)]
75pub struct ManifestItem {
76    pub path: String,
77    pub sub_engram_id: String,
78}
79
80/// Sub-engram in hierarchical structure
81#[derive(Serialize, Deserialize, Debug, Clone)]
82pub struct SubEngram {
83    pub id: String,
84    pub root: SparseVec,
85    /// Chunk IDs that belong to this sub-engram.
86    ///
87    /// This enables selective retrieval without indexing the entire global codebook.
88    #[serde(default)]
89    pub chunk_ids: Vec<usize>,
90    pub chunk_count: usize,
91    pub children: Vec<String>,
92}
93
94/// Bounds and tuning parameters for hierarchical selective retrieval.
95#[derive(Clone, Debug)]
96pub struct HierarchicalQueryBounds {
97    /// Global top-k results to return.
98    pub k: usize,
99    /// Candidate count per expanded node before reranking.
100    pub candidate_k: usize,
101    /// Maximum number of frontier nodes retained (beam width).
102    pub beam_width: usize,
103    /// Maximum depth to descend (0 means only level-0 nodes).
104    pub max_depth: usize,
105    /// Maximum number of expanded nodes.
106    pub max_expansions: usize,
107    /// Maximum number of cached inverted indices.
108    pub max_open_indices: usize,
109    /// Maximum number of cached sub-engrams.
110    pub max_open_engrams: usize,
111}
112
113impl Default for HierarchicalQueryBounds {
114    fn default() -> Self {
115        Self {
116            k: 10,
117            candidate_k: 100,
118            beam_width: 32,
119            max_depth: 4,
120            max_expansions: 128,
121            max_open_indices: 16,
122            max_open_engrams: 16,
123        }
124    }
125}
126
127#[derive(Clone, Debug, PartialEq)]
128pub struct HierarchicalChunkHit {
129    pub sub_engram_id: String,
130    pub chunk_id: usize,
131    pub approx_score: i32,
132    pub cosine: f64,
133}
134
135#[derive(Clone, Debug)]
136struct FrontierItem {
137    score: f64,
138    sub_engram_id: String,
139    depth: usize,
140}
141
142#[derive(Clone, Debug)]
143struct RemappedInvertedIndex {
144    index: TernaryInvertedIndex,
145    local_to_global: Vec<usize>,
146}
147
148impl RemappedInvertedIndex {
149    fn build(chunk_ids: &[usize], vectors: &HashMap<usize, SparseVec>) -> Self {
150        let mut index = TernaryInvertedIndex::new();
151        let mut local_to_global = Vec::with_capacity(chunk_ids.len());
152
153        for (local_id, &global_id) in chunk_ids.iter().enumerate() {
154            let Some(vec) = vectors.get(&global_id) else {
155                continue;
156            };
157            local_to_global.push(global_id);
158            index.add(local_id, vec);
159        }
160
161        index.finalize();
162        Self {
163            index,
164            local_to_global,
165        }
166    }
167
168    fn query_top_k_reranked(
169        &self,
170        query: &SparseVec,
171        vectors: &HashMap<usize, SparseVec>,
172        candidate_k: usize,
173        k: usize,
174    ) -> Vec<HierarchicalChunkHit> {
175        if k == 0 {
176            return Vec::new();
177        }
178
179        let candidates = self.index.query_top_k(query, candidate_k);
180        let mut out = Vec::with_capacity(candidates.len().min(k));
181        for cand in candidates {
182            let Some(&global_id) = self.local_to_global.get(cand.id) else {
183                continue;
184            };
185            let Some(vec) = vectors.get(&global_id) else {
186                continue;
187            };
188            out.push((global_id, cand.score, query.cosine(vec)));
189        }
190
191        out.sort_by(|a, b| {
192            b.2.total_cmp(&a.2)
193                .then_with(|| b.1.cmp(&a.1))
194                .then_with(|| a.0.cmp(&b.0))
195        });
196        out.truncate(k);
197
198        out.into_iter()
199            .map(|(chunk_id, approx_score, cosine)| HierarchicalChunkHit {
200                sub_engram_id: String::new(),
201                chunk_id,
202                approx_score,
203                cosine,
204            })
205            .collect()
206    }
207}
208
209#[derive(Clone, Debug)]
210struct LruCache<V> {
211    cap: usize,
212    map: HashMap<String, V>,
213    order: Vec<String>,
214}
215
216impl<V> LruCache<V> {
217    fn new(cap: usize) -> Self {
218        Self {
219            cap,
220            map: HashMap::new(),
221            order: Vec::new(),
222        }
223    }
224
225    fn get(&mut self, key: &str) -> Option<&V> {
226        if self.map.contains_key(key) {
227            self.touch(key);
228            return self.map.get(key);
229        }
230        None
231    }
232
233    fn insert(&mut self, key: String, value: V) {
234        if self.cap == 0 {
235            return;
236        }
237
238        if self.map.contains_key(&key) {
239            self.map.insert(key.clone(), value);
240            self.touch(&key);
241            return;
242        }
243
244        self.map.insert(key.clone(), value);
245        self.order.push(key);
246
247        while self.map.len() > self.cap {
248            if let Some(evict) = self.order.first().cloned() {
249                self.order.remove(0);
250                self.map.remove(&evict);
251            } else {
252                break;
253            }
254        }
255    }
256
257    fn touch(&mut self, key: &str) {
258        if let Some(pos) = self.order.iter().position(|k| k == key) {
259            let k = self.order.remove(pos);
260            self.order.push(k);
261        }
262    }
263}
264
265/// Storage/loader seam for hierarchical sub-engrams.
266///
267/// This enables on-demand loading (e.g., from disk) rather than requiring that
268/// every sub-engram is materialized in memory.
269pub trait SubEngramStore {
270    fn load(&self, id: &str) -> Option<SubEngram>;
271}
272
273fn escape_sub_engram_id(id: &str) -> String {
274    // Minimal reversible escaping for filenames.
275    // Note: not intended for untrusted input; IDs are internal.
276    id.replace('%', "%25").replace('/', "%2F")
277}
278
279/// Directory-backed store for sub-engrams.
280///
281/// Files are stored as bincode blobs under `${dir}/{escaped_id}.subengram`.
282pub struct DirectorySubEngramStore {
283    dir: PathBuf,
284}
285
286impl DirectorySubEngramStore {
287    pub fn new<P: AsRef<Path>>(dir: P) -> Self {
288        Self {
289            dir: dir.as_ref().to_path_buf(),
290        }
291    }
292
293    fn path_for_id(&self, id: &str) -> PathBuf {
294        self.dir
295            .join(format!("{}.subengram", escape_sub_engram_id(id)))
296    }
297}
298
299impl SubEngramStore for DirectorySubEngramStore {
300    fn load(&self, id: &str) -> Option<SubEngram> {
301        let path = self.path_for_id(id);
302        let data = fs::read(path).ok()?;
303        bincode::deserialize(&data).ok()
304    }
305}
306
307/// Save a hierarchical manifest as JSON.
308pub fn save_hierarchical_manifest<P: AsRef<Path>>(
309    hierarchical: &HierarchicalManifest,
310    path: P,
311) -> io::Result<()> {
312    let file = File::create(path)?;
313
314    // Serialize deterministically: HashMap iteration order is not stable.
315    #[derive(Serialize)]
316    struct StableHierarchicalManifest {
317        version: u32,
318        levels: Vec<ManifestLevel>,
319        sub_engrams: BTreeMap<String, SubEngram>,
320    }
321
322    let mut levels = hierarchical.levels.clone();
323    levels.sort_by(|a, b| a.level.cmp(&b.level));
324    for level in &mut levels {
325        level.items.sort_by(|a, b| {
326            a.path
327                .cmp(&b.path)
328                .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
329        });
330    }
331
332    let mut sub_engrams: BTreeMap<String, SubEngram> = BTreeMap::new();
333    for (id, sub) in &hierarchical.sub_engrams {
334        sub_engrams.insert(id.clone(), sub.clone());
335    }
336
337    let stable = StableHierarchicalManifest {
338        version: hierarchical.version,
339        levels,
340        sub_engrams,
341    };
342
343    serde_json::to_writer_pretty(file, &stable)?;
344    Ok(())
345}
346
347/// Load a hierarchical manifest from JSON.
348pub fn load_hierarchical_manifest<P: AsRef<Path>>(path: P) -> io::Result<HierarchicalManifest> {
349    let file = File::open(path)?;
350    let manifest = serde_json::from_reader(file)?;
351    Ok(manifest)
352}
353
354/// Save a set of sub-engrams to a directory (bincode per sub-engram).
355pub fn save_sub_engrams_dir<P: AsRef<Path>>(
356    sub_engrams: &HashMap<String, SubEngram>,
357    dir: P,
358) -> io::Result<()> {
359    let dir = dir.as_ref();
360    fs::create_dir_all(dir)?;
361
362    let mut ids: Vec<&String> = sub_engrams.keys().collect();
363    ids.sort();
364
365    for id in ids {
366        // SAFETY: id comes from keys(), so get() must succeed
367        let sub = sub_engrams
368            .get(id)
369            .expect("sub_engram id from keys() must exist in HashMap");
370        let encoded = bincode::serialize(sub).map_err(io::Error::other)?;
371        let path = dir.join(format!("{}.subengram", escape_sub_engram_id(id)));
372        fs::write(path, encoded)?;
373    }
374    Ok(())
375}
376
377struct InMemorySubEngramStore<'a> {
378    map: &'a HashMap<String, SubEngram>,
379}
380
381impl<'a> InMemorySubEngramStore<'a> {
382    fn new(map: &'a HashMap<String, SubEngram>) -> Self {
383        Self { map }
384    }
385}
386
387impl SubEngramStore for InMemorySubEngramStore<'_> {
388    fn load(&self, id: &str) -> Option<SubEngram> {
389        self.map.get(id).cloned()
390    }
391}
392
393fn get_cached_sub_engram(
394    cache: &mut LruCache<SubEngram>,
395    store: &impl SubEngramStore,
396    id: &str,
397) -> Option<SubEngram> {
398    if let Some(v) = cache.get(id) {
399        return Some(v.clone());
400    }
401    let loaded = store.load(id)?;
402    cache.insert(id.to_string(), loaded.clone());
403    Some(loaded)
404}
405
406/// Query a hierarchical manifest by selectively unfolding only promising sub-engrams.
407///
408/// This performs a beam-limited traversal over `hierarchical.sub_engrams`.
409/// At each expanded node, it builds (and LRU-caches) an inverted index over the
410/// node-local `chunk_ids` subset of `codebook`, then reranks by exact cosine.
411pub fn query_hierarchical_codebook(
412    hierarchical: &HierarchicalManifest,
413    codebook: &HashMap<usize, SparseVec>,
414    query: &SparseVec,
415    bounds: &HierarchicalQueryBounds,
416) -> Vec<HierarchicalChunkHit> {
417    let store = InMemorySubEngramStore::new(&hierarchical.sub_engrams);
418    query_hierarchical_codebook_with_store(hierarchical, &store, codebook, query, bounds)
419}
420
421/// Store-backed variant of `query_hierarchical_codebook` that supports on-demand sub-engram loading.
422pub fn query_hierarchical_codebook_with_store(
423    hierarchical: &HierarchicalManifest,
424    store: &impl SubEngramStore,
425    codebook: &HashMap<usize, SparseVec>,
426    query: &SparseVec,
427    bounds: &HierarchicalQueryBounds,
428) -> Vec<HierarchicalChunkHit> {
429    if bounds.k == 0 || hierarchical.levels.is_empty() {
430        return Vec::new();
431    }
432
433    let mut sub_cache: LruCache<SubEngram> = LruCache::new(bounds.max_open_engrams);
434    let mut index_cache: LruCache<RemappedInvertedIndex> = LruCache::new(bounds.max_open_indices);
435
436    let mut frontier: Vec<FrontierItem> = Vec::new();
437    if let Some(level0) = hierarchical.levels.first() {
438        for item in &level0.items {
439            let Some(sub) = get_cached_sub_engram(&mut sub_cache, store, &item.sub_engram_id)
440            else {
441                continue;
442            };
443            frontier.push(FrontierItem {
444                score: query.cosine(&sub.root),
445                sub_engram_id: item.sub_engram_id.clone(),
446                depth: 0,
447            });
448        }
449    }
450
451    frontier.sort_by(|a, b| {
452        b.score
453            .total_cmp(&a.score)
454            .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
455    });
456    if frontier.len() > bounds.beam_width {
457        frontier.truncate(bounds.beam_width);
458    }
459
460    let mut expansions = 0usize;
461
462    // Keep only the best hit per chunk for determinism.
463    let mut best_by_chunk: HashMap<usize, HierarchicalChunkHit> = HashMap::new();
464
465    while !frontier.is_empty() && expansions < bounds.max_expansions {
466        let node = frontier.remove(0);
467
468        let Some(sub) = get_cached_sub_engram(&mut sub_cache, store, &node.sub_engram_id) else {
469            continue;
470        };
471
472        expansions += 1;
473
474        let idx = if let Some(existing) = index_cache.get(&node.sub_engram_id) {
475            existing
476        } else {
477            let built = RemappedInvertedIndex::build(&sub.chunk_ids, codebook);
478            index_cache.insert(node.sub_engram_id.clone(), built);
479            // SAFETY: we just inserted the key, so get() must succeed immediately after
480            index_cache
481                .get(&node.sub_engram_id)
482                .expect("index_cache.get() must succeed immediately after insert()")
483        };
484
485        let mut local_hits =
486            idx.query_top_k_reranked(query, codebook, bounds.candidate_k, bounds.k);
487        for hit in &mut local_hits {
488            hit.sub_engram_id = node.sub_engram_id.clone();
489        }
490
491        for hit in local_hits {
492            match best_by_chunk.get(&hit.chunk_id) {
493                None => {
494                    best_by_chunk.insert(hit.chunk_id, hit);
495                }
496                Some(existing) => {
497                    let better = hit
498                        .cosine
499                        .total_cmp(&existing.cosine)
500                        .then_with(|| hit.approx_score.cmp(&existing.approx_score))
501                        .is_gt();
502                    if better {
503                        best_by_chunk.insert(hit.chunk_id, hit);
504                    }
505                }
506            }
507        }
508
509        if node.depth >= bounds.max_depth {
510            continue;
511        }
512
513        let children = sub.children.clone();
514        for child_id in &children {
515            let Some(child) = get_cached_sub_engram(&mut sub_cache, store, child_id) else {
516                continue;
517            };
518            frontier.push(FrontierItem {
519                score: query.cosine(&child.root),
520                sub_engram_id: child_id.clone(),
521                depth: node.depth + 1,
522            });
523        }
524
525        frontier.sort_by(|a, b| {
526            b.score
527                .total_cmp(&a.score)
528                .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
529        });
530        if frontier.len() > bounds.beam_width {
531            frontier.truncate(bounds.beam_width);
532        }
533    }
534
535    let mut out: Vec<HierarchicalChunkHit> = best_by_chunk.into_values().collect();
536    out.sort_by(|a, b| {
537        b.cosine
538            .total_cmp(&a.cosine)
539            .then_with(|| b.approx_score.cmp(&a.approx_score))
540            .then_with(|| a.chunk_id.cmp(&b.chunk_id))
541            .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
542    });
543    out.truncate(bounds.k);
544    out
545}
546
547/// Unified manifest enum for backward compatibility
548#[derive(Serialize, Deserialize, Debug)]
549pub enum UnifiedManifest {
550    Flat(Manifest),
551    Hierarchical(HierarchicalManifest),
552}
553
554impl From<Manifest> for UnifiedManifest {
555    fn from(manifest: Manifest) -> Self {
556        UnifiedManifest::Flat(manifest)
557    }
558}
559
560/// Engram: holographic encoding of a filesystem with correction guarantee
561#[derive(Serialize, Deserialize)]
562pub struct Engram {
563    pub root: SparseVec,
564    pub codebook: HashMap<usize, SparseVec>,
565    /// Correction store for 100% reconstruction guarantee
566    #[serde(default)]
567    pub corrections: CorrectionStore,
568}
569
570impl Engram {
571    /// Build a reusable inverted index over the codebook.
572    ///
573    /// This is useful when issuing multiple queries (e.g., shift-sweeps) and you
574    /// want to avoid rebuilding the index each time.
575    pub fn build_codebook_index(&self) -> TernaryInvertedIndex {
576        TernaryInvertedIndex::build_from_map(&self.codebook)
577    }
578
579    /// Query the codebook using a pre-built inverted index.
580    pub fn query_codebook_with_index(
581        &self,
582        index: &TernaryInvertedIndex,
583        query: &SparseVec,
584        candidate_k: usize,
585        k: usize,
586    ) -> Vec<RerankedResult> {
587        if k == 0 || self.codebook.is_empty() {
588            return Vec::new();
589        }
590        index.query_top_k_reranked(query, &self.codebook, candidate_k, k)
591    }
592
593    /// Query the engram's codebook for chunks most similar to `query`.
594    ///
595    /// This builds an inverted index over the codebook for sub-linear candidate
596    /// generation, then reranks those candidates using exact cosine similarity.
597    pub fn query_codebook(&self, query: &SparseVec, k: usize) -> Vec<RerankedResult> {
598        if k == 0 || self.codebook.is_empty() {
599            return Vec::new();
600        }
601
602        // Simple heuristic: rerank a moderately-sized candidate set.
603        let candidate_k = (k.saturating_mul(10)).max(50);
604        let index = self.build_codebook_index();
605        self.query_codebook_with_index(&index, query, candidate_k, k)
606    }
607}
608
609/// EmbrFS - Holographic Filesystem with Guaranteed Reconstruction
610///
611/// # 100% Reconstruction Guarantee
612///
613/// EmbrFS guarantees bit-perfect file reconstruction through a layered approach:
614///
615/// 1. **Encode**: Data chunks → SparseVec via reversible encoding
616/// 2. **Verify**: Immediately decode and compare to original
617/// 3. **Correct**: Store minimal correction if any difference exists
618/// 4. **Extract**: Decode + apply correction = exact original bytes
619///
620/// This guarantee holds regardless of:
621/// - Data content (binary, text, compressed, encrypted)
622/// - File size (single byte to gigabytes)
623/// - Number of files in the engram
624/// - Superposition crosstalk in bundles
625///
626/// # Examples
627///
628/// ```
629/// use embeddenator_fs::EmbrFS;
630/// use std::path::Path;
631///
632/// let mut fs = EmbrFS::new();
633/// // Ingest and extract would require actual files, so we just test creation
634/// assert_eq!(fs.manifest.total_chunks, 0);
635/// assert_eq!(fs.manifest.files.len(), 0);
636/// ```
637pub struct EmbrFS {
638    pub manifest: Manifest,
639    pub engram: Engram,
640    pub resonator: Option<Resonator>,
641}
642
643impl Default for EmbrFS {
644    fn default() -> Self {
645        Self::new()
646    }
647}
648
649impl EmbrFS {
650    /// Create a new empty EmbrFS instance
651    ///
652    /// # Examples
653    ///
654    /// ```
655    /// use embeddenator_fs::EmbrFS;
656    ///
657    /// let fs = EmbrFS::new();
658    /// assert_eq!(fs.manifest.files.len(), 0);
659    /// assert_eq!(fs.manifest.total_chunks, 0);
660    /// // Correction store starts empty
661    /// let stats = fs.engram.corrections.stats();
662    /// assert_eq!(stats.total_chunks, 0);
663    /// ```
664    pub fn new() -> Self {
665        EmbrFS {
666            manifest: Manifest {
667                files: Vec::new(),
668                total_chunks: 0,
669            },
670            engram: Engram {
671                root: SparseVec::new(),
672                codebook: HashMap::new(),
673                corrections: CorrectionStore::new(),
674            },
675            resonator: None,
676        }
677    }
678
679    fn path_to_forward_slash_string(path: &Path) -> String {
680        path.components()
681            .filter_map(|c| match c {
682                std::path::Component::Normal(s) => s.to_str().map(|v| v.to_string()),
683                _ => None,
684            })
685            .collect::<Vec<String>>()
686            .join("/")
687    }
688
689    /// Set the resonator for enhanced pattern recovery during extraction
690    ///
691    /// Configures a resonator network that can perform pattern completion to recover
692    /// missing or corrupted data chunks during filesystem extraction. The resonator
693    /// acts as a content-addressable memory that can reconstruct lost information
694    /// by finding the best matching patterns in its trained codebook.
695    ///
696    /// # How it works
697    /// - The resonator maintains a codebook of known vector patterns
698    /// - During extraction, missing chunks are projected onto the closest known pattern
699    /// - This enables robust recovery from partial data loss or corruption
700    ///
701    /// # Why this matters
702    /// - Provides fault tolerance for holographic storage systems
703    /// - Enables reconstruction even when some chunks are unavailable
704    /// - Supports graceful degradation rather than complete failure
705    ///
706    /// # Arguments
707    /// * `resonator` - A trained resonator network for pattern completion
708    ///
709    /// # Examples
710    /// ```
711    /// use embeddenator_fs::{EmbrFS, Resonator};
712    ///
713    /// let mut fs = EmbrFS::new();
714    /// let resonator = Resonator::new();
715    /// fs.set_resonator(resonator);
716    /// // Now extraction will use resonator-enhanced recovery
717    /// ```
718    pub fn set_resonator(&mut self, resonator: Resonator) {
719        self.resonator = Some(resonator);
720    }
721
722    /// Get correction statistics for this engram
723    ///
724    /// Returns statistics about how many chunks needed correction and the
725    /// overhead incurred by storing corrections.
726    ///
727    /// # Examples
728    /// ```
729    /// use embeddenator_fs::EmbrFS;
730    ///
731    /// let fs = EmbrFS::new();
732    /// let stats = fs.correction_stats();
733    /// assert_eq!(stats.total_chunks, 0);
734    /// ```
735    pub fn correction_stats(&self) -> CorrectionStats {
736        self.engram.corrections.stats()
737    }
738
739    /// Ingest an entire directory into engram format
740    pub fn ingest_directory<P: AsRef<Path>>(
741        &mut self,
742        dir: P,
743        verbose: bool,
744        config: &ReversibleVSAConfig,
745    ) -> io::Result<()> {
746        self.ingest_directory_with_prefix(dir, None, verbose, config)
747    }
748
749    /// Ingest a directory into the engram, optionally prefixing all logical paths.
750    ///
751    /// When `logical_prefix` is provided, all ingested file paths become:
752    /// `{logical_prefix}/{relative_path_from_dir}`.
753    pub fn ingest_directory_with_prefix<P: AsRef<Path>>(
754        &mut self,
755        dir: P,
756        logical_prefix: Option<&str>,
757        verbose: bool,
758        config: &ReversibleVSAConfig,
759    ) -> io::Result<()> {
760        let dir = dir.as_ref();
761        if verbose {
762            println!("Ingesting directory: {}", dir.display());
763        }
764
765        let mut files_to_process = Vec::new();
766        for entry in WalkDir::new(dir).follow_links(false) {
767            let entry = entry?;
768            if entry.file_type().is_file() {
769                files_to_process.push(entry.path().to_path_buf());
770            }
771        }
772        files_to_process.sort();
773
774        for file_path in files_to_process {
775            let relative = file_path.strip_prefix(dir).unwrap_or(file_path.as_path());
776            let rel = Self::path_to_forward_slash_string(relative);
777            let logical_path = if let Some(prefix) = logical_prefix {
778                if prefix.is_empty() {
779                    rel
780                } else if rel.is_empty() {
781                    prefix.to_string()
782                } else {
783                    format!("{}/{}", prefix, rel)
784                }
785            } else {
786                rel
787            };
788
789            self.ingest_file(&file_path, logical_path, verbose, config)?;
790        }
791
792        Ok(())
793    }
794
795    /// Ingest a single file into the engram with guaranteed reconstruction
796    ///
797    /// This method encodes file data into sparse vectors and stores any
798    /// necessary corrections to guarantee 100% bit-perfect reconstruction.
799    ///
800    /// # Correction Process
801    ///
802    /// For each chunk:
803    /// 1. Encode: `chunk_data → SparseVec`
804    /// 2. Decode: `SparseVec → decoded_data`  
805    /// 3. Compare: `chunk_data == decoded_data?`
806    /// 4. If different: store correction in `CorrectionStore`
807    ///
808    /// # Arguments
809    /// * `file_path` - Path to the file on disk
810    /// * `logical_path` - Path to use in the engram manifest
811    /// * `verbose` - Print progress information
812    /// * `config` - VSA encoding configuration
813    ///
814    /// # Returns
815    /// `io::Result<()>` indicating success or failure
816    pub fn ingest_file<P: AsRef<Path>>(
817        &mut self,
818        file_path: P,
819        logical_path: String,
820        verbose: bool,
821        config: &ReversibleVSAConfig,
822    ) -> io::Result<()> {
823        let file_path = file_path.as_ref();
824        let mut file = File::open(file_path)?;
825        let mut data = Vec::new();
826        file.read_to_end(&mut data)?;
827
828        let is_text = is_text_file(&data);
829
830        if verbose {
831            println!(
832                "Ingesting {}: {} bytes ({})",
833                logical_path,
834                data.len(),
835                if is_text { "text" } else { "binary" }
836            );
837        }
838
839        let chunk_size = DEFAULT_CHUNK_SIZE;
840        let mut chunks = Vec::new();
841        let mut corrections_needed = 0usize;
842
843        for (i, chunk) in data.chunks(chunk_size).enumerate() {
844            let chunk_id = self.manifest.total_chunks + i;
845
846            // Encode chunk to sparse vector
847            let chunk_vec = SparseVec::encode_data(chunk, config, Some(&logical_path));
848
849            // Immediately verify: decode and compare
850            let decoded = chunk_vec.decode_data(config, Some(&logical_path), chunk.len());
851
852            // Store correction if needed (guarantees reconstruction)
853            self.engram
854                .corrections
855                .add(chunk_id as u64, chunk, &decoded);
856
857            if chunk != decoded.as_slice() {
858                corrections_needed += 1;
859            }
860
861            self.engram.root = self.engram.root.bundle(&chunk_vec);
862            self.engram.codebook.insert(chunk_id, chunk_vec);
863            chunks.push(chunk_id);
864        }
865
866        if verbose && corrections_needed > 0 {
867            println!(
868                "  → {} of {} chunks needed correction",
869                corrections_needed,
870                chunks.len()
871            );
872        }
873
874        self.manifest.files.push(FileEntry {
875            path: logical_path,
876            is_text,
877            size: data.len(),
878            chunks: chunks.clone(),
879            deleted: false,
880        });
881
882        self.manifest.total_chunks += chunks.len();
883
884        Ok(())
885    }
886
887    /// Add a new file to an existing engram (incremental update)
888    ///
889    /// This method enables efficient incremental updates by adding a single file
890    /// to an existing engram without requiring full re-ingestion. The new file's
891    /// chunks are bundled with the existing root vector using VSA's associative
892    /// bundle operation.
893    ///
894    /// # Algorithm
895    /// 1. Encode new file into chunks (same as ingest_file)
896    /// 2. Bundle each chunk with existing root: `root_new = root_old ⊕ chunk`
897    /// 3. Add chunks to codebook with new chunk IDs
898    /// 4. Update manifest with new file entry
899    ///
900    /// # Performance
901    /// - Time complexity: O(n) where n = number of chunks in new file
902    /// - Does not require reading or re-encoding existing files
903    /// - Suitable for production workflows with frequent additions
904    ///
905    /// # Arguments
906    /// * `file_path` - Path to the file on disk
907    /// * `logical_path` - Path to use in the engram manifest
908    /// * `verbose` - Print progress information
909    /// * `config` - VSA encoding configuration
910    ///
911    /// # Returns
912    /// `io::Result<()>` indicating success or failure
913    ///
914    /// # Examples
915    /// ```no_run
916    /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
917    /// use std::path::Path;
918    ///
919    /// let mut fs = EmbrFS::new();
920    /// let config = ReversibleVSAConfig::default();
921    ///
922    /// // Ingest initial dataset
923    /// fs.ingest_directory("./data", false, &config).unwrap();
924    ///
925    /// // Later, add a new file without full re-ingestion
926    /// fs.add_file("./new_file.txt", "new_file.txt".to_string(), true, &config).unwrap();
927    /// ```
928    pub fn add_file<P: AsRef<Path>>(
929        &mut self,
930        file_path: P,
931        logical_path: String,
932        verbose: bool,
933        config: &ReversibleVSAConfig,
934    ) -> io::Result<()> {
935        let file_path = file_path.as_ref();
936
937        // Check if file already exists (not deleted)
938        if self
939            .manifest
940            .files
941            .iter()
942            .any(|f| f.path == logical_path && !f.deleted)
943        {
944            return Err(io::Error::new(
945                io::ErrorKind::AlreadyExists,
946                format!("File '{}' already exists in engram", logical_path),
947            ));
948        }
949
950        // Use existing ingest_file logic (already handles bundling with root)
951        self.ingest_file(file_path, logical_path, verbose, config)
952    }
953
954    /// Remove a file from the engram (mark as deleted for incremental update)
955    ///
956    /// This method marks a file as deleted in the manifest without modifying the
957    /// root vector. This is because VSA bundling is a lossy operation and there's
958    /// no clean inverse. The chunks remain in the codebook but won't be extracted.
959    ///
960    /// # Algorithm
961    /// 1. Find file in manifest by logical path
962    /// 2. Mark file entry as deleted
963    /// 3. Chunks remain in codebook (for potential recovery or compaction)
964    /// 4. File won't appear in future extractions
965    ///
966    /// # Note on VSA Limitations
967    /// Bundle operation is associative but not invertible:
968    /// - `(A ⊕ B) ⊕ C = A ⊕ (B ⊕ C)` ✓ (can add)
969    /// - `(A ⊕ B) ⊖ B ≠ A` ✗ (can't cleanly remove)
970    ///
971    /// To truly remove chunks from the root, use `compact()` which rebuilds
972    /// the engram without deleted files.
973    ///
974    /// # Arguments
975    /// * `logical_path` - Path of the file to remove
976    /// * `verbose` - Print progress information
977    ///
978    /// # Returns
979    /// `io::Result<()>` indicating success or failure
980    ///
981    /// # Examples
982    /// ```no_run
983    /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
984    ///
985    /// let mut fs = EmbrFS::new();
986    /// let config = ReversibleVSAConfig::default();
987    ///
988    /// fs.ingest_directory("./data", false, &config).unwrap();
989    /// fs.remove_file("old_file.txt", true).unwrap();
990    /// // File marked as deleted, won't be extracted
991    /// ```
992    pub fn remove_file(&mut self, logical_path: &str, verbose: bool) -> io::Result<()> {
993        // Find file in manifest
994        let file_entry = self
995            .manifest
996            .files
997            .iter_mut()
998            .find(|f| f.path == logical_path && !f.deleted)
999            .ok_or_else(|| {
1000                io::Error::new(
1001                    io::ErrorKind::NotFound,
1002                    format!("File '{}' not found in engram", logical_path),
1003                )
1004            })?;
1005
1006        if verbose {
1007            println!(
1008                "Marking file as deleted: {} ({} chunks)",
1009                logical_path,
1010                file_entry.chunks.len()
1011            );
1012        }
1013
1014        // Mark as deleted (don't remove from manifest to preserve chunk IDs)
1015        file_entry.deleted = true;
1016
1017        if verbose {
1018            println!("  Note: Use 'compact' to rebuild engram and reclaim space");
1019        }
1020
1021        Ok(())
1022    }
1023
1024    /// Modify an existing file in the engram (incremental update)
1025    ///
1026    /// This method updates a file's content by removing the old version and
1027    /// adding the new version. It's equivalent to `remove_file` + `add_file`.
1028    ///
1029    /// # Algorithm
1030    /// 1. Mark old file as deleted
1031    /// 2. Re-encode new file content
1032    /// 3. Bundle new chunks with root
1033    /// 4. Add new file entry to manifest
1034    ///
1035    /// # Trade-offs
1036    /// - Old chunks remain in codebook (use `compact()` to clean up)
1037    /// - Root contains both old and new chunk contributions (slight noise)
1038    /// - Fast operation, doesn't require rebuilding entire engram
1039    ///
1040    /// # Arguments
1041    /// * `file_path` - Path to the file on disk (new content)
1042    /// * `logical_path` - Path of the file in the engram
1043    /// * `verbose` - Print progress information
1044    /// * `config` - VSA encoding configuration
1045    ///
1046    /// # Returns
1047    /// `io::Result<()>` indicating success or failure
1048    ///
1049    /// # Examples
1050    /// ```no_run
1051    /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1052    /// use std::path::Path;
1053    ///
1054    /// let mut fs = EmbrFS::new();
1055    /// let config = ReversibleVSAConfig::default();
1056    ///
1057    /// fs.ingest_directory("./data", false, &config).unwrap();
1058    ///
1059    /// // Later, modify a file
1060    /// fs.modify_file("./data/updated.txt", "data/updated.txt".to_string(), true, &config).unwrap();
1061    /// ```
1062    pub fn modify_file<P: AsRef<Path>>(
1063        &mut self,
1064        file_path: P,
1065        logical_path: String,
1066        verbose: bool,
1067        config: &ReversibleVSAConfig,
1068    ) -> io::Result<()> {
1069        // First, mark old file as deleted
1070        self.remove_file(&logical_path, false)?;
1071
1072        if verbose {
1073            println!("Modifying file: {}", logical_path);
1074        }
1075
1076        // Then add the new version
1077        self.ingest_file(file_path, logical_path, verbose, config)?;
1078
1079        Ok(())
1080    }
1081
1082    /// Compact the engram by rebuilding without deleted files
1083    ///
1084    /// This operation rebuilds the engram from scratch, excluding all files
1085    /// marked as deleted. It's the only way to truly remove old chunks from
1086    /// the root vector and codebook.
1087    ///
1088    /// # Algorithm
1089    /// 1. Create new empty engram
1090    /// 2. Re-bundle all non-deleted files
1091    /// 3. Reassign chunk IDs sequentially
1092    /// 4. Replace old engram with compacted version
1093    ///
1094    /// # Performance
1095    /// - Time complexity: O(N) where N = total bytes of non-deleted files
1096    /// - Expensive operation, run periodically (not after every deletion)
1097    /// - Recommended: compact when deleted files exceed 20-30% of total
1098    ///
1099    /// # Benefits
1100    /// - Reclaims space from deleted chunks
1101    /// - Reduces root vector noise from obsolete data
1102    /// - Resets chunk IDs to sequential order
1103    /// - Maintains bit-perfect reconstruction of kept files
1104    ///
1105    /// # Arguments
1106    /// * `verbose` - Print progress information
1107    /// * `config` - VSA encoding configuration
1108    ///
1109    /// # Returns
1110    /// `io::Result<()>` indicating success or failure
1111    ///
1112    /// # Examples
1113    /// ```no_run
1114    /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1115    ///
1116    /// let mut fs = EmbrFS::new();
1117    /// let config = ReversibleVSAConfig::default();
1118    ///
1119    /// fs.ingest_directory("./data", false, &config).unwrap();
1120    /// fs.remove_file("old1.txt", false).unwrap();
1121    /// fs.remove_file("old2.txt", false).unwrap();
1122    ///
1123    /// // After many deletions, compact to reclaim space
1124    /// fs.compact(true, &config).unwrap();
1125    /// ```
1126    pub fn compact(&mut self, verbose: bool, config: &ReversibleVSAConfig) -> io::Result<()> {
1127        if verbose {
1128            let deleted_count = self.manifest.files.iter().filter(|f| f.deleted).count();
1129            let total_count = self.manifest.files.len();
1130            println!(
1131                "Compacting engram: removing {} deleted files ({} remaining)",
1132                deleted_count,
1133                total_count - deleted_count
1134            );
1135        }
1136
1137        // Create new engram with fresh root and codebook
1138        let mut new_engram = Engram {
1139            root: SparseVec::new(),
1140            codebook: HashMap::new(),
1141            corrections: CorrectionStore::new(),
1142        };
1143
1144        // Rebuild manifest with only non-deleted files
1145        let mut new_manifest = Manifest {
1146            files: Vec::new(),
1147            total_chunks: 0,
1148        };
1149
1150        // Process each non-deleted file
1151        for old_file in &self.manifest.files {
1152            if old_file.deleted {
1153                continue;
1154            }
1155
1156            // Reconstruct file data from old engram
1157            let mut file_data = Vec::new();
1158            let num_chunks = old_file.chunks.len();
1159            for (chunk_idx, &chunk_id) in old_file.chunks.iter().enumerate() {
1160                if let Some(chunk_vec) = self.engram.codebook.get(&chunk_id) {
1161                    let chunk_size = if chunk_idx == num_chunks - 1 {
1162                        let remaining = old_file.size - (chunk_idx * DEFAULT_CHUNK_SIZE);
1163                        remaining.min(DEFAULT_CHUNK_SIZE)
1164                    } else {
1165                        DEFAULT_CHUNK_SIZE
1166                    };
1167
1168                    let decoded = chunk_vec.decode_data(config, Some(&old_file.path), chunk_size);
1169                    let chunk_data = if let Some(corrected) =
1170                        self.engram.corrections.apply(chunk_id as u64, &decoded)
1171                    {
1172                        corrected
1173                    } else {
1174                        decoded
1175                    };
1176
1177                    file_data.extend_from_slice(&chunk_data);
1178                }
1179            }
1180            file_data.truncate(old_file.size);
1181
1182            // Re-encode with new chunk IDs
1183            let mut new_chunks = Vec::new();
1184
1185            for (i, chunk) in file_data.chunks(DEFAULT_CHUNK_SIZE).enumerate() {
1186                let new_chunk_id = new_manifest.total_chunks + i;
1187
1188                let chunk_vec = SparseVec::encode_data(chunk, config, Some(&old_file.path));
1189                let decoded = chunk_vec.decode_data(config, Some(&old_file.path), chunk.len());
1190
1191                new_engram
1192                    .corrections
1193                    .add(new_chunk_id as u64, chunk, &decoded);
1194
1195                new_engram.root = new_engram.root.bundle(&chunk_vec);
1196                new_engram.codebook.insert(new_chunk_id, chunk_vec);
1197                new_chunks.push(new_chunk_id);
1198            }
1199
1200            if verbose {
1201                println!(
1202                    "  Recompacted: {} ({} chunks)",
1203                    old_file.path,
1204                    new_chunks.len()
1205                );
1206            }
1207
1208            new_manifest.files.push(FileEntry {
1209                path: old_file.path.clone(),
1210                is_text: old_file.is_text,
1211                size: old_file.size,
1212                chunks: new_chunks.clone(),
1213                deleted: false,
1214            });
1215
1216            new_manifest.total_chunks += new_chunks.len();
1217        }
1218
1219        // Replace old engram and manifest with compacted versions
1220        self.engram = new_engram;
1221        self.manifest = new_manifest;
1222
1223        if verbose {
1224            println!(
1225                "Compaction complete: {} files, {} chunks",
1226                self.manifest.files.len(),
1227                self.manifest.total_chunks
1228            );
1229        }
1230
1231        Ok(())
1232    }
1233
1234    /// Save engram to file
1235    pub fn save_engram<P: AsRef<Path>>(&self, path: P) -> io::Result<()> {
1236        let encoded = bincode::serialize(&self.engram).map_err(io::Error::other)?;
1237        fs::write(path, encoded)?;
1238        Ok(())
1239    }
1240
1241    /// Load engram from file
1242    pub fn load_engram<P: AsRef<Path>>(path: P) -> io::Result<Engram> {
1243        let data = fs::read(path)?;
1244        bincode::deserialize(&data).map_err(io::Error::other)
1245    }
1246
1247    /// Save manifest to JSON file
1248    pub fn save_manifest<P: AsRef<Path>>(&self, path: P) -> io::Result<()> {
1249        let file = File::create(path)?;
1250        serde_json::to_writer_pretty(file, &self.manifest)?;
1251        Ok(())
1252    }
1253
1254    /// Load manifest from JSON file
1255    pub fn load_manifest<P: AsRef<Path>>(path: P) -> io::Result<Manifest> {
1256        let file = File::open(path)?;
1257        let manifest = serde_json::from_reader(file)?;
1258        Ok(manifest)
1259    }
1260
1261    /// Extract files from engram to directory with guaranteed reconstruction
1262    ///
1263    /// This method guarantees 100% bit-perfect reconstruction by applying
1264    /// stored corrections after decoding each chunk.
1265    ///
1266    /// # Reconstruction Process
1267    ///
1268    /// For each chunk:
1269    /// 1. Decode: `SparseVec → decoded_data`
1270    /// 2. Apply correction: `decoded_data + correction → original_data`
1271    /// 3. Verify: Hash matches stored hash (guaranteed by construction)
1272    ///
1273    /// # Arguments
1274    /// * `engram` - The engram containing encoded data and corrections
1275    /// * `manifest` - File metadata and chunk mappings
1276    /// * `output_dir` - Directory to write extracted files
1277    /// * `verbose` - Print progress information
1278    /// * `config` - VSA decoding configuration
1279    ///
1280    /// # Returns
1281    /// `io::Result<()>` indicating success or failure
1282    pub fn extract<P: AsRef<Path>>(
1283        engram: &Engram,
1284        manifest: &Manifest,
1285        output_dir: P,
1286        verbose: bool,
1287        config: &ReversibleVSAConfig,
1288    ) -> io::Result<()> {
1289        let output_dir = output_dir.as_ref();
1290
1291        if verbose {
1292            println!(
1293                "Extracting {} files to {}",
1294                manifest.files.iter().filter(|f| !f.deleted).count(),
1295                output_dir.display()
1296            );
1297            let stats = engram.corrections.stats();
1298            println!(
1299                "  Correction stats: {:.1}% perfect, {:.2}% overhead",
1300                stats.perfect_ratio * 100.0,
1301                stats.correction_ratio * 100.0
1302            );
1303        }
1304
1305        for file_entry in &manifest.files {
1306            // Skip deleted files
1307            if file_entry.deleted {
1308                continue;
1309            }
1310
1311            let file_path = output_dir.join(&file_entry.path);
1312
1313            if let Some(parent) = file_path.parent() {
1314                fs::create_dir_all(parent)?;
1315            }
1316
1317            let mut reconstructed = Vec::new();
1318            let num_chunks = file_entry.chunks.len();
1319            for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
1320                if let Some(chunk_vec) = engram.codebook.get(&chunk_id) {
1321                    // Calculate the actual chunk size
1322                    // Last chunk may be smaller than DEFAULT_CHUNK_SIZE
1323                    let chunk_size = if chunk_idx == num_chunks - 1 {
1324                        // Last chunk: remaining bytes
1325                        let remaining = file_entry.size - (chunk_idx * DEFAULT_CHUNK_SIZE);
1326                        remaining.min(DEFAULT_CHUNK_SIZE)
1327                    } else {
1328                        DEFAULT_CHUNK_SIZE
1329                    };
1330
1331                    // Decode the sparse vector to bytes
1332                    // IMPORTANT: Use the same path as during encoding for correct shift calculation
1333                    // Also use the same chunk_size as during ingest for correct correction matching
1334                    let decoded = chunk_vec.decode_data(config, Some(&file_entry.path), chunk_size);
1335
1336                    // Apply correction to guarantee bit-perfect reconstruction
1337                    let chunk_data = if let Some(corrected) =
1338                        engram.corrections.apply(chunk_id as u64, &decoded)
1339                    {
1340                        corrected
1341                    } else {
1342                        // No correction found - use decoded directly
1343                        // This can happen with legacy engrams or if correction store is empty
1344                        decoded
1345                    };
1346
1347                    reconstructed.extend_from_slice(&chunk_data);
1348                }
1349            }
1350
1351            reconstructed.truncate(file_entry.size);
1352
1353            fs::write(&file_path, reconstructed)?;
1354
1355            if verbose {
1356                println!("Extracted: {}", file_entry.path);
1357            }
1358        }
1359
1360        Ok(())
1361    }
1362
1363    /// Extract files using resonator-enhanced pattern completion with guaranteed reconstruction
1364    ///
1365    /// Performs filesystem extraction with intelligent recovery capabilities powered by
1366    /// resonator networks. When chunks are missing from the codebook, the resonator
1367    /// attempts pattern completion to reconstruct the lost data, enabling extraction
1368    /// even from partially corrupted or incomplete engrams.
1369    ///
1370    /// # Reconstruction Guarantee
1371    ///
1372    /// Even with resonator-assisted recovery, corrections are applied to guarantee
1373    /// bit-perfect reconstruction. The process is:
1374    ///
1375    /// 1. Try to get chunk from codebook
1376    /// 2. If missing, use resonator to recover approximate chunk
1377    /// 3. Apply correction from CorrectionStore
1378    /// 4. Result is guaranteed bit-perfect (if correction exists)
1379    ///
1380    /// # How it works
1381    /// 1. For each file chunk, check if it exists in the engram codebook
1382    /// 2. If missing, use the resonator to project a query vector onto known patterns
1383    /// 3. Apply stored corrections for guaranteed accuracy
1384    /// 4. Reconstruct the file from available and recovered chunks
1385    /// 5. If no resonator is configured, falls back to standard extraction
1386    ///
1387    /// # Why this matters
1388    /// - Enables 100% reconstruction even with missing chunks
1389    /// - Provides fault tolerance for distributed storage scenarios
1390    /// - Supports hierarchical recovery at multiple levels of the storage stack
1391    /// - Maintains data integrity through pattern-based completion
1392    ///
1393    /// # Arguments
1394    /// * `output_dir` - Directory path where extracted files will be written
1395    /// * `verbose` - Whether to print progress information during extraction
1396    /// * `config` - VSA configuration for encoding/decoding
1397    ///
1398    /// # Returns
1399    /// `io::Result<()>` indicating success or failure of the extraction operation
1400    ///
1401    /// # Examples
1402    /// ```
1403    /// use embeddenator_fs::{EmbrFS, Resonator, ReversibleVSAConfig};
1404    /// use std::path::Path;
1405    ///
1406    /// let mut fs = EmbrFS::new();
1407    /// let resonator = Resonator::new();
1408    /// let config = ReversibleVSAConfig::default();
1409    /// fs.set_resonator(resonator);
1410    ///
1411    /// // Assuming fs has been populated with data...
1412    /// let result = fs.extract_with_resonator("/tmp/output", true, &config);
1413    /// assert!(result.is_ok());
1414    /// ```
1415    pub fn extract_with_resonator<P: AsRef<Path>>(
1416        &self,
1417        output_dir: P,
1418        verbose: bool,
1419        config: &ReversibleVSAConfig,
1420    ) -> io::Result<()> {
1421        if self.resonator.is_none() {
1422            return Self::extract(&self.engram, &self.manifest, output_dir, verbose, config);
1423        }
1424
1425        // SAFETY: we just checked is_none() above and returned early
1426        let _resonator = self
1427            .resonator
1428            .as_ref()
1429            .expect("resonator is Some after is_none() check");
1430        let output_dir = output_dir.as_ref();
1431
1432        if verbose {
1433            println!(
1434                "Extracting {} files with resonator enhancement to {}",
1435                self.manifest.files.iter().filter(|f| !f.deleted).count(),
1436                output_dir.display()
1437            );
1438            let stats = self.engram.corrections.stats();
1439            println!(
1440                "  Correction stats: {:.1}% perfect, {:.2}% overhead",
1441                stats.perfect_ratio * 100.0,
1442                stats.correction_ratio * 100.0
1443            );
1444        }
1445
1446        for file_entry in &self.manifest.files {
1447            // Skip deleted files
1448            if file_entry.deleted {
1449                continue;
1450            }
1451
1452            let file_path = output_dir.join(&file_entry.path);
1453
1454            if let Some(parent) = file_path.parent() {
1455                fs::create_dir_all(parent)?;
1456            }
1457
1458            let mut reconstructed = Vec::new();
1459            let num_chunks = file_entry.chunks.len();
1460            for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
1461                // Calculate the actual chunk size
1462                let chunk_size = if chunk_idx == num_chunks - 1 {
1463                    let remaining = file_entry.size - (chunk_idx * DEFAULT_CHUNK_SIZE);
1464                    remaining.min(DEFAULT_CHUNK_SIZE)
1465                } else {
1466                    DEFAULT_CHUNK_SIZE
1467                };
1468
1469                let chunk_data = if let Some(vector) = self.engram.codebook.get(&chunk_id) {
1470                    // Decode the SparseVec back to bytes using reversible encoding
1471                    // IMPORTANT: Use the same path as during encoding for correct shift calculation
1472                    let decoded = vector.decode_data(config, Some(&file_entry.path), chunk_size);
1473
1474                    // Apply correction to guarantee bit-perfect reconstruction
1475                    if let Some(corrected) =
1476                        self.engram.corrections.apply(chunk_id as u64, &decoded)
1477                    {
1478                        corrected
1479                    } else {
1480                        decoded
1481                    }
1482                } else if let Some(resonator) = &self.resonator {
1483                    // Use resonator to recover missing chunk
1484                    // Create a query vector from the chunk_id using reversible encoding
1485                    let query_vec = SparseVec::encode_data(&chunk_id.to_le_bytes(), config, None);
1486                    let recovered_vec = resonator.project(&query_vec);
1487
1488                    // Decode the recovered vector back to bytes
1489                    // For resonator recovery, try with path first, fall back to no path
1490                    let decoded =
1491                        recovered_vec.decode_data(config, Some(&file_entry.path), chunk_size);
1492
1493                    // Apply correction if available (may not be if chunk was lost)
1494                    if let Some(corrected) =
1495                        self.engram.corrections.apply(chunk_id as u64, &decoded)
1496                    {
1497                        corrected
1498                    } else {
1499                        // No correction available - best effort recovery
1500                        decoded
1501                    }
1502                } else {
1503                    return Err(io::Error::new(
1504                        io::ErrorKind::NotFound,
1505                        format!("Missing chunk {} and no resonator available", chunk_id),
1506                    ));
1507                };
1508                reconstructed.extend_from_slice(&chunk_data);
1509            }
1510
1511            reconstructed.truncate(file_entry.size);
1512
1513            fs::write(&file_path, reconstructed)?;
1514
1515            if verbose {
1516                println!("Extracted with resonator: {}", file_entry.path);
1517            }
1518        }
1519
1520        Ok(())
1521    }
1522
1523    /// Perform hierarchical bundling with path role binding and permutation tagging
1524    ///
1525    /// Creates multi-level engram structures where path components are encoded using
1526    /// permutation operations to create distinct representations at each level. This
1527    /// enables efficient hierarchical retrieval and reconstruction.
1528    ///
1529    /// # How it works
1530    /// 1. Split file paths into components (e.g., "a/b/c.txt" → ["a", "b", "c.txt"])
1531    /// 2. For each level, apply permutation based on path component hash
1532    /// 3. Bundle representations level-by-level with sparsity control
1533    /// 4. Create sub-engrams for intermediate nodes
1534    ///
1535    /// # Why this matters
1536    /// - Enables scalable hierarchical storage beyond flat bundling limits
1537    /// - Path-based retrieval without full engram traversal
1538    /// - Maintains semantic relationships through permutation encoding
1539    /// - Supports efficient partial reconstruction
1540    ///
1541    /// # Arguments
1542    /// * `max_level_sparsity` - Maximum non-zero elements per level bundle
1543    /// * `verbose` - Whether to print progress information
1544    ///
1545    /// # Returns
1546    /// HierarchicalManifest describing the multi-level structure
1547    ///
1548    /// # Examples
1549    /// ```
1550    /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1551    ///
1552    /// let fs = EmbrFS::new();
1553    /// let config = ReversibleVSAConfig::default();
1554    /// // Assuming files have been ingested...
1555    ///
1556    /// let hierarchical = fs.bundle_hierarchically(500, false, &config);
1557    /// assert!(hierarchical.is_ok());
1558    /// ```
1559    pub fn bundle_hierarchically(
1560        &self,
1561        max_level_sparsity: usize,
1562        verbose: bool,
1563        _config: &ReversibleVSAConfig,
1564    ) -> io::Result<HierarchicalManifest> {
1565        self.bundle_hierarchically_with_options(max_level_sparsity, None, verbose, _config)
1566    }
1567
1568    /// Like `bundle_hierarchically`, but supports an optional deterministic cap on `chunk_ids` per node.
1569    ///
1570    /// If `max_chunks_per_node` is set and a node would exceed that many `chunk_ids`, the node becomes
1571    /// a router with empty `chunk_ids`, and deterministic shard children are created each containing a
1572    /// bounded subset of `chunk_ids`.
1573    pub fn bundle_hierarchically_with_options(
1574        &self,
1575        max_level_sparsity: usize,
1576        max_chunks_per_node: Option<usize>,
1577        verbose: bool,
1578        _config: &ReversibleVSAConfig,
1579    ) -> io::Result<HierarchicalManifest> {
1580        let mut levels = Vec::new();
1581        let mut sub_engrams = HashMap::new();
1582
1583        // Group files by *path prefixes* at each level.
1584        // Level 0: "a"; Level 1: "a/b"; etc.
1585        let mut level_prefixes: HashMap<usize, HashMap<String, Vec<&FileEntry>>> = HashMap::new();
1586        for file_entry in &self.manifest.files {
1587            let comps: Vec<&str> = file_entry.path.split('/').collect();
1588            let mut prefix = String::new();
1589            for (level, &comp) in comps.iter().enumerate() {
1590                if level == 0 {
1591                    prefix.push_str(comp);
1592                } else {
1593                    prefix.push('/');
1594                    prefix.push_str(comp);
1595                }
1596                level_prefixes
1597                    .entry(level)
1598                    .or_default()
1599                    .entry(prefix.clone())
1600                    .or_default()
1601                    .push(file_entry);
1602            }
1603        }
1604
1605        // Process each level
1606        let max_level = level_prefixes.keys().max().unwrap_or(&0);
1607
1608        for level in 0..=*max_level {
1609            if verbose {
1610                let item_count = level_prefixes
1611                    .get(&level)
1612                    .map(|comps| comps.values().map(|files| files.len()).sum::<usize>())
1613                    .unwrap_or(0);
1614                println!("Processing level {} with {} items", level, item_count);
1615            }
1616
1617            let mut level_bundle = SparseVec::new();
1618            let mut manifest_items = Vec::new();
1619
1620            if let Some(prefixes) = level_prefixes.get(&level) {
1621                let mut prefix_keys: Vec<&String> = prefixes.keys().collect();
1622                prefix_keys.sort();
1623
1624                for prefix in prefix_keys {
1625                    let mut files: Vec<&FileEntry> = prefixes
1626                        .get(prefix)
1627                        // SAFETY: prefix comes from keys(), so get() must succeed
1628                        .expect("prefix key from keys() must exist in HashMap")
1629                        .to_vec();
1630                    files.sort_by(|a, b| a.path.cmp(&b.path));
1631
1632                    // Create permutation shift based on prefix hash
1633                    let shift = {
1634                        use std::collections::hash_map::DefaultHasher;
1635                        use std::hash::{Hash, Hasher};
1636                        let mut hasher = DefaultHasher::new();
1637                        prefix.hash(&mut hasher);
1638                        (hasher.finish() % (DIM as u64)) as usize
1639                    };
1640
1641                    // Bundle all files under this component with permutation
1642                    let mut component_bundle = SparseVec::new();
1643                    let mut chunk_ids_set: HashSet<usize> = HashSet::new();
1644                    for file_entry in &files {
1645                        // Find chunks for this file and bundle them
1646                        let mut file_bundle = SparseVec::new();
1647                        for &chunk_id in &file_entry.chunks {
1648                            if let Some(chunk_vec) = self.engram.codebook.get(&chunk_id) {
1649                                file_bundle = file_bundle.bundle(chunk_vec);
1650                                chunk_ids_set.insert(chunk_id);
1651                            }
1652                        }
1653
1654                        // Apply level-based permutation
1655                        let permuted_file = file_bundle.permute(shift * (level + 1));
1656                        component_bundle = component_bundle.bundle(&permuted_file);
1657                    }
1658
1659                    // Apply sparsity control
1660                    if component_bundle.pos.len() + component_bundle.neg.len() > max_level_sparsity
1661                    {
1662                        component_bundle = component_bundle.thin(max_level_sparsity);
1663                    }
1664
1665                    level_bundle = level_bundle.bundle(&component_bundle);
1666
1667                    // Create sub-engram for this prefix.
1668                    // Children are the immediate next-level prefixes underneath this prefix.
1669                    let sub_id = format!("level_{}_prefix_{}", level, prefix);
1670
1671                    let mut children_set: HashSet<String> = HashSet::new();
1672                    if level < *max_level {
1673                        for file_entry in &files {
1674                            let comps: Vec<&str> = file_entry.path.split('/').collect();
1675                            if comps.len() <= level + 1 {
1676                                continue;
1677                            }
1678                            let child_prefix = comps[..=level + 1].join("/");
1679                            let child_id = format!("level_{}_prefix_{}", level + 1, child_prefix);
1680                            children_set.insert(child_id);
1681                        }
1682                    }
1683                    let mut children: Vec<String> = children_set.into_iter().collect();
1684                    children.sort();
1685
1686                    let mut chunk_ids: Vec<usize> = chunk_ids_set.into_iter().collect();
1687                    chunk_ids.sort_unstable();
1688
1689                    let chunk_count: usize = files.iter().map(|f| f.chunks.len()).sum();
1690
1691                    if let Some(max_chunks) = max_chunks_per_node.filter(|v| *v > 0) {
1692                        if chunk_ids.len() > max_chunks {
1693                            let mut shard_ids: Vec<String> = Vec::new();
1694                            for (shard_idx, chunk_slice) in chunk_ids.chunks(max_chunks).enumerate()
1695                            {
1696                                let shard_id = format!("{}__shard_{:04}", sub_id, shard_idx);
1697                                shard_ids.push(shard_id.clone());
1698                                sub_engrams.insert(
1699                                    shard_id.clone(),
1700                                    SubEngram {
1701                                        id: shard_id,
1702                                        root: component_bundle.clone(),
1703                                        chunk_ids: chunk_slice.to_vec(),
1704                                        chunk_count: chunk_slice.len(),
1705                                        children: Vec::new(),
1706                                    },
1707                                );
1708                            }
1709
1710                            let mut router_children = shard_ids;
1711                            router_children.extend(children.clone());
1712                            router_children.sort();
1713                            router_children.dedup();
1714
1715                            sub_engrams.insert(
1716                                sub_id.clone(),
1717                                SubEngram {
1718                                    id: sub_id.clone(),
1719                                    root: component_bundle,
1720                                    chunk_ids: Vec::new(),
1721                                    chunk_count,
1722                                    children: router_children,
1723                                },
1724                            );
1725                        } else {
1726                            sub_engrams.insert(
1727                                sub_id.clone(),
1728                                SubEngram {
1729                                    id: sub_id.clone(),
1730                                    root: component_bundle,
1731                                    chunk_ids,
1732                                    chunk_count,
1733                                    children,
1734                                },
1735                            );
1736                        }
1737                    } else {
1738                        sub_engrams.insert(
1739                            sub_id.clone(),
1740                            SubEngram {
1741                                id: sub_id.clone(),
1742                                root: component_bundle,
1743                                chunk_ids,
1744                                chunk_count,
1745                                children,
1746                            },
1747                        );
1748                    }
1749
1750                    manifest_items.push(ManifestItem {
1751                        path: prefix.clone(),
1752                        sub_engram_id: sub_id,
1753                    });
1754                }
1755            }
1756
1757            manifest_items.sort_by(|a, b| {
1758                a.path
1759                    .cmp(&b.path)
1760                    .then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
1761            });
1762
1763            // Apply final sparsity control to level bundle
1764            if level_bundle.pos.len() + level_bundle.neg.len() > max_level_sparsity {
1765                level_bundle = level_bundle.thin(max_level_sparsity);
1766            }
1767
1768            levels.push(ManifestLevel {
1769                level: level as u32,
1770                items: manifest_items,
1771            });
1772        }
1773
1774        Ok(HierarchicalManifest {
1775            version: 1,
1776            levels,
1777            sub_engrams,
1778        })
1779    }
1780
1781    /// Extract files from hierarchical manifest with manifest-guided traversal
1782    ///
1783    /// Performs hierarchical extraction by traversing the manifest levels and
1784    /// reconstructing files from sub-engrams. This enables efficient extraction
1785    /// from complex hierarchical structures without loading the entire engram.
1786    ///
1787    /// # How it works
1788    /// 1. Traverse manifest levels from root to leaves
1789    /// 2. For each level, locate relevant sub-engrams
1790    /// 3. Reconstruct file chunks using inverse permutation operations
1791    /// 4. Assemble complete files from hierarchical components
1792    ///
1793    /// # Why this matters
1794    /// - Enables partial extraction from large hierarchical datasets
1795    /// - Maintains bit-perfect reconstruction accuracy
1796    /// - Supports efficient path-based queries and retrieval
1797    /// - Scales to complex directory structures
1798    ///
1799    /// # Arguments
1800    /// * `hierarchical` - The hierarchical manifest to extract from
1801    /// * `output_dir` - Directory path where extracted files will be written
1802    /// * `verbose` - Whether to print progress information during extraction
1803    ///
1804    /// # Returns
1805    /// `io::Result<()>` indicating success or failure of the hierarchical extraction
1806    ///
1807    /// # Examples
1808    /// ```
1809    /// use embeddenator_fs::{EmbrFS, ReversibleVSAConfig};
1810    ///
1811    /// let fs = EmbrFS::new();
1812    /// let config = ReversibleVSAConfig::default();
1813    /// // Assuming hierarchical manifest was created...
1814    /// // let hierarchical = fs.bundle_hierarchically(500, true).unwrap();
1815    ///
1816    /// // fs.extract_hierarchically(&hierarchical, "/tmp/output", true, &config)?;
1817    /// ```
1818    pub fn extract_hierarchically<P: AsRef<Path>>(
1819        &self,
1820        hierarchical: &HierarchicalManifest,
1821        output_dir: P,
1822        verbose: bool,
1823        config: &ReversibleVSAConfig,
1824    ) -> io::Result<()> {
1825        let output_dir = output_dir.as_ref();
1826
1827        if verbose {
1828            println!(
1829                "Extracting hierarchical manifest with {} levels to {}",
1830                hierarchical.levels.len(),
1831                output_dir.display()
1832            );
1833        }
1834
1835        // For each file in the original manifest, reconstruct it using hierarchical information
1836        for file_entry in &self.manifest.files {
1837            // Skip deleted files
1838            if file_entry.deleted {
1839                continue;
1840            }
1841
1842            let file_path = output_dir.join(&file_entry.path);
1843
1844            if let Some(parent) = file_path.parent() {
1845                fs::create_dir_all(parent)?;
1846            }
1847
1848            let mut reconstructed = Vec::new();
1849
1850            // Reconstruct each chunk using hierarchical information
1851            let num_chunks = file_entry.chunks.len();
1852            for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
1853                if let Some(chunk_vector) = self.engram.codebook.get(&chunk_id) {
1854                    // Calculate the actual chunk size
1855                    let chunk_size = if chunk_idx == num_chunks - 1 {
1856                        let remaining = file_entry.size - (chunk_idx * DEFAULT_CHUNK_SIZE);
1857                        remaining.min(DEFAULT_CHUNK_SIZE)
1858                    } else {
1859                        DEFAULT_CHUNK_SIZE
1860                    };
1861
1862                    // Decode using hierarchical inverse transformations
1863                    let decoded =
1864                        chunk_vector.decode_data(config, Some(&file_entry.path), chunk_size);
1865
1866                    // Apply correction if available
1867                    let chunk_data = if let Some(corrected) =
1868                        self.engram.corrections.apply(chunk_id as u64, &decoded)
1869                    {
1870                        corrected
1871                    } else {
1872                        decoded
1873                    };
1874
1875                    reconstructed.extend_from_slice(&chunk_data);
1876                }
1877            }
1878
1879            // Truncate to actual file size
1880            reconstructed.truncate(file_entry.size);
1881
1882            fs::write(&file_path, reconstructed)?;
1883
1884            if verbose {
1885                println!("Extracted hierarchical: {}", file_entry.path);
1886            }
1887        }
1888
1889        Ok(())
1890    }
1891}
1892pub fn is_text_file(data: &[u8]) -> bool {
1893    if data.is_empty() {
1894        return true;
1895    }
1896
1897    let sample_size = data.len().min(8192);
1898    let sample = &data[..sample_size];
1899
1900    let mut null_count = 0;
1901    let mut control_count = 0;
1902
1903    for &byte in sample {
1904        if byte == 0 {
1905            null_count += 1;
1906        } else if byte < 32 && byte != b'\n' && byte != b'\r' && byte != b'\t' {
1907            control_count += 1;
1908        }
1909    }
1910
1911    null_count == 0 && control_count < sample_size / 10
1912}
embeddenator_fs/fs/embrfs.rs

embeddenator_fs/fs/
embrfs.rs