use crate::correction::{CorrectionStats, CorrectionStore};
use embeddenator_retrieval::resonator::Resonator;
use embeddenator_retrieval::{RerankedResult, TernaryInvertedIndex};
use embeddenator_vsa::{ReversibleVSAConfig, ReversibleVSAEncoder, SparseVec, DIM};
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
use std::collections::{HashMap, HashSet};
use std::fs::{self, File};
use std::io::{self, Read};
use std::path::{Path, PathBuf};
use walkdir::WalkDir;
pub const DEFAULT_CHUNK_SIZE: usize = 4096;
#[derive(Serialize, Deserialize, Debug)]
pub struct FileEntry {
pub path: String,
pub is_text: bool,
pub size: usize,
pub chunks: Vec<usize>,
#[serde(default)]
pub deleted: bool,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct Manifest {
pub files: Vec<FileEntry>,
pub total_chunks: usize,
#[serde(default = "default_chunk_size")]
pub chunk_size: usize,
#[serde(default)]
pub holographic: bool,
}
fn default_chunk_size() -> usize {
DEFAULT_CHUNK_SIZE
}
#[derive(Serialize, Deserialize, Debug)]
pub struct HierarchicalManifest {
pub version: u32,
pub levels: Vec<ManifestLevel>,
#[serde(default)]
pub sub_engrams: HashMap<String, SubEngram>,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct ManifestLevel {
pub level: u32,
pub items: Vec<ManifestItem>,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct ManifestItem {
pub path: String,
pub sub_engram_id: String,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct SubEngram {
pub id: String,
pub root: SparseVec,
#[serde(default)]
pub chunk_ids: Vec<usize>,
pub chunk_count: usize,
pub children: Vec<String>,
}
#[derive(Clone, Debug)]
pub struct HierarchicalQueryBounds {
pub k: usize,
pub candidate_k: usize,
pub beam_width: usize,
pub max_depth: usize,
pub max_expansions: usize,
pub max_open_indices: usize,
pub max_open_engrams: usize,
}
impl Default for HierarchicalQueryBounds {
fn default() -> Self {
Self {
k: 10,
candidate_k: 100,
beam_width: 32,
max_depth: 4,
max_expansions: 128,
max_open_indices: 16,
max_open_engrams: 16,
}
}
}
#[derive(Clone, Debug, PartialEq)]
pub struct HierarchicalChunkHit {
pub sub_engram_id: String,
pub chunk_id: usize,
pub approx_score: i32,
pub cosine: f64,
}
#[derive(Clone, Debug)]
struct FrontierItem {
score: f64,
sub_engram_id: String,
depth: usize,
}
#[derive(Clone, Debug)]
struct RemappedInvertedIndex {
index: TernaryInvertedIndex,
local_to_global: Vec<usize>,
}
impl RemappedInvertedIndex {
fn build(chunk_ids: &[usize], vectors: &HashMap<usize, SparseVec>) -> Self {
let mut index = TernaryInvertedIndex::new();
let mut local_to_global = Vec::with_capacity(chunk_ids.len());
for (local_id, &global_id) in chunk_ids.iter().enumerate() {
let Some(vec) = vectors.get(&global_id) else {
continue;
};
local_to_global.push(global_id);
index.add(local_id, vec);
}
index.finalize();
Self {
index,
local_to_global,
}
}
fn query_top_k_reranked(
&self,
query: &SparseVec,
vectors: &HashMap<usize, SparseVec>,
candidate_k: usize,
k: usize,
) -> Vec<HierarchicalChunkHit> {
if k == 0 {
return Vec::new();
}
let candidates = self.index.query_top_k(query, candidate_k);
let mut out = Vec::with_capacity(candidates.len().min(k));
for cand in candidates {
let Some(&global_id) = self.local_to_global.get(cand.id) else {
continue;
};
let Some(vec) = vectors.get(&global_id) else {
continue;
};
out.push((global_id, cand.score, query.cosine(vec)));
}
out.sort_by(|a, b| {
b.2.total_cmp(&a.2)
.then_with(|| b.1.cmp(&a.1))
.then_with(|| a.0.cmp(&b.0))
});
out.truncate(k);
out.into_iter()
.map(|(chunk_id, approx_score, cosine)| HierarchicalChunkHit {
sub_engram_id: String::new(),
chunk_id,
approx_score,
cosine,
})
.collect()
}
}
#[derive(Clone, Debug)]
struct LruCache<V> {
cap: usize,
map: HashMap<String, V>,
order: Vec<String>,
}
impl<V> LruCache<V> {
fn new(cap: usize) -> Self {
Self {
cap,
map: HashMap::new(),
order: Vec::new(),
}
}
fn get(&mut self, key: &str) -> Option<&V> {
if self.map.contains_key(key) {
self.touch(key);
return self.map.get(key);
}
None
}
fn insert(&mut self, key: String, value: V) {
if self.cap == 0 {
return;
}
if self.map.contains_key(&key) {
self.map.insert(key.clone(), value);
self.touch(&key);
return;
}
self.map.insert(key.clone(), value);
self.order.push(key);
while self.map.len() > self.cap {
if let Some(evict) = self.order.first().cloned() {
self.order.remove(0);
self.map.remove(&evict);
} else {
break;
}
}
}
fn touch(&mut self, key: &str) {
if let Some(pos) = self.order.iter().position(|k| k == key) {
let k = self.order.remove(pos);
self.order.push(k);
}
}
}
pub trait SubEngramStore {
fn load(&self, id: &str) -> Option<SubEngram>;
}
fn escape_sub_engram_id(id: &str) -> String {
id.replace('%', "%25").replace('/', "%2F")
}
pub struct DirectorySubEngramStore {
dir: PathBuf,
}
impl DirectorySubEngramStore {
pub fn new<P: AsRef<Path>>(dir: P) -> Self {
Self {
dir: dir.as_ref().to_path_buf(),
}
}
fn path_for_id(&self, id: &str) -> PathBuf {
self.dir
.join(format!("{}.subengram", escape_sub_engram_id(id)))
}
}
impl SubEngramStore for DirectorySubEngramStore {
fn load(&self, id: &str) -> Option<SubEngram> {
let path = self.path_for_id(id);
let data = fs::read(path).ok()?;
bincode::deserialize(&data).ok()
}
}
pub fn save_hierarchical_manifest<P: AsRef<Path>>(
hierarchical: &HierarchicalManifest,
path: P,
) -> io::Result<()> {
let file = File::create(path)?;
#[derive(Serialize)]
struct StableHierarchicalManifest {
version: u32,
levels: Vec<ManifestLevel>,
sub_engrams: BTreeMap<String, SubEngram>,
}
let mut levels = hierarchical.levels.clone();
levels.sort_by_key(|a| a.level);
for level in &mut levels {
level.items.sort_by(|a, b| {
a.path
.cmp(&b.path)
.then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
});
}
let mut sub_engrams: BTreeMap<String, SubEngram> = BTreeMap::new();
for (id, sub) in &hierarchical.sub_engrams {
sub_engrams.insert(id.clone(), sub.clone());
}
let stable = StableHierarchicalManifest {
version: hierarchical.version,
levels,
sub_engrams,
};
serde_json::to_writer_pretty(file, &stable)?;
Ok(())
}
pub fn load_hierarchical_manifest<P: AsRef<Path>>(path: P) -> io::Result<HierarchicalManifest> {
let file = File::open(path)?;
let manifest = serde_json::from_reader(file)?;
Ok(manifest)
}
pub fn save_sub_engrams_dir<P: AsRef<Path>>(
sub_engrams: &HashMap<String, SubEngram>,
dir: P,
) -> io::Result<()> {
let dir = dir.as_ref();
fs::create_dir_all(dir)?;
let mut ids: Vec<&String> = sub_engrams.keys().collect();
ids.sort();
for id in ids {
let sub = sub_engrams
.get(id)
.expect("sub_engram id from keys() must exist in HashMap");
let encoded = bincode::serialize(sub).map_err(io::Error::other)?;
let path = dir.join(format!("{}.subengram", escape_sub_engram_id(id)));
fs::write(path, encoded)?;
}
Ok(())
}
struct InMemorySubEngramStore<'a> {
map: &'a HashMap<String, SubEngram>,
}
impl<'a> InMemorySubEngramStore<'a> {
fn new(map: &'a HashMap<String, SubEngram>) -> Self {
Self { map }
}
}
impl SubEngramStore for InMemorySubEngramStore<'_> {
fn load(&self, id: &str) -> Option<SubEngram> {
self.map.get(id).cloned()
}
}
fn get_cached_sub_engram(
cache: &mut LruCache<SubEngram>,
store: &impl SubEngramStore,
id: &str,
) -> Option<SubEngram> {
if let Some(v) = cache.get(id) {
return Some(v.clone());
}
let loaded = store.load(id)?;
cache.insert(id.to_string(), loaded.clone());
Some(loaded)
}
pub fn query_hierarchical_codebook(
hierarchical: &HierarchicalManifest,
codebook: &HashMap<usize, SparseVec>,
query: &SparseVec,
bounds: &HierarchicalQueryBounds,
) -> Vec<HierarchicalChunkHit> {
let store = InMemorySubEngramStore::new(&hierarchical.sub_engrams);
query_hierarchical_codebook_with_store(hierarchical, &store, codebook, query, bounds)
}
pub fn query_hierarchical_codebook_with_store(
hierarchical: &HierarchicalManifest,
store: &impl SubEngramStore,
codebook: &HashMap<usize, SparseVec>,
query: &SparseVec,
bounds: &HierarchicalQueryBounds,
) -> Vec<HierarchicalChunkHit> {
if bounds.k == 0 || hierarchical.levels.is_empty() {
return Vec::new();
}
let mut sub_cache: LruCache<SubEngram> = LruCache::new(bounds.max_open_engrams);
let mut index_cache: LruCache<RemappedInvertedIndex> = LruCache::new(bounds.max_open_indices);
let mut frontier: Vec<FrontierItem> = Vec::new();
if let Some(level0) = hierarchical.levels.first() {
for item in &level0.items {
let Some(sub) = get_cached_sub_engram(&mut sub_cache, store, &item.sub_engram_id)
else {
continue;
};
frontier.push(FrontierItem {
score: query.cosine(&sub.root),
sub_engram_id: item.sub_engram_id.clone(),
depth: 0,
});
}
}
frontier.sort_by(|a, b| {
b.score
.total_cmp(&a.score)
.then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
});
if frontier.len() > bounds.beam_width {
frontier.truncate(bounds.beam_width);
}
let mut expansions = 0usize;
let mut best_by_chunk: HashMap<usize, HierarchicalChunkHit> = HashMap::new();
while !frontier.is_empty() && expansions < bounds.max_expansions {
let node = frontier.remove(0);
let Some(sub) = get_cached_sub_engram(&mut sub_cache, store, &node.sub_engram_id) else {
continue;
};
expansions += 1;
let idx = if let Some(existing) = index_cache.get(&node.sub_engram_id) {
existing
} else {
let built = RemappedInvertedIndex::build(&sub.chunk_ids, codebook);
index_cache.insert(node.sub_engram_id.clone(), built);
index_cache
.get(&node.sub_engram_id)
.expect("index_cache.get() must succeed immediately after insert()")
};
let mut local_hits =
idx.query_top_k_reranked(query, codebook, bounds.candidate_k, bounds.k);
for hit in &mut local_hits {
hit.sub_engram_id = node.sub_engram_id.clone();
}
for hit in local_hits {
match best_by_chunk.get(&hit.chunk_id) {
None => {
best_by_chunk.insert(hit.chunk_id, hit);
}
Some(existing) => {
let better = hit
.cosine
.total_cmp(&existing.cosine)
.then_with(|| hit.approx_score.cmp(&existing.approx_score))
.is_gt();
if better {
best_by_chunk.insert(hit.chunk_id, hit);
}
}
}
}
if node.depth >= bounds.max_depth {
continue;
}
let children = sub.children.clone();
for child_id in &children {
let Some(child) = get_cached_sub_engram(&mut sub_cache, store, child_id) else {
continue;
};
frontier.push(FrontierItem {
score: query.cosine(&child.root),
sub_engram_id: child_id.clone(),
depth: node.depth + 1,
});
}
frontier.sort_by(|a, b| {
b.score
.total_cmp(&a.score)
.then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
});
if frontier.len() > bounds.beam_width {
frontier.truncate(bounds.beam_width);
}
}
let mut out: Vec<HierarchicalChunkHit> = best_by_chunk.into_values().collect();
out.sort_by(|a, b| {
b.cosine
.total_cmp(&a.cosine)
.then_with(|| b.approx_score.cmp(&a.approx_score))
.then_with(|| a.chunk_id.cmp(&b.chunk_id))
.then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
});
out.truncate(bounds.k);
out
}
#[derive(Serialize, Deserialize, Debug)]
pub enum UnifiedManifest {
Flat(Manifest),
Hierarchical(HierarchicalManifest),
}
impl From<Manifest> for UnifiedManifest {
fn from(manifest: Manifest) -> Self {
UnifiedManifest::Flat(manifest)
}
}
#[derive(Serialize, Deserialize)]
pub struct Engram {
pub root: SparseVec,
pub codebook: HashMap<usize, SparseVec>,
#[serde(default)]
pub corrections: CorrectionStore,
}
impl Engram {
pub fn build_codebook_index(&self) -> TernaryInvertedIndex {
TernaryInvertedIndex::build_from_map(&self.codebook)
}
pub fn query_codebook_with_index(
&self,
index: &TernaryInvertedIndex,
query: &SparseVec,
candidate_k: usize,
k: usize,
) -> Vec<RerankedResult> {
if k == 0 || self.codebook.is_empty() {
return Vec::new();
}
index.query_top_k_reranked(query, &self.codebook, candidate_k, k)
}
pub fn query_codebook(&self, query: &SparseVec, k: usize) -> Vec<RerankedResult> {
if k == 0 || self.codebook.is_empty() {
return Vec::new();
}
let candidate_k = (k.saturating_mul(10)).max(50);
let index = self.build_codebook_index();
self.query_codebook_with_index(&index, query, candidate_k, k)
}
}
pub const HOLOGRAPHIC_CHUNK_SIZE: usize = 8;
pub struct EmbrFS {
pub manifest: Manifest,
pub engram: Engram,
pub resonator: Option<Resonator>,
encoder: Option<ReversibleVSAEncoder>,
chunk_size: usize,
}
impl Default for EmbrFS {
fn default() -> Self {
Self::new_holographic()
}
}
impl EmbrFS {
#[deprecated(
since = "0.25.0",
note = "Use new_holographic() instead for ~94% encoding accuracy and <10% storage overhead"
)]
pub fn new() -> Self {
EmbrFS {
manifest: Manifest {
files: Vec::new(),
total_chunks: 0,
chunk_size: DEFAULT_CHUNK_SIZE,
holographic: false,
},
engram: Engram {
root: SparseVec::new(),
codebook: HashMap::new(),
corrections: CorrectionStore::new(),
},
resonator: None,
encoder: None,
chunk_size: DEFAULT_CHUNK_SIZE,
}
}
pub fn new_holographic() -> Self {
EmbrFS {
manifest: Manifest {
files: Vec::new(),
total_chunks: 0,
chunk_size: HOLOGRAPHIC_CHUNK_SIZE,
holographic: true,
},
engram: Engram {
root: SparseVec::new(),
codebook: HashMap::new(),
corrections: CorrectionStore::new(),
},
resonator: None,
encoder: Some(ReversibleVSAEncoder::new()),
chunk_size: HOLOGRAPHIC_CHUNK_SIZE,
}
}
pub fn is_holographic(&self) -> bool {
self.encoder.is_some()
}
pub fn chunk_size(&self) -> usize {
self.chunk_size
}
fn path_to_forward_slash_string(path: &Path) -> String {
path.components()
.filter_map(|c| match c {
std::path::Component::Normal(s) => s.to_str().map(|v| v.to_string()),
_ => None,
})
.collect::<Vec<String>>()
.join("/")
}
pub fn set_resonator(&mut self, resonator: Resonator) {
self.resonator = Some(resonator);
}
pub fn correction_stats(&self) -> CorrectionStats {
self.engram.corrections.stats()
}
pub fn ingest_directory<P: AsRef<Path>>(
&mut self,
dir: P,
verbose: bool,
config: &ReversibleVSAConfig,
) -> io::Result<()> {
self.ingest_directory_with_prefix(dir, None, verbose, config)
}
pub fn ingest_directory_with_prefix<P: AsRef<Path>>(
&mut self,
dir: P,
logical_prefix: Option<&str>,
verbose: bool,
config: &ReversibleVSAConfig,
) -> io::Result<()> {
let dir = dir.as_ref();
if verbose {
println!("Ingesting directory: {}", dir.display());
}
let mut files_to_process = Vec::new();
for entry in WalkDir::new(dir).follow_links(false) {
let entry = entry?;
if entry.file_type().is_file() {
files_to_process.push(entry.path().to_path_buf());
}
}
files_to_process.sort();
for file_path in files_to_process {
let relative = file_path.strip_prefix(dir).unwrap_or(file_path.as_path());
let rel = Self::path_to_forward_slash_string(relative);
let logical_path = if let Some(prefix) = logical_prefix {
if prefix.is_empty() {
rel
} else if rel.is_empty() {
prefix.to_string()
} else {
format!("{}/{}", prefix, rel)
}
} else {
rel
};
self.ingest_file(&file_path, logical_path, verbose, config)?;
}
Ok(())
}
pub fn ingest_file<P: AsRef<Path>>(
&mut self,
file_path: P,
logical_path: String,
verbose: bool,
config: &ReversibleVSAConfig,
) -> io::Result<()> {
let file_path = file_path.as_ref();
let mut file = File::open(file_path)?;
let mut data = Vec::new();
file.read_to_end(&mut data)?;
let is_text = is_text_file(&data);
let is_holographic = self.encoder.is_some();
if verbose {
println!(
"Ingesting {}: {} bytes ({}, {})",
logical_path,
data.len(),
if is_text { "text" } else { "binary" },
if is_holographic {
"holographic"
} else {
"legacy"
}
);
}
let chunk_size = self.chunk_size;
let mut chunks = Vec::new();
let mut corrections_needed = 0usize;
let mut total_correction_bytes = 0usize;
for (i, chunk) in data.chunks(chunk_size).enumerate() {
let chunk_id = self.manifest.total_chunks + i;
let (chunk_vec, decoded) = if let Some(ref mut encoder) = self.encoder {
let encoded = encoder.encode(chunk);
let decoded = encoder.decode(&encoded, chunk.len());
(encoded, decoded)
} else {
let encoded = SparseVec::encode_data(chunk, config, Some(&logical_path));
let decoded = encoded.decode_data(config, Some(&logical_path), chunk.len());
(encoded, decoded)
};
self.engram
.corrections
.add(chunk_id as u64, chunk, &decoded);
if chunk != decoded.as_slice() {
corrections_needed += 1;
if let Some(correction) = self.engram.corrections.get(chunk_id as u64) {
total_correction_bytes += correction.storage_size();
}
}
self.engram.root = self.engram.root.bundle(&chunk_vec);
self.engram.codebook.insert(chunk_id, chunk_vec);
chunks.push(chunk_id);
}
if verbose {
let total_chunks = chunks.len();
let perfect_chunks = total_chunks - corrections_needed;
let accuracy = if total_chunks > 0 {
(perfect_chunks as f64 / total_chunks as f64) * 100.0
} else {
100.0
};
let overhead = if !data.is_empty() {
(total_correction_bytes as f64 / data.len() as f64) * 100.0
} else {
0.0
};
println!(
" → {}/{} chunks perfect ({:.1}% accuracy), {:.1}% correction overhead",
perfect_chunks, total_chunks, accuracy, overhead
);
}
self.manifest.files.push(FileEntry {
path: logical_path,
is_text,
size: data.len(),
chunks: chunks.clone(),
deleted: false,
});
self.manifest.total_chunks += chunks.len();
Ok(())
}
pub fn add_file<P: AsRef<Path>>(
&mut self,
file_path: P,
logical_path: String,
verbose: bool,
config: &ReversibleVSAConfig,
) -> io::Result<()> {
let file_path = file_path.as_ref();
if self
.manifest
.files
.iter()
.any(|f| f.path == logical_path && !f.deleted)
{
return Err(io::Error::new(
io::ErrorKind::AlreadyExists,
format!("File '{}' already exists in engram", logical_path),
));
}
self.ingest_file(file_path, logical_path, verbose, config)
}
pub fn remove_file(&mut self, logical_path: &str, verbose: bool) -> io::Result<()> {
let file_entry = self
.manifest
.files
.iter_mut()
.find(|f| f.path == logical_path && !f.deleted)
.ok_or_else(|| {
io::Error::new(
io::ErrorKind::NotFound,
format!("File '{}' not found in engram", logical_path),
)
})?;
if verbose {
println!(
"Marking file as deleted: {} ({} chunks)",
logical_path,
file_entry.chunks.len()
);
}
file_entry.deleted = true;
if verbose {
println!(" Note: Use 'compact' to rebuild engram and reclaim space");
}
Ok(())
}
pub fn modify_file<P: AsRef<Path>>(
&mut self,
file_path: P,
logical_path: String,
verbose: bool,
config: &ReversibleVSAConfig,
) -> io::Result<()> {
self.remove_file(&logical_path, false)?;
if verbose {
println!("Modifying file: {}", logical_path);
}
self.ingest_file(file_path, logical_path, verbose, config)?;
Ok(())
}
pub fn compact(&mut self, verbose: bool, config: &ReversibleVSAConfig) -> io::Result<()> {
if verbose {
let deleted_count = self.manifest.files.iter().filter(|f| f.deleted).count();
let total_count = self.manifest.files.len();
println!(
"Compacting engram: removing {} deleted files ({} remaining)",
deleted_count,
total_count - deleted_count
);
}
let is_holographic = self.encoder.is_some();
let chunk_size = self.chunk_size;
let mut new_engram = Engram {
root: SparseVec::new(),
codebook: HashMap::new(),
corrections: CorrectionStore::new(),
};
let mut new_manifest = Manifest {
files: Vec::new(),
total_chunks: 0,
chunk_size,
holographic: is_holographic,
};
for old_file in &self.manifest.files {
if old_file.deleted {
continue;
}
let mut file_data = Vec::new();
let num_chunks = old_file.chunks.len();
let old_chunk_size = self.manifest.chunk_size;
for (chunk_idx, &chunk_id) in old_file.chunks.iter().enumerate() {
if let Some(chunk_vec) = self.engram.codebook.get(&chunk_id) {
let this_chunk_size = if chunk_idx == num_chunks - 1 {
let remaining = old_file.size.saturating_sub(chunk_idx * old_chunk_size);
remaining.min(old_chunk_size)
} else {
old_chunk_size
};
let decoded = if self.manifest.holographic {
if let Some(ref encoder) = self.encoder {
encoder.decode(chunk_vec, this_chunk_size)
} else {
chunk_vec.decode_data(config, Some(&old_file.path), this_chunk_size)
}
} else {
chunk_vec.decode_data(config, Some(&old_file.path), this_chunk_size)
};
let chunk_data = if let Some(corrected) =
self.engram.corrections.apply(chunk_id as u64, &decoded)
{
corrected
} else {
decoded
};
file_data.extend_from_slice(&chunk_data);
}
}
file_data.truncate(old_file.size);
let mut new_chunks = Vec::new();
for (i, chunk) in file_data.chunks(chunk_size).enumerate() {
let new_chunk_id = new_manifest.total_chunks + i;
let (chunk_vec, decoded) = if let Some(ref mut encoder) = self.encoder {
let encoded = encoder.encode(chunk);
let decoded = encoder.decode(&encoded, chunk.len());
(encoded, decoded)
} else {
let encoded = SparseVec::encode_data(chunk, config, Some(&old_file.path));
let decoded = encoded.decode_data(config, Some(&old_file.path), chunk.len());
(encoded, decoded)
};
new_engram
.corrections
.add(new_chunk_id as u64, chunk, &decoded);
new_engram.root = new_engram.root.bundle(&chunk_vec);
new_engram.codebook.insert(new_chunk_id, chunk_vec);
new_chunks.push(new_chunk_id);
}
if verbose {
println!(
" Recompacted: {} ({} chunks)",
old_file.path,
new_chunks.len()
);
}
new_manifest.files.push(FileEntry {
path: old_file.path.clone(),
is_text: old_file.is_text,
size: old_file.size,
chunks: new_chunks.clone(),
deleted: false,
});
new_manifest.total_chunks += new_chunks.len();
}
self.engram = new_engram;
self.manifest = new_manifest;
if verbose {
let stats = self.engram.corrections.stats();
println!(
"Compaction complete: {} files, {} chunks ({:.1}% perfect, {:.2}% correction overhead)",
self.manifest.files.len(),
self.manifest.total_chunks,
stats.perfect_ratio * 100.0,
stats.correction_ratio * 100.0
);
}
Ok(())
}
pub fn save_engram<P: AsRef<Path>>(&self, path: P) -> io::Result<()> {
let encoded = bincode::serialize(&self.engram).map_err(io::Error::other)?;
fs::write(path, encoded)?;
Ok(())
}
pub fn load_engram<P: AsRef<Path>>(path: P) -> io::Result<Engram> {
let data = fs::read(path)?;
bincode::deserialize(&data).map_err(io::Error::other)
}
pub fn save_manifest<P: AsRef<Path>>(&self, path: P) -> io::Result<()> {
let file = File::create(path)?;
serde_json::to_writer_pretty(file, &self.manifest)?;
Ok(())
}
pub fn load_manifest<P: AsRef<Path>>(path: P) -> io::Result<Manifest> {
let file = File::open(path)?;
let manifest = serde_json::from_reader(file)?;
Ok(manifest)
}
pub fn load<P: AsRef<Path>, Q: AsRef<Path>>(
engram_path: P,
manifest_path: Q,
) -> io::Result<Self> {
let engram = Self::load_engram(engram_path)?;
let manifest = Self::load_manifest(manifest_path)?;
let (encoder, chunk_size) = if manifest.holographic {
(Some(ReversibleVSAEncoder::new()), manifest.chunk_size)
} else {
(None, manifest.chunk_size)
};
Ok(EmbrFS {
manifest,
engram,
resonator: None,
encoder,
chunk_size,
})
}
pub fn extract<P: AsRef<Path>>(
engram: &Engram,
manifest: &Manifest,
output_dir: P,
verbose: bool,
config: &ReversibleVSAConfig,
) -> io::Result<()> {
let output_dir = output_dir.as_ref();
let chunk_size = manifest.chunk_size;
let is_holographic = manifest.holographic;
let encoder = if is_holographic {
Some(ReversibleVSAEncoder::new())
} else {
None
};
if verbose {
println!(
"Extracting {} files to {} ({})",
manifest.files.iter().filter(|f| !f.deleted).count(),
output_dir.display(),
if is_holographic {
"holographic"
} else {
"legacy"
}
);
let stats = engram.corrections.stats();
println!(
" Correction stats: {:.1}% perfect, {:.2}% overhead",
stats.perfect_ratio * 100.0,
stats.correction_ratio * 100.0
);
}
for file_entry in &manifest.files {
if file_entry.deleted {
continue;
}
let file_path = output_dir.join(&file_entry.path);
if let Some(parent) = file_path.parent() {
fs::create_dir_all(parent)?;
}
let mut reconstructed = Vec::new();
let num_chunks = file_entry.chunks.len();
for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
if let Some(chunk_vec) = engram.codebook.get(&chunk_id) {
let this_chunk_size = if chunk_idx == num_chunks - 1 {
let remaining = file_entry.size.saturating_sub(chunk_idx * chunk_size);
remaining.min(chunk_size)
} else {
chunk_size
};
let decoded = if let Some(ref enc) = encoder {
enc.decode(chunk_vec, this_chunk_size)
} else {
chunk_vec.decode_data(config, Some(&file_entry.path), this_chunk_size)
};
let chunk_data = if let Some(corrected) =
engram.corrections.apply(chunk_id as u64, &decoded)
{
corrected
} else {
decoded
};
reconstructed.extend_from_slice(&chunk_data);
}
}
reconstructed.truncate(file_entry.size);
fs::write(&file_path, reconstructed)?;
if verbose {
println!("Extracted: {}", file_entry.path);
}
}
Ok(())
}
pub fn extract_with_resonator<P: AsRef<Path>>(
&self,
output_dir: P,
verbose: bool,
config: &ReversibleVSAConfig,
) -> io::Result<()> {
if self.resonator.is_none() {
return Self::extract(&self.engram, &self.manifest, output_dir, verbose, config);
}
let _resonator = self
.resonator
.as_ref()
.expect("resonator is Some after is_none() check");
let output_dir = output_dir.as_ref();
if verbose {
println!(
"Extracting {} files with resonator enhancement to {}",
self.manifest.files.iter().filter(|f| !f.deleted).count(),
output_dir.display()
);
let stats = self.engram.corrections.stats();
println!(
" Correction stats: {:.1}% perfect, {:.2}% overhead",
stats.perfect_ratio * 100.0,
stats.correction_ratio * 100.0
);
}
for file_entry in &self.manifest.files {
if file_entry.deleted {
continue;
}
let file_path = output_dir.join(&file_entry.path);
if let Some(parent) = file_path.parent() {
fs::create_dir_all(parent)?;
}
let mut reconstructed = Vec::new();
let num_chunks = file_entry.chunks.len();
for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
let chunk_size = if chunk_idx == num_chunks - 1 {
let remaining = file_entry.size - (chunk_idx * DEFAULT_CHUNK_SIZE);
remaining.min(DEFAULT_CHUNK_SIZE)
} else {
DEFAULT_CHUNK_SIZE
};
let chunk_data = if let Some(vector) = self.engram.codebook.get(&chunk_id) {
let decoded = vector.decode_data(config, Some(&file_entry.path), chunk_size);
if let Some(corrected) =
self.engram.corrections.apply(chunk_id as u64, &decoded)
{
corrected
} else {
decoded
}
} else if let Some(resonator) = &self.resonator {
let query_vec = SparseVec::encode_data(&chunk_id.to_le_bytes(), config, None);
let recovered_vec = resonator.project(&query_vec);
let decoded =
recovered_vec.decode_data(config, Some(&file_entry.path), chunk_size);
if let Some(corrected) =
self.engram.corrections.apply(chunk_id as u64, &decoded)
{
corrected
} else {
decoded
}
} else {
return Err(io::Error::new(
io::ErrorKind::NotFound,
format!("Missing chunk {} and no resonator available", chunk_id),
));
};
reconstructed.extend_from_slice(&chunk_data);
}
reconstructed.truncate(file_entry.size);
fs::write(&file_path, reconstructed)?;
if verbose {
println!("Extracted with resonator: {}", file_entry.path);
}
}
Ok(())
}
pub fn bundle_hierarchically(
&self,
max_level_sparsity: usize,
verbose: bool,
_config: &ReversibleVSAConfig,
) -> io::Result<HierarchicalManifest> {
self.bundle_hierarchically_with_options(max_level_sparsity, None, verbose, _config)
}
pub fn bundle_hierarchically_with_options(
&self,
max_level_sparsity: usize,
max_chunks_per_node: Option<usize>,
verbose: bool,
_config: &ReversibleVSAConfig,
) -> io::Result<HierarchicalManifest> {
let mut levels = Vec::new();
let mut sub_engrams = HashMap::new();
let mut level_prefixes: HashMap<usize, HashMap<String, Vec<&FileEntry>>> = HashMap::new();
for file_entry in &self.manifest.files {
let comps: Vec<&str> = file_entry.path.split('/').collect();
let mut prefix = String::new();
for (level, &comp) in comps.iter().enumerate() {
if level == 0 {
prefix.push_str(comp);
} else {
prefix.push('/');
prefix.push_str(comp);
}
level_prefixes
.entry(level)
.or_default()
.entry(prefix.clone())
.or_default()
.push(file_entry);
}
}
let max_level = level_prefixes.keys().max().unwrap_or(&0);
for level in 0..=*max_level {
if verbose {
let item_count = level_prefixes
.get(&level)
.map(|comps| comps.values().map(|files| files.len()).sum::<usize>())
.unwrap_or(0);
println!("Processing level {} with {} items", level, item_count);
}
let mut level_bundle = SparseVec::new();
let mut manifest_items = Vec::new();
if let Some(prefixes) = level_prefixes.get(&level) {
let mut prefix_keys: Vec<&String> = prefixes.keys().collect();
prefix_keys.sort();
for prefix in prefix_keys {
let mut files: Vec<&FileEntry> = prefixes
.get(prefix)
.expect("prefix key from keys() must exist in HashMap")
.to_vec();
files.sort_by(|a, b| a.path.cmp(&b.path));
let shift = {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
prefix.hash(&mut hasher);
(hasher.finish() % (DIM as u64)) as usize
};
let mut component_bundle = SparseVec::new();
let mut chunk_ids_set: HashSet<usize> = HashSet::new();
for file_entry in &files {
let mut file_bundle = SparseVec::new();
for &chunk_id in &file_entry.chunks {
if let Some(chunk_vec) = self.engram.codebook.get(&chunk_id) {
file_bundle = file_bundle.bundle(chunk_vec);
chunk_ids_set.insert(chunk_id);
}
}
let permuted_file = file_bundle.permute(shift * (level + 1));
component_bundle = component_bundle.bundle(&permuted_file);
}
if component_bundle.pos.len() + component_bundle.neg.len() > max_level_sparsity
{
component_bundle = component_bundle.thin(max_level_sparsity);
}
level_bundle = level_bundle.bundle(&component_bundle);
let sub_id = format!("level_{}_prefix_{}", level, prefix);
let mut children_set: HashSet<String> = HashSet::new();
if level < *max_level {
for file_entry in &files {
let comps: Vec<&str> = file_entry.path.split('/').collect();
if comps.len() <= level + 1 {
continue;
}
let child_prefix = comps[..=level + 1].join("/");
let child_id = format!("level_{}_prefix_{}", level + 1, child_prefix);
children_set.insert(child_id);
}
}
let mut children: Vec<String> = children_set.into_iter().collect();
children.sort();
let mut chunk_ids: Vec<usize> = chunk_ids_set.into_iter().collect();
chunk_ids.sort_unstable();
let chunk_count: usize = files.iter().map(|f| f.chunks.len()).sum();
if let Some(max_chunks) = max_chunks_per_node.filter(|v| *v > 0) {
if chunk_ids.len() > max_chunks {
let mut shard_ids: Vec<String> = Vec::new();
for (shard_idx, chunk_slice) in chunk_ids.chunks(max_chunks).enumerate()
{
let shard_id = format!("{}__shard_{:04}", sub_id, shard_idx);
shard_ids.push(shard_id.clone());
sub_engrams.insert(
shard_id.clone(),
SubEngram {
id: shard_id,
root: component_bundle.clone(),
chunk_ids: chunk_slice.to_vec(),
chunk_count: chunk_slice.len(),
children: Vec::new(),
},
);
}
let mut router_children = shard_ids;
router_children.extend(children.clone());
router_children.sort();
router_children.dedup();
sub_engrams.insert(
sub_id.clone(),
SubEngram {
id: sub_id.clone(),
root: component_bundle,
chunk_ids: Vec::new(),
chunk_count,
children: router_children,
},
);
} else {
sub_engrams.insert(
sub_id.clone(),
SubEngram {
id: sub_id.clone(),
root: component_bundle,
chunk_ids,
chunk_count,
children,
},
);
}
} else {
sub_engrams.insert(
sub_id.clone(),
SubEngram {
id: sub_id.clone(),
root: component_bundle,
chunk_ids,
chunk_count,
children,
},
);
}
manifest_items.push(ManifestItem {
path: prefix.clone(),
sub_engram_id: sub_id,
});
}
}
manifest_items.sort_by(|a, b| {
a.path
.cmp(&b.path)
.then_with(|| a.sub_engram_id.cmp(&b.sub_engram_id))
});
if level_bundle.pos.len() + level_bundle.neg.len() > max_level_sparsity {
level_bundle = level_bundle.thin(max_level_sparsity);
}
levels.push(ManifestLevel {
level: level as u32,
items: manifest_items,
});
}
Ok(HierarchicalManifest {
version: 1,
levels,
sub_engrams,
})
}
pub fn extract_hierarchically<P: AsRef<Path>>(
&self,
hierarchical: &HierarchicalManifest,
output_dir: P,
verbose: bool,
config: &ReversibleVSAConfig,
) -> io::Result<()> {
let output_dir = output_dir.as_ref();
if verbose {
println!(
"Extracting hierarchical manifest with {} levels to {}",
hierarchical.levels.len(),
output_dir.display()
);
}
for file_entry in &self.manifest.files {
if file_entry.deleted {
continue;
}
let file_path = output_dir.join(&file_entry.path);
if let Some(parent) = file_path.parent() {
fs::create_dir_all(parent)?;
}
let mut reconstructed = Vec::new();
let num_chunks = file_entry.chunks.len();
for (chunk_idx, &chunk_id) in file_entry.chunks.iter().enumerate() {
if let Some(chunk_vector) = self.engram.codebook.get(&chunk_id) {
let chunk_size = if chunk_idx == num_chunks - 1 {
let remaining = file_entry.size - (chunk_idx * DEFAULT_CHUNK_SIZE);
remaining.min(DEFAULT_CHUNK_SIZE)
} else {
DEFAULT_CHUNK_SIZE
};
let decoded =
chunk_vector.decode_data(config, Some(&file_entry.path), chunk_size);
let chunk_data = if let Some(corrected) =
self.engram.corrections.apply(chunk_id as u64, &decoded)
{
corrected
} else {
decoded
};
reconstructed.extend_from_slice(&chunk_data);
}
}
reconstructed.truncate(file_entry.size);
fs::write(&file_path, reconstructed)?;
if verbose {
println!("Extracted hierarchical: {}", file_entry.path);
}
}
Ok(())
}
}
pub fn is_text_file(data: &[u8]) -> bool {
if data.is_empty() {
return true;
}
let sample_size = data.len().min(8192);
let sample = &data[..sample_size];
let mut null_count = 0;
let mut control_count = 0;
for &byte in sample {
if byte == 0 {
null_count += 1;
} else if byte < 32 && byte != b'\n' && byte != b'\r' && byte != b'\t' {
control_count += 1;
}
}
null_count == 0 && control_count < sample_size / 10
}