use crate::correction::ChunkCorrection;
use crate::versioned::{ChunkId, VersionedChunk, VersionedFileEntry};
use crate::versioned_embrfs::{
EmbrFSError, VersionedEmbrFS, DEFAULT_CHUNK_SIZE, ENCODING_FORMAT_REVERSIBLE_VSA,
};
use embeddenator_vsa::SparseVec;
use sha2::{Digest, Sha256};
const MAX_BUNDLE_CAPACITY: usize = 100;
const LOW_ENTROPY_THRESHOLD: f64 = 0.3;
const MEDIUM_ENTROPY_THRESHOLD: f64 = 0.6;
const LOW_ENTROPY_CHUNK_SIZE: usize = 16 * 1024; const MEDIUM_ENTROPY_CHUNK_SIZE: usize = 8 * 1024; const HIGH_ENTROPY_CHUNK_SIZE: usize = 4 * 1024;
#[derive(Clone)]
pub struct HierarchicalSubEngram {
pub root: SparseVec,
pub chunk_ids: Vec<ChunkId>,
pub level: usize,
}
#[derive(Clone, Debug)]
pub struct LargeFileConfig {
pub adaptive_chunking: bool,
pub max_bundle_size: usize,
pub hierarchical: bool,
pub correction_threshold: f64,
pub parallel: bool,
}
impl Default for LargeFileConfig {
fn default() -> Self {
Self {
adaptive_chunking: true,
max_bundle_size: MAX_BUNDLE_CAPACITY,
hierarchical: true,
correction_threshold: 0.1,
parallel: true,
}
}
}
pub struct LargeFileHandler<'a> {
fs: &'a VersionedEmbrFS,
config: LargeFileConfig,
}
impl<'a> LargeFileHandler<'a> {
pub fn new(fs: &'a VersionedEmbrFS) -> Self {
Self {
fs,
config: LargeFileConfig::default(),
}
}
pub fn with_config(fs: &'a VersionedEmbrFS, config: LargeFileConfig) -> Self {
Self { fs, config }
}
pub fn write_large_file(
&self,
path: &str,
data: &[u8],
expected_version: Option<u64>,
) -> Result<LargeFileResult, EmbrFSError> {
let chunk_size = if self.config.adaptive_chunking {
self.calculate_optimal_chunk_size(data)
} else {
DEFAULT_CHUNK_SIZE
};
let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
let chunk_count = chunks.len();
let use_hierarchical =
self.config.hierarchical && chunk_count > self.config.max_bundle_size;
if use_hierarchical {
self.write_hierarchical(path, &chunks, expected_version, chunk_size)
} else {
self.write_flat(path, &chunks, expected_version, chunk_size)
}
}
fn calculate_optimal_chunk_size(&self, data: &[u8]) -> usize {
let entropy = self.estimate_entropy(data);
if entropy < LOW_ENTROPY_THRESHOLD {
LOW_ENTROPY_CHUNK_SIZE
} else if entropy < MEDIUM_ENTROPY_THRESHOLD {
MEDIUM_ENTROPY_CHUNK_SIZE
} else {
HIGH_ENTROPY_CHUNK_SIZE
}
}
fn estimate_entropy(&self, data: &[u8]) -> f64 {
if data.is_empty() {
return 0.0;
}
let sample_size = data.len().min(64 * 1024);
let sample = &data[0..sample_size];
let mut freq = [0u64; 256];
for &byte in sample {
freq[byte as usize] += 1;
}
let total = sample.len() as f64;
let mut entropy = 0.0;
for &count in &freq {
if count > 0 {
let p = count as f64 / total;
entropy -= p * p.log2();
}
}
entropy / 8.0
}
fn write_flat(
&self,
path: &str,
chunks: &[&[u8]],
expected_version: Option<u64>,
chunk_size: usize,
) -> Result<LargeFileResult, EmbrFSError> {
let mut chunk_ids = Vec::new();
let mut chunk_updates = Vec::new();
let mut corrections = Vec::new();
let mut total_correction_bytes = 0usize;
for chunk_data in chunks {
let chunk_id = self.fs.allocate_chunk_id();
let chunk_vec = self.fs.encode_chunk(chunk_data, Some(path));
let decoded = self
.fs
.decode_chunk(&chunk_vec, Some(path), chunk_data.len());
let mut hasher = Sha256::new();
hasher.update(chunk_data);
let hash = hasher.finalize();
let mut hash_bytes = [0u8; 8];
hash_bytes.copy_from_slice(&hash[0..8]);
let correction = ChunkCorrection::new(chunk_id as u64, chunk_data, &decoded);
total_correction_bytes += correction.storage_size();
chunk_updates.push((
chunk_id,
VersionedChunk::new(chunk_vec, chunk_data.len(), hash_bytes),
));
corrections.push((chunk_id as u64, correction));
chunk_ids.push(chunk_id);
}
self.fs.chunk_store.batch_insert_new(chunk_updates)?;
self.fs.corrections.batch_insert_new(corrections)?;
let total_size: usize = chunks.iter().map(|c| c.len()).sum();
let is_text = is_text_data_sample(chunks.first().copied().unwrap_or(&[]));
let mut file_entry =
VersionedFileEntry::new(path.to_string(), is_text, total_size, chunk_ids.clone());
if self.fs.is_holographic() {
file_entry.encoding_format = Some(ENCODING_FORMAT_REVERSIBLE_VSA);
}
let version = if let Some(expected) = expected_version {
let existing = self
.fs
.manifest
.get_file(path)
.ok_or_else(|| EmbrFSError::FileNotFound(path.to_string()))?;
if existing.0.version != expected {
return Err(EmbrFSError::VersionMismatch {
expected,
actual: existing.0.version,
});
}
self.fs.manifest.update_file(path, file_entry, expected)?;
expected + 1
} else {
self.fs.manifest.add_file(file_entry)?;
0
};
self.fs.bundle_chunks_to_root_streaming(&chunk_ids)?;
Ok(LargeFileResult {
path: path.to_string(),
total_bytes: total_size,
chunk_count: chunk_ids.len(),
version,
correction_bytes: total_correction_bytes,
hierarchy_levels: 1,
sub_engram_count: 1,
chunk_size_used: chunk_size,
})
}
fn write_hierarchical(
&self,
path: &str,
chunks: &[&[u8]],
expected_version: Option<u64>,
chunk_size: usize,
) -> Result<LargeFileResult, EmbrFSError> {
let mut chunk_ids = Vec::new();
let mut chunk_updates = Vec::new();
let mut corrections = Vec::new();
let mut total_correction_bytes = 0usize;
let mut level0_vectors: Vec<SparseVec> = Vec::new();
for chunk_data in chunks {
let chunk_id = self.fs.allocate_chunk_id();
let chunk_vec = self.fs.encode_chunk(chunk_data, Some(path));
let decoded = self
.fs
.decode_chunk(&chunk_vec, Some(path), chunk_data.len());
let mut hasher = Sha256::new();
hasher.update(chunk_data);
let hash = hasher.finalize();
let mut hash_bytes = [0u8; 8];
hash_bytes.copy_from_slice(&hash[0..8]);
let correction = ChunkCorrection::new(chunk_id as u64, chunk_data, &decoded);
total_correction_bytes += correction.storage_size();
level0_vectors.push(chunk_vec.clone());
chunk_updates.push((
chunk_id,
VersionedChunk::new(chunk_vec, chunk_data.len(), hash_bytes),
));
corrections.push((chunk_id as u64, correction));
chunk_ids.push(chunk_id);
}
let mut current_level = level0_vectors;
let mut hierarchy_levels = 1;
while current_level.len() > self.config.max_bundle_size {
let mut next_level = Vec::new();
for group in current_level.chunks(self.config.max_bundle_size) {
let mut sub_root = group[0].clone();
for vec in &group[1..] {
sub_root = sub_root.bundle(vec);
}
next_level.push(sub_root);
}
current_level = next_level;
hierarchy_levels += 1;
}
let sub_engram_count = current_level.len();
self.fs.chunk_store.batch_insert_new(chunk_updates)?;
self.fs.corrections.batch_insert_new(corrections)?;
let total_size: usize = chunks.iter().map(|c| c.len()).sum();
let is_text = is_text_data_sample(chunks.first().copied().unwrap_or(&[]));
let mut file_entry =
VersionedFileEntry::new(path.to_string(), is_text, total_size, chunk_ids.clone());
if self.fs.is_holographic() {
file_entry.encoding_format = Some(ENCODING_FORMAT_REVERSIBLE_VSA);
}
let version = if let Some(expected) = expected_version {
let existing = self
.fs
.manifest
.get_file(path)
.ok_or_else(|| EmbrFSError::FileNotFound(path.to_string()))?;
if existing.0.version != expected {
return Err(EmbrFSError::VersionMismatch {
expected,
actual: existing.0.version,
});
}
self.fs.manifest.update_file(path, file_entry, expected)?;
expected + 1
} else {
self.fs.manifest.add_file(file_entry)?;
0
};
self.fs.bundle_chunks_to_root_streaming(&chunk_ids)?;
Ok(LargeFileResult {
path: path.to_string(),
total_bytes: total_size,
chunk_count: chunk_ids.len(),
version,
correction_bytes: total_correction_bytes,
hierarchy_levels,
sub_engram_count,
chunk_size_used: chunk_size,
})
}
}
#[derive(Debug, Clone)]
pub struct LargeFileResult {
pub path: String,
pub total_bytes: usize,
pub chunk_count: usize,
pub version: u64,
pub correction_bytes: usize,
pub hierarchy_levels: usize,
pub sub_engram_count: usize,
pub chunk_size_used: usize,
}
impl LargeFileResult {
pub fn correction_ratio(&self) -> f64 {
if self.total_bytes == 0 {
0.0
} else {
self.correction_bytes as f64 / self.total_bytes as f64
}
}
pub fn is_acceptable_quality(&self) -> bool {
self.correction_ratio() < 0.1
}
}
fn is_text_data_sample(data: &[u8]) -> bool {
if data.is_empty() {
return true;
}
let sample_size = data.len().min(8192);
let sample = &data[0..sample_size];
let non_printable = sample
.iter()
.filter(|&&b| b < 32 && b != b'\n' && b != b'\r' && b != b'\t')
.count();
(non_printable as f64 / sample_size as f64) < 0.05
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_entropy_calculation() {
let fs = VersionedEmbrFS::new();
let handler = LargeFileHandler::new(&fs);
let uniform: Vec<u8> = (0..256).cycle().take(1000).map(|x| x as u8).collect();
let uniform_entropy = handler.estimate_entropy(&uniform);
assert!(
uniform_entropy > 0.9,
"Uniform data should have high entropy"
);
let repetitive = vec![0u8; 1000];
let rep_entropy = handler.estimate_entropy(&repetitive);
assert!(rep_entropy < 0.1, "Repetitive data should have low entropy");
let text = b"The quick brown fox jumps over the lazy dog. ".repeat(20);
let text_entropy = handler.estimate_entropy(&text);
assert!(
text_entropy > 0.3 && text_entropy < 0.8,
"Text should have medium entropy"
);
}
#[test]
fn test_adaptive_chunk_sizing() {
let fs = VersionedEmbrFS::new();
let handler = LargeFileHandler::new(&fs);
let low_entropy = vec![42u8; 10000];
let size1 = handler.calculate_optimal_chunk_size(&low_entropy);
assert_eq!(size1, LOW_ENTROPY_CHUNK_SIZE);
let high_entropy: Vec<u8> = (0..10000).map(|i| (i * 7 % 256) as u8).collect();
let size2 = handler.calculate_optimal_chunk_size(&high_entropy);
assert_eq!(size2, HIGH_ENTROPY_CHUNK_SIZE);
}
#[test]
fn test_small_file_flat_encoding() {
let fs = VersionedEmbrFS::new();
let handler = LargeFileHandler::new(&fs);
let data = b"Small file content";
let result = handler.write_large_file("small.txt", data, None).unwrap();
assert_eq!(result.total_bytes, data.len());
assert_eq!(result.hierarchy_levels, 1);
assert_eq!(result.sub_engram_count, 1);
let (content, _) = fs.read_file("small.txt").unwrap();
assert_eq!(&content[..], data);
}
#[test]
fn test_large_file_hierarchical_encoding() {
let fs = VersionedEmbrFS::new();
let config = LargeFileConfig {
max_bundle_size: 10, ..Default::default()
};
let handler = LargeFileHandler::with_config(&fs, config);
let data: Vec<u8> = (0..50000).map(|i| (i % 256) as u8).collect();
let result = handler.write_large_file("large.bin", &data, None).unwrap();
assert_eq!(result.total_bytes, data.len());
assert!(
result.hierarchy_levels > 1,
"Should use hierarchical encoding"
);
assert!(result.chunk_count > 10);
let (content, _) = fs.read_file("large.bin").unwrap();
assert_eq!(content, data);
}
}