1use crate::correction::ChunkCorrection;
45use crate::versioned::{ChunkId, VersionedChunk, VersionedFileEntry};
46use crate::versioned_embrfs::{
47 EmbrFSError, VersionedEmbrFS, DEFAULT_CHUNK_SIZE, ENCODING_FORMAT_REVERSIBLE_VSA,
48};
49use embeddenator_vsa::SparseVec;
50use sha2::{Digest, Sha256};
51
52const MAX_BUNDLE_CAPACITY: usize = 100;
54
55const LOW_ENTROPY_THRESHOLD: f64 = 0.3;
57const MEDIUM_ENTROPY_THRESHOLD: f64 = 0.6;
58
59const LOW_ENTROPY_CHUNK_SIZE: usize = 16 * 1024; const MEDIUM_ENTROPY_CHUNK_SIZE: usize = 8 * 1024; const HIGH_ENTROPY_CHUNK_SIZE: usize = 4 * 1024; #[derive(Clone)]
69pub struct HierarchicalSubEngram {
70 pub root: SparseVec,
72 pub chunk_ids: Vec<ChunkId>,
74 pub level: usize,
76}
77
78#[derive(Clone, Debug)]
80pub struct LargeFileConfig {
81 pub adaptive_chunking: bool,
83 pub max_bundle_size: usize,
85 pub hierarchical: bool,
87 pub correction_threshold: f64,
89 pub parallel: bool,
91}
92
93impl Default for LargeFileConfig {
94 fn default() -> Self {
95 Self {
96 adaptive_chunking: true,
97 max_bundle_size: MAX_BUNDLE_CAPACITY,
98 hierarchical: true,
99 correction_threshold: 0.1,
100 parallel: true,
101 }
102 }
103}
104
105pub struct LargeFileHandler<'a> {
107 fs: &'a VersionedEmbrFS,
108 config: LargeFileConfig,
109}
110
111impl<'a> LargeFileHandler<'a> {
112 pub fn new(fs: &'a VersionedEmbrFS) -> Self {
114 Self {
115 fs,
116 config: LargeFileConfig::default(),
117 }
118 }
119
120 pub fn with_config(fs: &'a VersionedEmbrFS, config: LargeFileConfig) -> Self {
122 Self { fs, config }
123 }
124
125 pub fn write_large_file(
129 &self,
130 path: &str,
131 data: &[u8],
132 expected_version: Option<u64>,
133 ) -> Result<LargeFileResult, EmbrFSError> {
134 let chunk_size = if self.config.adaptive_chunking {
136 self.calculate_optimal_chunk_size(data)
137 } else {
138 DEFAULT_CHUNK_SIZE
139 };
140
141 let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
143 let chunk_count = chunks.len();
144
145 let use_hierarchical =
147 self.config.hierarchical && chunk_count > self.config.max_bundle_size;
148
149 if use_hierarchical {
150 self.write_hierarchical(path, &chunks, expected_version, chunk_size)
151 } else {
152 self.write_flat(path, &chunks, expected_version, chunk_size)
153 }
154 }
155
156 fn calculate_optimal_chunk_size(&self, data: &[u8]) -> usize {
158 let entropy = self.estimate_entropy(data);
159
160 if entropy < LOW_ENTROPY_THRESHOLD {
161 LOW_ENTROPY_CHUNK_SIZE
162 } else if entropy < MEDIUM_ENTROPY_THRESHOLD {
163 MEDIUM_ENTROPY_CHUNK_SIZE
164 } else {
165 HIGH_ENTROPY_CHUNK_SIZE
166 }
167 }
168
169 fn estimate_entropy(&self, data: &[u8]) -> f64 {
171 if data.is_empty() {
172 return 0.0;
173 }
174
175 let sample_size = data.len().min(64 * 1024);
177 let sample = &data[0..sample_size];
178
179 let mut freq = [0u64; 256];
181 for &byte in sample {
182 freq[byte as usize] += 1;
183 }
184
185 let total = sample.len() as f64;
187 let mut entropy = 0.0;
188
189 for &count in &freq {
190 if count > 0 {
191 let p = count as f64 / total;
192 entropy -= p * p.log2();
193 }
194 }
195
196 entropy / 8.0
198 }
199
200 fn write_flat(
202 &self,
203 path: &str,
204 chunks: &[&[u8]],
205 expected_version: Option<u64>,
206 chunk_size: usize,
207 ) -> Result<LargeFileResult, EmbrFSError> {
208 let mut chunk_ids = Vec::new();
209 let mut chunk_updates = Vec::new();
210 let mut corrections = Vec::new();
211 let mut total_correction_bytes = 0usize;
212
213 for chunk_data in chunks {
214 let chunk_id = self.fs.allocate_chunk_id();
215
216 let chunk_vec = self.fs.encode_chunk(chunk_data, Some(path));
218
219 let decoded = self
221 .fs
222 .decode_chunk(&chunk_vec, Some(path), chunk_data.len());
223
224 let mut hasher = Sha256::new();
226 hasher.update(chunk_data);
227 let hash = hasher.finalize();
228 let mut hash_bytes = [0u8; 8];
229 hash_bytes.copy_from_slice(&hash[0..8]);
230
231 let correction = ChunkCorrection::new(chunk_id as u64, chunk_data, &decoded);
233 total_correction_bytes += correction.storage_size();
234
235 chunk_updates.push((
236 chunk_id,
237 VersionedChunk::new(chunk_vec, chunk_data.len(), hash_bytes),
238 ));
239 corrections.push((chunk_id as u64, correction));
240 chunk_ids.push(chunk_id);
241 }
242
243 self.fs.chunk_store.batch_insert_new(chunk_updates)?;
245 self.fs.corrections.batch_insert_new(corrections)?;
246
247 let total_size: usize = chunks.iter().map(|c| c.len()).sum();
249 let is_text = is_text_data_sample(chunks.first().copied().unwrap_or(&[]));
250 let mut file_entry =
251 VersionedFileEntry::new(path.to_string(), is_text, total_size, chunk_ids.clone());
252
253 if self.fs.is_holographic() {
255 file_entry.encoding_format = Some(ENCODING_FORMAT_REVERSIBLE_VSA);
256 }
257
258 let version = if let Some(expected) = expected_version {
259 let existing = self
260 .fs
261 .manifest
262 .get_file(path)
263 .ok_or_else(|| EmbrFSError::FileNotFound(path.to_string()))?;
264 if existing.0.version != expected {
265 return Err(EmbrFSError::VersionMismatch {
266 expected,
267 actual: existing.0.version,
268 });
269 }
270 self.fs.manifest.update_file(path, file_entry, expected)?;
271 expected + 1
272 } else {
273 self.fs.manifest.add_file(file_entry)?;
274 0
275 };
276
277 self.fs.bundle_chunks_to_root_streaming(&chunk_ids)?;
279
280 Ok(LargeFileResult {
281 path: path.to_string(),
282 total_bytes: total_size,
283 chunk_count: chunk_ids.len(),
284 version,
285 correction_bytes: total_correction_bytes,
286 hierarchy_levels: 1,
287 sub_engram_count: 1,
288 chunk_size_used: chunk_size,
289 })
290 }
291
292 fn write_hierarchical(
294 &self,
295 path: &str,
296 chunks: &[&[u8]],
297 expected_version: Option<u64>,
298 chunk_size: usize,
299 ) -> Result<LargeFileResult, EmbrFSError> {
300 let mut chunk_ids = Vec::new();
301 let mut chunk_updates = Vec::new();
302 let mut corrections = Vec::new();
303 let mut total_correction_bytes = 0usize;
304
305 let mut level0_vectors: Vec<SparseVec> = Vec::new();
307
308 for chunk_data in chunks {
309 let chunk_id = self.fs.allocate_chunk_id();
310
311 let chunk_vec = self.fs.encode_chunk(chunk_data, Some(path));
313
314 let decoded = self
316 .fs
317 .decode_chunk(&chunk_vec, Some(path), chunk_data.len());
318
319 let mut hasher = Sha256::new();
321 hasher.update(chunk_data);
322 let hash = hasher.finalize();
323 let mut hash_bytes = [0u8; 8];
324 hash_bytes.copy_from_slice(&hash[0..8]);
325
326 let correction = ChunkCorrection::new(chunk_id as u64, chunk_data, &decoded);
328 total_correction_bytes += correction.storage_size();
329
330 level0_vectors.push(chunk_vec.clone());
331 chunk_updates.push((
332 chunk_id,
333 VersionedChunk::new(chunk_vec, chunk_data.len(), hash_bytes),
334 ));
335 corrections.push((chunk_id as u64, correction));
336 chunk_ids.push(chunk_id);
337 }
338
339 let mut current_level = level0_vectors;
341 let mut hierarchy_levels = 1;
342
343 while current_level.len() > self.config.max_bundle_size {
344 let mut next_level = Vec::new();
345
346 for group in current_level.chunks(self.config.max_bundle_size) {
348 let mut sub_root = group[0].clone();
350 for vec in &group[1..] {
351 sub_root = sub_root.bundle(vec);
352 }
353 next_level.push(sub_root);
354 }
355
356 current_level = next_level;
357 hierarchy_levels += 1;
358 }
359
360 let sub_engram_count = current_level.len();
361
362 self.fs.chunk_store.batch_insert_new(chunk_updates)?;
364 self.fs.corrections.batch_insert_new(corrections)?;
365
366 let total_size: usize = chunks.iter().map(|c| c.len()).sum();
368 let is_text = is_text_data_sample(chunks.first().copied().unwrap_or(&[]));
369 let mut file_entry =
370 VersionedFileEntry::new(path.to_string(), is_text, total_size, chunk_ids.clone());
371
372 if self.fs.is_holographic() {
374 file_entry.encoding_format = Some(ENCODING_FORMAT_REVERSIBLE_VSA);
375 }
376
377 let version = if let Some(expected) = expected_version {
378 let existing = self
379 .fs
380 .manifest
381 .get_file(path)
382 .ok_or_else(|| EmbrFSError::FileNotFound(path.to_string()))?;
383 if existing.0.version != expected {
384 return Err(EmbrFSError::VersionMismatch {
385 expected,
386 actual: existing.0.version,
387 });
388 }
389 self.fs.manifest.update_file(path, file_entry, expected)?;
390 expected + 1
391 } else {
392 self.fs.manifest.add_file(file_entry)?;
393 0
394 };
395
396 self.fs.bundle_chunks_to_root_streaming(&chunk_ids)?;
398
399 Ok(LargeFileResult {
400 path: path.to_string(),
401 total_bytes: total_size,
402 chunk_count: chunk_ids.len(),
403 version,
404 correction_bytes: total_correction_bytes,
405 hierarchy_levels,
406 sub_engram_count,
407 chunk_size_used: chunk_size,
408 })
409 }
410}
411
412#[derive(Debug, Clone)]
414pub struct LargeFileResult {
415 pub path: String,
417 pub total_bytes: usize,
419 pub chunk_count: usize,
421 pub version: u64,
423 pub correction_bytes: usize,
425 pub hierarchy_levels: usize,
427 pub sub_engram_count: usize,
429 pub chunk_size_used: usize,
431}
432
433impl LargeFileResult {
434 pub fn correction_ratio(&self) -> f64 {
436 if self.total_bytes == 0 {
437 0.0
438 } else {
439 self.correction_bytes as f64 / self.total_bytes as f64
440 }
441 }
442
443 pub fn is_acceptable_quality(&self) -> bool {
445 self.correction_ratio() < 0.1
446 }
447}
448
449fn is_text_data_sample(data: &[u8]) -> bool {
451 if data.is_empty() {
452 return true;
453 }
454
455 let sample_size = data.len().min(8192);
456 let sample = &data[0..sample_size];
457
458 let non_printable = sample
459 .iter()
460 .filter(|&&b| b < 32 && b != b'\n' && b != b'\r' && b != b'\t')
461 .count();
462
463 (non_printable as f64 / sample_size as f64) < 0.05
464}
465
466#[cfg(test)]
467mod tests {
468 use super::*;
469
470 #[test]
471 fn test_entropy_calculation() {
472 let fs = VersionedEmbrFS::new();
473 let handler = LargeFileHandler::new(&fs);
474
475 let uniform: Vec<u8> = (0..256).cycle().take(1000).map(|x| x as u8).collect();
477 let uniform_entropy = handler.estimate_entropy(&uniform);
478 assert!(
479 uniform_entropy > 0.9,
480 "Uniform data should have high entropy"
481 );
482
483 let repetitive = vec![0u8; 1000];
485 let rep_entropy = handler.estimate_entropy(&repetitive);
486 assert!(rep_entropy < 0.1, "Repetitive data should have low entropy");
487
488 let text = b"The quick brown fox jumps over the lazy dog. ".repeat(20);
490 let text_entropy = handler.estimate_entropy(&text);
491 assert!(
492 text_entropy > 0.3 && text_entropy < 0.8,
493 "Text should have medium entropy"
494 );
495 }
496
497 #[test]
498 fn test_adaptive_chunk_sizing() {
499 let fs = VersionedEmbrFS::new();
500 let handler = LargeFileHandler::new(&fs);
501
502 let low_entropy = vec![42u8; 10000];
504 let size1 = handler.calculate_optimal_chunk_size(&low_entropy);
505 assert_eq!(size1, LOW_ENTROPY_CHUNK_SIZE);
506
507 let high_entropy: Vec<u8> = (0..10000).map(|i| (i * 7 % 256) as u8).collect();
509 let size2 = handler.calculate_optimal_chunk_size(&high_entropy);
510 assert_eq!(size2, HIGH_ENTROPY_CHUNK_SIZE);
511 }
512
513 #[test]
514 fn test_small_file_flat_encoding() {
515 let fs = VersionedEmbrFS::new();
516 let handler = LargeFileHandler::new(&fs);
517
518 let data = b"Small file content";
519 let result = handler.write_large_file("small.txt", data, None).unwrap();
520
521 assert_eq!(result.total_bytes, data.len());
522 assert_eq!(result.hierarchy_levels, 1);
523 assert_eq!(result.sub_engram_count, 1);
524
525 let (content, _) = fs.read_file("small.txt").unwrap();
527 assert_eq!(&content[..], data);
528 }
529
530 #[test]
531 fn test_large_file_hierarchical_encoding() {
532 let fs = VersionedEmbrFS::new();
533 let config = LargeFileConfig {
534 max_bundle_size: 10, ..Default::default()
536 };
537 let handler = LargeFileHandler::with_config(&fs, config);
538
539 let data: Vec<u8> = (0..50000).map(|i| (i % 256) as u8).collect();
541 let result = handler.write_large_file("large.bin", &data, None).unwrap();
542
543 assert_eq!(result.total_bytes, data.len());
544 assert!(
545 result.hierarchy_levels > 1,
546 "Should use hierarchical encoding"
547 );
548 assert!(result.chunk_count > 10);
549
550 let (content, _) = fs.read_file("large.bin").unwrap();
552 assert_eq!(content, data);
553 }
554}