Skip to main content

embeddenator_fs/fs/
large_file.rs

1//! Large File Handling with Hierarchical Sub-Engrams
2//!
3//! This module provides improved algorithms for handling large files (>1MB) with
4//! better fidelity than the default chunking approach. The key insight is that
5//! VSA encoding quality degrades when too many vectors are bundled together.
6//!
7//! # The Problem
8//!
9//! Standard VSA encoding has a "capacity limit" - the number of vectors that can
10//! be reliably bundled before signal-to-noise ratio degrades. For sparse ternary
11//! vectors with dimension D, this is roughly O(sqrt(D)).
12//!
13//! With D=10000, capacity ≈ 100 vectors per bundle. A 10MB file with 4KB chunks
14//! creates ~2500 chunks - well over capacity.
15//!
16//! # The Solution: Hierarchical Sub-Engrams
17//!
18//! Instead of bundling all chunks into one root:
19//!
20//! ```text
21//! Traditional (fails at scale):
22//!   root = chunk1 ⊕ chunk2 ⊕ ... ⊕ chunk2500
23//!
24//! Hierarchical (scales well):
25//!   level0 = [sub1, sub2, ... sub25]      (25 sub-engrams)
26//!   sub1 = chunk1 ⊕ chunk2 ⊕ ... ⊕ chunk100
27//!   sub2 = chunk101 ⊕ ... ⊕ chunk200
28//!   root = sub1 ⊕ sub2 ⊕ ... ⊕ sub25
29//! ```
30//!
31//! Each level bundles at most ~100 vectors, staying within capacity.
32//!
33//! # Adaptive Chunk Size
34//!
35//! For files with low entropy (highly compressible), larger chunks work better.
36//! For high-entropy data (already compressed), smaller chunks preserve fidelity.
37//!
38//! ```text
39//! entropy < 0.3  → chunk_size = 16KB (compressible data)
40//! entropy < 0.6  → chunk_size = 8KB  (mixed content)
41//! entropy >= 0.6 → chunk_size = 4KB  (high entropy)
42//! ```
43
44use crate::correction::ChunkCorrection;
45use crate::versioned::{ChunkId, VersionedChunk, VersionedFileEntry};
46use crate::versioned_embrfs::{
47    EmbrFSError, VersionedEmbrFS, DEFAULT_CHUNK_SIZE, ENCODING_FORMAT_REVERSIBLE_VSA,
48};
49use embeddenator_vsa::SparseVec;
50use sha2::{Digest, Sha256};
51
52/// Maximum chunks per bundle level (based on VSA capacity theory)
53const MAX_BUNDLE_CAPACITY: usize = 100;
54
55/// Entropy thresholds for adaptive chunking
56const LOW_ENTROPY_THRESHOLD: f64 = 0.3;
57const MEDIUM_ENTROPY_THRESHOLD: f64 = 0.6;
58
59/// Chunk sizes for different entropy levels
60const LOW_ENTROPY_CHUNK_SIZE: usize = 16 * 1024; // 16KB
61const MEDIUM_ENTROPY_CHUNK_SIZE: usize = 8 * 1024; // 8KB
62const HIGH_ENTROPY_CHUNK_SIZE: usize = 4 * 1024; // 4KB (default)
63
64/// Sub-engram for hierarchical encoding of large files
65///
66/// This is distinct from `embrfs::SubEngram` - this version is optimized
67/// for the hierarchical bundling of large file chunks.
68#[derive(Clone)]
69pub struct HierarchicalSubEngram {
70    /// Root vector of this sub-engram
71    pub root: SparseVec,
72    /// Chunk IDs contained in this sub-engram
73    pub chunk_ids: Vec<ChunkId>,
74    /// Level in the hierarchy (0 = leaf level)
75    pub level: usize,
76}
77
78/// Configuration for large file handling
79#[derive(Clone, Debug)]
80pub struct LargeFileConfig {
81    /// Enable adaptive chunk sizing based on entropy
82    pub adaptive_chunking: bool,
83    /// Maximum chunks per bundle
84    pub max_bundle_size: usize,
85    /// Enable hierarchical sub-engrams
86    pub hierarchical: bool,
87    /// Correction threshold for re-encoding
88    pub correction_threshold: f64,
89    /// Enable parallel encoding (when feature enabled)
90    pub parallel: bool,
91}
92
93impl Default for LargeFileConfig {
94    fn default() -> Self {
95        Self {
96            adaptive_chunking: true,
97            max_bundle_size: MAX_BUNDLE_CAPACITY,
98            hierarchical: true,
99            correction_threshold: 0.1,
100            parallel: true,
101        }
102    }
103}
104
105/// Large file handler with improved algorithms
106pub struct LargeFileHandler<'a> {
107    fs: &'a VersionedEmbrFS,
108    config: LargeFileConfig,
109}
110
111impl<'a> LargeFileHandler<'a> {
112    /// Create a new large file handler
113    pub fn new(fs: &'a VersionedEmbrFS) -> Self {
114        Self {
115            fs,
116            config: LargeFileConfig::default(),
117        }
118    }
119
120    /// Create with custom configuration
121    pub fn with_config(fs: &'a VersionedEmbrFS, config: LargeFileConfig) -> Self {
122        Self { fs, config }
123    }
124
125    /// Write a large file with improved encoding
126    ///
127    /// Uses hierarchical sub-engrams and adaptive chunking for better fidelity.
128    pub fn write_large_file(
129        &self,
130        path: &str,
131        data: &[u8],
132        expected_version: Option<u64>,
133    ) -> Result<LargeFileResult, EmbrFSError> {
134        // Calculate optimal chunk size based on entropy
135        let chunk_size = if self.config.adaptive_chunking {
136            self.calculate_optimal_chunk_size(data)
137        } else {
138            DEFAULT_CHUNK_SIZE
139        };
140
141        // Split into chunks
142        let chunks: Vec<&[u8]> = data.chunks(chunk_size).collect();
143        let chunk_count = chunks.len();
144
145        // Determine if we need hierarchical encoding
146        let use_hierarchical =
147            self.config.hierarchical && chunk_count > self.config.max_bundle_size;
148
149        if use_hierarchical {
150            self.write_hierarchical(path, &chunks, expected_version, chunk_size)
151        } else {
152            self.write_flat(path, &chunks, expected_version, chunk_size)
153        }
154    }
155
156    /// Calculate optimal chunk size based on data entropy
157    fn calculate_optimal_chunk_size(&self, data: &[u8]) -> usize {
158        let entropy = self.estimate_entropy(data);
159
160        if entropy < LOW_ENTROPY_THRESHOLD {
161            LOW_ENTROPY_CHUNK_SIZE
162        } else if entropy < MEDIUM_ENTROPY_THRESHOLD {
163            MEDIUM_ENTROPY_CHUNK_SIZE
164        } else {
165            HIGH_ENTROPY_CHUNK_SIZE
166        }
167    }
168
169    /// Estimate Shannon entropy of data (0.0 - 1.0)
170    fn estimate_entropy(&self, data: &[u8]) -> f64 {
171        if data.is_empty() {
172            return 0.0;
173        }
174
175        // Sample for large files
176        let sample_size = data.len().min(64 * 1024);
177        let sample = &data[0..sample_size];
178
179        // Count byte frequencies
180        let mut freq = [0u64; 256];
181        for &byte in sample {
182            freq[byte as usize] += 1;
183        }
184
185        // Calculate entropy
186        let total = sample.len() as f64;
187        let mut entropy = 0.0;
188
189        for &count in &freq {
190            if count > 0 {
191                let p = count as f64 / total;
192                entropy -= p * p.log2();
193            }
194        }
195
196        // Normalize to 0-1 range (max entropy for bytes is 8 bits)
197        entropy / 8.0
198    }
199
200    /// Write using flat (non-hierarchical) encoding
201    fn write_flat(
202        &self,
203        path: &str,
204        chunks: &[&[u8]],
205        expected_version: Option<u64>,
206        chunk_size: usize,
207    ) -> Result<LargeFileResult, EmbrFSError> {
208        let mut chunk_ids = Vec::new();
209        let mut chunk_updates = Vec::new();
210        let mut corrections = Vec::new();
211        let mut total_correction_bytes = 0usize;
212
213        for chunk_data in chunks {
214            let chunk_id = self.fs.allocate_chunk_id();
215
216            // Encode (using mode-appropriate encoder)
217            let chunk_vec = self.fs.encode_chunk(chunk_data, Some(path));
218
219            // Verify (using mode-appropriate decoder)
220            let decoded = self
221                .fs
222                .decode_chunk(&chunk_vec, Some(path), chunk_data.len());
223
224            // Compute hash
225            let mut hasher = Sha256::new();
226            hasher.update(chunk_data);
227            let hash = hasher.finalize();
228            let mut hash_bytes = [0u8; 8];
229            hash_bytes.copy_from_slice(&hash[0..8]);
230
231            // Create correction
232            let correction = ChunkCorrection::new(chunk_id as u64, chunk_data, &decoded);
233            total_correction_bytes += correction.storage_size();
234
235            chunk_updates.push((
236                chunk_id,
237                VersionedChunk::new(chunk_vec, chunk_data.len(), hash_bytes),
238            ));
239            corrections.push((chunk_id as u64, correction));
240            chunk_ids.push(chunk_id);
241        }
242
243        // Batch insert
244        self.fs.chunk_store.batch_insert_new(chunk_updates)?;
245        self.fs.corrections.batch_insert_new(corrections)?;
246
247        // Create manifest entry
248        let total_size: usize = chunks.iter().map(|c| c.len()).sum();
249        let is_text = is_text_data_sample(chunks.first().copied().unwrap_or(&[]));
250        let mut file_entry =
251            VersionedFileEntry::new(path.to_string(), is_text, total_size, chunk_ids.clone());
252
253        // Set encoding format for holographic mode files
254        if self.fs.is_holographic() {
255            file_entry.encoding_format = Some(ENCODING_FORMAT_REVERSIBLE_VSA);
256        }
257
258        let version = if let Some(expected) = expected_version {
259            let existing = self
260                .fs
261                .manifest
262                .get_file(path)
263                .ok_or_else(|| EmbrFSError::FileNotFound(path.to_string()))?;
264            if existing.0.version != expected {
265                return Err(EmbrFSError::VersionMismatch {
266                    expected,
267                    actual: existing.0.version,
268                });
269            }
270            self.fs.manifest.update_file(path, file_entry, expected)?;
271            expected + 1
272        } else {
273            self.fs.manifest.add_file(file_entry)?;
274            0
275        };
276
277        // Bundle
278        self.fs.bundle_chunks_to_root_streaming(&chunk_ids)?;
279
280        Ok(LargeFileResult {
281            path: path.to_string(),
282            total_bytes: total_size,
283            chunk_count: chunk_ids.len(),
284            version,
285            correction_bytes: total_correction_bytes,
286            hierarchy_levels: 1,
287            sub_engram_count: 1,
288            chunk_size_used: chunk_size,
289        })
290    }
291
292    /// Write using hierarchical sub-engram encoding
293    fn write_hierarchical(
294        &self,
295        path: &str,
296        chunks: &[&[u8]],
297        expected_version: Option<u64>,
298        chunk_size: usize,
299    ) -> Result<LargeFileResult, EmbrFSError> {
300        let mut chunk_ids = Vec::new();
301        let mut chunk_updates = Vec::new();
302        let mut corrections = Vec::new();
303        let mut total_correction_bytes = 0usize;
304
305        // Level 0: Encode all chunks
306        let mut level0_vectors: Vec<SparseVec> = Vec::new();
307
308        for chunk_data in chunks {
309            let chunk_id = self.fs.allocate_chunk_id();
310
311            // Encode (using mode-appropriate encoder)
312            let chunk_vec = self.fs.encode_chunk(chunk_data, Some(path));
313
314            // Verify (using mode-appropriate decoder)
315            let decoded = self
316                .fs
317                .decode_chunk(&chunk_vec, Some(path), chunk_data.len());
318
319            // Compute hash
320            let mut hasher = Sha256::new();
321            hasher.update(chunk_data);
322            let hash = hasher.finalize();
323            let mut hash_bytes = [0u8; 8];
324            hash_bytes.copy_from_slice(&hash[0..8]);
325
326            // Create correction
327            let correction = ChunkCorrection::new(chunk_id as u64, chunk_data, &decoded);
328            total_correction_bytes += correction.storage_size();
329
330            level0_vectors.push(chunk_vec.clone());
331            chunk_updates.push((
332                chunk_id,
333                VersionedChunk::new(chunk_vec, chunk_data.len(), hash_bytes),
334            ));
335            corrections.push((chunk_id as u64, correction));
336            chunk_ids.push(chunk_id);
337        }
338
339        // Build hierarchy of sub-engrams
340        let mut current_level = level0_vectors;
341        let mut hierarchy_levels = 1;
342
343        while current_level.len() > self.config.max_bundle_size {
344            let mut next_level = Vec::new();
345
346            // Group into sub-engrams
347            for group in current_level.chunks(self.config.max_bundle_size) {
348                // Bundle group into sub-engram
349                let mut sub_root = group[0].clone();
350                for vec in &group[1..] {
351                    sub_root = sub_root.bundle(vec);
352                }
353                next_level.push(sub_root);
354            }
355
356            current_level = next_level;
357            hierarchy_levels += 1;
358        }
359
360        let sub_engram_count = current_level.len();
361
362        // Batch insert chunks
363        self.fs.chunk_store.batch_insert_new(chunk_updates)?;
364        self.fs.corrections.batch_insert_new(corrections)?;
365
366        // Create manifest entry
367        let total_size: usize = chunks.iter().map(|c| c.len()).sum();
368        let is_text = is_text_data_sample(chunks.first().copied().unwrap_or(&[]));
369        let mut file_entry =
370            VersionedFileEntry::new(path.to_string(), is_text, total_size, chunk_ids.clone());
371
372        // Set encoding format for holographic mode files
373        if self.fs.is_holographic() {
374            file_entry.encoding_format = Some(ENCODING_FORMAT_REVERSIBLE_VSA);
375        }
376
377        let version = if let Some(expected) = expected_version {
378            let existing = self
379                .fs
380                .manifest
381                .get_file(path)
382                .ok_or_else(|| EmbrFSError::FileNotFound(path.to_string()))?;
383            if existing.0.version != expected {
384                return Err(EmbrFSError::VersionMismatch {
385                    expected,
386                    actual: existing.0.version,
387                });
388            }
389            self.fs.manifest.update_file(path, file_entry, expected)?;
390            expected + 1
391        } else {
392            self.fs.manifest.add_file(file_entry)?;
393            0
394        };
395
396        // Bundle final level into root
397        self.fs.bundle_chunks_to_root_streaming(&chunk_ids)?;
398
399        Ok(LargeFileResult {
400            path: path.to_string(),
401            total_bytes: total_size,
402            chunk_count: chunk_ids.len(),
403            version,
404            correction_bytes: total_correction_bytes,
405            hierarchy_levels,
406            sub_engram_count,
407            chunk_size_used: chunk_size,
408        })
409    }
410}
411
412/// Result of large file write operation
413#[derive(Debug, Clone)]
414pub struct LargeFileResult {
415    /// Path of the file
416    pub path: String,
417    /// Total bytes written
418    pub total_bytes: usize,
419    /// Number of chunks created
420    pub chunk_count: usize,
421    /// File version
422    pub version: u64,
423    /// Total correction bytes
424    pub correction_bytes: usize,
425    /// Number of hierarchy levels used
426    pub hierarchy_levels: usize,
427    /// Number of sub-engrams at lowest level
428    pub sub_engram_count: usize,
429    /// Chunk size used for this file
430    pub chunk_size_used: usize,
431}
432
433impl LargeFileResult {
434    /// Calculate correction ratio (correction bytes / total bytes)
435    pub fn correction_ratio(&self) -> f64 {
436        if self.total_bytes == 0 {
437            0.0
438        } else {
439            self.correction_bytes as f64 / self.total_bytes as f64
440        }
441    }
442
443    /// Check if encoding quality is acceptable (< 10% correction)
444    pub fn is_acceptable_quality(&self) -> bool {
445        self.correction_ratio() < 0.1
446    }
447}
448
449/// Heuristic text detection for sample data
450fn is_text_data_sample(data: &[u8]) -> bool {
451    if data.is_empty() {
452        return true;
453    }
454
455    let sample_size = data.len().min(8192);
456    let sample = &data[0..sample_size];
457
458    let non_printable = sample
459        .iter()
460        .filter(|&&b| b < 32 && b != b'\n' && b != b'\r' && b != b'\t')
461        .count();
462
463    (non_printable as f64 / sample_size as f64) < 0.05
464}
465
466#[cfg(test)]
467mod tests {
468    use super::*;
469
470    #[test]
471    fn test_entropy_calculation() {
472        let fs = VersionedEmbrFS::new();
473        let handler = LargeFileHandler::new(&fs);
474
475        // Uniform data = high entropy
476        let uniform: Vec<u8> = (0..256).cycle().take(1000).map(|x| x as u8).collect();
477        let uniform_entropy = handler.estimate_entropy(&uniform);
478        assert!(
479            uniform_entropy > 0.9,
480            "Uniform data should have high entropy"
481        );
482
483        // Repetitive data = low entropy
484        let repetitive = vec![0u8; 1000];
485        let rep_entropy = handler.estimate_entropy(&repetitive);
486        assert!(rep_entropy < 0.1, "Repetitive data should have low entropy");
487
488        // Text-like data = medium entropy
489        let text = b"The quick brown fox jumps over the lazy dog. ".repeat(20);
490        let text_entropy = handler.estimate_entropy(&text);
491        assert!(
492            text_entropy > 0.3 && text_entropy < 0.8,
493            "Text should have medium entropy"
494        );
495    }
496
497    #[test]
498    fn test_adaptive_chunk_sizing() {
499        let fs = VersionedEmbrFS::new();
500        let handler = LargeFileHandler::new(&fs);
501
502        // Low entropy -> large chunks
503        let low_entropy = vec![42u8; 10000];
504        let size1 = handler.calculate_optimal_chunk_size(&low_entropy);
505        assert_eq!(size1, LOW_ENTROPY_CHUNK_SIZE);
506
507        // High entropy -> small chunks
508        let high_entropy: Vec<u8> = (0..10000).map(|i| (i * 7 % 256) as u8).collect();
509        let size2 = handler.calculate_optimal_chunk_size(&high_entropy);
510        assert_eq!(size2, HIGH_ENTROPY_CHUNK_SIZE);
511    }
512
513    #[test]
514    fn test_small_file_flat_encoding() {
515        let fs = VersionedEmbrFS::new();
516        let handler = LargeFileHandler::new(&fs);
517
518        let data = b"Small file content";
519        let result = handler.write_large_file("small.txt", data, None).unwrap();
520
521        assert_eq!(result.total_bytes, data.len());
522        assert_eq!(result.hierarchy_levels, 1);
523        assert_eq!(result.sub_engram_count, 1);
524
525        // Verify data
526        let (content, _) = fs.read_file("small.txt").unwrap();
527        assert_eq!(&content[..], data);
528    }
529
530    #[test]
531    fn test_large_file_hierarchical_encoding() {
532        let fs = VersionedEmbrFS::new();
533        let config = LargeFileConfig {
534            max_bundle_size: 10, // Force hierarchical with small bundle size
535            ..Default::default()
536        };
537        let handler = LargeFileHandler::with_config(&fs, config);
538
539        // Create file that will require hierarchical encoding
540        let data: Vec<u8> = (0..50000).map(|i| (i % 256) as u8).collect();
541        let result = handler.write_large_file("large.bin", &data, None).unwrap();
542
543        assert_eq!(result.total_bytes, data.len());
544        assert!(
545            result.hierarchy_levels > 1,
546            "Should use hierarchical encoding"
547        );
548        assert!(result.chunk_count > 10);
549
550        // Verify data integrity
551        let (content, _) = fs.read_file("large.bin").unwrap();
552        assert_eq!(content, data);
553    }
554}