engram_rs/archive/
format.rs

1use crate::error::{EngramError, Result};
2use std::io::{Read, Write};
3
4/// Magic number: 0x89 'E' 'N' 'G' 0x0D 0x0A 0x1A 0x0A
5/// Follows PNG pattern for corruption detection
6pub const MAGIC_NUMBER: [u8; 8] = [0x89, b'E', b'N', b'G', 0x0D, 0x0A, 0x1A, 0x0A];
7
8/// Current format version - v1.0 with LOCA, ENDR, and frame compression
9pub const FORMAT_VERSION_MAJOR: u16 = 1;
10pub const FORMAT_VERSION_MINOR: u16 = 0;
11
12/// Header size in bytes
13pub const HEADER_SIZE: usize = 64;
14
15/// Central Directory entry size in bytes
16pub const CD_ENTRY_SIZE: usize = 320;
17
18/// Maximum path length in bytes (UTF-8)
19pub const MAX_PATH_LENGTH: usize = 255;
20
21/// Compression methods supported
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23#[repr(u8)]
24pub enum CompressionMethod {
25    None = 0,
26    Lz4 = 1,
27    Zstd = 2,
28}
29
30/// Encryption modes
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32#[repr(u8)]
33pub enum EncryptionMode {
34    /// No encryption
35    None = 0b00,
36    /// Entire archive encrypted (for backups/secure storage)
37    Archive = 0b01,
38    /// Each file encrypted individually (for queryable archives)
39    PerFile = 0b10,
40}
41
42impl CompressionMethod {
43    pub fn from_u8(value: u8) -> Result<Self> {
44        match value {
45            0 => Ok(Self::None),
46            1 => Ok(Self::Lz4),
47            2 => Ok(Self::Zstd),
48            _ => Err(EngramError::InvalidCompression(value)),
49        }
50    }
51
52    /// Choose best compression based on file type and size
53    pub fn choose_for_file(path: &str, size: u64) -> Self {
54        // Don't compress small files
55        if size < 4096 {
56            return Self::None;
57        }
58
59        // Check file extension
60        let path_lower = path.to_lowercase();
61
62        // Already compressed formats
63        if path_lower.ends_with(".png")
64            || path_lower.ends_with(".jpg")
65            || path_lower.ends_with(".jpeg")
66            || path_lower.ends_with(".gif")
67            || path_lower.ends_with(".mp3")
68            || path_lower.ends_with(".mp4")
69            || path_lower.ends_with(".zip")
70            || path_lower.ends_with(".gz")
71            || path_lower.ends_with(".7z")
72        {
73            return Self::None;
74        }
75
76        // Use Zstd for text/structured data (better compression)
77        if path_lower.ends_with(".txt")
78            || path_lower.ends_with(".md")
79            || path_lower.ends_with(".json")
80            || path_lower.ends_with(".toml")
81            || path_lower.ends_with(".html")
82            || path_lower.ends_with(".css")
83        {
84            return Self::Zstd;
85        }
86
87        // TODO: BytePunch future support (semantic tokenization)
88        if path_lower.ends_with(".cml")
89            || path_lower.ends_with(".xml")
90            || path_lower.ends_with(".js")
91        {
92            // TODO: Implement BytePunch compression (CML -> BytePunch)
93            return Self::None;
94        }
95
96        if path_lower.ends_with(".card") {
97            // TODO: Implement DataSpool compression (BytePunch -> Card)
98            return Self::None;
99        }
100
101        // Use LZ4 for binary data (faster)
102        if path_lower.ends_with(".db")
103            || path_lower.ends_with(".sqlite")
104            || path_lower.ends_with(".wasm")
105        {
106            return Self::Lz4;
107        }
108
109        // Default: Zstd for good compression
110        Self::Zstd
111    }
112}
113
114impl EncryptionMode {
115    /// Extract encryption mode from flags field
116    pub fn from_flags(flags: u32) -> Self {
117        match flags & 0b11 {
118            0b00 => Self::None,
119            0b01 => Self::Archive,
120            0b10 => Self::PerFile,
121            _ => Self::None, // Reserved, treat as None
122        }
123    }
124
125    /// Convert encryption mode to flags bits
126    pub fn to_flags(self) -> u32 {
127        self as u32
128    }
129}
130
131/// File header at the beginning of the archive
132#[derive(Debug, Clone)]
133pub struct FileHeader {
134    pub version_major: u16,
135    pub version_minor: u16,
136    pub header_crc: u32,
137    pub central_directory_offset: u64,
138    pub central_directory_size: u64,
139    pub entry_count: u32,
140    pub content_version: u32,
141    pub flags: u32,
142}
143
144impl FileHeader {
145    pub fn new() -> Self {
146        Self {
147            version_major: FORMAT_VERSION_MAJOR,
148            version_minor: FORMAT_VERSION_MINOR,
149            header_crc: 0,
150            central_directory_offset: 0,
151            central_directory_size: 0,
152            entry_count: 0,
153            content_version: 0,
154            flags: 0,
155        }
156    }
157
158    /// Set encryption mode in flags
159    pub fn set_encryption_mode(&mut self, mode: EncryptionMode) {
160        self.flags = (self.flags & !0b11) | mode.to_flags();
161    }
162
163    /// Get encryption mode from flags
164    pub fn encryption_mode(&self) -> EncryptionMode {
165        EncryptionMode::from_flags(self.flags)
166    }
167
168    /// Write header to a writer
169    pub fn write_to<W: Write>(&self, mut writer: W) -> Result<()> {
170        writer.write_all(&MAGIC_NUMBER)?;
171        writer.write_all(&self.version_major.to_le_bytes())?;
172        writer.write_all(&self.version_minor.to_le_bytes())?;
173        writer.write_all(&self.header_crc.to_le_bytes())?;
174        writer.write_all(&self.central_directory_offset.to_le_bytes())?;
175        writer.write_all(&self.central_directory_size.to_le_bytes())?;
176        writer.write_all(&self.entry_count.to_le_bytes())?;
177        writer.write_all(&self.content_version.to_le_bytes())?;
178        writer.write_all(&self.flags.to_le_bytes())?;
179
180        // Write reserved bytes (20 bytes of zeros - was 24, now 20 due to flags)
181        writer.write_all(&[0u8; 20])?;
182
183        Ok(())
184    }
185
186    /// Read header from a reader
187    pub fn read_from<R: Read>(mut reader: R) -> Result<Self> {
188        let mut magic = [0u8; 8];
189        reader.read_exact(&mut magic)?;
190
191        if magic != MAGIC_NUMBER {
192            return Err(EngramError::InvalidMagic);
193        }
194
195        let version_major = read_u16(&mut reader)?;
196        let version_minor = read_u16(&mut reader)?;
197        let header_crc = read_u32(&mut reader)?;
198        let central_directory_offset = read_u64(&mut reader)?;
199        let central_directory_size = read_u64(&mut reader)?;
200        let entry_count = read_u32(&mut reader)?;
201        let content_version = read_u32(&mut reader)?;
202
203        // Read flags (v1.0+ or v0.4+)
204        let flags = if version_major >= 1 || version_minor >= 4 {
205            read_u32(&mut reader)?
206        } else {
207            // v0.3 compatibility: skip 4 bytes, flags = 0 (no encryption)
208            let mut skip = [0u8; 4];
209            reader.read_exact(&mut skip)?;
210            0
211        };
212
213        // Skip remaining reserved bytes (20 bytes for v0.4+, 20 bytes for v0.3)
214        let mut reserved = [0u8; 20];
215        reader.read_exact(&mut reserved)?;
216
217        Ok(Self {
218            version_major,
219            version_minor,
220            header_crc,
221            central_directory_offset,
222            central_directory_size,
223            entry_count,
224            content_version,
225            flags,
226        })
227    }
228
229    /// Validate version compatibility
230    pub fn validate_version(&self) -> Result<()> {
231        if self.version_major > FORMAT_VERSION_MAJOR {
232            return Err(EngramError::UnsupportedVersion(
233                self.version_major << 8 | self.version_minor,
234            ));
235        }
236        Ok(())
237    }
238}
239
240impl Default for FileHeader {
241    fn default() -> Self {
242        Self::new()
243    }
244}
245
246/// Central Directory entry metadata
247#[derive(Debug, Clone)]
248pub struct EntryInfo {
249    pub path: String,
250    pub data_offset: u64,
251    pub uncompressed_size: u64,
252    pub compressed_size: u64,
253    pub crc32: u32,
254    pub modified_time: u64,
255    pub compression: CompressionMethod,
256    pub flags: u8,
257}
258
259impl EntryInfo {
260    /// Write entry to central directory
261    pub fn write_to<W: Write>(&self, mut writer: W) -> Result<()> {
262        // Signature "CENT" (0x43454E54)
263        writer.write_all(&[0x43, 0x45, 0x4E, 0x54])?;
264
265        writer.write_all(&self.data_offset.to_le_bytes())?;
266        writer.write_all(&self.uncompressed_size.to_le_bytes())?;
267        writer.write_all(&self.compressed_size.to_le_bytes())?;
268        writer.write_all(&self.crc32.to_le_bytes())?;
269        writer.write_all(&self.modified_time.to_le_bytes())?;
270        writer.write_all(&[self.compression as u8])?;
271        writer.write_all(&[self.flags])?;
272
273        // Path length and path
274        let path_bytes = self.path.as_bytes();
275        if path_bytes.len() > MAX_PATH_LENGTH {
276            return Err(EngramError::PathError(format!(
277                "Path too long: {} bytes (max {})",
278                path_bytes.len(),
279                MAX_PATH_LENGTH
280            )));
281        }
282
283        let path_len = path_bytes.len() as u16;
284        writer.write_all(&path_len.to_le_bytes())?;
285
286        // Path buffer (256 bytes, null-terminated)
287        let mut path_buf = [0u8; 256];
288        path_buf[..path_bytes.len()].copy_from_slice(path_bytes);
289        writer.write_all(&path_buf)?;
290
291        // Reserved (20 bytes)
292        writer.write_all(&[0u8; 20])?;
293
294        Ok(())
295    }
296
297    /// Read entry from central directory
298    pub fn read_from<R: Read>(mut reader: R) -> Result<Self> {
299        // Read and verify signature
300        let mut sig = [0u8; 4];
301        reader.read_exact(&mut sig)?;
302        if sig != [0x43, 0x45, 0x4E, 0x54] {
303            return Err(EngramError::InvalidFormat(
304                "Invalid central directory entry signature".to_string(),
305            ));
306        }
307
308        let data_offset = read_u64(&mut reader)?;
309        let uncompressed_size = read_u64(&mut reader)?;
310        let compressed_size = read_u64(&mut reader)?;
311        let crc32 = read_u32(&mut reader)?;
312        let modified_time = read_u64(&mut reader)?;
313
314        let mut compression_byte = [0u8; 1];
315        reader.read_exact(&mut compression_byte)?;
316        let compression = CompressionMethod::from_u8(compression_byte[0])?;
317
318        let mut flags = [0u8; 1];
319        reader.read_exact(&mut flags)?;
320
321        let path_len = read_u16(&mut reader)?;
322
323        let mut path_buf = [0u8; 256];
324        reader.read_exact(&mut path_buf)?;
325
326        let path = String::from_utf8(path_buf[..path_len as usize].to_vec())
327            .map_err(|e| EngramError::PathError(format!("Invalid UTF-8 in path: {}", e)))?;
328
329        // Skip reserved bytes
330        let mut reserved = [0u8; 20];
331        reader.read_exact(&mut reserved)?;
332
333        Ok(Self {
334            path,
335            data_offset,
336            uncompressed_size,
337            compressed_size,
338            crc32,
339            modified_time,
340            compression,
341            flags: flags[0],
342        })
343    }
344}
345
346// Helper functions for reading primitive types
347fn read_u16<R: Read>(mut reader: R) -> Result<u16> {
348    let mut buf = [0u8; 2];
349    reader.read_exact(&mut buf)?;
350    Ok(u16::from_le_bytes(buf))
351}
352
353fn read_u32<R: Read>(mut reader: R) -> Result<u32> {
354    let mut buf = [0u8; 4];
355    reader.read_exact(&mut buf)?;
356    Ok(u32::from_le_bytes(buf))
357}
358
359fn read_u64<R: Read>(mut reader: R) -> Result<u64> {
360    let mut buf = [0u8; 8];
361    reader.read_exact(&mut buf)?;
362    Ok(u64::from_le_bytes(buf))
363}
364
365#[cfg(test)]
366mod tests {
367    use super::*;
368
369    #[test]
370    fn test_compression_method_from_u8() {
371        assert_eq!(
372            CompressionMethod::from_u8(0).unwrap(),
373            CompressionMethod::None
374        );
375        assert_eq!(
376            CompressionMethod::from_u8(1).unwrap(),
377            CompressionMethod::Lz4
378        );
379        assert_eq!(
380            CompressionMethod::from_u8(2).unwrap(),
381            CompressionMethod::Zstd
382        );
383        assert!(CompressionMethod::from_u8(99).is_err());
384    }
385
386    #[test]
387    fn test_compression_choice() {
388        // Files >= 4096 bytes with text extensions should use Zstd
389        assert_eq!(
390            CompressionMethod::choose_for_file("test.txt", 5000),
391            CompressionMethod::Zstd
392        );
393        assert_eq!(
394            CompressionMethod::choose_for_file("test.json", 5000),
395            CompressionMethod::Zstd
396        );
397        // Binary files use LZ4
398        assert_eq!(
399            CompressionMethod::choose_for_file("test.db", 10000),
400            CompressionMethod::Lz4
401        );
402        // Already compressed files should not be compressed
403        assert_eq!(
404            CompressionMethod::choose_for_file("test.png", 5000),
405            CompressionMethod::None
406        );
407        // Small files (< 4096 bytes) should not be compressed
408        assert_eq!(
409            CompressionMethod::choose_for_file("test.txt", 2000),
410            CompressionMethod::None
411        );
412        assert_eq!(
413            CompressionMethod::choose_for_file("test.txt", 500),
414            CompressionMethod::None
415        );
416    }
417
418    #[test]
419    fn test_file_header_roundtrip() {
420        let header = FileHeader {
421            version_major: 0,
422            version_minor: 4,
423            header_crc: 0x12345678,
424            central_directory_offset: 1024,
425            central_directory_size: 512,
426            entry_count: 10,
427            content_version: 1,
428            flags: 0,
429        };
430
431        let mut buf = Vec::new();
432        header.write_to(&mut buf).unwrap();
433
434        assert_eq!(buf.len(), HEADER_SIZE);
435
436        let parsed = FileHeader::read_from(&buf[..]).unwrap();
437        assert_eq!(parsed.version_major, header.version_major);
438        assert_eq!(parsed.version_minor, header.version_minor);
439        assert_eq!(parsed.header_crc, header.header_crc);
440        assert_eq!(
441            parsed.central_directory_offset,
442            header.central_directory_offset
443        );
444        assert_eq!(parsed.entry_count, header.entry_count);
445    }
446
447    #[test]
448    fn test_entry_info_roundtrip() {
449        let entry = EntryInfo {
450            path: "test/file.txt".to_string(),
451            data_offset: 1024,
452            uncompressed_size: 5000,
453            compressed_size: 2000,
454            crc32: 0xDEADBEEF,
455            modified_time: 1699999999,
456            compression: CompressionMethod::Zstd,
457            flags: 0,
458        };
459
460        let mut buf = Vec::new();
461        entry.write_to(&mut buf).unwrap();
462
463        assert_eq!(buf.len(), CD_ENTRY_SIZE);
464
465        let parsed = EntryInfo::read_from(&buf[..]).unwrap();
466        assert_eq!(parsed.path, entry.path);
467        assert_eq!(parsed.data_offset, entry.data_offset);
468        assert_eq!(parsed.uncompressed_size, entry.uncompressed_size);
469        assert_eq!(parsed.compressed_size, entry.compressed_size);
470        assert_eq!(parsed.crc32, entry.crc32);
471        assert_eq!(parsed.compression, entry.compression);
472    }
473}