parcode/
format.rs

1//! Defines the physical binary layout of Parcode V4 files.
2//!
3//! This module specifies the on-disk representation of Parcode files, including the
4//! global header, chunk structure, and metadata encoding. Understanding this format
5//! is essential for implementing readers in other languages or debugging file corruption.
6//!
7//! ## File Format Overview (V4)
8//!
9//! Parcode V4 uses a "bottom-up" layout strategy where children are written before their
10//! parents. This enables streaming writes and allows the root chunk to contain a complete
11//! table of contents for the entire file.
12//!
13//! ### High-Level Structure
14//!
15//! ```text
16//! ┌──────────────────────────────────┐
17//! │ Chunk 0 (Leaf)                   │
18//! ├──────────────────────────────────┤
19//! │ Chunk 1 (Leaf)                   │
20//! ├──────────────────────────────────┤
21//! │ ...                              │
22//! ├──────────────────────────────────┤
23//! │ Chunk N (Parent)                 │
24//! ├──────────────────────────────────┤
25//! │ Root Chunk                       │
26//! ├──────────────────────────────────┤
27//! │ Global Header (26 bytes)         │
28//! └──────────────────────────────────┘
29//! ```
30//!
31//! ## Chunk Anatomy
32//!
33//! Each chunk is self-contained and consists of three parts:
34//!
35//! ```text
36//! ┌─────────────────────────────────────────────────────────┐
37//! │ Compressed Payload (Variable Length)                    │
38//! │   - Contains the actual data (serialized with bincode)  │
39//! │   - May be compressed (LZ4, etc.) based on MetaByte     │
40//! ├─────────────────────────────────────────────────────────┤
41//! │ Children Table (Optional, only if is_chunkable = true)  │
42//! │   - Array of ChildRef structures (16 bytes each)        │
43//! │   - Count stored as u32 LE (4 bytes) at the end         │
44//! ├─────────────────────────────────────────────────────────┤
45//! │ MetaByte (1 byte)                                       │
46//! │   - Bit 0: is_chunkable (has children)                  │
47//! │   - Bits 1-3: compression_method (0-7)                  │
48//! │   - Bits 4-7: Reserved for future use                   │
49//! └─────────────────────────────────────────────────────────┘
50//! ```
51//!
52//! ### Reading a Chunk
53//!
54//! To read a chunk at offset `O` with length `L`:
55//!
56//! 1. Read the `MetaByte` at `O + L - 1`
57//! 2. If `is_chunkable`, read the child count at `O + L - 5` (u32 LE)
58//! 3. Read the children table (if present) working backwards from the count
59//! 4. The payload starts at `O` and ends before the children table (or `MetaByte` if no children)
60//! 5. Decompress the payload based on the compression method
61//!
62//! ## Global Header
63//!
64//! The global header is always located at the end of the file and has a fixed size of 26 bytes:
65//!
66//! ```text
67//! Offset | Size | Field         | Description
68//! -------|------|---------------|----------------------------------------
69//! 0      | 4    | magic         | Magic bytes: "PAR4" (0x50 0x41 0x52 0x34)
70//! 4      | 2    | version       | Format version (u16 LE, currently 4)
71//! 6      | 8    | root_offset   | Absolute offset of root chunk (u64 LE)
72//! 14     | 8    | root_length   | Total length of root chunk (u64 LE)
73//! 22     | 4    | checksum      | Reserved for CRC32 (u32 LE, currently 0)
74//! ```
75//!
76//! ## Design Rationale
77//!
78//! ### Why Bottom-Up Layout?
79//!
80//! - **Streaming Writes:** Children can be written as soon as they're ready, without
81//!   knowing the final file size
82//! - **Parallel Execution:** Multiple threads can write chunks concurrently without
83//!   coordination beyond the sequential writer mutex
84//! - **Self-Describing Root:** The root chunk contains all necessary metadata to
85//!   navigate the entire file
86//!
87//! ### Why `MetaByte` at the End?
88//!
89//! - **Backward Reading:** When navigating from a parent to children, we can read
90//!   the `MetaByte` first to determine the chunk structure
91//! - **Alignment:** Placing metadata at the end avoids alignment issues with the payload
92//!
93//! ### Why Fixed-Size `ChildRef`?
94//!
95//! - **Random Access:** Fixed-size references enable O(1) indexing into the children array
96//! - **Simplicity:** No need for variable-length encoding or delimiters
97//!
98//! ## Compatibility
99//!
100//! - **Endianness:** All multi-byte integers use little-endian encoding
101//! - **Alignment:** No special alignment requirements (can be read from any offset)
102//! - **Version Detection:** Readers should check the magic bytes and version before parsing
103
104use crate::error::{ParcodeError, Result};
105
106/// Magic bytes identifying the file format: "PAR4".
107pub const MAGIC_BYTES: [u8; 4] = *b"PAR4";
108
109/// The fixed size of the Global Header.
110/// Magic(4) + Version(2) + RootOffset(8) + RootLength(8) + Checksum(4) = 26
111pub const GLOBAL_HEADER_SIZE: usize = 26;
112
113/// Configuration flags for a specific chunk, stored in the last byte.
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
115pub struct MetaByte(u8);
116
117impl MetaByte {
118    const CHUNKABLE_MASK: u8 = 0b0000_0001; // Bit 0
119    const COMPRESSION_MASK: u8 = 0b0000_1110; // Bits 1-3
120
121    /// Creates a new `MetaByte`.
122    pub fn new(is_chunkable: bool, compression_id: u8) -> Self {
123        let mut byte = 0;
124        if is_chunkable {
125            byte |= Self::CHUNKABLE_MASK;
126        }
127        // Compress ID lives in bits 1-3
128        byte |= (compression_id & 0x07) << 1;
129        Self(byte)
130    }
131
132    /// decodes the byte.
133    pub fn from_byte(byte: u8) -> Self {
134        Self(byte)
135    }
136
137    /// Returns true if the chunk contains references to children.
138    pub fn is_chunkable(&self) -> bool {
139        (self.0 & Self::CHUNKABLE_MASK) != 0
140    }
141
142    /// Returns the compression algorithm ID (0-7).
143    pub fn compression_method(&self) -> u8 {
144        (self.0 & Self::COMPRESSION_MASK) >> 1
145    }
146
147    /// Returns the raw byte representation.
148    pub fn as_u8(&self) -> u8 {
149        self.0
150    }
151}
152
153/// Represents a reference to a child chunk stored within a parent chunk.
154/// This allows the reader to locate dependencies without deserializing the payload.
155#[derive(Debug, Clone, Copy)]
156pub struct ChildRef {
157    /// Absolute offset in the file where the child chunk starts.
158    pub offset: u64,
159    /// Total length of the child chunk (including meta-byte).
160    pub length: u64,
161}
162
163impl ChildRef {
164    /// The size in bytes of a serialized `ChildRef`.
165    pub const SIZE: usize = 16; // 8 bytes offset + 8 bytes length
166
167    /// Serializes to a fixed-size byte array (Little Endian).
168    pub fn to_bytes(&self) -> [u8; Self::SIZE] {
169        let mut buf = [0u8; Self::SIZE];
170        buf[0..8].copy_from_slice(&self.offset.to_le_bytes());
171        buf[8..16].copy_from_slice(&self.length.to_le_bytes());
172        buf
173    }
174
175    /// Deserializes from a fixed-size byte array.
176    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
177        if bytes.len() < Self::SIZE {
178            return Err(ParcodeError::Format("Buffer too small for ChildRef".into()));
179        }
180
181        let offset_bytes = bytes.get(0..8).ok_or_else(|| {
182            ParcodeError::Format("Failed to read offset from ChildRef buffer".into())
183        })?;
184        let length_bytes = bytes.get(8..16).ok_or_else(|| {
185            ParcodeError::Format("Failed to read length from ChildRef buffer".into())
186        })?;
187
188        let offset = u64::from_le_bytes(
189            offset_bytes
190                .try_into()
191                .map_err(|_| ParcodeError::Format("Invalid offset bytes".into()))?,
192        );
193        let length = u64::from_le_bytes(
194            length_bytes
195                .try_into()
196                .map_err(|_| ParcodeError::Format("Invalid length bytes".into()))?,
197        );
198
199        Ok(Self { offset, length })
200    }
201}
202
203/// The Global Header located at the very end of the file (Tail).
204/// It points to the Root Chunk, which is the entry point for the graph.
205#[derive(Debug, Clone, Copy)]
206pub struct GlobalHeader {
207    /// The magic bytes identifying the file format.
208    pub magic: [u8; 4],
209    /// The version of the file format (currently 4).
210    pub version: u16,
211    /// Pointer to the final Root Chunk.
212    pub root_offset: u64,
213    /// The total length of the Root Chunk.
214    pub root_length: u64,
215    /// Reserved for CRC/Checksum of the header itself.
216    pub checksum: u32,
217}
218
219impl GlobalHeader {
220    /// Creates a new `GlobalHeader`.
221    pub fn new(root_offset: u64, root_length: u64) -> Self {
222        Self {
223            magic: MAGIC_BYTES,
224            version: 4,
225            root_offset,
226            root_length,
227            checksum: 0,
228        }
229    }
230
231    /// Serializes the header to bytes.
232    pub fn to_bytes(&self) -> [u8; GLOBAL_HEADER_SIZE] {
233        let mut buf = [0u8; GLOBAL_HEADER_SIZE];
234        buf[0..4].copy_from_slice(&self.magic);
235        buf[4..6].copy_from_slice(&self.version.to_le_bytes());
236        buf[6..14].copy_from_slice(&self.root_offset.to_le_bytes());
237        buf[14..22].copy_from_slice(&self.root_length.to_le_bytes());
238        buf[22..26].copy_from_slice(&self.checksum.to_le_bytes());
239        buf
240    }
241}