parcode/format.rs
1//! Defines the physical binary layout of Parcode V4 files.
2//!
3//! This module specifies the on-disk representation of Parcode files, including the
4//! global header, chunk structure, and metadata encoding. Understanding this format
5//! is essential for implementing readers in other languages or debugging file corruption.
6//!
7//! ## File Format Overview (V4)
8//!
9//! Parcode V4 uses a "bottom-up" layout strategy where children are written before their
10//! parents. This enables streaming writes and allows the root chunk to contain a complete
11//! table of contents for the entire file.
12//!
13//! ### High-Level Structure
14//!
15//! ```text
16//! ┌──────────────────────────────────┐
17//! │ Chunk 0 (Leaf) │
18//! ├──────────────────────────────────┤
19//! │ Chunk 1 (Leaf) │
20//! ├──────────────────────────────────┤
21//! │ ... │
22//! ├──────────────────────────────────┤
23//! │ Chunk N (Parent) │
24//! ├──────────────────────────────────┤
25//! │ Root Chunk │
26//! ├──────────────────────────────────┤
27//! │ Global Header (26 bytes) │
28//! └──────────────────────────────────┘
29//! ```
30//!
31//! ## Chunk Anatomy
32//!
33//! Each chunk is self-contained and consists of three parts:
34//!
35//! ```text
36//! ┌─────────────────────────────────────────────────────────┐
37//! │ Compressed Payload (Variable Length) │
38//! │ - Contains the actual data (serialized with bincode) │
39//! │ - May be compressed (LZ4, etc.) based on MetaByte │
40//! ├─────────────────────────────────────────────────────────┤
41//! │ Children Table (Optional, only if is_chunkable = true) │
42//! │ - Array of ChildRef structures (16 bytes each) │
43//! │ - Count stored as u32 LE (4 bytes) at the end │
44//! ├─────────────────────────────────────────────────────────┤
45//! │ MetaByte (1 byte) │
46//! │ - Bit 0: is_chunkable (has children) │
47//! │ - Bits 1-3: compression_method (0-7) │
48//! │ - Bits 4-7: Reserved for future use │
49//! └─────────────────────────────────────────────────────────┘
50//! ```
51//!
52//! ### Reading a Chunk
53//!
54//! To read a chunk at offset `O` with length `L`:
55//!
56//! 1. Read the `MetaByte` at `O + L - 1`
57//! 2. If `is_chunkable`, read the child count at `O + L - 5` (u32 LE)
58//! 3. Read the children table (if present) working backwards from the count
59//! 4. The payload starts at `O` and ends before the children table (or `MetaByte` if no children)
60//! 5. Decompress the payload based on the compression method
61//!
62//! ## Global Header
63//!
64//! The global header is always located at the end of the file and has a fixed size of 26 bytes:
65//!
66//! ```text
67//! Offset | Size | Field | Description
68//! -------|------|---------------|----------------------------------------
69//! 0 | 4 | magic | Magic bytes: "PAR4" (0x50 0x41 0x52 0x34)
70//! 4 | 2 | version | Format version (u16 LE, currently 4)
71//! 6 | 8 | root_offset | Absolute offset of root chunk (u64 LE)
72//! 14 | 8 | root_length | Total length of root chunk (u64 LE)
73//! 22 | 4 | checksum | Reserved for CRC32 (u32 LE, currently 0)
74//! ```
75//!
76//! ## Design Rationale
77//!
78//! ### Why Bottom-Up Layout?
79//!
80//! - **Streaming Writes:** Children can be written as soon as they're ready, without
81//! knowing the final file size
82//! - **Parallel Execution:** Multiple threads can write chunks concurrently without
83//! coordination beyond the sequential writer mutex
84//! - **Self-Describing Root:** The root chunk contains all necessary metadata to
85//! navigate the entire file
86//!
87//! ### Why `MetaByte` at the End?
88//!
89//! - **Backward Reading:** When navigating from a parent to children, we can read
90//! the `MetaByte` first to determine the chunk structure
91//! - **Alignment:** Placing metadata at the end avoids alignment issues with the payload
92//!
93//! ### Why Fixed-Size `ChildRef`?
94//!
95//! - **Random Access:** Fixed-size references enable O(1) indexing into the children array
96//! - **Simplicity:** No need for variable-length encoding or delimiters
97//!
98//! ## Compatibility
99//!
100//! - **Endianness:** All multi-byte integers use little-endian encoding
101//! - **Alignment:** No special alignment requirements (can be read from any offset)
102//! - **Version Detection:** Readers should check the magic bytes and version before parsing
103
104use crate::error::{ParcodeError, Result};
105
106/// Magic bytes identifying the file format: "PAR4".
107pub const MAGIC_BYTES: [u8; 4] = *b"PAR4";
108
109/// The fixed size of the Global Header.
110/// Magic(4) + Version(2) + RootOffset(8) + RootLength(8) + Checksum(4) = 26
111pub const GLOBAL_HEADER_SIZE: usize = 26;
112
113/// Configuration flags for a specific chunk, stored in the last byte.
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
115pub struct MetaByte(u8);
116
117impl MetaByte {
118 const CHUNKABLE_MASK: u8 = 0b0000_0001; // Bit 0
119 const COMPRESSION_MASK: u8 = 0b0000_1110; // Bits 1-3
120
121 /// Creates a new `MetaByte`.
122 pub fn new(is_chunkable: bool, compression_id: u8) -> Self {
123 let mut byte = 0;
124 if is_chunkable {
125 byte |= Self::CHUNKABLE_MASK;
126 }
127 // Compress ID lives in bits 1-3
128 byte |= (compression_id & 0x07) << 1;
129 Self(byte)
130 }
131
132 /// decodes the byte.
133 pub fn from_byte(byte: u8) -> Self {
134 Self(byte)
135 }
136
137 /// Returns true if the chunk contains references to children.
138 pub fn is_chunkable(&self) -> bool {
139 (self.0 & Self::CHUNKABLE_MASK) != 0
140 }
141
142 /// Returns the compression algorithm ID (0-7).
143 pub fn compression_method(&self) -> u8 {
144 (self.0 & Self::COMPRESSION_MASK) >> 1
145 }
146
147 /// Returns the raw byte representation.
148 pub fn as_u8(&self) -> u8 {
149 self.0
150 }
151}
152
153/// Represents a reference to a child chunk stored within a parent chunk.
154/// This allows the reader to locate dependencies without deserializing the payload.
155#[derive(Debug, Clone, Copy)]
156pub struct ChildRef {
157 /// Absolute offset in the file where the child chunk starts.
158 pub offset: u64,
159 /// Total length of the child chunk (including meta-byte).
160 pub length: u64,
161}
162
163impl ChildRef {
164 /// The size in bytes of a serialized `ChildRef`.
165 pub const SIZE: usize = 16; // 8 bytes offset + 8 bytes length
166
167 /// Serializes to a fixed-size byte array (Little Endian).
168 pub fn to_bytes(&self) -> [u8; Self::SIZE] {
169 let mut buf = [0u8; Self::SIZE];
170 buf[0..8].copy_from_slice(&self.offset.to_le_bytes());
171 buf[8..16].copy_from_slice(&self.length.to_le_bytes());
172 buf
173 }
174
175 /// Deserializes from a fixed-size byte array.
176 pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
177 if bytes.len() < Self::SIZE {
178 return Err(ParcodeError::Format("Buffer too small for ChildRef".into()));
179 }
180
181 let offset_bytes = bytes.get(0..8).ok_or_else(|| {
182 ParcodeError::Format("Failed to read offset from ChildRef buffer".into())
183 })?;
184 let length_bytes = bytes.get(8..16).ok_or_else(|| {
185 ParcodeError::Format("Failed to read length from ChildRef buffer".into())
186 })?;
187
188 let offset = u64::from_le_bytes(
189 offset_bytes
190 .try_into()
191 .map_err(|_| ParcodeError::Format("Invalid offset bytes".into()))?,
192 );
193 let length = u64::from_le_bytes(
194 length_bytes
195 .try_into()
196 .map_err(|_| ParcodeError::Format("Invalid length bytes".into()))?,
197 );
198
199 Ok(Self { offset, length })
200 }
201}
202
203/// The Global Header located at the very end of the file (Tail).
204/// It points to the Root Chunk, which is the entry point for the graph.
205#[derive(Debug, Clone, Copy)]
206pub struct GlobalHeader {
207 /// The magic bytes identifying the file format.
208 pub magic: [u8; 4],
209 /// The version of the file format (currently 4).
210 pub version: u16,
211 /// Pointer to the final Root Chunk.
212 pub root_offset: u64,
213 /// The total length of the Root Chunk.
214 pub root_length: u64,
215 /// Reserved for CRC/Checksum of the header itself.
216 pub checksum: u32,
217}
218
219impl GlobalHeader {
220 /// Creates a new `GlobalHeader`.
221 pub fn new(root_offset: u64, root_length: u64) -> Self {
222 Self {
223 magic: MAGIC_BYTES,
224 version: 4,
225 root_offset,
226 root_length,
227 checksum: 0,
228 }
229 }
230
231 /// Serializes the header to bytes.
232 pub fn to_bytes(&self) -> [u8; GLOBAL_HEADER_SIZE] {
233 let mut buf = [0u8; GLOBAL_HEADER_SIZE];
234 buf[0..4].copy_from_slice(&self.magic);
235 buf[4..6].copy_from_slice(&self.version.to_le_bytes());
236 buf[6..14].copy_from_slice(&self.root_offset.to_le_bytes());
237 buf[14..22].copy_from_slice(&self.root_length.to_le_bytes());
238 buf[22..26].copy_from_slice(&self.checksum.to_le_bytes());
239 buf
240 }
241}