Skip to main content

datacortex_core/
dcx.rs

1//! .dcx file format — v3 header with CRC-32 integrity.
2//!
3//! Layout (32-byte fixed header + variable metadata + compressed data):
4//!   Bytes  0-3:  Magic "DCX\x03"
5//!   Byte   4:    Version (3)
6//!   Byte   5:    Mode (0=Max, 1=Balanced, 2=Fast)
7//!   Byte   6:    Format hint (0-10)
8//!   Byte   7:    Flags (bit 0: has_transform_metadata, bit 1: has_zstd_dictionary, bit 2: metadata is zstd-compressed, bit 3: brotli entropy coder, bit 4: metadata embedded in compressed stream)
9//!   Bytes  8-15: Original size (u64 LE)
10//!   Bytes 16-23: Compressed data size (u64 LE)
11//!   Bytes 24-27: CRC-32 of original data (u32 LE)
12//!   Bytes 28-31: Transform metadata length (u32 LE)
13//!   [Transform metadata: variable]
14//!   [Compressed data]
15
16use std::io::{self, Read, Write};
17
18const MAGIC: [u8; 4] = [b'D', b'C', b'X', 0x03];
19const VERSION: u8 = 3;
20const HEADER_SIZE: usize = 32;
21
22/// Compression mode.
23#[derive(Debug, Clone, Copy, PartialEq, Eq)]
24#[repr(u8)]
25pub enum Mode {
26    Max = 0,
27    Balanced = 1,
28    Fast = 2,
29}
30
31impl Mode {
32    /// Parse a `Mode` from its on-disk `u8` representation.
33    pub fn from_u8(v: u8) -> io::Result<Self> {
34        match v {
35            0 => Ok(Self::Max),
36            1 => Ok(Self::Balanced),
37            2 => Ok(Self::Fast),
38            _ => Err(io::Error::new(
39                io::ErrorKind::InvalidData,
40                format!("unknown mode: {v}"),
41            )),
42        }
43    }
44
45    /// Return the lowercase string name for this mode.
46    pub fn name(&self) -> &'static str {
47        match self {
48            Self::Max => "max",
49            Self::Balanced => "balanced",
50            Self::Fast => "fast",
51        }
52    }
53}
54
55impl std::fmt::Display for Mode {
56    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
57        f.write_str(self.name())
58    }
59}
60
61/// Detected or declared file format.
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63#[repr(u8)]
64pub enum FormatHint {
65    Generic = 0,
66    Json = 1,
67    // Legacy variants kept for backward compat decoding
68    Markdown = 2,
69    Ndjson = 3,
70    // Legacy variants kept for backward compat decoding
71    Csv = 4,
72    Code = 5,
73    Log = 6,
74    Logfmt = 7,
75    Prometheus = 8,
76    Yaml = 9,
77    Xml = 10,
78}
79
80impl FormatHint {
81    /// Parse a `FormatHint` from its on-disk `u8` representation.
82    pub fn from_u8(v: u8) -> io::Result<Self> {
83        match v {
84            0 => Ok(Self::Generic),
85            1 => Ok(Self::Json),
86            2 => Ok(Self::Markdown),
87            3 => Ok(Self::Ndjson),
88            4 => Ok(Self::Csv),
89            5 => Ok(Self::Code),
90            6 => Ok(Self::Log),
91            7 => Ok(Self::Logfmt),
92            8 => Ok(Self::Prometheus),
93            9 => Ok(Self::Yaml),
94            10 => Ok(Self::Xml),
95            _ => Err(io::Error::new(
96                io::ErrorKind::InvalidData,
97                format!("unknown format: {v}"),
98            )),
99        }
100    }
101
102    /// Return the lowercase string name for this format.
103    pub fn name(&self) -> &'static str {
104        match self {
105            Self::Generic => "generic",
106            Self::Json => "json",
107            Self::Markdown => "markdown",
108            Self::Ndjson => "ndjson",
109            Self::Csv => "csv",
110            Self::Code => "code",
111            Self::Log => "log",
112            Self::Logfmt => "logfmt",
113            Self::Prometheus => "prometheus",
114            Self::Yaml => "yaml",
115            Self::Xml => "xml",
116        }
117    }
118}
119
120impl std::fmt::Display for FormatHint {
121    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
122        f.write_str(self.name())
123    }
124}
125
126/// Flag bit 0: transform metadata is present.
127pub const FLAG_HAS_TRANSFORM: u8 = 1 << 0;
128/// Flag bit 1: compressed payload embeds a zstd dictionary (Fast mode).
129pub const FLAG_HAS_DICT: u8 = 1 << 1;
130/// Flag bit 2: transform metadata is zstd-compressed.
131pub const FLAG_META_COMPRESSED: u8 = 1 << 2;
132/// Flag bit 3: payload compressed with brotli instead of zstd.
133pub const FLAG_BROTLI: u8 = 1 << 3;
134/// Flag bit 4: transform metadata is embedded inside the compressed stream.
135/// When set, `transform_metadata_len` in the header is 0 (no separate metadata).
136/// The decompressed stream starts with `meta_len:u32 LE` + raw metadata + preprocessed data.
137pub const FLAG_META_EMBEDDED: u8 = 1 << 4;
138
139/// .dcx file header.
140#[derive(Debug, Clone)]
141pub struct DcxHeader {
142    pub mode: Mode,
143    pub format_hint: FormatHint,
144    pub original_size: u64,
145    pub compressed_size: u64,
146    pub crc32: u32,
147    pub transform_metadata: Vec<u8>,
148    /// True if the compressed payload embeds a zstd dictionary.
149    pub has_dict: bool,
150    /// True if transform_metadata is zstd-compressed (bit 2 of flags).
151    pub meta_compressed: bool,
152    /// True if compressed with brotli instead of zstd (bit 3 of flags).
153    pub use_brotli: bool,
154    /// True if transform metadata is embedded in the compressed stream (bit 4 of flags).
155    /// When set, the header's transform_metadata is empty; metadata lives inside the
156    /// decompressed payload as `[meta_len:u32 LE][raw_metadata][preprocessed_data]`.
157    pub meta_embedded: bool,
158}
159
160impl DcxHeader {
161    /// Serialize header to writer.
162    pub fn write_to<W: Write>(&self, w: &mut W) -> io::Result<()> {
163        w.write_all(&MAGIC)?;
164        w.write_all(&[VERSION])?;
165        w.write_all(&[self.mode as u8])?;
166        w.write_all(&[self.format_hint as u8])?;
167        let mut flags: u8 = 0;
168        if !self.transform_metadata.is_empty() {
169            flags |= FLAG_HAS_TRANSFORM;
170        }
171        if self.has_dict {
172            flags |= FLAG_HAS_DICT;
173        }
174        if self.meta_compressed {
175            flags |= FLAG_META_COMPRESSED;
176        }
177        if self.use_brotli {
178            flags |= FLAG_BROTLI;
179        }
180        if self.meta_embedded {
181            flags |= FLAG_META_EMBEDDED;
182        }
183        w.write_all(&[flags])?;
184        w.write_all(&self.original_size.to_le_bytes())?;
185        w.write_all(&self.compressed_size.to_le_bytes())?;
186        w.write_all(&self.crc32.to_le_bytes())?;
187        w.write_all(&(self.transform_metadata.len() as u32).to_le_bytes())?;
188        if !self.transform_metadata.is_empty() {
189            w.write_all(&self.transform_metadata)?;
190        }
191        Ok(())
192    }
193
194    /// Deserialize header from reader.
195    pub fn read_from<R: Read>(r: &mut R) -> io::Result<Self> {
196        let mut buf = [0u8; HEADER_SIZE];
197        r.read_exact(&mut buf)?;
198
199        if buf[0..4] != MAGIC {
200            return Err(io::Error::new(
201                io::ErrorKind::InvalidData,
202                "not a .dcx file",
203            ));
204        }
205        if buf[4] != VERSION {
206            return Err(io::Error::new(
207                io::ErrorKind::InvalidData,
208                format!("unsupported .dcx version: {} (expected {VERSION})", buf[4]),
209            ));
210        }
211
212        let mode = Mode::from_u8(buf[5])?;
213        let format_hint = FormatHint::from_u8(buf[6])?;
214        let flags = buf[7];
215        let has_dict = flags & FLAG_HAS_DICT != 0;
216        let meta_compressed = flags & FLAG_META_COMPRESSED != 0;
217        let use_brotli = flags & FLAG_BROTLI != 0;
218        let meta_embedded = flags & FLAG_META_EMBEDDED != 0;
219        let original_size = u64::from_le_bytes(buf[8..16].try_into().expect("8-byte slice"));
220        let compressed_size = u64::from_le_bytes(buf[16..24].try_into().expect("8-byte slice"));
221        let crc32 = u32::from_le_bytes(buf[24..28].try_into().expect("4-byte slice"));
222        let transform_metadata_len =
223            u32::from_le_bytes(buf[28..32].try_into().expect("4-byte slice")) as usize;
224
225        let transform_metadata = if flags & FLAG_HAS_TRANSFORM != 0 && transform_metadata_len > 0 {
226            let mut meta = vec![0u8; transform_metadata_len];
227            r.read_exact(&mut meta)?;
228            meta
229        } else {
230            Vec::new()
231        };
232
233        Ok(DcxHeader {
234            mode,
235            format_hint,
236            original_size,
237            compressed_size,
238            crc32,
239            transform_metadata,
240            has_dict,
241            meta_compressed,
242            use_brotli,
243            meta_embedded,
244        })
245    }
246
247    /// Total header size including transform metadata.
248    pub fn total_size(&self) -> usize {
249        HEADER_SIZE + self.transform_metadata.len()
250    }
251}
252
253#[cfg(test)]
254mod tests {
255    use super::*;
256
257    #[test]
258    fn header_roundtrip() {
259        let header = DcxHeader {
260            mode: Mode::Balanced,
261            format_hint: FormatHint::Json,
262            original_size: 12345,
263            compressed_size: 6789,
264            crc32: 0xDEADBEEF,
265            transform_metadata: vec![],
266            has_dict: false,
267            meta_compressed: false,
268            use_brotli: false,
269            meta_embedded: false,
270        };
271
272        let mut buf = Vec::new();
273        header.write_to(&mut buf).unwrap();
274        assert_eq!(buf.len(), HEADER_SIZE);
275
276        let mut cursor = io::Cursor::new(&buf);
277        let decoded = DcxHeader::read_from(&mut cursor).unwrap();
278
279        assert_eq!(decoded.mode, Mode::Balanced);
280        assert_eq!(decoded.format_hint, FormatHint::Json);
281        assert_eq!(decoded.original_size, 12345);
282        assert_eq!(decoded.compressed_size, 6789);
283        assert_eq!(decoded.crc32, 0xDEADBEEF);
284        assert!(decoded.transform_metadata.is_empty());
285    }
286
287    #[test]
288    fn header_with_metadata() {
289        let meta = vec![1, 2, 3, 4, 5];
290        let header = DcxHeader {
291            mode: Mode::Max,
292            format_hint: FormatHint::Ndjson,
293            original_size: 999,
294            compressed_size: 500,
295            crc32: 0x12345678,
296            transform_metadata: meta.clone(),
297            has_dict: false,
298            meta_compressed: false,
299            use_brotli: false,
300            meta_embedded: false,
301        };
302
303        let mut buf = Vec::new();
304        header.write_to(&mut buf).unwrap();
305        assert_eq!(buf.len(), HEADER_SIZE + 5);
306
307        let mut cursor = io::Cursor::new(&buf);
308        let decoded = DcxHeader::read_from(&mut cursor).unwrap();
309        assert_eq!(decoded.transform_metadata, meta);
310        assert_eq!(decoded.total_size(), HEADER_SIZE + 5);
311    }
312
313    #[test]
314    fn header_dict_flag_roundtrip() {
315        let header = DcxHeader {
316            mode: Mode::Fast,
317            format_hint: FormatHint::Ndjson,
318            original_size: 5000,
319            compressed_size: 2000,
320            crc32: 0xCAFEBABE,
321            transform_metadata: vec![10, 20],
322            has_dict: true,
323            meta_compressed: false,
324            use_brotli: false,
325            meta_embedded: false,
326        };
327
328        let mut buf = Vec::new();
329        header.write_to(&mut buf).unwrap();
330
331        // Flags byte at offset 7 should have both bit 0 and bit 1 set.
332        assert_eq!(buf[7], FLAG_HAS_TRANSFORM | FLAG_HAS_DICT);
333
334        let mut cursor = io::Cursor::new(&buf);
335        let decoded = DcxHeader::read_from(&mut cursor).unwrap();
336        assert!(decoded.has_dict);
337        assert_eq!(decoded.transform_metadata, vec![10, 20]);
338    }
339
340    #[test]
341    fn header_meta_compressed_flag_roundtrip() {
342        let meta = vec![1, 2, 3, 4, 5, 6, 7, 8];
343        let header = DcxHeader {
344            mode: Mode::Fast,
345            format_hint: FormatHint::Json,
346            original_size: 10000,
347            compressed_size: 5000,
348            crc32: 0xAABBCCDD,
349            transform_metadata: meta.clone(),
350            has_dict: false,
351            meta_compressed: true,
352            use_brotli: false,
353            meta_embedded: false,
354        };
355
356        let mut buf = Vec::new();
357        header.write_to(&mut buf).unwrap();
358
359        // Flags byte at offset 7 should have bit 0 (transform) and bit 2 (meta_compressed) set.
360        assert_eq!(buf[7], FLAG_HAS_TRANSFORM | FLAG_META_COMPRESSED);
361
362        let mut cursor = io::Cursor::new(&buf);
363        let decoded = DcxHeader::read_from(&mut cursor).unwrap();
364        assert!(decoded.meta_compressed);
365        assert!(!decoded.has_dict);
366        assert_eq!(decoded.transform_metadata, meta);
367    }
368
369    #[test]
370    fn header_old_file_no_meta_compressed() {
371        // Simulate an old file with bit 2 = 0: meta_compressed should be false.
372        let header = DcxHeader {
373            mode: Mode::Fast,
374            format_hint: FormatHint::Json,
375            original_size: 1000,
376            compressed_size: 500,
377            crc32: 0x11223344,
378            transform_metadata: vec![42],
379            has_dict: false,
380            meta_compressed: false,
381            use_brotli: false,
382            meta_embedded: false,
383        };
384
385        let mut buf = Vec::new();
386        header.write_to(&mut buf).unwrap();
387
388        // Only bit 0 should be set, NOT bit 2.
389        assert_eq!(buf[7], FLAG_HAS_TRANSFORM);
390
391        let mut cursor = io::Cursor::new(&buf);
392        let decoded = DcxHeader::read_from(&mut cursor).unwrap();
393        assert!(!decoded.meta_compressed);
394        assert_eq!(decoded.transform_metadata, vec![42]);
395    }
396
397    #[test]
398    fn header_all_flags_roundtrip() {
399        // All five flags set simultaneously.
400        let header = DcxHeader {
401            mode: Mode::Fast,
402            format_hint: FormatHint::Ndjson,
403            original_size: 9000,
404            compressed_size: 4000,
405            crc32: 0xDEADC0DE,
406            transform_metadata: vec![1, 2, 3],
407            has_dict: true,
408            meta_compressed: true,
409            use_brotli: true,
410            meta_embedded: true,
411        };
412
413        let mut buf = Vec::new();
414        header.write_to(&mut buf).unwrap();
415
416        assert_eq!(
417            buf[7],
418            FLAG_HAS_TRANSFORM
419                | FLAG_HAS_DICT
420                | FLAG_META_COMPRESSED
421                | FLAG_BROTLI
422                | FLAG_META_EMBEDDED
423        );
424
425        let mut cursor = io::Cursor::new(&buf);
426        let decoded = DcxHeader::read_from(&mut cursor).unwrap();
427        assert!(decoded.has_dict);
428        assert!(decoded.meta_compressed);
429        assert!(decoded.use_brotli);
430        assert!(decoded.meta_embedded);
431        assert_eq!(decoded.transform_metadata, vec![1, 2, 3]);
432    }
433
434    #[test]
435    fn header_meta_embedded_flag_roundtrip() {
436        // meta_embedded flag alone (bit 4), with empty header metadata.
437        let header = DcxHeader {
438            mode: Mode::Fast,
439            format_hint: FormatHint::Json,
440            original_size: 5000,
441            compressed_size: 2000,
442            crc32: 0xFACEFEED,
443            transform_metadata: vec![],
444            has_dict: false,
445            meta_compressed: false,
446            use_brotli: true,
447            meta_embedded: true,
448        };
449
450        let mut buf = Vec::new();
451        header.write_to(&mut buf).unwrap();
452
453        // Flags: bit 3 (brotli) + bit 4 (meta_embedded). No bit 0 because metadata is empty.
454        assert_eq!(buf[7], FLAG_BROTLI | FLAG_META_EMBEDDED);
455
456        let mut cursor = io::Cursor::new(&buf);
457        let decoded = DcxHeader::read_from(&mut cursor).unwrap();
458        assert!(decoded.use_brotli);
459        assert!(decoded.meta_embedded);
460        assert!(!decoded.has_dict);
461        assert!(!decoded.meta_compressed);
462        assert!(decoded.transform_metadata.is_empty());
463    }
464
465    #[test]
466    fn bad_magic_rejected() {
467        let buf = [0u8; HEADER_SIZE];
468        let mut cursor = io::Cursor::new(&buf);
469        assert!(DcxHeader::read_from(&mut cursor).is_err());
470    }
471
472    #[test]
473    fn bad_version_rejected() {
474        let mut buf = [0u8; HEADER_SIZE];
475        buf[0..4].copy_from_slice(&MAGIC);
476        buf[4] = 99; // wrong version
477        let mut cursor = io::Cursor::new(&buf);
478        assert!(DcxHeader::read_from(&mut cursor).is_err());
479    }
480}