Skip to main content

cjc_snap/
encode.rs

1//! Canonical binary encoding for CJC `Value` types.
2//!
3//! Encoding is deterministic: the same logical value always produces the same
4//! byte sequence regardless of HashMap iteration order, Rc pointer identity,
5//! etc. This is critical for content-addressable hashing.
6
7use cjc_runtime::Value;
8
9// ---------------------------------------------------------------------------
10// Tag bytes -- one byte identifies each variant
11// ---------------------------------------------------------------------------
12
13/// Tag byte for [`Value::Void`].
14pub const TAG_VOID: u8 = 0x00;
15/// Tag byte for [`Value::Int`].
16pub const TAG_INT: u8 = 0x01;
17/// Tag byte for [`Value::Float`].
18pub const TAG_FLOAT: u8 = 0x02;
19/// Tag byte for [`Value::Bool`].
20pub const TAG_BOOL: u8 = 0x03;
21/// Tag byte for [`Value::String`].
22pub const TAG_STRING: u8 = 0x04;
23/// Tag byte for [`Value::Array`].
24pub const TAG_ARRAY: u8 = 0x05;
25/// Tag byte for [`Value::Tuple`].
26pub const TAG_TUPLE: u8 = 0x06;
27/// Tag byte for [`Value::Struct`].
28pub const TAG_STRUCT: u8 = 0x07;
29/// Tag byte for [`Value::Tensor`] (f64 data with shape).
30pub const TAG_TENSOR: u8 = 0x08;
31/// Tag byte for [`Value::Enum`].
32pub const TAG_ENUM: u8 = 0x09;
33/// Tag byte for [`Value::Bytes`] (mutable byte buffer).
34pub const TAG_BYTES: u8 = 0x0A;
35/// Tag byte for [`Value::ByteSlice`] (immutable byte slice).
36pub const TAG_BYTESLICE: u8 = 0x0B;
37/// Tag byte for [`Value::StrView`] (UTF-8 validated byte slice).
38pub const TAG_STRVIEW: u8 = 0x0C;
39/// Tag byte for [`Value::U8`] (single unsigned byte).
40pub const TAG_U8: u8 = 0x0D;
41/// Tag byte for [`Value::Bf16`] (brain float 16-bit).
42pub const TAG_BF16: u8 = 0x0E;
43/// Tag byte for [`Value::F16`] (IEEE 754 half-precision float).
44pub const TAG_F16: u8 = 0x0F;
45/// Tag byte for [`Value::Complex`] (complex f64).
46pub const TAG_COMPLEX: u8 = 0x10;
47/// Tag byte for [`Value::Map`] (deterministic key-value map).
48pub const TAG_MAP: u8 = 0x11;
49/// Tag byte for a typed tensor with explicit dtype metadata.
50pub const TAG_TYPED_TENSOR: u8 = 0x12;
51/// Tag byte for a chunked tensor with per-chunk SHA-256 hashes.
52pub const TAG_CHUNKED_TENSOR: u8 = 0x13;
53/// Tag byte for a sparse CSR (Compressed Sparse Row) matrix.
54pub const TAG_SPARSE_CSR: u8 = 0x14;
55/// Tag byte for a categorical column (levels + integer codes).
56pub const TAG_CATEGORICAL: u8 = 0x15;
57/// Tag byte for a schema definition (field names + type tags).
58pub const TAG_SCHEMA: u8 = 0x16;
59/// Tag byte for a columnar DataFrame.
60pub const TAG_DATAFRAME: u8 = 0x17;
61/// Tag byte for [`Value::Na`] (missing / not available).
62pub const TAG_NA: u8 = 0x18;
63
64/// Snap v2 format magic bytes (`CJS\x01`), used to distinguish v2 payloads
65/// from v1 during auto-detection in [`snap_decode_v2`](crate::snap_decode_v2).
66pub const SNAP_MAGIC: &[u8; 4] = b"CJS\x01";
67
68/// Snap v2 payload format version number.
69pub const SNAP_VERSION: u8 = 2;
70
71/// Canonical NaN representation for f64 (quiet NaN with no payload).
72const CANONICAL_NAN_BITS: u64 = 0x7FF8_0000_0000_0000;
73
74// ---------------------------------------------------------------------------
75// Encoder
76// ---------------------------------------------------------------------------
77
78/// Encode a CJC [`Value`] into a canonical byte representation (v1 format).
79///
80/// The encoding is fully deterministic:
81/// - Struct fields are sorted alphabetically by name.
82/// - Floats: NaN is canonicalized to a single quiet-NaN bit pattern.
83/// - Integers: little-endian 8 bytes.
84/// - Strings: 8-byte little-endian length prefix + UTF-8 bytes.
85/// - Map entries are sorted by the canonical encoding of their keys.
86///
87/// # Arguments
88///
89/// * `value` - The [`Value`] to encode. Must be a data-bearing variant
90///   (see [`is_snappable`](crate::is_snappable)).
91///
92/// # Returns
93///
94/// A `Vec<u8>` containing the canonical binary encoding.
95///
96/// # Panics
97///
98/// Panics if `value` is a runtime-only variant (`Fn`, `Closure`,
99/// `ClassRef`, `Scratchpad`, `GradGraph`, etc.) that cannot be
100/// meaningfully serialized.
101pub fn snap_encode(value: &Value) -> Vec<u8> {
102    let mut buf = Vec::with_capacity(256);
103    encode_value(value, &mut buf);
104    buf
105}
106
107fn encode_value(value: &Value, buf: &mut Vec<u8>) {
108    match value {
109        Value::Void => {
110            buf.push(TAG_VOID);
111        }
112        Value::Na => {
113            buf.push(TAG_NA);
114        }
115        Value::Int(v) => {
116            buf.push(TAG_INT);
117            buf.extend_from_slice(&v.to_le_bytes());
118        }
119        Value::Float(v) => {
120            buf.push(TAG_FLOAT);
121            let bits = if v.is_nan() {
122                CANONICAL_NAN_BITS
123            } else {
124                v.to_bits()
125            };
126            buf.extend_from_slice(&bits.to_le_bytes());
127        }
128        Value::Bool(v) => {
129            buf.push(TAG_BOOL);
130            buf.push(if *v { 0x01 } else { 0x00 });
131        }
132        Value::String(s) => {
133            buf.push(TAG_STRING);
134            encode_string(s.as_str(), buf);
135        }
136        Value::Bytes(b) => {
137            buf.push(TAG_BYTES);
138            let data = b.borrow();
139            let len = data.len() as u64;
140            buf.extend_from_slice(&len.to_le_bytes());
141            buf.extend_from_slice(&data);
142        }
143        Value::ByteSlice(b) => {
144            buf.push(TAG_BYTESLICE);
145            let len = b.len() as u64;
146            buf.extend_from_slice(&len.to_le_bytes());
147            buf.extend_from_slice(b);
148        }
149        Value::StrView(b) => {
150            buf.push(TAG_STRVIEW);
151            let len = b.len() as u64;
152            buf.extend_from_slice(&len.to_le_bytes());
153            buf.extend_from_slice(b);
154        }
155        Value::U8(v) => {
156            buf.push(TAG_U8);
157            buf.push(*v);
158        }
159        Value::Array(arr) => {
160            buf.push(TAG_ARRAY);
161            let len = arr.len() as u64;
162            buf.extend_from_slice(&len.to_le_bytes());
163            for elem in arr.iter() {
164                encode_value(elem, buf);
165            }
166        }
167        Value::Tuple(elems) => {
168            buf.push(TAG_TUPLE);
169            let len = elems.len() as u64;
170            buf.extend_from_slice(&len.to_le_bytes());
171            for elem in elems.iter() {
172                encode_value(elem, buf);
173            }
174        }
175        Value::Struct { name, fields } => {
176            buf.push(TAG_STRUCT);
177            // Encode struct name
178            encode_string(name, buf);
179            // Sort fields by name for determinism
180            let mut sorted_fields: Vec<(&String, &Value)> = fields.iter().collect();
181            sorted_fields.sort_by_key(|(k, _)| *k);
182            // Encode field count
183            let count = sorted_fields.len() as u64;
184            buf.extend_from_slice(&count.to_le_bytes());
185            // Encode each field: name + value
186            for (key, val) in sorted_fields {
187                encode_string(key, buf);
188                encode_value(val, buf);
189            }
190        }
191        Value::Tensor(t) => {
192            buf.push(TAG_TENSOR);
193            let shape = t.shape();
194            let ndim = shape.len() as u64;
195            buf.extend_from_slice(&ndim.to_le_bytes());
196            for &dim in shape {
197                buf.extend_from_slice(&(dim as u64).to_le_bytes());
198            }
199            // Write contiguous f64 data
200            let data = t.to_vec();
201            for &val in &data {
202                let bits = if val.is_nan() {
203                    CANONICAL_NAN_BITS
204                } else {
205                    val.to_bits()
206                };
207                buf.extend_from_slice(&bits.to_le_bytes());
208            }
209        }
210        Value::Enum {
211            enum_name,
212            variant,
213            fields,
214        } => {
215            buf.push(TAG_ENUM);
216            encode_string(enum_name, buf);
217            encode_string(variant, buf);
218            let count = fields.len() as u64;
219            buf.extend_from_slice(&count.to_le_bytes());
220            for field in fields {
221                encode_value(field, buf);
222            }
223        }
224        Value::Bf16(v) => {
225            buf.push(TAG_BF16);
226            buf.extend_from_slice(&v.0.to_le_bytes());
227        }
228        Value::F16(v) => {
229            buf.push(TAG_F16);
230            buf.extend_from_slice(&v.0.to_le_bytes());
231        }
232        Value::Complex(z) => {
233            buf.push(TAG_COMPLEX);
234            let re_bits = if z.re.is_nan() {
235                CANONICAL_NAN_BITS
236            } else {
237                z.re.to_bits()
238            };
239            let im_bits = if z.im.is_nan() {
240                CANONICAL_NAN_BITS
241            } else {
242                z.im.to_bits()
243            };
244            buf.extend_from_slice(&re_bits.to_le_bytes());
245            buf.extend_from_slice(&im_bits.to_le_bytes());
246        }
247        Value::Map(m) => {
248            buf.push(TAG_MAP);
249            let map = m.borrow();
250            // DetMap preserves insertion order, but for canonical encoding
251            // we sort entries by their encoded key representation.
252            let entries: Vec<_> = map.iter().collect();
253            // Sort by key's canonical encoding for determinism
254            let mut sorted: Vec<(Vec<u8>, &Value, &Value)> = entries
255                .iter()
256                .map(|(k, v)| {
257                    let mut key_buf = Vec::new();
258                    encode_value(k, &mut key_buf);
259                    (key_buf, *k, *v)
260                })
261                .collect();
262            sorted.sort_by(|(a, _, _), (b, _, _)| a.cmp(b));
263
264            let count = sorted.len() as u64;
265            buf.extend_from_slice(&count.to_le_bytes());
266            for (key_bytes, _, val) in &sorted {
267                buf.extend_from_slice(key_bytes);
268                encode_value(val, buf);
269            }
270        }
271
272        Value::SparseTensor(s) => {
273            encode_sparse_csr(s.nrows, s.ncols, &s.row_offsets, &s.col_indices, &s.values, buf);
274        }
275
276        // Runtime-only variants that cannot be meaningfully serialized:
277        Value::ClassRef(_)
278        | Value::Fn(_)
279        | Value::Closure { .. }
280        | Value::Regex { .. }
281        | Value::Scratchpad(_)
282        | Value::PagedKvCache(_)
283        | Value::AlignedBytes(_)
284        | Value::GradGraph(_)
285        | Value::OptimizerState(_)
286        | Value::TidyView(_)
287        | Value::GroupedTidyView(_)
288        | Value::VizorPlot(_)
289        | Value::QuantumState(_) => {
290            panic!(
291                "snap_encode: cannot serialize runtime-only variant: {}",
292                value.type_name()
293            );
294        }
295    }
296}
297
298/// Encode a CJC [`Value`] into the v2 snap format with magic header and version byte.
299///
300/// Format: `[MAGIC: "CJS\x01"][version: u8][flags: u8][payload...]`
301///
302/// The 6-byte header is followed by the same tag-based payload produced by
303/// [`snap_encode`]. The flags byte is currently `0x00` (uncompressed).
304///
305/// # Arguments
306///
307/// * `value` - The [`Value`] to encode.
308///
309/// # Returns
310///
311/// A `Vec<u8>` containing the v2-format binary encoding.
312///
313/// # Panics
314///
315/// Panics if `value` is a runtime-only variant.
316pub fn snap_encode_v2(value: &Value) -> Vec<u8> {
317    let mut buf = Vec::with_capacity(256);
318    buf.extend_from_slice(SNAP_MAGIC);
319    buf.push(SNAP_VERSION);
320    buf.push(0x00); // flags: uncompressed
321    encode_value(value, &mut buf);
322    buf
323}
324
325/// Encode a typed tensor with explicit dtype metadata into `buf`.
326///
327/// Wire format: `[TAG_TYPED_TENSOR][dtype: u8][ndim: u64][shape...][byte_len: u64][raw_bytes]`
328///
329/// The `dtype_tag` identifies the element type (0 = F64, 1 = F32, 2 = I64,
330/// 3 = I32, 4 = U8). The decoder converts all types back to f64 tensors.
331///
332/// # Arguments
333///
334/// * `dtype_tag` - Numeric type identifier for the tensor elements.
335/// * `shape` - Dimension sizes of the tensor.
336/// * `raw_bytes` - The raw element data in little-endian byte order.
337/// * `buf` - Output buffer to append the encoded bytes to.
338pub fn encode_typed_tensor(
339    dtype_tag: u8,
340    shape: &[usize],
341    raw_bytes: &[u8],
342    buf: &mut Vec<u8>,
343) {
344    buf.push(TAG_TYPED_TENSOR);
345    buf.push(dtype_tag);
346    let ndim = shape.len() as u64;
347    buf.extend_from_slice(&ndim.to_le_bytes());
348    for &dim in shape {
349        buf.extend_from_slice(&(dim as u64).to_le_bytes());
350    }
351    let byte_len = raw_bytes.len() as u64;
352    buf.extend_from_slice(&byte_len.to_le_bytes());
353    buf.extend_from_slice(raw_bytes);
354}
355
356/// Encode a sparse CSR (Compressed Sparse Row) matrix into `buf`.
357///
358/// Wire format:
359/// ```text
360/// [TAG_SPARSE_CSR][dtype: u8][nrows: u64][ncols: u64][nnz: u64]
361/// [row_ptr: (nrows+1) × u64][col_idx: nnz × u64][values: nnz × f64]
362/// ```
363///
364/// NaN values in `values` are canonicalized to a single quiet-NaN bit
365/// pattern for deterministic hashing.
366///
367/// # Arguments
368///
369/// * `nrows` - Number of rows in the matrix.
370/// * `ncols` - Number of columns in the matrix.
371/// * `row_ptr` - Row pointer array of length `nrows + 1`.
372/// * `col_idx` - Column index array of length `nnz`.
373/// * `values` - Non-zero value array of length `nnz`.
374/// * `buf` - Output buffer to append the encoded bytes to.
375pub fn encode_sparse_csr(
376    nrows: usize,
377    ncols: usize,
378    row_ptr: &[usize],
379    col_idx: &[usize],
380    values: &[f64],
381    buf: &mut Vec<u8>,
382) {
383    buf.push(TAG_SPARSE_CSR);
384    buf.push(0x00); // dtype = f64
385    buf.extend_from_slice(&(nrows as u64).to_le_bytes());
386    buf.extend_from_slice(&(ncols as u64).to_le_bytes());
387    let nnz = values.len() as u64;
388    buf.extend_from_slice(&nnz.to_le_bytes());
389    // row_ptr: (nrows+1) entries
390    for &rp in row_ptr {
391        buf.extend_from_slice(&(rp as u64).to_le_bytes());
392    }
393    // col_idx: nnz entries
394    for &ci in col_idx {
395        buf.extend_from_slice(&(ci as u64).to_le_bytes());
396    }
397    // values: nnz f64s
398    for &v in values {
399        let bits = if v.is_nan() { CANONICAL_NAN_BITS } else { v.to_bits() };
400        buf.extend_from_slice(&bits.to_le_bytes());
401    }
402}
403
404/// Encode a categorical column (factor variable) into `buf`.
405///
406/// Wire format:
407/// ```text
408/// [TAG_CATEGORICAL][n_levels: u32][level_strings...][n_rows: u64][codes: n_rows × u32]
409/// ```
410///
411/// Each level string is encoded as a length-prefixed UTF-8 string. Codes
412/// are zero-based indices into the levels array.
413///
414/// # Arguments
415///
416/// * `levels` - The unique category labels (e.g., `["cat", "dog", "fish"]`).
417/// * `codes` - Per-row integer codes referencing `levels`.
418/// * `buf` - Output buffer to append the encoded bytes to.
419pub fn encode_categorical(
420    levels: &[String],
421    codes: &[u32],
422    buf: &mut Vec<u8>,
423) {
424    buf.push(TAG_CATEGORICAL);
425    let n_levels = levels.len() as u32;
426    buf.extend_from_slice(&n_levels.to_le_bytes());
427    for level in levels {
428        encode_string(level, buf);
429    }
430    let n_rows = codes.len() as u64;
431    buf.extend_from_slice(&n_rows.to_le_bytes());
432    for &c in codes {
433        buf.extend_from_slice(&c.to_le_bytes());
434    }
435}
436
437/// Encode a schema definition (field names and type tags) into `buf`.
438///
439/// Wire format: `[TAG_SCHEMA][n_fields: u32][name: str, type_tag: u8]...`
440///
441/// Decodes back to a [`Value::Struct`] named `"Schema"` where each field
442/// maps a column name to its type tag as an integer.
443///
444/// # Arguments
445///
446/// * `fields` - Pairs of `(field_name, type_tag)`.
447/// * `buf` - Output buffer to append the encoded bytes to.
448pub fn encode_schema(
449    fields: &[(String, u8)],
450    buf: &mut Vec<u8>,
451) {
452    buf.push(TAG_SCHEMA);
453    let n_fields = fields.len() as u32;
454    buf.extend_from_slice(&n_fields.to_le_bytes());
455    for (name, type_tag) in fields {
456        encode_string(name, buf);
457        buf.push(*type_tag);
458    }
459}
460
461/// Default chunk size for [`encode_chunked_tensor`]: 4 MB.
462pub const DEFAULT_CHUNK_SIZE: usize = 4 * 1024 * 1024;
463
464/// DataFrame column type tag: 64-bit signed integer.
465pub const COL_TYPE_INT: u8 = 0;
466/// DataFrame column type tag: 64-bit float (NaN canonicalized).
467pub const COL_TYPE_FLOAT: u8 = 1;
468/// DataFrame column type tag: length-prefixed UTF-8 string.
469pub const COL_TYPE_STR: u8 = 2;
470/// DataFrame column type tag: boolean (0x00 / 0x01).
471pub const COL_TYPE_BOOL: u8 = 3;
472/// DataFrame column type tag: categorical (levels + codes).
473pub const COL_TYPE_CATEGORICAL: u8 = 4;
474/// DataFrame column type tag: datetime as epoch milliseconds (i64).
475pub const COL_TYPE_DATETIME: u8 = 5;
476
477/// Encode a tensor as chunked format with per-chunk SHA-256 integrity hashes.
478///
479/// Wire format:
480/// ```text
481/// [TAG_CHUNKED_TENSOR][dtype: u8][ndim: u64][shape...]
482/// [chunk_size: u64][n_chunks: u64]
483/// [chunk_0_len: u64][chunk_0_hash: 32 bytes][chunk_0_bytes...]
484/// [chunk_1_len: u64][chunk_1_hash: 32 bytes][chunk_1_bytes...]
485/// ...
486/// ```
487///
488/// Each chunk is independently content-addressed, enabling streaming
489/// verification, resumable I/O, and per-chunk deduplication.
490///
491/// # Arguments
492///
493/// * `dtype_tag` - Numeric type identifier for tensor elements.
494/// * `shape` - Dimension sizes of the tensor.
495/// * `raw_bytes` - The raw element data in little-endian byte order.
496/// * `chunk_size` - Maximum bytes per chunk (0 defaults to
497///   [`DEFAULT_CHUNK_SIZE`]).
498/// * `buf` - Output buffer to append the encoded bytes to.
499pub fn encode_chunked_tensor(
500    dtype_tag: u8,
501    shape: &[usize],
502    raw_bytes: &[u8],
503    chunk_size: usize,
504    buf: &mut Vec<u8>,
505) {
506    buf.push(TAG_CHUNKED_TENSOR);
507    buf.push(dtype_tag);
508    let ndim = shape.len() as u64;
509    buf.extend_from_slice(&ndim.to_le_bytes());
510    for &dim in shape {
511        buf.extend_from_slice(&(dim as u64).to_le_bytes());
512    }
513
514    let cs = if chunk_size == 0 { DEFAULT_CHUNK_SIZE } else { chunk_size };
515    buf.extend_from_slice(&(cs as u64).to_le_bytes());
516
517    // Calculate number of chunks
518    let n_chunks = if raw_bytes.is_empty() {
519        0usize
520    } else {
521        (raw_bytes.len() + cs - 1) / cs
522    };
523    buf.extend_from_slice(&(n_chunks as u64).to_le_bytes());
524
525    // Encode each chunk: [len][sha256][bytes]
526    for i in 0..n_chunks {
527        let start = i * cs;
528        let end = (start + cs).min(raw_bytes.len());
529        let chunk = &raw_bytes[start..end];
530        let chunk_len = chunk.len() as u64;
531        let chunk_hash = crate::sha256(chunk);
532
533        buf.extend_from_slice(&chunk_len.to_le_bytes());
534        buf.extend_from_slice(&chunk_hash);
535        buf.extend_from_slice(chunk);
536    }
537}
538
539/// Encode a DataFrame as columnar binary format into `buf`.
540///
541/// Wire format:
542/// ```text
543/// [TAG_DATAFRAME][n_cols: u32][n_rows: u64]
544/// [col_name: str][col_type: u8][col_data...]...
545/// ```
546///
547/// Per-column data encoding depends on column type:
548/// - **Int** (`COL_TYPE_INT`): `[i64 x n_rows]`
549/// - **Float** (`COL_TYPE_FLOAT`): `[f64 bits x n_rows]` (NaN canonicalized)
550/// - **Str** (`COL_TYPE_STR`): `[length-prefixed string x n_rows]`
551/// - **Bool** (`COL_TYPE_BOOL`): `[u8 x n_rows]` (0x00 / 0x01)
552/// - **Categorical** (`COL_TYPE_CATEGORICAL`): `[n_levels: u32][level_strings...][codes: u32 x n_rows]`
553/// - **DateTime** (`COL_TYPE_DATETIME`): `[i64 x n_rows]` (epoch millis)
554///
555/// Decodes back to a [`Value::Struct`] named `"DataFrame"` with metadata
556/// fields `__columns` and `__nrows`, plus one array field per column.
557///
558/// # Arguments
559///
560/// * `column_names` - Names for each column.
561/// * `column_types` - Type tag for each column (see `COL_TYPE_*` constants).
562/// * `column_data` - Typed column data matching the type tags.
563/// * `n_rows` - Number of rows in the DataFrame.
564/// * `buf` - Output buffer to append the encoded bytes to.
565pub fn encode_dataframe(
566    column_names: &[&str],
567    column_types: &[u8],
568    column_data: &[DataFrameColumnData<'_>],
569    n_rows: usize,
570    buf: &mut Vec<u8>,
571) {
572    buf.push(TAG_DATAFRAME);
573    let n_cols = column_names.len() as u32;
574    buf.extend_from_slice(&n_cols.to_le_bytes());
575    buf.extend_from_slice(&(n_rows as u64).to_le_bytes());
576
577    for i in 0..column_names.len() {
578        encode_string(column_names[i], buf);
579        buf.push(column_types[i]);
580
581        match &column_data[i] {
582            DataFrameColumnData::Int(vals) => {
583                for &v in vals.iter() {
584                    buf.extend_from_slice(&v.to_le_bytes());
585                }
586            }
587            DataFrameColumnData::Float(vals) => {
588                for &v in vals.iter() {
589                    let bits = if v.is_nan() { CANONICAL_NAN_BITS } else { v.to_bits() };
590                    buf.extend_from_slice(&bits.to_le_bytes());
591                }
592            }
593            DataFrameColumnData::Str(vals) => {
594                for s in vals.iter() {
595                    encode_string(s, buf);
596                }
597            }
598            DataFrameColumnData::Bool(vals) => {
599                for &b in vals.iter() {
600                    buf.push(if b { 0x01 } else { 0x00 });
601                }
602            }
603            DataFrameColumnData::Categorical { levels, codes } => {
604                let n_levels = levels.len() as u32;
605                buf.extend_from_slice(&n_levels.to_le_bytes());
606                for level in levels.iter() {
607                    encode_string(level, buf);
608                }
609                for &c in codes.iter() {
610                    buf.extend_from_slice(&c.to_le_bytes());
611                }
612            }
613            DataFrameColumnData::DateTime(vals) => {
614                for &v in vals.iter() {
615                    buf.extend_from_slice(&v.to_le_bytes());
616                }
617            }
618        }
619    }
620}
621
622/// Typed column data for [`encode_dataframe`].
623///
624/// Each variant borrows the underlying column data for zero-copy encoding.
625pub enum DataFrameColumnData<'a> {
626    /// Column of 64-bit signed integers.
627    Int(&'a [i64]),
628    /// Column of 64-bit floats (NaN values are canonicalized during encoding).
629    Float(&'a [f64]),
630    /// Column of UTF-8 strings.
631    Str(&'a [String]),
632    /// Column of boolean values.
633    Bool(&'a [bool]),
634    /// Categorical column with unique level labels and per-row integer codes.
635    Categorical {
636        /// The unique category labels.
637        levels: &'a [String],
638        /// Zero-based indices into `levels`, one per row.
639        codes: &'a [u32],
640    },
641    /// Column of datetime values stored as epoch milliseconds (i64).
642    DateTime(&'a [i64]),
643}
644
645/// Encode a string as 8-byte little-endian length + UTF-8 bytes.
646fn encode_string(s: &str, buf: &mut Vec<u8>) {
647    let bytes = s.as_bytes();
648    let len = bytes.len() as u64;
649    buf.extend_from_slice(&len.to_le_bytes());
650    buf.extend_from_slice(bytes);
651}
652
653#[cfg(test)]
654mod tests {
655    use super::*;
656    use std::collections::BTreeMap;
657    use std::rc::Rc;
658
659    #[test]
660    fn test_encode_void() {
661        let bytes = snap_encode(&Value::Void);
662        assert_eq!(bytes, vec![TAG_VOID]);
663    }
664
665    #[test]
666    fn test_encode_int() {
667        let bytes = snap_encode(&Value::Int(42));
668        assert_eq!(bytes[0], TAG_INT);
669        assert_eq!(bytes.len(), 9);
670        let val = i64::from_le_bytes(bytes[1..9].try_into().unwrap());
671        assert_eq!(val, 42);
672    }
673
674    #[test]
675    fn test_encode_negative_int() {
676        let bytes = snap_encode(&Value::Int(-1));
677        let val = i64::from_le_bytes(bytes[1..9].try_into().unwrap());
678        assert_eq!(val, -1);
679    }
680
681    #[test]
682    fn test_encode_float() {
683        let bytes = snap_encode(&Value::Float(3.14));
684        assert_eq!(bytes[0], TAG_FLOAT);
685        assert_eq!(bytes.len(), 9);
686        let bits = u64::from_le_bytes(bytes[1..9].try_into().unwrap());
687        assert_eq!(f64::from_bits(bits), 3.14);
688    }
689
690    #[test]
691    fn test_encode_nan_canonicalized() {
692        let nan1 = snap_encode(&Value::Float(f64::NAN));
693        let nan2 = snap_encode(&Value::Float(-f64::NAN));
694        // Both NaN variants produce the same canonical encoding
695        assert_eq!(nan1, nan2);
696        let bits = u64::from_le_bytes(nan1[1..9].try_into().unwrap());
697        assert_eq!(bits, CANONICAL_NAN_BITS);
698    }
699
700    #[test]
701    fn test_encode_bool() {
702        let t = snap_encode(&Value::Bool(true));
703        let f = snap_encode(&Value::Bool(false));
704        assert_eq!(t, vec![TAG_BOOL, 0x01]);
705        assert_eq!(f, vec![TAG_BOOL, 0x00]);
706    }
707
708    #[test]
709    fn test_encode_string() {
710        let val = Value::String(Rc::new("hello".to_string()));
711        let bytes = snap_encode(&val);
712        assert_eq!(bytes[0], TAG_STRING);
713        let len = u64::from_le_bytes(bytes[1..9].try_into().unwrap());
714        assert_eq!(len, 5);
715        assert_eq!(&bytes[9..14], b"hello");
716    }
717
718    #[test]
719    fn test_encode_array() {
720        let val = Value::Array(Rc::new(vec![Value::Int(1), Value::Int(2)]));
721        let bytes = snap_encode(&val);
722        assert_eq!(bytes[0], TAG_ARRAY);
723        let len = u64::from_le_bytes(bytes[1..9].try_into().unwrap());
724        assert_eq!(len, 2);
725    }
726
727    #[test]
728    fn test_encode_struct_sorted_fields() {
729        // Fields in HashMap may come in any order, but encoding must be sorted
730        let mut fields = BTreeMap::new();
731        fields.insert("z".to_string(), Value::Int(3));
732        fields.insert("a".to_string(), Value::Int(1));
733        fields.insert("m".to_string(), Value::Int(2));
734        let val = Value::Struct {
735            name: "Test".to_string(),
736            fields,
737        };
738        let bytes1 = snap_encode(&val);
739
740        // Encode again -- must produce identical bytes
741        let mut fields2 = BTreeMap::new();
742        fields2.insert("m".to_string(), Value::Int(2));
743        fields2.insert("a".to_string(), Value::Int(1));
744        fields2.insert("z".to_string(), Value::Int(3));
745        let val2 = Value::Struct {
746            name: "Test".to_string(),
747            fields: fields2,
748        };
749        let bytes2 = snap_encode(&val2);
750
751        assert_eq!(bytes1, bytes2, "struct encoding must be deterministic regardless of insertion order");
752    }
753
754    #[test]
755    fn test_encode_deterministic() {
756        let v1 = Value::Float(1.0);
757        let v2 = Value::Float(1.0);
758        assert_eq!(snap_encode(&v1), snap_encode(&v2));
759    }
760}