1use cjc_runtime::Value;
8
9pub const TAG_VOID: u8 = 0x00;
15pub const TAG_INT: u8 = 0x01;
17pub const TAG_FLOAT: u8 = 0x02;
19pub const TAG_BOOL: u8 = 0x03;
21pub const TAG_STRING: u8 = 0x04;
23pub const TAG_ARRAY: u8 = 0x05;
25pub const TAG_TUPLE: u8 = 0x06;
27pub const TAG_STRUCT: u8 = 0x07;
29pub const TAG_TENSOR: u8 = 0x08;
31pub const TAG_ENUM: u8 = 0x09;
33pub const TAG_BYTES: u8 = 0x0A;
35pub const TAG_BYTESLICE: u8 = 0x0B;
37pub const TAG_STRVIEW: u8 = 0x0C;
39pub const TAG_U8: u8 = 0x0D;
41pub const TAG_BF16: u8 = 0x0E;
43pub const TAG_F16: u8 = 0x0F;
45pub const TAG_COMPLEX: u8 = 0x10;
47pub const TAG_MAP: u8 = 0x11;
49pub const TAG_TYPED_TENSOR: u8 = 0x12;
51pub const TAG_CHUNKED_TENSOR: u8 = 0x13;
53pub const TAG_SPARSE_CSR: u8 = 0x14;
55pub const TAG_CATEGORICAL: u8 = 0x15;
57pub const TAG_SCHEMA: u8 = 0x16;
59pub const TAG_DATAFRAME: u8 = 0x17;
61pub const TAG_NA: u8 = 0x18;
63
64pub const SNAP_MAGIC: &[u8; 4] = b"CJS\x01";
67
68pub const SNAP_VERSION: u8 = 2;
70
71const CANONICAL_NAN_BITS: u64 = 0x7FF8_0000_0000_0000;
73
74pub fn snap_encode(value: &Value) -> Vec<u8> {
102 let mut buf = Vec::with_capacity(256);
103 encode_value(value, &mut buf);
104 buf
105}
106
107fn encode_value(value: &Value, buf: &mut Vec<u8>) {
108 match value {
109 Value::Void => {
110 buf.push(TAG_VOID);
111 }
112 Value::Na => {
113 buf.push(TAG_NA);
114 }
115 Value::Int(v) => {
116 buf.push(TAG_INT);
117 buf.extend_from_slice(&v.to_le_bytes());
118 }
119 Value::Float(v) => {
120 buf.push(TAG_FLOAT);
121 let bits = if v.is_nan() {
122 CANONICAL_NAN_BITS
123 } else {
124 v.to_bits()
125 };
126 buf.extend_from_slice(&bits.to_le_bytes());
127 }
128 Value::Bool(v) => {
129 buf.push(TAG_BOOL);
130 buf.push(if *v { 0x01 } else { 0x00 });
131 }
132 Value::String(s) => {
133 buf.push(TAG_STRING);
134 encode_string(s.as_str(), buf);
135 }
136 Value::Bytes(b) => {
137 buf.push(TAG_BYTES);
138 let data = b.borrow();
139 let len = data.len() as u64;
140 buf.extend_from_slice(&len.to_le_bytes());
141 buf.extend_from_slice(&data);
142 }
143 Value::ByteSlice(b) => {
144 buf.push(TAG_BYTESLICE);
145 let len = b.len() as u64;
146 buf.extend_from_slice(&len.to_le_bytes());
147 buf.extend_from_slice(b);
148 }
149 Value::StrView(b) => {
150 buf.push(TAG_STRVIEW);
151 let len = b.len() as u64;
152 buf.extend_from_slice(&len.to_le_bytes());
153 buf.extend_from_slice(b);
154 }
155 Value::U8(v) => {
156 buf.push(TAG_U8);
157 buf.push(*v);
158 }
159 Value::Array(arr) => {
160 buf.push(TAG_ARRAY);
161 let len = arr.len() as u64;
162 buf.extend_from_slice(&len.to_le_bytes());
163 for elem in arr.iter() {
164 encode_value(elem, buf);
165 }
166 }
167 Value::Tuple(elems) => {
168 buf.push(TAG_TUPLE);
169 let len = elems.len() as u64;
170 buf.extend_from_slice(&len.to_le_bytes());
171 for elem in elems.iter() {
172 encode_value(elem, buf);
173 }
174 }
175 Value::Struct { name, fields } => {
176 buf.push(TAG_STRUCT);
177 encode_string(name, buf);
179 let mut sorted_fields: Vec<(&String, &Value)> = fields.iter().collect();
181 sorted_fields.sort_by_key(|(k, _)| *k);
182 let count = sorted_fields.len() as u64;
184 buf.extend_from_slice(&count.to_le_bytes());
185 for (key, val) in sorted_fields {
187 encode_string(key, buf);
188 encode_value(val, buf);
189 }
190 }
191 Value::Tensor(t) => {
192 buf.push(TAG_TENSOR);
193 let shape = t.shape();
194 let ndim = shape.len() as u64;
195 buf.extend_from_slice(&ndim.to_le_bytes());
196 for &dim in shape {
197 buf.extend_from_slice(&(dim as u64).to_le_bytes());
198 }
199 let data = t.to_vec();
201 for &val in &data {
202 let bits = if val.is_nan() {
203 CANONICAL_NAN_BITS
204 } else {
205 val.to_bits()
206 };
207 buf.extend_from_slice(&bits.to_le_bytes());
208 }
209 }
210 Value::Enum {
211 enum_name,
212 variant,
213 fields,
214 } => {
215 buf.push(TAG_ENUM);
216 encode_string(enum_name, buf);
217 encode_string(variant, buf);
218 let count = fields.len() as u64;
219 buf.extend_from_slice(&count.to_le_bytes());
220 for field in fields {
221 encode_value(field, buf);
222 }
223 }
224 Value::Bf16(v) => {
225 buf.push(TAG_BF16);
226 buf.extend_from_slice(&v.0.to_le_bytes());
227 }
228 Value::F16(v) => {
229 buf.push(TAG_F16);
230 buf.extend_from_slice(&v.0.to_le_bytes());
231 }
232 Value::Complex(z) => {
233 buf.push(TAG_COMPLEX);
234 let re_bits = if z.re.is_nan() {
235 CANONICAL_NAN_BITS
236 } else {
237 z.re.to_bits()
238 };
239 let im_bits = if z.im.is_nan() {
240 CANONICAL_NAN_BITS
241 } else {
242 z.im.to_bits()
243 };
244 buf.extend_from_slice(&re_bits.to_le_bytes());
245 buf.extend_from_slice(&im_bits.to_le_bytes());
246 }
247 Value::Map(m) => {
248 buf.push(TAG_MAP);
249 let map = m.borrow();
250 let entries: Vec<_> = map.iter().collect();
253 let mut sorted: Vec<(Vec<u8>, &Value, &Value)> = entries
255 .iter()
256 .map(|(k, v)| {
257 let mut key_buf = Vec::new();
258 encode_value(k, &mut key_buf);
259 (key_buf, *k, *v)
260 })
261 .collect();
262 sorted.sort_by(|(a, _, _), (b, _, _)| a.cmp(b));
263
264 let count = sorted.len() as u64;
265 buf.extend_from_slice(&count.to_le_bytes());
266 for (key_bytes, _, val) in &sorted {
267 buf.extend_from_slice(key_bytes);
268 encode_value(val, buf);
269 }
270 }
271
272 Value::SparseTensor(s) => {
273 encode_sparse_csr(s.nrows, s.ncols, &s.row_offsets, &s.col_indices, &s.values, buf);
274 }
275
276 Value::ClassRef(_)
278 | Value::Fn(_)
279 | Value::Closure { .. }
280 | Value::Regex { .. }
281 | Value::Scratchpad(_)
282 | Value::PagedKvCache(_)
283 | Value::AlignedBytes(_)
284 | Value::GradGraph(_)
285 | Value::OptimizerState(_)
286 | Value::TidyView(_)
287 | Value::GroupedTidyView(_)
288 | Value::VizorPlot(_)
289 | Value::QuantumState(_) => {
290 panic!(
291 "snap_encode: cannot serialize runtime-only variant: {}",
292 value.type_name()
293 );
294 }
295 }
296}
297
298pub fn snap_encode_v2(value: &Value) -> Vec<u8> {
317 let mut buf = Vec::with_capacity(256);
318 buf.extend_from_slice(SNAP_MAGIC);
319 buf.push(SNAP_VERSION);
320 buf.push(0x00); encode_value(value, &mut buf);
322 buf
323}
324
325pub fn encode_typed_tensor(
339 dtype_tag: u8,
340 shape: &[usize],
341 raw_bytes: &[u8],
342 buf: &mut Vec<u8>,
343) {
344 buf.push(TAG_TYPED_TENSOR);
345 buf.push(dtype_tag);
346 let ndim = shape.len() as u64;
347 buf.extend_from_slice(&ndim.to_le_bytes());
348 for &dim in shape {
349 buf.extend_from_slice(&(dim as u64).to_le_bytes());
350 }
351 let byte_len = raw_bytes.len() as u64;
352 buf.extend_from_slice(&byte_len.to_le_bytes());
353 buf.extend_from_slice(raw_bytes);
354}
355
356pub fn encode_sparse_csr(
376 nrows: usize,
377 ncols: usize,
378 row_ptr: &[usize],
379 col_idx: &[usize],
380 values: &[f64],
381 buf: &mut Vec<u8>,
382) {
383 buf.push(TAG_SPARSE_CSR);
384 buf.push(0x00); buf.extend_from_slice(&(nrows as u64).to_le_bytes());
386 buf.extend_from_slice(&(ncols as u64).to_le_bytes());
387 let nnz = values.len() as u64;
388 buf.extend_from_slice(&nnz.to_le_bytes());
389 for &rp in row_ptr {
391 buf.extend_from_slice(&(rp as u64).to_le_bytes());
392 }
393 for &ci in col_idx {
395 buf.extend_from_slice(&(ci as u64).to_le_bytes());
396 }
397 for &v in values {
399 let bits = if v.is_nan() { CANONICAL_NAN_BITS } else { v.to_bits() };
400 buf.extend_from_slice(&bits.to_le_bytes());
401 }
402}
403
404pub fn encode_categorical(
420 levels: &[String],
421 codes: &[u32],
422 buf: &mut Vec<u8>,
423) {
424 buf.push(TAG_CATEGORICAL);
425 let n_levels = levels.len() as u32;
426 buf.extend_from_slice(&n_levels.to_le_bytes());
427 for level in levels {
428 encode_string(level, buf);
429 }
430 let n_rows = codes.len() as u64;
431 buf.extend_from_slice(&n_rows.to_le_bytes());
432 for &c in codes {
433 buf.extend_from_slice(&c.to_le_bytes());
434 }
435}
436
437pub fn encode_schema(
449 fields: &[(String, u8)],
450 buf: &mut Vec<u8>,
451) {
452 buf.push(TAG_SCHEMA);
453 let n_fields = fields.len() as u32;
454 buf.extend_from_slice(&n_fields.to_le_bytes());
455 for (name, type_tag) in fields {
456 encode_string(name, buf);
457 buf.push(*type_tag);
458 }
459}
460
461pub const DEFAULT_CHUNK_SIZE: usize = 4 * 1024 * 1024;
463
464pub const COL_TYPE_INT: u8 = 0;
466pub const COL_TYPE_FLOAT: u8 = 1;
468pub const COL_TYPE_STR: u8 = 2;
470pub const COL_TYPE_BOOL: u8 = 3;
472pub const COL_TYPE_CATEGORICAL: u8 = 4;
474pub const COL_TYPE_DATETIME: u8 = 5;
476
477pub fn encode_chunked_tensor(
500 dtype_tag: u8,
501 shape: &[usize],
502 raw_bytes: &[u8],
503 chunk_size: usize,
504 buf: &mut Vec<u8>,
505) {
506 buf.push(TAG_CHUNKED_TENSOR);
507 buf.push(dtype_tag);
508 let ndim = shape.len() as u64;
509 buf.extend_from_slice(&ndim.to_le_bytes());
510 for &dim in shape {
511 buf.extend_from_slice(&(dim as u64).to_le_bytes());
512 }
513
514 let cs = if chunk_size == 0 { DEFAULT_CHUNK_SIZE } else { chunk_size };
515 buf.extend_from_slice(&(cs as u64).to_le_bytes());
516
517 let n_chunks = if raw_bytes.is_empty() {
519 0usize
520 } else {
521 (raw_bytes.len() + cs - 1) / cs
522 };
523 buf.extend_from_slice(&(n_chunks as u64).to_le_bytes());
524
525 for i in 0..n_chunks {
527 let start = i * cs;
528 let end = (start + cs).min(raw_bytes.len());
529 let chunk = &raw_bytes[start..end];
530 let chunk_len = chunk.len() as u64;
531 let chunk_hash = crate::sha256(chunk);
532
533 buf.extend_from_slice(&chunk_len.to_le_bytes());
534 buf.extend_from_slice(&chunk_hash);
535 buf.extend_from_slice(chunk);
536 }
537}
538
539pub fn encode_dataframe(
566 column_names: &[&str],
567 column_types: &[u8],
568 column_data: &[DataFrameColumnData<'_>],
569 n_rows: usize,
570 buf: &mut Vec<u8>,
571) {
572 buf.push(TAG_DATAFRAME);
573 let n_cols = column_names.len() as u32;
574 buf.extend_from_slice(&n_cols.to_le_bytes());
575 buf.extend_from_slice(&(n_rows as u64).to_le_bytes());
576
577 for i in 0..column_names.len() {
578 encode_string(column_names[i], buf);
579 buf.push(column_types[i]);
580
581 match &column_data[i] {
582 DataFrameColumnData::Int(vals) => {
583 for &v in vals.iter() {
584 buf.extend_from_slice(&v.to_le_bytes());
585 }
586 }
587 DataFrameColumnData::Float(vals) => {
588 for &v in vals.iter() {
589 let bits = if v.is_nan() { CANONICAL_NAN_BITS } else { v.to_bits() };
590 buf.extend_from_slice(&bits.to_le_bytes());
591 }
592 }
593 DataFrameColumnData::Str(vals) => {
594 for s in vals.iter() {
595 encode_string(s, buf);
596 }
597 }
598 DataFrameColumnData::Bool(vals) => {
599 for &b in vals.iter() {
600 buf.push(if b { 0x01 } else { 0x00 });
601 }
602 }
603 DataFrameColumnData::Categorical { levels, codes } => {
604 let n_levels = levels.len() as u32;
605 buf.extend_from_slice(&n_levels.to_le_bytes());
606 for level in levels.iter() {
607 encode_string(level, buf);
608 }
609 for &c in codes.iter() {
610 buf.extend_from_slice(&c.to_le_bytes());
611 }
612 }
613 DataFrameColumnData::DateTime(vals) => {
614 for &v in vals.iter() {
615 buf.extend_from_slice(&v.to_le_bytes());
616 }
617 }
618 }
619 }
620}
621
622pub enum DataFrameColumnData<'a> {
626 Int(&'a [i64]),
628 Float(&'a [f64]),
630 Str(&'a [String]),
632 Bool(&'a [bool]),
634 Categorical {
636 levels: &'a [String],
638 codes: &'a [u32],
640 },
641 DateTime(&'a [i64]),
643}
644
645fn encode_string(s: &str, buf: &mut Vec<u8>) {
647 let bytes = s.as_bytes();
648 let len = bytes.len() as u64;
649 buf.extend_from_slice(&len.to_le_bytes());
650 buf.extend_from_slice(bytes);
651}
652
653#[cfg(test)]
654mod tests {
655 use super::*;
656 use std::collections::BTreeMap;
657 use std::rc::Rc;
658
659 #[test]
660 fn test_encode_void() {
661 let bytes = snap_encode(&Value::Void);
662 assert_eq!(bytes, vec![TAG_VOID]);
663 }
664
665 #[test]
666 fn test_encode_int() {
667 let bytes = snap_encode(&Value::Int(42));
668 assert_eq!(bytes[0], TAG_INT);
669 assert_eq!(bytes.len(), 9);
670 let val = i64::from_le_bytes(bytes[1..9].try_into().unwrap());
671 assert_eq!(val, 42);
672 }
673
674 #[test]
675 fn test_encode_negative_int() {
676 let bytes = snap_encode(&Value::Int(-1));
677 let val = i64::from_le_bytes(bytes[1..9].try_into().unwrap());
678 assert_eq!(val, -1);
679 }
680
681 #[test]
682 fn test_encode_float() {
683 let bytes = snap_encode(&Value::Float(3.14));
684 assert_eq!(bytes[0], TAG_FLOAT);
685 assert_eq!(bytes.len(), 9);
686 let bits = u64::from_le_bytes(bytes[1..9].try_into().unwrap());
687 assert_eq!(f64::from_bits(bits), 3.14);
688 }
689
690 #[test]
691 fn test_encode_nan_canonicalized() {
692 let nan1 = snap_encode(&Value::Float(f64::NAN));
693 let nan2 = snap_encode(&Value::Float(-f64::NAN));
694 assert_eq!(nan1, nan2);
696 let bits = u64::from_le_bytes(nan1[1..9].try_into().unwrap());
697 assert_eq!(bits, CANONICAL_NAN_BITS);
698 }
699
700 #[test]
701 fn test_encode_bool() {
702 let t = snap_encode(&Value::Bool(true));
703 let f = snap_encode(&Value::Bool(false));
704 assert_eq!(t, vec![TAG_BOOL, 0x01]);
705 assert_eq!(f, vec![TAG_BOOL, 0x00]);
706 }
707
708 #[test]
709 fn test_encode_string() {
710 let val = Value::String(Rc::new("hello".to_string()));
711 let bytes = snap_encode(&val);
712 assert_eq!(bytes[0], TAG_STRING);
713 let len = u64::from_le_bytes(bytes[1..9].try_into().unwrap());
714 assert_eq!(len, 5);
715 assert_eq!(&bytes[9..14], b"hello");
716 }
717
718 #[test]
719 fn test_encode_array() {
720 let val = Value::Array(Rc::new(vec![Value::Int(1), Value::Int(2)]));
721 let bytes = snap_encode(&val);
722 assert_eq!(bytes[0], TAG_ARRAY);
723 let len = u64::from_le_bytes(bytes[1..9].try_into().unwrap());
724 assert_eq!(len, 2);
725 }
726
727 #[test]
728 fn test_encode_struct_sorted_fields() {
729 let mut fields = BTreeMap::new();
731 fields.insert("z".to_string(), Value::Int(3));
732 fields.insert("a".to_string(), Value::Int(1));
733 fields.insert("m".to_string(), Value::Int(2));
734 let val = Value::Struct {
735 name: "Test".to_string(),
736 fields,
737 };
738 let bytes1 = snap_encode(&val);
739
740 let mut fields2 = BTreeMap::new();
742 fields2.insert("m".to_string(), Value::Int(2));
743 fields2.insert("a".to_string(), Value::Int(1));
744 fields2.insert("z".to_string(), Value::Int(3));
745 let val2 = Value::Struct {
746 name: "Test".to_string(),
747 fields: fields2,
748 };
749 let bytes2 = snap_encode(&val2);
750
751 assert_eq!(bytes1, bytes2, "struct encoding must be deterministic regardless of insertion order");
752 }
753
754 #[test]
755 fn test_encode_deterministic() {
756 let v1 = Value::Float(1.0);
757 let v2 = Value::Float(1.0);
758 assert_eq!(snap_encode(&v1), snap_encode(&v2));
759 }
760}