Skip to main content

sochdb_core/
soch_codec.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! TOON Format Codec
19//!
20//! This module implements the TOON (Token-Optimized Object Notation) format
21//! specification using the official `toon-format` crate.
22//!
23//! ## TOON Format Grammar (Simplified)
24//!
25//! ```text
26//! document     ::= top_level_value
27//! value        ::= simple_object | array | primitive
28//! simple_object::= (key ":" value newline)+
29//! array        ::= header newline item*
30//! header       ::= name "[" count "]" ( "{" fields "}" )? ":"
31//! item         ::= "-" value newline | row newline
32//! ```
33
34use crate::soch::SochValue; // Use shared types from soch.rs
35use std::collections::HashMap;
36use toon_format::types::KeyFoldingMode;
37use toon_format::{self, DecodeOptions, Delimiter, EncodeOptions, Indent};
38
39// ============================================================================
40// TOON Value Types
41// ============================================================================
42
43/// TOON value type tags for binary encoding
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45#[repr(u8)]
46pub enum SochTypeTag {
47    /// Null value
48    Null = 0x00,
49    /// Boolean false
50    False = 0x01,
51    /// Boolean true  
52    True = 0x02,
53    /// Positive fixint (0-15, embedded in lower nibble: 0x10-0x1F)
54    PosFixint = 0x10,
55    /// Negative fixint (-16 to -1, embedded: 0x20-0x2F)
56    NegFixint = 0x20,
57    /// 8-bit signed integer
58    Int8 = 0x30,
59    /// 16-bit signed integer
60    Int16 = 0x31,
61    /// 32-bit signed integer
62    Int32 = 0x32,
63    /// 64-bit signed integer
64    Int64 = 0x33,
65    /// 32-bit float
66    Float32 = 0x40,
67    /// 64-bit float
68    Float64 = 0x41,
69    /// Fixed-length string (length in lower 4 bits: 0x50-0x5F, 0-15 chars)
70    FixStr = 0x50,
71    /// String with 8-bit length prefix
72    Str8 = 0x60,
73    /// String with 16-bit length prefix
74    Str16 = 0x61,
75    /// String with 32-bit length prefix
76    Str32 = 0x62,
77    /// Array
78    Array = 0x70,
79    /// Reference to another table row
80    Ref = 0x80,
81    /// Object (Map)
82    Object = 0x90,
83    /// Binary data
84    Binary = 0xA0,
85    /// Unsigned Integer (varint)
86    UInt = 0xB0,
87}
88
89// ============================================================================
90// TOON Document Structure
91// ============================================================================
92
93/// TOON document
94#[derive(Debug, Clone)]
95pub struct SochDocument {
96    /// Root value
97    pub root: SochValue,
98    /// Schema version
99    pub version: u32,
100}
101
102impl SochDocument {
103    /// Create a new TOON document from a value
104    pub fn new(root: SochValue) -> Self {
105        Self { root, version: 1 }
106    }
107
108    /// Create a table-like document (legacy helper)
109    pub fn new_table(
110        _name: impl Into<String>,
111        fields: Vec<String>,
112        rows: Vec<Vec<SochValue>>,
113    ) -> Self {
114        // Convert to Array of Objects for canonical representation
115        let fields_str: Vec<String> = fields;
116        let mut array = Vec::new();
117        for row in rows {
118            let mut obj = HashMap::new();
119            for (i, val) in row.into_iter().enumerate() {
120                if i < fields_str.len() {
121                    obj.insert(fields_str[i].clone(), val);
122                }
123            }
124            array.push(SochValue::Object(obj));
125        }
126
127        Self {
128            root: SochValue::Array(array),
129            version: 1,
130        }
131    }
132}
133
134// ============================================================================
135// Text Format (Human-Readable)
136// ============================================================================
137
138/// TOON text format encoder (wraps toon-format crate)
139pub struct SochTextEncoder;
140
141impl SochTextEncoder {
142    /// Encode a document to TOON text format
143    pub fn encode(doc: &SochDocument) -> String {
144        // Use default options for now, can be sophisticated later
145        let options = EncodeOptions::new()
146            .with_indent(Indent::Spaces(2))
147            .with_delimiter(Delimiter::Comma)
148            .with_key_folding(KeyFoldingMode::Safe);
149
150        // Use toon_format to encode the SochValue
151        // SochValue implements Serialize, so this works directly.
152        toon_format::encode(&doc.root, &options)
153            .unwrap_or_else(|e| format!("Error encoding TOON: {}", e))
154    }
155}
156
157/// TOON text format decoder/parser (wraps toon-format crate)
158pub struct SochTextParser;
159
160impl SochTextParser {
161    pub fn parse(input: &str) -> Result<SochDocument, SochParseError> {
162        Self::parse_with_options(input, DecodeOptions::default())
163    }
164
165    pub fn parse_with_options(
166        input: &str,
167        options: DecodeOptions,
168    ) -> Result<SochDocument, SochParseError> {
169        let root: SochValue =
170            toon_format::decode(input, &options).map_err(|e| SochParseError::RowError {
171                line: 0,
172                cause: e.to_string(),
173            })?;
174
175        Ok(SochDocument::new(root))
176    }
177
178    // Legacy helper kept for compatibility if needed, but useless now
179    pub fn parse_header(_line: &str) -> Result<(String, usize, Vec<String>), SochParseError> {
180        Err(SochParseError::InvalidHeader)
181    }
182}
183
184/// Token counter (dummy implementation for now)
185pub struct SochTokenCounter;
186impl SochTokenCounter {
187    pub fn count(_doc: &SochDocument) -> usize {
188        0
189    }
190}
191
192/// Parse error types
193#[derive(Debug, Clone)]
194pub enum SochParseError {
195    EmptyInput,
196    InvalidHeader,
197    InvalidRowCount,
198    InvalidValue,
199    RowCountMismatch { expected: usize, actual: usize },
200    FieldCountMismatch { expected: usize, actual: usize },
201    RowError { line: usize, cause: String },
202}
203
204impl std::fmt::Display for SochParseError {
205    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
206        write!(f, "{:?}", self)
207    }
208}
209impl std::error::Error for SochParseError {}
210
211// ============================================================================
212// Binary Format (Compact)
213// ============================================================================
214
215/// TOON binary format magic bytes
216pub const TOON_MAGIC: [u8; 4] = [0x54, 0x4F, 0x4F, 0x4E]; // "TOON"
217
218/// TOON binary codec (Renamed from SochBinaryCodec to SochDbBinaryCodec)
219pub struct SochDbBinaryCodec;
220
221impl SochDbBinaryCodec {
222    /// Encode a document to binary format
223    pub fn encode(doc: &SochDocument) -> Vec<u8> {
224        let mut buf = Vec::new();
225        buf.extend_from_slice(&TOON_MAGIC);
226        // Version
227        Self::write_varint(&mut buf, doc.version as u64);
228        // Root value
229        Self::write_value(&mut buf, &doc.root);
230        // Checksum
231        let checksum = crc32fast::hash(&buf);
232        buf.extend_from_slice(&checksum.to_le_bytes());
233        buf
234    }
235
236    /// Decode binary format to document
237    pub fn decode(data: &[u8]) -> Result<SochDocument, SochParseError> {
238        if data.len() < 8 {
239            return Err(SochParseError::InvalidHeader);
240        }
241        if data[0..4] != TOON_MAGIC {
242            return Err(SochParseError::InvalidHeader);
243        }
244
245        // Verify checksum
246        let stored_checksum = u32::from_le_bytes(data[data.len() - 4..].try_into().unwrap());
247        let computed_checksum = crc32fast::hash(&data[..data.len() - 4]);
248        if stored_checksum != computed_checksum {
249            return Err(SochParseError::InvalidValue);
250        }
251
252        let data = &data[..data.len() - 4];
253        let mut cursor = 4;
254
255        let (version, bytes) = Self::read_varint(&data[cursor..])?;
256        cursor += bytes;
257
258        let (root, _) = Self::read_value(&data[cursor..])?;
259
260        Ok(SochDocument {
261            root,
262            version: version as u32,
263        })
264    }
265
266    fn write_varint(buf: &mut Vec<u8>, mut n: u64) {
267        while n > 127 {
268            buf.push((n as u8 & 0x7F) | 0x80);
269            n >>= 7;
270        }
271        buf.push(n as u8 & 0x7F);
272    }
273
274    fn read_varint(data: &[u8]) -> Result<(u64, usize), SochParseError> {
275        let mut result: u64 = 0;
276        let mut shift = 0;
277        let mut i = 0;
278        while i < data.len() {
279            let byte = data[i];
280            result |= ((byte & 0x7F) as u64) << shift;
281            i += 1;
282            if byte & 0x80 == 0 {
283                return Ok((result, i));
284            }
285            shift += 7;
286        }
287        Err(SochParseError::InvalidValue)
288    }
289
290    fn read_string(data: &[u8]) -> Result<(String, usize), SochParseError> {
291        let (len, varint_bytes) = Self::read_varint(data)?;
292        let len = len as usize;
293        if data.len() < varint_bytes + len {
294            return Err(SochParseError::InvalidValue);
295        }
296        let s = std::str::from_utf8(&data[varint_bytes..varint_bytes + len])
297            .map_err(|_| SochParseError::InvalidValue)?
298            .to_string();
299        Ok((s, varint_bytes + len))
300    }
301
302    fn write_value(buf: &mut Vec<u8>, value: &SochValue) {
303        match value {
304            SochValue::Null => buf.push(SochTypeTag::Null as u8),
305            SochValue::Bool(true) => buf.push(SochTypeTag::True as u8),
306            SochValue::Bool(false) => buf.push(SochTypeTag::False as u8),
307            SochValue::Int(n) => {
308                // Optimization: FixInts
309                buf.push(SochTypeTag::Int64 as u8);
310                buf.extend_from_slice(&n.to_le_bytes());
311            }
312            SochValue::UInt(n) => {
313                buf.push(SochTypeTag::UInt as u8);
314                Self::write_varint(buf, *n);
315            }
316            SochValue::Float(f) => {
317                buf.push(SochTypeTag::Float64 as u8);
318                buf.extend_from_slice(&f.to_le_bytes());
319            }
320            SochValue::Text(s) => {
321                buf.push(SochTypeTag::Str32 as u8);
322                let bytes = s.as_bytes();
323                buf.extend_from_slice(&(bytes.len() as u32).to_le_bytes());
324                buf.extend_from_slice(bytes);
325            }
326            SochValue::Binary(b) => {
327                buf.push(SochTypeTag::Binary as u8);
328                Self::write_varint(buf, b.len() as u64);
329                buf.extend_from_slice(b);
330            }
331            SochValue::Array(arr) => {
332                buf.push(SochTypeTag::Array as u8);
333                Self::write_varint(buf, arr.len() as u64);
334                for item in arr {
335                    Self::write_value(buf, item);
336                }
337            }
338            SochValue::Object(map) => {
339                buf.push(SochTypeTag::Object as u8);
340                Self::write_varint(buf, map.len() as u64);
341                for (k, v) in map {
342                    // Key string
343                    let k_bytes = k.as_bytes();
344                    Self::write_varint(buf, k_bytes.len() as u64);
345                    buf.extend_from_slice(k_bytes);
346                    // Value
347                    Self::write_value(buf, v);
348                }
349            }
350            SochValue::Ref { table, id } => {
351                buf.push(SochTypeTag::Ref as u8);
352                // table name
353                let t_bytes = table.as_bytes();
354                Self::write_varint(buf, t_bytes.len() as u64);
355                buf.extend_from_slice(t_bytes);
356                // id
357                Self::write_varint(buf, *id);
358            }
359        }
360    }
361
362    fn read_value(data: &[u8]) -> Result<(SochValue, usize), SochParseError> {
363        if data.is_empty() {
364            return Err(SochParseError::InvalidValue);
365        }
366        let tag = data[0];
367        let mut cursor = 1;
368
369        match tag {
370            0x00 => Ok((SochValue::Null, 1)),
371            0x01 => Ok((SochValue::Bool(false), 1)),
372            0x02 => Ok((SochValue::Bool(true), 1)),
373            0x33 => {
374                // Int64
375                if data.len() < cursor + 8 {
376                    return Err(SochParseError::InvalidValue);
377                }
378                let n = i64::from_le_bytes(data[cursor..cursor + 8].try_into().unwrap());
379                Ok((SochValue::Int(n), cursor + 8))
380            }
381            0x41 => {
382                // Float64
383                if data.len() < cursor + 8 {
384                    return Err(SochParseError::InvalidValue);
385                }
386                let f = f64::from_le_bytes(data[cursor..cursor + 8].try_into().unwrap());
387                Ok((SochValue::Float(f), cursor + 8))
388            }
389            0x62 => {
390                // Str32
391                if data.len() < cursor + 4 {
392                    return Err(SochParseError::InvalidValue);
393                }
394                let len = u32::from_le_bytes(data[cursor..cursor + 4].try_into().unwrap()) as usize;
395                cursor += 4;
396                if data.len() < cursor + len {
397                    return Err(SochParseError::InvalidValue);
398                }
399                let s = std::str::from_utf8(&data[cursor..cursor + len])
400                    .unwrap()
401                    .to_string();
402                Ok((SochValue::Text(s), cursor + len))
403            }
404            0x70 => {
405                // Array
406                let (len, bytes) = Self::read_varint(&data[cursor..])?;
407                cursor += bytes;
408                let mut arr = Vec::new();
409                for _ in 0..len {
410                    let (val, bytes_read) = Self::read_value(&data[cursor..])?;
411                    cursor += bytes_read;
412                    arr.push(val);
413                }
414                Ok((SochValue::Array(arr), cursor))
415            }
416            0xB0 => {
417                // UInt
418                let (n, bytes) = Self::read_varint(&data[cursor..])?;
419                Ok((SochValue::UInt(n), cursor + bytes))
420            }
421            0x80 => {
422                // Ref
423                let (table, table_bytes) = Self::read_string(&data[cursor..])?;
424                cursor += table_bytes;
425                let (id, id_bytes) = Self::read_varint(&data[cursor..])?;
426                Ok((SochValue::Ref { table, id }, cursor + id_bytes))
427            }
428            0x90 => {
429                // Object
430                let (len, bytes_read) = Self::read_varint(&data[cursor..])?;
431                cursor += bytes_read;
432                let mut map = HashMap::new();
433                for _ in 0..len {
434                    let (k, k_bytes) = Self::read_string(&data[cursor..])?;
435                    cursor += k_bytes;
436                    let (v, v_bytes) = Self::read_value(&data[cursor..])?;
437                    cursor += v_bytes;
438                    map.insert(k, v);
439                }
440                Ok((SochValue::Object(map), cursor))
441            }
442            // Add other cases as needed
443            _ => Err(SochParseError::InvalidValue),
444        }
445    }
446}
447
448#[cfg(test)]
449mod tests {
450    use super::*;
451
452    #[test]
453    fn test_simple_object() {
454        let mut obj = HashMap::new();
455        obj.insert("id".to_string(), SochValue::Int(1));
456        obj.insert("name".to_string(), SochValue::Text("Alice".to_string()));
457        let doc = SochDocument::new(SochValue::Object(obj));
458
459        // This test now uses canonical encoder
460        let encoded = SochTextEncoder::encode(&doc);
461        // Canonical output might differ slightly (e.g. sorting), but should contain keys
462        assert!(encoded.contains("id"));
463        assert!(encoded.contains("1"));
464        assert!(encoded.contains("name"));
465        assert!(encoded.contains("Alice"));
466
467        // Roundtrip binary with new codec name
468        let bin = SochDbBinaryCodec::encode(&doc);
469        let decoded = SochDbBinaryCodec::decode(&bin).unwrap();
470        if let SochValue::Object(map) = decoded.root {
471            // Accessing values. Note: SochValue doesn't impl PartialEq against literal ints easily matching on variant needed
472            // Use string representation or direct match
473            assert_eq!(map.get("id"), Some(&SochValue::Int(1)));
474            assert_eq!(map.get("name"), Some(&SochValue::Text("Alice".to_string())));
475        } else {
476            panic!("Expected object");
477        }
478    }
479
480    #[test]
481    fn test_array() {
482        let arr = vec![SochValue::Int(1), SochValue::Int(2)];
483        let doc = SochDocument::new(SochValue::Array(arr));
484
485        let encoded = SochTextEncoder::encode(&doc);
486        // Should contain values
487        assert!(encoded.contains("1"));
488        assert!(encoded.contains("2"));
489
490        let bin = SochDbBinaryCodec::encode(&doc);
491        let decoded = SochDbBinaryCodec::decode(&bin).unwrap();
492        if let SochValue::Array(arr) = decoded.root {
493            assert_eq!(arr.len(), 2);
494            assert_eq!(arr[0], SochValue::Int(1));
495        } else {
496            panic!("Expected array");
497        }
498    }
499}