Skip to main content

sochdb_core/
soch_codec.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! TOON Format Codec
19//!
20//! This module implements the TOON (Token-Optimized Object Notation) format
21//! specification using the official `toon-format` crate.
22//!
23//! ## TOON Format Grammar (Simplified)
24//!
25//! ```text
26//! document     ::= top_level_value
27//! value        ::= simple_object | array | primitive
28//! simple_object::= (key ":" value newline)+ 
29//! array        ::= header newline item*
30//! header       ::= name "[" count "]" ( "{" fields "}" )? ":"
31//! item         ::= "-" value newline | row newline
32//! ```
33
34use crate::soch::{SochValue}; // Use shared types from soch.rs
35use std::collections::HashMap;
36use toon_format::{self, EncodeOptions, DecodeOptions, Delimiter, Indent};
37use toon_format::types::KeyFoldingMode;
38
39// ============================================================================
40// TOON Value Types
41// ============================================================================
42
43/// TOON value type tags for binary encoding
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45#[repr(u8)]
46pub enum SochTypeTag {
47    /// Null value
48    Null = 0x00,
49    /// Boolean false
50    False = 0x01,
51    /// Boolean true  
52    True = 0x02,
53    /// Positive fixint (0-15, embedded in lower nibble: 0x10-0x1F)
54    PosFixint = 0x10,
55    /// Negative fixint (-16 to -1, embedded: 0x20-0x2F)
56    NegFixint = 0x20,
57    /// 8-bit signed integer
58    Int8 = 0x30,
59    /// 16-bit signed integer
60    Int16 = 0x31,
61    /// 32-bit signed integer
62    Int32 = 0x32,
63    /// 64-bit signed integer
64    Int64 = 0x33,
65    /// 32-bit float
66    Float32 = 0x40,
67    /// 64-bit float
68    Float64 = 0x41,
69    /// Fixed-length string (length in lower 4 bits: 0x50-0x5F, 0-15 chars)
70    FixStr = 0x50,
71    /// String with 8-bit length prefix
72    Str8 = 0x60,
73    /// String with 16-bit length prefix
74    Str16 = 0x61,
75    /// String with 32-bit length prefix
76    Str32 = 0x62,
77    /// Array
78    Array = 0x70,
79    /// Reference to another table row
80    Ref = 0x80,
81    /// Object (Map)
82    Object = 0x90,
83    /// Binary data
84    Binary = 0xA0,
85    /// Unsigned Integer (varint)
86    UInt = 0xB0,
87}
88
89// ============================================================================
90// TOON Document Structure
91// ============================================================================
92
93/// TOON document
94#[derive(Debug, Clone)]
95pub struct SochDocument {
96    /// Root value
97    pub root: SochValue,
98    /// Schema version
99    pub version: u32,
100}
101
102impl SochDocument {
103    /// Create a new TOON document from a value
104    pub fn new(root: SochValue) -> Self {
105        Self {
106            root,
107            version: 1,
108        }
109    }
110
111    /// Create a table-like document (legacy helper)
112    pub fn new_table(_name: impl Into<String>, fields: Vec<String>, rows: Vec<Vec<SochValue>>) -> Self {
113        // Convert to Array of Objects for canonical representation
114        let fields_str: Vec<String> = fields;
115        let mut array = Vec::new();
116        for row in rows {
117            let mut obj = HashMap::new();
118            for (i, val) in row.into_iter().enumerate() {
119                if i < fields_str.len() {
120                    obj.insert(fields_str[i].clone(), val);
121                }
122            }
123            array.push(SochValue::Object(obj));
124        }
125        
126        Self {
127            root: SochValue::Array(array),
128            version: 1,
129        }
130    }
131}
132
133// ============================================================================
134// Text Format (Human-Readable)
135// ============================================================================
136
137/// TOON text format encoder (wraps toon-format crate)
138pub struct SochTextEncoder;
139
140impl SochTextEncoder {
141    /// Encode a document to TOON text format
142    pub fn encode(doc: &SochDocument) -> String {
143        // Use default options for now, can be sophisticated later
144        let options = EncodeOptions::new()
145            .with_indent(Indent::Spaces(2))
146            .with_delimiter(Delimiter::Comma)
147            .with_key_folding(KeyFoldingMode::Safe);
148        
149        // Use toon_format to encode the SochValue
150        // SochValue implements Serialize, so this works directly.
151        toon_format::encode(&doc.root, &options).unwrap_or_else(|e| format!("Error encoding TOON: {}", e))
152    }
153}
154
155/// TOON text format decoder/parser (wraps toon-format crate)
156pub struct SochTextParser;
157
158impl SochTextParser {
159    pub fn parse(input: &str) -> Result<SochDocument, SochParseError> {
160         Self::parse_with_options(input, DecodeOptions::default())
161    }
162    
163    pub fn parse_with_options(input: &str, options: DecodeOptions) -> Result<SochDocument, SochParseError> {
164        let root: SochValue = toon_format::decode(input, &options)
165            .map_err(|e| SochParseError::RowError { line: 0, cause: e.to_string() })?;
166            
167        Ok(SochDocument::new(root))
168    }
169    
170    // Legacy helper kept for compatibility if needed, but useless now
171    pub fn parse_header(_line: &str) -> Result<(String, usize, Vec<String>), SochParseError> {
172        Err(SochParseError::InvalidHeader)
173    }
174}
175
176/// Token counter (dummy implementation for now)
177pub struct SochTokenCounter;
178impl SochTokenCounter {
179    pub fn count(_doc: &SochDocument) -> usize {
180        0
181    }
182}
183
184
185/// Parse error types
186#[derive(Debug, Clone)]
187pub enum SochParseError {
188    EmptyInput,
189    InvalidHeader,
190    InvalidRowCount,
191    InvalidValue,
192    RowCountMismatch { expected: usize, actual: usize },
193    FieldCountMismatch { expected: usize, actual: usize },
194    RowError { line: usize, cause: String },
195}
196
197impl std::fmt::Display for SochParseError {
198    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
199        write!(f, "{:?}", self)
200    }
201}
202impl std::error::Error for SochParseError {}
203
204
205// ============================================================================
206// Binary Format (Compact)
207// ============================================================================
208
209/// TOON binary format magic bytes
210pub const TOON_MAGIC: [u8; 4] = [0x54, 0x4F, 0x4F, 0x4E]; // "TOON"
211
212/// TOON binary codec (Renamed from SochBinaryCodec to SochDbBinaryCodec)
213pub struct SochDbBinaryCodec;
214
215impl SochDbBinaryCodec {
216    /// Encode a document to binary format
217    pub fn encode(doc: &SochDocument) -> Vec<u8> {
218        let mut buf = Vec::new();
219        buf.extend_from_slice(&TOON_MAGIC);
220        // Version
221        Self::write_varint(&mut buf, doc.version as u64);
222        // Root value
223        Self::write_value(&mut buf, &doc.root);
224        // Checksum
225        let checksum = crc32fast::hash(&buf);
226        buf.extend_from_slice(&checksum.to_le_bytes());
227        buf
228    }
229
230    /// Decode binary format to document
231    pub fn decode(data: &[u8]) -> Result<SochDocument, SochParseError> {
232        if data.len() < 8 { return Err(SochParseError::InvalidHeader); }
233        if data[0..4] != TOON_MAGIC { return Err(SochParseError::InvalidHeader); }
234        
235        // Verify checksum
236        let stored_checksum = u32::from_le_bytes(data[data.len() - 4..].try_into().unwrap());
237        let computed_checksum = crc32fast::hash(&data[..data.len() - 4]);
238        if stored_checksum != computed_checksum { return Err(SochParseError::InvalidValue); }
239        
240        let data = &data[..data.len() - 4];
241        let mut cursor = 4;
242        
243        let (version, bytes) = Self::read_varint(&data[cursor..])?;
244        cursor += bytes;
245        
246        let (root, _) = Self::read_value(&data[cursor..])?;
247        
248        Ok(SochDocument {
249            root,
250            version: version as u32,
251        })
252    }
253    
254    fn write_varint(buf: &mut Vec<u8>, mut n: u64) {
255        while n > 127 {
256            buf.push((n as u8 & 0x7F) | 0x80);
257            n >>= 7;
258        }
259        buf.push(n as u8 & 0x7F);
260    }
261    
262    fn read_varint(data: &[u8]) -> Result<(u64, usize), SochParseError> {
263        let mut result: u64 = 0;
264        let mut shift = 0;
265        let mut i = 0;
266        while i < data.len() {
267            let byte = data[i];
268            result |= ((byte & 0x7F) as u64) << shift;
269            i += 1;
270            if byte & 0x80 == 0 { return Ok((result, i)); }
271            shift += 7;
272        }
273        Err(SochParseError::InvalidValue)
274    }
275
276    fn read_string(data: &[u8]) -> Result<(String, usize), SochParseError> {
277        let (len, varint_bytes) = Self::read_varint(data)?;
278        let len = len as usize;
279        if data.len() < varint_bytes + len { return Err(SochParseError::InvalidValue); }
280        let s = std::str::from_utf8(&data[varint_bytes..varint_bytes+len]).map_err(|_| SochParseError::InvalidValue)?.to_string();
281        Ok((s, varint_bytes + len))
282    }
283    
284    fn write_value(buf: &mut Vec<u8>, value: &SochValue) {
285        match value {
286            SochValue::Null => buf.push(SochTypeTag::Null as u8),
287            SochValue::Bool(true) => buf.push(SochTypeTag::True as u8),
288            SochValue::Bool(false) => buf.push(SochTypeTag::False as u8),
289            SochValue::Int(n) => {
290                 // Optimization: FixInts
291                 buf.push(SochTypeTag::Int64 as u8);
292                 buf.extend_from_slice(&n.to_le_bytes());
293            },
294            SochValue::UInt(n) => {
295                 buf.push(SochTypeTag::UInt as u8);
296                 Self::write_varint(buf, *n);
297            },
298            SochValue::Float(f) => {
299                 buf.push(SochTypeTag::Float64 as u8);
300                 buf.extend_from_slice(&f.to_le_bytes());
301            },
302            SochValue::Text(s) => {
303                 buf.push(SochTypeTag::Str32 as u8);
304                 let bytes = s.as_bytes();
305                 buf.extend_from_slice(&(bytes.len() as u32).to_le_bytes());
306                 buf.extend_from_slice(bytes);
307            },
308            SochValue::Binary(b) => {
309                 buf.push(SochTypeTag::Binary as u8);
310                 Self::write_varint(buf, b.len() as u64);
311                 buf.extend_from_slice(b);
312            },
313            SochValue::Array(arr) => {
314                 buf.push(SochTypeTag::Array as u8);
315                 Self::write_varint(buf, arr.len() as u64);
316                 for item in arr { Self::write_value(buf, item); }
317            },
318            SochValue::Object(map) => {
319                 buf.push(SochTypeTag::Object as u8);
320                 Self::write_varint(buf, map.len() as u64);
321                 for (k, v) in map {
322                     // Key string
323                     let k_bytes = k.as_bytes();
324                     Self::write_varint(buf, k_bytes.len() as u64);
325                     buf.extend_from_slice(k_bytes);
326                     // Value
327                     Self::write_value(buf, v);
328                 }
329            },
330            SochValue::Ref { table, id } => {
331                 buf.push(SochTypeTag::Ref as u8);
332                 // table name
333                 let t_bytes = table.as_bytes();
334                 Self::write_varint(buf, t_bytes.len() as u64);
335                 buf.extend_from_slice(t_bytes);
336                 // id
337                 Self::write_varint(buf, *id);
338            }
339        }
340    }
341    
342    fn read_value(data: &[u8]) -> Result<(SochValue, usize), SochParseError> {
343        if data.is_empty() { return Err(SochParseError::InvalidValue); }
344        let tag = data[0];
345        let mut cursor = 1;
346        
347        match tag {
348            0x00 => Ok((SochValue::Null, 1)),
349            0x01 => Ok((SochValue::Bool(false), 1)),
350            0x02 => Ok((SochValue::Bool(true), 1)),
351            0x33 => { // Int64
352                 if data.len() < cursor + 8 { return Err(SochParseError::InvalidValue); }
353                 let n = i64::from_le_bytes(data[cursor..cursor+8].try_into().unwrap());
354                 Ok((SochValue::Int(n), cursor+8))
355            },
356            0x41 => { // Float64
357                 if data.len() < cursor + 8 { return Err(SochParseError::InvalidValue); }
358                 let f = f64::from_le_bytes(data[cursor..cursor+8].try_into().unwrap());
359                 Ok((SochValue::Float(f), cursor+8))
360            },
361            0x62 => { // Str32
362                 if data.len() < cursor + 4 { return Err(SochParseError::InvalidValue); }
363                 let len = u32::from_le_bytes(data[cursor..cursor+4].try_into().unwrap()) as usize;
364                 cursor += 4;
365                 if data.len() < cursor + len { return Err(SochParseError::InvalidValue); }
366                 let s = std::str::from_utf8(&data[cursor..cursor+len]).unwrap().to_string();
367                 Ok((SochValue::Text(s), cursor+len))
368            },
369            0x70 => { // Array
370                 let (len, bytes) = Self::read_varint(&data[cursor..])?;
371                 cursor += bytes;
372                 let mut arr = Vec::new();
373                 for _ in 0..len {
374                     let (val, bytes_read) = Self::read_value(&data[cursor..])?;
375                     cursor += bytes_read;
376                     arr.push(val);
377                 }
378                 Ok((SochValue::Array(arr), cursor))
379            },
380            0xB0 => { // UInt
381                 let (n, bytes) = Self::read_varint(&data[cursor..])?;
382                 Ok((SochValue::UInt(n), cursor+bytes))
383            },
384            0x80 => { // Ref
385                 let (table, table_bytes) = Self::read_string(&data[cursor..])?;
386                 cursor += table_bytes;
387                 let (id, id_bytes) = Self::read_varint(&data[cursor..])?;
388                 Ok((SochValue::Ref { table, id }, cursor+id_bytes))
389            },
390            0x90 => { // Object
391                 let (len, bytes_read) = Self::read_varint(&data[cursor..])?;
392                 cursor += bytes_read;
393                 let mut map = HashMap::new();
394                 for _ in 0..len {
395                     let (k, k_bytes) = Self::read_string(&data[cursor..])?;
396                     cursor += k_bytes;
397                     let (v, v_bytes) = Self::read_value(&data[cursor..])?;
398                     cursor += v_bytes;
399                     map.insert(k, v);
400                 }
401                 Ok((SochValue::Object(map), cursor))
402            },
403            // Add other cases as needed
404            _ => Err(SochParseError::InvalidValue)
405        }
406    }
407}
408
409#[cfg(test)]
410mod tests {
411    use super::*;
412
413    #[test]
414    fn test_simple_object() {
415        let mut obj = HashMap::new();
416        obj.insert("id".to_string(), SochValue::Int(1));
417        obj.insert("name".to_string(), SochValue::Text("Alice".to_string()));
418        let doc = SochDocument::new(SochValue::Object(obj));
419        
420        // This test now uses canonical encoder
421        let encoded = SochTextEncoder::encode(&doc);
422        // Canonical output might differ slightly (e.g. sorting), but should contain keys
423        assert!(encoded.contains("id"));
424        assert!(encoded.contains("1"));
425        assert!(encoded.contains("name"));
426        assert!(encoded.contains("Alice"));
427        
428        // Roundtrip binary with new codec name
429        let bin = SochDbBinaryCodec::encode(&doc);
430        let decoded = SochDbBinaryCodec::decode(&bin).unwrap();
431        if let SochValue::Object(map) = decoded.root {
432             // Accessing values. Note: SochValue doesn't impl PartialEq against literal ints easily matching on variant needed
433             // Use string representation or direct match
434            assert_eq!(map.get("id"), Some(&SochValue::Int(1)));
435            assert_eq!(map.get("name"), Some(&SochValue::Text("Alice".to_string())));
436        } else {
437            panic!("Expected object");
438        }
439    }
440
441    #[test]
442    fn test_array() {
443        let arr = vec![
444            SochValue::Int(1),
445            SochValue::Int(2),
446        ];
447        let doc = SochDocument::new(SochValue::Array(arr));
448        
449        let encoded = SochTextEncoder::encode(&doc);
450        // Should contain values
451        assert!(encoded.contains("1"));
452        assert!(encoded.contains("2"));
453        
454        let bin = SochDbBinaryCodec::encode(&doc);
455        let decoded = SochDbBinaryCodec::decode(&bin).unwrap();
456        if let SochValue::Array(arr) = decoded.root {
457             assert_eq!(arr.len(), 2);
458             assert_eq!(arr[0], SochValue::Int(1));
459        } else {
460            panic!("Expected array");
461        }
462    }
463}