sochdb_core/
soch_codec.rs

1// Copyright 2025 Sushanth (https://github.com/sushanthpy)
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! TOON Format Codec
16//!
17//! This module implements the TOON (Token-Optimized Object Notation) format
18//! specification using the official `toon-format` crate.
19//!
20//! ## TOON Format Grammar (Simplified)
21//!
22//! ```text
23//! document     ::= top_level_value
24//! value        ::= simple_object | array | primitive
25//! simple_object::= (key ":" value newline)+ 
26//! array        ::= header newline item*
27//! header       ::= name "[" count "]" ( "{" fields "}" )? ":"
28//! item         ::= "-" value newline | row newline
29//! ```
30
31use crate::soch::{SochValue}; // Use shared types from soch.rs
32use std::collections::HashMap;
33use toon_format::{self, EncodeOptions, DecodeOptions, Delimiter, Indent};
34use toon_format::types::KeyFoldingMode;
35
36// ============================================================================
37// TOON Value Types
38// ============================================================================
39
40/// TOON value type tags for binary encoding
41#[derive(Debug, Clone, Copy, PartialEq, Eq)]
42#[repr(u8)]
43pub enum SochTypeTag {
44    /// Null value
45    Null = 0x00,
46    /// Boolean false
47    False = 0x01,
48    /// Boolean true  
49    True = 0x02,
50    /// Positive fixint (0-15, embedded in lower nibble: 0x10-0x1F)
51    PosFixint = 0x10,
52    /// Negative fixint (-16 to -1, embedded: 0x20-0x2F)
53    NegFixint = 0x20,
54    /// 8-bit signed integer
55    Int8 = 0x30,
56    /// 16-bit signed integer
57    Int16 = 0x31,
58    /// 32-bit signed integer
59    Int32 = 0x32,
60    /// 64-bit signed integer
61    Int64 = 0x33,
62    /// 32-bit float
63    Float32 = 0x40,
64    /// 64-bit float
65    Float64 = 0x41,
66    /// Fixed-length string (length in lower 4 bits: 0x50-0x5F, 0-15 chars)
67    FixStr = 0x50,
68    /// String with 8-bit length prefix
69    Str8 = 0x60,
70    /// String with 16-bit length prefix
71    Str16 = 0x61,
72    /// String with 32-bit length prefix
73    Str32 = 0x62,
74    /// Array
75    Array = 0x70,
76    /// Reference to another table row
77    Ref = 0x80,
78    /// Object (Map)
79    Object = 0x90,
80    /// Binary data
81    Binary = 0xA0,
82    /// Unsigned Integer (varint)
83    UInt = 0xB0,
84}
85
86// ============================================================================
87// TOON Document Structure
88// ============================================================================
89
90/// TOON document
91#[derive(Debug, Clone)]
92pub struct SochDocument {
93    /// Root value
94    pub root: SochValue,
95    /// Schema version
96    pub version: u32,
97}
98
99impl SochDocument {
100    /// Create a new TOON document from a value
101    pub fn new(root: SochValue) -> Self {
102        Self {
103            root,
104            version: 1,
105        }
106    }
107
108    /// Create a table-like document (legacy helper)
109    pub fn new_table(_name: impl Into<String>, fields: Vec<String>, rows: Vec<Vec<SochValue>>) -> Self {
110        // Convert to Array of Objects for canonical representation
111        let fields_str: Vec<String> = fields;
112        let mut array = Vec::new();
113        for row in rows {
114            let mut obj = HashMap::new();
115            for (i, val) in row.into_iter().enumerate() {
116                if i < fields_str.len() {
117                    obj.insert(fields_str[i].clone(), val);
118                }
119            }
120            array.push(SochValue::Object(obj));
121        }
122        
123        Self {
124            root: SochValue::Array(array),
125            version: 1,
126        }
127    }
128}
129
130// ============================================================================
131// Text Format (Human-Readable)
132// ============================================================================
133
134/// TOON text format encoder (wraps toon-format crate)
135pub struct SochTextEncoder;
136
137impl SochTextEncoder {
138    /// Encode a document to TOON text format
139    pub fn encode(doc: &SochDocument) -> String {
140        // Use default options for now, can be sophisticated later
141        let options = EncodeOptions::new()
142            .with_indent(Indent::Spaces(2))
143            .with_delimiter(Delimiter::Comma)
144            .with_key_folding(KeyFoldingMode::Safe);
145        
146        // Use toon_format to encode the SochValue
147        // SochValue implements Serialize, so this works directly.
148        toon_format::encode(&doc.root, &options).unwrap_or_else(|e| format!("Error encoding TOON: {}", e))
149    }
150}
151
152/// TOON text format decoder/parser (wraps toon-format crate)
153pub struct SochTextParser;
154
155impl SochTextParser {
156    pub fn parse(input: &str) -> Result<SochDocument, SochParseError> {
157         Self::parse_with_options(input, DecodeOptions::default())
158    }
159    
160    pub fn parse_with_options(input: &str, options: DecodeOptions) -> Result<SochDocument, SochParseError> {
161        let root: SochValue = toon_format::decode(input, &options)
162            .map_err(|e| SochParseError::RowError { line: 0, cause: e.to_string() })?;
163            
164        Ok(SochDocument::new(root))
165    }
166    
167    // Legacy helper kept for compatibility if needed, but useless now
168    pub fn parse_header(_line: &str) -> Result<(String, usize, Vec<String>), SochParseError> {
169        Err(SochParseError::InvalidHeader)
170    }
171}
172
173/// Token counter (dummy implementation for now)
174pub struct SochTokenCounter;
175impl SochTokenCounter {
176    pub fn count(_doc: &SochDocument) -> usize {
177        0
178    }
179}
180
181
182/// Parse error types
183#[derive(Debug, Clone)]
184pub enum SochParseError {
185    EmptyInput,
186    InvalidHeader,
187    InvalidRowCount,
188    InvalidValue,
189    RowCountMismatch { expected: usize, actual: usize },
190    FieldCountMismatch { expected: usize, actual: usize },
191    RowError { line: usize, cause: String },
192}
193
194impl std::fmt::Display for SochParseError {
195    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
196        write!(f, "{:?}", self)
197    }
198}
199impl std::error::Error for SochParseError {}
200
201
202// ============================================================================
203// Binary Format (Compact)
204// ============================================================================
205
206/// TOON binary format magic bytes
207pub const TOON_MAGIC: [u8; 4] = [0x54, 0x4F, 0x4F, 0x4E]; // "TOON"
208
209/// TOON binary codec (Renamed from SochBinaryCodec to SochDbBinaryCodec)
210pub struct SochDbBinaryCodec;
211
212impl SochDbBinaryCodec {
213    /// Encode a document to binary format
214    pub fn encode(doc: &SochDocument) -> Vec<u8> {
215        let mut buf = Vec::new();
216        buf.extend_from_slice(&TOON_MAGIC);
217        // Version
218        Self::write_varint(&mut buf, doc.version as u64);
219        // Root value
220        Self::write_value(&mut buf, &doc.root);
221        // Checksum
222        let checksum = crc32fast::hash(&buf);
223        buf.extend_from_slice(&checksum.to_le_bytes());
224        buf
225    }
226
227    /// Decode binary format to document
228    pub fn decode(data: &[u8]) -> Result<SochDocument, SochParseError> {
229        if data.len() < 8 { return Err(SochParseError::InvalidHeader); }
230        if data[0..4] != TOON_MAGIC { return Err(SochParseError::InvalidHeader); }
231        
232        // Verify checksum
233        let stored_checksum = u32::from_le_bytes(data[data.len() - 4..].try_into().unwrap());
234        let computed_checksum = crc32fast::hash(&data[..data.len() - 4]);
235        if stored_checksum != computed_checksum { return Err(SochParseError::InvalidValue); }
236        
237        let data = &data[..data.len() - 4];
238        let mut cursor = 4;
239        
240        let (version, bytes) = Self::read_varint(&data[cursor..])?;
241        cursor += bytes;
242        
243        let (root, _) = Self::read_value(&data[cursor..])?;
244        
245        Ok(SochDocument {
246            root,
247            version: version as u32,
248        })
249    }
250    
251    fn write_varint(buf: &mut Vec<u8>, mut n: u64) {
252        while n > 127 {
253            buf.push((n as u8 & 0x7F) | 0x80);
254            n >>= 7;
255        }
256        buf.push(n as u8 & 0x7F);
257    }
258    
259    fn read_varint(data: &[u8]) -> Result<(u64, usize), SochParseError> {
260        let mut result: u64 = 0;
261        let mut shift = 0;
262        let mut i = 0;
263        while i < data.len() {
264            let byte = data[i];
265            result |= ((byte & 0x7F) as u64) << shift;
266            i += 1;
267            if byte & 0x80 == 0 { return Ok((result, i)); }
268            shift += 7;
269        }
270        Err(SochParseError::InvalidValue)
271    }
272
273    fn read_string(data: &[u8]) -> Result<(String, usize), SochParseError> {
274        let (len, varint_bytes) = Self::read_varint(data)?;
275        let len = len as usize;
276        if data.len() < varint_bytes + len { return Err(SochParseError::InvalidValue); }
277        let s = std::str::from_utf8(&data[varint_bytes..varint_bytes+len]).map_err(|_| SochParseError::InvalidValue)?.to_string();
278        Ok((s, varint_bytes + len))
279    }
280    
281    fn write_value(buf: &mut Vec<u8>, value: &SochValue) {
282        match value {
283            SochValue::Null => buf.push(SochTypeTag::Null as u8),
284            SochValue::Bool(true) => buf.push(SochTypeTag::True as u8),
285            SochValue::Bool(false) => buf.push(SochTypeTag::False as u8),
286            SochValue::Int(n) => {
287                 // Optimization: FixInts
288                 buf.push(SochTypeTag::Int64 as u8);
289                 buf.extend_from_slice(&n.to_le_bytes());
290            },
291            SochValue::UInt(n) => {
292                 buf.push(SochTypeTag::UInt as u8);
293                 Self::write_varint(buf, *n);
294            },
295            SochValue::Float(f) => {
296                 buf.push(SochTypeTag::Float64 as u8);
297                 buf.extend_from_slice(&f.to_le_bytes());
298            },
299            SochValue::Text(s) => {
300                 buf.push(SochTypeTag::Str32 as u8);
301                 let bytes = s.as_bytes();
302                 buf.extend_from_slice(&(bytes.len() as u32).to_le_bytes());
303                 buf.extend_from_slice(bytes);
304            },
305            SochValue::Binary(b) => {
306                 buf.push(SochTypeTag::Binary as u8);
307                 Self::write_varint(buf, b.len() as u64);
308                 buf.extend_from_slice(b);
309            },
310            SochValue::Array(arr) => {
311                 buf.push(SochTypeTag::Array as u8);
312                 Self::write_varint(buf, arr.len() as u64);
313                 for item in arr { Self::write_value(buf, item); }
314            },
315            SochValue::Object(map) => {
316                 buf.push(SochTypeTag::Object as u8);
317                 Self::write_varint(buf, map.len() as u64);
318                 for (k, v) in map {
319                     // Key string
320                     let k_bytes = k.as_bytes();
321                     Self::write_varint(buf, k_bytes.len() as u64);
322                     buf.extend_from_slice(k_bytes);
323                     // Value
324                     Self::write_value(buf, v);
325                 }
326            },
327            SochValue::Ref { table, id } => {
328                 buf.push(SochTypeTag::Ref as u8);
329                 // table name
330                 let t_bytes = table.as_bytes();
331                 Self::write_varint(buf, t_bytes.len() as u64);
332                 buf.extend_from_slice(t_bytes);
333                 // id
334                 Self::write_varint(buf, *id);
335            }
336        }
337    }
338    
339    fn read_value(data: &[u8]) -> Result<(SochValue, usize), SochParseError> {
340        if data.is_empty() { return Err(SochParseError::InvalidValue); }
341        let tag = data[0];
342        let mut cursor = 1;
343        
344        match tag {
345            0x00 => Ok((SochValue::Null, 1)),
346            0x01 => Ok((SochValue::Bool(false), 1)),
347            0x02 => Ok((SochValue::Bool(true), 1)),
348            0x33 => { // Int64
349                 if data.len() < cursor + 8 { return Err(SochParseError::InvalidValue); }
350                 let n = i64::from_le_bytes(data[cursor..cursor+8].try_into().unwrap());
351                 Ok((SochValue::Int(n), cursor+8))
352            },
353            0x41 => { // Float64
354                 if data.len() < cursor + 8 { return Err(SochParseError::InvalidValue); }
355                 let f = f64::from_le_bytes(data[cursor..cursor+8].try_into().unwrap());
356                 Ok((SochValue::Float(f), cursor+8))
357            },
358            0x62 => { // Str32
359                 if data.len() < cursor + 4 { return Err(SochParseError::InvalidValue); }
360                 let len = u32::from_le_bytes(data[cursor..cursor+4].try_into().unwrap()) as usize;
361                 cursor += 4;
362                 if data.len() < cursor + len { return Err(SochParseError::InvalidValue); }
363                 let s = std::str::from_utf8(&data[cursor..cursor+len]).unwrap().to_string();
364                 Ok((SochValue::Text(s), cursor+len))
365            },
366            0x70 => { // Array
367                 let (len, bytes) = Self::read_varint(&data[cursor..])?;
368                 cursor += bytes;
369                 let mut arr = Vec::new();
370                 for _ in 0..len {
371                     let (val, bytes_read) = Self::read_value(&data[cursor..])?;
372                     cursor += bytes_read;
373                     arr.push(val);
374                 }
375                 Ok((SochValue::Array(arr), cursor))
376            },
377            0xB0 => { // UInt
378                 let (n, bytes) = Self::read_varint(&data[cursor..])?;
379                 Ok((SochValue::UInt(n), cursor+bytes))
380            },
381            0x80 => { // Ref
382                 let (table, table_bytes) = Self::read_string(&data[cursor..])?;
383                 cursor += table_bytes;
384                 let (id, id_bytes) = Self::read_varint(&data[cursor..])?;
385                 Ok((SochValue::Ref { table, id }, cursor+id_bytes))
386            },
387            0x90 => { // Object
388                 let (len, bytes_read) = Self::read_varint(&data[cursor..])?;
389                 cursor += bytes_read;
390                 let mut map = HashMap::new();
391                 for _ in 0..len {
392                     let (k, k_bytes) = Self::read_string(&data[cursor..])?;
393                     cursor += k_bytes;
394                     let (v, v_bytes) = Self::read_value(&data[cursor..])?;
395                     cursor += v_bytes;
396                     map.insert(k, v);
397                 }
398                 Ok((SochValue::Object(map), cursor))
399            },
400            // Add other cases as needed
401            _ => Err(SochParseError::InvalidValue)
402        }
403    }
404}
405
406#[cfg(test)]
407mod tests {
408    use super::*;
409
410    #[test]
411    fn test_simple_object() {
412        let mut obj = HashMap::new();
413        obj.insert("id".to_string(), SochValue::Int(1));
414        obj.insert("name".to_string(), SochValue::Text("Alice".to_string()));
415        let doc = SochDocument::new(SochValue::Object(obj));
416        
417        // This test now uses canonical encoder
418        let encoded = SochTextEncoder::encode(&doc);
419        // Canonical output might differ slightly (e.g. sorting), but should contain keys
420        assert!(encoded.contains("id"));
421        assert!(encoded.contains("1"));
422        assert!(encoded.contains("name"));
423        assert!(encoded.contains("Alice"));
424        
425        // Roundtrip binary with new codec name
426        let bin = SochDbBinaryCodec::encode(&doc);
427        let decoded = SochDbBinaryCodec::decode(&bin).unwrap();
428        if let SochValue::Object(map) = decoded.root {
429             // Accessing values. Note: SochValue doesn't impl PartialEq against literal ints easily matching on variant needed
430             // Use string representation or direct match
431            assert_eq!(map.get("id"), Some(&SochValue::Int(1)));
432            assert_eq!(map.get("name"), Some(&SochValue::Text("Alice".to_string())));
433        } else {
434            panic!("Expected object");
435        }
436    }
437
438    #[test]
439    fn test_array() {
440        let arr = vec![
441            SochValue::Int(1),
442            SochValue::Int(2),
443        ];
444        let doc = SochDocument::new(SochValue::Array(arr));
445        
446        let encoded = SochTextEncoder::encode(&doc);
447        // Should contain values
448        assert!(encoded.contains("1"));
449        assert!(encoded.contains("2"));
450        
451        let bin = SochDbBinaryCodec::encode(&doc);
452        let decoded = SochDbBinaryCodec::decode(&bin).unwrap();
453        if let SochValue::Array(arr) = decoded.root {
454             assert_eq!(arr.len(), 2);
455             assert_eq!(arr[0], SochValue::Int(1));
456        } else {
457            panic!("Expected array");
458        }
459    }
460}