Skip to main content

nxs/
compiler.rs

1use crate::consts::{
2    FLAG_SCHEMA_EMBEDDED, MAGIC_FILE, MAGIC_FOOTER, MAGIC_LIST, MAGIC_OBJ, SIGIL_BINARY,
3    SIGIL_BOOL, SIGIL_FLOAT, SIGIL_INT, SIGIL_KEYWORD, SIGIL_LINK, SIGIL_NULL, SIGIL_STR,
4    SIGIL_TIME, VERSION,
5};
6use crate::error::{NxsError, Result};
7use crate::parser::{Field, Value};
8use std::collections::HashMap;
9
10pub struct Compiler {
11    dict: Vec<String>,               // key index → key name
12    key_map: HashMap<String, usize>, // key name → index
13    /// Per-slot TypeManifest sigil (0 = unset → defaults to SIGIL_STR in schema).
14    slot_sigils: Vec<u8>,
15}
16
17impl Compiler {
18    pub fn new() -> Self {
19        Compiler {
20            dict: Vec::new(),
21            key_map: HashMap::new(),
22            slot_sigils: Vec::new(),
23        }
24    }
25
26    // First pass: collect all unique keys into the global dictionary
27    pub fn collect_keys(&mut self, fields: &[Field]) {
28        for field in fields {
29            self.intern_key(&field.key);
30            self.collect_keys_from_value(&field.value);
31        }
32    }
33
34    fn collect_keys_from_value(&mut self, value: &Value) {
35        match value {
36            Value::Object(fields) => {
37                for field in fields {
38                    self.intern_key(&field.key);
39                    self.collect_keys_from_value(&field.value);
40                }
41            }
42            Value::List(elems) => {
43                for e in elems {
44                    self.collect_keys_from_value(e);
45                }
46            }
47            _ => {}
48        }
49    }
50
51    fn intern_key(&mut self, key: &str) -> usize {
52        if let Some(&idx) = self.key_map.get(key) {
53            return idx;
54        }
55        let idx = self.dict.len();
56        self.dict.push(key.to_string());
57        self.slot_sigils.push(0);
58        self.key_map.insert(key.to_string(), idx);
59        idx
60    }
61
62    fn mark_slot_sigil(&mut self, slot: usize, sigil: u8) {
63        if slot >= self.slot_sigils.len() {
64            return;
65        }
66        let cur = self.slot_sigils[slot];
67        if cur == 0 || (cur == SIGIL_NULL && sigil != SIGIL_NULL) {
68            self.slot_sigils[slot] = sigil;
69        }
70    }
71
72    pub fn compile(&mut self, fields: &[Field]) -> Result<Vec<u8>> {
73        self.collect_keys(fields);
74
75        let mut data_sector: Vec<u8> = Vec::new();
76        // Top-level fields are wrapped into a single root object
77        let root_bytes = self.encode_object(fields)?;
78        data_sector.extend_from_slice(&root_bytes);
79
80        let schema_bytes = self.encode_schema();
81        let tail_ptr: u64 = 32 + schema_bytes.len() as u64 + data_sector.len() as u64;
82        let tail_index = self.encode_tail_index(32 + schema_bytes.len() as u64, tail_ptr);
83        let dict_hash = murmur3_64(&schema_bytes);
84
85        let preamble = self.encode_preamble(dict_hash, FLAG_SCHEMA_EMBEDDED);
86
87        let mut out = Vec::new();
88        out.extend_from_slice(&preamble);
89        out.extend_from_slice(&schema_bytes);
90        out.extend_from_slice(&data_sector);
91        out.extend_from_slice(&tail_index);
92        Ok(out)
93    }
94
95    fn encode_preamble(&self, dict_hash: u64, flags: u16) -> Vec<u8> {
96        let mut b = Vec::with_capacity(32);
97        b.extend_from_slice(&MAGIC_FILE.to_le_bytes()); // 0..4
98        b.extend_from_slice(&VERSION.to_le_bytes()); // 4..6
99        b.extend_from_slice(&flags.to_le_bytes()); // 6..8
100        b.extend_from_slice(&dict_hash.to_le_bytes()); // 8..16
101                                                       // v1.1 streamable format always writes tail_ptr=0 here; the actual
102                                                       // tail pointer is stored in the footer FooterTailPtr field instead.
103        b.extend_from_slice(&0u64.to_le_bytes()); // 16..24 tail_ptr (always 0)
104        b.extend_from_slice(&0u64.to_le_bytes()); // 24..32 reserved
105        b
106    }
107
108    fn encode_schema(&self) -> Vec<u8> {
109        let mut b = Vec::new();
110        let key_count = self.dict.len() as u16;
111        b.extend_from_slice(&key_count.to_le_bytes());
112
113        for (i, _) in self.dict.iter().enumerate() {
114            let s = self.slot_sigils.get(i).copied().unwrap_or(0);
115            b.push(if s == 0 { SIGIL_STR } else { s });
116        }
117
118        // StringPool: null-terminated names
119        for key in &self.dict {
120            b.extend_from_slice(key.as_bytes());
121            b.push(0x00);
122        }
123
124        // Pad to 8-byte boundary
125        while b.len() % 8 != 0 {
126            b.push(0x00);
127        }
128        b
129    }
130
131    fn encode_object(&mut self, fields: &[Field]) -> Result<Vec<u8>> {
132        // Resolve macro fields first
133        let resolved: Vec<(usize, Value)> = fields
134            .iter()
135            .map(|f| {
136                let idx = *self
137                    .key_map
138                    .get(&f.key)
139                    .ok_or_else(|| NxsError::ParseError(format!("key not in dict: {}", f.key)))?;
140                let v = resolve_macro(&f.value, fields)?;
141                Ok((idx, v))
142            })
143            .collect::<Result<Vec<_>>>()?;
144
145        // Build bitmask
146        let mask = build_bitmask(
147            &resolved.iter().map(|(i, _)| *i).collect::<Vec<_>>(),
148            self.dict.len(),
149        );
150
151        // Encode each value
152        let mut value_bufs: Vec<Vec<u8>> = Vec::new();
153        for (slot, v) in &resolved {
154            self.mark_slot_sigil(*slot, value_sigil_byte(v));
155            value_bufs.push(encode_value(v)?);
156        }
157
158        // Build offset table — offsets relative to object start (Magic byte)
159        // Object structure: [Magic 4][Length 4][Bitmask N][OffsetTable M*2][values...]
160        let header_size = 4 + 4; // magic + length
161        let bitmask_size = mask.len();
162        let offset_table_size = resolved.len() * 2; // normal mode: u16 each
163        let data_start = header_size + bitmask_size + offset_table_size;
164
165        // Align data_start to 8
166        let data_start_aligned = align8(data_start);
167        let align_padding = data_start_aligned - data_start;
168
169        let mut offsets: Vec<u16> = Vec::new();
170        let mut cursor = data_start_aligned;
171        for buf in &value_bufs {
172            offsets.push(cursor as u16);
173            cursor += buf.len();
174        }
175
176        let total_len = cursor;
177
178        let mut obj = Vec::with_capacity(total_len);
179        obj.extend_from_slice(&MAGIC_OBJ.to_le_bytes());
180        obj.extend_from_slice(&(total_len as u32).to_le_bytes());
181        obj.extend_from_slice(&mask);
182        for off in &offsets {
183            obj.extend_from_slice(&off.to_le_bytes());
184        }
185        for _ in 0..align_padding {
186            obj.push(0x00);
187        }
188        for buf in &value_bufs {
189            obj.extend_from_slice(buf);
190        }
191        Ok(obj)
192    }
193
194    fn encode_tail_index(&self, data_sector_start: u64, tail_ptr: u64) -> Vec<u8> {
195        // For the root object there is exactly one top-level record
196        let mut b = Vec::new();
197        let entry_count: u32 = 1;
198        b.extend_from_slice(&entry_count.to_le_bytes());
199        // KeyID 0 (root), absolute offset = data_sector_start
200        b.extend_from_slice(&0u16.to_le_bytes());
201        b.extend_from_slice(&data_sector_start.to_le_bytes());
202        b.extend_from_slice(&tail_ptr.to_le_bytes());
203        b.extend_from_slice(&MAGIC_FOOTER.to_le_bytes());
204        b
205    }
206}
207
208// --- Encoding helpers ---
209
210fn encode_value(v: &Value) -> Result<Vec<u8>> {
211    match v {
212        Value::Int(n) => {
213            let mut b = Vec::with_capacity(8);
214            b.extend_from_slice(&n.to_le_bytes());
215            Ok(b)
216        }
217        Value::Float(f) => {
218            let mut b = Vec::with_capacity(8);
219            b.extend_from_slice(&f.to_le_bytes());
220            Ok(b)
221        }
222        Value::Bool(bl) => {
223            let mut b = vec![if *bl { 0x01u8 } else { 0x00u8 }];
224            // 7 bytes padding to maintain 8-byte alignment for next field
225            b.extend_from_slice(&[0u8; 7]);
226            Ok(b)
227        }
228        Value::Keyword(_) => Err(NxsError::UnsupportedFieldType),
229        Value::Str(s) => {
230            let bytes = s.as_bytes();
231            let len = bytes.len() as u32;
232            let mut b = Vec::new();
233            b.extend_from_slice(&len.to_le_bytes());
234            b.extend_from_slice(bytes);
235            pad_to_8(&mut b);
236            Ok(b)
237        }
238        Value::Time(ns) => {
239            let mut b = Vec::with_capacity(8);
240            b.extend_from_slice(&ns.to_le_bytes());
241            Ok(b)
242        }
243        Value::Binary(raw) => {
244            let len = raw.len() as u32;
245            let mut b = Vec::new();
246            b.extend_from_slice(&len.to_le_bytes());
247            b.extend_from_slice(raw);
248            pad_to_8(&mut b);
249            Ok(b)
250        }
251        Value::Link(off) => {
252            let mut b = Vec::with_capacity(8);
253            b.extend_from_slice(&off.to_le_bytes());
254            b.extend_from_slice(&[0u8; 4]); // pad to 8
255            Ok(b)
256        }
257        Value::Null => {
258            // Null is zero-width: the bitmask bit and offset-table slot are sufficient
259            // to distinguish explicit Null from an absent field.  No payload bytes are
260            // emitted.  (An earlier draft of the spec incorrectly said "offset points
261            // to a single 0x00 byte" — see SPEC.md §5.4 conformance note.)
262            Ok(vec![])
263        }
264        Value::Object(fields) => {
265            // Nested object: recursively compile with a fresh compiler that shares the parent dict
266            // For POC we use a standalone compiler — a real impl would share the global dict
267            let mut inner = Compiler::new();
268            inner.collect_keys(fields);
269            // Copy parent dict entries
270            inner.dict = fields.iter().map(|f| f.key.clone()).collect();
271            inner.key_map = inner
272                .dict
273                .iter()
274                .cloned()
275                .enumerate()
276                .map(|(i, k)| (k, i))
277                .collect();
278            inner.encode_object(fields)
279        }
280        Value::List(elems) => encode_list(elems),
281        Value::Macro(_) => Err(NxsError::MacroUnresolved(
282            "unresolved macro in encode".into(),
283        )),
284    }
285}
286
287fn encode_list(elems: &[Value]) -> Result<Vec<u8>> {
288    if elems.is_empty() {
289        let mut b = Vec::new();
290        b.extend_from_slice(&MAGIC_LIST.to_le_bytes()); // 4
291        b.extend_from_slice(&16u32.to_le_bytes()); // length=16
292        b.push(0x00); // sigil (none)
293        b.extend_from_slice(&0u32.to_le_bytes()); // ElemCount
294        b.extend_from_slice(&[0u8; 3]); // padding
295        return Ok(b);
296    }
297
298    let sigil_byte = value_sigil_byte(elems.first().unwrap());
299
300    let mut elem_bufs: Vec<Vec<u8>> = elems
301        .iter()
302        .map(|e| {
303            if value_sigil_byte(e) != sigil_byte {
304                return Err(NxsError::ListTypeMismatch);
305            }
306            encode_value(e)
307        })
308        .collect::<Result<Vec<_>>>()?;
309
310    // List header is 16 bytes: Magic(4) + Length(4) + ElemSigil(1) + ElemCount(4) + Padding(3)
311    let data_len: usize = elem_bufs.iter().map(|b| b.len()).sum();
312    let total_len = 16 + data_len;
313
314    let mut b = Vec::with_capacity(total_len);
315    b.extend_from_slice(&MAGIC_LIST.to_le_bytes());
316    b.extend_from_slice(&(total_len as u32).to_le_bytes());
317    b.push(sigil_byte);
318    b.extend_from_slice(&(elems.len() as u32).to_le_bytes());
319    b.extend_from_slice(&[0u8; 3]); // padding to align data to offset 16
320    for buf in &mut elem_bufs {
321        b.append(buf);
322    }
323    Ok(b)
324}
325
326fn value_sigil_byte(v: &Value) -> u8 {
327    match v {
328        Value::Int(_) => SIGIL_INT,
329        Value::Float(_) => SIGIL_FLOAT,
330        Value::Bool(_) => SIGIL_BOOL,
331        Value::Keyword(_) => SIGIL_KEYWORD,
332        Value::Str(_) => SIGIL_STR,
333        Value::Time(_) => SIGIL_TIME,
334        Value::Binary(_) => SIGIL_BINARY,
335        Value::Link(_) => SIGIL_LINK,
336        Value::Null => SIGIL_NULL,
337        Value::Object(_) => b'O',
338        Value::List(_) => b'L',
339        Value::Macro(_) => b'!',
340    }
341}
342
343fn pad_to_8(b: &mut Vec<u8>) {
344    while b.len() % 8 != 0 {
345        b.push(0x00);
346    }
347}
348
349fn align8(n: usize) -> usize {
350    (n + 7) & !7
351}
352
353// Build LEB128 continuation-bit bitmask encoding the presence of given key indices
354fn build_bitmask(present_indices: &[usize], total_keys: usize) -> Vec<u8> {
355    if total_keys == 0 {
356        return vec![0x00];
357    }
358    let mut bits = vec![false; total_keys];
359    for &idx in present_indices {
360        if idx < total_keys {
361            bits[idx] = true;
362        }
363    }
364    // Encode in groups of 7 bits with LEB128 continuation
365    let mut result = Vec::new();
366    let mut i = 0;
367    while i < bits.len() {
368        let chunk: Vec<bool> = bits[i..bits.len().min(i + 7)].to_vec();
369        let mut byte: u8 = 0;
370        for (bit_pos, &set) in chunk.iter().enumerate() {
371            if set {
372                byte |= 1 << bit_pos;
373            }
374        }
375        let has_more = i + 7 < bits.len();
376        if has_more {
377            byte |= 0x80;
378        }
379        result.push(byte);
380        i += 7;
381    }
382    result
383}
384
385// Minimal macro resolution: handle @key references and string concatenation
386fn resolve_macro(value: &Value, scope: &[Field]) -> Result<Value> {
387    match value {
388        Value::Macro(expr) => eval_macro(expr, scope),
389        other => Ok(other.clone()),
390    }
391}
392
393fn eval_macro(expr: &str, scope: &[Field]) -> Result<Value> {
394    let expr = expr.trim();
395
396    // @key reference
397    if let Some(key) = expr.strip_prefix('@') {
398        return scope
399            .iter()
400            .find(|f| f.key == key)
401            .map(|f| f.value.clone())
402            .ok_or_else(|| NxsError::MacroUnresolved(format!("@{key} not found in scope")));
403    }
404
405    // now() built-in
406    if expr == "now()" {
407        // Return 0 for deterministic output in POC; real impl would use SystemTime
408        return Ok(Value::Time(0));
409    }
410
411    // String/int literal passthrough
412    if expr.starts_with('"') && expr.ends_with('"') {
413        let inner = &expr[1..expr.len() - 1];
414        return Ok(Value::Str(inner.to_string()));
415    }
416    if let Ok(n) = expr.parse::<i64>() {
417        return Ok(Value::Int(n));
418    }
419    if let Ok(f) = expr.parse::<f64>() {
420        return Ok(Value::Float(f));
421    }
422
423    // String concatenation: split on ` + `
424    if expr.contains(" + ") {
425        let parts: Vec<&str> = expr.splitn(2, " + ").collect();
426        let left = eval_macro(parts[0].trim(), scope)?;
427        let right = eval_macro(parts[1].trim(), scope)?;
428        return match (left, right) {
429            (Value::Str(a), Value::Str(b)) => Ok(Value::Str(a + &b)),
430            (Value::Int(a), Value::Int(b)) => {
431                a.checked_add(b).map(Value::Int).ok_or(NxsError::Overflow)
432            }
433            (Value::Float(a), Value::Float(b)) => Ok(Value::Float(a + b)),
434            _ => Err(NxsError::MacroUnresolved(format!(
435                "incompatible types in +: {expr}"
436            ))),
437        };
438    }
439
440    Err(NxsError::MacroUnresolved(format!(
441        "cannot evaluate: {expr}"
442    )))
443}
444
445// MurmurHash3 64-bit (simplified finalizer-based version for POC)
446fn murmur3_64(data: &[u8]) -> u64 {
447    let mut h: u64 = 0x9368_1D62_5531_3A99;
448    for chunk in data.chunks(8) {
449        let mut k = 0u64;
450        for (i, &b) in chunk.iter().enumerate() {
451            k |= (b as u64) << (i * 8);
452        }
453        k = k.wrapping_mul(0xFF51AFD7ED558CCD);
454        k ^= k >> 33;
455        h ^= k;
456        h = h.wrapping_mul(0xC4CEB9FE1A85EC53);
457        h ^= h >> 33;
458    }
459    h ^= data.len() as u64;
460    h ^= h >> 33;
461    h = h.wrapping_mul(0xFF51AFD7ED558CCD);
462    h ^= h >> 33;
463    h
464}