Skip to main content

sim_codec/implementation/
portable.rs

1//! Codec-neutral, lossless text form for the data subset of `Expr`.
2//!
3//! Domain codecs (`codec:scene`, `codec:intent`, ...) must round-trip arbitrary
4//! data values without borrowing a general codec's grammar or losing
5//! information. This module provides exactly that: a small self-delimiting
6//! textual form that round-trips the data subset of `Expr` exactly -- maps,
7//! lists, vectors, sets, and the atoms (nil, bool, number, symbol, string,
8//! bytes). Eval-only `Expr` forms (calls, infix/prefix/postfix, quotes, blocks,
9//! locals, annotations, extensions) are not data and are rejected by
10//! [`encode_portable`], which is how a domain codec fails closed.
11//!
12//! Tag bytes: `_` nil, `T`/`F` bool, `N` number, `S` symbol, `R` string,
13//! `B` bytes, `(` list, `[` vector, `{` map, `%(` set. Symbol payloads start
14//! with `Q` (qualified) or `U` (unqualified); quoted strings are delimited by
15//! `"` with `\\ \" \n \r \t` escapes.
16
17use sim_kernel::{CodecId, Error, Expr, NumberLiteral, Result, Symbol};
18
19/// Serialize a data-subset `Expr` into the codec-neutral portable text form.
20///
21/// `codec` is used only to tag any error with the calling codec's id.
22///
23/// # Examples
24///
25/// ```
26/// use sim_codec::{decode_portable, encode_portable};
27/// use sim_kernel::{CodecId, Expr};
28///
29/// let expr = Expr::List(vec![Expr::Nil, Expr::Bool(true), Expr::String("hi".into())]);
30/// let text = encode_portable(CodecId(0), &expr).unwrap();
31/// assert_eq!(decode_portable(CodecId(0), &text).unwrap(), expr);
32///
33/// // Eval-only forms are not data and fail closed.
34/// assert!(encode_portable(CodecId(0), &Expr::Block(vec![])).is_err());
35/// ```
36pub fn encode_portable(codec: CodecId, expr: &Expr) -> Result<String> {
37    let mut out = String::new();
38    write_value(codec, expr, &mut out)?;
39    Ok(out)
40}
41
42/// Parse codec-neutral portable text back into an `Expr`, failing closed on any
43/// malformed input rather than panicking.
44pub fn decode_portable(codec: CodecId, source: &str) -> Result<Expr> {
45    let mut parser = Parser {
46        bytes: source.as_bytes(),
47        pos: 0,
48        codec,
49    };
50    let expr = parser.parse_value()?;
51    parser.skip_ws();
52    if parser.pos != parser.bytes.len() {
53        return Err(parser.error("trailing input after value"));
54    }
55    Ok(expr)
56}
57
58fn unsupported(codec: CodecId, form: &str) -> Error {
59    Error::CodecError {
60        codec,
61        message: format!("portable text cannot encode a non-data expression form: {form}"),
62    }
63}
64
65fn write_value(codec: CodecId, expr: &Expr, out: &mut String) -> Result<()> {
66    match expr {
67        Expr::Nil => out.push('_'),
68        Expr::Bool(true) => out.push('T'),
69        Expr::Bool(false) => out.push('F'),
70        Expr::Number(number) => {
71            out.push('N');
72            write_symbol_payload(&number.domain, out);
73            write_qstr(&number.canonical, out);
74        }
75        Expr::Symbol(symbol) => {
76            out.push('S');
77            write_symbol_payload(symbol, out);
78        }
79        Expr::String(text) => {
80            out.push('R');
81            write_qstr(text, out);
82        }
83        Expr::Bytes(bytes) => {
84            out.push('B');
85            write_qstr(&hex_encode(bytes), out);
86        }
87        Expr::List(items) => write_seq(codec, '(', ')', items, out)?,
88        Expr::Vector(items) => write_seq(codec, '[', ']', items, out)?,
89        Expr::Set(items) => {
90            out.push('%');
91            write_seq(codec, '(', ')', items, out)?;
92        }
93        Expr::Map(entries) => {
94            out.push('{');
95            for (key, value) in entries {
96                out.push(' ');
97                write_value(codec, key, out)?;
98                out.push(' ');
99                write_value(codec, value, out)?;
100            }
101            out.push_str(" }");
102        }
103        Expr::Local(_) => return Err(unsupported(codec, "local")),
104        Expr::Call { .. } => return Err(unsupported(codec, "call")),
105        Expr::Infix { .. } => return Err(unsupported(codec, "infix")),
106        Expr::Prefix { .. } => return Err(unsupported(codec, "prefix")),
107        Expr::Postfix { .. } => return Err(unsupported(codec, "postfix")),
108        Expr::Block(_) => return Err(unsupported(codec, "block")),
109        Expr::Quote { .. } => return Err(unsupported(codec, "quote")),
110        Expr::Annotated { .. } => return Err(unsupported(codec, "annotated")),
111        Expr::Extension { .. } => return Err(unsupported(codec, "extension")),
112    }
113    Ok(())
114}
115
116fn write_seq(
117    codec: CodecId,
118    open: char,
119    close: char,
120    items: &[Expr],
121    out: &mut String,
122) -> Result<()> {
123    out.push(open);
124    for item in items {
125        out.push(' ');
126        write_value(codec, item, out)?;
127    }
128    out.push(' ');
129    out.push(close);
130    Ok(())
131}
132
133fn write_symbol_payload(symbol: &Symbol, out: &mut String) {
134    match &symbol.namespace {
135        Some(namespace) => {
136            out.push('Q');
137            write_qstr(namespace, out);
138            write_qstr(&symbol.name, out);
139        }
140        None => {
141            out.push('U');
142            write_qstr(&symbol.name, out);
143        }
144    }
145}
146
147fn write_qstr(text: &str, out: &mut String) {
148    out.push('"');
149    for ch in text.chars() {
150        match ch {
151            '\\' => out.push_str("\\\\"),
152            '"' => out.push_str("\\\""),
153            '\n' => out.push_str("\\n"),
154            '\r' => out.push_str("\\r"),
155            '\t' => out.push_str("\\t"),
156            other => out.push(other),
157        }
158    }
159    out.push('"');
160}
161
162fn hex_encode(bytes: &[u8]) -> String {
163    const HEX: &[u8; 16] = b"0123456789abcdef";
164    let mut out = String::with_capacity(bytes.len() * 2);
165    for byte in bytes {
166        out.push(HEX[(byte >> 4) as usize] as char);
167        out.push(HEX[(byte & 0x0f) as usize] as char);
168    }
169    out
170}
171
172struct Parser<'a> {
173    bytes: &'a [u8],
174    pos: usize,
175    codec: CodecId,
176}
177
178impl Parser<'_> {
179    fn error(&self, message: impl Into<String>) -> Error {
180        Error::CodecError {
181            codec: self.codec,
182            message: format!(
183                "portable text decode error at byte {}: {}",
184                self.pos,
185                message.into()
186            ),
187        }
188    }
189
190    fn skip_ws(&mut self) {
191        while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_whitespace() {
192            self.pos += 1;
193        }
194    }
195
196    fn peek(&self) -> Option<u8> {
197        self.bytes.get(self.pos).copied()
198    }
199
200    fn bump(&mut self) -> Option<u8> {
201        let byte = self.peek()?;
202        self.pos += 1;
203        Some(byte)
204    }
205
206    fn expect(&mut self, byte: u8) -> Result<()> {
207        if self.bump() == Some(byte) {
208            Ok(())
209        } else {
210            Err(self.error(format!("expected '{}'", byte as char)))
211        }
212    }
213
214    fn parse_value(&mut self) -> Result<Expr> {
215        self.skip_ws();
216        match self.peek() {
217            Some(b'_') => {
218                self.pos += 1;
219                Ok(Expr::Nil)
220            }
221            Some(b'T') => {
222                self.pos += 1;
223                Ok(Expr::Bool(true))
224            }
225            Some(b'F') => {
226                self.pos += 1;
227                Ok(Expr::Bool(false))
228            }
229            Some(b'N') => {
230                self.pos += 1;
231                let domain = self.parse_symbol_payload()?;
232                let canonical = self.parse_qstr()?;
233                Ok(Expr::Number(NumberLiteral { domain, canonical }))
234            }
235            Some(b'S') => {
236                self.pos += 1;
237                Ok(Expr::Symbol(self.parse_symbol_payload()?))
238            }
239            Some(b'R') => {
240                self.pos += 1;
241                Ok(Expr::String(self.parse_qstr()?))
242            }
243            Some(b'B') => {
244                self.pos += 1;
245                let hex = self.parse_qstr()?;
246                Ok(Expr::Bytes(self.parse_hex(&hex)?))
247            }
248            Some(b'(') => Ok(Expr::List(self.parse_seq(b'(', b')')?)),
249            Some(b'[') => Ok(Expr::Vector(self.parse_seq(b'[', b']')?)),
250            Some(b'%') => {
251                self.pos += 1;
252                Ok(Expr::Set(self.parse_seq(b'(', b')')?))
253            }
254            Some(b'{') => self.parse_map(),
255            Some(other) => Err(self.error(format!("unexpected tag byte '{}'", other as char))),
256            None => Err(self.error("unexpected end of input")),
257        }
258    }
259
260    fn parse_seq(&mut self, open: u8, close: u8) -> Result<Vec<Expr>> {
261        self.expect(open)?;
262        let mut items = Vec::new();
263        loop {
264            self.skip_ws();
265            match self.peek() {
266                Some(byte) if byte == close => {
267                    self.pos += 1;
268                    return Ok(items);
269                }
270                None => return Err(self.error("unterminated sequence")),
271                _ => items.push(self.parse_value()?),
272            }
273        }
274    }
275
276    fn parse_map(&mut self) -> Result<Expr> {
277        self.expect(b'{')?;
278        let mut entries = Vec::new();
279        loop {
280            self.skip_ws();
281            match self.peek() {
282                Some(b'}') => {
283                    self.pos += 1;
284                    return Ok(Expr::Map(entries));
285                }
286                None => return Err(self.error("unterminated map")),
287                _ => {
288                    let key = self.parse_value()?;
289                    let value = self.parse_value()?;
290                    entries.push((key, value));
291                }
292            }
293        }
294    }
295
296    fn parse_symbol_payload(&mut self) -> Result<Symbol> {
297        match self.bump() {
298            Some(b'Q') => {
299                let namespace = self.parse_qstr()?;
300                let name = self.parse_qstr()?;
301                Ok(Symbol::qualified(namespace, name))
302            }
303            Some(b'U') => Ok(Symbol::new(self.parse_qstr()?)),
304            _ => Err(self.error("expected symbol payload tag 'Q' or 'U'")),
305        }
306    }
307
308    fn parse_qstr(&mut self) -> Result<String> {
309        self.expect(b'"')?;
310        let mut bytes = Vec::new();
311        loop {
312            match self.bump() {
313                Some(b'"') => {
314                    return String::from_utf8(bytes)
315                        .map_err(|err| self.error(format!("invalid utf-8 in string: {err}")));
316                }
317                Some(b'\\') => match self.bump() {
318                    Some(b'\\') => bytes.push(b'\\'),
319                    Some(b'"') => bytes.push(b'"'),
320                    Some(b'n') => bytes.push(b'\n'),
321                    Some(b'r') => bytes.push(b'\r'),
322                    Some(b't') => bytes.push(b'\t'),
323                    _ => return Err(self.error("invalid escape sequence")),
324                },
325                Some(byte) => bytes.push(byte),
326                None => return Err(self.error("unterminated string")),
327            }
328        }
329    }
330
331    fn parse_hex(&self, hex: &str) -> Result<Vec<u8>> {
332        if !hex.len().is_multiple_of(2) {
333            return Err(self.error("byte literal has odd hex length"));
334        }
335        let bytes = hex.as_bytes();
336        let mut out = Vec::with_capacity(hex.len() / 2);
337        let mut index = 0;
338        while index < bytes.len() {
339            let hi = hex_digit(bytes[index]).ok_or_else(|| self.error("invalid hex digit"))?;
340            let lo = hex_digit(bytes[index + 1]).ok_or_else(|| self.error("invalid hex digit"))?;
341            out.push((hi << 4) | lo);
342            index += 2;
343        }
344        Ok(out)
345    }
346}
347
348fn hex_digit(byte: u8) -> Option<u8> {
349    match byte {
350        b'0'..=b'9' => Some(byte - b'0'),
351        b'a'..=b'f' => Some(byte - b'a' + 10),
352        b'A'..=b'F' => Some(byte - b'A' + 10),
353        _ => None,
354    }
355}