Skip to main content

sim_lib_lang_scheme/
reader.rs

1use sim_codec::{DecodeBudget, Input, ReadCx};
2use sim_kernel::{
3    CodecId, Error, Expr, LocatedExprTree, NumberLiteral, Origin, Result, SourceId, Span, Symbol,
4};
5
6/// Decodes Scheme surface text into a located `Expr` tree under codec budgets.
7///
8/// Interns the source, enforces the decode limits, and returns the single
9/// top-level form as a [`LocatedExprTree`].
10pub fn decode_scheme_tree(
11    cx: &mut ReadCx<'_>,
12    source_id: impl Into<String>,
13    input: Input,
14) -> Result<LocatedExprTree> {
15    let source = input_text(cx.codec, input)?;
16    let mut budget = DecodeBudget::new(cx.limits);
17    budget.check_input_bytes(cx.codec, source.len())?;
18    let source_id = SourceId(source_id.into());
19    cx.cx.sources_mut().intern_text(source_id.clone(), &source);
20    let tree = parse_scheme_source(cx.codec, source_id, &source, &mut budget)?;
21    budget.check_tokens(cx.codec, tree_size(&tree))?;
22    Ok(tree)
23}
24
25/// Parses one top-level Scheme form from source text into a located `Expr` tree.
26///
27/// Lower-level entry point behind [`decode_scheme_tree`]; the caller supplies
28/// the codec id, interned source id, and a [`DecodeBudget`]. Errors if the input
29/// holds more than one top-level expression.
30pub fn parse_scheme_source(
31    codec: CodecId,
32    source_id: SourceId,
33    source: &str,
34    budget: &mut DecodeBudget,
35) -> Result<LocatedExprTree> {
36    let mut parser = Parser {
37        codec,
38        source_id,
39        source,
40        bytes: source.as_bytes(),
41        index: 0,
42        budget,
43    };
44    let tree = parser.read_expr(0)?;
45    parser.skip_ws_and_comments();
46    if !parser.is_eof() {
47        return parser.err("expected exactly one top-level expression");
48    }
49    Ok(tree)
50}
51
52struct Parser<'a, 'b> {
53    codec: CodecId,
54    source_id: SourceId,
55    source: &'a str,
56    bytes: &'a [u8],
57    index: usize,
58    budget: &'b mut DecodeBudget,
59}
60
61impl Parser<'_, '_> {
62    fn read_expr(&mut self, depth: usize) -> Result<LocatedExprTree> {
63        self.skip_ws_and_comments();
64        self.budget.enter_node(self.codec, depth)?;
65        let start = self.index;
66        let Some(byte) = self.peek() else {
67            return self.err("expected expression");
68        };
69        match byte {
70            b'(' => self.read_list(depth, start),
71            b')' => self.err("unexpected close parenthesis"),
72            b'\'' => self.read_quote(depth, start),
73            b'"' => self.read_string(start),
74            b'#' => self.read_hash_atom(start),
75            _ => self.read_atom(start),
76        }
77    }
78
79    fn read_list(&mut self, depth: usize, start: usize) -> Result<LocatedExprTree> {
80        self.index += 1;
81        let mut children = Vec::new();
82        loop {
83            self.skip_ws_and_comments();
84            match self.peek() {
85                Some(b')') => {
86                    self.index += 1;
87                    break;
88                }
89                Some(_) => children.push(self.read_expr(depth + 1)?),
90                None => return self.err("unterminated list"),
91            }
92        }
93        self.budget
94            .check_collection_len(self.codec, children.len())?;
95        let expr = Expr::List(children.iter().map(|child| child.expr.clone()).collect());
96        Ok(self.tree(expr, start, self.index, children))
97    }
98
99    fn read_quote(&mut self, depth: usize, start: usize) -> Result<LocatedExprTree> {
100        self.index += 1;
101        let quoted = self.read_expr(depth + 1)?;
102        let quote = self.tree(
103            Expr::Symbol(Symbol::new("quote")),
104            start,
105            start + 1,
106            Vec::new(),
107        );
108        let end = quoted
109            .origin
110            .as_ref()
111            .map(|origin| origin.span.end)
112            .unwrap_or(self.index);
113        Ok(self.tree(
114            Expr::List(vec![quote.expr.clone(), quoted.expr.clone()]),
115            start,
116            end,
117            vec![quote, quoted],
118        ))
119    }
120
121    fn read_string(&mut self, start: usize) -> Result<LocatedExprTree> {
122        self.index += 1;
123        let mut out = String::new();
124        while let Some(byte) = self.peek() {
125            self.index += 1;
126            match byte {
127                b'"' => {
128                    self.budget.check_string_bytes(self.codec, out.len())?;
129                    return Ok(self.tree(Expr::String(out), start, self.index, Vec::new()));
130                }
131                b'\\' => {
132                    let Some(escaped) = self.peek() else {
133                        return self.err("unterminated string escape");
134                    };
135                    self.index += 1;
136                    out.push(match escaped {
137                        b'n' => '\n',
138                        b'r' => '\r',
139                        b't' => '\t',
140                        b'"' => '"',
141                        b'\\' => '\\',
142                        other => other as char,
143                    });
144                }
145                other => out.push(other as char),
146            }
147        }
148        self.err("unterminated string")
149    }
150
151    fn read_hash_atom(&mut self, start: usize) -> Result<LocatedExprTree> {
152        let atom = self.take_atom();
153        let expr = match atom.as_str() {
154            "#t" | "#true" => Expr::Bool(true),
155            "#f" | "#false" => Expr::Bool(false),
156            _ => {
157                return Err(Error::CodecError {
158                    codec: self.codec,
159                    message: format!("unsupported Scheme hash token {atom}"),
160                });
161            }
162        };
163        Ok(self.tree(expr, start, self.index, Vec::new()))
164    }
165
166    fn read_atom(&mut self, start: usize) -> Result<LocatedExprTree> {
167        let atom = self.take_atom();
168        if atom.is_empty() {
169            return self.err("expected atom");
170        }
171        let expr = if let Some(number) = number_literal(&atom) {
172            Expr::Number(number)
173        } else {
174            Expr::Symbol(Symbol::new(atom))
175        };
176        Ok(self.tree(expr, start, self.index, Vec::new()))
177    }
178
179    fn take_atom(&mut self) -> String {
180        let start = self.index;
181        while let Some(byte) = self.peek() {
182            if byte.is_ascii_whitespace() || matches!(byte, b'(' | b')' | b'"' | b';') {
183                break;
184            }
185            self.index += 1;
186        }
187        self.source[start..self.index].to_owned()
188    }
189
190    fn skip_ws_and_comments(&mut self) {
191        loop {
192            while self.peek().is_some_and(|byte| byte.is_ascii_whitespace()) {
193                self.index += 1;
194            }
195            if self.peek() != Some(b';') {
196                return;
197            }
198            while let Some(byte) = self.peek() {
199                self.index += 1;
200                if byte == b'\n' {
201                    break;
202                }
203            }
204        }
205    }
206
207    fn tree(
208        &self,
209        expr: Expr,
210        start: usize,
211        end: usize,
212        children: Vec<LocatedExprTree>,
213    ) -> LocatedExprTree {
214        LocatedExprTree {
215            expr,
216            origin: Some(Origin {
217                codec: self.codec,
218                source: self.source_id.clone(),
219                span: Span { start, end },
220                trivia: Vec::new(),
221            }),
222            children,
223        }
224    }
225
226    fn peek(&self) -> Option<u8> {
227        self.bytes.get(self.index).copied()
228    }
229
230    fn is_eof(&self) -> bool {
231        self.index >= self.bytes.len()
232    }
233
234    fn err<T>(&self, message: impl Into<String>) -> Result<T> {
235        Err(Error::CodecError {
236            codec: self.codec,
237            message: message.into(),
238        })
239    }
240}
241
242fn number_literal(raw: &str) -> Option<NumberLiteral> {
243    let is_integer = raw
244        .strip_prefix(['+', '-'])
245        .unwrap_or(raw)
246        .chars()
247        .all(|ch| ch.is_ascii_digit());
248    if !is_integer || raw == "+" || raw == "-" {
249        return None;
250    }
251    Some(NumberLiteral {
252        domain: Symbol::qualified("numbers", "i64"),
253        canonical: raw.to_owned(),
254    })
255}
256
257fn tree_size(tree: &LocatedExprTree) -> usize {
258    1 + tree.children.iter().map(tree_size).sum::<usize>()
259}
260
261fn input_text(codec: CodecId, input: Input) -> Result<String> {
262    match input {
263        Input::Text(text) => Ok(text),
264        Input::Bytes(bytes) => String::from_utf8(bytes).map_err(|err| Error::CodecError {
265            codec,
266            message: format!("Scheme input is not valid UTF-8: {err}"),
267        }),
268    }
269}