1use sim_codec::{DecodeBudget, Input, ReadCx};
2use sim_kernel::{
3 CodecId, Error, Expr, LocatedExprTree, NumberLiteral, Origin, Result, SourceId, Span, Symbol,
4};
5
6pub fn decode_scheme_tree(
11 cx: &mut ReadCx<'_>,
12 source_id: impl Into<String>,
13 input: Input,
14) -> Result<LocatedExprTree> {
15 let source = input_text(cx.codec, input)?;
16 let mut budget = DecodeBudget::new(cx.limits);
17 budget.check_input_bytes(cx.codec, source.len())?;
18 let source_id = SourceId(source_id.into());
19 cx.cx.sources_mut().intern_text(source_id.clone(), &source);
20 let tree = parse_scheme_source(cx.codec, source_id, &source, &mut budget)?;
21 budget.check_tokens(cx.codec, tree_size(&tree))?;
22 Ok(tree)
23}
24
25pub fn parse_scheme_source(
31 codec: CodecId,
32 source_id: SourceId,
33 source: &str,
34 budget: &mut DecodeBudget,
35) -> Result<LocatedExprTree> {
36 let mut parser = Parser {
37 codec,
38 source_id,
39 source,
40 bytes: source.as_bytes(),
41 index: 0,
42 budget,
43 };
44 let tree = parser.read_expr(0)?;
45 parser.skip_ws_and_comments();
46 if !parser.is_eof() {
47 return parser.err("expected exactly one top-level expression");
48 }
49 Ok(tree)
50}
51
52struct Parser<'a, 'b> {
53 codec: CodecId,
54 source_id: SourceId,
55 source: &'a str,
56 bytes: &'a [u8],
57 index: usize,
58 budget: &'b mut DecodeBudget,
59}
60
61impl Parser<'_, '_> {
62 fn read_expr(&mut self, depth: usize) -> Result<LocatedExprTree> {
63 self.skip_ws_and_comments();
64 self.budget.enter_node(self.codec, depth)?;
65 let start = self.index;
66 let Some(byte) = self.peek() else {
67 return self.err("expected expression");
68 };
69 match byte {
70 b'(' => self.read_list(depth, start),
71 b')' => self.err("unexpected close parenthesis"),
72 b'\'' => self.read_quote(depth, start),
73 b'"' => self.read_string(start),
74 b'#' => self.read_hash_atom(start),
75 _ => self.read_atom(start),
76 }
77 }
78
79 fn read_list(&mut self, depth: usize, start: usize) -> Result<LocatedExprTree> {
80 self.index += 1;
81 let mut children = Vec::new();
82 loop {
83 self.skip_ws_and_comments();
84 match self.peek() {
85 Some(b')') => {
86 self.index += 1;
87 break;
88 }
89 Some(_) => children.push(self.read_expr(depth + 1)?),
90 None => return self.err("unterminated list"),
91 }
92 }
93 self.budget
94 .check_collection_len(self.codec, children.len())?;
95 let expr = Expr::List(children.iter().map(|child| child.expr.clone()).collect());
96 Ok(self.tree(expr, start, self.index, children))
97 }
98
99 fn read_quote(&mut self, depth: usize, start: usize) -> Result<LocatedExprTree> {
100 self.index += 1;
101 let quoted = self.read_expr(depth + 1)?;
102 let quote = self.tree(
103 Expr::Symbol(Symbol::new("quote")),
104 start,
105 start + 1,
106 Vec::new(),
107 );
108 let end = quoted
109 .origin
110 .as_ref()
111 .map(|origin| origin.span.end)
112 .unwrap_or(self.index);
113 Ok(self.tree(
114 Expr::List(vec![quote.expr.clone(), quoted.expr.clone()]),
115 start,
116 end,
117 vec![quote, quoted],
118 ))
119 }
120
121 fn read_string(&mut self, start: usize) -> Result<LocatedExprTree> {
122 self.index += 1;
123 let mut out = String::new();
124 while let Some(byte) = self.peek() {
125 self.index += 1;
126 match byte {
127 b'"' => {
128 self.budget.check_string_bytes(self.codec, out.len())?;
129 return Ok(self.tree(Expr::String(out), start, self.index, Vec::new()));
130 }
131 b'\\' => {
132 let Some(escaped) = self.peek() else {
133 return self.err("unterminated string escape");
134 };
135 self.index += 1;
136 out.push(match escaped {
137 b'n' => '\n',
138 b'r' => '\r',
139 b't' => '\t',
140 b'"' => '"',
141 b'\\' => '\\',
142 other => other as char,
143 });
144 }
145 other => out.push(other as char),
146 }
147 }
148 self.err("unterminated string")
149 }
150
151 fn read_hash_atom(&mut self, start: usize) -> Result<LocatedExprTree> {
152 let atom = self.take_atom();
153 let expr = match atom.as_str() {
154 "#t" | "#true" => Expr::Bool(true),
155 "#f" | "#false" => Expr::Bool(false),
156 _ => {
157 return Err(Error::CodecError {
158 codec: self.codec,
159 message: format!("unsupported Scheme hash token {atom}"),
160 });
161 }
162 };
163 Ok(self.tree(expr, start, self.index, Vec::new()))
164 }
165
166 fn read_atom(&mut self, start: usize) -> Result<LocatedExprTree> {
167 let atom = self.take_atom();
168 if atom.is_empty() {
169 return self.err("expected atom");
170 }
171 let expr = if let Some(number) = number_literal(&atom) {
172 Expr::Number(number)
173 } else {
174 Expr::Symbol(Symbol::new(atom))
175 };
176 Ok(self.tree(expr, start, self.index, Vec::new()))
177 }
178
179 fn take_atom(&mut self) -> String {
180 let start = self.index;
181 while let Some(byte) = self.peek() {
182 if byte.is_ascii_whitespace() || matches!(byte, b'(' | b')' | b'"' | b';') {
183 break;
184 }
185 self.index += 1;
186 }
187 self.source[start..self.index].to_owned()
188 }
189
190 fn skip_ws_and_comments(&mut self) {
191 loop {
192 while self.peek().is_some_and(|byte| byte.is_ascii_whitespace()) {
193 self.index += 1;
194 }
195 if self.peek() != Some(b';') {
196 return;
197 }
198 while let Some(byte) = self.peek() {
199 self.index += 1;
200 if byte == b'\n' {
201 break;
202 }
203 }
204 }
205 }
206
207 fn tree(
208 &self,
209 expr: Expr,
210 start: usize,
211 end: usize,
212 children: Vec<LocatedExprTree>,
213 ) -> LocatedExprTree {
214 LocatedExprTree {
215 expr,
216 origin: Some(Origin {
217 codec: self.codec,
218 source: self.source_id.clone(),
219 span: Span { start, end },
220 trivia: Vec::new(),
221 }),
222 children,
223 }
224 }
225
226 fn peek(&self) -> Option<u8> {
227 self.bytes.get(self.index).copied()
228 }
229
230 fn is_eof(&self) -> bool {
231 self.index >= self.bytes.len()
232 }
233
234 fn err<T>(&self, message: impl Into<String>) -> Result<T> {
235 Err(Error::CodecError {
236 codec: self.codec,
237 message: message.into(),
238 })
239 }
240}
241
242fn number_literal(raw: &str) -> Option<NumberLiteral> {
243 let is_integer = raw
244 .strip_prefix(['+', '-'])
245 .unwrap_or(raw)
246 .chars()
247 .all(|ch| ch.is_ascii_digit());
248 if !is_integer || raw == "+" || raw == "-" {
249 return None;
250 }
251 Some(NumberLiteral {
252 domain: Symbol::qualified("numbers", "i64"),
253 canonical: raw.to_owned(),
254 })
255}
256
257fn tree_size(tree: &LocatedExprTree) -> usize {
258 1 + tree.children.iter().map(tree_size).sum::<usize>()
259}
260
261fn input_text(codec: CodecId, input: Input) -> Result<String> {
262 match input {
263 Input::Text(text) => Ok(text),
264 Input::Bytes(bytes) => String::from_utf8(bytes).map_err(|err| Error::CodecError {
265 codec,
266 message: format!("Scheme input is not valid UTF-8: {err}"),
267 }),
268 }
269}