Skip to main content

sim_lib_lang_clojure/
reader.rs

1use sim_codec::{DecodeBudget, Input, ReadCx};
2use sim_kernel::{
3    CodecId, Error, Expr, LocatedExprTree, NumberLiteral, Origin, Result, SourceId, Span, Symbol,
4};
5
6/// Decodes EDN source into a [`LocatedExprTree`], interning source text and enforcing decode budgets.
7///
8/// Entry point used by [`ClojureEdnCodec`](crate::ClojureEdnCodec) to map surface
9/// syntax onto the located [`Expr`] tree.
10pub fn decode_clojure_edn_tree(
11    cx: &mut ReadCx<'_>,
12    source_id: impl Into<String>,
13    input: Input,
14) -> Result<LocatedExprTree> {
15    let source = input_text(cx.codec, input)?;
16    let mut budget = DecodeBudget::new(cx.limits);
17    budget.check_input_bytes(cx.codec, source.len())?;
18    let source_id = SourceId(source_id.into());
19    cx.cx.sources_mut().intern_text(source_id.clone(), &source);
20    let tree = parse_clojure_edn_source(cx.codec, source_id, &source, &mut budget)?;
21    budget.check_tokens(cx.codec, tree_size(&tree))?;
22    Ok(tree)
23}
24
25/// Parses a single top-level EDN value from source text into a [`LocatedExprTree`].
26///
27/// Fails closed if the input holds more than one top-level form. Spans are
28/// resolved against the given [`SourceId`] and counted against the decode budget.
29pub fn parse_clojure_edn_source(
30    codec: CodecId,
31    source_id: SourceId,
32    source: &str,
33    budget: &mut DecodeBudget,
34) -> Result<LocatedExprTree> {
35    let mut parser = Parser {
36        codec,
37        source_id,
38        source,
39        bytes: source.as_bytes(),
40        index: 0,
41        budget,
42    };
43    let tree = parser.read_expr(0)?;
44    parser.skip_ws_commas_and_comments();
45    if !parser.is_eof() {
46        return parser.err("expected exactly one top-level EDN value");
47    }
48    Ok(tree)
49}
50
51struct Parser<'a, 'b> {
52    codec: CodecId,
53    source_id: SourceId,
54    source: &'a str,
55    bytes: &'a [u8],
56    index: usize,
57    budget: &'b mut DecodeBudget,
58}
59
60impl Parser<'_, '_> {
61    fn read_expr(&mut self, depth: usize) -> Result<LocatedExprTree> {
62        self.skip_ws_commas_and_comments();
63        self.budget.enter_node(self.codec, depth)?;
64        let start = self.index;
65        let Some(byte) = self.peek() else {
66            return self.err("expected EDN value");
67        };
68        match byte {
69            b'(' => self.read_sequence(depth, start, b')', Expr::List),
70            b'[' => self.read_sequence(depth, start, b']', Expr::Vector),
71            b'{' => self.read_map(depth, start),
72            b'"' => self.read_string(start),
73            b'#' => self.read_dispatch(depth, start),
74            b')' | b']' | b'}' => self.err("unexpected EDN delimiter"),
75            _ => self.read_atom(start),
76        }
77    }
78
79    fn read_sequence(
80        &mut self,
81        depth: usize,
82        start: usize,
83        close: u8,
84        make_expr: fn(Vec<Expr>) -> Expr,
85    ) -> Result<LocatedExprTree> {
86        self.index += 1;
87        let mut children = Vec::new();
88        loop {
89            self.skip_ws_commas_and_comments();
90            match self.peek() {
91                Some(byte) if byte == close => {
92                    self.index += 1;
93                    break;
94                }
95                Some(_) => children.push(self.read_expr(depth + 1)?),
96                None => return self.err("unterminated EDN sequence"),
97            }
98        }
99        self.budget
100            .check_collection_len(self.codec, children.len())?;
101        let expr = make_expr(children.iter().map(|child| child.expr.clone()).collect());
102        Ok(self.tree(expr, start, self.index, children))
103    }
104
105    fn read_map(&mut self, depth: usize, start: usize) -> Result<LocatedExprTree> {
106        self.index += 1;
107        let mut children = Vec::new();
108        let mut entries = Vec::new();
109        loop {
110            self.skip_ws_commas_and_comments();
111            match self.peek() {
112                Some(b'}') => {
113                    self.index += 1;
114                    break;
115                }
116                Some(_) => {
117                    let key = self.read_expr(depth + 1)?;
118                    self.skip_ws_commas_and_comments();
119                    if self.peek() == Some(b'}') || self.is_eof() {
120                        return self.err("EDN map expects an even number of forms");
121                    }
122                    let value = self.read_expr(depth + 1)?;
123                    entries.push((key.expr.clone(), value.expr.clone()));
124                    children.push(key);
125                    children.push(value);
126                }
127                None => return self.err("unterminated EDN map"),
128            }
129        }
130        self.budget
131            .check_collection_len(self.codec, entries.len())?;
132        Ok(self.tree(Expr::Map(entries), start, self.index, children))
133    }
134
135    fn read_dispatch(&mut self, depth: usize, start: usize) -> Result<LocatedExprTree> {
136        self.index += 1;
137        match self.peek() {
138            Some(b'{') => self.read_set(depth, start),
139            _ => self.err("unsupported EDN dispatch token"),
140        }
141    }
142
143    fn read_set(&mut self, depth: usize, start: usize) -> Result<LocatedExprTree> {
144        self.index += 1;
145        let mut children = Vec::new();
146        loop {
147            self.skip_ws_commas_and_comments();
148            match self.peek() {
149                Some(b'}') => {
150                    self.index += 1;
151                    break;
152                }
153                Some(_) => children.push(self.read_expr(depth + 1)?),
154                None => return self.err("unterminated EDN set"),
155            }
156        }
157        self.budget
158            .check_collection_len(self.codec, children.len())?;
159        let expr = Expr::Set(children.iter().map(|child| child.expr.clone()).collect());
160        Ok(self.tree(expr, start, self.index, children))
161    }
162
163    fn read_string(&mut self, start: usize) -> Result<LocatedExprTree> {
164        self.index += 1;
165        let mut out = String::new();
166        while let Some(byte) = self.peek() {
167            self.index += 1;
168            match byte {
169                b'"' => {
170                    self.budget.check_string_bytes(self.codec, out.len())?;
171                    return Ok(self.tree(Expr::String(out), start, self.index, Vec::new()));
172                }
173                b'\\' => out.push(self.read_escape()?),
174                other => out.push(other as char),
175            }
176        }
177        self.err("unterminated EDN string")
178    }
179
180    fn read_escape(&mut self) -> Result<char> {
181        let Some(escaped) = self.peek() else {
182            return self.err("unterminated EDN string escape");
183        };
184        self.index += 1;
185        Ok(match escaped {
186            b'n' => '\n',
187            b'r' => '\r',
188            b't' => '\t',
189            b'"' => '"',
190            b'\\' => '\\',
191            other => other as char,
192        })
193    }
194
195    fn read_atom(&mut self, start: usize) -> Result<LocatedExprTree> {
196        let atom = self.take_atom();
197        if atom.is_empty() {
198            return self.err("expected EDN atom");
199        }
200        let expr = match atom.as_str() {
201            "nil" => Expr::Nil,
202            "true" => Expr::Bool(true),
203            "false" => Expr::Bool(false),
204            _ => number_literal(&atom)
205                .map(Expr::Number)
206                .unwrap_or_else(|| Expr::Symbol(symbol_atom(&atom))),
207        };
208        Ok(self.tree(expr, start, self.index, Vec::new()))
209    }
210
211    fn take_atom(&mut self) -> String {
212        let start = self.index;
213        while let Some(byte) = self.peek() {
214            if byte.is_ascii_whitespace()
215                || matches!(
216                    byte,
217                    b',' | b'(' | b')' | b'[' | b']' | b'{' | b'}' | b'"' | b';'
218                )
219            {
220                break;
221            }
222            self.index += 1;
223        }
224        self.source[start..self.index].to_owned()
225    }
226
227    fn skip_ws_commas_and_comments(&mut self) {
228        loop {
229            while self
230                .peek()
231                .is_some_and(|byte| byte.is_ascii_whitespace() || byte == b',')
232            {
233                self.index += 1;
234            }
235            if self.peek() != Some(b';') {
236                return;
237            }
238            while let Some(byte) = self.peek() {
239                self.index += 1;
240                if byte == b'\n' {
241                    break;
242                }
243            }
244        }
245    }
246
247    fn tree(
248        &self,
249        expr: Expr,
250        start: usize,
251        end: usize,
252        children: Vec<LocatedExprTree>,
253    ) -> LocatedExprTree {
254        LocatedExprTree {
255            expr,
256            origin: Some(Origin {
257                codec: self.codec,
258                source: self.source_id.clone(),
259                span: Span { start, end },
260                trivia: Vec::new(),
261            }),
262            children,
263        }
264    }
265
266    fn peek(&self) -> Option<u8> {
267        self.bytes.get(self.index).copied()
268    }
269
270    fn is_eof(&self) -> bool {
271        self.index >= self.bytes.len()
272    }
273
274    fn err<T>(&self, message: impl Into<String>) -> Result<T> {
275        Err(Error::CodecError {
276            codec: self.codec,
277            message: message.into(),
278        })
279    }
280}
281
282fn symbol_atom(atom: &str) -> Symbol {
283    if let Some(keyword) = atom.strip_prefix(':') {
284        return Symbol::qualified("keyword", keyword.to_owned());
285    }
286    if let Some((namespace, name)) = atom.split_once('/') {
287        return Symbol::qualified(namespace.to_owned(), name.to_owned());
288    }
289    Symbol::new(atom.to_owned())
290}
291
292fn number_literal(raw: &str) -> Option<NumberLiteral> {
293    let is_integer = raw
294        .strip_prefix(['+', '-'])
295        .unwrap_or(raw)
296        .chars()
297        .all(|ch| ch.is_ascii_digit());
298    if !is_integer || raw == "+" || raw == "-" {
299        return None;
300    }
301    Some(NumberLiteral {
302        domain: Symbol::qualified("numbers", "i64"),
303        canonical: raw.to_owned(),
304    })
305}
306
307fn tree_size(tree: &LocatedExprTree) -> usize {
308    1 + tree.children.iter().map(tree_size).sum::<usize>()
309}
310
311fn input_text(codec: CodecId, input: Input) -> Result<String> {
312    match input {
313        Input::Text(text) => Ok(text),
314        Input::Bytes(bytes) => String::from_utf8(bytes).map_err(|err| Error::CodecError {
315            codec,
316            message: format!("EDN input is not valid UTF-8: {err}"),
317        }),
318    }
319}