Skip to main content

engawa_lisp/
parse.rs

1//! Minimal s-expression tokenizer + parser.
2//!
3//! Supports:
4//! * `()` lists, arbitrary nesting
5//! * symbols (`[a-zA-Z_-][a-zA-Z0-9_-]*`)
6//! * `"double-quoted strings"` with `\"`, `\\`, `\n` escapes
7//! * numbers (`[+-]?[0-9]+(\.[0-9]+)?`)
8//! * `; line comment` to end of line
9//! * Whitespace separation
10//!
11//! Does NOT support: quasiquote, backtick, splice, character
12//! literals, multi-line strings. Add as the lisp surface grows.
13
14use thiserror::Error;
15
16use crate::sexpr::{Sexpr, SexprKind};
17
18/// Source-position span. Both bounds are inclusive byte offsets
19/// into the original source string. Useful for operator error
20/// messages.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub struct Span {
23    pub start: usize,
24    pub end: usize,
25    pub line: usize,
26    pub column: usize,
27}
28
29#[derive(Debug, Error, Clone, PartialEq)]
30pub enum ParseError {
31    #[error("unexpected character {ch:?} at line {line}, column {column}")]
32    UnexpectedChar {
33        ch: char,
34        line: usize,
35        column: usize,
36    },
37    #[error("unterminated string starting at line {line}, column {column}")]
38    UnterminatedString { line: usize, column: usize },
39    #[error("unterminated list (missing ')') opened at line {line}, column {column}")]
40    UnterminatedList { line: usize, column: usize },
41    #[error("unexpected ')' at line {line}, column {column}")]
42    UnexpectedRParen { line: usize, column: usize },
43    #[error("invalid escape sequence \\{ch} at line {line}, column {column}")]
44    InvalidEscape {
45        ch: char,
46        line: usize,
47        column: usize,
48    },
49}
50
51/// Parse a source string into a vector of top-level sexprs.
52pub fn parse(source: &str) -> Result<Vec<Sexpr>, ParseError> {
53    let mut p = Parser::new(source);
54    let mut out = Vec::new();
55    loop {
56        p.skip_whitespace_and_comments();
57        if p.eof() {
58            return Ok(out);
59        }
60        out.push(p.parse_one()?);
61    }
62}
63
64struct Parser<'a> {
65    source: &'a [u8],
66    pos: usize,
67    line: usize,
68    col: usize,
69}
70
71impl<'a> Parser<'a> {
72    fn new(source: &'a str) -> Self {
73        Self {
74            source: source.as_bytes(),
75            pos: 0,
76            line: 1,
77            col: 1,
78        }
79    }
80
81    fn eof(&self) -> bool {
82        self.pos >= self.source.len()
83    }
84
85    fn peek(&self) -> Option<u8> {
86        self.source.get(self.pos).copied()
87    }
88
89    fn advance(&mut self) -> Option<u8> {
90        let b = self.peek()?;
91        self.pos += 1;
92        if b == b'\n' {
93            self.line += 1;
94            self.col = 1;
95        } else {
96            self.col += 1;
97        }
98        Some(b)
99    }
100
101    fn skip_whitespace_and_comments(&mut self) {
102        while let Some(b) = self.peek() {
103            match b {
104                b' ' | b'\t' | b'\r' | b'\n' => {
105                    self.advance();
106                }
107                b';' => {
108                    while let Some(b) = self.peek() {
109                        if b == b'\n' {
110                            break;
111                        }
112                        self.advance();
113                    }
114                }
115                _ => break,
116            }
117        }
118    }
119
120    fn parse_one(&mut self) -> Result<Sexpr, ParseError> {
121        self.skip_whitespace_and_comments();
122        let start = self.pos;
123        let line = self.line;
124        let col = self.col;
125        let Some(b) = self.peek() else {
126            return Err(ParseError::UnexpectedChar {
127                ch: '\0',
128                line,
129                column: col,
130            });
131        };
132        match b {
133            b'(' => self.parse_list(),
134            b')' => Err(ParseError::UnexpectedRParen { line, column: col }),
135            b'"' => self.parse_string(),
136            b if b.is_ascii_digit() || b == b'-' || b == b'+' => {
137                // Negative numbers OR symbols starting with - .
138                // Disambiguate by peeking the second byte.
139                if (b == b'-' || b == b'+')
140                    && self
141                        .source
142                        .get(self.pos + 1)
143                        .is_none_or(|c| !c.is_ascii_digit())
144                {
145                    self.parse_symbol(start, line, col)
146                } else {
147                    self.parse_number(start, line, col)
148                }
149            }
150            b if b.is_ascii_alphabetic() || b == b'_' => {
151                self.parse_symbol(start, line, col)
152            }
153            _ => Err(ParseError::UnexpectedChar {
154                ch: b as char,
155                line,
156                column: col,
157            }),
158        }
159    }
160
161    fn parse_list(&mut self) -> Result<Sexpr, ParseError> {
162        let start = self.pos;
163        let line = self.line;
164        let col = self.col;
165        self.advance(); // consume '('
166        let mut items = Vec::new();
167        loop {
168            self.skip_whitespace_and_comments();
169            match self.peek() {
170                None => {
171                    return Err(ParseError::UnterminatedList { line, column: col });
172                }
173                Some(b')') => {
174                    self.advance();
175                    let end = self.pos;
176                    return Ok(Sexpr {
177                        kind: SexprKind::List(items),
178                        span: Span {
179                            start,
180                            end,
181                            line,
182                            column: col,
183                        },
184                    });
185                }
186                _ => {
187                    items.push(self.parse_one()?);
188                }
189            }
190        }
191    }
192
193    fn parse_string(&mut self) -> Result<Sexpr, ParseError> {
194        let start = self.pos;
195        let line = self.line;
196        let col = self.col;
197        self.advance(); // consume '"'
198        let mut s = String::new();
199        loop {
200            let Some(b) = self.peek() else {
201                return Err(ParseError::UnterminatedString { line, column: col });
202            };
203            match b {
204                b'"' => {
205                    self.advance();
206                    let end = self.pos;
207                    return Ok(Sexpr {
208                        kind: SexprKind::String(s),
209                        span: Span {
210                            start,
211                            end,
212                            line,
213                            column: col,
214                        },
215                    });
216                }
217                b'\\' => {
218                    self.advance();
219                    let Some(esc) = self.peek() else {
220                        return Err(ParseError::UnterminatedString { line, column: col });
221                    };
222                    let ch = match esc {
223                        b'"' => '"',
224                        b'\\' => '\\',
225                        b'n' => '\n',
226                        b't' => '\t',
227                        b'r' => '\r',
228                        other => {
229                            return Err(ParseError::InvalidEscape {
230                                ch: other as char,
231                                line: self.line,
232                                column: self.col,
233                            });
234                        }
235                    };
236                    s.push(ch);
237                    self.advance();
238                }
239                _ => {
240                    s.push(b as char);
241                    self.advance();
242                }
243            }
244        }
245    }
246
247    fn parse_symbol(
248        &mut self,
249        start: usize,
250        line: usize,
251        col: usize,
252    ) -> Result<Sexpr, ParseError> {
253        while let Some(b) = self.peek() {
254            if b.is_ascii_alphanumeric() || b == b'_' || b == b'-' {
255                self.advance();
256            } else {
257                break;
258            }
259        }
260        let end = self.pos;
261        let s = std::str::from_utf8(&self.source[start..end])
262            .expect("source is utf8")
263            .to_string();
264        Ok(Sexpr {
265            kind: SexprKind::Symbol(s),
266            span: Span {
267                start,
268                end,
269                line,
270                column: col,
271            },
272        })
273    }
274
275    fn parse_number(
276        &mut self,
277        start: usize,
278        line: usize,
279        col: usize,
280    ) -> Result<Sexpr, ParseError> {
281        // Optional sign already at start; consume it + digits +
282        // optional .digits.
283        self.advance();
284        while let Some(b) = self.peek() {
285            if b.is_ascii_digit() || b == b'.' {
286                self.advance();
287            } else {
288                break;
289            }
290        }
291        let end = self.pos;
292        let s = std::str::from_utf8(&self.source[start..end])
293            .expect("source is utf8")
294            .to_string();
295        Ok(Sexpr {
296            kind: SexprKind::Number(s),
297            span: Span {
298                start,
299                end,
300                line,
301                column: col,
302            },
303        })
304    }
305}