Skip to main content

lambda_throw_cat/
lexer.rs

1//! Tokenizer for the object-extended lambda calculus surface syntax.
2//!
3//! Spike 3 adds four tokens beyond spike 2: `LBrace` (`{`), `RBrace` (`}`),
4//! `Comma` (`,`), and the `KwExtend` keyword.
5
6use crate::error::Error;
7use crate::syntax::{Position, VarName};
8
9/// A token paired with its source position.
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct Token {
12    kind: TokenKind,
13    at: Position,
14}
15
16impl Token {
17    /// The token's syntactic kind.
18    #[must_use]
19    pub fn kind(&self) -> &TokenKind {
20        &self.kind
21    }
22
23    /// Byte offset where the token begins in the source.
24    #[must_use]
25    pub fn at(&self) -> Position {
26        self.at
27    }
28
29    fn new(kind: TokenKind, at: Position) -> Self {
30        Self { kind, at }
31    }
32}
33
34/// The syntactic kind of a token.
35#[derive(Debug, Clone, PartialEq, Eq)]
36pub enum TokenKind {
37    /// An identifier that is not a reserved keyword.
38    Ident(VarName),
39    /// The `let` keyword.
40    KwLet,
41    /// The `in` keyword.
42    KwIn,
43    /// The `fix` keyword.
44    KwFix,
45    /// The `ref` keyword (allocation prefix).
46    KwRef,
47    /// The `extend` keyword (prototype-bearing object literal).
48    KwExtend,
49    /// The `throw` keyword.
50    KwThrow,
51    /// The `try` keyword.
52    KwTry,
53    /// The `catch` keyword.
54    KwCatch,
55    /// A lambda head, written `\`.
56    Lambda,
57    /// A dot `.`.  Used both as the lambda head separator and as the field
58    /// access infix; the parser distinguishes by syntactic position.
59    Dot,
60    /// An equals sign `=` (let-binding or property assignment in a literal).
61    Equals,
62    /// An opening parenthesis `(`.
63    LParen,
64    /// A closing parenthesis `)`.
65    RParen,
66    /// An opening brace `{`.
67    LBrace,
68    /// A closing brace `}`.
69    RBrace,
70    /// A comma `,` (separator inside object literals).
71    Comma,
72    /// A bang `!` (dereference prefix).
73    Bang,
74    /// An assignment `:=`.
75    Assign,
76    /// A semicolon `;` (sequencing).
77    Semicolon,
78}
79
80impl std::fmt::Display for TokenKind {
81    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
82        match self {
83            Self::Ident(name) => write!(f, "identifier {:?}", name.as_str()),
84            Self::KwLet => f.write_str("keyword `let`"),
85            Self::KwIn => f.write_str("keyword `in`"),
86            Self::KwFix => f.write_str("keyword `fix`"),
87            Self::KwRef => f.write_str("keyword `ref`"),
88            Self::KwExtend => f.write_str("keyword `extend`"),
89            Self::KwThrow => f.write_str("keyword `throw`"),
90            Self::KwTry => f.write_str("keyword `try`"),
91            Self::KwCatch => f.write_str("keyword `catch`"),
92            Self::Lambda => f.write_str("`\\`"),
93            Self::Dot => f.write_str("`.`"),
94            Self::Equals => f.write_str("`=`"),
95            Self::LParen => f.write_str("`(`"),
96            Self::RParen => f.write_str("`)`"),
97            Self::LBrace => f.write_str("`{`"),
98            Self::RBrace => f.write_str("`}`"),
99            Self::Comma => f.write_str("`,`"),
100            Self::Bang => f.write_str("`!`"),
101            Self::Assign => f.write_str("`:=`"),
102            Self::Semicolon => f.write_str("`;`"),
103        }
104    }
105}
106
107enum Step {
108    End,
109    Byte(u8),
110}
111
112fn peek(src: &[u8], pos: usize) -> Step {
113    src.get(pos).copied().map_or(Step::End, Step::Byte)
114}
115
116/// Lex the entire source string into a vector of tokens.
117///
118/// # Errors
119///
120/// Returns [`Error::UnexpectedChar`] on any non-ASCII byte or any character
121/// outside the grammar, or [`Error::UnexpectedEnd`] if a multi-byte token
122/// (such as `:=`) is truncated.
123///
124/// # Examples
125///
126/// ```
127/// # fn main() -> Result<(), lambda_throw_cat::error::Error> {
128/// use lambda_throw_cat::lexer::lex;
129///
130/// let tokens = lex("{ foo = bar }")?;
131/// assert_eq!(tokens.len(), 5);
132/// # Ok(())
133/// # }
134/// ```
135///
136/// [`Error::UnexpectedChar`]: crate::error::Error::UnexpectedChar
137/// [`Error::UnexpectedEnd`]: crate::error::Error::UnexpectedEnd
138pub fn lex(src: &str) -> Result<Vec<Token>, Error> {
139    step(src.as_bytes(), 0, Vec::new())
140}
141
142fn step(src: &[u8], pos: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
143    match peek(src, pos) {
144        Step::End => Ok(acc),
145        Step::Byte(b) => take_token(src, pos, acc, b),
146    }
147}
148
149fn take_token(src: &[u8], pos: usize, acc: Vec<Token>, b: u8) -> Result<Vec<Token>, Error> {
150    match b {
151        b' ' | b'\t' | b'\n' | b'\r' => step(src, pos + 1, acc),
152        b'\\' => emit_single(src, pos, acc, TokenKind::Lambda),
153        b'.' => emit_single(src, pos, acc, TokenKind::Dot),
154        b'=' => emit_single(src, pos, acc, TokenKind::Equals),
155        b'(' => emit_single(src, pos, acc, TokenKind::LParen),
156        b')' => emit_single(src, pos, acc, TokenKind::RParen),
157        b'{' => emit_single(src, pos, acc, TokenKind::LBrace),
158        b'}' => emit_single(src, pos, acc, TokenKind::RBrace),
159        b',' => emit_single(src, pos, acc, TokenKind::Comma),
160        b'!' => emit_single(src, pos, acc, TokenKind::Bang),
161        b';' => emit_single(src, pos, acc, TokenKind::Semicolon),
162        b':' => take_colon(src, pos, acc),
163        other if is_ident_start(other) => read_ident(src, pos, acc),
164        other => Err(Error::UnexpectedChar {
165            at: pos.into(),
166            ch: char::from(other),
167        }),
168    }
169}
170
171fn emit_single(
172    src: &[u8],
173    pos: usize,
174    acc: Vec<Token>,
175    kind: TokenKind,
176) -> Result<Vec<Token>, Error> {
177    step(src, pos + 1, push(acc, Token::new(kind, pos.into())))
178}
179
180fn take_colon(src: &[u8], pos: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
181    match peek(src, pos + 1) {
182        Step::End => Err(Error::UnexpectedEnd {
183            expected: "`=` after `:`",
184        }),
185        Step::Byte(b'=') => step(
186            src,
187            pos + 2,
188            push(acc, Token::new(TokenKind::Assign, pos.into())),
189        ),
190        Step::Byte(other) => Err(Error::UnexpectedChar {
191            at: (pos + 1).into(),
192            ch: char::from(other),
193        }),
194    }
195}
196
197fn push(acc: Vec<Token>, token: Token) -> Vec<Token> {
198    acc.into_iter().chain(std::iter::once(token)).collect()
199}
200
201fn read_ident(src: &[u8], start: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
202    let end = scan_ident(src, start);
203    let slice = src.get(start..end).unwrap_or(&[]);
204    let token = classify_ident(slice, start);
205    step(src, end, push(acc, token))
206}
207
208fn scan_ident(src: &[u8], pos: usize) -> usize {
209    src.get(pos)
210        .copied()
211        .filter(|b| is_ident_continue(*b))
212        .map_or(pos, |_| scan_ident(src, pos + 1))
213}
214
215fn classify_ident(slice: &[u8], start: usize) -> Token {
216    let at = Position::from(start);
217    match slice {
218        b"let" => Token::new(TokenKind::KwLet, at),
219        b"in" => Token::new(TokenKind::KwIn, at),
220        b"fix" => Token::new(TokenKind::KwFix, at),
221        b"ref" => Token::new(TokenKind::KwRef, at),
222        b"extend" => Token::new(TokenKind::KwExtend, at),
223        b"throw" => Token::new(TokenKind::KwThrow, at),
224        b"try" => Token::new(TokenKind::KwTry, at),
225        b"catch" => Token::new(TokenKind::KwCatch, at),
226        bytes => Token::new(
227            TokenKind::Ident(VarName::from(
228                std::str::from_utf8(bytes).unwrap_or_default(),
229            )),
230            at,
231        ),
232    }
233}
234
235fn is_ident_start(b: u8) -> bool {
236    b.is_ascii_alphabetic() || b == b'_'
237}
238
239fn is_ident_continue(b: u8) -> bool {
240    b.is_ascii_alphanumeric() || b == b'_'
241}
242
243#[cfg(test)]
244mod tests {
245    use super::*;
246
247    #[test]
248    fn lex_object_literal() -> Result<(), Error> {
249        let tokens = lex("{ foo = bar, baz = qux }")?;
250        let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
251        let expected = vec![
252            TokenKind::LBrace,
253            TokenKind::Ident(VarName::from("foo")),
254            TokenKind::Equals,
255            TokenKind::Ident(VarName::from("bar")),
256            TokenKind::Comma,
257            TokenKind::Ident(VarName::from("baz")),
258            TokenKind::Equals,
259            TokenKind::Ident(VarName::from("qux")),
260            TokenKind::RBrace,
261        ];
262        (kinds == expected)
263            .then_some(())
264            .ok_or(Error::UnexpectedEnd {
265                expected: "object literal tokenization",
266            })
267    }
268
269    #[test]
270    fn lex_extend_keyword() -> Result<(), Error> {
271        let tokens = lex("extend p {}")?;
272        let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
273        let expected = vec![
274            TokenKind::KwExtend,
275            TokenKind::Ident(VarName::from("p")),
276            TokenKind::LBrace,
277            TokenKind::RBrace,
278        ];
279        (kinds == expected)
280            .then_some(())
281            .ok_or(Error::UnexpectedEnd {
282                expected: "extend tokenization",
283            })
284    }
285
286    #[test]
287    fn lex_field_access_dot() -> Result<(), Error> {
288        let tokens = lex("obj.field")?;
289        let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
290        let expected = vec![
291            TokenKind::Ident(VarName::from("obj")),
292            TokenKind::Dot,
293            TokenKind::Ident(VarName::from("field")),
294        ];
295        (kinds == expected)
296            .then_some(())
297            .ok_or(Error::UnexpectedEnd {
298                expected: "field access tokenization",
299            })
300    }
301}