Skip to main content

lambda_ref_cat/
lexer.rs

1//! Tokenizer for the extended lambda calculus surface syntax.
2//!
3//! Spike 2 adds four tokens beyond spike 1: `Bang` (`!`), `Assign` (`:=`),
4//! `Semicolon` (`;`), and the `KwRef` keyword.  The lexer is otherwise the
5//! same recursive functional pipeline.
6//!
7//! Source must be ASCII; any non-ASCII byte is reported as
8//! [`Error::UnexpectedChar`].
9//!
10//! [`Error::UnexpectedChar`]: crate::error::Error::UnexpectedChar
11
12use crate::error::Error;
13use crate::syntax::{Position, VarName};
14
15/// A token paired with its source position.
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct Token {
18    kind: TokenKind,
19    at: Position,
20}
21
22impl Token {
23    /// The token's syntactic kind.
24    #[must_use]
25    pub fn kind(&self) -> &TokenKind {
26        &self.kind
27    }
28
29    /// Byte offset where the token begins in the source.
30    #[must_use]
31    pub fn at(&self) -> Position {
32        self.at
33    }
34
35    fn new(kind: TokenKind, at: Position) -> Self {
36        Self { kind, at }
37    }
38}
39
40/// The syntactic kind of a token.
41#[derive(Debug, Clone, PartialEq, Eq)]
42pub enum TokenKind {
43    /// An identifier that is not a reserved keyword.
44    Ident(VarName),
45    /// The `let` keyword.
46    KwLet,
47    /// The `in` keyword.
48    KwIn,
49    /// The `fix` keyword.
50    KwFix,
51    /// The `ref` keyword (allocation prefix).
52    KwRef,
53    /// A lambda head, written `\`.
54    Lambda,
55    /// A dot `.` separating a lambda head from its body.
56    Dot,
57    /// An equals sign `=` (let-binding).
58    Equals,
59    /// An opening parenthesis `(`.
60    LParen,
61    /// A closing parenthesis `)`.
62    RParen,
63    /// A bang `!` (dereference prefix).
64    Bang,
65    /// An assignment `:=`.
66    Assign,
67    /// A semicolon `;` (sequencing).
68    Semicolon,
69}
70
71impl std::fmt::Display for TokenKind {
72    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
73        match self {
74            Self::Ident(name) => write!(f, "identifier {:?}", name.as_str()),
75            Self::KwLet => f.write_str("keyword `let`"),
76            Self::KwIn => f.write_str("keyword `in`"),
77            Self::KwFix => f.write_str("keyword `fix`"),
78            Self::KwRef => f.write_str("keyword `ref`"),
79            Self::Lambda => f.write_str("`\\`"),
80            Self::Dot => f.write_str("`.`"),
81            Self::Equals => f.write_str("`=`"),
82            Self::LParen => f.write_str("`(`"),
83            Self::RParen => f.write_str("`)`"),
84            Self::Bang => f.write_str("`!`"),
85            Self::Assign => f.write_str("`:=`"),
86            Self::Semicolon => f.write_str("`;`"),
87        }
88    }
89}
90
91/// One step of the lexer: either a byte to dispatch on or end-of-input.
92enum Step {
93    End,
94    Byte(u8),
95}
96
97fn peek(src: &[u8], pos: usize) -> Step {
98    src.get(pos).copied().map_or(Step::End, Step::Byte)
99}
100
101/// Lex the entire source string into a vector of tokens.
102///
103/// # Errors
104///
105/// Returns [`Error::UnexpectedChar`] on any non-ASCII byte or any character
106/// outside the grammar, or [`Error::UnexpectedEnd`] if a multi-byte token
107/// (such as `:=`) is truncated.
108///
109/// # Examples
110///
111/// ```
112/// # fn main() -> Result<(), lambda_ref_cat::error::Error> {
113/// use lambda_ref_cat::lexer::lex;
114///
115/// let tokens = lex("ref (\\x. x)")?;
116/// assert_eq!(tokens.len(), 7);
117/// # Ok(())
118/// # }
119/// ```
120///
121/// [`Error::UnexpectedChar`]: crate::error::Error::UnexpectedChar
122/// [`Error::UnexpectedEnd`]: crate::error::Error::UnexpectedEnd
123pub fn lex(src: &str) -> Result<Vec<Token>, Error> {
124    step(src.as_bytes(), 0, Vec::new())
125}
126
127fn step(src: &[u8], pos: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
128    match peek(src, pos) {
129        Step::End => Ok(acc),
130        Step::Byte(b) => take_token(src, pos, acc, b),
131    }
132}
133
134fn take_token(src: &[u8], pos: usize, acc: Vec<Token>, b: u8) -> Result<Vec<Token>, Error> {
135    match b {
136        b' ' | b'\t' | b'\n' | b'\r' => step(src, pos + 1, acc),
137        b'\\' => emit_single(src, pos, acc, TokenKind::Lambda),
138        b'.' => emit_single(src, pos, acc, TokenKind::Dot),
139        b'=' => emit_single(src, pos, acc, TokenKind::Equals),
140        b'(' => emit_single(src, pos, acc, TokenKind::LParen),
141        b')' => emit_single(src, pos, acc, TokenKind::RParen),
142        b'!' => emit_single(src, pos, acc, TokenKind::Bang),
143        b';' => emit_single(src, pos, acc, TokenKind::Semicolon),
144        b':' => take_colon(src, pos, acc),
145        other if is_ident_start(other) => read_ident(src, pos, acc),
146        other => Err(Error::UnexpectedChar {
147            at: pos.into(),
148            ch: char::from(other),
149        }),
150    }
151}
152
153fn emit_single(
154    src: &[u8],
155    pos: usize,
156    acc: Vec<Token>,
157    kind: TokenKind,
158) -> Result<Vec<Token>, Error> {
159    step(src, pos + 1, push(acc, Token::new(kind, pos.into())))
160}
161
162fn take_colon(src: &[u8], pos: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
163    match peek(src, pos + 1) {
164        Step::End => Err(Error::UnexpectedEnd {
165            expected: "`=` after `:`",
166        }),
167        Step::Byte(b'=') => step(
168            src,
169            pos + 2,
170            push(acc, Token::new(TokenKind::Assign, pos.into())),
171        ),
172        Step::Byte(other) => Err(Error::UnexpectedChar {
173            at: (pos + 1).into(),
174            ch: char::from(other),
175        }),
176    }
177}
178
179fn push(acc: Vec<Token>, token: Token) -> Vec<Token> {
180    acc.into_iter().chain(std::iter::once(token)).collect()
181}
182
183fn read_ident(src: &[u8], start: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
184    let end = scan_ident(src, start);
185    let slice = src.get(start..end).unwrap_or(&[]);
186    let token = classify_ident(slice, start);
187    step(src, end, push(acc, token))
188}
189
190fn scan_ident(src: &[u8], pos: usize) -> usize {
191    src.get(pos)
192        .copied()
193        .filter(|b| is_ident_continue(*b))
194        .map_or(pos, |_| scan_ident(src, pos + 1))
195}
196
197fn classify_ident(slice: &[u8], start: usize) -> Token {
198    let at = Position::from(start);
199    match slice {
200        b"let" => Token::new(TokenKind::KwLet, at),
201        b"in" => Token::new(TokenKind::KwIn, at),
202        b"fix" => Token::new(TokenKind::KwFix, at),
203        b"ref" => Token::new(TokenKind::KwRef, at),
204        bytes => Token::new(
205            TokenKind::Ident(VarName::from(
206                std::str::from_utf8(bytes).unwrap_or_default(),
207            )),
208            at,
209        ),
210    }
211}
212
213fn is_ident_start(b: u8) -> bool {
214    b.is_ascii_alphabetic() || b == b'_'
215}
216
217fn is_ident_continue(b: u8) -> bool {
218    b.is_ascii_alphanumeric() || b == b'_'
219}
220
221#[cfg(test)]
222mod tests {
223    use super::*;
224
225    #[test]
226    fn lex_ref_and_assign() -> Result<(), Error> {
227        let tokens = lex("ref x := y")?;
228        let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
229        let expected = vec![
230            TokenKind::KwRef,
231            TokenKind::Ident(VarName::from("x")),
232            TokenKind::Assign,
233            TokenKind::Ident(VarName::from("y")),
234        ];
235        (kinds == expected)
236            .then_some(())
237            .ok_or(Error::UnexpectedEnd {
238                expected: "ref/assign tokenization",
239            })
240    }
241
242    #[test]
243    fn lex_sequence_and_bang() -> Result<(), Error> {
244        let tokens = lex("!x ; y")?;
245        let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
246        let expected = vec![
247            TokenKind::Bang,
248            TokenKind::Ident(VarName::from("x")),
249            TokenKind::Semicolon,
250            TokenKind::Ident(VarName::from("y")),
251        ];
252        (kinds == expected)
253            .then_some(())
254            .ok_or(Error::UnexpectedEnd {
255                expected: "bang/semi tokenization",
256            })
257    }
258
259    #[test]
260    fn bare_colon_is_error() -> Result<(), Error> {
261        let result = lex(":x");
262        match result {
263            Err(Error::UnexpectedChar { .. }) => Ok(()),
264            Err(other) => Err(other),
265            Ok(_) => Err(Error::UnexpectedEnd {
266                expected: "bare colon rejection",
267            }),
268        }
269    }
270}