lambda-ref-cat 0.1.0

Lambda calculus with mutable reference cells and a pure-functional mark-sweep garbage collector, built on comp-cat-rs. Spike 2 of a web-engine reformulation targeting Tauri.
Documentation
//! Tokenizer for the extended lambda calculus surface syntax.
//!
//! Spike 2 adds four tokens beyond spike 1: `Bang` (`!`), `Assign` (`:=`),
//! `Semicolon` (`;`), and the `KwRef` keyword.  The lexer is otherwise the
//! same recursive functional pipeline.
//!
//! Source must be ASCII; any non-ASCII byte is reported as
//! [`Error::UnexpectedChar`].
//!
//! [`Error::UnexpectedChar`]: crate::error::Error::UnexpectedChar

use crate::error::Error;
use crate::syntax::{Position, VarName};

/// A token paired with its source position.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
    kind: TokenKind,
    at: Position,
}

impl Token {
    /// The token's syntactic kind.
    #[must_use]
    pub fn kind(&self) -> &TokenKind {
        &self.kind
    }

    /// Byte offset where the token begins in the source.
    #[must_use]
    pub fn at(&self) -> Position {
        self.at
    }

    fn new(kind: TokenKind, at: Position) -> Self {
        Self { kind, at }
    }
}

/// The syntactic kind of a token.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TokenKind {
    /// An identifier that is not a reserved keyword.
    Ident(VarName),
    /// The `let` keyword.
    KwLet,
    /// The `in` keyword.
    KwIn,
    /// The `fix` keyword.
    KwFix,
    /// The `ref` keyword (allocation prefix).
    KwRef,
    /// A lambda head, written `\`.
    Lambda,
    /// A dot `.` separating a lambda head from its body.
    Dot,
    /// An equals sign `=` (let-binding).
    Equals,
    /// An opening parenthesis `(`.
    LParen,
    /// A closing parenthesis `)`.
    RParen,
    /// A bang `!` (dereference prefix).
    Bang,
    /// An assignment `:=`.
    Assign,
    /// A semicolon `;` (sequencing).
    Semicolon,
}

impl std::fmt::Display for TokenKind {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Ident(name) => write!(f, "identifier {:?}", name.as_str()),
            Self::KwLet => f.write_str("keyword `let`"),
            Self::KwIn => f.write_str("keyword `in`"),
            Self::KwFix => f.write_str("keyword `fix`"),
            Self::KwRef => f.write_str("keyword `ref`"),
            Self::Lambda => f.write_str("`\\`"),
            Self::Dot => f.write_str("`.`"),
            Self::Equals => f.write_str("`=`"),
            Self::LParen => f.write_str("`(`"),
            Self::RParen => f.write_str("`)`"),
            Self::Bang => f.write_str("`!`"),
            Self::Assign => f.write_str("`:=`"),
            Self::Semicolon => f.write_str("`;`"),
        }
    }
}

/// One step of the lexer: either a byte to dispatch on or end-of-input.
enum Step {
    End,
    Byte(u8),
}

fn peek(src: &[u8], pos: usize) -> Step {
    src.get(pos).copied().map_or(Step::End, Step::Byte)
}

/// Lex the entire source string into a vector of tokens.
///
/// # Errors
///
/// Returns [`Error::UnexpectedChar`] on any non-ASCII byte or any character
/// outside the grammar, or [`Error::UnexpectedEnd`] if a multi-byte token
/// (such as `:=`) is truncated.
///
/// # Examples
///
/// ```
/// # fn main() -> Result<(), lambda_ref_cat::error::Error> {
/// use lambda_ref_cat::lexer::lex;
///
/// let tokens = lex("ref (\\x. x)")?;
/// assert_eq!(tokens.len(), 7);
/// # Ok(())
/// # }
/// ```
///
/// [`Error::UnexpectedChar`]: crate::error::Error::UnexpectedChar
/// [`Error::UnexpectedEnd`]: crate::error::Error::UnexpectedEnd
pub fn lex(src: &str) -> Result<Vec<Token>, Error> {
    step(src.as_bytes(), 0, Vec::new())
}

fn step(src: &[u8], pos: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
    match peek(src, pos) {
        Step::End => Ok(acc),
        Step::Byte(b) => take_token(src, pos, acc, b),
    }
}

fn take_token(src: &[u8], pos: usize, acc: Vec<Token>, b: u8) -> Result<Vec<Token>, Error> {
    match b {
        b' ' | b'\t' | b'\n' | b'\r' => step(src, pos + 1, acc),
        b'\\' => emit_single(src, pos, acc, TokenKind::Lambda),
        b'.' => emit_single(src, pos, acc, TokenKind::Dot),
        b'=' => emit_single(src, pos, acc, TokenKind::Equals),
        b'(' => emit_single(src, pos, acc, TokenKind::LParen),
        b')' => emit_single(src, pos, acc, TokenKind::RParen),
        b'!' => emit_single(src, pos, acc, TokenKind::Bang),
        b';' => emit_single(src, pos, acc, TokenKind::Semicolon),
        b':' => take_colon(src, pos, acc),
        other if is_ident_start(other) => read_ident(src, pos, acc),
        other => Err(Error::UnexpectedChar {
            at: pos.into(),
            ch: char::from(other),
        }),
    }
}

fn emit_single(
    src: &[u8],
    pos: usize,
    acc: Vec<Token>,
    kind: TokenKind,
) -> Result<Vec<Token>, Error> {
    step(src, pos + 1, push(acc, Token::new(kind, pos.into())))
}

fn take_colon(src: &[u8], pos: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
    match peek(src, pos + 1) {
        Step::End => Err(Error::UnexpectedEnd {
            expected: "`=` after `:`",
        }),
        Step::Byte(b'=') => step(
            src,
            pos + 2,
            push(acc, Token::new(TokenKind::Assign, pos.into())),
        ),
        Step::Byte(other) => Err(Error::UnexpectedChar {
            at: (pos + 1).into(),
            ch: char::from(other),
        }),
    }
}

fn push(acc: Vec<Token>, token: Token) -> Vec<Token> {
    acc.into_iter().chain(std::iter::once(token)).collect()
}

fn read_ident(src: &[u8], start: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
    let end = scan_ident(src, start);
    let slice = src.get(start..end).unwrap_or(&[]);
    let token = classify_ident(slice, start);
    step(src, end, push(acc, token))
}

fn scan_ident(src: &[u8], pos: usize) -> usize {
    src.get(pos)
        .copied()
        .filter(|b| is_ident_continue(*b))
        .map_or(pos, |_| scan_ident(src, pos + 1))
}

fn classify_ident(slice: &[u8], start: usize) -> Token {
    let at = Position::from(start);
    match slice {
        b"let" => Token::new(TokenKind::KwLet, at),
        b"in" => Token::new(TokenKind::KwIn, at),
        b"fix" => Token::new(TokenKind::KwFix, at),
        b"ref" => Token::new(TokenKind::KwRef, at),
        bytes => Token::new(
            TokenKind::Ident(VarName::from(
                std::str::from_utf8(bytes).unwrap_or_default(),
            )),
            at,
        ),
    }
}

fn is_ident_start(b: u8) -> bool {
    b.is_ascii_alphabetic() || b == b'_'
}

fn is_ident_continue(b: u8) -> bool {
    b.is_ascii_alphanumeric() || b == b'_'
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn lex_ref_and_assign() -> Result<(), Error> {
        let tokens = lex("ref x := y")?;
        let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
        let expected = vec![
            TokenKind::KwRef,
            TokenKind::Ident(VarName::from("x")),
            TokenKind::Assign,
            TokenKind::Ident(VarName::from("y")),
        ];
        (kinds == expected)
            .then_some(())
            .ok_or(Error::UnexpectedEnd {
                expected: "ref/assign tokenization",
            })
    }

    #[test]
    fn lex_sequence_and_bang() -> Result<(), Error> {
        let tokens = lex("!x ; y")?;
        let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
        let expected = vec![
            TokenKind::Bang,
            TokenKind::Ident(VarName::from("x")),
            TokenKind::Semicolon,
            TokenKind::Ident(VarName::from("y")),
        ];
        (kinds == expected)
            .then_some(())
            .ok_or(Error::UnexpectedEnd {
                expected: "bang/semi tokenization",
            })
    }

    #[test]
    fn bare_colon_is_error() -> Result<(), Error> {
        let result = lex(":x");
        match result {
            Err(Error::UnexpectedChar { .. }) => Ok(()),
            Err(other) => Err(other),
            Ok(_) => Err(Error::UnexpectedEnd {
                expected: "bare colon rejection",
            }),
        }
    }
}