lambda-throw-cat 0.1.0

Lambda calculus with records, prototype chains, ref cells, GC, and non-local control flow via throw/try/catch. Outcome::Normal/Thrown is threaded purely-functionally through every reduction. Spike 4 of a web-engine reformulation targeting Tauri.
Documentation
//! Tokenizer for the object-extended lambda calculus surface syntax.
//!
//! Spike 3 adds four tokens beyond spike 2: `LBrace` (`{`), `RBrace` (`}`),
//! `Comma` (`,`), and the `KwExtend` keyword.

use crate::error::Error;
use crate::syntax::{Position, VarName};

/// A token paired with its source position.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
    kind: TokenKind,
    at: Position,
}

impl Token {
    /// The token's syntactic kind.
    #[must_use]
    pub fn kind(&self) -> &TokenKind {
        &self.kind
    }

    /// Byte offset where the token begins in the source.
    #[must_use]
    pub fn at(&self) -> Position {
        self.at
    }

    fn new(kind: TokenKind, at: Position) -> Self {
        Self { kind, at }
    }
}

/// The syntactic kind of a token.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TokenKind {
    /// An identifier that is not a reserved keyword.
    Ident(VarName),
    /// The `let` keyword.
    KwLet,
    /// The `in` keyword.
    KwIn,
    /// The `fix` keyword.
    KwFix,
    /// The `ref` keyword (allocation prefix).
    KwRef,
    /// The `extend` keyword (prototype-bearing object literal).
    KwExtend,
    /// The `throw` keyword.
    KwThrow,
    /// The `try` keyword.
    KwTry,
    /// The `catch` keyword.
    KwCatch,
    /// A lambda head, written `\`.
    Lambda,
    /// A dot `.`.  Used both as the lambda head separator and as the field
    /// access infix; the parser distinguishes by syntactic position.
    Dot,
    /// An equals sign `=` (let-binding or property assignment in a literal).
    Equals,
    /// An opening parenthesis `(`.
    LParen,
    /// A closing parenthesis `)`.
    RParen,
    /// An opening brace `{`.
    LBrace,
    /// A closing brace `}`.
    RBrace,
    /// A comma `,` (separator inside object literals).
    Comma,
    /// A bang `!` (dereference prefix).
    Bang,
    /// An assignment `:=`.
    Assign,
    /// A semicolon `;` (sequencing).
    Semicolon,
}

impl std::fmt::Display for TokenKind {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Ident(name) => write!(f, "identifier {:?}", name.as_str()),
            Self::KwLet => f.write_str("keyword `let`"),
            Self::KwIn => f.write_str("keyword `in`"),
            Self::KwFix => f.write_str("keyword `fix`"),
            Self::KwRef => f.write_str("keyword `ref`"),
            Self::KwExtend => f.write_str("keyword `extend`"),
            Self::KwThrow => f.write_str("keyword `throw`"),
            Self::KwTry => f.write_str("keyword `try`"),
            Self::KwCatch => f.write_str("keyword `catch`"),
            Self::Lambda => f.write_str("`\\`"),
            Self::Dot => f.write_str("`.`"),
            Self::Equals => f.write_str("`=`"),
            Self::LParen => f.write_str("`(`"),
            Self::RParen => f.write_str("`)`"),
            Self::LBrace => f.write_str("`{`"),
            Self::RBrace => f.write_str("`}`"),
            Self::Comma => f.write_str("`,`"),
            Self::Bang => f.write_str("`!`"),
            Self::Assign => f.write_str("`:=`"),
            Self::Semicolon => f.write_str("`;`"),
        }
    }
}

enum Step {
    End,
    Byte(u8),
}

fn peek(src: &[u8], pos: usize) -> Step {
    src.get(pos).copied().map_or(Step::End, Step::Byte)
}

/// Lex the entire source string into a vector of tokens.
///
/// # Errors
///
/// Returns [`Error::UnexpectedChar`] on any non-ASCII byte or any character
/// outside the grammar, or [`Error::UnexpectedEnd`] if a multi-byte token
/// (such as `:=`) is truncated.
///
/// # Examples
///
/// ```
/// # fn main() -> Result<(), lambda_throw_cat::error::Error> {
/// use lambda_throw_cat::lexer::lex;
///
/// let tokens = lex("{ foo = bar }")?;
/// assert_eq!(tokens.len(), 5);
/// # Ok(())
/// # }
/// ```
///
/// [`Error::UnexpectedChar`]: crate::error::Error::UnexpectedChar
/// [`Error::UnexpectedEnd`]: crate::error::Error::UnexpectedEnd
pub fn lex(src: &str) -> Result<Vec<Token>, Error> {
    step(src.as_bytes(), 0, Vec::new())
}

fn step(src: &[u8], pos: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
    match peek(src, pos) {
        Step::End => Ok(acc),
        Step::Byte(b) => take_token(src, pos, acc, b),
    }
}

fn take_token(src: &[u8], pos: usize, acc: Vec<Token>, b: u8) -> Result<Vec<Token>, Error> {
    match b {
        b' ' | b'\t' | b'\n' | b'\r' => step(src, pos + 1, acc),
        b'\\' => emit_single(src, pos, acc, TokenKind::Lambda),
        b'.' => emit_single(src, pos, acc, TokenKind::Dot),
        b'=' => emit_single(src, pos, acc, TokenKind::Equals),
        b'(' => emit_single(src, pos, acc, TokenKind::LParen),
        b')' => emit_single(src, pos, acc, TokenKind::RParen),
        b'{' => emit_single(src, pos, acc, TokenKind::LBrace),
        b'}' => emit_single(src, pos, acc, TokenKind::RBrace),
        b',' => emit_single(src, pos, acc, TokenKind::Comma),
        b'!' => emit_single(src, pos, acc, TokenKind::Bang),
        b';' => emit_single(src, pos, acc, TokenKind::Semicolon),
        b':' => take_colon(src, pos, acc),
        other if is_ident_start(other) => read_ident(src, pos, acc),
        other => Err(Error::UnexpectedChar {
            at: pos.into(),
            ch: char::from(other),
        }),
    }
}

fn emit_single(
    src: &[u8],
    pos: usize,
    acc: Vec<Token>,
    kind: TokenKind,
) -> Result<Vec<Token>, Error> {
    step(src, pos + 1, push(acc, Token::new(kind, pos.into())))
}

fn take_colon(src: &[u8], pos: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
    match peek(src, pos + 1) {
        Step::End => Err(Error::UnexpectedEnd {
            expected: "`=` after `:`",
        }),
        Step::Byte(b'=') => step(
            src,
            pos + 2,
            push(acc, Token::new(TokenKind::Assign, pos.into())),
        ),
        Step::Byte(other) => Err(Error::UnexpectedChar {
            at: (pos + 1).into(),
            ch: char::from(other),
        }),
    }
}

fn push(acc: Vec<Token>, token: Token) -> Vec<Token> {
    acc.into_iter().chain(std::iter::once(token)).collect()
}

fn read_ident(src: &[u8], start: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
    let end = scan_ident(src, start);
    let slice = src.get(start..end).unwrap_or(&[]);
    let token = classify_ident(slice, start);
    step(src, end, push(acc, token))
}

fn scan_ident(src: &[u8], pos: usize) -> usize {
    src.get(pos)
        .copied()
        .filter(|b| is_ident_continue(*b))
        .map_or(pos, |_| scan_ident(src, pos + 1))
}

fn classify_ident(slice: &[u8], start: usize) -> Token {
    let at = Position::from(start);
    match slice {
        b"let" => Token::new(TokenKind::KwLet, at),
        b"in" => Token::new(TokenKind::KwIn, at),
        b"fix" => Token::new(TokenKind::KwFix, at),
        b"ref" => Token::new(TokenKind::KwRef, at),
        b"extend" => Token::new(TokenKind::KwExtend, at),
        b"throw" => Token::new(TokenKind::KwThrow, at),
        b"try" => Token::new(TokenKind::KwTry, at),
        b"catch" => Token::new(TokenKind::KwCatch, at),
        bytes => Token::new(
            TokenKind::Ident(VarName::from(
                std::str::from_utf8(bytes).unwrap_or_default(),
            )),
            at,
        ),
    }
}

fn is_ident_start(b: u8) -> bool {
    b.is_ascii_alphabetic() || b == b'_'
}

fn is_ident_continue(b: u8) -> bool {
    b.is_ascii_alphanumeric() || b == b'_'
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn lex_object_literal() -> Result<(), Error> {
        let tokens = lex("{ foo = bar, baz = qux }")?;
        let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
        let expected = vec![
            TokenKind::LBrace,
            TokenKind::Ident(VarName::from("foo")),
            TokenKind::Equals,
            TokenKind::Ident(VarName::from("bar")),
            TokenKind::Comma,
            TokenKind::Ident(VarName::from("baz")),
            TokenKind::Equals,
            TokenKind::Ident(VarName::from("qux")),
            TokenKind::RBrace,
        ];
        (kinds == expected)
            .then_some(())
            .ok_or(Error::UnexpectedEnd {
                expected: "object literal tokenization",
            })
    }

    #[test]
    fn lex_extend_keyword() -> Result<(), Error> {
        let tokens = lex("extend p {}")?;
        let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
        let expected = vec![
            TokenKind::KwExtend,
            TokenKind::Ident(VarName::from("p")),
            TokenKind::LBrace,
            TokenKind::RBrace,
        ];
        (kinds == expected)
            .then_some(())
            .ok_or(Error::UnexpectedEnd {
                expected: "extend tokenization",
            })
    }

    #[test]
    fn lex_field_access_dot() -> Result<(), Error> {
        let tokens = lex("obj.field")?;
        let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
        let expected = vec![
            TokenKind::Ident(VarName::from("obj")),
            TokenKind::Dot,
            TokenKind::Ident(VarName::from("field")),
        ];
        (kinds == expected)
            .then_some(())
            .ok_or(Error::UnexpectedEnd {
                expected: "field access tokenization",
            })
    }
}