use crate::error::Error;
use crate::syntax::{Position, VarName};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
kind: TokenKind,
at: Position,
}
impl Token {
#[must_use]
pub fn kind(&self) -> &TokenKind {
&self.kind
}
#[must_use]
pub fn at(&self) -> Position {
self.at
}
fn new(kind: TokenKind, at: Position) -> Self {
Self { kind, at }
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TokenKind {
Ident(VarName),
KwLet,
KwIn,
KwFix,
KwRef,
Lambda,
Dot,
Equals,
LParen,
RParen,
Bang,
Assign,
Semicolon,
}
impl std::fmt::Display for TokenKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Ident(name) => write!(f, "identifier {:?}", name.as_str()),
Self::KwLet => f.write_str("keyword `let`"),
Self::KwIn => f.write_str("keyword `in`"),
Self::KwFix => f.write_str("keyword `fix`"),
Self::KwRef => f.write_str("keyword `ref`"),
Self::Lambda => f.write_str("`\\`"),
Self::Dot => f.write_str("`.`"),
Self::Equals => f.write_str("`=`"),
Self::LParen => f.write_str("`(`"),
Self::RParen => f.write_str("`)`"),
Self::Bang => f.write_str("`!`"),
Self::Assign => f.write_str("`:=`"),
Self::Semicolon => f.write_str("`;`"),
}
}
}
enum Step {
End,
Byte(u8),
}
fn peek(src: &[u8], pos: usize) -> Step {
src.get(pos).copied().map_or(Step::End, Step::Byte)
}
pub fn lex(src: &str) -> Result<Vec<Token>, Error> {
step(src.as_bytes(), 0, Vec::new())
}
fn step(src: &[u8], pos: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
match peek(src, pos) {
Step::End => Ok(acc),
Step::Byte(b) => take_token(src, pos, acc, b),
}
}
fn take_token(src: &[u8], pos: usize, acc: Vec<Token>, b: u8) -> Result<Vec<Token>, Error> {
match b {
b' ' | b'\t' | b'\n' | b'\r' => step(src, pos + 1, acc),
b'\\' => emit_single(src, pos, acc, TokenKind::Lambda),
b'.' => emit_single(src, pos, acc, TokenKind::Dot),
b'=' => emit_single(src, pos, acc, TokenKind::Equals),
b'(' => emit_single(src, pos, acc, TokenKind::LParen),
b')' => emit_single(src, pos, acc, TokenKind::RParen),
b'!' => emit_single(src, pos, acc, TokenKind::Bang),
b';' => emit_single(src, pos, acc, TokenKind::Semicolon),
b':' => take_colon(src, pos, acc),
other if is_ident_start(other) => read_ident(src, pos, acc),
other => Err(Error::UnexpectedChar {
at: pos.into(),
ch: char::from(other),
}),
}
}
fn emit_single(
src: &[u8],
pos: usize,
acc: Vec<Token>,
kind: TokenKind,
) -> Result<Vec<Token>, Error> {
step(src, pos + 1, push(acc, Token::new(kind, pos.into())))
}
fn take_colon(src: &[u8], pos: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
match peek(src, pos + 1) {
Step::End => Err(Error::UnexpectedEnd {
expected: "`=` after `:`",
}),
Step::Byte(b'=') => step(
src,
pos + 2,
push(acc, Token::new(TokenKind::Assign, pos.into())),
),
Step::Byte(other) => Err(Error::UnexpectedChar {
at: (pos + 1).into(),
ch: char::from(other),
}),
}
}
fn push(acc: Vec<Token>, token: Token) -> Vec<Token> {
acc.into_iter().chain(std::iter::once(token)).collect()
}
fn read_ident(src: &[u8], start: usize, acc: Vec<Token>) -> Result<Vec<Token>, Error> {
let end = scan_ident(src, start);
let slice = src.get(start..end).unwrap_or(&[]);
let token = classify_ident(slice, start);
step(src, end, push(acc, token))
}
fn scan_ident(src: &[u8], pos: usize) -> usize {
src.get(pos)
.copied()
.filter(|b| is_ident_continue(*b))
.map_or(pos, |_| scan_ident(src, pos + 1))
}
fn classify_ident(slice: &[u8], start: usize) -> Token {
let at = Position::from(start);
match slice {
b"let" => Token::new(TokenKind::KwLet, at),
b"in" => Token::new(TokenKind::KwIn, at),
b"fix" => Token::new(TokenKind::KwFix, at),
b"ref" => Token::new(TokenKind::KwRef, at),
bytes => Token::new(
TokenKind::Ident(VarName::from(
std::str::from_utf8(bytes).unwrap_or_default(),
)),
at,
),
}
}
fn is_ident_start(b: u8) -> bool {
b.is_ascii_alphabetic() || b == b'_'
}
fn is_ident_continue(b: u8) -> bool {
b.is_ascii_alphanumeric() || b == b'_'
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn lex_ref_and_assign() -> Result<(), Error> {
let tokens = lex("ref x := y")?;
let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
let expected = vec![
TokenKind::KwRef,
TokenKind::Ident(VarName::from("x")),
TokenKind::Assign,
TokenKind::Ident(VarName::from("y")),
];
(kinds == expected)
.then_some(())
.ok_or(Error::UnexpectedEnd {
expected: "ref/assign tokenization",
})
}
#[test]
fn lex_sequence_and_bang() -> Result<(), Error> {
let tokens = lex("!x ; y")?;
let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind().clone()).collect();
let expected = vec![
TokenKind::Bang,
TokenKind::Ident(VarName::from("x")),
TokenKind::Semicolon,
TokenKind::Ident(VarName::from("y")),
];
(kinds == expected)
.then_some(())
.ok_or(Error::UnexpectedEnd {
expected: "bang/semi tokenization",
})
}
#[test]
fn bare_colon_is_error() -> Result<(), Error> {
let result = lex(":x");
match result {
Err(Error::UnexpectedChar { .. }) => Ok(()),
Err(other) => Err(other),
Ok(_) => Err(Error::UnexpectedEnd {
expected: "bare colon rejection",
}),
}
}
}