#![allow(clippy::upper_case_acronyms)]
use logos::Logos;
use miette::{Diagnostic, SourceSpan};
use thiserror::Error;
use claw_common::Source;
#[derive(Debug, PartialEq, Clone)]
pub struct TokenData {
pub token: Token,
pub span: SourceSpan,
}
#[derive(Error, Debug, Diagnostic)]
#[error("Unable to tokenize input")]
#[diagnostic()]
pub struct LexerError {
#[source_code]
src: Source,
#[label("Here")]
span: SourceSpan,
}
pub fn tokenize(src: Source, contents: &str) -> Result<Vec<TokenData>, LexerError> {
let lexer = Token::lexer(contents);
lexer
.spanned()
.map(|(token, span)| match token {
Ok(token) => Ok(TokenData {
token,
span: SourceSpan::from(span),
}),
Err(_error) => Err(LexerError {
src: src.clone(),
span: span.into(),
}),
})
.collect()
}
#[derive(Logos, Debug, PartialEq, Clone)]
#[logos(error = ())]
#[logos(skip r"[ \t\r\n\f]+")]
#[logos(skip r"//[^\n]*")]
#[logos(subpattern word = r"[a-z][a-z0-9]*|[A-Z][A-Z0-9]*")]
#[logos(subpattern id = r"%?(?&word)(-(?&word))*")]
pub enum Token {
#[token("\"", parse_string_literal)]
#[token("r", parse_raw_string_literal)]
StringLiteral(String),
#[regex(r"[0-9][_0-9]*", |lex| parse_decint_literal(lex.slice()))]
#[regex(r"0b[01][_01]*", |lex| parse_bin_literal(lex.slice()))]
#[regex(r"0x[0-9a-fA-F][_0-9a-fA-F]*", |lex| parse_hex_literal(lex.slice()))]
IntLiteral(u64),
#[regex(r"[0-9][_0-9]*\.[0-9][_0-9]*", |lex| parse_decfloat_literal(lex.slice()))]
FloatLiteral(f64),
#[regex(r"(?&id)", |lex| lex.slice().to_string())]
Identifier(String),
#[token("export")]
Export,
#[token("import")]
Import,
#[token("from")]
From,
#[token("func")]
Func,
#[token("if")]
If,
#[token("for")]
For,
#[token("in")]
In,
#[token("loop")]
Loop,
#[token("break")]
Break,
#[token("continue")]
Continue,
#[token("return")]
Return,
#[token("result")]
Result,
#[token("string")]
String,
#[token("u8")]
U8,
#[token("u16")]
U16,
#[token("u32")]
U32,
#[token("u64")]
U64,
#[token("s8")]
S8,
#[token("s16")]
S16,
#[token("s32")]
S32,
#[token("s64")]
S64,
#[token("f32")]
F32,
#[token("f64")]
F64,
#[token("as")]
As,
#[token("at")]
At,
#[token("let")]
Let,
#[token("mut")]
Mut,
#[token("bool")]
Bool,
#[token("true")]
True,
#[token("false")]
False,
#[token("(")]
LParen,
#[token(")")]
RParen,
#[token("{")]
LBrace,
#[token("}")]
RBrace,
#[token("[")]
LBracket,
#[token("]")]
RBracket,
#[token(",")]
Comma,
#[token(".")]
Dot,
#[token("..")]
Range,
#[token(":")]
Colon,
#[token(";")]
Semicolon,
#[token("=")]
Assign,
#[token("->")]
Arrow,
#[token("+")]
Add,
#[token("-")]
Sub,
#[token("*")]
Mult,
#[token("/")]
Div,
#[token("%")]
Mod,
#[token("!")]
Invert,
#[token("and")]
LogicalAnd,
#[token("or")]
LogicalOr,
#[token("|")]
BitOr,
#[token("&")]
BitAnd,
#[token("^")]
BitXor,
#[token("<<")]
BitShiftL,
#[token(">>")]
BitShiftR,
#[token(">>>")]
ArithShiftR,
#[token("|=")]
BitOrAssign,
#[token("&=")]
BitAndAssign,
#[token("^=")]
BitXorAssign,
#[token("+=")]
AddAssign,
#[token("-=")]
SubAssign,
#[token("*=")]
StarAssign,
#[token("/=")]
DivAssign,
#[token("<")]
LT,
#[token("<=")]
LTE,
#[token(">")]
GT,
#[token(">=")]
GTE,
#[token("==")]
EQ,
#[token("!=")]
NEQ,
}
impl std::fmt::Display for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Token::StringLiteral(s) => write!(f, "\"{}\"", s),
Token::IntLiteral(i) => write!(f, "{}", i),
Token::FloatLiteral(float) => write!(f, "{:?}", float),
Token::Identifier(ident) => write!(f, "{}", ident),
Token::Export => write!(f, "export"),
Token::Import => write!(f, "import"),
Token::From => write!(f, "from"),
Token::Func => write!(f, "func"),
Token::If => write!(f, "if"),
Token::For => write!(f, "for"),
Token::In => write!(f, "in"),
Token::Loop => write!(f, "loop"),
Token::Break => write!(f, "break"),
Token::Continue => write!(f, "continue"),
Token::Return => write!(f, "return"),
Token::Result => write!(f, "result"),
Token::String => write!(f, "string"),
Token::U8 => write!(f, "u8"),
Token::U16 => write!(f, "u16"),
Token::U32 => write!(f, "u32"),
Token::U64 => write!(f, "u64"),
Token::S8 => write!(f, "S8"),
Token::S16 => write!(f, "S16"),
Token::S32 => write!(f, "S32"),
Token::S64 => write!(f, "s64"),
Token::F32 => write!(f, "f32"),
Token::F64 => write!(f, "f64"),
Token::As => write!(f, "as"),
Token::At => write!(f, "at"),
Token::Let => write!(f, "let"),
Token::Mut => write!(f, "mut"),
Token::Bool => write!(f, "bool"),
Token::True => write!(f, "true"),
Token::False => write!(f, "false"),
Token::LParen => write!(f, "("),
Token::RParen => write!(f, ")"),
Token::LBrace => write!(f, "{{"),
Token::RBrace => write!(f, "}}"),
Token::LBracket => write!(f, "["),
Token::RBracket => write!(f, "]"),
Token::Comma => write!(f, ","),
Token::Dot => write!(f, "."),
Token::Range => write!(f, ".."),
Token::Colon => write!(f, ":"),
Token::Semicolon => write!(f, ";"),
Token::Assign => write!(f, "="),
Token::Arrow => write!(f, "->"),
Token::Add => write!(f, "+"),
Token::Sub => write!(f, "-"),
Token::Mult => write!(f, "*"),
Token::Div => write!(f, "/"),
Token::Mod => write!(f, "%"),
Token::Invert => write!(f, "!"),
Token::LogicalAnd => write!(f, "and"),
Token::LogicalOr => write!(f, "or"),
Token::BitOr => write!(f, "|"),
Token::BitAnd => write!(f, "&"),
Token::BitXor => write!(f, "^"),
Token::BitShiftL => write!(f, "<<"),
Token::BitShiftR => write!(f, ">>"),
Token::ArithShiftR => write!(f, ">>>"),
Token::BitOrAssign => write!(f, "|="),
Token::BitAndAssign => write!(f, "&="),
Token::BitXorAssign => write!(f, "^="),
Token::AddAssign => write!(f, "+="),
Token::SubAssign => write!(f, "-="),
Token::StarAssign => write!(f, "*="),
Token::DivAssign => write!(f, "/="),
Token::LT => write!(f, "<"),
Token::LTE => write!(f, "<="),
Token::GT => write!(f, ">"),
Token::GTE => write!(f, ">="),
Token::EQ => write!(f, "=="),
Token::NEQ => write!(f, "!="),
}
}
}
fn parse_string_literal(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
let mut c_iter = lex.remainder().chars();
let mut buf = String::new();
while let Some(c) = c_iter.next() {
if c == '"' {
lex.bump(1);
return Some(buf);
}
if c == '\\' {
lex.bump(1);
if let Some((c_esc, c_len)) = parse_escaped_char(&mut c_iter) {
lex.bump(c_len);
buf.push(c_esc);
}
} else {
lex.bump(c.len_utf8());
buf.push(c);
}
}
None
}
fn parse_escaped_char(lex: &mut std::str::Chars) -> Option<(char, usize)> {
let res = match lex.next()? {
'\"' => ('\"', 1),
'\\' => ('\\', 1),
'/' => ('/', 1),
'b' => ('\u{0008}', 1),
'f' => ('\u{000C}', 1),
'n' => ('\n', 1),
'r' => ('\r', 1),
't' => ('\t', 1),
'u' => {
let next_4: [Option<char>; 4] = [lex.next(), lex.next(), lex.next(), lex.next()];
let next_4: Option<Vec<char>> = next_4.iter().copied().collect();
let next_4: String = next_4?.into_iter().collect();
let code_point = u32::from_str_radix(&next_4, 16).ok()?;
let new_c: char = std::char::from_u32(code_point)?;
(new_c, 5)
}
_ => return None,
};
Some(res)
}
fn parse_raw_string_literal(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
let mut c_iter = lex.remainder().chars();
let mut buf = String::new();
let mut starting_hashes = 0;
let mut starting_quote = false;
while let Some(c) = c_iter.next() {
lex.bump(c.len_utf8());
if c == '"' {
starting_quote = true;
break;
}
if c == '#' {
starting_hashes += 1;
} else {
return None;
}
}
if !starting_quote {
return None;
}
let mut seen_quote = false;
let mut hash_count = 0;
while let Some(c) = c_iter.next() {
lex.bump(c.len_utf8());
if seen_quote && c == '#' {
hash_count += 1;
if hash_count == starting_hashes {
return Some(buf);
}
continue;
}
if seen_quote {
buf.push('"');
}
seen_quote = false;
for _ in 0..hash_count {
buf.push('#');
}
hash_count = 0;
if c == '"' {
seen_quote = true;
} else {
buf.push(c);
}
}
None
}
fn parse_decint_literal(s: &str) -> Option<u64> {
s.replace('_', "").parse().ok()
}
fn parse_decfloat_literal(s: &str) -> Option<f64> {
s.replace('_', "").parse().ok()
}
fn parse_bin_literal(s: &str) -> Option<u64> {
u64::from_str_radix(&s[2..].replace('_', ""), 2).ok()
}
fn parse_hex_literal(s: &str) -> Option<u64> {
u64::from_str_radix(&s[2..].replace('_', ""), 16).ok()
}
#[cfg(test)]
mod test {
use super::*;
use claw_common::make_source;
use pretty_assertions::assert_eq;
#[test]
fn tokenize_func_declaration() {
let contents = "func test(a: u32) -> u32";
let src = make_source("test", contents);
let ident_test = Token::Identifier("test".to_owned());
let ident_a = Token::Identifier("a".to_owned());
let output = vec![
(Token::Func, SourceSpan::from(0..4)),
(ident_test, SourceSpan::from(5..9)),
(Token::LParen, SourceSpan::from(9..10)),
(ident_a, SourceSpan::from(10..11)),
(Token::Colon, SourceSpan::from(11..12)),
(Token::U32, SourceSpan::from(13..16)),
(Token::RParen, SourceSpan::from(16..17)),
(Token::Arrow, SourceSpan::from(18..20)),
(Token::U32, SourceSpan::from(21..24)),
]
.into_iter()
.map(to_token_data)
.collect::<Vec<TokenData>>();
match tokenize(src, contents) {
Ok(tokens) => assert_eq!(output, tokens),
Err(_) => panic!("Should not have failed"),
}
}
#[test]
fn tokenize_let() {
let contents = r#"let a = "asdf\"";"#;
let src = make_source("test", contents);
let ident_a = Token::Identifier("a".to_owned());
let string_asdf = Token::StringLiteral(String::from(r#"asdf""#));
let output = vec![
(Token::Let, SourceSpan::from(0..3)),
(ident_a, SourceSpan::from(4..5)),
(Token::Assign, SourceSpan::from(6..7)),
(string_asdf, SourceSpan::from(8..16)),
(Token::Semicolon, SourceSpan::from(16..17)),
]
.into_iter()
.map(to_token_data)
.collect::<Vec<TokenData>>();
match tokenize(src, contents) {
Ok(tokens) => assert_eq!(output, tokens),
Err(_) => panic!("Should not have failed"),
}
}
fn to_token_data(d: (Token, SourceSpan)) -> TokenData {
TokenData {
token: d.0,
span: d.1,
}
}
}