use crate::frontend::ast::Span;
use logos::{Lexer, Logos};
fn process_basic_escape(ch: char) -> Option<char> {
match ch {
'n' => Some('\n'),
't' => Some('\t'),
'r' => Some('\r'),
'\\' => Some('\\'),
'"' => Some('"'),
'\'' => Some('\''),
'0' => Some('\0'),
_ => None,
}
}
fn extract_unicode_hex(chars: &mut std::str::Chars) -> String {
chars.next(); let mut hex = String::with_capacity(6); for hex_char in chars.by_ref() {
if hex_char == '}' {
break;
}
hex.push(hex_char);
}
hex
}
fn process_unicode_escape(chars: &mut std::str::Chars) -> String {
let hex = extract_unicode_hex(chars);
u32::from_str_radix(&hex, 16)
.ok()
.and_then(char::from_u32)
.map_or_else(|| format!("\\u{{{hex}}}"), |c| c.to_string())
}
fn process_backslash_escape(chars: &mut std::str::Chars, result: &mut String) {
match chars.next() {
None => result.push('\\'), Some('u') if chars.as_str().starts_with('{') => {
result.push_str(&process_unicode_escape(chars));
}
Some(escape_ch) => {
if let Some(escaped) = process_basic_escape(escape_ch) {
result.push(escaped);
} else {
result.push('\\');
result.push(escape_ch);
}
}
}
}
fn process_escapes(s: &str) -> String {
let mut result = String::with_capacity(s.len());
let mut chars = s.chars();
while let Some(ch) = chars.next() {
if ch == '\\' {
process_backslash_escape(&mut chars, &mut result);
} else {
result.push(ch);
}
}
result
}
fn lex_nested_block_comment(lex: &mut Lexer<Token>) -> Option<String> {
let remainder = lex.remainder();
let bytes = remainder.as_bytes();
let mut depth = 1; let mut content = String::new();
let mut i = 0;
while i < bytes.len() {
if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'*' {
depth += 1;
content.push('/');
content.push('*');
i += 2;
} else if i + 1 < bytes.len() && bytes[i] == b'*' && bytes[i + 1] == b'/' {
depth -= 1;
if depth == 0 {
lex.bump(i + 2);
return Some(content);
}
content.push('*');
content.push('/');
i += 2;
} else {
let ch = remainder[i..].chars().next()?;
content.push(ch);
i += ch.len_utf8();
}
}
lex.bump(remainder.len());
Some(content)
}
#[derive(Logos, Debug, PartialEq, Clone)]
#[logos(skip r"[ \t\n\r\f]+")]
pub enum Token {
#[regex(r"///[^\n]*", |lex| lex.slice()[3..].to_string())]
DocComment(String),
#[regex(r"//[^\n]*", |lex| lex.slice()[2..].to_string())]
LineComment(String),
#[token("/*", lex_nested_block_comment)]
BlockComment(String),
#[regex(r"#(?:[^\[\n][^\n]*)?", |lex| {
let s = lex.slice();
if s.len() > 1 { s[1..].to_string() } else { String::new() }
})]
HashComment(String),
#[regex(r"0[xX][0-9a-fA-F]+(?:i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| {
let slice = lex.slice();
slice.to_string()
})]
HexInteger(String),
#[regex(r"[0-9]+(?:i8|i16|i32|i64|i128|isize|u8|u16|u32|u64|u128|usize)?", |lex| {
let slice = lex.slice();
// Parse type suffix and numeric value separately - store as string to preserve suffix
slice.to_string()
})]
Integer(String),
#[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?|[0-9]+[eE][+-]?[0-9]+", |lex| lex.slice().parse::<f64>().ok())]
Float(f64),
#[regex(r#""([^"\\]|\\.)*""#, |lex| {
let s = lex.slice();
let inner = &s[1..s.len()-1];
Some(process_escapes(inner))
})]
#[regex(r"'(([^'\\>\n \t;},):]|\\.)([^'\\>\n \t;},):]|\\.)+|)'", |lex| {
let s = lex.slice();
let inner = &s[1..s.len()-1];
// Only match if it's NOT a single character (let Char handle that)
if inner.len() != 1 && !(inner.starts_with('\\') && inner.len() == 2) {
Some(process_escapes(inner))
} else {
None
}
})]
String(String),
#[regex(r#"f"([^"\\]|\\.)*""#, |lex| {
let s = lex.slice();
// Remove f" prefix and " suffix
let inner = &s[2..s.len()-1];
Some(process_escapes(inner))
})]
FString(String),
#[regex(r####"r#"([^"]|"[^#])*"#"####, |lex| {
let s = lex.slice();
// Remove r#" prefix and "# suffix - no escape processing for raw strings
Some(s[3..s.len()-2].to_string())
})]
#[regex(r#"r"([^"])*""#, |lex| {
let s = lex.slice();
// Remove r" prefix and " suffix - no escape processing for raw strings
Some(s[2..s.len()-1].to_string())
})]
RawString(String),
#[regex(r"'([^'\\]|\\.)'", priority = 7, callback = |lex| {
let s = lex.slice();
let inner = &s[1..s.len()-1];
if inner.len() == 1 {
inner.chars().next()
} else if inner.starts_with('\\') && inner.len() == 2 {
match inner.chars().nth(1) {
Some('n') => Some('\n'),
Some('t') => Some('\t'),
Some('r') => Some('\r'),
Some('\\') => Some('\\'),
Some('\'') => Some('\''),
Some('0') => Some('\0'),
_ => None,
}
} else {
None
}
})]
Char(char),
#[regex(r"b'([^'\\]|\\.)'", |lex| {
let s = lex.slice();
let inner = &s[2..s.len()-1]; // Skip b' prefix
if inner.len() == 1 {
Some(inner.as_bytes()[0])
} else if inner.starts_with('\\') && inner.len() == 2 {
match inner.chars().nth(1) {
Some('n') => Some(b'\n'),
Some('t') => Some(b'\t'),
Some('r') => Some(b'\r'),
Some('\\') => Some(b'\\'),
Some('\'') => Some(b'\''),
Some('0') => Some(b'\0'),
_ => None,
}
} else {
None
}
})]
Byte(u8),
#[token("true", |_| true)]
#[token("false", |_| false)]
Bool(bool),
#[token("fun")]
Fun,
#[token("fn")]
Fn,
#[token("let")]
Let,
#[token("var")]
Var,
#[token("mod")]
Mod,
#[token("if")]
If,
#[token("else")]
Else,
#[token("match")]
Match,
#[token("for")]
For,
#[token("in")]
In,
#[token("while")]
While,
#[token("loop")]
Loop,
#[token("lazy")]
Lazy,
#[token("async")]
Async,
#[token("await")]
Await,
#[token("throw")]
Throw,
#[token("try")]
Try,
#[token("catch")]
Catch,
#[token("finally")]
Finally,
#[token("return")]
Return,
#[token("Ok")]
Ok,
#[token("Err")]
Err,
#[token("Some")]
Some,
#[token("None")]
None,
#[token("null")]
Null,
#[token("Result")]
Result,
#[token("Option")]
Option,
#[token("break")]
Break,
#[token("continue")]
Continue,
#[token("struct")]
Struct,
#[token("enum")]
Enum,
#[token("impl")]
Impl,
#[token("trait")]
Trait,
#[token("extend")]
Extend,
#[token("actor")]
Actor,
#[token("spawn")]
Spawn,
#[token("effect")]
Effect,
#[token("handle")]
Handle,
#[token("handler")]
Handler,
#[token("property")]
Property,
#[token("private")]
Private,
#[token("protected")]
Protected,
#[token("sealed")]
Sealed,
#[token("final")]
Final,
#[token("abstract")]
Abstract,
#[token("mixin")]
Mixin,
#[token("operator")]
Operator,
#[token("interface")]
Interface,
#[token("implements")]
Implements,
#[token("override")]
Override,
#[token("receive")]
Receive,
#[token("send")]
Send,
#[token("ask")]
Ask,
#[token("type")]
Type,
#[token("where")]
Where,
#[token("const", priority = 2)]
Const,
#[token("unsafe", priority = 2)]
Unsafe,
#[token("static")]
Static,
#[token("mut")]
Mut,
#[regex("@[a-zA-Z_][a-zA-Z0-9_]*", priority = 3, callback = |lex| lex.slice().to_string())]
Label(String),
#[regex(r":[a-zA-Z_][a-zA-Z0-9_]*", priority = 3, callback = |lex| lex.slice()[1..].to_string())]
Atom(String),
#[regex(r"'[a-zA-Z_][a-zA-Z0-9_]*", priority = 5, callback = |lex| lex.slice().to_string())]
Lifetime(String),
#[token("pub")]
Pub,
#[token("import")]
Import,
#[token("use")]
Use,
#[token("as")]
As,
#[token("with")]
With,
#[token("from")]
From,
#[token("module")]
Module,
#[token("export")]
Export,
#[token("default")]
Default,
#[token("class")]
Class,
#[token("self")]
Self_,
#[token("super")]
Super,
#[token("crate")]
Crate,
#[token("df", priority = 2)]
DataFrame,
#[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string(), priority = 1)]
Identifier(String),
#[token("+")]
Plus,
#[token("-")]
Minus,
#[token("*")]
Star,
#[token("/")]
Slash,
#[token("%")]
Percent,
#[token("**")]
Power,
#[token("==")]
EqualEqual,
#[token("!=")]
NotEqual,
#[token("<?")]
ActorQuery,
#[token("<-")]
LeftArrow,
#[token("<")]
Less,
#[token("<=")]
LessEqual,
#[token(">")]
Greater,
#[token(">=")]
GreaterEqual,
#[token("&&")]
AndAnd,
#[token("||")]
OrOr,
#[token("!")]
Bang,
#[token("&")]
Ampersand,
#[token("|")]
Pipe,
#[token("^")]
Caret,
#[token("@")]
At,
#[token("~")]
Tilde,
#[token("\\")]
Backslash,
#[token("<<")]
LeftShift,
#[token(">>")]
RightShift,
#[token("=")]
Equal,
#[token("+=")]
PlusEqual,
#[token("-=")]
MinusEqual,
#[token("*=")]
StarEqual,
#[token("/=")]
SlashEqual,
#[token("%=")]
PercentEqual,
#[token("**=")]
PowerEqual,
#[token("&=")]
AmpersandEqual,
#[token("|=")]
PipeEqual,
#[token("^=")]
CaretEqual,
#[token("<<=")]
LeftShiftEqual,
#[token("++")]
Increment,
#[token("--")]
Decrement,
#[token("|>")]
Pipeline,
#[token("->")]
Arrow,
#[token("=>")]
FatArrow,
#[token("..")]
DotDot,
#[token("..=")]
DotDotEqual,
#[token("...")]
DotDotDot,
#[token("??")]
NullCoalesce,
#[token("?")]
Question,
#[token("?.")]
SafeNav,
#[token("(")]
LeftParen,
#[token(")")]
RightParen,
#[token("[")]
LeftBracket,
#[token("]")]
RightBracket,
#[token("{")]
LeftBrace,
#[token("}")]
RightBrace,
#[token(",")]
Comma,
#[token(".")]
Dot,
#[token(":")]
Colon,
#[token("::")]
ColonColon,
#[token(";")]
Semicolon,
#[token("_", priority = 2)]
Underscore,
#[token("#[", priority = 3)]
AttributeStart,
}
impl Token {
#[must_use]
pub fn is_binary_op(&self) -> bool {
matches!(
self,
Token::Plus
| Token::Minus
| Token::Star
| Token::Slash
| Token::Percent
| Token::Power
| Token::EqualEqual
| Token::NotEqual
| Token::Less
| Token::LessEqual
| Token::Greater
| Token::GreaterEqual
| Token::AndAnd
| Token::OrOr
| Token::Ampersand
| Token::Pipe
| Token::Caret
| Token::LeftShift
)
}
#[must_use]
pub fn is_unary_op(&self) -> bool {
matches!(
self,
Token::Bang | Token::Minus | Token::Tilde | Token::Ampersand
)
}
#[must_use]
pub fn is_assignment_op(&self) -> bool {
matches!(
self,
Token::Equal
| Token::PlusEqual
| Token::MinusEqual
| Token::StarEqual
| Token::SlashEqual
| Token::PercentEqual
| Token::PowerEqual
| Token::AmpersandEqual
| Token::PipeEqual
| Token::CaretEqual
| Token::LeftShiftEqual
)
}
}
pub struct TokenStream<'a> {
lexer: Lexer<'a, Token>,
peeked: Option<(Token, Span)>,
input: &'a str,
current_position: usize,
}
#[derive(Clone)]
pub struct TokenStreamPosition<'a> {
lexer: Lexer<'a, Token>,
peeked: Option<(Token, Span)>,
current_position: usize,
}
impl<'a> TokenStream<'a> {
#[must_use]
pub fn source(&self) -> &'a str {
self.input
}
#[must_use]
pub fn new(input: &'a str) -> Self {
let processed_input = if input.starts_with("#!") {
if let Some(newline_pos) = input.find('\n') {
&input[newline_pos + 1..]
} else {
""
}
} else {
input
};
Self {
lexer: Token::lexer(processed_input),
peeked: None,
input,
current_position: 0,
}
}
pub fn current_position(&self) -> (usize, usize) {
let mut line = 1;
let mut col = 1;
for (i, ch) in self.input.chars().enumerate() {
if i >= self.current_position {
break;
}
if ch == '\n' {
line += 1;
col = 1;
} else {
col += 1;
}
}
(line, col)
}
pub fn get_context_string(&self) -> String {
let start = self.current_position.saturating_sub(20);
let end = (self.current_position + 20).min(self.input.len());
let context = &self.input[start..end];
format!("...{context}...")
}
#[must_use]
pub fn position(&self) -> TokenStreamPosition<'a> {
TokenStreamPosition {
lexer: self.lexer.clone(),
peeked: self.peeked.clone(),
current_position: self.current_position,
}
}
pub fn set_position(&mut self, pos: TokenStreamPosition<'a>) {
self.lexer = pos.lexer;
self.peeked = pos.peeked;
self.current_position = pos.current_position;
}
#[allow(clippy::should_implement_trait)]
pub fn next(&mut self) -> Option<(Token, Span)> {
if let Some(peeked) = self.peeked.take() {
self.current_position = peeked.1.end;
return Some(peeked);
}
self.lexer.next().map(|result| {
let token = result.unwrap_or(Token::Bang); let span = Span::new(self.lexer.span().start, self.lexer.span().end);
self.current_position = span.end;
(token, span)
})
}
pub fn peek(&mut self) -> Option<&(Token, Span)> {
if self.peeked.is_none() {
self.peeked = self.next();
}
self.peeked.as_ref()
}
pub fn peek_ahead(&mut self, n: usize) -> Option<(Token, Span)> {
self.peek_nth(n)
}
pub fn peek_nth(&mut self, n: usize) -> Option<(Token, Span)> {
if n == 0 {
return self.peek().cloned();
}
let saved_peeked = self.peeked.clone();
let saved_lexer = self.lexer.clone();
for _ in 0..n {
let _ = self.peek();
self.advance();
}
let result = self.peek().cloned();
self.lexer = saved_lexer;
self.peeked = saved_peeked;
result
}
pub fn peek_nth_is_colon(&mut self, n: usize) -> bool {
if n == 0 {
self.peek().is_some_and(|(t, _)| matches!(t, Token::Colon))
} else {
self.peek_nth(n)
.is_some_and(|(t, _)| matches!(t, Token::Colon))
}
}
pub fn expect(&mut self, expected: &Token) -> anyhow::Result<Span> {
match self.next() {
Some((token, span)) if token == *expected => Ok(span),
Some((token, _)) => anyhow::bail!("Expected {expected:?}, found {token:?}"),
None => anyhow::bail!("Expected {expected:?}, found EOF"),
}
}
pub fn advance(&mut self) -> Option<(Token, Span)> {
self.next()
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
#[allow(clippy::panic)]
#[path = "lexer_tests.rs"]
mod tests;