use std::fmt;
#[derive(Debug, Clone, PartialEq)]
pub enum TokenKind {
LParen,
RParen,
LBrace,
RBrace,
LBracket,
RBracket,
Comma,
Semi,
Colon,
Arrow,
Equals,
EqEq,
Plus,
Minus,
Star,
Slash,
Percent,
Question,
Lt,
Gt,
Le,
Ge,
Identifier(String),
Number(i32),
FloatLiteral(f64),
StringLiteral(String),
Placeholder,
BoolLiteral(bool),
AndAnd,
Bang,
Ne,
Amp,
OrOr,
Shl,
Shr,
Caret,
CaretEquals,
Tilde,
Pipe,
}
#[derive(Debug, Clone, PartialEq)]
pub struct Token {
pub kind: TokenKind,
pub line: usize,
pub col: usize,
}
#[derive(Debug, Clone)]
pub struct LexError {
pub message: String,
pub line: usize,
pub col: usize,
}
impl std::error::Error for LexError {}
impl fmt::Display for LexError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "LexError (line {}, col {}): {}", self.line, self.col, self.message)
}
}
pub fn tokenize(source: &str, verbose: bool) -> Result<Vec<Token>, LexError> {
let mut lexer = Lexer::new(source, verbose);
lexer.lex_all()?;
if lexer.verbose {
eprintln!("[lexer] final tokens => {:?}", lexer.tokens);
}
Ok(lexer.tokens)
}
struct Lexer {
chars: Vec<char>,
i: usize,
line: usize,
col: usize,
tokens: Vec<Token>,
verbose: bool,
bracket_depth: usize,
paren_depth: usize,
brace_depth: usize,
}
impl Lexer {
fn new(source: &str, verbose: bool) -> Self {
Self {
chars: source.chars().collect(),
i: 0,
line: 1,
col: 1,
tokens: Vec::new(),
verbose,
bracket_depth: 0,
paren_depth: 0,
brace_depth: 0,
}
}
fn push_token(&mut self, kind: TokenKind) {
self.tokens.push(Token { kind, line: self.line, col: self.col });
}
fn lex_all(&mut self) -> Result<(), LexError> {
while self.i < self.chars.len() {
let ch = self.chars[self.i];
if ch == '\n' {
self.next_char();
if self.brace_depth > 0 && self.paren_depth == 0 && self.bracket_depth == 0 {
self.push_token(TokenKind::Semi);
}
continue;
}
if ch.is_whitespace() {
self.next_char();
continue;
}
match ch {
'(' => { self.next_char(); self.paren_depth += 1; self.push_token(TokenKind::LParen); }
')' => { self.next_char(); if self.paren_depth > 0 { self.paren_depth -= 1; } self.push_token(TokenKind::RParen); }
'{' => { self.next_char(); self.brace_depth += 1; self.push_token(TokenKind::LBrace); }
'}' => { self.next_char(); if self.brace_depth > 0 { self.brace_depth -= 1; } self.push_token(TokenKind::RBrace); }
'[' => { self.next_char(); self.bracket_depth = self.bracket_depth.saturating_add(1); self.push_token(TokenKind::LBracket); }
']' => { self.next_char(); if self.bracket_depth > 0 { self.bracket_depth -= 1; } self.push_token(TokenKind::RBracket); }
',' => { self.next_char(); self.push_token(TokenKind::Comma); }
';' => { self.next_char(); self.push_token(TokenKind::Semi); }
':' => { self.next_char(); self.push_token(TokenKind::Colon); }
'?' => { self.next_char(); self.push_token(TokenKind::Question); }
'+' => { self.next_char(); self.push_token(TokenKind::Plus); }
'-' => { self.next_char(); self.push_token(TokenKind::Minus); }
'*' => { self.next_char(); self.push_token(TokenKind::Star); }
'%' => { self.next_char(); self.push_token(TokenKind::Percent); }
'=' => {
self.next_char();
if let Some('=') = self.peek_char(0) {
self.next_char();
self.push_token(TokenKind::EqEq);
} else if let Some('>') = self.peek_char(0) {
self.next_char();
self.push_token(TokenKind::Arrow);
} else {
self.push_token(TokenKind::Equals);
}
}
'<' => {
self.next_char();
match self.peek_char(0) {
Some('=') => { self.next_char(); self.push_token(TokenKind::Le); }
Some('<') => { self.next_char(); self.push_token(TokenKind::Shl); }
_ => { self.push_token(TokenKind::Lt); }
}
}
'>' => {
self.next_char();
match self.peek_char(0) {
Some('=') => { self.next_char(); self.push_token(TokenKind::Ge); }
Some('>') => { self.next_char(); self.push_token(TokenKind::Shr); }
_ => { self.push_token(TokenKind::Gt); }
}
}
'/' => {
if let Some('/') = self.peek_char(1) {
self.next_char();
self.next_char();
while let Some(c2) = self.peek_char(0) {
if c2 == '\n' { break; }
self.next_char();
}
} else if let Some('*') = self.peek_char(1) {
self.next_char();
self.next_char();
let mut prev_star = false;
loop {
match self.next_char() {
Some(c2) => {
if prev_star && c2 == '/' { break; }
prev_star = c2 == '*';
}
None => {
return Err(LexError {
message: "Unclosed block comment".to_string(),
line: self.line,
col: self.col,
});
}
}
}
} else {
self.next_char();
self.push_token(TokenKind::Slash);
}
}
'#' => {
self.next_char();
while let Some(c2) = self.peek_char(0) {
if c2 == '\n' { break; }
self.next_char();
}
}
'\'' | '"' | '`' => {
let delimiter = ch;
let start_line = self.line;
let start_col = self.col;
self.next_char();
let parsed = match self.parse_quoted_string(delimiter) {
Ok(s) => s,
Err(e) => {
return Err(LexError {
message: format!(
"String parse error (started at line {}, col {}): {}",
start_line, start_col, e
),
line: self.line,
col: self.col,
});
}
};
self.tokens.push(Token {
kind: TokenKind::StringLiteral(parsed),
line: start_line,
col: start_col,
});
}
c2 if c2.is_ascii_digit() => {
let (maybe_tok, maybe_err) = self.parse_number_literal();
if let Some(e) = maybe_err {
return Err(e);
}
if let Some(tok) = maybe_tok {
self.push_token(tok);
}
}
'_' => {
if let Some(nc) = self.peek_char(1) {
if Self::is_valid_ident_char(nc, self.bracket_depth) {
let ident = self.parse_identifier();
let kind = match ident.as_str() {
"true" => TokenKind::BoolLiteral(true),
"false" => TokenKind::BoolLiteral(false),
"_" => TokenKind::Placeholder,
_ => TokenKind::Identifier(ident),
};
self.push_token(kind);
} else {
self.next_char();
self.push_token(TokenKind::Placeholder);
}
} else {
self.next_char();
self.push_token(TokenKind::Placeholder);
}
}
c2 if c2.is_ascii_alphabetic() || c2 == '$' => {
let ident = self.parse_identifier();
let kind = match ident.as_str() {
"true" => TokenKind::BoolLiteral(true),
"false" => TokenKind::BoolLiteral(false),
_ => TokenKind::Identifier(ident),
};
self.push_token(kind);
}
'&' => {
if let Some('&') = self.peek_char(1) {
self.next_char();
self.next_char();
self.push_token(TokenKind::AndAnd);
} else {
self.next_char();
self.push_token(TokenKind::Amp);
}
}
'!' => {
self.next_char();
if let Some('=') = self.peek_char(0) {
self.next_char();
self.push_token(TokenKind::Ne);
} else {
self.push_token(TokenKind::Bang);
}
}
'^' => {
self.next_char();
if let Some('=') = self.peek_char(0) {
self.next_char();
self.push_token(TokenKind::CaretEquals);
} else {
self.push_token(TokenKind::Caret);
}
}
'~' => {
self.next_char();
self.push_token(TokenKind::Tilde);
}
'|' => {
if let Some('|') = self.peek_char(1) {
self.next_char();
self.next_char();
self.push_token(TokenKind::OrOr);
} else {
self.next_char();
self.push_token(TokenKind::Pipe);
}
}
other => {
return Err(LexError {
message: format!("Unexpected character '{}'", other),
line: self.line,
col: self.col,
});
}
}
}
Ok(())
}
fn is_valid_ident_char(ch: char, bracket_depth: usize) -> bool {
if bracket_depth > 0 {
ch.is_ascii_alphanumeric() || ch == '_' || ch == '$'
} else {
ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' || ch == ':'
}
}
fn parse_number_literal(&mut self) -> (Option<TokenKind>, Option<LexError>) {
let start_line = self.line;
let start_col = self.col;
let mut num_str = String::new();
let mut has_dot = false;
let mut has_exp = false;
while let Some(ch2) = self.peek_char(0) {
if ch2.is_ascii_digit() {
num_str.push(ch2);
self.next_char();
} else if ch2 == '.' && !has_dot && !has_exp {
has_dot = true;
num_str.push(ch2);
self.next_char();
} else if (ch2 == 'e' || ch2 == 'E') && !has_exp {
has_exp = true;
num_str.push(ch2);
self.next_char();
if let Some(sign) = self.peek_char(0) {
if sign == '+' || sign == '-' {
num_str.push(sign);
self.next_char();
}
}
} else {
break;
}
}
if has_dot || has_exp {
match num_str.parse::<f64>() {
Ok(ff) => (Some(TokenKind::FloatLiteral(ff)), None),
Err(e) => (
None,
Some(LexError {
message: format!("Invalid float '{}': {}", num_str, e),
line: start_line,
col: start_col,
}),
),
}
} else {
match num_str.parse::<i32>() {
Ok(ii) => (Some(TokenKind::Number(ii)), None),
Err(e) => (
None,
Some(LexError {
message: format!("Invalid integer '{}': {}", num_str, e),
line: start_line,
col: start_col,
}),
),
}
}
}
fn parse_identifier(&mut self) -> String {
let mut ident = String::new();
while let Some(ch2) = self.peek_char(0) {
if Self::is_valid_ident_char(ch2, self.bracket_depth) {
ident.push(ch2);
self.next_char();
} else {
break;
}
}
ident
}
fn parse_quoted_string(&mut self, delimiter: char) -> Result<String, String> {
let mut result = String::new();
while self.i < self.chars.len() {
let c = self.chars[self.i];
if c == delimiter {
self.next_char();
return Ok(result);
}
if c == '\n' && delimiter != '`' {
return Err(format!(
"Unterminated string literal (reached newline before closing '{}')",
delimiter
));
}
if c == '\\' {
if let Some(nc) = self.peek_char(1) {
self.next_char();
self.next_char();
let esc = match nc {
'n' => '\n',
'r' => '\r',
't' => '\t',
'\\' => '\\',
'\'' => '\'',
'"' => '"',
'`' => '`',
_ => {
return Err(format!("Invalid escape sequence '\\{}'", nc));
}
};
result.push(esc);
continue;
} else {
return Err("Unterminated string literal (ends with backslash)".to_string());
}
}
result.push(c);
self.next_char();
}
Err(format!(
"Unterminated string literal (missing closing '{}')",
delimiter
))
}
fn next_char(&mut self) -> Option<char> {
if self.i >= self.chars.len() {
return None;
}
let ch = self.chars[self.i];
self.i += 1;
if ch == '\n' {
self.line += 1;
self.col = 1;
} else {
self.col += 1;
}
Some(ch)
}
fn peek_char(&self, offset: usize) -> Option<char> {
if self.i + offset < self.chars.len() {
Some(self.chars[self.i + offset])
} else {
None
}
}
}