use crate::error::LexerError;
use crate::token::{Span, Token, TokenKind};
pub struct Lexer<'a> {
input: &'a str,
chars: std::iter::Peekable<std::str::CharIndices<'a>>,
current_pos: usize,
line: usize,
column: usize,
at_line_start: bool,
after_equals: bool,
}
impl<'a> Lexer<'a> {
pub fn new(input: &'a str) -> Self {
Self {
input,
chars: input.char_indices().peekable(),
current_pos: 0,
line: 1,
column: 1,
at_line_start: true,
after_equals: false,
}
}
pub fn next_token(&mut self) -> Result<Token, LexerError> {
self.skip_comment();
let start_pos = self.current_pos;
let start_line = self.line;
let start_column = self.column;
let Some((_pos, ch)) = self.advance() else {
return Ok(Token::new(
TokenKind::Eof,
Span::new(start_pos, start_pos, start_line, start_column),
));
};
let kind = match ch {
' ' | '\t' => {
while let Some(&(_, c)) = self.chars.peek() {
if c == ' ' || c == '\t' {
self.advance();
} else {
break;
}
}
TokenKind::Space
}
'\n' => {
self.line += 1;
self.column = 1;
self.at_line_start = true;
self.after_equals = false; TokenKind::Newline
}
'\r' => {
if self.peek_char() == Some('\n') {
self.advance();
}
self.line += 1;
self.column = 1;
self.at_line_start = true;
self.after_equals = false; TokenKind::Newline
}
's' => {
self.at_line_start = false;
TokenKind::Straight
}
'r' => {
self.at_line_start = false;
TokenKind::Right
}
'l' => {
self.at_line_start = false;
TokenKind::Left
}
':' => {
self.at_line_start = false;
TokenKind::Colon
}
'(' => {
self.at_line_start = false;
TokenKind::LParen
}
')' => {
self.at_line_start = false;
TokenKind::RParen
}
',' => {
self.at_line_start = false;
TokenKind::Comma
}
'+' => {
self.at_line_start = false;
TokenKind::Plus
}
'-' => {
self.at_line_start = false;
TokenKind::Minus
}
'=' => {
self.at_line_start = false;
self.after_equals = true;
TokenKind::Equals
}
c if c.is_ascii_lowercase() => {
self.at_line_start = false;
TokenKind::Ident(c)
}
c if c.is_ascii_uppercase() => {
let was_at_line_start = self.at_line_start;
let was_after_equals = self.after_equals;
self.at_line_start = false;
self.after_equals = false;
let next_char = self.peek_char();
let could_be_directive = was_at_line_start
&& (next_char == Some('_')
|| next_char.map(|c| c.is_ascii_uppercase()).unwrap_or(false));
if could_be_directive {
let word = self.read_uppercase_word(c);
match word.as_str() {
"MAX_STEP" | "MAX_DEPTH" | "MAX_MEMORY" | "ON_LIMIT" => {
TokenKind::Directive(word)
}
_ => {
return Err(LexerError::new(
format!("Unknown directive '{}' (E009)", word),
start_line,
start_column,
));
}
}
} else if was_after_equals {
let next_char = self.peek_char();
if next_char.map(|c| c.is_ascii_uppercase()).unwrap_or(false) {
let word = self.read_uppercase_word(c);
match word.as_str() {
"ERROR" | "TRUNCATE" => TokenKind::DirectiveValue(word),
_ => {
return Err(LexerError::new(
format!("Unknown directive value '{}' (E009)", word),
start_line,
start_column,
));
}
}
} else {
TokenKind::Param(c)
}
} else {
TokenKind::Param(c)
}
}
c if c.is_ascii_digit() => {
let was_at_line_start = self.at_line_start;
self.after_equals = false;
self.at_line_start = false;
let num = self.read_number(c);
if was_at_line_start {
if self.peek_is_colon_ahead() {
TokenKind::AgentId(num)
} else {
TokenKind::Number(num as i32)
}
} else {
TokenKind::Number(num as i32)
}
}
_ => {
return Err(LexerError::new(
format!("Unexpected character '{}'", ch),
start_line,
start_column,
));
}
};
let span = Span::new(start_pos, self.current_pos, start_line, start_column);
Ok(Token::new(kind, span))
}
pub fn tokenize(&mut self) -> Result<Vec<Token>, LexerError> {
let mut tokens = Vec::new();
loop {
let token = self.next_token()?;
let is_eof = token.kind == TokenKind::Eof;
tokens.push(token);
if is_eof {
break;
}
}
Ok(tokens)
}
fn skip_comment(&mut self) {
if let Some(&(_, ch)) = self.chars.peek() {
if ch == '#' {
while let Some(&(_, c)) = self.chars.peek() {
if c == '\n' || c == '\r' {
break;
}
self.advance();
}
} else if ch == '/' {
let saved_pos = self.current_pos;
let saved_line = self.line;
let saved_column = self.column;
self.advance();
if let Some(&(_, '/')) = self.chars.peek() {
while let Some(&(_, c)) = self.chars.peek() {
if c == '\n' || c == '\r' {
break;
}
self.advance();
}
} else {
self.current_pos = saved_pos;
self.line = saved_line;
self.column = saved_column;
self.chars = self.input.char_indices().peekable();
while let Some(&(pos, _)) = self.chars.peek() {
if pos >= saved_pos {
break;
}
self.chars.next();
}
}
}
}
}
fn peek_char(&mut self) -> Option<char> {
self.chars.peek().map(|&(_, ch)| ch)
}
fn peek_is_colon_ahead(&self) -> bool {
self.input[self.current_pos..].starts_with(':')
}
fn advance(&mut self) -> Option<(usize, char)> {
if let Some((pos, ch)) = self.chars.next() {
self.current_pos = pos + ch.len_utf8();
if ch != '\n' && ch != '\r' {
self.column += 1;
}
Some((pos, ch))
} else {
None
}
}
fn read_number(&mut self, first_digit: char) -> u32 {
let mut num = first_digit.to_digit(10).unwrap();
while let Some(&(_, ch)) = self.chars.peek() {
if let Some(digit) = ch.to_digit(10) {
self.advance();
num = num * 10 + digit;
} else {
break;
}
}
num
}
fn read_uppercase_word(&mut self, first_char: char) -> String {
let mut word = String::new();
word.push(first_char);
while let Some(&(_, ch)) = self.chars.peek() {
if ch.is_ascii_uppercase() || ch == '_' {
self.advance();
word.push(ch);
} else {
break;
}
}
word
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_commands() {
let mut lexer = Lexer::new("srl");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Straight);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Right);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Left);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Eof);
}
#[test]
fn test_agent_line() {
let mut lexer = Lexer::new("0: srl");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::AgentId(0));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Colon);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Space);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Straight);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Right);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Left);
}
#[test]
fn test_multi_digit_agent_id() {
let mut lexer = Lexer::new("12: s");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::AgentId(12));
}
#[test]
fn test_identifier() {
let mut lexer = Lexer::new("x:ss");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Ident('x'));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Colon);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Straight);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Straight);
}
#[test]
fn test_function_definition() {
let mut lexer = Lexer::new("f(X):XXX");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Ident('f'));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::LParen);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Param('X'));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::RParen);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Colon);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Param('X'));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Param('X'));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Param('X'));
}
#[test]
fn test_newline() {
let mut lexer = Lexer::new("s\nr");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Straight);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Newline);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Right);
}
#[test]
fn test_comment() {
let mut lexer = Lexer::new("s # comment\nr");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Straight);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Space);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Newline);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Right);
}
#[test]
fn test_span_tracking() {
let mut lexer = Lexer::new("0: s");
let token = lexer.next_token().unwrap();
assert_eq!(token.span.line, 1);
assert_eq!(token.span.column, 1);
}
#[test]
fn test_number_in_function_call() {
let mut lexer = Lexer::new("0: a(4)");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::AgentId(0));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Colon);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Space);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Ident('a'));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::LParen);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Number(4));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::RParen);
}
#[test]
fn test_comma_and_operators() {
let mut lexer = Lexer::new("a(X,Y)");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Ident('a'));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::LParen);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Param('X'));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Comma);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Param('Y'));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::RParen);
let mut lexer = Lexer::new("a(X-1)");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Ident('a'));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::LParen);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Param('X'));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Minus);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Number(1));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::RParen);
}
#[test]
fn test_single_slash_position_tracking() {
let mut lexer = Lexer::new("/s");
let result = lexer.next_token();
assert!(result.is_err());
if let Err(e) = result {
assert_eq!(e.line, 1);
assert_eq!(e.column, 1);
}
}
#[test]
fn test_single_slash_span_after_restore() {
let mut lexer = Lexer::new("s/r");
let t1 = lexer.next_token().unwrap();
assert_eq!(t1.kind, TokenKind::Straight);
assert_eq!(t1.span.start, 0);
assert_eq!(t1.span.end, 1);
let err = lexer.next_token().unwrap_err();
assert_eq!(err.column, 2, "Error column should be 2");
assert_eq!(err.line, 1);
}
#[test]
fn test_position_after_single_slash_in_middle() {
let mut lexer = Lexer::new("ss/r");
let t1 = lexer.next_token().unwrap();
assert_eq!(t1.kind, TokenKind::Straight);
assert_eq!(t1.span.start, 0);
assert_eq!(t1.span.end, 1);
let t2 = lexer.next_token().unwrap();
assert_eq!(t2.kind, TokenKind::Straight);
assert_eq!(t2.span.start, 1);
assert_eq!(t2.span.end, 2);
let err = lexer.next_token().unwrap_err();
assert_eq!(
err.column, 3,
"Error column should be 3 for '/' at position 2"
);
}
#[test]
fn test_span_corruption_after_slash_restore() {
let mut lexer = Lexer::new("sss/sss");
for i in 0..3 {
let t = lexer.next_token().unwrap();
assert_eq!(t.kind, TokenKind::Straight);
assert_eq!(t.span.start, i, "Token {} should start at {}", i, i);
assert_eq!(t.span.end, i + 1, "Token {} should end at {}", i, i + 1);
}
let _ = lexer.next_token();
}
#[test]
fn test_single_slash_followed_by_valid_code() {
let input = "s/r";
let mut lexer = Lexer::new(input);
let token = lexer.next_token().unwrap();
assert_eq!(token.kind, TokenKind::Straight);
assert_eq!(token.span.start, 0);
assert_eq!(token.span.end, 1);
let result = lexer.next_token();
assert!(result.is_err());
}
#[test]
fn test_double_slash_comment() {
let mut lexer = Lexer::new("s // comment\nr");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Straight);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Space);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Newline);
let token = lexer.next_token().unwrap();
assert_eq!(token.kind, TokenKind::Right);
assert_eq!(token.span.line, 2);
}
#[test]
fn test_span_end_position() {
let mut lexer = Lexer::new("srl");
let token = lexer.next_token().unwrap();
assert_eq!(token.kind, TokenKind::Straight);
assert_eq!(token.span.start, 0);
assert_eq!(token.span.end, 1);
let token = lexer.next_token().unwrap();
assert_eq!(token.kind, TokenKind::Right);
assert_eq!(token.span.start, 1);
assert_eq!(token.span.end, 2);
let token = lexer.next_token().unwrap();
assert_eq!(token.kind, TokenKind::Left);
assert_eq!(token.span.start, 2);
assert_eq!(token.span.end, 3);
}
#[test]
fn test_directive_max_step() {
let mut lexer = Lexer::new("MAX_STEP=1000");
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Directive("MAX_STEP".to_string())
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Equals);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Number(1000));
}
#[test]
fn test_directive_on_limit_error() {
let mut lexer = Lexer::new("ON_LIMIT=ERROR");
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Directive("ON_LIMIT".to_string())
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Equals);
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::DirectiveValue("ERROR".to_string())
);
}
#[test]
fn test_directive_on_limit_truncate() {
let mut lexer = Lexer::new("ON_LIMIT=TRUNCATE");
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Directive("ON_LIMIT".to_string())
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Equals);
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::DirectiveValue("TRUNCATE".to_string())
);
}
#[test]
fn test_unknown_directive_error() {
let mut lexer = Lexer::new("MAX_STEPS=100");
let result = lexer.next_token();
assert!(result.is_err());
let err = result.unwrap_err();
assert!(err.message.contains("E009"));
}
#[test]
fn test_directive_with_code() {
let mut lexer = Lexer::new("MAX_STEP=100\n0: srl");
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Directive("MAX_STEP".to_string())
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Equals);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Number(100));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Newline);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::AgentId(0));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Colon);
}
#[test]
fn test_agent_id_requires_colon() {
let mut lexer = Lexer::new("0: srl");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::AgentId(0));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Colon);
}
#[test]
fn test_line_start_number_without_colon_is_number() {
let mut lexer = Lexer::new("0\nsrl");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Number(0));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Newline);
}
#[test]
fn test_space_before_colon_is_number() {
let mut lexer = Lexer::new("0 : srl");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Number(0));
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Space);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Colon);
}
#[test]
fn test_multi_digit_agent_id_requires_colon() {
let mut lexer = Lexer::new("123: srl");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::AgentId(123));
}
#[test]
fn test_multi_digit_number_without_colon() {
let mut lexer = Lexer::new("123\nsrl");
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Number(123));
}
#[test]
fn test_after_equals_reset_on_newline() {
let mut lexer = Lexer::new("ON_LIMIT=\nERROR");
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Directive("ON_LIMIT".to_string())
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Equals);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Newline);
let result = lexer.next_token();
assert!(result.is_err()); }
#[test]
fn test_directive_value_same_line() {
let mut lexer = Lexer::new("ON_LIMIT=ERROR");
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::Directive("ON_LIMIT".to_string())
);
assert_eq!(lexer.next_token().unwrap().kind, TokenKind::Equals);
assert_eq!(
lexer.next_token().unwrap().kind,
TokenKind::DirectiveValue("ERROR".to_string())
);
}
}