use crate::ast::Span;
use crate::error::{Error, Result};
use crate::parser::{Token, TokenKind};
pub struct Lexer<'a> {
input: &'a str,
pos: usize,
line: usize,
col: usize,
}
impl<'a> Lexer<'a> {
#[must_use]
pub fn new(input: &'a str) -> Self {
Self {
input,
pos: 0,
line: 1,
col: 1,
}
}
pub fn next_token(&mut self) -> Result<Token> {
self.skip_whitespace();
if self.is_eof() {
return Ok(self.make_token(TokenKind::Eof));
}
let start_pos = self.pos;
let start_line = self.line;
let start_col = self.col;
let ch = self.current_char();
let kind = match ch {
'#' => self.lex_comment(),
'{' => {
self.advance();
TokenKind::LeftBrace
}
'}' => {
self.advance();
TokenKind::RightBrace
}
';' => {
self.advance();
TokenKind::Semicolon
}
'=' => {
self.advance();
TokenKind::Word("=".to_string()) }
'"' => self.lex_string('"')?,
'\'' => self.lex_string('\'')?,
'$' => self.lex_variable()?,
_ if ch.is_ascii_digit() => self.lex_number(),
_ if is_word_start(ch) => self.lex_word(),
_ => {
return Err(Error::syntax(
format!("unexpected character '{ch}'"),
self.line,
self.col,
Some("valid token".to_string()),
Some(format!("'{ch}'")),
));
}
};
let span = Span::new(start_pos, self.pos, start_line, start_col);
Ok(Token::new(kind, span))
}
pub fn tokenize(&mut self) -> Result<Vec<Token>> {
let mut tokens = Vec::new();
loop {
let token = self.next_token()?;
let is_eof = token.kind == TokenKind::Eof;
tokens.push(token);
if is_eof {
break;
}
}
Ok(tokens)
}
fn skip_whitespace(&mut self) {
while !self.is_eof() {
let ch = self.current_char();
if ch.is_whitespace() {
if ch == '\n' {
self.line += 1;
self.col = 1;
self.pos += 1;
} else {
self.advance();
}
} else {
break;
}
}
}
fn lex_comment(&mut self) -> TokenKind {
self.advance();
let start = self.pos;
while !self.is_eof() && self.current_char() != '\n' {
self.advance();
}
let comment = self.input[start..self.pos].trim().to_string();
TokenKind::Comment(comment)
}
fn lex_string(&mut self, quote: char) -> Result<TokenKind> {
self.advance();
let start = self.pos;
let mut escaped = false;
while !self.is_eof() {
let ch = self.current_char();
if escaped {
escaped = false;
self.advance();
continue;
}
if ch == '\\' {
escaped = true;
self.advance();
continue;
}
if ch == quote {
let value = self.input[start..self.pos].to_string();
self.advance(); return Ok(TokenKind::String(value));
}
if ch == '\n' {
return Err(Error::syntax(
"unterminated string literal",
self.line,
self.col,
Some("closing quote".to_string()),
Some("newline".to_string()),
));
}
self.advance();
}
Err(Error::unexpected_eof("closing quote", self.line))
}
fn lex_variable(&mut self) -> Result<TokenKind> {
self.advance();
let start = self.pos;
if !self.is_eof() && self.current_char() == '{' {
self.advance(); let name_start = self.pos;
while !self.is_eof() && self.current_char() != '}' {
self.advance();
}
if self.is_eof() {
return Err(Error::unexpected_eof("'}'", self.line));
}
let name = self.input[name_start..self.pos].to_string();
self.advance(); return Ok(TokenKind::Variable(name));
}
while !self.is_eof() && is_word_char(self.current_char()) {
self.advance();
}
let name = self.input[start..self.pos].to_string();
if name.is_empty() {
return Err(Error::syntax(
"expected variable name after '$'",
self.line,
self.col,
Some("variable name".to_string()),
None,
));
}
Ok(TokenKind::Variable(name))
}
fn lex_number(&mut self) -> TokenKind {
let start = self.pos;
while !self.is_eof() && (self.current_char().is_ascii_digit() || self.current_char() == '.')
{
self.advance();
}
let number = self.input[start..self.pos].to_string();
TokenKind::Number(number)
}
fn lex_word(&mut self) -> TokenKind {
let start = self.pos;
while !self.is_eof() && is_word_char(self.current_char()) {
self.advance();
}
let word = self.input[start..self.pos].to_string();
TokenKind::Word(word)
}
fn make_token(&self, kind: TokenKind) -> Token {
Token::new(kind, Span::new(self.pos, self.pos, self.line, self.col))
}
fn current_char(&self) -> char {
self.input[self.pos..].chars().next().unwrap_or('\0')
}
fn is_eof(&self) -> bool {
self.pos >= self.input.len()
}
fn advance(&mut self) {
if !self.is_eof() {
let ch = self.current_char();
self.pos += ch.len_utf8();
if ch != '\n' {
self.col += 1;
}
}
}
}
fn is_word_start(ch: char) -> bool {
ch.is_ascii_alphabetic()
|| ch == '_'
|| ch == '/'
|| ch == '.'
|| ch == '*'
|| ch == '^'
|| ch == '~'
|| ch == '\\'
}
fn is_word_char(ch: char) -> bool {
ch.is_ascii_alphanumeric()
|| ch == '_'
|| ch == '-'
|| ch == '/'
|| ch == '.'
|| ch == ':'
|| ch == '='
|| ch == '*'
|| ch == '^'
|| ch == '~'
|| ch == '\\'
|| ch == '$' }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_lex_simple_directive() {
let mut lexer = Lexer::new("user nginx;");
let tokens = lexer.tokenize().unwrap();
assert_eq!(tokens.len(), 4); assert_eq!(tokens[0].kind, TokenKind::Word("user".to_string()));
assert_eq!(tokens[1].kind, TokenKind::Word("nginx".to_string()));
assert_eq!(tokens[2].kind, TokenKind::Semicolon);
assert_eq!(tokens[3].kind, TokenKind::Eof);
}
#[test]
fn test_lex_block() {
let mut lexer = Lexer::new("server { listen 80; }");
let tokens = lexer.tokenize().unwrap();
assert_eq!(tokens[0].kind, TokenKind::Word("server".to_string()));
assert_eq!(tokens[1].kind, TokenKind::LeftBrace);
assert_eq!(tokens[2].kind, TokenKind::Word("listen".to_string()));
assert_eq!(tokens[3].kind, TokenKind::Number("80".to_string()));
assert_eq!(tokens[4].kind, TokenKind::Semicolon);
assert_eq!(tokens[5].kind, TokenKind::RightBrace);
}
#[test]
fn test_lex_string() {
let mut lexer = Lexer::new(r#"root "/var/www";"#);
let tokens = lexer.tokenize().unwrap();
assert_eq!(tokens[0].kind, TokenKind::Word("root".to_string()));
assert_eq!(tokens[1].kind, TokenKind::String("/var/www".to_string()));
assert_eq!(tokens[2].kind, TokenKind::Semicolon);
}
#[test]
fn test_lex_variable() {
let mut lexer = Lexer::new("set $host;");
let tokens = lexer.tokenize().unwrap();
assert_eq!(tokens[0].kind, TokenKind::Word("set".to_string()));
assert_eq!(tokens[1].kind, TokenKind::Variable("host".to_string()));
assert_eq!(tokens[2].kind, TokenKind::Semicolon);
}
#[test]
fn test_lex_comment() {
let mut lexer = Lexer::new("# This is a comment\nuser nginx;");
let tokens = lexer.tokenize().unwrap();
assert_eq!(
tokens[0].kind,
TokenKind::Comment("This is a comment".to_string())
);
assert_eq!(tokens[1].kind, TokenKind::Word("user".to_string()));
}
#[test]
fn test_position_tracking() {
let mut lexer = Lexer::new("server\n{\n listen 80;\n}");
let tokens = lexer.tokenize().unwrap();
assert_eq!(tokens[0].span.line, 1);
assert_eq!(tokens[1].span.line, 2);
assert_eq!(tokens[2].span.line, 3);
}
#[test]
fn test_unterminated_string() {
let mut lexer = Lexer::new(r#"root "/var/www"#);
let result = lexer.tokenize();
assert!(result.is_err());
}
}