mod token;
pub use token::{Token, TokenKind};
use crate::ast::Span;
use crate::errors::Diagnostic;
pub fn tokenize(source: &str, filename: &str) -> Result<Vec<Token>, Vec<Diagnostic>> {
let lexer = Lexer::new(source, filename);
lexer.tokenize()
}
struct Lexer<'a> {
source: &'a str,
filename: &'a str,
chars: std::iter::Peekable<std::str::CharIndices<'a>>,
tokens: Vec<Token>,
errors: Vec<Diagnostic>,
current_pos: usize,
line: usize,
col: usize,
}
impl<'a> Lexer<'a> {
fn new(source: &'a str, filename: &'a str) -> Self {
Self {
source,
filename,
chars: source.char_indices().peekable(),
tokens: Vec::new(),
errors: Vec::new(),
current_pos: 0,
line: 1,
col: 1,
}
}
fn tokenize(mut self) -> Result<Vec<Token>, Vec<Diagnostic>> {
while let Some((pos, ch)) = self.peek() {
match ch {
' ' | '\t' | '\r' => {
self.advance();
}
'\n' => {
self.advance();
self.line += 1;
self.col = 1;
}
'/' => {
self.advance();
if self.check('/') {
self.skip_line_comment();
} else if self.check('*') {
self.skip_block_comment();
} else {
self.add_token(TokenKind::Slash, pos, self.current_pos);
}
}
'+' => self.single_char_token(TokenKind::Plus, pos),
'*' => self.single_char_token(TokenKind::Star, pos),
'-' => self.single_char_token(TokenKind::Minus, pos),
'(' => self.single_char_token(TokenKind::LParen, pos),
')' => self.single_char_token(TokenKind::RParen, pos),
'{' => self.single_char_token(TokenKind::LBrace, pos),
'}' => self.single_char_token(TokenKind::RBrace, pos),
'[' => self.single_char_token(TokenKind::LBracket, pos),
']' => self.single_char_token(TokenKind::RBracket, pos),
',' => self.single_char_token(TokenKind::Comma, pos),
':' => self.single_char_token(TokenKind::Colon, pos),
'.' => self.single_char_token(TokenKind::Dot, pos),
'=' => {
self.advance();
if self.check('>') {
self.advance();
self.add_token(TokenKind::Arrow, pos, self.current_pos);
} else if self.check('=') {
self.advance();
self.add_token(TokenKind::EqEq, pos, self.current_pos);
} else {
self.add_token(TokenKind::Eq, pos, self.current_pos);
}
}
'<' => {
self.advance();
if self.check('=') {
self.advance();
self.add_token(TokenKind::LtEq, pos, self.current_pos);
} else {
self.add_token(TokenKind::Lt, pos, self.current_pos);
}
}
'>' => {
self.advance();
if self.check('=') {
self.advance();
self.add_token(TokenKind::GtEq, pos, self.current_pos);
} else {
self.add_token(TokenKind::Gt, pos, self.current_pos);
}
}
'!' => {
self.advance();
if self.check('=') {
self.advance();
self.add_token(TokenKind::BangEq, pos, self.current_pos);
} else {
self.add_token(TokenKind::Bang, pos, self.current_pos);
}
}
'&' => {
self.advance();
if self.check('&') {
self.advance();
self.add_token(TokenKind::And, pos, self.current_pos);
} else {
self.error_at(pos, "expected '&&', found single '&'");
}
}
'|' => {
self.advance();
if self.check('|') {
self.advance();
self.add_token(TokenKind::Or, pos, self.current_pos);
} else {
self.error_at(pos, "expected '||', found single '|'");
}
}
'"' => self.string(pos),
'0'..='9' => self.number(pos),
'a'..='z' | 'A'..='Z' | '_' => self.identifier(pos),
_ => {
self.error_at(pos, &format!("unexpected character '{}'", ch));
self.advance();
}
}
}
self.tokens.push(Token {
kind: TokenKind::Eof,
span: Span::point(self.current_pos as u32, self.line, self.col),
lexeme: "".to_string(),
});
if self.errors.is_empty() {
Ok(self.tokens)
} else {
Err(self.errors)
}
}
fn peek(&mut self) -> Option<(usize, char)> {
self.chars.peek().copied()
}
fn advance(&mut self) -> Option<char> {
if let Some((pos, ch)) = self.chars.next() {
self.current_pos = pos + ch.len_utf8();
self.col += 1;
Some(ch)
} else {
None
}
}
fn check(&mut self, expected: char) -> bool {
self.peek().is_some_and(|(_, ch)| ch == expected)
}
fn single_char_token(&mut self, kind: TokenKind, start: usize) {
self.advance();
self.add_token(kind, start, self.current_pos);
}
fn add_token(&mut self, kind: TokenKind, start: usize, end: usize) {
let lexeme = self.source[start..end].to_string();
let span = Span::new(
start as u32,
end as u32,
self.line,
self.col - (end - start),
);
self.tokens.push(Token { kind, span, lexeme });
}
fn skip_line_comment(&mut self) {
while let Some((_, ch)) = self.peek() {
if ch == '\n' {
break;
}
self.advance();
}
}
fn skip_block_comment(&mut self) {
self.advance(); let mut depth = 1;
while depth > 0 {
match self.advance() {
Some('/') if self.check('*') => {
self.advance();
depth += 1;
}
Some('*') if self.check('/') => {
self.advance();
depth -= 1;
}
Some('\n') => {
self.line += 1;
self.col = 1;
}
Some(_) => {}
None => {
self.errors
.push(Diagnostic::error("unterminated block comment"));
return;
}
}
}
}
fn string(&mut self, start: usize) {
let start_line = self.line;
let start_col = self.col;
self.advance();
let mut value = String::new();
loop {
match self.advance() {
Some('"') => break,
Some('\\') => {
match self.advance() {
Some('n') => value.push('\n'),
Some('t') => value.push('\t'),
Some('r') => value.push('\r'),
Some('\\') => value.push('\\'),
Some('"') => value.push('"'),
Some(ch) => {
self.error_at(
self.current_pos - 1,
&format!("unknown escape sequence '\\{}'", ch),
);
}
None => {
self.error_at(start, "unterminated string literal");
return;
}
}
}
Some('\n') => {
self.error_at(
start,
"unterminated string literal (strings cannot span lines)",
);
self.line += 1;
self.col = 1;
return;
}
Some(ch) => value.push(ch),
None => {
self.error_at(start, "unterminated string literal");
return;
}
}
}
let span = Span::new(start as u32, self.current_pos as u32, start_line, start_col);
self.tokens.push(Token {
kind: TokenKind::String(value),
span,
lexeme: self.source[start..self.current_pos].to_string(),
});
}
fn number(&mut self, start: usize) {
while let Some((_, ch)) = self.peek() {
if ch.is_ascii_digit() {
self.advance();
} else {
break;
}
}
if self.check('.') {
let mut chars = self.chars.clone();
chars.next(); if let Some((_, ch)) = chars.peek() {
if ch.is_ascii_digit() {
self.advance(); while let Some((_, ch)) = self.peek() {
if ch.is_ascii_digit() {
self.advance();
} else {
break;
}
}
}
}
}
let lexeme = &self.source[start..self.current_pos];
let value: f64 = lexeme.parse().unwrap_or_else(|_| {
self.error_at(start, &format!("invalid number '{}'", lexeme));
0.0
});
let span = Span::new(
start as u32,
self.current_pos as u32,
self.line,
self.col - (self.current_pos - start),
);
self.tokens.push(Token {
kind: TokenKind::Number(value),
span,
lexeme: lexeme.to_string(),
});
}
fn identifier(&mut self, start: usize) {
while let Some((_, ch)) = self.peek() {
if ch.is_ascii_alphanumeric() || ch == '_' {
self.advance();
} else {
break;
}
}
let lexeme = &self.source[start..self.current_pos];
let kind = match lexeme {
"val" => TokenKind::Val,
"import" => TokenKind::Import,
"as" => TokenKind::As,
"unsafe" => TokenKind::Unsafe,
"module" => TokenKind::Module,
"true" => TokenKind::True,
"false" => TokenKind::False,
"if" => TokenKind::If,
"then" => TokenKind::Then,
"else" => TokenKind::Else,
"assert" => TokenKind::Assert,
"hcl" => TokenKind::Hcl,
_ => TokenKind::Ident(lexeme.to_string()),
};
let span = Span::new(
start as u32,
self.current_pos as u32,
self.line,
self.col - (self.current_pos - start),
);
self.tokens.push(Token {
kind,
span,
lexeme: lexeme.to_string(),
});
}
fn error_at(&mut self, pos: usize, message: &str) {
self.errors.push(Diagnostic::error_at(
message.to_string(),
Span::point(pos as u32, self.line, self.col),
self.filename.to_string(),
));
}
}
#[cfg(test)]
mod tests {
use super::*;
fn lex(source: &str) -> Vec<TokenKind> {
tokenize(source, "test.hk")
.unwrap()
.into_iter()
.map(|t| t.kind)
.collect()
}
#[test]
fn test_basic_tokens() {
let tokens = lex("val x = 42");
assert_eq!(
tokens,
vec![
TokenKind::Val,
TokenKind::Ident("x".to_string()),
TokenKind::Eq,
TokenKind::Number(42.0),
TokenKind::Eof,
]
);
}
#[test]
fn test_string() {
let tokens = lex(r#""hello world""#);
assert_eq!(
tokens,
vec![TokenKind::String("hello world".to_string()), TokenKind::Eof,]
);
}
#[test]
fn test_arrow() {
let tokens = lex("x => y");
assert_eq!(
tokens,
vec![
TokenKind::Ident("x".to_string()),
TokenKind::Arrow,
TokenKind::Ident("y".to_string()),
TokenKind::Eof,
]
);
}
#[test]
fn test_member_access() {
let tokens = lex("S3.createBucket");
assert_eq!(
tokens,
vec![
TokenKind::Ident("S3".to_string()),
TokenKind::Dot,
TokenKind::Ident("createBucket".to_string()),
TokenKind::Eof,
]
);
}
#[test]
fn test_comments() {
let tokens = lex("val x = 1 // comment\nval y = 2");
assert_eq!(
tokens,
vec![
TokenKind::Val,
TokenKind::Ident("x".to_string()),
TokenKind::Eq,
TokenKind::Number(1.0),
TokenKind::Val,
TokenKind::Ident("y".to_string()),
TokenKind::Eq,
TokenKind::Number(2.0),
TokenKind::Eof,
]
);
}
#[test]
fn test_operators() {
let tokens = lex("1 + 2 - 3 * 4 / 5");
assert!(tokens.contains(&TokenKind::Plus));
assert!(tokens.contains(&TokenKind::Minus));
assert!(tokens.contains(&TokenKind::Star));
assert!(tokens.contains(&TokenKind::Slash));
}
#[test]
fn test_comparison_operators() {
let tokens = lex("a == b != c < d > e <= f >= g");
assert!(tokens.contains(&TokenKind::EqEq));
assert!(tokens.contains(&TokenKind::BangEq));
assert!(tokens.contains(&TokenKind::Lt));
assert!(tokens.contains(&TokenKind::Gt));
assert!(tokens.contains(&TokenKind::LtEq));
assert!(tokens.contains(&TokenKind::GtEq));
}
#[test]
fn test_logical_operators() {
let tokens = lex("a && b || !c");
assert!(tokens.contains(&TokenKind::And));
assert!(tokens.contains(&TokenKind::Or));
assert!(tokens.contains(&TokenKind::Bang));
}
#[test]
fn test_boolean_literals() {
let tokens = lex("true false");
assert!(tokens.contains(&TokenKind::True));
assert!(tokens.contains(&TokenKind::False));
}
#[test]
fn test_keywords() {
let tokens = lex("val import as unsafe module if then else hcl");
assert!(tokens.contains(&TokenKind::Val));
assert!(tokens.contains(&TokenKind::Import));
assert!(tokens.contains(&TokenKind::As));
assert!(tokens.contains(&TokenKind::Unsafe));
assert!(tokens.contains(&TokenKind::Module));
assert!(tokens.contains(&TokenKind::If));
assert!(tokens.contains(&TokenKind::Then));
assert!(tokens.contains(&TokenKind::Else));
assert!(tokens.contains(&TokenKind::Hcl));
}
#[test]
fn test_brackets_and_braces() {
let tokens = lex("[] {} ()");
assert!(tokens.contains(&TokenKind::LBracket));
assert!(tokens.contains(&TokenKind::RBracket));
assert!(tokens.contains(&TokenKind::LBrace));
assert!(tokens.contains(&TokenKind::RBrace));
assert!(tokens.contains(&TokenKind::LParen));
assert!(tokens.contains(&TokenKind::RParen));
}
#[test]
fn test_float_number() {
let tokens = lex("3.14");
assert_eq!(tokens, vec![TokenKind::Number(3.14), TokenKind::Eof]);
}
}