use crate::tokens::{keyword_type, Token, TokenType};
#[derive(Debug)]
pub struct LexerError {
pub message: String,
pub line: u32,
pub column: u32,
}
pub struct Lexer {
source: Vec<char>,
_filename: String,
pos: usize,
line: u32,
column: u32,
tokens: Vec<Token>,
strip_comments: bool,
}
impl Lexer {
pub fn new(source: &str, filename: &str) -> Self {
Lexer {
source: source.chars().collect(),
_filename: filename.to_string(),
pos: 0,
line: 1,
column: 1,
tokens: Vec::new(),
strip_comments: false,
}
}
pub fn tokenize(self) -> Result<Vec<Token>, LexerError> {
self.tokenize_with(false)
}
pub fn tokenize_with(mut self, strip_comments: bool) -> Result<Vec<Token>, LexerError> {
self.strip_comments = strip_comments;
while !self.at_end() {
self.consume_trivia()?;
if self.at_end() {
break;
}
self.scan_token()?;
}
self.tokens.push(Token {
ttype: TokenType::Eof,
value: String::new(),
line: self.line,
column: self.column,
});
Ok(self.tokens)
}
fn at_end(&self) -> bool {
self.pos >= self.source.len()
}
fn peek(&self) -> char {
if self.at_end() {
'\0'
} else {
self.source[self.pos]
}
}
fn peek_next(&self) -> char {
if self.pos + 1 >= self.source.len() {
'\0'
} else {
self.source[self.pos + 1]
}
}
fn advance(&mut self) -> char {
let ch = self.source[self.pos];
self.pos += 1;
if ch == '\n' {
self.line += 1;
self.column = 1;
} else {
self.column += 1;
}
ch
}
fn match_char(&mut self, expected: char) -> bool {
if self.at_end() || self.source[self.pos] != expected {
return false;
}
self.advance();
true
}
fn emit(&mut self, ttype: TokenType, value: &str, line: u32, column: u32) {
self.tokens.push(Token {
ttype,
value: value.to_string(),
line,
column,
});
}
fn consume_trivia(&mut self) -> Result<(), LexerError> {
while !self.at_end() {
let ch = self.peek();
if ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' {
self.advance();
} else if ch == '/' && self.peek_next() == '/' {
self.consume_line_comment();
} else if ch == '/' && self.peek_next() == '*' {
self.consume_block_comment()?;
} else {
break;
}
}
Ok(())
}
fn consume_line_comment(&mut self) {
let line = self.line;
let col = self.column;
self.advance(); self.advance(); let is_outer_doc = self.peek() == '/' && self.peek_next() != '/';
let is_inner_doc = !is_outer_doc && self.peek() == '!';
if is_outer_doc || is_inner_doc {
self.advance(); }
let body_start = self.pos;
while !self.at_end() && self.peek() != '\n' {
self.advance();
}
let body: String = self.source[body_start..self.pos].iter().collect();
let (ttype, full_text) = if is_outer_doc {
(TokenType::DocLineComment, format!("///{body}"))
} else if is_inner_doc {
(TokenType::InnerDocLineComment, format!("//!{body}"))
} else {
(TokenType::LineComment, format!("//{body}"))
};
if !self.strip_comments {
self.emit(ttype, &full_text, line, col);
}
}
fn consume_block_comment(&mut self) -> Result<(), LexerError> {
let line = self.line;
let col = self.column;
self.advance(); self.advance(); let is_outer_doc = self.peek() == '*' && self.peek_next() != '/';
let is_inner_doc = !is_outer_doc && self.peek() == '!';
if is_outer_doc || is_inner_doc {
self.advance(); }
let body_start = self.pos;
while !self.at_end() {
if self.peek() == '*' && self.peek_next() == '/' {
let body: String = self.source[body_start..self.pos].iter().collect();
self.advance(); self.advance(); let (ttype, full_text) = if is_outer_doc {
(TokenType::DocBlockComment, format!("/**{body}*/"))
} else if is_inner_doc {
(TokenType::InnerDocBlockComment, format!("/*!{body}*/"))
} else {
(TokenType::BlockComment, format!("/*{body}*/"))
};
if !self.strip_comments {
self.emit(ttype, &full_text, line, col);
}
return Ok(());
}
self.advance();
}
Err(LexerError {
message: "Unterminated block comment".to_string(),
line,
column: col,
})
}
fn scan_token(&mut self) -> Result<(), LexerError> {
let line = self.line;
let col = self.column;
let ch = self.advance();
match ch {
'{' => self.emit(TokenType::LBrace, "{", line, col),
'}' => self.emit(TokenType::RBrace, "}", line, col),
'(' => self.emit(TokenType::LParen, "(", line, col),
')' => self.emit(TokenType::RParen, ")", line, col),
'[' => self.emit(TokenType::LBracket, "[", line, col),
']' => self.emit(TokenType::RBracket, "]", line, col),
':' => self.emit(TokenType::Colon, ":", line, col),
',' => self.emit(TokenType::Comma, ",", line, col),
'?' => self.emit(TokenType::Question, "?", line, col),
'@' => self.emit(TokenType::At, "@", line, col),
'+' => self.emit(TokenType::Plus, "+", line, col),
'*' => self.emit(TokenType::Star, "*", line, col),
'.' => {
if self.match_char('.') {
self.emit(TokenType::DotDot, "..", line, col);
} else {
self.emit(TokenType::Dot, ".", line, col);
}
}
'-' => {
if self.match_char('>') {
self.emit(TokenType::Arrow, "->", line, col);
} else if !self.at_end() && self.peek().is_ascii_digit() {
self.scan_number(line, col, '\0', true)?;
} else {
self.emit(TokenType::Minus, "-", line, col);
}
}
'/' => self.emit(TokenType::Slash, "/", line, col),
'<' => {
if self.match_char('=') {
self.emit(TokenType::Lte, "<=", line, col);
} else {
self.emit(TokenType::Lt, "<", line, col);
}
}
'>' => {
if self.match_char('=') {
self.emit(TokenType::Gte, ">=", line, col);
} else {
self.emit(TokenType::Gt, ">", line, col);
}
}
'=' => {
if self.match_char('=') {
self.emit(TokenType::Eq, "==", line, col);
} else {
self.emit(TokenType::Assign, "=", line, col);
}
}
'!' => {
if self.match_char('=') {
self.emit(TokenType::Neq, "!=", line, col);
} else {
return Err(LexerError {
message: "Unexpected '!'. Did you mean '!='?".to_string(),
line,
column: col,
});
}
}
'"' => self.scan_string(line, col)?,
c if c.is_ascii_digit() => self.scan_number(line, col, c, false)?,
c if c.is_alphabetic() || c == '_' => self.scan_identifier(line, col, c),
c => {
return Err(LexerError {
message: format!("Unexpected character {:?}", c),
line,
column: col,
});
}
}
Ok(())
}
fn scan_string(&mut self, start_line: u32, start_col: u32) -> Result<(), LexerError> {
let mut chars = String::new();
while !self.at_end() && self.peek() != '"' {
if self.peek() == '\n' {
chars.push(self.advance());
continue;
}
if self.peek() == '\\' {
self.advance(); if self.at_end() {
return Err(LexerError {
message: "Unterminated escape sequence".to_string(),
line: self.line,
column: self.column,
});
}
let esc = self.advance();
match esc {
'n' => chars.push('\n'),
't' => chars.push('\t'),
'\\' => chars.push('\\'),
'"' => chars.push('"'),
c => chars.push(c),
}
} else {
chars.push(self.advance());
}
}
if self.at_end() {
return Err(LexerError {
message: "Unterminated string".to_string(),
line: start_line,
column: start_col,
});
}
self.advance(); self.emit(TokenType::StringLit, &chars, start_line, start_col);
Ok(())
}
fn scan_number(
&mut self,
start_line: u32,
start_col: u32,
first_char: char,
negative: bool,
) -> Result<(), LexerError> {
let mut digits = String::new();
if negative {
digits.push('-');
}
if first_char != '\0' {
digits.push(first_char);
}
while !self.at_end() && self.peek().is_ascii_digit() {
digits.push(self.advance());
}
let mut is_float = false;
if !self.at_end() && self.peek() == '.' && self.peek_next() != '.' {
is_float = true;
digits.push(self.advance()); if self.at_end() || !self.peek().is_ascii_digit() {
return Err(LexerError {
message: "Expected digit after decimal point".to_string(),
line: self.line,
column: self.column,
});
}
while !self.at_end() && self.peek().is_ascii_digit() {
digits.push(self.advance());
}
}
let raw = digits.clone();
if !self.at_end() && self.peek().is_alphabetic() {
let saved_pos = self.pos;
let saved_col = self.column;
let mut suffix = String::new();
while !self.at_end() && self.peek().is_alphabetic() {
suffix.push(self.advance());
}
if matches!(suffix.as_str(), "s" | "ms" | "m" | "h" | "d") {
let value = format!("{}{}", raw, suffix);
self.emit(TokenType::Duration, &value, start_line, start_col);
return Ok(());
} else {
self.pos = saved_pos;
self.column = saved_col;
}
}
if is_float {
self.emit(TokenType::Float, &raw, start_line, start_col);
} else {
self.emit(TokenType::Integer, &raw, start_line, start_col);
}
Ok(())
}
fn scan_identifier(&mut self, start_line: u32, start_col: u32, first_char: char) {
let mut word = String::new();
word.push(first_char);
while !self.at_end() && (self.peek().is_alphanumeric() || self.peek() == '_') {
word.push(self.advance());
}
let ttype = keyword_type(&word);
self.emit(ttype, &word, start_line, start_col);
}
}
#[cfg(test)]
mod fase_1_to_5_end_to_end {
use super::*;
fn kinds(source: &str) -> Vec<TokenType> {
Lexer::new(source, "<test>")
.tokenize()
.expect("lex ok")
.into_iter()
.map(|t| t.ttype)
.collect()
}
#[test]
fn resource_decl_tokenizes() {
let kinds = kinds("resource Db { kind: postgres lifetime: linear }");
assert!(kinds.contains(&TokenType::Resource));
assert!(kinds.contains(&TokenType::LBrace));
assert!(kinds.contains(&TokenType::RBrace));
}
#[test]
fn fabric_manifest_observe_tokenize() {
let src = r#"
fabric Vpc { provider: aws region: "us-east-1" zones: 2 }
manifest M { resources: [Db] fabric: Vpc }
observe O { sources: [M] quorum: 1 on_partition: degrade }
"#;
let k = kinds(src);
assert!(k.contains(&TokenType::Fabric));
assert!(k.contains(&TokenType::Manifest));
assert!(k.contains(&TokenType::Observe));
}
#[test]
fn reconcile_lease_ensemble_tokenize() {
let src = r#"
reconcile R { manifest: M observe: O max_retries: 3 period: "60s" }
lease L { resource: Db ttl: "30m" renewable: true }
ensemble E { daemons: [] quorum: 1 disagreement: degrade }
"#;
let k = kinds(src);
assert!(k.contains(&TokenType::Reconcile));
assert!(k.contains(&TokenType::Lease));
assert!(k.contains(&TokenType::Ensemble));
}
#[test]
fn topology_and_session_pi_calculus_tokenize() {
let src = r#"
session S {
client: [send Request end]
server: [receive Request end]
}
topology T { nodes: [A, B] edges: [A -> B : S] }
"#;
let k = kinds(src);
assert!(k.contains(&TokenType::Session));
assert!(k.contains(&TokenType::Send));
assert!(k.contains(&TokenType::Receive));
assert!(k.contains(&TokenType::End));
assert!(k.contains(&TokenType::Topology));
}
#[test]
fn immune_reflex_heal_tokenize() {
let src = r#"
immune I { sensitivity: 0.5 window: "1m" baseline: "7d" action: alert }
reflex Rf { on: drift action: throttle }
heal H { target: I max_patches: 3 rollback_on: divergence }
"#;
let k = kinds(src);
assert!(k.contains(&TokenType::Immune));
assert!(k.contains(&TokenType::Reflex));
assert!(k.contains(&TokenType::Heal));
}
#[test]
fn new_keywords_do_not_collide_with_identifiers() {
let k = kinds("resource_group manifested observer reconciled leased");
for tt in k.iter() {
assert!(
!matches!(
tt,
TokenType::Resource
| TokenType::Manifest
| TokenType::Observe
| TokenType::Reconcile
| TokenType::Lease
),
"near-match identifier wrongly classified as keyword: {tt:?}"
);
}
}
}
#[cfg(test)]
mod fase14a_trivia_tests {
use super::*;
fn lex(src: &str) -> Vec<Token> {
Lexer::new(src, "<test>").tokenize().expect("lex")
}
fn lex_strip(src: &str) -> Vec<Token> {
Lexer::new(src, "<test>").tokenize_with(true).expect("lex")
}
fn non_eof(toks: &[Token]) -> Vec<&Token> {
toks.iter().filter(|t| t.ttype != TokenType::Eof).collect()
}
#[test]
fn regular_line_comment_emitted_as_line_comment() {
let toks = lex("// hi");
let body: Vec<_> = non_eof(&toks);
assert_eq!(body.len(), 1);
assert_eq!(body[0].ttype, TokenType::LineComment);
assert_eq!(body[0].value, "// hi");
}
#[test]
fn doc_line_comment_emitted_with_doc_kind() {
let toks = lex("/// docs");
let body: Vec<_> = non_eof(&toks);
assert_eq!(body[0].ttype, TokenType::DocLineComment);
assert_eq!(body[0].value, "/// docs");
}
#[test]
fn four_slash_banner_is_regular_not_doc() {
let toks = lex("//// banner");
assert_eq!(non_eof(&toks)[0].ttype, TokenType::LineComment);
}
#[test]
fn regular_block_comment_emitted() {
let toks = lex("/* body */");
assert_eq!(non_eof(&toks)[0].ttype, TokenType::BlockComment);
assert_eq!(non_eof(&toks)[0].value, "/* body */");
}
#[test]
fn doc_block_comment_emitted() {
let toks = lex("/** docs */");
assert_eq!(non_eof(&toks)[0].ttype, TokenType::DocBlockComment);
}
#[test]
fn empty_block_is_regular_not_doc() {
let toks = lex("/**/");
assert_eq!(non_eof(&toks)[0].ttype, TokenType::BlockComment);
}
#[test]
fn strip_comments_opt_in_legacy() {
let src = "// dropped\nflow F() -> Out { }";
let toks = lex_strip(src);
for t in &toks {
assert!(
!matches!(
t.ttype,
TokenType::LineComment
| TokenType::BlockComment
| TokenType::DocLineComment
| TokenType::DocBlockComment
| TokenType::InnerDocLineComment
| TokenType::InnerDocBlockComment
),
"strip_comments=true must not emit any comment kind, got {:?}",
t.ttype
);
}
}
#[test]
fn inner_doc_line_comment_emitted_with_inner_doc_kind() {
let toks = lex("//! module docs");
let body: Vec<_> = non_eof(&toks);
assert_eq!(body[0].ttype, TokenType::InnerDocLineComment);
assert_eq!(body[0].value, "//! module docs");
}
#[test]
fn inner_doc_block_comment_emitted_with_inner_doc_kind() {
let toks = lex("/*! module docs */");
let body: Vec<_> = non_eof(&toks);
assert_eq!(body[0].ttype, TokenType::InnerDocBlockComment);
assert_eq!(body[0].value, "/*! module docs */");
}
#[test]
fn outer_and_inner_doc_distinguished() {
let toks = lex("/// outer\n//! inner\n// plain");
let body: Vec<_> = non_eof(&toks);
assert_eq!(body.len(), 3);
assert_eq!(body[0].ttype, TokenType::DocLineComment);
assert_eq!(body[1].ttype, TokenType::InnerDocLineComment);
assert_eq!(body[2].ttype, TokenType::LineComment);
}
#[test]
fn block_outer_and_inner_doc_distinguished() {
let toks = lex("/** outer */\n/*! inner */\n/* plain */");
let body: Vec<_> = non_eof(&toks);
assert_eq!(body.len(), 3);
assert_eq!(body[0].ttype, TokenType::DocBlockComment);
assert_eq!(body[1].ttype, TokenType::InnerDocBlockComment);
assert_eq!(body[2].ttype, TokenType::BlockComment);
}
#[test]
fn comment_loc_preserved_across_lines() {
let toks = lex("// a\n/// b\n/* c */");
let body: Vec<_> = non_eof(&toks);
assert_eq!(body[0].line, 1);
assert_eq!(body[1].line, 2);
assert_eq!(body[2].line, 3);
}
#[test]
fn unterminated_block_still_errors() {
let result = Lexer::new("/* never closes", "<test>").tokenize();
assert!(result.is_err());
let err = result.unwrap_err();
assert!(err.message.contains("Unterminated"));
}
#[test]
fn trivia_helpers_strip_markers() {
use crate::tokens::{Trivia, TriviaKind};
let t = Trivia {
kind: TriviaKind::DocLine,
text: "/// hi".into(),
line: 1,
column: 1,
};
assert!(t.is_doc());
assert_eq!(t.stripped_text(), " hi");
let b = Trivia {
kind: TriviaKind::DocBlock,
text: "/** body */".into(),
line: 1,
column: 1,
};
assert!(b.is_doc());
assert_eq!(b.stripped_text(), " body ");
let r = Trivia {
kind: TriviaKind::Line,
text: "// regular".into(),
line: 1,
column: 1,
};
assert!(!r.is_doc());
assert_eq!(r.stripped_text(), " regular");
}
}