use crate::error::ParseError;
#[derive(Debug, Clone, PartialEq)]
pub enum TokenKind {
Pattern,
Stage,
Unless,
Between,
After,
Graph,
Now,
Temporal,
True,
False,
Compose, Sharing, Concurrent,
LBrace, RBrace, Dot, Arrow, Eq, Lt, Gt, Lte, Gte, Bang, At, DotDot, Question, GtGt, Pipe, Star, LParen, RParen, Comma, Plus, Minus, Colon, Semicolon,
Ident(String),
String(String),
Number(f64),
Eof,
}
#[derive(Debug, Clone)]
pub struct Token {
pub kind: TokenKind,
pub line: usize,
pub column: usize,
pub offset: usize,
pub len: usize,
}
impl Token {
pub fn span(&self) -> (usize, usize) {
(self.offset, self.offset + self.len)
}
}
pub struct Lexer<'a> {
source: &'a str,
bytes: &'a [u8],
pos: usize,
line: usize,
col: usize,
}
impl<'a> Lexer<'a> {
pub fn new(source: &'a str) -> Self {
Self {
source,
bytes: source.as_bytes(),
pos: 0,
line: 1,
col: 1,
}
}
pub fn tokenize(&mut self) -> Result<Vec<Token>, ParseError> {
let mut tokens = Vec::new();
loop {
self.skip_whitespace_and_comments();
if self.pos >= self.bytes.len() {
tokens.push(Token {
kind: TokenKind::Eof,
line: self.line,
column: self.col,
offset: self.pos,
len: 0,
});
break;
}
tokens.push(self.next_token()?);
}
Ok(tokens)
}
fn skip_whitespace_and_comments(&mut self) {
loop {
while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_whitespace() {
if self.bytes[self.pos] == b'\n' {
self.line += 1;
self.col = 1;
} else {
self.col += 1;
}
self.pos += 1;
}
if self.pos + 1 < self.bytes.len()
&& self.bytes[self.pos] == b'/'
&& self.bytes[self.pos + 1] == b'/'
{
while self.pos < self.bytes.len() && self.bytes[self.pos] != b'\n' {
self.pos += 1;
}
continue;
}
break;
}
}
fn next_token(&mut self) -> Result<Token, ParseError> {
let start = self.pos;
let line = self.line;
let col = self.col;
let ch = self.bytes[self.pos];
match ch {
b'{' => {
self.advance();
Ok(Token {
kind: TokenKind::LBrace,
line,
column: col,
offset: start,
len: 1,
})
}
b'}' => {
self.advance();
Ok(Token {
kind: TokenKind::RBrace,
line,
column: col,
offset: start,
len: 1,
})
}
b'@' => {
self.advance();
Ok(Token {
kind: TokenKind::At,
line,
column: col,
offset: start,
len: 1,
})
}
b'?' => {
self.advance();
Ok(Token {
kind: TokenKind::Question,
line,
column: col,
offset: start,
len: 1,
})
}
b'|' => {
self.advance();
Ok(Token {
kind: TokenKind::Pipe,
line,
column: col,
offset: start,
len: 1,
})
}
b'*' => {
self.advance();
Ok(Token {
kind: TokenKind::Star,
line,
column: col,
offset: start,
len: 1,
})
}
b'(' => {
self.advance();
Ok(Token {
kind: TokenKind::LParen,
line,
column: col,
offset: start,
len: 1,
})
}
b')' => {
self.advance();
Ok(Token {
kind: TokenKind::RParen,
line,
column: col,
offset: start,
len: 1,
})
}
b',' => {
self.advance();
Ok(Token {
kind: TokenKind::Comma,
line,
column: col,
offset: start,
len: 1,
})
}
b'!' => {
self.advance();
Ok(Token {
kind: TokenKind::Bang,
line,
column: col,
offset: start,
len: 1,
})
}
b'.' => {
self.advance();
if self.pos < self.bytes.len() && self.bytes[self.pos] == b'.' {
self.advance();
Ok(Token {
kind: TokenKind::DotDot,
line,
column: col,
offset: start,
len: 2,
})
} else {
Ok(Token {
kind: TokenKind::Dot,
line,
column: col,
offset: start,
len: 1,
})
}
}
b'-' => {
self.advance();
if self.pos < self.bytes.len() && self.bytes[self.pos] == b'>' {
self.advance();
Ok(Token {
kind: TokenKind::Arrow,
line,
column: col,
offset: start,
len: 2,
})
} else {
Ok(Token {
kind: TokenKind::Minus,
line,
column: col,
offset: start,
len: 1,
})
}
}
b'+' => {
self.advance();
Ok(Token {
kind: TokenKind::Plus,
line,
column: col,
offset: start,
len: 1,
})
}
b':' => {
self.advance();
Ok(Token {
kind: TokenKind::Colon,
line,
column: col,
offset: start,
len: 1,
})
}
b';' => {
self.advance();
Ok(Token {
kind: TokenKind::Semicolon,
line,
column: col,
offset: start,
len: 1,
})
}
b'=' => {
self.advance();
if self.pos < self.bytes.len() && self.bytes[self.pos] == b'=' {
self.advance();
}
Ok(Token {
kind: TokenKind::Eq,
line,
column: col,
offset: start,
len: self.pos - start,
})
}
b'<' => {
self.advance();
if self.pos < self.bytes.len() && self.bytes[self.pos] == b'=' {
self.advance();
Ok(Token {
kind: TokenKind::Lte,
line,
column: col,
offset: start,
len: 2,
})
} else {
Ok(Token {
kind: TokenKind::Lt,
line,
column: col,
offset: start,
len: 1,
})
}
}
b'>' => {
self.advance();
if self.pos < self.bytes.len() && self.bytes[self.pos] == b'=' {
self.advance();
Ok(Token {
kind: TokenKind::Gte,
line,
column: col,
offset: start,
len: 2,
})
} else if self.pos < self.bytes.len() && self.bytes[self.pos] == b'>' {
self.advance();
Ok(Token {
kind: TokenKind::GtGt,
line,
column: col,
offset: start,
len: 2,
})
} else {
Ok(Token {
kind: TokenKind::Gt,
line,
column: col,
offset: start,
len: 1,
})
}
}
b'"' => self.read_string(line, col),
b'0'..=b'9' => self.read_number(start, line, col),
b'a'..=b'z' | b'A'..=b'Z' | b'_' => self.read_ident(start, line, col),
_ => Err(self.error_at(
line,
col,
start,
&format!("unexpected character '{}'", ch as char),
)),
}
}
fn advance(&mut self) {
self.pos += 1;
self.col += 1;
}
fn read_string(&mut self, line: usize, col: usize) -> Result<Token, ParseError> {
let start = self.pos;
self.advance();
if self.pos + 1 < self.bytes.len()
&& self.bytes[self.pos] == b'"'
&& self.bytes[self.pos + 1] == b'"'
{
self.advance(); self.advance(); return self.read_triple_string(start, line, col);
}
let content_start = self.pos;
while self.pos < self.bytes.len() && self.bytes[self.pos] != b'"' {
if self.bytes[self.pos] == b'\n' {
return Err(self.error_at(line, col, start, "unterminated string literal"));
}
self.pos += 1;
self.col += 1;
}
if self.pos >= self.bytes.len() {
return Err(self.error_at(line, col, start, "unterminated string literal"));
}
let s = self.source[content_start..self.pos].to_string();
self.advance(); Ok(Token {
kind: TokenKind::String(s),
line,
column: col,
offset: start,
len: self.pos - start,
})
}
fn read_triple_string(
&mut self,
start: usize,
line: usize,
col: usize,
) -> Result<Token, ParseError> {
let content_start = self.pos;
loop {
if self.pos >= self.bytes.len() {
return Err(self.error_at(line, col, start, "unterminated triple-quoted string"));
}
if self.pos + 2 < self.bytes.len()
&& self.bytes[self.pos] == b'"'
&& self.bytes[self.pos + 1] == b'"'
&& self.bytes[self.pos + 2] == b'"'
{
let s = self.source[content_start..self.pos].to_string();
self.advance(); self.advance(); self.advance(); return Ok(Token {
kind: TokenKind::String(s),
line,
column: col,
offset: start,
len: self.pos - start,
});
}
if self.bytes[self.pos] == b'\n' {
self.line += 1;
self.col = 1;
self.pos += 1;
} else {
self.pos += 1;
self.col += 1;
}
}
}
fn read_number(&mut self, start: usize, line: usize, col: usize) -> Result<Token, ParseError> {
while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_digit() {
self.pos += 1;
self.col += 1;
}
if self.pos + 1 < self.bytes.len()
&& self.bytes[self.pos] == b'.'
&& self.bytes[self.pos + 1] != b'.'
&& self.bytes[self.pos + 1].is_ascii_digit()
{
self.pos += 1;
self.col += 1;
while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_digit() {
self.pos += 1;
self.col += 1;
}
}
let num_str = &self.source[start..self.pos];
let val: f64 = num_str.parse().map_err(|_| {
self.error_at(line, col, start, &format!("invalid number '{}'", num_str))
})?;
Ok(Token {
kind: TokenKind::Number(val),
line,
column: col,
offset: start,
len: self.pos - start,
})
}
fn read_ident(&mut self, start: usize, line: usize, col: usize) -> Result<Token, ParseError> {
while self.pos < self.bytes.len()
&& (self.bytes[self.pos].is_ascii_alphanumeric() || self.bytes[self.pos] == b'_')
{
self.pos += 1;
self.col += 1;
}
let word = &self.source[start..self.pos];
let kind = match word {
"pattern" => TokenKind::Pattern,
"stage" => TokenKind::Stage,
"unless" => TokenKind::Unless,
"between" => TokenKind::Between,
"after" => TokenKind::After,
"graph" => TokenKind::Graph,
"now" => TokenKind::Now,
"temporal" => TokenKind::Temporal,
"true" => TokenKind::True,
"false" => TokenKind::False,
"compose" => TokenKind::Compose,
"sharing" => TokenKind::Sharing,
"concurrent" => TokenKind::Concurrent,
_ => TokenKind::Ident(word.to_string()),
};
Ok(Token {
kind,
line,
column: col,
offset: start,
len: self.pos - start,
})
}
fn error_at(&self, line: usize, col: usize, offset: usize, msg: &str) -> ParseError {
ParseError {
line,
column: col,
span: (offset, self.pos.max(offset + 1)),
message: msg.to_string(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tokenize_simple_pattern() {
let src = r#"pattern test { stage e1 { e1.eventType = "enter" } }"#;
let tokens = Lexer::new(src).tokenize().unwrap();
assert!(matches!(tokens[0].kind, TokenKind::Pattern));
assert!(matches!(tokens[1].kind, TokenKind::Ident(ref s) if s == "test"));
assert!(matches!(tokens[2].kind, TokenKind::LBrace));
assert!(matches!(tokens[3].kind, TokenKind::Stage));
}
#[test]
fn tokenize_graph() {
let src = r#"graph { @1 ev.type = "enter" @2..5 ev2.type = "siege" }"#;
let tokens = Lexer::new(src).tokenize().unwrap();
assert!(matches!(tokens[0].kind, TokenKind::Graph));
assert!(matches!(tokens[2].kind, TokenKind::At));
assert!(matches!(tokens[3].kind, TokenKind::Number(n) if n == 1.0));
}
#[test]
fn tokenize_comments() {
let src = "// this is a comment\npattern test {}";
let tokens = Lexer::new(src).tokenize().unwrap();
assert!(matches!(tokens[0].kind, TokenKind::Pattern));
}
#[test]
fn tokenize_arrow_and_question() {
let src = "e1.actor -> ?guest";
let tokens = Lexer::new(src).tokenize().unwrap();
assert!(matches!(tokens[0].kind, TokenKind::Ident(ref s) if s == "e1"));
assert!(matches!(tokens[1].kind, TokenKind::Dot));
assert!(matches!(tokens[2].kind, TokenKind::Ident(ref s) if s == "actor"));
assert!(matches!(tokens[3].kind, TokenKind::Arrow));
assert!(matches!(tokens[4].kind, TokenKind::Question));
assert!(matches!(tokens[5].kind, TokenKind::Ident(ref s) if s == "guest"));
}
#[test]
fn tokenize_new_symbols() {
let src = "+ - : ;";
let tokens = Lexer::new(src).tokenize().unwrap();
assert!(matches!(tokens[0].kind, TokenKind::Plus));
assert!(matches!(tokens[1].kind, TokenKind::Minus));
assert!(matches!(tokens[2].kind, TokenKind::Colon));
assert!(matches!(tokens[3].kind, TokenKind::Semicolon));
}
#[test]
fn tokenize_minus_not_folded_into_number() {
let src = "-5";
let tokens = Lexer::new(src).tokenize().unwrap();
assert!(matches!(tokens[0].kind, TokenKind::Minus));
assert!(matches!(tokens[1].kind, TokenKind::Number(n) if n == 5.0));
}
#[test]
fn tokenize_arrow_still_works() {
let src = "-> -5";
let tokens = Lexer::new(src).tokenize().unwrap();
assert!(matches!(tokens[0].kind, TokenKind::Arrow));
assert!(matches!(tokens[1].kind, TokenKind::Minus));
assert!(matches!(tokens[2].kind, TokenKind::Number(n) if n == 5.0));
}
#[test]
fn tokenize_triple_quoted_string() {
let src = r#""""hello
world""""#;
let tokens = Lexer::new(src).tokenize().unwrap();
assert!(matches!(tokens[0].kind, TokenKind::String(ref s) if s == "hello\nworld"));
}
#[test]
fn tokenize_triple_quoted_empty() {
let src = "\"\"\"\"\"\""; let tokens = Lexer::new(src).tokenize().unwrap();
assert!(matches!(tokens[0].kind, TokenKind::String(ref s) if s.is_empty()));
}
#[test]
fn tokenize_triple_quoted_with_single_quotes_inside() {
let src = r#""""say "hello" to them""""#;
let tokens = Lexer::new(src).tokenize().unwrap();
assert!(
matches!(tokens[0].kind, TokenKind::String(ref s) if s == r#"say "hello" to them"#)
);
}
#[test]
fn tokenize_triple_quoted_double_quotes_inside() {
let src = r#""""has ""two"" inside""""#;
let tokens = Lexer::new(src).tokenize().unwrap();
assert!(matches!(tokens[0].kind, TokenKind::String(ref s) if s == r#"has ""two"" inside"#));
}
#[test]
fn tokenize_salience_style() {
let src = r#"lifecycle: oneshot; priority: normal; adjust ?e2.depth + 1"#;
let tokens = Lexer::new(src).tokenize().unwrap();
assert!(matches!(tokens[0].kind, TokenKind::Ident(ref s) if s == "lifecycle"));
assert!(matches!(tokens[1].kind, TokenKind::Colon));
assert!(matches!(tokens[2].kind, TokenKind::Ident(ref s) if s == "oneshot"));
assert!(matches!(tokens[3].kind, TokenKind::Semicolon));
assert!(matches!(tokens[13].kind, TokenKind::Plus));
assert!(matches!(tokens[14].kind, TokenKind::Number(n) if n == 1.0));
}
}