#[derive(Debug, PartialEq, Clone)]
pub enum TokenType {
Eof,
Whitespace,
Comment(String),
Unknown,
Identifier(String),
String(String),
Number(f64),
True,
False,
Null,
Import,
From,
As,
LBrace,
RBrace,
LBracket,
RBracket,
LParen,
RParen,
Comma,
Colon,
DoubleColon,
Dot,
Equals,
Hash,
Dollar,
Ampersand,
Asterisk,
Spread,
}
#[derive(Debug, Clone)]
pub struct Token {
pub ttype: TokenType,
pub pos_start: usize,
pub pos_end: usize,
}
impl Token {
#[must_use]
pub fn new(ttype: TokenType, pos_start: usize, pos_end: usize) -> Token {
Token {
ttype,
pos_start,
pos_end,
}
}
}
pub struct Lexer<'a> {
chars: std::iter::Peekable<std::str::Chars<'a>>,
position: usize,
}
impl<'a> Lexer<'a> {
#[must_use]
pub fn new(input: &'a str) -> Self {
Self {
chars: input.chars().peekable(),
position: 0,
}
}
pub fn lex(&mut self) -> Vec<Token> {
let mut tokens = Vec::new();
loop {
let token = self.next_token();
if token.ttype == TokenType::Eof {
tokens.push(token);
break;
}
tokens.push(token);
}
tokens
}
pub fn next_token(&mut self) -> Token {
let start_pos = self.position;
let ttype = if let Some(char) = self.advance() {
match char {
'{' => TokenType::LBrace,
'}' => TokenType::RBrace,
'[' => TokenType::LBracket,
']' => TokenType::RBracket,
'(' => TokenType::LParen,
')' => TokenType::RParen,
',' => TokenType::Comma,
'#' => TokenType::Hash,
'$' => TokenType::Dollar,
'&' => TokenType::Ampersand,
'*' => TokenType::Asterisk,
'=' => TokenType::Equals,
':' => {
if self.peek() == Some(&':') {
self.advance();
TokenType::DoubleColon
} else {
TokenType::Colon
}
}
'.' => {
if self.peek() == Some(&'.') {
self.advance();
if self.peek() == Some(&'.') {
self.advance();
TokenType::Spread
} else {
TokenType::Unknown
}
} else {
TokenType::Dot
}
}
'/' => {
if self.peek() == Some(&'/') {
self.read_comment()
} else {
TokenType::Unknown
}
}
'"' => self.read_string(),
c if c.is_whitespace() => self.read_whitespace(),
c if c.is_ascii_alphabetic() || c == '_' => self.read_identifier(c),
c if c.is_ascii_digit()
|| (c == '-' && self.peek().is_some_and(char::is_ascii_digit)) =>
{
self.read_number(c)
}
_ => TokenType::Unknown,
}
} else {
TokenType::Eof
};
Token::new(ttype, start_pos, self.position)
}
fn advance(&mut self) -> Option<char> {
let char = self.chars.next();
if let Some(c) = char {
self.position += c.len_utf8();
}
char
}
fn peek(&mut self) -> Option<&char> {
self.chars.peek()
}
fn read_whitespace(&mut self) -> TokenType {
while let Some(c) = self.peek() {
if c.is_whitespace() {
self.advance();
} else {
break;
}
}
TokenType::Whitespace
}
fn read_comment(&mut self) -> TokenType {
self.advance(); let mut comment_text = String::new();
while let Some(c) = self.peek() {
if *c == '\n' {
break;
}
comment_text.push(self.advance().unwrap());
}
TokenType::Comment(comment_text.trim().to_string())
}
fn read_string(&mut self) -> TokenType {
let mut value = String::new();
loop {
match self.peek() {
Some('"') => {
self.advance(); return TokenType::String(value);
}
Some('\\') => {
self.advance(); match self.advance() {
Some('"') => value.push('"'),
Some('\\') => value.push('\\'),
Some('n') => value.push('\n'),
Some('r') => value.push('\r'),
Some('t') => value.push('\t'),
Some(other) => {
value.push('\\');
value.push(other);
}
None => return TokenType::Unknown, }
}
Some(c) => {
value.push(*c);
self.advance();
}
None => return TokenType::Unknown, }
}
}
fn read_identifier(&mut self, first_char: char) -> TokenType {
let mut ident = String::new();
ident.push(first_char);
while let Some(c) = self.peek() {
if c.is_ascii_alphanumeric() || *c == '_' {
ident.push(self.advance().unwrap());
} else {
break;
}
}
match ident.as_str() {
"true" | "on" => TokenType::True,
"false" | "off" => TokenType::False,
"null" => TokenType::Null,
"import" => TokenType::Import,
"from" => TokenType::From,
"as" => TokenType::As,
_ => TokenType::Identifier(ident),
}
}
fn read_number(&mut self, first_char: char) -> TokenType {
let mut number_str = String::new();
number_str.push(first_char);
let mut has_dot = first_char == '.';
let mut has_exponent = false;
while let Some(c) = self.peek() {
if c.is_ascii_digit() {
number_str.push(self.advance().unwrap());
} else if *c == '.' && !has_dot {
has_dot = true;
number_str.push(self.advance().unwrap());
} else if (*c == 'e' || *c == 'E') && !has_exponent {
has_exponent = true;
number_str.push(self.advance().unwrap());
if let Some(sign_char) = self.peek() {
if *sign_char == '+' || *sign_char == '-' {
number_str.push(self.advance().unwrap());
}
}
} else {
break;
}
}
if let Ok(num) = number_str.parse::<f64>() {
TokenType::Number(num)
} else {
TokenType::Unknown
}
}
}
#[allow(dead_code)]
pub(crate) fn tokens_to_pretty_string(tokens: &[Token]) -> String {
let mut buff: Vec<String> = Vec::with_capacity(tokens.len());
for token in tokens {
buff.push(format!(
"{:?}, {}, {}",
token.ttype, token.pos_start, token.pos_end,
));
}
buff.join("\n")
}
#[cfg(test)]
#[allow(clippy::needless_pass_by_value)]
#[allow(clippy::explicit_auto_deref)]
mod tests {
use super::*;
fn assert_tokens(input: &str, expected: &[TokenType]) {
let mut lexer = Lexer::new(input);
let tokens = lexer.lex();
let token_types: Vec<TokenType> = tokens.into_iter().map(|t| t.ttype).collect();
let filtered_tokens: Vec<TokenType> = token_types
.into_iter()
.filter(|t| !matches!(t, TokenType::Whitespace | TokenType::Comment(_)))
.collect();
assert_eq!(filtered_tokens, expected);
}
#[test]
fn test_eof() {
assert_tokens("", &[TokenType::Eof]);
}
#[test]
fn test_single_char_tokens() {
let input = "{}[](),:#{new_string}*";
let expected = vec![
TokenType::LBrace,
TokenType::RBrace,
TokenType::LBracket,
TokenType::RBracket,
TokenType::LParen,
TokenType::RParen,
TokenType::Comma,
TokenType::Colon,
TokenType::Hash,
TokenType::LBrace,
TokenType::Identifier("new_string".to_string()),
TokenType::RBrace,
TokenType::Asterisk,
TokenType::Eof,
];
assert_tokens(input, &expected);
}
#[test]
fn test_multi_char_operators() {
let input = ":: ...";
let expected = vec![TokenType::DoubleColon, TokenType::Spread, TokenType::Eof];
assert_tokens(input, &expected);
}
#[test]
fn test_keywords() {
let input = "true on false off null import from as";
let expected = vec![
TokenType::True,
TokenType::True,
TokenType::False,
TokenType::False,
TokenType::Null,
TokenType::Import,
TokenType::From,
TokenType::As,
TokenType::Eof,
];
assert_tokens(input, &expected);
}
#[test]
fn test_identifiers() {
let input = "foo bar_123 _baz";
let expected = vec![
TokenType::Identifier("foo".to_string()),
TokenType::Identifier("bar_123".to_string()),
TokenType::Identifier("_baz".to_string()),
TokenType::Eof,
];
assert_tokens(input, &expected);
}
#[test]
fn test_numbers() {
let input = "123 45.67 -10 0.5";
let expected = vec![
TokenType::Number(123.0),
TokenType::Number(45.67),
TokenType::Number(-10.0),
TokenType::Number(0.5),
TokenType::Eof,
];
assert_tokens(input, &expected);
}
#[test]
fn test_comments_and_whitespace() {
let input = " // this is a comment\n key: value // another one";
let mut lexer = Lexer::new(input);
let tokens = lexer.lex();
let token_types: Vec<TokenType> = tokens.into_iter().map(|t| t.ttype).collect();
let expected = vec![
TokenType::Whitespace,
TokenType::Comment("this is a comment".to_string()),
TokenType::Whitespace,
TokenType::Identifier("key".to_string()),
TokenType::Colon,
TokenType::Whitespace,
TokenType::Identifier("value".to_string()),
TokenType::Whitespace,
TokenType::Comment("another one".to_string()),
TokenType::Eof,
];
assert_eq!(token_types, expected);
}
#[test]
fn test_complex_mon_structure() {
let input = r#"
{
// Config settings
service_name: "My App",
port: 8080,
is_enabled: on,
&default_user: {
permissions: ["READ", "WRITE"],
},
admin :: User = {
...*default_user,
name: "Admin",
}
}
"#;
let expected = vec![
TokenType::LBrace,
TokenType::Identifier("service_name".to_string()),
TokenType::Colon,
TokenType::String("My App".to_string()),
TokenType::Comma,
TokenType::Identifier("port".to_string()),
TokenType::Colon,
TokenType::Number(8080.0),
TokenType::Comma,
TokenType::Identifier("is_enabled".to_string()),
TokenType::Colon,
TokenType::True,
TokenType::Comma,
TokenType::Ampersand,
TokenType::Identifier("default_user".to_string()),
TokenType::Colon,
TokenType::LBrace,
TokenType::Identifier("permissions".to_string()),
TokenType::Colon,
TokenType::LBracket,
TokenType::String("READ".to_string()),
TokenType::Comma,
TokenType::String("WRITE".to_string()),
TokenType::RBracket,
TokenType::Comma,
TokenType::RBrace,
TokenType::Comma,
TokenType::Identifier("admin".to_string()),
TokenType::DoubleColon,
TokenType::Identifier("User".to_string()),
TokenType::Equals,
TokenType::LBrace,
TokenType::Spread,
TokenType::Asterisk,
TokenType::Identifier("default_user".to_string()),
TokenType::Comma,
TokenType::Identifier("name".to_string()),
TokenType::Colon,
TokenType::String("Admin".to_string()),
TokenType::Comma,
TokenType::RBrace,
TokenType::RBrace,
TokenType::Eof,
];
print!("{input}");
assert_tokens(input, &expected);
}
#[test]
fn test_unclosed_string() {
let input = r#"{ key: "unclosed }"#;
let mut lexer = Lexer::new(input);
let tokens = lexer.lex();
let has_unknown = tokens.iter().any(|t| matches!(t.ttype, TokenType::Unknown));
assert!(has_unknown, "Should have Unknown token for unclosed string");
}
#[test]
fn test_string_with_escapes() {
let input = r#""hello\nworld\t\"test\"""#;
let mut lexer = Lexer::new(input);
let token = lexer.next_token();
match token.ttype {
TokenType::String(s) => {
assert!(s.contains('\n'));
assert!(s.contains('\t'));
assert!(s.contains('"'));
assert_eq!(s, "hello\nworld\t\"test\"");
}
_ => panic!("Expected string token, got {:?}", token.ttype),
}
}
#[test]
fn test_invalid_escape_at_eof() {
let input = r#""test\"#;
let mut lexer = Lexer::new(input);
let token = lexer.next_token();
assert!(matches!(token.ttype, TokenType::Unknown));
}
#[test]
fn test_number_with_exponent() {
let input = "1.23e10 4.5E-3";
let mut lexer = Lexer::new(input);
let tok1 = lexer.next_token();
assert!(matches!(tok1.ttype, TokenType::Number(n) if (n - 1.23e10).abs() < 1e-6));
lexer.next_token(); let tok2 = lexer.next_token();
assert!(matches!(tok2.ttype, TokenType::Number(n) if (n - 4.5e-3).abs() < 1e-9));
}
#[test]
fn test_negative_numbers() {
let input = "-42 -3.2";
let expected = vec![
TokenType::Number(-42.0),
TokenType::Number(-3.2),
TokenType::Eof,
];
assert_tokens(input, &expected);
}
#[test]
fn test_dotdot_not_spread() {
let input = "..";
let mut lexer = Lexer::new(input);
let tok1 = lexer.next_token();
assert!(matches!(tok1.ttype, TokenType::Dot | TokenType::Unknown));
}
#[test]
fn test_unknown_character() {
let input = "{ @invalid }";
let mut lexer = Lexer::new(input);
let tokens: Vec<TokenType> = lexer.lex().into_iter().map(|t| t.ttype).collect();
assert!(tokens.iter().any(|t| matches!(t, TokenType::Unknown)));
}
#[test]
fn test_single_slash_not_comment() {
let input = "test / value";
let mut lexer = Lexer::new(input);
let tokens: Vec<TokenType> = lexer.lex().into_iter().map(|t| t.ttype).collect();
assert!(tokens.iter().any(|t| matches!(t, TokenType::Unknown)));
}
#[test]
fn test_escape_r() {
let input = r#""test\rvalue""#;
let mut lexer = Lexer::new(input);
let token = lexer.next_token();
assert!(matches!(token.ttype, TokenType::String(s) if !s.is_empty()));
}
#[test]
fn test_escape_backslash() {
let input = r#""test\\value""#;
let mut lexer = Lexer::new(input);
let token = lexer.next_token();
assert!(matches!(token.ttype, TokenType::String(s) if !s.is_empty()));
}
#[test]
fn test_unknown_escape_preserved() {
let input = r#""test\xvalue""#;
let mut lexer = Lexer::new(input);
let token = lexer.next_token();
assert!(matches!(token.ttype, TokenType::String(_)));
}
#[test]
fn test_zero_number() {
assert_tokens("0", &[TokenType::Number(0.0), TokenType::Eof]);
}
#[test]
fn test_decimal_point_only() {
assert_tokens("3.69", &[TokenType::Number(3.69), TokenType::Eof]);
}
#[test]
fn test_leading_decimal() {
let input = ".5";
let mut lexer = Lexer::new(input);
let tok1 = lexer.next_token();
let tok2 = lexer.next_token();
assert!(matches!(tok1.ttype, TokenType::Dot));
assert!(matches!(tok2.ttype, TokenType::Number(5.0)));
}
#[test]
fn test_multiline_comment() {
let input = "// line 1\n// line 2\nvalue";
let mut lexer = Lexer::new(input);
let tokens: Vec<TokenType> = lexer
.lex()
.into_iter()
.filter(|t| !matches!(t.ttype, TokenType::Whitespace | TokenType::Comment(_)))
.map(|t| t.ttype)
.collect();
assert_eq!(
tokens,
vec![TokenType::Identifier("value".to_string()), TokenType::Eof]
);
}
#[test]
fn test_comment_at_eof() {
let input = "value // comment at end";
let mut lexer = Lexer::new(input);
let tokens: Vec<TokenType> = lexer.lex().into_iter().map(|t| t.ttype).collect();
assert!(tokens.iter().any(|t| matches!(t, TokenType::Comment(_))));
}
#[test]
fn test_all_keywords() {
let input = "true false null on off import from as";
let expected = vec![
TokenType::True,
TokenType::False,
TokenType::Null,
TokenType::True, TokenType::False, TokenType::Import,
TokenType::From,
TokenType::As,
TokenType::Eof,
];
assert_tokens(input, &expected);
}
#[test]
fn test_identifiers_with_underscores() {
let input = "my_var _private __dunder";
let expected = vec![
TokenType::Identifier("my_var".to_string()),
TokenType::Identifier("_private".to_string()),
TokenType::Identifier("__dunder".to_string()),
TokenType::Eof,
];
assert_tokens(input, &expected);
}
#[test]
fn test_mixed_operators() {
let input = ":: = ...";
let expected = vec![
TokenType::DoubleColon,
TokenType::Equals,
TokenType::Spread,
TokenType::Eof,
];
assert_tokens(input, &expected);
}
#[test]
fn test_adjacent_tokens_no_whitespace() {
let input = "[1,2,3]";
let mut lexer = Lexer::new(input);
let tokens: Vec<TokenType> = lexer
.lex()
.into_iter()
.filter(|t| !matches!(t.ttype, TokenType::Whitespace))
.map(|t| t.ttype)
.collect();
assert_eq!(tokens.len(), 8); }
#[test]
fn test_hash_token() {
let input = "#struct";
let expected = vec![
TokenType::Hash,
TokenType::Identifier("struct".to_string()),
TokenType::Eof,
];
assert_tokens(input, &expected);
}
#[test]
fn test_dollar_token() {
let input = "$Status.Active";
let expected = vec![
TokenType::Dollar,
TokenType::Identifier("Status".to_string()),
TokenType::Dot,
TokenType::Identifier("Active".to_string()),
TokenType::Eof,
];
assert_tokens(input, &expected);
}
#[test]
fn test_empty_string() {
let input = r#""""#;
let mut lexer = Lexer::new(input);
let token = lexer.next_token();
assert_eq!(token.ttype, TokenType::String("".to_string()));
}
}