#[derive(Debug, Clone, PartialEq)]
pub struct Span {
pub offset: usize,
pub line: usize,
pub column: usize,
}
#[derive(Debug, Clone, PartialEq)]
pub struct SpannedToken {
pub token: Token,
pub span: Span,
}
#[derive(Debug, Clone, PartialEq)]
pub enum Token {
Feature,
Struct,
BooleanType,
IntegerType,
FloatType,
StringType,
BoolLit(bool),
NumberLit(f64),
StringLit(String),
LBrace,
RBrace,
Equals,
Colon,
Ident(String),
}
#[derive(Debug, Clone, PartialEq)]
pub struct LexError {
pub message: String,
pub span: Span,
}
fn compute_span(full_input: &str, offset: usize) -> Span {
let consumed = &full_input[..offset];
let line = consumed.chars().filter(|&c| c == '\n').count() + 1;
let column = match consumed.rfind('\n') {
Some(pos) => offset - pos,
None => offset + 1,
};
Span {
offset,
line,
column,
}
}
fn skip_whitespace(input: &str, pos: usize) -> usize {
let mut i = pos;
let bytes = input.as_bytes();
while i < bytes.len()
&& (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\n' || bytes[i] == b'\r')
{
i += 1;
}
i
}
fn lex_word(input: &str, pos: usize) -> (usize, &str) {
let start = pos;
let mut i = pos;
let bytes = input.as_bytes();
while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
i += 1;
}
(i, &input[start..i])
}
fn lex_number_token(input: &str, pos: usize) -> Result<(usize, f64), String> {
let mut i = pos;
let bytes = input.as_bytes();
if i < bytes.len() && bytes[i] == b'-' {
i += 1;
}
let digit_start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i == digit_start {
return Err("expected digit".to_string());
}
if i < bytes.len() && bytes[i] == b'.' {
i += 1;
let frac_start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i == frac_start {
return Err("expected digit after decimal point".to_string());
}
}
let num_str = &input[pos..i];
let n: f64 = num_str.parse().map_err(|e| format!("{}", e))?;
Ok((i, n))
}
fn lex_string_token(input: &str, pos: usize) -> Result<(usize, String), String> {
let mut i = pos + 1; let mut result = String::new();
let bytes = input.as_bytes();
loop {
if i >= bytes.len() {
return Err("unterminated string".to_string());
}
match bytes[i] {
b'"' => {
return Ok((i + 1, result));
}
b'\\' => {
i += 1;
if i >= bytes.len() {
return Err("unterminated string".to_string());
}
match bytes[i] {
b'n' => result.push('\n'),
b't' => result.push('\t'),
b'\\' => result.push('\\'),
b'"' => result.push('"'),
c => {
result.push('\\');
result.push(c as char);
}
}
i += 1;
}
_ => {
let ch = input[i..].chars().next().unwrap();
result.push(ch);
i += ch.len_utf8();
}
}
}
}
pub fn lex(input: &str) -> Result<Vec<SpannedToken>, LexError> {
let mut tokens = Vec::new();
let mut pos = 0;
loop {
pos = skip_whitespace(input, pos);
if pos >= input.len() {
break;
}
let span = compute_span(input, pos);
let byte = input.as_bytes()[pos];
match byte {
b'{' => {
tokens.push(SpannedToken {
token: Token::LBrace,
span,
});
pos += 1;
}
b'}' => {
tokens.push(SpannedToken {
token: Token::RBrace,
span,
});
pos += 1;
}
b'=' => {
tokens.push(SpannedToken {
token: Token::Equals,
span,
});
pos += 1;
}
b':' => {
tokens.push(SpannedToken {
token: Token::Colon,
span,
});
pos += 1;
}
b'"' => match lex_string_token(input, pos) {
Ok((new_pos, s)) => {
tokens.push(SpannedToken {
token: Token::StringLit(s),
span,
});
pos = new_pos;
}
Err(msg) => {
return Err(LexError { message: msg, span });
}
},
b'0'..=b'9' => match lex_number_token(input, pos) {
Ok((new_pos, n)) => {
tokens.push(SpannedToken {
token: Token::NumberLit(n),
span,
});
pos = new_pos;
}
Err(msg) => {
return Err(LexError { message: msg, span });
}
},
c if c.is_ascii_alphabetic() || c == b'_' => {
let (new_pos, word) = lex_word(input, pos);
let token = match word {
"Feature" => Token::Feature,
"Struct" => Token::Struct,
"Boolean" => Token::BooleanType,
"Integer" => Token::IntegerType,
"Float" => Token::FloatType,
"String" => Token::StringType,
"true" => Token::BoolLit(true),
"false" => Token::BoolLit(false),
_ => Token::Ident(word.to_string()),
};
tokens.push(SpannedToken { token, span });
pos = new_pos;
}
_ => {
return Err(LexError {
message: format!("unexpected character: {:?}", byte as char),
span,
});
}
}
}
Ok(tokens)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn lex_feature_keyword() {
let tokens = lex("Feature").unwrap();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].token, Token::Feature);
assert_eq!(
tokens[0].span,
Span {
offset: 0,
line: 1,
column: 1
}
);
}
#[test]
fn lex_type_keywords() {
let tokens = lex("Boolean Integer Float String").unwrap();
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0].token, Token::BooleanType);
assert_eq!(tokens[1].token, Token::IntegerType);
assert_eq!(tokens[2].token, Token::FloatType);
assert_eq!(tokens[3].token, Token::StringType);
}
#[test]
fn lex_bool_literals() {
let tokens = lex("true false").unwrap();
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].token, Token::BoolLit(true));
assert_eq!(tokens[1].token, Token::BoolLit(false));
}
#[test]
fn lex_number_literals() {
let tokens = lex("42 3.14").unwrap();
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].token, Token::NumberLit(42.0));
assert_eq!(tokens[1].token, Token::NumberLit(3.14));
}
#[test]
fn lex_string_literal() {
let tokens = lex(r#""hello""#).unwrap();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].token, Token::StringLit("hello".to_string()));
}
#[test]
fn lex_string_with_escapes() {
let tokens = lex(r#""hello\nworld""#).unwrap();
assert_eq!(tokens.len(), 1);
assert_eq!(
tokens[0].token,
Token::StringLit("hello\nworld".to_string())
);
}
#[test]
fn lex_complete_feature_block() {
let input = r#"1: Feature Checkout = {
1: enabled Boolean = true
2: max_items Integer = 50
3: header_text String = "Complete your purchase"
}"#;
let tokens = lex(input).unwrap();
assert_eq!(tokens.len(), 25);
assert_eq!(tokens[0].token, Token::NumberLit(1.0));
assert_eq!(tokens[1].token, Token::Colon);
assert_eq!(tokens[2].token, Token::Feature);
assert_eq!(tokens[3].token, Token::Ident("Checkout".to_string()));
assert_eq!(tokens[4].token, Token::Equals);
assert_eq!(tokens[5].token, Token::LBrace);
assert_eq!(tokens[6].token, Token::NumberLit(1.0));
assert_eq!(tokens[7].token, Token::Colon);
assert_eq!(tokens[8].token, Token::Ident("enabled".to_string()));
assert_eq!(tokens[9].token, Token::BooleanType);
assert_eq!(tokens[10].token, Token::Equals);
assert_eq!(tokens[11].token, Token::BoolLit(true));
assert_eq!(tokens[24].token, Token::RBrace);
}
#[test]
fn lex_error_unterminated_string() {
let result = lex(r#""hello"#);
assert!(result.is_err());
let err = result.unwrap_err();
assert_eq!(err.message, "unterminated string");
}
#[test]
fn lex_error_invalid_character() {
let result = lex("@");
assert!(result.is_err());
let err = result.unwrap_err();
assert!(err.message.contains("unexpected character"));
}
#[test]
fn lex_struct_keyword() {
let tokens = lex("Struct").unwrap();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].token, Token::Struct);
}
#[test]
fn lex_struct_block() {
let input = r#"1: Struct Theme = {
1: dark_mode Boolean = false
}"#;
let tokens = lex(input).unwrap();
assert_eq!(tokens[0].token, Token::NumberLit(1.0));
assert_eq!(tokens[1].token, Token::Colon);
assert_eq!(tokens[2].token, Token::Struct);
assert_eq!(tokens[3].token, Token::Ident("Theme".to_string()));
assert_eq!(tokens[4].token, Token::Equals);
assert_eq!(tokens[5].token, Token::LBrace);
assert_eq!(tokens[6].token, Token::NumberLit(1.0));
assert_eq!(tokens[7].token, Token::Colon);
}
#[test]
fn lex_span_info_multiline() {
let input = "Feature\n Checkout";
let tokens = lex(input).unwrap();
assert_eq!(
tokens[0].span,
Span {
offset: 0,
line: 1,
column: 1
}
);
assert_eq!(
tokens[1].span,
Span {
offset: 10,
line: 2,
column: 3
}
);
}
}