use super::error::ParseError;
#[derive(Debug, Clone, PartialEq)]
pub enum Token {
Ident(String),
QuotedString(String),
SingleQuotedString(String),
Int(i64),
Float(f64),
Bool(bool),
Regex(String),
Colon,
OpenBrace,
CloseBrace,
Pipe,
Dash,
Variable(String),
Gt,
Lt,
Gte,
Lte,
Newline,
Raw(String),
}
#[derive(Debug, Clone)]
pub struct Located<T> {
pub value: T,
pub line: usize,
pub col: usize,
}
pub type LocatedToken = Located<Token>;
pub fn tokenize(input: &str) -> Result<Vec<LocatedToken>, ParseError> {
let mut tokens = Vec::new();
let lines: Vec<&str> = input.lines().collect();
let mut in_multiline = false;
let mut multiline_base_indent: usize = 0;
for (line_idx, line) in lines.iter().enumerate() {
let line_num = line_idx + 1;
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
tokens.push(LocatedToken {
value: Token::Newline,
line: line_num,
col: 1,
});
continue;
}
if in_multiline {
let indent = line.len() - line.trim_start().len();
if indent > multiline_base_indent {
tokens.push(LocatedToken {
value: Token::Raw(trimmed.to_string()),
line: line_num,
col: indent + 1,
});
tokens.push(LocatedToken {
value: Token::Newline,
line: line_num,
col: line.len() + 1,
});
continue;
}
in_multiline = false;
}
let line_tokens = tokenize_line(trimmed, line_num)?;
let has_pipe = line_tokens
.iter()
.rev()
.any(|t| matches!(t.value, Token::Pipe));
if has_pipe {
in_multiline = true;
multiline_base_indent = line.len() - line.trim_start().len();
}
tokens.extend(line_tokens);
tokens.push(LocatedToken {
value: Token::Newline,
line: line_num,
col: line.len() + 1,
});
}
Ok(tokens)
}
fn tokenize_line(line: &str, line_num: usize) -> Result<Vec<LocatedToken>, ParseError> {
let mut tokens = Vec::new();
let chars: Vec<char> = line.chars().collect();
let mut i = 0;
while i < chars.len() {
if chars[i].is_whitespace() {
i += 1;
continue;
}
if chars[i] == '#' {
break;
}
let col = i + 1;
match chars[i] {
'{' => {
tokens.push(LocatedToken {
value: Token::OpenBrace,
line: line_num,
col,
});
i += 1;
}
'}' => {
tokens.push(LocatedToken {
value: Token::CloseBrace,
line: line_num,
col,
});
i += 1;
}
'|' => {
tokens.push(LocatedToken {
value: Token::Pipe,
line: line_num,
col,
});
i += 1;
}
':' => {
tokens.push(LocatedToken {
value: Token::Colon,
line: line_num,
col,
});
i += 1;
}
'>' => {
if i + 1 < chars.len() && chars[i + 1] == '=' {
tokens.push(LocatedToken {
value: Token::Gte,
line: line_num,
col,
});
i += 2;
} else {
tokens.push(LocatedToken {
value: Token::Gt,
line: line_num,
col,
});
i += 1;
}
}
'<' => {
if i + 1 < chars.len() && chars[i + 1] == '=' {
tokens.push(LocatedToken {
value: Token::Lte,
line: line_num,
col,
});
i += 2;
} else {
tokens.push(LocatedToken {
value: Token::Lt,
line: line_num,
col,
});
i += 1;
}
}
'-' => {
if tokens.is_empty() || (i + 1 < chars.len() && chars[i + 1] == ' ') {
let rest: String = chars[i + 1..].iter().collect();
let rest_trimmed = rest.trim_start();
if rest_trimmed.starts_with('"') || rest_trimmed.starts_with('\'') {
tokens.push(LocatedToken {
value: Token::Dash,
line: line_num,
col,
});
i += 1;
continue;
}
tokens.push(LocatedToken {
value: Token::Dash,
line: line_num,
col,
});
i += 1;
continue;
}
let word = read_word(&chars, &mut i);
tokens.push(classify_word(word, line_num, col));
}
'"' => {
let s = read_double_quoted_string(&chars, &mut i, line_num)?;
tokens.push(LocatedToken {
value: Token::QuotedString(s),
line: line_num,
col,
});
}
'\'' => {
let prev_is_alpha = i > 0 && chars[i - 1].is_alphanumeric();
if prev_is_alpha {
let word = read_word(&chars, &mut i);
let combined = format!("'{}", word);
tokens.push(LocatedToken {
value: Token::Ident(combined),
line: line_num,
col,
});
} else {
let s = read_single_quoted_string(&chars, &mut i, line_num)?;
tokens.push(LocatedToken {
value: Token::SingleQuotedString(s),
line: line_num,
col,
});
}
}
'/' => {
let is_regex_ctx = tokens
.last()
.map_or(false, |t| matches!(t.value, Token::Colon));
if is_regex_ctx {
let s = read_regex(&chars, &mut i, line_num)?;
tokens.push(LocatedToken {
value: Token::Regex(s),
line: line_num,
col,
});
} else {
let word = read_word(&chars, &mut i);
tokens.push(classify_word(word, line_num, col));
}
}
'$' => {
i += 1;
let name = read_ident_chars(&chars, &mut i);
if name.is_empty() {
return Err(ParseError::new(
"expected variable name after $",
line_num,
col,
));
}
tokens.push(LocatedToken {
value: Token::Variable(name),
line: line_num,
col,
});
}
_ if chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == '.' => {
let word = read_word(&chars, &mut i);
tokens.push(classify_word(word, line_num, col));
}
_ => {
let remaining: String = chars[i..].iter().collect();
tokens.push(LocatedToken {
value: Token::Raw(remaining.trim().to_string()),
line: line_num,
col,
});
break;
}
}
}
Ok(tokens)
}
fn read_word(chars: &[char], i: &mut usize) -> String {
let mut word = String::new();
while *i < chars.len()
&& (chars[*i].is_alphanumeric()
|| chars[*i] == '_'
|| chars[*i] == '-'
|| chars[*i] == '.'
|| chars[*i] == '/')
{
word.push(chars[*i]);
*i += 1;
}
word
}
fn read_ident_chars(chars: &[char], i: &mut usize) -> String {
let mut name = String::new();
while *i < chars.len() && (chars[*i].is_alphanumeric() || chars[*i] == '_' || chars[*i] == '-')
{
name.push(chars[*i]);
*i += 1;
}
name
}
fn classify_word(word: String, line: usize, col: usize) -> LocatedToken {
let value = match word.as_str() {
"true" => Token::Bool(true),
"false" => Token::Bool(false),
_ => {
if let Ok(n) = word.parse::<i64>() {
Token::Int(n)
} else if let Ok(f) = word.parse::<f64>() {
Token::Float(f)
} else {
Token::Ident(word)
}
}
};
LocatedToken { value, line, col }
}
fn read_double_quoted_string(
chars: &[char],
i: &mut usize,
line: usize,
) -> Result<String, ParseError> {
*i += 1; let mut s = String::new();
while *i < chars.len() {
if chars[*i] == '\\' && *i + 1 < chars.len() {
match chars[*i + 1] {
'n' => s.push('\n'),
't' => s.push('\t'),
'"' => s.push('"'),
'\\' => s.push('\\'),
other => {
s.push('\\');
s.push(other);
}
}
*i += 2;
continue;
}
if chars[*i] == '"' {
*i += 1;
return Ok(s);
}
s.push(chars[*i]);
*i += 1;
}
Err(ParseError::new("unterminated string", line, *i + 1))
}
fn read_single_quoted_string(
chars: &[char],
i: &mut usize,
line: usize,
) -> Result<String, ParseError> {
*i += 1; let mut s = String::new();
while *i < chars.len() {
if chars[*i] == '\'' {
*i += 1;
return Ok(s);
}
s.push(chars[*i]);
*i += 1;
}
Err(ParseError::new("unterminated string", line, *i + 1))
}
fn read_regex(chars: &[char], i: &mut usize, line: usize) -> Result<String, ParseError> {
*i += 1; let mut s = String::new();
while *i < chars.len() {
if chars[*i] == '\\' && *i + 1 < chars.len() && chars[*i + 1] == '/' {
s.push('/');
*i += 2;
continue;
}
if chars[*i] == '/' {
*i += 1;
return Ok(s);
}
s.push(chars[*i]);
*i += 1;
}
Err(ParseError::new("unterminated regex", line, *i + 1))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_tokens() {
let input = r#"blueprint "Test" {"#;
let tokens = tokenize(input).unwrap_or_else(|e| panic!("parse error: {e}"));
let values: Vec<&Token> = tokens.iter().map(|t| &t.value).collect();
assert!(matches!(values[0], Token::Ident(s) if s == "blueprint"));
assert!(matches!(values[1], Token::QuotedString(s) if s == "Test"));
assert!(matches!(values[2], Token::OpenBrace));
}
#[test]
fn test_property_line() {
let input = "slug: build-http-server";
let tokens = tokenize(input).unwrap_or_else(|e| panic!("parse error: {e}"));
let values: Vec<&Token> = tokens.iter().map(|t| &t.value).collect();
assert!(matches!(values[0], Token::Ident(s) if s == "slug"));
assert!(matches!(values[1], Token::Colon));
assert!(matches!(values[2], Token::Ident(s) if s == "build-http-server"));
}
#[test]
fn test_variable_token() {
let input = "$container_id";
let tokens = tokenize(input).unwrap_or_else(|e| panic!("parse error: {e}"));
assert!(matches!(&tokens[0].value, Token::Variable(s) if s == "container_id"));
}
#[test]
fn test_regex_token() {
let input = r"matches: /^[a-f0-9]{64}$/";
let tokens = tokenize(input).unwrap_or_else(|e| panic!("parse error: {e}"));
let non_nl: Vec<&Token> = tokens
.iter()
.filter(|t| !matches!(t.value, Token::Newline))
.map(|t| &t.value)
.collect();
assert!(matches!(non_nl[0], Token::Ident(s) if s == "matches"));
assert!(matches!(non_nl[1], Token::Colon));
assert!(matches!(non_nl[2], Token::Regex(s) if s == "^[a-f0-9]{64}$"));
}
#[test]
fn test_slash_as_path() {
let input = "GET /echo/hello";
let tokens = tokenize(input).unwrap_or_else(|e| panic!("parse error: {e}"));
let non_nl: Vec<&Token> = tokens
.iter()
.filter(|t| !matches!(t.value, Token::Newline))
.map(|t| &t.value)
.collect();
assert!(matches!(non_nl[0], Token::Ident(s) if s == "GET"));
assert!(matches!(non_nl[1], Token::Ident(s) if s == "/echo/hello"));
}
#[test]
fn test_comparison_operators() {
let input = "> < >= <=";
let tokens = tokenize(input).unwrap_or_else(|e| panic!("parse error: {e}"));
let values: Vec<&Token> = tokens
.iter()
.filter(|t| !matches!(t.value, Token::Newline))
.map(|t| &t.value)
.collect();
assert_eq!(values.len(), 4);
assert!(matches!(values[0], Token::Gt));
assert!(matches!(values[1], Token::Lt));
assert!(matches!(values[2], Token::Gte));
assert!(matches!(values[3], Token::Lte));
}
#[test]
fn test_comment_skipped() {
let input = "# this is a comment\nslug: test";
let tokens = tokenize(input).unwrap_or_else(|e| panic!("parse error: {e}"));
let non_newline: Vec<&Token> = tokens
.iter()
.filter(|t| !matches!(t.value, Token::Newline))
.map(|t| &t.value)
.collect();
assert!(matches!(non_newline[0], Token::Ident(s) if s == "slug"));
}
#[test]
fn test_numeric_tokens() {
let input = "port: 4221";
let tokens = tokenize(input).unwrap_or_else(|e| panic!("parse error: {e}"));
let values: Vec<&Token> = tokens
.iter()
.filter(|t| !matches!(t.value, Token::Newline))
.map(|t| &t.value)
.collect();
assert!(matches!(values[2], Token::Int(4221)));
}
#[test]
fn test_bool_tokens() {
let input = "is_published: true";
let tokens = tokenize(input).unwrap_or_else(|e| panic!("parse error: {e}"));
let values: Vec<&Token> = tokens
.iter()
.filter(|t| !matches!(t.value, Token::Newline))
.map(|t| &t.value)
.collect();
assert!(matches!(values[2], Token::Bool(true)));
}
}