#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Token {
Term(String),
Eq,
Open,
Close,
Eof,
}
#[derive(PartialEq)]
enum State {
Whitespace,
Comment,
Term,
Quoted,
}
pub fn tokenize(input: &str) -> crate::error::Result<Vec<(Token, u32)>> {
let mut tokens = Vec::new();
let mut state = State::Whitespace;
let mut buf = String::new();
let mut line = 1u32;
let mut token_line = 1u32;
for ch in input.chars() {
match state {
State::Whitespace => match ch {
'\n' => line = inc_line(line)?,
' ' | '\t' | '\r' => {}
'#' => state = State::Comment,
'"' => {
token_line = line;
state = State::Quoted;
}
'=' => tokens.push((Token::Eq, line)),
'{' => tokens.push((Token::Open, line)),
'}' => tokens.push((Token::Close, line)),
_ => {
token_line = line;
buf.push(ch);
state = State::Term;
}
},
State::Quoted => {
if ch == '"' {
tokens.push((Token::Term(buf.clone()), token_line));
buf.clear();
state = State::Whitespace;
} else {
if ch == '\n' {
line = inc_line(line)?;
}
buf.push(ch);
}
}
State::Comment => {
if ch == '\n' {
line = inc_line(line)?;
state = State::Whitespace;
}
}
State::Term => {
flush_term_char(
ch,
&mut buf,
&mut tokens,
&mut line,
&mut token_line,
&mut state,
)?;
}
}
}
if state == State::Term {
let term = buf.trim().to_owned();
if !term.is_empty() {
tokens.push((Token::Term(term), token_line));
}
}
if state == State::Quoted {
return Err(crate::error::Error::Parse(format!(
"line {token_line}: unterminated quoted string"
)));
}
tokens.push((Token::Eof, line));
Ok(tokens)
}
fn inc_line(line: u32) -> crate::error::Result<u32> {
line.checked_add(1).ok_or_else(|| {
crate::error::Error::Parse("file exceeds maximum line count (u32::MAX)".to_owned())
})
}
fn flush_term_char(
ch: char,
buf: &mut String,
tokens: &mut Vec<(Token, u32)>,
line: &mut u32,
token_line: &mut u32,
state: &mut State,
) -> crate::error::Result<()> {
match ch {
'\n' => {
flush_buf(buf, tokens, *token_line);
*line = inc_line(*line)?;
*state = State::Whitespace;
}
'#' | ' ' | '\t' | '\r' => {
flush_buf(buf, tokens, *token_line);
*state = if ch == '#' {
State::Comment
} else {
State::Whitespace
};
}
'=' => {
flush_buf(buf, tokens, *token_line);
tokens.push((Token::Eq, *line));
*state = State::Whitespace;
}
'{' => {
flush_buf(buf, tokens, *token_line);
tokens.push((Token::Open, *line));
*state = State::Whitespace;
}
'}' => {
flush_buf(buf, tokens, *token_line);
tokens.push((Token::Close, *line));
*state = State::Whitespace;
}
_ => buf.push(ch),
}
Ok(())
}
fn flush_buf(buf: &mut String, tokens: &mut Vec<(Token, u32)>, token_line: u32) {
let term = buf.trim().to_owned();
if !term.is_empty() {
tokens.push((Token::Term(term), token_line));
}
buf.clear();
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn basic_kv() {
let toks = tokenize("key = value").unwrap();
assert_eq!(
toks,
vec![
(Token::Term("key".into()), 1),
(Token::Eq, 1),
(Token::Term("value".into()), 1),
(Token::Eof, 1),
]
);
}
#[test]
fn quoted_value() {
let toks = tokenize(r#"key = "hello world""#).unwrap();
assert_eq!(
toks,
vec![
(Token::Term("key".into()), 1),
(Token::Eq, 1),
(Token::Term("hello world".into()), 1),
(Token::Eof, 1),
]
);
}
#[test]
fn comment_stripped() {
let toks = tokenize("# comment\nkey = val").unwrap();
assert_eq!(
toks,
vec![
(Token::Term("key".into()), 2),
(Token::Eq, 2),
(Token::Term("val".into()), 2),
(Token::Eof, 2),
]
);
}
#[test]
fn array() {
let toks = tokenize("k = { 1 2 3 }").unwrap();
assert_eq!(
toks,
vec![
(Token::Term("k".into()), 1),
(Token::Eq, 1),
(Token::Open, 1),
(Token::Term("1".into()), 1),
(Token::Term("2".into()), 1),
(Token::Term("3".into()), 1),
(Token::Close, 1),
(Token::Eof, 1),
]
);
}
#[test]
fn multiline_line_numbers() {
let toks = tokenize("a = 1\nb = 2\nc = 3\n").unwrap();
assert_eq!(
toks,
vec![
(Token::Term("a".into()), 1),
(Token::Eq, 1),
(Token::Term("1".into()), 1),
(Token::Term("b".into()), 2),
(Token::Eq, 2),
(Token::Term("2".into()), 2),
(Token::Term("c".into()), 3),
(Token::Eq, 3),
(Token::Term("3".into()), 3),
(Token::Eof, 4),
]
);
}
#[test]
fn line_overflow_returns_error() {
assert!(inc_line(u32::MAX).is_err());
}
#[test]
fn unterminated_quoted_string_reports_line_number() {
let err = tokenize("key = \"unterminated").unwrap_err();
let msg = err.to_string();
assert!(
msg.contains("unterminated quoted string"),
"expected 'unterminated quoted string' in error: {msg}"
);
assert!(
msg.contains("line 1"),
"expected line number in error: {msg}"
);
}
#[test]
fn unterminated_quoted_string_multiline_reports_correct_line() {
let err = tokenize("a = 1\nb = \"unterminated").unwrap_err();
let msg = err.to_string();
assert!(msg.contains("line 2"), "expected 'line 2' in error: {msg}");
}
}