use crate::error::{Error, ErrorKind};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LineKind {
ListItem,
DictItem,
StringItem,
KeyItem,
InlineDict,
InlineList,
Unrecognized,
}
#[derive(Debug, Clone)]
pub struct Line {
pub kind: LineKind,
pub lineno: usize,
pub depth: usize,
pub key: Option<String>,
pub value: Option<String>,
pub text: String,
}
#[derive(Debug)]
pub struct Lexer {
lines: Vec<Line>,
pos: usize,
}
impl Lexer {
pub fn new(input: &str) -> Result<Self, Error> {
let input = input.strip_prefix('\u{feff}').unwrap_or(input);
let normalized = input.replace("\r\n", "\n").replace('\r', "\n");
let raw_lines: Vec<&str> = normalized.split('\n').collect();
let mut lines = Vec::new();
for (lineno, &raw) in raw_lines.iter().enumerate() {
let depth = count_leading_spaces(raw);
if let Some(err) = check_indentation(raw, depth, lineno) {
return Err(err);
}
let rest = &raw[depth..];
if rest.is_empty() {
continue;
}
if rest.starts_with('#') {
continue;
}
let line = classify_line(rest, depth, lineno, raw)?;
lines.push(line);
}
Ok(Lexer { lines, pos: 0 })
}
pub fn peek(&self) -> Option<&Line> {
self.lines.get(self.pos)
}
pub fn next_line(&mut self) -> Option<&Line> {
if self.pos < self.lines.len() {
let line = &self.lines[self.pos];
self.pos += 1;
Some(line)
} else {
None
}
}
pub fn next_is(&self, depth: usize, kind: LineKind) -> bool {
self.peek().map_or(false, |l| l.depth == depth && l.kind == kind)
}
}
fn count_leading_spaces(line: &str) -> usize {
line.bytes().take_while(|&b| b == b' ').count()
}
fn check_indentation(raw: &str, _space_count: usize, lineno: usize) -> Option<Error> {
for (i, ch) in raw.chars().enumerate() {
if ch == ' ' {
continue;
}
if ch == '\t' {
return Some(
Error::new(
ErrorKind::TabInIndentation,
"invalid character in indentation: '\\t'.",
)
.with_lineno(lineno)
.with_colno(i)
.with_line(raw.to_string()),
);
}
if ch.is_whitespace() && ch != ' ' {
let desc = match ch {
'\u{a0}' => "'\\xa0' (NO-BREAK SPACE).".to_string(),
_ => format!("'\\u{:04x}'.", ch as u32),
};
return Some(
Error::new(
ErrorKind::InvalidIndentation,
format!("invalid character in indentation: {}", desc),
)
.with_lineno(lineno)
.with_colno(i)
.with_line(raw.to_string()),
);
}
break;
}
None
}
fn classify_line(rest: &str, depth: usize, lineno: usize, raw: &str) -> Result<Line, Error> {
let text = raw.to_string();
if rest.starts_with("- ") || rest == "-" {
let value = if rest == "-" {
String::new()
} else {
rest[2..].to_string()
};
return Ok(Line {
kind: LineKind::ListItem,
lineno,
depth,
key: None,
value: Some(value),
text,
});
}
if rest.starts_with("> ") || rest == ">" {
let value = if rest == ">" {
String::new()
} else {
rest[2..].to_string()
};
return Ok(Line {
kind: LineKind::StringItem,
lineno,
depth,
key: None,
value: Some(value),
text,
});
}
if rest.starts_with(": ") || rest == ":" {
let value = if rest == ":" {
String::new()
} else {
rest[2..].to_string()
};
return Ok(Line {
kind: LineKind::KeyItem,
lineno,
depth,
key: None,
value: Some(value),
text,
});
}
if rest.starts_with('[') {
return Ok(Line {
kind: LineKind::InlineList,
lineno,
depth,
key: None,
value: Some(rest.to_string()),
text,
});
}
if rest.starts_with('{') {
return Ok(Line {
kind: LineKind::InlineDict,
lineno,
depth,
key: None,
value: Some(rest.to_string()),
text,
});
}
if let Some((key, value)) = try_parse_dict_item(rest) {
return Ok(Line {
kind: LineKind::DictItem,
lineno,
depth,
key: Some(key),
value: Some(value),
text,
});
}
Ok(Line {
kind: LineKind::Unrecognized,
lineno,
depth,
key: None,
value: None,
text,
})
}
fn try_parse_dict_item(rest: &str) -> Option<(String, String)> {
if let Some(colon_pos) = rest.find(": ") {
let key = rest[..colon_pos].to_string();
let value = rest[colon_pos + 2..].to_string();
if key.is_empty() {
return None;
}
let key = key.trim_end().to_string();
return Some((key, value));
}
if rest.ends_with(':') {
let key = rest[..rest.len() - 1].trim_end().to_string();
if key.is_empty() {
return None;
}
return Some((key, String::new()));
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_blank_and_comments_skipped() {
let input = "# comment\n\n # indented comment\n \nkey: value";
let lexer = Lexer::new(input).unwrap();
assert_eq!(lexer.lines.len(), 1);
assert_eq!(lexer.lines[0].kind, LineKind::DictItem);
}
#[test]
fn test_list_item() {
let lexer = Lexer::new("- hello\n-").unwrap();
assert_eq!(lexer.lines.len(), 2);
assert_eq!(lexer.lines[0].kind, LineKind::ListItem);
assert_eq!(lexer.lines[0].value.as_deref(), Some("hello"));
assert_eq!(lexer.lines[1].kind, LineKind::ListItem);
assert_eq!(lexer.lines[1].value.as_deref(), Some(""));
}
#[test]
fn test_string_item() {
let lexer = Lexer::new("> hello\n>").unwrap();
assert_eq!(lexer.lines.len(), 2);
assert_eq!(lexer.lines[0].kind, LineKind::StringItem);
assert_eq!(lexer.lines[0].value.as_deref(), Some("hello"));
assert_eq!(lexer.lines[1].kind, LineKind::StringItem);
assert_eq!(lexer.lines[1].value.as_deref(), Some(""));
}
#[test]
fn test_key_item() {
let lexer = Lexer::new(": hello\n:").unwrap();
assert_eq!(lexer.lines.len(), 2);
assert_eq!(lexer.lines[0].kind, LineKind::KeyItem);
assert_eq!(lexer.lines[0].value.as_deref(), Some("hello"));
assert_eq!(lexer.lines[1].kind, LineKind::KeyItem);
assert_eq!(lexer.lines[1].value.as_deref(), Some(""));
}
#[test]
fn test_dict_item() {
let lexer = Lexer::new("name: John\nage:").unwrap();
assert_eq!(lexer.lines.len(), 2);
assert_eq!(lexer.lines[0].kind, LineKind::DictItem);
assert_eq!(lexer.lines[0].key.as_deref(), Some("name"));
assert_eq!(lexer.lines[0].value.as_deref(), Some("John"));
assert_eq!(lexer.lines[1].kind, LineKind::DictItem);
assert_eq!(lexer.lines[1].key.as_deref(), Some("age"));
assert_eq!(lexer.lines[1].value.as_deref(), Some(""));
}
#[test]
fn test_inline_list() {
let lexer = Lexer::new("[a, b, c]").unwrap();
assert_eq!(lexer.lines.len(), 1);
assert_eq!(lexer.lines[0].kind, LineKind::InlineList);
assert_eq!(lexer.lines[0].value.as_deref(), Some("[a, b, c]"));
}
#[test]
fn test_inline_dict() {
let lexer = Lexer::new("{k: v}").unwrap();
assert_eq!(lexer.lines.len(), 1);
assert_eq!(lexer.lines[0].kind, LineKind::InlineDict);
assert_eq!(lexer.lines[0].value.as_deref(), Some("{k: v}"));
}
#[test]
fn test_indentation() {
let lexer = Lexer::new("key:\n - item1\n - item2").unwrap();
assert_eq!(lexer.lines[0].depth, 0);
assert_eq!(lexer.lines[1].depth, 2);
assert_eq!(lexer.lines[2].depth, 2);
}
#[test]
fn test_tab_in_indentation() {
let result = Lexer::new("\tkey: value");
assert!(result.is_err());
let err = result.unwrap_err();
assert_eq!(err.kind, ErrorKind::TabInIndentation);
}
#[test]
fn test_dict_key_no_tag_start() {
let lexer = Lexer::new("- not a dict\n> not a dict\n: not a dict").unwrap();
assert_eq!(lexer.lines[0].kind, LineKind::ListItem);
assert_eq!(lexer.lines[1].kind, LineKind::StringItem);
assert_eq!(lexer.lines[2].kind, LineKind::KeyItem);
}
#[test]
fn test_unrecognized() {
let lexer = Lexer::new("hello world").unwrap();
assert_eq!(lexer.lines[0].kind, LineKind::Unrecognized);
}
}