#[derive(Debug, Clone, PartialEq, Eq)]
pub struct WhitespaceError {
pub message: String,
pub range: std::ops::Range<usize>,
pub category: WhitespaceErrorCategory,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum WhitespaceErrorCategory {
TabIndentation,
LineTooLong,
MixedLineEndings,
InvalidIndentation,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(u16)]
#[allow(non_camel_case_types, clippy::upper_case_acronyms)]
pub enum SyntaxKind {
ROOT = 0,
DOCUMENT,
SEQUENCE,
MAPPING,
SCALAR,
ALIAS,
TAGGED_NODE,
ERROR,
DASH,
PLUS,
COLON,
QUESTION,
LEFT_BRACKET,
RIGHT_BRACKET,
LEFT_BRACE,
RIGHT_BRACE,
COMMA,
PIPE,
GREATER,
AMPERSAND,
ASTERISK,
EXCLAMATION,
PERCENT,
AT,
BACKTICK,
QUOTE,
SINGLE_QUOTE,
DOC_START,
DOC_END,
KEY,
VALUE,
MAPPING_ENTRY,
SEQUENCE_ENTRY,
STRING,
UNTERMINATED_STRING,
INT,
FLOAT,
BOOL,
NULL,
TAG,
ANCHOR,
REFERENCE,
MERGE_KEY,
DIRECTIVE,
WHITESPACE,
NEWLINE,
INDENT,
COMMENT,
BOM,
EOF,
}
impl From<SyntaxKind> for rowan::SyntaxKind {
fn from(kind: SyntaxKind) -> Self {
Self(kind as u16)
}
}
fn read_scalar_from<'a>(
chars: &mut std::iter::Peekable<std::str::CharIndices<'a>>,
input: &'a str,
start_idx: usize,
exclude_chars: &str,
) -> &'a str {
let mut end_idx = start_idx;
while let Some((idx, ch)) = chars.peek() {
if ch.is_whitespace() || is_yaml_special_except(*ch, exclude_chars) {
break;
}
end_idx = *idx + ch.len_utf8();
chars.next();
}
&input[start_idx..end_idx]
}
pub fn lex(input: &str) -> Vec<(SyntaxKind, &str)> {
let (tokens, _) = lex_with_validation(input);
tokens
}
pub struct ValidationConfig {
pub max_line_length: Option<usize>,
pub enforce_consistent_line_endings: bool,
}
impl Default for ValidationConfig {
fn default() -> Self {
Self {
max_line_length: Some(120), enforce_consistent_line_endings: true,
}
}
}
pub fn lex_with_validation(input: &str) -> (Vec<(SyntaxKind, &str)>, Vec<WhitespaceError>) {
lex_with_validation_config(input, &ValidationConfig::default())
}
pub fn lex_with_validation_config<'a>(
input: &'a str,
config: &ValidationConfig,
) -> (Vec<(SyntaxKind, &'a str)>, Vec<WhitespaceError>) {
use SyntaxKind::*;
let mut tokens = Vec::with_capacity(input.len() / 8); let mut chars = input.char_indices().peekable();
let mut whitespace_errors = Vec::new();
let bytes = input.as_bytes();
let mut current_line_start = 0;
let mut detected_line_ending: Option<&str> = None;
let mut flow_depth: u32 = 0;
if let Some((0, '\u{FEFF}')) = chars.peek() {
chars.next(); tokens.push((BOM, "\u{FEFF}"));
}
while let Some((start_idx, ch)) = chars.next() {
let token_start = start_idx;
match ch {
'-' => {
if let Some((_, '-')) = chars.peek() {
chars.next(); if let Some((_, '-')) = chars.peek() {
chars.next(); tokens.push((DOC_START, &input[token_start..start_idx + 3]));
} else {
tokens.push((DASH, &input[token_start..start_idx + 1]));
tokens.push((DASH, &input[start_idx + 1..start_idx + 2]));
}
} else {
let line_start_pos = input[..token_start]
.rfind(['\n', '\r'])
.map(|pos| pos + 1)
.unwrap_or(0);
let before_dash = &input[line_start_pos..token_start];
let only_whitespace_before = before_dash.chars().all(|c| c == ' ' || c == '\t');
let after_value_indicator = tokens
.iter()
.rev()
.find(|(kind, _)| !matches!(kind, WHITESPACE | INDENT))
.is_some_and(|(kind, _)| matches!(kind, QUESTION | COLON));
let followed_by_whitespace_or_end = chars
.peek()
.map_or(true, |(_, next_ch)| next_ch.is_whitespace());
let is_sequence_marker = (only_whitespace_before || after_value_indicator)
&& followed_by_whitespace_or_end;
if is_sequence_marker {
tokens.push((DASH, &input[token_start..start_idx + 1]));
} else {
let text = read_scalar_from(&mut chars, input, start_idx + 1, "-");
let full_text = &input[token_start..token_start + 1 + text.len()];
let token_kind = classify_scalar(full_text);
tokens.push((token_kind, full_text));
}
}
}
'+' => tokens.push((PLUS, &input[token_start..start_idx + 1])),
':' => {
if flow_depth > 0 {
tokens.push((COLON, &input[token_start..start_idx + 1]));
} else if let Some((_, next_ch)) = chars.peek() {
if next_ch.is_whitespace() {
tokens.push((COLON, &input[token_start..start_idx + 1]));
} else {
let mut end_idx = start_idx + 1;
while let Some((idx, next_ch)) = chars.peek() {
if next_ch.is_whitespace() {
break;
}
if is_yaml_special_except(*next_ch, ":") {
break;
}
end_idx = *idx + next_ch.len_utf8();
chars.next();
}
let text = &input[token_start..end_idx];
tokens.push((classify_scalar(text), text));
}
} else {
tokens.push((COLON, &input[token_start..start_idx + 1]));
}
}
'?' => tokens.push((QUESTION, &input[token_start..start_idx + 1])),
'[' => {
flow_depth += 1;
tokens.push((LEFT_BRACKET, &input[token_start..start_idx + 1]));
}
']' => {
flow_depth = flow_depth.saturating_sub(1);
tokens.push((RIGHT_BRACKET, &input[token_start..start_idx + 1]));
}
'{' => {
flow_depth += 1;
tokens.push((LEFT_BRACE, &input[token_start..start_idx + 1]));
}
'}' => {
flow_depth = flow_depth.saturating_sub(1);
tokens.push((RIGHT_BRACE, &input[token_start..start_idx + 1]));
}
',' => tokens.push((COMMA, &input[token_start..start_idx + 1])),
'|' => tokens.push((PIPE, &input[token_start..start_idx + 1])),
'>' => tokens.push((GREATER, &input[token_start..start_idx + 1])),
'<' => {
if let Some((_, '<')) = chars.peek() {
chars.next(); tokens.push((MERGE_KEY, &input[token_start..start_idx + 2]));
} else {
let mut end_idx = start_idx + 1;
while let Some((idx, ch)) = chars.peek() {
if ch.is_whitespace() || is_yaml_special(*ch) {
break;
}
end_idx = *idx + ch.len_utf8();
chars.next();
}
let text = &input[token_start..end_idx];
let token_kind = classify_scalar(text);
tokens.push((token_kind, text));
}
}
'&' => {
let name = read_scalar_from(&mut chars, input, start_idx + 1, "");
if !name.is_empty() {
tokens.push((ANCHOR, &input[token_start..start_idx + 1 + name.len()]));
} else {
tokens.push((AMPERSAND, &input[token_start..start_idx + 1]));
}
}
'*' => {
let name = read_scalar_from(&mut chars, input, start_idx + 1, "");
if !name.is_empty() {
tokens.push((REFERENCE, &input[token_start..start_idx + 1 + name.len()]));
} else {
tokens.push((ASTERISK, &input[token_start..start_idx + 1]));
}
}
'"' => {
let mut end_idx = start_idx + 1;
let mut escaped = false;
let mut found_closing = false;
while let Some((idx, ch)) = chars.peek() {
let current_idx = *idx;
let current_ch = *ch;
if escaped {
escaped = false;
end_idx = current_idx + current_ch.len_utf8();
chars.next();
continue;
}
if current_ch == '\\' {
escaped = true;
end_idx = current_idx + current_ch.len_utf8();
chars.next();
} else if current_ch == '"' {
end_idx = current_idx + current_ch.len_utf8();
chars.next();
found_closing = true;
break;
} else {
end_idx = current_idx + current_ch.len_utf8();
chars.next();
}
}
if found_closing {
tokens.push((STRING, &input[token_start..end_idx]));
} else {
tokens.push((UNTERMINATED_STRING, &input[token_start..end_idx]));
}
}
'\'' => {
let mut end_idx = start_idx + 1;
let mut found_closing = false;
while let Some((idx, ch)) = chars.peek() {
let current_idx = *idx;
let current_ch = *ch;
if current_ch == '\'' {
end_idx = current_idx + current_ch.len_utf8();
chars.next();
if let Some((next_idx, '\'')) = chars.peek() {
end_idx = *next_idx + 1;
chars.next();
} else {
found_closing = true;
break;
}
} else {
end_idx = current_idx + current_ch.len_utf8();
chars.next();
}
}
if found_closing {
tokens.push((STRING, &input[token_start..end_idx]));
} else {
tokens.push((UNTERMINATED_STRING, &input[token_start..end_idx]));
}
}
'.' => {
if chars.peek() == Some(&(start_idx + 1, '.')) {
chars.next(); if chars.peek() == Some(&(start_idx + 2, '.')) {
chars.next(); tokens.push((DOC_END, &input[token_start..start_idx + 3]));
} else {
let rest = read_scalar_from(&mut chars, input, start_idx + 2, "");
let text = &input[token_start..start_idx + 2 + rest.len()];
let token_kind = classify_scalar(text);
tokens.push((token_kind, text));
}
} else {
let rest = read_scalar_from(&mut chars, input, start_idx + 1, "");
let text = &input[token_start..start_idx + 1 + rest.len()];
let token_kind = classify_scalar(text);
tokens.push((token_kind, text));
}
}
'#' => {
let mut end_idx = start_idx + 1;
while let Some((idx, ch)) = chars.peek() {
if *ch == '\n' || *ch == '\r' {
break;
}
end_idx = *idx + ch.len_utf8();
chars.next();
}
tokens.push((COMMENT, &input[token_start..end_idx]));
}
'!' => {
let mut end_idx = start_idx + 1;
if let Some((_, '!')) = chars.peek() {
chars.next(); end_idx = start_idx + 2;
}
while let Some((idx, ch)) = chars.peek() {
if ch.is_whitespace() || is_yaml_special(*ch) {
break;
}
end_idx = *idx + ch.len_utf8();
chars.next();
}
tokens.push((TAG, &input[token_start..end_idx]));
}
'%' => {
if flow_depth > 0 {
let mut end_idx = start_idx + 1;
while let Some((idx, next_ch)) = chars.peek() {
if next_ch.is_whitespace() {
break;
}
if is_yaml_special_except(*next_ch, "%") {
break;
}
end_idx = *idx + next_ch.len_utf8();
chars.next();
}
let text = &input[token_start..end_idx];
tokens.push((classify_scalar(text), text));
} else {
let mut end_idx = start_idx + 1;
while let Some((idx, ch)) = chars.peek() {
if *ch == '\n' || *ch == '\r' {
break;
}
end_idx = *idx + ch.len_utf8();
chars.next();
}
tokens.push((DIRECTIVE, &input[token_start..end_idx]));
}
}
'\n' => {
if let Some(max_len) = config.max_line_length {
let line_length = start_idx - current_line_start;
if line_length > max_len {
whitespace_errors.push(WhitespaceError {
message: format!(
"Line too long ({} > {} characters)",
line_length, max_len
),
range: current_line_start..start_idx,
category: WhitespaceErrorCategory::LineTooLong,
});
}
}
let line_ending = "\n";
if config.enforce_consistent_line_endings {
if let Some(detected) = detected_line_ending {
if detected != line_ending {
whitespace_errors.push(WhitespaceError {
message: "Inconsistent line endings detected".to_string(),
range: token_start..start_idx + 1,
category: WhitespaceErrorCategory::MixedLineEndings,
});
}
} else {
detected_line_ending = Some(line_ending);
}
}
tokens.push((NEWLINE, &input[token_start..start_idx + 1]));
current_line_start = start_idx + 1;
}
'\r' => {
if let Some(max_len) = config.max_line_length {
let line_length = start_idx - current_line_start;
if line_length > max_len {
whitespace_errors.push(WhitespaceError {
message: format!(
"Line too long ({} > {} characters)",
line_length, max_len
),
range: current_line_start..start_idx,
category: WhitespaceErrorCategory::LineTooLong,
});
}
}
let (line_ending, end_pos) = if let Some((_, '\n')) = chars.peek() {
chars.next();
("\r\n", start_idx + 2)
} else {
("\r", start_idx + 1)
};
if config.enforce_consistent_line_endings {
if let Some(detected) = detected_line_ending {
if detected != line_ending {
whitespace_errors.push(WhitespaceError {
message: "Inconsistent line endings detected".to_string(),
range: token_start..end_pos,
category: WhitespaceErrorCategory::MixedLineEndings,
});
}
} else {
detected_line_ending = Some(line_ending);
}
}
tokens.push((NEWLINE, &input[token_start..end_pos]));
current_line_start = end_pos;
}
' ' | '\t' => {
let mut end_idx = start_idx + 1;
let mut has_tabs = ch == '\t';
while let Some((idx, ch)) = chars.peek() {
if *ch != ' ' && *ch != '\t' {
break;
}
if *ch == '\t' {
has_tabs = true;
}
end_idx = *idx + 1;
chars.next();
}
let is_indentation = token_start == 0
|| (token_start > 0
&& (bytes[token_start - 1] == b'\n' || bytes[token_start - 1] == b'\r'));
if is_indentation {
if has_tabs {
whitespace_errors.push(WhitespaceError {
message: "Tab character used for indentation (forbidden in YAML)"
.to_string(),
range: token_start..end_idx,
category: WhitespaceErrorCategory::TabIndentation,
});
}
tokens.push((INDENT, &input[token_start..end_idx]));
} else {
tokens.push((WHITESPACE, &input[token_start..end_idx]));
}
}
_ => {
let mut end_idx = start_idx + ch.len_utf8();
while let Some((idx, next_ch)) = chars.peek() {
if next_ch.is_whitespace() {
break;
}
if *next_ch == ':' {
let next_idx = *idx + next_ch.len_utf8();
if next_idx >= input.len() {
break;
} else if let Some(after) = input[next_idx..].chars().next() {
if after.is_whitespace() {
break;
}
}
end_idx = *idx + next_ch.len_utf8();
chars.next();
continue;
}
if is_yaml_special_except(*next_ch, "-:") {
if flow_depth == 0 && matches!(*next_ch, '[' | ']' | '{' | '}' | ',') {
} else {
break;
}
}
if *next_ch == '-' {
let line_start = input[..(*idx)].rfind('\n').map(|p| p + 1).unwrap_or(0);
let before_hyphen = &input[line_start..*idx];
if before_hyphen.chars().all(|c| c == ' ' || c == '\t') && *idx == end_idx {
break;
}
}
end_idx = *idx + next_ch.len_utf8();
chars.next();
}
let text = &input[token_start..end_idx];
tokens.push((classify_scalar(text), text));
}
}
}
if let Some(max_len) = config.max_line_length {
let final_line_length = input.len() - current_line_start;
if final_line_length > max_len && final_line_length > 0 {
whitespace_errors.push(WhitespaceError {
message: format!(
"Line too long ({} > {} characters)",
final_line_length, max_len
),
range: current_line_start..input.len(),
category: WhitespaceErrorCategory::LineTooLong,
});
}
}
(tokens, whitespace_errors)
}
fn classify_scalar(text: &str) -> SyntaxKind {
use SyntaxKind::*;
match text {
"true" | "false" | "True" | "False" | "TRUE" | "FALSE" => return BOOL,
"null" | "Null" | "NULL" | "~" => return NULL,
_ => {}
}
if crate::scalar::ScalarValue::parse_integer(text).is_some() {
return INT;
}
match text {
".inf" | ".Inf" | ".INF" | "+.inf" | "+.Inf" | "+.INF" | "-.inf" | "-.Inf" | "-.INF"
| ".nan" | ".NaN" | ".NAN" => return FLOAT,
"infinity" | "inf" | "Infinity" | "Inf" | "INFINITY" | "INF" | "-infinity" | "-inf"
| "-Infinity" | "-Inf" | "-INFINITY" | "-INF" | "+infinity" | "+inf" | "+Infinity"
| "+Inf" | "+INFINITY" | "+INF" | "nan" | "NaN" | "NAN" => return STRING,
_ => {}
}
if text.parse::<f64>().is_ok() {
return FLOAT;
}
STRING
}
const YAML_SPECIAL_CHARS: &str = ":+-?[]{},'|>&*!%\"#";
fn is_yaml_special(ch: char) -> bool {
YAML_SPECIAL_CHARS.contains(ch)
}
fn is_yaml_special_except(ch: char, exclude: &str) -> bool {
YAML_SPECIAL_CHARS.contains(ch) && !exclude.contains(ch)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_mapping() {
let input = "key: value";
let tokens = lex(input);
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0], (SyntaxKind::STRING, "key"));
assert_eq!(tokens[1], (SyntaxKind::COLON, ":"));
assert_eq!(tokens[2], (SyntaxKind::WHITESPACE, " "));
assert_eq!(tokens[3], (SyntaxKind::STRING, "value"));
}
#[test]
fn test_scalar_types() {
let tokens = lex("age: 42");
assert_eq!(tokens[0], (SyntaxKind::STRING, "age"));
assert_eq!(tokens[3], (SyntaxKind::INT, "42"));
let tokens = lex("pi: 3.14");
assert_eq!(tokens[0], (SyntaxKind::STRING, "pi"));
assert_eq!(tokens[3], (SyntaxKind::FLOAT, "3.14"));
let tokens = lex("enabled: true");
assert_eq!(tokens[0], (SyntaxKind::STRING, "enabled"));
assert_eq!(tokens[3], (SyntaxKind::BOOL, "true"));
let tokens = lex("disabled: false");
assert_eq!(tokens[3], (SyntaxKind::BOOL, "false"));
let tokens = lex("value: null");
assert_eq!(tokens[3], (SyntaxKind::NULL, "null"));
let tokens = lex("value: ~");
assert_eq!(tokens[3], (SyntaxKind::NULL, "~"));
}
#[test]
fn test_sequences() {
let input = "- item1\n- item2";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::DASH, "-"));
assert_eq!(tokens[1], (SyntaxKind::WHITESPACE, " "));
assert_eq!(tokens[2], (SyntaxKind::STRING, "item1"));
assert_eq!(tokens[3], (SyntaxKind::NEWLINE, "\n"));
assert_eq!(tokens[4], (SyntaxKind::DASH, "-"));
assert_eq!(tokens[5], (SyntaxKind::WHITESPACE, " "));
assert_eq!(tokens[6], (SyntaxKind::STRING, "item2"));
}
#[test]
fn test_hyphen_in_scalars() {
let input = "Name: example-project";
let tokens = lex(input);
println!("Hyphen test tokens:");
for (i, (kind, text)) in tokens.iter().enumerate() {
println!(" {}: {:?} = {:?}", i, kind, text);
}
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0], (SyntaxKind::STRING, "Name"));
assert_eq!(tokens[1], (SyntaxKind::COLON, ":"));
assert_eq!(tokens[2], (SyntaxKind::WHITESPACE, " "));
assert_eq!(tokens[3], (SyntaxKind::STRING, "example-project"));
}
#[test]
fn test_hyphen_sequence_vs_scalar() {
let sequence_input = "- example-item";
let tokens = lex(sequence_input);
println!("Sequence hyphen tokens:");
for (i, (kind, text)) in tokens.iter().enumerate() {
println!(" {}: {:?} = {:?}", i, kind, text);
}
assert_eq!(tokens[0], (SyntaxKind::DASH, "-"));
assert_eq!(tokens[1], (SyntaxKind::WHITESPACE, " "));
assert_eq!(tokens[2], (SyntaxKind::STRING, "example-item"));
let scalar_input = "package-name: my-awesome-package";
let tokens = lex(scalar_input);
println!("Package hyphen tokens:");
for (i, (kind, text)) in tokens.iter().enumerate() {
println!(" {}: {:?} = {:?}", i, kind, text);
}
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0], (SyntaxKind::STRING, "package-name"));
assert_eq!(tokens[3], (SyntaxKind::STRING, "my-awesome-package"));
}
#[test]
fn test_flow_style() {
let tokens = lex("[1, 2, 3]");
assert_eq!(tokens[0], (SyntaxKind::LEFT_BRACKET, "["));
assert_eq!(tokens[1], (SyntaxKind::INT, "1"));
assert_eq!(tokens[2], (SyntaxKind::COMMA, ","));
assert_eq!(tokens[3], (SyntaxKind::WHITESPACE, " "));
assert_eq!(tokens[4], (SyntaxKind::INT, "2"));
assert_eq!(tokens[5], (SyntaxKind::COMMA, ","));
assert_eq!(tokens[6], (SyntaxKind::WHITESPACE, " "));
assert_eq!(tokens[7], (SyntaxKind::INT, "3"));
assert_eq!(tokens[8], (SyntaxKind::RIGHT_BRACKET, "]"));
let tokens = lex("{a: 1, b: 2}");
assert_eq!(tokens[0], (SyntaxKind::LEFT_BRACE, "{"));
assert_eq!(tokens[1], (SyntaxKind::STRING, "a"));
assert_eq!(tokens[2], (SyntaxKind::COLON, ":"));
assert_eq!(tokens[3], (SyntaxKind::WHITESPACE, " "));
assert_eq!(tokens[4], (SyntaxKind::INT, "1"));
}
#[test]
fn test_comments() {
let input = "key: value # this is a comment\n# full line comment";
let tokens = lex(input);
let comments: Vec<_> = tokens
.iter()
.filter(|(kind, _)| *kind == SyntaxKind::COMMENT)
.collect();
assert_eq!(comments.len(), 2);
assert_eq!(comments[0].1, "# this is a comment");
assert_eq!(comments[1].1, "# full line comment");
}
#[test]
fn test_multiline_scalar() {
let input = "key: value\n continued";
let tokens = lex(input);
let indents: Vec<_> = tokens
.iter()
.filter(|(kind, _)| *kind == SyntaxKind::INDENT)
.collect();
assert_eq!(indents.len(), 1);
assert_eq!(indents[0].1, " ");
}
#[test]
fn test_quoted_strings() {
let input = r#"single: 'quoted'
double: "quoted""#;
let tokens = lex(input);
let quoted_strings: Vec<_> = tokens
.iter()
.filter(|(kind, text)| {
*kind == SyntaxKind::STRING && (text.starts_with('\'') || text.starts_with('"'))
})
.collect();
assert_eq!(quoted_strings.len(), 2);
let quoted_texts: Vec<&str> = {
let mut v: Vec<&str> = quoted_strings.iter().map(|(_, t)| *t).collect();
v.sort();
v
};
assert_eq!(quoted_texts, ["\"quoted\"", "'quoted'"]);
}
#[test]
fn test_document_markers() {
let input = "---\nkey: value\n...";
let tokens = lex(input);
println!("Document tokens:");
for (i, (kind, text)) in tokens.iter().enumerate() {
println!(" {}: {:?} = {:?}", i, kind, text);
}
let doc_start_count = tokens
.iter()
.filter(|(kind, _)| *kind == SyntaxKind::DOC_START)
.count();
let doc_end_count = tokens
.iter()
.filter(|(kind, _)| *kind == SyntaxKind::DOC_END)
.count();
assert_eq!(doc_start_count, 1);
assert_eq!(doc_end_count, 1);
}
#[test]
fn test_empty_input() {
let input = "";
let tokens = lex(input);
println!("Empty input tokens: {:?}", tokens);
assert_eq!(tokens.len(), 0);
}
#[test]
fn test_anchors_and_aliases() {
let input = "key: &anchor_name value";
let tokens = lex(input);
println!("Anchor tokens: {:?}", tokens);
let anchors: Vec<_> = tokens
.iter()
.filter(|(kind, _)| *kind == SyntaxKind::ANCHOR)
.collect();
assert_eq!(anchors.len(), 1);
assert_eq!(anchors[0].1, "&anchor_name");
let input = "key: *reference_name";
let tokens = lex(input);
println!("Reference tokens: {:?}", tokens);
let references: Vec<_> = tokens
.iter()
.filter(|(kind, _)| *kind == SyntaxKind::REFERENCE)
.collect();
assert_eq!(references.len(), 1);
assert_eq!(references[0].1, "*reference_name");
let input = "key: & *";
let tokens = lex(input);
let ampersands: Vec<_> = tokens
.iter()
.filter(|(kind, _)| *kind == SyntaxKind::AMPERSAND)
.collect();
assert_eq!(ampersands.len(), 1);
let asterisks: Vec<_> = tokens
.iter()
.filter(|(kind, _)| *kind == SyntaxKind::ASTERISK)
.collect();
assert_eq!(asterisks.len(), 1);
}
#[test]
fn test_merge_key_token() {
let input = "<<: *defaults";
let tokens = lex(input);
let merge_keys: Vec<_> = tokens
.iter()
.filter(|(kind, _)| *kind == SyntaxKind::MERGE_KEY)
.collect();
assert_eq!(merge_keys.len(), 1);
assert_eq!(merge_keys[0].1, "<<");
let input2 = "key: < value";
let tokens2 = lex(input2);
let merge_keys2: Vec<_> = tokens2
.iter()
.filter(|(kind, _)| *kind == SyntaxKind::MERGE_KEY)
.collect();
assert_eq!(merge_keys2.len(), 0, "Single < should not be a merge key");
}
#[test]
fn test_plus_token() {
let input = "key: |+ value";
let tokens = lex(input);
let plus_tokens: Vec<_> = tokens
.iter()
.filter(|(kind, _)| *kind == SyntaxKind::PLUS)
.collect();
assert_eq!(plus_tokens.len(), 1);
assert_eq!(plus_tokens[0].1, "+");
}
#[test]
fn test_block_scalar_indicators() {
let input1 = "key: |+ content";
let tokens1 = lex(input1);
assert!(tokens1
.iter()
.any(|(kind, text)| *kind == SyntaxKind::PIPE && *text == "|"));
assert!(tokens1
.iter()
.any(|(kind, text)| *kind == SyntaxKind::PLUS && *text == "+"));
let input2 = "key: >- content";
let tokens2 = lex(input2);
assert!(tokens2
.iter()
.any(|(kind, text)| *kind == SyntaxKind::GREATER && *text == ">"));
assert!(tokens2
.iter()
.any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "-"));
let input3 = "key: |2+ content";
let tokens3 = lex(input3);
assert!(tokens3
.iter()
.any(|(kind, text)| *kind == SyntaxKind::PIPE && *text == "|"));
assert!(tokens3
.iter()
.any(|(kind, text)| *kind == SyntaxKind::INT && *text == "2"));
assert!(tokens3
.iter()
.any(|(kind, text)| *kind == SyntaxKind::PLUS && *text == "+"));
}
#[test]
fn test_special_characters_in_block_content() {
let input = "line with - and + and : characters";
let tokens = lex(input);
assert!(tokens
.iter()
.any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "-"));
assert!(tokens
.iter()
.any(|(kind, text)| *kind == SyntaxKind::PLUS && *text == "+"));
assert!(tokens
.iter()
.any(|(kind, text)| *kind == SyntaxKind::COLON && *text == ":"));
assert!(tokens
.iter()
.any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "line"));
assert!(tokens
.iter()
.any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "with"));
assert!(tokens
.iter()
.any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "and"));
assert!(tokens
.iter()
.any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "characters"));
}
#[test]
fn test_token_recognition() {
let input = "key: |2+ \n content with - and : and > chars\n more content";
let tokens = lex(input);
println!("Comprehensive tokens:");
for (i, (kind, text)) in tokens.iter().enumerate() {
println!(" {}: {:?} = {:?}", i, kind, text);
}
let count = |k: SyntaxKind| tokens.iter().filter(|(kind, _)| *kind == k).count();
assert_eq!(count(SyntaxKind::COLON), 2);
assert_eq!(count(SyntaxKind::PIPE), 1); assert_eq!(count(SyntaxKind::INT), 1); assert_eq!(count(SyntaxKind::PLUS), 1); assert_eq!(count(SyntaxKind::GREATER), 1); assert_eq!(count(SyntaxKind::NEWLINE), 2); assert_eq!(count(SyntaxKind::INDENT), 2); assert!(count(SyntaxKind::STRING) >= 1, "expected STRING tokens");
assert_eq!(
tokens
.iter()
.filter(|(kind, text)| *kind == SyntaxKind::STRING && *text == "-")
.count(),
1
);
}
#[test]
fn test_dash_handling() {
let input = "---\nkey: value";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::DOC_START, "---"));
let input = "---";
let tokens = lex(input);
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0], (SyntaxKind::DOC_START, "---"));
let input = "--";
let tokens = lex(input);
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0], (SyntaxKind::DASH, "-"));
assert_eq!(tokens[1], (SyntaxKind::DASH, "-"));
let input = "----";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::DOC_START, "---"));
assert_eq!(tokens[1], (SyntaxKind::STRING, "-"));
}
#[test]
fn test_dash_in_different_scalar_contexts() {
let input = "package-name: my-awesome-package-v2";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::STRING, "package-name"));
assert_eq!(tokens[1], (SyntaxKind::COLON, ":"));
assert_eq!(tokens[2], (SyntaxKind::WHITESPACE, " "));
assert_eq!(tokens[3], (SyntaxKind::STRING, "my-awesome-package-v2"));
let input = "id: 123e4567-e89b-12d3-a456-426614174000";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::STRING, "id"));
assert_eq!(
tokens[3],
(SyntaxKind::STRING, "123e4567-e89b-12d3-a456-426614174000")
);
let input = "args: --verbose --log-level=debug";
let tokens = lex(input);
assert_eq!(
tokens
.windows(3)
.filter(|w| {
w[0] == (SyntaxKind::DASH, "-")
&& w[1] == (SyntaxKind::DASH, "-")
&& w[2] == (SyntaxKind::STRING, "verbose")
})
.count(),
1
);
let input = "temperature: -40";
let tokens = lex(input);
assert_eq!(
tokens
.iter()
.filter(|(kind, text)| *kind == SyntaxKind::INT && *text == "-40")
.count(),
1
);
let input = "range: 1-10";
let tokens = lex(input);
assert_eq!(
tokens
.iter()
.filter(|(kind, text)| *kind == SyntaxKind::STRING && *text == "1-10")
.count(),
1
);
}
#[test]
fn test_sequence_markers_with_indentation() {
let input = "- item1\n- item2";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::DASH, "-"));
assert_eq!(tokens[1], (SyntaxKind::WHITESPACE, " "));
assert_eq!(tokens[2], (SyntaxKind::STRING, "item1"));
let input = " - item1\n - item2";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::INDENT, " "));
assert_eq!(tokens[1], (SyntaxKind::DASH, "-"));
let input = "- item1\n - nested1\n - nested2\n- item2";
let tokens = lex(input);
let dash_tokens: Vec<_> = tokens
.iter()
.filter(|(kind, _)| *kind == SyntaxKind::DASH)
.collect();
assert_eq!(dash_tokens.len(), 4);
let input = "- first-item\n- second-item";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::DASH, "-"));
assert_eq!(tokens[2], (SyntaxKind::STRING, "first-item"));
assert_eq!(tokens[4], (SyntaxKind::DASH, "-"));
assert_eq!(tokens[6], (SyntaxKind::STRING, "second-item"));
}
#[test]
fn test_dash_after_colon() {
let input = "key:-value";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::STRING, "key:-value"));
let input = "key: -value";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::STRING, "key"));
assert_eq!(tokens[1], (SyntaxKind::COLON, ":"));
assert_eq!(tokens[2], (SyntaxKind::WHITESPACE, " "));
assert_eq!(tokens[3], (SyntaxKind::STRING, "-value"));
}
#[test]
fn test_yaml_spec_compliant_colon_handling() {
let input = "http://example.com:8080/path";
let tokens = lex(input);
assert_eq!(tokens.len(), 1);
assert_eq!(
tokens[0],
(SyntaxKind::STRING, "http://example.com:8080/path")
);
let input = "2024:12:31:23:59:59";
let tokens = lex(input);
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0], (SyntaxKind::STRING, "2024:12:31:23:59:59"));
let input = "key: value";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::STRING, "key"));
assert_eq!(tokens[1], (SyntaxKind::COLON, ":"));
assert_eq!(tokens[2], (SyntaxKind::WHITESPACE, " "));
assert_eq!(tokens[3], (SyntaxKind::STRING, "value"));
let input = "key:value";
let tokens = lex(input);
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0], (SyntaxKind::STRING, "key:value"));
let input = "a:b:c:d";
let tokens = lex(input);
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0], (SyntaxKind::STRING, "a:b:c:d"));
}
#[test]
fn test_block_scalar_with_chomping() {
let count_kind = |toks: &[(SyntaxKind, &str)], k: SyntaxKind| {
toks.iter().filter(|(kind, _)| *kind == k).count()
};
let input = "text: |-\n content";
let tokens = lex(input);
assert_eq!(count_kind(&tokens, SyntaxKind::PIPE), 1);
assert_eq!(
tokens
.iter()
.filter(|(kind, text)| *kind == SyntaxKind::STRING && *text == "-")
.count(),
1
);
let input = "text: |+\n content";
let tokens = lex(input);
assert_eq!(count_kind(&tokens, SyntaxKind::PIPE), 1);
assert_eq!(count_kind(&tokens, SyntaxKind::PLUS), 1);
let input = "text: >-\n content";
let tokens = lex(input);
assert_eq!(count_kind(&tokens, SyntaxKind::GREATER), 1);
assert_eq!(
tokens
.iter()
.filter(|(kind, text)| *kind == SyntaxKind::STRING && *text == "-")
.count(),
1
);
let input = "text: |2-\n content";
let tokens = lex(input);
assert_eq!(count_kind(&tokens, SyntaxKind::PIPE), 1);
let has_2_token = tokens.iter().any(|(kind, text)| {
(*kind == SyntaxKind::STRING || *kind == SyntaxKind::INT) && text.contains("2")
});
assert!(has_2_token, "expected a token containing '2'");
}
#[test]
fn test_dash_edge_cases() {
let input = "value-";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::STRING, "value-"));
let input = "-value";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::STRING, "-value"));
let input = "key: a---b";
let tokens = lex(input);
assert_eq!(
tokens
.iter()
.filter(|(kind, text)| *kind == SyntaxKind::STRING && *text == "a---b")
.count(),
1
);
let input = "key: value-\nnext: item";
let tokens = lex(input);
assert!(tokens
.iter()
.any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "value-"));
let input = "snake_case-with-dash_mix";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::STRING, "snake_case-with-dash_mix"));
}
#[test]
fn test_whitespace_validation_tab_indentation() {
let input_with_tabs = "key: value\n\tindented_key: indented_value";
let (tokens, errors) = lex_with_validation(input_with_tabs);
assert_eq!(errors.len(), 1);
assert_eq!(errors[0].category, WhitespaceErrorCategory::TabIndentation);
assert_eq!(
errors[0].message,
"Tab character used for indentation (forbidden in YAML)"
);
assert!(tokens
.iter()
.any(|(kind, text)| *kind == SyntaxKind::INDENT && text.contains('\t')));
}
#[test]
fn test_whitespace_validation_line_endings() {
let input_mixed = "line1\nline2\r\nline3\rline4";
let config = ValidationConfig {
enforce_consistent_line_endings: true,
max_line_length: None,
};
let (tokens, errors) = lex_with_validation_config(input_mixed, &config);
assert!(errors
.iter()
.any(|e| e.category == WhitespaceErrorCategory::MixedLineEndings));
let newlines: Vec<_> = tokens
.iter()
.filter(|(kind, _)| *kind == SyntaxKind::NEWLINE)
.collect();
assert_eq!(newlines.len(), 3); assert_eq!(newlines[0].1, "\n");
assert_eq!(newlines[1].1, "\r\n");
assert_eq!(newlines[2].1, "\r");
}
#[test]
fn test_whitespace_validation_line_length() {
let long_line = format!("key: {}", "a".repeat(150));
let config = ValidationConfig {
enforce_consistent_line_endings: false,
max_line_length: Some(120),
};
let (_, errors) = lex_with_validation_config(&long_line, &config);
assert_eq!(errors.len(), 1);
assert_eq!(errors[0].category, WhitespaceErrorCategory::LineTooLong);
assert_eq!(errors[0].message, "Line too long (155 > 120 characters)");
}
#[test]
fn test_whitespace_validation_disabled() {
let input_with_issues = "key: value\n\tindented: with_tabs\n";
let config = ValidationConfig {
enforce_consistent_line_endings: false,
max_line_length: None,
};
let (tokens, errors) = lex_with_validation_config(input_with_issues, &config);
assert_eq!(errors.len(), 1);
assert_eq!(errors[0].category, WhitespaceErrorCategory::TabIndentation);
assert!(!tokens.is_empty());
}
#[test]
fn test_dash_in_flow_collections() {
let input = "[item-one, item-two]";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::LEFT_BRACKET, "["));
assert_eq!(tokens[1], (SyntaxKind::STRING, "item-one"));
assert_eq!(tokens[2], (SyntaxKind::COMMA, ","));
assert_eq!(tokens[4], (SyntaxKind::STRING, "item-two"));
assert_eq!(tokens[5], (SyntaxKind::RIGHT_BRACKET, "]"));
let input = "{kebab-key: kebab-value}";
let tokens = lex(input);
assert_eq!(tokens[0], (SyntaxKind::LEFT_BRACE, "{"));
assert_eq!(tokens[1], (SyntaxKind::STRING, "kebab-key"));
assert_eq!(tokens[2], (SyntaxKind::COLON, ":"));
assert_eq!(tokens[4], (SyntaxKind::STRING, "kebab-value"));
assert_eq!(tokens[5], (SyntaxKind::RIGHT_BRACE, "}"));
}
#[test]
fn test_dash_with_quotes() {
let input = r#"key: "- not a sequence marker""#;
let tokens = lex(input);
assert_eq!(
tokens
.iter()
.filter(|(kind, text)| {
*kind == SyntaxKind::STRING && *text == "\"- not a sequence marker\""
})
.count(),
1
);
let input = r#"key: '- also not a sequence marker'"#;
let tokens = lex(input);
assert_eq!(
tokens
.iter()
.filter(|(kind, text)| {
*kind == SyntaxKind::STRING && *text == "'- also not a sequence marker'"
})
.count(),
1
);
}
#[test]
fn test_dash_in_multiline_values() {
let input = "description: This is a multi-\n line value with dashes";
let tokens = lex(input);
assert!(tokens
.iter()
.any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "multi-"));
let input = "text: value\n - but this is not a sequence";
let tokens = lex(input);
let indent_dash: Vec<_> = tokens
.windows(2)
.filter(|w| w[0].0 == SyntaxKind::INDENT && w[1].0 == SyntaxKind::DASH)
.collect();
assert_eq!(indent_dash.len(), 1);
}
#[test]
fn test_dash_special_yaml_values() {
let input = "date: 2024-01-15";
let tokens = lex(input);
assert!(tokens
.iter()
.any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "2024-01-15"));
let input = "timestamp: 2024-01-15T10:30:00-05:00";
let tokens = lex(input);
assert!(tokens.iter().any(
|(kind, text)| *kind == SyntaxKind::STRING && *text == "2024-01-15T10:30:00-05:00"
));
let input = "version: 1.0.0-beta.1";
let tokens = lex(input);
assert!(tokens
.iter()
.any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "1.0.0-beta.1"));
}
#[test]
fn test_flow_indicators_in_block_scalar() {
let input = "key: unix:///Users/${metadata.username}/path";
let tokens = lex(input);
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0], (SyntaxKind::STRING, "key"));
assert_eq!(tokens[1], (SyntaxKind::COLON, ":"));
assert_eq!(tokens[2], (SyntaxKind::WHITESPACE, " "));
assert_eq!(
tokens[3],
(
SyntaxKind::STRING,
"unix:///Users/${metadata.username}/path"
)
);
}
}