use memchr::{memchr, memchr2};
use std::borrow::Cow;
use crate::error::Error;
use crate::lines::LineBuffer;
use crate::pos::{Pos, Span};
use super::{Lexer, is_blank_or_comment, is_marker};
use crate::chars::{is_c_indicator, is_ns_char};
use crate::lines::pos_after_line;
impl<'input> Lexer<'input> {
pub fn try_consume_plain_scalar(
&mut self,
parent_indent: usize,
) -> Option<(Cow<'input, str>, Span)> {
if let Some(inline) = self.inline_scalar.take() {
return Some(inline);
}
let (leading_spaces, scalar_start_pos, first_value_len) =
peek_plain_scalar_first_line(&self.buf)?;
let Some(consumed_first) = self.buf.consume_next() else {
unreachable!("peek returned Some but consume returned None")
};
self.current_pos = pos_after_line(&consumed_first);
let Some(first_value_ref): Option<&'input str> = consumed_first
.content
.get(leading_spaces..leading_spaces + first_value_len)
else {
unreachable!("scalar slice out of bounds")
};
let after_scalar_start = leading_spaces + first_value_len;
if let Some(suffix) = consumed_first.content.get(after_scalar_start..) {
if let Some(comment_text) = extract_trailing_comment(suffix) {
let hash_byte_in_line = after_scalar_start + suffix.len() - comment_text.len() - 1;
let hash_col_in_line =
crate::pos::column_at(consumed_first.content, hash_byte_in_line);
let hash_pos = Pos {
byte_offset: consumed_first.pos.byte_offset + hash_byte_in_line,
line: consumed_first.pos.line,
column: consumed_first.pos.column + hash_col_in_line,
};
let span_end = crate::pos::advance_within_line(hash_pos.advance('#'), comment_text);
if let Some(bad_i) = memchr(b'\0', comment_text.as_bytes()) {
let bad_char_i = comment_text[..bad_i].chars().count();
let bad_pos = Pos {
byte_offset: hash_pos.byte_offset + 1 + bad_i,
line: hash_pos.line,
column: hash_pos.column + 1 + bad_char_i,
};
self.plain_scalar_suffix_error = Some(Error {
pos: bad_pos,
message: "invalid character U+0000 in comment".to_owned(),
});
} else {
self.trailing_comment = Some((
comment_text,
Span {
start: hash_pos,
end: span_end,
},
));
}
} else if let Some((bad_i, bad_ch)) = suffix
.char_indices()
.find(|(_, c)| matches!(*c, '\0' | '\u{FEFF}'))
{
let bad_col_offset =
crate::pos::column_at(consumed_first.content, after_scalar_start + bad_i);
let bad_pos = Pos {
byte_offset: consumed_first.pos.byte_offset + after_scalar_start + bad_i,
line: consumed_first.pos.line,
column: consumed_first.pos.column + bad_col_offset,
};
self.plain_scalar_suffix_error = Some(Error {
pos: bad_pos,
message: format!("invalid character U+{:04X} in plain scalar", bad_ch as u32),
});
}
}
let extra = if self.trailing_comment.is_some() || self.plain_scalar_suffix_error.is_some() {
None
} else {
self.collect_plain_continuations(first_value_ref, parent_indent, consumed_first.indent)
};
let span_end = self.current_pos;
Some(extra.map_or_else(
|| {
let end_pos = crate::pos::advance_within_line(scalar_start_pos, first_value_ref);
(
Cow::Borrowed(first_value_ref),
Span {
start: scalar_start_pos,
end: end_pos,
},
)
},
|owned| {
(
Cow::Owned(owned),
Span {
start: scalar_start_pos,
end: span_end,
},
)
},
))
}
fn collect_plain_continuations(
&mut self,
first_value_ref: &str,
parent_indent: usize,
scalar_indent: usize,
) -> Option<String> {
let mut pending_blanks: usize = 0;
let mut result: Option<String> = None;
loop {
let Some(next) = self.buf.peek_next() else {
break;
};
let trimmed = next.content.trim_start_matches([' ', '\t']);
if trimmed.is_empty() {
pending_blanks += 1;
let Some(consumed) = self.buf.consume_next() else {
unreachable!("consume blank line failed")
};
self.current_pos = pos_after_line(&consumed);
continue;
}
if is_marker(next.content, b'-') || is_marker(next.content, b'.') {
break;
}
let n0_exception = parent_indent == 0 && scalar_indent == 0;
let tab_exception = parent_indent == 0 && next.content.starts_with('\t');
if next.indent <= parent_indent && !n0_exception && !tab_exception {
break;
}
let cont_value = scan_plain_line_block(trimmed);
if cont_value.is_empty() {
break;
}
let after_cont = trimmed[cont_value.len()..].trim_start_matches([' ', '\t']);
if after_cont.starts_with(": ") || after_cont == ":" {
break;
}
let has_trailing_comment = after_cont.starts_with('#');
let buf = result.get_or_insert_with(|| String::from(first_value_ref));
if pending_blanks > 0 {
buf.extend(std::iter::repeat_n('\n', pending_blanks));
pending_blanks = 0;
} else {
buf.push(' ');
}
buf.push_str(cont_value);
let Some(consumed) = self.buf.consume_next() else {
unreachable!("consume cont line failed")
};
self.current_pos = pos_after_line(&consumed);
if has_trailing_comment {
break;
}
}
result
}
}
pub(super) fn peek_plain_scalar_first_line(buf: &LineBuffer<'_>) -> Option<(usize, Pos, usize)> {
let first = buf.peek_next()?;
if is_blank_or_comment(first) {
return None;
}
let content_trimmed = first.content.trim_start_matches([' ', '\t']);
if content_trimmed.is_empty() {
return None;
}
let first_char = content_trimmed.chars().next()?;
if !ns_plain_first_block(first_char, content_trimmed) {
return None;
}
let first_value = scan_plain_line_block(content_trimmed);
if first_value.is_empty() {
return None;
}
let leading_bytes = first.content.len() - content_trimmed.len();
let leading_chars = crate::pos::column_at(first.content, leading_bytes);
let scalar_start_pos = Pos {
byte_offset: first.offset + leading_bytes,
line: first.pos.line,
column: first.pos.column + leading_chars,
};
Some((leading_bytes, scalar_start_pos, first_value.len()))
}
pub fn ns_plain_first_block(ch: char, rest: &str) -> bool {
if is_c_indicator(ch) {
if matches!(ch, '?' | ':' | '-') {
let after = &rest[ch.len_utf8()..];
if let Some(next) = after.chars().next() {
return ns_plain_safe_block(next);
}
}
return false;
}
is_ns_char(ch)
}
const fn ns_plain_safe_block(ch: char) -> bool {
is_ns_char(ch)
}
fn ns_plain_char_block(prev_was_ws: bool, ch: char, next: Option<char>) -> bool {
if ch == '#' {
return !prev_was_ws;
}
if ch == ':' {
return next.is_some_and(ns_plain_safe_block);
}
ns_plain_safe_block(ch)
}
pub fn scan_plain_line_block(content: &str) -> &str {
let bytes = content.as_bytes();
let len = bytes.len();
let mut pos = 0;
let mut committed_end: usize = 0;
let mut prev_was_ws = false;
while pos < len {
let candidate =
memchr2(b':', b'#', bytes.get(pos..).unwrap_or_default()).map(|off| pos + off);
let end = candidate.unwrap_or(len);
while pos < end {
let Some(&b) = bytes.get(pos) else { break };
if b >= 0x80 {
let Some(ch) = content.get(pos..).and_then(|s| s.chars().next()) else {
break;
};
let ch_len = ch.len_utf8();
let next_ch = content.get(pos + ch_len..).and_then(|s| s.chars().next());
if !ns_plain_char_block(prev_was_ws, ch, next_ch) {
return &content[..committed_end];
}
committed_end = pos + ch_len;
prev_was_ws = false;
pos += ch_len;
} else {
match b {
b' ' | b'\t' => {
prev_was_ws = true;
pos += 1;
}
0x00..=0x1F | 0x7F => {
return &content[..committed_end];
}
_ => {
committed_end = pos + 1;
prev_was_ws = false;
pos += 1;
}
}
}
}
let Some(hit) = candidate else { break };
let Some(&b) = bytes.get(hit) else { break };
if b == b'#' {
if prev_was_ws {
break;
}
} else {
let next_ch = content.get(hit + 1..).and_then(|s| s.chars().next());
if !next_ch.is_some_and(ns_plain_safe_block) {
break;
}
}
committed_end = hit + 1;
prev_was_ws = false;
pos = hit + 1;
}
&content[..committed_end]
}
pub fn scan_plain_line_flow(content: &str) -> &str {
let bytes = content.as_bytes();
let len = bytes.len();
let mut pos = 0;
let mut committed_end: usize = 0;
let mut prev_was_ws = false;
while pos < len {
let candidate =
memchr2(b':', b'#', bytes.get(pos..).unwrap_or_default()).map(|off| pos + off);
let end = candidate.unwrap_or(len);
while pos < end {
let Some(&b) = bytes.get(pos) else { break };
if b >= 0x80 {
let Some(ch) = content.get(pos..).and_then(|s| s.chars().next()) else {
break;
};
let ch_len = ch.len_utf8();
if !ns_plain_safe_block(ch) {
return &content[..committed_end];
}
committed_end = pos + ch_len;
prev_was_ws = false;
pos += ch_len;
} else {
match b {
b' ' | b'\t' => {
prev_was_ws = true;
pos += 1;
}
b',' | b'[' | b']' | b'{' | b'}' | 0x00..=0x1F | 0x7F => {
return &content[..committed_end];
}
_ => {
committed_end = pos + 1;
prev_was_ws = false;
pos += 1;
}
}
}
}
let Some(hit) = candidate else { break };
let Some(&b) = bytes.get(hit) else { break };
if b == b'#' {
if prev_was_ws {
break;
}
} else {
match bytes.get(hit + 1).copied() {
None | Some(b' ' | b'\t' | b',' | b'[' | b']' | b'{' | b'}') => break,
Some(_) => {}
}
}
committed_end = hit + 1;
prev_was_ws = false;
pos = hit + 1;
}
&content[..committed_end]
}
pub fn extract_trailing_comment(suffix: &str) -> Option<&str> {
let bytes = suffix.as_bytes();
let mut search_from = 0;
while let Some(rel) = memchr(b'#', bytes.get(search_from..).unwrap_or_default()) {
let i = search_from + rel;
let preceded_by_ws = i == 0 || matches!(bytes.get(i - 1), Some(b' ' | b'\t'));
if preceded_by_ws {
return Some(&suffix[i + 1..]);
}
search_from = i + 1;
}
None
}
#[cfg(test)]
mod tests {
use rstest::rstest;
use super::*;
use std::borrow::Cow;
fn make_lexer(input: &str) -> super::super::Lexer<'_> {
super::super::Lexer::new(input)
}
#[rstest]
#[case::empty_input_returns_empty("", "")]
#[case::single_ascii_safe_char("a", "a")]
#[case::plain_word_no_terminators("hello", "hello")]
#[case::trailing_whitespace_excluded("abc ", "abc")]
#[case::colon_followed_by_space_terminates("key: rest", "key")]
#[case::colon_at_eol_terminates("key:", "key")]
#[case::colon_followed_by_alnum_is_content("a:b", "a:b")]
#[case::hash_after_space_terminates("foo # comment", "foo")]
#[case::hash_without_preceding_space_is_content("a#b", "a#b")]
#[case::url_with_colon_slash_slash_is_content("http://example.com", "http://example.com")]
fn scan_plain_line_block_cases(#[case] input: &str, #[case] expected: &str) {
assert_eq!(scan_plain_line_block(input), expected);
}
#[rstest]
#[case::pure_cjk_no_terminators("日本語", "日本語")]
#[case::mixed_ascii_then_multibyte_no_terminator("hello日本語", "hello日本語")]
#[case::mixed_multibyte_then_ascii_no_terminator("日本語hello", "日本語hello")]
#[case::colon_followed_by_multibyte_is_content("key:値", "key:値")]
#[case::colon_followed_by_two_byte_char_is_content("key:ñ", "key:ñ")]
#[case::colon_followed_by_four_byte_char_is_content("key:😀", "key:😀")]
#[case::hash_after_multibyte_not_whitespace_preceded_is_content("日#本", "日#本")]
#[case::hash_after_space_in_multibyte_context_terminates("日本 #note", "日本")]
fn scan_plain_line_block_multibyte_cases(#[case] input: &str, #[case] expected: &str) {
assert_eq!(scan_plain_line_block(input), expected);
}
#[rstest]
#[case::nul_byte_mid_scalar_terminates_before_nul("hello\0world", "hello")]
#[case::nul_byte_at_start_returns_empty("\0abc", "")]
#[case::bom_mid_scalar_terminates_before_bom("hello\u{FEFF}world", "hello")]
#[case::bom_at_start_returns_empty("\u{FEFF}abc", "")]
fn scan_plain_line_block_terminator_cases(#[case] input: &str, #[case] expected: &str) {
assert_eq!(scan_plain_line_block(input), expected);
}
#[rstest]
#[case::tab_between_words_included("abc\tdef", "abc\tdef")]
#[case::multiple_spaces_between_words_included("foo bar", "foo bar")]
#[case::trailing_tab_excluded("abc\t", "abc")]
fn scan_plain_line_block_whitespace_cases(#[case] input: &str, #[case] expected: &str) {
assert_eq!(scan_plain_line_block(input), expected);
}
#[rstest]
#[case::pure_cjk_returns_full("日本語", "日本語")]
#[case::colon_followed_by_multibyte_is_content("key:値", "key:値")]
#[case::colon_followed_by_flow_indicator_terminates("key:]rest", "key")]
#[case::nul_mid_scalar_terminates("hello\0world", "hello")]
#[case::bom_mid_scalar_terminates("hello\u{FEFF}world", "hello")]
#[case::mixed_ascii_cjk_no_terminator("abc中文", "abc中文")]
fn scan_plain_line_flow_multibyte_cases(#[case] input: &str, #[case] expected: &str) {
assert_eq!(scan_plain_line_flow(input), expected);
}
#[rstest]
#[case::block_two_byte_char_correct_slice(scan_plain_line_block as fn(&str) -> &str, "café", "café")]
#[case::block_three_byte_char_correct_slice(scan_plain_line_block as fn(&str) -> &str, "中文abc", "中文abc")]
#[case::block_four_byte_char_correct_slice(scan_plain_line_block as fn(&str) -> &str, "😀abc", "😀abc")]
#[case::block_colon_then_multibyte(scan_plain_line_block as fn(&str) -> &str, "a:b中文", "a:b中文")]
#[case::block_termination_before_multibyte(scan_plain_line_block as fn(&str) -> &str, "foo: 日本語", "foo")]
fn scan_plain_line_slice_no_panic_cases(
#[case] scan_fn: fn(&str) -> &str,
#[case] input: &str,
#[case] expected: &str,
) {
let result = scan_fn(input);
assert_eq!(result, expected);
let _ = result.chars().count(); }
#[test]
fn flow_slice_valid_utf8_after_flow_indicator() {
let result = scan_plain_line_flow("日本語]rest");
assert_eq!(result, "日本語");
assert_eq!(result.chars().count(), 3);
}
#[rstest]
#[case::single_word("hello", "hello")]
#[case::multi_word("hello world", "hello world")]
#[case::with_url("http://x.com", "http://x.com")]
#[case::with_hash_no_preceding_space("a#b", "a#b")]
#[case::terminated_by_colon_space("key: value", "key")]
#[case::terminated_by_hash_with_space("foo # comment", "foo")]
#[case::trailing_whitespace_stripped("foo ", "foo")]
#[case::multiline_folds_single_break_to_space("foo\n bar\n baz", "foo bar baz")]
#[case::multiline_blank_line_folds_to_newline("foo\n\nbar", "foo\nbar")]
#[case::minus_followed_by_safe_char_is_valid("-a", "-a")]
#[case::colon_followed_by_safe_char_is_valid(":a", ":a")]
#[case::forbidden_continuation_stops_at_marker("foo\n---\nbar", "foo")]
#[case::with_multibyte_utf8("中文", "中文")]
#[case::with_backslashes("plain\\value\\with\\backslashes", "plain\\value\\with\\backslashes")]
fn try_consume_plain_scalar_value_cases(#[case] input: &str, #[case] expected: &str) {
let mut lex = make_lexer(input);
let (val, _) = lex
.try_consume_plain_scalar(0)
.unwrap_or_else(|| unreachable!("should parse"));
assert_eq!(val, expected);
}
#[rstest]
#[case::cow_borrowed_for_single_line("hello", true)]
#[case::cow_owned_for_multiline("foo\n bar", false)]
fn try_consume_plain_scalar_cow_cases(#[case] input: &str, #[case] expect_borrowed: bool) {
let mut lex = make_lexer(input);
let (val, _) = lex
.try_consume_plain_scalar(0)
.unwrap_or_else(|| unreachable!("should parse"));
if expect_borrowed {
assert!(matches!(val, Cow::Borrowed(_)), "expected Cow::Borrowed");
} else {
assert!(matches!(val, Cow::Owned(_)), "expected Cow::Owned");
}
}
#[rstest]
#[case::empty_input_returns_none("")]
#[case::blank_line_returns_none(" ")]
#[case::comment_line_returns_none("# comment")]
#[case::dash_space_returns_none("- ")]
#[case::question_space_returns_none("? ")]
#[case::colon_space_returns_none(": ")]
fn try_consume_plain_scalar_returns_none_cases(#[case] input: &str) {
let mut lex = make_lexer(input);
assert!(lex.try_consume_plain_scalar(0).is_none());
}
#[test]
fn plain_scalar_indicator_chars_return_none() {
for indicator in &[
"[", "{", "&", "!", "*", ":", "?", "-", "|", ">", "'", "\"", "#", "%", ",", "]", "}",
] {
let mut lex = make_lexer(indicator);
let result = lex.try_consume_plain_scalar(0);
assert!(
result.is_none(),
"indicator '{indicator}' should not start a plain scalar"
);
}
}
#[rstest]
#[case::ascii_word_span("hello", 0, Some("hello"), 0, 5)]
#[case::indented_start_span(" hello", 0, Some("hello"), 2, 7)]
#[case::multibyte_utf8_span("中文", 0, Some("中文"), 0, 6)]
#[case::multibyte_with_leading_space_span(" 中", 0, Some("中"), 2, 5)]
fn try_consume_plain_scalar_span_cases(
#[case] input: &str,
#[case] parent_indent: usize,
#[case] expected_val: Option<&str>,
#[case] expected_start: usize,
#[case] expected_end: usize,
) {
let mut lex = make_lexer(input);
let (val, span) = lex
.try_consume_plain_scalar(parent_indent)
.unwrap_or_else(|| unreachable!("should parse"));
if let Some(ev) = expected_val {
assert_eq!(val, ev);
}
assert_eq!(span.start.byte_offset, expected_start);
assert_eq!(span.end.byte_offset, expected_end);
}
#[test]
fn plain_scalar_dedented_continuation_stops() {
let mut lex = make_lexer(" foo\nbar");
let (val, _) = lex
.try_consume_plain_scalar(2)
.unwrap_or_else(|| unreachable!("should parse"));
assert_eq!(val, "foo");
}
#[rstest]
#[case::colon_tab_terminates("key:\tvalue", "key")]
#[case::colon_eof_terminates("key:", "key")]
fn try_consume_plain_scalar_colon_edge_cases(#[case] input: &str, #[case] expected: &str) {
let mut lex = make_lexer(input);
let (val, _) = lex
.try_consume_plain_scalar(0)
.unwrap_or_else(|| unreachable!("should parse"));
assert_eq!(val, expected);
}
#[test]
fn plain_scalar_hash_preceded_by_tab_terminates() {
let mut lex = make_lexer("foo\t# comment");
let (val, _) = lex
.try_consume_plain_scalar(0)
.unwrap_or_else(|| unreachable!("should parse"));
assert_eq!(val, "foo");
}
#[rstest]
#[case::two_blank_lines_fold_to_two_newlines("foo\n\n\nbar", "foo\n\nbar")]
#[case::continuation_trailing_space_stripped("foo\nbar \nbaz", "foo bar baz")]
fn try_consume_plain_scalar_multiline_folding_cases(
#[case] input: &str,
#[case] expected: &str,
) {
let mut lex = make_lexer(input);
let (val, _) = lex
.try_consume_plain_scalar(0)
.unwrap_or_else(|| unreachable!("should parse"));
assert_eq!(val, expected);
}
#[test]
fn plain_scalar_multiline_two_blank_lines_fold_to_two_newlines_indented() {
let mut lex = make_lexer("foo\n\n\n bar");
let (val, _) = lex
.try_consume_plain_scalar(0)
.unwrap_or_else(|| unreachable!("should parse"));
assert!(matches!(val, Cow::Owned(_)), "multi-line must be Owned");
assert_eq!(val, "foo\n\nbar");
}
#[rstest]
#[case::dots_terminated_by_document_end_marker("foo\n...\nbar", "foo")]
#[case::dash_dash_dash_word_attached_is_not_forbidden("foo\n---word", "foo ---word")]
fn try_consume_plain_scalar_forbidden_marker_cases(
#[case] input: &str,
#[case] expected: &str,
) {
let mut lex = make_lexer(input);
let (val, _) = lex
.try_consume_plain_scalar(0)
.unwrap_or_else(|| unreachable!("should parse"));
assert_eq!(val, expected);
}
#[test]
fn plain_scalar_inline_after_marker_is_extracted() {
let mut lex = make_lexer("--- text");
lex.consume_marker_line(false);
let (val, _) = lex
.try_consume_plain_scalar(0)
.unwrap_or_else(|| unreachable!("should parse inline scalar"));
assert_eq!(val, "text");
}
#[test]
fn plain_scalar_inline_after_marker_is_cow_borrowed() {
let mut lex = make_lexer("--- text");
lex.consume_marker_line(false);
let (val, _) = lex
.try_consume_plain_scalar(0)
.unwrap_or_else(|| unreachable!("should parse inline scalar"));
assert!(
matches!(val, Cow::Borrowed(_)),
"inline scalar from marker line must be Cow::Borrowed"
);
}
#[rstest]
#[case::terminates_at_close_bracket("abc]rest", "abc")]
#[case::terminates_at_close_brace("abc}rest", "abc")]
#[case::terminates_at_comma("abc,rest", "abc")]
#[case::terminates_at_open_bracket("abc[rest", "abc")]
#[case::terminates_at_open_brace("abc{rest", "abc")]
#[case::returns_full_when_no_terminator("hello", "hello")]
#[case::empty_input_returns_empty("", "")]
#[case::hash_after_space_starts_comment("abc # comment", "abc")]
#[case::hash_without_preceding_space_is_content("abc#def", "abc#def")]
#[case::colon_space_terminates("key: rest", "key")]
#[case::colon_flow_indicator_terminates("key:}rest", "key")]
#[case::colon_at_eol_terminates("key:", "key")]
#[case::colon_followed_by_alnum_is_content("a:b", "a:b")]
#[case::trailing_whitespace_excluded("abc ", "abc")]
fn scan_plain_line_flow_cases(#[case] input: &str, #[case] expected: &str) {
assert_eq!(scan_plain_line_flow(input), expected);
}
}