use std::borrow::Cow;
use crate::error::Error;
use crate::lines::{Line, LineBuffer, pos_after_line};
use crate::pos::{Pos, Span};
mod block;
mod comment;
pub mod plain;
mod quoted;
pub use crate::chars::is_ns_char;
pub use plain::scan_plain_line_flow;
use block::parse_block_header;
use plain::scan_plain_line_block;
pub struct Lexer<'input> {
pub(super) buf: LineBuffer<'input>,
pub(super) current_pos: Pos,
pub(super) inline_scalar: Option<(Cow<'input, str>, Span)>,
pub trailing_comment: Option<(&'input str, Span)>,
pub pending_multiline_tail: Option<(&'input str, Pos)>,
pub plain_scalar_suffix_error: Option<Error>,
pub marker_inline_error: Option<Error>,
}
impl<'input> Lexer<'input> {
#[must_use]
pub fn new(input: &'input str) -> Self {
Self {
buf: LineBuffer::new(input),
current_pos: Pos::ORIGIN,
inline_scalar: None,
trailing_comment: None,
pending_multiline_tail: None,
plain_scalar_suffix_error: None,
marker_inline_error: None,
}
}
pub fn skip_empty_lines(&mut self) -> Pos {
loop {
let skip = self
.buf
.peek_next()
.is_some_and(|line| is_blank_not_comment(line));
if skip {
if let Some(line) = self.buf.consume_next() {
self.current_pos = pos_after_line(&line);
}
} else {
return self.current_pos;
}
}
}
pub fn skip_blank_lines_between_docs(&mut self) -> Pos {
loop {
let skip = self
.buf
.peek_next()
.is_some_and(|line| is_blank_not_comment(line));
if skip {
if let Some(line) = self.buf.consume_next() {
self.current_pos = pos_after_line(&line);
}
} else {
return self.current_pos;
}
}
}
#[must_use]
pub fn is_directive_line(&self) -> bool {
self.buf
.peek_next()
.is_some_and(|line| line.content.starts_with('%'))
}
pub fn try_consume_directive_line(&mut self) -> Option<(&'input str, Pos)> {
let line = self.buf.peek_next()?;
if !line.content.starts_with('%') {
return None;
}
let start_pos = line.pos;
let content: &'input str = line.content;
let Some(consumed) = self.buf.consume_next() else {
unreachable!("try_consume_directive_line: peek returned Some but consume returned None")
};
self.current_pos = pos_after_line(&consumed);
Some((content, start_pos))
}
#[must_use]
pub fn is_comment_line(&self) -> bool {
self.buf.peek_next().is_some_and(|line| {
let trimmed = line.content.trim_start_matches([' ', '\t']);
trimmed.starts_with('#')
})
}
#[must_use]
pub fn is_directives_end(&self) -> bool {
self.buf
.peek_next()
.is_some_and(|line| is_marker(line.content, b'-'))
}
#[must_use]
pub fn is_document_end(&self) -> bool {
self.buf
.peek_next()
.is_some_and(|line| is_marker(line.content, b'.'))
}
#[must_use]
pub fn has_content(&self) -> bool {
self.buf
.peek_next()
.is_some_and(|line| !is_blank_or_comment(line))
}
pub fn consume_marker_line(&mut self, reject_all_inline: bool) -> (Pos, Pos) {
let Some(line) = self.buf.consume_next() else {
unreachable!("consume_marker_line called at EOF")
};
let marker_pos = line.pos;
let after = pos_after_line(&line);
self.current_pos = after;
let inline = line
.content
.get(4..)
.unwrap_or("")
.trim_start_matches([' ', '\t']);
if !inline.is_empty() {
let prefix_bytes = line.content.len() - inline.len();
let prefix_chars = crate::pos::column_at(line.content, prefix_bytes);
let inline_start = Pos {
byte_offset: marker_pos.byte_offset + prefix_bytes,
line: marker_pos.line,
column: marker_pos.column + prefix_chars,
};
if let Some(comment_text) = inline.strip_prefix('#') {
let comment_end =
crate::pos::advance_within_line(inline_start.advance('#'), comment_text);
self.trailing_comment = Some((
comment_text,
Span {
start: inline_start,
end: comment_end,
},
));
} else if reject_all_inline {
self.marker_inline_error = Some(Error {
pos: inline_start,
message: "invalid content after document-end marker '...'".into(),
});
} else {
if let Some(after_pipe) = inline
.strip_prefix('|')
.or_else(|| inline.strip_prefix('>'))
{
let (_, _, header_err) = parse_block_header(after_pipe, inline_start);
if let Some(e) = header_err {
self.marker_inline_error = Some(e);
return (marker_pos, after);
}
}
let scanned = scan_plain_line_block(inline);
if scanned.is_empty() {
self.marker_inline_error = Some(Error {
pos: inline_start,
message: "invalid content after document-start marker '---'".into(),
});
} else {
let residual = inline[scanned.len()..].trim_start_matches([' ', '\t']);
if !residual.is_empty() && !residual.starts_with('#') {
self.marker_inline_error = Some(Error {
pos: inline_start,
message: "invalid content after document-start marker '---'".into(),
});
} else {
let inline_end = crate::pos::advance_within_line(inline_start, scanned);
self.inline_scalar = Some((
Cow::Borrowed(scanned),
Span {
start: inline_start,
end: inline_end,
},
));
}
}
}
}
(marker_pos, after)
}
#[must_use]
pub fn peek_next_line(&self) -> Option<&Line<'input>> {
self.buf.peek_next()
}
#[must_use]
pub fn peek_second_line(&self) -> Option<Line<'input>> {
self.buf.peek_second()
}
pub fn prepend_inline_line(&mut self, line: Line<'input>) {
self.buf.prepend_line(line);
}
pub fn consume_line(&mut self) {
if let Some(line) = self.buf.consume_next() {
self.current_pos = pos_after_line(&line);
}
}
#[must_use]
pub fn at_eof(&self) -> bool {
self.buf.at_eof()
}
#[must_use]
pub const fn has_inline_scalar(&self) -> bool {
self.inline_scalar.is_some()
}
#[must_use]
pub fn peek_inline_scalar(&self) -> Option<(&str, Pos)> {
self.inline_scalar
.as_ref()
.map(|(v, span)| (v.as_ref(), span.start))
}
pub fn drain_inline_scalar(&mut self) {
self.inline_scalar = None;
}
#[cfg(test)]
pub fn set_inline_scalar_for_test(&mut self, value: Cow<'input, str>, span: crate::pos::Span) {
self.inline_scalar = Some((value, span));
}
#[must_use]
pub fn is_next_line_synthetic(&self) -> bool {
self.buf.is_next_synthetic()
}
#[must_use]
pub const fn current_pos(&self) -> Pos {
self.current_pos
}
pub fn drain_to_end(&mut self) -> Pos {
while let Some(line) = self.buf.consume_next() {
self.current_pos = pos_after_line(&line);
}
self.current_pos
}
}
fn is_blank_not_comment(line: &Line<'_>) -> bool {
line.content.trim_start_matches([' ', '\t']).is_empty()
}
pub fn is_blank_or_comment(line: &Line<'_>) -> bool {
let trimmed = line.content.trim_start_matches([' ', '\t']);
trimmed.is_empty() || trimmed.starts_with('#')
}
pub fn is_marker(content: &str, ch: u8) -> bool {
let bytes = content.as_bytes();
if bytes.len() < 3 {
return false;
}
let Some((&b0, &b1, &b2)) = bytes
.first()
.zip(bytes.get(1))
.zip(bytes.get(2))
.map(|((a, b), c)| (a, b, c))
else {
return false;
};
if b0 != ch || b1 != ch || b2 != ch {
return false;
}
matches!(bytes.get(3), None | Some(&b' ' | &b'\t'))
}
pub fn is_doc_marker_line(content: &str) -> bool {
is_marker(content, b'-') || is_marker(content, b'.')
}
#[cfg(test)]
mod tests {
use rstest::rstest;
use super::*;
fn is_directive_or_blank_or_comment(line: &Line<'_>) -> bool {
if is_blank_or_comment(line) {
return true;
}
let trimmed = line.content.trim_start_matches([' ', '\t']);
trimmed.starts_with('%')
}
fn make_lexer(input: &str) -> Lexer<'_> {
Lexer::new(input)
}
#[rstest]
#[case::exact_three_dashes("---")]
#[case::followed_by_space("--- ")]
#[case::followed_by_tab("---\t")]
fn directives_end_true(#[case] input: &str) {
assert!(make_lexer(input).is_directives_end());
}
#[rstest]
#[case::word_attached("---word")]
#[case::partial_dashes("--")]
#[case::empty_line("")]
fn directives_end_false(#[case] input: &str) {
assert!(!make_lexer(input).is_directives_end());
}
#[rstest]
#[case::exact_three_dots("...")]
#[case::followed_by_space("... ")]
fn document_end_true(#[case] input: &str) {
assert!(make_lexer(input).is_document_end());
}
#[rstest]
#[case::word_attached("...word")]
#[case::partial_dots("..")]
fn document_end_false(#[case] input: &str) {
assert!(!make_lexer(input).is_document_end());
}
#[test]
fn skip_empty_lines_advances_past_blank_line() {
let mut lex = make_lexer("\n---");
lex.skip_empty_lines();
assert!(lex.is_directives_end());
}
#[test]
fn skip_empty_lines_returns_pos_after_consumed_lines() {
let mut lex = make_lexer("\n\n---");
let pos = lex.skip_empty_lines();
assert_eq!(pos.byte_offset, 2);
}
#[test]
fn skip_empty_lines_stops_at_comment_lines() {
let mut lex = make_lexer("# comment\n---");
lex.skip_empty_lines();
assert!(lex.is_comment_line(), "expected to stop at comment line");
assert!(!lex.is_directives_end());
}
#[test]
fn skip_empty_lines_on_empty_input_returns_origin_pos() {
let mut lex = make_lexer("");
let pos = lex.skip_empty_lines();
assert_eq!(pos, Pos::ORIGIN);
}
#[test]
fn skip_empty_lines_leaves_content_line_untouched() {
let mut lex = make_lexer("content");
lex.skip_empty_lines();
assert!(lex.has_content());
}
#[test]
fn consume_marker_line_returns_marker_pos_and_after_pos() {
let mut lex = make_lexer("---\n");
let (marker_pos, after_pos) = lex.consume_marker_line(false);
assert_eq!(marker_pos.byte_offset, 0);
assert_eq!(after_pos.byte_offset, 4);
}
#[test]
fn consume_marker_line_advances_lexer_past_line() {
let mut lex = make_lexer("---\nnext");
lex.consume_marker_line(false);
assert!(lex.buf.peek_next().is_some_and(|l| l.content == "next"));
}
#[rstest]
#[case::empty_input("")]
#[case::blank_and_comment_lines("\n# comment\n \n")]
fn has_content_false(#[case] input: &str) {
assert!(!make_lexer(input).has_content());
}
#[test]
fn has_content_true_when_non_blank_line_present() {
let lex = make_lexer("foo");
assert!(lex.has_content());
}
#[test]
fn drain_to_end_returns_pos_after_last_byte() {
let mut lex = make_lexer("abc\n");
let pos = lex.drain_to_end();
assert_eq!(pos.byte_offset, 4);
}
#[test]
fn is_blank_or_comment_does_not_skip_directive_lines() {
let Some(line) = LineBuffer::new("%foo: bar").consume_next() else {
unreachable!("LineBuffer produced no line for non-empty input")
};
assert!(!is_blank_or_comment(&line));
}
#[test]
fn is_directive_or_blank_or_comment_skips_directive_lines() {
let Some(line) = LineBuffer::new("%YAML 1.2").consume_next() else {
unreachable!("LineBuffer produced no line for non-empty input")
};
assert!(is_directive_or_blank_or_comment(&line));
}
}