use memchr::memchr;
use crate::chars::{find_non_c_printable, non_printable_error_message};
use crate::error::Error;
use crate::pos::{Pos, Span};
use super::Lexer;
use crate::lines::pos_after_line;
impl<'input> Lexer<'input> {
pub fn try_consume_comment(
&mut self,
max_comment_len: usize,
) -> Result<Option<(&'input str, Span)>, Error> {
let Some(line) = self.buf.peek_next() else {
return Ok(None);
};
let trimmed = line.content.trim_start_matches([' ', '\t']);
if !trimmed.starts_with('#') {
return Ok(None);
}
let hash_byte_offset = memchr(b'#', line.content.as_bytes()).unwrap_or(0);
let hash_col = crate::pos::column_at(line.content, hash_byte_offset);
let hash_pos = Pos {
byte_offset: line.pos.byte_offset + hash_byte_offset,
line: line.pos.line,
column: line.pos.column + hash_col,
};
let text_start = hash_byte_offset + 1; let text: &'input str = &line.content[text_start..];
if text.len() > max_comment_len {
return Err(Error::syntax(
hash_pos,
format!("comment exceeds maximum allowed length ({max_comment_len} bytes)"),
));
}
if !self.input_all_printable {
if let Some((bad_i, bad_ch)) = find_non_c_printable(text.as_bytes()) {
let bad_char_count = text[..bad_i].chars().count();
let bad_pos = Pos {
byte_offset: hash_pos.byte_offset + 1 + bad_i,
line: hash_pos.line,
column: hash_pos.column + 1 + bad_char_count,
};
return Err(Error::invalid_character(
bad_pos,
non_printable_error_message(bad_ch, "comment"),
));
}
}
let span_end = crate::pos::advance_within_line(hash_pos.advance('#'), text);
let span = Span::from_pos(hash_pos, span_end);
let Some(consumed) = self.buf.consume_next() else {
unreachable!("try_consume_comment: peek returned Some but consume returned None")
};
self.current_pos = pos_after_line(&consumed);
Ok(Some((text, span)))
}
}
#[cfg(test)]
mod tests {
use rstest::rstest;
use super::*;
use crate::error::ErrorKind;
fn make_lexer(input: &str) -> Lexer<'_> {
Lexer::new(input)
}
#[rstest]
#[case::returns_none_at_eof("")]
#[case::returns_none_for_blank_line("\n")]
#[case::returns_none_for_whitespace_only_line(" \n")]
#[case::returns_none_for_content_line("key: value\n")]
#[case::returns_none_for_directive_line("%YAML 1.2\n")]
fn returns_none(#[case] input: &str) {
let mut lex = make_lexer(input);
assert_eq!(lex.try_consume_comment(1024), Ok(None));
}
#[rstest]
#[case::plain_comment_returns_text_after_hash("# hello\n", " hello")]
#[case::indented_comment_returns_text_after_hash(" # indented\n", " indented")]
#[case::tab_indented_comment_returns_text("\t# tabbed\n", " tabbed")]
#[case::empty_comment_body_returns_empty_text("#\n", "")]
#[case::comment_with_hash_in_body_preserves_inner_hash("# foo # bar\n", " foo # bar")]
#[case::unicode_body_text_is_slice_of_input("# 日本語\n", " 日本語")]
fn happy_path_text(#[case] input: &str, #[case] expected: &str) {
let mut lex = make_lexer(input);
let Ok(Some((text, _))) = lex.try_consume_comment(1024) else {
unreachable!("expected Ok(Some(...))")
};
assert_eq!(text, expected);
}
#[rstest]
#[case::span_start_byte_offset_at_hash("# comment\n", 0, 0, 1)]
#[case::span_start_column_at_hash_after_leading_spaces(" # comment\n", 3, 3, 1)]
fn span_start(
#[case] input: &str,
#[case] expected_byte_offset: u32,
#[case] expected_column: u32,
#[case] expected_line: u32,
) {
let mut lex = make_lexer(input);
let Ok(Some((_, span))) = lex.try_consume_comment(1024) else {
unreachable!("expected Ok(Some(...))")
};
let idx = crate::pos::LineIndex::new(input);
assert_eq!(span.start, expected_byte_offset);
let (line, col) = idx.line_column(span.start);
assert_eq!(col, expected_column);
assert_eq!(line, expected_line);
}
#[rstest]
#[case::span_end_byte_offset_past_last_char("# abc\n", 5, 5)]
#[case::span_end_byte_offset_for_multibyte_body("# 日\n", 5, 3)]
fn span_end(
#[case] input: &str,
#[case] expected_byte_offset: u32,
#[case] expected_column: u32,
) {
let mut lex = make_lexer(input);
let Ok(Some((_, span))) = lex.try_consume_comment(1024) else {
unreachable!("expected Ok(Some(...))")
};
let idx = crate::pos::LineIndex::new(input);
assert_eq!(span.end, expected_byte_offset);
let (_, col) = idx.line_column(span.end);
assert_eq!(col, expected_column);
}
#[test]
fn span_for_empty_comment_body() {
let mut lex = make_lexer("#\n");
let Ok(Some((_, span))) = lex.try_consume_comment(1024) else {
unreachable!("expected Ok(Some(...))")
};
assert_eq!(span.start, 0);
assert_eq!(span.end, 1);
}
#[test]
fn lexer_position_advances_past_consumed_comment_line() {
let mut lex = make_lexer("# c\nnext\n");
let Ok(_) = lex.try_consume_comment(1024) else {
unreachable!("expected Ok")
};
assert_eq!(lex.current_pos().byte_offset, 4);
}
#[test]
fn next_line_is_available_after_comment_consumed() {
let mut lex = make_lexer("# comment\nnext\n");
let Ok(_) = lex.try_consume_comment(1024) else {
unreachable!("expected Ok")
};
assert_eq!(lex.peek_next_line().map(|l| l.content), Some("next"));
}
#[test]
fn comment_not_consumed_on_none_return() {
let mut lex = make_lexer("content\n# comment\n");
let Ok(result) = lex.try_consume_comment(1024) else {
unreachable!("expected Ok")
};
assert_eq!(result, None);
assert_eq!(lex.peek_next_line().map(|l| l.content), Some("content"));
}
#[rstest]
#[case::comment_within_limit_returns_ok("# ab\n", 3, " ab")]
#[case::comment_exactly_at_limit_returns_ok("# abc\n", 4, " abc")]
fn comment_len_ok(#[case] input: &str, #[case] limit: usize, #[case] expected_text: &str) {
let mut lex = make_lexer(input);
let Ok(Some((text, _))) = lex.try_consume_comment(limit) else {
unreachable!("expected Ok(Some(...))")
};
assert_eq!(text, expected_text);
}
#[test]
fn comment_exceeding_limit_returns_err() {
let mut lex = make_lexer("# abc\n");
let Err(err) = lex.try_consume_comment(3) else {
unreachable!("expected Err")
};
assert!(
err.message
.contains("comment exceeds maximum allowed length")
);
}
#[test]
fn error_pos_points_to_hash() {
let mut lex = make_lexer(" # toolong\n");
let Err(err) = lex.try_consume_comment(0) else {
unreachable!("expected Err")
};
assert_eq!(err.pos.byte_offset, 2);
assert_eq!(err.pos.column, 2);
}
#[test]
fn comment_rejects_nul_in_body() {
let mut lex = make_lexer("# hello\x00world\n");
let Err(err) = lex.try_consume_comment(1024) else {
unreachable!("expected Err, got Ok")
};
assert_eq!(
err.kind,
ErrorKind::InvalidCharacter,
"expected InvalidCharacter for NUL, got: {:?}",
err.kind
);
}
#[test]
fn comment_rejects_0x01_in_body() {
let mut lex = make_lexer("# hello\x01world\n");
let Err(err) = lex.try_consume_comment(1024) else {
unreachable!("expected Err, got Ok")
};
assert_eq!(
err.kind,
ErrorKind::InvalidCharacter,
"expected InvalidCharacter for SOH, got: {:?}",
err.kind
);
}
#[test]
fn comment_rejects_del_0x7f_in_body() {
let mut lex = make_lexer("# hello\x7fworld\n");
let Err(err) = lex.try_consume_comment(1024) else {
unreachable!("expected Err, got Ok")
};
assert_eq!(
err.kind,
ErrorKind::InvalidCharacter,
"expected InvalidCharacter for DEL, got: {:?}",
err.kind
);
}
#[test]
fn comment_rejects_c1_control_0x80_in_body() {
let mut lex = make_lexer("# hello\u{0080}world\n");
let Err(err) = lex.try_consume_comment(1024) else {
unreachable!("expected Err, got Ok")
};
assert_eq!(
err.kind,
ErrorKind::InvalidCharacter,
"expected InvalidCharacter for U+0080, got: {:?}",
err.kind
);
}
#[test]
fn comment_rejects_0xfffe_in_body() {
let mut lex = make_lexer("# hello\u{FFFE}world\n");
let Err(err) = lex.try_consume_comment(1024) else {
unreachable!("expected Err, got Ok")
};
assert_eq!(
err.kind,
ErrorKind::InvalidCharacter,
"expected InvalidCharacter for U+FFFE, got: {:?}",
err.kind
);
}
#[test]
fn comment_rejects_0xffff_in_body() {
let mut lex = make_lexer("# hello\u{FFFF}world\n");
let Err(err) = lex.try_consume_comment(1024) else {
unreachable!("expected Err, got Ok")
};
assert_eq!(
err.kind,
ErrorKind::InvalidCharacter,
"expected InvalidCharacter for U+FFFF, got: {:?}",
err.kind
);
}
#[test]
fn comment_non_printable_as_first_body_char() {
let mut lex = make_lexer("#\x07hello\n");
let Err(err) = lex.try_consume_comment(1024) else {
unreachable!("expected Err, got Ok")
};
assert_eq!(
err.kind,
ErrorKind::InvalidCharacter,
"expected InvalidCharacter for BEL as first body char, got: {:?}",
err.kind
);
}
#[test]
fn comment_error_message_contains_uplus_hex() {
let mut lex = make_lexer("# hello\x07world\n");
let Err(err) = lex.try_consume_comment(1024) else {
unreachable!("expected Err, got Ok")
};
assert!(
err.message.contains("U+0007"),
"error message must contain U+0007, got: {}",
err.message
);
}
#[test]
fn trailing_comment_non_printable_in_body_produces_error() {
let events: Vec<_> = crate::parse_events("key: value # comment\x07here\n").collect();
let has_invalid_character_error = events.iter().any(|r| {
r.as_ref()
.err()
.is_some_and(|e| e.kind == ErrorKind::InvalidCharacter)
});
assert!(
has_invalid_character_error,
"expected InvalidCharacter error for BEL in trailing comment"
);
}
#[test]
fn comment_accepts_tab_in_body() {
let mut lex = make_lexer("# col1\tcol2\n");
let Ok(Some((text, _))) = lex.try_consume_comment(1024) else {
unreachable!("expected Ok(Some(...))")
};
assert!(text.contains('\t'), "TAB must be accepted in comment body");
}
#[test]
fn comment_accepts_nel_0x85() {
let mut lex = make_lexer("# val\u{0085}ue\n");
if let Err(e) = lex.try_consume_comment(1024) {
assert_ne!(
e.kind,
ErrorKind::InvalidCharacter,
"NEL must not be rejected as InvalidCharacter in comment, got: {:?}",
e.kind
);
}
}
#[test]
fn comment_body_empty_no_error() {
let mut lex = make_lexer("#\n");
let Ok(Some((text, _))) = lex.try_consume_comment(1024) else {
unreachable!("expected Ok(Some(...))")
};
assert_eq!(text, "", "empty comment body must produce empty text slice");
}
}