use mago_span::Span;
use crate::error::ParseError;
use crate::internal::token::Token;
#[inline]
pub fn tokenize<'src>(comment: &'src [u8], span: Span) -> Result<Vec<Token<'src>>, ParseError> {
if comment.len() < 5 || !comment.starts_with(b"/**") || !comment.ends_with(b"*/") {
return Err(ParseError::InvalidComment(span));
}
let mut content_start = 3u32;
let mut content_end = (comment.len() - 2) as u32;
let content = &comment[3..(comment.len() - 2)];
if content.contains(&b'\n') {
let mut lines_with_positions: Vec<(&'src [u8], u32)> = Vec::new();
let mut cursor = 0usize;
for nl in memchr::memmem::find_iter(content, b"\n") {
let raw_line = &content[cursor..nl];
let cleaned_line = raw_line.strip_suffix(b"\r").unwrap_or(raw_line);
lines_with_positions.push((cleaned_line, cursor as u32));
cursor = nl + 1;
}
if cursor <= content.len() {
let raw_line = &content[cursor..];
let cleaned_line = raw_line.strip_suffix(b"\r").unwrap_or(raw_line);
lines_with_positions.push((cleaned_line, cursor as u32));
}
let mut comment_lines = Vec::new();
for (line, line_start_in_content) in lines_with_positions {
let trimmed_line = line.trim_ascii_end();
if trimmed_line.trim_ascii().is_empty() {
continue;
}
let line_indent_length =
trimmed_line.iter().position(|b| !b.is_ascii_whitespace()).unwrap_or(trimmed_line.len());
let line_content_after_indent = &trimmed_line[line_indent_length..];
let mut content_start_in_line = line_indent_length as u32;
let line_after_asterisk = if let Some(line_after_asterisk) = line_content_after_indent.strip_prefix(b"*") {
content_start_in_line += 1;
line_after_asterisk
} else {
line_content_after_indent
};
if let Some(&first_byte) = line_after_asterisk.first() {
if first_byte.is_ascii_whitespace() {
content_start_in_line += 1;
}
let content_end_in_line = trimmed_line.len() as u32;
let content_start_in_comment = content_start + line_start_in_content + content_start_in_line;
let content_end_in_comment = content_start + line_start_in_content + content_end_in_line;
let content_bytes = &comment[content_start_in_comment as usize..content_end_in_comment as usize];
let content_span = span.subspan(content_start_in_comment, content_end_in_comment);
comment_lines.push(Token::Line { content: content_bytes, span: content_span });
} else {
comment_lines.push(Token::EmptyLine {
span: span.subspan(content_start + line_start_in_content, content_start + line_start_in_content),
});
}
}
Ok(comment_lines)
} else {
if content.is_empty() {
return Ok(Vec::new());
}
let content = if let Some(content) = content.strip_prefix(b" ") {
content_start += 1;
content
} else {
content
};
let content = if let Some(content) = content.strip_suffix(b" ") {
content_end -= 1;
content
} else {
content
};
if content.is_empty() {
return Ok(Vec::new());
}
Ok(vec![Token::Line { content, span: span.subspan(content_start, content_end) }])
}
}
#[cfg(test)]
mod tests {
use super::*;
use mago_database::file::FileId;
use mago_span::Position;
#[test]
fn test_lex_empty_single_line_comment() {
let comment = "/***/";
let span = Span::new(FileId::zero(), Position::new(0), Position::new(comment.len() as u32));
match tokenize(comment.as_bytes(), span) {
Ok(tokens) => {
assert_eq!(tokens.len(), 0);
}
Err(e) => {
panic!("Error parsing comment: {e:?}");
}
}
}
#[test]
fn test_lex_empty_multiline_line_comment() {
let comment = "/**\n*/";
let span = Span::new(FileId::zero(), Position::new(0), Position::new(comment.len() as u32));
match tokenize(comment.as_bytes(), span) {
Ok(tokens) => {
assert_eq!(tokens.len(), 0);
}
Err(e) => {
panic!("Error parsing comment: {e:?}");
}
}
}
#[test]
fn test_lex_single_line_comment() {
let comment = "/** This is a single-line comment */";
let span = Span::new(FileId::zero(), Position::new(0), Position::new(comment.len() as u32));
match tokenize(comment.as_bytes(), span) {
Ok(tokens) => {
assert_eq!(tokens.len(), 1);
let Token::Line { content, span } = &tokens[0] else {
panic!("Expected a line, but got something else");
};
assert_eq!(*content, b"This is a single-line comment" as &[u8]);
assert!(comment.as_bytes()[span.start.offset as usize..span.end.offset as usize].eq(*content));
}
Err(e) => {
panic!("Error parsing comment: {e:?}");
}
}
}
#[test]
fn test_lex_single_line_comment_missing_whitespace_front() {
let comment = "/**This is a single-line comment */";
let span = Span::new(FileId::zero(), Position::new(0), Position::new(comment.len() as u32));
match tokenize(comment.as_bytes(), span) {
Ok(tokens) => {
assert_eq!(tokens.len(), 1);
let Token::Line { content, span } = &tokens[0] else {
panic!("Expected a line, but got something else");
};
assert_eq!(*content, b"This is a single-line comment" as &[u8]);
assert!(comment.as_bytes()[span.start.offset as usize..span.end.offset as usize].eq(*content));
}
Err(e) => {
panic!("Error parsing comment: {e:?}");
}
}
}
#[test]
fn test_lex_single_line_comment_missing_whitespace_back() {
let comment = "/** This is a single-line comment*/";
let span = Span::new(FileId::zero(), Position::new(0), Position::new(comment.len() as u32));
match tokenize(comment.as_bytes(), span) {
Ok(tokens) => {
assert_eq!(tokens.len(), 1);
let Token::Line { content, span } = &tokens[0] else {
panic!("Expected a line, but got something else");
};
assert_eq!(*content, b"This is a single-line comment" as &[u8]);
assert!(comment.as_bytes()[span.start.offset as usize..span.end.offset as usize].eq(*content));
}
Err(e) => {
panic!("Error parsing comment: {e:?}");
}
}
}
#[test]
fn test_lex_multi_line_comment() {
let comment = "/**
* This is a multi-line comment.
* It has multiple lines.
* Each line starts with an asterisk.
*/";
let span = Span::new(FileId::zero(), Position::new(0), Position::new(comment.len() as u32));
match tokenize(comment.as_bytes(), span) {
Ok(tokens) => {
assert_eq!(tokens.len(), 3);
let expected_contents: [&[u8]; 3] = [
b"This is a multi-line comment.",
b"It has multiple lines.",
b"Each line starts with an asterisk.",
];
for (i, line) in tokens.iter().enumerate() {
let Token::Line { content, span } = line else {
panic!("Expected a line, but got something else");
};
assert_eq!(*content, expected_contents[i]);
assert!(comment.as_bytes()[span.start.offset as usize..span.end.offset as usize].eq(*content));
}
}
Err(e) => {
panic!("Error parsing comment: {e:?}");
}
}
}
#[test]
fn test_lex_multi_line_comment_indent() {
let comment = r#"/**
* This is a multi-line comment.
* It has multiple lines.
* Each line starts with an asterisk.
*
* $foo = "bar";
* $bar = "baz";
*/"#;
let span = Span::new(FileId::zero(), Position::new(0), Position::new(comment.len() as u32));
match tokenize(comment.as_bytes(), span) {
Ok(tokens) => {
assert_eq!(tokens.len(), 6);
let expected_contents: [&[u8]; 6] = [
b"This is a multi-line comment.",
b"It has multiple lines.",
b"Each line starts with an asterisk.",
b"",
b" $foo = \"bar\";",
b" $bar = \"baz\";",
];
for (i, line) in tokens.iter().enumerate() {
let expected_content = expected_contents[i];
if expected_content.is_empty() {
match line {
Token::EmptyLine { span } => {
assert_eq!(&comment[span.start.offset as usize..span.end.offset as usize], "");
}
_ => {
panic!("Expected an empty line, but got something else");
}
}
} else {
let Token::Line { content, span } = line else {
panic!("Expected a line, but got something else");
};
assert_eq!(*content, expected_content);
assert!(comment.as_bytes()[span.start.offset as usize..span.end.offset as usize].eq(*content));
}
}
}
Err(e) => {
panic!("Error parsing comment: {e:?}");
}
}
}
#[test]
fn test_lex_multi_line_comment_inconsistent_indentation() {
let comment = "/**
* This is a multi-line comment.
* It has multiple lines.
* Each line starts with an asterisk.
*/";
let span = Span::new(FileId::zero(), Position::new(0), Position::new(comment.len() as u32));
match tokenize(comment.as_bytes(), span) {
Ok(tokens) => {
assert_eq!(tokens.len(), 3);
let expected_contents: [&[u8]; 3] = [
b"This is a multi-line comment.",
b"It has multiple lines.",
b"Each line starts with an asterisk.",
];
for (i, line) in tokens.iter().enumerate() {
let Token::Line { content, span } = line else {
panic!("Expected a line, but got something else");
};
assert_eq!(*content, expected_contents[i]);
assert!(comment.as_bytes()[span.start.offset as usize..span.end.offset as usize].eq(*content));
}
}
Err(e) => {
panic!("Unexpected error: {e:?}");
}
}
}
#[test]
fn test_lex_multi_line_comment_missing_asterisk() {
let comment = "/**
* This is a multi-line comment.
It has multiple lines.
* Each line starts with an asterisk.
*/";
let span = Span::new(FileId::zero(), Position::new(0), Position::new(comment.len() as u32));
match tokenize(comment.as_bytes(), span) {
Ok(tokens) => {
assert_eq!(tokens.len(), 3);
let expected_contents: [&[u8]; 3] = [
b"This is a multi-line comment.",
b"It has multiple lines.",
b"Each line starts with an asterisk.",
];
for (i, line) in tokens.iter().enumerate() {
let Token::Line { content, span } = line else {
panic!("Expected a line, but got something else");
};
assert_eq!(*content, expected_contents[i]);
assert!(comment.as_bytes()[span.start.offset as usize..span.end.offset as usize].eq(*content));
}
}
Err(e) => {
panic!("Unexpected error: {e:?}");
}
}
}
#[test]
fn test_lex_multi_line_comment_missing_whitespace_after_asterisk() {
let comment = "/**
* This is a multi-line comment.
*It has multiple lines.
* Each line starts with an asterisk.
*/";
let span = Span::new(FileId::zero(), Position::new(0), Position::new(comment.len() as u32));
match tokenize(comment.as_bytes(), span) {
Ok(tokens) => {
assert_eq!(tokens.len(), 3);
let expected_contents: [&[u8]; 3] = [
b"This is a multi-line comment.",
b"It has multiple lines.",
b"Each line starts with an asterisk.",
];
for (i, line) in tokens.iter().enumerate() {
let Token::Line { content, span } = line else {
panic!("Expected a line, but got something else");
};
assert_eq!(*content, expected_contents[i]);
assert!(comment.as_bytes()[span.start.offset as usize..span.end.offset as usize].eq(*content));
}
}
Err(e) => {
panic!("Unexpected error: {e:?}");
}
}
}
#[test]
fn test_lex_multi_line_comment_crlf_with_multibyte_char() {
let comment = "/**\r\n * blah blah ‰©\r\n */";
let span = Span::new(FileId::zero(), Position::new(0), Position::new(comment.len() as u32));
match tokenize(comment.as_bytes(), span) {
Ok(tokens) => {
assert_eq!(tokens.len(), 1, "Should have parsed exactly one line of content");
let Token::Line { content, span: token_span } = &tokens[0] else {
panic!("Expected a Token::Line");
};
let expected_content = "blah blah ‰©".as_bytes();
assert_eq!(*content, expected_content);
let sliced = &comment[token_span.start.offset as usize..token_span.end.offset as usize];
assert_eq!(sliced.as_bytes(), expected_content);
}
Err(e) => {
panic!("Failed to tokenize comment with CRLF endings: {e:?}");
}
}
}
}