use pulldown_cmark::{Event, HeadingLevel, Options, Parser, Tag, TagEnd};
use super::document::{Directive, Document, ListItem, Paragraph, Section};
use crate::types::SourceFile;
#[must_use]
pub fn parse_markdown(text: &str, source: SourceFile) -> Document {
let options = Options::ENABLE_TABLES
| Options::ENABLE_FOOTNOTES
| Options::ENABLE_STRIKETHROUGH
| Options::ENABLE_TASKLISTS;
let mut sections: Vec<Section> = Vec::new();
let mut directives: Vec<Directive> = Vec::new();
let mut pending_directive_rules: Vec<String> = Vec::new();
let mut open_blocks: Vec<(String, u32)> = Vec::new();
let mut list_items: Vec<ListItem> = Vec::new();
let mut list_depth: u32 = 0;
let mut current_title: Option<String> = None;
let mut current_depth: u32 = 0;
let mut current_heading_line: Option<u32> = None;
let mut current_paragraphs: Vec<Paragraph> = Vec::new();
let mut in_heading: Option<HeadingLevel> = None;
let mut in_paragraph = false;
let mut in_code = false;
let mut in_list_item = false;
let mut buf = String::new();
let mut paragraph_start_line: u32 = 1;
let offsets: Vec<(Event, std::ops::Range<usize>)> =
Parser::new_ext(text, options).into_offset_iter().collect();
for (event, range) in offsets {
if !pending_directive_rules.is_empty() && is_content_block_start(&event) {
let target_line = offset_to_line(text, range.start);
for rule_id in std::mem::take(&mut pending_directive_rules) {
directives.push(Directive::new(rule_id, target_line));
}
}
match event {
Event::Start(Tag::Heading { level, .. }) => {
finish_paragraph(
&mut in_paragraph,
&mut buf,
&mut current_paragraphs,
paragraph_start_line,
);
finish_section(
&mut sections,
&mut current_title,
&mut current_depth,
&mut current_heading_line,
&mut current_paragraphs,
);
in_heading = Some(level);
current_heading_line = Some(offset_to_line(text, range.start));
buf.clear();
},
Event::End(TagEnd::Heading(level)) => {
current_title = Some(buf.trim().to_string()).filter(|s| !s.is_empty());
current_depth = heading_depth(level);
current_paragraphs = Vec::new();
in_heading = None;
buf.clear();
},
Event::Start(Tag::Paragraph) => {
in_paragraph = true;
buf.clear();
paragraph_start_line = offset_to_line(text, range.start);
},
Event::End(TagEnd::Paragraph) => {
finish_paragraph(
&mut in_paragraph,
&mut buf,
&mut current_paragraphs,
paragraph_start_line,
);
},
Event::Start(Tag::List(_)) => {
list_depth = list_depth.saturating_add(1);
},
Event::End(TagEnd::List(_)) => {
list_depth = list_depth.saturating_sub(1);
},
Event::Start(Tag::Item) => {
in_list_item = true;
list_items.push(ListItem::new(
list_depth.max(1),
offset_to_line(text, range.start),
));
},
Event::End(TagEnd::Item) => {
in_list_item = false;
},
Event::Start(Tag::CodeBlock(_)) => {
in_code = true;
},
Event::End(TagEnd::CodeBlock) => {
in_code = false;
},
Event::Code(_) => {
},
Event::Html(s) | Event::InlineHtml(s) => {
let block_line = offset_to_line(text, range.start);
for (parsed, line) in parse_all_directives_in_html(&s, block_line) {
match parsed {
ParsedDirective::LineForm { rule_id } => {
pending_directive_rules.push(rule_id);
},
ParsedDirective::BlockOpen { rule_id } => {
open_blocks.push((rule_id, line));
},
ParsedDirective::BlockClose { rule_id: Some(id) } => {
if let Some(pos) = open_blocks.iter().rposition(|(r, _)| r == &id) {
let (rule_id, start) = open_blocks.remove(pos);
directives.push(Directive::block(rule_id, start, line));
}
},
ParsedDirective::BlockClose { rule_id: None } => {
for (rule_id, start) in std::mem::take(&mut open_blocks) {
directives.push(Directive::block(rule_id, start, line));
}
},
}
}
},
Event::Text(s) => {
if in_code {
continue;
}
if in_heading.is_some() || in_paragraph {
buf.push_str(&s);
}
},
Event::SoftBreak if in_heading.is_some() || in_paragraph => {
buf.push(' ');
},
Event::HardBreak if in_heading.is_some() || in_paragraph => {
buf.push('\n');
},
_ => {},
}
}
let _ = in_list_item;
finish_paragraph(
&mut in_paragraph,
&mut buf,
&mut current_paragraphs,
paragraph_start_line,
);
finish_section(
&mut sections,
&mut current_title,
&mut current_depth,
&mut current_heading_line,
&mut current_paragraphs,
);
if sections.is_empty() {
sections.push(Section::new(None, 0, Vec::new()));
}
if !open_blocks.is_empty() {
let last_line = offset_to_line(text, text.len().saturating_sub(1));
for (rule_id, start) in open_blocks {
directives.push(Directive::block(rule_id, start, last_line.max(start)));
}
}
Document::with_metadata(source, sections, directives, list_items)
}
fn is_content_block_start(event: &Event<'_>) -> bool {
matches!(
event,
Event::Start(Tag::Paragraph | Tag::Heading { .. } | Tag::Item)
)
}
enum ParsedDirective {
LineForm { rule_id: String },
BlockOpen { rule_id: String },
BlockClose { rule_id: Option<String> },
}
fn parse_all_directives_in_html(html: &str, block_start_line: u32) -> Vec<(ParsedDirective, u32)> {
let mut out = Vec::new();
let mut cursor = 0usize;
while let Some(open_rel) = html[cursor..].find("<!--") {
let open = cursor + open_rel;
let Some(close_rel) = html[open..].find("-->") else {
break;
};
let close = open + close_rel + 3;
let comment = &html[open..close];
if let Some(parsed) = parse_single_directive(comment) {
#[allow(clippy::naive_bytecount)]
let newlines_before = html.as_bytes()[..open]
.iter()
.filter(|&&b| b == b'\n')
.count();
let line = block_start_line.saturating_add(u32::try_from(newlines_before).unwrap_or(0));
out.push((parsed, line));
}
cursor = close;
}
out
}
fn parse_single_directive(html: &str) -> Option<ParsedDirective> {
let inner = html
.trim()
.strip_prefix("<!--")?
.strip_suffix("-->")?
.trim();
if let Some(rest) = inner.strip_prefix("lucid-lint-disable") {
let rule_id = rest.strip_prefix(|c: char| c.is_whitespace())?.trim();
if rule_id.is_empty() || !is_valid_rule_id(rule_id) {
return None;
}
return Some(ParsedDirective::BlockOpen {
rule_id: rule_id.to_string(),
});
}
if let Some(rest) = inner.strip_prefix("lucid-lint-enable") {
let trimmed = rest.trim();
if trimmed.is_empty() {
return Some(ParsedDirective::BlockClose { rule_id: None });
}
if !is_valid_rule_id(trimmed) {
return None;
}
return Some(ParsedDirective::BlockClose {
rule_id: Some(trimmed.to_string()),
});
}
let rest = inner.strip_prefix("lucid-lint")?;
let rest = rest.strip_prefix(|c: char| c.is_whitespace())?.trim_start();
let rest = rest.strip_prefix("disable-next-line")?;
let rule_id = rest.strip_prefix(|c: char| c.is_whitespace())?.trim();
if rule_id.is_empty() || !is_valid_rule_id(rule_id) {
return None;
}
Some(ParsedDirective::LineForm {
rule_id: rule_id.to_string(),
})
}
fn is_valid_rule_id(s: &str) -> bool {
s.chars()
.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-' || c == '.')
}
fn finish_paragraph(
in_paragraph: &mut bool,
buf: &mut String,
paragraphs: &mut Vec<Paragraph>,
start_line: u32,
) {
if !*in_paragraph {
return;
}
let text = buf.trim().to_string();
if !text.is_empty() {
paragraphs.push(Paragraph::new(text, start_line));
}
buf.clear();
*in_paragraph = false;
}
fn finish_section(
sections: &mut Vec<Section>,
title: &mut Option<String>,
depth: &mut u32,
heading_line: &mut Option<u32>,
paragraphs: &mut Vec<Paragraph>,
) {
if title.is_none() && paragraphs.is_empty() && sections.is_empty() {
return;
}
if title.is_some() || !paragraphs.is_empty() {
let section = match heading_line.take() {
Some(line) => {
Section::with_heading_line(title.take(), *depth, line, std::mem::take(paragraphs))
},
None => Section::new(title.take(), *depth, std::mem::take(paragraphs)),
};
sections.push(section);
*depth = 0;
}
}
const fn heading_depth(level: HeadingLevel) -> u32 {
match level {
HeadingLevel::H1 => 1,
HeadingLevel::H2 => 2,
HeadingLevel::H3 => 3,
HeadingLevel::H4 => 4,
HeadingLevel::H5 => 5,
HeadingLevel::H6 => 6,
}
}
fn offset_to_line(text: &str, offset: usize) -> u32 {
let capped = offset.min(text.len());
#[allow(clippy::naive_bytecount)]
let lines_before = text.as_bytes()[..capped]
.iter()
.filter(|&&b| b == b'\n')
.count();
(lines_before + 1).try_into().unwrap_or(u32::MAX)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parses_simple_markdown() {
let md = "# Title\n\nFirst paragraph.\n\n## Sub\n\nSecond paragraph.";
let doc = parse_markdown(md, SourceFile::Anonymous);
assert_eq!(doc.sections.len(), 2);
assert_eq!(doc.sections[0].title.as_deref(), Some("Title"));
assert_eq!(doc.sections[0].paragraphs.len(), 1);
assert_eq!(doc.sections[1].title.as_deref(), Some("Sub"));
assert_eq!(doc.sections[1].paragraphs.len(), 1);
}
#[test]
fn preserves_inline_emphasis_text() {
let md = "Some *emphasized* and **strong** and `code` text.";
let doc = parse_markdown(md, SourceFile::Anonymous);
let para = &doc.sections[0].paragraphs[0].text;
assert!(para.contains("emphasized"));
assert!(para.contains("strong"));
assert!(!para.contains("code"));
}
#[test]
fn excludes_fenced_code_blocks() {
let md = "Before.\n\n```\nignored code\n```\n\nAfter.";
let doc = parse_markdown(md, SourceFile::Anonymous);
let texts: Vec<_> = doc.sections[0]
.paragraphs
.iter()
.map(|p| p.text.clone())
.collect();
assert!(!texts.iter().any(|t| t.contains("ignored code")));
assert!(texts.iter().any(|t| t == "Before."));
assert!(texts.iter().any(|t| t == "After."));
}
#[test]
fn extracts_heading_hierarchy() {
let md = "# H1\n\nIntro.\n\n## H2\n\nSubcontent.\n\n### H3\n\nDeep.";
let doc = parse_markdown(md, SourceFile::Anonymous);
let depths: Vec<u32> = doc.sections.iter().map(|s| s.depth).collect();
assert_eq!(depths, vec![1, 2, 3]);
}
#[test]
fn handles_empty_markdown() {
let doc = parse_markdown("", SourceFile::Anonymous);
assert!(doc.sections.is_empty() || doc.sections[0].paragraphs.is_empty());
}
#[test]
fn handles_markdown_with_no_headings() {
let md = "Just a paragraph.\n\nAnd another.";
let doc = parse_markdown(md, SourceFile::Anonymous);
assert_eq!(doc.sections.len(), 1);
assert_eq!(doc.sections[0].title, None);
assert_eq!(doc.sections[0].paragraphs.len(), 2);
}
#[test]
fn extracts_disable_next_line_directive() {
let md = "Intro.\n\n<!-- lucid-lint disable-next-line structure.sentence-too-long -->\n\
A long sentence that will be suppressed.\n";
let doc = parse_markdown(md, SourceFile::Anonymous);
assert_eq!(doc.directives.len(), 1);
assert_eq!(doc.directives[0].rule_id, "structure.sentence-too-long");
assert_eq!(doc.directives[0].start_line, 4);
assert_eq!(doc.directives[0].end_line, 4);
}
#[test]
fn ignores_non_directive_html_comments() {
let md = "Intro.\n\n<!-- just a regular comment -->\n\nAfter.";
let doc = parse_markdown(md, SourceFile::Anonymous);
assert!(doc.directives.is_empty());
}
#[test]
fn rejects_directive_with_invalid_rule_id() {
let md = "<!-- lucid-lint disable-next-line Bad_Rule -->\nText.\n";
let doc = parse_markdown(md, SourceFile::Anonymous);
assert!(doc.directives.is_empty());
}
#[test]
fn directive_without_following_content_is_dropped() {
let md = "Body.\n\n<!-- lucid-lint disable-next-line structure.sentence-too-long -->\n";
let doc = parse_markdown(md, SourceFile::Anonymous);
assert!(doc.directives.is_empty());
}
#[test]
fn extracts_block_disable_and_enable_directive() {
let md = "Intro.\n\n\
<!-- lucid-lint-disable structure.sentence-too-long -->\n\n\
Inside block.\n\n\
More inside.\n\n\
<!-- lucid-lint-enable -->\n\n\
After.\n";
let doc = parse_markdown(md, SourceFile::Anonymous);
assert_eq!(doc.directives.len(), 1);
let d = &doc.directives[0];
assert_eq!(d.rule_id, "structure.sentence-too-long");
assert_eq!(d.start_line, 3);
assert_eq!(d.end_line, 9);
assert!(d.covers(5));
assert!(d.covers(7));
assert!(!d.covers(11));
}
#[test]
fn block_enable_with_rule_id_closes_matching_scope_only() {
let md = "<!-- lucid-lint-disable structure.sentence-too-long -->\n\n\
<!-- lucid-lint-disable lexicon.weasel-words -->\n\n\
Between.\n\n\
<!-- lucid-lint-enable structure.sentence-too-long -->\n\n\
After.\n\n\
<!-- lucid-lint-enable -->\n";
let doc = parse_markdown(md, SourceFile::Anonymous);
assert_eq!(doc.directives.len(), 2);
let sentence = doc
.directives
.iter()
.find(|d| d.rule_id == "structure.sentence-too-long")
.expect("sentence-too-long directive");
let weasel = doc
.directives
.iter()
.find(|d| d.rule_id == "lexicon.weasel-words")
.expect("weasel-words directive");
assert!(sentence.end_line < weasel.end_line);
}
#[test]
fn unterminated_block_disable_extends_to_end_of_document() {
let md = "Intro.\n\n\
<!-- lucid-lint-disable structure.sentence-too-long -->\n\n\
Body.\n";
let doc = parse_markdown(md, SourceFile::Anonymous);
assert_eq!(doc.directives.len(), 1);
let d = &doc.directives[0];
assert_eq!(d.rule_id, "structure.sentence-too-long");
assert!(d.end_line >= d.start_line);
assert!(d.covers(5));
}
#[test]
fn enable_with_no_matching_open_scope_is_ignored() {
let md = "<!-- lucid-lint-enable structure.sentence-too-long -->\n\nText.\n";
let doc = parse_markdown(md, SourceFile::Anonymous);
assert!(doc.directives.is_empty());
}
#[test]
fn block_directive_with_invalid_rule_id_is_rejected() {
let md = "<!-- lucid-lint-disable Bad_Rule -->\n\nBody.\n\n\
<!-- lucid-lint-enable -->\n";
let doc = parse_markdown(md, SourceFile::Anonymous);
assert!(doc.directives.is_empty());
}
#[test]
fn preserves_paragraph_start_line() {
let md = "Line 1.\n\nLine 3.\n\nLine 5.";
let doc = parse_markdown(md, SourceFile::Anonymous);
let lines: Vec<u32> = doc.sections[0]
.paragraphs
.iter()
.map(|p| p.start_line)
.collect();
assert_eq!(lines.len(), 3);
assert!(lines[0] <= lines[1] && lines[1] <= lines[2]);
}
}