use pulldown_cmark::{Event, HeadingLevel, Options, Parser, Tag, TagEnd};
#[derive(Debug, Clone, PartialEq)]
pub(crate) struct MarkdownSection {
pub heading: String,
pub heading_level: u8,
pub body_text: String,
pub list_items: Vec<String>,
pub is_ordered_list: bool,
pub code_blocks: Vec<(Option<String>, String)>,
pub source_line_start: usize,
pub source_line_end: usize,
}
pub(crate) fn extract_sections(markdown: &str) -> Vec<MarkdownSection> {
let opts = Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TABLES | Options::ENABLE_TASKLISTS;
let parser = Parser::new_ext(markdown, opts);
let line_offsets = build_line_offsets(markdown);
let mut sections: Vec<MarkdownSection> = Vec::new();
let mut current: Option<MarkdownSection> = None;
let mut in_heading = false;
let mut heading_text = String::new();
let mut heading_level: u8 = 0;
let mut _in_list = false;
let mut in_list_item = false;
let mut list_item_text = String::new();
let mut is_ordered = false;
let mut in_code_block = false;
let mut code_lang: Option<String> = None;
let mut code_body = String::new();
let mut in_paragraph = false;
for (event, range) in parser.into_offset_iter() {
let line_num = offset_to_line(&line_offsets, range.start);
match event {
Event::Start(Tag::Heading { level, .. }) => {
if let Some(mut sec) = current.take() {
sec.source_line_end = line_num.saturating_sub(1).max(sec.source_line_start);
sections.push(sec);
}
in_heading = true;
heading_level = heading_level_to_u8(level);
heading_text.clear();
}
Event::End(TagEnd::Heading(_)) => {
in_heading = false;
current = Some(MarkdownSection {
heading: heading_text.trim().to_owned(),
heading_level,
body_text: String::new(),
list_items: Vec::new(),
is_ordered_list: false,
code_blocks: Vec::new(),
source_line_start: line_num,
source_line_end: line_num,
});
}
Event::Start(Tag::List(first_item)) => {
_in_list = true;
is_ordered = first_item.is_some();
}
Event::End(TagEnd::List(_)) => {
_in_list = false;
if let Some(ref mut sec) = current {
sec.is_ordered_list = is_ordered;
}
}
Event::Start(Tag::Item) => {
in_list_item = true;
list_item_text.clear();
}
Event::End(TagEnd::Item) => {
in_list_item = false;
if let Some(ref mut sec) = current {
sec.list_items.push(list_item_text.trim().to_owned());
}
}
Event::Start(Tag::CodeBlock(kind)) => {
in_code_block = true;
code_lang = match kind {
pulldown_cmark::CodeBlockKind::Fenced(lang) => {
let l = lang.trim().to_owned();
if l.is_empty() { None } else { Some(l) }
}
pulldown_cmark::CodeBlockKind::Indented => None,
};
code_body.clear();
}
Event::End(TagEnd::CodeBlock) => {
in_code_block = false;
if let Some(ref mut sec) = current {
sec.code_blocks
.push((code_lang.take(), code_body.trim_end().to_owned()));
} else {
let mut sec = MarkdownSection {
heading: String::new(),
heading_level: 0,
body_text: String::new(),
list_items: Vec::new(),
is_ordered_list: false,
code_blocks: vec![(code_lang.take(), code_body.trim_end().to_owned())],
source_line_start: line_num,
source_line_end: line_num,
};
sec.source_line_end = line_num;
current = Some(sec);
}
code_body.clear();
}
Event::Start(Tag::Paragraph) => {
in_paragraph = true;
}
Event::End(TagEnd::Paragraph) => {
in_paragraph = false;
}
Event::Text(text) | Event::Code(text) => {
if in_heading {
heading_text.push_str(&text);
} else if in_code_block {
code_body.push_str(&text);
} else if in_list_item {
list_item_text.push_str(&text);
} else if in_paragraph {
if let Some(ref mut sec) = current {
if !sec.body_text.is_empty() && !sec.body_text.ends_with('\n') {
sec.body_text.push(' ');
}
sec.body_text.push_str(&text);
}
}
}
Event::SoftBreak | Event::HardBreak => {
if in_heading {
heading_text.push(' ');
} else if in_list_item {
list_item_text.push(' ');
} else if let Some(ref mut sec) = current {
if in_paragraph {
sec.body_text.push(' ');
}
}
}
_ => {}
}
}
if let Some(mut sec) = current.take() {
let total_lines = markdown.lines().count();
sec.source_line_end = total_lines.max(sec.source_line_start);
sections.push(sec);
}
if sections.is_empty() && !markdown.trim().is_empty() {
sections.push(MarkdownSection {
heading: String::new(),
heading_level: 0,
body_text: markdown.to_owned(),
list_items: Vec::new(),
is_ordered_list: false,
code_blocks: Vec::new(),
source_line_start: 1,
source_line_end: markdown.lines().count().max(1),
});
}
sections
}
fn build_line_offsets(text: &str) -> Vec<usize> {
let mut offsets = vec![0];
for (i, byte) in text.bytes().enumerate() {
if byte == b'\n' {
offsets.push(i + 1);
}
}
offsets
}
fn offset_to_line(line_offsets: &[usize], offset: usize) -> usize {
match line_offsets.binary_search(&offset) {
Ok(idx) => idx + 1,
Err(idx) => idx,
}
}
fn heading_level_to_u8(level: HeadingLevel) -> u8 {
match level {
HeadingLevel::H1 => 1,
HeadingLevel::H2 => 2,
HeadingLevel::H3 => 3,
HeadingLevel::H4 => 4,
HeadingLevel::H5 => 5,
HeadingLevel::H6 => 6,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_sections_single_heading_with_list() {
let md = "## Login Flow\n\n- Step one\n- Step two\n";
let sections = extract_sections(md);
assert_eq!(sections.len(), 1);
assert_eq!(sections[0].heading, "Login Flow");
assert_eq!(sections[0].heading_level, 2);
assert_eq!(sections[0].list_items.len(), 2);
assert_eq!(sections[0].list_items[0], "Step one");
}
#[test]
fn test_extract_sections_two_headings() {
let md = "## First\n\nParagraph one.\n\n## Second\n\nParagraph two.\n";
let sections = extract_sections(md);
assert_eq!(sections.len(), 2);
assert_eq!(sections[0].heading, "First");
assert_eq!(sections[1].heading, "Second");
}
#[test]
fn test_extract_sections_ordered_list_detected() {
let md = "## Steps\n\n1. Do this\n2. Do that\n";
let sections = extract_sections(md);
assert_eq!(sections.len(), 1);
assert!(sections[0].is_ordered_list);
assert_eq!(sections[0].list_items.len(), 2);
}
#[test]
fn test_extract_sections_code_block_captured() {
let md = "## Code Example\n\n```rust\nfn main() {}\n```\n";
let sections = extract_sections(md);
assert_eq!(sections.len(), 1);
assert_eq!(sections[0].code_blocks.len(), 1);
assert_eq!(sections[0].code_blocks[0].0.as_deref(), Some("rust"));
assert!(sections[0].code_blocks[0].1.contains("fn main()"));
}
#[test]
fn test_extract_sections_no_headings_returns_single_section() {
let md = "Just some text without any headings.\n";
let sections = extract_sections(md);
assert_eq!(sections.len(), 1);
assert!(sections[0].heading.is_empty());
assert!(sections[0].body_text.contains("Just some text"));
}
#[test]
fn test_extract_sections_empty_input_returns_empty() {
let sections = extract_sections("");
assert!(sections.is_empty());
}
#[test]
fn test_extract_sections_mixed_content() {
let md = "\
## Constraints
- Must do X
- Must not do Y
Some explanatory text.
## Flow
1. Step one
2. Step two
```bash
echo hello
```
";
let sections = extract_sections(md);
assert_eq!(sections.len(), 2);
assert_eq!(sections[0].list_items.len(), 2);
assert!(sections[0].body_text.contains("explanatory"));
assert_eq!(sections[1].list_items.len(), 2);
assert!(sections[1].is_ordered_list);
assert_eq!(sections[1].code_blocks.len(), 1);
}
#[test]
fn test_extract_sections_nested_headings() {
let md = "## Parent\n\nText.\n\n### Child\n\nChild text.\n";
let sections = extract_sections(md);
assert_eq!(sections.len(), 2);
assert_eq!(sections[0].heading, "Parent");
assert_eq!(sections[0].heading_level, 2);
assert_eq!(sections[1].heading, "Child");
assert_eq!(sections[1].heading_level, 3);
}
#[test]
fn test_extract_sections_source_line_numbers() {
let md = "## First\n\nLine.\n\n## Second\n\nText.\n";
let sections = extract_sections(md);
assert_eq!(sections.len(), 2);
assert!(sections[0].source_line_start >= 1);
assert!(sections[1].source_line_start > sections[0].source_line_start);
}
#[test]
fn test_extract_sections_body_text_paragraph() {
let md = "## Heading\n\nFirst paragraph text.\n\nSecond paragraph text.\n";
let sections = extract_sections(md);
assert_eq!(sections.len(), 1);
assert!(sections[0].body_text.contains("First paragraph"));
}
}