1use std::collections::HashMap;
2use std::ops::Range;
3use std::path::Path;
4
5use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag, TagEnd};
6
7use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument, Extractor};
8use crate::Result;
9
10pub struct MarkdownExtractor;
11
12impl Extractor for MarkdownExtractor {
13 fn supports(&self) -> &[&str] {
14 &["md", "markdown", "mdown", "mkd"]
15 }
16
17 fn profile_key(&self) -> &'static str {
18 "md"
19 }
20
21 fn version(&self) -> u32 {
22 2
23 }
24
25 fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
26 let source = std::str::from_utf8(bytes).map_err(|err| {
27 kbolt_types::KboltError::InvalidInput(format!("non-utf8 markdown input: {err}"))
28 })?;
29
30 let mut blocks = Vec::new();
31 let mut heading_stack: Vec<String> = Vec::new();
32 let mut open_blocks: Vec<OpenBlock> = Vec::new();
33 let mut title: Option<String> = None;
34 let parser = Parser::new_ext(source, Options::all());
35
36 for (event, range) in parser.into_offset_iter() {
37 match event {
38 Event::Start(tag) => {
39 if let Some(open) =
40 open_block_for_tag(&tag, range.start, &heading_stack, &open_blocks)
41 {
42 open_blocks.push(open);
43 }
44 }
45 Event::End(tag_end) => {
46 let Some(index) = open_blocks
47 .iter()
48 .rposition(|open| open.matches_end(&tag_end))
49 else {
50 continue;
51 };
52 let open = open_blocks.remove(index);
53 let exclude_end = range.end.min(source.len());
54 for parent in &mut open_blocks {
55 if parent.start <= open.start {
56 parent.excluded_ranges.push(open.start..exclude_end);
57 }
58 }
59
60 let span_end = trim_trailing_newlines(source, open.start, range.end);
61 if span_end <= open.start {
62 continue;
63 }
64
65 let text = block_text(source, open.start, span_end, &open.excluded_ranges);
66 if text.trim().is_empty() {
67 continue;
68 }
69
70 if let OpenKind::Heading(level) = open.kind {
71 let heading = extract_heading_label(text.as_str());
72 if !heading.is_empty() {
73 apply_heading(&mut heading_stack, level, heading.clone());
74 if title.is_none() {
75 title = Some(heading);
76 }
77 }
78 }
79
80 let length = text.len();
81 blocks.push(ExtractedBlock {
82 text,
83 offset: open.start,
84 length,
85 kind: open.block_kind,
86 heading_path: open.heading_path,
87 attrs: open.attrs,
88 });
89 }
90 _ => {}
91 }
92 }
93
94 blocks.sort_by_key(|block| block.offset);
95
96 Ok(ExtractedDocument {
97 blocks,
98 metadata: HashMap::new(),
99 title,
100 })
101 }
102}
103
104#[derive(Debug, Clone)]
105struct OpenBlock {
106 kind: OpenKind,
107 block_kind: BlockKind,
108 start: usize,
109 heading_path: Vec<String>,
110 attrs: HashMap<String, String>,
111 excluded_ranges: Vec<Range<usize>>,
112}
113
114impl OpenBlock {
115 fn matches_end(&self, end: &TagEnd) -> bool {
116 match (&self.kind, end) {
117 (OpenKind::Heading(level), TagEnd::Heading(end_level)) => {
118 *level == heading_level(end_level)
119 }
120 (OpenKind::Paragraph, TagEnd::Paragraph) => true,
121 (OpenKind::ListItem, TagEnd::Item) => true,
122 (OpenKind::BlockQuote, TagEnd::BlockQuote(_)) => true,
123 (OpenKind::CodeFence, TagEnd::CodeBlock) => true,
124 (OpenKind::TableHeader, TagEnd::TableHead) => true,
125 (OpenKind::TableRow, TagEnd::TableRow) => true,
126 (OpenKind::HtmlBlock, TagEnd::HtmlBlock) => true,
127 _ => false,
128 }
129 }
130}
131
132#[derive(Debug, Clone, Copy)]
133enum OpenKind {
134 Heading(usize),
135 Paragraph,
136 ListItem,
137 BlockQuote,
138 CodeFence,
139 TableHeader,
140 TableRow,
141 HtmlBlock,
142}
143
144fn open_block_for_tag(
145 tag: &Tag<'_>,
146 start: usize,
147 heading_path: &[String],
148 open_blocks: &[OpenBlock],
149) -> Option<OpenBlock> {
150 let (kind, block_kind, attrs) = match tag {
151 Tag::Heading { level, .. } => (
152 OpenKind::Heading(heading_level(level)),
153 BlockKind::Heading,
154 HashMap::new(),
155 ),
156 Tag::Paragraph if inside_list_or_quote(open_blocks) => return None,
157 Tag::Paragraph => (OpenKind::Paragraph, BlockKind::Paragraph, HashMap::new()),
158 Tag::Item => (OpenKind::ListItem, BlockKind::ListItem, HashMap::new()),
159 Tag::BlockQuote(_) => (OpenKind::BlockQuote, BlockKind::BlockQuote, HashMap::new()),
160 Tag::CodeBlock(kind) => {
161 let mut attrs = HashMap::new();
162 if let CodeBlockKind::Fenced(info) = kind {
163 if let Some(language) = info.split_whitespace().next() {
164 if !language.is_empty() {
165 attrs.insert("language".to_string(), language.to_string());
166 }
167 }
168 }
169 (OpenKind::CodeFence, BlockKind::CodeFence, attrs)
170 }
171 Tag::TableHead => (
172 OpenKind::TableHeader,
173 BlockKind::TableHeader,
174 HashMap::new(),
175 ),
176 Tag::TableRow => (OpenKind::TableRow, BlockKind::TableRow, HashMap::new()),
177 Tag::HtmlBlock => (OpenKind::HtmlBlock, BlockKind::HtmlBlock, HashMap::new()),
178 _ => return None,
179 };
180
181 Some(OpenBlock {
182 kind,
183 block_kind,
184 start,
185 heading_path: heading_path.to_vec(),
186 attrs,
187 excluded_ranges: Vec::new(),
188 })
189}
190
191fn inside_list_or_quote(open_blocks: &[OpenBlock]) -> bool {
192 open_blocks
193 .iter()
194 .any(|open| matches!(open.kind, OpenKind::ListItem | OpenKind::BlockQuote))
195}
196
197fn heading_level(level: &HeadingLevel) -> usize {
198 match level {
199 HeadingLevel::H1 => 1,
200 HeadingLevel::H2 => 2,
201 HeadingLevel::H3 => 3,
202 HeadingLevel::H4 => 4,
203 HeadingLevel::H5 => 5,
204 HeadingLevel::H6 => 6,
205 }
206}
207
208fn apply_heading(stack: &mut Vec<String>, level: usize, heading: String) {
209 while stack.len() >= level {
210 stack.pop();
211 }
212 stack.push(heading);
213}
214
215fn extract_heading_label(raw_markdown: &str) -> String {
216 let line = raw_markdown.lines().next().unwrap_or("").trim();
217 let stripped = line
218 .trim_start_matches('#')
219 .trim()
220 .trim_end_matches('#')
221 .trim();
222
223 if stripped.is_empty() {
224 line.to_string()
225 } else {
226 stripped.to_string()
227 }
228}
229
230fn trim_trailing_newlines(source: &str, start: usize, end: usize) -> usize {
231 let bytes = source.as_bytes();
232 let mut cursor = end.min(bytes.len());
233 while cursor > start && matches!(bytes[cursor - 1], b'\n' | b'\r') {
234 cursor -= 1;
235 }
236 cursor
237}
238
239fn block_text(source: &str, start: usize, end: usize, excluded_ranges: &[Range<usize>]) -> String {
240 if excluded_ranges.is_empty() {
241 return source[start..end].to_string();
242 }
243
244 let mut ranges = excluded_ranges
245 .iter()
246 .filter_map(|range| {
247 let range_start = range.start.max(start).min(end);
248 let range_end = range.end.max(start).min(end);
249 (range_start < range_end).then_some(range_start..range_end)
250 })
251 .collect::<Vec<_>>();
252 ranges.sort_by_key(|range| range.start);
253
254 let mut text = String::new();
255 let mut cursor = start;
256 for range in ranges {
257 if range.start > cursor {
258 text.push_str(&source[cursor..range.start]);
259 }
260 cursor = cursor.max(range.end);
261 }
262 if cursor < end {
263 text.push_str(&source[cursor..end]);
264 }
265
266 text.trim_end_matches([' ', '\t', '\n', '\r']).to_string()
267}
268
269#[cfg(test)]
270mod tests {
271 use std::path::Path;
272
273 use crate::ingest::extract::{BlockKind, Extractor};
274 use crate::ingest::markdown::MarkdownExtractor;
275
276 #[test]
277 fn extracts_heading_paths_for_nested_sections() {
278 let extractor = MarkdownExtractor;
279 assert_eq!(extractor.profile_key(), "md");
280 let markdown = br#"# Title
281Intro paragraph.
282
283## Details
284More text.
285"#;
286
287 let doc = extractor
288 .extract(Path::new("docs/readme.md"), markdown)
289 .expect("extract markdown");
290
291 assert_eq!(doc.title.as_deref(), Some("Title"));
292 assert!(
293 doc.blocks
294 .iter()
295 .any(|block| block.kind == BlockKind::Heading),
296 "expected heading blocks"
297 );
298 assert!(
299 doc.blocks.iter().any(|block| {
300 block.kind == BlockKind::Paragraph
301 && block.heading_path == vec!["Title".to_string(), "Details".to_string()]
302 }),
303 "expected paragraph to carry nested heading path"
304 );
305 }
306
307 #[test]
308 fn emits_list_quote_and_code_blocks_with_attrs() {
309 let extractor = MarkdownExtractor;
310 let markdown = br#"# Guide
311- first item
312
313> quoted text
314
315```rust
316fn main() {}
317```
318"#;
319
320 let doc = extractor
321 .extract(Path::new("docs/guide.md"), markdown)
322 .expect("extract markdown");
323
324 assert!(doc
325 .blocks
326 .iter()
327 .any(|block| block.kind == BlockKind::ListItem));
328 assert!(doc
329 .blocks
330 .iter()
331 .any(|block| block.kind == BlockKind::BlockQuote));
332 let code = doc
333 .blocks
334 .iter()
335 .find(|block| block.kind == BlockKind::CodeFence)
336 .expect("code fence block");
337 assert_eq!(code.attrs.get("language").map(String::as_str), Some("rust"));
338 }
339
340 #[test]
341 fn nested_list_items_do_not_duplicate_child_text() {
342 let extractor = MarkdownExtractor;
343 let doc = extractor
344 .extract(
345 Path::new("docs/list.md"),
346 br#"- parent listtarget
347 - child nestedtarget
348"#,
349 )
350 .expect("extract markdown");
351
352 let list_items = doc
353 .blocks
354 .iter()
355 .filter(|block| block.kind == BlockKind::ListItem)
356 .map(|block| block.text.as_str())
357 .collect::<Vec<_>>();
358 assert_eq!(
359 list_items,
360 vec!["- parent listtarget", "- child nestedtarget"]
361 );
362
363 let canonical = list_items.join("\n\n");
364 assert_eq!(canonical.matches("nestedtarget").count(), 1);
365 assert!(!canonical.contains("listtarget\n - child"));
366 assert!(doc
367 .blocks
368 .iter()
369 .all(|block| block.length == block.text.len()));
370 }
371
372 #[test]
373 fn rejects_non_utf8_markdown_bytes() {
374 let extractor = MarkdownExtractor;
375 let err = extractor
376 .extract(Path::new("docs/readme.md"), &[0xff, 0xfe, 0xfd])
377 .expect_err("invalid utf8 should fail");
378 assert!(err.to_string().contains("non-utf8 markdown input"));
379 }
380}