Skip to main content

kbolt_core/ingest/
markdown.rs

1use std::collections::HashMap;
2use std::ops::Range;
3use std::path::Path;
4
5use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag, TagEnd};
6
7use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument, Extractor};
8use crate::Result;
9
10pub struct MarkdownExtractor;
11
12impl Extractor for MarkdownExtractor {
13    fn supports(&self) -> &[&str] {
14        &["md", "markdown", "mdown", "mkd"]
15    }
16
17    fn profile_key(&self) -> &'static str {
18        "md"
19    }
20
21    fn version(&self) -> u32 {
22        2
23    }
24
25    fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
26        let source = std::str::from_utf8(bytes).map_err(|err| {
27            kbolt_types::KboltError::InvalidInput(format!("non-utf8 markdown input: {err}"))
28        })?;
29
30        let mut blocks = Vec::new();
31        let mut heading_stack: Vec<String> = Vec::new();
32        let mut open_blocks: Vec<OpenBlock> = Vec::new();
33        let mut title: Option<String> = None;
34        let parser = Parser::new_ext(source, Options::all());
35
36        for (event, range) in parser.into_offset_iter() {
37            match event {
38                Event::Start(tag) => {
39                    if let Some(open) =
40                        open_block_for_tag(&tag, range.start, &heading_stack, &open_blocks)
41                    {
42                        open_blocks.push(open);
43                    }
44                }
45                Event::End(tag_end) => {
46                    let Some(index) = open_blocks
47                        .iter()
48                        .rposition(|open| open.matches_end(&tag_end))
49                    else {
50                        continue;
51                    };
52                    let open = open_blocks.remove(index);
53                    let exclude_end = range.end.min(source.len());
54                    for parent in &mut open_blocks {
55                        if parent.start <= open.start {
56                            parent.excluded_ranges.push(open.start..exclude_end);
57                        }
58                    }
59
60                    let span_end = trim_trailing_newlines(source, open.start, range.end);
61                    if span_end <= open.start {
62                        continue;
63                    }
64
65                    let text = block_text(source, open.start, span_end, &open.excluded_ranges);
66                    if text.trim().is_empty() {
67                        continue;
68                    }
69
70                    if let OpenKind::Heading(level) = open.kind {
71                        let heading = extract_heading_label(text.as_str());
72                        if !heading.is_empty() {
73                            apply_heading(&mut heading_stack, level, heading.clone());
74                            if title.is_none() {
75                                title = Some(heading);
76                            }
77                        }
78                    }
79
80                    let length = text.len();
81                    blocks.push(ExtractedBlock {
82                        text,
83                        offset: open.start,
84                        length,
85                        kind: open.block_kind,
86                        heading_path: open.heading_path,
87                        attrs: open.attrs,
88                    });
89                }
90                _ => {}
91            }
92        }
93
94        blocks.sort_by_key(|block| block.offset);
95
96        Ok(ExtractedDocument {
97            blocks,
98            metadata: HashMap::new(),
99            title,
100        })
101    }
102}
103
104#[derive(Debug, Clone)]
105struct OpenBlock {
106    kind: OpenKind,
107    block_kind: BlockKind,
108    start: usize,
109    heading_path: Vec<String>,
110    attrs: HashMap<String, String>,
111    excluded_ranges: Vec<Range<usize>>,
112}
113
114impl OpenBlock {
115    fn matches_end(&self, end: &TagEnd) -> bool {
116        match (&self.kind, end) {
117            (OpenKind::Heading(level), TagEnd::Heading(end_level)) => {
118                *level == heading_level(end_level)
119            }
120            (OpenKind::Paragraph, TagEnd::Paragraph) => true,
121            (OpenKind::ListItem, TagEnd::Item) => true,
122            (OpenKind::BlockQuote, TagEnd::BlockQuote(_)) => true,
123            (OpenKind::CodeFence, TagEnd::CodeBlock) => true,
124            (OpenKind::TableHeader, TagEnd::TableHead) => true,
125            (OpenKind::TableRow, TagEnd::TableRow) => true,
126            (OpenKind::HtmlBlock, TagEnd::HtmlBlock) => true,
127            _ => false,
128        }
129    }
130}
131
132#[derive(Debug, Clone, Copy)]
133enum OpenKind {
134    Heading(usize),
135    Paragraph,
136    ListItem,
137    BlockQuote,
138    CodeFence,
139    TableHeader,
140    TableRow,
141    HtmlBlock,
142}
143
144fn open_block_for_tag(
145    tag: &Tag<'_>,
146    start: usize,
147    heading_path: &[String],
148    open_blocks: &[OpenBlock],
149) -> Option<OpenBlock> {
150    let (kind, block_kind, attrs) = match tag {
151        Tag::Heading { level, .. } => (
152            OpenKind::Heading(heading_level(level)),
153            BlockKind::Heading,
154            HashMap::new(),
155        ),
156        Tag::Paragraph if inside_list_or_quote(open_blocks) => return None,
157        Tag::Paragraph => (OpenKind::Paragraph, BlockKind::Paragraph, HashMap::new()),
158        Tag::Item => (OpenKind::ListItem, BlockKind::ListItem, HashMap::new()),
159        Tag::BlockQuote(_) => (OpenKind::BlockQuote, BlockKind::BlockQuote, HashMap::new()),
160        Tag::CodeBlock(kind) => {
161            let mut attrs = HashMap::new();
162            if let CodeBlockKind::Fenced(info) = kind {
163                if let Some(language) = info.split_whitespace().next() {
164                    if !language.is_empty() {
165                        attrs.insert("language".to_string(), language.to_string());
166                    }
167                }
168            }
169            (OpenKind::CodeFence, BlockKind::CodeFence, attrs)
170        }
171        Tag::TableHead => (
172            OpenKind::TableHeader,
173            BlockKind::TableHeader,
174            HashMap::new(),
175        ),
176        Tag::TableRow => (OpenKind::TableRow, BlockKind::TableRow, HashMap::new()),
177        Tag::HtmlBlock => (OpenKind::HtmlBlock, BlockKind::HtmlBlock, HashMap::new()),
178        _ => return None,
179    };
180
181    Some(OpenBlock {
182        kind,
183        block_kind,
184        start,
185        heading_path: heading_path.to_vec(),
186        attrs,
187        excluded_ranges: Vec::new(),
188    })
189}
190
191fn inside_list_or_quote(open_blocks: &[OpenBlock]) -> bool {
192    open_blocks
193        .iter()
194        .any(|open| matches!(open.kind, OpenKind::ListItem | OpenKind::BlockQuote))
195}
196
197fn heading_level(level: &HeadingLevel) -> usize {
198    match level {
199        HeadingLevel::H1 => 1,
200        HeadingLevel::H2 => 2,
201        HeadingLevel::H3 => 3,
202        HeadingLevel::H4 => 4,
203        HeadingLevel::H5 => 5,
204        HeadingLevel::H6 => 6,
205    }
206}
207
208fn apply_heading(stack: &mut Vec<String>, level: usize, heading: String) {
209    while stack.len() >= level {
210        stack.pop();
211    }
212    stack.push(heading);
213}
214
215fn extract_heading_label(raw_markdown: &str) -> String {
216    let line = raw_markdown.lines().next().unwrap_or("").trim();
217    let stripped = line
218        .trim_start_matches('#')
219        .trim()
220        .trim_end_matches('#')
221        .trim();
222
223    if stripped.is_empty() {
224        line.to_string()
225    } else {
226        stripped.to_string()
227    }
228}
229
230fn trim_trailing_newlines(source: &str, start: usize, end: usize) -> usize {
231    let bytes = source.as_bytes();
232    let mut cursor = end.min(bytes.len());
233    while cursor > start && matches!(bytes[cursor - 1], b'\n' | b'\r') {
234        cursor -= 1;
235    }
236    cursor
237}
238
239fn block_text(source: &str, start: usize, end: usize, excluded_ranges: &[Range<usize>]) -> String {
240    if excluded_ranges.is_empty() {
241        return source[start..end].to_string();
242    }
243
244    let mut ranges = excluded_ranges
245        .iter()
246        .filter_map(|range| {
247            let range_start = range.start.max(start).min(end);
248            let range_end = range.end.max(start).min(end);
249            (range_start < range_end).then_some(range_start..range_end)
250        })
251        .collect::<Vec<_>>();
252    ranges.sort_by_key(|range| range.start);
253
254    let mut text = String::new();
255    let mut cursor = start;
256    for range in ranges {
257        if range.start > cursor {
258            text.push_str(&source[cursor..range.start]);
259        }
260        cursor = cursor.max(range.end);
261    }
262    if cursor < end {
263        text.push_str(&source[cursor..end]);
264    }
265
266    text.trim_end_matches([' ', '\t', '\n', '\r']).to_string()
267}
268
269#[cfg(test)]
270mod tests {
271    use std::path::Path;
272
273    use crate::ingest::extract::{BlockKind, Extractor};
274    use crate::ingest::markdown::MarkdownExtractor;
275
276    #[test]
277    fn extracts_heading_paths_for_nested_sections() {
278        let extractor = MarkdownExtractor;
279        assert_eq!(extractor.profile_key(), "md");
280        let markdown = br#"# Title
281Intro paragraph.
282
283## Details
284More text.
285"#;
286
287        let doc = extractor
288            .extract(Path::new("docs/readme.md"), markdown)
289            .expect("extract markdown");
290
291        assert_eq!(doc.title.as_deref(), Some("Title"));
292        assert!(
293            doc.blocks
294                .iter()
295                .any(|block| block.kind == BlockKind::Heading),
296            "expected heading blocks"
297        );
298        assert!(
299            doc.blocks.iter().any(|block| {
300                block.kind == BlockKind::Paragraph
301                    && block.heading_path == vec!["Title".to_string(), "Details".to_string()]
302            }),
303            "expected paragraph to carry nested heading path"
304        );
305    }
306
307    #[test]
308    fn emits_list_quote_and_code_blocks_with_attrs() {
309        let extractor = MarkdownExtractor;
310        let markdown = br#"# Guide
311- first item
312
313> quoted text
314
315```rust
316fn main() {}
317```
318"#;
319
320        let doc = extractor
321            .extract(Path::new("docs/guide.md"), markdown)
322            .expect("extract markdown");
323
324        assert!(doc
325            .blocks
326            .iter()
327            .any(|block| block.kind == BlockKind::ListItem));
328        assert!(doc
329            .blocks
330            .iter()
331            .any(|block| block.kind == BlockKind::BlockQuote));
332        let code = doc
333            .blocks
334            .iter()
335            .find(|block| block.kind == BlockKind::CodeFence)
336            .expect("code fence block");
337        assert_eq!(code.attrs.get("language").map(String::as_str), Some("rust"));
338    }
339
340    #[test]
341    fn nested_list_items_do_not_duplicate_child_text() {
342        let extractor = MarkdownExtractor;
343        let doc = extractor
344            .extract(
345                Path::new("docs/list.md"),
346                br#"- parent listtarget
347  - child nestedtarget
348"#,
349            )
350            .expect("extract markdown");
351
352        let list_items = doc
353            .blocks
354            .iter()
355            .filter(|block| block.kind == BlockKind::ListItem)
356            .map(|block| block.text.as_str())
357            .collect::<Vec<_>>();
358        assert_eq!(
359            list_items,
360            vec!["- parent listtarget", "- child nestedtarget"]
361        );
362
363        let canonical = list_items.join("\n\n");
364        assert_eq!(canonical.matches("nestedtarget").count(), 1);
365        assert!(!canonical.contains("listtarget\n  - child"));
366        assert!(doc
367            .blocks
368            .iter()
369            .all(|block| block.length == block.text.len()));
370    }
371
372    #[test]
373    fn rejects_non_utf8_markdown_bytes() {
374        let extractor = MarkdownExtractor;
375        let err = extractor
376            .extract(Path::new("docs/readme.md"), &[0xff, 0xfe, 0xfd])
377            .expect_err("invalid utf8 should fail");
378        assert!(err.to_string().contains("non-utf8 markdown input"));
379    }
380}