Skip to main content

kbolt_core/ingest/
markdown.rs

1use std::collections::HashMap;
2use std::path::Path;
3
4use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag, TagEnd};
5
6use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument, Extractor};
7use crate::Result;
8
9pub struct MarkdownExtractor;
10
11impl Extractor for MarkdownExtractor {
12    fn supports(&self) -> &[&str] {
13        &["md", "markdown", "mdown", "mkd"]
14    }
15
16    fn profile_key(&self) -> &'static str {
17        "md"
18    }
19
20    fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
21        let source = std::str::from_utf8(bytes).map_err(|err| {
22            kbolt_types::KboltError::InvalidInput(format!("non-utf8 markdown input: {err}"))
23        })?;
24
25        let mut blocks = Vec::new();
26        let mut heading_stack: Vec<String> = Vec::new();
27        let mut open_blocks: Vec<OpenBlock> = Vec::new();
28        let mut title: Option<String> = None;
29        let parser = Parser::new_ext(source, Options::all());
30
31        for (event, range) in parser.into_offset_iter() {
32            match event {
33                Event::Start(tag) => {
34                    if let Some(open) =
35                        open_block_for_tag(&tag, range.start, &heading_stack, &open_blocks)
36                    {
37                        open_blocks.push(open);
38                    }
39                }
40                Event::End(tag_end) => {
41                    let Some(index) = open_blocks
42                        .iter()
43                        .rposition(|open| open.matches_end(&tag_end))
44                    else {
45                        continue;
46                    };
47                    let open = open_blocks.remove(index);
48                    let span_end = trim_trailing_newlines(source, open.start, range.end);
49                    if span_end <= open.start {
50                        continue;
51                    }
52
53                    let text = source[open.start..span_end].to_string();
54                    if text.trim().is_empty() {
55                        continue;
56                    }
57
58                    if let OpenKind::Heading(level) = open.kind {
59                        let heading = extract_heading_label(text.as_str());
60                        if !heading.is_empty() {
61                            apply_heading(&mut heading_stack, level, heading.clone());
62                            if title.is_none() {
63                                title = Some(heading);
64                            }
65                        }
66                    }
67
68                    blocks.push(ExtractedBlock {
69                        text,
70                        offset: open.start,
71                        length: span_end.saturating_sub(open.start),
72                        kind: open.block_kind,
73                        heading_path: open.heading_path,
74                        attrs: open.attrs,
75                    });
76                }
77                _ => {}
78            }
79        }
80
81        Ok(ExtractedDocument {
82            blocks,
83            metadata: HashMap::new(),
84            title,
85        })
86    }
87}
88
89#[derive(Debug, Clone)]
90struct OpenBlock {
91    kind: OpenKind,
92    block_kind: BlockKind,
93    start: usize,
94    heading_path: Vec<String>,
95    attrs: HashMap<String, String>,
96}
97
98impl OpenBlock {
99    fn matches_end(&self, end: &TagEnd) -> bool {
100        match (&self.kind, end) {
101            (OpenKind::Heading(level), TagEnd::Heading(end_level)) => {
102                *level == heading_level(end_level)
103            }
104            (OpenKind::Paragraph, TagEnd::Paragraph) => true,
105            (OpenKind::ListItem, TagEnd::Item) => true,
106            (OpenKind::BlockQuote, TagEnd::BlockQuote(_)) => true,
107            (OpenKind::CodeFence, TagEnd::CodeBlock) => true,
108            (OpenKind::TableHeader, TagEnd::TableHead) => true,
109            (OpenKind::TableRow, TagEnd::TableRow) => true,
110            (OpenKind::HtmlBlock, TagEnd::HtmlBlock) => true,
111            _ => false,
112        }
113    }
114}
115
116#[derive(Debug, Clone, Copy)]
117enum OpenKind {
118    Heading(usize),
119    Paragraph,
120    ListItem,
121    BlockQuote,
122    CodeFence,
123    TableHeader,
124    TableRow,
125    HtmlBlock,
126}
127
128fn open_block_for_tag(
129    tag: &Tag<'_>,
130    start: usize,
131    heading_path: &[String],
132    open_blocks: &[OpenBlock],
133) -> Option<OpenBlock> {
134    let (kind, block_kind, attrs) = match tag {
135        Tag::Heading { level, .. } => (
136            OpenKind::Heading(heading_level(level)),
137            BlockKind::Heading,
138            HashMap::new(),
139        ),
140        Tag::Paragraph if inside_list_or_quote(open_blocks) => return None,
141        Tag::Paragraph => (OpenKind::Paragraph, BlockKind::Paragraph, HashMap::new()),
142        Tag::Item => (OpenKind::ListItem, BlockKind::ListItem, HashMap::new()),
143        Tag::BlockQuote(_) => (OpenKind::BlockQuote, BlockKind::BlockQuote, HashMap::new()),
144        Tag::CodeBlock(kind) => {
145            let mut attrs = HashMap::new();
146            if let CodeBlockKind::Fenced(info) = kind {
147                if let Some(language) = info.split_whitespace().next() {
148                    if !language.is_empty() {
149                        attrs.insert("language".to_string(), language.to_string());
150                    }
151                }
152            }
153            (OpenKind::CodeFence, BlockKind::CodeFence, attrs)
154        }
155        Tag::TableHead => (
156            OpenKind::TableHeader,
157            BlockKind::TableHeader,
158            HashMap::new(),
159        ),
160        Tag::TableRow => (OpenKind::TableRow, BlockKind::TableRow, HashMap::new()),
161        Tag::HtmlBlock => (OpenKind::HtmlBlock, BlockKind::HtmlBlock, HashMap::new()),
162        _ => return None,
163    };
164
165    Some(OpenBlock {
166        kind,
167        block_kind,
168        start,
169        heading_path: heading_path.to_vec(),
170        attrs,
171    })
172}
173
174fn inside_list_or_quote(open_blocks: &[OpenBlock]) -> bool {
175    open_blocks
176        .iter()
177        .any(|open| matches!(open.kind, OpenKind::ListItem | OpenKind::BlockQuote))
178}
179
180fn heading_level(level: &HeadingLevel) -> usize {
181    match level {
182        HeadingLevel::H1 => 1,
183        HeadingLevel::H2 => 2,
184        HeadingLevel::H3 => 3,
185        HeadingLevel::H4 => 4,
186        HeadingLevel::H5 => 5,
187        HeadingLevel::H6 => 6,
188    }
189}
190
191fn apply_heading(stack: &mut Vec<String>, level: usize, heading: String) {
192    while stack.len() >= level {
193        stack.pop();
194    }
195    stack.push(heading);
196}
197
198fn extract_heading_label(raw_markdown: &str) -> String {
199    let line = raw_markdown.lines().next().unwrap_or("").trim();
200    let stripped = line
201        .trim_start_matches('#')
202        .trim()
203        .trim_end_matches('#')
204        .trim();
205
206    if stripped.is_empty() {
207        line.to_string()
208    } else {
209        stripped.to_string()
210    }
211}
212
213fn trim_trailing_newlines(source: &str, start: usize, end: usize) -> usize {
214    let bytes = source.as_bytes();
215    let mut cursor = end.min(bytes.len());
216    while cursor > start && matches!(bytes[cursor - 1], b'\n' | b'\r') {
217        cursor -= 1;
218    }
219    cursor
220}
221
222#[cfg(test)]
223mod tests {
224    use std::path::Path;
225
226    use crate::ingest::extract::{BlockKind, Extractor};
227    use crate::ingest::markdown::MarkdownExtractor;
228
229    #[test]
230    fn extracts_heading_paths_for_nested_sections() {
231        let extractor = MarkdownExtractor;
232        assert_eq!(extractor.profile_key(), "md");
233        let markdown = br#"# Title
234Intro paragraph.
235
236## Details
237More text.
238"#;
239
240        let doc = extractor
241            .extract(Path::new("docs/readme.md"), markdown)
242            .expect("extract markdown");
243
244        assert_eq!(doc.title.as_deref(), Some("Title"));
245        assert!(
246            doc.blocks
247                .iter()
248                .any(|block| block.kind == BlockKind::Heading),
249            "expected heading blocks"
250        );
251        assert!(
252            doc.blocks.iter().any(|block| {
253                block.kind == BlockKind::Paragraph
254                    && block.heading_path == vec!["Title".to_string(), "Details".to_string()]
255            }),
256            "expected paragraph to carry nested heading path"
257        );
258    }
259
260    #[test]
261    fn emits_list_quote_and_code_blocks_with_attrs() {
262        let extractor = MarkdownExtractor;
263        let markdown = br#"# Guide
264- first item
265
266> quoted text
267
268```rust
269fn main() {}
270```
271"#;
272
273        let doc = extractor
274            .extract(Path::new("docs/guide.md"), markdown)
275            .expect("extract markdown");
276
277        assert!(doc
278            .blocks
279            .iter()
280            .any(|block| block.kind == BlockKind::ListItem));
281        assert!(doc
282            .blocks
283            .iter()
284            .any(|block| block.kind == BlockKind::BlockQuote));
285        let code = doc
286            .blocks
287            .iter()
288            .find(|block| block.kind == BlockKind::CodeFence)
289            .expect("code fence block");
290        assert_eq!(code.attrs.get("language").map(String::as_str), Some("rust"));
291    }
292
293    #[test]
294    fn rejects_non_utf8_markdown_bytes() {
295        let extractor = MarkdownExtractor;
296        let err = extractor
297            .extract(Path::new("docs/readme.md"), &[0xff, 0xfe, 0xfd])
298            .expect_err("invalid utf8 should fail");
299        assert!(err.to_string().contains("non-utf8 markdown input"));
300    }
301}