Skip to main content

kardo_core/parser/
markdown.rs

1//! Markdown parsing via `pulldown-cmark`.
2//!
3//! Extracts headings, links, code blocks, word count, and list count from Markdown content.
4
5use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
6use serde::Serialize;
7
8#[derive(Debug, Clone, Serialize)]
9pub struct Heading {
10    pub level: u8,
11    pub text: String,
12    /// Slug-style anchor derived from heading text (e.g. "my-heading").
13    pub anchor: String,
14}
15
16#[derive(Debug, Clone, Serialize)]
17pub struct Link {
18    pub text: String,
19    pub url: String,
20    /// `true` if the link points to a relative file path (not http/https, not anchor).
21    pub is_internal: bool,
22    /// Whether the target file actually exists on disk. Set to `None` by the parser;
23    /// resolved later by the integrity checker.
24    pub target_exists: Option<bool>,
25}
26
27#[derive(Debug, Clone, Serialize)]
28pub struct CodeBlock {
29    pub language: Option<String>,
30    pub line_count: usize,
31}
32
33#[derive(Debug, Clone, Serialize)]
34pub struct ParsedDocument {
35    pub headings: Vec<Heading>,
36    pub links: Vec<Link>,
37    pub code_blocks: Vec<CodeBlock>,
38    pub list_count: usize,
39    pub word_count: usize,
40    pub has_frontmatter: bool,
41    pub frontmatter_keys: Vec<String>,
42}
43
44/// Parse Markdown content into structured data.
45///
46/// Extracts headings (with hierarchy level and anchor), links (internal/external),
47/// code blocks (with language and line count), list count, and word count.
48/// Frontmatter keys are extracted separately via `parse_frontmatter`.
49pub fn parse_markdown(content: &str) -> ParsedDocument {
50    let has_frontmatter = content.starts_with("---\n")
51        || content.starts_with("---\r\n");
52
53    let frontmatter_keys = if has_frontmatter {
54        extract_frontmatter_keys(content)
55    } else {
56        vec![]
57    };
58
59    // Strip frontmatter before parsing markdown
60    let md_content = strip_frontmatter(content);
61
62    let mut options = Options::empty();
63    options.insert(Options::ENABLE_TABLES);
64    options.insert(Options::ENABLE_STRIKETHROUGH);
65
66    let parser = Parser::new_ext(md_content, options);
67
68    let mut headings = Vec::new();
69    let mut links = Vec::new();
70    let mut code_blocks = Vec::new();
71    let mut word_count = 0usize;
72    let mut list_count = 0usize;
73
74    let mut in_heading = false;
75    let mut current_heading_level: u8 = 0;
76    let mut current_heading_text = String::new();
77
78    let mut in_link = false;
79    let mut current_link_url = String::new();
80    let mut current_link_text = String::new();
81
82    let mut in_code_block = false;
83    let mut current_code_lang: Option<String> = None;
84    let mut current_code_content = String::new();
85
86    for event in parser {
87        match event {
88            Event::Start(Tag::Heading { level, .. }) => {
89                in_heading = true;
90                current_heading_level = level as u8;
91                current_heading_text.clear();
92            }
93            Event::End(TagEnd::Heading(_)) => {
94                in_heading = false;
95                let text = current_heading_text.trim().to_string();
96                let anchor = slugify(&text);
97                headings.push(Heading {
98                    level: current_heading_level,
99                    text,
100                    anchor,
101                });
102                current_heading_text.clear();
103            }
104            Event::Start(Tag::Link { dest_url, .. }) => {
105                in_link = true;
106                current_link_url = dest_url.to_string();
107                current_link_text.clear();
108            }
109            Event::End(TagEnd::Link) => {
110                in_link = false;
111                let is_internal = is_internal_link(&current_link_url);
112                links.push(Link {
113                    url: current_link_url.clone(),
114                    text: current_link_text.trim().to_string(),
115                    is_internal,
116                    target_exists: None,
117                });
118                current_link_url.clear();
119                current_link_text.clear();
120            }
121            Event::Start(Tag::List(_)) => {
122                list_count += 1;
123            }
124            Event::Start(Tag::CodeBlock(cb_kind)) => {
125                in_code_block = true;
126                current_code_content.clear();
127                current_code_lang = match cb_kind {
128                    pulldown_cmark::CodeBlockKind::Fenced(lang) => {
129                        let lang = lang.trim().to_string();
130                        if lang.is_empty() { None } else { Some(lang) }
131                    }
132                    pulldown_cmark::CodeBlockKind::Indented => None,
133                };
134            }
135            Event::End(TagEnd::CodeBlock) => {
136                in_code_block = false;
137                let line_count = current_code_content.lines().count();
138                code_blocks.push(CodeBlock {
139                    language: current_code_lang.take(),
140                    line_count,
141                });
142                current_code_content.clear();
143            }
144            Event::Text(text) => {
145                if in_heading {
146                    current_heading_text.push_str(&text);
147                }
148                if in_link {
149                    current_link_text.push_str(&text);
150                }
151                if in_code_block {
152                    current_code_content.push_str(&text);
153                } else {
154                    word_count += count_words(&text);
155                }
156            }
157            Event::Code(code) => {
158                word_count += count_words(&code);
159                if in_heading {
160                    current_heading_text.push_str(&code);
161                }
162                if in_link {
163                    current_link_text.push_str(&code);
164                }
165            }
166            _ => {}
167        }
168    }
169
170    ParsedDocument {
171        headings,
172        links,
173        code_blocks,
174        list_count,
175        word_count,
176        has_frontmatter,
177        frontmatter_keys,
178    }
179}
180
181/// Determine if a URL is an internal reference.
182/// Internal: relative paths (./docs/prd.md, ../README.md) and fragment-only (#heading).
183/// External: http://, https://, mailto:.
184fn is_internal_link(url: &str) -> bool {
185    if url.starts_with("http://")
186        || url.starts_with("https://")
187        || url.starts_with("mailto:")
188    {
189        return false;
190    }
191    true
192}
193
194/// Generate a slug-style anchor from heading text.
195/// Lowercases, replaces non-alphanumeric with hyphens, trims hyphens.
196fn slugify(text: &str) -> String {
197    let slug: String = text
198        .to_lowercase()
199        .chars()
200        .map(|c| if c.is_alphanumeric() { c } else { '-' })
201        .collect();
202    // Collapse multiple hyphens and trim
203    let mut result = String::new();
204    let mut prev_hyphen = true; // trim leading
205    for c in slug.chars() {
206        if c == '-' {
207            if !prev_hyphen {
208                result.push('-');
209            }
210            prev_hyphen = true;
211        } else {
212            result.push(c);
213            prev_hyphen = false;
214        }
215    }
216    // Trim trailing hyphen
217    if result.ends_with('-') {
218        result.pop();
219    }
220    result
221}
222
223/// Strip YAML frontmatter (between --- markers) from content.
224fn strip_frontmatter(content: &str) -> &str {
225    if !content.starts_with("---\n") && !content.starts_with("---\r\n") {
226        return content;
227    }
228
229    let after_first = if let Some(stripped) = content.strip_prefix("---\r\n") {
230        stripped
231    } else {
232        &content[4..]
233    };
234
235    if let Some(end_pos) = after_first.find("\n---") {
236        let skip = end_pos + 4; // "\n---"
237        let rest = &after_first[skip..];
238        if let Some(stripped) = rest.strip_prefix('\n') {
239            stripped
240        } else if let Some(stripped) = rest.strip_prefix("\r\n") {
241            stripped
242        } else {
243            rest
244        }
245    } else {
246        content
247    }
248}
249
250/// Extract frontmatter keys (simple YAML key: value lines).
251fn extract_frontmatter_keys(content: &str) -> Vec<String> {
252    let after_first = if let Some(stripped) = content.strip_prefix("---\r\n") {
253        stripped
254    } else if let Some(stripped) = content.strip_prefix("---\n") {
255        stripped
256    } else {
257        return vec![];
258    };
259
260    let end_pos = match after_first.find("\n---") {
261        Some(pos) => pos,
262        None => return vec![],
263    };
264
265    let fm_str = &after_first[..end_pos];
266    let mut keys = Vec::new();
267
268    for line in fm_str.lines() {
269        let line = line.trim();
270        if line.is_empty() || line.starts_with('#') {
271            continue;
272        }
273        if let Some((key, _)) = line.split_once(':') {
274            let key = key.trim();
275            if !key.is_empty() {
276                keys.push(key.to_string());
277            }
278        }
279    }
280
281    keys
282}
283
284/// Count words in a text fragment.
285fn count_words(text: &str) -> usize {
286    text.split_whitespace().count()
287}
288
289#[cfg(test)]
290mod tests {
291    use super::*;
292
293    #[test]
294    fn test_parse_headings() {
295        let md = "# Title\n\n## Section 1\n\n### Subsection\n\n## Section 2\n";
296        let doc = parse_markdown(md);
297        assert_eq!(doc.headings.len(), 4);
298        assert_eq!(doc.headings[0].level, 1);
299        assert_eq!(doc.headings[0].text, "Title");
300        assert_eq!(doc.headings[0].anchor, "title");
301        assert_eq!(doc.headings[1].level, 2);
302        assert_eq!(doc.headings[1].text, "Section 1");
303        assert_eq!(doc.headings[1].anchor, "section-1");
304        assert_eq!(doc.headings[2].level, 3);
305        assert_eq!(doc.headings[2].anchor, "subsection");
306    }
307
308    #[test]
309    fn test_parse_links_external() {
310        let md = "Check [Google](https://google.com) and [Docs](http://docs.rs).\n";
311        let doc = parse_markdown(md);
312        assert_eq!(doc.links.len(), 2);
313        assert_eq!(doc.links[0].url, "https://google.com");
314        assert_eq!(doc.links[0].text, "Google");
315        assert!(!doc.links[0].is_internal);
316        assert!(!doc.links[1].is_internal);
317        assert!(doc.links[0].target_exists.is_none());
318    }
319
320    #[test]
321    fn test_parse_links_internal() {
322        let md = "See [setup](./docs/setup.md) and [config](../config.yaml).\n";
323        let doc = parse_markdown(md);
324        assert_eq!(doc.links.len(), 2);
325        assert!(doc.links[0].is_internal);
326        assert_eq!(doc.links[0].url, "./docs/setup.md");
327        assert!(doc.links[1].is_internal);
328        assert!(doc.links[0].target_exists.is_none());
329    }
330
331    #[test]
332    fn test_parse_links_anchor() {
333        let md = "Jump to [section](#overview).\n";
334        let doc = parse_markdown(md);
335        assert_eq!(doc.links.len(), 1);
336        assert!(doc.links[0].is_internal); // fragment-only is internal
337        assert_eq!(doc.links[0].url, "#overview");
338    }
339
340    #[test]
341    fn test_parse_code_blocks() {
342        let md = "# Demo\n\n```rust\nfn main() {\n    println!(\"hi\");\n}\n```\n\n```\nplain\n```\n";
343        let doc = parse_markdown(md);
344        assert_eq!(doc.code_blocks.len(), 2);
345        assert_eq!(doc.code_blocks[0].language.as_deref(), Some("rust"));
346        assert_eq!(doc.code_blocks[0].line_count, 3);
347        assert!(doc.code_blocks[1].language.is_none());
348        assert_eq!(doc.code_blocks[1].line_count, 1);
349    }
350
351    #[test]
352    fn test_word_count() {
353        let md = "Hello world. This is a test.\n\nAnother paragraph here.\n";
354        let doc = parse_markdown(md);
355        assert_eq!(doc.word_count, 9);
356    }
357
358    #[test]
359    fn test_word_count_excludes_code_blocks() {
360        let md = "One two three.\n\n```rust\nfn main() {}\n```\n\nFour five.\n";
361        let doc = parse_markdown(md);
362        assert_eq!(doc.word_count, 5);
363    }
364
365    #[test]
366    fn test_list_count() {
367        let md = "# Lists\n\n- item a\n- item b\n\n1. first\n2. second\n\nParagraph.\n\n- another list\n";
368        let doc = parse_markdown(md);
369        assert_eq!(doc.list_count, 3);
370    }
371
372    #[test]
373    fn test_frontmatter_detected() {
374        let md = "---\ntitle: Hello\nauthor: Alice\n---\n\n# Hello\n";
375        let doc = parse_markdown(md);
376        assert!(doc.has_frontmatter);
377        assert_eq!(doc.frontmatter_keys, vec!["title", "author"]);
378        assert_eq!(doc.headings.len(), 1);
379        assert_eq!(doc.headings[0].text, "Hello");
380    }
381
382    #[test]
383    fn test_no_frontmatter() {
384        let md = "# Just a heading\n\nSome text.\n";
385        let doc = parse_markdown(md);
386        assert!(!doc.has_frontmatter);
387        assert!(doc.frontmatter_keys.is_empty());
388    }
389
390    #[test]
391    fn test_empty_document() {
392        let doc = parse_markdown("");
393        assert!(doc.headings.is_empty());
394        assert!(doc.links.is_empty());
395        assert!(doc.code_blocks.is_empty());
396        assert_eq!(doc.word_count, 0);
397        assert_eq!(doc.list_count, 0);
398        assert!(!doc.has_frontmatter);
399        assert!(doc.frontmatter_keys.is_empty());
400    }
401
402    #[test]
403    fn test_mixed_link_types() {
404        let md = "[ext](https://example.com) [int](./file.md) [anc](#top)\n";
405        let doc = parse_markdown(md);
406        assert_eq!(doc.links.len(), 3);
407        assert!(!doc.links[0].is_internal); // external
408        assert!(doc.links[1].is_internal);  // relative path
409        assert!(doc.links[2].is_internal);  // fragment-only anchor
410    }
411
412    #[test]
413    fn test_slugify() {
414        assert_eq!(slugify("Hello World"), "hello-world");
415        assert_eq!(slugify("API Reference (v2)"), "api-reference-v2");
416        assert_eq!(slugify("  Leading Spaces  "), "leading-spaces");
417    }
418
419    #[test]
420    fn test_internal_reference_detection() {
421        let md = "See [PRD](./docs/prd.md) and [README](../README.md) for details.\n";
422        let doc = parse_markdown(md);
423        assert_eq!(doc.links.len(), 2);
424        assert!(doc.links[0].is_internal);
425        assert_eq!(doc.links[0].url, "./docs/prd.md");
426        assert!(doc.links[1].is_internal);
427        assert_eq!(doc.links[1].url, "../README.md");
428    }
429}