1use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
6use serde::Serialize;
7
8#[derive(Debug, Clone, Serialize)]
9pub struct Heading {
10 pub level: u8,
11 pub text: String,
12 pub anchor: String,
14}
15
16#[derive(Debug, Clone, Serialize)]
17pub struct Link {
18 pub text: String,
19 pub url: String,
20 pub is_internal: bool,
22 pub target_exists: Option<bool>,
25}
26
27#[derive(Debug, Clone, Serialize)]
28pub struct CodeBlock {
29 pub language: Option<String>,
30 pub line_count: usize,
31}
32
33#[derive(Debug, Clone, Serialize)]
34pub struct ParsedDocument {
35 pub headings: Vec<Heading>,
36 pub links: Vec<Link>,
37 pub code_blocks: Vec<CodeBlock>,
38 pub list_count: usize,
39 pub word_count: usize,
40 pub has_frontmatter: bool,
41 pub frontmatter_keys: Vec<String>,
42}
43
44pub fn parse_markdown(content: &str) -> ParsedDocument {
50 let has_frontmatter = content.starts_with("---\n")
51 || content.starts_with("---\r\n");
52
53 let frontmatter_keys = if has_frontmatter {
54 extract_frontmatter_keys(content)
55 } else {
56 vec![]
57 };
58
59 let md_content = strip_frontmatter(content);
61
62 let mut options = Options::empty();
63 options.insert(Options::ENABLE_TABLES);
64 options.insert(Options::ENABLE_STRIKETHROUGH);
65
66 let parser = Parser::new_ext(md_content, options);
67
68 let mut headings = Vec::new();
69 let mut links = Vec::new();
70 let mut code_blocks = Vec::new();
71 let mut word_count = 0usize;
72 let mut list_count = 0usize;
73
74 let mut in_heading = false;
75 let mut current_heading_level: u8 = 0;
76 let mut current_heading_text = String::new();
77
78 let mut in_link = false;
79 let mut current_link_url = String::new();
80 let mut current_link_text = String::new();
81
82 let mut in_code_block = false;
83 let mut current_code_lang: Option<String> = None;
84 let mut current_code_content = String::new();
85
86 for event in parser {
87 match event {
88 Event::Start(Tag::Heading { level, .. }) => {
89 in_heading = true;
90 current_heading_level = level as u8;
91 current_heading_text.clear();
92 }
93 Event::End(TagEnd::Heading(_)) => {
94 in_heading = false;
95 let text = current_heading_text.trim().to_string();
96 let anchor = slugify(&text);
97 headings.push(Heading {
98 level: current_heading_level,
99 text,
100 anchor,
101 });
102 current_heading_text.clear();
103 }
104 Event::Start(Tag::Link { dest_url, .. }) => {
105 in_link = true;
106 current_link_url = dest_url.to_string();
107 current_link_text.clear();
108 }
109 Event::End(TagEnd::Link) => {
110 in_link = false;
111 let is_internal = is_internal_link(¤t_link_url);
112 links.push(Link {
113 url: current_link_url.clone(),
114 text: current_link_text.trim().to_string(),
115 is_internal,
116 target_exists: None,
117 });
118 current_link_url.clear();
119 current_link_text.clear();
120 }
121 Event::Start(Tag::List(_)) => {
122 list_count += 1;
123 }
124 Event::Start(Tag::CodeBlock(cb_kind)) => {
125 in_code_block = true;
126 current_code_content.clear();
127 current_code_lang = match cb_kind {
128 pulldown_cmark::CodeBlockKind::Fenced(lang) => {
129 let lang = lang.trim().to_string();
130 if lang.is_empty() { None } else { Some(lang) }
131 }
132 pulldown_cmark::CodeBlockKind::Indented => None,
133 };
134 }
135 Event::End(TagEnd::CodeBlock) => {
136 in_code_block = false;
137 let line_count = current_code_content.lines().count();
138 code_blocks.push(CodeBlock {
139 language: current_code_lang.take(),
140 line_count,
141 });
142 current_code_content.clear();
143 }
144 Event::Text(text) => {
145 if in_heading {
146 current_heading_text.push_str(&text);
147 }
148 if in_link {
149 current_link_text.push_str(&text);
150 }
151 if in_code_block {
152 current_code_content.push_str(&text);
153 } else {
154 word_count += count_words(&text);
155 }
156 }
157 Event::Code(code) => {
158 word_count += count_words(&code);
159 if in_heading {
160 current_heading_text.push_str(&code);
161 }
162 if in_link {
163 current_link_text.push_str(&code);
164 }
165 }
166 _ => {}
167 }
168 }
169
170 ParsedDocument {
171 headings,
172 links,
173 code_blocks,
174 list_count,
175 word_count,
176 has_frontmatter,
177 frontmatter_keys,
178 }
179}
180
181fn is_internal_link(url: &str) -> bool {
185 if url.starts_with("http://")
186 || url.starts_with("https://")
187 || url.starts_with("mailto:")
188 {
189 return false;
190 }
191 true
192}
193
194fn slugify(text: &str) -> String {
197 let slug: String = text
198 .to_lowercase()
199 .chars()
200 .map(|c| if c.is_alphanumeric() { c } else { '-' })
201 .collect();
202 let mut result = String::new();
204 let mut prev_hyphen = true; for c in slug.chars() {
206 if c == '-' {
207 if !prev_hyphen {
208 result.push('-');
209 }
210 prev_hyphen = true;
211 } else {
212 result.push(c);
213 prev_hyphen = false;
214 }
215 }
216 if result.ends_with('-') {
218 result.pop();
219 }
220 result
221}
222
223fn strip_frontmatter(content: &str) -> &str {
225 if !content.starts_with("---\n") && !content.starts_with("---\r\n") {
226 return content;
227 }
228
229 let after_first = if let Some(stripped) = content.strip_prefix("---\r\n") {
230 stripped
231 } else {
232 &content[4..]
233 };
234
235 if let Some(end_pos) = after_first.find("\n---") {
236 let skip = end_pos + 4; let rest = &after_first[skip..];
238 if let Some(stripped) = rest.strip_prefix('\n') {
239 stripped
240 } else if let Some(stripped) = rest.strip_prefix("\r\n") {
241 stripped
242 } else {
243 rest
244 }
245 } else {
246 content
247 }
248}
249
250fn extract_frontmatter_keys(content: &str) -> Vec<String> {
252 let after_first = if let Some(stripped) = content.strip_prefix("---\r\n") {
253 stripped
254 } else if let Some(stripped) = content.strip_prefix("---\n") {
255 stripped
256 } else {
257 return vec![];
258 };
259
260 let end_pos = match after_first.find("\n---") {
261 Some(pos) => pos,
262 None => return vec![],
263 };
264
265 let fm_str = &after_first[..end_pos];
266 let mut keys = Vec::new();
267
268 for line in fm_str.lines() {
269 let line = line.trim();
270 if line.is_empty() || line.starts_with('#') {
271 continue;
272 }
273 if let Some((key, _)) = line.split_once(':') {
274 let key = key.trim();
275 if !key.is_empty() {
276 keys.push(key.to_string());
277 }
278 }
279 }
280
281 keys
282}
283
284fn count_words(text: &str) -> usize {
286 text.split_whitespace().count()
287}
288
289#[cfg(test)]
290mod tests {
291 use super::*;
292
293 #[test]
294 fn test_parse_headings() {
295 let md = "# Title\n\n## Section 1\n\n### Subsection\n\n## Section 2\n";
296 let doc = parse_markdown(md);
297 assert_eq!(doc.headings.len(), 4);
298 assert_eq!(doc.headings[0].level, 1);
299 assert_eq!(doc.headings[0].text, "Title");
300 assert_eq!(doc.headings[0].anchor, "title");
301 assert_eq!(doc.headings[1].level, 2);
302 assert_eq!(doc.headings[1].text, "Section 1");
303 assert_eq!(doc.headings[1].anchor, "section-1");
304 assert_eq!(doc.headings[2].level, 3);
305 assert_eq!(doc.headings[2].anchor, "subsection");
306 }
307
308 #[test]
309 fn test_parse_links_external() {
310 let md = "Check [Google](https://google.com) and [Docs](http://docs.rs).\n";
311 let doc = parse_markdown(md);
312 assert_eq!(doc.links.len(), 2);
313 assert_eq!(doc.links[0].url, "https://google.com");
314 assert_eq!(doc.links[0].text, "Google");
315 assert!(!doc.links[0].is_internal);
316 assert!(!doc.links[1].is_internal);
317 assert!(doc.links[0].target_exists.is_none());
318 }
319
320 #[test]
321 fn test_parse_links_internal() {
322 let md = "See [setup](./docs/setup.md) and [config](../config.yaml).\n";
323 let doc = parse_markdown(md);
324 assert_eq!(doc.links.len(), 2);
325 assert!(doc.links[0].is_internal);
326 assert_eq!(doc.links[0].url, "./docs/setup.md");
327 assert!(doc.links[1].is_internal);
328 assert!(doc.links[0].target_exists.is_none());
329 }
330
331 #[test]
332 fn test_parse_links_anchor() {
333 let md = "Jump to [section](#overview).\n";
334 let doc = parse_markdown(md);
335 assert_eq!(doc.links.len(), 1);
336 assert!(doc.links[0].is_internal); assert_eq!(doc.links[0].url, "#overview");
338 }
339
340 #[test]
341 fn test_parse_code_blocks() {
342 let md = "# Demo\n\n```rust\nfn main() {\n println!(\"hi\");\n}\n```\n\n```\nplain\n```\n";
343 let doc = parse_markdown(md);
344 assert_eq!(doc.code_blocks.len(), 2);
345 assert_eq!(doc.code_blocks[0].language.as_deref(), Some("rust"));
346 assert_eq!(doc.code_blocks[0].line_count, 3);
347 assert!(doc.code_blocks[1].language.is_none());
348 assert_eq!(doc.code_blocks[1].line_count, 1);
349 }
350
351 #[test]
352 fn test_word_count() {
353 let md = "Hello world. This is a test.\n\nAnother paragraph here.\n";
354 let doc = parse_markdown(md);
355 assert_eq!(doc.word_count, 9);
356 }
357
358 #[test]
359 fn test_word_count_excludes_code_blocks() {
360 let md = "One two three.\n\n```rust\nfn main() {}\n```\n\nFour five.\n";
361 let doc = parse_markdown(md);
362 assert_eq!(doc.word_count, 5);
363 }
364
365 #[test]
366 fn test_list_count() {
367 let md = "# Lists\n\n- item a\n- item b\n\n1. first\n2. second\n\nParagraph.\n\n- another list\n";
368 let doc = parse_markdown(md);
369 assert_eq!(doc.list_count, 3);
370 }
371
372 #[test]
373 fn test_frontmatter_detected() {
374 let md = "---\ntitle: Hello\nauthor: Alice\n---\n\n# Hello\n";
375 let doc = parse_markdown(md);
376 assert!(doc.has_frontmatter);
377 assert_eq!(doc.frontmatter_keys, vec!["title", "author"]);
378 assert_eq!(doc.headings.len(), 1);
379 assert_eq!(doc.headings[0].text, "Hello");
380 }
381
382 #[test]
383 fn test_no_frontmatter() {
384 let md = "# Just a heading\n\nSome text.\n";
385 let doc = parse_markdown(md);
386 assert!(!doc.has_frontmatter);
387 assert!(doc.frontmatter_keys.is_empty());
388 }
389
390 #[test]
391 fn test_empty_document() {
392 let doc = parse_markdown("");
393 assert!(doc.headings.is_empty());
394 assert!(doc.links.is_empty());
395 assert!(doc.code_blocks.is_empty());
396 assert_eq!(doc.word_count, 0);
397 assert_eq!(doc.list_count, 0);
398 assert!(!doc.has_frontmatter);
399 assert!(doc.frontmatter_keys.is_empty());
400 }
401
402 #[test]
403 fn test_mixed_link_types() {
404 let md = "[ext](https://example.com) [int](./file.md) [anc](#top)\n";
405 let doc = parse_markdown(md);
406 assert_eq!(doc.links.len(), 3);
407 assert!(!doc.links[0].is_internal); assert!(doc.links[1].is_internal); assert!(doc.links[2].is_internal); }
411
412 #[test]
413 fn test_slugify() {
414 assert_eq!(slugify("Hello World"), "hello-world");
415 assert_eq!(slugify("API Reference (v2)"), "api-reference-v2");
416 assert_eq!(slugify(" Leading Spaces "), "leading-spaces");
417 }
418
419 #[test]
420 fn test_internal_reference_detection() {
421 let md = "See [PRD](./docs/prd.md) and [README](../README.md) for details.\n";
422 let doc = parse_markdown(md);
423 assert_eq!(doc.links.len(), 2);
424 assert!(doc.links[0].is_internal);
425 assert_eq!(doc.links[0].url, "./docs/prd.md");
426 assert!(doc.links[1].is_internal);
427 assert_eq!(doc.links[1].url, "../README.md");
428 }
429}