typstify_parser/
markdown.rs

1//! Markdown parser using pulldown-cmark.
2
3use std::path::Path;
4
5use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag, TagEnd};
6use thiserror::Error;
7use typstify_core::{
8    content::{ParsedContent, TocEntry},
9    frontmatter::parse_frontmatter,
10};
11
12use crate::syntax::SyntaxHighlighter;
13
14/// Markdown parsing errors.
15#[derive(Debug, Error)]
16pub enum MarkdownError {
17    /// Failed to parse frontmatter.
18    #[error("frontmatter error: {0}")]
19    Frontmatter(#[from] typstify_core::error::CoreError),
20}
21
22/// Result type for markdown operations.
23pub type Result<T> = std::result::Result<T, MarkdownError>;
24
25/// Markdown parser with syntax highlighting support.
26#[derive(Debug)]
27pub struct MarkdownParser {
28    highlighter: SyntaxHighlighter,
29    options: Options,
30}
31
32impl Default for MarkdownParser {
33    fn default() -> Self {
34        Self::new()
35    }
36}
37
38impl MarkdownParser {
39    /// Create a new markdown parser with default options.
40    pub fn new() -> Self {
41        let mut options = Options::empty();
42        options.insert(Options::ENABLE_TABLES);
43        options.insert(Options::ENABLE_FOOTNOTES);
44        options.insert(Options::ENABLE_STRIKETHROUGH);
45        options.insert(Options::ENABLE_TASKLISTS);
46        options.insert(Options::ENABLE_HEADING_ATTRIBUTES);
47
48        Self {
49            highlighter: SyntaxHighlighter::default(),
50            options,
51        }
52    }
53
54    /// Create a parser with a custom syntax theme.
55    pub fn with_theme(theme: &str) -> Self {
56        let mut parser = Self::new();
57        parser.highlighter.set_theme(theme);
58        parser
59    }
60
61    /// Parse markdown content with frontmatter.
62    pub fn parse(&self, content: &str, path: &Path) -> Result<ParsedContent> {
63        // Split frontmatter from body
64        let (frontmatter, body) = parse_frontmatter(content, path)?;
65
66        // Parse the markdown body
67        let (html, toc) = self.render_markdown(&body);
68
69        Ok(ParsedContent {
70            frontmatter,
71            html,
72            raw: body,
73            toc,
74        })
75    }
76
77    /// Parse markdown without frontmatter (body only).
78    pub fn parse_body(&self, body: &str) -> (String, Vec<TocEntry>) {
79        self.render_markdown(body)
80    }
81
82    /// Render markdown to HTML with TOC extraction.
83    fn render_markdown(&self, content: &str) -> (String, Vec<TocEntry>) {
84        let parser = Parser::new_ext(content, self.options);
85        let mut toc = Vec::new();
86        let mut html = String::new();
87        let mut current_heading: Option<(u8, String)> = None;
88        let mut code_block_lang: Option<String> = None;
89        let mut code_block_content = String::new();
90
91        for event in parser {
92            match event {
93                // Handle heading start
94                Event::Start(Tag::Heading { level, id, .. }) => {
95                    let lvl = level as u8;
96                    current_heading = Some((lvl, String::new()));
97                    let id_attr = id.map(|i| format!(" id=\"{i}\"")).unwrap_or_default();
98                    html.push_str(&format!("<h{lvl}{id_attr}>"));
99                }
100
101                // Handle heading end
102                Event::End(TagEnd::Heading(level)) => {
103                    let lvl = level as u8;
104                    if let Some((_, ref text)) = current_heading {
105                        let id = slugify(text);
106                        toc.push(TocEntry {
107                            level: lvl,
108                            text: text.clone(),
109                            id: id.clone(),
110                        });
111                    }
112                    html.push_str(&format!("</h{lvl}>"));
113                    current_heading = None;
114                }
115
116                // Handle code block start
117                Event::Start(Tag::CodeBlock(kind)) => {
118                    code_block_lang = match kind {
119                        CodeBlockKind::Fenced(lang) => {
120                            let lang = lang.to_string();
121                            if lang.is_empty() { None } else { Some(lang) }
122                        }
123                        CodeBlockKind::Indented => None,
124                    };
125                    code_block_content.clear();
126                }
127
128                // Handle code block end
129                Event::End(TagEnd::CodeBlock) => {
130                    let highlighted = self
131                        .highlighter
132                        .highlight(&code_block_content, code_block_lang.as_deref());
133                    html.push_str(&highlighted);
134                    code_block_lang = None;
135                    code_block_content.clear();
136                }
137
138                // Handle text inside code blocks
139                Event::Text(text)
140                    if code_block_lang.is_some() || !code_block_content.is_empty() =>
141                {
142                    code_block_content.push_str(&text);
143                }
144
145                // Handle regular text
146                Event::Text(text) => {
147                    if let Some((_, ref mut heading_text)) = current_heading {
148                        heading_text.push_str(&text);
149                    }
150                    html.push_str(&html_escape(&text));
151                }
152
153                // Handle code (inline)
154                Event::Code(code) => {
155                    if let Some((_, ref mut heading_text)) = current_heading {
156                        heading_text.push_str(&code);
157                    }
158                    html.push_str(&format!("<code>{}</code>", html_escape(&code)));
159                }
160
161                // Handle soft breaks
162                Event::SoftBreak => {
163                    html.push('\n');
164                }
165
166                // Handle hard breaks
167                Event::HardBreak => {
168                    html.push_str("<br />\n");
169                }
170
171                // Handle other start tags
172                Event::Start(tag) => {
173                    html.push_str(&tag_to_html_start(&tag));
174                }
175
176                // Handle other end tags
177                Event::End(tag) => {
178                    html.push_str(&tag_to_html_end(&tag));
179                }
180
181                // Handle HTML
182                Event::Html(raw) | Event::InlineHtml(raw) => {
183                    html.push_str(&raw);
184                }
185
186                // Handle footnote references
187                Event::FootnoteReference(name) => {
188                    html.push_str(&format!(
189                        "<sup class=\"footnote-ref\"><a href=\"#fn-{name}\">[{name}]</a></sup>"
190                    ));
191                }
192
193                // Handle rules
194                Event::Rule => {
195                    html.push_str("<hr />\n");
196                }
197
198                // Handle task list markers
199                Event::TaskListMarker(checked) => {
200                    let checkbox = if checked {
201                        "<input type=\"checkbox\" checked disabled />"
202                    } else {
203                        "<input type=\"checkbox\" disabled />"
204                    };
205                    html.push_str(checkbox);
206                }
207
208                Event::InlineMath(math) => {
209                    html.push_str(&format!("<span class=\"math inline\">\\({math}\\)</span>"));
210                }
211
212                Event::DisplayMath(math) => {
213                    html.push_str(&format!("<div class=\"math display\">\\[{math}\\]</div>"));
214                }
215            }
216        }
217
218        (html, toc)
219    }
220}
221
222/// Convert a pulldown-cmark tag to HTML opening tag.
223fn tag_to_html_start(tag: &Tag) -> String {
224    match tag {
225        Tag::Paragraph => "<p>".to_string(),
226        Tag::Heading { level, id, .. } => {
227            let id_attr = id
228                .as_ref()
229                .map(|i| format!(" id=\"{i}\""))
230                .unwrap_or_default();
231            format!("<h{}{id_attr}>", *level as u8)
232        }
233        Tag::BlockQuote(_) => "<blockquote>".to_string(),
234        Tag::CodeBlock(_) => String::new(), // Handled separately
235        Tag::List(Some(start)) => format!("<ol start=\"{start}\">"),
236        Tag::List(None) => "<ul>".to_string(),
237        Tag::Item => "<li>".to_string(),
238        Tag::FootnoteDefinition(name) => {
239            format!("<div class=\"footnote\" id=\"fn-{name}\">")
240        }
241        Tag::Table(alignments) => {
242            let _ = alignments; // Alignments handled per cell
243            "<table>".to_string()
244        }
245        Tag::TableHead => "<thead><tr>".to_string(),
246        Tag::TableRow => "<tr>".to_string(),
247        Tag::TableCell => "<td>".to_string(),
248        Tag::Emphasis => "<em>".to_string(),
249        Tag::Strong => "<strong>".to_string(),
250        Tag::Strikethrough => "<del>".to_string(),
251        Tag::Link {
252            dest_url, title, ..
253        } => {
254            let title_attr = if title.is_empty() {
255                String::new()
256            } else {
257                format!(" title=\"{}\"", html_escape(title))
258            };
259            format!("<a href=\"{}\"{}> ", html_escape(dest_url), title_attr)
260        }
261        Tag::Image {
262            dest_url, title, ..
263        } => {
264            let title_attr = if title.is_empty() {
265                String::new()
266            } else {
267                format!(" title=\"{}\"", html_escape(title))
268            };
269            // Add loading="lazy" and decoding="async" for performance
270            format!(
271                "<img src=\"{}\" loading=\"lazy\" decoding=\"async\"{}",
272                html_escape(dest_url),
273                title_attr
274            )
275        }
276        Tag::HtmlBlock => String::new(),
277        Tag::MetadataBlock(_) => String::new(),
278        Tag::DefinitionList => "<dl>".to_string(),
279        Tag::DefinitionListTitle => "<dt>".to_string(),
280        Tag::DefinitionListDefinition => "<dd>".to_string(),
281        Tag::Superscript => "<sup>".to_string(),
282        Tag::Subscript => "<sub>".to_string(),
283    }
284}
285
286/// Convert a pulldown-cmark tag end to HTML closing tag.
287fn tag_to_html_end(tag: &TagEnd) -> String {
288    match tag {
289        TagEnd::Paragraph => "</p>\n".to_string(),
290        TagEnd::Heading(level) => format!("</h{}>\n", *level as u8),
291        TagEnd::BlockQuote(_) => "</blockquote>\n".to_string(),
292        TagEnd::CodeBlock => String::new(), // Handled separately
293        TagEnd::List(ordered) => {
294            if *ordered {
295                "</ol>\n".to_string()
296            } else {
297                "</ul>\n".to_string()
298            }
299        }
300        TagEnd::Item => "</li>\n".to_string(),
301        TagEnd::FootnoteDefinition => "</div>\n".to_string(),
302        TagEnd::Table => "</table>\n".to_string(),
303        TagEnd::TableHead => "</tr></thead>\n".to_string(),
304        TagEnd::TableRow => "</tr>\n".to_string(),
305        TagEnd::TableCell => "</td>".to_string(),
306        TagEnd::Emphasis => "</em>".to_string(),
307        TagEnd::Strong => "</strong>".to_string(),
308        TagEnd::Strikethrough => "</del>".to_string(),
309        TagEnd::Link => "</a>".to_string(),
310        TagEnd::Image => " />".to_string(),
311        TagEnd::HtmlBlock => String::new(),
312        TagEnd::MetadataBlock(_) => String::new(),
313        TagEnd::DefinitionList => "</dl>\n".to_string(),
314        TagEnd::DefinitionListTitle => "</dt>\n".to_string(),
315        TagEnd::DefinitionListDefinition => "</dd>\n".to_string(),
316        TagEnd::Superscript => "</sup>".to_string(),
317        TagEnd::Subscript => "</sub>".to_string(),
318    }
319}
320
321/// Escape HTML special characters.
322fn html_escape(s: &str) -> String {
323    s.replace('&', "&amp;")
324        .replace('<', "&lt;")
325        .replace('>', "&gt;")
326        .replace('"', "&quot;")
327}
328
329/// Convert text to a URL-safe slug.
330fn slugify(text: &str) -> String {
331    text.to_lowercase()
332        .chars()
333        .map(|c| {
334            if c.is_alphanumeric() {
335                c
336            } else if c.is_whitespace() || c == '-' || c == '_' {
337                '-'
338            } else {
339                '\0'
340            }
341        })
342        .filter(|c| *c != '\0')
343        .collect::<String>()
344        .split('-')
345        .filter(|s| !s.is_empty())
346        .collect::<Vec<_>>()
347        .join("-")
348}
349
350#[cfg(test)]
351mod tests {
352    use super::*;
353
354    #[test]
355    fn test_parse_simple_markdown() {
356        let parser = MarkdownParser::new();
357        let content = r#"---
358title: "Test Post"
359---
360
361# Hello World
362
363This is a test."#;
364
365        let result = parser.parse(content, Path::new("test.md")).unwrap();
366
367        assert_eq!(result.frontmatter.title, "Test Post");
368        assert!(result.html.contains("<h1"));
369        assert!(result.html.contains("Hello World"));
370        assert!(result.html.contains("<p>"));
371    }
372
373    #[test]
374    fn test_parse_code_block() {
375        let parser = MarkdownParser::new();
376        let (html, _) = parser.parse_body(
377            r#"```rust
378fn main() {
379    println!("Hello");
380}
381```"#,
382        );
383
384        assert!(html.contains("fn"));
385        assert!(html.contains("main"));
386    }
387
388    #[test]
389    fn test_toc_extraction() {
390        let parser = MarkdownParser::new();
391        let (_, toc) = parser.parse_body(
392            r#"# Heading 1
393## Heading 2
394### Heading 3"#,
395        );
396
397        assert_eq!(toc.len(), 3);
398        assert_eq!(toc[0].level, 1);
399        assert_eq!(toc[0].text, "Heading 1");
400        assert_eq!(toc[1].level, 2);
401        assert_eq!(toc[2].level, 3);
402    }
403
404    #[test]
405    fn test_slugify() {
406        assert_eq!(slugify("Hello World"), "hello-world");
407        assert_eq!(slugify("Test 123 Post"), "test-123-post");
408        assert_eq!(slugify("Multiple   Spaces"), "multiple-spaces");
409        assert_eq!(slugify("Special!@#Chars"), "specialchars");
410    }
411
412    #[test]
413    fn test_table_rendering() {
414        let parser = MarkdownParser::new();
415        let (html, _) = parser.parse_body(
416            r#"| Header 1 | Header 2 |
417|----------|----------|
418| Cell 1   | Cell 2   |"#,
419        );
420
421        assert!(html.contains("<table>"));
422        assert!(html.contains("<thead>"));
423        assert!(html.contains("<tr>"));
424        assert!(html.contains("<td>"));
425    }
426
427    #[test]
428    fn test_task_list() {
429        let parser = MarkdownParser::new();
430        let (html, _) = parser.parse_body(
431            r#"- [x] Done
432- [ ] Not done"#,
433        );
434
435        assert!(html.contains("checkbox"));
436        assert!(html.contains("checked"));
437    }
438
439    #[test]
440    fn test_no_frontmatter() {
441        let parser = MarkdownParser::new();
442        let content = "# Just Content\n\nNo frontmatter here.";
443        let result = parser.parse(content, Path::new("test.md")).unwrap();
444
445        assert!(result.frontmatter.title.is_empty());
446        assert!(result.html.contains("Just Content"));
447    }
448}