Skip to main content

rustpress_md/
lib.rs

1use std::collections::HashMap;
2
3use anyhow::{Context, Result};
4use once_cell::sync::Lazy;
5use pulldown_cmark::{
6    html, CodeBlockKind, CowStr, Event, HeadingLevel, Options, Parser, Tag, TagEnd,
7};
8use regex::Regex;
9use serde::{Deserialize, Serialize};
10use syntect::easy::HighlightLines;
11use syntect::highlighting::{Theme, ThemeSet};
12use syntect::html::{styled_line_to_highlighted_html, IncludeBackground};
13use syntect::parsing::SyntaxSet;
14use syntect::util::LinesWithEndings;
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct MarkdownOptions {
18    pub mermaid: bool,
19    pub code_highlight: bool,
20    pub code_line_numbers: bool,
21    pub heading_anchors: bool,
22    pub index_code: bool,
23}
24
25impl Default for MarkdownOptions {
26    fn default() -> Self {
27        Self {
28            mermaid: true,
29            code_highlight: true,
30            code_line_numbers: true,
31            heading_anchors: true,
32            index_code: false,
33        }
34    }
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
38#[serde(default)]
39pub struct Frontmatter {
40    pub title: Option<String>,
41    pub layout: String,
42    pub sidebar: bool,
43    pub search: bool,
44    pub access: String,
45}
46
47impl Default for Frontmatter {
48    fn default() -> Self {
49        Self {
50            title: None,
51            layout: "doc".to_string(),
52            sidebar: true,
53            search: true,
54            access: "public".to_string(),
55        }
56    }
57}
58
59#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
60pub struct Heading {
61    pub level: u8,
62    pub text: String,
63    pub anchor: String,
64}
65
66#[derive(Debug, Clone, PartialEq, Eq)]
67pub struct Document {
68    pub frontmatter: Frontmatter,
69    pub title: String,
70    pub html: String,
71    pub headings: Vec<Heading>,
72    pub search_text: String,
73}
74
75pub fn parse_markdown(input: &str, options: MarkdownOptions) -> Result<Document> {
76    let (frontmatter, markdown) = split_frontmatter(input)?;
77    let mut frontmatter = frontmatter;
78    normalize_frontmatter(&mut frontmatter);
79
80    let mut parser_options = Options::empty();
81    parser_options.insert(Options::ENABLE_TABLES);
82    parser_options.insert(Options::ENABLE_FOOTNOTES);
83    parser_options.insert(Options::ENABLE_STRIKETHROUGH);
84    parser_options.insert(Options::ENABLE_TASKLISTS);
85    parser_options.insert(Options::ENABLE_HEADING_ATTRIBUTES);
86
87    let events: Vec<Event<'_>> = Parser::new_ext(markdown, parser_options).collect();
88    let headings = collect_headings(&events);
89    let html = render_html(events, &headings, &options);
90    let search_text = collect_search_text(markdown, options.index_code);
91    let title = frontmatter
92        .title
93        .clone()
94        .or_else(|| headings.first().map(|heading| heading.text.clone()))
95        .unwrap_or_else(|| "Untitled".to_string());
96
97    Ok(Document {
98        frontmatter,
99        title,
100        html,
101        headings,
102        search_text,
103    })
104}
105
106fn split_frontmatter(input: &str) -> Result<(Frontmatter, &str)> {
107    let trimmed = input.strip_prefix('\u{feff}').unwrap_or(input);
108    if !trimmed.starts_with("---\n") && !trimmed.starts_with("---\r\n") {
109        return Ok((Frontmatter::default(), trimmed));
110    }
111
112    let body_start = if trimmed.starts_with("---\r\n") { 5 } else { 4 };
113    let rest = &trimmed[body_start..];
114    for marker in ["\n---\n", "\r\n---\r\n", "\n---\r\n", "\r\n---\n"] {
115        if let Some(index) = rest.find(marker) {
116            let yaml = &rest[..index];
117            let after_marker = &rest[index + marker.len()..];
118            let frontmatter = serde_yaml::from_str(yaml).context("failed to parse frontmatter")?;
119            return Ok((frontmatter, after_marker));
120        }
121    }
122
123    anyhow::bail!("frontmatter starts with --- but has no closing marker")
124}
125
126fn normalize_frontmatter(frontmatter: &mut Frontmatter) {
127    if frontmatter.layout.is_empty() {
128        frontmatter.layout = "doc".to_string();
129    }
130    if frontmatter.access != "public" && frontmatter.access != "masked" {
131        frontmatter.access = "public".to_string();
132    }
133}
134
135fn collect_headings(events: &[Event<'_>]) -> Vec<Heading> {
136    let mut headings = Vec::new();
137    let mut current: Option<(u8, String)> = None;
138    let mut used = HashMap::<String, usize>::new();
139
140    for event in events {
141        match event {
142            Event::Start(Tag::Heading { level, .. }) => {
143                current = Some((heading_level(*level), String::new()));
144            }
145            Event::Text(text) | Event::Code(text) => {
146                if let Some((_, current_text)) = &mut current {
147                    current_text.push_str(text);
148                }
149            }
150            Event::End(TagEnd::Heading(_)) => {
151                if let Some((level, text)) = current.take() {
152                    let anchor = unique_slug(&slugify(&text), &mut used);
153                    headings.push(Heading {
154                        level,
155                        text: text.trim().to_string(),
156                        anchor,
157                    });
158                }
159            }
160            _ => {}
161        }
162    }
163
164    headings
165}
166
167fn render_html(events: Vec<Event<'_>>, headings: &[Heading], options: &MarkdownOptions) -> String {
168    let mut out_events = Vec::with_capacity(events.len() + headings.len());
169    let mut heading_index = 0usize;
170    let mut in_code_block = false;
171    let mut code_lang: Option<String> = None;
172    let mut code_text = String::new();
173
174    for event in events {
175        match event {
176            Event::Start(Tag::Heading { level, .. }) if options.heading_anchors => {
177                let anchor = headings
178                    .get(heading_index)
179                    .map(|heading| heading.anchor.as_str())
180                    .unwrap_or_default();
181                heading_index += 1;
182                out_events.push(Event::Html(CowStr::from(format!(
183                    "<{} id=\"{}\"><a class=\"heading-anchor\" href=\"#{}\" aria-label=\"Link to section\">#</a>",
184                    heading_tag(level),
185                    escape_attr(anchor),
186                    escape_attr(anchor)
187                ))));
188            }
189            Event::End(TagEnd::Heading(level)) if options.heading_anchors => {
190                out_events.push(Event::Html(CowStr::from(format!(
191                    "</{}>",
192                    heading_tag(level)
193                ))));
194            }
195            Event::Start(Tag::CodeBlock(kind)) => {
196                let lang = match &kind {
197                    CodeBlockKind::Fenced(value) => value
198                        .split_whitespace()
199                        .next()
200                        .filter(|value| !value.is_empty())
201                        .map(str::to_string),
202                    CodeBlockKind::Indented => None,
203                };
204
205                if options.mermaid && lang.as_deref() == Some("mermaid") {
206                    in_code_block = true;
207                    code_lang = lang;
208                    out_events.push(Event::Html(CowStr::from("<pre class=\"mermaid\">")));
209                } else {
210                    in_code_block = true;
211                    code_lang = lang;
212                    code_text.clear();
213                }
214            }
215            Event::Text(text) if in_code_block && code_lang.as_deref() == Some("mermaid") => {
216                out_events.push(Event::Html(CowStr::from(escape_html(&text))));
217            }
218            Event::Text(text) | Event::Code(text) if in_code_block => {
219                code_text.push_str(&text);
220            }
221            Event::End(TagEnd::CodeBlock) if in_code_block => {
222                if code_lang.as_deref() == Some("mermaid") {
223                    out_events.push(Event::Html(CowStr::from("</pre>")));
224                } else {
225                    out_events.push(Event::Html(CowStr::from(render_code_block(
226                        &code_text,
227                        code_lang.as_deref(),
228                        options.code_highlight,
229                        options.code_line_numbers,
230                    ))));
231                    code_text.clear();
232                }
233                in_code_block = false;
234                code_lang = None;
235            }
236            _ => out_events.push(event),
237        }
238    }
239
240    let mut rendered = String::new();
241    html::push_html(&mut rendered, out_events.into_iter());
242    rendered
243}
244
245fn render_code_block(
246    code: &str,
247    lang: Option<&str>,
248    highlight: bool,
249    code_line_numbers: bool,
250) -> String {
251    let code = trim_trailing_blank_lines(code);
252    let normalized_lang = lang
253        .map(normalize_code_lang)
254        .filter(|lang| !lang.is_empty());
255    let content = if highlight {
256        highlight_code(code, normalized_lang)
257    } else {
258        escape_html(code)
259    };
260    let lang_class = normalized_lang
261        .map(|lang| format!(" language-{}", escape_attr(lang)))
262        .unwrap_or_default();
263    let header = normalized_lang
264        .map(|lang| {
265            format!(
266                r#"<div class="rp-code-header"><span>{}</span></div>"#,
267                escape_html(lang)
268            )
269        })
270        .unwrap_or_default();
271
272    if code_line_numbers {
273        let line_count = LinesWithEndings::from(code).count().max(1);
274        let lines = (1..=line_count)
275            .map(|line| line.to_string())
276            .collect::<Vec<_>>()
277            .join("\n");
278        return format!(
279            r#"<div class="rp-code rp-code-line-numbers">{header}<button class="rp-code-copy" type="button" data-rp-copy-code aria-label="Copy code" title="Copy code"><svg class="rp-code-copy-icon" viewBox="0 0 24 24" aria-hidden="true"><rect x="9" y="9" width="11" height="11" rx="2"></rect><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"></path></svg><svg class="rp-code-copy-check" viewBox="0 0 24 24" aria-hidden="true"><path d="M20 6 9 17l-5-5"></path></svg></button><pre><span class="rp-code-lines" aria-hidden="true">{lines}</span><code class="rp-code-content{lang_class}">{content}</code></pre></div>"#
280        );
281    }
282
283    format!(
284        r#"<div class="rp-code">{header}<button class="rp-code-copy" type="button" data-rp-copy-code aria-label="Copy code" title="Copy code"><svg class="rp-code-copy-icon" viewBox="0 0 24 24" aria-hidden="true"><rect x="9" y="9" width="11" height="11" rx="2"></rect><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"></path></svg><svg class="rp-code-copy-check" viewBox="0 0 24 24" aria-hidden="true"><path d="M20 6 9 17l-5-5"></path></svg></button><pre><code class="rp-code-content{lang_class}">{content}</code></pre></div>"#
285    )
286}
287
288fn trim_trailing_blank_lines(code: &str) -> &str {
289    let mut end = code.len();
290
291    for line in LinesWithEndings::from(code)
292        .collect::<Vec<_>>()
293        .into_iter()
294        .rev()
295    {
296        if line.trim().is_empty() {
297            end -= line.len();
298        } else {
299            break;
300        }
301    }
302
303    &code[..end]
304}
305
306fn normalize_code_lang(lang: &str) -> &str {
307    lang.trim()
308        .trim_start_matches("language-")
309        .split([',', '{'])
310        .next()
311        .unwrap_or("")
312        .trim()
313}
314
315fn highlight_code(code: &str, lang: Option<&str>) -> String {
316    let syntax = lang
317        .and_then(|lang| SYNTAX_SET.find_syntax_by_token(lang))
318        .unwrap_or_else(|| SYNTAX_SET.find_syntax_plain_text());
319    let mut highlighter = HighlightLines::new(syntax, highlight_theme());
320    let mut html = String::new();
321
322    for line in LinesWithEndings::from(code) {
323        match highlighter
324            .highlight_line(line, &SYNTAX_SET)
325            .and_then(|regions| styled_line_to_highlighted_html(&regions, IncludeBackground::No))
326        {
327            Ok(line_html) => html.push_str(&line_html),
328            Err(_) => html.push_str(&escape_html(line)),
329        }
330    }
331
332    html
333}
334
335static SYNTAX_SET: Lazy<SyntaxSet> = Lazy::new(SyntaxSet::load_defaults_newlines);
336static THEME_SET: Lazy<ThemeSet> = Lazy::new(ThemeSet::load_defaults);
337
338fn highlight_theme() -> &'static Theme {
339    THEME_SET
340        .themes
341        .get("base16-ocean.dark")
342        .or_else(|| THEME_SET.themes.values().next())
343        .expect("syntect ships with default themes")
344}
345
346fn collect_search_text(markdown: &str, index_code: bool) -> String {
347    let mut parser_options = Options::empty();
348    parser_options.insert(Options::ENABLE_TABLES);
349    parser_options.insert(Options::ENABLE_STRIKETHROUGH);
350    parser_options.insert(Options::ENABLE_TASKLISTS);
351
352    let mut text = String::new();
353    let mut in_code_block = false;
354
355    for event in Parser::new_ext(markdown, parser_options) {
356        match event {
357            Event::Start(Tag::CodeBlock(_)) => in_code_block = true,
358            Event::End(TagEnd::CodeBlock) => in_code_block = false,
359            Event::Text(value) | Event::Code(value) => {
360                if index_code || !in_code_block {
361                    if !text.is_empty() {
362                        text.push(' ');
363                    }
364                    text.push_str(&value);
365                }
366            }
367            _ => {}
368        }
369    }
370
371    normalize_space(&text)
372}
373
374fn heading_level(level: HeadingLevel) -> u8 {
375    match level {
376        HeadingLevel::H1 => 1,
377        HeadingLevel::H2 => 2,
378        HeadingLevel::H3 => 3,
379        HeadingLevel::H4 => 4,
380        HeadingLevel::H5 => 5,
381        HeadingLevel::H6 => 6,
382    }
383}
384
385fn heading_tag(level: HeadingLevel) -> &'static str {
386    match level {
387        HeadingLevel::H1 => "h1",
388        HeadingLevel::H2 => "h2",
389        HeadingLevel::H3 => "h3",
390        HeadingLevel::H4 => "h4",
391        HeadingLevel::H5 => "h5",
392        HeadingLevel::H6 => "h6",
393    }
394}
395
396fn slugify(text: &str) -> String {
397    static PUNCT: Lazy<Regex> =
398        Lazy::new(|| Regex::new(r"[^\p{Alphabetic}\p{Number}_\-\s]+").unwrap());
399    static SPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_]+").unwrap());
400
401    let lower = text.trim().to_lowercase();
402    let without_punct = PUNCT.replace_all(&lower, "");
403    let slug = SPACE.replace_all(without_punct.trim(), "-");
404    if slug.is_empty() {
405        "section".to_string()
406    } else {
407        slug.to_string()
408    }
409}
410
411fn unique_slug(slug: &str, used: &mut HashMap<String, usize>) -> String {
412    let count = used.entry(slug.to_string()).or_insert(0);
413    *count += 1;
414    if *count == 1 {
415        slug.to_string()
416    } else {
417        format!("{slug}-{}", *count)
418    }
419}
420
421fn normalize_space(input: &str) -> String {
422    input.split_whitespace().collect::<Vec<_>>().join(" ")
423}
424
425fn escape_html(input: &str) -> String {
426    input
427        .replace('&', "&amp;")
428        .replace('<', "&lt;")
429        .replace('>', "&gt;")
430}
431
432fn escape_attr(input: &str) -> String {
433    escape_html(input).replace('"', "&quot;")
434}
435
436#[cfg(test)]
437mod tests {
438    use super::*;
439
440    #[test]
441    fn parses_frontmatter_and_title() {
442        let doc = parse_markdown(
443            "---\ntitle: Page Title\naccess: masked\n---\n# Ignored\nBody",
444            MarkdownOptions::default(),
445        )
446        .unwrap();
447
448        assert_eq!(doc.title, "Page Title");
449        assert_eq!(doc.frontmatter.access, "masked");
450        assert!(doc.html.contains("id=\"ignored\""));
451    }
452
453    #[test]
454    fn chinese_heading_anchor_is_preserved() {
455        let doc = parse_markdown("# 中文 标题\n\ntext", MarkdownOptions::default()).unwrap();
456
457        assert_eq!(doc.headings[0].anchor, "中文-标题");
458        assert!(doc.html.contains("id=\"中文-标题\""));
459    }
460
461    #[test]
462    fn mermaid_code_block_becomes_mermaid_pre() {
463        let doc = parse_markdown(
464            "```mermaid\nflowchart LR\nA-->B\n```",
465            MarkdownOptions::default(),
466        )
467        .unwrap();
468
469        assert!(doc.html.contains("<pre class=\"mermaid\">"));
470        assert!(doc.html.contains("A--&gt;B"));
471        assert!(!doc.html.contains("data-rp-copy-code"));
472        assert!(!doc.html.contains("rp-code-line-numbers"));
473        assert!(!doc.html.contains("rp-code-lines"));
474    }
475
476    #[test]
477    fn fenced_code_has_copy_button_and_is_highlighted_with_syntect() {
478        let doc = parse_markdown(
479            "```rust\nfn main() {\n    println!(\"hi\");\n}\n```",
480            MarkdownOptions::default(),
481        )
482        .unwrap();
483
484        assert!(doc.html.contains("class=\"rp-code rp-code-line-numbers\""));
485        assert!(doc.html.contains("class=\"rp-code-copy\""));
486        assert!(doc.html.contains("data-rp-copy-code"));
487        assert!(doc.html.contains("aria-label=\"Copy code\""));
488        assert!(doc.html.contains("rp-code-line-numbers"));
489        assert!(doc
490            .html
491            .contains("class=\"rp-code-lines\" aria-hidden=\"true\""));
492        assert!(doc.html.contains("language-rust"));
493        assert!(doc.html.contains("<span style="));
494        assert!(doc.html.contains("println"));
495    }
496
497    #[test]
498    fn code_line_numbers_can_be_disabled() {
499        let doc = parse_markdown(
500            "```rust\nfn main() { println!(\"hi\"); }\n```",
501            MarkdownOptions {
502                code_line_numbers: false,
503                ..MarkdownOptions::default()
504            },
505        )
506        .unwrap();
507
508        assert!(doc.html.contains("class=\"rp-code\""));
509        assert!(doc.html.contains("data-rp-copy-code"));
510        assert!(!doc.html.contains("rp-code-line-numbers"));
511        assert!(!doc.html.contains("rp-code-lines"));
512    }
513
514    #[test]
515    fn code_highlight_can_be_disabled_without_removing_copy_button() {
516        let doc = parse_markdown(
517            "```rust\nfn main() { println!(\"<hi>\"); }\n```",
518            MarkdownOptions {
519                code_highlight: false,
520                ..MarkdownOptions::default()
521            },
522        )
523        .unwrap();
524
525        assert!(doc.html.contains("class=\"rp-code rp-code-line-numbers\""));
526        assert!(doc.html.contains("data-rp-copy-code"));
527        assert!(doc.html.contains("class=\"rp-code-content language-rust\""));
528        assert!(doc.html.contains("println!(\"&lt;hi&gt;\")"));
529        assert!(!doc.html.contains("<span style="));
530    }
531
532    #[test]
533    fn code_line_numbers_match_multiline_trailing_and_empty_blocks() {
534        let multiline = render_code_block("one\ntwo\n\n", None, false, true);
535        assert!(
536            multiline.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1\n2</span>")
537        );
538        assert!(multiline.contains("<code class=\"rp-code-content\">one\ntwo\n</code>"));
539
540        let empty = render_code_block("", None, false, true);
541        assert!(empty.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1</span>"));
542    }
543
544    #[test]
545    fn code_block_trims_trailing_whitespace_only_lines() {
546        let html = render_code_block("one\n  \n\t\n", None, false, true);
547
548        assert!(html.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1</span>"));
549        assert!(html.contains("<code class=\"rp-code-content\">one\n</code>"));
550    }
551
552    #[test]
553    fn code_content_does_not_include_line_numbers() {
554        let doc = parse_markdown(
555            "```\nalpha\nbeta\n```",
556            MarkdownOptions {
557                code_highlight: false,
558                ..MarkdownOptions::default()
559            },
560        )
561        .unwrap();
562
563        assert!(doc
564            .html
565            .contains("class=\"rp-code-lines\" aria-hidden=\"true\">1\n2</span>"));
566        assert_eq!(code_content(&doc.html), "alpha\nbeta\n");
567    }
568
569    #[test]
570    fn code_is_excluded_from_search_by_default() {
571        let doc = parse_markdown(
572            "Body\n\n```rust\nlet hidden = true;\n```",
573            MarkdownOptions::default(),
574        )
575        .unwrap();
576
577        assert!(doc.search_text.contains("Body"));
578        assert!(!doc.search_text.contains("hidden"));
579    }
580
581    fn code_content(html: &str) -> &str {
582        let class_start = html.find("class=\"rp-code-content").unwrap();
583        let content_start = class_start + html[class_start..].find('>').unwrap() + 1;
584        let content_end = content_start + html[content_start..].find("</code>").unwrap();
585        &html[content_start..content_end]
586    }
587}