Skip to main content

rustpress_md/
lib.rs

1use std::collections::HashMap;
2
3use anyhow::{Context, Result};
4use once_cell::sync::Lazy;
5use pulldown_cmark::{
6    html, CodeBlockKind, CowStr, Event, HeadingLevel, Options, Parser, Tag, TagEnd,
7};
8use regex::Regex;
9use serde::{Deserialize, Serialize};
10use syntect::easy::HighlightLines;
11use syntect::highlighting::{Theme, ThemeSet};
12use syntect::html::{styled_line_to_highlighted_html, IncludeBackground};
13use syntect::parsing::SyntaxSet;
14use syntect::util::LinesWithEndings;
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct MarkdownOptions {
18    pub mermaid: bool,
19    pub code_highlight: bool,
20    pub code_line_numbers: bool,
21    pub heading_anchors: bool,
22    pub index_code: bool,
23}
24
25impl Default for MarkdownOptions {
26    fn default() -> Self {
27        Self {
28            mermaid: true,
29            code_highlight: true,
30            code_line_numbers: true,
31            heading_anchors: true,
32            index_code: false,
33        }
34    }
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
38#[serde(default)]
39pub struct Frontmatter {
40    pub title: Option<String>,
41    pub layout: String,
42    pub sidebar: bool,
43    pub search: bool,
44    pub access: String,
45}
46
47impl Default for Frontmatter {
48    fn default() -> Self {
49        Self {
50            title: None,
51            layout: "doc".to_string(),
52            sidebar: true,
53            search: true,
54            access: "public".to_string(),
55        }
56    }
57}
58
59#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
60pub struct Heading {
61    pub level: u8,
62    pub text: String,
63    pub anchor: String,
64}
65
66#[derive(Debug, Clone, PartialEq, Eq)]
67pub struct Document {
68    pub frontmatter: Frontmatter,
69    pub title: String,
70    pub html: String,
71    pub headings: Vec<Heading>,
72    pub search_text: String,
73}
74
75pub fn parse_markdown(input: &str, options: MarkdownOptions) -> Result<Document> {
76    let (frontmatter, markdown) = split_frontmatter(input)?;
77    let mut frontmatter = frontmatter;
78    normalize_frontmatter(&mut frontmatter);
79
80    let mut parser_options = Options::empty();
81    parser_options.insert(Options::ENABLE_TABLES);
82    parser_options.insert(Options::ENABLE_FOOTNOTES);
83    parser_options.insert(Options::ENABLE_STRIKETHROUGH);
84    parser_options.insert(Options::ENABLE_TASKLISTS);
85    parser_options.insert(Options::ENABLE_HEADING_ATTRIBUTES);
86
87    let events: Vec<Event<'_>> = Parser::new_ext(markdown, parser_options).collect();
88    let headings = collect_headings(&events);
89    let html = render_html(events, &headings, &options);
90    let search_text = collect_search_text(markdown, options.index_code);
91    let title = frontmatter
92        .title
93        .clone()
94        .or_else(|| headings.first().map(|heading| heading.text.clone()))
95        .unwrap_or_else(|| "Untitled".to_string());
96
97    Ok(Document {
98        frontmatter,
99        title,
100        html,
101        headings,
102        search_text,
103    })
104}
105
106fn split_frontmatter(input: &str) -> Result<(Frontmatter, &str)> {
107    let trimmed = input.strip_prefix('\u{feff}').unwrap_or(input);
108    if !trimmed.starts_with("---\n") && !trimmed.starts_with("---\r\n") {
109        return Ok((Frontmatter::default(), trimmed));
110    }
111
112    let body_start = if trimmed.starts_with("---\r\n") { 5 } else { 4 };
113    let rest = &trimmed[body_start..];
114    for marker in ["\n---\n", "\r\n---\r\n", "\n---\r\n", "\r\n---\n"] {
115        if let Some(index) = rest.find(marker) {
116            let yaml = &rest[..index];
117            let after_marker = &rest[index + marker.len()..];
118            let frontmatter = serde_yaml::from_str(yaml).context("failed to parse frontmatter")?;
119            return Ok((frontmatter, after_marker));
120        }
121    }
122
123    anyhow::bail!("frontmatter starts with --- but has no closing marker")
124}
125
126fn normalize_frontmatter(frontmatter: &mut Frontmatter) {
127    if frontmatter.layout.is_empty() {
128        frontmatter.layout = "doc".to_string();
129    }
130    if frontmatter.access != "public" && frontmatter.access != "masked" {
131        frontmatter.access = "public".to_string();
132    }
133}
134
135fn collect_headings(events: &[Event<'_>]) -> Vec<Heading> {
136    let mut headings = Vec::new();
137    let mut current: Option<(u8, String)> = None;
138    let mut used = HashMap::<String, usize>::new();
139
140    for event in events {
141        match event {
142            Event::Start(Tag::Heading { level, .. }) => {
143                current = Some((heading_level(*level), String::new()));
144            }
145            Event::Text(text) | Event::Code(text) => {
146                if let Some((_, current_text)) = &mut current {
147                    current_text.push_str(text);
148                }
149            }
150            Event::End(TagEnd::Heading(_)) => {
151                if let Some((level, text)) = current.take() {
152                    let anchor = unique_slug(&slugify(&text), &mut used);
153                    headings.push(Heading {
154                        level,
155                        text: text.trim().to_string(),
156                        anchor,
157                    });
158                }
159            }
160            _ => {}
161        }
162    }
163
164    headings
165}
166
167fn render_html(events: Vec<Event<'_>>, headings: &[Heading], options: &MarkdownOptions) -> String {
168    let mut out_events = Vec::with_capacity(events.len() + headings.len());
169    let mut heading_index = 0usize;
170    let mut in_code_block = false;
171    let mut code_lang: Option<String> = None;
172    let mut code_text = String::new();
173
174    for event in events {
175        match event {
176            Event::Start(Tag::Heading { level, .. }) if options.heading_anchors => {
177                let anchor = headings
178                    .get(heading_index)
179                    .map(|heading| heading.anchor.as_str())
180                    .unwrap_or_default();
181                heading_index += 1;
182                out_events.push(Event::Html(CowStr::from(format!(
183                    "<{} id=\"{}\"><a class=\"heading-anchor\" href=\"#{}\" aria-label=\"Link to section\">#</a>",
184                    heading_tag(level),
185                    escape_attr(anchor),
186                    escape_attr(anchor)
187                ))));
188            }
189            Event::End(TagEnd::Heading(level)) if options.heading_anchors => {
190                out_events.push(Event::Html(CowStr::from(format!(
191                    "</{}>",
192                    heading_tag(level)
193                ))));
194            }
195            Event::Start(Tag::CodeBlock(kind)) => {
196                let lang = match &kind {
197                    CodeBlockKind::Fenced(value) => value
198                        .split_whitespace()
199                        .next()
200                        .filter(|value| !value.is_empty())
201                        .map(str::to_string),
202                    CodeBlockKind::Indented => None,
203                };
204
205                if options.mermaid && lang.as_deref() == Some("mermaid") {
206                    in_code_block = true;
207                    code_lang = lang;
208                    out_events.push(Event::Html(CowStr::from("<pre class=\"mermaid\">")));
209                } else {
210                    in_code_block = true;
211                    code_lang = lang;
212                    code_text.clear();
213                }
214            }
215            Event::Text(text) if in_code_block && code_lang.as_deref() == Some("mermaid") => {
216                out_events.push(Event::Html(CowStr::from(escape_html(&text))));
217            }
218            Event::Text(text) | Event::Code(text) if in_code_block => {
219                code_text.push_str(&text);
220            }
221            Event::End(TagEnd::CodeBlock) if in_code_block => {
222                if code_lang.as_deref() == Some("mermaid") {
223                    out_events.push(Event::Html(CowStr::from("</pre>")));
224                } else {
225                    out_events.push(Event::Html(CowStr::from(render_code_block(
226                        &code_text,
227                        code_lang.as_deref(),
228                        options.code_highlight,
229                        options.code_line_numbers,
230                    ))));
231                    code_text.clear();
232                }
233                in_code_block = false;
234                code_lang = None;
235            }
236            _ => out_events.push(event),
237        }
238    }
239
240    let mut rendered = String::new();
241    html::push_html(&mut rendered, out_events.into_iter());
242    rendered
243}
244
245fn render_code_block(
246    code: &str,
247    lang: Option<&str>,
248    highlight: bool,
249    code_line_numbers: bool,
250) -> String {
251    let code = trim_trailing_blank_lines(code);
252    let normalized_lang = lang
253        .map(normalize_code_lang)
254        .filter(|lang| !lang.is_empty());
255    let content = if highlight {
256        highlight_code(code, normalized_lang)
257    } else {
258        escape_html(code)
259    };
260    let lang_class = normalized_lang
261        .map(|lang| format!(" language-{}", escape_attr(lang)))
262        .unwrap_or_default();
263    let header = normalized_lang
264        .map(|lang| {
265            format!(
266                r#"<div class="rp-code-header"><span>{}</span></div>"#,
267                escape_html(lang)
268            )
269        })
270        .unwrap_or_default();
271
272    if code_line_numbers {
273        let line_count = LinesWithEndings::from(code).count().max(1);
274        let lines = (1..=line_count)
275            .map(|line| line.to_string())
276            .collect::<Vec<_>>()
277            .join("\n");
278        return format!(
279            r#"<div class="rp-code rp-code-line-numbers">{header}<button class="rp-code-copy" type="button" data-rp-copy-code aria-label="Copy code" title="Copy code"><svg class="rp-code-copy-icon" viewBox="0 0 24 24" aria-hidden="true"><rect x="9" y="9" width="11" height="11" rx="2"></rect><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"></path></svg><svg class="rp-code-copy-check" viewBox="0 0 24 24" aria-hidden="true"><path d="M20 6 9 17l-5-5"></path></svg></button><pre><span class="rp-code-lines" aria-hidden="true">{lines}</span><code class="rp-code-content{lang_class}">{content}</code></pre></div>"#
280        );
281    }
282
283    format!(
284        r#"<div class="rp-code">{header}<button class="rp-code-copy" type="button" data-rp-copy-code aria-label="Copy code" title="Copy code"><svg class="rp-code-copy-icon" viewBox="0 0 24 24" aria-hidden="true"><rect x="9" y="9" width="11" height="11" rx="2"></rect><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"></path></svg><svg class="rp-code-copy-check" viewBox="0 0 24 24" aria-hidden="true"><path d="M20 6 9 17l-5-5"></path></svg></button><pre><code class="rp-code-content{lang_class}">{content}</code></pre></div>"#
285    )
286}
287
288fn trim_trailing_blank_lines(code: &str) -> &str {
289    let Some((last_non_whitespace, ch)) = code.char_indices().rfind(|(_, ch)| !ch.is_whitespace())
290    else {
291        return "";
292    };
293    let last_non_whitespace_end = last_non_whitespace + ch.len_utf8();
294    let trailing = &code[last_non_whitespace_end..];
295    let line_break = match (trailing.find('\n'), trailing.find('\r')) {
296        (Some(newline), Some(carriage_return)) => Some(newline.min(carriage_return)),
297        (Some(newline), None) => Some(newline),
298        (None, Some(carriage_return)) => Some(carriage_return),
299        (None, None) => None,
300    };
301
302    line_break
303        .map(|line_break| &code[..last_non_whitespace_end + line_break])
304        .unwrap_or(code)
305}
306
307fn normalize_code_lang(lang: &str) -> &str {
308    lang.trim()
309        .trim_start_matches("language-")
310        .split([',', '{'])
311        .next()
312        .unwrap_or("")
313        .trim()
314}
315
316fn highlight_code(code: &str, lang: Option<&str>) -> String {
317    let syntax = lang
318        .and_then(|lang| SYNTAX_SET.find_syntax_by_token(lang))
319        .unwrap_or_else(|| SYNTAX_SET.find_syntax_plain_text());
320    let mut highlighter = HighlightLines::new(syntax, highlight_theme());
321    let mut html = String::new();
322
323    for line in LinesWithEndings::from(code) {
324        match highlighter
325            .highlight_line(line, &SYNTAX_SET)
326            .and_then(|regions| styled_line_to_highlighted_html(&regions, IncludeBackground::No))
327        {
328            Ok(line_html) => html.push_str(&line_html),
329            Err(_) => html.push_str(&escape_html(line)),
330        }
331    }
332
333    html
334}
335
336static SYNTAX_SET: Lazy<SyntaxSet> = Lazy::new(SyntaxSet::load_defaults_newlines);
337static THEME_SET: Lazy<ThemeSet> = Lazy::new(ThemeSet::load_defaults);
338
339fn highlight_theme() -> &'static Theme {
340    THEME_SET
341        .themes
342        .get("base16-ocean.dark")
343        .or_else(|| THEME_SET.themes.values().next())
344        .expect("syntect ships with default themes")
345}
346
347fn collect_search_text(markdown: &str, index_code: bool) -> String {
348    let mut parser_options = Options::empty();
349    parser_options.insert(Options::ENABLE_TABLES);
350    parser_options.insert(Options::ENABLE_STRIKETHROUGH);
351    parser_options.insert(Options::ENABLE_TASKLISTS);
352
353    let mut text = String::new();
354    let mut in_code_block = false;
355
356    for event in Parser::new_ext(markdown, parser_options) {
357        match event {
358            Event::Start(Tag::CodeBlock(_)) => in_code_block = true,
359            Event::End(TagEnd::CodeBlock) => in_code_block = false,
360            Event::Text(value) | Event::Code(value) => {
361                if index_code || !in_code_block {
362                    if !text.is_empty() {
363                        text.push(' ');
364                    }
365                    text.push_str(&value);
366                }
367            }
368            _ => {}
369        }
370    }
371
372    normalize_space(&text)
373}
374
375fn heading_level(level: HeadingLevel) -> u8 {
376    match level {
377        HeadingLevel::H1 => 1,
378        HeadingLevel::H2 => 2,
379        HeadingLevel::H3 => 3,
380        HeadingLevel::H4 => 4,
381        HeadingLevel::H5 => 5,
382        HeadingLevel::H6 => 6,
383    }
384}
385
386fn heading_tag(level: HeadingLevel) -> &'static str {
387    match level {
388        HeadingLevel::H1 => "h1",
389        HeadingLevel::H2 => "h2",
390        HeadingLevel::H3 => "h3",
391        HeadingLevel::H4 => "h4",
392        HeadingLevel::H5 => "h5",
393        HeadingLevel::H6 => "h6",
394    }
395}
396
397fn slugify(text: &str) -> String {
398    static PUNCT: Lazy<Regex> =
399        Lazy::new(|| Regex::new(r"[^\p{Alphabetic}\p{Number}_\-\s]+").unwrap());
400    static SPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_]+").unwrap());
401
402    let lower = text.trim().to_lowercase();
403    let without_punct = PUNCT.replace_all(&lower, "");
404    let slug = SPACE.replace_all(without_punct.trim(), "-");
405    if slug.is_empty() {
406        "section".to_string()
407    } else {
408        slug.to_string()
409    }
410}
411
412fn unique_slug(slug: &str, used: &mut HashMap<String, usize>) -> String {
413    let count = used.entry(slug.to_string()).or_insert(0);
414    *count += 1;
415    if *count == 1 {
416        slug.to_string()
417    } else {
418        format!("{slug}-{}", *count)
419    }
420}
421
422fn normalize_space(input: &str) -> String {
423    input.split_whitespace().collect::<Vec<_>>().join(" ")
424}
425
426fn escape_html(input: &str) -> String {
427    input
428        .replace('&', "&amp;")
429        .replace('<', "&lt;")
430        .replace('>', "&gt;")
431}
432
433fn escape_attr(input: &str) -> String {
434    escape_html(input).replace('"', "&quot;")
435}
436
437#[cfg(test)]
438mod tests {
439    use super::*;
440
441    #[test]
442    fn parses_frontmatter_and_title() {
443        let doc = parse_markdown(
444            "---\ntitle: Page Title\naccess: masked\n---\n# Ignored\nBody",
445            MarkdownOptions::default(),
446        )
447        .unwrap();
448
449        assert_eq!(doc.title, "Page Title");
450        assert_eq!(doc.frontmatter.access, "masked");
451        assert!(doc.html.contains("id=\"ignored\""));
452    }
453
454    #[test]
455    fn chinese_heading_anchor_is_preserved() {
456        let doc = parse_markdown("# 中文 标题\n\ntext", MarkdownOptions::default()).unwrap();
457
458        assert_eq!(doc.headings[0].anchor, "中文-标题");
459        assert!(doc.html.contains("id=\"中文-标题\""));
460    }
461
462    #[test]
463    fn mermaid_code_block_becomes_mermaid_pre() {
464        let doc = parse_markdown(
465            "```mermaid\nflowchart LR\nA-->B\n```",
466            MarkdownOptions::default(),
467        )
468        .unwrap();
469
470        assert!(doc.html.contains("<pre class=\"mermaid\">"));
471        assert!(doc.html.contains("A--&gt;B"));
472        assert!(!doc.html.contains("data-rp-copy-code"));
473        assert!(!doc.html.contains("rp-code-line-numbers"));
474        assert!(!doc.html.contains("rp-code-lines"));
475    }
476
477    #[test]
478    fn fenced_code_has_copy_button_and_is_highlighted_with_syntect() {
479        let doc = parse_markdown(
480            "```rust\nfn main() {\n    println!(\"hi\");\n}\n```",
481            MarkdownOptions::default(),
482        )
483        .unwrap();
484
485        assert!(doc.html.contains("class=\"rp-code rp-code-line-numbers\""));
486        assert!(doc.html.contains("class=\"rp-code-copy\""));
487        assert!(doc.html.contains("data-rp-copy-code"));
488        assert!(doc.html.contains("aria-label=\"Copy code\""));
489        assert!(doc.html.contains("rp-code-line-numbers"));
490        assert!(doc
491            .html
492            .contains("class=\"rp-code-lines\" aria-hidden=\"true\""));
493        assert!(doc.html.contains("language-rust"));
494        assert!(doc.html.contains("<span style="));
495        assert!(doc.html.contains("println"));
496    }
497
498    #[test]
499    fn code_line_numbers_can_be_disabled() {
500        let doc = parse_markdown(
501            "```rust\nfn main() { println!(\"hi\"); }\n```",
502            MarkdownOptions {
503                code_line_numbers: false,
504                ..MarkdownOptions::default()
505            },
506        )
507        .unwrap();
508
509        assert!(doc.html.contains("class=\"rp-code\""));
510        assert!(doc.html.contains("data-rp-copy-code"));
511        assert!(!doc.html.contains("rp-code-line-numbers"));
512        assert!(!doc.html.contains("rp-code-lines"));
513    }
514
515    #[test]
516    fn code_highlight_can_be_disabled_without_removing_copy_button() {
517        let doc = parse_markdown(
518            "```rust\nfn main() { println!(\"<hi>\"); }\n```",
519            MarkdownOptions {
520                code_highlight: false,
521                ..MarkdownOptions::default()
522            },
523        )
524        .unwrap();
525
526        assert!(doc.html.contains("class=\"rp-code rp-code-line-numbers\""));
527        assert!(doc.html.contains("data-rp-copy-code"));
528        assert!(doc.html.contains("class=\"rp-code-content language-rust\""));
529        assert!(doc.html.contains("println!(\"&lt;hi&gt;\")"));
530        assert!(!doc.html.contains("<span style="));
531    }
532
533    #[test]
534    fn code_line_numbers_match_multiline_trailing_and_empty_blocks() {
535        let multiline = render_code_block("one\ntwo\n\n", None, false, true);
536        assert!(
537            multiline.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1\n2</span>")
538        );
539        assert!(multiline.contains("<code class=\"rp-code-content\">one\ntwo</code>"));
540
541        let empty = render_code_block("", None, false, true);
542        assert!(empty.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1</span>"));
543    }
544
545    #[test]
546    fn code_block_trims_trailing_whitespace_only_lines() {
547        let html = render_code_block("one\n  \n\t\n", None, false, true);
548
549        assert!(html.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1</span>"));
550        assert!(html.contains("<code class=\"rp-code-content\">one</code>"));
551    }
552
553    #[test]
554    fn code_block_preserves_trailing_spaces_on_last_content_line() {
555        let html = render_code_block("one  \n\n", None, false, true);
556
557        assert!(html.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1</span>"));
558        assert!(html.contains("<code class=\"rp-code-content\">one  </code>"));
559    }
560
561    #[test]
562    fn code_content_does_not_include_line_numbers() {
563        let doc = parse_markdown(
564            "```\nalpha\nbeta\n```",
565            MarkdownOptions {
566                code_highlight: false,
567                ..MarkdownOptions::default()
568            },
569        )
570        .unwrap();
571
572        assert!(doc
573            .html
574            .contains("class=\"rp-code-lines\" aria-hidden=\"true\">1\n2</span>"));
575        assert_eq!(code_content(&doc.html), "alpha\nbeta");
576    }
577
578    #[test]
579    fn code_is_excluded_from_search_by_default() {
580        let doc = parse_markdown(
581            "Body\n\n```rust\nlet hidden = true;\n```",
582            MarkdownOptions::default(),
583        )
584        .unwrap();
585
586        assert!(doc.search_text.contains("Body"));
587        assert!(!doc.search_text.contains("hidden"));
588    }
589
590    fn code_content(html: &str) -> &str {
591        let class_start = html.find("class=\"rp-code-content").unwrap();
592        let content_start = class_start + html[class_start..].find('>').unwrap() + 1;
593        let content_end = content_start + html[content_start..].find("</code>").unwrap();
594        &html[content_start..content_end]
595    }
596}