Skip to main content

rustpress_md/
lib.rs

1use std::collections::HashMap;
2
3use anyhow::{Context, Result};
4use once_cell::sync::Lazy;
5use pulldown_cmark::{
6    html, CodeBlockKind, CowStr, Event, HeadingLevel, Options, Parser, Tag, TagEnd,
7};
8use regex::Regex;
9use serde::{Deserialize, Serialize};
10use syntect::easy::HighlightLines;
11use syntect::highlighting::{Theme, ThemeSet};
12use syntect::html::{styled_line_to_highlighted_html, IncludeBackground};
13use syntect::parsing::SyntaxSet;
14use syntect::util::LinesWithEndings;
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct MarkdownOptions {
18    pub mermaid: bool,
19    pub code_highlight: bool,
20    pub code_line_numbers: bool,
21    pub heading_anchors: bool,
22    pub index_code: bool,
23}
24
25impl Default for MarkdownOptions {
26    fn default() -> Self {
27        Self {
28            mermaid: true,
29            code_highlight: true,
30            code_line_numbers: true,
31            heading_anchors: true,
32            index_code: false,
33        }
34    }
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
38#[serde(default)]
39pub struct Frontmatter {
40    pub title: Option<String>,
41    pub layout: String,
42    pub sidebar: bool,
43    pub search: bool,
44    pub access: String,
45}
46
47impl Default for Frontmatter {
48    fn default() -> Self {
49        Self {
50            title: None,
51            layout: "doc".to_string(),
52            sidebar: true,
53            search: true,
54            access: "public".to_string(),
55        }
56    }
57}
58
59#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
60pub struct Heading {
61    pub level: u8,
62    pub text: String,
63    pub anchor: String,
64}
65
66#[derive(Debug, Clone, PartialEq, Eq)]
67pub struct Document {
68    pub frontmatter: Frontmatter,
69    pub title: String,
70    pub html: String,
71    pub headings: Vec<Heading>,
72    pub search_text: String,
73}
74
75pub fn parse_markdown(input: &str, options: MarkdownOptions) -> Result<Document> {
76    let (frontmatter, markdown) = split_frontmatter(input)?;
77    let mut frontmatter = frontmatter;
78    normalize_frontmatter(&mut frontmatter);
79
80    let mut parser_options = Options::empty();
81    parser_options.insert(Options::ENABLE_TABLES);
82    parser_options.insert(Options::ENABLE_FOOTNOTES);
83    parser_options.insert(Options::ENABLE_STRIKETHROUGH);
84    parser_options.insert(Options::ENABLE_TASKLISTS);
85    parser_options.insert(Options::ENABLE_HEADING_ATTRIBUTES);
86
87    let events: Vec<Event<'_>> = Parser::new_ext(markdown, parser_options).collect();
88    let headings = collect_headings(&events);
89    let html = render_html(events, &headings, &options);
90    let search_text = collect_search_text(markdown, options.index_code);
91    let title = frontmatter
92        .title
93        .clone()
94        .or_else(|| headings.first().map(|heading| heading.text.clone()))
95        .unwrap_or_else(|| "Untitled".to_string());
96
97    Ok(Document {
98        frontmatter,
99        title,
100        html,
101        headings,
102        search_text,
103    })
104}
105
106fn split_frontmatter(input: &str) -> Result<(Frontmatter, &str)> {
107    let trimmed = input.strip_prefix('\u{feff}').unwrap_or(input);
108    if !trimmed.starts_with("---\n") && !trimmed.starts_with("---\r\n") {
109        return Ok((Frontmatter::default(), trimmed));
110    }
111
112    let body_start = if trimmed.starts_with("---\r\n") { 5 } else { 4 };
113    let rest = &trimmed[body_start..];
114    for marker in ["\n---\n", "\r\n---\r\n", "\n---\r\n", "\r\n---\n"] {
115        if let Some(index) = rest.find(marker) {
116            let yaml = &rest[..index];
117            let after_marker = &rest[index + marker.len()..];
118            let frontmatter = serde_yaml::from_str(yaml).context("failed to parse frontmatter")?;
119            return Ok((frontmatter, after_marker));
120        }
121    }
122
123    anyhow::bail!("frontmatter starts with --- but has no closing marker")
124}
125
126fn normalize_frontmatter(frontmatter: &mut Frontmatter) {
127    if frontmatter.layout.is_empty() {
128        frontmatter.layout = "doc".to_string();
129    }
130    if frontmatter.access != "public" && frontmatter.access != "masked" {
131        frontmatter.access = "public".to_string();
132    }
133}
134
135fn collect_headings(events: &[Event<'_>]) -> Vec<Heading> {
136    let mut headings = Vec::new();
137    let mut current: Option<(u8, String)> = None;
138    let mut used = HashMap::<String, usize>::new();
139
140    for event in events {
141        match event {
142            Event::Start(Tag::Heading { level, .. }) => {
143                current = Some((heading_level(*level), String::new()));
144            }
145            Event::Text(text) | Event::Code(text) => {
146                if let Some((_, current_text)) = &mut current {
147                    current_text.push_str(text);
148                }
149            }
150            Event::End(TagEnd::Heading(_)) => {
151                if let Some((level, text)) = current.take() {
152                    let anchor = unique_slug(&slugify(&text), &mut used);
153                    headings.push(Heading {
154                        level,
155                        text: text.trim().to_string(),
156                        anchor,
157                    });
158                }
159            }
160            _ => {}
161        }
162    }
163
164    headings
165}
166
167fn render_html(events: Vec<Event<'_>>, headings: &[Heading], options: &MarkdownOptions) -> String {
168    let mut out_events = Vec::with_capacity(events.len() + headings.len());
169    let mut heading_index = 0usize;
170    let mut in_code_block = false;
171    let mut code_lang: Option<String> = None;
172    let mut code_text = String::new();
173
174    for event in events {
175        match event {
176            Event::Start(Tag::Heading { level, .. }) if options.heading_anchors => {
177                let anchor = headings
178                    .get(heading_index)
179                    .map(|heading| heading.anchor.as_str())
180                    .unwrap_or_default();
181                heading_index += 1;
182                out_events.push(Event::Html(CowStr::from(format!(
183                    "<{} id=\"{}\"><a class=\"heading-anchor\" href=\"#{}\" aria-label=\"Link to section\">#</a>",
184                    heading_tag(level),
185                    escape_attr(anchor),
186                    escape_attr(anchor)
187                ))));
188            }
189            Event::End(TagEnd::Heading(level)) if options.heading_anchors => {
190                out_events.push(Event::Html(CowStr::from(format!(
191                    "</{}>",
192                    heading_tag(level)
193                ))));
194            }
195            Event::Start(Tag::CodeBlock(kind)) => {
196                let lang = match &kind {
197                    CodeBlockKind::Fenced(value) => value
198                        .split_whitespace()
199                        .next()
200                        .filter(|value| !value.is_empty())
201                        .map(str::to_string),
202                    CodeBlockKind::Indented => None,
203                };
204
205                if options.mermaid && lang.as_deref() == Some("mermaid") {
206                    in_code_block = true;
207                    code_lang = lang;
208                    out_events.push(Event::Html(CowStr::from("<pre class=\"mermaid\">")));
209                } else {
210                    in_code_block = true;
211                    code_lang = lang;
212                    code_text.clear();
213                }
214            }
215            Event::Text(text) if in_code_block && code_lang.as_deref() == Some("mermaid") => {
216                out_events.push(Event::Html(CowStr::from(escape_html(&text))));
217            }
218            Event::Text(text) | Event::Code(text) if in_code_block => {
219                code_text.push_str(&text);
220            }
221            Event::End(TagEnd::CodeBlock) if in_code_block => {
222                if code_lang.as_deref() == Some("mermaid") {
223                    out_events.push(Event::Html(CowStr::from("</pre>")));
224                } else {
225                    out_events.push(Event::Html(CowStr::from(render_code_block(
226                        &code_text,
227                        code_lang.as_deref(),
228                        options.code_highlight,
229                        options.code_line_numbers,
230                    ))));
231                    code_text.clear();
232                }
233                in_code_block = false;
234                code_lang = None;
235            }
236            _ => out_events.push(event),
237        }
238    }
239
240    let mut rendered = String::new();
241    html::push_html(&mut rendered, out_events.into_iter());
242    rendered
243}
244
245fn render_code_block(
246    code: &str,
247    lang: Option<&str>,
248    highlight: bool,
249    code_line_numbers: bool,
250) -> String {
251    let normalized_lang = lang
252        .map(normalize_code_lang)
253        .filter(|lang| !lang.is_empty());
254    let content = if highlight {
255        highlight_code(code, normalized_lang)
256    } else {
257        escape_html(code)
258    };
259    let lang_class = normalized_lang
260        .map(|lang| format!(" language-{}", escape_attr(lang)))
261        .unwrap_or_default();
262    let header = normalized_lang
263        .map(|lang| {
264            format!(
265                r#"<div class="rp-code-header"><span>{}</span></div>"#,
266                escape_html(lang)
267            )
268        })
269        .unwrap_or_default();
270
271    if code_line_numbers {
272        let line_count = LinesWithEndings::from(code).count().max(1);
273        let lines = (1..=line_count)
274            .map(|line| line.to_string())
275            .collect::<Vec<_>>()
276            .join("\n");
277        return format!(
278            r#"<div class="rp-code rp-code-line-numbers">{header}<button class="rp-code-copy" type="button" data-rp-copy-code aria-label="Copy code" title="Copy code"><svg class="rp-code-copy-icon" viewBox="0 0 24 24" aria-hidden="true"><rect x="9" y="9" width="11" height="11" rx="2"></rect><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"></path></svg><svg class="rp-code-copy-check" viewBox="0 0 24 24" aria-hidden="true"><path d="M20 6 9 17l-5-5"></path></svg></button><pre><span class="rp-code-lines" aria-hidden="true">{lines}</span><code class="rp-code-content{lang_class}">{content}</code></pre></div>"#
279        );
280    }
281
282    format!(
283        r#"<div class="rp-code">{header}<button class="rp-code-copy" type="button" data-rp-copy-code aria-label="Copy code" title="Copy code"><svg class="rp-code-copy-icon" viewBox="0 0 24 24" aria-hidden="true"><rect x="9" y="9" width="11" height="11" rx="2"></rect><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"></path></svg><svg class="rp-code-copy-check" viewBox="0 0 24 24" aria-hidden="true"><path d="M20 6 9 17l-5-5"></path></svg></button><pre><code class="rp-code-content{lang_class}">{content}</code></pre></div>"#
284    )
285}
286
287fn normalize_code_lang(lang: &str) -> &str {
288    lang.trim()
289        .trim_start_matches("language-")
290        .split([',', '{'])
291        .next()
292        .unwrap_or("")
293        .trim()
294}
295
296fn highlight_code(code: &str, lang: Option<&str>) -> String {
297    let syntax = lang
298        .and_then(|lang| SYNTAX_SET.find_syntax_by_token(lang))
299        .unwrap_or_else(|| SYNTAX_SET.find_syntax_plain_text());
300    let mut highlighter = HighlightLines::new(syntax, highlight_theme());
301    let mut html = String::new();
302
303    for line in LinesWithEndings::from(code) {
304        match highlighter
305            .highlight_line(line, &SYNTAX_SET)
306            .and_then(|regions| styled_line_to_highlighted_html(&regions, IncludeBackground::No))
307        {
308            Ok(line_html) => html.push_str(&line_html),
309            Err(_) => html.push_str(&escape_html(line)),
310        }
311    }
312
313    html
314}
315
316static SYNTAX_SET: Lazy<SyntaxSet> = Lazy::new(SyntaxSet::load_defaults_newlines);
317static THEME_SET: Lazy<ThemeSet> = Lazy::new(ThemeSet::load_defaults);
318
319fn highlight_theme() -> &'static Theme {
320    THEME_SET
321        .themes
322        .get("base16-ocean.dark")
323        .or_else(|| THEME_SET.themes.values().next())
324        .expect("syntect ships with default themes")
325}
326
327fn collect_search_text(markdown: &str, index_code: bool) -> String {
328    let mut parser_options = Options::empty();
329    parser_options.insert(Options::ENABLE_TABLES);
330    parser_options.insert(Options::ENABLE_STRIKETHROUGH);
331    parser_options.insert(Options::ENABLE_TASKLISTS);
332
333    let mut text = String::new();
334    let mut in_code_block = false;
335
336    for event in Parser::new_ext(markdown, parser_options) {
337        match event {
338            Event::Start(Tag::CodeBlock(_)) => in_code_block = true,
339            Event::End(TagEnd::CodeBlock) => in_code_block = false,
340            Event::Text(value) | Event::Code(value) => {
341                if index_code || !in_code_block {
342                    if !text.is_empty() {
343                        text.push(' ');
344                    }
345                    text.push_str(&value);
346                }
347            }
348            _ => {}
349        }
350    }
351
352    normalize_space(&text)
353}
354
355fn heading_level(level: HeadingLevel) -> u8 {
356    match level {
357        HeadingLevel::H1 => 1,
358        HeadingLevel::H2 => 2,
359        HeadingLevel::H3 => 3,
360        HeadingLevel::H4 => 4,
361        HeadingLevel::H5 => 5,
362        HeadingLevel::H6 => 6,
363    }
364}
365
366fn heading_tag(level: HeadingLevel) -> &'static str {
367    match level {
368        HeadingLevel::H1 => "h1",
369        HeadingLevel::H2 => "h2",
370        HeadingLevel::H3 => "h3",
371        HeadingLevel::H4 => "h4",
372        HeadingLevel::H5 => "h5",
373        HeadingLevel::H6 => "h6",
374    }
375}
376
377fn slugify(text: &str) -> String {
378    static PUNCT: Lazy<Regex> =
379        Lazy::new(|| Regex::new(r"[^\p{Alphabetic}\p{Number}_\-\s]+").unwrap());
380    static SPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_]+").unwrap());
381
382    let lower = text.trim().to_lowercase();
383    let without_punct = PUNCT.replace_all(&lower, "");
384    let slug = SPACE.replace_all(without_punct.trim(), "-");
385    if slug.is_empty() {
386        "section".to_string()
387    } else {
388        slug.to_string()
389    }
390}
391
392fn unique_slug(slug: &str, used: &mut HashMap<String, usize>) -> String {
393    let count = used.entry(slug.to_string()).or_insert(0);
394    *count += 1;
395    if *count == 1 {
396        slug.to_string()
397    } else {
398        format!("{slug}-{}", *count)
399    }
400}
401
402fn normalize_space(input: &str) -> String {
403    input.split_whitespace().collect::<Vec<_>>().join(" ")
404}
405
406fn escape_html(input: &str) -> String {
407    input
408        .replace('&', "&amp;")
409        .replace('<', "&lt;")
410        .replace('>', "&gt;")
411}
412
413fn escape_attr(input: &str) -> String {
414    escape_html(input).replace('"', "&quot;")
415}
416
417#[cfg(test)]
418mod tests {
419    use super::*;
420
421    #[test]
422    fn parses_frontmatter_and_title() {
423        let doc = parse_markdown(
424            "---\ntitle: Page Title\naccess: masked\n---\n# Ignored\nBody",
425            MarkdownOptions::default(),
426        )
427        .unwrap();
428
429        assert_eq!(doc.title, "Page Title");
430        assert_eq!(doc.frontmatter.access, "masked");
431        assert!(doc.html.contains("id=\"ignored\""));
432    }
433
434    #[test]
435    fn chinese_heading_anchor_is_preserved() {
436        let doc = parse_markdown("# 中文 标题\n\ntext", MarkdownOptions::default()).unwrap();
437
438        assert_eq!(doc.headings[0].anchor, "中文-标题");
439        assert!(doc.html.contains("id=\"中文-标题\""));
440    }
441
442    #[test]
443    fn mermaid_code_block_becomes_mermaid_pre() {
444        let doc = parse_markdown(
445            "```mermaid\nflowchart LR\nA-->B\n```",
446            MarkdownOptions::default(),
447        )
448        .unwrap();
449
450        assert!(doc.html.contains("<pre class=\"mermaid\">"));
451        assert!(doc.html.contains("A--&gt;B"));
452        assert!(!doc.html.contains("data-rp-copy-code"));
453        assert!(!doc.html.contains("rp-code-line-numbers"));
454        assert!(!doc.html.contains("rp-code-lines"));
455    }
456
457    #[test]
458    fn fenced_code_has_copy_button_and_is_highlighted_with_syntect() {
459        let doc = parse_markdown(
460            "```rust\nfn main() {\n    println!(\"hi\");\n}\n```",
461            MarkdownOptions::default(),
462        )
463        .unwrap();
464
465        assert!(doc.html.contains("class=\"rp-code rp-code-line-numbers\""));
466        assert!(doc.html.contains("class=\"rp-code-copy\""));
467        assert!(doc.html.contains("data-rp-copy-code"));
468        assert!(doc.html.contains("aria-label=\"Copy code\""));
469        assert!(doc.html.contains("rp-code-line-numbers"));
470        assert!(doc
471            .html
472            .contains("class=\"rp-code-lines\" aria-hidden=\"true\""));
473        assert!(doc.html.contains("language-rust"));
474        assert!(doc.html.contains("<span style="));
475        assert!(doc.html.contains("println"));
476    }
477
478    #[test]
479    fn code_line_numbers_can_be_disabled() {
480        let doc = parse_markdown(
481            "```rust\nfn main() { println!(\"hi\"); }\n```",
482            MarkdownOptions {
483                code_line_numbers: false,
484                ..MarkdownOptions::default()
485            },
486        )
487        .unwrap();
488
489        assert!(doc.html.contains("class=\"rp-code\""));
490        assert!(doc.html.contains("data-rp-copy-code"));
491        assert!(!doc.html.contains("rp-code-line-numbers"));
492        assert!(!doc.html.contains("rp-code-lines"));
493    }
494
495    #[test]
496    fn code_highlight_can_be_disabled_without_removing_copy_button() {
497        let doc = parse_markdown(
498            "```rust\nfn main() { println!(\"<hi>\"); }\n```",
499            MarkdownOptions {
500                code_highlight: false,
501                ..MarkdownOptions::default()
502            },
503        )
504        .unwrap();
505
506        assert!(doc.html.contains("class=\"rp-code rp-code-line-numbers\""));
507        assert!(doc.html.contains("data-rp-copy-code"));
508        assert!(doc.html.contains("class=\"rp-code-content language-rust\""));
509        assert!(doc.html.contains("println!(\"&lt;hi&gt;\")"));
510        assert!(!doc.html.contains("<span style="));
511    }
512
513    #[test]
514    fn code_line_numbers_match_multiline_trailing_and_empty_blocks() {
515        let multiline = render_code_block("one\ntwo\n\n", None, false, true);
516        assert!(
517            multiline.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1\n2\n3</span>")
518        );
519
520        let empty = render_code_block("", None, false, true);
521        assert!(empty.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1</span>"));
522    }
523
524    #[test]
525    fn code_content_does_not_include_line_numbers() {
526        let doc = parse_markdown(
527            "```\nalpha\nbeta\n```",
528            MarkdownOptions {
529                code_highlight: false,
530                ..MarkdownOptions::default()
531            },
532        )
533        .unwrap();
534
535        assert!(doc
536            .html
537            .contains("class=\"rp-code-lines\" aria-hidden=\"true\">1\n2</span>"));
538        assert_eq!(code_content(&doc.html), "alpha\nbeta\n");
539    }
540
541    #[test]
542    fn code_is_excluded_from_search_by_default() {
543        let doc = parse_markdown(
544            "Body\n\n```rust\nlet hidden = true;\n```",
545            MarkdownOptions::default(),
546        )
547        .unwrap();
548
549        assert!(doc.search_text.contains("Body"));
550        assert!(!doc.search_text.contains("hidden"));
551    }
552
553    fn code_content(html: &str) -> &str {
554        let class_start = html.find("class=\"rp-code-content").unwrap();
555        let content_start = class_start + html[class_start..].find('>').unwrap() + 1;
556        let content_end = content_start + html[content_start..].find("</code>").unwrap();
557        &html[content_start..content_end]
558    }
559}