Skip to main content

rustpress_md/
lib.rs

1use std::collections::HashMap;
2
3use anyhow::{Context, Result};
4use once_cell::sync::Lazy;
5use pulldown_cmark::{
6    html, CodeBlockKind, CowStr, Event, HeadingLevel, Options, Parser, Tag, TagEnd,
7};
8use regex::Regex;
9use serde::{Deserialize, Serialize};
10use syntect::easy::HighlightLines;
11use syntect::highlighting::{Theme, ThemeSet};
12use syntect::html::{styled_line_to_highlighted_html, IncludeBackground};
13use syntect::parsing::SyntaxSet;
14use syntect::util::LinesWithEndings;
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct MarkdownOptions {
18    pub mermaid: bool,
19    pub code_highlight: bool,
20    pub heading_anchors: bool,
21    pub index_code: bool,
22}
23
24impl Default for MarkdownOptions {
25    fn default() -> Self {
26        Self {
27            mermaid: true,
28            code_highlight: true,
29            heading_anchors: true,
30            index_code: false,
31        }
32    }
33}
34
35#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
36#[serde(default)]
37pub struct Frontmatter {
38    pub title: Option<String>,
39    pub layout: String,
40    pub sidebar: bool,
41    pub search: bool,
42    pub access: String,
43}
44
45impl Default for Frontmatter {
46    fn default() -> Self {
47        Self {
48            title: None,
49            layout: "doc".to_string(),
50            sidebar: true,
51            search: true,
52            access: "public".to_string(),
53        }
54    }
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
58pub struct Heading {
59    pub level: u8,
60    pub text: String,
61    pub anchor: String,
62}
63
64#[derive(Debug, Clone, PartialEq, Eq)]
65pub struct Document {
66    pub frontmatter: Frontmatter,
67    pub title: String,
68    pub html: String,
69    pub headings: Vec<Heading>,
70    pub search_text: String,
71}
72
73pub fn parse_markdown(input: &str, options: MarkdownOptions) -> Result<Document> {
74    let (frontmatter, markdown) = split_frontmatter(input)?;
75    let mut frontmatter = frontmatter;
76    normalize_frontmatter(&mut frontmatter);
77
78    let mut parser_options = Options::empty();
79    parser_options.insert(Options::ENABLE_TABLES);
80    parser_options.insert(Options::ENABLE_FOOTNOTES);
81    parser_options.insert(Options::ENABLE_STRIKETHROUGH);
82    parser_options.insert(Options::ENABLE_TASKLISTS);
83    parser_options.insert(Options::ENABLE_HEADING_ATTRIBUTES);
84
85    let events: Vec<Event<'_>> = Parser::new_ext(markdown, parser_options).collect();
86    let headings = collect_headings(&events);
87    let html = render_html(events, &headings, &options);
88    let search_text = collect_search_text(markdown, options.index_code);
89    let title = frontmatter
90        .title
91        .clone()
92        .or_else(|| headings.first().map(|heading| heading.text.clone()))
93        .unwrap_or_else(|| "Untitled".to_string());
94
95    Ok(Document {
96        frontmatter,
97        title,
98        html,
99        headings,
100        search_text,
101    })
102}
103
104fn split_frontmatter(input: &str) -> Result<(Frontmatter, &str)> {
105    let trimmed = input.strip_prefix('\u{feff}').unwrap_or(input);
106    if !trimmed.starts_with("---\n") && !trimmed.starts_with("---\r\n") {
107        return Ok((Frontmatter::default(), trimmed));
108    }
109
110    let body_start = if trimmed.starts_with("---\r\n") { 5 } else { 4 };
111    let rest = &trimmed[body_start..];
112    for marker in ["\n---\n", "\r\n---\r\n", "\n---\r\n", "\r\n---\n"] {
113        if let Some(index) = rest.find(marker) {
114            let yaml = &rest[..index];
115            let after_marker = &rest[index + marker.len()..];
116            let frontmatter = serde_yaml::from_str(yaml).context("failed to parse frontmatter")?;
117            return Ok((frontmatter, after_marker));
118        }
119    }
120
121    anyhow::bail!("frontmatter starts with --- but has no closing marker")
122}
123
124fn normalize_frontmatter(frontmatter: &mut Frontmatter) {
125    if frontmatter.layout.is_empty() {
126        frontmatter.layout = "doc".to_string();
127    }
128    if frontmatter.access != "public" && frontmatter.access != "masked" {
129        frontmatter.access = "public".to_string();
130    }
131}
132
133fn collect_headings(events: &[Event<'_>]) -> Vec<Heading> {
134    let mut headings = Vec::new();
135    let mut current: Option<(u8, String)> = None;
136    let mut used = HashMap::<String, usize>::new();
137
138    for event in events {
139        match event {
140            Event::Start(Tag::Heading { level, .. }) => {
141                current = Some((heading_level(*level), String::new()));
142            }
143            Event::Text(text) | Event::Code(text) => {
144                if let Some((_, current_text)) = &mut current {
145                    current_text.push_str(text);
146                }
147            }
148            Event::End(TagEnd::Heading(_)) => {
149                if let Some((level, text)) = current.take() {
150                    let anchor = unique_slug(&slugify(&text), &mut used);
151                    headings.push(Heading {
152                        level,
153                        text: text.trim().to_string(),
154                        anchor,
155                    });
156                }
157            }
158            _ => {}
159        }
160    }
161
162    headings
163}
164
165fn render_html(events: Vec<Event<'_>>, headings: &[Heading], options: &MarkdownOptions) -> String {
166    let mut out_events = Vec::with_capacity(events.len() + headings.len());
167    let mut heading_index = 0usize;
168    let mut in_code_block = false;
169    let mut code_lang: Option<String> = None;
170    let mut code_text = String::new();
171
172    for event in events {
173        match event {
174            Event::Start(Tag::Heading { level, .. }) if options.heading_anchors => {
175                let anchor = headings
176                    .get(heading_index)
177                    .map(|heading| heading.anchor.as_str())
178                    .unwrap_or_default();
179                heading_index += 1;
180                out_events.push(Event::Html(CowStr::from(format!(
181                    "<{} id=\"{}\"><a class=\"heading-anchor\" href=\"#{}\" aria-label=\"Link to section\">#</a>",
182                    heading_tag(level),
183                    escape_attr(anchor),
184                    escape_attr(anchor)
185                ))));
186            }
187            Event::End(TagEnd::Heading(level)) if options.heading_anchors => {
188                out_events.push(Event::Html(CowStr::from(format!(
189                    "</{}>",
190                    heading_tag(level)
191                ))));
192            }
193            Event::Start(Tag::CodeBlock(kind)) => {
194                let lang = match &kind {
195                    CodeBlockKind::Fenced(value) => value
196                        .split_whitespace()
197                        .next()
198                        .filter(|value| !value.is_empty())
199                        .map(str::to_string),
200                    CodeBlockKind::Indented => None,
201                };
202
203                if options.mermaid && lang.as_deref() == Some("mermaid") {
204                    in_code_block = true;
205                    code_lang = lang;
206                    out_events.push(Event::Html(CowStr::from("<pre class=\"mermaid\">")));
207                } else if options.code_highlight {
208                    in_code_block = true;
209                    code_lang = lang;
210                    code_text.clear();
211                } else {
212                    out_events.push(Event::Start(Tag::CodeBlock(kind)));
213                }
214            }
215            Event::Text(text) if in_code_block && code_lang.as_deref() == Some("mermaid") => {
216                out_events.push(Event::Html(CowStr::from(escape_html(&text))));
217            }
218            Event::Text(text) | Event::Code(text) if in_code_block => {
219                code_text.push_str(&text);
220            }
221            Event::End(TagEnd::CodeBlock) if in_code_block => {
222                if code_lang.as_deref() == Some("mermaid") {
223                    out_events.push(Event::Html(CowStr::from("</pre>")));
224                } else {
225                    out_events.push(Event::Html(CowStr::from(render_code_block(
226                        &code_text,
227                        code_lang.as_deref(),
228                    ))));
229                    code_text.clear();
230                }
231                in_code_block = false;
232                code_lang = None;
233            }
234            _ => out_events.push(event),
235        }
236    }
237
238    let mut rendered = String::new();
239    html::push_html(&mut rendered, out_events.into_iter());
240    rendered
241}
242
243fn render_code_block(code: &str, lang: Option<&str>) -> String {
244    let normalized_lang = lang
245        .map(normalize_code_lang)
246        .filter(|lang| !lang.is_empty());
247    let highlighted = highlight_code(code, normalized_lang);
248    let lang_class = normalized_lang
249        .map(|lang| format!(" language-{}", escape_attr(lang)))
250        .unwrap_or_default();
251    let header = normalized_lang
252        .map(|lang| {
253            format!(
254                r#"<div class="rp-code-header"><span>{}</span></div>"#,
255                escape_html(lang)
256            )
257        })
258        .unwrap_or_default();
259
260    format!(
261        r#"<div class="rp-code">{header}<pre><code class="rp-code-content{lang_class}">{highlighted}</code></pre></div>"#
262    )
263}
264
265fn normalize_code_lang(lang: &str) -> &str {
266    lang.trim()
267        .trim_start_matches("language-")
268        .split([',', '{'])
269        .next()
270        .unwrap_or("")
271        .trim()
272}
273
274fn highlight_code(code: &str, lang: Option<&str>) -> String {
275    let syntax = lang
276        .and_then(|lang| SYNTAX_SET.find_syntax_by_token(lang))
277        .unwrap_or_else(|| SYNTAX_SET.find_syntax_plain_text());
278    let mut highlighter = HighlightLines::new(syntax, highlight_theme());
279    let mut html = String::new();
280
281    for line in LinesWithEndings::from(code) {
282        match highlighter
283            .highlight_line(line, &SYNTAX_SET)
284            .and_then(|regions| styled_line_to_highlighted_html(&regions, IncludeBackground::No))
285        {
286            Ok(line_html) => html.push_str(&line_html),
287            Err(_) => html.push_str(&escape_html(line)),
288        }
289    }
290
291    html
292}
293
294static SYNTAX_SET: Lazy<SyntaxSet> = Lazy::new(SyntaxSet::load_defaults_newlines);
295static THEME_SET: Lazy<ThemeSet> = Lazy::new(ThemeSet::load_defaults);
296
297fn highlight_theme() -> &'static Theme {
298    THEME_SET
299        .themes
300        .get("base16-ocean.dark")
301        .or_else(|| THEME_SET.themes.values().next())
302        .expect("syntect ships with default themes")
303}
304
305fn collect_search_text(markdown: &str, index_code: bool) -> String {
306    let mut parser_options = Options::empty();
307    parser_options.insert(Options::ENABLE_TABLES);
308    parser_options.insert(Options::ENABLE_STRIKETHROUGH);
309    parser_options.insert(Options::ENABLE_TASKLISTS);
310
311    let mut text = String::new();
312    let mut in_code_block = false;
313
314    for event in Parser::new_ext(markdown, parser_options) {
315        match event {
316            Event::Start(Tag::CodeBlock(_)) => in_code_block = true,
317            Event::End(TagEnd::CodeBlock) => in_code_block = false,
318            Event::Text(value) | Event::Code(value) => {
319                if index_code || !in_code_block {
320                    if !text.is_empty() {
321                        text.push(' ');
322                    }
323                    text.push_str(&value);
324                }
325            }
326            _ => {}
327        }
328    }
329
330    normalize_space(&text)
331}
332
333fn heading_level(level: HeadingLevel) -> u8 {
334    match level {
335        HeadingLevel::H1 => 1,
336        HeadingLevel::H2 => 2,
337        HeadingLevel::H3 => 3,
338        HeadingLevel::H4 => 4,
339        HeadingLevel::H5 => 5,
340        HeadingLevel::H6 => 6,
341    }
342}
343
344fn heading_tag(level: HeadingLevel) -> &'static str {
345    match level {
346        HeadingLevel::H1 => "h1",
347        HeadingLevel::H2 => "h2",
348        HeadingLevel::H3 => "h3",
349        HeadingLevel::H4 => "h4",
350        HeadingLevel::H5 => "h5",
351        HeadingLevel::H6 => "h6",
352    }
353}
354
355fn slugify(text: &str) -> String {
356    static PUNCT: Lazy<Regex> =
357        Lazy::new(|| Regex::new(r"[^\p{Alphabetic}\p{Number}_\-\s]+").unwrap());
358    static SPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_]+").unwrap());
359
360    let lower = text.trim().to_lowercase();
361    let without_punct = PUNCT.replace_all(&lower, "");
362    let slug = SPACE.replace_all(without_punct.trim(), "-");
363    if slug.is_empty() {
364        "section".to_string()
365    } else {
366        slug.to_string()
367    }
368}
369
370fn unique_slug(slug: &str, used: &mut HashMap<String, usize>) -> String {
371    let count = used.entry(slug.to_string()).or_insert(0);
372    *count += 1;
373    if *count == 1 {
374        slug.to_string()
375    } else {
376        format!("{slug}-{}", *count)
377    }
378}
379
380fn normalize_space(input: &str) -> String {
381    input.split_whitespace().collect::<Vec<_>>().join(" ")
382}
383
384fn escape_html(input: &str) -> String {
385    input
386        .replace('&', "&amp;")
387        .replace('<', "&lt;")
388        .replace('>', "&gt;")
389}
390
391fn escape_attr(input: &str) -> String {
392    escape_html(input).replace('"', "&quot;")
393}
394
395#[cfg(test)]
396mod tests {
397    use super::*;
398
399    #[test]
400    fn parses_frontmatter_and_title() {
401        let doc = parse_markdown(
402            "---\ntitle: Page Title\naccess: masked\n---\n# Ignored\nBody",
403            MarkdownOptions::default(),
404        )
405        .unwrap();
406
407        assert_eq!(doc.title, "Page Title");
408        assert_eq!(doc.frontmatter.access, "masked");
409        assert!(doc.html.contains("id=\"ignored\""));
410    }
411
412    #[test]
413    fn chinese_heading_anchor_is_preserved() {
414        let doc = parse_markdown("# 中文 标题\n\ntext", MarkdownOptions::default()).unwrap();
415
416        assert_eq!(doc.headings[0].anchor, "中文-标题");
417        assert!(doc.html.contains("id=\"中文-标题\""));
418    }
419
420    #[test]
421    fn mermaid_code_block_becomes_mermaid_pre() {
422        let doc = parse_markdown(
423            "```mermaid\nflowchart LR\nA-->B\n```",
424            MarkdownOptions::default(),
425        )
426        .unwrap();
427
428        assert!(doc.html.contains("<pre class=\"mermaid\">"));
429        assert!(doc.html.contains("A--&gt;B"));
430    }
431
432    #[test]
433    fn fenced_code_is_highlighted_with_syntect() {
434        let doc = parse_markdown(
435            "```rust\nfn main() {\n    println!(\"hi\");\n}\n```",
436            MarkdownOptions::default(),
437        )
438        .unwrap();
439
440        assert!(doc.html.contains("class=\"rp-code\""));
441        assert!(doc.html.contains("language-rust"));
442        assert!(doc.html.contains("<span style="));
443        assert!(doc.html.contains("println"));
444    }
445
446    #[test]
447    fn code_highlight_can_be_disabled() {
448        let doc = parse_markdown(
449            "```rust\nfn main() {}\n```",
450            MarkdownOptions {
451                code_highlight: false,
452                ..MarkdownOptions::default()
453            },
454        )
455        .unwrap();
456
457        assert!(!doc.html.contains("class=\"rp-code\""));
458        assert!(doc.html.contains("<pre><code class=\"language-rust\">"));
459    }
460
461    #[test]
462    fn code_is_excluded_from_search_by_default() {
463        let doc = parse_markdown(
464            "Body\n\n```rust\nlet hidden = true;\n```",
465            MarkdownOptions::default(),
466        )
467        .unwrap();
468
469        assert!(doc.search_text.contains("Body"));
470        assert!(!doc.search_text.contains("hidden"));
471    }
472}