1use std::collections::HashMap;
2
3use anyhow::{Context, Result};
4use once_cell::sync::Lazy;
5use pulldown_cmark::{
6 html, CodeBlockKind, CowStr, Event, HeadingLevel, Options, Parser, Tag, TagEnd,
7};
8use regex::Regex;
9use serde::{Deserialize, Serialize};
10use syntect::easy::HighlightLines;
11use syntect::highlighting::{Theme, ThemeSet};
12use syntect::html::{styled_line_to_highlighted_html, IncludeBackground};
13use syntect::parsing::SyntaxSet;
14use syntect::util::LinesWithEndings;
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct MarkdownOptions {
18 pub mermaid: bool,
19 pub code_highlight: bool,
20 pub code_line_numbers: bool,
21 pub heading_anchors: bool,
22 pub index_code: bool,
23}
24
25impl Default for MarkdownOptions {
26 fn default() -> Self {
27 Self {
28 mermaid: true,
29 code_highlight: true,
30 code_line_numbers: true,
31 heading_anchors: true,
32 index_code: false,
33 }
34 }
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
38#[serde(default)]
39pub struct Frontmatter {
40 pub title: Option<String>,
41 pub layout: String,
42 pub sidebar: bool,
43 pub search: bool,
44 pub access: String,
45}
46
47impl Default for Frontmatter {
48 fn default() -> Self {
49 Self {
50 title: None,
51 layout: "doc".to_string(),
52 sidebar: true,
53 search: true,
54 access: "public".to_string(),
55 }
56 }
57}
58
59#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
60pub struct Heading {
61 pub level: u8,
62 pub text: String,
63 pub anchor: String,
64}
65
66#[derive(Debug, Clone, PartialEq, Eq)]
67pub struct Document {
68 pub frontmatter: Frontmatter,
69 pub title: String,
70 pub html: String,
71 pub headings: Vec<Heading>,
72 pub search_text: String,
73}
74
75pub fn parse_markdown(input: &str, options: MarkdownOptions) -> Result<Document> {
76 let (frontmatter, markdown) = split_frontmatter(input)?;
77 let mut frontmatter = frontmatter;
78 normalize_frontmatter(&mut frontmatter);
79
80 let mut parser_options = Options::empty();
81 parser_options.insert(Options::ENABLE_TABLES);
82 parser_options.insert(Options::ENABLE_FOOTNOTES);
83 parser_options.insert(Options::ENABLE_STRIKETHROUGH);
84 parser_options.insert(Options::ENABLE_TASKLISTS);
85 parser_options.insert(Options::ENABLE_HEADING_ATTRIBUTES);
86
87 let events: Vec<Event<'_>> = Parser::new_ext(markdown, parser_options).collect();
88 let headings = collect_headings(&events);
89 let html = render_html(events, &headings, &options);
90 let search_text = collect_search_text(markdown, options.index_code);
91 let title = frontmatter
92 .title
93 .clone()
94 .or_else(|| headings.first().map(|heading| heading.text.clone()))
95 .unwrap_or_else(|| "Untitled".to_string());
96
97 Ok(Document {
98 frontmatter,
99 title,
100 html,
101 headings,
102 search_text,
103 })
104}
105
106fn split_frontmatter(input: &str) -> Result<(Frontmatter, &str)> {
107 let trimmed = input.strip_prefix('\u{feff}').unwrap_or(input);
108 if !trimmed.starts_with("---\n") && !trimmed.starts_with("---\r\n") {
109 return Ok((Frontmatter::default(), trimmed));
110 }
111
112 let body_start = if trimmed.starts_with("---\r\n") { 5 } else { 4 };
113 let rest = &trimmed[body_start..];
114 for marker in ["\n---\n", "\r\n---\r\n", "\n---\r\n", "\r\n---\n"] {
115 if let Some(index) = rest.find(marker) {
116 let yaml = &rest[..index];
117 let after_marker = &rest[index + marker.len()..];
118 let frontmatter = serde_yaml::from_str(yaml).context("failed to parse frontmatter")?;
119 return Ok((frontmatter, after_marker));
120 }
121 }
122
123 anyhow::bail!("frontmatter starts with --- but has no closing marker")
124}
125
126fn normalize_frontmatter(frontmatter: &mut Frontmatter) {
127 if frontmatter.layout.is_empty() {
128 frontmatter.layout = "doc".to_string();
129 }
130 if frontmatter.access != "public" && frontmatter.access != "masked" {
131 frontmatter.access = "public".to_string();
132 }
133}
134
135fn collect_headings(events: &[Event<'_>]) -> Vec<Heading> {
136 let mut headings = Vec::new();
137 let mut current: Option<(u8, String)> = None;
138 let mut used = HashMap::<String, usize>::new();
139
140 for event in events {
141 match event {
142 Event::Start(Tag::Heading { level, .. }) => {
143 current = Some((heading_level(*level), String::new()));
144 }
145 Event::Text(text) | Event::Code(text) => {
146 if let Some((_, current_text)) = &mut current {
147 current_text.push_str(text);
148 }
149 }
150 Event::End(TagEnd::Heading(_)) => {
151 if let Some((level, text)) = current.take() {
152 let anchor = unique_slug(&slugify(&text), &mut used);
153 headings.push(Heading {
154 level,
155 text: text.trim().to_string(),
156 anchor,
157 });
158 }
159 }
160 _ => {}
161 }
162 }
163
164 headings
165}
166
167fn render_html(events: Vec<Event<'_>>, headings: &[Heading], options: &MarkdownOptions) -> String {
168 let mut out_events = Vec::with_capacity(events.len() + headings.len());
169 let mut heading_index = 0usize;
170 let mut in_code_block = false;
171 let mut code_lang: Option<String> = None;
172 let mut code_text = String::new();
173
174 for event in events {
175 match event {
176 Event::Start(Tag::Heading { level, .. }) if options.heading_anchors => {
177 let anchor = headings
178 .get(heading_index)
179 .map(|heading| heading.anchor.as_str())
180 .unwrap_or_default();
181 heading_index += 1;
182 out_events.push(Event::Html(CowStr::from(format!(
183 "<{} id=\"{}\"><a class=\"heading-anchor\" href=\"#{}\" aria-label=\"Link to section\">#</a>",
184 heading_tag(level),
185 escape_attr(anchor),
186 escape_attr(anchor)
187 ))));
188 }
189 Event::End(TagEnd::Heading(level)) if options.heading_anchors => {
190 out_events.push(Event::Html(CowStr::from(format!(
191 "</{}>",
192 heading_tag(level)
193 ))));
194 }
195 Event::Start(Tag::CodeBlock(kind)) => {
196 let lang = match &kind {
197 CodeBlockKind::Fenced(value) => value
198 .split_whitespace()
199 .next()
200 .filter(|value| !value.is_empty())
201 .map(str::to_string),
202 CodeBlockKind::Indented => None,
203 };
204
205 if options.mermaid && lang.as_deref() == Some("mermaid") {
206 in_code_block = true;
207 code_lang = lang;
208 out_events.push(Event::Html(CowStr::from("<pre class=\"mermaid\">")));
209 } else {
210 in_code_block = true;
211 code_lang = lang;
212 code_text.clear();
213 }
214 }
215 Event::Text(text) if in_code_block && code_lang.as_deref() == Some("mermaid") => {
216 out_events.push(Event::Html(CowStr::from(escape_html(&text))));
217 }
218 Event::Text(text) | Event::Code(text) if in_code_block => {
219 code_text.push_str(&text);
220 }
221 Event::End(TagEnd::CodeBlock) if in_code_block => {
222 if code_lang.as_deref() == Some("mermaid") {
223 out_events.push(Event::Html(CowStr::from("</pre>")));
224 } else {
225 out_events.push(Event::Html(CowStr::from(render_code_block(
226 &code_text,
227 code_lang.as_deref(),
228 options.code_highlight,
229 options.code_line_numbers,
230 ))));
231 code_text.clear();
232 }
233 in_code_block = false;
234 code_lang = None;
235 }
236 _ => out_events.push(event),
237 }
238 }
239
240 let mut rendered = String::new();
241 html::push_html(&mut rendered, out_events.into_iter());
242 rendered
243}
244
245fn render_code_block(
246 code: &str,
247 lang: Option<&str>,
248 highlight: bool,
249 code_line_numbers: bool,
250) -> String {
251 let code = trim_trailing_blank_lines(code);
252 let normalized_lang = lang
253 .map(normalize_code_lang)
254 .filter(|lang| !lang.is_empty());
255 let content = if highlight {
256 highlight_code(code, normalized_lang)
257 } else {
258 escape_html(code)
259 };
260 let lang_class = normalized_lang
261 .map(|lang| format!(" language-{}", escape_attr(lang)))
262 .unwrap_or_default();
263 let header = normalized_lang
264 .map(|lang| {
265 format!(
266 r#"<div class="rp-code-header"><span>{}</span></div>"#,
267 escape_html(lang)
268 )
269 })
270 .unwrap_or_default();
271
272 if code_line_numbers {
273 let line_count = LinesWithEndings::from(code).count().max(1);
274 let lines = (1..=line_count)
275 .map(|line| line.to_string())
276 .collect::<Vec<_>>()
277 .join("\n");
278 return format!(
279 r#"<div class="rp-code rp-code-line-numbers">{header}<button class="rp-code-copy" type="button" data-rp-copy-code aria-label="Copy code" title="Copy code"><svg class="rp-code-copy-icon" viewBox="0 0 24 24" aria-hidden="true"><rect x="9" y="9" width="11" height="11" rx="2"></rect><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"></path></svg><svg class="rp-code-copy-check" viewBox="0 0 24 24" aria-hidden="true"><path d="M20 6 9 17l-5-5"></path></svg></button><pre><span class="rp-code-lines" aria-hidden="true">{lines}</span><code class="rp-code-content{lang_class}">{content}</code></pre></div>"#
280 );
281 }
282
283 format!(
284 r#"<div class="rp-code">{header}<button class="rp-code-copy" type="button" data-rp-copy-code aria-label="Copy code" title="Copy code"><svg class="rp-code-copy-icon" viewBox="0 0 24 24" aria-hidden="true"><rect x="9" y="9" width="11" height="11" rx="2"></rect><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"></path></svg><svg class="rp-code-copy-check" viewBox="0 0 24 24" aria-hidden="true"><path d="M20 6 9 17l-5-5"></path></svg></button><pre><code class="rp-code-content{lang_class}">{content}</code></pre></div>"#
285 )
286}
287
288fn trim_trailing_blank_lines(code: &str) -> &str {
289 let Some((last_non_whitespace, ch)) = code.char_indices().rfind(|(_, ch)| !ch.is_whitespace())
290 else {
291 return "";
292 };
293 let last_non_whitespace_end = last_non_whitespace + ch.len_utf8();
294 let trailing = &code[last_non_whitespace_end..];
295 let line_break = match (trailing.find('\n'), trailing.find('\r')) {
296 (Some(newline), Some(carriage_return)) => Some(newline.min(carriage_return)),
297 (Some(newline), None) => Some(newline),
298 (None, Some(carriage_return)) => Some(carriage_return),
299 (None, None) => None,
300 };
301
302 line_break
303 .map(|line_break| &code[..last_non_whitespace_end + line_break])
304 .unwrap_or(code)
305}
306
307fn normalize_code_lang(lang: &str) -> &str {
308 lang.trim()
309 .trim_start_matches("language-")
310 .split([',', '{'])
311 .next()
312 .unwrap_or("")
313 .trim()
314}
315
316fn highlight_code(code: &str, lang: Option<&str>) -> String {
317 let syntax = lang
318 .and_then(|lang| SYNTAX_SET.find_syntax_by_token(lang))
319 .unwrap_or_else(|| SYNTAX_SET.find_syntax_plain_text());
320 let mut highlighter = HighlightLines::new(syntax, highlight_theme());
321 let mut html = String::new();
322
323 for line in LinesWithEndings::from(code) {
324 match highlighter
325 .highlight_line(line, &SYNTAX_SET)
326 .and_then(|regions| styled_line_to_highlighted_html(®ions, IncludeBackground::No))
327 {
328 Ok(line_html) => html.push_str(&line_html),
329 Err(_) => html.push_str(&escape_html(line)),
330 }
331 }
332
333 html
334}
335
336static SYNTAX_SET: Lazy<SyntaxSet> = Lazy::new(SyntaxSet::load_defaults_newlines);
337static THEME_SET: Lazy<ThemeSet> = Lazy::new(ThemeSet::load_defaults);
338
339fn highlight_theme() -> &'static Theme {
340 THEME_SET
341 .themes
342 .get("base16-ocean.dark")
343 .or_else(|| THEME_SET.themes.values().next())
344 .expect("syntect ships with default themes")
345}
346
347fn collect_search_text(markdown: &str, index_code: bool) -> String {
348 let mut parser_options = Options::empty();
349 parser_options.insert(Options::ENABLE_TABLES);
350 parser_options.insert(Options::ENABLE_STRIKETHROUGH);
351 parser_options.insert(Options::ENABLE_TASKLISTS);
352
353 let mut text = String::new();
354 let mut in_code_block = false;
355
356 for event in Parser::new_ext(markdown, parser_options) {
357 match event {
358 Event::Start(Tag::CodeBlock(_)) => in_code_block = true,
359 Event::End(TagEnd::CodeBlock) => in_code_block = false,
360 Event::Text(value) | Event::Code(value) => {
361 if index_code || !in_code_block {
362 if !text.is_empty() {
363 text.push(' ');
364 }
365 text.push_str(&value);
366 }
367 }
368 _ => {}
369 }
370 }
371
372 normalize_space(&text)
373}
374
375fn heading_level(level: HeadingLevel) -> u8 {
376 match level {
377 HeadingLevel::H1 => 1,
378 HeadingLevel::H2 => 2,
379 HeadingLevel::H3 => 3,
380 HeadingLevel::H4 => 4,
381 HeadingLevel::H5 => 5,
382 HeadingLevel::H6 => 6,
383 }
384}
385
386fn heading_tag(level: HeadingLevel) -> &'static str {
387 match level {
388 HeadingLevel::H1 => "h1",
389 HeadingLevel::H2 => "h2",
390 HeadingLevel::H3 => "h3",
391 HeadingLevel::H4 => "h4",
392 HeadingLevel::H5 => "h5",
393 HeadingLevel::H6 => "h6",
394 }
395}
396
397fn slugify(text: &str) -> String {
398 static PUNCT: Lazy<Regex> =
399 Lazy::new(|| Regex::new(r"[^\p{Alphabetic}\p{Number}_\-\s]+").unwrap());
400 static SPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_]+").unwrap());
401
402 let lower = text.trim().to_lowercase();
403 let without_punct = PUNCT.replace_all(&lower, "");
404 let slug = SPACE.replace_all(without_punct.trim(), "-");
405 if slug.is_empty() {
406 "section".to_string()
407 } else {
408 slug.to_string()
409 }
410}
411
412fn unique_slug(slug: &str, used: &mut HashMap<String, usize>) -> String {
413 let count = used.entry(slug.to_string()).or_insert(0);
414 *count += 1;
415 if *count == 1 {
416 slug.to_string()
417 } else {
418 format!("{slug}-{}", *count)
419 }
420}
421
422fn normalize_space(input: &str) -> String {
423 input.split_whitespace().collect::<Vec<_>>().join(" ")
424}
425
426fn escape_html(input: &str) -> String {
427 input
428 .replace('&', "&")
429 .replace('<', "<")
430 .replace('>', ">")
431}
432
433fn escape_attr(input: &str) -> String {
434 escape_html(input).replace('"', """)
435}
436
437#[cfg(test)]
438mod tests {
439 use super::*;
440
441 #[test]
442 fn parses_frontmatter_and_title() {
443 let doc = parse_markdown(
444 "---\ntitle: Page Title\naccess: masked\n---\n# Ignored\nBody",
445 MarkdownOptions::default(),
446 )
447 .unwrap();
448
449 assert_eq!(doc.title, "Page Title");
450 assert_eq!(doc.frontmatter.access, "masked");
451 assert!(doc.html.contains("id=\"ignored\""));
452 }
453
454 #[test]
455 fn chinese_heading_anchor_is_preserved() {
456 let doc = parse_markdown("# 中文 标题\n\ntext", MarkdownOptions::default()).unwrap();
457
458 assert_eq!(doc.headings[0].anchor, "中文-标题");
459 assert!(doc.html.contains("id=\"中文-标题\""));
460 }
461
462 #[test]
463 fn mermaid_code_block_becomes_mermaid_pre() {
464 let doc = parse_markdown(
465 "```mermaid\nflowchart LR\nA-->B\n```",
466 MarkdownOptions::default(),
467 )
468 .unwrap();
469
470 assert!(doc.html.contains("<pre class=\"mermaid\">"));
471 assert!(doc.html.contains("A-->B"));
472 assert!(!doc.html.contains("data-rp-copy-code"));
473 assert!(!doc.html.contains("rp-code-line-numbers"));
474 assert!(!doc.html.contains("rp-code-lines"));
475 }
476
477 #[test]
478 fn fenced_code_has_copy_button_and_is_highlighted_with_syntect() {
479 let doc = parse_markdown(
480 "```rust\nfn main() {\n println!(\"hi\");\n}\n```",
481 MarkdownOptions::default(),
482 )
483 .unwrap();
484
485 assert!(doc.html.contains("class=\"rp-code rp-code-line-numbers\""));
486 assert!(doc.html.contains("class=\"rp-code-copy\""));
487 assert!(doc.html.contains("data-rp-copy-code"));
488 assert!(doc.html.contains("aria-label=\"Copy code\""));
489 assert!(doc.html.contains("rp-code-line-numbers"));
490 assert!(doc
491 .html
492 .contains("class=\"rp-code-lines\" aria-hidden=\"true\""));
493 assert!(doc.html.contains("language-rust"));
494 assert!(doc.html.contains("<span style="));
495 assert!(doc.html.contains("println"));
496 }
497
498 #[test]
499 fn code_line_numbers_can_be_disabled() {
500 let doc = parse_markdown(
501 "```rust\nfn main() { println!(\"hi\"); }\n```",
502 MarkdownOptions {
503 code_line_numbers: false,
504 ..MarkdownOptions::default()
505 },
506 )
507 .unwrap();
508
509 assert!(doc.html.contains("class=\"rp-code\""));
510 assert!(doc.html.contains("data-rp-copy-code"));
511 assert!(!doc.html.contains("rp-code-line-numbers"));
512 assert!(!doc.html.contains("rp-code-lines"));
513 }
514
515 #[test]
516 fn code_highlight_can_be_disabled_without_removing_copy_button() {
517 let doc = parse_markdown(
518 "```rust\nfn main() { println!(\"<hi>\"); }\n```",
519 MarkdownOptions {
520 code_highlight: false,
521 ..MarkdownOptions::default()
522 },
523 )
524 .unwrap();
525
526 assert!(doc.html.contains("class=\"rp-code rp-code-line-numbers\""));
527 assert!(doc.html.contains("data-rp-copy-code"));
528 assert!(doc.html.contains("class=\"rp-code-content language-rust\""));
529 assert!(doc.html.contains("println!(\"<hi>\")"));
530 assert!(!doc.html.contains("<span style="));
531 }
532
533 #[test]
534 fn code_line_numbers_match_multiline_trailing_and_empty_blocks() {
535 let multiline = render_code_block("one\ntwo\n\n", None, false, true);
536 assert!(
537 multiline.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1\n2</span>")
538 );
539 assert!(multiline.contains("<code class=\"rp-code-content\">one\ntwo</code>"));
540
541 let empty = render_code_block("", None, false, true);
542 assert!(empty.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1</span>"));
543 }
544
545 #[test]
546 fn code_block_trims_trailing_whitespace_only_lines() {
547 let html = render_code_block("one\n \n\t\n", None, false, true);
548
549 assert!(html.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1</span>"));
550 assert!(html.contains("<code class=\"rp-code-content\">one</code>"));
551 }
552
553 #[test]
554 fn code_block_preserves_trailing_spaces_on_last_content_line() {
555 let html = render_code_block("one \n\n", None, false, true);
556
557 assert!(html.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1</span>"));
558 assert!(html.contains("<code class=\"rp-code-content\">one </code>"));
559 }
560
561 #[test]
562 fn code_content_does_not_include_line_numbers() {
563 let doc = parse_markdown(
564 "```\nalpha\nbeta\n```",
565 MarkdownOptions {
566 code_highlight: false,
567 ..MarkdownOptions::default()
568 },
569 )
570 .unwrap();
571
572 assert!(doc
573 .html
574 .contains("class=\"rp-code-lines\" aria-hidden=\"true\">1\n2</span>"));
575 assert_eq!(code_content(&doc.html), "alpha\nbeta");
576 }
577
578 #[test]
579 fn code_is_excluded_from_search_by_default() {
580 let doc = parse_markdown(
581 "Body\n\n```rust\nlet hidden = true;\n```",
582 MarkdownOptions::default(),
583 )
584 .unwrap();
585
586 assert!(doc.search_text.contains("Body"));
587 assert!(!doc.search_text.contains("hidden"));
588 }
589
590 fn code_content(html: &str) -> &str {
591 let class_start = html.find("class=\"rp-code-content").unwrap();
592 let content_start = class_start + html[class_start..].find('>').unwrap() + 1;
593 let content_end = content_start + html[content_start..].find("</code>").unwrap();
594 &html[content_start..content_end]
595 }
596}