1use std::collections::HashMap;
2
3use anyhow::{Context, Result};
4use once_cell::sync::Lazy;
5use pulldown_cmark::{
6 html, CodeBlockKind, CowStr, Event, HeadingLevel, Options, Parser, Tag, TagEnd,
7};
8use regex::Regex;
9use serde::{Deserialize, Serialize};
10use syntect::easy::HighlightLines;
11use syntect::highlighting::{Theme, ThemeSet};
12use syntect::html::{styled_line_to_highlighted_html, IncludeBackground};
13use syntect::parsing::SyntaxSet;
14use syntect::util::LinesWithEndings;
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct MarkdownOptions {
18 pub mermaid: bool,
19 pub code_highlight: bool,
20 pub code_line_numbers: bool,
21 pub heading_anchors: bool,
22 pub index_code: bool,
23}
24
25impl Default for MarkdownOptions {
26 fn default() -> Self {
27 Self {
28 mermaid: true,
29 code_highlight: true,
30 code_line_numbers: true,
31 heading_anchors: true,
32 index_code: false,
33 }
34 }
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
38#[serde(default)]
39pub struct Frontmatter {
40 pub title: Option<String>,
41 pub layout: String,
42 pub sidebar: bool,
43 pub search: bool,
44 pub access: String,
45}
46
47impl Default for Frontmatter {
48 fn default() -> Self {
49 Self {
50 title: None,
51 layout: "doc".to_string(),
52 sidebar: true,
53 search: true,
54 access: "public".to_string(),
55 }
56 }
57}
58
59#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
60pub struct Heading {
61 pub level: u8,
62 pub text: String,
63 pub anchor: String,
64}
65
66#[derive(Debug, Clone, PartialEq, Eq)]
67pub struct Document {
68 pub frontmatter: Frontmatter,
69 pub title: String,
70 pub html: String,
71 pub headings: Vec<Heading>,
72 pub search_text: String,
73}
74
75pub fn parse_markdown(input: &str, options: MarkdownOptions) -> Result<Document> {
76 let (frontmatter, markdown) = split_frontmatter(input)?;
77 let mut frontmatter = frontmatter;
78 normalize_frontmatter(&mut frontmatter);
79
80 let mut parser_options = Options::empty();
81 parser_options.insert(Options::ENABLE_TABLES);
82 parser_options.insert(Options::ENABLE_FOOTNOTES);
83 parser_options.insert(Options::ENABLE_STRIKETHROUGH);
84 parser_options.insert(Options::ENABLE_TASKLISTS);
85 parser_options.insert(Options::ENABLE_HEADING_ATTRIBUTES);
86
87 let events: Vec<Event<'_>> = Parser::new_ext(markdown, parser_options).collect();
88 let headings = collect_headings(&events);
89 let html = render_html(events, &headings, &options);
90 let search_text = collect_search_text(markdown, options.index_code);
91 let title = frontmatter
92 .title
93 .clone()
94 .or_else(|| headings.first().map(|heading| heading.text.clone()))
95 .unwrap_or_else(|| "Untitled".to_string());
96
97 Ok(Document {
98 frontmatter,
99 title,
100 html,
101 headings,
102 search_text,
103 })
104}
105
106fn split_frontmatter(input: &str) -> Result<(Frontmatter, &str)> {
107 let trimmed = input.strip_prefix('\u{feff}').unwrap_or(input);
108 if !trimmed.starts_with("---\n") && !trimmed.starts_with("---\r\n") {
109 return Ok((Frontmatter::default(), trimmed));
110 }
111
112 let body_start = if trimmed.starts_with("---\r\n") { 5 } else { 4 };
113 let rest = &trimmed[body_start..];
114 for marker in ["\n---\n", "\r\n---\r\n", "\n---\r\n", "\r\n---\n"] {
115 if let Some(index) = rest.find(marker) {
116 let yaml = &rest[..index];
117 let after_marker = &rest[index + marker.len()..];
118 let frontmatter = serde_yaml::from_str(yaml).context("failed to parse frontmatter")?;
119 return Ok((frontmatter, after_marker));
120 }
121 }
122
123 anyhow::bail!("frontmatter starts with --- but has no closing marker")
124}
125
126fn normalize_frontmatter(frontmatter: &mut Frontmatter) {
127 if frontmatter.layout.is_empty() {
128 frontmatter.layout = "doc".to_string();
129 }
130 if frontmatter.access != "public" && frontmatter.access != "masked" {
131 frontmatter.access = "public".to_string();
132 }
133}
134
135fn collect_headings(events: &[Event<'_>]) -> Vec<Heading> {
136 let mut headings = Vec::new();
137 let mut current: Option<(u8, String)> = None;
138 let mut used = HashMap::<String, usize>::new();
139
140 for event in events {
141 match event {
142 Event::Start(Tag::Heading { level, .. }) => {
143 current = Some((heading_level(*level), String::new()));
144 }
145 Event::Text(text) | Event::Code(text) => {
146 if let Some((_, current_text)) = &mut current {
147 current_text.push_str(text);
148 }
149 }
150 Event::End(TagEnd::Heading(_)) => {
151 if let Some((level, text)) = current.take() {
152 let anchor = unique_slug(&slugify(&text), &mut used);
153 headings.push(Heading {
154 level,
155 text: text.trim().to_string(),
156 anchor,
157 });
158 }
159 }
160 _ => {}
161 }
162 }
163
164 headings
165}
166
167fn render_html(events: Vec<Event<'_>>, headings: &[Heading], options: &MarkdownOptions) -> String {
168 let mut out_events = Vec::with_capacity(events.len() + headings.len());
169 let mut heading_index = 0usize;
170 let mut in_code_block = false;
171 let mut code_lang: Option<String> = None;
172 let mut code_text = String::new();
173
174 for event in events {
175 match event {
176 Event::Start(Tag::Heading { level, .. }) if options.heading_anchors => {
177 let anchor = headings
178 .get(heading_index)
179 .map(|heading| heading.anchor.as_str())
180 .unwrap_or_default();
181 heading_index += 1;
182 out_events.push(Event::Html(CowStr::from(format!(
183 "<{} id=\"{}\"><a class=\"heading-anchor\" href=\"#{}\" aria-label=\"Link to section\">#</a>",
184 heading_tag(level),
185 escape_attr(anchor),
186 escape_attr(anchor)
187 ))));
188 }
189 Event::End(TagEnd::Heading(level)) if options.heading_anchors => {
190 out_events.push(Event::Html(CowStr::from(format!(
191 "</{}>",
192 heading_tag(level)
193 ))));
194 }
195 Event::Start(Tag::CodeBlock(kind)) => {
196 let lang = match &kind {
197 CodeBlockKind::Fenced(value) => value
198 .split_whitespace()
199 .next()
200 .filter(|value| !value.is_empty())
201 .map(str::to_string),
202 CodeBlockKind::Indented => None,
203 };
204
205 if options.mermaid && lang.as_deref() == Some("mermaid") {
206 in_code_block = true;
207 code_lang = lang;
208 out_events.push(Event::Html(CowStr::from("<pre class=\"mermaid\">")));
209 } else {
210 in_code_block = true;
211 code_lang = lang;
212 code_text.clear();
213 }
214 }
215 Event::Text(text) if in_code_block && code_lang.as_deref() == Some("mermaid") => {
216 out_events.push(Event::Html(CowStr::from(escape_html(&text))));
217 }
218 Event::Text(text) | Event::Code(text) if in_code_block => {
219 code_text.push_str(&text);
220 }
221 Event::End(TagEnd::CodeBlock) if in_code_block => {
222 if code_lang.as_deref() == Some("mermaid") {
223 out_events.push(Event::Html(CowStr::from("</pre>")));
224 } else {
225 out_events.push(Event::Html(CowStr::from(render_code_block(
226 &code_text,
227 code_lang.as_deref(),
228 options.code_highlight,
229 options.code_line_numbers,
230 ))));
231 code_text.clear();
232 }
233 in_code_block = false;
234 code_lang = None;
235 }
236 _ => out_events.push(event),
237 }
238 }
239
240 let mut rendered = String::new();
241 html::push_html(&mut rendered, out_events.into_iter());
242 rendered
243}
244
245fn render_code_block(
246 code: &str,
247 lang: Option<&str>,
248 highlight: bool,
249 code_line_numbers: bool,
250) -> String {
251 let code = trim_trailing_blank_lines(code);
252 let normalized_lang = lang
253 .map(normalize_code_lang)
254 .filter(|lang| !lang.is_empty());
255 let content = if highlight {
256 highlight_code(code, normalized_lang)
257 } else {
258 escape_html(code)
259 };
260 let lang_class = normalized_lang
261 .map(|lang| format!(" language-{}", escape_attr(lang)))
262 .unwrap_or_default();
263 let header = normalized_lang
264 .map(|lang| {
265 format!(
266 r#"<div class="rp-code-header"><span>{}</span></div>"#,
267 escape_html(lang)
268 )
269 })
270 .unwrap_or_default();
271
272 if code_line_numbers {
273 let line_count = LinesWithEndings::from(code).count().max(1);
274 let lines = (1..=line_count)
275 .map(|line| line.to_string())
276 .collect::<Vec<_>>()
277 .join("\n");
278 return format!(
279 r#"<div class="rp-code rp-code-line-numbers">{header}<button class="rp-code-copy" type="button" data-rp-copy-code aria-label="Copy code" title="Copy code"><svg class="rp-code-copy-icon" viewBox="0 0 24 24" aria-hidden="true"><rect x="9" y="9" width="11" height="11" rx="2"></rect><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"></path></svg><svg class="rp-code-copy-check" viewBox="0 0 24 24" aria-hidden="true"><path d="M20 6 9 17l-5-5"></path></svg></button><pre><span class="rp-code-lines" aria-hidden="true">{lines}</span><code class="rp-code-content{lang_class}">{content}</code></pre></div>"#
280 );
281 }
282
283 format!(
284 r#"<div class="rp-code">{header}<button class="rp-code-copy" type="button" data-rp-copy-code aria-label="Copy code" title="Copy code"><svg class="rp-code-copy-icon" viewBox="0 0 24 24" aria-hidden="true"><rect x="9" y="9" width="11" height="11" rx="2"></rect><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"></path></svg><svg class="rp-code-copy-check" viewBox="0 0 24 24" aria-hidden="true"><path d="M20 6 9 17l-5-5"></path></svg></button><pre><code class="rp-code-content{lang_class}">{content}</code></pre></div>"#
285 )
286}
287
288fn trim_trailing_blank_lines(code: &str) -> &str {
289 let mut end = code.len();
290
291 for line in LinesWithEndings::from(code)
292 .collect::<Vec<_>>()
293 .into_iter()
294 .rev()
295 {
296 if line.trim().is_empty() {
297 end -= line.len();
298 } else {
299 break;
300 }
301 }
302
303 &code[..end]
304}
305
306fn normalize_code_lang(lang: &str) -> &str {
307 lang.trim()
308 .trim_start_matches("language-")
309 .split([',', '{'])
310 .next()
311 .unwrap_or("")
312 .trim()
313}
314
315fn highlight_code(code: &str, lang: Option<&str>) -> String {
316 let syntax = lang
317 .and_then(|lang| SYNTAX_SET.find_syntax_by_token(lang))
318 .unwrap_or_else(|| SYNTAX_SET.find_syntax_plain_text());
319 let mut highlighter = HighlightLines::new(syntax, highlight_theme());
320 let mut html = String::new();
321
322 for line in LinesWithEndings::from(code) {
323 match highlighter
324 .highlight_line(line, &SYNTAX_SET)
325 .and_then(|regions| styled_line_to_highlighted_html(®ions, IncludeBackground::No))
326 {
327 Ok(line_html) => html.push_str(&line_html),
328 Err(_) => html.push_str(&escape_html(line)),
329 }
330 }
331
332 html
333}
334
335static SYNTAX_SET: Lazy<SyntaxSet> = Lazy::new(SyntaxSet::load_defaults_newlines);
336static THEME_SET: Lazy<ThemeSet> = Lazy::new(ThemeSet::load_defaults);
337
338fn highlight_theme() -> &'static Theme {
339 THEME_SET
340 .themes
341 .get("base16-ocean.dark")
342 .or_else(|| THEME_SET.themes.values().next())
343 .expect("syntect ships with default themes")
344}
345
346fn collect_search_text(markdown: &str, index_code: bool) -> String {
347 let mut parser_options = Options::empty();
348 parser_options.insert(Options::ENABLE_TABLES);
349 parser_options.insert(Options::ENABLE_STRIKETHROUGH);
350 parser_options.insert(Options::ENABLE_TASKLISTS);
351
352 let mut text = String::new();
353 let mut in_code_block = false;
354
355 for event in Parser::new_ext(markdown, parser_options) {
356 match event {
357 Event::Start(Tag::CodeBlock(_)) => in_code_block = true,
358 Event::End(TagEnd::CodeBlock) => in_code_block = false,
359 Event::Text(value) | Event::Code(value) => {
360 if index_code || !in_code_block {
361 if !text.is_empty() {
362 text.push(' ');
363 }
364 text.push_str(&value);
365 }
366 }
367 _ => {}
368 }
369 }
370
371 normalize_space(&text)
372}
373
374fn heading_level(level: HeadingLevel) -> u8 {
375 match level {
376 HeadingLevel::H1 => 1,
377 HeadingLevel::H2 => 2,
378 HeadingLevel::H3 => 3,
379 HeadingLevel::H4 => 4,
380 HeadingLevel::H5 => 5,
381 HeadingLevel::H6 => 6,
382 }
383}
384
385fn heading_tag(level: HeadingLevel) -> &'static str {
386 match level {
387 HeadingLevel::H1 => "h1",
388 HeadingLevel::H2 => "h2",
389 HeadingLevel::H3 => "h3",
390 HeadingLevel::H4 => "h4",
391 HeadingLevel::H5 => "h5",
392 HeadingLevel::H6 => "h6",
393 }
394}
395
396fn slugify(text: &str) -> String {
397 static PUNCT: Lazy<Regex> =
398 Lazy::new(|| Regex::new(r"[^\p{Alphabetic}\p{Number}_\-\s]+").unwrap());
399 static SPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_]+").unwrap());
400
401 let lower = text.trim().to_lowercase();
402 let without_punct = PUNCT.replace_all(&lower, "");
403 let slug = SPACE.replace_all(without_punct.trim(), "-");
404 if slug.is_empty() {
405 "section".to_string()
406 } else {
407 slug.to_string()
408 }
409}
410
411fn unique_slug(slug: &str, used: &mut HashMap<String, usize>) -> String {
412 let count = used.entry(slug.to_string()).or_insert(0);
413 *count += 1;
414 if *count == 1 {
415 slug.to_string()
416 } else {
417 format!("{slug}-{}", *count)
418 }
419}
420
421fn normalize_space(input: &str) -> String {
422 input.split_whitespace().collect::<Vec<_>>().join(" ")
423}
424
425fn escape_html(input: &str) -> String {
426 input
427 .replace('&', "&")
428 .replace('<', "<")
429 .replace('>', ">")
430}
431
432fn escape_attr(input: &str) -> String {
433 escape_html(input).replace('"', """)
434}
435
436#[cfg(test)]
437mod tests {
438 use super::*;
439
440 #[test]
441 fn parses_frontmatter_and_title() {
442 let doc = parse_markdown(
443 "---\ntitle: Page Title\naccess: masked\n---\n# Ignored\nBody",
444 MarkdownOptions::default(),
445 )
446 .unwrap();
447
448 assert_eq!(doc.title, "Page Title");
449 assert_eq!(doc.frontmatter.access, "masked");
450 assert!(doc.html.contains("id=\"ignored\""));
451 }
452
453 #[test]
454 fn chinese_heading_anchor_is_preserved() {
455 let doc = parse_markdown("# 中文 标题\n\ntext", MarkdownOptions::default()).unwrap();
456
457 assert_eq!(doc.headings[0].anchor, "中文-标题");
458 assert!(doc.html.contains("id=\"中文-标题\""));
459 }
460
461 #[test]
462 fn mermaid_code_block_becomes_mermaid_pre() {
463 let doc = parse_markdown(
464 "```mermaid\nflowchart LR\nA-->B\n```",
465 MarkdownOptions::default(),
466 )
467 .unwrap();
468
469 assert!(doc.html.contains("<pre class=\"mermaid\">"));
470 assert!(doc.html.contains("A-->B"));
471 assert!(!doc.html.contains("data-rp-copy-code"));
472 assert!(!doc.html.contains("rp-code-line-numbers"));
473 assert!(!doc.html.contains("rp-code-lines"));
474 }
475
476 #[test]
477 fn fenced_code_has_copy_button_and_is_highlighted_with_syntect() {
478 let doc = parse_markdown(
479 "```rust\nfn main() {\n println!(\"hi\");\n}\n```",
480 MarkdownOptions::default(),
481 )
482 .unwrap();
483
484 assert!(doc.html.contains("class=\"rp-code rp-code-line-numbers\""));
485 assert!(doc.html.contains("class=\"rp-code-copy\""));
486 assert!(doc.html.contains("data-rp-copy-code"));
487 assert!(doc.html.contains("aria-label=\"Copy code\""));
488 assert!(doc.html.contains("rp-code-line-numbers"));
489 assert!(doc
490 .html
491 .contains("class=\"rp-code-lines\" aria-hidden=\"true\""));
492 assert!(doc.html.contains("language-rust"));
493 assert!(doc.html.contains("<span style="));
494 assert!(doc.html.contains("println"));
495 }
496
497 #[test]
498 fn code_line_numbers_can_be_disabled() {
499 let doc = parse_markdown(
500 "```rust\nfn main() { println!(\"hi\"); }\n```",
501 MarkdownOptions {
502 code_line_numbers: false,
503 ..MarkdownOptions::default()
504 },
505 )
506 .unwrap();
507
508 assert!(doc.html.contains("class=\"rp-code\""));
509 assert!(doc.html.contains("data-rp-copy-code"));
510 assert!(!doc.html.contains("rp-code-line-numbers"));
511 assert!(!doc.html.contains("rp-code-lines"));
512 }
513
514 #[test]
515 fn code_highlight_can_be_disabled_without_removing_copy_button() {
516 let doc = parse_markdown(
517 "```rust\nfn main() { println!(\"<hi>\"); }\n```",
518 MarkdownOptions {
519 code_highlight: false,
520 ..MarkdownOptions::default()
521 },
522 )
523 .unwrap();
524
525 assert!(doc.html.contains("class=\"rp-code rp-code-line-numbers\""));
526 assert!(doc.html.contains("data-rp-copy-code"));
527 assert!(doc.html.contains("class=\"rp-code-content language-rust\""));
528 assert!(doc.html.contains("println!(\"<hi>\")"));
529 assert!(!doc.html.contains("<span style="));
530 }
531
532 #[test]
533 fn code_line_numbers_match_multiline_trailing_and_empty_blocks() {
534 let multiline = render_code_block("one\ntwo\n\n", None, false, true);
535 assert!(
536 multiline.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1\n2</span>")
537 );
538 assert!(multiline.contains("<code class=\"rp-code-content\">one\ntwo\n</code>"));
539
540 let empty = render_code_block("", None, false, true);
541 assert!(empty.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1</span>"));
542 }
543
544 #[test]
545 fn code_block_trims_trailing_whitespace_only_lines() {
546 let html = render_code_block("one\n \n\t\n", None, false, true);
547
548 assert!(html.contains("<span class=\"rp-code-lines\" aria-hidden=\"true\">1</span>"));
549 assert!(html.contains("<code class=\"rp-code-content\">one\n</code>"));
550 }
551
552 #[test]
553 fn code_content_does_not_include_line_numbers() {
554 let doc = parse_markdown(
555 "```\nalpha\nbeta\n```",
556 MarkdownOptions {
557 code_highlight: false,
558 ..MarkdownOptions::default()
559 },
560 )
561 .unwrap();
562
563 assert!(doc
564 .html
565 .contains("class=\"rp-code-lines\" aria-hidden=\"true\">1\n2</span>"));
566 assert_eq!(code_content(&doc.html), "alpha\nbeta\n");
567 }
568
569 #[test]
570 fn code_is_excluded_from_search_by_default() {
571 let doc = parse_markdown(
572 "Body\n\n```rust\nlet hidden = true;\n```",
573 MarkdownOptions::default(),
574 )
575 .unwrap();
576
577 assert!(doc.search_text.contains("Body"));
578 assert!(!doc.search_text.contains("hidden"));
579 }
580
581 fn code_content(html: &str) -> &str {
582 let class_start = html.find("class=\"rp-code-content").unwrap();
583 let content_start = class_start + html[class_start..].find('>').unwrap() + 1;
584 let content_end = content_start + html[content_start..].find("</code>").unwrap();
585 &html[content_start..content_end]
586 }
587}