1use std::collections::HashMap;
2
3use anyhow::{Context, Result};
4use once_cell::sync::Lazy;
5use pulldown_cmark::{
6 html, CodeBlockKind, CowStr, Event, HeadingLevel, Options, Parser, Tag, TagEnd,
7};
8use regex::Regex;
9use serde::{Deserialize, Serialize};
10use syntect::easy::HighlightLines;
11use syntect::highlighting::{Theme, ThemeSet};
12use syntect::html::{styled_line_to_highlighted_html, IncludeBackground};
13use syntect::parsing::SyntaxSet;
14use syntect::util::LinesWithEndings;
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct MarkdownOptions {
18 pub mermaid: bool,
19 pub code_highlight: bool,
20 pub heading_anchors: bool,
21 pub index_code: bool,
22}
23
24impl Default for MarkdownOptions {
25 fn default() -> Self {
26 Self {
27 mermaid: true,
28 code_highlight: true,
29 heading_anchors: true,
30 index_code: false,
31 }
32 }
33}
34
35#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
36#[serde(default)]
37pub struct Frontmatter {
38 pub title: Option<String>,
39 pub layout: String,
40 pub sidebar: bool,
41 pub search: bool,
42 pub access: String,
43}
44
45impl Default for Frontmatter {
46 fn default() -> Self {
47 Self {
48 title: None,
49 layout: "doc".to_string(),
50 sidebar: true,
51 search: true,
52 access: "public".to_string(),
53 }
54 }
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
58pub struct Heading {
59 pub level: u8,
60 pub text: String,
61 pub anchor: String,
62}
63
64#[derive(Debug, Clone, PartialEq, Eq)]
65pub struct Document {
66 pub frontmatter: Frontmatter,
67 pub title: String,
68 pub html: String,
69 pub headings: Vec<Heading>,
70 pub search_text: String,
71}
72
73pub fn parse_markdown(input: &str, options: MarkdownOptions) -> Result<Document> {
74 let (frontmatter, markdown) = split_frontmatter(input)?;
75 let mut frontmatter = frontmatter;
76 normalize_frontmatter(&mut frontmatter);
77
78 let mut parser_options = Options::empty();
79 parser_options.insert(Options::ENABLE_TABLES);
80 parser_options.insert(Options::ENABLE_FOOTNOTES);
81 parser_options.insert(Options::ENABLE_STRIKETHROUGH);
82 parser_options.insert(Options::ENABLE_TASKLISTS);
83 parser_options.insert(Options::ENABLE_HEADING_ATTRIBUTES);
84
85 let events: Vec<Event<'_>> = Parser::new_ext(markdown, parser_options).collect();
86 let headings = collect_headings(&events);
87 let html = render_html(events, &headings, &options);
88 let search_text = collect_search_text(markdown, options.index_code);
89 let title = frontmatter
90 .title
91 .clone()
92 .or_else(|| headings.first().map(|heading| heading.text.clone()))
93 .unwrap_or_else(|| "Untitled".to_string());
94
95 Ok(Document {
96 frontmatter,
97 title,
98 html,
99 headings,
100 search_text,
101 })
102}
103
104fn split_frontmatter(input: &str) -> Result<(Frontmatter, &str)> {
105 let trimmed = input.strip_prefix('\u{feff}').unwrap_or(input);
106 if !trimmed.starts_with("---\n") && !trimmed.starts_with("---\r\n") {
107 return Ok((Frontmatter::default(), trimmed));
108 }
109
110 let body_start = if trimmed.starts_with("---\r\n") { 5 } else { 4 };
111 let rest = &trimmed[body_start..];
112 for marker in ["\n---\n", "\r\n---\r\n", "\n---\r\n", "\r\n---\n"] {
113 if let Some(index) = rest.find(marker) {
114 let yaml = &rest[..index];
115 let after_marker = &rest[index + marker.len()..];
116 let frontmatter = serde_yaml::from_str(yaml).context("failed to parse frontmatter")?;
117 return Ok((frontmatter, after_marker));
118 }
119 }
120
121 anyhow::bail!("frontmatter starts with --- but has no closing marker")
122}
123
124fn normalize_frontmatter(frontmatter: &mut Frontmatter) {
125 if frontmatter.layout.is_empty() {
126 frontmatter.layout = "doc".to_string();
127 }
128 if frontmatter.access != "public" && frontmatter.access != "masked" {
129 frontmatter.access = "public".to_string();
130 }
131}
132
133fn collect_headings(events: &[Event<'_>]) -> Vec<Heading> {
134 let mut headings = Vec::new();
135 let mut current: Option<(u8, String)> = None;
136 let mut used = HashMap::<String, usize>::new();
137
138 for event in events {
139 match event {
140 Event::Start(Tag::Heading { level, .. }) => {
141 current = Some((heading_level(*level), String::new()));
142 }
143 Event::Text(text) | Event::Code(text) => {
144 if let Some((_, current_text)) = &mut current {
145 current_text.push_str(text);
146 }
147 }
148 Event::End(TagEnd::Heading(_)) => {
149 if let Some((level, text)) = current.take() {
150 let anchor = unique_slug(&slugify(&text), &mut used);
151 headings.push(Heading {
152 level,
153 text: text.trim().to_string(),
154 anchor,
155 });
156 }
157 }
158 _ => {}
159 }
160 }
161
162 headings
163}
164
165fn render_html(events: Vec<Event<'_>>, headings: &[Heading], options: &MarkdownOptions) -> String {
166 let mut out_events = Vec::with_capacity(events.len() + headings.len());
167 let mut heading_index = 0usize;
168 let mut in_code_block = false;
169 let mut code_lang: Option<String> = None;
170 let mut code_text = String::new();
171
172 for event in events {
173 match event {
174 Event::Start(Tag::Heading { level, .. }) if options.heading_anchors => {
175 let anchor = headings
176 .get(heading_index)
177 .map(|heading| heading.anchor.as_str())
178 .unwrap_or_default();
179 heading_index += 1;
180 out_events.push(Event::Html(CowStr::from(format!(
181 "<{} id=\"{}\"><a class=\"heading-anchor\" href=\"#{}\" aria-label=\"Link to section\">#</a>",
182 heading_tag(level),
183 escape_attr(anchor),
184 escape_attr(anchor)
185 ))));
186 }
187 Event::End(TagEnd::Heading(level)) if options.heading_anchors => {
188 out_events.push(Event::Html(CowStr::from(format!(
189 "</{}>",
190 heading_tag(level)
191 ))));
192 }
193 Event::Start(Tag::CodeBlock(kind)) => {
194 let lang = match &kind {
195 CodeBlockKind::Fenced(value) => value
196 .split_whitespace()
197 .next()
198 .filter(|value| !value.is_empty())
199 .map(str::to_string),
200 CodeBlockKind::Indented => None,
201 };
202
203 if options.mermaid && lang.as_deref() == Some("mermaid") {
204 in_code_block = true;
205 code_lang = lang;
206 out_events.push(Event::Html(CowStr::from("<pre class=\"mermaid\">")));
207 } else if options.code_highlight {
208 in_code_block = true;
209 code_lang = lang;
210 code_text.clear();
211 } else {
212 out_events.push(Event::Start(Tag::CodeBlock(kind)));
213 }
214 }
215 Event::Text(text) if in_code_block && code_lang.as_deref() == Some("mermaid") => {
216 out_events.push(Event::Html(CowStr::from(escape_html(&text))));
217 }
218 Event::Text(text) | Event::Code(text) if in_code_block => {
219 code_text.push_str(&text);
220 }
221 Event::End(TagEnd::CodeBlock) if in_code_block => {
222 if code_lang.as_deref() == Some("mermaid") {
223 out_events.push(Event::Html(CowStr::from("</pre>")));
224 } else {
225 out_events.push(Event::Html(CowStr::from(render_code_block(
226 &code_text,
227 code_lang.as_deref(),
228 ))));
229 code_text.clear();
230 }
231 in_code_block = false;
232 code_lang = None;
233 }
234 _ => out_events.push(event),
235 }
236 }
237
238 let mut rendered = String::new();
239 html::push_html(&mut rendered, out_events.into_iter());
240 rendered
241}
242
243fn render_code_block(code: &str, lang: Option<&str>) -> String {
244 let normalized_lang = lang
245 .map(normalize_code_lang)
246 .filter(|lang| !lang.is_empty());
247 let highlighted = highlight_code(code, normalized_lang);
248 let lang_class = normalized_lang
249 .map(|lang| format!(" language-{}", escape_attr(lang)))
250 .unwrap_or_default();
251 let header = normalized_lang
252 .map(|lang| {
253 format!(
254 r#"<div class="rp-code-header"><span>{}</span></div>"#,
255 escape_html(lang)
256 )
257 })
258 .unwrap_or_default();
259
260 format!(
261 r#"<div class="rp-code">{header}<pre><code class="rp-code-content{lang_class}">{highlighted}</code></pre></div>"#
262 )
263}
264
265fn normalize_code_lang(lang: &str) -> &str {
266 lang.trim()
267 .trim_start_matches("language-")
268 .split([',', '{'])
269 .next()
270 .unwrap_or("")
271 .trim()
272}
273
274fn highlight_code(code: &str, lang: Option<&str>) -> String {
275 let syntax = lang
276 .and_then(|lang| SYNTAX_SET.find_syntax_by_token(lang))
277 .unwrap_or_else(|| SYNTAX_SET.find_syntax_plain_text());
278 let mut highlighter = HighlightLines::new(syntax, highlight_theme());
279 let mut html = String::new();
280
281 for line in LinesWithEndings::from(code) {
282 match highlighter
283 .highlight_line(line, &SYNTAX_SET)
284 .and_then(|regions| styled_line_to_highlighted_html(®ions, IncludeBackground::No))
285 {
286 Ok(line_html) => html.push_str(&line_html),
287 Err(_) => html.push_str(&escape_html(line)),
288 }
289 }
290
291 html
292}
293
294static SYNTAX_SET: Lazy<SyntaxSet> = Lazy::new(SyntaxSet::load_defaults_newlines);
295static THEME_SET: Lazy<ThemeSet> = Lazy::new(ThemeSet::load_defaults);
296
297fn highlight_theme() -> &'static Theme {
298 THEME_SET
299 .themes
300 .get("base16-ocean.dark")
301 .or_else(|| THEME_SET.themes.values().next())
302 .expect("syntect ships with default themes")
303}
304
305fn collect_search_text(markdown: &str, index_code: bool) -> String {
306 let mut parser_options = Options::empty();
307 parser_options.insert(Options::ENABLE_TABLES);
308 parser_options.insert(Options::ENABLE_STRIKETHROUGH);
309 parser_options.insert(Options::ENABLE_TASKLISTS);
310
311 let mut text = String::new();
312 let mut in_code_block = false;
313
314 for event in Parser::new_ext(markdown, parser_options) {
315 match event {
316 Event::Start(Tag::CodeBlock(_)) => in_code_block = true,
317 Event::End(TagEnd::CodeBlock) => in_code_block = false,
318 Event::Text(value) | Event::Code(value) => {
319 if index_code || !in_code_block {
320 if !text.is_empty() {
321 text.push(' ');
322 }
323 text.push_str(&value);
324 }
325 }
326 _ => {}
327 }
328 }
329
330 normalize_space(&text)
331}
332
333fn heading_level(level: HeadingLevel) -> u8 {
334 match level {
335 HeadingLevel::H1 => 1,
336 HeadingLevel::H2 => 2,
337 HeadingLevel::H3 => 3,
338 HeadingLevel::H4 => 4,
339 HeadingLevel::H5 => 5,
340 HeadingLevel::H6 => 6,
341 }
342}
343
344fn heading_tag(level: HeadingLevel) -> &'static str {
345 match level {
346 HeadingLevel::H1 => "h1",
347 HeadingLevel::H2 => "h2",
348 HeadingLevel::H3 => "h3",
349 HeadingLevel::H4 => "h4",
350 HeadingLevel::H5 => "h5",
351 HeadingLevel::H6 => "h6",
352 }
353}
354
355fn slugify(text: &str) -> String {
356 static PUNCT: Lazy<Regex> =
357 Lazy::new(|| Regex::new(r"[^\p{Alphabetic}\p{Number}_\-\s]+").unwrap());
358 static SPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_]+").unwrap());
359
360 let lower = text.trim().to_lowercase();
361 let without_punct = PUNCT.replace_all(&lower, "");
362 let slug = SPACE.replace_all(without_punct.trim(), "-");
363 if slug.is_empty() {
364 "section".to_string()
365 } else {
366 slug.to_string()
367 }
368}
369
370fn unique_slug(slug: &str, used: &mut HashMap<String, usize>) -> String {
371 let count = used.entry(slug.to_string()).or_insert(0);
372 *count += 1;
373 if *count == 1 {
374 slug.to_string()
375 } else {
376 format!("{slug}-{}", *count)
377 }
378}
379
380fn normalize_space(input: &str) -> String {
381 input.split_whitespace().collect::<Vec<_>>().join(" ")
382}
383
384fn escape_html(input: &str) -> String {
385 input
386 .replace('&', "&")
387 .replace('<', "<")
388 .replace('>', ">")
389}
390
391fn escape_attr(input: &str) -> String {
392 escape_html(input).replace('"', """)
393}
394
395#[cfg(test)]
396mod tests {
397 use super::*;
398
399 #[test]
400 fn parses_frontmatter_and_title() {
401 let doc = parse_markdown(
402 "---\ntitle: Page Title\naccess: masked\n---\n# Ignored\nBody",
403 MarkdownOptions::default(),
404 )
405 .unwrap();
406
407 assert_eq!(doc.title, "Page Title");
408 assert_eq!(doc.frontmatter.access, "masked");
409 assert!(doc.html.contains("id=\"ignored\""));
410 }
411
412 #[test]
413 fn chinese_heading_anchor_is_preserved() {
414 let doc = parse_markdown("# 中文 标题\n\ntext", MarkdownOptions::default()).unwrap();
415
416 assert_eq!(doc.headings[0].anchor, "中文-标题");
417 assert!(doc.html.contains("id=\"中文-标题\""));
418 }
419
420 #[test]
421 fn mermaid_code_block_becomes_mermaid_pre() {
422 let doc = parse_markdown(
423 "```mermaid\nflowchart LR\nA-->B\n```",
424 MarkdownOptions::default(),
425 )
426 .unwrap();
427
428 assert!(doc.html.contains("<pre class=\"mermaid\">"));
429 assert!(doc.html.contains("A-->B"));
430 }
431
432 #[test]
433 fn fenced_code_is_highlighted_with_syntect() {
434 let doc = parse_markdown(
435 "```rust\nfn main() {\n println!(\"hi\");\n}\n```",
436 MarkdownOptions::default(),
437 )
438 .unwrap();
439
440 assert!(doc.html.contains("class=\"rp-code\""));
441 assert!(doc.html.contains("language-rust"));
442 assert!(doc.html.contains("<span style="));
443 assert!(doc.html.contains("println"));
444 }
445
446 #[test]
447 fn code_highlight_can_be_disabled() {
448 let doc = parse_markdown(
449 "```rust\nfn main() {}\n```",
450 MarkdownOptions {
451 code_highlight: false,
452 ..MarkdownOptions::default()
453 },
454 )
455 .unwrap();
456
457 assert!(!doc.html.contains("class=\"rp-code\""));
458 assert!(doc.html.contains("<pre><code class=\"language-rust\">"));
459 }
460
461 #[test]
462 fn code_is_excluded_from_search_by_default() {
463 let doc = parse_markdown(
464 "Body\n\n```rust\nlet hidden = true;\n```",
465 MarkdownOptions::default(),
466 )
467 .unwrap();
468
469 assert!(doc.search_text.contains("Body"));
470 assert!(!doc.search_text.contains("hidden"));
471 }
472}