use comrak::{markdown_to_html, ComrakOptions};
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[cfg(feature = "python")]
pub mod python;
#[cfg(feature = "wasm")]
pub mod wasm;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Document {
pub html: String,
pub metadata: HashMap<String, String>,
pub toc: String,
pub headings: Vec<Heading>,
pub reading_time: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Heading {
pub text: String,
pub level: usize,
pub id: String,
}
impl Document {
pub fn parse(markdown: &str) -> Self {
let (metadata, content) = extract_frontmatter(markdown);
let headings = extract_headings(&content);
let mut options = ComrakOptions::default();
options.extension.strikethrough = true;
options.extension.tagfilter = true;
options.extension.table = true;
options.extension.autolink = true;
options.extension.tasklist = true;
options.extension.footnotes = true;
options.extension.description_lists = true;
options.extension.front_matter_delimiter = Some("---".to_string());
options.extension.header_ids = Some(String::new());
let content_with_anchors = add_heading_anchors(&content, &headings);
let html = rewrite_heading_ids(
&markdown_to_html(&content_with_anchors, &options),
&headings,
);
let toc = generate_toc(&headings);
let reading_time = calculate_reading_time(&content);
Document {
html,
metadata,
toc,
headings,
reading_time,
}
}
pub fn to_html(markdown: &str) -> String {
Self::parse(markdown).html
}
}
fn extract_frontmatter(markdown: &str) -> (HashMap<String, String>, String) {
let re = Regex::new(r"(?s)^---\s*\n(.*?)\n---\s*\n(.*)$").unwrap();
if let Some(caps) = re.captures(markdown) {
let yaml_str = caps.get(1).map_or("", |m| m.as_str());
let content = caps.get(2).map_or(markdown, |m| m.as_str());
let metadata: HashMap<String, serde_yaml::Value> =
serde_yaml::from_str(yaml_str).unwrap_or_default();
let metadata: HashMap<String, String> = metadata
.into_iter()
.map(|(k, v)| {
let v_str = match v {
serde_yaml::Value::String(s) => s,
serde_yaml::Value::Number(n) => n.to_string(),
serde_yaml::Value::Bool(b) => b.to_string(),
_ => format!("{:?}", v),
};
(k, v_str)
})
.collect();
(metadata, content.to_string())
} else {
(HashMap::new(), markdown.to_string())
}
}
fn extract_headings(markdown: &str) -> Vec<Heading> {
let re = Regex::new(r"^(#{1,6})\s+(.+)$").unwrap();
let mut headings = Vec::new();
let mut used_ids: HashMap<String, usize> = HashMap::new();
let mut in_fenced_code = false;
for line in markdown.lines() {
let trimmed = line.trim_start();
if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
in_fenced_code = !in_fenced_code;
continue;
}
if in_fenced_code {
continue;
}
if let Some(caps) = re.captures(line) {
let level = caps.get(1).map_or(0, |m| m.as_str().len());
let text = caps
.get(2)
.map_or("", |m| m.as_str())
.trim()
.trim_end_matches('#')
.trim()
.to_string();
let id = unique_anchor_id(generate_anchor_id(&text), &mut used_ids);
headings.push(Heading { text, level, id });
}
}
headings
}
fn generate_anchor_id(text: &str) -> String {
let re = Regex::new(r"<[^>]+>").unwrap();
let text = re.replace_all(text, "");
let mut id = String::new();
for c in text.chars() {
match c {
'ー' | '〜' | '~' => continue,
'あ' | 'ア' => id.push('a'),
'い' | 'イ' => id.push('i'),
'う' | 'ウ' => id.push('u'),
'え' | 'エ' => id.push('e'),
'お' | 'オ' => id.push('o'),
'か' | 'カ' => id.push_str("ka"),
'き' | 'キ' => id.push_str("ki"),
'く' | 'ク' => id.push_str("ku"),
'け' | 'ケ' => id.push_str("ke"),
'こ' | 'コ' => id.push_str("ko"),
'さ' | 'サ' => id.push_str("sa"),
'し' | 'シ' => id.push_str("shi"),
'す' | 'ス' => id.push_str("su"),
'せ' | 'セ' => id.push_str("se"),
'そ' | 'ソ' => id.push_str("so"),
'た' | 'タ' => id.push_str("ta"),
'ち' | 'チ' => id.push_str("chi"),
'つ' | 'ツ' => id.push_str("tsu"),
'て' | 'テ' => id.push_str("te"),
'と' | 'ト' => id.push_str("to"),
'な' | 'ナ' => id.push_str("na"),
'に' | 'ニ' => id.push_str("ni"),
'ぬ' | 'ヌ' => id.push_str("nu"),
'ね' | 'ネ' => id.push_str("ne"),
'の' | 'ノ' => id.push_str("no"),
'は' | 'ハ' => id.push_str("ha"),
'ひ' | 'ヒ' => id.push_str("hi"),
'ふ' | 'フ' => id.push_str("fu"),
'へ' | 'ヘ' => id.push_str("he"),
'ほ' | 'ホ' => id.push_str("ho"),
'ま' | 'マ' => id.push_str("ma"),
'み' | 'ミ' => id.push_str("mi"),
'む' | 'ム' => id.push_str("mu"),
'め' | 'メ' => id.push_str("me"),
'も' | 'モ' => id.push_str("mo"),
'や' | 'ヤ' => id.push_str("ya"),
'ゆ' | 'ユ' => id.push_str("yu"),
'よ' | 'ヨ' => id.push_str("yo"),
'ら' | 'ラ' => id.push_str("ra"),
'り' | 'リ' => id.push_str("ri"),
'る' | 'ル' => id.push_str("ru"),
'れ' | 'レ' => id.push_str("re"),
'ろ' | 'ロ' => id.push_str("ro"),
'わ' | 'ワ' => id.push_str("wa"),
'を' | 'ヲ' => id.push_str("wo"),
'ん' | 'ン' => id.push('n'),
'が' | 'ガ' => id.push_str("ga"),
'ぎ' | 'ギ' => id.push_str("gi"),
'ぐ' | 'グ' => id.push_str("gu"),
'げ' | 'ゲ' => id.push_str("ge"),
'ご' | 'ゴ' => id.push_str("go"),
'ざ' | 'ザ' => id.push_str("za"),
'じ' | 'ジ' => id.push_str("ji"),
'ず' | 'ズ' => id.push_str("zu"),
'ぜ' | 'ゼ' => id.push_str("ze"),
'ぞ' | 'ゾ' => id.push_str("zo"),
'だ' | 'ダ' => id.push_str("da"),
'ぢ' | 'ヂ' => id.push_str("di"),
'づ' | 'ヅ' => id.push_str("du"),
'で' | 'デ' => id.push_str("de"),
'ど' | 'ド' => id.push_str("do"),
'ば' | 'バ' => id.push_str("ba"),
'び' | 'ビ' => id.push_str("bi"),
'ぶ' | 'ブ' => id.push_str("bu"),
'べ' | 'ベ' => id.push_str("be"),
'ぼ' | 'ボ' => id.push_str("bo"),
'ぱ' | 'パ' => id.push_str("pa"),
'ぴ' | 'ピ' => id.push_str("pi"),
'ぷ' | 'プ' => id.push_str("pu"),
'ぺ' | 'ペ' => id.push_str("pe"),
'ぽ' | 'ポ' => id.push_str("po"),
' ' | ' ' => id.push('-'),
c if c.is_alphanumeric() && c.is_ascii() => id.push(c.to_ascii_lowercase()),
c if matches!(c, '\u{4E00}'..='\u{9FFF}') => {
match c {
'方' => id.push_str("hou"),
'法' => id.push_str("hou"),
_ => {} }
}
_ => {} }
}
let re = Regex::new(r"-+").unwrap();
let id = re.replace_all(&id, "-");
id.trim_matches('-').to_string()
}
fn unique_anchor_id(base_id: String, used_ids: &mut HashMap<String, usize>) -> String {
let base_id = if base_id.is_empty() {
"heading".to_string()
} else {
base_id
};
let count = used_ids.entry(base_id.clone()).or_insert(0);
*count += 1;
if *count == 1 {
base_id
} else {
format!("{base_id}-{count}")
}
}
fn add_heading_anchors(content: &str, _headings: &[Heading]) -> String {
content.to_string()
}
fn rewrite_heading_ids(html: &str, headings: &[Heading]) -> String {
let re = Regex::new(
r##"(?s)<h([1-6])><a href="#[^"]*" aria-hidden="true" class="anchor" id="[^"]*"></a>(.*?)</h[1-6]>"##,
)
.unwrap();
let mut index = 0;
re.replace_all(html, |caps: ®ex::Captures| {
let level = caps.get(1).map_or("", |m| m.as_str());
let body = caps.get(2).map_or("", |m| m.as_str());
let id = headings
.get(index)
.map(|heading| heading.id.as_str())
.unwrap_or_default();
index += 1;
format!(
"<h{level}><a href=\"#{}\" aria-hidden=\"true\" class=\"anchor\" id=\"{}\"></a>{body}</h{level}>",
escape_html_attr(id),
escape_html_attr(id)
)
})
.to_string()
}
fn generate_toc(headings: &[Heading]) -> String {
let mut toc = String::new();
for heading in headings {
let indent = " ".repeat(heading.level.saturating_sub(1));
toc.push_str(&format!(
"{}- [{}](#{})\n",
indent,
escape_markdown_link_text(&heading.text),
heading.id
));
}
toc
}
fn calculate_reading_time(markdown: &str) -> usize {
let re = Regex::new(r"```[\s\S]*?```").unwrap();
let text = re.replace_all(markdown, "");
let re = Regex::new(r"[#*`\[\]()!]").unwrap();
let text = re.replace_all(&text, "");
let mut char_count: usize = 0;
let mut english_text = String::new();
for c in text.chars() {
if is_japanese_char(c) {
char_count += 1;
english_text.push(' ');
} else {
english_text.push(c);
}
}
let word_count = english_text
.split(|c: char| !c.is_ascii_alphanumeric())
.filter(|word| !word.is_empty())
.count();
let japanese_time = char_count.div_ceil(400);
let english_time = word_count.div_ceil(200);
(japanese_time + english_time).max(1)
}
fn escape_markdown_link_text(input: &str) -> String {
let mut escaped = String::new();
for c in input.chars() {
if matches!(c, '[' | ']' | '\\') {
escaped.push('\\');
}
escaped.push(c);
}
escaped
}
fn escape_html_attr(input: &str) -> String {
input
.chars()
.map(|c| match c {
'&' => "&".to_string(),
'"' => """.to_string(),
'<' => "<".to_string(),
'>' => ">".to_string(),
_ => c.to_string(),
})
.collect()
}
fn is_japanese_char(c: char) -> bool {
matches!(c,
'\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{4E00}'..='\u{9FFF}' | '\u{FF66}'..='\u{FF9F}' )
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_basic() {
let doc = Document::parse("# Hello\n\nWorld");
assert!(doc.html.contains("<h1"));
assert!(doc.html.contains("World"));
}
#[test]
fn test_parse_with_frontmatter() {
let markdown = r#"---
title: Test Article
author: Taro
---
# Content
"#;
let doc = Document::parse(markdown);
assert_eq!(doc.metadata.get("title").unwrap(), "Test Article");
assert_eq!(doc.metadata.get("author").unwrap(), "Taro");
}
#[test]
fn test_heading_extraction() {
let doc = Document::parse("# H1\n## H2\n### H3");
assert_eq!(doc.headings.len(), 3);
assert_eq!(doc.headings[0].level, 1);
assert_eq!(doc.headings[1].level, 2);
assert_eq!(doc.headings[2].level, 3);
}
#[test]
fn test_heading_extraction_ignores_fenced_code() {
let markdown = r#"# Title
```markdown
# Not a heading
```
~~~markdown
## Also not a heading
~~~
## Section
"#;
let doc = Document::parse(markdown);
assert_eq!(doc.headings.len(), 2);
assert_eq!(doc.headings[0].text, "Title");
assert_eq!(doc.headings[1].text, "Section");
assert!(!doc.toc.contains("Not a heading"));
}
#[test]
fn test_duplicate_heading_ids_are_unique() {
let doc = Document::parse("# はじめに\n## はじめに\n### はじめに");
assert_eq!(doc.headings[0].id, "hajimeni");
assert_eq!(doc.headings[1].id, "hajimeni-2");
assert_eq!(doc.headings[2].id, "hajimeni-3");
assert!(doc.html.contains(r#"id="hajimeni-2""#));
assert!(doc.toc.contains(" - [はじめに](#hajimeni-2)"));
}
#[test]
fn test_empty_anchor_falls_back_to_heading() {
let doc = Document::parse("# 日本語\n## 日本語");
assert_eq!(doc.headings[0].id, "heading");
assert_eq!(doc.headings[1].id, "heading-2");
assert!(doc.html.contains(r#"id="heading""#));
assert!(doc.html.contains(r#"id="heading-2""#));
}
#[test]
fn test_toc_escapes_markdown_link_text() {
let doc = Document::parse("# [概要]\\確認");
assert!(doc.toc.contains(r"- [\[概要\]\\確認]"));
}
#[test]
fn test_japanese_anchor_id() {
let id = generate_anchor_id("はじめに");
assert_eq!(id, "hajimeni");
let id = generate_anchor_id("インストール方法");
assert_eq!(id, "insutoruhouhou");
let id = generate_anchor_id("インストール 方法");
assert_eq!(id, "insutoru-houhou");
}
#[test]
fn test_japanese_anchor_id_matches_html() {
let doc = Document::parse("# はじめに\n\n## インストール方法");
assert!(doc.html.contains(r##"href="#hajimeni""##));
assert!(doc.html.contains(r#"id="hajimeni""#));
assert!(doc.html.contains(r##"href="#insutoruhouhou""##));
assert!(doc.html.contains(r#"id="insutoruhouhou""#));
assert!(doc.toc.contains("- [はじめに](#hajimeni)"));
assert!(doc.toc.contains(" - [インストール方法](#insutoruhouhou)"));
}
#[test]
fn test_toc_generation() {
let doc = Document::parse("# First\n## Second\n### Third");
assert!(doc.toc.contains("- [First](#first)"));
assert!(doc.toc.contains(" - [Second](#second)"));
}
#[test]
fn test_reading_time() {
let text = "あ".repeat(400);
let doc = Document::parse(&text);
assert_eq!(doc.reading_time, 1);
let text = "あ".repeat(800);
let doc = Document::parse(&text);
assert_eq!(doc.reading_time, 2);
let text = "あ".repeat(401);
let doc = Document::parse(&text);
assert_eq!(doc.reading_time, 2);
}
#[test]
fn test_english_reading_time_counts_words() {
let text = (0..200).map(|_| "word").collect::<Vec<_>>().join(" ");
let doc = Document::parse(&text);
assert_eq!(doc.reading_time, 1);
let text = (0..400).map(|_| "word").collect::<Vec<_>>().join(" ");
let doc = Document::parse(&text);
assert_eq!(doc.reading_time, 2);
let text = (0..201).map(|_| "word").collect::<Vec<_>>().join(" ");
let doc = Document::parse(&text);
assert_eq!(doc.reading_time, 2);
}
#[test]
fn test_gfm_table() {
let markdown = r#"
| Header1 | Header2 |
|---------|---------|
| Cell1 | Cell2 |
"#;
let doc = Document::parse(markdown);
assert!(doc.html.contains("<table"));
}
#[test]
fn test_gfm_strikethrough() {
let doc = Document::parse("~~strikethrough~~");
assert!(doc.html.contains("<del>") || doc.html.contains("strikethrough"));
}
#[test]
fn test_gfm_tasklist() {
let markdown = r#"
- [x] Done
- [ ] Todo
"#;
let doc = Document::parse(markdown);
assert!(doc.html.contains("checkbox") || doc.html.contains("checked"));
}
}