use crate::error::{Result, WeChatError};
use crate::utils;
use comrak::{Arena, ComrakOptions, nodes::NodeValue};
use std::borrow::Cow;
use std::collections::HashMap;
use std::path::{Path, PathBuf};
#[derive(Debug, Clone, PartialEq)]
pub struct ImageRef {
pub alt_text: String,
pub original_url: String,
pub position: (usize, usize),
pub is_local: bool,
}
impl ImageRef {
pub fn new(alt_text: String, url: String, position: (usize, usize)) -> Self {
let is_local = !url.starts_with("http://") && !url.starts_with("https://");
Self {
alt_text,
original_url: url,
position,
is_local,
}
}
pub fn resolve_path(&self, base_path: &Path) -> Result<PathBuf> {
if self.is_local {
utils::resolve_path(base_path, &self.original_url).map_err(|e| {
WeChatError::config_error(format!(
"Invalid image path '{}': {}",
self.original_url, e
))
})
} else {
if self.original_url.starts_with("http://") || self.original_url.starts_with("https://")
{
Ok(PathBuf::from(&self.original_url))
} else {
Err(WeChatError::config_error(format!(
"Invalid URL format: {}",
self.original_url
)))
}
}
}
}
#[derive(Debug, Clone)]
pub struct MarkdownContent {
pub title: Option<String>,
pub author: Option<String>,
pub description: Option<String>,
pub cover: Option<String>,
pub theme: Option<String>,
pub code: Option<String>,
pub content: String,
pub images: Vec<ImageRef>,
pub metadata: HashMap<String, String>,
pub original_text: String,
}
struct SummaryExtractor {
summary: String,
text_length: usize,
max_length: usize,
found_paragraph: bool,
}
impl SummaryExtractor {
fn new(max_length: usize) -> Self {
Self {
summary: String::new(),
text_length: 0,
max_length,
found_paragraph: false,
}
}
fn extract_from_node<'a>(&mut self, node: &'a comrak::nodes::AstNode<'a>) -> bool {
match &node.data.borrow().value {
NodeValue::Paragraph => {
if self.found_paragraph {
return true;
}
self.found_paragraph = true;
for child in node.children() {
if self.extract_from_node(child) {
break;
}
if self.text_length >= self.max_length {
break;
}
}
if !self.summary.is_empty() {
return true; }
self.found_paragraph = false;
}
NodeValue::Text(text) => {
if self.found_paragraph && self.text_length < self.max_length {
let remaining = self.max_length - self.text_length;
if text.len() <= remaining {
self.summary.push_str(text);
self.text_length += text.len();
} else {
let mut boundary = remaining;
while boundary > 0 && !text.is_char_boundary(boundary) {
boundary -= 1;
}
if boundary > 0 {
self.summary.push_str(&text[..boundary]);
self.summary.push_str("...");
} else {
self.summary.push_str("...");
}
self.text_length = self.max_length;
}
}
}
_ => {
for child in node.children() {
if self.extract_from_node(child) {
return true;
}
if self.text_length >= self.max_length {
break;
}
}
}
}
false
}
}
struct PlainTextExtractor {
text: String,
}
impl PlainTextExtractor {
fn new() -> Self {
Self {
text: String::new(),
}
}
fn extract_from_node<'a>(&mut self, node: &'a comrak::nodes::AstNode<'a>) {
match &node.data.borrow().value {
NodeValue::Text(content) => {
self.text.push_str(content);
self.text.push(' ');
}
_ => {
for child in node.children() {
self.extract_from_node(child);
}
}
}
}
}
impl MarkdownContent {
pub fn replace_image_urls(&mut self, url_mapping: &HashMap<String, String>) -> Result<()> {
let mut content: Cow<str> = Cow::Borrowed(&self.content);
let mut sorted_images = self.images.clone();
sorted_images.sort_by(|a, b| b.position.0.cmp(&a.position.0));
for image in &sorted_images {
if let Some(new_url) = url_mapping.get(&image.original_url) {
let old_markdown = format!("", image.alt_text, image.original_url);
let new_markdown = format!("", image.alt_text, new_url);
match content {
Cow::Borrowed(s) => {
if s.contains(&old_markdown) {
content = Cow::Owned(s.replace(&old_markdown, &new_markdown));
}
}
Cow::Owned(ref mut s) => {
*s = s.replace(&old_markdown, &new_markdown);
}
}
}
}
if let Cow::Owned(new_content) = content {
self.content = new_content;
}
Ok(())
}
pub fn get_summary(&self, max_length: usize) -> String {
let arena = Arena::new();
let options = ComrakOptions::default();
let root = comrak::parse_document(&arena, &self.content, &options);
let mut extractor = SummaryExtractor::new(max_length);
extractor.extract_from_node(root);
if extractor.summary.is_empty() {
let content_text = self.extract_plain_text();
if content_text.len() > max_length {
format!("{}...", &content_text[..max_length])
} else {
content_text
}
} else {
extractor.summary
}
}
pub fn extract_plain_text(&self) -> String {
let arena = Arena::new();
let options = ComrakOptions::default();
let root = comrak::parse_document(&arena, &self.content, &options);
let mut extractor = PlainTextExtractor::new();
extractor.extract_from_node(root);
extractor.text.trim().to_string()
}
}
struct TitleExtractor;
impl TitleExtractor {
fn find_h1_title<'a>(node: &'a comrak::nodes::AstNode<'a>) -> Option<String> {
match &node.data.borrow().value {
NodeValue::Heading(heading) if heading.level == 1 => {
let mut title = String::new();
Self::collect_heading_text(node, &mut title);
if !title.trim().is_empty() {
Some(title.trim().to_string())
} else {
None
}
}
_ => {
for child in node.children() {
if let Some(title) = Self::find_h1_title(child) {
return Some(title);
}
}
None
}
}
}
fn collect_heading_text<'a>(node: &'a comrak::nodes::AstNode<'a>, title: &mut String) {
match &node.data.borrow().value {
NodeValue::Text(text) => title.push_str(text),
_ => {
for child in node.children() {
Self::collect_heading_text(child, title);
}
}
}
}
}
struct ImageExtractor {
images: Vec<ImageRef>,
source_content: String,
}
impl ImageExtractor {
fn new(source_content: &str) -> Self {
Self {
images: Vec::new(),
source_content: source_content.to_string(),
}
}
fn extract_from_node<'a>(&mut self, node: &'a comrak::nodes::AstNode<'a>) {
match &node.data.borrow().value {
NodeValue::Image(link) => {
let mut alt_text = String::new();
Self::collect_alt_text(node, &mut alt_text);
let url = link.url.clone();
let position = if let Some(start) =
self.source_content.find(&format!(""))
{
let end = start + format!("").len();
(start, end)
} else {
(0, 0) };
let image_ref = ImageRef::new(alt_text, url, position);
self.images.push(image_ref);
}
_ => {
for child in node.children() {
self.extract_from_node(child);
}
}
}
}
fn collect_alt_text<'a>(node: &'a comrak::nodes::AstNode<'a>, alt: &mut String) {
match &node.data.borrow().value {
NodeValue::Text(text) => alt.push_str(text),
NodeValue::Code(code) => alt.push_str(&code.literal),
_ => {
for child in node.children() {
Self::collect_alt_text(child, alt);
}
}
}
}
}
#[derive(Debug)]
pub struct MarkdownParser {
options: ComrakOptions<'static>,
}
impl MarkdownParser {
pub fn new() -> Self {
let mut options = ComrakOptions::<'static>::default();
options.extension.strikethrough = true;
options.extension.table = true;
options.extension.footnotes = true;
options.extension.tasklist = true;
options.parse.smart = true;
Self { options }
}
pub fn parse(&self, markdown: &str) -> Result<MarkdownContent> {
let (metadata, content_without_frontmatter) = self.extract_frontmatter(markdown)?;
let title = self.extract_title(&content_without_frontmatter, &metadata);
let author = metadata.get("author").cloned();
let description = metadata.get("description").cloned();
let cover = metadata.get("cover").cloned();
let theme = metadata.get("theme").cloned();
let code = metadata.get("code").cloned();
let images = self.extract_images(&content_without_frontmatter)?;
Ok(MarkdownContent {
title,
author,
description,
cover,
theme,
code,
content: content_without_frontmatter,
images,
metadata,
original_text: markdown.to_string(),
})
}
pub async fn parse_file<P: AsRef<Path>>(&self, path: P) -> Result<MarkdownContent> {
let content = tokio::fs::read_to_string(&path).await.map_err(|e| {
WeChatError::file_error(
path.as_ref().display().to_string(),
format!("Failed to read file: {e}"),
)
})?;
self.parse(&content)
}
fn extract_frontmatter(&self, markdown: &str) -> Result<(HashMap<String, String>, String)> {
let mut metadata = HashMap::new();
let content = if let Some(stripped) = markdown.strip_prefix("---\n") {
if let Some(end_pos) = stripped.find("\n---\n") {
let frontmatter = &stripped[..end_pos];
let content = &stripped[end_pos + 5..];
for line in frontmatter.lines() {
if let Some((key, value)) = line.split_once(':') {
let key = key.trim().to_string();
let value = value.trim().trim_matches('"').to_string();
metadata.insert(key, value);
}
}
content.to_string()
} else {
markdown.to_string()
}
} else {
markdown.to_string()
};
Ok((metadata, content))
}
fn extract_title(&self, content: &str, metadata: &HashMap<String, String>) -> Option<String> {
if let Some(title) = metadata.get("title") {
return Some(title.clone());
}
let arena = Arena::new();
let root = comrak::parse_document(&arena, content, &self.options);
TitleExtractor::find_h1_title(root)
}
fn extract_images(&self, content: &str) -> Result<Vec<ImageRef>> {
let arena = Arena::new();
let root = comrak::parse_document(&arena, content, &self.options);
let mut extractor = ImageExtractor::new(content);
extractor.extract_from_node(root);
Ok(extractor.images)
}
}
impl Default for MarkdownParser {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::NamedTempFile;
#[test]
fn test_image_ref_creation() {
let img = ImageRef::new("Alt text".to_string(), "image.jpg".to_string(), (10, 20));
assert_eq!(img.alt_text, "Alt text");
assert_eq!(img.original_url, "image.jpg");
assert_eq!(img.position, (10, 20));
assert!(img.is_local);
let remote_img = ImageRef::new(
"Remote".to_string(),
"https://example.com/image.jpg".to_string(),
(0, 10),
);
assert!(!remote_img.is_local);
}
#[test]
fn test_frontmatter_extraction() {
let parser = MarkdownParser::new();
let markdown = r#"---
title: Test Article
author: John Doe
date: 2024-01-01
---
# Content
This is the content."#;
let (metadata, content) = parser.extract_frontmatter(markdown).unwrap();
assert_eq!(metadata.get("title"), Some(&"Test Article".to_string()));
assert_eq!(metadata.get("author"), Some(&"John Doe".to_string()));
assert_eq!(metadata.get("date"), Some(&"2024-01-01".to_string()));
assert!(content.contains("# Content"));
}
#[test]
fn test_title_extraction() {
let parser = MarkdownParser::new();
let mut metadata = HashMap::new();
metadata.insert("title".to_string(), "Metadata Title".to_string());
let title = parser.extract_title("# Heading Title", &metadata);
assert_eq!(title, Some("Metadata Title".to_string()));
let empty_metadata = HashMap::new();
let title = parser.extract_title("# Heading Title\n\nContent", &empty_metadata);
assert_eq!(title, Some("Heading Title".to_string()));
let title = parser.extract_title("Just content", &empty_metadata);
assert_eq!(title, None);
}
#[test]
fn test_image_extraction() {
let parser = MarkdownParser::new();
let markdown = r#"# Test
Here's a local image: 
And a remote image: 
More content here."#;
let images = parser.extract_images(markdown).unwrap();
assert_eq!(images.len(), 2);
let local_img = &images[0];
assert_eq!(local_img.alt_text, "Alt text");
assert_eq!(local_img.original_url, "./images/local.jpg");
assert!(local_img.is_local);
let remote_img = &images[1];
assert_eq!(remote_img.alt_text, "Remote");
assert_eq!(remote_img.original_url, "https://example.com/remote.png");
assert!(!remote_img.is_local);
}
#[tokio::test]
async fn test_markdown_parsing() {
let parser = MarkdownParser::new();
let markdown = r#"---
title: Test Article
author: Jane Doe
cover: images/cover.jpg
---
# Main Heading
This is a test article with an image: 
More content here."#;
let content = parser.parse(markdown).unwrap();
assert_eq!(content.title, Some("Test Article".to_string()));
assert_eq!(content.author, Some("Jane Doe".to_string()));
assert_eq!(content.description, None);
assert_eq!(content.cover, Some("images/cover.jpg".to_string()));
assert_eq!(content.theme, None);
assert_eq!(content.code, None);
assert_eq!(content.images.len(), 1);
assert_eq!(content.images[0].alt_text, "Test");
assert_eq!(content.images[0].original_url, "./test.jpg");
}
#[test]
fn test_url_replacement() {
let parser = MarkdownParser::new();
let markdown = " and ";
let mut content = parser.parse(markdown).unwrap();
let mut url_mapping = HashMap::new();
url_mapping.insert(
"./local.jpg".to_string(),
"https://wechat.com/123".to_string(),
);
content.replace_image_urls(&url_mapping).unwrap();
assert!(content.content.contains("https://wechat.com/123"));
assert!(content.content.contains("https://example.com/remote.png"));
}
#[test]
fn test_summary_extraction() {
let parser = MarkdownParser::new();
let markdown = r#"# Title
This is the first paragraph with some content.
This is the second paragraph.
"#;
let content = parser.parse(markdown).unwrap();
let summary = content.get_summary(100);
assert!(summary.contains("This is the first paragraph"));
assert!(!summary.contains("This is the second paragraph"));
}
#[tokio::test]
async fn test_file_parsing() {
let parser = MarkdownParser::new();
let temp_file = NamedTempFile::new().unwrap();
let markdown_content = r#"# Test File
Content from file with 
"#;
tokio::fs::write(temp_file.path(), markdown_content)
.await
.unwrap();
let content = parser.parse_file(temp_file.path()).await.unwrap();
assert_eq!(content.title, Some("Test File".to_string()));
assert_eq!(content.images.len(), 1);
assert_eq!(content.images[0].original_url, "./test.jpg");
}
#[test]
fn test_cover_extraction_from_frontmatter() {
let parser = MarkdownParser::new();
let markdown_with_cover = r#"---
title: Test Article
cover: images/cover.jpg
---
# Content"#;
let content = parser.parse(markdown_with_cover).unwrap();
assert_eq!(content.cover, Some("images/cover.jpg".to_string()));
let markdown_without_cover = r#"---
title: Test Article
---
# Content"#;
let content = parser.parse(markdown_without_cover).unwrap();
assert_eq!(content.cover, None);
}
#[test]
fn test_theme_extraction_from_frontmatter() {
let parser = MarkdownParser::new();
let markdown_with_theme = r#"---
title: Test Article
theme: lapis
---
# Content"#;
let content = parser.parse(markdown_with_theme).unwrap();
assert_eq!(content.theme, Some("lapis".to_string()));
let markdown_without_theme = r#"---
title: Test Article
---
# Content"#;
let content = parser.parse(markdown_without_theme).unwrap();
assert_eq!(content.theme, None);
}
#[test]
fn test_code_theme_extraction_from_frontmatter() {
let parser = MarkdownParser::new();
let markdown_with_code = r#"---
title: Test Article
code: solarized-light
---
# Content"#;
let content = parser.parse(markdown_with_code).unwrap();
assert_eq!(content.code, Some("solarized-light".to_string()));
let markdown_without_code = r#"---
title: Test Article
---
# Content"#;
let content = parser.parse(markdown_without_code).unwrap();
assert_eq!(content.code, None);
}
#[test]
fn test_description_extraction_from_frontmatter() {
let parser = MarkdownParser::new();
let markdown_with_description = r#"---
title: Test Article
author: Jane Doe
description: This is a custom description for the article
cover: images/cover.jpg
---
# Content"#;
let content = parser.parse(markdown_with_description).unwrap();
assert_eq!(
content.description,
Some("This is a custom description for the article".to_string())
);
let markdown_without_description = r#"---
title: Test Article
author: Jane Doe
cover: images/cover.jpg
---
# Content"#;
let content = parser.parse(markdown_without_description).unwrap();
assert_eq!(content.description, None);
}
#[test]
fn test_description_vs_auto_summary_priority() {
let parser = MarkdownParser::new();
let markdown_with_description = r#"---
title: Test Article
description: Custom description from frontmatter
---
# Test Article
This is the first paragraph of the article content. It should not be used as the summary when a description is provided in the frontmatter.
This is the second paragraph with more content."#;
let content = parser.parse(markdown_with_description).unwrap();
assert_eq!(
content.description,
Some("Custom description from frontmatter".to_string())
);
let auto_summary = content.get_summary(120);
assert!(auto_summary.contains("This is the first paragraph"));
let final_digest = content
.description
.clone()
.unwrap_or_else(|| content.get_summary(120));
assert_eq!(final_digest, "Custom description from frontmatter");
assert_ne!(final_digest, auto_summary);
}
#[test]
fn test_markdown_parsing_with_all_frontmatter() {
let parser = MarkdownParser::new();
let markdown = r#"---
title: Full Example
author: John Doe
description: Complete example with all frontmatter fields
cover: assets/cover-image.png
date: 2024-01-01
---
# Main Content
Article content with an image: 
"#;
let content = parser.parse(markdown).unwrap();
assert_eq!(content.title, Some("Full Example".to_string()));
assert_eq!(content.author, Some("John Doe".to_string()));
assert_eq!(
content.description,
Some("Complete example with all frontmatter fields".to_string())
);
assert_eq!(content.cover, Some("assets/cover-image.png".to_string()));
assert_eq!(content.theme, None);
assert_eq!(content.code, None);
assert_eq!(
content.metadata.get("date"),
Some(&"2024-01-01".to_string())
);
assert_eq!(content.images.len(), 1);
assert_eq!(content.images[0].original_url, "./example.jpg");
}
}