use crate::core::{
error::{ProcessingError, Result},
traits::Processor,
};
use pulldown_cmark::{
html, Event, HeadingLevel, Options as MarkdownOptions, Parser, Tag,
};
use serde::{Deserialize, Serialize};
use serde_json::Value as JsonValue;
use serde_yml::from_str;
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
const MAX_CONTENT_SIZE: usize = 10 * 1024 * 1024;
const ALLOWED_HTML_TAGS: &[&str] = &[
"p",
"br",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"strong",
"em",
"del",
"ul",
"ol",
"li",
"code",
"pre",
"blockquote",
"hr",
"table",
"thead",
"tbody",
"tr",
"th",
"td",
"img",
"a",
"nav",
];
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProcessorConfig {
#[serde(default = "default_true")]
pub sanitize: bool,
#[serde(default)]
pub toc: bool,
#[serde(default = "default_toc_level")]
pub toc_max_level: u8,
#[serde(default = "default_true")]
pub auto_links: bool,
#[serde(default)]
pub options: HashMap<String, JsonValue>,
}
impl Default for ProcessorConfig {
fn default() -> Self {
Self {
sanitize: true,
toc: false,
toc_max_level: 3,
auto_links: true,
options: HashMap::new(),
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ContentMetadata {
pub title: Option<String>,
pub description: Option<String>,
pub date: Option<String>,
pub tags: Vec<String>,
pub custom: HashMap<String, JsonValue>,
}
#[derive(Debug)]
struct TocEntry {
text: String,
level: u8,
id: String,
}
#[derive(Debug, Clone)]
pub struct MarkdownProcessor {
options: MarkdownOptions,
config: ProcessorConfig,
allowed_tags: Arc<HashSet<String>>,
}
impl MarkdownProcessor {
pub fn new() -> Self {
let allowed_tags = ALLOWED_HTML_TAGS
.iter()
.map(|&tag| tag.to_string())
.collect();
Self {
options: MarkdownOptions::empty(),
config: ProcessorConfig::default(),
allowed_tags: Arc::new(allowed_tags),
}
}
pub fn with_tables(mut self, enable: bool) -> Self {
if enable {
self.options.insert(MarkdownOptions::ENABLE_TABLES);
} else {
self.options.remove(MarkdownOptions::ENABLE_TABLES);
}
self
}
pub fn with_strikethrough(mut self, enable: bool) -> Self {
if enable {
self.options.insert(MarkdownOptions::ENABLE_STRIKETHROUGH);
} else {
self.options.remove(MarkdownOptions::ENABLE_STRIKETHROUGH);
}
self
}
pub fn with_footnotes(mut self, enable: bool) -> Self {
if enable {
self.options.insert(MarkdownOptions::ENABLE_FOOTNOTES);
} else {
self.options.remove(MarkdownOptions::ENABLE_FOOTNOTES);
}
self
}
pub fn with_config(mut self, config: ProcessorConfig) -> Self {
self.config = config;
self
}
fn extract_metadata(
&self,
content: &str,
) -> Result<ContentMetadata> {
let mut metadata = ContentMetadata::default();
let mut lines = content.lines();
if content.starts_with("---\n") {
let mut frontmatter = String::with_capacity(1024);
let _ = lines.next();
for line in lines.by_ref() {
if line == "---" {
break;
}
frontmatter.push_str(line);
frontmatter.push('\n');
}
if let Ok(yaml) =
from_str::<HashMap<String, JsonValue>>(&frontmatter)
{
Self::process_metadata(&mut metadata, yaml)?;
}
}
if metadata.title.is_none() {
for line in content.lines() {
if let Some(title) = line.strip_prefix("# ") {
metadata.title = Some(title.trim().to_string());
break;
}
}
}
Ok(metadata)
}
fn process_metadata(
metadata: &mut ContentMetadata,
yaml: HashMap<String, JsonValue>,
) -> Result<()> {
for (key, value) in yaml {
match key.as_str() {
"title" => {
metadata.title = value
.as_str()
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty());
}
"description" => {
metadata.description = value
.as_str()
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty());
}
"date" => {
metadata.date = value
.as_str()
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty());
}
"tags" => {
if let Some(tags) = value.as_array() {
metadata.tags = tags
.iter()
.filter_map(|v| {
v.as_str()
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
})
.collect();
}
}
_ => {
let _ = metadata.custom.insert(key, value);
}
}
}
Ok(())
}
fn generate_toc(&self, content: &str) -> Result<String> {
let mut toc = String::from(
"<nav class=\"toc\" aria-label=\"Table of Contents\">\n<ul>\n",
);
let mut entries = Vec::new();
let parser = Parser::new_ext(content, self.options);
let mut current_text = String::new();
let mut current_level = None;
for event in parser {
match event {
Event::Start(Tag::Heading { level, .. }) => {
current_text.clear();
current_level = Some(level);
}
Event::Text(text) => {
current_text.push_str(&text);
}
Event::End(_) => {
if let Some(level) = current_level.take() {
let level_num = match level {
HeadingLevel::H1 => 1,
HeadingLevel::H2 => 2,
HeadingLevel::H3 => 3,
HeadingLevel::H4 => 4,
HeadingLevel::H5 => 5,
HeadingLevel::H6 => 6,
};
if level_num <= self.config.toc_max_level {
let id =
self.generate_heading_id(¤t_text);
entries.push(TocEntry {
text: current_text.clone(),
level: level_num,
id,
});
}
}
}
_ => {}
}
}
self.build_toc_html(&mut toc, &entries)?;
toc.push_str("</ul>\n</nav>");
Ok(toc)
}
fn generate_heading_id(&self, text: &str) -> String {
text.to_lowercase()
.chars()
.filter_map(|c| match c {
'a'..='z' | '0'..='9' => Some(c),
' ' | '-' | '_' => Some('-'),
_ => None,
})
.collect()
}
fn build_toc_html(
&self,
toc: &mut String,
entries: &[TocEntry],
) -> Result<()> {
let mut current_level = 1;
for entry in entries {
while entry.level > current_level {
toc.push_str("<ul>\n");
current_level += 1;
}
while entry.level < current_level {
toc.push_str("</ul>\n");
current_level -= 1;
}
toc.push_str(&format!(
"<li><a href=\"#{}\" aria-label=\"{}\">{}</a></li>\n",
entry.id, entry.text, entry.text
));
}
while current_level > 1 {
toc.push_str("</ul>\n");
current_level -= 1;
}
Ok(())
}
fn sanitize_html(&self, html: &str) -> Result<String> {
let mut output = String::with_capacity(html.len());
let mut in_tag = false;
let mut current_tag = String::new();
for c in html.chars() {
match c {
'<' => {
in_tag = true;
current_tag.clear();
}
'>' if in_tag => {
in_tag = false;
let tag_name = current_tag
.split_whitespace()
.next()
.unwrap_or("")
.trim_start_matches('/')
.to_lowercase();
if self.allowed_tags.contains(&tag_name) {
output.push('<');
output.push_str(¤t_tag);
output.push('>');
}
}
_ if in_tag => {
current_tag.push(c);
}
_ => {
if !in_tag {
output.push(c);
}
}
}
}
Ok(output)
}
fn validate(&self, content: &str) -> Result<()> {
if content.len() > MAX_CONTENT_SIZE {
return Err(ProcessingError::ContentProcessing {
details: format!(
"Content exceeds maximum size of {} bytes",
MAX_CONTENT_SIZE
),
source: None,
});
}
if content.trim().is_empty() {
return Err(ProcessingError::ContentProcessing {
details: "Content cannot be empty".to_string(),
source: None,
});
}
let suspicious_patterns = [
"javascript:",
"data:",
"vbscript:",
"onclick",
"onerror",
"onload",
"eval(",
];
for pattern in &suspicious_patterns {
if content.to_lowercase().contains(pattern) {
return Err(ProcessingError::ContentProcessing {
details: format!(
"Suspicious content pattern detected: {}",
pattern
),
source: None,
});
}
}
Ok(())
}
}
impl Default for MarkdownProcessor {
fn default() -> Self {
Self::new()
}
}
impl Processor for MarkdownProcessor {
type Input = String;
type Output = String;
type Context = JsonValue;
fn process(
&self,
content: String,
context: Option<&Self::Context>,
) -> Result<Self::Output> {
self.validate(&content)?;
let metadata = self.extract_metadata(&content)?;
let config: ProcessorConfig = context
.and_then(|ctx| serde_json::from_value(ctx.clone()).ok())
.unwrap_or_default();
let parser = Parser::new_ext(&content, self.options);
let mut html_output = String::with_capacity(content.len() * 2);
html::push_html(&mut html_output, parser);
if config.toc {
let toc = self.generate_toc(&content)?;
println!("Generated ToC: {}", toc); html_output = format!("{}\n{}", toc, html_output);
}
let processed = if config.sanitize {
self.sanitize_html(&html_output)?
} else {
html_output
};
if !metadata.custom.is_empty() {
let json_ld = serde_json::to_string(&metadata.custom)
.map_err(|e| ProcessingError::ContentProcessing {
details: "Failed to serialize metadata".to_string(),
source: Some(Box::new(e)),
})?;
Ok(format!(
"{}\n<script type=\"application/ld+json\">{}</script>",
processed, json_ld
))
} else {
Ok(processed)
}
}
}
fn default_true() -> bool {
true
}
fn default_toc_level() -> u8 {
3
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn test_markdown_processor_basic() {
let processor = MarkdownProcessor::new();
let input = "# Test\n\nThis is a **test**.";
let result = processor.process(input.to_owned(), None).unwrap();
assert!(result.contains("<h1>"));
assert!(result.contains("<strong>"));
}
#[test]
fn test_markdown_processor_with_options() {
let processor = MarkdownProcessor::new()
.with_tables(true)
.with_strikethrough(true);
let input =
"# Test\n\n| A | B |\n|---|---|\n| 1 | 2 |\n\n~~strike~~";
let result = processor.process(input.to_owned(), None).unwrap();
assert!(result.contains("<table>"));
assert!(result.contains("<del>"));
}
#[test]
fn test_metadata_extraction() {
let processor = MarkdownProcessor::new();
let input = r#"---
title: Test Post
description: A test post
date: 2024-01-01
tags:
- test
- example
custom_field: value
---
# Content"#;
let metadata = processor.extract_metadata(input).unwrap();
assert_eq!(metadata.title, Some("Test Post".to_string()));
assert_eq!(
metadata.description,
Some("A test post".to_string())
);
assert_eq!(metadata.date, Some("2024-01-01".to_string()));
assert_eq!(metadata.tags, vec!["test", "example"]);
assert!(metadata.custom.contains_key("custom_field"));
}
#[test]
fn test_toc_generation() {
let processor = MarkdownProcessor::new();
let input = "# H1\n\n## H2\n\n### H3";
let context = json!({
"toc": true
});
let result = processor
.process(input.to_owned(), Some(&context))
.unwrap();
println!("Result: {}", result);
assert!(result.contains(r#"<nav class="toc""#));
assert!(result.contains("<ul>"));
}
#[test]
fn test_sanitization() {
let processor = MarkdownProcessor::new();
let input = "# Test\n\n<script>alert('xss')</script>";
let context = json!({
"sanitize": true
});
let result = processor
.process(input.to_owned(), Some(&context))
.unwrap();
assert!(!result.contains("<script>"));
}
#[test]
fn test_validation() {
let processor = MarkdownProcessor::new();
assert!(processor.validate("").is_err());
let large_content = "a".repeat(MAX_CONTENT_SIZE + 1);
assert!(processor.validate(&large_content).is_err());
assert!(processor.validate("javascript:alert(1)").is_err());
assert!(processor.validate("onclick='alert(1)'").is_err());
assert!(processor.validate("# Valid content").is_ok());
}
#[test]
fn test_heading_id_generation() {
let processor = MarkdownProcessor::new();
let id = processor.generate_heading_id("Hello World! 123");
assert_eq!(id, "hello-world-123");
}
#[test]
fn test_custom_metadata() {
let processor = MarkdownProcessor::new();
let input = r#"---
title: Test
custom:
key1: value1
key2: 42
---
# Content"#;
let metadata = processor.extract_metadata(input).unwrap();
assert!(metadata.custom.contains_key("custom"));
}
#[test]
fn test_sanitization_with_allowed_tags() {
let processor = MarkdownProcessor::new();
let input = r#"
<p>Valid paragraph</p>
<script>alert('bad')</script>
<img src="valid.jpg" alt="valid">
<iframe src="bad.html"></iframe>
"#;
let result = processor.sanitize_html(input).unwrap();
assert!(result.contains("<p>"));
assert!(result.contains("<img"));
assert!(!result.contains("<script>"));
assert!(!result.contains("<iframe>"));
}
}