use super::stack_management::check_wasm_size_limit;
#[cfg(not(target_arch = "wasm32"))]
use super::stack_management::{html_requires_large_stack, run_on_dedicated_stack};
use crate::core::config::OutputFormat as KreuzbergOutputFormat;
use crate::error::{KreuzbergError, Result};
use crate::types::HtmlMetadata;
use html_to_markdown_rs::{
ConversionOptions, MetadataConfig, OutputFormat as LibOutputFormat, convert as convert_html, convert_with_metadata,
};
fn map_output_format(format: KreuzbergOutputFormat) -> LibOutputFormat {
match format {
KreuzbergOutputFormat::Markdown => LibOutputFormat::Markdown,
KreuzbergOutputFormat::Djot => LibOutputFormat::Djot,
KreuzbergOutputFormat::Plain | KreuzbergOutputFormat::Html | KreuzbergOutputFormat::Structured => {
LibOutputFormat::Markdown
}
}
}
pub fn resolve_conversion_options(
options: Option<ConversionOptions>,
output_format: KreuzbergOutputFormat,
) -> ConversionOptions {
let mut opts = options.unwrap_or_else(|| ConversionOptions {
extract_metadata: true,
hocr_spatial_tables: false,
preprocessing: super::types::PreprocessingOptions {
enabled: false,
..Default::default()
},
..Default::default()
});
opts.output_format = map_output_format(output_format);
opts
}
fn convert_html_with_options(html: &str, options: ConversionOptions) -> Result<String> {
convert_html(html, Some(options))
.map_err(|e| KreuzbergError::parsing(format!("Failed to convert HTML to Markdown: {}", e)))
}
pub fn convert_html_to_markdown(
html: &str,
options: Option<ConversionOptions>,
output_format: Option<KreuzbergOutputFormat>,
) -> Result<String> {
check_wasm_size_limit(html)?;
let format = output_format.unwrap_or(KreuzbergOutputFormat::Markdown);
let options = resolve_conversion_options(options, format);
#[cfg(not(target_arch = "wasm32"))]
if html_requires_large_stack(html.len()) {
let html = html.to_string();
return run_on_dedicated_stack(move || convert_html_with_options(&html, options));
}
convert_html_with_options(html, options)
}
pub fn convert_html_to_markdown_with_metadata(
html: &str,
options: Option<ConversionOptions>,
output_format: Option<KreuzbergOutputFormat>,
) -> Result<(String, Option<HtmlMetadata>)> {
check_wasm_size_limit(html)?;
let format = output_format.unwrap_or(KreuzbergOutputFormat::Markdown);
let options = resolve_conversion_options(options, format);
let metadata_config = MetadataConfig::default();
#[cfg(not(target_arch = "wasm32"))]
if html_requires_large_stack(html.len()) {
let html = html.to_string();
return run_on_dedicated_stack(move || {
convert_with_metadata(&html, Some(options), metadata_config, None)
.map_err(|e| KreuzbergError::parsing(format!("HTML metadata extraction failed: {}", e)))
.map(|(content, extended_metadata)| {
let html_metadata = HtmlMetadata::from(extended_metadata);
(
content,
if html_metadata.is_empty() {
None
} else {
Some(html_metadata)
},
)
})
});
}
let (content, extended_metadata) = convert_with_metadata(html, Some(options), metadata_config, None)
.map_err(|e| KreuzbergError::parsing(format!("HTML metadata extraction failed: {}", e)))?;
let html_metadata = HtmlMetadata::from(extended_metadata);
Ok((
content,
if html_metadata.is_empty() {
None
} else {
Some(html_metadata)
},
))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_convert_simple_html() {
let html = "<h1>Hello World</h1><p>This is a test.</p>";
let result = convert_html_to_markdown(html, None, None).unwrap();
assert!(result.contains("# Hello World"));
assert!(result.contains("This is a test."));
}
#[test]
fn test_html_config_heading_style() {
let html = "<h1>Heading</h1>";
let options = ConversionOptions {
heading_style: super::super::types::HeadingStyle::Atx,
..Default::default()
};
let result = convert_html_to_markdown(html, Some(options), None).unwrap();
assert!(result.contains("# Heading"));
}
#[test]
fn test_html_with_list() {
let html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
let result = convert_html_to_markdown(html, None, None).unwrap();
assert!(result.contains("Item 1"));
assert!(result.contains("Item 2"));
}
#[test]
fn test_html_with_table() {
let html = "<table><tr><th>Header</th></tr><tr><td>Data</td></tr></table>";
let result = convert_html_to_markdown(html, None, None).unwrap();
assert!(result.contains("Header"));
assert!(result.contains("Data"));
}
#[test]
fn test_preprocessing_config() {
let html = "<nav>Navigation</nav><p>Content</p>";
let mut options = ConversionOptions::default();
options.preprocessing.enabled = true;
options.preprocessing.preset = super::super::types::PreprocessingPreset::Standard;
options.preprocessing.remove_navigation = true;
let result = convert_html_to_markdown(html, Some(options), None).unwrap();
assert!(result.contains("Content"));
}
#[test]
fn test_preprocessing_keeps_main_content() {
let html = r#"
<!DOCTYPE html>
<html>
<body>
<nav><p>Skip me</p></nav>
<main id="content">
<article>
<h1>Taylor Swift</h1>
<p>Taylor Alison Swift is an American singer-songwriter.</p>
</article>
</main>
</body>
</html>
"#;
let markdown = convert_html_to_markdown(html, None, None).expect("conversion failed");
assert!(markdown.contains("Taylor Alison Swift"), "{markdown}");
}
#[test]
fn test_metadata_document_fields() {
let html = r#"<!DOCTYPE html>
<html>
<head>
<title>Amazing Article</title>
<meta name="description" content="This is a description of the article">
<meta name="author" content="Jane Doe">
<link rel="canonical" href="https://example.com/article/amazing">
<base href="https://example.com/">
</head>
<body>
<h1>Amazing Article</h1>
<p>Content here.</p>
</body>
</html>"#;
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None, None).unwrap();
let metadata = metadata.expect("metadata should be present");
assert_eq!(
metadata.title,
Some("Amazing Article".to_string()),
"Title should be extracted from <title> tag"
);
assert_eq!(
metadata.description,
Some("This is a description of the article".to_string()),
"Description should be extracted from meta description tag"
);
assert_eq!(
metadata.author,
Some("Jane Doe".to_string()),
"Author should be extracted from meta author tag"
);
assert_eq!(
metadata.canonical_url,
Some("https://example.com/article/amazing".to_string()),
"Canonical URL should be extracted from link[rel=canonical]"
);
assert_eq!(
metadata.base_href,
Some("https://example.com/".to_string()),
"Base href should be extracted from <base> tag"
);
}
#[test]
fn test_metadata_empty_html() {
let html = "";
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None, None).unwrap();
assert!(
metadata.is_none() || metadata.as_ref().unwrap().is_empty(),
"Empty HTML should return None or empty metadata"
);
}
#[test]
fn test_metadata_no_metadata() {
let html = r#"<!DOCTYPE html>
<html>
<body>
<h1>Simple Page</h1>
<p>Just content, no metadata tags.</p>
</body>
</html>"#;
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None, None).unwrap();
if let Some(meta) = metadata {
assert!(
meta.title.is_none() || meta.title.is_some(),
"Title might be extracted from h1 or might be None"
);
assert!(meta.open_graph.is_empty(), "Open Graph should be empty with no OG tags");
assert!(
meta.twitter_card.is_empty(),
"Twitter Card should be empty with no Twitter tags"
);
}
}
#[test]
fn test_metadata_malformed_html() {
let html = r#"<!DOCTYPE html>
<html>
<head>
<title>Malformed
<meta name="author content="No closing quote
</head>
<body>
<h1>Title
<p>Unclosed paragraph
<div>Unmatched closing tag</div></div>
</body>
</html>"#;
let result = convert_html_to_markdown_with_metadata(html, None, None);
assert!(
result.is_ok(),
"Malformed HTML should be handled gracefully without error"
);
let (_, metadata) = result.unwrap();
assert!(
metadata.is_some() || metadata.is_none(),
"Should return either Some or None metadata"
);
}
#[test]
fn test_metadata_special_characters() {
let html = r#"<!DOCTYPE html>
<html>
<head>
<title>Café & Restaurant "Guide"</title>
<meta name="description" content="5 stars ★★★★★ < 50% off">
<meta name="author" content="José GarcÃa-López">
<meta property="og:title" content="Quote "Special" & Characters">
</head>
<body>
<h1>Article Title © 2024</h1>
</body>
</html>"#;
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None, None).unwrap();
let metadata = metadata.expect("metadata should be present");
if let Some(title) = &metadata.title {
assert!(!title.is_empty(), "Title should be extracted and decoded");
}
if let Some(author) = &metadata.author {
assert!(
author.contains("GarcÃa") || author.contains("Jose"),
"Special characters should be handled correctly"
);
}
if let Some(desc) = &metadata.description {
assert!(!desc.is_empty(), "Description should be extracted");
}
}
#[test]
fn test_metadata_duplicate_tags() {
let html = r#"<!DOCTYPE html>
<html>
<head>
<title>First Title</title>
<meta name="description" content="First description">
<meta name="description" content="Second description (should override)">
<meta name="author" content="Author One">
<meta name="author" content="Author Two">
</head>
<body>
<p>Content</p>
</body>
</html>"#;
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None, None).unwrap();
let metadata = metadata.expect("metadata should be present");
if let Some(title) = &metadata.title {
assert_eq!(
title, "First Title",
"Title should be the single value from first title tag"
);
}
if let Some(description) = &metadata.description {
assert!(
!description.is_empty(),
"Description should be populated even with duplicates"
);
assert!(
description.contains("First") || description.contains("Second"),
"Description should contain one of the duplicate values"
);
}
}
#[test]
fn test_malformed_json_ld_graceful_handling() {
let html = r#"<!DOCTYPE html>
<html>
<head>
<title>Malformed JSON-LD Test</title>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "Test Article",
"author": "John Doe"
"datePublished": "2024-01-01"
}
</script>
</head>
<body>
<h1>Article Title</h1>
<p>This HTML contains invalid JSON-LD (missing comma after author field)</p>
</body>
</html>"#;
let result = convert_html_to_markdown_with_metadata(html, None, None);
assert!(
result.is_ok(),
"Malformed JSON-LD should not cause panic. Error: {:?}",
result.err()
);
let (markdown, metadata) = result.unwrap();
assert!(
!markdown.is_empty(),
"Markdown should be extracted despite invalid JSON-LD"
);
assert!(
markdown.contains("Article Title") || markdown.contains("Article"),
"Content should be properly converted to Markdown"
);
if let Some(meta) = metadata {
assert_eq!(
meta.title,
Some("Malformed JSON-LD Test".to_string()),
"Document metadata should be extracted from tags"
);
}
}
#[test]
fn test_metadata_xss_sanitization() {
let html = r#"<!DOCTYPE html>
<html>
<head>
<title>Safe Title <script>alert('xss')</script></title>
<meta name="description" content="Description with encoded content">
<meta name="author" content="Author Name">
<meta property="og:title" content="OG Title">
<meta property="og:description" content="OG Description">
</head>
<body>
<h1>Title Section</h1>
<p>Content here</p>
</body>
</html>"#;
let result = convert_html_to_markdown_with_metadata(html, None, None);
assert!(
result.is_ok(),
"HTML with script-like content should not cause error. Error: {:?}",
result.err()
);
let (markdown, metadata) = result.unwrap();
assert!(!markdown.is_empty(), "Markdown should be generated");
if let Some(meta) = metadata {
if let Some(title) = &meta.title {
assert!(!title.is_empty(), "Title should be extracted");
assert!(
title.contains("Safe") || title.contains("script"),
"Title should extract content from title tag: {}",
title
);
}
if let Some(desc) = &meta.description {
assert!(!desc.is_empty(), "Description should be extracted");
}
if let Some(author) = &meta.author {
assert_eq!(author, "Author Name", "Author should be correctly extracted");
}
if !meta.open_graph.is_empty() {
let og_count = meta.open_graph.len();
assert!(og_count > 0, "Open Graph tags should be extracted");
}
}
}
#[test]
fn test_convert_html_to_djot() {
use crate::core::config::OutputFormat;
let html = "<h1>Hello World</h1><p>This is a test.</p>";
let result = convert_html_to_markdown(html, None, Some(OutputFormat::Djot)).unwrap();
assert!(result.contains("# Hello World"));
assert!(result.contains("This is a test."));
}
#[test]
fn test_convert_html_to_djot_with_emphasis() {
use crate::core::config::OutputFormat;
let html = "<p>This is <strong>bold</strong> and <em>italic</em>.</p>";
let result = convert_html_to_markdown(html, None, Some(OutputFormat::Djot)).unwrap();
assert!(result.contains("*bold*"));
assert!(result.contains("_italic_"));
}
#[test]
fn test_convert_html_with_metadata_djot() {
use crate::core::config::OutputFormat;
let html = r#"<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<meta name="description" content="Test description">
</head>
<body>
<h1>Content</h1>
<p>This is <strong>content</strong>.</p>
</body>
</html>"#;
let (content, metadata) = convert_html_to_markdown_with_metadata(html, None, Some(OutputFormat::Djot)).unwrap();
assert!(content.contains("# Content"));
assert!(content.contains("*content*"));
assert!(metadata.is_some());
let meta = metadata.unwrap();
assert_eq!(meta.title, Some("Test Page".to_string()));
assert_eq!(meta.description, Some("Test description".to_string()));
}
}