use std::path::Path;
use anyhow::Result;
pub fn html_to_markdown(source: &str) -> Result<String> {
let _span = tracing::info_span!("html_to_markdown").entered();
let markdown = html2md::rewrite_html(source, false);
if markdown.trim().is_empty() {
tracing::warn!("HTML conversion produced empty output");
anyhow::bail!("HTML conversion produced no content");
}
tracing::info!(bytes = markdown.len(), "HTML converted to markdown");
Ok(markdown)
}
const MAX_CONVERT_FILE_SIZE: u64 = 100 * 1024 * 1024;
pub fn html_file_to_markdown(path: &Path) -> Result<String> {
let _span = tracing::info_span!("html_file_to_markdown", path = %path.display()).entered();
let meta = std::fs::metadata(path)
.map_err(|e| anyhow::anyhow!("Failed to stat {}: {}", path.display(), e))?;
if meta.len() > MAX_CONVERT_FILE_SIZE {
anyhow::bail!(
"File {} exceeds {} MB size limit",
path.display(),
MAX_CONVERT_FILE_SIZE / 1024 / 1024,
);
}
let source = std::fs::read_to_string(path)
.map_err(|e| anyhow::anyhow!("Failed to read {}: {}", path.display(), e))?;
html_to_markdown(&source)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_html_to_markdown_basic_paragraph() {
let html = "<p>Hello, world!</p>";
let result = html_to_markdown(html).expect("should convert simple paragraph");
assert!(
result.contains("Hello, world!"),
"converted markdown should contain the paragraph text"
);
}
#[test]
fn test_html_to_markdown_heading() {
let html = "<h1>My Heading</h1><p>Some text.</p>";
let result = html_to_markdown(html).expect("should convert heading and paragraph");
assert!(
result.contains("My Heading"),
"converted markdown should contain the heading text"
);
assert!(
result.contains("Some text."),
"converted markdown should contain the paragraph text"
);
}
#[test]
fn test_html_to_markdown_empty_returns_error() {
let result = html_to_markdown(" ");
assert!(result.is_err(), "empty HTML should return an error");
}
}