use crate::tools::tool::ToolError;
#[cfg(feature = "html-to-markdown")]
use html_to_markdown_rs::convert;
#[cfg(feature = "html-to-markdown")]
use readabilityrs::Readability;
#[cfg(not(feature = "html-to-markdown"))]
pub fn convert_html_to_markdown(html: &str, _url: &str) -> Result<String, ToolError> {
Ok(html.to_string())
}
#[cfg(feature = "html-to-markdown")]
pub fn convert_html_to_markdown(html: &str, url: &str) -> Result<String, ToolError> {
let readability = Readability::new(html, Some(url), None)
.map_err(|e| ToolError::ExecutionFailed(format!("readability parser: {:?}", e)))?;
let article = readability.parse().ok_or_else(|| {
ToolError::ExecutionFailed("failed to extract article content".to_string())
})?;
let clean_html = article.content.ok_or_else(|| {
ToolError::ExecutionFailed("no content extracted from article".to_string())
})?;
let markdown = convert(&clean_html, None)
.map_err(|e| ToolError::ExecutionFailed(format!("HTML to markdown: {}", e)))?;
Ok(markdown)
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(not(feature = "html-to-markdown"))]
#[test]
fn passthrough_returns_input_unchanged_when_feature_disabled() {
{
let html = "<html><body>raw</body></html>";
let out = convert_html_to_markdown(html, "https://example.com/").unwrap();
assert_eq!(out, html);
}
}
#[cfg(not(feature = "html-to-markdown"))]
#[test]
fn passthrough_ignores_url_when_feature_disabled() {
{
let html = "anything";
let _ = convert_html_to_markdown(html, "").unwrap();
let _ = convert_html_to_markdown(html, "https://example.com/page").unwrap();
}
}
#[cfg(feature = "html-to-markdown")]
#[test]
fn simple_article_extracted_and_converted_to_markdown() {
let html = r#"<!DOCTYPE html>
<html><head><title>Test</title></head><body>
<nav><a href="/">Home</a></nav>
<main>
<article>
<h1>Test Title</h1>
<p>First paragraph with enough text so that readability's scoring finds this as the main content block. We need to exceed the default character threshold.</p>
<p>Second paragraph. More body text here to make the article clearly the dominant content area versus the short nav and footer.</p>
<p>Third paragraph for good measure. The extraction algorithm scores candidates by paragraph count and text length; this block should win.</p>
</article>
</main>
<footer><p>Footer</p></footer>
</body></html>"#;
let out = convert_html_to_markdown(html, "https://example.com/article").unwrap();
assert!(
out.contains("Test Title"),
"expected title in output: {}",
out
);
assert!(
out.contains("First paragraph"),
"expected content in output: {}",
out
);
assert!(
out.contains("Second paragraph"),
"expected content in output: {}",
out
);
assert!(
!out.contains("<article>"),
"expected markdown, not raw HTML"
);
}
#[cfg(feature = "html-to-markdown")]
#[test]
fn returns_execution_error_on_empty_html() {
let result = convert_html_to_markdown("", "https://example.com/");
let err = result.unwrap_err();
let msg = err.to_string();
assert!(
msg.contains("Execution failed") || msg.contains("extract") || msg.contains("content"),
"{}",
msg
);
}
#[cfg(feature = "html-to-markdown")]
#[test]
fn returns_execution_error_on_plain_text_not_html() {
let result = convert_html_to_markdown("not html at all", "https://example.com/");
let err = result.unwrap_err();
let msg = err.to_string();
assert!(
msg.contains("Execution failed")
|| msg.contains("extract")
|| msg.contains("content")
|| msg.contains("parser"),
"{}",
msg
);
}
}