use super::super::{McpToolCall, McpToolResult};
use anyhow::{anyhow, Result};
use html5ever::parse_document;
use html5ever::tendril::TendrilSink;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use reqwest;
use serde_json::{json, Value};
use std::path::Path;
use tokio::fs as tokio_fs;
use url::Url;
pub async fn execute_read_html(call: &McpToolCall) -> Result<McpToolResult> {
let sources_value = match call.parameters.get("sources") {
Some(value) => value,
_ => {
return Ok(McpToolResult::error(
call.tool_name.clone(),
call.tool_id.clone(),
"Missing 'sources' parameter".to_string(),
))
}
};
match sources_value {
Value::String(source) => {
convert_single_html_to_md(call, source).await
}
Value::Array(sources) => {
let mut source_strings = Vec::new();
for source in sources {
match source.as_str() {
Some(s) => source_strings.push(s.to_string()),
None => {
return Ok(McpToolResult::error(
call.tool_name.clone(),
call.tool_id.clone(),
"Invalid source in array - all sources must be strings".to_string(),
))
}
}
}
convert_multiple_html_to_md(call, &source_strings).await
}
_ => Ok(McpToolResult::error(
call.tool_name.clone(),
call.tool_id.clone(),
"'sources' parameter must be a string or array of strings".to_string(),
)),
}
}
async fn convert_single_html_to_md(call: &McpToolCall, source: &str) -> Result<McpToolResult> {
let (html_content, source_type) = fetch_html_content(source).await?;
let markdown = html_to_markdown(&html_content)?;
Ok(McpToolResult {
tool_name: "read_html".to_string(),
tool_id: call.tool_id.clone(),
result: json!({
"success": true,
"conversions": [{
"source": source,
"type": source_type,
"markdown": markdown,
"size": markdown.len()
}],
"count": 1
}),
})
}
async fn convert_multiple_html_to_md(
call: &McpToolCall,
sources: &[String],
) -> Result<McpToolResult> {
let mut conversions = Vec::with_capacity(sources.len());
let mut failures = Vec::new();
for source in sources {
match fetch_html_content(source).await {
Ok((html_content, source_type)) => match html_to_markdown(&html_content) {
Ok(markdown) => {
conversions.push(json!({
"source": source,
"type": source_type,
"markdown": markdown,
"size": markdown.len()
}));
}
Err(e) => {
failures.push(format!("Failed to convert {} to markdown: {}", source, e));
}
},
Err(e) => {
failures.push(format!("Failed to fetch {}: {}", source, e));
}
}
}
Ok(McpToolResult {
tool_name: "read_html".to_string(),
tool_id: call.tool_id.clone(),
result: json!({
"success": !conversions.is_empty(),
"conversions": conversions,
"count": conversions.len(),
"failed": failures
}),
})
}
async fn fetch_html_content(source: &str) -> Result<(String, &'static str)> {
if let Ok(url) = Url::parse(source) {
if url.scheme() == "http" || url.scheme() == "https" {
let response = reqwest::get(source).await?;
if !response.status().is_success() {
return Err(anyhow!("HTTP error {}: {}", response.status(), source));
}
let html = response.text().await?;
Ok((html, "url"))
} else if url.scheme() == "file" {
let path = url
.to_file_path()
.map_err(|_| anyhow!("Invalid file URL: {}", source))?;
let html = tokio_fs::read_to_string(&path).await?;
Ok((html, "file"))
} else {
Err(anyhow!("Unsupported URL scheme: {}", url.scheme()))
}
} else {
let path = Path::new(source);
if !path.exists() {
return Err(anyhow!("File does not exist: {}", source));
}
if !path.is_file() {
return Err(anyhow!("Path is not a file: {}", source));
}
let html = tokio_fs::read_to_string(path).await?;
Ok((html, "file"))
}
}
fn html_to_markdown(html: &str) -> Result<String> {
let dom = parse_document(RcDom::default(), Default::default()).one(html);
let mut markdown = String::new();
walk_node(&dom.document, &mut markdown, 0)?;
let cleaned = clean_markdown(&markdown);
Ok(cleaned)
}
fn walk_node(handle: &Handle, markdown: &mut String, depth: usize) -> Result<()> {
let node = handle;
match &node.data {
NodeData::Document => {
for child in node.children.borrow().iter() {
walk_node(child, markdown, depth)?;
}
}
NodeData::Element { name, attrs, .. } => {
let tag_name = &name.local;
let attrs = attrs.borrow();
match tag_name.as_ref() {
"h1" => {
markdown.push_str("\n# ");
process_children(node, markdown, depth)?;
markdown.push_str("\n\n");
}
"h2" => {
markdown.push_str("\n## ");
process_children(node, markdown, depth)?;
markdown.push_str("\n\n");
}
"h3" => {
markdown.push_str("\n### ");
process_children(node, markdown, depth)?;
markdown.push_str("\n\n");
}
"h4" => {
markdown.push_str("\n#### ");
process_children(node, markdown, depth)?;
markdown.push_str("\n\n");
}
"h5" => {
markdown.push_str("\n##### ");
process_children(node, markdown, depth)?;
markdown.push_str("\n\n");
}
"h6" => {
markdown.push_str("\n###### ");
process_children(node, markdown, depth)?;
markdown.push_str("\n\n");
}
"p" => {
markdown.push('\n');
process_children(node, markdown, depth)?;
markdown.push_str("\n\n");
}
"strong" | "b" => {
markdown.push_str("**");
process_children(node, markdown, depth)?;
markdown.push_str("**");
}
"em" | "i" => {
markdown.push('*');
process_children(node, markdown, depth)?;
markdown.push('*');
}
"code" => {
markdown.push('`');
process_children(node, markdown, depth)?;
markdown.push('`');
}
"pre" => {
markdown.push_str("\n```\n");
process_children(node, markdown, depth)?;
markdown.push_str("\n```\n\n");
}
"a" => {
let href = attrs
.iter()
.find(|attr| &*attr.name.local == "href")
.map(|attr| attr.value.to_string());
if let Some(url) = href {
markdown.push('[');
process_children(node, markdown, depth)?;
markdown.push_str(&format!("]({})", url));
} else {
process_children(node, markdown, depth)?;
}
}
"ul" => {
markdown.push('\n');
process_children(node, markdown, depth)?;
markdown.push('\n');
}
"ol" => {
markdown.push('\n');
process_children(node, markdown, depth)?;
markdown.push('\n');
}
"li" => {
if depth > 0 {
for _ in 0..(depth - 1) {
markdown.push_str(" ");
}
}
markdown.push_str("- ");
process_children(node, markdown, depth + 1)?;
markdown.push('\n');
}
"blockquote" => {
markdown.push_str("\n> ");
process_children(node, markdown, depth)?;
markdown.push_str("\n\n");
}
"br" => {
markdown.push_str(" \n");
}
"hr" => {
markdown.push_str("\n---\n\n");
}
"img" => {
let src = attrs
.iter()
.find(|attr| &*attr.name.local == "src")
.map(|attr| attr.value.to_string());
let alt = attrs
.iter()
.find(|attr| &*attr.name.local == "alt")
.map(|attr| attr.value.to_string())
.unwrap_or_else(|| "".to_string());
if let Some(url) = src {
markdown.push_str(&format!("", alt, url));
}
}
"script" | "style" | "head" | "meta" | "link" | "title" => {
}
_ => {
process_children(node, markdown, depth)?;
}
}
}
NodeData::Text { contents } => {
let text = contents.borrow().to_string();
let cleaned_text = text.trim();
if !cleaned_text.is_empty() {
markdown.push_str(cleaned_text);
}
}
_ => {
for child in node.children.borrow().iter() {
walk_node(child, markdown, depth)?;
}
}
}
Ok(())
}
fn process_children(node: &Handle, markdown: &mut String, depth: usize) -> Result<()> {
for child in node.children.borrow().iter() {
walk_node(child, markdown, depth)?;
}
Ok(())
}
fn clean_markdown(markdown: &str) -> String {
let mut lines: Vec<&str> = markdown.lines().collect();
while let Some(&first) = lines.first() {
if first.trim().is_empty() {
lines.remove(0);
} else {
break;
}
}
while let Some(&last) = lines.last() {
if last.trim().is_empty() {
lines.pop();
} else {
break;
}
}
let mut result = Vec::new();
let mut empty_count = 0;
for line in lines {
if line.trim().is_empty() {
empty_count += 1;
if empty_count <= 2 {
result.push(line);
}
} else {
empty_count = 0;
result.push(line);
}
}
result.join("\n")
}