use anyhow::{Context, Result};
use serde_json::Value;
use super::Tool;
const MAX_CONTENT: usize = 50_000;
pub struct WebFetchTool;
impl Tool for WebFetchTool {
fn name(&self) -> &str {
"webfetch"
}
fn description(&self) -> &str {
"Fetch content from a URL and return it as text. Automatically strips HTML tags for web pages."
}
fn input_schema(&self) -> Value {
serde_json::json!({
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to fetch content from"
}
},
"required": ["url"]
})
}
fn execute(&self, input: Value) -> Result<String> {
let url = input["url"]
.as_str()
.context("Missing required parameter 'url'")?;
tracing::debug!("webfetch: {}", url);
let response =
reqwest::blocking::get(url).with_context(|| format!("failed to fetch: {}", url))?;
let status = response.status();
if !status.is_success() {
anyhow::bail!("HTTP {}: {}", status.as_u16(), url);
}
let content_type = response
.headers()
.get("content-type")
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.to_string();
let body = response
.text()
.with_context(|| format!("failed to read response from: {}", url))?;
let text = if content_type.contains("text/html") {
strip_html(&body)
} else {
body
};
if text.len() > MAX_CONTENT {
Ok(format!(
"{}\n... (truncated at {} chars)",
&text[..MAX_CONTENT],
MAX_CONTENT
))
} else {
Ok(text)
}
}
}
fn strip_html(html: &str) -> String {
let mut result = String::with_capacity(html.len() / 3);
let mut in_tag = false;
let mut in_script = false;
let mut in_style = false;
let mut last_was_space = false;
let lower = html.to_lowercase();
let chars: Vec<char> = html.chars().collect();
let lower_chars: Vec<char> = lower.chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
if !in_tag && chars[i] == '<' {
let remaining: String = lower_chars[i..].iter().take(10).collect();
if remaining.starts_with("<script") {
in_script = true;
} else if remaining.starts_with("<style") {
in_style = true;
}
if remaining.starts_with("</script") {
in_script = false;
} else if remaining.starts_with("</style") {
in_style = false;
}
let tag: String = lower_chars[i..].iter().take(5).collect();
if (tag.starts_with("<br")
|| tag.starts_with("<p")
|| tag.starts_with("<div")
|| tag.starts_with("<h")
|| tag.starts_with("<li")
|| tag.starts_with("<tr"))
&& !result.ends_with('\n')
{
result.push('\n');
}
in_tag = true;
i += 1;
continue;
}
if in_tag {
if chars[i] == '>' {
in_tag = false;
}
i += 1;
continue;
}
if in_script || in_style {
i += 1;
continue;
}
if chars[i] == '&'
&& let Some(semi) = html[i..].find(';')
{
let entity = &html[i..i + semi + 1];
let decoded = match entity {
"&" => "&",
"<" => "<",
">" => ">",
""" => "\"",
"'" => "'",
" " => " ",
_ => " ",
};
result.push_str(decoded);
last_was_space = decoded == " ";
i += semi + 1;
continue;
}
if chars[i].is_whitespace() {
if !last_was_space && !result.is_empty() {
result.push(' ');
last_was_space = true;
}
} else {
result.push(chars[i]);
last_was_space = false;
}
i += 1;
}
let mut cleaned = String::new();
let mut consecutive = 0;
for c in result.chars() {
if c == '\n' {
consecutive += 1;
if consecutive <= 2 {
cleaned.push(c);
}
} else {
consecutive = 0;
cleaned.push(c);
}
}
cleaned.trim().to_string()
}