use async_trait::async_trait;
use serde_json::json;
use crate::tools::error::ToolError;
use crate::tools::trait_::Tool;
#[derive(Debug, Clone)]
pub struct WebFetchTool {
timeout_secs: u64,
max_size: usize,
}
impl WebFetchTool {
#[must_use]
pub const fn new() -> Self {
Self {
timeout_secs: 15,
max_size: 500 * 1024,
}
}
#[must_use]
pub const fn with_timeout(mut self, secs: u64) -> Self {
self.timeout_secs = secs;
self
}
#[must_use]
pub const fn with_max_size(mut self, bytes: usize) -> Self {
self.max_size = bytes;
self
}
}
impl Default for WebFetchTool {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl Tool for WebFetchTool {
#[allow(
clippy::unnecessary_literal_bound,
reason = "trait signature requires &self lifetime"
)]
fn name(&self) -> &str {
"web_fetch"
}
#[allow(
clippy::unnecessary_literal_bound,
reason = "trait signature requires &self lifetime"
)]
fn description(&self) -> &str {
"Fetch the full text content of a webpage. \
Use this tool to read the complete content of articles, documentation, \
or other web pages when search snippets are not sufficient.\n\n\
When to use:\n\
- When search results reference an article you need to read fully\n\
- When you need specific details from a webpage\n\
- When the search snippet is too brief to answer the question\n\n\
When NOT to use:\n\
- For simple facts that search snippets already cover\n\
- For sites that require authentication\n\
- For dynamic/JavaScript-heavy sites that may not render well"
}
fn schema(&self) -> serde_json::Value {
json!({
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL of the webpage to fetch"
}
},
"required": ["url"]
})
}
async fn invoke(&self, input: serde_json::Value) -> Result<String, ToolError> {
let url = input["url"]
.as_str()
.ok_or_else(|| ToolError::invalid_input("Missing 'url' field".to_string()))?;
if url.trim().is_empty() {
return Err(ToolError::invalid_input("URL cannot be empty".to_string()));
}
if !url.starts_with("http://") && !url.starts_with("https://") {
return Err(ToolError::invalid_input(format!(
"URL must start with http:// or https://, got: {url}"
)));
}
let client = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(self.timeout_secs))
.build()
.map_err(|e| {
ToolError::execution_failed(format!("Failed to create HTTP client: {e}"))
})?;
let response = client
.get(url)
.header("User-Agent", "Mozilla/5.0 (compatible; JunctureBot/1.0)")
.send()
.await
.map_err(|e| ToolError::execution_failed(format!("Failed to fetch URL: {e}")))?;
if !response.status().is_success() {
return Err(ToolError::execution_failed(format!(
"HTTP error: {} for URL: {url}",
response.status()
)));
}
let body = response.text().await.map_err(|e| {
ToolError::execution_failed(format!("Failed to read response body: {e}"))
})?;
if body.len() > self.max_size {
return Err(ToolError::execution_failed(format!(
"Response too large: {} bytes (max {} bytes)",
body.len(),
self.max_size
)));
}
let text = strip_html_tags(&body);
if text.trim().is_empty() {
return Err(ToolError::execution_failed(
"No text content found on the page".to_string(),
));
}
Ok(text)
}
}
fn strip_html_tags(html: &str) -> String {
let mut result = String::with_capacity(html.len());
let mut in_tag = false;
for c in html.chars() {
match c {
'<' => {
in_tag = true;
}
'>' => {
in_tag = false;
result.push(' ');
}
_ if in_tag => {}
_ => {
result.push(c);
}
}
}
let mut normalized = String::with_capacity(result.len());
let mut prev_was_space = false;
for c in result.chars() {
if c.is_whitespace() {
if !prev_was_space {
normalized.push(' ');
prev_was_space = true;
}
} else {
normalized.push(c);
prev_was_space = false;
}
}
normalized.trim().to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_web_fetch_tool_name() {
let tool = WebFetchTool::new();
assert_eq!(tool.name(), "web_fetch");
}
#[test]
fn test_web_fetch_tool_description() {
let tool = WebFetchTool::new();
assert!(tool.description().contains("Fetch"));
}
#[test]
fn test_web_fetch_tool_schema() {
let tool = WebFetchTool::new();
let schema = tool.schema();
assert_eq!(schema["type"], "object");
assert!(schema["properties"]["url"].is_object());
}
#[test]
fn test_web_fetch_tool_default() {
let tool = WebFetchTool::default();
assert_eq!(tool.name(), "web_fetch");
}
#[test]
fn test_web_fetch_tool_builder() {
let tool = WebFetchTool::new()
.with_timeout(30)
.with_max_size(1024 * 1024);
assert_eq!(tool.timeout_secs, 30);
assert_eq!(tool.max_size, 1024 * 1024);
}
#[tokio::test]
async fn test_web_fetch_missing_url() {
let tool = WebFetchTool::new();
let input = json!({});
let result = tool.invoke(input).await;
result.unwrap_err();
}
#[tokio::test]
async fn test_web_fetch_empty_url() {
let tool = WebFetchTool::new();
let input = json!({"url": " "});
let result = tool.invoke(input).await;
result.unwrap_err();
}
#[tokio::test]
async fn test_web_fetch_invalid_scheme() {
let tool = WebFetchTool::new();
let input = json!({"url": "ftp://example.com"});
let result = tool.invoke(input).await;
result.unwrap_err();
}
#[test]
fn test_strip_html_tags() {
let html = "<html><body><h1>Hello</h1><p>World</p></body></html>";
let text = strip_html_tags(html);
assert_eq!(text, "Hello World");
}
#[test]
fn test_strip_html_tags_whitespace() {
let html = "<p>Hello \n World</p>";
let text = strip_html_tags(html);
assert_eq!(text, "Hello World");
}
#[test]
fn test_strip_html_tags_nested() {
let html = "<div><span><b>Bold</b> text</span></div>";
let text = strip_html_tags(html);
assert_eq!(text, "Bold text");
}
}