use async_trait::async_trait;
use crate::tool::{Tool, ToolDefinition, ToolError, ToolRegistry};
#[derive(Debug, Clone)]
pub struct PageContent {
pub title: Option<String>,
pub text: String,
pub url: String,
}
pub fn extract_text(html: &str) -> String {
let mut result = html.to_string();
while let Some(start) = result.to_lowercase().find("<script") {
if let Some(end) = result.to_lowercase()[start..].find("</script>") {
result = format!(
"{}{}",
&result[..start],
&result[start + end + "</script>".len()..]
);
} else {
break;
}
}
while let Some(start) = result.to_lowercase().find("<style") {
if let Some(end) = result.to_lowercase()[start..].find("</style>") {
result = format!(
"{}{}",
&result[..start],
&result[start + end + "</style>".len()..]
);
} else {
break;
}
}
while let Some(start) = result.find("<!--") {
if let Some(end) = result[start..].find("-->") {
result = format!("{}{}", &result[..start], &result[start + end + 3..]);
} else {
break;
}
}
let block_tags = [
"<br", "<p", "</p>", "<div", "</div>", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6", "</h1>",
"</h2>", "</h3>", "</h4>", "</h5>", "</h6>", "<li", "</li>", "<tr", "</tr>",
];
for tag in &block_tags {
result = result.replace(tag, &format!("\n{}", tag));
}
let mut cleaned = String::with_capacity(result.len());
let mut in_tag = false;
for ch in result.chars() {
match ch {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => cleaned.push(ch),
_ => {}
}
}
cleaned = cleaned
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace("'", "'")
.replace(" ", " ")
.replace("'", "'")
.replace("/", "/");
let mut final_text = String::with_capacity(cleaned.len());
let mut prev_was_newline = false;
let mut newline_count = 0;
for line in cleaned.lines() {
let trimmed = line.split_whitespace().collect::<Vec<_>>().join(" ");
if trimmed.is_empty() {
newline_count += 1;
if newline_count <= 2 {
prev_was_newline = true;
}
} else {
if prev_was_newline && !final_text.is_empty() {
final_text.push('\n');
}
if !final_text.is_empty() {
final_text.push('\n');
}
final_text.push_str(&trimmed);
prev_was_newline = false;
newline_count = 0;
}
}
final_text.trim().to_string()
}
pub fn extract_title(html: &str) -> Option<String> {
let lower = html.to_lowercase();
let start = lower.find("<title")?;
let tag_end = lower[start..].find('>')?;
let content_start = start + tag_end + 1;
let end = lower[content_start..].find("</title>")?;
let title = html[content_start..content_start + end].trim().to_string();
if title.is_empty() {
None
} else {
Some(title)
}
}
pub struct BrowserTool {
client: reqwest::Client,
max_content_length: usize,
}
impl BrowserTool {
pub fn new() -> Self {
let client = reqwest::Client::builder()
.user_agent("Mozilla/5.0 (compatible; AgenticBot/1.0)")
.timeout(std::time::Duration::from_secs(30))
.redirect(reqwest::redirect::Policy::limited(10))
.build()
.unwrap_or_else(|_| reqwest::Client::new());
Self {
client,
max_content_length: 100_000,
}
}
pub fn with_max_content_length(mut self, max: usize) -> Self {
self.max_content_length = max;
self
}
pub async fn fetch(&self, url: &str) -> Result<PageContent, BrowserError> {
let response = self
.client
.get(url)
.send()
.await
.map_err(|e| BrowserError::Request(e.to_string()))?;
let final_url = response.url().to_string();
let status = response.status();
if !status.is_success() {
return Err(BrowserError::HttpStatus {
status: status.as_u16(),
url: final_url,
});
}
let content_type = response
.headers()
.get("content-type")
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.to_string();
if !content_type.contains("text/html") && !content_type.contains("text/plain") {
return Err(BrowserError::UnsupportedContentType(content_type));
}
let body = response
.text()
.await
.map_err(|e| BrowserError::Request(e.to_string()))?;
let body = if body.len() > self.max_content_length {
body[..self.max_content_length].to_string()
} else {
body
};
let title = extract_title(&body);
let text = extract_text(&body);
Ok(PageContent {
title,
text,
url: final_url,
})
}
}
impl Default for BrowserTool {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl Tool for BrowserTool {
fn definition(&self) -> ToolDefinition {
ToolDefinition {
name: "read_url".into(),
description: "Fetch a web page and extract its readable text content. Strips \
navigation, scripts, and other non-content elements to return \
just the main article text."
.into(),
input_schema: serde_json::json!({
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to fetch and read"
}
},
"required": ["url"]
}),
}
}
async fn execute(&self, input: serde_json::Value) -> Result<String, ToolError> {
let url = input
.get("url")
.and_then(|v| v.as_str())
.ok_or_else(|| ToolError::InvalidInput("missing 'url'".into()))?;
let content = self
.fetch(url)
.await
.map_err(|e| ToolError::ExecutionFailed(e.to_string()))?;
let mut output = String::new();
if let Some(title) = &content.title {
output.push_str(&format!("Title: {}\n", title));
}
output.push_str(&format!("URL: {}\n\n", content.url));
output.push_str(&content.text);
if output.len() > 50_000 {
output.truncate(50_000);
output.push_str("\n\n[Content truncated]");
}
Ok(output)
}
}
pub fn register_tool(registry: &mut ToolRegistry) {
registry.register(Box::new(BrowserTool::new()));
}
#[derive(Debug, thiserror::Error)]
pub enum BrowserError {
#[error("request failed: {0}")]
Request(String),
#[error("HTTP {status} for {url}")]
HttpStatus { status: u16, url: String },
#[error("unsupported content type: {0}")]
UnsupportedContentType(String),
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extract_text_basic_html() {
let html = "<html><body><h1>Hello</h1><p>This is a test.</p></body></html>";
let text = extract_text(html);
assert!(text.contains("Hello"));
assert!(text.contains("This is a test."));
}
#[test]
fn extract_text_removes_scripts() {
let html = r#"<p>Before</p><script>alert('xss');</script><p>After</p>"#;
let text = extract_text(html);
assert!(text.contains("Before"));
assert!(text.contains("After"));
assert!(!text.contains("alert"));
}
#[test]
fn extract_text_removes_styles() {
let html = r#"<p>Content</p><style>.foo { color: red; }</style><p>More</p>"#;
let text = extract_text(html);
assert!(text.contains("Content"));
assert!(!text.contains("color"));
}
#[test]
fn extract_text_decodes_entities() {
let html = "<p>A & B < C > D "E"</p>";
let text = extract_text(html);
assert!(text.contains("A & B < C > D \"E\""));
}
#[test]
fn extract_text_collapses_whitespace() {
let html = "<p> Hello world </p>";
let text = extract_text(html);
assert_eq!(text, "Hello world");
}
#[test]
fn extract_text_removes_comments() {
let html = "<p>Visible</p><!-- hidden comment --><p>Also visible</p>";
let text = extract_text(html);
assert!(text.contains("Visible"));
assert!(text.contains("Also visible"));
assert!(!text.contains("hidden comment"));
}
#[test]
fn extract_title_basic() {
let html = "<html><head><title>My Page</title></head><body></body></html>";
assert_eq!(extract_title(html), Some("My Page".into()));
}
#[test]
fn extract_title_missing() {
let html = "<html><body>No title here</body></html>";
assert_eq!(extract_title(html), None);
}
#[test]
fn extract_title_empty() {
let html = "<title></title>";
assert_eq!(extract_title(html), None);
}
#[test]
fn tool_definition_valid() {
let tool = BrowserTool::new();
let def = tool.definition();
assert_eq!(def.name, "read_url");
assert!(def.input_schema["required"]
.as_array()
.unwrap()
.contains(&serde_json::json!("url")));
}
#[test]
fn browser_error_display() {
let err = BrowserError::HttpStatus {
status: 404,
url: "https://example.com".into(),
};
let msg = err.to_string();
assert!(msg.contains("404"));
assert!(msg.contains("example.com"));
}
#[test]
fn extract_text_complex_page() {
let html = r#"
<!DOCTYPE html>
<html>
<head>
<title>Test Article</title>
<style>body { font-family: sans-serif; }</style>
<script>console.log('analytics');</script>
</head>
<body>
<nav>
<a href="/">Home</a>
<a href="/about">About</a>
</nav>
<article>
<h1>Breaking News</h1>
<p>This is the first paragraph of the article.</p>
<p>This is the second paragraph with <strong>bold</strong> text.</p>
</article>
<footer>Copyright 2025</footer>
<script>trackPageView();</script>
</body>
</html>
"#;
let text = extract_text(html);
assert!(text.contains("Breaking News"));
assert!(text.contains("first paragraph"));
assert!(text.contains("bold"));
assert!(!text.contains("console.log"));
assert!(!text.contains("trackPageView"));
assert!(!text.contains("font-family"));
}
}