Skip to main content

codetether_agent/tool/
webfetch.rs

1//! Web Fetch Tool - Fetches content from URLs and converts HTML to markdown/text/html.
2
3use super::{Tool, ToolResult};
4use anyhow::{Context, Result};
5use async_trait::async_trait;
6use serde::Deserialize;
7use serde_json::{Value, json};
8use std::time::Duration;
9
10#[allow(dead_code)]
11const MAX_CONTENT_LENGTH: usize = 10 * 1024 * 1024;
12const REQUEST_TIMEOUT: Duration = Duration::from_secs(30);
13
14pub struct WebFetchTool {
15    client: reqwest::Client,
16}
17
18impl Default for WebFetchTool {
19    fn default() -> Self {
20        Self::new()
21    }
22}
23
24impl WebFetchTool {
25    pub fn new() -> Self {
26        let client = reqwest::Client::builder()
27            .timeout(REQUEST_TIMEOUT)
28            .user_agent("CodeTether-Agent/1.0")
29            .redirect(reqwest::redirect::Policy::limited(5))
30            .build()
31            .expect("Failed to build HTTP client");
32        Self { client }
33    }
34
35    fn html_to_markdown(&self, html: &str) -> String {
36        let mut result = html.to_string();
37        let patterns = [
38            (r"<h1[^>]*>(.*?)</h1>", "# $1\n"),
39            (r"<h2[^>]*>(.*?)</h2>", "## $1\n"),
40            (r"<h3[^>]*>(.*?)</h3>", "### $1\n"),
41            (r"<p[^>]*>(.*?)</p>", "$1\n\n"),
42            (r"<(strong|b)[^>]*>(.*?)</\1>", "**$2**"),
43            (r"<(em|i)[^>]*>(.*?)</\1>", "*$2*"),
44            (r"<code[^>]*>(.*?)</code>", "`$1`"),
45            (r"<li[^>]*>(.*?)</li>", "- $1\n"),
46        ];
47        for (pat, rep) in patterns {
48            if let Ok(re) = regex::Regex::new(pat) {
49                result = re.replace_all(&result, rep).to_string();
50            }
51        }
52        result = regex::Regex::new(r"<[^>]+>")
53            .unwrap()
54            .replace_all(&result, "")
55            .to_string();
56        result
57            .replace("&nbsp;", " ")
58            .replace("&amp;", "&")
59            .replace("&lt;", "<")
60            .replace("&gt;", ">")
61    }
62
63    fn html_to_text(&self, html: &str) -> String {
64        let md = self.html_to_markdown(html);
65        md.replace("**", "")
66            .replace("*", "")
67            .replace("`", "")
68            .replace("# ", "")
69    }
70
71    fn is_html(&self, ct: &str, body: &str) -> bool {
72        ct.contains("text/html") || body.trim().starts_with('<')
73    }
74}
75
76#[derive(Deserialize)]
77struct Params {
78    url: String,
79    #[serde(default = "default_fmt")]
80    format: String,
81}
82fn default_fmt() -> String {
83    "markdown".into()
84}
85
86#[async_trait]
87impl Tool for WebFetchTool {
88    fn id(&self) -> &str {
89        "webfetch"
90    }
91    fn name(&self) -> &str {
92        "Web Fetch"
93    }
94    fn description(&self) -> &str {
95        "Fetch content from URL, convert HTML to markdown/text/html."
96    }
97    fn parameters(&self) -> Value {
98        json!({"type":"object","properties":{"url":{"type":"string"},"format":{"type":"string","enum":["markdown","text","html"],"default":"markdown"}},"required":["url"]})
99    }
100
101    async fn execute(&self, params: Value) -> Result<ToolResult> {
102        let p: Params = serde_json::from_value(params).context("Invalid params")?;
103        let url = p.url.parse::<reqwest::Url>().context("Invalid URL")?;
104        if url.scheme() != "http" && url.scheme() != "https" {
105            return Ok(ToolResult::error("Only HTTP/HTTPS supported"));
106        }
107        let resp = self
108            .client
109            .get(url)
110            .send()
111            .await
112            .map_err(|e| anyhow::anyhow!("{}", e))?;
113        if !resp.status().is_success() {
114            return Ok(ToolResult::error(format!("HTTP {}", resp.status())));
115        }
116        let ct = resp
117            .headers()
118            .get("content-type")
119            .and_then(|v| v.to_str().ok())
120            .unwrap_or("")
121            .to_lowercase();
122        let body = resp.text().await.context("Failed to read body")?;
123        let content = match p.format.as_str() {
124            "html" => body,
125            "text" => {
126                if self.is_html(&ct, &body) {
127                    self.html_to_text(&body)
128                } else {
129                    body
130                }
131            }
132            _ => {
133                if self.is_html(&ct, &body) {
134                    self.html_to_markdown(&body)
135                } else {
136                    body
137                }
138            }
139        };
140        Ok(ToolResult::success(content).with_metadata("url", json!(p.url)))
141    }
142}