use async_trait::async_trait;
use std::net::{Ipv4Addr, Ipv6Addr};
use url::{Host, Url};
use smooth_operator_core::tool::ToolSchema;
use smooth_operator_core::Tool;
const MAX_TEXT_LEN: usize = 8_000;
pub struct FetchUrlTool {
client: reqwest::Client,
}
impl FetchUrlTool {
#[must_use]
pub fn new() -> Self {
Self {
client: safe_http_client(),
}
}
#[must_use]
pub fn with_client(client: reqwest::Client) -> Self {
Self { client }
}
}
impl Default for FetchUrlTool {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl Tool for FetchUrlTool {
fn schema(&self) -> ToolSchema {
ToolSchema {
name: "fetch_url".to_string(),
description: "Fetch a PUBLIC web page over HTTP(S) and return its readable text \
content (HTML stripped to plain text, length-capped). Use this to read \
a public docs page, help article, or webpage the user references. \
Internal/private/loopback/metadata addresses are rejected for security."
.to_string(),
parameters: serde_json::json!({
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The absolute http(s) URL to fetch (e.g. \
'https://example.com/docs/page')."
}
},
"required": ["url"]
}),
}
}
async fn execute(&self, arguments: serde_json::Value) -> anyhow::Result<String> {
let raw_url = arguments
.get("url")
.and_then(serde_json::Value::as_str)
.ok_or_else(|| anyhow::anyhow!("fetch_url requires a string 'url' argument"))?;
let url = assert_url_is_public(raw_url)?;
let resp = self
.client
.get(url.clone())
.header(reqwest::header::USER_AGENT, "smooth-operator/fetch_url")
.send()
.await
.map_err(|e| anyhow::anyhow!("fetch_url request failed for {url}: {e}"))?;
let status = resp.status();
if !status.is_success() {
return Err(anyhow::anyhow!(
"fetch_url got HTTP {} from {url}",
status.as_u16()
));
}
let content_type = resp
.headers()
.get(reqwest::header::CONTENT_TYPE)
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.to_ascii_lowercase();
let body = resp
.text()
.await
.map_err(|e| anyhow::anyhow!("fetch_url failed reading body from {url}: {e}"))?;
let text = if content_type.contains("html") || looks_like_html(&body) {
html_to_text(&body)
} else {
collapse_whitespace(&body)
};
Ok(cap_len(&text, MAX_TEXT_LEN))
}
fn is_read_only(&self) -> bool {
true
}
}
pub fn assert_url_is_public(raw_url: &str) -> anyhow::Result<Url> {
let url = Url::parse(raw_url)
.map_err(|e| anyhow::anyhow!("fetch_url: invalid URL {raw_url:?}: {e}"))?;
let scheme = url.scheme();
if scheme != "http" && scheme != "https" {
return Err(anyhow::anyhow!(
"fetch_url: refusing non-http(s) scheme {scheme:?} (only http/https allowed)"
));
}
let host = url
.host()
.ok_or_else(|| anyhow::anyhow!("fetch_url: URL {raw_url:?} has no host"))?;
match host {
Host::Domain(domain) => {
let lowered = domain.to_ascii_lowercase();
if lowered == "localhost" || lowered.ends_with(".localhost") {
return Err(anyhow::anyhow!(
"fetch_url: refusing to fetch localhost ({domain:?}) — SSRF guard"
));
}
}
Host::Ipv4(ip) => assert_ipv4_public(ip)?,
Host::Ipv6(ip) => assert_ipv6_public(ip)?,
}
Ok(url)
}
pub fn safe_http_client() -> reqwest::Client {
reqwest::Client::builder()
.redirect(reqwest::redirect::Policy::custom(|attempt| {
if attempt.previous().len() >= 10 {
return attempt.error(Box::<dyn std::error::Error + Send + Sync>::from(
"fetch_url: too many redirects",
));
}
match assert_url_is_public(attempt.url().as_str()) {
Ok(_) => attempt.follow(),
Err(e) => attempt.error(Box::<dyn std::error::Error + Send + Sync>::from(format!(
"fetch_url: redirect blocked by SSRF guard: {e}"
))),
}
}))
.build()
.expect("fetch_url HTTP client build is infallible")
}
fn assert_ipv4_public(ip: Ipv4Addr) -> anyhow::Result<()> {
let blocked = ip.is_loopback() || ip.is_private() || ip.is_link_local() || ip.is_unspecified() || ip.is_broadcast() || ip.is_documentation() || is_shared_cgnat(ip) || ip.octets()[0] == 0; if blocked {
return Err(anyhow::anyhow!(
"fetch_url: refusing to fetch non-public IPv4 {ip} — SSRF guard"
));
}
Ok(())
}
fn is_shared_cgnat(ip: Ipv4Addr) -> bool {
let [a, b, ..] = ip.octets();
a == 100 && (64..=127).contains(&b)
}
fn assert_ipv6_public(ip: Ipv6Addr) -> anyhow::Result<()> {
if let Some(v4) = ip.to_ipv4() {
return assert_ipv4_public(v4);
}
let is_unique_local = (ip.segments()[0] & 0xfe00) == 0xfc00; let is_link_local = (ip.segments()[0] & 0xffc0) == 0xfe80; let blocked = ip.is_loopback() || ip.is_unspecified() || is_unique_local || is_link_local;
if blocked {
return Err(anyhow::anyhow!(
"fetch_url: refusing to fetch non-public IPv6 {ip} — SSRF guard"
));
}
Ok(())
}
fn looks_like_html(body: &str) -> bool {
let mut end = body.len().min(512);
while end > 0 && !body.is_char_boundary(end) {
end -= 1;
}
let head = body[..end].to_ascii_lowercase();
head.contains("<html") || head.contains("<!doctype html") || head.contains("<body")
}
pub fn html_to_text(html: &str) -> String {
let mut out = String::with_capacity(html.len());
let bytes = html.as_bytes();
let lower = html.to_ascii_lowercase();
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'<' {
if let Some(end) = skip_block(&lower, i, "script") {
i = end;
out.push(' ');
continue;
}
if let Some(end) = skip_block(&lower, i, "style") {
i = end;
out.push(' ');
continue;
}
if lower[i..].starts_with("<!--") {
if let Some(rel) = lower[i..].find("-->") {
i += rel + 3;
continue;
}
break;
}
if let Some(rel) = html[i..].find('>') {
i += rel + 1;
out.push(' ');
continue;
}
break;
}
let start = i;
while i < bytes.len() && bytes[i] != b'<' {
i += 1;
}
out.push_str(&html[start..i]);
}
let decoded = decode_entities(&out);
collapse_whitespace(&decoded)
}
fn skip_block(lower: &str, start: usize, tag: &str) -> Option<usize> {
let open = format!("<{tag}");
if !lower[start..].starts_with(&open) {
return None;
}
let close = format!("</{tag}>");
match lower[start..].find(&close) {
Some(rel) => Some(start + rel + close.len()),
None => Some(lower.len()),
}
}
fn decode_entities(s: &str) -> String {
s.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace("'", "'")
.replace(" ", " ")
}
fn collapse_whitespace(s: &str) -> String {
s.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn cap_len(s: &str, max: usize) -> String {
if s.chars().count() <= max {
return s.to_string();
}
let truncated: String = s.chars().take(max).collect();
format!("{truncated}… [truncated]")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn rejects_localhost() {
assert!(assert_url_is_public("http://localhost:8080/secret").is_err());
assert!(assert_url_is_public("http://app.localhost/").is_err());
}
#[test]
fn rejects_loopback_ip() {
assert!(assert_url_is_public("http://127.0.0.1/").is_err());
assert!(assert_url_is_public("http://127.0.0.53:53/").is_err());
assert!(assert_url_is_public("http://[::1]/").is_err());
}
#[test]
fn rejects_metadata_and_link_local() {
assert!(assert_url_is_public("http://169.254.169.254/latest/meta-data/").is_err());
assert!(assert_url_is_public("http://169.254.0.1/").is_err());
}
#[test]
fn rejects_private_ranges() {
assert!(assert_url_is_public("http://10.0.0.5/").is_err());
assert!(assert_url_is_public("http://172.16.3.4/").is_err());
assert!(assert_url_is_public("http://192.168.1.1/").is_err());
assert!(assert_url_is_public("http://0.0.0.0/").is_err());
assert!(assert_url_is_public("http://100.64.0.1/").is_err());
}
#[test]
fn rejects_ipv4_mapped_ipv6_loopback() {
assert!(assert_url_is_public("http://[::ffff:127.0.0.1]/").is_err());
}
#[test]
fn rejects_non_http_scheme() {
assert!(assert_url_is_public("file:///etc/passwd").is_err());
assert!(assert_url_is_public("ftp://example.com/x").is_err());
assert!(assert_url_is_public("gopher://evil/").is_err());
}
#[test]
fn allows_public_hosts() {
assert!(assert_url_is_public("https://example.com/docs").is_ok());
assert!(assert_url_is_public("http://93.184.216.34/").is_ok()); assert!(assert_url_is_public("https://api.smoo.ai/v1").is_ok());
}
#[tokio::test]
async fn execute_rejects_internal_url_without_fetching() {
let tool = FetchUrlTool::new();
let err = tool
.execute(serde_json::json!({ "url": "http://169.254.169.254/latest/meta-data/" }))
.await
.expect_err("internal URL must be rejected");
assert!(
err.to_string().contains("SSRF guard"),
"expected SSRF guard rejection, got: {err}"
);
}
#[tokio::test]
async fn execute_requires_url_argument() {
let tool = FetchUrlTool::new();
let err = tool
.execute(serde_json::json!({}))
.await
.expect_err("missing url should error");
assert!(err.to_string().contains("url"));
}
#[test]
fn html_to_text_strips_tags_scripts_styles_and_entities() {
let html = r#"
<!doctype html>
<html>
<head>
<style>.x { color: red; }</style>
<script>var leak = "should not appear";</script>
<title>Doc</title>
</head>
<body>
<h1>Hello & welcome</h1>
<p>The return window is 17 days.</p>
<!-- a comment that should vanish -->
</body>
</html>
"#;
let text = html_to_text(html);
assert!(text.contains("Hello & welcome"), "got: {text}");
assert!(
text.contains("The return window is 17 days."),
"got: {text}"
);
assert!(!text.contains("should not appear"), "script leaked: {text}");
assert!(!text.contains("color: red"), "style leaked: {text}");
assert!(!text.contains("a comment"), "comment leaked: {text}");
assert!(
!text.contains('<') && !text.contains('>'),
"tags leaked: {text}"
);
}
#[test]
fn cap_len_truncates_long_text() {
let long = "a".repeat(MAX_TEXT_LEN + 100);
let capped = cap_len(&long, MAX_TEXT_LEN);
assert!(capped.ends_with("… [truncated]"));
assert!(capped.chars().count() <= MAX_TEXT_LEN + "… [truncated]".chars().count());
}
#[test]
fn looks_like_html_no_panic_on_multibyte_boundary() {
let mut body = "€".repeat(200);
body.push_str("<html>");
let _ = looks_like_html(&body); assert!(looks_like_html("<!doctype html><html><body>hi"));
assert!(!looks_like_html("just some plain text, no markup here"));
}
#[test]
fn schema_is_read_only_with_url_param() {
let tool = FetchUrlTool::new();
let schema = tool.schema();
assert_eq!(schema.name, "fetch_url");
assert_eq!(schema.parameters["required"][0], "url");
assert!(tool.is_read_only());
}
}