use super::super::error::{Result, ToolError};
use super::super::r#trait::{Tool, ToolCapability, ToolExecutionContext, ToolResult};
use super::{clean, export, extract, fetch, sitemap, ssrf, to_markdown};
use async_trait::async_trait;
use serde_json::{Value, json};
use uuid::Uuid;
const MAX_INLINE_BYTES: usize = 100_000;
const SITEMAP_EXPORT_HARD_CAP: usize = 100;
#[derive(Clone, Copy)]
enum Mode {
Readable,
Raw,
}
#[derive(Default)]
pub struct WebScrapeTool {
#[cfg(feature = "browser")]
browser: Option<std::sync::Arc<crate::brain::tools::browser::BrowserManager>>,
}
impl WebScrapeTool {
#[cfg(feature = "browser")]
pub fn with_browser(
mut self,
manager: std::sync::Arc<crate::brain::tools::browser::BrowserManager>,
) -> Self {
self.browser = Some(manager);
self
}
async fn fetch_html(
&self,
url: &str,
session_id: Uuid,
timeout: u64,
) -> std::result::Result<String, String> {
let html = fetch::fetch_static(url, timeout).await?;
#[cfg(feature = "browser")]
if fetch::is_js_shell(&html)
&& let Some(browser) = &self.browser
{
match fetch::fetch_rendered(browser, session_id, url).await {
Ok(rendered) => return Ok(rendered),
Err(e) => tracing::debug!("web_scrape: render escalation failed for {url}: {e}"),
}
}
#[cfg(not(feature = "browser"))]
let _ = session_id;
Ok(html)
}
async fn scrape_markdown(
&self,
url: &str,
mode: Mode,
session_id: Uuid,
timeout: u64,
) -> std::result::Result<String, String> {
let base = ssrf::validate_url(url)?;
let html = self.fetch_html(url, session_id, timeout).await?;
let content = match mode {
Mode::Readable => extract::extract_main_content(&html),
Mode::Raw => html,
};
let cleaned = clean::strip_noise(&content);
let absolute = to_markdown::absolutize_urls(&cleaned, &base);
let markdown = to_markdown::to_markdown(&absolute);
Ok(clean::collapse_blank_lines(&markdown))
}
async fn run_sitemap(
&self,
url: &str,
export: bool,
max_pages: usize,
session_id: Uuid,
timeout: u64,
context: &ToolExecutionContext,
) -> Result<ToolResult> {
if let Err(e) = ssrf::validate_url(url) {
return Ok(ToolResult::error(format!("web_scrape: {e}")));
}
let sitemap_url = if url.to_ascii_lowercase().contains("sitemap") && url.ends_with(".xml") {
url.to_string()
} else {
match sitemap::discover_sitemap_url(url, timeout).await {
Some(s) => s,
None => {
return Ok(ToolResult::error(format!(
"web_scrape: no sitemap found for {url} (tried /sitemap.xml, robots.txt, common alternates)"
)));
}
}
};
let pages = sitemap::collect_sitemap_urls(&sitemap_url, timeout).await;
if pages.is_empty() {
return Ok(ToolResult::error(format!(
"web_scrape: sitemap {sitemap_url} yielded no page URLs"
)));
}
if !export {
let joined = pages.join("\n");
let listing = truncate_utf8(&joined, MAX_INLINE_BYTES);
return Ok(ToolResult::success(format!(
"Found {} page URLs in {sitemap_url}:\n\n{listing}",
pages.len()
))
.with_metadata("sitemap_url".into(), sitemap_url)
.with_metadata("url_count".into(), pages.len().to_string()));
}
let dir = export::resolve_export_dir(session_id, context.service_context.as_ref()).await;
let limit = max_pages.min(SITEMAP_EXPORT_HARD_CAP).min(pages.len());
let mut written = 0usize;
let mut failed = 0usize;
for page in pages.iter().take(limit) {
match self
.scrape_markdown(page, Mode::Readable, session_id, timeout)
.await
{
Ok(md) => match export::write_markdown(&dir, page, &md).await {
Ok(_) => written += 1,
Err(e) => {
failed += 1;
tracing::debug!("web_scrape: export write failed for {page}: {e}");
}
},
Err(e) => {
failed += 1;
tracing::debug!("web_scrape: scrape failed for {page}: {e}");
}
}
}
let mut summary = format!(
"Exported {written} of {} sitemap pages to {}",
pages.len(),
dir.display()
);
if limit < pages.len() {
summary.push_str(&format!(
"\n(capped at {limit} pages; raise max_pages up to {SITEMAP_EXPORT_HARD_CAP} for more)"
));
}
if failed > 0 {
summary.push_str(&format!("\n{failed} page(s) failed and were skipped."));
}
Ok(ToolResult::success(summary)
.with_metadata("export_dir".into(), dir.display().to_string())
.with_metadata("pages_written".into(), written.to_string())
.with_metadata("pages_total".into(), pages.len().to_string()))
}
}
#[async_trait]
impl Tool for WebScrapeTool {
fn name(&self) -> &str {
"web_scrape"
}
fn description(&self) -> &str {
"Fetch a URL and return clean markdown at zero AI / zero API cost. Isolates \
the main content, strips scripts/nav/footers, and keeps images as \
 references so you can vision only the ones a task needs. \
Modes: 'readable' (default, main content only), 'raw' (whole page), \
'sitemap' (list a site's page URLs, or with export=true scrape them all \
to disk). Set export=true to also save markdown under the session's \
project or profile directory. Prefer this over http_request for reading \
page content."
}
fn input_schema(&self) -> Value {
json!({
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The page URL to scrape, or a site root / sitemap URL in sitemap mode."
},
"mode": {
"type": "string",
"enum": ["readable", "raw", "sitemap"],
"description": "readable = main content as markdown (default); raw = whole page; sitemap = enumerate a site's page URLs.",
"default": "readable"
},
"export": {
"type": "boolean",
"description": "Also write the markdown to disk under the session's project (projects/<slug>/files/scrapes/) or active profile (~/.opencrabs[/profiles/<name>]/scrapes/). In sitemap mode, scrapes and saves every page.",
"default": false
},
"max_pages": {
"type": "integer",
"description": "sitemap + export only: cap on pages scraped (default 20, hard max 100).",
"default": 20
}
},
"required": ["url"]
})
}
fn capabilities(&self) -> Vec<ToolCapability> {
vec![ToolCapability::Network]
}
async fn execute(&self, input: Value, context: &ToolExecutionContext) -> Result<ToolResult> {
let url = input
.get("url")
.and_then(|v| v.as_str())
.map(str::trim)
.filter(|s| !s.is_empty())
.ok_or_else(|| ToolError::InvalidInput("web_scrape requires a 'url'".into()))?;
let mode_str = input
.get("mode")
.and_then(|v| v.as_str())
.unwrap_or("readable");
let export = input
.get("export")
.and_then(|v| v.as_bool())
.unwrap_or(false);
let max_pages = input
.get("max_pages")
.and_then(|v| v.as_u64())
.unwrap_or(20) as usize;
let timeout = context.timeout_secs.clamp(5, 120);
let session_id = context.session_id;
if mode_str == "sitemap" {
return self
.run_sitemap(url, export, max_pages, session_id, timeout, context)
.await;
}
let mode = match mode_str {
"raw" => Mode::Raw,
_ => Mode::Readable,
};
let markdown = match self.scrape_markdown(url, mode, session_id, timeout).await {
Ok(md) => md,
Err(e) => {
return Ok(ToolResult::error(format!(
"web_scrape failed for {url}: {e}"
)));
}
};
let mut header = String::new();
if export {
let dir =
export::resolve_export_dir(session_id, context.service_context.as_ref()).await;
match export::write_markdown(&dir, url, &markdown).await {
Ok(path) => header = format!("Saved to {}\n\n", path.display()),
Err(e) => {
tracing::warn!("web_scrape: export write failed for {url}: {e}");
header = format!("(export failed: {e})\n\n");
}
}
}
let body = truncate_utf8(&markdown, MAX_INLINE_BYTES);
let truncated = markdown.len() > body.len();
let mut output = format!("{header}{body}");
if truncated {
output.push_str(&format!(
"\n\n[truncated to {} of {} bytes{}]",
body.len(),
markdown.len(),
if export {
"; full document written to disk"
} else {
""
}
));
}
Ok(ToolResult::success(output)
.with_metadata("url".into(), url.to_string())
.with_metadata("bytes".into(), markdown.len().to_string()))
}
}
fn truncate_utf8(s: &str, max_bytes: usize) -> &str {
crate::utils::string::truncate_str(s, max_bytes)
}