use std::fmt::Write as FmtWrite;
use std::time::Instant;
use rust_mcp_sdk::macros::{JsonSchema, mcp_tool};
use rust_mcp_sdk::schema::{CallToolResult, TextContent, schema_utils::CallToolError};
use serde::{Deserialize, Serialize};
use nab::content::budget::truncate_to_budget;
use nab::content::diff::{ContentSnapshot, compute_diff};
use nab::content::diff_format::format_diff_markdown;
use nab::content::focus::extract_focused;
use nab::content::response_classifier::{
ResponseAnalysis, ResponseClass, classify_response, classify_thin_content,
};
use nab::content::snapshot_store::SnapshotStore;
use nab::{AcceleratedClient, SafeFetchConfig};
use crate::helpers::{
convert_body_async, fetch_safe_response, fetch_with_cookies, fetch_with_session_response,
resolve_cookie_header, write_body_info, write_response_summary,
};
use crate::structured::{FetchStructuredParams, build_fetch_structured_v2, truncate_markdown};
use crate::tools::client::{get_client, resolve_session_client};
#[derive(Debug, Clone, Copy, Default)]
struct FetchDiagnosticMetadata {
response_class: Option<&'static str>,
response_confidence: Option<f32>,
response_reason: Option<&'static str>,
thin_content_detected: bool,
}
#[mcp_tool(
name = "fetch",
description = "Fetch a URL and convert to clean markdown for LLM consumption.
Content conversion (automatic by Content-Type):
- HTML → clean markdown (boilerplate removed, links preserved)
- PDF → markdown with headings and table detection (requires pdf feature)
- JSON/plain text → passthrough
- SPA data auto-extracted (__NEXT_DATA__, __NUXT__, __APOLLO_STATE__, etc.)
Network features:
- HTTP/2 multiplexing, HTTP/3 (QUIC) with 0-RTT
- TLS 1.3, Brotli/Zstd/Gzip decompression
- Realistic browser fingerprints (Chrome/Firefox/Safari)
- Browser cookie injection (Brave/Chrome/Firefox/Safari)
Diff mode (diff: true):
- Compares current content against the previous snapshot for this URL
- Returns only the changed sections (token-efficient for monitoring tasks)
- First fetch caches the page; subsequent fetches return semantic diffs
- Unchanged content returns a 5-token confirmation instead of full body
Focus mode (focus: query):
- Keeps only sections relevant to the query (BM25 scoring)
- Replaces dropped sections with '[N sections omitted]' markers
- Diff markers are always preserved regardless of relevance
Token budget (max_tokens: N):
- Structure-aware truncation preserving headings, code, and tables
- Priority: title > code/tables > headings (30% cap) > body > blockquotes
Returns: Markdown-converted body with timing info (or diff when diff: true).",
read_only_hint = true,
open_world_hint = true
)]
#[derive(Debug, Deserialize, Serialize, JsonSchema)]
#[allow(clippy::struct_excessive_bools)]
pub struct FetchTool {
url: String,
#[serde(default)]
headers: bool,
#[serde(default)]
body: bool,
#[serde(default)]
cookies: Option<String>,
#[serde(default)]
diff: bool,
#[serde(default)]
focus: Option<String>,
#[serde(default)]
max_tokens: Option<u64>,
#[serde(default)]
session: Option<String>,
#[serde(default)]
tor: bool,
}
impl FetchTool {
#[allow(clippy::too_many_lines)]
pub async fn run(&self) -> Result<CallToolResult, CallToolError> {
let url_host = url::Url::parse(&self.url)
.ok()
.and_then(|u| u.host_str().map(str::to_owned))
.unwrap_or_else(|| "<invalid>".to_owned());
tracing::info!(
url_host = %url_host,
has_focus = self.focus.is_some(),
has_budget = self.max_tokens.is_some(),
has_session = self.session.is_some(),
diff = self.diff,
tor = self.tor,
"fetch start"
);
let start = Instant::now();
let tor_client: Option<AcceleratedClient> = if self.tor {
match AcceleratedClient::with_tor_proxy() {
Ok(c) => Some(c),
Err(e) => {
tracing::warn!(
error = %e,
"Tor proxy unavailable; falling back to direct connection"
);
None
}
}
} else {
None
};
let client: &AcceleratedClient = match tor_client.as_ref() {
Some(c) => c,
None => get_client().await,
};
let profile = client.profile().await;
let mut output = format!("🌐 Fetching: {}\n", self.url);
let _ = writeln!(
output,
"🎭 Profile: {}",
profile.user_agent.split('/').next().unwrap_or("Unknown")
);
let cookie_header = resolve_cookie_header(&self.url, self.cookies.as_deref());
if let Some(ref session_name) = self.session {
let session_client =
resolve_session_client(session_name, Some(&cookie_header), &self.url).await?;
if let Some(ref sn) = self.session {
let _ = writeln!(output, "🔑 Session: {sn}");
}
let (status, content_type, response_headers, body_bytes, elapsed) =
fetch_with_session_response(&session_client, &self.url, start).await?;
let raw_text = String::from_utf8_lossy(&body_bytes).into_owned();
write_response_summary(
&mut output,
status,
elapsed,
self.headers,
&response_headers,
);
write_body_info(&mut output, body_bytes.len());
let conversion = convert_body_async(&body_bytes, &content_type, &self.url).await?;
let diagnostics = trace_fetch_classification(
status.as_u16(),
&content_type,
&raw_text,
body_bytes.len(),
&conversion.markdown,
conversion.quality.as_ref(),
);
if let Some(pages) = conversion.page_count {
let _ = writeln!(
output,
"📑 Pages: {} | Conversion: {:.1}ms",
pages, conversion.elapsed_ms
);
}
let markdown = conversion.markdown;
let status_u16 = status.as_u16();
let elapsed_ms = elapsed.as_secs_f64() * 1000.0;
return Ok(self.finish_fetch(
output,
markdown,
status_u16,
&content_type,
elapsed_ms,
diagnostics,
));
}
let site_router = nab::site::SiteRouter::new();
let cookie_opt = if cookie_header.is_empty() {
None
} else {
Some(cookie_header.as_str())
};
let (markdown, status_u16, content_type, elapsed_ms, diagnostics) = if let Some(
site_content,
) =
site_router.try_extract(&self.url, client, cookie_opt).await
{
let elapsed_ms = start.elapsed().as_secs_f64() * 1000.0;
output.push_str("\n📄 Content (from specialized provider):\n\n");
(
site_content.markdown,
200u16,
"text/html".to_owned(),
elapsed_ms,
FetchDiagnosticMetadata::default(),
)
} else {
let config = SafeFetchConfig::default();
let (status, content_type, response_headers, body_bytes, elapsed) =
if cookie_header.is_empty() {
fetch_safe_response(client, &self.url, &config, start).await?
} else {
fetch_with_cookies(client, &self.url, &cookie_header, &profile, start).await?
};
let raw_text = String::from_utf8_lossy(&body_bytes).into_owned();
write_response_summary(
&mut output,
status,
elapsed,
self.headers,
&response_headers,
);
write_body_info(&mut output, body_bytes.len());
let conversion = convert_body_async(&body_bytes, &content_type, &self.url).await?;
if let Some(pages) = conversion.page_count {
let _ = writeln!(
output,
"📑 Pages: {} | Conversion: {:.1}ms",
pages, conversion.elapsed_ms
);
}
let quality_is_low = conversion
.quality
.as_ref()
.is_some_and(|q| q.confidence < 0.5);
let final_markdown = if content_type.contains("html")
&& (conversion.markdown.len() < 800 || quality_is_low)
&& body_bytes.len() > 5_000
{
let raw_html = String::from_utf8_lossy(&body_bytes);
if let Some(recovered) =
crate::helpers::recover_nextjs_chunks(client, &raw_html, &self.url).await
{
let _ = writeln!(
output,
" Recovered {} chars from Next.js content chunk",
recovered.len()
);
recovered
} else {
conversion.markdown
}
} else {
conversion.markdown
};
let diagnostics = trace_fetch_classification(
status.as_u16(),
&content_type,
&raw_text,
body_bytes.len(),
&final_markdown,
conversion.quality.as_ref(),
);
(
final_markdown,
status.as_u16(),
content_type,
elapsed.as_secs_f64() * 1000.0,
diagnostics,
)
};
Ok(self.finish_fetch(
output,
markdown,
status_u16,
&content_type,
elapsed_ms,
diagnostics,
))
}
fn finish_fetch(
&self,
mut output: String,
markdown: String,
status_u16: u16,
content_type: &str,
elapsed_ms: f64,
diagnostics: FetchDiagnosticMetadata,
) -> CallToolResult {
let has_diff = if self.diff {
let (diff_output, had_diff) = apply_diff(&self.url, &markdown);
output.push('\n');
output.push_str(&diff_output);
had_diff
} else {
if self.body {
let truncated = truncate_markdown(&markdown, 4000);
let _ = write!(output, "\n{truncated}");
}
false
};
let (processed_markdown, omitted_sections, total_sections) =
if let Some(ref query) = self.focus {
let focus_result = extract_focused(&markdown, query);
(
focus_result.markdown,
focus_result.omitted_sections,
focus_result.total_sections,
)
} else {
(markdown, 0, 0)
};
let max_tok = self
.max_tokens
.map(|t| usize::try_from(t).unwrap_or(usize::MAX));
let budget_result = truncate_to_budget(&processed_markdown, max_tok);
let structured = build_fetch_structured_v2(&FetchStructuredParams {
url: &self.url,
status: status_u16,
content_type,
markdown: &budget_result.markdown,
timing_ms: elapsed_ms,
has_diff,
omitted_sections,
total_sections,
truncated: budget_result.truncated,
full_tokens: budget_result.total_tokens,
response_class: diagnostics.response_class,
response_confidence: diagnostics.response_confidence,
response_reason: diagnostics.response_reason,
thin_content_detected: diagnostics.thin_content_detected,
});
let mut result = CallToolResult::text_content(vec![TextContent::from(output)]);
result.structured_content = Some(structured);
result
}
}
fn apply_diff(url: &str, markdown: &str) -> (String, bool) {
apply_diff_with_store(&SnapshotStore::new(), url, markdown)
}
fn trace_fetch_classification(
status: u16,
content_type: &str,
raw_text: &str,
body_len: usize,
markdown: &str,
quality: Option<&nab::content::quality::QualityScore>,
) -> FetchDiagnosticMetadata {
let classification = classify_response(ResponseAnalysis {
status,
body: raw_text,
content_type: Some(content_type),
html_bytes: content_type.contains("html").then_some(body_len),
markdown: content_type.contains("html").then_some(markdown),
markdown_chars: content_type.contains("html").then_some(markdown.len()),
quality,
});
if let Some(primary) = classification.primary() {
tracing::warn!(
status,
class = ?primary.class,
confidence = ?primary.confidence,
reason = primary.reason,
"fetch response classified"
);
}
let thin_content_detected =
classify_thin_content(Some(content_type), body_len, markdown.len(), quality).is_some();
if thin_content_detected {
tracing::warn!(
status,
markdown_len = markdown.len(),
body_len,
"fetch response classified as thin content"
);
}
let primary = classification.primary();
FetchDiagnosticMetadata {
response_class: primary.map(|signal| signal.class.code()),
response_confidence: primary.map(|signal| signal.confidence),
response_reason: primary.map(|signal| signal.reason),
thin_content_detected: thin_content_detected
|| classification.has_class(ResponseClass::ThinContent),
}
}
pub(crate) fn apply_diff_with_store(
store: &SnapshotStore,
url: &str,
markdown: &str,
) -> (String, bool) {
let new_snap = ContentSnapshot::new(url, markdown, std::time::SystemTime::now());
let output = match store.load_latest_snapshot(url) {
Some(old_snap) if old_snap.content_unchanged(&new_snap) => {
let _ = store.save_snapshot(url, &new_snap);
"No changes since last fetch".to_owned()
}
Some(old_snap) => {
let _ = store.save_snapshot(url, &new_snap);
let diff = compute_diff(&old_snap, &new_snap);
format!(
"Changed since last fetch:\n\n{}",
format_diff_markdown(&diff)
)
}
None => {
let _ = store.save_snapshot(url, &new_snap);
format!(
"First fetch (cached for future diff):\n\n{}",
truncate_markdown(markdown, 4000)
)
}
};
let has_diff = !output.starts_with("No changes") && !output.starts_with("First fetch");
(output, has_diff)
}