use std::fs::File;
use std::io::Write;
use std::path::PathBuf;
use std::time::{Instant, SystemTime};
use anyhow::Result;
use serde_json::json;
use nab::content::diff::ContentSnapshot;
use nab::content::diff_format::format_diff_terminal;
use nab::content::ocr::fetch_integration::FetchOcrEnricher;
use nab::content::response_classifier::{
ResponseAnalysis, ResponseClass, ThinContentDiagnostic, classify_response,
classify_thin_content,
};
use nab::content::snapshot_store::SnapshotStore;
use nab::{AcceleratedClient, OnePasswordAuth, SafeFetchConfig};
use super::output::{output_body, write_stdout, write_stdout_line};
use crate::OutputFormat;
/// All parameters for a `nab fetch` invocation.
///
/// Constructed from CLI arguments in `main.rs` and threaded through the
/// fetch pipeline, replacing the 22-positional-parameter function signature.
#[allow(clippy::struct_excessive_bools)] // 1:1 map of CLI boolean flags
pub struct FetchConfig {
pub url: String,
pub show_headers: bool,
pub show_body: bool,
pub format: OutputFormat,
pub output_file: Option<PathBuf>,
pub cookies: String,
pub use_1password: bool,
pub raw_html: bool,
pub links: bool,
pub max_body: usize,
pub custom_headers: Vec<String>,
pub auto_referer: bool,
pub warmup_url: Option<String>,
pub method: String,
pub data: Option<String>,
pub capture_cookies: bool,
pub no_redirect: bool,
pub batch_file: Option<String>,
pub parallel: usize,
pub proxy: Option<String>,
/// When `true`, route all requests through the Tor SOCKS5 proxy at
/// `localhost:9050`. DNS resolution is also proxied (`socks5h://`) to
/// prevent DNS leaks. If Tor is not running the request falls back to a
/// direct connection with a warning printed to stderr.
pub tor: bool,
pub show_diff: bool,
pub html_options: nab::content::html::HtmlConversionOptions,
/// When `true`, skip saving the fetch result to hebb's kv store.
pub no_save: bool,
/// When `true`, skip OCR-enriching images in the fetched HTML.
pub no_ocr: bool,
/// When `true`, do not auto-transcribe media URLs; fall back to normal HTML fetch.
pub no_transcribe: bool,
/// Optional BCP-47 language hint for transcription (e.g. `"fi"`, `"en-US"`).
pub language: Option<String>,
}
#[allow(clippy::too_many_lines)] // Orchestration function; splitting would hurt readability
pub async fn cmd_fetch(cfg: &FetchConfig) -> Result<()> {
// Handle batch mode
if cfg.batch_file.is_some() {
return super::fetch_batch::cmd_fetch_batch(cfg).await;
}
let client = build_client(cfg.no_redirect, cfg.proxy.as_deref(), cfg.tor)?;
let profile = client.profile().await;
let domain = super::extract_domain(&cfg.url);
let cookie_header = super::resolve_cookie_header(&cfg.cookies, &domain);
if !cookie_header.is_empty() && matches!(cfg.format, OutputFormat::Full) {
write_stdout_line(&format!("šŖ Loading cookies for {domain}"))?;
}
// āā Media URL auto-transcription āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
if !cfg.no_transcribe && !cfg.raw_html && nab::content::media::is_media_url(&cfg.url) {
tracing::info!(url = %cfg.url, "detected media URL ā transcribing");
match nab::content::media::fetch_media_as_markdown(&cfg.url, cfg.language.as_deref(), false)
.await
{
Ok(result) => {
if !cfg.no_save {
save_to_hebb(&cfg.url, &result.markdown, "").await;
}
output_body(
&result.markdown,
cfg.output_file.as_deref(),
cfg.links,
cfg.max_body,
)?;
return Ok(());
}
Err(e) => {
tracing::warn!("media transcription failed ({e:#}), falling back to normal fetch");
}
}
}
let site_router = nab::site::SiteRouter::new();
let cookie_opt = non_empty(&cookie_header);
if let Some(site_content) = site_router.try_extract(&cfg.url, &client, cookie_opt).await {
output_body(
&site_content.markdown,
cfg.output_file.as_deref(),
cfg.links,
cfg.max_body,
)?;
return Ok(());
}
let markdown = !cfg.raw_html;
if cfg.use_1password && OnePasswordAuth::is_available() {
let auth = OnePasswordAuth::new(None);
if let Ok(Some(cred)) = auth.get_credential_for_url(&cfg.url)
&& matches!(cfg.format, OutputFormat::Full)
{
write_stdout_line(&format!("š Found 1Password: {}", cred.title))?;
}
}
if let Some(warmup) = &cfg.warmup_url {
if matches!(cfg.format, OutputFormat::Full) {
write_stdout_line(&format!("š„ Warming up session: {warmup}"))?;
}
let mut warmup_req = client.inner().get(warmup.as_str());
warmup_req = warmup_req.headers(profile.to_headers());
if !cookie_header.is_empty() {
warmup_req = warmup_req.header("Cookie", &cookie_header);
}
let _ = warmup_req.send().await;
}
let start = Instant::now();
let is_simple_get = cfg.method.eq_ignore_ascii_case("GET")
&& cookie_header.is_empty()
&& cfg.custom_headers.is_empty()
&& cfg.data.is_none()
&& !cfg.auto_referer
&& !cfg.no_redirect;
let (status, version, set_cookies, content_type, response_headers, body_bytes) =
if is_simple_get {
execute_safe_get(&client, &cfg.url, cfg.show_headers).await?
} else {
execute_manual_request(&client, cfg, &profile, &cookie_header).await?
};
let elapsed = start.elapsed();
let raw_text = String::from_utf8_lossy(&body_bytes).to_string();
if cfg.capture_cookies && !set_cookies.is_empty() {
write_stdout_line("šŖ Set-Cookie:")?;
for cookie in &set_cookies {
if let Some(name_value) = cookie.split(';').next() {
write_stdout_line(&format!(" {name_value}"))?;
}
}
}
let body_len = body_bytes.len();
let (body_text, quality) = if markdown && !cfg.links {
let converted = convert_body_to_markdown(
&body_bytes,
&content_type,
&cfg.url,
cfg.format,
cfg.html_options,
)
.await?;
// Attempt Next.js content chunk recovery when extraction yields thin content.
// The readability extractor may capture 300-600 chars of nav/header/footer
// even when the article body is empty, so we use a higher threshold (800)
// combined with a low quality confidence score (<0.5) to detect this case.
let quality_is_low = converted
.quality
.as_ref()
.is_some_and(|q| q.confidence < 0.5);
let is_nextjs_meta = content_type.contains("html")
&& body_len > 5_000
&& nab::content::spa_extract::is_nextjs_metadata_only(&raw_text);
let (final_markdown, final_quality) = if is_nextjs_meta
&& (converted.markdown.len() < 800 || quality_is_low)
{
match recover_nextjs_content_chunks(&client, &raw_text, &cfg.url, cfg.format).await {
Some(recovered) => (recovered, converted.quality),
None => (converted.markdown, converted.quality),
}
} else {
(converted.markdown, converted.quality)
};
(final_markdown, final_quality)
} else {
(raw_text.clone(), None)
};
// āā Apple Vision OCR enrichment āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
let body_text = if !cfg.no_ocr && !cfg.raw_html && content_type.contains("html") {
enrich_with_ocr(&body_text, &raw_text, &cfg.url, &client).await
} else {
body_text
};
// āā Auto-save to hebb kv:urls āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
if !cfg.no_save {
save_to_hebb(&cfg.url, &body_text, &raw_text).await;
}
for warning in build_fetch_diagnostics(
status.as_u16(),
&raw_text,
Some(&content_type),
body_len,
&body_text,
quality.as_ref(),
cfg.html_options.allow_jina_fallback,
) {
eprintln!("ā ļø {warning}");
}
if cfg.show_diff {
emit_diff(&cfg.url, &body_text, cfg.format)?;
}
print_output(
cfg,
&FetchResponse {
profile: &profile,
cookie_header: &cookie_header,
status,
version: &version,
elapsed,
response_headers: &response_headers,
body_len,
body_text: &body_text,
raw_text: &raw_text,
content_type: &content_type,
quality: quality.as_ref(),
},
)?;
Ok(())
}
/// Execute a safe GET request via `fetch_safe`.
async fn execute_safe_get(
client: &AcceleratedClient,
url: &str,
show_headers: bool,
) -> Result<(
reqwest::StatusCode,
String,
Vec<String>,
String,
Vec<(String, String)>,
bytes::Bytes,
)> {
let config = SafeFetchConfig::default();
let safe_resp = client.fetch_safe(url, &config).await?;
let set_cookies: Vec<String> = safe_resp
.headers
.iter()
.filter(|(k, _)| k.eq_ignore_ascii_case("set-cookie"))
.map(|(_, v)| v.clone())
.collect();
let resp_headers: Vec<(String, String)> = if show_headers {
safe_resp.headers.clone()
} else {
Vec::new()
};
Ok((
safe_resp.status,
String::from("HTTP/2"),
set_cookies,
safe_resp.content_type.clone(),
resp_headers,
safe_resp.body,
))
}
/// Execute a manually-built request (non-GET, cookies, custom headers, etc.).
async fn execute_manual_request(
client: &AcceleratedClient,
cfg: &FetchConfig,
profile: &nab::fingerprint::BrowserProfile,
cookie_header: &str,
) -> Result<(
reqwest::StatusCode,
String,
Vec<String>,
String,
Vec<(String, String)>,
bytes::Bytes,
)> {
let url = &cfg.url;
let mut request = match cfg.method.to_uppercase().as_str() {
"POST" => client.inner().post(url),
"PUT" => client.inner().put(url),
"PATCH" => client.inner().patch(url),
"DELETE" => client.inner().delete(url),
"HEAD" => client.inner().head(url),
_ => client.inner().get(url),
};
if let Some(body_data) = &cfg.data {
request = request.body(body_data.clone());
if !cfg
.custom_headers
.iter()
.any(|h| h.to_lowercase().starts_with("content-type"))
{
request = request.header("Content-Type", "application/json");
}
}
request = request.headers(profile.to_headers());
if !cookie_header.is_empty() {
request = request.header("Cookie", cookie_header);
}
if cfg.auto_referer
&& let Some(referer) = super::build_referer(url)
{
request = request.header("Referer", referer);
}
for header_str in &cfg.custom_headers {
let parts: Vec<&str> = header_str.splitn(2, ':').collect();
if parts.len() == 2 {
request = request.header(parts[0].trim(), parts[1].trim());
}
}
let response = request.send().await?;
let status = response.status();
let version_str = format!("{:?}", response.version());
let set_cookies: Vec<String> = response
.headers()
.get_all("set-cookie")
.iter()
.filter_map(|v| v.to_str().ok().map(String::from))
.collect();
let content_type = response
.headers()
.get("content-type")
.and_then(|v| v.to_str().ok())
.unwrap_or("text/html")
.to_string();
let resp_headers: Vec<(String, String)> = if cfg.show_headers {
response
.headers()
.iter()
.map(|(name, value)| {
(
name.to_string(),
value.to_str().unwrap_or("<binary>").to_string(),
)
})
.collect()
} else {
Vec::new()
};
let bytes = response.bytes().await?;
Ok((
status,
version_str,
set_cookies,
content_type,
resp_headers,
bytes,
))
}
/// Attempt to recover article content from Next.js webpack content chunks.
///
/// When a Next.js page has `__NEXT_DATA__` with only metadata (no article body),
/// the content is often compiled into a lazy-loaded webpack chunk (common for MDX
/// blogs). This function:
///
/// 1. Discovers the webpack runtime and page component script URLs from the HTML
/// 2. Fetches them to find the content chunk filename
/// 3. Fetches the content chunk
/// 4. Extracts readable text from the compiled JSX
///
/// Returns `Some(markdown)` on success, `None` if recovery fails at any step.
async fn recover_nextjs_content_chunks(
client: &nab::AcceleratedClient,
html: &str,
page_url: &str,
format: crate::OutputFormat,
) -> Option<String> {
if matches!(format, crate::OutputFormat::Full) {
eprintln!(" Attempting Next.js content chunk recovery...");
}
let recovered = nab::util::recover_nextjs_chunks(client, html, page_url).await;
if matches!(format, crate::OutputFormat::Full)
&& let Some(content) = recovered.as_ref()
{
eprintln!(" Recovered {} chars from content chunk", content.len());
}
recovered
}
/// Conversion result bundled with quality metadata for the output layer.
struct ConvertedBody {
markdown: String,
quality: Option<nab::content::quality::QualityScore>,
}
/// Convert body bytes to markdown via `ContentRouter`.
async fn convert_body_to_markdown(
body_bytes: &bytes::Bytes,
content_type: &str,
url: &str,
format: OutputFormat,
html_options: nab::content::html::HtmlConversionOptions,
) -> Result<ConvertedBody> {
let router = nab::content::ContentRouter::with_html_options(html_options);
let bytes = body_bytes.to_vec();
let ct = content_type.to_string();
let fetch_url = url.to_string();
let result = tokio::time::timeout(
std::time::Duration::from_secs(60),
tokio::task::spawn_blocking(move || router.convert_with_url(&bytes, &ct, Some(&fetch_url))),
)
.await
.map_err(|_| anyhow::anyhow!("Content conversion timed out after 60s"))???;
if matches!(format, OutputFormat::Full)
&& let Some(pages) = result.page_count
{
write_stdout_line(&format!(" Pages: {pages}"))?;
write_stdout_line(&format!(" Conversion: {:.1}ms", result.elapsed_ms))?;
}
Ok(ConvertedBody {
quality: result.quality,
markdown: result.markdown,
})
}
/// Response data collected after the HTTP request completes.
struct FetchResponse<'a> {
profile: &'a nab::fingerprint::BrowserProfile,
cookie_header: &'a str,
status: reqwest::StatusCode,
version: &'a str,
elapsed: std::time::Duration,
response_headers: &'a [(String, String)],
body_len: usize,
body_text: &'a str,
raw_text: &'a str,
content_type: &'a str,
/// Extraction quality score ā present for HTML, absent for raw/binary content.
quality: Option<&'a nab::content::quality::QualityScore>,
}
/// Print the response according to the requested output format.
fn print_output(cfg: &FetchConfig, resp: &FetchResponse<'_>) -> Result<()> {
let markdown = !cfg.raw_html;
let out_path = cfg.output_file.as_deref();
match cfg.format {
OutputFormat::Compact => {
write_stdout_line(&format!(
"{} {}B {:.0}ms",
resp.status.as_u16(),
resp.body_len,
resp.elapsed.as_secs_f64() * 1000.0
))?;
if cfg.show_body || out_path.is_some() || markdown || cfg.links {
output_body(resp.body_text, out_path, cfg.links, cfg.max_body)?;
}
}
OutputFormat::Json => {
let metadata = serde_json::json!({
"title": extract_title(resp.raw_text),
"content_length": resp.body_len,
"content_type": resp.content_type,
});
let mut output = serde_json::json!({
"url": cfg.url,
"status": resp.status.as_u16(),
"content_type": resp.content_type,
"markdown": resp.body_text,
"metadata": metadata,
"elapsed_ms": (resp.elapsed.as_secs_f64() * 1000.0 * 10.0).round() / 10.0,
});
if let Some(q) = resp.quality {
output["confidence"] = serde_json::json!(q.confidence);
output["quality"] = serde_json::to_value(q)?;
}
write_stdout_line(&serde_json::to_string(&output)?)?;
if let Some(path) = out_path {
let mut file = File::create(path)?;
file.write_all(resp.body_text.as_bytes())?;
}
}
OutputFormat::Full => {
write_stdout_line(&format!("š Fetching: {}", cfg.url))?;
write_stdout_line(&format!("š User-Agent: {}", resp.profile.user_agent))?;
if !resp.cookie_header.is_empty() {
write_stdout_line(&format!(
"šŖ Loaded {} cookies from {}",
resp.cookie_header.matches('=').count(),
if cfg.cookies == "auto" {
"browser (auto-detected)"
} else {
&cfg.cookies
}
))?;
}
write_stdout_line("\nš Response:")?;
write_stdout_line(&format!(" Status: {}", resp.status))?;
write_stdout_line(&format!(" Version: {}", resp.version))?;
write_stdout_line(&format!(
" Time: {:.2}ms",
resp.elapsed.as_secs_f64() * 1000.0
))?;
if cfg.show_headers {
write_stdout_line("\nš Headers:")?;
for (name, value) in resp.response_headers {
write_stdout_line(&format!(" {name}: {value}"))?;
}
}
write_stdout_line(&format!("\nš Body: {} bytes", resp.body_len))?;
if cfg.show_body || out_path.is_some() || markdown || cfg.links {
output_body(resp.body_text, out_path, cfg.links, cfg.max_body)?;
}
}
}
Ok(())
}
/// Load the previous snapshot, compute diff, print it, then save new snapshot.
fn emit_diff(url: &str, current_text: &str, format: OutputFormat) -> Result<()> {
let store = SnapshotStore::default();
let new_snap = ContentSnapshot::new(url, current_text, SystemTime::now());
if let Some(old_snap) = store.load_latest_snapshot(url) {
let diff = nab::content::diff::compute_diff(&old_snap, &new_snap);
let output = format_diff_terminal(&diff);
match format {
OutputFormat::Full | OutputFormat::Compact => write_stdout(&output)?,
OutputFormat::Json => eprint!("{output}"),
}
} else if matches!(format, OutputFormat::Full) {
write_stdout_line("(no previous snapshot ā storing baseline for future --diff runs)")?;
}
let _ = store.save_snapshot(url, &new_snap);
Ok(())
}
/// Extract `<title>` from HTML.
fn extract_title(html: &str) -> Option<String> {
let doc = scraper::Html::parse_document(html);
let sel = scraper::Selector::parse("title").ok()?;
doc.select(&sel)
.next()
.map(|el| el.text().collect::<String>().trim().to_string())
}
fn build_fetch_diagnostics(
status: u16,
raw_text: &str,
content_type: Option<&str>,
html_len: usize,
body_text: &str,
quality: Option<&nab::content::quality::QualityScore>,
allow_jina_fallback: bool,
) -> Vec<String> {
let mut warnings = Vec::new();
let markdown_len = body_text.len();
let classification = classify_response(ResponseAnalysis {
status,
body: raw_text,
content_type,
html_bytes: content_type
.is_some_and(|value| value.contains("html"))
.then_some(html_len),
markdown: content_type
.is_some_and(|value| value.contains("html"))
.then_some(body_text),
markdown_chars: content_type
.is_some_and(|value| value.contains("html"))
.then_some(markdown_len),
quality,
});
if let Some(primary) = classification.primary() {
let warning = match primary.class {
ResponseClass::BotChallenge => format!(
"Bot or browser challenge detected (HTTP {status}). Browser challenge likely requires cookies or JavaScript.\nTry:\n1. Visit the URL in a browser first\n2. Let nab reuse your default browser cookies automatically unless you intentionally disabled them\n3. Use --cookies brave|chrome|firefox|safari only to override the default browser profile"
),
ResponseClass::RateLimited => format!(
"Rate limiting detected (HTTP {status}).\nRetry later, or use an authenticated browser/session path if the site rate-limits anonymous traffic."
),
ResponseClass::Unauthorized => format!(
"Authenticated access appears to be required (HTTP {status}).\nSign in in your browser first, then retry with the default browser cookies or a named session. If you explicitly disabled cookies, re-enable them."
),
ResponseClass::LoginRequired => "The response looks like a login page.\nSign in in your browser first, then retry. nab already uses your default browser cookies automatically unless you disabled them."
.to_string(),
ResponseClass::Forbidden => {
if status == 999 {
"Nonstandard block status HTTP 999 detected.\nSome sites use this as an anti-automation or access-control response. Retry with the default browser cookies, or override the browser profile with --cookies brave if the authenticated session lives outside your default browser."
.to_string()
} else {
format!(
"Forbidden response (HTTP {status}).\nAccess may require authentication, an allowed browser session, or different permissions."
)
}
}
ResponseClass::ObfuscatedContent => {
obfuscated_content_message(status, markdown_len, html_len)
}
ResponseClass::ThinContent => String::new(),
};
if !warning.is_empty() {
warnings.push(warning);
}
}
if let Some(thin) = classify_thin_content(content_type, html_len, markdown_len, quality) {
warnings.push(thin_content_message(thin));
if !allow_jina_fallback {
warnings.push("Remote reader fallback is disabled by --no-fallback.".to_string());
}
}
warnings
}
fn obfuscated_content_message(status: u16, markdown_len: usize, html_len: usize) -> String {
format!(
"The extracted page content looks encoded or obfuscated rather than readable text (HTTP {status}).\nThis often happens on protected or paywalled pages that return an opaque payload instead of article content.\nObserved output: {markdown_len} chars extracted from {html_len} bytes of HTML.\nTry:\n1. Sign in in your browser first and retry with the default browser cookies\n2. Use a named login session if the site requires an authenticated flow\n3. If the page still returns an opaque blob, the site is likely withholding readable content from non-browser automation"
)
}
fn thin_content_message(diagnostic: ThinContentDiagnostic) -> String {
if let Some(message) =
nab::content::html::detect_thin_content(diagnostic.html_bytes, diagnostic.markdown_chars)
{
return message;
}
format!(
"Output is suspiciously thin ({} chars from {} bytes of HTML). \
Extraction confidence is low, so the main content may be missing.\n \
1. nab spa <url> (extract embedded SPA data)\n \
2. nab fetch <url> (uses default browser cookies automatically)\n \
3. nab fetch --cookies brave <url> (override the browser profile if needed)",
diagnostic.markdown_chars, diagnostic.html_bytes
)
}
// āāā OCR enrichment āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
/// Run OCR on images in `html` and annotate `markdown` with recognized text.
///
/// Returns the original markdown unchanged when the OCR engine is unavailable
/// or when no thin-alt images are found. Per-image errors are silently skipped.
async fn enrich_with_ocr(
markdown: &str,
html: &str,
url: &str,
client: &AcceleratedClient,
) -> String {
let enricher = match FetchOcrEnricher::new() {
Ok(e) if e.is_available() => e,
_ => return markdown.to_string(),
};
let http = client.inner().clone();
let ocr_map = enricher.enrich_images(html, url, &http).await;
if ocr_map.is_empty() {
return markdown.to_string();
}
enricher.annotate_markdown(markdown, &ocr_map)
}
// āāā hebb kv save āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
/// Save the fetch result to hebb's `kv:urls` namespace for future retrieval.
///
/// Spawns `hebb-mcp` as a one-shot child process and sends a single
/// `tools/call kv_set` request over its stdio. Silently no-ops when
/// `hebb-mcp` is not installed. All errors are logged as `debug`.
async fn save_to_hebb(url: &str, markdown: &str, html: &str) {
if !hebb_is_available() {
return;
}
let key = url_key(url);
let title = extract_title(html).unwrap_or_default();
if let Err(e) = hebb_kv_set_oneshot("urls", &key, url, &title, markdown).await {
tracing::debug!("hebb kv_set skipped: {e}");
}
}
/// Return `true` when `hebb-mcp` is locatable on this system.
fn hebb_is_available() -> bool {
if which::which("hebb-mcp").is_ok() {
return true;
}
dirs::data_local_dir().is_some_and(|d| d.join("hebb/bin/hebb-mcp").exists())
}
/// Spawn `hebb-mcp` for a single `kv_set` call then let the process exit.
///
/// Uses the same MCP JSON-RPC stdio protocol as `HebbClient` but without
/// maintaining a long-lived subprocess.
async fn hebb_kv_set_oneshot(
namespace: &str,
key: &str,
url: &str,
title: &str,
markdown: &str,
) -> Result<()> {
use std::process::Stdio;
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
let binary = if let Ok(p) = which::which("hebb-mcp") {
p
} else if let Some(managed) = dirs::data_local_dir().map(|d| d.join("hebb/bin/hebb-mcp")) {
if managed.exists() {
managed
} else {
return Err(anyhow::anyhow!("hebb-mcp not found"));
}
} else {
return Err(anyhow::anyhow!("hebb-mcp not found"));
};
let mut child = tokio::process::Command::new(&binary)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::null())
.spawn()
.map_err(|e| anyhow::anyhow!("spawn hebb-mcp: {e}"))?;
let mut stdin = child.stdin.take().expect("piped");
let stdout = child.stdout.take().expect("piped");
let mut reader = BufReader::new(stdout);
// āā initialize āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
let init_req = serde_json::to_string(&json!({
"jsonrpc": "2.0", "id": 0, "method": "initialize",
"params": {
"protocolVersion": "2025-11-25",
"capabilities": { "sampling": {} },
"clientInfo": { "name": "nab-cli", "version": env!("CARGO_PKG_VERSION") }
}
}))?;
stdin.write_all(format!("{init_req}\n").as_bytes()).await?;
stdin.flush().await?;
// Wait for initialize response.
let mut line = String::new();
reader.read_line(&mut line).await?;
let _init_resp: serde_json::Value = serde_json::from_str(line.trim())?;
// initialized notification
let notif = serde_json::to_string(&json!({
"jsonrpc": "2.0", "method": "notifications/initialized", "params": {}
}))?;
stdin.write_all(format!("{notif}\n").as_bytes()).await?;
stdin.flush().await?;
// āā kv_set āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
let call_req = serde_json::to_string(&json!({
"jsonrpc": "2.0", "id": 1, "method": "tools/call",
"params": {
"name": "kv_set",
"arguments": {
"namespace": namespace,
"key": key,
"value": { "url": url, "title": title },
"content_text": markdown,
}
}
}))?;
stdin.write_all(format!("{call_req}\n").as_bytes()).await?;
stdin.flush().await?;
drop(stdin);
// Read kv_set response (best-effort; don't block indefinitely).
let _ = tokio::time::timeout(std::time::Duration::from_secs(5), async {
let mut resp_line = String::new();
let _ = reader.read_line(&mut resp_line).await;
})
.await;
let _ = child.kill().await;
Ok(())
}
/// Derive a short, stable key from a URL using the first 16 hex chars of
/// its SHA-256 hash.
fn url_key(url: &str) -> String {
use sha2::{Digest, Sha256};
let digest = Sha256::digest(url.as_bytes());
hex::encode(&digest[..8]) // 8 bytes = 16 hex chars
}
/// Local alias for the canonical Tor proxy URL defined in the library crate.
///
/// The `socks5h` scheme routes DNS through the proxy, preventing leaks to the
/// local resolver that would reveal the destination to the ISP.
const TOR_PROXY_URL: &str = nab::TOR_PROXY_URL;
/// Build HTTP client with optional proxy and redirect settings.
///
/// When `tor` is `true` the client is configured to route all traffic through
/// the Tor SOCKS5 proxy at `127.0.0.1:9050`. An explicit `proxy` value takes
/// precedence over `tor` ā they are mutually exclusive. If Tor is unavailable
/// a warning is emitted and the request proceeds without a proxy.
pub(super) fn build_client(
no_redirect: bool,
proxy: Option<&str>,
tor: bool,
) -> Result<AcceleratedClient> {
let proxy_url = proxy
.map(String::from)
.or_else(|| tor.then(|| TOR_PROXY_URL.to_owned()))
.or_else(|| std::env::var("HTTPS_PROXY").ok())
.or_else(|| std::env::var("HTTP_PROXY").ok())
.or_else(|| std::env::var("ALL_PROXY").ok())
.or_else(|| std::env::var("https_proxy").ok())
.or_else(|| std::env::var("http_proxy").ok())
.or_else(|| std::env::var("all_proxy").ok());
if let Some(ref purl) = proxy_url {
match build_client_with_proxy(purl, no_redirect) {
Ok(client) => return Ok(client),
Err(e) if tor && proxy.is_none() => {
// Tor was requested but the daemon is not running; warn and fall
// back to a direct connection so the caller still gets a result.
eprintln!("ā ļø Tor proxy unavailable ({e:#}); falling back to direct connection");
}
Err(e) => return Err(e),
}
}
if no_redirect {
AcceleratedClient::new_no_redirect()
} else {
AcceleratedClient::new()
}
}
/// Build a `reqwest` client that routes all traffic through the given proxy URL.
fn build_client_with_proxy(proxy_url: &str, no_redirect: bool) -> Result<AcceleratedClient> {
let proxy = reqwest::Proxy::all(proxy_url)
.map_err(|e| anyhow::anyhow!("Invalid proxy URL '{proxy_url}': {e}"))?;
let mut builder = reqwest::Client::builder().proxy(proxy);
if no_redirect {
builder = builder.redirect(reqwest::redirect::Policy::none());
}
let inner_client = builder.build()?;
AcceleratedClient::from_client(inner_client)
}
// Re-export from mod.rs for internal use.
pub(super) use super::non_empty;
#[cfg(test)]
mod tests {
use super::{TOR_PROXY_URL, build_client, build_fetch_diagnostics};
#[test]
fn build_fetch_diagnostics_for_bot_challenge_mentions_cookies() {
let warning = build_fetch_diagnostics(
429,
"<html><body>Vercel Security Checkpoint</body></html>",
Some("text/html"),
0,
"",
None,
true,
)
.into_iter()
.next()
.expect("expected challenge warning");
assert!(
warning.contains("challenge"),
"warning should mention challenge, got: {warning}"
);
assert!(
warning.contains("--cookies"),
"warning should suggest --cookies workaround, got: {warning}"
);
}
#[test]
fn build_fetch_diagnostics_for_rate_limit_is_not_bot_specific() {
let warning = build_fetch_diagnostics(
429,
"Rate limit exceeded. Please slow down.",
Some("text/html"),
0,
"",
None,
true,
)
.into_iter()
.next()
.expect("expected rate-limit warning");
assert!(
warning.contains("Rate limiting"),
"warning should mention rate limiting, got: {warning}"
);
}
#[test]
fn build_fetch_diagnostics_for_http_401_mentions_authenticated_access() {
let warning =
build_fetch_diagnostics(401, "Unauthorized", Some("text/html"), 0, "", None, true)
.into_iter()
.next()
.expect("expected unauthorized warning");
assert!(
warning.contains("Authenticated access appears to be required"),
"warning should mention authenticated access, got: {warning}"
);
assert!(
!warning.contains("login page"),
"warning should not pretend a bare 401 is a login page, got: {warning}"
);
}
#[test]
fn build_fetch_diagnostics_for_thin_content_includes_no_fallback_hint() {
let thin_markdown = "x".repeat(100);
let warnings = build_fetch_diagnostics(
200,
"<html></html>",
Some("text/html"),
20_000,
&thin_markdown,
None,
false,
);
assert!(
warnings
.iter()
.any(|warning| warning.contains("suspiciously thin")),
"expected thin-content warning, got: {warnings:?}"
);
assert!(
warnings
.iter()
.any(|warning| warning.contains("--no-fallback")),
"expected no-fallback hint, got: {warnings:?}"
);
}
#[test]
fn build_fetch_diagnostics_for_obfuscated_content_mentions_paywall_behavior() {
let blob = format!("Title: Protected article\n\n{}", "AbC123+/".repeat(700));
let warning = build_fetch_diagnostics(
200,
"<html><body><script>protected payload</script></body></html>",
Some("text/html"),
40_000,
&blob,
None,
true,
)
.into_iter()
.next()
.expect("expected obfuscated-content warning");
assert!(
warning.contains("encoded or obfuscated"),
"warning should explain the blob-like output, got: {warning}"
);
assert!(
warning.contains("paywalled"),
"warning should mention protected/paywalled pages, got: {warning}"
);
}
// āā Tor / proxy routing tests āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
#[test]
fn tor_proxy_url_uses_socks5h_scheme() {
// GIVEN: the canonical Tor proxy constant
// WHEN: we inspect its scheme prefix
// THEN: it uses `socks5h` (DNS via proxy) not plain `socks5`
assert!(
TOR_PROXY_URL.starts_with("socks5h://"),
"Tor proxy must use socks5h:// for DNS-via-proxy; got: {TOR_PROXY_URL}"
);
}
#[test]
fn tor_proxy_url_targets_localhost_9050() {
// GIVEN: the canonical Tor proxy constant
// WHEN: we inspect the host and port
// THEN: it targets the standard Tor daemon address
assert!(
TOR_PROXY_URL.contains("127.0.0.1:9050"),
"Tor proxy must target 127.0.0.1:9050; got: {TOR_PROXY_URL}"
);
}
#[test]
fn build_client_without_tor_succeeds() {
// GIVEN: no proxy, no Tor flag
// WHEN: we build a client
// THEN: it succeeds and uses the default configuration
let result = build_client(false, None, false);
assert!(
result.is_ok(),
"build_client(no_redirect=false, proxy=None, tor=false) must succeed"
);
}
#[test]
fn build_client_with_explicit_proxy_takes_precedence_over_tor() {
// GIVEN: an explicit HTTP proxy and tor=true
// WHEN: we build the client
// THEN: the explicit proxy is used (no error from bad SOCKS5h address)
// The proxy parse step validates URL syntax only; a real TCP
// connection is not made, so this succeeds even without a proxy daemon.
let result = build_client(false, Some("http://127.0.0.1:8080"), true);
assert!(
result.is_ok(),
"explicit proxy must override tor flag without error"
);
}
#[test]
fn build_client_with_invalid_proxy_url_returns_error() {
// GIVEN: a syntactically invalid proxy URL
// WHEN: we build the client
// THEN: an error is returned (not a panic)
let result = build_client(false, Some("not-a-valid-url:::"), false);
assert!(
result.is_err(),
"invalid proxy URL must return an error, not panic"
);
}
}