use crate::error::CliError;
use crate::extraction;
use crate::types::{Config, Endpoint, SafeSearch, SearchResult, TimeFilter};
use rand::Rng;
use reqwest::{Client, Response, StatusCode};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::time::Duration;
use tokio_util::sync::CancellationToken;
const URL_ENDPOINT_HTML_DEFAULT: &str = "https://html.duckduckgo.com/html/";
const URL_ENDPOINT_LITE_DEFAULT: &str = "https://lite.duckduckgo.com/lite/";
const ENV_BASE_URL_HTML: &str = "DUCKDUCKGO_SEARCH_CLI_BASE_URL_HTML";
const ENV_BASE_URL_LITE: &str = "DUCKDUCKGO_SEARCH_CLI_BASE_URL_LITE";
const PAGINATION_DELAY_MIN_MS: u64 = 800;
const PAGINATION_DELAY_MAX_MS: u64 = 1500;
const SILENT_BLOCK_THRESHOLD: usize = 5_000;
const BACKOFF_BASE_MS: u64 = 1000;
const BACKOFF_JITTER_MAX_MS: u64 = 500;
fn calculate_backoff_ms(attempt: u32) -> u64 {
let factor = 1u64 << attempt.min(10);
let backoff = BACKOFF_BASE_MS.saturating_mul(factor);
let jitter = rand::thread_rng().gen_range(0..=BACKOFF_JITTER_MAX_MS);
backoff.saturating_add(jitter)
}
fn parse_retry_after(response: &Response) -> Option<u64> {
let value = response.headers().get("retry-after")?.to_str().ok()?;
if let Ok(secs) = value.parse::<u64>() {
return Some(secs.min(120) * 1000);
}
None
}
pub fn html_base_url() -> String {
std::env::var(ENV_BASE_URL_HTML).unwrap_or_else(|_| URL_ENDPOINT_HTML_DEFAULT.to_string())
}
pub fn lite_base_url() -> String {
std::env::var(ENV_BASE_URL_LITE).unwrap_or_else(|_| URL_ENDPOINT_LITE_DEFAULT.to_string())
}
pub fn build_search_url(
query: &str,
language: &str,
country: &str,
endpoint: Endpoint,
time_filter: Option<TimeFilter>,
safe_search: SafeSearch,
) -> String {
let base = match endpoint {
Endpoint::Html => html_base_url(),
Endpoint::Lite => lite_base_url(),
};
let query_encoded = urlencoding::encode(query);
let kl = format_kl(language, country);
let mut url = String::with_capacity(base.len() + query_encoded.len() + kl.len() + 32);
url.push_str(&base);
url.push_str("?q=");
url.push_str(&query_encoded);
url.push_str("&kl=");
url.push_str(&kl);
if let Some(kp) = safe_search.as_param() {
url.push_str("&kp=");
url.push_str(kp);
}
if let Some(df) = time_filter {
url.push_str("&df=");
url.push_str(df.as_param());
}
url
}
pub fn build_url(query: &str, language: &str, country: &str) -> String {
build_search_url(
query,
language,
country,
Endpoint::Html,
None,
SafeSearch::Moderate,
)
}
#[inline]
pub fn format_kl(language: &str, country: &str) -> String {
let mut kl = String::with_capacity(country.len() + language.len() + 1);
for ch in country.chars() {
kl.push(ch.to_ascii_lowercase());
}
kl.push('-');
for ch in language.chars() {
kl.push(ch.to_ascii_lowercase());
}
kl
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum RetryFailReason {
RateLimited,
Blocked,
HttpError(u16),
Timeout,
Network(String),
}
impl RetryFailReason {
pub fn as_error_code(&self) -> &'static str {
match self {
RetryFailReason::RateLimited => crate::error::codes::RATE_LIMITED,
RetryFailReason::Blocked => crate::error::codes::BLOCKED,
RetryFailReason::HttpError(_) => crate::error::codes::HTTP_ERROR,
RetryFailReason::Timeout => crate::error::codes::TIMEOUT,
RetryFailReason::Network(_) => crate::error::codes::NETWORK_ERROR,
}
}
pub fn message(&self) -> String {
match self {
RetryFailReason::RateLimited => "persistent rate limit (HTTP 429)".to_string(),
RetryFailReason::Blocked => "blocked by DuckDuckGo (HTTP 403)".to_string(),
RetryFailReason::HttpError(status) => format!("HTTP {status} unrecoverable"),
RetryFailReason::Timeout => "persistent timeout".to_string(),
RetryFailReason::Network(msg) => format!("network error: {msg}"),
}
}
}
#[derive(Debug)]
pub struct RetryResult {
pub response: Response,
pub attempts: u32,
}
#[tracing::instrument(skip_all, fields(%url, max_attempts = retries + 1))]
pub async fn execute_with_retry(
client: &Client,
url: &str,
retries: u32,
flag_rate_limit: &Arc<AtomicBool>,
cancellation: &CancellationToken,
) -> std::result::Result<RetryResult, RetryFailReason> {
let total_attempts = retries.saturating_add(1);
let mut last_reason = RetryFailReason::Network("no attempts executed".to_string());
let mut timeout_already_retried = false;
for attempt in 0..total_attempts {
if cancellation.is_cancelled() {
return Err(RetryFailReason::Network("cancelled".to_string()));
}
if flag_rate_limit.load(Ordering::Relaxed) && attempt == 0 {
let extra_ms = rand::thread_rng().gen_range(500..1200);
tracing::debug!(
extra_ms,
"global rate-limit flag active — waiting before retry attempt"
);
tokio::time::sleep(Duration::from_millis(extra_ms)).await;
}
tracing::debug!(attempt = attempt + 1, total = total_attempts, url = %url, "executing GET request");
let envio = tokio::select! {
biased;
_ = cancellation.cancelled() => {
return Err(RetryFailReason::Network("cancelled during request".to_string()));
}
res = client.get(url).send() => res,
};
match envio {
Ok(response) => {
let status = response.status();
if status == StatusCode::ACCEPTED {
flag_rate_limit.store(true, Ordering::Relaxed);
last_reason = RetryFailReason::Blocked;
if attempt + 1 < total_attempts {
let total = calculate_backoff_ms(attempt);
tracing::warn!(
attempt = attempt + 1,
backoff_ms = total,
"HTTP 202 anomaly — DDG soft block, applying backoff"
);
tokio::time::sleep(Duration::from_millis(total)).await;
continue;
}
return Err(RetryFailReason::Blocked);
}
if status.is_success() {
return Ok(RetryResult {
response,
attempts: attempt + 1,
});
}
if status == StatusCode::TOO_MANY_REQUESTS {
flag_rate_limit.store(true, Ordering::Relaxed);
last_reason = RetryFailReason::RateLimited;
if attempt + 1 < total_attempts {
let delay_ms = parse_retry_after(&response)
.unwrap_or_else(|| calculate_backoff_ms(attempt));
tracing::warn!(
attempt = attempt + 1,
backoff_ms = delay_ms,
"HTTP 429 — applying backoff"
);
tokio::time::sleep(Duration::from_millis(delay_ms)).await;
continue;
}
return Err(RetryFailReason::RateLimited);
}
if status == StatusCode::FORBIDDEN {
last_reason = RetryFailReason::Blocked;
if attempt + 1 < total_attempts {
tracing::warn!(
attempt = attempt + 1,
"HTTP 403 — immediate retry (UA rotation applied on next client)"
);
continue;
}
return Err(RetryFailReason::Blocked);
}
return Err(RetryFailReason::HttpError(status.as_u16()));
}
Err(err) => {
if err.is_timeout() {
last_reason = RetryFailReason::Timeout;
if !timeout_already_retried && attempt + 1 < total_attempts {
timeout_already_retried = true;
tracing::warn!("timeout — 1 retry allowed");
continue;
}
return Err(RetryFailReason::Timeout);
}
last_reason = RetryFailReason::Network(err.to_string());
if attempt + 1 < total_attempts {
let backoff = Duration::from_millis(400);
tokio::time::sleep(backoff).await;
continue;
}
return Err(last_reason);
}
}
}
Err(last_reason)
}
pub async fn execute_search(
client: &Client,
query: &str,
idioma: &str,
pais: &str,
) -> Result<String, CliError> {
let url = build_url(query, idioma, pais);
tracing::debug!(url = %url, "Sending GET to the DuckDuckGo HTML endpoint");
let response = client
.get(&url)
.send()
.await
.map_err(|e| CliError::HttpError {
message: format!("failed to send GET to {url}: {e}"),
cause: Some(e.into()),
})?;
let status = response.status();
tracing::debug!(status = %status, "HTTP response received");
if !status.is_success() {
return Err(CliError::HttpError {
message: format!(
"DuckDuckGo returned HTTP {} for {:?}",
status.as_u16(),
query
),
cause: None,
});
}
let html = response.text().await.map_err(|e| CliError::HttpError {
message: format!("failed to read UTF-8 response body: {e}"),
cause: Some(e.into()),
})?;
if html.len() < SILENT_BLOCK_THRESHOLD {
tracing::warn!(
bytes = html.len(),
limiar = SILENT_BLOCK_THRESHOLD,
"suspiciously small response — possible silent block"
);
return Err(CliError::HttpError {
message: format!(
"suspiciously small response ({} bytes < {} threshold) — possible silent block",
html.len(),
SILENT_BLOCK_THRESHOLD
),
cause: None,
});
}
tracing::debug!(bytes = html.len(), "HTML received successfully");
Ok(html)
}
#[derive(Debug)]
pub struct AggregatedSearchResult {
pub results: Vec<SearchResult>,
pub pages_fetched: u32,
pub used_fallback_lite: bool,
pub attempts: u32,
pub effective_endpoint: Endpoint,
}
pub fn extract_pagination_tokens(html: &str) -> Option<(String, String, String)> {
use scraper::Html;
let doc = Html::parse_document(html);
let vqd = doc
.select(sel_vqd())
.next()
.and_then(|el| el.value().attr("value"))
.map(|v| v.to_string())?;
let s = doc
.select(sel_s_input())
.next()
.and_then(|el| el.value().attr("value"))
.map(|v| v.to_string())?;
let dc = doc
.select(sel_dc())
.next()
.and_then(|el| el.value().attr("value"))
.map(|v| v.to_string())?;
Some((vqd, s, dc))
}
fn sel_vqd() -> &'static scraper::Selector {
use std::sync::OnceLock;
static C: OnceLock<scraper::Selector> = OnceLock::new();
C.get_or_init(|| scraper::Selector::parse("input[name='vqd']").unwrap())
}
fn sel_s_input() -> &'static scraper::Selector {
use std::sync::OnceLock;
static C: OnceLock<scraper::Selector> = OnceLock::new();
C.get_or_init(|| scraper::Selector::parse("input[name='s']").unwrap())
}
fn sel_dc() -> &'static scraper::Selector {
use std::sync::OnceLock;
static C: OnceLock<scraper::Selector> = OnceLock::new();
C.get_or_init(|| scraper::Selector::parse("input[name='dc']").unwrap())
}
pub async fn search_with_pagination(
client: &Client,
cfg: &Config,
query: &str,
flag_rate_limit: &Arc<AtomicBool>,
cancellation: &CancellationToken,
) -> std::result::Result<AggregatedSearchResult, RetryFailReason> {
let initial_endpoint = cfg.endpoint;
let initial_url = build_search_url(
query,
&cfg.language,
&cfg.country,
initial_endpoint,
cfg.time_filter,
cfg.safe_search,
);
let first_result = execute_with_retry(
client,
&initial_url,
cfg.retries,
flag_rate_limit,
cancellation,
)
.await?;
let mut accumulated_attempts = first_result.attempts;
let first_html = first_result
.response
.text()
.await
.map_err(|e| RetryFailReason::Network(e.to_string()))?;
if first_html.len() < SILENT_BLOCK_THRESHOLD {
tracing::warn!(
bytes = first_html.len(),
limiar = SILENT_BLOCK_THRESHOLD,
"first page response suspiciously small — possible silent block"
);
return Err(RetryFailReason::Blocked);
}
let mut accumulated_results = match initial_endpoint {
Endpoint::Html => {
extraction::extract_results_with_strategies_cfg(&first_html, &cfg.selectors)
}
Endpoint::Lite => extraction::extract_results_lite_with_cfg(&first_html, &cfg.selectors),
};
let mut used_fallback_lite = false;
let mut effective_endpoint = initial_endpoint;
let mut pages_fetched: u32 = 1;
if accumulated_results.is_empty() && initial_endpoint == Endpoint::Html {
tracing::warn!("HTML returned zero results — trying Lite fallback");
let url_lite = build_search_url(
query,
&cfg.language,
&cfg.country,
Endpoint::Lite,
cfg.time_filter,
cfg.safe_search,
);
match execute_with_retry(
client,
&url_lite,
cfg.retries,
flag_rate_limit,
cancellation,
)
.await
{
Ok(r_lite) => {
accumulated_attempts = accumulated_attempts.saturating_add(r_lite.attempts);
let html_lite = r_lite
.response
.text()
.await
.map_err(|e| RetryFailReason::Network(e.to_string()))?;
let lite_results =
extraction::extract_results_lite_with_cfg(&html_lite, &cfg.selectors);
if !lite_results.is_empty() {
accumulated_results = lite_results;
used_fallback_lite = true;
effective_endpoint = Endpoint::Lite;
}
}
Err(err) => {
tracing::warn!(?err, "Lite fallback also failed — keeping empty");
}
}
}
if effective_endpoint == Endpoint::Html && cfg.pages > 1 && !accumulated_results.is_empty() {
if let Some((mut vqd, mut s, mut dc)) = extract_pagination_tokens(&first_html) {
let mut form_data: Vec<(&str, String)> = vec![
("q", query.to_string()), ("s", s.clone()), ("nextParams", String::new()), ("v", "l".to_string()), ("o", "json".to_string()), ("dc", dc.clone()), ("api", "d.js".to_string()), ("vqd", vqd.clone()), ("kl", format_kl(&cfg.language, &cfg.country)), ];
for page_idx in 2..=cfg.pages {
if cancellation.is_cancelled() {
tracing::debug!("cancellation detected during pagination");
break;
}
let delay_ms =
rand::thread_rng().gen_range(PAGINATION_DELAY_MIN_MS..=PAGINATION_DELAY_MAX_MS);
tokio::select! {
biased;
_ = cancellation.cancelled() => { break; }
_ = tokio::time::sleep(Duration::from_millis(delay_ms)) => {}
}
form_data[1].1.clone_from(&s);
form_data[5].1.clone_from(&dc);
form_data[7].1.clone_from(&vqd);
let base = html_base_url();
let response = match tokio::select! {
biased;
_ = cancellation.cancelled() => {
break;
}
r = client
.post(&base)
.header(reqwest::header::REFERER, "https://html.duckduckgo.com/")
.headers(cfg.browser_profile.pagination_headers())
.form(&form_data)
.send() => r,
} {
Ok(r) => r,
Err(err) => {
tracing::warn!(
?err,
pagina = page_idx,
"network error during pagination — stopping"
);
break;
}
};
if !response.status().is_success() {
tracing::warn!(
status = response.status().as_u16(),
pagina = page_idx,
"pagination returned non-success status — stopping"
);
break;
}
let page_html = match response.text().await {
Ok(t) => t,
Err(e) => {
tracing::warn!(?e, "error reading page body — stopping");
break;
}
};
if page_html.len() < SILENT_BLOCK_THRESHOLD {
tracing::warn!(
bytes = page_html.len(),
limiar = SILENT_BLOCK_THRESHOLD,
pagina = page_idx,
"pagination page suspiciously small — possible silent block"
);
break;
}
let new_results =
extraction::extract_results_with_strategies_cfg(&page_html, &cfg.selectors);
if new_results.is_empty() {
tracing::debug!(pagina = page_idx, "page returned zero results — stopping");
break;
}
let offset = u32::try_from(accumulated_results.len()).unwrap_or(u32::MAX);
for mut r in new_results {
r.position = offset.saturating_add(r.position);
accumulated_results.push(r);
}
pages_fetched = page_idx;
match extract_pagination_tokens(&page_html) {
Some((next_vqd, next_s, next_dc)) => {
vqd = next_vqd;
s = next_s;
dc = next_dc;
}
None => {
tracing::warn!(pagina = page_idx, "pagination tokens missing — stopping");
break;
}
}
}
} else {
tracing::warn!("vqd/s/dc tokens missing on first page — pagination not possible");
}
}
if let Some(n) = cfg.num_results {
let n_usize = n as usize;
if accumulated_results.len() > n_usize {
accumulated_results.truncate(n_usize);
}
}
Ok(AggregatedSearchResult {
results: accumulated_results,
pages_fetched,
used_fallback_lite,
attempts: accumulated_attempts,
effective_endpoint,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn format_kl_concatenates_correctly() {
assert_eq!(format_kl("pt", "br"), "br-pt");
assert_eq!(format_kl("PT", "BR"), "br-pt");
assert_eq!(format_kl("en", "us"), "us-en");
}
#[test]
fn build_url_escapes_spaces_and_accents() {
let url = build_url("endividamento brasileiro", "pt", "br");
assert!(url.starts_with("https://html.duckduckgo.com/html/?q="));
assert!(url.contains("endividamento%20brasileiro"));
assert!(url.contains("&kl=br-pt"));
}
#[test]
fn build_url_escapes_special_characters() {
let url = build_url("C++ tutorial", "en", "us");
assert!(url.contains("C%2B%2B"));
assert!(url.contains("&kl=us-en"));
}
#[test]
fn build_url_with_portuguese_accents() {
let url = build_url("música eletrônica", "pt", "br");
assert!(url.contains("m%C3%BAsica"));
assert!(url.contains("eletr%C3%B4nica"));
}
#[test]
fn build_search_url_adds_optional_params() {
let url = build_search_url(
"rust",
"en",
"us",
Endpoint::Html,
Some(TimeFilter::Week),
SafeSearch::Strict,
);
assert!(url.contains("&kp=1"));
assert!(url.contains("&df=w"));
}
#[test]
fn build_search_url_omits_kp_when_moderate() {
let url = build_search_url(
"rust",
"en",
"us",
Endpoint::Html,
None,
SafeSearch::Moderate,
);
assert!(!url.contains("&kp="));
assert!(!url.contains("&df="));
}
#[test]
fn build_search_url_lite_endpoint_uses_correct_url() {
let url = build_search_url(
"rust",
"en",
"us",
Endpoint::Lite,
None,
SafeSearch::Moderate,
);
assert!(url.starts_with("https://lite.duckduckgo.com/lite/?"));
}
#[test]
fn extract_pagination_tokens_extracts_when_present() {
let html = r#"
<form>
<input name="q" value="rust">
<input name="vqd" value="4-12345678-abc">
<input name="s" value="50">
<input name="dc" value="51">
</form>
"#;
let (vqd, s, dc) = extract_pagination_tokens(html).expect("all present");
assert_eq!(vqd, "4-12345678-abc");
assert_eq!(s, "50");
assert_eq!(dc, "51");
}
#[test]
fn extract_pagination_tokens_returns_none_when_absent() {
let html = r#"<html><body>Sem inputs</body></html>"#;
assert!(extract_pagination_tokens(html).is_none());
}
#[test]
fn retry_fail_reason_returns_correct_error_code() {
assert_eq!(
RetryFailReason::RateLimited.as_error_code(),
crate::error::codes::RATE_LIMITED
);
assert_eq!(
RetryFailReason::Blocked.as_error_code(),
crate::error::codes::BLOCKED
);
assert_eq!(
RetryFailReason::Timeout.as_error_code(),
crate::error::codes::TIMEOUT
);
}
}