Skip to main content

websearch/
lib.rs

1//! Zero-infrastructure web search via DuckDuckGo Lite scraping.
2//!
3//! No API key, no backend. Results reuse the same reference-style URL
4//! preservation as the fetch path: each hit's title carries an inline `[N]`
5//! marker and the full URLs are collected into a reference block, keeping the
6//! context window tight while staying citable.
7
8// Shared primitives from webfetch-core; re-exported so internal modules can
9// keep using `crate::compress` / `crate::refs`.
10pub use webfetch_core::{compress, refs, tls};
11
12pub mod extract;
13pub mod types;
14
15use std::time::Duration;
16
17use reqwest::Client;
18
19use crate::compress::estimate_tokens;
20use types::{Reference, SearchOptions, SearchOutput, SearchResult};
21
22const USER_AGENT: &str =
23    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36";
24const MAX_ATTEMPTS: u32 = 3;
25
26/// Fetch the raw DuckDuckGo Lite results page for a query, retrying transient
27/// failures (connection/timeout, 5xx, 429) with exponential backoff.
28pub async fn fetch_ddg_lite(query: &str, options: &SearchOptions) -> anyhow::Result<String> {
29    let builder = Client::builder()
30        .timeout(Duration::from_secs(options.timeout_secs))
31        .gzip(true);
32    // Trust the OS store (+ SSL_CERT_FILE / --ca-cert) so the request succeeds
33    // behind a TLS-intercepting proxy, not just with the bundled webpki roots.
34    let client = options.tls.apply(builder)?.build()?;
35
36    let mut url = format!(
37        "https://lite.duckduckgo.com/lite/?q={}",
38        urlencoding::encode(query)
39    );
40    // DDG safe-search toggle: kp=1 strict, kp=-1 off.
41    if let Some(safe) = options.safe_search {
42        url.push_str(if safe { "&kp=1" } else { "&kp=-1" });
43    }
44
45    let mut delay = Duration::from_millis(200);
46    for attempt_no in 1..=MAX_ATTEMPTS {
47        match attempt(&client, &url).await {
48            Ok(body) => return Ok(body),
49            Err((err, transient)) => {
50                if attempt_no == MAX_ATTEMPTS || !transient {
51                    return Err(err);
52                }
53                tokio::time::sleep(delay).await;
54                delay *= 2;
55            }
56        }
57    }
58    unreachable!("loop returns on the final attempt")
59}
60
61/// One request attempt; the bool reports whether a failure is worth retrying.
62async fn attempt(client: &Client, url: &str) -> Result<String, (anyhow::Error, bool)> {
63    let resp = match client
64        .get(url)
65        .header("User-Agent", USER_AGENT)
66        .send()
67        .await
68    {
69        Ok(r) => r,
70        Err(e) => {
71            let transient = e.is_timeout() || e.is_connect() || e.is_request();
72            return Err((e.into(), transient));
73        }
74    };
75    let status = resp.status();
76    let resp = match resp.error_for_status() {
77        Ok(r) => r,
78        Err(e) => {
79            let transient = status.is_server_error() || status.as_u16() == 429;
80            return Err((e.into(), transient));
81        }
82    };
83    match resp.text().await {
84        Ok(body) => Ok(body),
85        Err(e) => {
86            let transient = e.is_timeout();
87            Err((e.into(), transient))
88        }
89    }
90}
91
92/// Build the reference block (index → URL) from parsed results.
93pub fn build_refs(results: &[SearchResult]) -> Vec<Reference> {
94    results
95        .iter()
96        .map(|r| Reference {
97            index: r.ref_index,
98            url: r.url.clone(),
99        })
100        .collect()
101}
102
103/// Render the inline body: each result as `title [N]` followed by its snippet.
104/// URLs are intentionally absent here — they live in the reference block.
105pub fn format_results(results: &[SearchResult]) -> String {
106    results
107        .iter()
108        .map(|r| {
109            if r.snippet.is_empty() {
110                format!("{} [{}]", r.title, r.ref_index)
111            } else {
112                format!("{} [{}]\n{}", r.title, r.ref_index, r.snippet)
113            }
114        })
115        .collect::<Vec<_>>()
116        .join("\n\n")
117}
118
119/// Render the reference block appended to text output.
120/// Thin wrapper over [`crate::refs::render_block`].
121pub fn render_references(refs: &[Reference]) -> String {
122    crate::refs::render_block(refs)
123}
124
125/// Parse an already-fetched results page into a [`SearchOutput`] (no network).
126pub fn build_output(query: &str, html: &str, max_results: usize) -> SearchOutput {
127    let results = extract::parse_ddg_lite(html, max_results);
128    let references = build_refs(&results);
129
130    let body = format_results(&results);
131    let refs_block = render_references(&references);
132    let full = if refs_block.is_empty() {
133        body
134    } else {
135        format!("{body}\n\n{refs_block}")
136    };
137
138    SearchOutput {
139        query: query.to_string(),
140        token_estimate: estimate_tokens(&full),
141        result_count: results.len(),
142        references,
143        results,
144    }
145}
146
147/// Fetch and parse a query end to end.
148pub async fn run_search(options: SearchOptions) -> anyhow::Result<SearchOutput> {
149    let html = fetch_ddg_lite(&options.query, &options).await?;
150    let max = options.max_results.unwrap_or(5);
151    Ok(build_output(&options.query, &html, max))
152}