Skip to main content

websearch/
lib.rs

1//! Zero-infrastructure web search via DuckDuckGo Lite scraping.
2//!
3//! No API key, no backend. Results reuse the same reference-style URL
4//! preservation as the fetch path: each hit's title carries an inline `[N]`
5//! marker and the full URLs are collected into a reference block, keeping the
6//! context window tight while staying citable.
7
8// Shared primitives from webfetch-core; re-exported so internal modules can
9// keep using `crate::compress` / `crate::refs`.
10pub use webfetch_core::{compress, refs};
11
12pub mod extract;
13pub mod types;
14
15use std::time::Duration;
16
17use reqwest::Client;
18
19use crate::compress::estimate_tokens;
20use types::{Reference, SearchOptions, SearchOutput, SearchResult};
21
22const USER_AGENT: &str =
23    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36";
24const MAX_ATTEMPTS: u32 = 3;
25
26/// Fetch the raw DuckDuckGo Lite results page for a query, retrying transient
27/// failures (connection/timeout, 5xx, 429) with exponential backoff.
28pub async fn fetch_ddg_lite(query: &str, options: &SearchOptions) -> anyhow::Result<String> {
29    let client = Client::builder()
30        .timeout(Duration::from_secs(options.timeout_secs))
31        .gzip(true)
32        .build()?;
33
34    let mut url = format!(
35        "https://lite.duckduckgo.com/lite/?q={}",
36        urlencoding::encode(query)
37    );
38    // DDG safe-search toggle: kp=1 strict, kp=-1 off.
39    if let Some(safe) = options.safe_search {
40        url.push_str(if safe { "&kp=1" } else { "&kp=-1" });
41    }
42
43    let mut delay = Duration::from_millis(200);
44    for attempt_no in 1..=MAX_ATTEMPTS {
45        match attempt(&client, &url).await {
46            Ok(body) => return Ok(body),
47            Err((err, transient)) => {
48                if attempt_no == MAX_ATTEMPTS || !transient {
49                    return Err(err);
50                }
51                tokio::time::sleep(delay).await;
52                delay *= 2;
53            }
54        }
55    }
56    unreachable!("loop returns on the final attempt")
57}
58
59/// One request attempt; the bool reports whether a failure is worth retrying.
60async fn attempt(client: &Client, url: &str) -> Result<String, (anyhow::Error, bool)> {
61    let resp = match client
62        .get(url)
63        .header("User-Agent", USER_AGENT)
64        .send()
65        .await
66    {
67        Ok(r) => r,
68        Err(e) => {
69            let transient = e.is_timeout() || e.is_connect() || e.is_request();
70            return Err((e.into(), transient));
71        }
72    };
73    let status = resp.status();
74    let resp = match resp.error_for_status() {
75        Ok(r) => r,
76        Err(e) => {
77            let transient = status.is_server_error() || status.as_u16() == 429;
78            return Err((e.into(), transient));
79        }
80    };
81    match resp.text().await {
82        Ok(body) => Ok(body),
83        Err(e) => {
84            let transient = e.is_timeout();
85            Err((e.into(), transient))
86        }
87    }
88}
89
90/// Build the reference block (index → URL) from parsed results.
91pub fn build_refs(results: &[SearchResult]) -> Vec<Reference> {
92    results
93        .iter()
94        .map(|r| Reference {
95            index: r.ref_index,
96            url: r.url.clone(),
97        })
98        .collect()
99}
100
101/// Render the inline body: each result as `title [N]` followed by its snippet.
102/// URLs are intentionally absent here — they live in the reference block.
103pub fn format_results(results: &[SearchResult]) -> String {
104    results
105        .iter()
106        .map(|r| {
107            if r.snippet.is_empty() {
108                format!("{} [{}]", r.title, r.ref_index)
109            } else {
110                format!("{} [{}]\n{}", r.title, r.ref_index, r.snippet)
111            }
112        })
113        .collect::<Vec<_>>()
114        .join("\n\n")
115}
116
117/// Render the reference block appended to text output.
118/// Thin wrapper over [`crate::refs::render_block`].
119pub fn render_references(refs: &[Reference]) -> String {
120    crate::refs::render_block(refs)
121}
122
123/// Parse an already-fetched results page into a [`SearchOutput`] (no network).
124pub fn build_output(query: &str, html: &str, max_results: usize) -> SearchOutput {
125    let results = extract::parse_ddg_lite(html, max_results);
126    let references = build_refs(&results);
127
128    let body = format_results(&results);
129    let refs_block = render_references(&references);
130    let full = if refs_block.is_empty() {
131        body
132    } else {
133        format!("{body}\n\n{refs_block}")
134    };
135
136    SearchOutput {
137        query: query.to_string(),
138        token_estimate: estimate_tokens(&full),
139        result_count: results.len(),
140        references,
141        results,
142    }
143}
144
145/// Fetch and parse a query end to end.
146pub async fn run_search(options: SearchOptions) -> anyhow::Result<SearchOutput> {
147    let html = fetch_ddg_lite(&options.query, &options).await?;
148    let max = options.max_results.unwrap_or(5);
149    Ok(build_output(&options.query, &html, max))
150}