duckduckgo-search-cli 0.7.2

CLI in Rust to search DuckDuckGo via pure HTTP, with structured output for LLM consumption.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
// SPDX-License-Identifier: MIT OR Apache-2.0
// Workload: declarative (in-memory merge, no I/O).
//! Result aggregation across sub-queries for the deep-research pipeline.
//!
//! Two strategies are supported:
//!
//! - [`AggregationStrategy::Rrf`] — Reciprocal Rank Fusion (Cormack et al.,
//!   2009). For each sub-query, the score of a result at rank `r` is
//!   `1 / (K + r)`. Scores are summed across all sub-queries that mention the
//!   same canonical URL. The default K is 60 (matches the literature and the
//!   `GraphRAG` memory subsystem's hybrid search).
//! - [`AggregationStrategy::DedupeByUrl`] — canonical-URL deduplication that
//!   keeps the FIRST occurrence (lowest source-index wins) and discards the
//!   rest. No scoring is performed; the returned list preserves the input
//!   order, which is convenient for stable, predictable JSON output.
//!
//! # URL canonicalization (gap 2.3 of the v0.7.0 audit)
//!
//! The canonical form is computed by [`canonicalize_url`]:
//!
//! 1. Lowercase the scheme and host.
//! 2. Strip the fragment (everything after `#`).
//! 3. Drop tracking query parameters (`utm_*`, `fbclid`, `gclid`, `ref`,
//!    `mc_cid`, `mc_eid`).
//! 4. Sort the remaining query parameters alphabetically (so that
//!    `?a=1&b=2` and `?b=2&a=1` compare equal).
//! 5. Normalise trailing slashes on the path (collapse `///+` to `/` and
//!    strip the trailing slash when the path is longer than `/`).
//!
//! The canonical form is then hashed with `blake3` (first 16 hex chars) to
//! serve as the dedup key — URLs with identical canonical form collide.

use crate::types::SearchOutput;
use serde::{Deserialize, Serialize};
use url::Url;

/// Aggregation strategy variant.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AggregationStrategy {
    /// Reciprocal Rank Fusion with the given K constant.
    Rrf(u32),
    /// Canonical-URL deduplication, keep first occurrence.
    DedupeByUrl,
}

/// Aggregated evidence item, scored and sorted by descending score.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct AggregatedItem {
    /// Source URL (as returned by the upstream search).
    pub url: String,
    /// Title (first non-empty across duplicates is kept).
    pub title: String,
    /// Display URL (optional, kept from the first occurrence).
    #[serde(rename = "url_exibicao", skip_serializing_if = "Option::is_none")]
    pub display_url: Option<String>,
    /// Optional snippet (first non-empty).
    #[serde(skip_serializing_if = "Option::is_none")]
    pub snippet: Option<String>,
    /// Score in `[0.0, 1.0]` (1.0 = best, 0.0 = irrelevant). Higher is better.
    pub score: f64,
    /// Original position in the source sub-query (1-indexed).
    #[serde(rename = "posicao")]
    pub position: u32,
    /// Texts of the sub-queries that produced this result.
    #[serde(rename = "fontes")]
    pub sources: Vec<String>,
}

/// Computes the canonical form of a URL for deduplication.
///
/// Returns the input unchanged if it fails to parse — that way we never lose
/// data, and the hash of the original string still serves as a unique key.
///
/// # Examples
///
/// ```
/// use duckduckgo_search_cli::aggregation::canonicalize_url;
///
/// // Tracking parameters are stripped.
/// assert_eq!(
///     canonicalize_url("https://Example.com/a?utm_source=x&id=1"),
///     canonicalize_url("https://example.com/a?id=1"),
/// );
///
/// // Query parameters are sorted alphabetically.
/// assert_eq!(
///     canonicalize_url("https://example.com/p?b=2&a=1"),
///     "https://example.com/p?a=1&b=2",
/// );
///
/// // Fragment is removed.
/// assert_eq!(
///     canonicalize_url("https://example.com/p#section"),
///     "https://example.com/p",
/// );
///
/// // Unparseable input is returned unchanged.
/// assert_eq!(
///     canonicalize_url("not a url"),
///     "not a url",
/// );
/// ```
pub fn canonicalize_url(raw: &str) -> String {
    let parsed = match Url::parse(raw) {
        Ok(u) => u,
        Err(_) => return raw.to_string(),
    };

    let mut url = parsed.clone();
    let lower_host = url.host_str().map(|h| h.to_ascii_lowercase());
    if let Some(h) = lower_host {
        let _ = url.set_host(Some(&h));
    }
    let _ = url.set_scheme(url.scheme().to_ascii_lowercase().as_str());
    url.set_fragment(None);

    // Drop tracking params.
    let tracking: &[&str] = &[
        "utm_source",
        "utm_medium",
        "utm_campaign",
        "utm_term",
        "utm_content",
        "fbclid",
        "gclid",
        "ref",
        "mc_cid",
        "mc_eid",
    ];
    let pairs: Vec<(String, String)> = url
        .query_pairs()
        .filter(|(k, _)| !tracking.contains(&k.as_ref()))
        .map(|(k, v)| (k.into_owned(), v.into_owned()))
        .collect();
    if !pairs.is_empty() {
        let mut sorted = pairs;
        sorted.sort_by(|a, b| a.0.cmp(&b.0));
        let mut ser = url::form_urlencoded::Serializer::new(String::new());
        for (k, v) in &sorted {
            ser.append_pair(k, v);
        }
        let query = ser.finish();
        url.set_query(Some(&query));
    } else {
        url.set_query(None);
    }

    let mut out = url.to_string();
    // Collapse repeated slashes in the path (after the scheme://host).
    if let Some(idx) = out.find("://") {
        let (scheme, rest) = out.split_at(idx + 3);
        // The character after "scheme://" is the first char of the host.
        // Find the first '/' that follows the host (it separates host from path).
        let host_end = rest.find('/').unwrap_or(rest.len());
        let host = &rest[..host_end];
        let path_and_query = &rest[host_end..];
        // Split into path and query so we can normalize the path independently.
        let (raw_path, raw_query) = match path_and_query.find('?') {
            Some(q) => (&path_and_query[..q], &path_and_query[q..]),
            None => (path_and_query, ""),
        };
        // Collapse repeated slashes in the path.
        let collapsed_path: String =
            raw_path
                .chars()
                .fold(String::with_capacity(raw_path.len()), |mut acc, c| {
                    if c == '/' && acc.ends_with('/') {
                        acc
                    } else {
                        acc.push(c);
                        acc
                    }
                });
        // Trim trailing slashes from the path (but keep at least one "/").
        let trimmed_path = collapsed_path.trim_end_matches('/');
        let trimmed_path = if trimmed_path.is_empty() {
            "/"
        } else {
            trimmed_path
        };
        out = format!("{scheme}{host}{trimmed_path}{raw_query}");
    }
    out
}

/// Returns a short, deterministic hash of the canonical URL.
pub fn canonical_hash(raw: &str) -> String {
    let canonical = canonicalize_url(raw);
    let hash = blake3::hash(canonical.as_bytes());
    let hex = hash.to_hex();
    hex.to_string()[..16].to_string()
}

/// Merges a list of per-sub-query `SearchOutput` into a single ranked list.
///
/// The order of `outputs` is the order in which the sub-queries were
/// dispatched; ranks inside each output are 1-indexed.
pub fn aggregate(outputs: &[SearchOutput], strategy: AggregationStrategy) -> Vec<AggregatedItem> {
    match strategy {
        AggregationStrategy::Rrf(k) => rrf_aggregate(outputs, k),
        AggregationStrategy::DedupeByUrl => dedupe_by_url(outputs),
    }
}

fn rrf_aggregate(outputs: &[SearchOutput], k: u32) -> Vec<AggregatedItem> {
    use std::collections::HashMap;

    #[derive(Default)]
    struct Entry {
        url: String,
        title: String,
        display_url: Option<String>,
        snippet: Option<String>,
        score: f64,
        position: u32,
        sources: Vec<String>,
    }

    let mut map: HashMap<String, Entry> = HashMap::new();
    for output in outputs {
        for (idx, r) in output.results.iter().enumerate() {
            let key = canonical_hash(&r.url);
            let rank = (idx as u32) + 1;
            let score = 1.0 / ((k as f64) + (rank as f64));
            let entry = map.entry(key).or_insert_with(|| Entry {
                url: r.url.clone(),
                title: String::new(),
                display_url: r.display_url.clone(),
                snippet: r.snippet.clone(),
                score: 0.0,
                position: rank,
                sources: Vec::new(),
            });
            entry.score += score;
            if entry.title.is_empty() {
                entry.title = r.title.clone();
            }
            if entry.display_url.is_none() {
                entry.display_url = r.display_url.clone();
            }
            if entry.snippet.is_none() {
                entry.snippet = r.snippet.clone();
            }
            if rank < entry.position {
                entry.position = rank;
            }
            if !output.query.is_empty() && !entry.sources.contains(&output.query) {
                entry.sources.push(output.query.clone());
            }
        }
    }

    let mut out: Vec<AggregatedItem> = map
        .into_values()
        .map(|e| AggregatedItem {
            url: e.url,
            title: e.title,
            display_url: e.display_url,
            snippet: e.snippet,
            // Normalize to [0, 1] by dividing by the theoretical maximum (one
            // occurrence at rank 1 across all sub-queries). For practical
            // outputs, scores usually fall in (0, 0.05].
            score: e.score,
            position: e.position,
            sources: e.sources,
        })
        .collect();
    out.sort_by(|a, b| {
        b.score
            .partial_cmp(&a.score)
            .unwrap_or(std::cmp::Ordering::Equal)
            .then_with(|| a.position.cmp(&b.position))
    });
    out
}

fn dedupe_by_url(outputs: &[SearchOutput]) -> Vec<AggregatedItem> {
    use std::collections::HashMap;

    let mut map: HashMap<String, AggregatedItem> = HashMap::new();
    for output in outputs {
        for (idx, r) in output.results.iter().enumerate() {
            let key = canonical_hash(&r.url);
            map.entry(key).or_insert_with(|| AggregatedItem {
                url: r.url.clone(),
                title: r.title.clone(),
                display_url: r.display_url.clone(),
                snippet: r.snippet.clone(),
                score: 1.0,
                position: (idx as u32) + 1,
                sources: if output.query.is_empty() {
                    Vec::new()
                } else {
                    vec![output.query.clone()]
                },
            });
        }
    }
    let mut out: Vec<AggregatedItem> = map.into_values().collect();
    out.sort_by_key(|i| i.position);
    out
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::types::SearchMetadata;

    fn out_with(query: &str, urls: &[&str]) -> SearchOutput {
        SearchOutput {
            query: query.to_string(),
            engine: "duckduckgo".to_string(),
            endpoint: "html".to_string(),
            timestamp: "2026-06-07T00:00:00Z".to_string(),
            region: "br-pt".to_string(),
            result_count: urls.len() as u32,
            results: urls
                .iter()
                .enumerate()
                .map(|(i, u)| crate::types::SearchResult {
                    position: (i as u32) + 1,
                    title: format!("title-{}", i),
                    url: u.to_string(),
                    display_url: None,
                    snippet: Some(format!("snippet-{}", i)),
                    original_title: None,
                    content: None,
                    content_size: None,
                    content_extraction_method: None,
                })
                .collect(),
            pages_fetched: 1,
            error: None,
            message: None,
            metadata: SearchMetadata {
                execution_time_ms: 0,
                selectors_hash: String::new(),
                retries: 0,
                used_fallback_endpoint: false,
                concurrent_fetches: 0,
                fetch_successes: 0,
                fetch_failures: 0,
                used_chrome: false,
                user_agent: String::new(),
                used_proxy: false,
                identity_used: None,
                cascade_level: None,
            },
        }
    }

    #[test]
    fn canonicalize_strips_utm_and_lowercases_host() {
        let a = "HTTPS://Example.com/path/?utm_source=x&b=2&a=1#frag";
        let b = "https://example.com/path?a=1&b=2";
        assert_eq!(canonicalize_url(a), canonicalize_url(b));
    }

    #[test]
    fn canonicalize_collapses_repeated_slashes() {
        let a = "https://example.com/foo//bar///baz/";
        let b = "https://example.com/foo/bar/baz";
        assert_eq!(canonicalize_url(a), canonicalize_url(b));
    }

    #[test]
    fn canonicalize_preserves_root() {
        assert_eq!(
            canonicalize_url("https://example.com"),
            "https://example.com/"
        );
    }

    #[test]
    fn canonical_hash_is_stable() {
        let a = canonical_hash("https://Example.com/a?utm_source=x&b=1&a=1");
        let b = canonical_hash("https://example.com/a?a=1&b=1");
        assert_eq!(a, b);
    }

    #[test]
    fn rrf_combines_duplicate_urls_with_combined_score() {
        let a = out_with("alpha", &["https://example.com/a", "https://example.com/b"]);
        let b = out_with("beta", &["https://example.com/a", "https://example.com/c"]);
        let merged = aggregate(&[a, b], AggregationStrategy::Rrf(60));
        // Example.com/a appears in both => highest score.
        assert_eq!(merged[0].url, "https://example.com/a");
        assert!(merged[0].score > merged[1].score);
        // Sources trace back to both sub-queries.
        assert_eq!(merged[0].sources.len(), 2);
    }

    #[test]
    fn rrf_is_deterministic_across_calls() {
        let a = out_with("alpha", &["https://example.com/a", "https://example.com/b"]);
        let b = out_with("beta", &["https://example.com/b", "https://example.com/c"]);
        let m1 = aggregate(&[a.clone(), b.clone()], AggregationStrategy::Rrf(60));
        let m2 = aggregate(&[a, b], AggregationStrategy::Rrf(60));
        assert_eq!(m1, m2);
    }

    #[test]
    fn dedupe_keeps_first_occurrence() {
        let a = out_with("alpha", &["https://example.com/a", "https://example.com/b"]);
        let b = out_with("beta", &["https://example.com/a", "https://example.com/c"]);
        let merged = aggregate(&[a, b], AggregationStrategy::DedupeByUrl);
        assert_eq!(merged.len(), 3);
        assert_eq!(merged[0].url, "https://example.com/a");
        assert_eq!(merged[0].position, 1);
    }

    #[test]
    fn canonicalize_handles_invalid_url_gracefully() {
        let out = canonicalize_url("not a url");
        assert_eq!(out, "not a url");
    }

    // ---------------------------------------------------------------
    // Property-based tests (proptest)
    // ---------------------------------------------------------------
    mod proptests {
        use super::*;
        use proptest::prelude::*;

        proptest! {
            /// `canonicalize_url(canonicalize_url(x)) == canonicalize_url(x)` —
            /// the operation must be idempotent for any well-formed URL.
            #[test]
            fn canonicalize_is_idempotent(
                scheme in "(https|http)",
                host in "[a-z]{3,12}",
                path in "/[a-z0-9/_-]{0,30}",
                qkey in "[a-z]{1,5}",
                qval in "[a-z0-9]{1,5}",
            ) {
                let url = format!("{}://{}{}?{}={}", scheme, host, path, qkey, qval);
                let once = canonicalize_url(&url);
                let twice = canonicalize_url(&once);
                prop_assert_eq!(once, twice);
            }

            /// The canonical form never contains a fragment (`#`).
            #[test]
            fn canonicalize_strips_fragment(
                host in "[a-z]{3,8}",
                fragment in "[a-zA-Z0-9]{1,12}",
            ) {
                let url = format!("https://{}/p#{}", host, fragment);
                let canon = canonicalize_url(&url);
                prop_assert!(!canon.contains('#'), "fragment leaked: {}", canon);
            }

            /// `utm_*`, `fbclid`, and `gclid` must always be stripped.
            #[test]
            fn canonicalize_strips_tracking_params(
                host in "[a-z]{3,8}",
                path in "/[a-z0-9]{1,10}",
            ) {
                let url = format!(
                    "https://{}{}?utm_source=x&fbclid=y&gclid=z&keep=1",
                    host, path
                );
                let canon = canonicalize_url(&url);
                prop_assert!(!canon.contains("utm_"), "utm leaked: {}", canon);
                prop_assert!(!canon.contains("fbclid"), "fbclid leaked: {}", canon);
                prop_assert!(!canon.contains("gclid"), "gclid leaked: {}", canon);
                prop_assert!(canon.contains("keep=1"), "non-tracking lost: {}", canon);
            }

            /// The host is always lowercased in the canonical form.
            #[test]
            fn canonicalize_lowercases_host(
                host_part in "[A-Z]{3,8}",
                path in "/[a-z0-9]{0,8}",
            ) {
                let url = format!("https://{}/{}", host_part, path);
                let canon = canonicalize_url(&url);
                let lower = host_part.to_ascii_lowercase();
                prop_assert!(canon.contains(&lower), "host not lowered: {}", canon);
            }
        }
    }
}