use crate::types::SearchOutput;
use serde::{Deserialize, Serialize};
use url::Url;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AggregationStrategy {
Rrf(u32),
DedupeByUrl,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct AggregatedItem {
pub url: String,
pub title: String,
#[serde(rename = "url_exibicao", skip_serializing_if = "Option::is_none")]
pub display_url: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub snippet: Option<String>,
pub score: f64,
#[serde(rename = "posicao")]
pub position: u32,
#[serde(rename = "fontes")]
pub sources: Vec<String>,
}
pub fn canonicalize_url(raw: &str) -> String {
let parsed = match Url::parse(raw) {
Ok(u) => u,
Err(_) => return raw.to_string(),
};
let mut url = parsed.clone();
let lower_host = url.host_str().map(|h| h.to_ascii_lowercase());
if let Some(h) = lower_host {
let _ = url.set_host(Some(&h));
}
let _ = url.set_scheme(url.scheme().to_ascii_lowercase().as_str());
url.set_fragment(None);
let tracking: &[&str] = &[
"utm_source",
"utm_medium",
"utm_campaign",
"utm_term",
"utm_content",
"fbclid",
"gclid",
"ref",
"mc_cid",
"mc_eid",
];
let pairs: Vec<(String, String)> = url
.query_pairs()
.filter(|(k, _)| !tracking.contains(&k.as_ref()))
.map(|(k, v)| (k.into_owned(), v.into_owned()))
.collect();
if !pairs.is_empty() {
let mut sorted = pairs;
sorted.sort_by(|a, b| a.0.cmp(&b.0));
let mut ser = url::form_urlencoded::Serializer::new(String::new());
for (k, v) in &sorted {
ser.append_pair(k, v);
}
let query = ser.finish();
url.set_query(Some(&query));
} else {
url.set_query(None);
}
let mut out = url.to_string();
if let Some(idx) = out.find("://") {
let (scheme, rest) = out.split_at(idx + 3);
let host_end = rest.find('/').unwrap_or(rest.len());
let host = &rest[..host_end];
let path_and_query = &rest[host_end..];
let (raw_path, raw_query) = match path_and_query.find('?') {
Some(q) => (&path_and_query[..q], &path_and_query[q..]),
None => (path_and_query, ""),
};
let collapsed_path: String =
raw_path
.chars()
.fold(String::with_capacity(raw_path.len()), |mut acc, c| {
if c == '/' && acc.ends_with('/') {
acc
} else {
acc.push(c);
acc
}
});
let trimmed_path = collapsed_path.trim_end_matches('/');
let trimmed_path = if trimmed_path.is_empty() {
"/"
} else {
trimmed_path
};
out = format!("{scheme}{host}{trimmed_path}{raw_query}");
}
out
}
pub fn canonical_hash(raw: &str) -> String {
let canonical = canonicalize_url(raw);
let hash = blake3::hash(canonical.as_bytes());
let hex = hash.to_hex();
hex.to_string()[..16].to_string()
}
pub fn aggregate(outputs: &[SearchOutput], strategy: AggregationStrategy) -> Vec<AggregatedItem> {
match strategy {
AggregationStrategy::Rrf(k) => rrf_aggregate(outputs, k),
AggregationStrategy::DedupeByUrl => dedupe_by_url(outputs),
}
}
fn rrf_aggregate(outputs: &[SearchOutput], k: u32) -> Vec<AggregatedItem> {
use std::collections::HashMap;
#[derive(Default)]
struct Entry {
url: String,
title: String,
display_url: Option<String>,
snippet: Option<String>,
score: f64,
position: u32,
sources: Vec<String>,
}
let mut map: HashMap<String, Entry> = HashMap::new();
for output in outputs {
for (idx, r) in output.results.iter().enumerate() {
let key = canonical_hash(&r.url);
let rank = (idx as u32) + 1;
let score = 1.0 / ((k as f64) + (rank as f64));
let entry = map.entry(key).or_insert_with(|| Entry {
url: r.url.clone(),
title: String::new(),
display_url: r.display_url.clone(),
snippet: r.snippet.clone(),
score: 0.0,
position: rank,
sources: Vec::new(),
});
entry.score += score;
if entry.title.is_empty() {
entry.title = r.title.clone();
}
if entry.display_url.is_none() {
entry.display_url = r.display_url.clone();
}
if entry.snippet.is_none() {
entry.snippet = r.snippet.clone();
}
if rank < entry.position {
entry.position = rank;
}
if !output.query.is_empty() && !entry.sources.contains(&output.query) {
entry.sources.push(output.query.clone());
}
}
}
let mut out: Vec<AggregatedItem> = map
.into_values()
.map(|e| AggregatedItem {
url: e.url,
title: e.title,
display_url: e.display_url,
snippet: e.snippet,
score: e.score,
position: e.position,
sources: e.sources,
})
.collect();
out.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.position.cmp(&b.position))
});
out
}
fn dedupe_by_url(outputs: &[SearchOutput]) -> Vec<AggregatedItem> {
use std::collections::HashMap;
let mut map: HashMap<String, AggregatedItem> = HashMap::new();
for output in outputs {
for (idx, r) in output.results.iter().enumerate() {
let key = canonical_hash(&r.url);
map.entry(key).or_insert_with(|| AggregatedItem {
url: r.url.clone(),
title: r.title.clone(),
display_url: r.display_url.clone(),
snippet: r.snippet.clone(),
score: 1.0,
position: (idx as u32) + 1,
sources: if output.query.is_empty() {
Vec::new()
} else {
vec![output.query.clone()]
},
});
}
}
let mut out: Vec<AggregatedItem> = map.into_values().collect();
out.sort_by_key(|i| i.position);
out
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::SearchMetadata;
fn out_with(query: &str, urls: &[&str]) -> SearchOutput {
SearchOutput {
query: query.to_string(),
engine: "duckduckgo".to_string(),
endpoint: "html".to_string(),
timestamp: "2026-06-07T00:00:00Z".to_string(),
region: "br-pt".to_string(),
result_count: urls.len() as u32,
results: urls
.iter()
.enumerate()
.map(|(i, u)| crate::types::SearchResult {
position: (i as u32) + 1,
title: format!("title-{}", i),
url: u.to_string(),
display_url: None,
snippet: Some(format!("snippet-{}", i)),
original_title: None,
content: None,
content_size: None,
content_extraction_method: None,
})
.collect(),
pages_fetched: 1,
error: None,
message: None,
metadata: SearchMetadata {
execution_time_ms: 0,
selectors_hash: String::new(),
retries: 0,
used_fallback_endpoint: false,
concurrent_fetches: 0,
fetch_successes: 0,
fetch_failures: 0,
used_chrome: false,
user_agent: String::new(),
used_proxy: false,
identity_used: None,
cascade_level: None,
},
}
}
#[test]
fn canonicalize_strips_utm_and_lowercases_host() {
let a = "HTTPS://Example.com/path/?utm_source=x&b=2&a=1#frag";
let b = "https://example.com/path?a=1&b=2";
assert_eq!(canonicalize_url(a), canonicalize_url(b));
}
#[test]
fn canonicalize_collapses_repeated_slashes() {
let a = "https://example.com/foo//bar///baz/";
let b = "https://example.com/foo/bar/baz";
assert_eq!(canonicalize_url(a), canonicalize_url(b));
}
#[test]
fn canonicalize_preserves_root() {
assert_eq!(
canonicalize_url("https://example.com"),
"https://example.com/"
);
}
#[test]
fn canonical_hash_is_stable() {
let a = canonical_hash("https://Example.com/a?utm_source=x&b=1&a=1");
let b = canonical_hash("https://example.com/a?a=1&b=1");
assert_eq!(a, b);
}
#[test]
fn rrf_combines_duplicate_urls_with_combined_score() {
let a = out_with("alpha", &["https://example.com/a", "https://example.com/b"]);
let b = out_with("beta", &["https://example.com/a", "https://example.com/c"]);
let merged = aggregate(&[a, b], AggregationStrategy::Rrf(60));
assert_eq!(merged[0].url, "https://example.com/a");
assert!(merged[0].score > merged[1].score);
assert_eq!(merged[0].sources.len(), 2);
}
#[test]
fn rrf_is_deterministic_across_calls() {
let a = out_with("alpha", &["https://example.com/a", "https://example.com/b"]);
let b = out_with("beta", &["https://example.com/b", "https://example.com/c"]);
let m1 = aggregate(&[a.clone(), b.clone()], AggregationStrategy::Rrf(60));
let m2 = aggregate(&[a, b], AggregationStrategy::Rrf(60));
assert_eq!(m1, m2);
}
#[test]
fn dedupe_keeps_first_occurrence() {
let a = out_with("alpha", &["https://example.com/a", "https://example.com/b"]);
let b = out_with("beta", &["https://example.com/a", "https://example.com/c"]);
let merged = aggregate(&[a, b], AggregationStrategy::DedupeByUrl);
assert_eq!(merged.len(), 3);
assert_eq!(merged[0].url, "https://example.com/a");
assert_eq!(merged[0].position, 1);
}
#[test]
fn canonicalize_handles_invalid_url_gracefully() {
let out = canonicalize_url("not a url");
assert_eq!(out, "not a url");
}
mod proptests {
use super::*;
use proptest::prelude::*;
proptest! {
#[test]
fn canonicalize_is_idempotent(
scheme in "(https|http)",
host in "[a-z]{3,12}",
path in "/[a-z0-9/_-]{0,30}",
qkey in "[a-z]{1,5}",
qval in "[a-z0-9]{1,5}",
) {
let url = format!("{}://{}{}?{}={}", scheme, host, path, qkey, qval);
let once = canonicalize_url(&url);
let twice = canonicalize_url(&once);
prop_assert_eq!(once, twice);
}
#[test]
fn canonicalize_strips_fragment(
host in "[a-z]{3,8}",
fragment in "[a-zA-Z0-9]{1,12}",
) {
let url = format!("https://{}/p#{}", host, fragment);
let canon = canonicalize_url(&url);
prop_assert!(!canon.contains('#'), "fragment leaked: {}", canon);
}
#[test]
fn canonicalize_strips_tracking_params(
host in "[a-z]{3,8}",
path in "/[a-z0-9]{1,10}",
) {
let url = format!(
"https://{}{}?utm_source=x&fbclid=y&gclid=z&keep=1",
host, path
);
let canon = canonicalize_url(&url);
prop_assert!(!canon.contains("utm_"), "utm leaked: {}", canon);
prop_assert!(!canon.contains("fbclid"), "fbclid leaked: {}", canon);
prop_assert!(!canon.contains("gclid"), "gclid leaked: {}", canon);
prop_assert!(canon.contains("keep=1"), "non-tracking lost: {}", canon);
}
#[test]
fn canonicalize_lowercases_host(
host_part in "[A-Z]{3,8}",
path in "/[a-z0-9]{0,8}",
) {
let url = format!("https://{}/{}", host_part, path);
let canon = canonicalize_url(&url);
let lower = host_part.to_ascii_lowercase();
prop_assert!(canon.contains(&lower), "host not lowered: {}", canon);
}
}
}
}