#![allow(clippy::module_name_repetitions)]
#![allow(clippy::cast_possible_truncation)]
#![allow(clippy::cast_sign_loss)]
#![allow(clippy::cast_precision_loss)]
use alloc::format;
use alloc::string::{String, ToString};
use alloc::vec;
use alloc::vec::Vec;
use core::cmp::Ordering;
pub const WEB_SEARCH_RRF_K: u32 = 60;
pub const WEB_SEARCH_CONCURRENCY_PER_CATEGORY: u32 = 5;
pub const WEB_SEARCH_PROVIDER_LIMIT: u32 = 10;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ProviderCategory {
Search,
Knowledge,
Papers,
Code,
}
impl ProviderCategory {
#[must_use]
pub const fn slug(self) -> &'static str {
match self {
Self::Search => "search",
Self::Knowledge => "knowledge",
Self::Papers => "papers",
Self::Code => "code",
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct ProviderSpec {
pub id: &'static str,
pub label: &'static str,
pub category: ProviderCategory,
pub cors_readable: bool,
pub default_for_category: bool,
}
pub const WEB_SEARCH_PROVIDER_REGISTRY: &[ProviderSpec] = &[
ProviderSpec {
id: "duckduckgo",
label: "DuckDuckGo Instant Answer",
category: ProviderCategory::Search,
cors_readable: true,
default_for_category: true,
},
ProviderSpec {
id: "google",
label: "Google Search",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "bing",
label: "Bing Search",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "brave",
label: "Brave Search",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "yahoo",
label: "Yahoo Search",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "yandex",
label: "Yandex Search",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "ecosia",
label: "Ecosia",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "mojeek",
label: "Mojeek",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "startpage",
label: "Startpage",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "wikipedia",
label: "Wikipedia REST",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: true,
},
ProviderSpec {
id: "wikidata",
label: "Wikidata entities",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "wiktionary",
label: "Wiktionary opensearch",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "cambridge-dictionary",
label: "Cambridge Dictionary",
category: ProviderCategory::Knowledge,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "merriam-webster",
label: "Merriam-Webster Dictionary",
category: ProviderCategory::Knowledge,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "dictionary-com",
label: "Dictionary.com",
category: ProviderCategory::Knowledge,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "collins-dictionary",
label: "Collins English Dictionary",
category: ProviderCategory::Knowledge,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "internet-archive",
label: "Internet Archive (archive.org)",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "dbpedia",
label: "DBpedia Lookup",
category: ProviderCategory::Knowledge,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "openlibrary",
label: "Open Library",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "openalex",
label: "OpenAlex works",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "crossref",
label: "Crossref works",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "semantic-scholar",
label: "Semantic Scholar",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "arxiv",
label: "arXiv atom export",
category: ProviderCategory::Papers,
cors_readable: true,
default_for_category: true,
},
ProviderSpec {
id: "europepmc",
label: "Europe PMC",
category: ProviderCategory::Papers,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "doaj",
label: "DOAJ articles",
category: ProviderCategory::Papers,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "github",
label: "GitHub repositories",
category: ProviderCategory::Code,
cors_readable: true,
default_for_category: true,
},
ProviderSpec {
id: "gitlab",
label: "GitLab projects",
category: ProviderCategory::Code,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "codeberg",
label: "Codeberg",
category: ProviderCategory::Code,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "gitee",
label: "Gitee",
category: ProviderCategory::Code,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "bitbucket",
label: "Bitbucket Cloud",
category: ProviderCategory::Code,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "gitflic",
label: "GitFlic",
category: ProviderCategory::Code,
cors_readable: false,
default_for_category: false,
},
];
pub const WEB_SEARCH_PROVIDERS: &[&str] = &[
"duckduckgo",
"internet-archive",
"wikipedia",
"wikidata",
"wiktionary",
];
#[must_use]
pub fn default_search_plan_ids() -> Vec<String> {
WEB_SEARCH_PROVIDERS
.iter()
.map(|id| (*id).to_string())
.collect()
}
#[must_use]
pub fn build_request_evidence(query: &str, language: &str) -> Vec<String> {
let mut lines: Vec<String> = Vec::new();
lines.push(format!("web_search:request:{query}"));
if !language.is_empty() {
lines.push(format!("web_search:language:{language}"));
}
for provider in WEB_SEARCH_PROVIDERS {
lines.push(format!("web_search:provider:{provider}"));
}
lines.push(format!("web_search:combined:rrf:k={WEB_SEARCH_RRF_K}"));
lines
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ProviderRanking {
pub provider_id: String,
pub rank: u32,
pub url: String,
pub title: String,
pub excerpt: String,
}
#[derive(Debug, Clone, PartialEq)]
pub struct FusedEntry {
pub url: String,
pub title: String,
pub excerpt: String,
pub score: f64,
pub providers: Vec<(String, u32)>,
}
#[must_use]
pub fn reciprocal_rank_fusion(entries: &[ProviderRanking], k: u32) -> Vec<FusedEntry> {
let mut fused: Vec<FusedEntry> = Vec::new();
for entry in entries {
if entry.url.is_empty() {
continue;
}
let denom = f64::from(k) + f64::from(entry.rank);
if denom <= 0.0 {
continue;
}
let score = 1.0 / denom;
if let Some(existing) = fused.iter_mut().find(|item| item.url == entry.url) {
existing.score += score;
existing
.providers
.push((entry.provider_id.clone(), entry.rank));
if existing.title.is_empty() && !entry.title.is_empty() {
existing.title.clone_from(&entry.title);
}
if existing.excerpt.is_empty() && !entry.excerpt.is_empty() {
existing.excerpt.clone_from(&entry.excerpt);
}
} else {
let providers: Vec<(String, u32)> = vec![(entry.provider_id.clone(), entry.rank)];
fused.push(FusedEntry {
url: entry.url.clone(),
title: if entry.title.is_empty() {
entry.url.clone()
} else {
entry.title.clone()
},
excerpt: entry.excerpt.clone(),
score,
providers,
});
}
}
fused.sort_by(|left, right| match right.score.partial_cmp(&left.score) {
Some(Ordering::Equal) | None => right.providers.len().cmp(&left.providers.len()),
Some(order) => order,
});
fused
}
#[must_use]
pub fn parse_rrf_input(input: &str) -> Vec<ProviderRanking> {
let mut entries: Vec<ProviderRanking> = Vec::new();
for line in input.split('\n') {
if line.is_empty() {
continue;
}
let mut parts = line.split('\t');
let Some(provider_id) = parts.next() else {
continue;
};
let Some(rank_text) = parts.next() else {
continue;
};
let Some(url) = parts.next() else {
continue;
};
let title = parts.next().unwrap_or("");
let excerpt = parts.next().unwrap_or("");
let Ok(rank) = rank_text.parse::<u32>() else {
continue;
};
if url.is_empty() {
continue;
}
entries.push(ProviderRanking {
provider_id: provider_id.to_string(),
rank,
url: url.to_string(),
title: title.to_string(),
excerpt: excerpt.to_string(),
});
}
entries
}
#[must_use]
pub fn serialize_rrf_output(fused: &[FusedEntry]) -> String {
let mut buffer = String::new();
for entry in fused {
if !buffer.is_empty() {
buffer.push('\n');
}
buffer.push_str(&entry.url);
buffer.push('\t');
buffer.push_str(&entry.title);
buffer.push('\t');
buffer.push_str(&entry.excerpt);
buffer.push('\t');
buffer.push_str(&format_score(entry.score));
buffer.push('\t');
for (index, (provider_id, rank)) in entry.providers.iter().enumerate() {
if index > 0 {
buffer.push('+');
}
buffer.push_str(provider_id);
buffer.push('#');
buffer.push_str(&rank.to_string());
}
}
buffer
}
fn format_score(score: f64) -> String {
let scaled_f = score * 1_000_000.0;
let scaled = if scaled_f >= 0.0 {
(scaled_f + 0.5) as i64
} else {
(scaled_f - 0.5) as i64
};
let whole = scaled / 1_000_000;
let fraction = (scaled % 1_000_000).abs();
let mut text = String::new();
if scaled < 0 && whole == 0 {
text.push('-');
}
text.push_str(&whole.to_string());
text.push('.');
let fraction_str = fraction.to_string();
for _ in 0..(6 - fraction_str.len()) {
text.push('0');
}
text.push_str(&fraction_str);
text
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn rrf_k_is_sixty() {
assert_eq!(WEB_SEARCH_RRF_K, 60);
}
#[test]
fn default_plan_lists_duckduckgo_first() {
let plan = default_search_plan_ids();
assert_eq!(plan.first().map(String::as_str), Some("duckduckgo"));
assert!(plan.contains(&"wikipedia".to_string()));
assert!(plan.contains(&"wikidata".to_string()));
assert!(plan.contains(&"wiktionary".to_string()));
assert!(plan.contains(&"internet-archive".to_string()));
}
#[test]
fn default_plan_preserves_issue_180_priority_order() {
let plan = default_search_plan_ids();
assert_eq!(
plan,
vec![
"duckduckgo".to_string(),
"internet-archive".to_string(),
"wikipedia".to_string(),
"wikidata".to_string(),
"wiktionary".to_string(),
]
);
}
#[test]
fn registry_includes_all_four_categories() {
let mut search = 0;
let mut knowledge = 0;
let mut papers = 0;
let mut code = 0;
for spec in WEB_SEARCH_PROVIDER_REGISTRY {
match spec.category {
ProviderCategory::Search => search += 1,
ProviderCategory::Knowledge => knowledge += 1,
ProviderCategory::Papers => papers += 1,
ProviderCategory::Code => code += 1,
}
}
assert!(search >= 7, "expected ≥7 search providers, found {search}");
assert!(
knowledge >= 6,
"expected ≥6 knowledge providers, found {knowledge}"
);
assert!(papers >= 3, "expected ≥3 papers providers, found {papers}");
assert!(code >= 5, "expected ≥5 code providers, found {code}");
}
#[test]
fn build_request_evidence_includes_combined_ranking_line() {
let lines = build_request_evidence("formal-ai", "en");
assert_eq!(
lines.first().map(String::as_str),
Some("web_search:request:formal-ai")
);
assert!(lines.contains(&"web_search:language:en".to_string()));
assert!(lines.contains(&"web_search:provider:duckduckgo".to_string()));
assert_eq!(
lines.last().map(String::as_str),
Some("web_search:combined:rrf:k=60")
);
}
#[test]
fn reciprocal_rank_fusion_combines_shared_urls() {
let entries = [
ProviderRanking {
provider_id: "duckduckgo".to_string(),
rank: 1,
url: "https://example.com".to_string(),
title: "Example".to_string(),
excerpt: "DDG".to_string(),
},
ProviderRanking {
provider_id: "wikipedia".to_string(),
rank: 2,
url: "https://example.com".to_string(),
title: "Example".to_string(),
excerpt: "Wiki".to_string(),
},
ProviderRanking {
provider_id: "wikidata".to_string(),
rank: 1,
url: "https://other.example".to_string(),
title: "Other".to_string(),
excerpt: String::new(),
},
];
let fused = reciprocal_rank_fusion(&entries, WEB_SEARCH_RRF_K);
assert_eq!(fused.len(), 2);
assert_eq!(fused[0].url, "https://example.com");
assert_eq!(fused[0].providers.len(), 2);
}
#[test]
fn rrf_input_round_trips_through_parser_and_serializer() {
let input = "duckduckgo\t1\thttps://example.com\tExample\tDDG\n\
wikipedia\t2\thttps://example.com\tExample\tWiki";
let entries = parse_rrf_input(input);
assert_eq!(entries.len(), 2);
let fused = reciprocal_rank_fusion(&entries, WEB_SEARCH_RRF_K);
let output = serialize_rrf_output(&fused);
assert!(output.contains("duckduckgo#1+wikipedia#2"));
assert!(output.starts_with("https://example.com"));
}
#[test]
fn format_score_pads_fraction_to_six_digits() {
assert_eq!(format_score(0.000_032_786_885_245_901_64), "0.000033");
assert_eq!(format_score(1.5), "1.500000");
}
#[test]
fn registry_pins_issue_133_explicit_providers() {
let ids: Vec<&str> = WEB_SEARCH_PROVIDER_REGISTRY
.iter()
.map(|spec| spec.id)
.collect();
for required in [
"duckduckgo",
"google",
"bing",
"brave",
"yahoo",
"yandex",
"ecosia",
"mojeek",
"startpage",
"wikipedia",
"wikidata",
"wiktionary",
"internet-archive",
"arxiv",
"europepmc",
"doaj",
"github",
"gitlab",
"codeberg",
"gitee",
"bitbucket",
"gitflic",
] {
assert!(
ids.contains(&required),
"registry must list `{required}` (issue #133)"
);
}
}
#[test]
fn cors_readable_defaults_are_consistent_with_default_plan() {
let plan = default_search_plan_ids();
for id in &plan {
let spec = WEB_SEARCH_PROVIDER_REGISTRY
.iter()
.find(|spec| spec.id == id.as_str())
.unwrap_or_else(|| panic!("plan provider `{id}` missing from registry"));
assert!(
spec.cors_readable,
"default-plan provider `{id}` must be CORS-readable"
);
}
}
#[test]
fn build_request_evidence_lists_providers_in_priority_order() {
let lines = build_request_evidence("query", "en");
let provider_lines: Vec<&str> = lines
.iter()
.filter(|line| line.starts_with("web_search:provider:"))
.map(String::as_str)
.collect();
assert_eq!(
provider_lines,
vec![
"web_search:provider:duckduckgo",
"web_search:provider:internet-archive",
"web_search:provider:wikipedia",
"web_search:provider:wikidata",
"web_search:provider:wiktionary",
]
);
}
#[test]
fn build_request_evidence_skips_empty_language_line() {
let lines = build_request_evidence("query", "");
assert!(!lines
.iter()
.any(|line| line == "web_search:language:" || line == "web_search:language: "));
}
#[test]
fn internet_archive_is_cors_readable_in_registry() {
let spec = WEB_SEARCH_PROVIDER_REGISTRY
.iter()
.find(|spec| spec.id == "internet-archive")
.expect("internet-archive must be in registry");
assert!(
spec.cors_readable,
"internet-archive must stay CORS-readable so the demo browser can call it directly"
);
assert!(matches!(spec.category, ProviderCategory::Knowledge));
}
#[test]
fn dictionary_sources_are_non_cors_knowledge_providers() {
let plan = default_search_plan_ids();
for id in [
"cambridge-dictionary",
"merriam-webster",
"dictionary-com",
"collins-dictionary",
] {
let spec = WEB_SEARCH_PROVIDER_REGISTRY
.iter()
.find(|spec| spec.id == id)
.unwrap_or_else(|| panic!("dictionary provider `{id}` missing from registry"));
assert!(matches!(spec.category, ProviderCategory::Knowledge));
assert!(
!spec.cors_readable,
"dictionary page provider `{id}` must stay proxy/diagnostics-only"
);
assert!(
!spec.default_for_category,
"dictionary page provider `{id}` must not replace the live CORS default"
);
assert!(
!plan.contains(&id.to_string()),
"dictionary page provider `{id}` must not enter the default CORS plan"
);
}
}
#[test]
fn rrf_score_matches_cormack_clarke_buettcher_formula() {
let entries = [ProviderRanking {
provider_id: "duckduckgo".to_string(),
rank: 1,
url: "https://example.com".to_string(),
title: "Example".to_string(),
excerpt: String::new(),
}];
let fused = reciprocal_rank_fusion(&entries, WEB_SEARCH_RRF_K);
assert_eq!(fused.len(), 1);
let expected = 1.0_f64 / (f64::from(WEB_SEARCH_RRF_K) + 1.0);
assert!(
(fused[0].score - expected).abs() < 1e-9,
"expected score {expected}, got {}",
fused[0].score
);
}
#[test]
fn default_plan_providers_carry_human_labels() {
for id in &*default_search_plan_ids() {
let spec = WEB_SEARCH_PROVIDER_REGISTRY
.iter()
.find(|spec| spec.id == id.as_str())
.unwrap_or_else(|| panic!("plan id `{id}` missing from registry"));
assert!(
!spec.label.is_empty(),
"plan provider `{id}` must have a non-empty label"
);
}
}
#[test]
fn default_plan_is_a_subset_of_registry_ids() {
let registry_ids: Vec<&str> = WEB_SEARCH_PROVIDER_REGISTRY
.iter()
.map(|spec| spec.id)
.collect();
for id in &*default_search_plan_ids() {
assert!(
registry_ids.contains(&id.as_str()),
"default-plan id `{id}` not present in registry"
);
}
}
}