#![allow(clippy::module_name_repetitions)]
#![allow(clippy::cast_possible_truncation)]
#![allow(clippy::cast_sign_loss)]
#![allow(clippy::cast_precision_loss)]
use alloc::format;
use alloc::string::{String, ToString};
use alloc::vec;
use alloc::vec::Vec;
use core::cmp::Ordering;
pub const WEB_SEARCH_RRF_K: u32 = 60;
pub const WEB_SEARCH_CONCURRENCY_PER_CATEGORY: u32 = 5;
pub const WEB_SEARCH_PROVIDER_LIMIT: u32 = 10;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ProviderCategory {
Search,
Knowledge,
Papers,
Code,
}
impl ProviderCategory {
#[must_use]
pub const fn slug(self) -> &'static str {
match self {
Self::Search => "search",
Self::Knowledge => "knowledge",
Self::Papers => "papers",
Self::Code => "code",
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct ProviderSpec {
pub id: &'static str,
pub label: &'static str,
pub category: ProviderCategory,
pub cors_readable: bool,
pub default_for_category: bool,
}
pub const WEB_SEARCH_PROVIDER_REGISTRY: &[ProviderSpec] = &[
ProviderSpec {
id: "duckduckgo",
label: "DuckDuckGo Instant Answer",
category: ProviderCategory::Search,
cors_readable: true,
default_for_category: true,
},
ProviderSpec {
id: "google",
label: "Google Search",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "bing",
label: "Bing Search",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "brave",
label: "Brave Search",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "yahoo",
label: "Yahoo Search",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "yandex",
label: "Yandex Search",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "ecosia",
label: "Ecosia",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "mojeek",
label: "Mojeek",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "startpage",
label: "Startpage",
category: ProviderCategory::Search,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "wikipedia",
label: "Wikipedia REST",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: true,
},
ProviderSpec {
id: "wikidata",
label: "Wikidata entities",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "wiktionary",
label: "Wiktionary opensearch",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "wikinews",
label: "Wikinews opensearch",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "cambridge-dictionary",
label: "Cambridge Dictionary",
category: ProviderCategory::Knowledge,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "merriam-webster",
label: "Merriam-Webster Dictionary",
category: ProviderCategory::Knowledge,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "dictionary-com",
label: "Dictionary.com",
category: ProviderCategory::Knowledge,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "collins-dictionary",
label: "Collins English Dictionary",
category: ProviderCategory::Knowledge,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "internet-archive",
label: "Internet Archive (archive.org)",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "dbpedia",
label: "DBpedia Lookup",
category: ProviderCategory::Knowledge,
cors_readable: false,
default_for_category: false,
},
ProviderSpec {
id: "openlibrary",
label: "Open Library",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "openalex",
label: "OpenAlex works",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "crossref",
label: "Crossref works",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "semantic-scholar",
label: "Semantic Scholar",
category: ProviderCategory::Knowledge,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "arxiv",
label: "arXiv atom export",
category: ProviderCategory::Papers,
cors_readable: true,
default_for_category: true,
},
ProviderSpec {
id: "europepmc",
label: "Europe PMC",
category: ProviderCategory::Papers,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "doaj",
label: "DOAJ articles",
category: ProviderCategory::Papers,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "github",
label: "GitHub repositories",
category: ProviderCategory::Code,
cors_readable: true,
default_for_category: true,
},
ProviderSpec {
id: "gitlab",
label: "GitLab projects",
category: ProviderCategory::Code,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "codeberg",
label: "Codeberg",
category: ProviderCategory::Code,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "gitee",
label: "Gitee",
category: ProviderCategory::Code,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "bitbucket",
label: "Bitbucket Cloud",
category: ProviderCategory::Code,
cors_readable: true,
default_for_category: false,
},
ProviderSpec {
id: "gitflic",
label: "GitFlic",
category: ProviderCategory::Code,
cors_readable: false,
default_for_category: false,
},
];
pub const WEB_SEARCH_PROVIDERS: &[&str] = &[
"duckduckgo",
"internet-archive",
"wikipedia",
"wikidata",
"wiktionary",
"wikinews",
];
#[must_use]
pub fn default_search_plan_ids() -> Vec<String> {
WEB_SEARCH_PROVIDERS
.iter()
.map(|id| (*id).to_string())
.collect()
}
#[must_use]
pub fn build_request_evidence(query: &str, language: &str) -> Vec<String> {
let mut lines: Vec<String> = Vec::new();
lines.push(format!("web_search:request:{query}"));
if !language.is_empty() {
lines.push(format!("web_search:language:{language}"));
}
for provider in WEB_SEARCH_PROVIDERS {
lines.push(format!("web_search:provider:{provider}"));
}
lines.push(format!("web_search:combined:rrf:k={WEB_SEARCH_RRF_K}"));
lines
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ProviderRanking {
pub provider_id: String,
pub rank: u32,
pub url: String,
pub title: String,
pub excerpt: String,
}
#[derive(Debug, Clone, PartialEq)]
pub struct FusedEntry {
pub url: String,
pub title: String,
pub excerpt: String,
pub score: f64,
pub providers: Vec<(String, u32)>,
}
#[must_use]
pub fn reciprocal_rank_fusion(entries: &[ProviderRanking], k: u32) -> Vec<FusedEntry> {
let mut fused: Vec<FusedEntry> = Vec::new();
for entry in entries {
if entry.url.is_empty() {
continue;
}
let denom = f64::from(k) + f64::from(entry.rank);
if denom <= 0.0 {
continue;
}
let score = 1.0 / denom;
if let Some(existing) = fused.iter_mut().find(|item| item.url == entry.url) {
existing.score += score;
existing
.providers
.push((entry.provider_id.clone(), entry.rank));
if existing.title.is_empty() && !entry.title.is_empty() {
existing.title.clone_from(&entry.title);
}
if existing.excerpt.is_empty() && !entry.excerpt.is_empty() {
existing.excerpt.clone_from(&entry.excerpt);
}
} else {
let providers: Vec<(String, u32)> = vec![(entry.provider_id.clone(), entry.rank)];
fused.push(FusedEntry {
url: entry.url.clone(),
title: if entry.title.is_empty() {
entry.url.clone()
} else {
entry.title.clone()
},
excerpt: entry.excerpt.clone(),
score,
providers,
});
}
}
fused.sort_by(|left, right| match right.score.partial_cmp(&left.score) {
Some(Ordering::Equal) | None => right.providers.len().cmp(&left.providers.len()),
Some(order) => order,
});
fused
}
#[must_use]
pub fn parse_rrf_input(input: &str) -> Vec<ProviderRanking> {
let mut entries: Vec<ProviderRanking> = Vec::new();
for line in input.split('\n') {
if line.is_empty() {
continue;
}
let mut parts = line.split('\t');
let Some(provider_id) = parts.next() else {
continue;
};
let Some(rank_text) = parts.next() else {
continue;
};
let Some(url) = parts.next() else {
continue;
};
let title = parts.next().unwrap_or("");
let excerpt = parts.next().unwrap_or("");
let Ok(rank) = rank_text.parse::<u32>() else {
continue;
};
if url.is_empty() {
continue;
}
entries.push(ProviderRanking {
provider_id: provider_id.to_string(),
rank,
url: url.to_string(),
title: title.to_string(),
excerpt: excerpt.to_string(),
});
}
entries
}
#[must_use]
pub fn serialize_rrf_output(fused: &[FusedEntry]) -> String {
let mut buffer = String::new();
for entry in fused {
if !buffer.is_empty() {
buffer.push('\n');
}
buffer.push_str(&entry.url);
buffer.push('\t');
buffer.push_str(&entry.title);
buffer.push('\t');
buffer.push_str(&entry.excerpt);
buffer.push('\t');
buffer.push_str(&format_score(entry.score));
buffer.push('\t');
for (index, (provider_id, rank)) in entry.providers.iter().enumerate() {
if index > 0 {
buffer.push('+');
}
buffer.push_str(provider_id);
buffer.push('#');
buffer.push_str(&rank.to_string());
}
}
buffer
}
fn format_score(score: f64) -> String {
let scaled_f = score * 1_000_000.0;
let scaled = if scaled_f >= 0.0 {
(scaled_f + 0.5) as i64
} else {
(scaled_f - 0.5) as i64
};
let whole = scaled / 1_000_000;
let fraction = (scaled % 1_000_000).abs();
let mut text = String::new();
if scaled < 0 && whole == 0 {
text.push('-');
}
text.push_str(&whole.to_string());
text.push('.');
let fraction_str = fraction.to_string();
for _ in 0..(6 - fraction_str.len()) {
text.push('0');
}
text.push_str(&fraction_str);
text
}