use anyhow::{Result, anyhow};
use crate::config::WebConfig;
#[derive(Debug, Clone)]
pub(super) struct WebResult {
pub title: String,
pub url: String,
pub text: String,
}
pub(super) fn available(cfg: &WebConfig) -> bool {
if !cfg.enabled {
return false;
}
match cfg.provider.as_str() {
"tavily" => !cfg.api_key.trim().is_empty(),
"searxng" => !cfg.endpoint.trim().is_empty(),
_ => false,
}
}
pub(super) async fn search(cfg: WebConfig, query: String) -> Result<Vec<WebResult>> {
match cfg.provider.as_str() {
"tavily" => tavily(&cfg, &query).await,
"searxng" => searxng(&cfg, &query).await,
other => Err(anyhow!("unknown web provider `{other}` (tavily | searxng)")),
}
}
async fn tavily(cfg: &WebConfig, query: &str) -> Result<Vec<WebResult>> {
let body = serde_json::json!({
"api_key": cfg.api_key,
"query": query,
"max_results": cfg.max_results.max(1),
"search_depth": "basic",
"include_raw_content": cfg.fetch,
});
let resp = reqwest::Client::new()
.post("https://api.tavily.com/search")
.json(&body)
.send()
.await
.map_err(|e| anyhow!("tavily request: {e}"))?;
if !resp.status().is_success() {
return Err(anyhow!("tavily HTTP {}", resp.status()));
}
let json: serde_json::Value = resp.json().await.map_err(|e| anyhow!("tavily response: {e}"))?;
Ok(parse_results(&json, true))
}
async fn searxng(cfg: &WebConfig, query: &str) -> Result<Vec<WebResult>> {
let base = cfg.endpoint.trim_end_matches('/');
let client = reqwest::Client::new();
let resp = client
.get(format!("{base}/search"))
.query(&[("q", query), ("format", "json")])
.send()
.await
.map_err(|e| anyhow!("searxng request: {e}"))?;
if !resp.status().is_success() {
return Err(anyhow!("searxng HTTP {}", resp.status()));
}
let json: serde_json::Value = resp.json().await.map_err(|e| anyhow!("searxng response: {e}"))?;
let mut results = parse_results(&json, false);
results.truncate(cfg.max_results.max(1));
if cfg.fetch {
for r in results.iter_mut() {
if let Ok(page) = client.get(&r.url).send().await {
if let Ok(html) = page.text().await {
let text = html_to_text(&html);
if text.len() > r.text.len() {
r.text = text;
}
}
}
}
}
Ok(results)
}
fn parse_results(json: &serde_json::Value, prefer_raw: bool) -> Vec<WebResult> {
let Some(arr) = json.get("results").and_then(|r| r.as_array()) else {
return Vec::new();
};
arr.iter()
.filter_map(|r| {
let url = r.get("url").and_then(|u| u.as_str())?.to_string();
let title = r.get("title").and_then(|t| t.as_str()).unwrap_or("").to_string();
let raw = r.get("raw_content").and_then(|c| c.as_str());
let content = r.get("content").and_then(|c| c.as_str()).unwrap_or("");
let text = if prefer_raw { raw.unwrap_or(content) } else { content }.trim().to_string();
Some(WebResult { title, url, text })
})
.collect()
}
pub(super) fn html_to_text(html: &str) -> String {
let stripped = SCRIPT_STYLE.replace_all(html, " ");
let no_tags = TAGS.replace_all(&stripped, " ");
let decoded = no_tags
.replace(" ", " ")
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace("’", "'")
.replace("“", "\"")
.replace("”", "\"");
WS.replace_all(&decoded, " ").trim().to_string()
}
use std::sync::LazyLock;
static SCRIPT_STYLE: LazyLock<regex::Regex> =
LazyLock::new(|| regex::Regex::new(r"(?is)<(script|style)[^>]*>.*?</(script|style)>").unwrap());
static TAGS: LazyLock<regex::Regex> = LazyLock::new(|| regex::Regex::new(r"(?s)<[^>]+>").unwrap());
static WS: LazyLock<regex::Regex> = LazyLock::new(|| regex::Regex::new(r"\s+").unwrap());
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn availability_gates() {
let mut c = WebConfig::default();
assert!(!available(&c)); c.enabled = true;
c.provider = "tavily".into();
assert!(!available(&c)); c.api_key = "k".into();
assert!(available(&c));
c.provider = "searxng".into();
c.api_key.clear();
assert!(!available(&c)); c.endpoint = "https://searx.example".into();
assert!(available(&c));
}
#[test]
fn html_strip() {
let h = "<html><head><style>x{}</style></head><body>Hello <b>World</b> & <script>alert(1)</script>more</body></html>";
let t = html_to_text(h);
assert!(t.contains("Hello World"));
assert!(t.contains("& more") || t.contains("& more") || t.contains("more"));
assert!(!t.contains("alert"));
assert!(!t.contains("x{}"));
}
#[test]
fn parse_prefers_raw_content() {
let j = serde_json::json!({
"results": [
{"title":"T","url":"http://a","content":"snippet","raw_content":"full text"},
{"title":"U","url":"http://b","content":"only snippet"}
]
});
let r = parse_results(&j, true);
assert_eq!(r.len(), 2);
assert_eq!(r[0].text, "full text");
assert_eq!(r[1].text, "only snippet");
assert_eq!(parse_results(&j, false)[0].text, "snippet");
}
}