use crate::{client::WebClient, error::WebToolError};
use chrono::{DateTime, Utc};
use riglr_macros::tool;
use schemars::JsonSchema;
use scraper::{ElementRef, Html, Selector};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use tracing::{debug, info, warn};
const EXA_API_KEY: &str = "EXA_API_KEY";
#[derive(Debug, Clone)]
pub struct WebSearchConfig {
pub exa_api_key: String,
pub exa_base_url: String,
pub max_results: u32,
pub timeout_seconds: u64,
pub include_content: bool,
pub content_limit: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct SearchResult {
pub id: String,
pub title: String,
pub url: String,
pub description: Option<String>,
pub content: Option<String>,
pub summary: Option<String>,
pub published_date: Option<DateTime<Utc>>,
pub domain: DomainInfo,
pub metadata: PageMetadata,
pub relevance_score: f64,
pub content_type: ContentType,
pub language: Option<String>,
pub reading_time_minutes: Option<u32>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct DomainInfo {
pub name: String,
pub reputation_score: Option<u32>,
pub category: Option<String>,
pub is_trusted: bool,
pub authority_score: Option<u32>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct PageMetadata {
pub author: Option<String>,
pub tags: Vec<String>,
pub social_meta: SocialMetadata,
pub seo_meta: SeoMetadata,
pub canonical_url: Option<String>,
pub last_modified: Option<DateTime<Utc>>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct SocialMetadata {
pub og_title: Option<String>,
pub og_description: Option<String>,
pub og_image: Option<String>,
pub twitter_card: Option<String>,
pub twitter_site: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct SeoMetadata {
pub meta_description: Option<String>,
pub meta_keywords: Vec<String>,
pub robots: Option<String>,
pub schema_types: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct ContentType {
pub primary: String,
pub format: String,
pub is_paywalled: Option<bool>,
pub quality_score: Option<u32>,
pub length_category: String, }
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct WebSearchResult {
pub query: String,
pub search_type: String,
pub results: Vec<SearchResult>,
pub metadata: WebSearchMetadata,
pub insights: SearchInsights,
pub searched_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct WebSearchMetadata {
pub total_results: u32,
pub returned_results: u32,
pub execution_time_ms: u32,
pub filtered: bool,
pub related_queries: Vec<String>,
pub top_domains: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct SearchInsights {
pub common_topics: Vec<String>,
pub date_distribution: HashMap<String, u32>, pub content_types: HashMap<String, u32>,
pub avg_quality_score: Option<f64>,
pub languages: HashMap<String, u32>,
pub sentiment: Option<SearchSentiment>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct SearchSentiment {
pub overall_sentiment: f64,
pub distribution: SentimentDistribution,
pub most_positive: Option<String>, pub most_negative: Option<String>, }
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct SentimentDistribution {
pub positive_pct: f64,
pub neutral_pct: f64,
pub negative_pct: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct ContentSummary {
pub url: String,
pub title: String,
pub executive_summary: String,
pub key_points: Vec<String>,
pub entities: Vec<ContentEntity>,
pub topics: Vec<String>,
pub confidence: f64,
pub generated_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct ContentEntity {
pub name: String,
pub entity_type: String,
pub confidence: f64,
pub context: String,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct SimilarPagesResult {
pub source_url: String,
pub similar_pages: Vec<SearchResult>,
pub similarity_metadata: SimilarityMetadata,
pub searched_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct SimilarityMetadata {
pub avg_similarity: f64,
pub method: String,
pub common_themes: Vec<String>,
pub content_overlap: f64,
}
impl Default for WebSearchConfig {
fn default() -> Self {
Self {
exa_api_key: String::default(),
exa_base_url: "https://api.exa.ai".to_string(),
max_results: 20,
timeout_seconds: 30,
include_content: true,
content_limit: 5000,
}
}
}
impl WebSearchConfig {
#[allow(dead_code)]
fn from_context(context: &riglr_core::provider::ApplicationContext) -> Self {
Self {
exa_api_key: context
.config
.providers
.exa_api_key
.clone()
.unwrap_or_default(),
exa_base_url: "https://api.exa.ai".to_string(),
max_results: 20,
timeout_seconds: 30,
include_content: true,
content_limit: 5000,
}
}
}
pub async fn search_web_with_context(
query: String,
max_results: Option<u32>,
include_content: Option<bool>,
domain_filter: Option<Vec<String>>,
date_filter: Option<String>, content_type_filter: Option<String>, app_context: &riglr_core::provider::ApplicationContext,
) -> crate::error::Result<WebSearchResult> {
debug!(
"Performing web search for query: '{}' with {} max results",
query,
max_results.unwrap_or(20)
);
let exa_api_key = app_context
.config
.providers
.exa_api_key
.clone()
.ok_or_else(|| {
WebToolError::Config(
"EXA_API_KEY not configured. Set EXA_API_KEY in your environment.".to_string(),
)
})?;
let config = WebSearchConfig::default();
let client = WebClient::default().with_exa_key(exa_api_key.clone());
let mut params = HashMap::default();
params.insert("query".to_string(), query.clone());
params.insert(
"num_results".to_string(),
max_results.unwrap_or(20).to_string(),
);
params.insert(
"include_content".to_string(),
include_content.unwrap_or(true).to_string(),
);
params.insert("search_type".to_string(), "semantic".to_string());
if let Some(ref domains) = domain_filter {
params.insert("include_domains".to_string(), domains.join(","));
}
if let Some(ref date) = date_filter {
params.insert("start_published_date".to_string(), format_date_filter(date));
}
if let Some(content_type) = content_type_filter {
params.insert("category".to_string(), content_type);
}
let url = format!("{}/search", config.exa_base_url);
let mut headers = HashMap::default();
headers.insert("x-api-key".to_string(), exa_api_key.clone());
headers.insert("accept".to_string(), "application/json".to_string());
let response = client
.get_with_params_and_headers(&url, ¶ms, headers)
.await
.map_err(|e| {
if e.to_string().contains("timeout") || e.to_string().contains("connection") {
WebToolError::Network(format!("Web search request failed: {}", e))
} else {
WebToolError::Config(format!("Web search request failed: {}", e))
}
})?;
let results = parse_exa_search_response(&response, &query)
.await
.map_err(|e| WebToolError::Config(format!("Failed to parse search response: {}", e)))?;
let insights = analyze_search_results(&results)
.await
.map_err(|e| WebToolError::Config(format!("Failed to analyze results: {}", e)))?;
let search_result = WebSearchResult {
query: query.clone(),
search_type: "semantic".to_string(),
results: results.clone(),
metadata: WebSearchMetadata {
total_results: results.len() as u32,
returned_results: results.len() as u32,
execution_time_ms: 1500, filtered: domain_filter.is_some() || date_filter.is_some(),
related_queries: generate_related_queries(&query).await.map_err(|e| {
WebToolError::Config(format!("Failed to generate related queries: {}", e))
})?,
top_domains: extract_top_domains(&results),
},
insights,
searched_at: Utc::now(),
};
info!(
"Web search completed: {} results for '{}'",
results.len(),
query
);
Ok(search_result)
}
#[tool]
pub async fn search_web(
context: &riglr_core::provider::ApplicationContext,
query: String,
max_results: Option<u32>,
include_content: Option<bool>,
domain_filter: Option<Vec<String>>,
date_filter: Option<String>, content_type_filter: Option<String>, ) -> crate::error::Result<WebSearchResult> {
search_web_with_context(
query,
max_results,
include_content,
domain_filter,
date_filter,
content_type_filter,
context,
)
.await
}
#[tool]
pub async fn find_similar_pages(
context: &riglr_core::provider::ApplicationContext,
source_url: String,
max_results: Option<u32>,
include_content: Option<bool>,
similarity_threshold: Option<f64>,
) -> crate::error::Result<SimilarPagesResult> {
debug!("Finding pages similar to: {}", source_url);
let exa_api_key = context
.config
.providers
.exa_api_key
.clone()
.ok_or_else(|| {
WebToolError::Config(
"EXA_API_KEY not configured. Set EXA_API_KEY in your environment.".to_string(),
)
})?;
let client = WebClient::default().with_exa_key(exa_api_key.clone());
let mut params = HashMap::default();
params.insert("url".to_string(), source_url.clone());
params.insert(
"num_results".to_string(),
max_results.unwrap_or(10).to_string(),
);
params.insert(
"include_content".to_string(),
include_content.unwrap_or(true).to_string(),
);
if let Some(threshold) = similarity_threshold {
params.insert("similarity_threshold".to_string(), threshold.to_string());
}
let config = WebSearchConfig::default();
let url = format!("{}/find_similar", config.exa_base_url);
let mut headers = HashMap::default();
headers.insert("x-api-key".to_string(), exa_api_key.clone());
headers.insert("accept".to_string(), "application/json".to_string());
let response = client
.get_with_params_and_headers(&url, ¶ms, headers)
.await
.map_err(|e| {
if e.to_string().contains("timeout") || e.to_string().contains("connection") {
WebToolError::Network(format!("Web search request failed: {}", e))
} else {
WebToolError::Config(format!("Web search request failed: {}", e))
}
})?;
let similar_pages = parse_similar_pages_response(&response)
.await
.map_err(|e| WebToolError::Config(format!("Failed to parse similar pages: {}", e)))?;
let similarity_metadata = analyze_similarity(&similar_pages)
.await
.map_err(|e| WebToolError::Config(format!("Failed to analyze similarity: {}", e)))?;
let result = SimilarPagesResult {
source_url: source_url.clone(),
similar_pages: similar_pages.clone(),
similarity_metadata,
searched_at: Utc::now(),
};
info!(
"Found {} similar pages to {}",
similar_pages.len(),
source_url
);
Ok(result)
}
#[tool]
pub async fn summarize_web_content(
context: &riglr_core::provider::ApplicationContext,
urls: Vec<String>,
summary_length: Option<String>, focus_topics: Option<Vec<String>>,
_include_quotes: Option<bool>,
) -> crate::error::Result<Vec<ContentSummary>> {
debug!("Summarizing content from {} URLs", urls.len());
let exa_api_key = context
.get_extension::<String>()
.and_then(|s| {
if s.starts_with("exa_") {
Some(s.as_ref().clone())
} else {
None
}
})
.unwrap_or_else(|| std::env::var(EXA_API_KEY).unwrap_or_else(|_| String::default()));
let client = WebClient::default().with_exa_key(exa_api_key);
let mut summaries = Vec::new();
for url in urls {
match extract_and_summarize_page(&client, &url, &summary_length, &focus_topics).await {
Ok(summary) => {
summaries.push(summary);
}
Err(e) => {
warn!("Failed to summarize {}: {}", url, e);
}
}
}
info!(
"Successfully summarized {} out of {} pages",
summaries.len(),
summaries.len()
);
Ok(summaries)
}
#[tool]
pub async fn search_recent_news(
context: &riglr_core::provider::ApplicationContext,
topic: String,
time_window: Option<String>, source_types: Option<Vec<String>>, max_results: Option<u32>,
include_analysis: Option<bool>,
) -> crate::error::Result<WebSearchResult> {
debug!(
"Searching recent news for topic: '{}' within {}",
topic,
time_window.as_deref().unwrap_or("week")
);
let exa_api_key = context
.config
.providers
.exa_api_key
.clone()
.ok_or_else(|| {
WebToolError::Config(
"EXA_API_KEY not configured. Set EXA_API_KEY in your environment.".to_string(),
)
})?;
let client = WebClient::default().with_exa_key(exa_api_key.clone());
let mut params = HashMap::default();
params.insert("query".to_string(), topic.clone());
params.insert("search_type".to_string(), "news".to_string());
params.insert(
"num_results".to_string(),
max_results.unwrap_or(30).to_string(),
);
params.insert("include_content".to_string(), "true".to_string());
let time_window = time_window.unwrap_or_else(|| "week".to_string());
params.insert(
"start_published_date".to_string(),
format_date_filter(&time_window),
);
if let Some(sources) = source_types {
if sources.contains(&"news".to_string()) {
params.insert("category".to_string(), "news".to_string());
}
}
let config = WebSearchConfig::default();
let url = format!("{}/search", config.exa_base_url);
let mut headers = HashMap::default();
headers.insert("x-api-key".to_string(), exa_api_key.clone());
headers.insert("accept".to_string(), "application/json".to_string());
let response = client
.get_with_params_and_headers(&url, ¶ms, headers)
.await
.map_err(|e| {
if e.to_string().contains("timeout") || e.to_string().contains("connection") {
WebToolError::Network(format!("Web search request failed: {}", e))
} else {
WebToolError::Config(format!("Web search request failed: {}", e))
}
})?;
let mut results = parse_exa_search_response(&response, &topic)
.await
.map_err(|e| WebToolError::Config(format!("Failed to parse news response: {}", e)))?;
results.sort_by(|a, b| {
b.published_date
.unwrap_or_else(Utc::now)
.cmp(&a.published_date.unwrap_or_else(Utc::now))
});
let insights = if include_analysis.unwrap_or(true) {
analyze_news_results(&results)
.await
.map_err(|e| WebToolError::Config(format!("Failed to analyze news: {}", e)))?
} else {
SearchInsights {
common_topics: vec![],
date_distribution: HashMap::default(),
content_types: HashMap::default(),
avg_quality_score: None,
languages: HashMap::default(),
sentiment: None,
}
};
let search_result = WebSearchResult {
query: topic.clone(),
search_type: "news".to_string(),
results: results.clone(),
metadata: WebSearchMetadata {
total_results: results.len() as u32,
returned_results: results.len() as u32,
execution_time_ms: 1200,
filtered: true,
related_queries: generate_related_queries(&topic).await.map_err(|e| {
WebToolError::Config(format!("Failed to generate related queries: {}", e))
})?,
top_domains: extract_top_domains(&results),
},
insights,
searched_at: Utc::now(),
};
info!(
"Recent news search completed: {} results for '{}'",
search_result.results.len(),
topic
);
Ok(search_result)
}
async fn parse_exa_search_response(
response: &str,
query: &str,
) -> crate::error::Result<Vec<SearchResult>> {
let json: serde_json::Value = serde_json::from_str(response)
.map_err(|e| WebToolError::Parsing(format!("Invalid Exa JSON: {}", e)))?;
let mut out = Vec::new();
let results = json
.get("results")
.and_then(|v| v.as_array())
.cloned()
.unwrap_or_default();
for r in results {
let title = r
.get("title")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let url = r
.get("url")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
if url.is_empty() {
continue;
}
let id = r
.get("id")
.and_then(|v| v.as_str())
.unwrap_or(url.as_str())
.to_string();
let description = r
.get("description")
.or_else(|| r.get("snippet"))
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let content = r
.get("text")
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let published_date = r
.get("publishedDate")
.or_else(|| r.get("published_date"))
.and_then(|v| v.as_str())
.and_then(|s| DateTime::parse_from_rfc3339(s).ok())
.map(|dt| dt.with_timezone(&Utc));
let domain_name = url::Url::parse(&url)
.ok()
.and_then(|u| u.host_str().map(|h| h.to_string()))
.unwrap_or_default();
let score = r.get("score").and_then(|v| v.as_f64()).unwrap_or(0.8);
let language = r
.get("language")
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let author = r
.get("author")
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let words = content
.as_ref()
.map(|c| c.split_whitespace().count() as u32)
.unwrap_or(0);
let reading_time = if words > 0 {
Some((words as f64 / 200.0).ceil() as u32)
} else {
None
};
let length_category = match words {
0..=200 => "Short",
201..=800 => "Medium",
801..=2000 => "Long",
_ => "Very Long",
}
.to_string();
let content_type = ContentType {
primary: "Article".to_string(),
format: if url.to_lowercase().ends_with(".pdf") {
"PDF".to_string()
} else {
"HTML".to_string()
},
is_paywalled: None,
quality_score: Some(((score * 100.0) as u32).min(100)),
length_category,
};
let metadata = PageMetadata {
author,
tags: vec![query.to_lowercase()],
social_meta: SocialMetadata {
og_title: None,
og_description: None,
og_image: None,
twitter_card: None,
twitter_site: None,
},
seo_meta: SeoMetadata {
meta_description: description.clone(),
meta_keywords: vec![],
robots: None,
schema_types: vec![],
},
canonical_url: None,
last_modified: None,
};
let domain = DomainInfo {
name: domain_name,
reputation_score: None,
category: None,
is_trusted: true,
authority_score: None,
};
out.push(SearchResult {
id,
title,
url,
description,
content,
summary: None,
published_date,
domain,
metadata,
relevance_score: score,
content_type,
language,
reading_time_minutes: reading_time,
});
}
Ok(out)
}
async fn parse_similar_pages_response(response: &str) -> crate::error::Result<Vec<SearchResult>> {
parse_exa_search_response(response, "").await
}
async fn extract_and_summarize_page(
client: &WebClient,
url: &str,
summary_length: &Option<String>,
focus_topics: &Option<Vec<String>>,
) -> crate::error::Result<ContentSummary> {
let html = client
.get(url)
.await
.map_err(|e| WebToolError::Network(format!("Failed to fetch {}: {}", url, e)))?;
let (title, clean_text, sentences, headings) = extract_main_content(&html, url);
let n = match summary_length.as_deref() {
Some("comprehensive") => 8,
Some("detailed") => 5,
_ => 3,
} as usize;
let topic_set: std::collections::HashSet<String> = focus_topics
.clone()
.unwrap_or_default()
.into_iter()
.map(|t| t.to_lowercase())
.collect();
let ranked = rank_sentences(&sentences, &clean_text, &topic_set, &headings);
let selected = select_diverse(&ranked, n, 0.6);
let executive_summary = selected.join(" ");
let mut key_points = selected.iter().take(5).cloned().collect::<Vec<_>>();
if key_points.is_empty() && !headings.is_empty() {
key_points = headings.iter().take(5).cloned().collect();
}
let topics = if !topic_set.is_empty() {
topic_set.iter().cloned().collect()
} else {
extract_topics_from_text(&clean_text)
};
let entity_re = regex::Regex::new(r"(?m)(?:^|\s)([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})").unwrap();
let mut entities: Vec<ContentEntity> = entity_re
.captures_iter(&clean_text)
.map(|cap| ContentEntity {
name: cap[1].trim().to_string(),
entity_type: "ProperNoun".to_string(),
confidence: 0.55,
context: "".to_string(),
})
.collect();
entities.dedup_by(|a, b| a.name.eq_ignore_ascii_case(&b.name));
entities.truncate(8);
let mut confidence = (clean_text.len().min(8000) as f64 / 8000.0) * 0.6 + 0.3;
if !headings.is_empty() {
confidence += 0.05;
}
confidence = confidence.min(0.97);
Ok(ContentSummary {
url: url.to_string(),
title,
executive_summary,
key_points,
entities,
topics,
confidence,
generated_at: Utc::now(),
})
}
fn extract_main_content(
html: &str,
fallback_url: &str,
) -> (String, String, Vec<String>, Vec<String>) {
let document = Html::parse_document(html);
let sel_meta_title = Selector::parse("meta[property=\"og:title\"]").unwrap();
let title = document
.select(&sel_meta_title)
.filter_map(|el| el.value().attr("content"))
.map(|s| s.trim().to_string())
.find(|s| !s.is_empty())
.or_else(|| {
let sel_title = Selector::parse("title").unwrap();
document
.select(&sel_title)
.next()
.map(|e| e.text().collect::<String>().trim().to_string())
})
.unwrap_or_else(|| fallback_url.to_string());
let candidates = vec![
"article",
"main",
"div#content",
"div#main",
"div.post-content",
"div.article-content",
"section.article",
"div.entry-content",
"div#main-content",
];
let mut best_text = String::default();
let mut best_headings: Vec<String> = Vec::new();
for css in candidates {
if let Ok(sel) = Selector::parse(css) {
for node in document.select(&sel) {
let (text, headings) = extract_text_from_node(node);
if text.len() > best_text.len() {
best_text = text;
best_headings = headings;
}
}
}
}
if best_text.is_empty() {
if let Ok(sel) = Selector::parse("body") {
if let Some(body) = document.select(&sel).next() {
let (text, headings) = extract_text_from_node(body);
best_text = text;
best_headings = headings;
}
}
}
let sentences: Vec<String> = split_sentences(&best_text)
.into_iter()
.filter(|s| s.split_whitespace().count() >= 5)
.collect();
(title, best_text, sentences, best_headings)
}
fn extract_text_from_node(root: ElementRef) -> (String, Vec<String>) {
let sel_exclude = [
"script", "style", "noscript", "template", "header", "footer", "nav", "aside",
];
let sel_p = Selector::parse("p, li").unwrap();
let sel_h = Selector::parse("h1, h2, h3").unwrap();
let mut headings: Vec<String> = root
.select(&sel_h)
.map(|h| normalize_whitespace(&h.text().collect::<String>()))
.filter(|s| !s.is_empty())
.collect();
headings.dedup();
let mut blocks: Vec<String> = Vec::new();
for p in root.select(&sel_p) {
if has_excluded_ancestor(p, &sel_exclude) {
continue;
}
let txt = normalize_whitespace(&p.text().collect::<String>());
if txt.len() >= 40 {
blocks.push(txt);
}
}
let full = blocks.join("\n");
(full, headings)
}
fn has_excluded_ancestor(mut node: ElementRef, excluded: &[&str]) -> bool {
while let Some(parent) = node.ancestors().find_map(ElementRef::wrap) {
let name = parent.value().name();
if excluded.contains(&name) {
return true;
}
node = parent;
if node.parent().is_none() {
break;
}
}
false
}
fn normalize_whitespace(s: &str) -> String {
let s = html_escape::decode_html_entities(s);
let re = regex::Regex::new(r"\s+").unwrap();
re.replace_all(&s, " ").trim().to_string()
}
fn split_sentences(text: &str) -> Vec<String> {
let mut v = Vec::new();
let mut current = String::default();
for ch in text.chars() {
current.push(ch);
if matches!(ch, '.' | '!' | '?') {
let s = normalize_whitespace(¤t);
if !s.is_empty() {
v.push(s);
}
current.clear();
}
}
if !current.trim().is_empty() {
v.push(normalize_whitespace(¤t));
}
v
}
fn rank_sentences(
sentences: &[String],
full_text: &str,
topics: &std::collections::HashSet<String>,
headings: &[String],
) -> Vec<(String, f64)> {
let mut tf: HashMap<String, f64> = HashMap::default();
for w in full_text.split(|c: char| !c.is_alphanumeric()) {
let w = w.to_lowercase();
if w.len() < 3 {
continue;
}
*tf.entry(w).or_insert(0.0) += 1.0;
}
let max_tf = tf.values().cloned().fold(1.0, f64::max);
for v in tf.values_mut() {
*v /= max_tf;
}
let heading_text = headings.join(" ").to_lowercase();
let mut scored: Vec<(String, f64)> = sentences
.iter()
.enumerate()
.map(|(i, s)| {
let words: Vec<String> = s
.split(|c: char| !c.is_alphanumeric())
.map(|w| w.to_lowercase())
.filter(|w| w.len() >= 3)
.collect();
let mut score = 0.0;
for w in &words {
score += *tf.get(w).unwrap_or(&0.0);
}
let len = s.split_whitespace().count() as f64;
if len > 0.0 {
score /= len.powf(0.3);
}
score += 0.15 * (1.0 / ((i + 1) as f64).sqrt());
if !topics.is_empty() {
let lower = s.to_lowercase();
for t in topics {
if lower.contains(t) {
score += 0.25;
}
}
}
for h in headings {
if s.to_lowercase().contains(&h.to_lowercase()) {
score += 0.2;
break;
}
}
if !heading_text.is_empty() {
let overlap = jaccard(&s.to_lowercase(), &heading_text);
score += 0.1 * overlap;
}
(s.clone(), score)
})
.collect();
scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
scored
}
fn jaccard(a: &str, b: &str) -> f64 {
let set_a: std::collections::HashSet<_> = a.split_whitespace().collect();
let set_b: std::collections::HashSet<_> = b.split_whitespace().collect();
let inter = set_a.intersection(&set_b).count() as f64;
let union = set_a.union(&set_b).count() as f64;
if union == 0.0 {
0.0
} else {
inter / union
}
}
fn select_diverse(scored: &[(String, f64)], k: usize, max_sim: f64) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
for (s, _) in scored {
if out.len() >= k {
break;
}
if out
.iter()
.all(|t| jaccard(&s.to_lowercase(), &t.to_lowercase()) < max_sim)
{
out.push(s.clone());
}
}
out
}
async fn analyze_search_results(results: &[SearchResult]) -> crate::error::Result<SearchInsights> {
let mut content_types = HashMap::default();
let mut languages = HashMap::default();
let mut date_distribution = HashMap::default();
let mut topics = Vec::new();
for result in results {
*content_types
.entry(result.content_type.primary.clone())
.or_insert(0) += 1;
if let Some(lang) = &result.language {
*languages.entry(lang.clone()).or_insert(0) += 1;
}
if let Some(pub_date) = result.published_date {
let days_ago = (Utc::now() - pub_date).num_days();
let category = match days_ago {
0..=1 => "today",
2..=7 => "this_week",
8..=30 => "this_month",
_ => "older",
};
*date_distribution.entry(category.to_string()).or_insert(0) += 1;
}
topics.extend(result.metadata.tags.clone());
}
let quality_scores: Vec<u32> = results
.iter()
.filter_map(|r| r.content_type.quality_score)
.collect();
let avg_quality_score = if !quality_scores.is_empty() {
Some(quality_scores.iter().sum::<u32>() as f64 / quality_scores.len() as f64)
} else {
None
};
Ok(SearchInsights {
common_topics: topics,
date_distribution,
content_types,
avg_quality_score,
languages,
sentiment: None, })
}
async fn analyze_news_results(results: &[SearchResult]) -> crate::error::Result<SearchInsights> {
analyze_search_results(results).await
}
async fn analyze_similarity(results: &[SearchResult]) -> crate::error::Result<SimilarityMetadata> {
let avg_similarity =
results.iter().map(|r| r.relevance_score).sum::<f64>() / results.len() as f64;
let common_themes = results
.iter()
.flat_map(|r| r.metadata.tags.clone())
.collect::<std::collections::HashSet<_>>()
.into_iter()
.collect();
Ok(SimilarityMetadata {
avg_similarity,
method: "semantic_embeddings".to_string(),
common_themes,
content_overlap: 0.75, })
}
async fn generate_related_queries(query: &str) -> crate::error::Result<Vec<String>> {
let mut variants = vec![
format!("{} news", query),
format!("{} latest", query),
format!("{} guide", query),
format!("{} tutorial", query),
format!("{} best practices", query),
format!("{} examples", query),
format!("how to {}", query),
format!("{} vs alternatives", query),
format!("{} 2025 trends", query),
];
variants.sort();
variants.dedup();
Ok(variants)
}
fn extract_top_domains(results: &[SearchResult]) -> Vec<String> {
let mut domain_counts: HashMap<String, u32> = HashMap::default();
for result in results {
*domain_counts.entry(result.domain.name.clone()).or_insert(0) += 1;
}
let mut domains: Vec<(String, u32)> = domain_counts.into_iter().collect();
domains.sort_by(|a, b| b.1.cmp(&a.1));
domains
.into_iter()
.take(10)
.map(|(domain, _)| domain)
.collect()
}
fn format_date_filter(window: &str) -> String {
let days_ago = match window {
"24h" | "day" => 1,
"week" => 7,
"month" => 30,
"year" => 365,
_ => 7,
};
let date = Utc::now() - chrono::Duration::days(days_ago);
date.format("%Y-%m-%d").to_string()
}
fn extract_topics_from_text(text: &str) -> Vec<String> {
let stopwords = [
"the", "and", "for", "with", "that", "this", "from", "have", "your", "you", "are", "was",
"were", "has", "had", "not", "but", "all", "any", "can", "will", "just", "into", "about",
"over", "more", "than", "when", "what", "how", "why", "where", "then", "them", "they",
"their", "its", "it's", "as", "of", "in", "on", "to", "by", "at", "or", "an", "be",
];
let mut counts: HashMap<String, u32> = HashMap::default();
for w in text.split(|c: char| !c.is_alphanumeric()) {
let w = w.to_lowercase();
if w.len() < 4 {
continue;
}
if stopwords.contains(&w.as_str()) {
continue;
}
*counts.entry(w).or_insert(0) += 1;
}
let mut v: Vec<(String, u32)> = counts.into_iter().collect();
v.sort_by(|a, b| b.1.cmp(&a.1));
v.into_iter().take(5).map(|(k, _)| k).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_web_search_config_default() {
let config = WebSearchConfig::default();
assert_eq!(config.exa_base_url, "https://api.exa.ai");
assert_eq!(config.max_results, 20);
}
#[test]
fn test_search_result_serialization() {
let result = SearchResult {
id: "1".to_string(),
title: "Test Page".to_string(),
url: "https://example.com".to_string(),
description: Some("Test description".to_string()),
content: Some("Test content".to_string()),
summary: None,
published_date: Some(Utc::now()),
domain: DomainInfo {
name: "example.com".to_string(),
reputation_score: Some(80),
category: Some("Test".to_string()),
is_trusted: true,
authority_score: Some(70),
},
metadata: PageMetadata {
author: None,
tags: vec!["test".to_string()],
social_meta: SocialMetadata {
og_title: None,
og_description: None,
og_image: None,
twitter_card: None,
twitter_site: None,
},
seo_meta: SeoMetadata {
meta_description: None,
meta_keywords: vec![],
robots: None,
schema_types: vec![],
},
canonical_url: None,
last_modified: None,
},
relevance_score: 0.8,
content_type: ContentType {
primary: "Article".to_string(),
format: "HTML".to_string(),
is_paywalled: Some(false),
quality_score: Some(75),
length_category: "Medium".to_string(),
},
language: Some("en".to_string()),
reading_time_minutes: Some(5),
};
let json = serde_json::to_string(&result).unwrap();
assert!(json.contains("Test Page"));
}
#[test]
fn test_format_date_filter() {
let result = format_date_filter("week");
assert!(!result.is_empty());
assert!(result.len() == 10); }
}