1use crate::{client::WebClient, error::WebToolError};
7use chrono::{DateTime, Utc};
8use riglr_macros::tool;
9use schemars::JsonSchema;
10use scraper::{ElementRef, Html, Selector};
11use serde::{Deserialize, Serialize};
12use std::collections::HashMap;
13use tracing::{debug, info, warn};
14
15const EXA_API_KEY: &str = "EXA_API_KEY";
16
17#[derive(Debug, Clone)]
19pub struct WebSearchConfig {
20 pub exa_api_key: String,
22 pub exa_base_url: String,
24 pub max_results: u32,
26 pub timeout_seconds: u64,
28 pub include_content: bool,
30 pub content_limit: usize,
32}
33
34#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
36pub struct SearchResult {
37 pub id: String,
39 pub title: String,
41 pub url: String,
43 pub description: Option<String>,
45 pub content: Option<String>,
47 pub summary: Option<String>,
49 pub published_date: Option<DateTime<Utc>>,
51 pub domain: DomainInfo,
53 pub metadata: PageMetadata,
55 pub relevance_score: f64,
57 pub content_type: ContentType,
59 pub language: Option<String>,
61 pub reading_time_minutes: Option<u32>,
63}
64
65#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
67pub struct DomainInfo {
68 pub name: String,
70 pub reputation_score: Option<u32>,
72 pub category: Option<String>,
74 pub is_trusted: bool,
76 pub authority_score: Option<u32>,
78}
79
80#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
82pub struct PageMetadata {
83 pub author: Option<String>,
85 pub tags: Vec<String>,
87 pub social_meta: SocialMetadata,
89 pub seo_meta: SeoMetadata,
91 pub canonical_url: Option<String>,
93 pub last_modified: Option<DateTime<Utc>>,
95}
96
97#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
99pub struct SocialMetadata {
100 pub og_title: Option<String>,
102 pub og_description: Option<String>,
104 pub og_image: Option<String>,
106 pub twitter_card: Option<String>,
108 pub twitter_site: Option<String>,
110}
111
112#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
114pub struct SeoMetadata {
115 pub meta_description: Option<String>,
117 pub meta_keywords: Vec<String>,
119 pub robots: Option<String>,
121 pub schema_types: Vec<String>,
123}
124
125#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
127pub struct ContentType {
128 pub primary: String,
130 pub format: String,
132 pub is_paywalled: Option<bool>,
134 pub quality_score: Option<u32>,
136 pub length_category: String, }
139
140#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
142pub struct WebSearchResult {
143 pub query: String,
145 pub search_type: String,
147 pub results: Vec<SearchResult>,
149 pub metadata: WebSearchMetadata,
151 pub insights: SearchInsights,
153 pub searched_at: DateTime<Utc>,
155}
156
157#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
159pub struct WebSearchMetadata {
160 pub total_results: u32,
162 pub returned_results: u32,
164 pub execution_time_ms: u32,
166 pub filtered: bool,
168 pub related_queries: Vec<String>,
170 pub top_domains: Vec<String>,
172}
173
174#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
176pub struct SearchInsights {
177 pub common_topics: Vec<String>,
179 pub date_distribution: HashMap<String, u32>, pub content_types: HashMap<String, u32>,
183 pub avg_quality_score: Option<f64>,
185 pub languages: HashMap<String, u32>,
187 pub sentiment: Option<SearchSentiment>,
189}
190
191#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
193pub struct SearchSentiment {
194 pub overall_sentiment: f64,
196 pub distribution: SentimentDistribution,
198 pub most_positive: Option<String>, pub most_negative: Option<String>, }
203
204#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
206pub struct SentimentDistribution {
207 pub positive_pct: f64,
209 pub neutral_pct: f64,
211 pub negative_pct: f64,
213}
214
215#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
217pub struct ContentSummary {
218 pub url: String,
220 pub title: String,
222 pub executive_summary: String,
224 pub key_points: Vec<String>,
226 pub entities: Vec<ContentEntity>,
228 pub topics: Vec<String>,
230 pub confidence: f64,
232 pub generated_at: DateTime<Utc>,
234}
235
236#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
238pub struct ContentEntity {
239 pub name: String,
241 pub entity_type: String,
243 pub confidence: f64,
245 pub context: String,
247}
248
249#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
251pub struct SimilarPagesResult {
252 pub source_url: String,
254 pub similar_pages: Vec<SearchResult>,
256 pub similarity_metadata: SimilarityMetadata,
258 pub searched_at: DateTime<Utc>,
260}
261
262#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
264pub struct SimilarityMetadata {
265 pub avg_similarity: f64,
267 pub method: String,
269 pub common_themes: Vec<String>,
271 pub content_overlap: f64,
273}
274
275impl Default for WebSearchConfig {
276 fn default() -> Self {
277 Self {
278 exa_api_key: String::default(),
279 exa_base_url: "https://api.exa.ai".to_string(),
280 max_results: 20,
281 timeout_seconds: 30,
282 include_content: true,
283 content_limit: 5000,
284 }
285 }
286}
287
288impl WebSearchConfig {
289 #[allow(dead_code)]
291 fn from_context(context: &riglr_core::provider::ApplicationContext) -> Self {
292 Self {
293 exa_api_key: context
294 .config
295 .providers
296 .exa_api_key
297 .clone()
298 .unwrap_or_default(),
299 exa_base_url: "https://api.exa.ai".to_string(),
300 max_results: 20,
301 timeout_seconds: 30,
302 include_content: true,
303 content_limit: 5000,
304 }
305 }
306}
307
308pub async fn search_web_with_context(
310 query: String,
311 max_results: Option<u32>,
312 include_content: Option<bool>,
313 domain_filter: Option<Vec<String>>,
314 date_filter: Option<String>, content_type_filter: Option<String>, app_context: &riglr_core::provider::ApplicationContext,
317) -> crate::error::Result<WebSearchResult> {
318 debug!(
319 "Performing web search for query: '{}' with {} max results",
320 query,
321 max_results.unwrap_or(20)
322 );
323
324 let exa_api_key = app_context
326 .config
327 .providers
328 .exa_api_key
329 .clone()
330 .ok_or_else(|| {
331 WebToolError::Config(
332 "EXA_API_KEY not configured. Set EXA_API_KEY in your environment.".to_string(),
333 )
334 })?;
335
336 let config = WebSearchConfig::default();
337 let client = WebClient::default().with_exa_key(exa_api_key.clone());
338
339 let mut params = HashMap::default();
341 params.insert("query".to_string(), query.clone());
342 params.insert(
343 "num_results".to_string(),
344 max_results.unwrap_or(20).to_string(),
345 );
346 params.insert(
347 "include_content".to_string(),
348 include_content.unwrap_or(true).to_string(),
349 );
350 params.insert("search_type".to_string(), "semantic".to_string());
351
352 if let Some(ref domains) = domain_filter {
353 params.insert("include_domains".to_string(), domains.join(","));
354 }
355
356 if let Some(ref date) = date_filter {
357 params.insert("start_published_date".to_string(), format_date_filter(date));
358 }
359
360 if let Some(content_type) = content_type_filter {
361 params.insert("category".to_string(), content_type);
362 }
363
364 let url = format!("{}/search", config.exa_base_url);
366 let mut headers = HashMap::default();
367 headers.insert("x-api-key".to_string(), exa_api_key.clone());
368 headers.insert("accept".to_string(), "application/json".to_string());
369 let response = client
370 .get_with_params_and_headers(&url, ¶ms, headers)
371 .await
372 .map_err(|e| {
373 if e.to_string().contains("timeout") || e.to_string().contains("connection") {
374 WebToolError::Network(format!("Web search request failed: {}", e))
375 } else {
376 WebToolError::Config(format!("Web search request failed: {}", e))
377 }
378 })?;
379
380 let results = parse_exa_search_response(&response, &query)
382 .await
383 .map_err(|e| WebToolError::Config(format!("Failed to parse search response: {}", e)))?;
384
385 let insights = analyze_search_results(&results)
387 .await
388 .map_err(|e| WebToolError::Config(format!("Failed to analyze results: {}", e)))?;
389
390 let search_result = WebSearchResult {
391 query: query.clone(),
392 search_type: "semantic".to_string(),
393 results: results.clone(),
394 metadata: WebSearchMetadata {
395 total_results: results.len() as u32,
396 returned_results: results.len() as u32,
397 execution_time_ms: 1500, filtered: domain_filter.is_some() || date_filter.is_some(),
399 related_queries: generate_related_queries(&query).await.map_err(|e| {
400 WebToolError::Config(format!("Failed to generate related queries: {}", e))
401 })?,
402 top_domains: extract_top_domains(&results),
403 },
404 insights,
405 searched_at: Utc::now(),
406 };
407
408 info!(
409 "Web search completed: {} results for '{}'",
410 results.len(),
411 query
412 );
413
414 Ok(search_result)
415}
416
417#[tool]
422pub async fn search_web(
423 context: &riglr_core::provider::ApplicationContext,
424 query: String,
425 max_results: Option<u32>,
426 include_content: Option<bool>,
427 domain_filter: Option<Vec<String>>,
428 date_filter: Option<String>, content_type_filter: Option<String>, ) -> crate::error::Result<WebSearchResult> {
431 search_web_with_context(
432 query,
433 max_results,
434 include_content,
435 domain_filter,
436 date_filter,
437 content_type_filter,
438 context,
439 )
440 .await
441}
442
443#[tool]
448pub async fn find_similar_pages(
449 context: &riglr_core::provider::ApplicationContext,
450 source_url: String,
451 max_results: Option<u32>,
452 include_content: Option<bool>,
453 similarity_threshold: Option<f64>,
454) -> crate::error::Result<SimilarPagesResult> {
455 debug!("Finding pages similar to: {}", source_url);
456
457 let exa_api_key = context
459 .config
460 .providers
461 .exa_api_key
462 .clone()
463 .ok_or_else(|| {
464 WebToolError::Config(
465 "EXA_API_KEY not configured. Set EXA_API_KEY in your environment.".to_string(),
466 )
467 })?;
468
469 let client = WebClient::default().with_exa_key(exa_api_key.clone());
470
471 let mut params = HashMap::default();
473 params.insert("url".to_string(), source_url.clone());
474 params.insert(
475 "num_results".to_string(),
476 max_results.unwrap_or(10).to_string(),
477 );
478 params.insert(
479 "include_content".to_string(),
480 include_content.unwrap_or(true).to_string(),
481 );
482
483 if let Some(threshold) = similarity_threshold {
484 params.insert("similarity_threshold".to_string(), threshold.to_string());
485 }
486
487 let config = WebSearchConfig::default();
489 let url = format!("{}/find_similar", config.exa_base_url);
490 let mut headers = HashMap::default();
491 headers.insert("x-api-key".to_string(), exa_api_key.clone());
492 headers.insert("accept".to_string(), "application/json".to_string());
493 let response = client
494 .get_with_params_and_headers(&url, ¶ms, headers)
495 .await
496 .map_err(|e| {
497 if e.to_string().contains("timeout") || e.to_string().contains("connection") {
498 WebToolError::Network(format!("Web search request failed: {}", e))
499 } else {
500 WebToolError::Config(format!("Web search request failed: {}", e))
501 }
502 })?;
503
504 let similar_pages = parse_similar_pages_response(&response)
506 .await
507 .map_err(|e| WebToolError::Config(format!("Failed to parse similar pages: {}", e)))?;
508
509 let similarity_metadata = analyze_similarity(&similar_pages)
511 .await
512 .map_err(|e| WebToolError::Config(format!("Failed to analyze similarity: {}", e)))?;
513
514 let result = SimilarPagesResult {
515 source_url: source_url.clone(),
516 similar_pages: similar_pages.clone(),
517 similarity_metadata,
518 searched_at: Utc::now(),
519 };
520
521 info!(
522 "Found {} similar pages to {}",
523 similar_pages.len(),
524 source_url
525 );
526
527 Ok(result)
528}
529
530#[tool]
535pub async fn summarize_web_content(
536 context: &riglr_core::provider::ApplicationContext,
537 urls: Vec<String>,
538 summary_length: Option<String>, focus_topics: Option<Vec<String>>,
540 _include_quotes: Option<bool>,
541) -> crate::error::Result<Vec<ContentSummary>> {
542 debug!("Summarizing content from {} URLs", urls.len());
543
544 let exa_api_key = context
546 .get_extension::<String>()
547 .and_then(|s| {
548 if s.starts_with("exa_") {
549 Some(s.as_ref().clone())
550 } else {
551 None
552 }
553 })
554 .unwrap_or_else(|| std::env::var(EXA_API_KEY).unwrap_or_else(|_| String::default()));
555
556 let client = WebClient::default().with_exa_key(exa_api_key);
557
558 let mut summaries = Vec::new();
559
560 for url in urls {
562 match extract_and_summarize_page(&client, &url, &summary_length, &focus_topics).await {
563 Ok(summary) => {
564 summaries.push(summary);
565 }
566 Err(e) => {
567 warn!("Failed to summarize {}: {}", url, e);
568 }
570 }
571 }
572
573 info!(
574 "Successfully summarized {} out of {} pages",
575 summaries.len(),
576 summaries.len()
577 );
578
579 Ok(summaries)
580}
581
582#[tool]
587pub async fn search_recent_news(
588 context: &riglr_core::provider::ApplicationContext,
589 topic: String,
590 time_window: Option<String>, source_types: Option<Vec<String>>, max_results: Option<u32>,
593 include_analysis: Option<bool>,
594) -> crate::error::Result<WebSearchResult> {
595 debug!(
596 "Searching recent news for topic: '{}' within {}",
597 topic,
598 time_window.as_deref().unwrap_or("week")
599 );
600
601 let exa_api_key = context
603 .config
604 .providers
605 .exa_api_key
606 .clone()
607 .ok_or_else(|| {
608 WebToolError::Config(
609 "EXA_API_KEY not configured. Set EXA_API_KEY in your environment.".to_string(),
610 )
611 })?;
612
613 let client = WebClient::default().with_exa_key(exa_api_key.clone());
614
615 let mut params = HashMap::default();
617 params.insert("query".to_string(), topic.clone());
618 params.insert("search_type".to_string(), "news".to_string());
619 params.insert(
620 "num_results".to_string(),
621 max_results.unwrap_or(30).to_string(),
622 );
623 params.insert("include_content".to_string(), "true".to_string());
624
625 let time_window = time_window.unwrap_or_else(|| "week".to_string());
627 params.insert(
628 "start_published_date".to_string(),
629 format_date_filter(&time_window),
630 );
631
632 if let Some(sources) = source_types {
634 if sources.contains(&"news".to_string()) {
635 params.insert("category".to_string(), "news".to_string());
636 }
637 }
638
639 let config = WebSearchConfig::default();
640 let url = format!("{}/search", config.exa_base_url);
641 let mut headers = HashMap::default();
642 headers.insert("x-api-key".to_string(), exa_api_key.clone());
643 headers.insert("accept".to_string(), "application/json".to_string());
644 let response = client
645 .get_with_params_and_headers(&url, ¶ms, headers)
646 .await
647 .map_err(|e| {
648 if e.to_string().contains("timeout") || e.to_string().contains("connection") {
649 WebToolError::Network(format!("Web search request failed: {}", e))
650 } else {
651 WebToolError::Config(format!("Web search request failed: {}", e))
652 }
653 })?;
654
655 let mut results = parse_exa_search_response(&response, &topic)
657 .await
658 .map_err(|e| WebToolError::Config(format!("Failed to parse news response: {}", e)))?;
659
660 results.sort_by(|a, b| {
662 b.published_date
663 .unwrap_or_else(Utc::now)
664 .cmp(&a.published_date.unwrap_or_else(Utc::now))
665 });
666
667 let insights = if include_analysis.unwrap_or(true) {
668 analyze_news_results(&results)
669 .await
670 .map_err(|e| WebToolError::Config(format!("Failed to analyze news: {}", e)))?
671 } else {
672 SearchInsights {
673 common_topics: vec![],
674 date_distribution: HashMap::default(),
675 content_types: HashMap::default(),
676 avg_quality_score: None,
677 languages: HashMap::default(),
678 sentiment: None,
679 }
680 };
681
682 let search_result = WebSearchResult {
683 query: topic.clone(),
684 search_type: "news".to_string(),
685 results: results.clone(),
686 metadata: WebSearchMetadata {
687 total_results: results.len() as u32,
688 returned_results: results.len() as u32,
689 execution_time_ms: 1200,
690 filtered: true,
691 related_queries: generate_related_queries(&topic).await.map_err(|e| {
692 WebToolError::Config(format!("Failed to generate related queries: {}", e))
693 })?,
694 top_domains: extract_top_domains(&results),
695 },
696 insights,
697 searched_at: Utc::now(),
698 };
699
700 info!(
701 "Recent news search completed: {} results for '{}'",
702 search_result.results.len(),
703 topic
704 );
705
706 Ok(search_result)
707}
708
709async fn parse_exa_search_response(
711 response: &str,
712 query: &str,
713) -> crate::error::Result<Vec<SearchResult>> {
714 let json: serde_json::Value = serde_json::from_str(response)
715 .map_err(|e| WebToolError::Parsing(format!("Invalid Exa JSON: {}", e)))?;
716
717 let mut out = Vec::new();
718 let results = json
719 .get("results")
720 .and_then(|v| v.as_array())
721 .cloned()
722 .unwrap_or_default();
723 for r in results {
724 let title = r
725 .get("title")
726 .and_then(|v| v.as_str())
727 .unwrap_or("")
728 .to_string();
729 let url = r
730 .get("url")
731 .and_then(|v| v.as_str())
732 .unwrap_or("")
733 .to_string();
734 if url.is_empty() {
735 continue;
736 }
737 let id = r
738 .get("id")
739 .and_then(|v| v.as_str())
740 .unwrap_or(url.as_str())
741 .to_string();
742 let description = r
743 .get("description")
744 .or_else(|| r.get("snippet"))
745 .and_then(|v| v.as_str())
746 .map(|s| s.to_string());
747 let content = r
748 .get("text")
749 .and_then(|v| v.as_str())
750 .map(|s| s.to_string());
751 let published_date = r
752 .get("publishedDate")
753 .or_else(|| r.get("published_date"))
754 .and_then(|v| v.as_str())
755 .and_then(|s| DateTime::parse_from_rfc3339(s).ok())
756 .map(|dt| dt.with_timezone(&Utc));
757 let domain_name = url::Url::parse(&url)
758 .ok()
759 .and_then(|u| u.host_str().map(|h| h.to_string()))
760 .unwrap_or_default();
761 let score = r.get("score").and_then(|v| v.as_f64()).unwrap_or(0.8);
762 let language = r
763 .get("language")
764 .and_then(|v| v.as_str())
765 .map(|s| s.to_string());
766 let author = r
767 .get("author")
768 .and_then(|v| v.as_str())
769 .map(|s| s.to_string());
770
771 let words = content
772 .as_ref()
773 .map(|c| c.split_whitespace().count() as u32)
774 .unwrap_or(0);
775 let reading_time = if words > 0 {
776 Some((words as f64 / 200.0).ceil() as u32)
777 } else {
778 None
779 };
780 let length_category = match words {
781 0..=200 => "Short",
782 201..=800 => "Medium",
783 801..=2000 => "Long",
784 _ => "Very Long",
785 }
786 .to_string();
787
788 let content_type = ContentType {
789 primary: "Article".to_string(),
790 format: if url.to_lowercase().ends_with(".pdf") {
791 "PDF".to_string()
792 } else {
793 "HTML".to_string()
794 },
795 is_paywalled: None,
796 quality_score: Some(((score * 100.0) as u32).min(100)),
797 length_category,
798 };
799
800 let metadata = PageMetadata {
801 author,
802 tags: vec![query.to_lowercase()],
803 social_meta: SocialMetadata {
804 og_title: None,
805 og_description: None,
806 og_image: None,
807 twitter_card: None,
808 twitter_site: None,
809 },
810 seo_meta: SeoMetadata {
811 meta_description: description.clone(),
812 meta_keywords: vec![],
813 robots: None,
814 schema_types: vec![],
815 },
816 canonical_url: None,
817 last_modified: None,
818 };
819
820 let domain = DomainInfo {
821 name: domain_name,
822 reputation_score: None,
823 category: None,
824 is_trusted: true,
825 authority_score: None,
826 };
827
828 out.push(SearchResult {
829 id,
830 title,
831 url,
832 description,
833 content,
834 summary: None,
835 published_date,
836 domain,
837 metadata,
838 relevance_score: score,
839 content_type,
840 language,
841 reading_time_minutes: reading_time,
842 });
843 }
844 Ok(out)
845}
846
847async fn parse_similar_pages_response(response: &str) -> crate::error::Result<Vec<SearchResult>> {
849 parse_exa_search_response(response, "").await
851}
852
853async fn extract_and_summarize_page(
858 client: &WebClient,
859 url: &str,
860 summary_length: &Option<String>,
861 focus_topics: &Option<Vec<String>>,
862) -> crate::error::Result<ContentSummary> {
863 let html = client
864 .get(url)
865 .await
866 .map_err(|e| WebToolError::Network(format!("Failed to fetch {}: {}", url, e)))?;
867 let (title, clean_text, sentences, headings) = extract_main_content(&html, url);
868
869 let n = match summary_length.as_deref() {
871 Some("comprehensive") => 8,
872 Some("detailed") => 5,
873 _ => 3,
874 } as usize;
875
876 let topic_set: std::collections::HashSet<String> = focus_topics
877 .clone()
878 .unwrap_or_default()
879 .into_iter()
880 .map(|t| t.to_lowercase())
881 .collect();
882
883 let ranked = rank_sentences(&sentences, &clean_text, &topic_set, &headings);
884 let selected = select_diverse(&ranked, n, 0.6);
885 let executive_summary = selected.join(" ");
886
887 let mut key_points = selected.iter().take(5).cloned().collect::<Vec<_>>();
889 if key_points.is_empty() && !headings.is_empty() {
890 key_points = headings.iter().take(5).cloned().collect();
891 }
892
893 let topics = if !topic_set.is_empty() {
894 topic_set.iter().cloned().collect()
895 } else {
896 extract_topics_from_text(&clean_text)
897 };
898
899 let entity_re = regex::Regex::new(r"(?m)(?:^|\s)([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})").unwrap();
901 let mut entities: Vec<ContentEntity> = entity_re
902 .captures_iter(&clean_text)
903 .map(|cap| ContentEntity {
904 name: cap[1].trim().to_string(),
905 entity_type: "ProperNoun".to_string(),
906 confidence: 0.55,
907 context: "".to_string(),
908 })
909 .collect();
910 entities.dedup_by(|a, b| a.name.eq_ignore_ascii_case(&b.name));
911 entities.truncate(8);
912
913 let mut confidence = (clean_text.len().min(8000) as f64 / 8000.0) * 0.6 + 0.3;
915 if !headings.is_empty() {
916 confidence += 0.05;
917 }
918 confidence = confidence.min(0.97);
919
920 Ok(ContentSummary {
921 url: url.to_string(),
922 title,
923 executive_summary,
924 key_points,
925 entities,
926 topics,
927 confidence,
928 generated_at: Utc::now(),
929 })
930}
931
932fn extract_main_content(
934 html: &str,
935 fallback_url: &str,
936) -> (String, String, Vec<String>, Vec<String>) {
937 let document = Html::parse_document(html);
938
939 let sel_meta_title = Selector::parse("meta[property=\"og:title\"]").unwrap();
941 let title = document
942 .select(&sel_meta_title)
943 .filter_map(|el| el.value().attr("content"))
944 .map(|s| s.trim().to_string())
945 .find(|s| !s.is_empty())
946 .or_else(|| {
947 let sel_title = Selector::parse("title").unwrap();
949 document
950 .select(&sel_title)
951 .next()
952 .map(|e| e.text().collect::<String>().trim().to_string())
953 })
954 .unwrap_or_else(|| fallback_url.to_string());
955
956 let candidates = vec![
958 "article",
959 "main",
960 "div#content",
961 "div#main",
962 "div.post-content",
963 "div.article-content",
964 "section.article",
965 "div.entry-content",
966 "div#main-content",
967 ];
968
969 let mut best_text = String::default();
970 let mut best_headings: Vec<String> = Vec::new();
971 for css in candidates {
972 if let Ok(sel) = Selector::parse(css) {
973 for node in document.select(&sel) {
974 let (text, headings) = extract_text_from_node(node);
975 if text.len() > best_text.len() {
976 best_text = text;
977 best_headings = headings;
978 }
979 }
980 }
981 }
982
983 if best_text.is_empty() {
984 if let Ok(sel) = Selector::parse("body") {
986 if let Some(body) = document.select(&sel).next() {
987 let (text, headings) = extract_text_from_node(body);
988 best_text = text;
989 best_headings = headings;
990 }
991 }
992 }
993
994 let sentences: Vec<String> = split_sentences(&best_text)
996 .into_iter()
997 .filter(|s| s.split_whitespace().count() >= 5)
998 .collect();
999
1000 (title, best_text, sentences, best_headings)
1001}
1002
1003fn extract_text_from_node(root: ElementRef) -> (String, Vec<String>) {
1005 let sel_exclude = [
1006 "script", "style", "noscript", "template", "header", "footer", "nav", "aside",
1007 ];
1008 let sel_p = Selector::parse("p, li").unwrap();
1009 let sel_h = Selector::parse("h1, h2, h3").unwrap();
1010
1011 let mut headings: Vec<String> = root
1013 .select(&sel_h)
1014 .map(|h| normalize_whitespace(&h.text().collect::<String>()))
1015 .filter(|s| !s.is_empty())
1016 .collect();
1017 headings.dedup();
1018
1019 let mut blocks: Vec<String> = Vec::new();
1021 for p in root.select(&sel_p) {
1022 if has_excluded_ancestor(p, &sel_exclude) {
1024 continue;
1025 }
1026 let txt = normalize_whitespace(&p.text().collect::<String>());
1027 if txt.len() >= 40 {
1028 blocks.push(txt);
1029 }
1030 }
1031 let full = blocks.join("\n");
1032 (full, headings)
1033}
1034
1035fn has_excluded_ancestor(mut node: ElementRef, excluded: &[&str]) -> bool {
1037 while let Some(parent) = node.ancestors().find_map(ElementRef::wrap) {
1038 let name = parent.value().name();
1039 if excluded.contains(&name) {
1040 return true;
1041 }
1042 node = parent;
1043 if node.parent().is_none() {
1045 break;
1046 }
1047 }
1048 false
1049}
1050
1051fn normalize_whitespace(s: &str) -> String {
1053 let s = html_escape::decode_html_entities(s);
1054 let re = regex::Regex::new(r"\s+").unwrap();
1055 re.replace_all(&s, " ").trim().to_string()
1056}
1057
1058fn split_sentences(text: &str) -> Vec<String> {
1060 let mut v = Vec::new();
1061 let mut current = String::default();
1062 for ch in text.chars() {
1063 current.push(ch);
1064 if matches!(ch, '.' | '!' | '?') {
1065 let s = normalize_whitespace(¤t);
1066 if !s.is_empty() {
1067 v.push(s);
1068 }
1069 current.clear();
1070 }
1071 }
1072 if !current.trim().is_empty() {
1073 v.push(normalize_whitespace(¤t));
1074 }
1075 v
1076}
1077
1078fn rank_sentences(
1080 sentences: &[String],
1081 full_text: &str,
1082 topics: &std::collections::HashSet<String>,
1083 headings: &[String],
1084) -> Vec<(String, f64)> {
1085 let mut tf: HashMap<String, f64> = HashMap::default();
1086 for w in full_text.split(|c: char| !c.is_alphanumeric()) {
1087 let w = w.to_lowercase();
1088 if w.len() < 3 {
1089 continue;
1090 }
1091 *tf.entry(w).or_insert(0.0) += 1.0;
1092 }
1093 let max_tf = tf.values().cloned().fold(1.0, f64::max);
1095 for v in tf.values_mut() {
1096 *v /= max_tf;
1097 }
1098
1099 let heading_text = headings.join(" ").to_lowercase();
1100
1101 let mut scored: Vec<(String, f64)> = sentences
1102 .iter()
1103 .enumerate()
1104 .map(|(i, s)| {
1105 let words: Vec<String> = s
1106 .split(|c: char| !c.is_alphanumeric())
1107 .map(|w| w.to_lowercase())
1108 .filter(|w| w.len() >= 3)
1109 .collect();
1110 let mut score = 0.0;
1111 for w in &words {
1112 score += *tf.get(w).unwrap_or(&0.0);
1113 }
1114 let len = s.split_whitespace().count() as f64;
1116 if len > 0.0 {
1117 score /= len.powf(0.3);
1118 }
1119 score += 0.15 * (1.0 / ((i + 1) as f64).sqrt());
1121 if !topics.is_empty() {
1123 let lower = s.to_lowercase();
1124 for t in topics {
1125 if lower.contains(t) {
1126 score += 0.25;
1127 }
1128 }
1129 }
1130 for h in headings {
1132 if s.to_lowercase().contains(&h.to_lowercase()) {
1133 score += 0.2;
1134 break;
1135 }
1136 }
1137 if !heading_text.is_empty() {
1139 let overlap = jaccard(&s.to_lowercase(), &heading_text);
1140 score += 0.1 * overlap;
1141 }
1142 (s.clone(), score)
1143 })
1144 .collect();
1145 scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
1146 scored
1147}
1148
1149fn jaccard(a: &str, b: &str) -> f64 {
1151 let set_a: std::collections::HashSet<_> = a.split_whitespace().collect();
1152 let set_b: std::collections::HashSet<_> = b.split_whitespace().collect();
1153 let inter = set_a.intersection(&set_b).count() as f64;
1154 let union = set_a.union(&set_b).count() as f64;
1155 if union == 0.0 {
1156 0.0
1157 } else {
1158 inter / union
1159 }
1160}
1161
1162fn select_diverse(scored: &[(String, f64)], k: usize, max_sim: f64) -> Vec<String> {
1164 let mut out: Vec<String> = Vec::new();
1165 for (s, _) in scored {
1166 if out.len() >= k {
1167 break;
1168 }
1169 if out
1170 .iter()
1171 .all(|t| jaccard(&s.to_lowercase(), &t.to_lowercase()) < max_sim)
1172 {
1173 out.push(s.clone());
1174 }
1175 }
1176 out
1177}
1178
1179async fn analyze_search_results(results: &[SearchResult]) -> crate::error::Result<SearchInsights> {
1181 let mut content_types = HashMap::default();
1182 let mut languages = HashMap::default();
1183 let mut date_distribution = HashMap::default();
1184 let mut topics = Vec::new();
1185
1186 for result in results {
1187 *content_types
1189 .entry(result.content_type.primary.clone())
1190 .or_insert(0) += 1;
1191
1192 if let Some(lang) = &result.language {
1194 *languages.entry(lang.clone()).or_insert(0) += 1;
1195 }
1196
1197 if let Some(pub_date) = result.published_date {
1199 let days_ago = (Utc::now() - pub_date).num_days();
1200 let category = match days_ago {
1201 0..=1 => "today",
1202 2..=7 => "this_week",
1203 8..=30 => "this_month",
1204 _ => "older",
1205 };
1206 *date_distribution.entry(category.to_string()).or_insert(0) += 1;
1207 }
1208
1209 topics.extend(result.metadata.tags.clone());
1211 }
1212
1213 let quality_scores: Vec<u32> = results
1215 .iter()
1216 .filter_map(|r| r.content_type.quality_score)
1217 .collect();
1218 let avg_quality_score = if !quality_scores.is_empty() {
1219 Some(quality_scores.iter().sum::<u32>() as f64 / quality_scores.len() as f64)
1220 } else {
1221 None
1222 };
1223
1224 Ok(SearchInsights {
1225 common_topics: topics,
1226 date_distribution,
1227 content_types,
1228 avg_quality_score,
1229 languages,
1230 sentiment: None, })
1232}
1233
1234async fn analyze_news_results(results: &[SearchResult]) -> crate::error::Result<SearchInsights> {
1236 analyze_search_results(results).await
1238}
1239
1240async fn analyze_similarity(results: &[SearchResult]) -> crate::error::Result<SimilarityMetadata> {
1242 let avg_similarity =
1243 results.iter().map(|r| r.relevance_score).sum::<f64>() / results.len() as f64;
1244
1245 let common_themes = results
1246 .iter()
1247 .flat_map(|r| r.metadata.tags.clone())
1248 .collect::<std::collections::HashSet<_>>()
1249 .into_iter()
1250 .collect();
1251
1252 Ok(SimilarityMetadata {
1253 avg_similarity,
1254 method: "semantic_embeddings".to_string(),
1255 common_themes,
1256 content_overlap: 0.75, })
1258}
1259
1260async fn generate_related_queries(query: &str) -> crate::error::Result<Vec<String>> {
1262 let mut variants = vec![
1264 format!("{} news", query),
1265 format!("{} latest", query),
1266 format!("{} guide", query),
1267 format!("{} tutorial", query),
1268 format!("{} best practices", query),
1269 format!("{} examples", query),
1270 format!("how to {}", query),
1271 format!("{} vs alternatives", query),
1272 format!("{} 2025 trends", query),
1273 ];
1274 variants.sort();
1275 variants.dedup();
1276 Ok(variants)
1277}
1278
1279fn extract_top_domains(results: &[SearchResult]) -> Vec<String> {
1281 let mut domain_counts: HashMap<String, u32> = HashMap::default();
1282
1283 for result in results {
1284 *domain_counts.entry(result.domain.name.clone()).or_insert(0) += 1;
1285 }
1286
1287 let mut domains: Vec<(String, u32)> = domain_counts.into_iter().collect();
1288 domains.sort_by(|a, b| b.1.cmp(&a.1));
1289
1290 domains
1291 .into_iter()
1292 .take(10)
1293 .map(|(domain, _)| domain)
1294 .collect()
1295}
1296
1297fn format_date_filter(window: &str) -> String {
1299 let days_ago = match window {
1300 "24h" | "day" => 1,
1301 "week" => 7,
1302 "month" => 30,
1303 "year" => 365,
1304 _ => 7,
1305 };
1306
1307 let date = Utc::now() - chrono::Duration::days(days_ago);
1308 date.format("%Y-%m-%d").to_string()
1309}
1310
1311fn extract_topics_from_text(text: &str) -> Vec<String> {
1313 let stopwords = [
1314 "the", "and", "for", "with", "that", "this", "from", "have", "your", "you", "are", "was",
1315 "were", "has", "had", "not", "but", "all", "any", "can", "will", "just", "into", "about",
1316 "over", "more", "than", "when", "what", "how", "why", "where", "then", "them", "they",
1317 "their", "its", "it's", "as", "of", "in", "on", "to", "by", "at", "or", "an", "be",
1318 ];
1319 let mut counts: HashMap<String, u32> = HashMap::default();
1320 for w in text.split(|c: char| !c.is_alphanumeric()) {
1321 let w = w.to_lowercase();
1322 if w.len() < 4 {
1323 continue;
1324 }
1325 if stopwords.contains(&w.as_str()) {
1326 continue;
1327 }
1328 *counts.entry(w).or_insert(0) += 1;
1329 }
1330 let mut v: Vec<(String, u32)> = counts.into_iter().collect();
1331 v.sort_by(|a, b| b.1.cmp(&a.1));
1332 v.into_iter().take(5).map(|(k, _)| k).collect()
1333}
1334
1335#[cfg(test)]
1336mod tests {
1337 use super::*;
1338
1339 #[test]
1340 fn test_web_search_config_default() {
1341 let config = WebSearchConfig::default();
1342 assert_eq!(config.exa_base_url, "https://api.exa.ai");
1343 assert_eq!(config.max_results, 20);
1344 }
1345
1346 #[test]
1347 fn test_search_result_serialization() {
1348 let result = SearchResult {
1349 id: "1".to_string(),
1350 title: "Test Page".to_string(),
1351 url: "https://example.com".to_string(),
1352 description: Some("Test description".to_string()),
1353 content: Some("Test content".to_string()),
1354 summary: None,
1355 published_date: Some(Utc::now()),
1356 domain: DomainInfo {
1357 name: "example.com".to_string(),
1358 reputation_score: Some(80),
1359 category: Some("Test".to_string()),
1360 is_trusted: true,
1361 authority_score: Some(70),
1362 },
1363 metadata: PageMetadata {
1364 author: None,
1365 tags: vec!["test".to_string()],
1366 social_meta: SocialMetadata {
1367 og_title: None,
1368 og_description: None,
1369 og_image: None,
1370 twitter_card: None,
1371 twitter_site: None,
1372 },
1373 seo_meta: SeoMetadata {
1374 meta_description: None,
1375 meta_keywords: vec![],
1376 robots: None,
1377 schema_types: vec![],
1378 },
1379 canonical_url: None,
1380 last_modified: None,
1381 },
1382 relevance_score: 0.8,
1383 content_type: ContentType {
1384 primary: "Article".to_string(),
1385 format: "HTML".to_string(),
1386 is_paywalled: Some(false),
1387 quality_score: Some(75),
1388 length_category: "Medium".to_string(),
1389 },
1390 language: Some("en".to_string()),
1391 reading_time_minutes: Some(5),
1392 };
1393
1394 let json = serde_json::to_string(&result).unwrap();
1395 assert!(json.contains("Test Page"));
1396 }
1397
1398 #[test]
1399 fn test_format_date_filter() {
1400 let result = format_date_filter("week");
1401 assert!(!result.is_empty());
1402 assert!(result.len() == 10); }
1404}