riglr_web_tools/
web_search.rs

1//! Web search integration using Exa API and web scraping
2//!
3//! This module provides web search capabilities, content extraction using HTML parsing,
4//! and extractive summarization (sentence ranking) for AI agents to gather web-based information.
5
6use crate::{client::WebClient, error::WebToolError};
7use chrono::{DateTime, Utc};
8use riglr_macros::tool;
9use schemars::JsonSchema;
10use scraper::{ElementRef, Html, Selector};
11use serde::{Deserialize, Serialize};
12use std::collections::HashMap;
13use tracing::{debug, info, warn};
14
15const EXA_API_KEY: &str = "EXA_API_KEY";
16
17/// Configuration for web search services
18#[derive(Debug, Clone)]
19pub struct WebSearchConfig {
20    /// Exa API key for intelligent search
21    pub exa_api_key: String,
22    /// Exa API base URL (default: https://api.exa.ai)
23    pub exa_base_url: String,
24    /// Maximum results per search (default: 20)
25    pub max_results: u32,
26    /// Default search timeout in seconds (default: 30)
27    pub timeout_seconds: u64,
28    /// Whether to include page content by default
29    pub include_content: bool,
30    /// Content extraction length limit (characters)
31    pub content_limit: usize,
32}
33
34/// Comprehensive search result with content and metadata
35#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
36pub struct SearchResult {
37    /// Unique result identifier
38    pub id: String,
39    /// Page title
40    pub title: String,
41    /// Page URL
42    pub url: String,
43    /// Page description/snippet
44    pub description: Option<String>,
45    /// Extracted text content
46    pub content: Option<String>,
47    /// Content summary (if processed)
48    pub summary: Option<String>,
49    /// Publication date (if available)
50    pub published_date: Option<DateTime<Utc>>,
51    /// Domain information
52    pub domain: DomainInfo,
53    /// Page metadata
54    pub metadata: PageMetadata,
55    /// Search relevance score (0.0 - 1.0)
56    pub relevance_score: f64,
57    /// Content type and format info
58    pub content_type: ContentType,
59    /// Language detection result
60    pub language: Option<String>,
61    /// Estimated reading time (minutes)
62    pub reading_time_minutes: Option<u32>,
63}
64
65/// Domain information for a search result
66#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
67pub struct DomainInfo {
68    /// Domain name (e.g., "techcrunch.com")
69    pub name: String,
70    /// Domain reputation score (0-100)
71    pub reputation_score: Option<u32>,
72    /// Domain category (News, Blog, Academic, etc.)
73    pub category: Option<String>,
74    /// Whether domain is known to be trustworthy
75    pub is_trusted: bool,
76    /// Domain authority score (if available)
77    pub authority_score: Option<u32>,
78}
79
80/// Page metadata extracted from HTML
81#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
82pub struct PageMetadata {
83    /// Author name(s)
84    pub author: Option<String>,
85    /// Article/page tags
86    pub tags: Vec<String>,
87    /// Social media metadata (Open Graph)
88    pub social_meta: SocialMetadata,
89    /// SEO metadata
90    pub seo_meta: SeoMetadata,
91    /// Canonical URL (if different from actual URL)
92    pub canonical_url: Option<String>,
93    /// Last modified date
94    pub last_modified: Option<DateTime<Utc>>,
95}
96
97/// Social media metadata (Open Graph, Twitter Cards)
98#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
99pub struct SocialMetadata {
100    /// Open Graph title
101    pub og_title: Option<String>,
102    /// Open Graph description
103    pub og_description: Option<String>,
104    /// Open Graph image URL
105    pub og_image: Option<String>,
106    /// Twitter card type
107    pub twitter_card: Option<String>,
108    /// Twitter handle
109    pub twitter_site: Option<String>,
110}
111
112/// SEO-related metadata
113#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
114pub struct SeoMetadata {
115    /// Meta description
116    pub meta_description: Option<String>,
117    /// Meta keywords
118    pub meta_keywords: Vec<String>,
119    /// Page robots directive
120    pub robots: Option<String>,
121    /// Schema.org structured data types found
122    pub schema_types: Vec<String>,
123}
124
125/// Content type and format information
126#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
127pub struct ContentType {
128    /// Primary content type (Article, Blog, News, Academic, etc.)
129    pub primary: String,
130    /// Content format (HTML, PDF, etc.)
131    pub format: String,
132    /// Whether content is behind paywall
133    pub is_paywalled: Option<bool>,
134    /// Content quality score (0-100)
135    pub quality_score: Option<u32>,
136    /// Estimated content length category
137    pub length_category: String, // "Short", "Medium", "Long", "Very Long"
138}
139
140/// Complete search operation result
141#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
142pub struct WebSearchResult {
143    /// Search query used
144    pub query: String,
145    /// Search type performed
146    pub search_type: String,
147    /// Found results
148    pub results: Vec<SearchResult>,
149    /// Search metadata
150    pub metadata: WebSearchMetadata,
151    /// Aggregated insights from results
152    pub insights: SearchInsights,
153    /// Search timestamp
154    pub searched_at: DateTime<Utc>,
155}
156
157/// Metadata about the search operation
158#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
159pub struct WebSearchMetadata {
160    /// Total results found
161    pub total_results: u32,
162    /// Results returned in this response
163    pub returned_results: u32,
164    /// Search execution time (ms)
165    pub execution_time_ms: u32,
166    /// Whether results were filtered or limited
167    pub filtered: bool,
168    /// Suggested related queries
169    pub related_queries: Vec<String>,
170    /// Top domains in results
171    pub top_domains: Vec<String>,
172}
173
174/// Aggregated insights from search results
175#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
176pub struct SearchInsights {
177    /// Most common topics/themes found
178    pub common_topics: Vec<String>,
179    /// Publication date distribution
180    pub date_distribution: HashMap<String, u32>, // "last_week", "last_month", etc.
181    /// Content type distribution
182    pub content_types: HashMap<String, u32>,
183    /// Average content quality score
184    pub avg_quality_score: Option<f64>,
185    /// Language distribution
186    pub languages: HashMap<String, u32>,
187    /// Sentiment analysis (if performed)
188    pub sentiment: Option<SearchSentiment>,
189}
190
191/// Sentiment analysis of search results
192#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
193pub struct SearchSentiment {
194    /// Overall sentiment score (-1.0 to 1.0)
195    pub overall_sentiment: f64,
196    /// Sentiment distribution
197    pub distribution: SentimentDistribution,
198    /// Most positive result
199    pub most_positive: Option<String>, // URL
200    /// Most negative result
201    pub most_negative: Option<String>, // URL
202}
203
204/// Distribution of sentiment across results
205#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
206pub struct SentimentDistribution {
207    /// Percentage of positive results
208    pub positive_pct: f64,
209    /// Percentage of neutral results
210    pub neutral_pct: f64,
211    /// Percentage of negative results
212    pub negative_pct: f64,
213}
214
215/// Content summary with key points
216#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
217pub struct ContentSummary {
218    /// URL of the page
219    pub url: String,
220    /// Page title
221    pub title: String,
222    /// Executive summary (2-3 sentences)
223    pub executive_summary: String,
224    /// Key points extracted
225    pub key_points: Vec<String>,
226    /// Important entities mentioned
227    pub entities: Vec<ContentEntity>,
228    /// Main topics covered
229    pub topics: Vec<String>,
230    /// Summary quality confidence (0.0-1.0)
231    pub confidence: f64,
232    /// When the summary was generated
233    pub generated_at: DateTime<Utc>,
234}
235
236/// Entity found in content
237#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
238pub struct ContentEntity {
239    /// Entity name
240    pub name: String,
241    /// Entity type (Person, Organization, Location, etc.)
242    pub entity_type: String,
243    /// Confidence score (0.0-1.0)
244    pub confidence: f64,
245    /// Context in which entity appears
246    pub context: String,
247}
248
249/// Similar page search result
250#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
251pub struct SimilarPagesResult {
252    /// Source URL used for similarity search
253    pub source_url: String,
254    /// Similar pages found
255    pub similar_pages: Vec<SearchResult>,
256    /// Similarity scores and metadata
257    pub similarity_metadata: SimilarityMetadata,
258    /// Search timestamp
259    pub searched_at: DateTime<Utc>,
260}
261
262/// Metadata about similarity analysis
263#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
264pub struct SimilarityMetadata {
265    /// Average similarity score
266    pub avg_similarity: f64,
267    /// Similarity calculation method used
268    pub method: String,
269    /// Common themes between source and similar pages
270    pub common_themes: Vec<String>,
271    /// Content overlap analysis
272    pub content_overlap: f64,
273}
274
275impl Default for WebSearchConfig {
276    fn default() -> Self {
277        Self {
278            exa_api_key: String::default(),
279            exa_base_url: "https://api.exa.ai".to_string(),
280            max_results: 20,
281            timeout_seconds: 30,
282            include_content: true,
283            content_limit: 5000,
284        }
285    }
286}
287
288impl WebSearchConfig {
289    /// Create WebSearchConfig from ApplicationContext
290    #[allow(dead_code)]
291    fn from_context(context: &riglr_core::provider::ApplicationContext) -> Self {
292        Self {
293            exa_api_key: context
294                .config
295                .providers
296                .exa_api_key
297                .clone()
298                .unwrap_or_default(),
299            exa_base_url: "https://api.exa.ai".to_string(),
300            max_results: 20,
301            timeout_seconds: 30,
302            include_content: true,
303            content_limit: 5000,
304        }
305    }
306}
307
308/// Internal function to perform web search with ApplicationContext
309pub async fn search_web_with_context(
310    query: String,
311    max_results: Option<u32>,
312    include_content: Option<bool>,
313    domain_filter: Option<Vec<String>>,
314    date_filter: Option<String>,         // "day", "week", "month", "year"
315    content_type_filter: Option<String>, // "news", "academic", "blog"
316    app_context: &riglr_core::provider::ApplicationContext,
317) -> crate::error::Result<WebSearchResult> {
318    debug!(
319        "Performing web search for query: '{}' with {} max results",
320        query,
321        max_results.unwrap_or(20)
322    );
323
324    // Get EXA_API_KEY from ApplicationContext
325    let exa_api_key = app_context
326        .config
327        .providers
328        .exa_api_key
329        .clone()
330        .ok_or_else(|| {
331            WebToolError::Config(
332                "EXA_API_KEY not configured. Set EXA_API_KEY in your environment.".to_string(),
333            )
334        })?;
335
336    let config = WebSearchConfig::default();
337    let client = WebClient::default().with_exa_key(exa_api_key.clone());
338
339    // Build search parameters
340    let mut params = HashMap::default();
341    params.insert("query".to_string(), query.clone());
342    params.insert(
343        "num_results".to_string(),
344        max_results.unwrap_or(20).to_string(),
345    );
346    params.insert(
347        "include_content".to_string(),
348        include_content.unwrap_or(true).to_string(),
349    );
350    params.insert("search_type".to_string(), "semantic".to_string());
351
352    if let Some(ref domains) = domain_filter {
353        params.insert("include_domains".to_string(), domains.join(","));
354    }
355
356    if let Some(ref date) = date_filter {
357        params.insert("start_published_date".to_string(), format_date_filter(date));
358    }
359
360    if let Some(content_type) = content_type_filter {
361        params.insert("category".to_string(), content_type);
362    }
363
364    // Make API request to Exa with API key header
365    let url = format!("{}/search", config.exa_base_url);
366    let mut headers = HashMap::default();
367    headers.insert("x-api-key".to_string(), exa_api_key.clone());
368    headers.insert("accept".to_string(), "application/json".to_string());
369    let response = client
370        .get_with_params_and_headers(&url, &params, headers)
371        .await
372        .map_err(|e| {
373            if e.to_string().contains("timeout") || e.to_string().contains("connection") {
374                WebToolError::Network(format!("Web search request failed: {}", e))
375            } else {
376                WebToolError::Config(format!("Web search request failed: {}", e))
377            }
378        })?;
379
380    // Parse search results
381    let results = parse_exa_search_response(&response, &query)
382        .await
383        .map_err(|e| WebToolError::Config(format!("Failed to parse search response: {}", e)))?;
384
385    // Perform additional analysis
386    let insights = analyze_search_results(&results)
387        .await
388        .map_err(|e| WebToolError::Config(format!("Failed to analyze results: {}", e)))?;
389
390    let search_result = WebSearchResult {
391        query: query.clone(),
392        search_type: "semantic".to_string(),
393        results: results.clone(),
394        metadata: WebSearchMetadata {
395            total_results: results.len() as u32,
396            returned_results: results.len() as u32,
397            execution_time_ms: 1500, // Would measure actual time
398            filtered: domain_filter.is_some() || date_filter.is_some(),
399            related_queries: generate_related_queries(&query).await.map_err(|e| {
400                WebToolError::Config(format!("Failed to generate related queries: {}", e))
401            })?,
402            top_domains: extract_top_domains(&results),
403        },
404        insights,
405        searched_at: Utc::now(),
406    };
407
408    info!(
409        "Web search completed: {} results for '{}'",
410        results.len(),
411        query
412    );
413
414    Ok(search_result)
415}
416
417/// Perform web search with content extraction
418///
419/// This tool performs web search and returns results with extracted content and metadata.
420/// Uses traditional search APIs rather than semantic understanding.
421#[tool]
422pub async fn search_web(
423    context: &riglr_core::provider::ApplicationContext,
424    query: String,
425    max_results: Option<u32>,
426    include_content: Option<bool>,
427    domain_filter: Option<Vec<String>>,
428    date_filter: Option<String>,         // "day", "week", "month", "year"
429    content_type_filter: Option<String>, // "news", "academic", "blog"
430) -> crate::error::Result<WebSearchResult> {
431    search_web_with_context(
432        query,
433        max_results,
434        include_content,
435        domain_filter,
436        date_filter,
437        content_type_filter,
438        context,
439    )
440    .await
441}
442
443/// Search for pages similar to a given URL
444///
445/// This tool finds web pages that are similar in content and topic to a source URL,
446/// useful for finding related information or alternative perspectives.
447#[tool]
448pub async fn find_similar_pages(
449    context: &riglr_core::provider::ApplicationContext,
450    source_url: String,
451    max_results: Option<u32>,
452    include_content: Option<bool>,
453    similarity_threshold: Option<f64>,
454) -> crate::error::Result<SimilarPagesResult> {
455    debug!("Finding pages similar to: {}", source_url);
456
457    // Get EXA_API_KEY from ApplicationContext
458    let exa_api_key = context
459        .config
460        .providers
461        .exa_api_key
462        .clone()
463        .ok_or_else(|| {
464            WebToolError::Config(
465                "EXA_API_KEY not configured. Set EXA_API_KEY in your environment.".to_string(),
466            )
467        })?;
468
469    let client = WebClient::default().with_exa_key(exa_api_key.clone());
470
471    // Build similarity search parameters
472    let mut params = HashMap::default();
473    params.insert("url".to_string(), source_url.clone());
474    params.insert(
475        "num_results".to_string(),
476        max_results.unwrap_or(10).to_string(),
477    );
478    params.insert(
479        "include_content".to_string(),
480        include_content.unwrap_or(true).to_string(),
481    );
482
483    if let Some(threshold) = similarity_threshold {
484        params.insert("similarity_threshold".to_string(), threshold.to_string());
485    }
486
487    // Make API request with API key header
488    let config = WebSearchConfig::default();
489    let url = format!("{}/find_similar", config.exa_base_url);
490    let mut headers = HashMap::default();
491    headers.insert("x-api-key".to_string(), exa_api_key.clone());
492    headers.insert("accept".to_string(), "application/json".to_string());
493    let response = client
494        .get_with_params_and_headers(&url, &params, headers)
495        .await
496        .map_err(|e| {
497            if e.to_string().contains("timeout") || e.to_string().contains("connection") {
498                WebToolError::Network(format!("Web search request failed: {}", e))
499            } else {
500                WebToolError::Config(format!("Web search request failed: {}", e))
501            }
502        })?;
503
504    // Parse results
505    let similar_pages = parse_similar_pages_response(&response)
506        .await
507        .map_err(|e| WebToolError::Config(format!("Failed to parse similar pages: {}", e)))?;
508
509    // Analyze similarity patterns
510    let similarity_metadata = analyze_similarity(&similar_pages)
511        .await
512        .map_err(|e| WebToolError::Config(format!("Failed to analyze similarity: {}", e)))?;
513
514    let result = SimilarPagesResult {
515        source_url: source_url.clone(),
516        similar_pages: similar_pages.clone(),
517        similarity_metadata,
518        searched_at: Utc::now(),
519    };
520
521    info!(
522        "Found {} similar pages to {}",
523        similar_pages.len(),
524        source_url
525    );
526
527    Ok(result)
528}
529
530/// Summarize content from multiple web pages
531///
532/// This tool extracts and summarizes key information from multiple web pages,
533/// creating a comprehensive overview of a topic from multiple sources.
534#[tool]
535pub async fn summarize_web_content(
536    context: &riglr_core::provider::ApplicationContext,
537    urls: Vec<String>,
538    summary_length: Option<String>, // "brief", "detailed", "comprehensive"
539    focus_topics: Option<Vec<String>>,
540    _include_quotes: Option<bool>,
541) -> crate::error::Result<Vec<ContentSummary>> {
542    debug!("Summarizing content from {} URLs", urls.len());
543
544    // Try to get EXA_API_KEY from ApplicationContext extensions first, fall back to env var
545    let exa_api_key = context
546        .get_extension::<String>()
547        .and_then(|s| {
548            if s.starts_with("exa_") {
549                Some(s.as_ref().clone())
550            } else {
551                None
552            }
553        })
554        .unwrap_or_else(|| std::env::var(EXA_API_KEY).unwrap_or_else(|_| String::default()));
555
556    let client = WebClient::default().with_exa_key(exa_api_key);
557
558    let mut summaries = Vec::new();
559
560    // Process each URL
561    for url in urls {
562        match extract_and_summarize_page(&client, &url, &summary_length, &focus_topics).await {
563            Ok(summary) => {
564                summaries.push(summary);
565            }
566            Err(e) => {
567                warn!("Failed to summarize {}: {}", url, e);
568                // Continue with other URLs
569            }
570        }
571    }
572
573    info!(
574        "Successfully summarized {} out of {} pages",
575        summaries.len(),
576        summaries.len()
577    );
578
579    Ok(summaries)
580}
581
582/// Search for recent news and articles on a topic
583///
584/// This tool specifically searches for recent news articles and blog posts,
585/// optimized for finding current information and trending discussions.
586#[tool]
587pub async fn search_recent_news(
588    context: &riglr_core::provider::ApplicationContext,
589    topic: String,
590    time_window: Option<String>,       // "24h", "week", "month"
591    source_types: Option<Vec<String>>, // "news", "blog", "social"
592    max_results: Option<u32>,
593    include_analysis: Option<bool>,
594) -> crate::error::Result<WebSearchResult> {
595    debug!(
596        "Searching recent news for topic: '{}' within {}",
597        topic,
598        time_window.as_deref().unwrap_or("week")
599    );
600
601    // Get EXA_API_KEY from ApplicationContext
602    let exa_api_key = context
603        .config
604        .providers
605        .exa_api_key
606        .clone()
607        .ok_or_else(|| {
608            WebToolError::Config(
609                "EXA_API_KEY not configured. Set EXA_API_KEY in your environment.".to_string(),
610            )
611        })?;
612
613    let client = WebClient::default().with_exa_key(exa_api_key.clone());
614
615    // Build news-specific search parameters
616    let mut params = HashMap::default();
617    params.insert("query".to_string(), topic.clone());
618    params.insert("search_type".to_string(), "news".to_string());
619    params.insert(
620        "num_results".to_string(),
621        max_results.unwrap_or(30).to_string(),
622    );
623    params.insert("include_content".to_string(), "true".to_string());
624
625    // Set time window
626    let time_window = time_window.unwrap_or_else(|| "week".to_string());
627    params.insert(
628        "start_published_date".to_string(),
629        format_date_filter(&time_window),
630    );
631
632    // Filter by source types if specified
633    if let Some(sources) = source_types {
634        if sources.contains(&"news".to_string()) {
635            params.insert("category".to_string(), "news".to_string());
636        }
637    }
638
639    let config = WebSearchConfig::default();
640    let url = format!("{}/search", config.exa_base_url);
641    let mut headers = HashMap::default();
642    headers.insert("x-api-key".to_string(), exa_api_key.clone());
643    headers.insert("accept".to_string(), "application/json".to_string());
644    let response = client
645        .get_with_params_and_headers(&url, &params, headers)
646        .await
647        .map_err(|e| {
648            if e.to_string().contains("timeout") || e.to_string().contains("connection") {
649                WebToolError::Network(format!("Web search request failed: {}", e))
650            } else {
651                WebToolError::Config(format!("Web search request failed: {}", e))
652            }
653        })?;
654
655    // Parse and enhance results for news context
656    let mut results = parse_exa_search_response(&response, &topic)
657        .await
658        .map_err(|e| WebToolError::Config(format!("Failed to parse news response: {}", e)))?;
659
660    // Sort by recency
661    results.sort_by(|a, b| {
662        b.published_date
663            .unwrap_or_else(Utc::now)
664            .cmp(&a.published_date.unwrap_or_else(Utc::now))
665    });
666
667    let insights = if include_analysis.unwrap_or(true) {
668        analyze_news_results(&results)
669            .await
670            .map_err(|e| WebToolError::Config(format!("Failed to analyze news: {}", e)))?
671    } else {
672        SearchInsights {
673            common_topics: vec![],
674            date_distribution: HashMap::default(),
675            content_types: HashMap::default(),
676            avg_quality_score: None,
677            languages: HashMap::default(),
678            sentiment: None,
679        }
680    };
681
682    let search_result = WebSearchResult {
683        query: topic.clone(),
684        search_type: "news".to_string(),
685        results: results.clone(),
686        metadata: WebSearchMetadata {
687            total_results: results.len() as u32,
688            returned_results: results.len() as u32,
689            execution_time_ms: 1200,
690            filtered: true,
691            related_queries: generate_related_queries(&topic).await.map_err(|e| {
692                WebToolError::Config(format!("Failed to generate related queries: {}", e))
693            })?,
694            top_domains: extract_top_domains(&results),
695        },
696        insights,
697        searched_at: Utc::now(),
698    };
699
700    info!(
701        "Recent news search completed: {} results for '{}'",
702        search_result.results.len(),
703        topic
704    );
705
706    Ok(search_result)
707}
708
709/// Parse Exa search API response into structured results
710async fn parse_exa_search_response(
711    response: &str,
712    query: &str,
713) -> crate::error::Result<Vec<SearchResult>> {
714    let json: serde_json::Value = serde_json::from_str(response)
715        .map_err(|e| WebToolError::Parsing(format!("Invalid Exa JSON: {}", e)))?;
716
717    let mut out = Vec::new();
718    let results = json
719        .get("results")
720        .and_then(|v| v.as_array())
721        .cloned()
722        .unwrap_or_default();
723    for r in results {
724        let title = r
725            .get("title")
726            .and_then(|v| v.as_str())
727            .unwrap_or("")
728            .to_string();
729        let url = r
730            .get("url")
731            .and_then(|v| v.as_str())
732            .unwrap_or("")
733            .to_string();
734        if url.is_empty() {
735            continue;
736        }
737        let id = r
738            .get("id")
739            .and_then(|v| v.as_str())
740            .unwrap_or(url.as_str())
741            .to_string();
742        let description = r
743            .get("description")
744            .or_else(|| r.get("snippet"))
745            .and_then(|v| v.as_str())
746            .map(|s| s.to_string());
747        let content = r
748            .get("text")
749            .and_then(|v| v.as_str())
750            .map(|s| s.to_string());
751        let published_date = r
752            .get("publishedDate")
753            .or_else(|| r.get("published_date"))
754            .and_then(|v| v.as_str())
755            .and_then(|s| DateTime::parse_from_rfc3339(s).ok())
756            .map(|dt| dt.with_timezone(&Utc));
757        let domain_name = url::Url::parse(&url)
758            .ok()
759            .and_then(|u| u.host_str().map(|h| h.to_string()))
760            .unwrap_or_default();
761        let score = r.get("score").and_then(|v| v.as_f64()).unwrap_or(0.8);
762        let language = r
763            .get("language")
764            .and_then(|v| v.as_str())
765            .map(|s| s.to_string());
766        let author = r
767            .get("author")
768            .and_then(|v| v.as_str())
769            .map(|s| s.to_string());
770
771        let words = content
772            .as_ref()
773            .map(|c| c.split_whitespace().count() as u32)
774            .unwrap_or(0);
775        let reading_time = if words > 0 {
776            Some((words as f64 / 200.0).ceil() as u32)
777        } else {
778            None
779        };
780        let length_category = match words {
781            0..=200 => "Short",
782            201..=800 => "Medium",
783            801..=2000 => "Long",
784            _ => "Very Long",
785        }
786        .to_string();
787
788        let content_type = ContentType {
789            primary: "Article".to_string(),
790            format: if url.to_lowercase().ends_with(".pdf") {
791                "PDF".to_string()
792            } else {
793                "HTML".to_string()
794            },
795            is_paywalled: None,
796            quality_score: Some(((score * 100.0) as u32).min(100)),
797            length_category,
798        };
799
800        let metadata = PageMetadata {
801            author,
802            tags: vec![query.to_lowercase()],
803            social_meta: SocialMetadata {
804                og_title: None,
805                og_description: None,
806                og_image: None,
807                twitter_card: None,
808                twitter_site: None,
809            },
810            seo_meta: SeoMetadata {
811                meta_description: description.clone(),
812                meta_keywords: vec![],
813                robots: None,
814                schema_types: vec![],
815            },
816            canonical_url: None,
817            last_modified: None,
818        };
819
820        let domain = DomainInfo {
821            name: domain_name,
822            reputation_score: None,
823            category: None,
824            is_trusted: true,
825            authority_score: None,
826        };
827
828        out.push(SearchResult {
829            id,
830            title,
831            url,
832            description,
833            content,
834            summary: None,
835            published_date,
836            domain,
837            metadata,
838            relevance_score: score,
839            content_type,
840            language,
841            reading_time_minutes: reading_time,
842        });
843    }
844    Ok(out)
845}
846
847/// Parse similar pages API response
848async fn parse_similar_pages_response(response: &str) -> crate::error::Result<Vec<SearchResult>> {
849    // Reuse the general Exa parser without query context
850    parse_exa_search_response(response, "").await
851}
852
853/// Extract and summarize content from a single page using extractive summarization
854///
855/// Uses sentence ranking and selection rather than generative AI summarization.
856/// Ranks sentences by importance and selects diverse, representative ones.
857async fn extract_and_summarize_page(
858    client: &WebClient,
859    url: &str,
860    summary_length: &Option<String>,
861    focus_topics: &Option<Vec<String>>,
862) -> crate::error::Result<ContentSummary> {
863    let html = client
864        .get(url)
865        .await
866        .map_err(|e| WebToolError::Network(format!("Failed to fetch {}: {}", url, e)))?;
867    let (title, clean_text, sentences, headings) = extract_main_content(&html, url);
868
869    // Determine target summary length
870    let n = match summary_length.as_deref() {
871        Some("comprehensive") => 8,
872        Some("detailed") => 5,
873        _ => 3,
874    } as usize;
875
876    let topic_set: std::collections::HashSet<String> = focus_topics
877        .clone()
878        .unwrap_or_default()
879        .into_iter()
880        .map(|t| t.to_lowercase())
881        .collect();
882
883    let ranked = rank_sentences(&sentences, &clean_text, &topic_set, &headings);
884    let selected = select_diverse(&ranked, n, 0.6);
885    let executive_summary = selected.join(" ");
886
887    // Key points: top distinct sentences or heading-based bullets
888    let mut key_points = selected.iter().take(5).cloned().collect::<Vec<_>>();
889    if key_points.is_empty() && !headings.is_empty() {
890        key_points = headings.iter().take(5).cloned().collect();
891    }
892
893    let topics = if !topic_set.is_empty() {
894        topic_set.iter().cloned().collect()
895    } else {
896        extract_topics_from_text(&clean_text)
897    };
898
899    // Entities via improved proper-noun pattern
900    let entity_re = regex::Regex::new(r"(?m)(?:^|\s)([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})").unwrap();
901    let mut entities: Vec<ContentEntity> = entity_re
902        .captures_iter(&clean_text)
903        .map(|cap| ContentEntity {
904            name: cap[1].trim().to_string(),
905            entity_type: "ProperNoun".to_string(),
906            confidence: 0.55,
907            context: "".to_string(),
908        })
909        .collect();
910    entities.dedup_by(|a, b| a.name.eq_ignore_ascii_case(&b.name));
911    entities.truncate(8);
912
913    // Confidence based on content richness and heading availability
914    let mut confidence = (clean_text.len().min(8000) as f64 / 8000.0) * 0.6 + 0.3;
915    if !headings.is_empty() {
916        confidence += 0.05;
917    }
918    confidence = confidence.min(0.97);
919
920    Ok(ContentSummary {
921        url: url.to_string(),
922        title,
923        executive_summary,
924        key_points,
925        entities,
926        topics,
927        confidence,
928        generated_at: Utc::now(),
929    })
930}
931
932/// Extract main content using HTML parsing and content-density heuristics
933fn extract_main_content(
934    html: &str,
935    fallback_url: &str,
936) -> (String, String, Vec<String>, Vec<String>) {
937    let document = Html::parse_document(html);
938
939    // Prefer og:title
940    let sel_meta_title = Selector::parse("meta[property=\"og:title\"]").unwrap();
941    let title = document
942        .select(&sel_meta_title)
943        .filter_map(|el| el.value().attr("content"))
944        .map(|s| s.trim().to_string())
945        .find(|s| !s.is_empty())
946        .or_else(|| {
947            // Fallback to <title>
948            let sel_title = Selector::parse("title").unwrap();
949            document
950                .select(&sel_title)
951                .next()
952                .map(|e| e.text().collect::<String>().trim().to_string())
953        })
954        .unwrap_or_else(|| fallback_url.to_string());
955
956    // Candidate containers likely to hold article content
957    let candidates = vec![
958        "article",
959        "main",
960        "div#content",
961        "div#main",
962        "div.post-content",
963        "div.article-content",
964        "section.article",
965        "div.entry-content",
966        "div#main-content",
967    ];
968
969    let mut best_text = String::default();
970    let mut best_headings: Vec<String> = Vec::new();
971    for css in candidates {
972        if let Ok(sel) = Selector::parse(css) {
973            for node in document.select(&sel) {
974                let (text, headings) = extract_text_from_node(node);
975                if text.len() > best_text.len() {
976                    best_text = text;
977                    best_headings = headings;
978                }
979            }
980        }
981    }
982
983    if best_text.is_empty() {
984        // Fallback: collect from body paragraphs
985        if let Ok(sel) = Selector::parse("body") {
986            if let Some(body) = document.select(&sel).next() {
987                let (text, headings) = extract_text_from_node(body);
988                best_text = text;
989                best_headings = headings;
990            }
991        }
992    }
993
994    // Sentence split
995    let sentences: Vec<String> = split_sentences(&best_text)
996        .into_iter()
997        .filter(|s| s.split_whitespace().count() >= 5)
998        .collect();
999
1000    (title, best_text, sentences, best_headings)
1001}
1002
1003/// Extract text and headings from an HTML element node
1004fn extract_text_from_node(root: ElementRef) -> (String, Vec<String>) {
1005    let sel_exclude = [
1006        "script", "style", "noscript", "template", "header", "footer", "nav", "aside",
1007    ];
1008    let sel_p = Selector::parse("p, li").unwrap();
1009    let sel_h = Selector::parse("h1, h2, h3").unwrap();
1010
1011    // Headings
1012    let mut headings: Vec<String> = root
1013        .select(&sel_h)
1014        .map(|h| normalize_whitespace(&h.text().collect::<String>()))
1015        .filter(|s| !s.is_empty())
1016        .collect();
1017    headings.dedup();
1018
1019    // Paragraph-like text
1020    let mut blocks: Vec<String> = Vec::new();
1021    for p in root.select(&sel_p) {
1022        // Skip paragraphs inside excluded parents
1023        if has_excluded_ancestor(p, &sel_exclude) {
1024            continue;
1025        }
1026        let txt = normalize_whitespace(&p.text().collect::<String>());
1027        if txt.len() >= 40 {
1028            blocks.push(txt);
1029        }
1030    }
1031    let full = blocks.join("\n");
1032    (full, headings)
1033}
1034
1035/// Check if a node has any excluded ancestor elements
1036fn has_excluded_ancestor(mut node: ElementRef, excluded: &[&str]) -> bool {
1037    while let Some(parent) = node.ancestors().find_map(ElementRef::wrap) {
1038        let name = parent.value().name();
1039        if excluded.contains(&name) {
1040            return true;
1041        }
1042        node = parent;
1043        // continue up until root
1044        if node.parent().is_none() {
1045            break;
1046        }
1047    }
1048    false
1049}
1050
1051/// Normalize whitespace in text by collapsing multiple spaces into single spaces
1052fn normalize_whitespace(s: &str) -> String {
1053    let s = html_escape::decode_html_entities(s);
1054    let re = regex::Regex::new(r"\s+").unwrap();
1055    re.replace_all(&s, " ").trim().to_string()
1056}
1057
1058/// Split text into sentences based on punctuation
1059fn split_sentences(text: &str) -> Vec<String> {
1060    let mut v = Vec::new();
1061    let mut current = String::default();
1062    for ch in text.chars() {
1063        current.push(ch);
1064        if matches!(ch, '.' | '!' | '?') {
1065            let s = normalize_whitespace(&current);
1066            if !s.is_empty() {
1067                v.push(s);
1068            }
1069            current.clear();
1070        }
1071    }
1072    if !current.trim().is_empty() {
1073        v.push(normalize_whitespace(&current));
1074    }
1075    v
1076}
1077
1078/// Rank sentences with simple TF scoring + positional + heading/topic boosts
1079fn rank_sentences(
1080    sentences: &[String],
1081    full_text: &str,
1082    topics: &std::collections::HashSet<String>,
1083    headings: &[String],
1084) -> Vec<(String, f64)> {
1085    let mut tf: HashMap<String, f64> = HashMap::default();
1086    for w in full_text.split(|c: char| !c.is_alphanumeric()) {
1087        let w = w.to_lowercase();
1088        if w.len() < 3 {
1089            continue;
1090        }
1091        *tf.entry(w).or_insert(0.0) += 1.0;
1092    }
1093    // Normalize
1094    let max_tf = tf.values().cloned().fold(1.0, f64::max);
1095    for v in tf.values_mut() {
1096        *v /= max_tf;
1097    }
1098
1099    let heading_text = headings.join(" ").to_lowercase();
1100
1101    let mut scored: Vec<(String, f64)> = sentences
1102        .iter()
1103        .enumerate()
1104        .map(|(i, s)| {
1105            let words: Vec<String> = s
1106                .split(|c: char| !c.is_alphanumeric())
1107                .map(|w| w.to_lowercase())
1108                .filter(|w| w.len() >= 3)
1109                .collect();
1110            let mut score = 0.0;
1111            for w in &words {
1112                score += *tf.get(w).unwrap_or(&0.0);
1113            }
1114            // Length normalization
1115            let len = s.split_whitespace().count() as f64;
1116            if len > 0.0 {
1117                score /= len.powf(0.3);
1118            }
1119            // Positional boost (earlier sentences)
1120            score += 0.15 * (1.0 / ((i + 1) as f64).sqrt());
1121            // Topic boost
1122            if !topics.is_empty() {
1123                let lower = s.to_lowercase();
1124                for t in topics {
1125                    if lower.contains(t) {
1126                        score += 0.25;
1127                    }
1128                }
1129            }
1130            // Heading proximity boost
1131            for h in headings {
1132                if s.to_lowercase().contains(&h.to_lowercase()) {
1133                    score += 0.2;
1134                    break;
1135                }
1136            }
1137            // Title/headings semantic overlap
1138            if !heading_text.is_empty() {
1139                let overlap = jaccard(&s.to_lowercase(), &heading_text);
1140                score += 0.1 * overlap;
1141            }
1142            (s.clone(), score)
1143        })
1144        .collect();
1145    scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
1146    scored
1147}
1148
1149/// Calculate Jaccard similarity between two strings
1150fn jaccard(a: &str, b: &str) -> f64 {
1151    let set_a: std::collections::HashSet<_> = a.split_whitespace().collect();
1152    let set_b: std::collections::HashSet<_> = b.split_whitespace().collect();
1153    let inter = set_a.intersection(&set_b).count() as f64;
1154    let union = set_a.union(&set_b).count() as f64;
1155    if union == 0.0 {
1156        0.0
1157    } else {
1158        inter / union
1159    }
1160}
1161
1162/// Select diverse sentences from ranked list to avoid redundancy
1163fn select_diverse(scored: &[(String, f64)], k: usize, max_sim: f64) -> Vec<String> {
1164    let mut out: Vec<String> = Vec::new();
1165    for (s, _) in scored {
1166        if out.len() >= k {
1167            break;
1168        }
1169        if out
1170            .iter()
1171            .all(|t| jaccard(&s.to_lowercase(), &t.to_lowercase()) < max_sim)
1172        {
1173            out.push(s.clone());
1174        }
1175    }
1176    out
1177}
1178
1179/// Analyze search results to extract insights
1180async fn analyze_search_results(results: &[SearchResult]) -> crate::error::Result<SearchInsights> {
1181    let mut content_types = HashMap::default();
1182    let mut languages = HashMap::default();
1183    let mut date_distribution = HashMap::default();
1184    let mut topics = Vec::new();
1185
1186    for result in results {
1187        // Count content types
1188        *content_types
1189            .entry(result.content_type.primary.clone())
1190            .or_insert(0) += 1;
1191
1192        // Count languages
1193        if let Some(lang) = &result.language {
1194            *languages.entry(lang.clone()).or_insert(0) += 1;
1195        }
1196
1197        // Analyze publication dates
1198        if let Some(pub_date) = result.published_date {
1199            let days_ago = (Utc::now() - pub_date).num_days();
1200            let category = match days_ago {
1201                0..=1 => "today",
1202                2..=7 => "this_week",
1203                8..=30 => "this_month",
1204                _ => "older",
1205            };
1206            *date_distribution.entry(category.to_string()).or_insert(0) += 1;
1207        }
1208
1209        // Extract topics from metadata
1210        topics.extend(result.metadata.tags.clone());
1211    }
1212
1213    // Calculate average quality score
1214    let quality_scores: Vec<u32> = results
1215        .iter()
1216        .filter_map(|r| r.content_type.quality_score)
1217        .collect();
1218    let avg_quality_score = if !quality_scores.is_empty() {
1219        Some(quality_scores.iter().sum::<u32>() as f64 / quality_scores.len() as f64)
1220    } else {
1221        None
1222    };
1223
1224    Ok(SearchInsights {
1225        common_topics: topics,
1226        date_distribution,
1227        content_types,
1228        avg_quality_score,
1229        languages,
1230        sentiment: None, // Would analyze sentiment in production
1231    })
1232}
1233
1234/// Analyze news-specific results
1235async fn analyze_news_results(results: &[SearchResult]) -> crate::error::Result<SearchInsights> {
1236    // Similar to analyze_search_results but with news-specific analysis
1237    analyze_search_results(results).await
1238}
1239
1240/// Analyze similarity patterns between pages
1241async fn analyze_similarity(results: &[SearchResult]) -> crate::error::Result<SimilarityMetadata> {
1242    let avg_similarity =
1243        results.iter().map(|r| r.relevance_score).sum::<f64>() / results.len() as f64;
1244
1245    let common_themes = results
1246        .iter()
1247        .flat_map(|r| r.metadata.tags.clone())
1248        .collect::<std::collections::HashSet<_>>()
1249        .into_iter()
1250        .collect();
1251
1252    Ok(SimilarityMetadata {
1253        avg_similarity,
1254        method: "semantic_embeddings".to_string(),
1255        common_themes,
1256        content_overlap: 0.75, // Would calculate actual overlap
1257    })
1258}
1259
1260/// Generate related search queries
1261async fn generate_related_queries(query: &str) -> crate::error::Result<Vec<String>> {
1262    // Heuristic expansion of the query into related intents
1263    let mut variants = vec![
1264        format!("{} news", query),
1265        format!("{} latest", query),
1266        format!("{} guide", query),
1267        format!("{} tutorial", query),
1268        format!("{} best practices", query),
1269        format!("{} examples", query),
1270        format!("how to {}", query),
1271        format!("{} vs alternatives", query),
1272        format!("{} 2025 trends", query),
1273    ];
1274    variants.sort();
1275    variants.dedup();
1276    Ok(variants)
1277}
1278
1279/// Extract top domains from search results
1280fn extract_top_domains(results: &[SearchResult]) -> Vec<String> {
1281    let mut domain_counts: HashMap<String, u32> = HashMap::default();
1282
1283    for result in results {
1284        *domain_counts.entry(result.domain.name.clone()).or_insert(0) += 1;
1285    }
1286
1287    let mut domains: Vec<(String, u32)> = domain_counts.into_iter().collect();
1288    domains.sort_by(|a, b| b.1.cmp(&a.1));
1289
1290    domains
1291        .into_iter()
1292        .take(10)
1293        .map(|(domain, _)| domain)
1294        .collect()
1295}
1296
1297/// Format date filter for API requests
1298fn format_date_filter(window: &str) -> String {
1299    let days_ago = match window {
1300        "24h" | "day" => 1,
1301        "week" => 7,
1302        "month" => 30,
1303        "year" => 365,
1304        _ => 7,
1305    };
1306
1307    let date = Utc::now() - chrono::Duration::days(days_ago);
1308    date.format("%Y-%m-%d").to_string()
1309}
1310
1311/// Simple keyword topic extraction from text
1312fn extract_topics_from_text(text: &str) -> Vec<String> {
1313    let stopwords = [
1314        "the", "and", "for", "with", "that", "this", "from", "have", "your", "you", "are", "was",
1315        "were", "has", "had", "not", "but", "all", "any", "can", "will", "just", "into", "about",
1316        "over", "more", "than", "when", "what", "how", "why", "where", "then", "them", "they",
1317        "their", "its", "it's", "as", "of", "in", "on", "to", "by", "at", "or", "an", "be",
1318    ];
1319    let mut counts: HashMap<String, u32> = HashMap::default();
1320    for w in text.split(|c: char| !c.is_alphanumeric()) {
1321        let w = w.to_lowercase();
1322        if w.len() < 4 {
1323            continue;
1324        }
1325        if stopwords.contains(&w.as_str()) {
1326            continue;
1327        }
1328        *counts.entry(w).or_insert(0) += 1;
1329    }
1330    let mut v: Vec<(String, u32)> = counts.into_iter().collect();
1331    v.sort_by(|a, b| b.1.cmp(&a.1));
1332    v.into_iter().take(5).map(|(k, _)| k).collect()
1333}
1334
1335#[cfg(test)]
1336mod tests {
1337    use super::*;
1338
1339    #[test]
1340    fn test_web_search_config_default() {
1341        let config = WebSearchConfig::default();
1342        assert_eq!(config.exa_base_url, "https://api.exa.ai");
1343        assert_eq!(config.max_results, 20);
1344    }
1345
1346    #[test]
1347    fn test_search_result_serialization() {
1348        let result = SearchResult {
1349            id: "1".to_string(),
1350            title: "Test Page".to_string(),
1351            url: "https://example.com".to_string(),
1352            description: Some("Test description".to_string()),
1353            content: Some("Test content".to_string()),
1354            summary: None,
1355            published_date: Some(Utc::now()),
1356            domain: DomainInfo {
1357                name: "example.com".to_string(),
1358                reputation_score: Some(80),
1359                category: Some("Test".to_string()),
1360                is_trusted: true,
1361                authority_score: Some(70),
1362            },
1363            metadata: PageMetadata {
1364                author: None,
1365                tags: vec!["test".to_string()],
1366                social_meta: SocialMetadata {
1367                    og_title: None,
1368                    og_description: None,
1369                    og_image: None,
1370                    twitter_card: None,
1371                    twitter_site: None,
1372                },
1373                seo_meta: SeoMetadata {
1374                    meta_description: None,
1375                    meta_keywords: vec![],
1376                    robots: None,
1377                    schema_types: vec![],
1378                },
1379                canonical_url: None,
1380                last_modified: None,
1381            },
1382            relevance_score: 0.8,
1383            content_type: ContentType {
1384                primary: "Article".to_string(),
1385                format: "HTML".to_string(),
1386                is_paywalled: Some(false),
1387                quality_score: Some(75),
1388                length_category: "Medium".to_string(),
1389            },
1390            language: Some("en".to_string()),
1391            reading_time_minutes: Some(5),
1392        };
1393
1394        let json = serde_json::to_string(&result).unwrap();
1395        assert!(json.contains("Test Page"));
1396    }
1397
1398    #[test]
1399    fn test_format_date_filter() {
1400        let result = format_date_filter("week");
1401        assert!(!result.is_empty());
1402        assert!(result.len() == 10); // YYYY-MM-DD format
1403    }
1404}
riglr_web_tools/web_search.rs

riglr_web_tools/
web_search.rs