reasonkit-web 0.1.7

//! High-Performance Web Crawler
//!
//! This module provides intelligent web crawling using the Spider crate
//! with support for smart mode (HTTP-first, Chrome fallback) and
//! compliant crawling (robots.txt, sitemap).
//!
//! # Features
//! - Smart crawling (HTTP-first, Chrome for JS-heavy pages)
//! - Robots.txt compliance
//! - Sitemap parsing
//! - Content extraction with Readability
//! - Rate limiting and politeness
//!
//! Enable with: `cargo build --features crawler`

#![cfg(feature = "crawler")]

use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashSet;

// Re-exports
pub use spider;

/// Configuration for the web crawler
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawlerConfig {
    /// Maximum concurrent requests
    pub max_concurrent: usize,
    /// Request timeout in seconds
    pub timeout_secs: u64,
    /// Delay between requests to same domain (ms)
    pub delay_ms: u64,
    /// Maximum depth for recursive crawling
    pub max_depth: usize,
    /// Maximum pages per domain
    pub max_pages_per_domain: usize,
    /// User agent string
    pub user_agent: String,
    /// Respect robots.txt
    pub respect_robots_txt: bool,
    /// Use smart mode (HTTP-first, Chrome fallback)
    pub smart_mode: bool,
    /// Follow redirects
    pub follow_redirects: bool,
    /// Maximum redirects to follow
    pub max_redirects: usize,
}

impl Default for CrawlerConfig {
    fn default() -> Self {
        Self {
            max_concurrent: 10,
            timeout_secs: 30,
            delay_ms: 100,
            max_depth: 3,
            max_pages_per_domain: 1000,
            user_agent: "ReasonKit-Web/0.1 (+https://reasonkit.sh)".to_string(),
            respect_robots_txt: true,
            smart_mode: true,
            follow_redirects: true,
            max_redirects: 5,
        }
    }
}

impl CrawlerConfig {
    /// Create a polite configuration for public crawling
    pub fn polite() -> Self {
        Self {
            max_concurrent: 2,
            delay_ms: 1000,
            ..Default::default()
        }
    }

    /// Create an aggressive configuration for internal/authorized crawling
    pub fn aggressive() -> Self {
        Self {
            max_concurrent: 50,
            delay_ms: 10,
            respect_robots_txt: false,
            ..Default::default()
        }
    }
}

/// A crawled page
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawledPage {
    /// Page URL
    pub url: String,
    /// HTTP status code
    pub status: u16,
    /// Page title
    pub title: Option<String>,
    /// Page description (meta)
    pub description: Option<String>,
    /// Main content text
    pub content: String,
    /// Raw HTML
    pub html: String,
    /// Extracted links
    pub links: Vec<ExtractedLink>,
    /// Page metadata
    pub metadata: PageMetadata,
    /// Crawl timestamp
    pub crawled_at: chrono::DateTime<chrono::Utc>,
}

/// An extracted link
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractedLink {
    pub url: String,
    pub text: Option<String>,
    pub rel: Option<String>,
    pub is_internal: bool,
}

/// Page metadata
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct PageMetadata {
    pub content_type: Option<String>,
    pub content_length: Option<usize>,
    pub last_modified: Option<String>,
    pub etag: Option<String>,
    pub language: Option<String>,
    pub keywords: Vec<String>,
    pub author: Option<String>,
    pub canonical_url: Option<String>,
}

/// Crawl statistics
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CrawlStats {
    pub pages_crawled: usize,
    pub pages_failed: usize,
    pub bytes_downloaded: usize,
    pub links_found: usize,
    pub duration_ms: u64,
}

/// Web crawler instance
pub struct WebCrawler {
    config: CrawlerConfig,
    visited: HashSet<String>,
}

impl WebCrawler {
    /// Create a new crawler with the given configuration
    pub fn new(config: CrawlerConfig) -> Self {
        Self {
            config,
            visited: HashSet::new(),
        }
    }

    /// Get the configuration
    pub fn config(&self) -> &CrawlerConfig {
        &self.config
    }

    /// Check if a URL has been visited
    pub fn is_visited(&self, url: &str) -> bool {
        self.visited.contains(url)
    }

    /// Mark a URL as visited
    pub fn mark_visited(&mut self, url: &str) {
        self.visited.insert(url.to_string());
    }

    /// Get the count of visited URLs
    pub fn visited_count(&self) -> usize {
        self.visited.len()
    }

    /// Clear visited URLs
    pub fn clear_visited(&mut self) {
        self.visited.clear();
    }

    /// Crawl a single URL
    pub async fn crawl_url(&mut self, url: &str) -> Result<CrawledPage> {
        use spider::website::Website;

        let mut website = Website::new(url);
        website.with_limit(1);

        // Configure based on settings
        if !self.config.respect_robots_txt {
            website.with_respect_robots_txt(false);
        }

        website.crawl().await;

        let pages = website.get_pages().expect("Failed to get pages");

        if let Some(page) = pages.first() {
            self.mark_visited(url);

            let html = page.get_html();
            let content = self.extract_content(&html);
            let links = self.extract_links(&html, url);
            let title = self.extract_title(&html);
            let description = self.extract_meta(&html, "description");

            Ok(CrawledPage {
                url: url.to_string(),
                status: 200,
                title,
                description,
                content,
                html,
                links,
                metadata: PageMetadata::default(),
                crawled_at: chrono::Utc::now(),
            })
        } else {
            anyhow::bail!("Failed to crawl URL: {}", url)
        }
    }

    /// Crawl a website starting from the given URL
    pub async fn crawl_site(&mut self, start_url: &str) -> Result<Vec<CrawledPage>> {
        use spider::website::Website;

        let mut website = Website::new(start_url);
        website.with_limit(self.config.max_pages_per_domain as u32);

        if !self.config.respect_robots_txt {
            website.with_respect_robots_txt(false);
        }

        website.crawl().await;

        let pages = website.get_pages().expect("Failed to get pages");
        let mut crawled_pages = Vec::new();

        for page in pages.iter() {
            let url = page.get_url();
            let html = page.get_html();

            self.mark_visited(&url);

            // Extract all data before moving html into the struct
            let title = self.extract_title(&html);
            let description = self.extract_meta(&html, "description");
            let content = self.extract_content(&html);
            let links = self.extract_links(&html, &url);

            crawled_pages.push(CrawledPage {
                url: url.to_string(),
                status: 200,
                title,
                description,
                content,
                html,
                links,
                metadata: PageMetadata::default(),
                crawled_at: chrono::Utc::now(),
            });
        }

        Ok(crawled_pages)
    }

    fn extract_content(&self, html: &str) -> String {
        // Use readability for content extraction if available
        #[cfg(feature = "content-extraction")]
        {
            use readability::extractor;
            use url::Url;

            // Try to extract with readability - requires a URL for relative link resolution
            if let Ok(product) = extractor::extract(
                &mut html.as_bytes(),
                &Url::parse("http://example.com").unwrap(),
            ) {
                return product.text;
            }
        }

        // Fallback: basic HTML stripping
        self.strip_html(html)
    }

    fn strip_html(&self, html: &str) -> String {
        // Simple HTML stripping (real implementation would be more sophisticated)
        let re = regex::Regex::new(r"<[^>]+>").unwrap();
        let text = re.replace_all(html, " ");
        let text = text.split_whitespace().collect::<Vec<_>>().join(" ");
        text
    }

    fn extract_title(&self, html: &str) -> Option<String> {
        let re = regex::Regex::new(r"<title[^>]*>([^<]+)</title>").ok()?;
        re.captures(html).map(|c| c[1].to_string())
    }

    fn extract_meta(&self, html: &str, name: &str) -> Option<String> {
        let pattern = format!(
            r#"<meta[^>]+name=["']{}["'][^>]+content=["']([^"']+)["']"#,
            name
        );
        let re = regex::Regex::new(&pattern).ok()?;
        re.captures(html).map(|c| c[1].to_string())
    }

    fn extract_links(&self, html: &str, base_url: &str) -> Vec<ExtractedLink> {
        let mut links = Vec::new();
        let base = url::Url::parse(base_url).ok();

        let re = regex::Regex::new(r#"<a[^>]+href=["']([^"']+)["'][^>]*>([^<]*)</a>"#).unwrap();

        for cap in re.captures_iter(html) {
            let href = &cap[1];
            let text = cap.get(2).map(|m| m.as_str().to_string());

            // Resolve relative URLs
            let resolved_url = if let Some(base) = &base {
                base.join(href)
                    .map(|u| u.to_string())
                    .unwrap_or_else(|_| href.to_string())
            } else {
                href.to_string()
            };

            let is_internal = base
                .as_ref()
                .and_then(|_| url::Url::parse(&resolved_url).ok())
                .map(|u| u.host_str() == base.as_ref().and_then(|b| b.host_str()))
                .unwrap_or(false);

            links.push(ExtractedLink {
                url: resolved_url,
                text,
                rel: None,
                is_internal,
            });
        }

        links
    }
}

impl Default for WebCrawler {
    fn default() -> Self {
        Self::new(CrawlerConfig::default())
    }
}

/// Sitemap parser for structured crawling
#[cfg(feature = "compliant-crawling")]
pub mod sitemap_parser {
    use super::*;

    /// Parse a sitemap.xml file
    pub fn parse_sitemap(xml: &str) -> Result<Vec<SitemapEntry>> {
        use sitemap::reader::{SiteMapEntity, SiteMapReader};

        let mut entries = Vec::new();
        let reader = SiteMapReader::new(xml.as_bytes());

        for entry in reader {
            // SiteMapReader returns SiteMapEntity directly
            if let SiteMapEntity::Url(url_entry) = entry {
                // Convert ChangeFreq enum to Option<String>
                let change_freq = match &url_entry.changefreq {
                    sitemap::structs::ChangeFreq::None => None,
                    freq => Some(format!("{:?}", freq)),
                };

                entries.push(SitemapEntry {
                    url: url_entry
                        .loc
                        .get_url()
                        .map(|u| u.to_string())
                        .unwrap_or_default(),
                    last_modified: url_entry.lastmod.get_time().map(|d| d.to_string()),
                    change_frequency: change_freq,
                    priority: url_entry.priority.get_priority().map(|p| p as f64),
                });
            }
        }

        Ok(entries)
    }

    /// A sitemap entry
    #[derive(Debug, Clone, Serialize, Deserialize)]
    pub struct SitemapEntry {
        pub url: String,
        pub last_modified: Option<String>,
        pub change_frequency: Option<String>,
        pub priority: Option<f64>,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_config_default() {
        let config = CrawlerConfig::default();
        assert!(config.respect_robots_txt);
        assert!(config.smart_mode);
    }

    #[test]
    fn test_config_polite() {
        let config = CrawlerConfig::polite();
        assert_eq!(config.max_concurrent, 2);
        assert_eq!(config.delay_ms, 1000);
    }

    #[test]
    fn test_crawler_visited() {
        let mut crawler = WebCrawler::default();
        assert!(!crawler.is_visited("https://example.com"));

        crawler.mark_visited("https://example.com");
        assert!(crawler.is_visited("https://example.com"));
        assert_eq!(crawler.visited_count(), 1);
    }
}