essence-engine 0.2.0

use crate::{
    crawler::config::{CircuitBreaker, CrawlerConfig, MemoryMonitor},
    crawler::filter::{is_same_domain, should_crawl_url},
    crawler::pagination::{PaginationConfig, PaginationDetector},
    crawler::prioritizer::{PrioritizedUrl, UrlPrioritizer},
    crawler::rate_limiter::DomainRateLimiter,
    crawler::url_normalization::normalize_url,
    engines::{http::HttpEngine, ScrapeEngine},
    error::{Result, ScrapeError},
    format,
    types::{CrawlRequest, Document, ScrapeRequest},
    utils::robots,
};
use scraper::{Html, Selector};
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::sync::Arc;
use tracing::{debug, info, warn};
use url::Url;

/// Crawl a website starting from the given URL
pub async fn crawl_website(request: &CrawlRequest) -> Result<Vec<Document>> {
    info!("Starting crawl from URL: {}", request.url);

    // Parse and validate base URL
    let _base_url = Url::parse(&request.url)
        .map_err(|e| ScrapeError::InvalidUrl(format!("Invalid base URL: {}", e)))?;

    // Normalize the base URL to prevent duplicates using comprehensive normalization
    let normalized_base_url = normalize_url(&request.url);

    // Initialize crawler config with bounds
    let config = CrawlerConfig::default();

    // Initialize circuit breaker and memory monitor
    let circuit_breaker = CircuitBreaker::new(config.circuit_breaker_threshold);
    let memory_monitor = MemoryMonitor::new(config.max_memory_mb, config.enable_memory_monitoring);

    // Initialize URL prioritizer for intelligent crawling
    let url_prioritizer = UrlPrioritizer::new();

    // Initialize data structures with capacity hints
    let mut visited = HashSet::new();
    let mut queue = BinaryHeap::with_capacity(config.max_queue_size.min(1000));
    let mut documents = Vec::with_capacity(request.limit.min(1000) as usize);
    let mut url_depths: HashMap<String, u32> = HashMap::new();

    // Add normalized base URL to priority queue
    let base_prioritized_url = PrioritizedUrl::new(normalized_base_url.clone(), 0, &url_prioritizer);
    queue.push(base_prioritized_url);
    url_depths.insert(normalized_base_url.clone(), 0);

    // Check robots.txt for the domain
    let robots_cache = check_robots_txt(&request.url, request.ignore_sitemap).await;

    // Create HTTP engine for scraping
    let engine = HttpEngine::new()?;

    // Initialize pagination detector with configuration
    let pagination_config = PaginationConfig {
        max_pages: request.max_pagination_pages.unwrap_or(50) as usize,
        max_depth: request.max_depth as usize,
        detect_circular: true,
    };
    let mut pagination_detector = PaginationDetector::new(pagination_config);
    let detect_pagination = request.detect_pagination.unwrap_or(true);

    // Create rate limiter
    let rate_limit = std::env::var("CRAWL_RATE_LIMIT_PER_SEC")
        .ok()
        .and_then(|v| v.parse().ok())
        .unwrap_or(2);
    let rate_limiter = Arc::new(DomainRateLimiter::new(rate_limit));

    info!(
        "Rate limiting enabled: {} requests/second per domain",
        rate_limit
    );

    // Priority-based crawl (BFS with intelligent ordering)
    while let Some(prioritized_url) = queue.pop() {
        let current_url = prioritized_url.url;
        let current_depth = prioritized_url.depth;
        // Check memory limit periodically (every 10 iterations)
        if documents.len() % 10 == 0 {
            if let Err(e) = memory_monitor.check_memory_limit() {
                warn!("Memory limit check failed: {}", e);
                return Err(e);
            }
        }

        // Check if we've reached the document limit
        if documents.len() >= request.limit as usize {
            info!("Reached crawl limit of {} pages", request.limit);
            break;
        }

        // Skip if already visited
        if visited.contains(&current_url) {
            continue;
        }

        // Extract domain for circuit breaker
        let domain = match Url::parse(&current_url) {
            Ok(parsed) => parsed.host_str().unwrap_or("unknown").to_string(),
            Err(_) => "unknown".to_string(),
        };

        // Check circuit breaker
        if config.enable_circuit_breaker && circuit_breaker.should_skip(&domain) {
            warn!(
                "Circuit breaker: Skipping domain {} due to excessive failures ({})",
                domain,
                circuit_breaker.get_failure_count(&domain)
            );
            visited.insert(current_url.clone());
            continue;
        }

        // Skip if depth exceeds max_depth
        if current_depth > request.max_depth {
            debug!("Skipping URL due to depth limit: {}", current_url);
            continue;
        }

        // Check robots.txt
        if !robots_cache {
            match robots::is_allowed_default(&current_url).await {
                Ok(allowed) => {
                    if !allowed {
                        warn!("Robots.txt disallows crawling: {}", current_url);
                        visited.insert(current_url.clone());
                        continue;
                    }
                }
                Err(e) => {
                    warn!("Failed to check robots.txt for {}: {}", current_url, e);
                }
            }
        }

        // Check if URL should be crawled based on filters
        if !should_crawl_url(&current_url, &request.include_paths, &request.exclude_paths) {
            debug!(
                "URL filtered out by include/exclude patterns: {}",
                current_url
            );
            visited.insert(current_url.clone());
            continue;
        }

        // Mark as visited (URL is already normalized from queue)
        visited.insert(current_url.clone());


        // Apply rate limiting before scraping
        if let Err(e) = rate_limiter.wait_for_permission(&current_url).await {
            warn!("Rate limiter error for {}: {}", current_url, e);
            continue;
        }

        // Scrape the URL
        info!(
            "Crawling URL: {} (depth: {}, total: {})",
            current_url,
            current_depth,
            documents.len()
        );

        match scrape_url(&current_url, &engine).await {
            Ok((document, links, html)) => {
                // Record success in circuit breaker
                if config.enable_circuit_breaker {
                    circuit_breaker.record_success(&domain);
                }

                // Add document to results
                documents.push(document);

                // Process discovered links if we haven't reached max depth
                if current_depth < request.max_depth {
                    // First, detect pagination links if enabled
                    let pagination_links = if detect_pagination {
                        pagination_detector.detect_pagination(&html, &current_url)
                    } else {
                        Vec::new()
                    };

                    // Add pagination links with priority (same depth as current)
                    for link in &pagination_links {
                        let normalized_link = normalize_url(link);

                        // Skip if already visited or queued
                        if visited.contains(&normalized_link)
                            || url_depths.contains_key(&normalized_link)
                        {
                            continue;
                        }

                        // Check domain restrictions
                        if is_same_domain(&normalized_link, &normalized_base_url) {
                            // Check queue size limit before adding
                            if queue.len() >= config.max_queue_size {
                                warn!(
                                    "Queue limit reached ({}/{}), skipping pagination link: {}",
                                    queue.len(),
                                    config.max_queue_size,
                                    link
                                );
                                continue;
                            }

                            // Add pagination links with high priority (same depth as current)
                            let prioritized_link = PrioritizedUrl::new(
                                normalized_link.clone(),
                                current_depth,
                                &url_prioritizer,
                            );
                            queue.push(prioritized_link);
                            url_depths.insert(normalized_link, current_depth);
                            debug!("Added pagination link to queue: {}", link);
                        }
                    }

                    // Then process regular links
                    for link in links {
                        // Normalize the link to prevent duplicates using comprehensive normalization
                        let normalized_link = normalize_url(&link);

                        // Skip if already visited or queued
                        if visited.contains(&normalized_link)
                            || url_depths.contains_key(&normalized_link)
                        {
                            continue;
                        }

                        // Skip if this is a pagination link (already processed)
                        if pagination_links.contains(&link) || pagination_links.contains(&normalized_link) {
                            continue;
                        }

                        // Check domain restrictions
                        let allow_link = if request.allow_external_links.unwrap_or(false) {
                            true
                        } else if request.allow_backward_links.unwrap_or(false) {
                            // Allow backward links means crawl entire domain
                            is_same_domain(&normalized_link, &normalized_base_url)
                        } else {
                            // Only allow links that are "forward" (same path or deeper)
                            is_same_domain(&normalized_link, &normalized_base_url)
                                && is_forward_link(&normalized_link, &current_url)
                        };

                        if allow_link {
                            // Check queue size limit before adding
                            if queue.len() >= config.max_queue_size {
                                debug!(
                                    "Queue limit reached ({}/{}), skipping link: {}",
                                    queue.len(),
                                    config.max_queue_size,
                                    link
                                );
                                continue;
                            }

                            // Apply backpressure: when queue is at threshold, skip secondary links
                            let backpressure_limit =
                                (config.max_queue_size * config.backpressure_threshold as usize) / 100;
                            if queue.len() >= backpressure_limit {
                                debug!(
                                    "Queue at backpressure threshold ({}/{}), slowing link discovery",
                                    queue.len(),
                                    config.max_queue_size
                                );
                                // Only add if this is a direct child (depth + 1)
                                // Skip secondary/deeper links
                                if current_depth + 1 < request.max_depth {
                                    let prioritized_link = PrioritizedUrl::new(
                                        normalized_link.clone(),
                                        current_depth + 1,
                                        &url_prioritizer,
                                    );
                                    queue.push(prioritized_link);
                                    url_depths.insert(normalized_link, current_depth + 1);
                                }
                            } else {
                                let prioritized_link = PrioritizedUrl::new(
                                    normalized_link.clone(),
                                    current_depth + 1,
                                    &url_prioritizer,
                                );
                                queue.push(prioritized_link);
                                url_depths.insert(normalized_link, current_depth + 1);
                            }
                        }
                    }
                }
            }
            Err(e) => {
                warn!("Failed to scrape {}: {}", current_url, e);

                // Record failure in circuit breaker
                if config.enable_circuit_breaker {
                    circuit_breaker.record_failure(&domain);
                }

                // Continue crawling despite errors
            }
        }
    }

    // Log final stats
    info!(
        "Crawl stats - Queue size: {}, Circuit breaker failures: {}, Memory: {:.2}%",
        queue.len(),
        circuit_breaker.get_total_failures(),
        memory_monitor.get_memory_percentage()
    );

    info!(
        "Crawl completed. Total pages crawled: {}, visited: {}",
        documents.len(),
        visited.len()
    );

    Ok(documents)
}

/// Check if robots.txt allows crawling
async fn check_robots_txt(url: &str, ignore_sitemap: Option<bool>) -> bool {
    if ignore_sitemap.unwrap_or(false) {
        return true;
    }

    match robots::is_allowed_default(url).await {
        Ok(allowed) => allowed,
        Err(e) => {
            warn!("Failed to check robots.txt: {}, allowing by default", e);
            true
        }
    }
}

/// Scrape a single URL and extract links
async fn scrape_url(url: &str, engine: &HttpEngine) -> Result<(Document, Vec<String>, String)> {
    // Create a scrape request
    let scrape_request = ScrapeRequest {
        url: url.to_string(),
        formats: vec!["markdown".to_string(), "links".to_string()],
        headers: HashMap::new(),
        include_tags: vec![],
        exclude_tags: vec![],
        only_main_content: true,
        timeout: 30000,
        wait_for: 0,
        remove_base64_images: true,
        skip_tls_verification: false,
        engine: "http".to_string(),
        wait_for_selector: None,
        actions: vec![],
        screenshot: false,
        screenshot_format: "png".to_string(),
    };

    // Scrape the URL
    let raw_result = engine.scrape(&scrape_request).await?;

    // Extract links from HTML
    let links = extract_links(&raw_result.html, url)?;

    // Store HTML for pagination detection
    let html = raw_result.html.clone();

    // Process the result into a document
    let document = format::process_scrape_result(raw_result, &scrape_request).await?;

    Ok((document, links, html))
}

/// Extract all links from HTML
fn extract_links(html: &str, base_url: &str) -> Result<Vec<String>> {
    let document = Html::parse_document(html);
    let selector = Selector::parse("a[href]")
        .map_err(|e| ScrapeError::Internal(format!("Failed to create link selector: {:?}", e)))?;

    let base = Url::parse(base_url)
        .map_err(|e| ScrapeError::InvalidUrl(format!("Invalid base URL: {}", e)))?;

    let mut links = Vec::new();

    for element in document.select(&selector) {
        if let Some(href) = element.value().attr("href") {
            // Skip javascript:, mailto:, tel:, etc.
            if href.starts_with("javascript:")
                || href.starts_with("mailto:")
                || href.starts_with("tel:")
                || href.starts_with('#')
            {
                continue;
            }

            // Parse and resolve the URL
            match base.join(href) {
                Ok(absolute_url) => {
                    let url_str = absolute_url.to_string();
                    // Remove fragment
                    let url_without_fragment = url_str.split('#').next().unwrap_or(&url_str);
                    links.push(url_without_fragment.to_string());
                }
                Err(_) => {
                    // Skip invalid URLs
                    continue;
                }
            }
        }
    }

    // Deduplicate
    links.sort();
    links.dedup();

    Ok(links)
}

/// Check if a link is "forward" (same path or deeper)
fn is_forward_link(link: &str, current: &str) -> bool {
    let link_parsed = match Url::parse(link) {
        Ok(u) => u,
        Err(_) => return false,
    };

    let current_parsed = match Url::parse(current) {
        Ok(u) => u,
        Err(_) => return false,
    };

    let link_path = link_parsed.path();
    let current_path = current_parsed.path();

    // A link is forward if:
    // 1. It has the same path as current, or
    // 2. It's a subpath of current (starts with current path + '/')
    if link_path == current_path {
        return true;
    }

    // Ensure we're checking path boundaries, not just string prefixes
    // /news/articles should match /news, but /newsletter should NOT match /news
    let current_with_slash = if current_path.ends_with('/') {
        current_path.to_string()
    } else {
        format!("{}/", current_path)
    };

    link_path.starts_with(&current_with_slash)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_extract_links() {
        let html = r##"
            <html>
                <body>
                    <a href="/page1">Page 1</a>
                    <a href="/page2">Page 2</a>
                    <a href="https://example.com/page3">Page 3</a>
                    <a href="javascript:void(0)">JS</a>
                    <a href="mailto:test@example.com">Email</a>
                    <a href="#section">Section</a>
                </body>
            </html>
        "##;

        let links = extract_links(html, "https://example.com").unwrap();

        assert!(links.contains(&"https://example.com/page1".to_string()));
        assert!(links.contains(&"https://example.com/page2".to_string()));
        assert!(links.contains(&"https://example.com/page3".to_string()));
        assert!(!links.iter().any(|l| l.contains("javascript:")));
        assert!(!links.iter().any(|l| l.contains("mailto:")));
        assert!(!links.iter().any(|l| l.contains('#')));
    }

    #[test]
    fn test_is_forward_link() {
        // Forward: subpath of current
        assert!(is_forward_link(
            "https://example.com/blog/post1",
            "https://example.com/blog"
        ));

        // Forward: same path
        assert!(is_forward_link(
            "https://example.com/blog",
            "https://example.com/blog"
        ));

        // NOT forward: different path
        assert!(!is_forward_link(
            "https://example.com/about",
            "https://example.com/blog"
        ));

        // NOT forward: path boundary check (newsletter is not a subpath of news)
        assert!(!is_forward_link(
            "https://example.com/newsletter",
            "https://example.com/news"
        ));

        // Forward: with trailing slash
        assert!(is_forward_link(
            "https://example.com/blog/post1",
            "https://example.com/blog/"
        ));
    }

    #[test]
    fn test_url_deduplication_with_trailing_slash() {
        // Test that URLs with and without trailing slashes are properly deduplicated
        use std::collections::HashSet;

        let mut visited: HashSet<String> = HashSet::new();
        let mut url_depths: HashMap<String, u32> = HashMap::new();

        // Simulate adding URLs that differ only by trailing slash
        let url1 = normalize_url("https://example.com");
        let url2 = normalize_url("https://example.com/");

        // Both should normalize to the same URL
        assert_eq!(url1, url2, "URLs should be identical after normalization");

        // Add first URL
        visited.insert(url1.clone());
        url_depths.insert(url1.clone(), 0);

        // Second URL should be detected as duplicate
        assert!(
            visited.contains(&url2),
            "Second URL should be detected as already visited"
        );
        assert!(
            url_depths.contains_key(&url2),
            "Second URL should be detected as already in depth map"
        );

        // Only one unique URL should exist
        assert_eq!(visited.len(), 1, "Should have exactly one unique URL");
        assert_eq!(
            url_depths.len(),
            1,
            "Should have exactly one URL in depth map"
        );
    }

    #[test]
    fn test_extract_links_deduplication() {
        // Test that extract_links properly handles trailing slashes and fragments
        let html = r##"
            <html>
                <body>
                    <a href="/">Home</a>
                    <a href="/page">Page</a>
                    <a href="/page/">Page with slash</a>
                    <a href="/page#section1">Page section 1</a>
                    <a href="/page#section2">Page section 2</a>
                    <a href="/other">Other</a>
                </body>
            </html>
        "##;

        let links = extract_links(html, "https://example.com").unwrap();

        // Count how many times each normalized URL appears
        let mut normalized_links: Vec<String> = links
            .iter()
            .map(|link| normalize_url(link))
            .collect();
        normalized_links.sort();
        normalized_links.dedup();

        // Should have exactly 3 unique pages: /, /page, /other
        // (All variants of /page should normalize to one URL)
        assert_eq!(
            normalized_links.len(),
            3,
            "Should have exactly 3 unique normalized URLs"
        );
    }
}