halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Decision - Decision to switch to a headless browser

use scraper::{Html, Selector};

/// Render decision
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum RenderDecision {
    /// Static HTML is sufficient
    Static,
    /// Requires a headless browser
    NeedsBrowser(BrowserReason),
}

/// Reason for requiring a browser
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum BrowserReason {
    /// Empty or nearly empty page
    EmptyContent,
    /// Many scripts
    HeavyScripts,
    /// JS frameworks detected (React, Vue, Angular, etc.)
    JsFramework(String),
    /// Unresolvable lazy loading
    LazyLoading,
    /// Required selectors not found
    MissingSelectors,
    /// SPA detected
    SinglePageApp,
}

/// Render checker
pub struct RenderChecker {
    /// Script threshold to consider "heavy"
    script_threshold: usize,
    /// Minimum content threshold (characters)
    min_content_length: usize,
    /// Required selectors (if all absent, browser needed)
    required_selectors: Vec<String>,
}

impl Default for RenderChecker {
    fn default() -> Self {
        Self {
            script_threshold: 10,
            min_content_length: 500,
            required_selectors: Vec::new(),
        }
    }
}

impl RenderChecker {
    /// New checker
    pub fn new() -> Self {
        Self::default()
    }

    /// Configure the script threshold
    pub fn with_script_threshold(mut self, threshold: usize) -> Self {
        self.script_threshold = threshold;
        self
    }

    /// Configure the minimum content
    pub fn with_min_content(mut self, min_length: usize) -> Self {
        self.min_content_length = min_length;
        self
    }

    /// Configure the required selectors
    pub fn with_required_selectors(mut self, selectors: Vec<String>) -> Self {
        self.required_selectors = selectors;
        self
    }

    /// Check if JS rendering is necessary
    pub fn check(&self, html: &str) -> RenderDecision {
        let document = Html::parse_document(html);

        // 1. Check if page is empty
        if let Some(reason) = self.check_empty_content(&document) {
            return RenderDecision::NeedsBrowser(reason);
        }

        // 2. Check JS frameworks
        if let Some(reason) = self.check_js_frameworks(html) {
            return RenderDecision::NeedsBrowser(reason);
        }

        // 3. Check number of scripts
        if let Some(reason) = self.check_heavy_scripts(&document) {
            return RenderDecision::NeedsBrowser(reason);
        }

        // 4. Check required selectors
        if let Some(reason) = self.check_required_selectors(&document) {
            return RenderDecision::NeedsBrowser(reason);
        }

        // 5. Check SPA patterns
        if let Some(reason) = self.check_spa_patterns(&document) {
            return RenderDecision::NeedsBrowser(reason);
        }

        RenderDecision::Static
    }

    /// Check if content is empty
    fn check_empty_content(&self, document: &Html) -> Option<BrowserReason> {
        // Count text in body
        let body_selector = Selector::parse("body").ok()?;
        let body = document.select(&body_selector).next()?;
        
        // Exclude scripts and styles
        let text: String = body
            .text()
            .collect::<Vec<_>>()
            .join(" ")
            .split_whitespace()
            .collect::<Vec<_>>()
            .join(" ");

        if text.len() < self.min_content_length {
            return Some(BrowserReason::EmptyContent);
        }

        None
    }

    /// Detect JS frameworks
    fn check_js_frameworks(&self, html: &str) -> Option<BrowserReason> {
        let html_lower = html.to_lowercase();

        // React
        if html_lower.contains("__react") 
            || html_lower.contains("data-reactroot")
            || html_lower.contains("_reactrootcontainer") 
        {
            return Some(BrowserReason::JsFramework("React".to_string()));
        }

        // Vue
        if html_lower.contains("data-v-") || html_lower.contains("__vue__") {
            return Some(BrowserReason::JsFramework("Vue".to_string()));
        }

        // Angular
        if html_lower.contains("ng-version") || html_lower.contains("ng-app") {
            return Some(BrowserReason::JsFramework("Angular".to_string()));
        }

        // Next.js
        if html_lower.contains("__next") || html_lower.contains("_next/static") {
            return Some(BrowserReason::JsFramework("Next.js".to_string()));
        }

        // Nuxt.js
        if html_lower.contains("__nuxt") || html_lower.contains("_nuxt/") {
            return Some(BrowserReason::JsFramework("Nuxt.js".to_string()));
        }

        // Svelte
        if html_lower.contains("svelte-") {
            return Some(BrowserReason::JsFramework("Svelte".to_string()));
        }

        None
    }

    /// Check the number of scripts
    fn check_heavy_scripts(&self, document: &Html) -> Option<BrowserReason> {
        let script_selector = Selector::parse("script[src]").ok()?;
        let script_count = document.select(&script_selector).count();

        if script_count > self.script_threshold {
            return Some(BrowserReason::HeavyScripts);
        }

        None
    }

    /// Check required selectors
    fn check_required_selectors(&self, document: &Html) -> Option<BrowserReason> {
        if self.required_selectors.is_empty() {
            return None;
        }

        for selector_str in &self.required_selectors {
            if let Ok(selector) = Selector::parse(selector_str) {
                if document.select(&selector).next().is_some() {
                    return None; // At least one selector found
                }
            }
        }

        // No required selector found
        Some(BrowserReason::MissingSelectors)
    }

    /// Detect SPA patterns
    fn check_spa_patterns(&self, document: &Html) -> Option<BrowserReason> {
        // Single div as main container
        let app_selector = Selector::parse("#app, #root, #__next, #__nuxt").ok()?;
        let app_div = document.select(&app_selector).next()?;

        // Check if the div is almost empty
        let text: String = app_div.text().collect::<Vec<_>>().join("");
        let text = text.trim();

        if text.len() < 100 {
            return Some(BrowserReason::SinglePageApp);
        }

        None
    }
}

/// Indicators for required rendering
#[derive(Debug, Clone, Default)]
pub struct RenderIndicators {
    /// External scripts
    pub external_scripts: usize,
    /// Inline scripts
    pub inline_scripts: usize,
    /// Lazy loading detected
    pub has_lazy_loading: bool,
    /// Framework detected
    pub detected_framework: Option<String>,
    /// Text content (characters)
    pub text_content_length: usize,
    /// Empty app/root div
    pub empty_app_container: bool,
}

/// Analyze render indicators
pub fn analyze_render_indicators(html: &str) -> RenderIndicators {
    let document = Html::parse_document(html);
    let mut indicators = RenderIndicators::default();

    // Compter les scripts
    if let Ok(sel) = Selector::parse("script[src]") {
        indicators.external_scripts = document.select(&sel).count();
    }
    if let Ok(sel) = Selector::parse("script:not([src])") {
        indicators.inline_scripts = document.select(&sel).count();
    }

    // Lazy loading
    if let Ok(sel) = Selector::parse("[data-src], [data-lazy], [loading='lazy']") {
        indicators.has_lazy_loading = document.select(&sel).next().is_some();
    }

    // Framework
    let checker = RenderChecker::new();
    if let Some(BrowserReason::JsFramework(fw)) = checker.check_js_frameworks(html) {
        indicators.detected_framework = Some(fw);
    }

    // Contenu textuel
    if let Ok(body_sel) = Selector::parse("body") {
        if let Some(body) = document.select(&body_sel).next() {
            indicators.text_content_length = body
                .text()
                .collect::<Vec<_>>()
                .join("")
                .trim()
                .len();
        }
    }

    // Container vide
    if let Ok(sel) = Selector::parse("#app, #root") {
        if let Some(container) = document.select(&sel).next() {
            let text: String = container.text().collect();
            indicators.empty_app_container = text.trim().len() < 50;
        }
    }

    indicators
}