halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Browser - Headless browser abstraction for JS rendering

use std::time::Duration;
use url::Url;

use crate::types::error::{Error, Result};

/// Browser response after rendering
#[derive(Debug, Clone)]
pub struct BrowserResponse {
    /// Final URL after redirects
    pub final_url: Url,
    /// Rendered HTML content
    pub html: String,
    /// Page title
    pub title: Option<String>,
    /// Console logs from the page
    pub console_logs: Vec<ConsoleMessage>,
    /// Network requests made during rendering
    pub network_requests: Vec<NetworkRequest>,
    /// Render duration
    pub render_time_ms: u64,
    /// Screenshot (PNG bytes) if requested
    pub screenshot: Option<Vec<u8>>,
}

/// Console message from the browser
#[derive(Debug, Clone)]
pub struct ConsoleMessage {
    /// Message level (log, warn, error)
    pub level: ConsoleLevel,
    /// Message text
    pub text: String,
}

/// Console message level
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ConsoleLevel {
    /// Log level
    Log,
    /// Warning level
    Warn,
    /// Error level
    Error,
    /// Debug level
    Debug,
}

/// Network request captured during rendering
#[derive(Debug, Clone)]
pub struct NetworkRequest {
    /// Request URL
    pub url: String,
    /// HTTP method
    pub method: String,
    /// Resource type (document, script, image, etc.)
    pub resource_type: ResourceType,
    /// Response status code (if completed)
    pub status: Option<u16>,
}

/// Resource type for network requests
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ResourceType {
    /// HTML document
    Document,
    /// JavaScript
    Script,
    /// Stylesheet
    Stylesheet,
    /// Image
    Image,
    /// Font
    Font,
    /// XHR/Fetch request
    Xhr,
    /// WebSocket
    WebSocket,
    /// Other resource
    Other,
}

/// Browser render options
#[derive(Debug, Clone)]
pub struct RenderOptions {
    /// Timeout for page load
    pub timeout: Duration,
    /// Wait for network idle
    pub wait_for_network_idle: bool,
    /// Network idle timeout (ms with no requests)
    pub network_idle_timeout_ms: u64,
    /// Wait for a specific selector
    pub wait_for_selector: Option<String>,
    /// Execute JavaScript before extraction
    pub execute_script: Option<String>,
    /// Capture screenshot
    pub capture_screenshot: bool,
    /// Viewport width
    pub viewport_width: u32,
    /// Viewport height
    pub viewport_height: u32,
    /// User agent override
    pub user_agent: Option<String>,
    /// Block resource types
    pub block_resources: Vec<ResourceType>,
    /// Extra HTTP headers
    pub extra_headers: Vec<(String, String)>,
}

impl Default for RenderOptions {
    fn default() -> Self {
        Self {
            timeout: Duration::from_secs(30),
            wait_for_network_idle: true,
            network_idle_timeout_ms: 500,
            wait_for_selector: None,
            execute_script: None,
            capture_screenshot: false,
            viewport_width: 1920,
            viewport_height: 1080,
            user_agent: None,
            block_resources: vec![ResourceType::Image, ResourceType::Font],
            extra_headers: Vec::new(),
        }
    }
}

/// Browser backend trait for abstraction
#[allow(async_fn_in_trait)]
pub trait BrowserBackend: Send + Sync {
    /// Render a URL and return the result
    async fn render(&self, url: &Url, options: &RenderOptions) -> Result<BrowserResponse>;
    
    /// Check if the browser is healthy
    async fn health_check(&self) -> Result<()>;
    
    /// Close the browser
    async fn close(&self) -> Result<()>;
}

/// Headless browser pool for concurrent rendering
pub struct BrowserPool {
    /// Backend type
    backend_type: BrowserBackendType,
    /// Maximum concurrent browsers
    max_concurrent: usize,
    /// Default render options
    default_options: RenderOptions,
    /// Current active count
    active_count: std::sync::atomic::AtomicUsize,
}

/// Supported browser backends
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum BrowserBackendType {
    /// Chrome DevTools Protocol (headless Chrome)
    ChromeCdp,
    /// Playwright
    Playwright,
    /// Puppeteer-like (via external process)
    Puppeteer,
    /// No browser (stub for testing)
    None,
}

impl Default for BrowserPool {
    fn default() -> Self {
        Self::new(BrowserBackendType::None, 4)
    }
}

impl BrowserPool {
    /// Create a new browser pool
    pub fn new(backend_type: BrowserBackendType, max_concurrent: usize) -> Self {
        Self {
            backend_type,
            max_concurrent,
            default_options: RenderOptions::default(),
            active_count: std::sync::atomic::AtomicUsize::new(0),
        }
    }

    /// Configure default options
    pub fn with_options(mut self, options: RenderOptions) -> Self {
        self.default_options = options;
        self
    }

    /// Get the backend type
    pub fn backend_type(&self) -> &BrowserBackendType {
        &self.backend_type
    }

    /// Get max concurrent browsers
    pub fn max_concurrent(&self) -> usize {
        self.max_concurrent
    }

    /// Check if a browser slot is available
    pub fn has_available_slot(&self) -> bool {
        self.active_count.load(std::sync::atomic::Ordering::Relaxed) < self.max_concurrent
    }

    /// Acquire a browser slot
    pub fn acquire(&self) -> Option<BrowserSlot<'_>> {
        let current = self.active_count.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
        if current >= self.max_concurrent {
            self.active_count.fetch_sub(1, std::sync::atomic::Ordering::SeqCst);
            None
        } else {
            Some(BrowserSlot { pool: self })
        }
    }

    /// Render a URL using the pool
    pub async fn render(&self, url: &Url, options: Option<&RenderOptions>) -> Result<BrowserResponse> {
        let _slot = self.acquire().ok_or_else(|| {
            Error::Config("No browser slots available".to_string())
        })?;

        let opts = options.unwrap_or(&self.default_options);

        match self.backend_type {
            BrowserBackendType::None => {
                // Stub implementation - return error indicating browser needed
                Err(Error::Config("No browser backend configured".to_string()))
            }
            BrowserBackendType::ChromeCdp => {
                self.render_chrome_cdp(url, opts).await
            }
            BrowserBackendType::Playwright => {
                self.render_playwright(url, opts).await
            }
            BrowserBackendType::Puppeteer => {
                self.render_puppeteer(url, opts).await
            }
        }
    }

    /// Chrome CDP implementation
    async fn render_chrome_cdp(&self, url: &Url, options: &RenderOptions) -> Result<BrowserResponse> {
        // TODO: Implement Chrome DevTools Protocol
        // This would connect to a running Chrome instance via WebSocket
        // and use CDP commands to navigate and extract content
        let _ = (url, options);
        Err(Error::Config("Chrome CDP backend not yet implemented".to_string()))
    }

    /// Playwright implementation
    async fn render_playwright(&self, url: &Url, options: &RenderOptions) -> Result<BrowserResponse> {
        // TODO: Implement Playwright integration
        // This would spawn a Playwright process or use a Playwright server
        let _ = (url, options);
        Err(Error::Config("Playwright backend not yet implemented".to_string()))
    }

    /// Puppeteer implementation
    async fn render_puppeteer(&self, url: &Url, options: &RenderOptions) -> Result<BrowserResponse> {
        // TODO: Implement Puppeteer integration
        let _ = (url, options);
        Err(Error::Config("Puppeteer backend not yet implemented".to_string()))
    }

    /// Active browser count
    pub fn active_count(&self) -> usize {
        self.active_count.load(std::sync::atomic::Ordering::Relaxed)
    }
}

/// RAII guard for browser slot
pub struct BrowserSlot<'a> {
    pool: &'a BrowserPool,
}

impl Drop for BrowserSlot<'_> {
    fn drop(&mut self) {
        self.pool.active_count.fetch_sub(1, std::sync::atomic::Ordering::SeqCst);
    }
}

/// Stub browser for testing (returns static HTML)
pub struct StubBrowser {
    /// HTML to return
    html: String,
}

impl StubBrowser {
    /// Create a new stub browser
    pub fn new(html: impl Into<String>) -> Self {
        Self { html: html.into() }
    }
}

impl BrowserBackend for StubBrowser {
    async fn render(&self, url: &Url, _options: &RenderOptions) -> Result<BrowserResponse> {
        Ok(BrowserResponse {
            final_url: url.clone(),
            html: self.html.clone(),
            title: None,
            console_logs: Vec::new(),
            network_requests: Vec::new(),
            render_time_ms: 0,
            screenshot: None,
        })
    }

    async fn health_check(&self) -> Result<()> {
        Ok(())
    }

    async fn close(&self) -> Result<()> {
        Ok(())
    }
}