halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Logs - Structured logs

use serde::Serialize;
use tracing::{info, warn, error, debug};

/// Structured logger
pub struct StructuredLogger {
    /// Current job ID
    job_id: Option<String>,
}

impl Default for StructuredLogger {
    fn default() -> Self {
        Self::new()
    }
}

impl StructuredLogger {
    /// New logger
    pub fn new() -> Self {
        Self { job_id: None }
    }

    /// With Job ID
    pub fn with_job_id(job_id: &str) -> Self {
        Self {
            job_id: Some(job_id.to_string()),
        }
    }

    /// Log a request
    pub fn log_request(&self, event: &RequestEvent) {
        info!(
            job_id = %self.job_id.as_deref().unwrap_or("-"),
            url = %event.url,
            method = %event.method,
            "Request started"
        );
    }

    /// Log a response
    pub fn log_response(&self, event: &ResponseEvent) {
        if event.status_code >= 400 {
            warn!(
                job_id = %self.job_id.as_deref().unwrap_or("-"),
                url = %event.url,
                status = event.status_code,
                bytes = event.bytes,
                duration_ms = event.duration_ms,
                "Response error"
            );
        } else {
            info!(
                job_id = %self.job_id.as_deref().unwrap_or("-"),
                url = %event.url,
                status = event.status_code,
                bytes = event.bytes,
                duration_ms = event.duration_ms,
                "Response received"
            );
        }
    }

    /// Log an error
    pub fn log_error(&self, event: &ErrorEvent) {
        error!(
            job_id = %self.job_id.as_deref().unwrap_or("-"),
            url = %event.url,
            error_type = %event.error_type,
            message = %event.message,
            recoverable = event.recoverable,
            "Scrape error"
        );
    }

    /// Log parsing
    pub fn log_parse(&self, event: &ParseEvent) {
        debug!(
            job_id = %self.job_id.as_deref().unwrap_or("-"),
            url = %event.url,
            text_length = event.text_length,
            links_count = event.links_count,
            images_count = event.images_count,
            duration_ms = event.duration_ms,
            "Parsing completed"
        );
    }

    /// Log a retry
    pub fn log_retry(&self, url: &str, attempt: u32, delay_ms: u64) {
        warn!(
            job_id = %self.job_id.as_deref().unwrap_or("-"),
            url = %url,
            attempt = attempt,
            delay_ms = delay_ms,
            "Retrying request"
        );
    }

    /// Log rate limiting
    pub fn log_rate_limited(&self, url: &str, pause_ms: u64) {
        warn!(
            job_id = %self.job_id.as_deref().unwrap_or("-"),
            url = %url,
            pause_ms = pause_ms,
            "Rate limited, pausing"
        );
    }

    /// Convenience: Log request from URL
    pub fn log_request_url(&self, url: &url::Url, method: &str) {
        self.log_request(&RequestEvent {
            url: url.to_string(),
            method: method.to_string(),
        });
    }

    /// Convenience: Log response from parts
    pub fn log_response_parts(&self, url: &url::Url, status_code: u16, bytes: u64, duration_ms: u64) {
        self.log_response(&ResponseEvent {
            url: url.to_string(),
            status_code,
            bytes,
            duration_ms,
        });
    }

    /// Convenience: Log error from parts
    pub fn log_error_parts(&self, url: &url::Url, message: &str, recoverable: bool) {
        self.log_error(&ErrorEvent {
            url: url.to_string(),
            error_type: "scrape_error".to_string(),
            message: message.to_string(),
            recoverable,
        });
    }

    /// Log info message
    pub fn log_info(&self, message: &str) {
        info!(
            job_id = %self.job_id.as_deref().unwrap_or("-"),
            "{}", message
        );
    }

    /// Log warning message
    pub fn log_warn(&self, message: &str) {
        warn!(
            job_id = %self.job_id.as_deref().unwrap_or("-"),
            "{}", message
        );
    }

    /// Log debug message
    pub fn log_debug(&self, message: &str) {
        debug!(
            job_id = %self.job_id.as_deref().unwrap_or("-"),
            "{}", message
        );
    }
}

/// Request event
#[derive(Debug, Serialize)]
pub struct RequestEvent {
    /// Request URL
    pub url: String,
    /// HTTP method
    pub method: String,
}

/// Response event
#[derive(Debug, Serialize)]
pub struct ResponseEvent {
    /// Response URL
    pub url: String,
    /// HTTP status code
    pub status_code: u16,
    /// Response size in bytes
    pub bytes: u64,
    /// Request duration in milliseconds
    pub duration_ms: u64,
}

/// Error event
#[derive(Debug, Serialize)]
pub struct ErrorEvent {
    /// Error URL context
    pub url: String,
    /// Error type classification
    pub error_type: String,
    /// Error message
    pub message: String,
    /// Whether the error is recoverable
    pub recoverable: bool,
}

/// Parse event
#[derive(Debug, Serialize)]
pub struct ParseEvent {
    /// Parsed URL
    pub url: String,
    /// Extracted text length in characters
    pub text_length: usize,
    /// Number of links found
    pub links_count: usize,
    /// Number of images found
    pub images_count: usize,
    /// Parse duration in milliseconds
    pub duration_ms: u64,
}