halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Metrics - Metrics collection

use std::collections::HashMap;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::RwLock;
use std::time::{Duration, Instant};

/// Global metrics
#[derive(Debug, Default)]
pub struct Metrics {
    /// Total requests
    pub requests_total: AtomicU64,
    /// Successful requests
    pub requests_success: AtomicU64,
    /// Failed requests
    pub requests_failed: AtomicU64,
    /// Rate limited requests
    pub requests_rate_limited: AtomicU64,
    /// Downloaded bytes
    pub bytes_downloaded: AtomicU64,
    /// Retries performed
    pub retries_total: AtomicU64,
    /// Total request time (ms)
    pub request_time_total_ms: AtomicU64,
    /// Extracted documents
    pub documents_extracted: AtomicU64,
}

impl Metrics {
    /// New metrics
    pub fn new() -> Self {
        Self::default()
    }

    /// Increment total requests
    pub fn inc_requests(&self) {
        self.requests_total.fetch_add(1, Ordering::Relaxed);
    }

    /// Increment successes
    pub fn inc_success(&self) {
        self.requests_success.fetch_add(1, Ordering::Relaxed);
    }

    /// Increment failures
    pub fn inc_failed(&self) {
        self.requests_failed.fetch_add(1, Ordering::Relaxed);
    }

    /// Increment rate limits
    pub fn inc_rate_limited(&self) {
        self.requests_rate_limited.fetch_add(1, Ordering::Relaxed);
    }

    /// Add downloaded bytes
    pub fn add_bytes(&self, bytes: u64) {
        self.bytes_downloaded.fetch_add(bytes, Ordering::Relaxed);
    }

    /// Increment retries
    pub fn inc_retries(&self) {
        self.retries_total.fetch_add(1, Ordering::Relaxed);
    }

    /// Add request time
    pub fn add_request_time(&self, duration_ms: u64) {
        self.request_time_total_ms.fetch_add(duration_ms, Ordering::Relaxed);
    }

    /// Increment extracted documents
    pub fn inc_documents(&self) {
        self.documents_extracted.fetch_add(1, Ordering::Relaxed);
    }

    /// Success rate
    pub fn success_rate(&self) -> f64 {
        let total = self.requests_total.load(Ordering::Relaxed);
        if total == 0 {
            return 0.0;
        }
        let success = self.requests_success.load(Ordering::Relaxed);
        success as f64 / total as f64
    }

    /// Average latency (ms)
    pub fn avg_latency_ms(&self) -> f64 {
        let total = self.requests_total.load(Ordering::Relaxed);
        if total == 0 {
            return 0.0;
        }
        let time = self.request_time_total_ms.load(Ordering::Relaxed);
        time as f64 / total as f64
    }

    /// Metrics snapshot
    pub fn snapshot(&self) -> MetricsSnapshot {
        MetricsSnapshot {
            requests_total: self.requests_total.load(Ordering::Relaxed),
            requests_success: self.requests_success.load(Ordering::Relaxed),
            requests_failed: self.requests_failed.load(Ordering::Relaxed),
            requests_rate_limited: self.requests_rate_limited.load(Ordering::Relaxed),
            bytes_downloaded: self.bytes_downloaded.load(Ordering::Relaxed),
            retries_total: self.retries_total.load(Ordering::Relaxed),
            documents_extracted: self.documents_extracted.load(Ordering::Relaxed),
            success_rate: self.success_rate(),
            avg_latency_ms: self.avg_latency_ms(),
        }
    }
}

/// Metrics snapshot
#[derive(Debug, Clone)]
pub struct MetricsSnapshot {
    /// Total number of requests
    pub requests_total: u64,
    /// Number of successful requests
    pub requests_success: u64,
    /// Number of failed requests
    pub requests_failed: u64,
    /// Number of rate-limited requests
    pub requests_rate_limited: u64,
    /// Total bytes downloaded
    pub bytes_downloaded: u64,
    /// Total number of retries
    pub retries_total: u64,
    /// Number of documents extracted
    pub documents_extracted: u64,
    /// Success rate (0.0 - 1.0)
    pub success_rate: f64,
    /// Average latency in milliseconds
    pub avg_latency_ms: f64,
}

/// Metrics collector by domain
pub struct MetricsCollector {
    /// Global metrics
    global: Metrics,
    /// Metrics by domain
    by_domain: RwLock<HashMap<String, DomainMetrics>>,
    /// Start timestamp
    started_at: Instant,
}

impl Default for MetricsCollector {
    fn default() -> Self {
        Self::new()
    }
}

impl MetricsCollector {
    /// New collector
    pub fn new() -> Self {
        Self {
            global: Metrics::new(),
            by_domain: RwLock::new(HashMap::new()),
            started_at: Instant::now(),
        }
    }

    /// Access to global metrics
    pub fn global(&self) -> &Metrics {
        &self.global
    }

    /// Record a request
    pub fn record_request(&self, domain: &str) {
        self.global.inc_requests();
        self.with_domain(domain, |m| m.requests += 1);
    }

    /// Record a success
    pub fn record_success(&self, domain: &str, bytes: u64, duration_ms: u64) {
        self.global.inc_success();
        self.global.add_bytes(bytes);
        self.global.add_request_time(duration_ms);
        
        self.with_domain(domain, |m| {
            m.successes += 1;
            m.bytes += bytes;
            m.total_time_ms += duration_ms;
        });
    }

    /// Record a failure
    pub fn record_failure(&self, domain: &str, duration_ms: u64) {
        self.global.inc_failed();
        self.global.add_request_time(duration_ms);
        
        self.with_domain(domain, |m| {
            m.failures += 1;
            m.total_time_ms += duration_ms;
        });
    }

    /// Record a rate limit
    pub fn record_rate_limit(&self, domain: &str) {
        self.global.inc_rate_limited();
        self.with_domain(domain, |m| m.rate_limits += 1);
    }

    /// Record a retry
    pub fn record_retry(&self, domain: &str) {
        self.global.inc_retries();
        self.with_domain(domain, |m| m.retries += 1);
    }

    /// Record a document
    pub fn record_document(&self) {
        self.global.inc_documents();
    }

    /// Duration since start
    pub fn elapsed(&self) -> Duration {
        self.started_at.elapsed()
    }

    /// Requests per second
    pub fn requests_per_second(&self) -> f64 {
        let elapsed = self.elapsed().as_secs_f64();
        if elapsed < 0.001 {
            return 0.0;
        }
        self.global.requests_total.load(Ordering::Relaxed) as f64 / elapsed
    }

    /// Metrics by domain
    pub fn domain_stats(&self) -> HashMap<String, DomainMetrics> {
        self.by_domain.read().unwrap().clone()
    }

    /// Helper to access/create metrics for a domain
    fn with_domain<F>(&self, domain: &str, f: F)
    where
        F: FnOnce(&mut DomainMetrics),
    {
        let mut by_domain = self.by_domain.write().unwrap();
        let metrics = by_domain
            .entry(domain.to_string())
            .or_insert_with(DomainMetrics::default);
        f(metrics);
    }
}

/// Metrics by domain
#[derive(Debug, Clone, Default)]
pub struct DomainMetrics {
    /// Total requests to this domain
    pub requests: u64,
    /// Successful requests
    pub successes: u64,
    /// Failed requests
    pub failures: u64,
    /// Rate-limited requests
    pub rate_limits: u64,
    /// Number of retries
    pub retries: u64,
    /// Bytes downloaded from this domain
    pub bytes: u64,
    /// Total time spent on requests in milliseconds
    pub total_time_ms: u64,
}

impl DomainMetrics {
    /// Success rate
    pub fn success_rate(&self) -> f64 {
        if self.requests == 0 {
            return 0.0;
        }
        self.successes as f64 / self.requests as f64
    }

    /// Average latency
    pub fn avg_latency_ms(&self) -> f64 {
        if self.requests == 0 {
            return 0.0;
        }
        self.total_time_ms as f64 / self.requests as f64
    }
}