halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Limits - Resource limits

use std::time::{Duration, Instant};

/// Resource limits
#[derive(Debug, Clone)]
pub struct ResourceLimits {
    /// Max response size (bytes)
    pub max_response_size: u64,
    /// Max parsing time (ms)
    pub max_parse_time_ms: u64,
    /// Max decompressed size (bytes)
    pub max_decompressed_size: u64,
    /// Max compression ratio (anti zip-bomb)
    pub max_compression_ratio: f64,
    /// Max DOM depth
    pub max_dom_depth: usize,
    /// Max DOM elements
    pub max_dom_elements: usize,
}

impl Default for ResourceLimits {
    fn default() -> Self {
        Self {
            max_response_size: 50 * 1024 * 1024,      // 50 MB
            max_parse_time_ms: 30000,                  // 30 secondes
            max_decompressed_size: 100 * 1024 * 1024, // 100 MB
            max_compression_ratio: 100.0,
            max_dom_depth: 100,
            max_dom_elements: 100000,
        }
    }
}

impl ResourceLimits {
    /// New limits
    pub fn new() -> Self {
        Self::default()
    }

    /// Check response size (returns main Error type)
    pub fn check_response_size(&self, size: u64) -> crate::types::error::Result<()> {
        if size > self.max_response_size {
            Err(crate::types::error::Error::SizeExceeded {
                max: self.max_response_size,
                actual: size,
            })
        } else {
            Ok(())
        }
    }

    /// Check response size (returns LimitError)
    pub fn check_response_size_limit(&self, size: u64) -> Result<(), LimitError> {
        if size > self.max_response_size {
            Err(LimitError::ResponseSizeExceeded {
                size,
                max: self.max_response_size,
            })
        } else {
            Ok(())
        }
    }

    /// Check decompressed size
    pub fn check_decompressed_size(&self, size: u64) -> Result<(), LimitError> {
        if size > self.max_decompressed_size {
            Err(LimitError::DecompressedSizeExceeded {
                size,
                max: self.max_decompressed_size,
            })
        } else {
            Ok(())
        }
    }

    /// Check compression ratio
    pub fn check_compression_ratio(&self, compressed: u64, decompressed: u64) -> Result<(), LimitError> {
        if compressed == 0 {
            return Ok(());
        }
        let ratio = decompressed as f64 / compressed as f64;
        if ratio > self.max_compression_ratio {
            Err(LimitError::CompressionRatioExceeded {
                ratio,
                max: self.max_compression_ratio,
            })
        } else {
            Ok(())
        }
    }

    /// Check parsing time
    pub fn check_parse_time(&self, elapsed_ms: u64) -> crate::types::error::Result<()> {
        if elapsed_ms > self.max_parse_time_ms {
            Err(crate::types::error::Error::ResourceLimit(format!(
                "Parse time {}ms exceeds limit {}ms",
                elapsed_ms, self.max_parse_time_ms
            )))
        } else {
            Ok(())
        }
    }

    /// Check DOM depth
    pub fn check_dom_depth(&self, depth: usize) -> Result<(), LimitError> {
        if depth > self.max_dom_depth {
            Err(LimitError::DomDepthExceeded {
                depth,
                max: self.max_dom_depth,
            })
        } else {
            Ok(())
        }
    }

    /// Check DOM element count
    pub fn check_dom_elements(&self, count: usize) -> Result<(), LimitError> {
        if count > self.max_dom_elements {
            Err(LimitError::DomElementsExceeded {
                count,
                max: self.max_dom_elements,
            })
        } else {
            Ok(())
        }
    }
}

/// Limit error
#[derive(Debug, Clone)]
pub enum LimitError {
    /// Response size exceeded
    ResponseSizeExceeded {
        /// Actual size in bytes
        size: u64,
        /// Maximum allowed size in bytes
        max: u64,
    },
    /// Decompressed size exceeded
    DecompressedSizeExceeded {
        /// Actual decompressed size in bytes
        size: u64,
        /// Maximum allowed size in bytes
        max: u64,
    },
    /// Compression ratio exceeded
    CompressionRatioExceeded {
        /// Actual compression ratio
        ratio: f64,
        /// Maximum allowed ratio
        max: f64,
    },
    /// Parsing time exceeded
    ParseTimeExceeded {
        /// Elapsed time in milliseconds
        elapsed_ms: u64,
        /// Maximum allowed time in milliseconds
        max_ms: u64,
    },
    /// DOM depth exceeded
    DomDepthExceeded {
        /// Actual DOM depth
        depth: usize,
        /// Maximum allowed depth
        max: usize,
    },
    /// DOM elements exceeded
    DomElementsExceeded {
        /// Actual element count
        count: usize,
        /// Maximum allowed count
        max: usize,
    },
}

impl std::fmt::Display for LimitError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            LimitError::ResponseSizeExceeded { size, max } => {
                write!(f, "Response size {} exceeds limit {}", size, max)
            }
            LimitError::DecompressedSizeExceeded { size, max } => {
                write!(f, "Decompressed size {} exceeds limit {}", size, max)
            }
            LimitError::CompressionRatioExceeded { ratio, max } => {
                write!(f, "Compression ratio {:.2} exceeds limit {:.2}", ratio, max)
            }
            LimitError::ParseTimeExceeded { elapsed_ms, max_ms } => {
                write!(f, "Parse time {}ms exceeds limit {}ms", elapsed_ms, max_ms)
            }
            LimitError::DomDepthExceeded { depth, max } => {
                write!(f, "DOM depth {} exceeds limit {}", depth, max)
            }
            LimitError::DomElementsExceeded { count, max } => {
                write!(f, "DOM elements {} exceeds limit {}", count, max)
            }
        }
    }
}

impl std::error::Error for LimitError {}

/// Timer to limit parsing time
pub struct ParseTimer {
    started_at: Instant,
    max_duration: Duration,
}

impl ParseTimer {
    /// New timer
    pub fn new(max_ms: u64) -> Self {
        Self {
            started_at: Instant::now(),
            max_duration: Duration::from_millis(max_ms),
        }
    }

    /// Check if time is exceeded
    pub fn check(&self) -> Result<(), LimitError> {
        let elapsed = self.started_at.elapsed();
        if elapsed > self.max_duration {
            Err(LimitError::ParseTimeExceeded {
                elapsed_ms: elapsed.as_millis() as u64,
                max_ms: self.max_duration.as_millis() as u64,
            })
        } else {
            Ok(())
        }
    }

    /// Elapsed time in ms
    pub fn elapsed_ms(&self) -> u64 {
        self.started_at.elapsed().as_millis() as u64
    }

    /// Remaining time in ms
    pub fn remaining_ms(&self) -> u64 {
        let elapsed = self.started_at.elapsed();
        if elapsed >= self.max_duration {
            0
        } else {
            (self.max_duration - elapsed).as_millis() as u64
        }
    }
}