halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Compression - Secure decompression (gzip, brotli, deflate)

use bytes::Bytes;
use flate2::read::{DeflateDecoder, GzDecoder};
use std::io::Read;

use crate::types::error::{Error, Result};

/// Compression type
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CompressionType {
    /// No compression
    None,
    /// Gzip compression
    Gzip,
    /// Deflate compression
    Deflate,
    /// Brotli compression
    Brotli,
}

impl CompressionType {
    /// Detects compression type from Content-Encoding
    pub fn from_header(content_encoding: Option<&str>) -> Self {
        match content_encoding.map(|s| s.to_lowercase()).as_deref() {
            Some("gzip") | Some("x-gzip") => CompressionType::Gzip,
            Some("deflate") => CompressionType::Deflate,
            Some("br") => CompressionType::Brotli,
            _ => CompressionType::None,
        }
    }
}

/// Decompressor with anti zip-bomb protection
pub struct Decompressor {
    /// Max decompressed size
    max_size: u64,
    /// Max compression ratio (anti zip-bomb)
    max_ratio: f64,
}

impl Default for Decompressor {
    fn default() -> Self {
        Self {
            max_size: 100 * 1024 * 1024, // 100 MB
            max_ratio: 100.0,
        }
    }
}

impl Decompressor {
    /// New decompressor with limits
    pub fn new(max_size: u64, max_ratio: f64) -> Self {
        Self { max_size, max_ratio }
    }

    /// Decompresses the data
    pub fn decompress(&self, data: &[u8], compression: CompressionType) -> Result<Bytes> {
        match compression {
            CompressionType::None => Ok(Bytes::copy_from_slice(data)),
            CompressionType::Gzip => self.decompress_gzip(data),
            CompressionType::Deflate => self.decompress_deflate(data),
            CompressionType::Brotli => self.decompress_brotli(data),
        }
    }

    /// Decompresses gzip
    fn decompress_gzip(&self, data: &[u8]) -> Result<Bytes> {
        let mut decoder = GzDecoder::new(data);
        self.read_with_limits(&mut decoder, data.len())
    }

    /// Decompresses deflate
    fn decompress_deflate(&self, data: &[u8]) -> Result<Bytes> {
        let mut decoder = DeflateDecoder::new(data);
        self.read_with_limits(&mut decoder, data.len())
    }

    /// Decompresses brotli
    fn decompress_brotli(&self, data: &[u8]) -> Result<Bytes> {
        let mut decoder = brotli::Decompressor::new(data, 4096);
        self.read_with_limits(&mut decoder, data.len())
    }

    /// Reads with limit verification
    fn read_with_limits<R: Read>(&self, reader: &mut R, compressed_size: usize) -> Result<Bytes> {
        let mut output = Vec::new();
        let mut buffer = [0u8; 8192];
        let mut total_read: u64 = 0;

        loop {
            let n = reader.read(&mut buffer).map_err(|e| {
                Error::Decompression(format!("Read error: {}", e))
            })?;

            if n == 0 {
                break;
            }

            total_read += n as u64;

            // Max size verification
            if total_read > self.max_size {
                return Err(Error::SizeExceeded {
                    max: self.max_size,
                    actual: total_read,
                });
            }

            // Ratio verification (anti zip-bomb)
            if compressed_size > 0 {
                let ratio = total_read as f64 / compressed_size as f64;
                if ratio > self.max_ratio {
                    return Err(Error::Decompression(format!(
                        "Compression ratio {} exceeds limit {}",
                        ratio, self.max_ratio
                    )));
                }
            }

            output.extend_from_slice(&buffer[..n]);
        }

        Ok(Bytes::from(output))
    }
}

/// Decompression result
#[derive(Debug)]
pub struct DecompressionResult {
    /// Decompressed data
    pub data: Bytes,
    /// Compressed size
    pub compressed_size: u64,
    /// Decompressed size
    pub decompressed_size: u64,
    /// Compression ratio
    pub ratio: f64,
}

impl DecompressionResult {
    /// Creates a new result
    pub fn new(data: Bytes, compressed_size: u64) -> Self {
        let decompressed_size = data.len() as u64;
        let ratio = if compressed_size > 0 {
            decompressed_size as f64 / compressed_size as f64
        } else {
            1.0
        };
        Self {
            data,
            compressed_size,
            decompressed_size,
            ratio,
        }
    }
}