halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Snapshot - Raw snapshot storage

use sha2::{Sha256, Digest};
use std::collections::HashMap;
use std::sync::RwLock;
use url::Url;


/// Raw snapshot of a page
#[derive(Debug, Clone)]
pub struct RawSnapshot {
    /// Source URL
    pub url: Url,
    /// Raw HTML
    pub html: String,
    /// Raw HTTP headers
    pub headers: HashMap<String, String>,
    /// Encoding used
    pub encoding: String,
    /// Compressed size
    pub compressed_size: Option<u64>,
    /// Decompressed size
    pub decompressed_size: u64,
    /// Timestamp
    pub timestamp: chrono::DateTime<chrono::Utc>,
    /// Hash SHA-256 du HTML
    pub hash: String,
}

impl RawSnapshot {
    /// Create a new snapshot from HTML string
    pub fn new(url: Url, html: String, headers: HashMap<String, String>) -> Self {
        let hash = Self::compute_hash(&html);
        let decompressed_size = html.len() as u64;
        
        Self {
            url,
            html,
            headers,
            encoding: "utf-8".to_string(),
            compressed_size: None,
            decompressed_size,
            timestamp: chrono::Utc::now(),
            hash,
        }
    }

    /// Create a new snapshot from raw response
    pub fn from_response(
        url: Url,
        status_code: u16,
        headers: HashMap<String, String>,
        body: Vec<u8>,
    ) -> Self {
        let html = String::from_utf8_lossy(&body).to_string();
        let hash = Self::compute_hash(&html);
        let decompressed_size = body.len() as u64;

        let mut snapshot_headers = headers;
        snapshot_headers.insert("x-status-code".to_string(), status_code.to_string());

        Self {
            url,
            html,
            headers: snapshot_headers,
            encoding: "utf-8".to_string(),
            compressed_size: None,
            decompressed_size,
            timestamp: chrono::Utc::now(),
            hash,
        }
    }

    /// Compute SHA-256 hash
    fn compute_hash(content: &str) -> String {
        let mut hasher = Sha256::new();
        hasher.update(content.as_bytes());
        format!("{:x}", hasher.finalize())
    }

    /// Serialize to bytes
    pub fn serialize(&self) -> Vec<u8> {
        // Simple format: headers\n\n\nhtml
        let mut output = String::new();
        
        // URL
        output.push_str(&format!("URL: {}\n", self.url));
        output.push_str(&format!("Timestamp: {}\n", self.timestamp.to_rfc3339()));
        output.push_str(&format!("Hash: {}\n", self.hash));
        output.push_str(&format!("Encoding: {}\n", self.encoding));
        
        // Headers
        for (key, value) in &self.headers {
            output.push_str(&format!("{}: {}\n", key, value));
        }
        
        // Separator
        output.push_str("\n\n\n");
        
        // HTML
        output.push_str(&self.html);
        
        output.into_bytes()
    }
}

/// In-memory snapshot store
pub struct SnapshotStore {
    /// Snapshots by hash
    snapshots: RwLock<HashMap<String, RawSnapshot>>,
    /// URL -> hash index
    url_index: RwLock<HashMap<String, String>>,
    /// Size limit
    max_snapshots: usize,
}

impl Default for SnapshotStore {
    fn default() -> Self {
        Self::new(10000)
    }
}

impl SnapshotStore {
    /// New store
    pub fn new(max_snapshots: usize) -> Self {
        Self {
            snapshots: RwLock::new(HashMap::new()),
            url_index: RwLock::new(HashMap::new()),
            max_snapshots,
        }
    }

    /// Store a snapshot
    pub fn store(&self, snapshot: RawSnapshot) -> String {
        let hash = snapshot.hash.clone();
        let url_key = snapshot.url.to_string();

        // Check the limit
        {
            let snapshots = self.snapshots.read().unwrap();
            if snapshots.len() >= self.max_snapshots {
                // Simple eviction (first in)
                drop(snapshots);
                let mut snapshots = self.snapshots.write().unwrap();
                let mut url_index = self.url_index.write().unwrap();
                
                // Remove 10% of the oldest
                let to_remove: Vec<_> = snapshots
                    .keys()
                    .take(self.max_snapshots / 10)
                    .cloned()
                    .collect();
                for h in to_remove {
                    snapshots.remove(&h);
                    // Clean up the index
                    url_index.retain(|_, v| v != &h);
                }
            }
        }

        // Store
        self.snapshots.write().unwrap().insert(hash.clone(), snapshot);
        self.url_index.write().unwrap().insert(url_key, hash.clone());

        hash
    }

    /// Get by hash
    pub fn get_by_hash(&self, hash: &str) -> Option<RawSnapshot> {
        self.snapshots.read().unwrap().get(hash).cloned()
    }

    /// Get by URL
    pub fn get_by_url(&self, url: &Url) -> Option<RawSnapshot> {
        let url_key = url.to_string();
        let hash = self.url_index.read().unwrap().get(&url_key)?.clone();
        self.get_by_hash(&hash)
    }

    /// Check if a hash exists
    pub fn has_hash(&self, hash: &str) -> bool {
        self.snapshots.read().unwrap().contains_key(hash)
    }

    /// Number of snapshots
    pub fn len(&self) -> usize {
        self.snapshots.read().unwrap().len()
    }

    /// Is store empty?
    pub fn is_empty(&self) -> bool {
        self.snapshots.read().unwrap().is_empty()
    }

    /// Clear the store
    pub fn clear(&self) {
        self.snapshots.write().unwrap().clear();
        self.url_index.write().unwrap().clear();
    }
}