halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Conditional - Conditional requests (ETag, If-Modified-Since)

use std::collections::HashMap;
use std::sync::RwLock;
use url::Url;

/// Metadata cache for conditional requests
pub struct ConditionalCache {
    /// Entry storage
    entries: RwLock<HashMap<String, CacheEntry>>,
    /// Max cache size
    max_size: usize,
}

/// Cache entry
#[derive(Debug, Clone)]
pub struct CacheEntry {
    /// ETag
    pub etag: Option<String>,
    /// Last-Modified
    pub last_modified: Option<String>,
    /// Caching timestamp
    pub cached_at: std::time::Instant,
}

impl Default for ConditionalCache {
    fn default() -> Self {
        Self::new(10000)
    }
}

impl ConditionalCache {
    /// New cache
    pub fn new(max_size: usize) -> Self {
        Self {
            entries: RwLock::new(HashMap::new()),
            max_size,
        }
    }

    /// Cache key for a URL
    fn cache_key(url: &Url) -> String {
        // Normalize: remove fragment, sort query params
        let mut url = url.clone();
        url.set_fragment(None);
        url.to_string()
    }

    /// Retrieves an entry
    pub fn get(&self, url: &Url) -> Option<CacheEntry> {
        let key = Self::cache_key(url);
        self.entries.read().ok()?.get(&key).cloned()
    }

    /// Caches an entry
    pub fn set(&self, url: &Url, etag: Option<String>, last_modified: Option<String>) {
        let key = Self::cache_key(url);
        let entry = CacheEntry {
            etag,
            last_modified,
            cached_at: std::time::Instant::now(),
        };

        if let Ok(mut entries) = self.entries.write() {
            // Simple eviction if too large
            if entries.len() >= self.max_size {
                // Remove 10% of the oldest entries
                let to_remove: Vec<_> = entries
                    .iter()
                    .take(self.max_size / 10)
                    .map(|(k, _)| k.clone())
                    .collect();
                for k in to_remove {
                    entries.remove(&k);
                }
            }
            entries.insert(key, entry);
        }
    }

    /// Removes an entry
    pub fn remove(&self, url: &Url) {
        let key = Self::cache_key(url);
        if let Ok(mut entries) = self.entries.write() {
            entries.remove(&key);
        }
    }

    /// Clears the cache
    pub fn clear(&self) {
        if let Ok(mut entries) = self.entries.write() {
            entries.clear();
        }
    }

    /// Current size
    pub fn len(&self) -> usize {
        self.entries.read().map(|e| e.len()).unwrap_or(0)
    }

    /// Is empty?
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }
}

/// Builder for conditional request
pub struct ConditionalRequest {
    /// ETag for If-None-Match
    pub etag: Option<String>,
    /// Date for If-Modified-Since
    pub last_modified: Option<String>,
}

impl ConditionalRequest {
    /// New empty conditional request
    pub fn new() -> Self {
        Self {
            etag: None,
            last_modified: None,
        }
    }

    /// From a cache entry
    pub fn from_cache(entry: &CacheEntry) -> Self {
        Self {
            etag: entry.etag.clone(),
            last_modified: entry.last_modified.clone(),
        }
    }

    /// Do we have conditions?
    pub fn has_conditions(&self) -> bool {
        self.etag.is_some() || self.last_modified.is_some()
    }
}

impl Default for ConditionalRequest {
    fn default() -> Self {
        Self::new()
    }
}