halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Normalized - Storage of normalized documents

use std::collections::HashMap;
use std::sync::RwLock;
use url::Url;

use crate::types::Document;

/// Store for normalized documents
pub struct NormalizedStore {
    /// Documents by URL
    documents: RwLock<HashMap<String, Document>>,
    /// Size limit
    max_documents: usize,
}

impl Default for NormalizedStore {
    fn default() -> Self {
        Self::new(10000)
    }
}

impl NormalizedStore {
    /// New store
    pub fn new(max_documents: usize) -> Self {
        Self {
            documents: RwLock::new(HashMap::new()),
            max_documents,
        }
    }

    /// Store a document
    pub fn store(&self, document: Document) {
        let url_key = document.source_url.to_string();

        // Check the limit
        {
            let documents = self.documents.read().unwrap();
            if documents.len() >= self.max_documents {
                drop(documents);
                let mut documents = self.documents.write().unwrap();
                
                // Remove 10% of the oldest
                let to_remove: Vec<_> = documents
                    .iter()
                    .take(self.max_documents / 10)
                    .map(|(k, _)| k.clone())
                    .collect();
                for k in to_remove {
                    documents.remove(&k);
                }
            }
        }

        self.documents.write().unwrap().insert(url_key, document);
    }

    /// Get a document by URL
    pub fn get(&self, url: &Url) -> Option<Document> {
        let url_key = url.to_string();
        self.documents.read().unwrap().get(&url_key).cloned()
    }

    /// Check if a document exists
    pub fn has(&self, url: &Url) -> bool {
        let url_key = url.to_string();
        self.documents.read().unwrap().contains_key(&url_key)
    }

    /// Number of documents
    pub fn len(&self) -> usize {
        self.documents.read().unwrap().len()
    }

    /// Is store empty?
    pub fn is_empty(&self) -> bool {
        self.documents.read().unwrap().is_empty()
    }

    /// Clear the store
    pub fn clear(&self) {
        self.documents.write().unwrap().clear();
    }

    /// Iterate over all documents
    pub fn iter<F>(&self, mut f: F)
    where
        F: FnMut(&Document),
    {
        let documents = self.documents.read().unwrap();
        for doc in documents.values() {
            f(doc);
        }
    }

    /// Export all documents
    pub fn export_all(&self) -> Vec<Document> {
        self.documents.read().unwrap().values().cloned().collect()
    }
}