halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Frontier - Priority queue for crawling

use std::cmp::Ordering;
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::sync::RwLock;
use url::Url;

use crate::types::config::CrawlStrategy;

/// Entry in the frontier
#[derive(Debug, Clone)]
pub struct CrawlEntry {
    /// URL to crawl
    pub url: Url,
    /// Crawl depth
    pub depth: u32,
    /// Priority score (higher = more priority)
    pub priority: i32,
    /// Parent URL (where this link came from)
    pub parent_url: Option<Url>,
    /// Timestamp when added
    pub added_at: std::time::Instant,
}

impl CrawlEntry {
    /// New entry
    pub fn new(url: Url, depth: u32, priority: i32) -> Self {
        Self {
            url,
            depth,
            priority,
            parent_url: None,
            added_at: std::time::Instant::now(),
        }
    }

    /// With parent URL
    pub fn with_parent(mut self, parent: Url) -> Self {
        self.parent_url = Some(parent);
        self
    }
}

// Ordonnancement pour BinaryHeap (max-heap)
impl Eq for CrawlEntry {}

impl PartialEq for CrawlEntry {
    fn eq(&self, other: &Self) -> bool {
        self.url == other.url
    }
}

impl PartialOrd for CrawlEntry {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

impl Ord for CrawlEntry {
    fn cmp(&self, other: &Self) -> Ordering {
        // Priority first, then depth (reversed for BFS)
        self.priority
            .cmp(&other.priority)
            .then_with(|| other.depth.cmp(&self.depth)) // BFS: lower depth first
    }
}

/// Thread-safe frontier
#[allow(dead_code)]
pub struct Frontier {
    /// Priority queue
    queue: RwLock<BinaryHeap<CrawlEntry>>,
    /// Already seen URLs
    seen: RwLock<HashSet<String>>,
    /// Counter per domain
    domain_counts: RwLock<HashMap<String, u32>>,
    /// Crawl strategy
    strategy: CrawlStrategy,
    /// Max depth
    max_depth: u32,
    /// Max URLs per domain
    max_per_domain: u32,
    /// Max total URLs
    max_total: u32,
}

impl Frontier {
    /// New frontier
    pub fn new(strategy: CrawlStrategy, max_depth: u32, max_per_domain: u32, max_total: u32) -> Self {
        Self {
            queue: RwLock::new(BinaryHeap::new()),
            seen: RwLock::new(HashSet::new()),
            domain_counts: RwLock::new(HashMap::new()),
            strategy,
            max_depth,
            max_per_domain,
            max_total,
        }
    }

    /// Add a URL to the frontier
    pub fn push(&self, entry: CrawlEntry) -> bool {
        // Check depth
        if entry.depth > self.max_depth {
            return false;
        }

        let url_key = entry.url.to_string();
        let domain = entry.url.host_str().unwrap_or("").to_string();

        // Check if already seen
        {
            let seen = self.seen.read().unwrap();
            if seen.contains(&url_key) {
                return false;
            }
        }

        // Check domain quota
        {
            let counts = self.domain_counts.read().unwrap();
            if let Some(&count) = counts.get(&domain) {
                if count >= self.max_per_domain {
                    return false;
                }
            }
        }

        // Check total quota
        {
            let seen = self.seen.read().unwrap();
            if seen.len() >= self.max_total as usize {
                return false;
            }
        }

        // Add
        {
            let mut seen = self.seen.write().unwrap();
            let mut queue = self.queue.write().unwrap();
            let mut counts = self.domain_counts.write().unwrap();

            seen.insert(url_key);
            queue.push(entry);
            *counts.entry(domain).or_insert(0) += 1;
        }

        true
    }

    /// Add multiple URLs
    pub fn push_many(&self, entries: Vec<CrawlEntry>) -> usize {
        entries.into_iter().filter(|e| self.push(e.clone())).count()
    }

    /// Get the next URL
    pub fn pop(&self) -> Option<CrawlEntry> {
        let mut queue = self.queue.write().unwrap();
        queue.pop()
    }

    /// Queue size
    pub fn len(&self) -> usize {
        self.queue.read().unwrap().len()
    }

    /// Is queue empty?
    pub fn is_empty(&self) -> bool {
        self.queue.read().unwrap().is_empty()
    }

    /// Number of seen URLs
    pub fn seen_count(&self) -> usize {
        self.seen.read().unwrap().len()
    }

    /// URL already seen?
    pub fn has_seen(&self, url: &Url) -> bool {
        self.seen.read().unwrap().contains(&url.to_string())
    }

    /// Mark a URL as seen without adding it to the queue
    pub fn mark_seen(&self, url: &Url) {
        self.seen.write().unwrap().insert(url.to_string());
    }

    /// Clear the frontier
    pub fn clear(&self) {
        let mut queue = self.queue.write().unwrap();
        let mut seen = self.seen.write().unwrap();
        let mut counts = self.domain_counts.write().unwrap();
        queue.clear();
        seen.clear();
        counts.clear();
    }

    /// Stats per domain
    pub fn domain_stats(&self) -> HashMap<String, u32> {
        self.domain_counts.read().unwrap().clone()
    }
}