halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Dedup - Deduplication by URL and by content

use sha2::{Sha256, Digest};
use std::collections::HashSet;
use std::sync::RwLock;

/// Deduplication by URL
pub struct UrlDedup {
    /// Set of normalized URLs seen
    seen: RwLock<HashSet<String>>,
}

impl Default for UrlDedup {
    fn default() -> Self {
        Self::new()
    }
}

impl UrlDedup {
    /// New deduplicator
    pub fn new() -> Self {
        Self {
            seen: RwLock::new(HashSet::new()),
        }
    }

    /// Check if the URL has already been seen
    pub fn is_duplicate(&self, url: &str) -> bool {
        self.seen.read().unwrap().contains(url)
    }

    /// Mark URL as seen, returns true if new
    pub fn mark_seen(&self, url: &str) -> bool {
        self.seen.write().unwrap().insert(url.to_string())
    }

    /// Check and mark in one operation
    pub fn check_and_mark(&self, url: &str) -> bool {
        let mut seen = self.seen.write().unwrap();
        if seen.contains(url) {
            false
        } else {
            seen.insert(url.to_string());
            true
        }
    }

    /// Number of URLs seen
    pub fn count(&self) -> usize {
        self.seen.read().unwrap().len()
    }

    /// Clear the deduplicator
    pub fn clear(&self) {
        self.seen.write().unwrap().clear();
    }
}

/// Deduplication by content (hash + simhash)
pub struct ContentDedup {
    /// Set of exact hashes seen
    exact_hashes: RwLock<HashSet<String>>,
    /// Set of simhashes for near-duplicates
    simhashes: RwLock<Vec<u64>>,
    /// Hamming distance threshold for simhash
    hamming_threshold: u32,
}

impl Default for ContentDedup {
    fn default() -> Self {
        Self::new(3)
    }
}

impl ContentDedup {
    /// New deduplicator with Hamming threshold
    pub fn new(hamming_threshold: u32) -> Self {
        Self {
            exact_hashes: RwLock::new(HashSet::new()),
            simhashes: RwLock::new(Vec::new()),
            hamming_threshold,
        }
    }

    /// SHA-256 hash of content
    pub fn hash_content(content: &str) -> String {
        let mut hasher = Sha256::new();
        hasher.update(content.as_bytes());
        format!("{:x}", hasher.finalize())
    }

    /// Simhash of content (for near-duplicates)
    pub fn simhash(content: &str) -> u64 {
        let mut v = [0i32; 64];
        
        // Tokenize into words and compute features
        for word in content.split_whitespace() {
            let hash = Self::fnv_hash(word);
            for i in 0..64 {
                if (hash >> i) & 1 == 1 {
                    v[i] += 1;
                } else {
                    v[i] -= 1;
                }
            }
        }

        // Convert to fingerprint
        let mut simhash: u64 = 0;
        for (i, &val) in v.iter().enumerate() {
            if val > 0 {
                simhash |= 1 << i;
            }
        }
        simhash
    }

    /// FNV-1a hash for tokens
    fn fnv_hash(s: &str) -> u64 {
        let mut hash: u64 = 0xcbf29ce484222325;
        for byte in s.bytes() {
            hash ^= byte as u64;
            hash = hash.wrapping_mul(0x100000001b3);
        }
        hash
    }

    /// Hamming distance between two simhashes
    pub fn hamming_distance(a: u64, b: u64) -> u32 {
        (a ^ b).count_ones()
    }

    /// Check if content is an exact duplicate
    pub fn is_exact_duplicate(&self, content: &str) -> bool {
        let hash = Self::hash_content(content);
        self.exact_hashes.read().unwrap().contains(&hash)
    }

    /// Check if content is a near-duplicate
    pub fn is_near_duplicate(&self, content: &str) -> bool {
        let simhash = Self::simhash(content);
        let simhashes = self.simhashes.read().unwrap();
        
        simhashes.iter().any(|&existing| {
            Self::hamming_distance(simhash, existing) <= self.hamming_threshold
        })
    }

    /// Check if duplicate (exact or near)
    pub fn is_duplicate(&self, content: &str) -> bool {
        self.is_exact_duplicate(content) || self.is_near_duplicate(content)
    }

    /// Mark content as seen
    pub fn mark_seen(&self, content: &str) {
        let hash = Self::hash_content(content);
        let simhash = Self::simhash(content);
        
        self.exact_hashes.write().unwrap().insert(hash);
        self.simhashes.write().unwrap().push(simhash);
    }

    /// Check and mark in one operation
    pub fn check_and_mark(&self, content: &str) -> bool {
        if self.is_duplicate(content) {
            false
        } else {
            self.mark_seen(content);
            true
        }
    }

    /// Number of contents seen (exact)
    pub fn count(&self) -> usize {
        self.exact_hashes.read().unwrap().len()
    }

    /// Clear the deduplicator
    pub fn clear(&self) {
        self.exact_hashes.write().unwrap().clear();
        self.simhashes.write().unwrap().clear();
    }
}

/// Normalize text before deduplication
pub fn normalize_text_for_dedup(text: &str) -> String {
    text.to_lowercase()
        .split_whitespace()
        .collect::<Vec<_>>()
        .join(" ")
}