halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Text - Main text extraction (boilerplate removal)

use scraper::{Html, Selector, ElementRef};
use std::collections::HashSet;

/// Text extractor
pub struct TextExtractor {
    /// Main content selectors
    content_selectors: Vec<String>,
    /// Boilerplate selectors to exclude
    exclude_selectors: Vec<String>,
    /// Segment into chunks?
    segment: bool,
    /// Chunk size
    chunk_size: usize,
}

impl Default for TextExtractor {
    fn default() -> Self {
        Self {
            content_selectors: vec![
                "article".to_string(),
                "main".to_string(),
                "[role=\"main\"]".to_string(),
                ".post-content".to_string(),
                ".entry-content".to_string(),
                ".article-content".to_string(),
                ".content".to_string(),
                "#content".to_string(),
            ],
            exclude_selectors: vec![
                "nav".to_string(),
                "header".to_string(),
                "footer".to_string(),
                "aside".to_string(),
                ".sidebar".to_string(),
                ".navigation".to_string(),
                ".menu".to_string(),
                ".breadcrumb".to_string(),
                ".pagination".to_string(),
                ".comments".to_string(),
                ".related".to_string(),
                ".share".to_string(),
                ".social".to_string(),
                ".ad".to_string(),
                ".advertisement".to_string(),
                "[role=\"navigation\"]".to_string(),
                "[role=\"banner\"]".to_string(),
                "[role=\"contentinfo\"]".to_string(),
                "[role=\"complementary\"]".to_string(),
            ],
            segment: true,
            chunk_size: 1000,
        }
    }
}

impl TextExtractor {
    /// New extractor
    pub fn new() -> Self {
        Self::default()
    }

    /// Configure segmentation
    pub fn with_chunking(mut self, enabled: bool, chunk_size: usize) -> Self {
        self.segment = enabled;
        self.chunk_size = chunk_size;
        self
    }

    /// Extract the main text
    pub fn extract(&self, html: &str) -> ExtractedText {
        let document = Html::parse_document(html);
        
        // Try main content selectors
        let main_element = self.find_main_content(&document);
        
        let text = if let Some(element) = main_element {
            self.extract_from_element(&element)
        } else {
            // Fallback: extract entire body
            self.extract_full_text(&document)
        };

        let chunks = if self.segment {
            self.segment_text(&text)
        } else {
            vec![text.clone()]
        };

        // Extract sections (headings + paragraphs)
        let sections = self.extract_sections(&document);

        ExtractedText {
            full_text: text,
            chunks,
            sections,
        }
    }

    /// Find the main content element
    fn find_main_content<'a>(&self, document: &'a Html) -> Option<ElementRef<'a>> {
        for selector_str in &self.content_selectors {
            if let Ok(selector) = Selector::parse(selector_str) {
                if let Some(element) = document.select(&selector).next() {
                    return Some(element);
                }
            }
        }
        None
    }

    /// Extract text from an element (without boilerplate)
    fn extract_from_element(&self, element: &ElementRef) -> String {
        let html = element.inner_html();
        let sub_doc = Html::parse_fragment(&html);
        
        // Build a set of elements to exclude
        let selectors: Vec<_> = self.exclude_selectors
            .iter()
            .filter_map(|s| Selector::parse(s).ok())
            .collect();
        
        let exclude_set: HashSet<_> = selectors.iter()
            .flat_map(|sel| sub_doc.select(sel))
            .map(|el| el.id())
            .collect();

        // Extract text from non-excluded elements
        let mut text_parts = Vec::new();
        
        for node in sub_doc.root_element().descendants() {
            if let Some(text) = node.value().as_text() {
                // Check if this element is inside an excluded element
                let mut excluded = false;
                let mut parent = node.parent();
                while let Some(p) = parent {
                    if exclude_set.contains(&p.id()) {
                        excluded = true;
                        break;
                    }
                    parent = p.parent();
                }

                if !excluded {
                    let t = text.trim();
                    if !t.is_empty() {
                        text_parts.push(t.to_string());
                    }
                }
            }
        }

        text_parts.join(" ")
    }

    /// Extract all text (fallback)
    fn extract_full_text(&self, document: &Html) -> String {
        // Exclude boilerplate
        let selectors: Vec<_> = self.exclude_selectors
            .iter()
            .filter_map(|s| Selector::parse(s).ok())
            .collect();
        
        let exclude_set: HashSet<_> = selectors.iter()
            .flat_map(|sel| document.select(sel))
            .map(|el| el.id())
            .collect();

        // Also exclude script, style, etc.
        let script_sel = Selector::parse("script, style, noscript").unwrap();
        let script_ids: HashSet<_> = document.select(&script_sel).map(|el| el.id()).collect();

        let mut text_parts = Vec::new();

        for node in document.root_element().descendants() {
            if let Some(text) = node.value().as_text() {
                let mut excluded = false;
                let mut parent = node.parent();
                while let Some(p) = parent {
                    if exclude_set.contains(&p.id()) || script_ids.contains(&p.id()) {
                        excluded = true;
                        break;
                    }
                    parent = p.parent();
                }

                if !excluded {
                    let t = text.trim();
                    if !t.is_empty() {
                        text_parts.push(t.to_string());
                    }
                }
            }
        }

        text_parts.join(" ")
    }

    /// Segment text into chunks
    fn segment_text(&self, text: &str) -> Vec<String> {
        let mut chunks = Vec::new();
        let mut current_chunk = String::new();

        for sentence in text.split(|c| c == '.' || c == '!' || c == '?') {
            let sentence = sentence.trim();
            if sentence.is_empty() {
                continue;
            }

            let sentence_with_punct = format!("{}. ", sentence);

            if current_chunk.len() + sentence_with_punct.len() > self.chunk_size {
                if !current_chunk.is_empty() {
                    chunks.push(current_chunk.trim().to_string());
                }
                current_chunk = sentence_with_punct;
            } else {
                current_chunk.push_str(&sentence_with_punct);
            }
        }

        if !current_chunk.is_empty() {
            chunks.push(current_chunk.trim().to_string());
        }

        chunks
    }

    /// Extrait les sections (headings + contenu)
    fn extract_sections(&self, document: &Html) -> Vec<TextSection> {
        let mut sections = Vec::new();
        let heading_sel = Selector::parse("h1, h2, h3, h4, h5, h6").unwrap();

        for heading in document.select(&heading_sel) {
            let level = heading.value().name().chars().nth(1)
                .and_then(|c| c.to_digit(10))
                .unwrap_or(1) as u8;
            
            let title = heading.text().collect::<Vec<_>>().join(" ").trim().to_string();
            
            // Récupérer le contenu jusqu'au prochain heading
            let content = self.extract_section_content(&heading);

            sections.push(TextSection {
                level,
                title,
                content,
            });
        }

        sections
    }

    /// Extrait le contenu après un heading
    fn extract_section_content(&self, heading: &ElementRef) -> String {
        let mut content = String::new();
        let mut current = heading.next_sibling();

        while let Some(sibling) = current {
            // Arrêter si on atteint un autre heading
            if let Some(element) = sibling.value().as_element() {
                let name = element.name();
                if name.starts_with('h') && name.len() == 2 {
                    break;
                }
            }

            // Récupérer le texte
            for node in sibling.descendants() {
                if let Some(text) = node.value().as_text() {
                    let t = text.trim();
                    if !t.is_empty() {
                        content.push_str(t);
                        content.push(' ');
                    }
                }
            }

            current = sibling.next_sibling();
        }

        content.trim().to_string()
    }
}

/// Texte extrait
#[derive(Debug, Clone)]
pub struct ExtractedText {
    /// Texte complet
    pub full_text: String,
    /// Chunks pour embeddings
    pub chunks: Vec<String>,
    /// Sections avec headings
    pub sections: Vec<TextSection>,
}

/// Section de texte
#[derive(Debug, Clone)]
pub struct TextSection {
    /// Niveau de heading (1-6)
    pub level: u8,
    /// Titre de la section
    pub title: String,
    /// Contenu de la section
    pub content: String,
}

/// Compte les mots
pub fn word_count(text: &str) -> usize {
    text.split_whitespace().count()
}

/// Compte les caractères (sans espaces)
pub fn char_count(text: &str) -> usize {
    text.chars().filter(|c| !c.is_whitespace()).count()
}

/// Détecte la langue (simple heuristique)
pub fn detect_language(text: &str) -> Option<String> {
    let sample = text.chars().take(1000).collect::<String>().to_lowercase();
    
    // Mots courants par langue
    let french_words = ["le", "la", "les", "de", "du", "un", "une", "et", "est", "que"];
    let english_words = ["the", "a", "an", "of", "to", "in", "is", "and", "that", "for"];
    let german_words = ["der", "die", "das", "und", "ist", "ein", "eine", "für", "mit", "auf"];
    let spanish_words = ["el", "la", "los", "de", "un", "una", "que", "es", "en", "por"];

    let fr_count = french_words.iter().filter(|w| sample.contains(*w)).count();
    let en_count = english_words.iter().filter(|w| sample.contains(*w)).count();
    let de_count = german_words.iter().filter(|w| sample.contains(*w)).count();
    let es_count = spanish_words.iter().filter(|w| sample.contains(*w)).count();

    let max = fr_count.max(en_count).max(de_count).max(es_count);
    if max < 3 {
        return None;
    }

    if max == fr_count {
        Some("fr".to_string())
    } else if max == en_count {
        Some("en".to_string())
    } else if max == de_count {
        Some("de".to_string())
    } else if max == es_count {
        Some("es".to_string())
    } else {
        None
    }
}