halldyll_core/parse/
text.rs

1//! Text - Main text extraction (boilerplate removal)
2
3use scraper::{Html, Selector, ElementRef};
4use std::collections::HashSet;
5
6/// Text extractor
7pub struct TextExtractor {
8    /// Main content selectors
9    content_selectors: Vec<String>,
10    /// Boilerplate selectors to exclude
11    exclude_selectors: Vec<String>,
12    /// Segment into chunks?
13    segment: bool,
14    /// Chunk size
15    chunk_size: usize,
16}
17
18impl Default for TextExtractor {
19    fn default() -> Self {
20        Self {
21            content_selectors: vec![
22                "article".to_string(),
23                "main".to_string(),
24                "[role=\"main\"]".to_string(),
25                ".post-content".to_string(),
26                ".entry-content".to_string(),
27                ".article-content".to_string(),
28                ".content".to_string(),
29                "#content".to_string(),
30            ],
31            exclude_selectors: vec![
32                "nav".to_string(),
33                "header".to_string(),
34                "footer".to_string(),
35                "aside".to_string(),
36                ".sidebar".to_string(),
37                ".navigation".to_string(),
38                ".menu".to_string(),
39                ".breadcrumb".to_string(),
40                ".pagination".to_string(),
41                ".comments".to_string(),
42                ".related".to_string(),
43                ".share".to_string(),
44                ".social".to_string(),
45                ".ad".to_string(),
46                ".advertisement".to_string(),
47                "[role=\"navigation\"]".to_string(),
48                "[role=\"banner\"]".to_string(),
49                "[role=\"contentinfo\"]".to_string(),
50                "[role=\"complementary\"]".to_string(),
51            ],
52            segment: true,
53            chunk_size: 1000,
54        }
55    }
56}
57
58impl TextExtractor {
59    /// New extractor
60    pub fn new() -> Self {
61        Self::default()
62    }
63
64    /// Configure segmentation
65    pub fn with_chunking(mut self, enabled: bool, chunk_size: usize) -> Self {
66        self.segment = enabled;
67        self.chunk_size = chunk_size;
68        self
69    }
70
71    /// Extract the main text
72    pub fn extract(&self, html: &str) -> ExtractedText {
73        let document = Html::parse_document(html);
74        
75        // Try main content selectors
76        let main_element = self.find_main_content(&document);
77        
78        let text = if let Some(element) = main_element {
79            self.extract_from_element(&element)
80        } else {
81            // Fallback: extract entire body
82            self.extract_full_text(&document)
83        };
84
85        let chunks = if self.segment {
86            self.segment_text(&text)
87        } else {
88            vec![text.clone()]
89        };
90
91        // Extract sections (headings + paragraphs)
92        let sections = self.extract_sections(&document);
93
94        ExtractedText {
95            full_text: text,
96            chunks,
97            sections,
98        }
99    }
100
101    /// Find the main content element
102    fn find_main_content<'a>(&self, document: &'a Html) -> Option<ElementRef<'a>> {
103        for selector_str in &self.content_selectors {
104            if let Ok(selector) = Selector::parse(selector_str) {
105                if let Some(element) = document.select(&selector).next() {
106                    return Some(element);
107                }
108            }
109        }
110        None
111    }
112
113    /// Extract text from an element (without boilerplate)
114    fn extract_from_element(&self, element: &ElementRef) -> String {
115        let html = element.inner_html();
116        let sub_doc = Html::parse_fragment(&html);
117        
118        // Build a set of elements to exclude
119        let selectors: Vec<_> = self.exclude_selectors
120            .iter()
121            .filter_map(|s| Selector::parse(s).ok())
122            .collect();
123        
124        let exclude_set: HashSet<_> = selectors.iter()
125            .flat_map(|sel| sub_doc.select(sel))
126            .map(|el| el.id())
127            .collect();
128
129        // Extract text from non-excluded elements
130        let mut text_parts = Vec::new();
131        
132        for node in sub_doc.root_element().descendants() {
133            if let Some(text) = node.value().as_text() {
134                // Check if this element is inside an excluded element
135                let mut excluded = false;
136                let mut parent = node.parent();
137                while let Some(p) = parent {
138                    if exclude_set.contains(&p.id()) {
139                        excluded = true;
140                        break;
141                    }
142                    parent = p.parent();
143                }
144
145                if !excluded {
146                    let t = text.trim();
147                    if !t.is_empty() {
148                        text_parts.push(t.to_string());
149                    }
150                }
151            }
152        }
153
154        text_parts.join(" ")
155    }
156
157    /// Extract all text (fallback)
158    fn extract_full_text(&self, document: &Html) -> String {
159        // Exclude boilerplate
160        let selectors: Vec<_> = self.exclude_selectors
161            .iter()
162            .filter_map(|s| Selector::parse(s).ok())
163            .collect();
164        
165        let exclude_set: HashSet<_> = selectors.iter()
166            .flat_map(|sel| document.select(sel))
167            .map(|el| el.id())
168            .collect();
169
170        // Also exclude script, style, etc.
171        let script_sel = Selector::parse("script, style, noscript").unwrap();
172        let script_ids: HashSet<_> = document.select(&script_sel).map(|el| el.id()).collect();
173
174        let mut text_parts = Vec::new();
175
176        for node in document.root_element().descendants() {
177            if let Some(text) = node.value().as_text() {
178                let mut excluded = false;
179                let mut parent = node.parent();
180                while let Some(p) = parent {
181                    if exclude_set.contains(&p.id()) || script_ids.contains(&p.id()) {
182                        excluded = true;
183                        break;
184                    }
185                    parent = p.parent();
186                }
187
188                if !excluded {
189                    let t = text.trim();
190                    if !t.is_empty() {
191                        text_parts.push(t.to_string());
192                    }
193                }
194            }
195        }
196
197        text_parts.join(" ")
198    }
199
200    /// Segment text into chunks
201    fn segment_text(&self, text: &str) -> Vec<String> {
202        let mut chunks = Vec::new();
203        let mut current_chunk = String::new();
204
205        for sentence in text.split(|c| c == '.' || c == '!' || c == '?') {
206            let sentence = sentence.trim();
207            if sentence.is_empty() {
208                continue;
209            }
210
211            let sentence_with_punct = format!("{}. ", sentence);
212
213            if current_chunk.len() + sentence_with_punct.len() > self.chunk_size {
214                if !current_chunk.is_empty() {
215                    chunks.push(current_chunk.trim().to_string());
216                }
217                current_chunk = sentence_with_punct;
218            } else {
219                current_chunk.push_str(&sentence_with_punct);
220            }
221        }
222
223        if !current_chunk.is_empty() {
224            chunks.push(current_chunk.trim().to_string());
225        }
226
227        chunks
228    }
229
230    /// Extrait les sections (headings + contenu)
231    fn extract_sections(&self, document: &Html) -> Vec<TextSection> {
232        let mut sections = Vec::new();
233        let heading_sel = Selector::parse("h1, h2, h3, h4, h5, h6").unwrap();
234
235        for heading in document.select(&heading_sel) {
236            let level = heading.value().name().chars().nth(1)
237                .and_then(|c| c.to_digit(10))
238                .unwrap_or(1) as u8;
239            
240            let title = heading.text().collect::<Vec<_>>().join(" ").trim().to_string();
241            
242            // Récupérer le contenu jusqu'au prochain heading
243            let content = self.extract_section_content(&heading);
244
245            sections.push(TextSection {
246                level,
247                title,
248                content,
249            });
250        }
251
252        sections
253    }
254
255    /// Extrait le contenu après un heading
256    fn extract_section_content(&self, heading: &ElementRef) -> String {
257        let mut content = String::new();
258        let mut current = heading.next_sibling();
259
260        while let Some(sibling) = current {
261            // Arrêter si on atteint un autre heading
262            if let Some(element) = sibling.value().as_element() {
263                let name = element.name();
264                if name.starts_with('h') && name.len() == 2 {
265                    break;
266                }
267            }
268
269            // Récupérer le texte
270            for node in sibling.descendants() {
271                if let Some(text) = node.value().as_text() {
272                    let t = text.trim();
273                    if !t.is_empty() {
274                        content.push_str(t);
275                        content.push(' ');
276                    }
277                }
278            }
279
280            current = sibling.next_sibling();
281        }
282
283        content.trim().to_string()
284    }
285}
286
287/// Texte extrait
288#[derive(Debug, Clone)]
289pub struct ExtractedText {
290    /// Texte complet
291    pub full_text: String,
292    /// Chunks pour embeddings
293    pub chunks: Vec<String>,
294    /// Sections avec headings
295    pub sections: Vec<TextSection>,
296}
297
298/// Section de texte
299#[derive(Debug, Clone)]
300pub struct TextSection {
301    /// Niveau de heading (1-6)
302    pub level: u8,
303    /// Titre de la section
304    pub title: String,
305    /// Contenu de la section
306    pub content: String,
307}
308
309/// Compte les mots
310pub fn word_count(text: &str) -> usize {
311    text.split_whitespace().count()
312}
313
314/// Compte les caractères (sans espaces)
315pub fn char_count(text: &str) -> usize {
316    text.chars().filter(|c| !c.is_whitespace()).count()
317}
318
319/// Détecte la langue (simple heuristique)
320pub fn detect_language(text: &str) -> Option<String> {
321    let sample = text.chars().take(1000).collect::<String>().to_lowercase();
322    
323    // Mots courants par langue
324    let french_words = ["le", "la", "les", "de", "du", "un", "une", "et", "est", "que"];
325    let english_words = ["the", "a", "an", "of", "to", "in", "is", "and", "that", "for"];
326    let german_words = ["der", "die", "das", "und", "ist", "ein", "eine", "für", "mit", "auf"];
327    let spanish_words = ["el", "la", "los", "de", "un", "una", "que", "es", "en", "por"];
328
329    let fr_count = french_words.iter().filter(|w| sample.contains(*w)).count();
330    let en_count = english_words.iter().filter(|w| sample.contains(*w)).count();
331    let de_count = german_words.iter().filter(|w| sample.contains(*w)).count();
332    let es_count = spanish_words.iter().filter(|w| sample.contains(*w)).count();
333
334    let max = fr_count.max(en_count).max(de_count).max(es_count);
335    if max < 3 {
336        return None;
337    }
338
339    if max == fr_count {
340        Some("fr".to_string())
341    } else if max == en_count {
342        Some("en".to_string())
343    } else if max == de_count {
344        Some("de".to_string())
345    } else if max == es_count {
346        Some("es".to_string())
347    } else {
348        None
349    }
350}
halldyll_core/parse/text.rs

halldyll_core/parse/
text.rs