halldyll_parser/
text.rs

1//! Text extraction and processing for halldyll-parser
2//!
3//! This module handles:
4//! - Text extraction from HTML documents
5//! - Boilerplate removal (nav, footer, ads, etc.)
6//! - Text cleaning and normalization
7//! - Readability scoring
8//! - Language detection (basic)
9
10use scraper::{Html, ElementRef, Node};
11use std::collections::HashSet;
12
13use crate::selector::{SELECTORS, try_parse_selector, BOILERPLATE_SELECTORS, CONTENT_SELECTORS};
14use crate::types::{TextContent, ParserConfig, ParserResult};
15
16// ============================================================================
17// MAIN TEXT EXTRACTION
18// ============================================================================
19
20/// Extract main text content from HTML document
21pub fn extract_text(document: &Html, config: &ParserConfig) -> ParserResult<TextContent> {
22    // First, try to find main content area
23    let main_text = extract_main_content(document, config);
24    
25    // If we got substantial content, use it
26    if !main_text.trim().is_empty() && main_text.split_whitespace().count() > 20 {
27        return Ok(TextContent::from_raw(&main_text));
28    }
29    
30    // Fallback to body with boilerplate removal
31    let body_text = extract_body_text(document, config);
32    Ok(TextContent::from_raw(&body_text))
33}
34
35/// Extract text from main content area (article, main, etc.)
36fn extract_main_content(document: &Html, config: &ParserConfig) -> String {
37    // Try configured content selectors first
38    for selector_str in &config.content_selectors {
39        if let Some(sel) = try_parse_selector(selector_str) {
40            if let Some(element) = document.select(&sel).next() {
41                let text = extract_element_text(&element, config);
42                if !text.trim().is_empty() {
43                    return text;
44                }
45            }
46        }
47    }
48    
49    // Try default content selectors
50    for selector_str in CONTENT_SELECTORS {
51        if let Some(sel) = try_parse_selector(selector_str) {
52            if let Some(element) = document.select(&sel).next() {
53                let text = extract_element_text(&element, config);
54                if !text.trim().is_empty() {
55                    return text;
56                }
57            }
58        }
59    }
60    
61    String::new()
62}
63
64/// Extract text from body with boilerplate removal
65fn extract_body_text(document: &Html, config: &ParserConfig) -> String {
66    if let Some(body) = document.select(&SELECTORS.body).next() {
67        extract_element_text_filtered(&body, config)
68    } else {
69        String::new()
70    }
71}
72
73/// Extract text from an element, preserving structure
74fn extract_element_text(element: &ElementRef, config: &ParserConfig) -> String {
75    let mut text = String::new();
76    
77    for node in element.descendants() {
78        match node.value() {
79            Node::Text(t) => {
80                let content = t.text.trim();
81                if !content.is_empty() {
82                    if !text.is_empty() && !text.ends_with(' ') && !text.ends_with('\n') {
83                        text.push(' ');
84                    }
85                    text.push_str(content);
86                }
87            }
88            Node::Element(el) => {
89                // Add line breaks for block elements
90                let tag_name = el.name();
91                if is_block_element(tag_name) && !text.is_empty() && !text.ends_with('\n') {
92                    text.push('\n');
93                }
94            }
95            _ => {}
96        }
97    }
98    
99    if config.preserve_whitespace {
100        text
101    } else {
102        normalize_text(&text)
103    }
104}
105
106/// Extract text from element, filtering out boilerplate
107fn extract_element_text_filtered(element: &ElementRef, config: &ParserConfig) -> String {
108    // Collect IDs/classes of elements to skip
109    let skip_selectors: Vec<_> = config.remove_selectors.iter()
110        .chain(BOILERPLATE_SELECTORS.iter().map(|s| s.to_string()).collect::<Vec<_>>().iter())
111        .filter_map(|s| try_parse_selector(s))
112        .collect();
113    
114    let mut text = String::new();
115    extract_text_recursive(element, &skip_selectors, &mut text, config);
116    
117    if config.preserve_whitespace {
118        text
119    } else {
120        normalize_text(&text)
121    }
122}
123
124/// Recursively extract text, skipping boilerplate elements
125fn extract_text_recursive(
126    element: &ElementRef,
127    skip_selectors: &[scraper::Selector],
128    text: &mut String,
129    _config: &ParserConfig,
130) {
131    // Check if this element should be skipped
132    for sel in skip_selectors {
133        if element.select(sel).next().map(|e| e.id() == element.id()).unwrap_or(false) {
134            return;
135        }
136    }
137    
138    // Check element name
139    let tag_name = element.value().name();
140    if should_skip_element(tag_name) {
141        return;
142    }
143    
144    // Add block element spacing
145    if is_block_element(tag_name) && !text.is_empty() && !text.ends_with('\n') {
146        text.push('\n');
147    }
148    
149    for child in element.children() {
150        match child.value() {
151            Node::Text(t) => {
152                let content = t.text.trim();
153                if !content.is_empty() {
154                    if !text.is_empty() && !text.ends_with(' ') && !text.ends_with('\n') {
155                        text.push(' ');
156                    }
157                    text.push_str(content);
158                }
159            }
160            Node::Element(_) => {
161                if let Some(child_el) = ElementRef::wrap(child) {
162                    extract_text_recursive(&child_el, skip_selectors, text, _config);
163                }
164            }
165            _ => {}
166        }
167    }
168}
169
170// ============================================================================
171// TEXT PROCESSING
172// ============================================================================
173
174/// Normalize text (collapse whitespace, trim)
175pub fn normalize_text(text: &str) -> String {
176    let mut result = String::with_capacity(text.len());
177    let mut prev_whitespace = false;
178    let mut in_line_start = true;
179    
180    for c in text.chars() {
181        if c == '\n' {
182            // Preserve single newlines, collapse multiple
183            if !result.ends_with('\n') {
184                result.push('\n');
185            }
186            prev_whitespace = false;
187            in_line_start = true;
188        } else if c.is_whitespace() {
189            if !prev_whitespace && !in_line_start {
190                result.push(' ');
191                prev_whitespace = true;
192            }
193        } else {
194            result.push(c);
195            prev_whitespace = false;
196            in_line_start = false;
197        }
198    }
199    
200    // Trim and collapse multiple newlines
201    let trimmed = result.trim();
202    collapse_newlines(trimmed)
203}
204
205/// Collapse multiple consecutive newlines to at most 2
206fn collapse_newlines(text: &str) -> String {
207    let mut result = String::with_capacity(text.len());
208    let mut newline_count = 0;
209    
210    for c in text.chars() {
211        if c == '\n' {
212            newline_count += 1;
213            if newline_count <= 2 {
214                result.push(c);
215            }
216        } else {
217            newline_count = 0;
218            result.push(c);
219        }
220    }
221    
222    result
223}
224
225/// Clean text by removing control characters
226pub fn clean_text(text: &str) -> String {
227    text.chars()
228        .filter(|c| !c.is_control() || *c == '\n' || *c == '\t')
229        .collect()
230}
231
232/// Strip HTML tags from text (for cases where we have HTML strings)
233pub fn strip_html_tags(html: &str) -> String {
234    let doc = Html::parse_fragment(html);
235    let mut text = String::new();
236    
237    for node in doc.tree.nodes() {
238        if let Some(t) = node.value().as_text() {
239            text.push_str(&t.text);
240        }
241    }
242    
243    normalize_text(&text)
244}
245
246// ============================================================================
247// ELEMENT CLASSIFICATION
248// ============================================================================
249
250/// Check if element should be completely skipped
251fn should_skip_element(tag_name: &str) -> bool {
252    matches!(tag_name, 
253        "script" | "style" | "noscript" | "iframe" | "object" | 
254        "embed" | "applet" | "svg" | "canvas" | "map" | "template"
255    )
256}
257
258/// Check if element is a block element (needs line break)
259fn is_block_element(tag_name: &str) -> bool {
260    matches!(tag_name,
261        "p" | "div" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" |
262        "blockquote" | "pre" | "ul" | "ol" | "li" | "dl" | "dt" | "dd" |
263        "table" | "tr" | "article" | "section" | "aside" |
264        "header" | "footer" | "nav" | "main" | "figure" | "figcaption" |
265        "address" | "hr" | "br" | "form" | "fieldset"
266    )
267}
268
269/// Check if element is inline
270pub fn is_inline_element(tag_name: &str) -> bool {
271    matches!(tag_name,
272        "a" | "span" | "em" | "strong" | "b" | "i" | "u" | "s" |
273        "mark" | "small" | "sub" | "sup" | "code" | "kbd" | "samp" | "var" |
274        "abbr" | "cite" | "dfn" | "time" | "q" | "label"
275    )
276}
277
278// ============================================================================
279// READABILITY SCORING
280// ============================================================================
281
282/// Calculate Flesch-Kincaid Reading Ease score
283/// Higher score = easier to read (0-100+)
284pub fn flesch_reading_ease(text: &str) -> f64 {
285    let words = count_words(text);
286    let sentences = count_sentences(text);
287    let syllables = count_syllables(text);
288    
289    if words == 0 || sentences == 0 {
290        return 0.0;
291    }
292    
293    let words_f = words as f64;
294    let sentences_f = sentences as f64;
295    let syllables_f = syllables as f64;
296    
297    206.835 - 1.015 * (words_f / sentences_f) - 84.6 * (syllables_f / words_f)
298}
299
300/// Calculate Flesch-Kincaid Grade Level
301/// Returns US school grade level needed to understand text
302pub fn flesch_kincaid_grade(text: &str) -> f64 {
303    let words = count_words(text);
304    let sentences = count_sentences(text);
305    let syllables = count_syllables(text);
306    
307    if words == 0 || sentences == 0 {
308        return 0.0;
309    }
310    
311    let words_f = words as f64;
312    let sentences_f = sentences as f64;
313    let syllables_f = syllables as f64;
314    
315    0.39 * (words_f / sentences_f) + 11.8 * (syllables_f / words_f) - 15.59
316}
317
318/// Count words in text
319pub fn count_words(text: &str) -> usize {
320    text.split_whitespace().count()
321}
322
323/// Count sentences in text
324pub fn count_sentences(text: &str) -> usize {
325    text.chars()
326        .filter(|c| *c == '.' || *c == '!' || *c == '?')
327        .count()
328        .max(1)
329}
330
331/// Estimate syllable count (English approximation)
332fn count_syllables(text: &str) -> usize {
333    text.split_whitespace()
334        .map(count_word_syllables)
335        .sum()
336}
337
338/// Count syllables in a single word (rough estimate)
339fn count_word_syllables(word: &str) -> usize {
340    let word = word.to_lowercase();
341    let word = word.trim_matches(|c: char| !c.is_alphabetic());
342    
343    if word.is_empty() {
344        return 0;
345    }
346    
347    if word.len() <= 3 {
348        return 1;
349    }
350    
351    let vowels: HashSet<char> = ['a', 'e', 'i', 'o', 'u', 'y'].into_iter().collect();
352    let mut count = 0;
353    let mut prev_vowel = false;
354    
355    for c in word.chars() {
356        let is_vowel = vowels.contains(&c);
357        if is_vowel && !prev_vowel {
358            count += 1;
359        }
360        prev_vowel = is_vowel;
361    }
362    
363    // Adjust for silent e
364    if word.ends_with('e') && count > 1 {
365        count -= 1;
366    }
367    
368    count.max(1)
369}
370
371// ============================================================================
372// LANGUAGE DETECTION (BASIC)
373// ============================================================================
374
375/// Simple language detection based on common words
376/// Returns ISO 639-1 language code or None
377pub fn detect_language(text: &str) -> Option<String> {
378    let lowercase_words: Vec<String> = text.split_whitespace()
379        .take(100) // Sample first 100 words
380        .map(|w| w.to_lowercase())
381        .collect();
382    
383    let words: Vec<&str> = lowercase_words.iter().map(|s| s.as_str()).collect();
384    
385    if words.is_empty() {
386        return None;
387    }
388    
389    // Common words by language
390    let english = ["the", "a", "an", "is", "are", "was", "were", "be", "been", "being", 
391                   "have", "has", "had", "do", "does", "did", "will", "would", "could",
392                   "should", "may", "might", "must", "shall", "can", "of", "to", "in",
393                   "for", "on", "with", "at", "by", "from", "and", "or", "but", "not"];
394    
395    let french = ["le", "la", "les", "un", "une", "des", "de", "du", "est", "sont",
396                  "était", "étaient", "être", "avoir", "a", "ont", "fait", "faire",
397                  "dit", "dire", "que", "qui", "quoi", "où", "quand", "comment",
398                  "pour", "sur", "avec", "dans", "par", "et", "ou", "mais", "ne", "pas"];
399    
400    let german = ["der", "die", "das", "ein", "eine", "ist", "sind", "war", "waren",
401                  "sein", "haben", "hat", "hatte", "hatten", "werden", "wird", "wurde",
402                  "und", "oder", "aber", "nicht", "für", "auf", "mit", "in", "an", "von",
403                  "zu", "bei", "nach", "aus", "über", "durch", "wenn", "als", "ob"];
404    
405    let spanish = ["el", "la", "los", "las", "un", "una", "unos", "unas", "es", "son",
406                   "era", "eran", "ser", "estar", "tener", "tiene", "hacer", "hecho",
407                   "que", "qué", "quien", "quién", "donde", "dónde", "cuando", "cuándo",
408                   "para", "por", "con", "en", "de", "y", "o", "pero", "no", "si"];
409    
410    // Simple word count matching
411    let words_text: String = words.iter().map(|w| w.to_string()).collect::<Vec<_>>().join(" ");
412    
413    let en_count = english.iter().filter(|w| words_text.contains(*w)).count();
414    let fr_count = french.iter().filter(|w| words_text.contains(*w)).count();
415    let de_count = german.iter().filter(|w| words_text.contains(*w)).count();
416    let es_count = spanish.iter().filter(|w| words_text.contains(*w)).count();
417    
418    let max_count = en_count.max(fr_count).max(de_count).max(es_count);
419    
420    if max_count < 3 {
421        return None; // Not enough confidence
422    }
423    
424    if en_count == max_count {
425        Some("en".to_string())
426    } else if fr_count == max_count {
427        Some("fr".to_string())
428    } else if de_count == max_count {
429        Some("de".to_string())
430    } else if es_count == max_count {
431        Some("es".to_string())
432    } else {
433        None
434    }
435}
436
437// ============================================================================
438// TESTS
439// ============================================================================
440
441#[cfg(test)]
442mod tests {
443    use super::*;
444
445    fn parse_html(html: &str) -> Html {
446        Html::parse_document(html)
447    }
448
449    #[test]
450    fn test_extract_text_simple() {
451        let doc = parse_html("<html><body><p>Hello world</p></body></html>");
452        let config = ParserConfig::default();
453        let text = extract_text(&doc, &config).unwrap();
454        assert!(text.cleaned_text.contains("Hello world"));
455    }
456
457    #[test]
458    fn test_extract_text_from_article() {
459        let doc = parse_html(r#"
460            <html>
461            <body>
462                <nav>Navigation here</nav>
463                <article>
464                    <h1>Title</h1>
465                    <p>This is the main content of the article.</p>
466                    <p>Another paragraph with more content.</p>
467                </article>
468                <footer>Footer here</footer>
469            </body>
470            </html>
471        "#);
472        let config = ParserConfig::default();
473        let text = extract_text(&doc, &config).unwrap();
474        assert!(text.cleaned_text.contains("main content"));
475    }
476
477    #[test]
478    fn test_extract_text_skips_script() {
479        let doc = parse_html(r#"
480            <html>
481            <body>
482                <p>Visible text</p>
483                <script>var x = "invisible";</script>
484                <p>More visible text</p>
485            </body>
486            </html>
487        "#);
488        let config = ParserConfig::default();
489        let text = extract_text(&doc, &config).unwrap();
490        assert!(text.cleaned_text.contains("Visible text"));
491        assert!(!text.cleaned_text.contains("invisible"));
492    }
493
494    #[test]
495    fn test_normalize_text() {
496        let input = "  Hello   world  \n\n\n  multiple   spaces  ";
497        let result = normalize_text(input);
498        // Normalizes: trims, collapses spaces, preserves up to 2 newlines
499        assert_eq!(result, "Hello world \nmultiple spaces");
500    }
501
502    #[test]
503    fn test_clean_text() {
504        let input = "Hello\x00World\x01Test\nNewline";
505        let cleaned = clean_text(input);
506        assert_eq!(cleaned, "HelloWorldTest\nNewline");
507    }
508
509    #[test]
510    fn test_strip_html_tags() {
511        let html = "<p>Hello <strong>world</strong></p>";
512        let text = strip_html_tags(html);
513        assert_eq!(text, "Hello world");
514    }
515
516    #[test]
517    fn test_count_words() {
518        assert_eq!(count_words("Hello world test"), 3);
519        assert_eq!(count_words("One"), 1);
520        assert_eq!(count_words("   "), 0);
521    }
522
523    #[test]
524    fn test_count_sentences() {
525        assert_eq!(count_sentences("Hello. World! How?"), 3);
526        assert_eq!(count_sentences("No punctuation"), 1);
527    }
528
529    #[test]
530    fn test_flesch_reading_ease() {
531        // Simple text should have high score (easy to read)
532        let simple = "The cat sat on the mat. The dog ran fast.";
533        let score = flesch_reading_ease(simple);
534        assert!(score > 60.0, "Simple text should be easy to read: {}", score);
535    }
536
537    #[test]
538    fn test_flesch_kincaid_grade() {
539        let simple = "The cat sat. The dog ran.";
540        let grade = flesch_kincaid_grade(simple);
541        assert!(grade < 6.0, "Simple text should be low grade level: {}", grade);
542    }
543
544    #[test]
545    fn test_count_word_syllables() {
546        assert_eq!(count_word_syllables("cat"), 1);
547        assert_eq!(count_word_syllables("hello"), 2);
548        assert_eq!(count_word_syllables("beautiful"), 3); // beau-ti-ful
549        assert_eq!(count_word_syllables("extraordinary"), 5); // ex-tra-or-di-na-ry (algorithm may count differently)
550    }
551
552    #[test]
553    fn test_detect_language_english() {
554        let text = "The quick brown fox jumps over the lazy dog. This is a test of the English language detection system.";
555        assert_eq!(detect_language(text), Some("en".to_string()));
556    }
557
558    #[test]
559    fn test_detect_language_french() {
560        let text = "Le chat est sur la table. C'est un beau jour pour une promenade dans le parc.";
561        assert_eq!(detect_language(text), Some("fr".to_string()));
562    }
563
564    #[test]
565    fn test_detect_language_german() {
566        let text = "Der Hund ist auf dem Tisch. Das ist ein schöner Tag für einen Spaziergang im Park.";
567        assert_eq!(detect_language(text), Some("de".to_string()));
568    }
569
570    #[test]
571    fn test_detect_language_spanish() {
572        let text = "El gato está en la mesa. Es un buen día para un paseo en el parque.";
573        assert_eq!(detect_language(text), Some("es".to_string()));
574    }
575
576    #[test]
577    fn test_detect_language_insufficient() {
578        let text = "xyz abc 123";
579        assert_eq!(detect_language(text), None);
580    }
581
582    #[test]
583    fn test_is_block_element() {
584        assert!(is_block_element("p"));
585        assert!(is_block_element("div"));
586        assert!(is_block_element("h1"));
587        assert!(!is_block_element("span"));
588        assert!(!is_block_element("a"));
589    }
590
591    #[test]
592    fn test_is_inline_element() {
593        assert!(is_inline_element("span"));
594        assert!(is_inline_element("a"));
595        assert!(is_inline_element("strong"));
596        assert!(!is_inline_element("div"));
597        assert!(!is_inline_element("p"));
598    }
599
600    #[test]
601    fn test_should_skip_element() {
602        assert!(should_skip_element("script"));
603        assert!(should_skip_element("style"));
604        assert!(should_skip_element("noscript"));
605        assert!(!should_skip_element("p"));
606        assert!(!should_skip_element("div"));
607    }
608
609    #[test]
610    fn test_text_content_reading_time() {
611        // 225 words = ~1 minute
612        let words = "word ".repeat(225);
613        let content = TextContent::from_raw(&words);
614        let time = content.reading_time_minutes.unwrap();
615        assert!((time - 1.0).abs() < 0.1);
616    }
617
618    #[test]
619    fn test_text_content_word_count() {
620        let content = TextContent::from_raw("Hello world test");
621        assert_eq!(content.word_count, 3);
622    }
623}