1use scraper::{Html, ElementRef, Node};
11use std::collections::HashSet;
12
13use crate::selector::{SELECTORS, try_parse_selector, BOILERPLATE_SELECTORS, CONTENT_SELECTORS};
14use crate::types::{TextContent, ParserConfig, ParserResult};
15
16pub fn extract_text(document: &Html, config: &ParserConfig) -> ParserResult<TextContent> {
22 let main_text = extract_main_content(document, config);
24
25 if !main_text.trim().is_empty() && main_text.split_whitespace().count() > 20 {
27 return Ok(TextContent::from_raw(&main_text));
28 }
29
30 let body_text = extract_body_text(document, config);
32 Ok(TextContent::from_raw(&body_text))
33}
34
35fn extract_main_content(document: &Html, config: &ParserConfig) -> String {
37 for selector_str in &config.content_selectors {
39 if let Some(sel) = try_parse_selector(selector_str) {
40 if let Some(element) = document.select(&sel).next() {
41 let text = extract_element_text(&element, config);
42 if !text.trim().is_empty() {
43 return text;
44 }
45 }
46 }
47 }
48
49 for selector_str in CONTENT_SELECTORS {
51 if let Some(sel) = try_parse_selector(selector_str) {
52 if let Some(element) = document.select(&sel).next() {
53 let text = extract_element_text(&element, config);
54 if !text.trim().is_empty() {
55 return text;
56 }
57 }
58 }
59 }
60
61 String::new()
62}
63
64fn extract_body_text(document: &Html, config: &ParserConfig) -> String {
66 if let Some(body) = document.select(&SELECTORS.body).next() {
67 extract_element_text_filtered(&body, config)
68 } else {
69 String::new()
70 }
71}
72
73fn extract_element_text(element: &ElementRef, config: &ParserConfig) -> String {
75 let mut text = String::new();
76
77 for node in element.descendants() {
78 match node.value() {
79 Node::Text(t) => {
80 let content = t.text.trim();
81 if !content.is_empty() {
82 if !text.is_empty() && !text.ends_with(' ') && !text.ends_with('\n') {
83 text.push(' ');
84 }
85 text.push_str(content);
86 }
87 }
88 Node::Element(el) => {
89 let tag_name = el.name();
91 if is_block_element(tag_name) && !text.is_empty() && !text.ends_with('\n') {
92 text.push('\n');
93 }
94 }
95 _ => {}
96 }
97 }
98
99 if config.preserve_whitespace {
100 text
101 } else {
102 normalize_text(&text)
103 }
104}
105
106fn extract_element_text_filtered(element: &ElementRef, config: &ParserConfig) -> String {
108 let skip_selectors: Vec<_> = config.remove_selectors.iter()
110 .chain(BOILERPLATE_SELECTORS.iter().map(|s| s.to_string()).collect::<Vec<_>>().iter())
111 .filter_map(|s| try_parse_selector(s))
112 .collect();
113
114 let mut text = String::new();
115 extract_text_recursive(element, &skip_selectors, &mut text, config);
116
117 if config.preserve_whitespace {
118 text
119 } else {
120 normalize_text(&text)
121 }
122}
123
124fn extract_text_recursive(
126 element: &ElementRef,
127 skip_selectors: &[scraper::Selector],
128 text: &mut String,
129 _config: &ParserConfig,
130) {
131 for sel in skip_selectors {
133 if element.select(sel).next().map(|e| e.id() == element.id()).unwrap_or(false) {
134 return;
135 }
136 }
137
138 let tag_name = element.value().name();
140 if should_skip_element(tag_name) {
141 return;
142 }
143
144 if is_block_element(tag_name) && !text.is_empty() && !text.ends_with('\n') {
146 text.push('\n');
147 }
148
149 for child in element.children() {
150 match child.value() {
151 Node::Text(t) => {
152 let content = t.text.trim();
153 if !content.is_empty() {
154 if !text.is_empty() && !text.ends_with(' ') && !text.ends_with('\n') {
155 text.push(' ');
156 }
157 text.push_str(content);
158 }
159 }
160 Node::Element(_) => {
161 if let Some(child_el) = ElementRef::wrap(child) {
162 extract_text_recursive(&child_el, skip_selectors, text, _config);
163 }
164 }
165 _ => {}
166 }
167 }
168}
169
170pub fn normalize_text(text: &str) -> String {
176 let mut result = String::with_capacity(text.len());
177 let mut prev_whitespace = false;
178 let mut in_line_start = true;
179
180 for c in text.chars() {
181 if c == '\n' {
182 if !result.ends_with('\n') {
184 result.push('\n');
185 }
186 prev_whitespace = false;
187 in_line_start = true;
188 } else if c.is_whitespace() {
189 if !prev_whitespace && !in_line_start {
190 result.push(' ');
191 prev_whitespace = true;
192 }
193 } else {
194 result.push(c);
195 prev_whitespace = false;
196 in_line_start = false;
197 }
198 }
199
200 let trimmed = result.trim();
202 collapse_newlines(trimmed)
203}
204
205fn collapse_newlines(text: &str) -> String {
207 let mut result = String::with_capacity(text.len());
208 let mut newline_count = 0;
209
210 for c in text.chars() {
211 if c == '\n' {
212 newline_count += 1;
213 if newline_count <= 2 {
214 result.push(c);
215 }
216 } else {
217 newline_count = 0;
218 result.push(c);
219 }
220 }
221
222 result
223}
224
225pub fn clean_text(text: &str) -> String {
227 text.chars()
228 .filter(|c| !c.is_control() || *c == '\n' || *c == '\t')
229 .collect()
230}
231
232pub fn strip_html_tags(html: &str) -> String {
234 let doc = Html::parse_fragment(html);
235 let mut text = String::new();
236
237 for node in doc.tree.nodes() {
238 if let Some(t) = node.value().as_text() {
239 text.push_str(&t.text);
240 }
241 }
242
243 normalize_text(&text)
244}
245
246fn should_skip_element(tag_name: &str) -> bool {
252 matches!(tag_name,
253 "script" | "style" | "noscript" | "iframe" | "object" |
254 "embed" | "applet" | "svg" | "canvas" | "map" | "template"
255 )
256}
257
258fn is_block_element(tag_name: &str) -> bool {
260 matches!(tag_name,
261 "p" | "div" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" |
262 "blockquote" | "pre" | "ul" | "ol" | "li" | "dl" | "dt" | "dd" |
263 "table" | "tr" | "article" | "section" | "aside" |
264 "header" | "footer" | "nav" | "main" | "figure" | "figcaption" |
265 "address" | "hr" | "br" | "form" | "fieldset"
266 )
267}
268
269pub fn is_inline_element(tag_name: &str) -> bool {
271 matches!(tag_name,
272 "a" | "span" | "em" | "strong" | "b" | "i" | "u" | "s" |
273 "mark" | "small" | "sub" | "sup" | "code" | "kbd" | "samp" | "var" |
274 "abbr" | "cite" | "dfn" | "time" | "q" | "label"
275 )
276}
277
278pub fn flesch_reading_ease(text: &str) -> f64 {
285 let words = count_words(text);
286 let sentences = count_sentences(text);
287 let syllables = count_syllables(text);
288
289 if words == 0 || sentences == 0 {
290 return 0.0;
291 }
292
293 let words_f = words as f64;
294 let sentences_f = sentences as f64;
295 let syllables_f = syllables as f64;
296
297 206.835 - 1.015 * (words_f / sentences_f) - 84.6 * (syllables_f / words_f)
298}
299
300pub fn flesch_kincaid_grade(text: &str) -> f64 {
303 let words = count_words(text);
304 let sentences = count_sentences(text);
305 let syllables = count_syllables(text);
306
307 if words == 0 || sentences == 0 {
308 return 0.0;
309 }
310
311 let words_f = words as f64;
312 let sentences_f = sentences as f64;
313 let syllables_f = syllables as f64;
314
315 0.39 * (words_f / sentences_f) + 11.8 * (syllables_f / words_f) - 15.59
316}
317
318pub fn count_words(text: &str) -> usize {
320 text.split_whitespace().count()
321}
322
323pub fn count_sentences(text: &str) -> usize {
325 text.chars()
326 .filter(|c| *c == '.' || *c == '!' || *c == '?')
327 .count()
328 .max(1)
329}
330
331fn count_syllables(text: &str) -> usize {
333 text.split_whitespace()
334 .map(count_word_syllables)
335 .sum()
336}
337
338fn count_word_syllables(word: &str) -> usize {
340 let word = word.to_lowercase();
341 let word = word.trim_matches(|c: char| !c.is_alphabetic());
342
343 if word.is_empty() {
344 return 0;
345 }
346
347 if word.len() <= 3 {
348 return 1;
349 }
350
351 let vowels: HashSet<char> = ['a', 'e', 'i', 'o', 'u', 'y'].into_iter().collect();
352 let mut count = 0;
353 let mut prev_vowel = false;
354
355 for c in word.chars() {
356 let is_vowel = vowels.contains(&c);
357 if is_vowel && !prev_vowel {
358 count += 1;
359 }
360 prev_vowel = is_vowel;
361 }
362
363 if word.ends_with('e') && count > 1 {
365 count -= 1;
366 }
367
368 count.max(1)
369}
370
371pub fn detect_language(text: &str) -> Option<String> {
378 let lowercase_words: Vec<String> = text.split_whitespace()
379 .take(100) .map(|w| w.to_lowercase())
381 .collect();
382
383 let words: Vec<&str> = lowercase_words.iter().map(|s| s.as_str()).collect();
384
385 if words.is_empty() {
386 return None;
387 }
388
389 let english = ["the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
391 "have", "has", "had", "do", "does", "did", "will", "would", "could",
392 "should", "may", "might", "must", "shall", "can", "of", "to", "in",
393 "for", "on", "with", "at", "by", "from", "and", "or", "but", "not"];
394
395 let french = ["le", "la", "les", "un", "une", "des", "de", "du", "est", "sont",
396 "était", "étaient", "être", "avoir", "a", "ont", "fait", "faire",
397 "dit", "dire", "que", "qui", "quoi", "où", "quand", "comment",
398 "pour", "sur", "avec", "dans", "par", "et", "ou", "mais", "ne", "pas"];
399
400 let german = ["der", "die", "das", "ein", "eine", "ist", "sind", "war", "waren",
401 "sein", "haben", "hat", "hatte", "hatten", "werden", "wird", "wurde",
402 "und", "oder", "aber", "nicht", "für", "auf", "mit", "in", "an", "von",
403 "zu", "bei", "nach", "aus", "über", "durch", "wenn", "als", "ob"];
404
405 let spanish = ["el", "la", "los", "las", "un", "una", "unos", "unas", "es", "son",
406 "era", "eran", "ser", "estar", "tener", "tiene", "hacer", "hecho",
407 "que", "qué", "quien", "quién", "donde", "dónde", "cuando", "cuándo",
408 "para", "por", "con", "en", "de", "y", "o", "pero", "no", "si"];
409
410 let words_text: String = words.iter().map(|w| w.to_string()).collect::<Vec<_>>().join(" ");
412
413 let en_count = english.iter().filter(|w| words_text.contains(*w)).count();
414 let fr_count = french.iter().filter(|w| words_text.contains(*w)).count();
415 let de_count = german.iter().filter(|w| words_text.contains(*w)).count();
416 let es_count = spanish.iter().filter(|w| words_text.contains(*w)).count();
417
418 let max_count = en_count.max(fr_count).max(de_count).max(es_count);
419
420 if max_count < 3 {
421 return None; }
423
424 if en_count == max_count {
425 Some("en".to_string())
426 } else if fr_count == max_count {
427 Some("fr".to_string())
428 } else if de_count == max_count {
429 Some("de".to_string())
430 } else if es_count == max_count {
431 Some("es".to_string())
432 } else {
433 None
434 }
435}
436
437#[cfg(test)]
442mod tests {
443 use super::*;
444
445 fn parse_html(html: &str) -> Html {
446 Html::parse_document(html)
447 }
448
449 #[test]
450 fn test_extract_text_simple() {
451 let doc = parse_html("<html><body><p>Hello world</p></body></html>");
452 let config = ParserConfig::default();
453 let text = extract_text(&doc, &config).unwrap();
454 assert!(text.cleaned_text.contains("Hello world"));
455 }
456
457 #[test]
458 fn test_extract_text_from_article() {
459 let doc = parse_html(r#"
460 <html>
461 <body>
462 <nav>Navigation here</nav>
463 <article>
464 <h1>Title</h1>
465 <p>This is the main content of the article.</p>
466 <p>Another paragraph with more content.</p>
467 </article>
468 <footer>Footer here</footer>
469 </body>
470 </html>
471 "#);
472 let config = ParserConfig::default();
473 let text = extract_text(&doc, &config).unwrap();
474 assert!(text.cleaned_text.contains("main content"));
475 }
476
477 #[test]
478 fn test_extract_text_skips_script() {
479 let doc = parse_html(r#"
480 <html>
481 <body>
482 <p>Visible text</p>
483 <script>var x = "invisible";</script>
484 <p>More visible text</p>
485 </body>
486 </html>
487 "#);
488 let config = ParserConfig::default();
489 let text = extract_text(&doc, &config).unwrap();
490 assert!(text.cleaned_text.contains("Visible text"));
491 assert!(!text.cleaned_text.contains("invisible"));
492 }
493
494 #[test]
495 fn test_normalize_text() {
496 let input = " Hello world \n\n\n multiple spaces ";
497 let result = normalize_text(input);
498 assert_eq!(result, "Hello world \nmultiple spaces");
500 }
501
502 #[test]
503 fn test_clean_text() {
504 let input = "Hello\x00World\x01Test\nNewline";
505 let cleaned = clean_text(input);
506 assert_eq!(cleaned, "HelloWorldTest\nNewline");
507 }
508
509 #[test]
510 fn test_strip_html_tags() {
511 let html = "<p>Hello <strong>world</strong></p>";
512 let text = strip_html_tags(html);
513 assert_eq!(text, "Hello world");
514 }
515
516 #[test]
517 fn test_count_words() {
518 assert_eq!(count_words("Hello world test"), 3);
519 assert_eq!(count_words("One"), 1);
520 assert_eq!(count_words(" "), 0);
521 }
522
523 #[test]
524 fn test_count_sentences() {
525 assert_eq!(count_sentences("Hello. World! How?"), 3);
526 assert_eq!(count_sentences("No punctuation"), 1);
527 }
528
529 #[test]
530 fn test_flesch_reading_ease() {
531 let simple = "The cat sat on the mat. The dog ran fast.";
533 let score = flesch_reading_ease(simple);
534 assert!(score > 60.0, "Simple text should be easy to read: {}", score);
535 }
536
537 #[test]
538 fn test_flesch_kincaid_grade() {
539 let simple = "The cat sat. The dog ran.";
540 let grade = flesch_kincaid_grade(simple);
541 assert!(grade < 6.0, "Simple text should be low grade level: {}", grade);
542 }
543
544 #[test]
545 fn test_count_word_syllables() {
546 assert_eq!(count_word_syllables("cat"), 1);
547 assert_eq!(count_word_syllables("hello"), 2);
548 assert_eq!(count_word_syllables("beautiful"), 3); assert_eq!(count_word_syllables("extraordinary"), 5); }
551
552 #[test]
553 fn test_detect_language_english() {
554 let text = "The quick brown fox jumps over the lazy dog. This is a test of the English language detection system.";
555 assert_eq!(detect_language(text), Some("en".to_string()));
556 }
557
558 #[test]
559 fn test_detect_language_french() {
560 let text = "Le chat est sur la table. C'est un beau jour pour une promenade dans le parc.";
561 assert_eq!(detect_language(text), Some("fr".to_string()));
562 }
563
564 #[test]
565 fn test_detect_language_german() {
566 let text = "Der Hund ist auf dem Tisch. Das ist ein schöner Tag für einen Spaziergang im Park.";
567 assert_eq!(detect_language(text), Some("de".to_string()));
568 }
569
570 #[test]
571 fn test_detect_language_spanish() {
572 let text = "El gato está en la mesa. Es un buen día para un paseo en el parque.";
573 assert_eq!(detect_language(text), Some("es".to_string()));
574 }
575
576 #[test]
577 fn test_detect_language_insufficient() {
578 let text = "xyz abc 123";
579 assert_eq!(detect_language(text), None);
580 }
581
582 #[test]
583 fn test_is_block_element() {
584 assert!(is_block_element("p"));
585 assert!(is_block_element("div"));
586 assert!(is_block_element("h1"));
587 assert!(!is_block_element("span"));
588 assert!(!is_block_element("a"));
589 }
590
591 #[test]
592 fn test_is_inline_element() {
593 assert!(is_inline_element("span"));
594 assert!(is_inline_element("a"));
595 assert!(is_inline_element("strong"));
596 assert!(!is_inline_element("div"));
597 assert!(!is_inline_element("p"));
598 }
599
600 #[test]
601 fn test_should_skip_element() {
602 assert!(should_skip_element("script"));
603 assert!(should_skip_element("style"));
604 assert!(should_skip_element("noscript"));
605 assert!(!should_skip_element("p"));
606 assert!(!should_skip_element("div"));
607 }
608
609 #[test]
610 fn test_text_content_reading_time() {
611 let words = "word ".repeat(225);
613 let content = TextContent::from_raw(&words);
614 let time = content.reading_time_minutes.unwrap();
615 assert!((time - 1.0).abs() < 0.1);
616 }
617
618 #[test]
619 fn test_text_content_word_count() {
620 let content = TextContent::from_raw("Hello world test");
621 assert_eq!(content.word_count, 3);
622 }
623}