use scraper::{Html, Selector, ElementRef};
use std::collections::HashSet;
pub struct TextExtractor {
content_selectors: Vec<String>,
exclude_selectors: Vec<String>,
segment: bool,
chunk_size: usize,
}
impl Default for TextExtractor {
fn default() -> Self {
Self {
content_selectors: vec![
"article".to_string(),
"main".to_string(),
"[role=\"main\"]".to_string(),
".post-content".to_string(),
".entry-content".to_string(),
".article-content".to_string(),
".content".to_string(),
"#content".to_string(),
],
exclude_selectors: vec![
"nav".to_string(),
"header".to_string(),
"footer".to_string(),
"aside".to_string(),
".sidebar".to_string(),
".navigation".to_string(),
".menu".to_string(),
".breadcrumb".to_string(),
".pagination".to_string(),
".comments".to_string(),
".related".to_string(),
".share".to_string(),
".social".to_string(),
".ad".to_string(),
".advertisement".to_string(),
"[role=\"navigation\"]".to_string(),
"[role=\"banner\"]".to_string(),
"[role=\"contentinfo\"]".to_string(),
"[role=\"complementary\"]".to_string(),
],
segment: true,
chunk_size: 1000,
}
}
}
impl TextExtractor {
pub fn new() -> Self {
Self::default()
}
pub fn with_chunking(mut self, enabled: bool, chunk_size: usize) -> Self {
self.segment = enabled;
self.chunk_size = chunk_size;
self
}
pub fn extract(&self, html: &str) -> ExtractedText {
let document = Html::parse_document(html);
let main_element = self.find_main_content(&document);
let text = if let Some(element) = main_element {
self.extract_from_element(&element)
} else {
self.extract_full_text(&document)
};
let chunks = if self.segment {
self.segment_text(&text)
} else {
vec![text.clone()]
};
let sections = self.extract_sections(&document);
ExtractedText {
full_text: text,
chunks,
sections,
}
}
fn find_main_content<'a>(&self, document: &'a Html) -> Option<ElementRef<'a>> {
for selector_str in &self.content_selectors {
if let Ok(selector) = Selector::parse(selector_str) {
if let Some(element) = document.select(&selector).next() {
return Some(element);
}
}
}
None
}
fn extract_from_element(&self, element: &ElementRef) -> String {
let html = element.inner_html();
let sub_doc = Html::parse_fragment(&html);
let selectors: Vec<_> = self.exclude_selectors
.iter()
.filter_map(|s| Selector::parse(s).ok())
.collect();
let exclude_set: HashSet<_> = selectors.iter()
.flat_map(|sel| sub_doc.select(sel))
.map(|el| el.id())
.collect();
let mut text_parts = Vec::new();
for node in sub_doc.root_element().descendants() {
if let Some(text) = node.value().as_text() {
let mut excluded = false;
let mut parent = node.parent();
while let Some(p) = parent {
if exclude_set.contains(&p.id()) {
excluded = true;
break;
}
parent = p.parent();
}
if !excluded {
let t = text.trim();
if !t.is_empty() {
text_parts.push(t.to_string());
}
}
}
}
text_parts.join(" ")
}
fn extract_full_text(&self, document: &Html) -> String {
let selectors: Vec<_> = self.exclude_selectors
.iter()
.filter_map(|s| Selector::parse(s).ok())
.collect();
let exclude_set: HashSet<_> = selectors.iter()
.flat_map(|sel| document.select(sel))
.map(|el| el.id())
.collect();
let script_sel = Selector::parse("script, style, noscript").unwrap();
let script_ids: HashSet<_> = document.select(&script_sel).map(|el| el.id()).collect();
let mut text_parts = Vec::new();
for node in document.root_element().descendants() {
if let Some(text) = node.value().as_text() {
let mut excluded = false;
let mut parent = node.parent();
while let Some(p) = parent {
if exclude_set.contains(&p.id()) || script_ids.contains(&p.id()) {
excluded = true;
break;
}
parent = p.parent();
}
if !excluded {
let t = text.trim();
if !t.is_empty() {
text_parts.push(t.to_string());
}
}
}
}
text_parts.join(" ")
}
fn segment_text(&self, text: &str) -> Vec<String> {
let mut chunks = Vec::new();
let mut current_chunk = String::new();
for sentence in text.split(|c| c == '.' || c == '!' || c == '?') {
let sentence = sentence.trim();
if sentence.is_empty() {
continue;
}
let sentence_with_punct = format!("{}. ", sentence);
if current_chunk.len() + sentence_with_punct.len() > self.chunk_size {
if !current_chunk.is_empty() {
chunks.push(current_chunk.trim().to_string());
}
current_chunk = sentence_with_punct;
} else {
current_chunk.push_str(&sentence_with_punct);
}
}
if !current_chunk.is_empty() {
chunks.push(current_chunk.trim().to_string());
}
chunks
}
fn extract_sections(&self, document: &Html) -> Vec<TextSection> {
let mut sections = Vec::new();
let heading_sel = Selector::parse("h1, h2, h3, h4, h5, h6").unwrap();
for heading in document.select(&heading_sel) {
let level = heading.value().name().chars().nth(1)
.and_then(|c| c.to_digit(10))
.unwrap_or(1) as u8;
let title = heading.text().collect::<Vec<_>>().join(" ").trim().to_string();
let content = self.extract_section_content(&heading);
sections.push(TextSection {
level,
title,
content,
});
}
sections
}
fn extract_section_content(&self, heading: &ElementRef) -> String {
let mut content = String::new();
let mut current = heading.next_sibling();
while let Some(sibling) = current {
if let Some(element) = sibling.value().as_element() {
let name = element.name();
if name.starts_with('h') && name.len() == 2 {
break;
}
}
for node in sibling.descendants() {
if let Some(text) = node.value().as_text() {
let t = text.trim();
if !t.is_empty() {
content.push_str(t);
content.push(' ');
}
}
}
current = sibling.next_sibling();
}
content.trim().to_string()
}
}
#[derive(Debug, Clone)]
pub struct ExtractedText {
pub full_text: String,
pub chunks: Vec<String>,
pub sections: Vec<TextSection>,
}
#[derive(Debug, Clone)]
pub struct TextSection {
pub level: u8,
pub title: String,
pub content: String,
}
pub fn word_count(text: &str) -> usize {
text.split_whitespace().count()
}
pub fn char_count(text: &str) -> usize {
text.chars().filter(|c| !c.is_whitespace()).count()
}
pub fn detect_language(text: &str) -> Option<String> {
let sample = text.chars().take(1000).collect::<String>().to_lowercase();
let french_words = ["le", "la", "les", "de", "du", "un", "une", "et", "est", "que"];
let english_words = ["the", "a", "an", "of", "to", "in", "is", "and", "that", "for"];
let german_words = ["der", "die", "das", "und", "ist", "ein", "eine", "für", "mit", "auf"];
let spanish_words = ["el", "la", "los", "de", "un", "una", "que", "es", "en", "por"];
let fr_count = french_words.iter().filter(|w| sample.contains(*w)).count();
let en_count = english_words.iter().filter(|w| sample.contains(*w)).count();
let de_count = german_words.iter().filter(|w| sample.contains(*w)).count();
let es_count = spanish_words.iter().filter(|w| sample.contains(*w)).count();
let max = fr_count.max(en_count).max(de_count).max(es_count);
if max < 3 {
return None;
}
if max == fr_count {
Some("fr".to_string())
} else if max == en_count {
Some("en".to_string())
} else if max == de_count {
Some("de".to_string())
} else if max == es_count {
Some("es".to_string())
} else {
None
}
}