1use scraper::{Html, Selector, ElementRef};
4use std::collections::HashSet;
5
6pub struct TextExtractor {
8 content_selectors: Vec<String>,
10 exclude_selectors: Vec<String>,
12 segment: bool,
14 chunk_size: usize,
16}
17
18impl Default for TextExtractor {
19 fn default() -> Self {
20 Self {
21 content_selectors: vec![
22 "article".to_string(),
23 "main".to_string(),
24 "[role=\"main\"]".to_string(),
25 ".post-content".to_string(),
26 ".entry-content".to_string(),
27 ".article-content".to_string(),
28 ".content".to_string(),
29 "#content".to_string(),
30 ],
31 exclude_selectors: vec![
32 "nav".to_string(),
33 "header".to_string(),
34 "footer".to_string(),
35 "aside".to_string(),
36 ".sidebar".to_string(),
37 ".navigation".to_string(),
38 ".menu".to_string(),
39 ".breadcrumb".to_string(),
40 ".pagination".to_string(),
41 ".comments".to_string(),
42 ".related".to_string(),
43 ".share".to_string(),
44 ".social".to_string(),
45 ".ad".to_string(),
46 ".advertisement".to_string(),
47 "[role=\"navigation\"]".to_string(),
48 "[role=\"banner\"]".to_string(),
49 "[role=\"contentinfo\"]".to_string(),
50 "[role=\"complementary\"]".to_string(),
51 ],
52 segment: true,
53 chunk_size: 1000,
54 }
55 }
56}
57
58impl TextExtractor {
59 pub fn new() -> Self {
61 Self::default()
62 }
63
64 pub fn with_chunking(mut self, enabled: bool, chunk_size: usize) -> Self {
66 self.segment = enabled;
67 self.chunk_size = chunk_size;
68 self
69 }
70
71 pub fn extract(&self, html: &str) -> ExtractedText {
73 let document = Html::parse_document(html);
74
75 let main_element = self.find_main_content(&document);
77
78 let text = if let Some(element) = main_element {
79 self.extract_from_element(&element)
80 } else {
81 self.extract_full_text(&document)
83 };
84
85 let chunks = if self.segment {
86 self.segment_text(&text)
87 } else {
88 vec![text.clone()]
89 };
90
91 let sections = self.extract_sections(&document);
93
94 ExtractedText {
95 full_text: text,
96 chunks,
97 sections,
98 }
99 }
100
101 fn find_main_content<'a>(&self, document: &'a Html) -> Option<ElementRef<'a>> {
103 for selector_str in &self.content_selectors {
104 if let Ok(selector) = Selector::parse(selector_str) {
105 if let Some(element) = document.select(&selector).next() {
106 return Some(element);
107 }
108 }
109 }
110 None
111 }
112
113 fn extract_from_element(&self, element: &ElementRef) -> String {
115 let html = element.inner_html();
116 let sub_doc = Html::parse_fragment(&html);
117
118 let selectors: Vec<_> = self.exclude_selectors
120 .iter()
121 .filter_map(|s| Selector::parse(s).ok())
122 .collect();
123
124 let exclude_set: HashSet<_> = selectors.iter()
125 .flat_map(|sel| sub_doc.select(sel))
126 .map(|el| el.id())
127 .collect();
128
129 let mut text_parts = Vec::new();
131
132 for node in sub_doc.root_element().descendants() {
133 if let Some(text) = node.value().as_text() {
134 let mut excluded = false;
136 let mut parent = node.parent();
137 while let Some(p) = parent {
138 if exclude_set.contains(&p.id()) {
139 excluded = true;
140 break;
141 }
142 parent = p.parent();
143 }
144
145 if !excluded {
146 let t = text.trim();
147 if !t.is_empty() {
148 text_parts.push(t.to_string());
149 }
150 }
151 }
152 }
153
154 text_parts.join(" ")
155 }
156
157 fn extract_full_text(&self, document: &Html) -> String {
159 let selectors: Vec<_> = self.exclude_selectors
161 .iter()
162 .filter_map(|s| Selector::parse(s).ok())
163 .collect();
164
165 let exclude_set: HashSet<_> = selectors.iter()
166 .flat_map(|sel| document.select(sel))
167 .map(|el| el.id())
168 .collect();
169
170 let script_sel = Selector::parse("script, style, noscript").unwrap();
172 let script_ids: HashSet<_> = document.select(&script_sel).map(|el| el.id()).collect();
173
174 let mut text_parts = Vec::new();
175
176 for node in document.root_element().descendants() {
177 if let Some(text) = node.value().as_text() {
178 let mut excluded = false;
179 let mut parent = node.parent();
180 while let Some(p) = parent {
181 if exclude_set.contains(&p.id()) || script_ids.contains(&p.id()) {
182 excluded = true;
183 break;
184 }
185 parent = p.parent();
186 }
187
188 if !excluded {
189 let t = text.trim();
190 if !t.is_empty() {
191 text_parts.push(t.to_string());
192 }
193 }
194 }
195 }
196
197 text_parts.join(" ")
198 }
199
200 fn segment_text(&self, text: &str) -> Vec<String> {
202 let mut chunks = Vec::new();
203 let mut current_chunk = String::new();
204
205 for sentence in text.split(|c| c == '.' || c == '!' || c == '?') {
206 let sentence = sentence.trim();
207 if sentence.is_empty() {
208 continue;
209 }
210
211 let sentence_with_punct = format!("{}. ", sentence);
212
213 if current_chunk.len() + sentence_with_punct.len() > self.chunk_size {
214 if !current_chunk.is_empty() {
215 chunks.push(current_chunk.trim().to_string());
216 }
217 current_chunk = sentence_with_punct;
218 } else {
219 current_chunk.push_str(&sentence_with_punct);
220 }
221 }
222
223 if !current_chunk.is_empty() {
224 chunks.push(current_chunk.trim().to_string());
225 }
226
227 chunks
228 }
229
230 fn extract_sections(&self, document: &Html) -> Vec<TextSection> {
232 let mut sections = Vec::new();
233 let heading_sel = Selector::parse("h1, h2, h3, h4, h5, h6").unwrap();
234
235 for heading in document.select(&heading_sel) {
236 let level = heading.value().name().chars().nth(1)
237 .and_then(|c| c.to_digit(10))
238 .unwrap_or(1) as u8;
239
240 let title = heading.text().collect::<Vec<_>>().join(" ").trim().to_string();
241
242 let content = self.extract_section_content(&heading);
244
245 sections.push(TextSection {
246 level,
247 title,
248 content,
249 });
250 }
251
252 sections
253 }
254
255 fn extract_section_content(&self, heading: &ElementRef) -> String {
257 let mut content = String::new();
258 let mut current = heading.next_sibling();
259
260 while let Some(sibling) = current {
261 if let Some(element) = sibling.value().as_element() {
263 let name = element.name();
264 if name.starts_with('h') && name.len() == 2 {
265 break;
266 }
267 }
268
269 for node in sibling.descendants() {
271 if let Some(text) = node.value().as_text() {
272 let t = text.trim();
273 if !t.is_empty() {
274 content.push_str(t);
275 content.push(' ');
276 }
277 }
278 }
279
280 current = sibling.next_sibling();
281 }
282
283 content.trim().to_string()
284 }
285}
286
287#[derive(Debug, Clone)]
289pub struct ExtractedText {
290 pub full_text: String,
292 pub chunks: Vec<String>,
294 pub sections: Vec<TextSection>,
296}
297
298#[derive(Debug, Clone)]
300pub struct TextSection {
301 pub level: u8,
303 pub title: String,
305 pub content: String,
307}
308
309pub fn word_count(text: &str) -> usize {
311 text.split_whitespace().count()
312}
313
314pub fn char_count(text: &str) -> usize {
316 text.chars().filter(|c| !c.is_whitespace()).count()
317}
318
319pub fn detect_language(text: &str) -> Option<String> {
321 let sample = text.chars().take(1000).collect::<String>().to_lowercase();
322
323 let french_words = ["le", "la", "les", "de", "du", "un", "une", "et", "est", "que"];
325 let english_words = ["the", "a", "an", "of", "to", "in", "is", "and", "that", "for"];
326 let german_words = ["der", "die", "das", "und", "ist", "ein", "eine", "für", "mit", "auf"];
327 let spanish_words = ["el", "la", "los", "de", "un", "una", "que", "es", "en", "por"];
328
329 let fr_count = french_words.iter().filter(|w| sample.contains(*w)).count();
330 let en_count = english_words.iter().filter(|w| sample.contains(*w)).count();
331 let de_count = german_words.iter().filter(|w| sample.contains(*w)).count();
332 let es_count = spanish_words.iter().filter(|w| sample.contains(*w)).count();
333
334 let max = fr_count.max(en_count).max(de_count).max(es_count);
335 if max < 3 {
336 return None;
337 }
338
339 if max == fr_count {
340 Some("fr".to_string())
341 } else if max == en_count {
342 Some("en".to_string())
343 } else if max == de_count {
344 Some("de".to_string())
345 } else if max == es_count {
346 Some("es".to_string())
347 } else {
348 None
349 }
350}