essence/format/
advanced_extraction.rs1use crate::{error::Result, error::ScrapeError};
2use readability::extractor::extract;
3use scraper::{Html, Selector};
4use std::collections::HashMap;
5use whatlang::{detect, Lang};
6
7pub struct AdvancedExtractor;
9
10#[derive(Debug, Clone)]
12pub struct ArticleContent {
13 pub title: Option<String>,
15 pub text: String,
17 pub html: String,
19 pub excerpt: Option<String>,
21 pub language: Option<String>,
23 pub word_count: usize,
25 pub reading_time: usize,
27}
28
29#[derive(Debug, Clone, serde::Serialize)]
31pub struct TableData {
32 pub headers: Vec<String>,
34 pub rows: Vec<HashMap<String, String>>,
36}
37
38impl AdvancedExtractor {
39 pub fn extract_article(html: &str, url: &str) -> Result<ArticleContent> {
41 let article = extract(&mut html.as_bytes(), &url::Url::parse(url).map_err(|e| {
43 ScrapeError::ParseError(format!("Invalid URL for readability: {}", e))
44 })?)
45 .map_err(|e| ScrapeError::ParseError(format!("Readability extraction failed: {}", e)))?;
46
47 let text = article.text.trim().to_string();
48 let word_count = Self::count_words(&text);
49 let reading_time = Self::estimate_reading_time(word_count);
50 let excerpt = Self::generate_excerpt(&text);
51 let language = Self::detect_language(&text);
52
53 Ok(ArticleContent {
54 title: Some(article.title),
55 text,
56 html: article.content,
57 excerpt,
58 language,
59 word_count,
60 reading_time,
61 })
62 }
63
64 pub fn generate_excerpt(text: &str) -> Option<String> {
66 if text.is_empty() {
67 return None;
68 }
69
70 const MAX_EXCERPT_LENGTH: usize = 200;
71 let text = text.trim();
72
73 if text.len() <= MAX_EXCERPT_LENGTH {
75 return Some(text.to_string());
76 }
77
78 let excerpt_candidate = &text[..MAX_EXCERPT_LENGTH];
80
81 let sentence_endings = [". ", "! ", "? "];
83 let mut last_sentence_end = 0;
84
85 for ending in &sentence_endings {
86 if let Some(pos) = excerpt_candidate.rfind(ending) {
87 last_sentence_end = last_sentence_end.max(pos + ending.len());
88 }
89 }
90
91 if last_sentence_end > 0 {
93 return Some(text[..last_sentence_end].trim().to_string());
94 }
95
96 if let Some(last_space) = excerpt_candidate.rfind(' ') {
98 return Some(format!("{}...", text[..last_space].trim()));
99 }
100
101 Some(format!("{}...", &text[..MAX_EXCERPT_LENGTH.min(text.len())]))
103 }
104
105 pub fn detect_language(text: &str) -> Option<String> {
107 if text.is_empty() {
108 return None;
109 }
110
111 detect(text).map(|info| {
112 match info.lang() {
113 Lang::Eng => "en",
114 Lang::Spa => "es",
115 Lang::Fra => "fr",
116 Lang::Deu => "de",
117 Lang::Ita => "it",
118 Lang::Por => "pt",
119 Lang::Rus => "ru",
120 Lang::Jpn => "ja",
121 Lang::Kor => "ko",
122 Lang::Cmn => "zh",
123 Lang::Ara => "ar",
124 Lang::Hin => "hi",
125 Lang::Tur => "tr",
126 Lang::Nld => "nl",
127 Lang::Pol => "pl",
128 Lang::Swe => "sv",
129 Lang::Dan => "da",
130 Lang::Fin => "fi",
131 Lang::Ces => "cs",
132 Lang::Ron => "ro",
133 Lang::Ukr => "uk",
134 Lang::Ell => "el",
135 Lang::Hun => "hu",
136 Lang::Heb => "he",
137 Lang::Tha => "th",
138 Lang::Vie => "vi",
139 _ => "unknown",
140 }
141 .to_string()
142 })
143 }
144
145 pub fn extract_tables_as_json(html: &str) -> Result<Vec<TableData>> {
147 let document = Html::parse_document(html);
148 let table_selector = Selector::parse("table")
149 .map_err(|e| ScrapeError::ParseError(format!("Invalid table selector: {:?}", e)))?;
150
151 let mut tables = Vec::new();
152
153 for table in document.select(&table_selector) {
154 let header_selector = Selector::parse("thead th, thead td")
156 .map_err(|e| ScrapeError::ParseError(format!("Invalid header selector: {:?}", e)))?;
157
158 let headers: Vec<String> = table
159 .select(&header_selector)
160 .map(|th| th.text().collect::<String>().trim().to_string())
161 .filter(|h| !h.is_empty())
162 .collect();
163
164 let headers = if headers.is_empty() {
166 let first_row_selector = Selector::parse("tr:first-child th, tr:first-child td")
167 .map_err(|e| ScrapeError::ParseError(format!("Invalid first row selector: {:?}", e)))?;
168
169 table
170 .select(&first_row_selector)
171 .map(|td| td.text().collect::<String>().trim().to_string())
172 .filter(|h| !h.is_empty())
173 .collect()
174 } else {
175 headers
176 };
177
178 let headers = if headers.is_empty() {
180 vec!["Column 1".to_string()]
181 } else {
182 headers
183 };
184
185 let has_thead = table.select(&Selector::parse("thead").unwrap()).count() > 0;
187 let has_tbody = table.select(&Selector::parse("tbody").unwrap()).count() > 0;
188
189 let row_selector = if has_tbody {
190 Selector::parse("tbody tr")
191 } else {
192 Selector::parse("tr")
193 }.map_err(|e| ScrapeError::ParseError(format!("Invalid row selector: {:?}", e)))?;
194
195 let cell_selector = Selector::parse("td, th")
196 .map_err(|e| ScrapeError::ParseError(format!("Invalid cell selector: {:?}", e)))?;
197
198 let mut rows = Vec::new();
199
200 for (i, row) in table.select(&row_selector).enumerate() {
201 if i == 0 && !has_thead && !has_tbody {
203 continue;
204 }
205
206 let cells: Vec<String> = row
207 .select(&cell_selector)
208 .map(|td| td.text().collect::<String>().trim().to_string())
209 .collect();
210
211 if !cells.is_empty() {
212 let mut row_map = HashMap::new();
213 for (j, cell) in cells.iter().enumerate() {
214 let header = headers.get(j).cloned()
215 .unwrap_or_else(|| format!("Column {}", j + 1));
216 row_map.insert(header, cell.clone());
217 }
218 rows.push(row_map);
219 }
220 }
221
222 if !rows.is_empty() {
223 tables.push(TableData { headers, rows });
224 }
225 }
226
227 Ok(tables)
228 }
229
230 pub fn count_words(text: &str) -> usize {
232 text.split_whitespace().count()
233 }
234
235 pub fn estimate_reading_time(word_count: usize) -> usize {
237 const WORDS_PER_MINUTE: usize = 200;
238 word_count.div_ceil(WORDS_PER_MINUTE)
239 }
240}
241
242#[cfg(test)]
243mod tests {
244 use super::*;
245
246 #[test]
247 fn test_generate_excerpt_short() {
248 let text = "This is a short text.";
249 let excerpt = AdvancedExtractor::generate_excerpt(text);
250 assert_eq!(excerpt, Some(text.to_string()));
251 }
252
253 #[test]
254 fn test_generate_excerpt_with_sentence() {
255 let text = "This is the first sentence. This is the second sentence. This is a very long third sentence that will definitely exceed the maximum excerpt length and should be cut off.";
256 let excerpt = AdvancedExtractor::generate_excerpt(text).unwrap();
257 assert!(excerpt.contains("first sentence"));
258 assert!(excerpt.len() <= 210); if excerpt.len() < text.len() {
261 assert!(excerpt.ends_with('.') || excerpt.ends_with("..."));
263 }
264 }
265
266 #[test]
267 fn test_detect_language_english() {
268 let text = "This is an English text. It contains several sentences to help with language detection.";
269 let lang = AdvancedExtractor::detect_language(text);
270 assert_eq!(lang, Some("en".to_string()));
271 }
272
273 #[test]
274 fn test_detect_language_spanish() {
275 let text = "Este es un texto en español. Contiene varias oraciones para ayudar con la detección del idioma.";
276 let lang = AdvancedExtractor::detect_language(text);
277 assert_eq!(lang, Some("es".to_string()));
278 }
279
280 #[test]
281 fn test_word_count() {
282 let text = "This is a test with five words";
283 assert_eq!(AdvancedExtractor::count_words(text), 7);
284 }
285
286 #[test]
287 fn test_reading_time() {
288 assert_eq!(AdvancedExtractor::estimate_reading_time(200), 1);
289 assert_eq!(AdvancedExtractor::estimate_reading_time(400), 2);
290 assert_eq!(AdvancedExtractor::estimate_reading_time(250), 2);
291 }
292
293 #[test]
294 fn test_extract_tables() {
295 let html = r#"
296 <table>
297 <thead>
298 <tr><th>Name</th><th>Age</th></tr>
299 </thead>
300 <tbody>
301 <tr><td>Alice</td><td>30</td></tr>
302 <tr><td>Bob</td><td>25</td></tr>
303 </tbody>
304 </table>
305 "#;
306
307 let tables = AdvancedExtractor::extract_tables_as_json(html).unwrap();
308 assert_eq!(tables.len(), 1);
309
310 eprintln!("Headers: {:?}", tables[0].headers);
312
313 let non_empty_headers: Vec<_> = tables[0].headers.iter()
315 .filter(|h| !h.is_empty())
316 .collect();
317
318 assert!(non_empty_headers.len() >= 2);
319 assert!(tables[0].headers.contains(&"Name".to_string()));
320 assert!(tables[0].headers.contains(&"Age".to_string()));
321 assert_eq!(tables[0].rows.len(), 2);
322 assert_eq!(tables[0].rows[0].get("Name"), Some(&"Alice".to_string()));
323 assert_eq!(tables[0].rows[0].get("Age"), Some(&"30".to_string()));
324 }
325}