essence/format/
advanced_extraction.rs

1use crate::{error::Result, error::ScrapeError};
2use readability::extractor::extract;
3use scraper::{Html, Selector};
4use std::collections::HashMap;
5use whatlang::{detect, Lang};
6
7/// Advanced content extractor with article extraction and language detection
8pub struct AdvancedExtractor;
9
10/// Extracted article content
11#[derive(Debug, Clone)]
12pub struct ArticleContent {
13    /// Article title
14    pub title: Option<String>,
15    /// Article text content
16    pub text: String,
17    /// Article HTML
18    pub html: String,
19    /// Extracted excerpt
20    pub excerpt: Option<String>,
21    /// Detected language
22    pub language: Option<String>,
23    /// Word count
24    pub word_count: usize,
25    /// Estimated reading time in minutes
26    pub reading_time: usize,
27}
28
29/// Table data structure
30#[derive(Debug, Clone, serde::Serialize)]
31pub struct TableData {
32    /// Table headers
33    pub headers: Vec<String>,
34    /// Table rows as maps of header -> value
35    pub rows: Vec<HashMap<String, String>>,
36}
37
38impl AdvancedExtractor {
39    /// Extract article content using Mozilla's Readability algorithm
40    pub fn extract_article(html: &str, url: &str) -> Result<ArticleContent> {
41        // Use readability to extract main article content
42        let article = extract(&mut html.as_bytes(), &url::Url::parse(url).map_err(|e| {
43            ScrapeError::ParseError(format!("Invalid URL for readability: {}", e))
44        })?)
45        .map_err(|e| ScrapeError::ParseError(format!("Readability extraction failed: {}", e)))?;
46
47        let text = article.text.trim().to_string();
48        let word_count = Self::count_words(&text);
49        let reading_time = Self::estimate_reading_time(word_count);
50        let excerpt = Self::generate_excerpt(&text);
51        let language = Self::detect_language(&text);
52
53        Ok(ArticleContent {
54            title: Some(article.title),
55            text,
56            html: article.content,
57            excerpt,
58            language,
59            word_count,
60            reading_time,
61        })
62    }
63
64    /// Generate a smart excerpt with sentence boundary detection
65    pub fn generate_excerpt(text: &str) -> Option<String> {
66        if text.is_empty() {
67            return None;
68        }
69
70        const MAX_EXCERPT_LENGTH: usize = 200;
71        let text = text.trim();
72
73        // If text is shorter than max length, return it as is
74        if text.len() <= MAX_EXCERPT_LENGTH {
75            return Some(text.to_string());
76        }
77
78        // Find the last sentence boundary before MAX_EXCERPT_LENGTH
79        let excerpt_candidate = &text[..MAX_EXCERPT_LENGTH];
80        
81        // Look for sentence endings: . ! ?
82        let sentence_endings = [". ", "! ", "? "];
83        let mut last_sentence_end = 0;
84        
85        for ending in &sentence_endings {
86            if let Some(pos) = excerpt_candidate.rfind(ending) {
87                last_sentence_end = last_sentence_end.max(pos + ending.len());
88            }
89        }
90
91        // If we found a sentence boundary, use it
92        if last_sentence_end > 0 {
93            return Some(text[..last_sentence_end].trim().to_string());
94        }
95
96        // Otherwise, find the last word boundary
97        if let Some(last_space) = excerpt_candidate.rfind(' ') {
98            return Some(format!("{}...", text[..last_space].trim()));
99        }
100
101        // Fallback: just truncate with ellipsis
102        Some(format!("{}...", &text[..MAX_EXCERPT_LENGTH.min(text.len())]))
103    }
104
105    /// Detect language using whatlang
106    pub fn detect_language(text: &str) -> Option<String> {
107        if text.is_empty() {
108            return None;
109        }
110
111        detect(text).map(|info| {
112            match info.lang() {
113                Lang::Eng => "en",
114                Lang::Spa => "es",
115                Lang::Fra => "fr",
116                Lang::Deu => "de",
117                Lang::Ita => "it",
118                Lang::Por => "pt",
119                Lang::Rus => "ru",
120                Lang::Jpn => "ja",
121                Lang::Kor => "ko",
122                Lang::Cmn => "zh",
123                Lang::Ara => "ar",
124                Lang::Hin => "hi",
125                Lang::Tur => "tr",
126                Lang::Nld => "nl",
127                Lang::Pol => "pl",
128                Lang::Swe => "sv",
129                Lang::Dan => "da",
130                Lang::Fin => "fi",
131                Lang::Ces => "cs",
132                Lang::Ron => "ro",
133                Lang::Ukr => "uk",
134                Lang::Ell => "el",
135                Lang::Hun => "hu",
136                Lang::Heb => "he",
137                Lang::Tha => "th",
138                Lang::Vie => "vi",
139                _ => "unknown",
140            }
141            .to_string()
142        })
143    }
144
145    /// Extract tables as structured JSON with header mapping
146    pub fn extract_tables_as_json(html: &str) -> Result<Vec<TableData>> {
147        let document = Html::parse_document(html);
148        let table_selector = Selector::parse("table")
149            .map_err(|e| ScrapeError::ParseError(format!("Invalid table selector: {:?}", e)))?;
150
151        let mut tables = Vec::new();
152
153        for table in document.select(&table_selector) {
154            // Extract headers
155            let header_selector = Selector::parse("thead th, thead td")
156                .map_err(|e| ScrapeError::ParseError(format!("Invalid header selector: {:?}", e)))?;
157            
158            let headers: Vec<String> = table
159                .select(&header_selector)
160                .map(|th| th.text().collect::<String>().trim().to_string())
161                .filter(|h| !h.is_empty())
162                .collect();
163
164            // If no headers in thead, try first tr
165            let headers = if headers.is_empty() {
166                let first_row_selector = Selector::parse("tr:first-child th, tr:first-child td")
167                    .map_err(|e| ScrapeError::ParseError(format!("Invalid first row selector: {:?}", e)))?;
168                
169                table
170                    .select(&first_row_selector)
171                    .map(|td| td.text().collect::<String>().trim().to_string())
172                    .filter(|h| !h.is_empty())
173                    .collect()
174            } else {
175                headers
176            };
177
178            // If still no headers, generate generic ones
179            let headers = if headers.is_empty() {
180                vec!["Column 1".to_string()]
181            } else {
182                headers
183            };
184
185            // Extract rows - prefer tbody tr, but fall back to all tr if no tbody
186            let has_thead = table.select(&Selector::parse("thead").unwrap()).count() > 0;
187            let has_tbody = table.select(&Selector::parse("tbody").unwrap()).count() > 0;
188
189            let row_selector = if has_tbody {
190                Selector::parse("tbody tr")
191            } else {
192                Selector::parse("tr")
193            }.map_err(|e| ScrapeError::ParseError(format!("Invalid row selector: {:?}", e)))?;
194
195            let cell_selector = Selector::parse("td, th")
196                .map_err(|e| ScrapeError::ParseError(format!("Invalid cell selector: {:?}", e)))?;
197
198            let mut rows = Vec::new();
199
200            for (i, row) in table.select(&row_selector).enumerate() {
201                // Skip the first row if it was used as headers (only when no thead/tbody)
202                if i == 0 && !has_thead && !has_tbody {
203                    continue;
204                }
205
206                let cells: Vec<String> = row
207                    .select(&cell_selector)
208                    .map(|td| td.text().collect::<String>().trim().to_string())
209                    .collect();
210
211                if !cells.is_empty() {
212                    let mut row_map = HashMap::new();
213                    for (j, cell) in cells.iter().enumerate() {
214                        let header = headers.get(j).cloned()
215                            .unwrap_or_else(|| format!("Column {}", j + 1));
216                        row_map.insert(header, cell.clone());
217                    }
218                    rows.push(row_map);
219                }
220            }
221
222            if !rows.is_empty() {
223                tables.push(TableData { headers, rows });
224            }
225        }
226
227        Ok(tables)
228    }
229
230    /// Count words in text
231    pub fn count_words(text: &str) -> usize {
232        text.split_whitespace().count()
233    }
234
235    /// Estimate reading time in minutes (assuming 200 words per minute)
236    pub fn estimate_reading_time(word_count: usize) -> usize {
237        const WORDS_PER_MINUTE: usize = 200;
238        word_count.div_ceil(WORDS_PER_MINUTE)
239    }
240}
241
242#[cfg(test)]
243mod tests {
244    use super::*;
245
246    #[test]
247    fn test_generate_excerpt_short() {
248        let text = "This is a short text.";
249        let excerpt = AdvancedExtractor::generate_excerpt(text);
250        assert_eq!(excerpt, Some(text.to_string()));
251    }
252
253    #[test]
254    fn test_generate_excerpt_with_sentence() {
255        let text = "This is the first sentence. This is the second sentence. This is a very long third sentence that will definitely exceed the maximum excerpt length and should be cut off.";
256        let excerpt = AdvancedExtractor::generate_excerpt(text).unwrap();
257        assert!(excerpt.contains("first sentence"));
258        assert!(excerpt.len() <= 210); // Allow for sentence boundary
259        // The excerpt should end at a sentence boundary before "cut off"
260        if excerpt.len() < text.len() {
261            // If truncated, should not include the last part
262            assert!(excerpt.ends_with('.') || excerpt.ends_with("..."));
263        }
264    }
265
266    #[test]
267    fn test_detect_language_english() {
268        let text = "This is an English text. It contains several sentences to help with language detection.";
269        let lang = AdvancedExtractor::detect_language(text);
270        assert_eq!(lang, Some("en".to_string()));
271    }
272
273    #[test]
274    fn test_detect_language_spanish() {
275        let text = "Este es un texto en español. Contiene varias oraciones para ayudar con la detección del idioma.";
276        let lang = AdvancedExtractor::detect_language(text);
277        assert_eq!(lang, Some("es".to_string()));
278    }
279
280    #[test]
281    fn test_word_count() {
282        let text = "This is a test with five words";
283        assert_eq!(AdvancedExtractor::count_words(text), 7);
284    }
285
286    #[test]
287    fn test_reading_time() {
288        assert_eq!(AdvancedExtractor::estimate_reading_time(200), 1);
289        assert_eq!(AdvancedExtractor::estimate_reading_time(400), 2);
290        assert_eq!(AdvancedExtractor::estimate_reading_time(250), 2);
291    }
292
293    #[test]
294    fn test_extract_tables() {
295        let html = r#"
296            <table>
297                <thead>
298                    <tr><th>Name</th><th>Age</th></tr>
299                </thead>
300                <tbody>
301                    <tr><td>Alice</td><td>30</td></tr>
302                    <tr><td>Bob</td><td>25</td></tr>
303                </tbody>
304            </table>
305        "#;
306
307        let tables = AdvancedExtractor::extract_tables_as_json(html).unwrap();
308        assert_eq!(tables.len(), 1);
309
310        // Debug: print headers to see what we got
311        eprintln!("Headers: {:?}", tables[0].headers);
312
313        // The scraper might include whitespace as a third element, so let's filter
314        let non_empty_headers: Vec<_> = tables[0].headers.iter()
315            .filter(|h| !h.is_empty())
316            .collect();
317
318        assert!(non_empty_headers.len() >= 2);
319        assert!(tables[0].headers.contains(&"Name".to_string()));
320        assert!(tables[0].headers.contains(&"Age".to_string()));
321        assert_eq!(tables[0].rows.len(), 2);
322        assert_eq!(tables[0].rows[0].get("Name"), Some(&"Alice".to_string()));
323        assert_eq!(tables[0].rows[0].get("Age"), Some(&"30".to_string()));
324    }
325}
essence/format/advanced_extraction.rs

essence/format/
advanced_extraction.rs