Skip to main content

hanzo_extract/
web.rs

1//! Web page content extraction
2
3use crate::{config::ExtractorConfig, error::Result, ExtractError, ExtractResult, Extractor};
4use reqwest::Client;
5use scraper::{Html, Selector};
6use std::time::Duration;
7
8/// Web page content extractor
9pub struct WebExtractor {
10    config: ExtractorConfig,
11    client: Client,
12}
13
14impl Default for WebExtractor {
15    fn default() -> Self {
16        Self::new(ExtractorConfig::default())
17    }
18}
19
20impl WebExtractor {
21    /// Create a new web extractor with the given configuration
22    pub fn new(config: ExtractorConfig) -> Self {
23        let client = Client::builder()
24            .timeout(Duration::from_secs(config.timeout_secs))
25            .redirect(if config.follow_redirects {
26                reqwest::redirect::Policy::limited(config.max_redirects)
27            } else {
28                reqwest::redirect::Policy::none()
29            })
30            .user_agent(&config.user_agent)
31            .build()
32            .expect("Failed to create HTTP client");
33
34        Self { config, client }
35    }
36
37    /// Extract clean text from HTML content
38    fn extract_text_from_html(&self, html: &str) -> (String, Option<String>) {
39        let document = Html::parse_document(html);
40
41        // Extract title
42        let title_selector = Selector::parse("title").unwrap();
43        let title = document
44            .select(&title_selector)
45            .next()
46            .map(|el| el.text().collect::<String>().trim().to_string());
47
48        // Remove script and style elements
49        let mut text_parts = Vec::new();
50
51        // Try to get main content areas first
52        let content_selectors = [
53            "article",
54            "main",
55            "[role='main']",
56            ".content",
57            ".post-content",
58            ".article-content",
59            "#content",
60            "#main",
61        ];
62
63        let mut found_main_content = false;
64        for selector_str in content_selectors {
65            if let Ok(selector) = Selector::parse(selector_str) {
66                for element in document.select(&selector) {
67                    let text = self.extract_element_text(&element);
68                    if !text.is_empty() {
69                        text_parts.push(text);
70                        found_main_content = true;
71                    }
72                }
73                if found_main_content {
74                    break;
75                }
76            }
77        }
78
79        // Fall back to body if no main content found
80        if !found_main_content {
81            if let Ok(body_selector) = Selector::parse("body") {
82                for element in document.select(&body_selector) {
83                    text_parts.push(self.extract_element_text(&element));
84                }
85            }
86        }
87
88        let text = text_parts.join("\n\n");
89        let clean_text = self.clean_text(&text);
90
91        (clean_text, title)
92    }
93
94    /// Extract text from an HTML element, skipping script/style
95    #[allow(clippy::only_used_in_recursion)]
96    fn extract_element_text(&self, element: &scraper::ElementRef) -> String {
97        let skip_tags = [
98            "script", "style", "noscript", "nav", "header", "footer", "aside",
99        ];
100
101        let mut text = String::new();
102        for child in element.children() {
103            if let Some(element) = child.value().as_element() {
104                let tag = element.name();
105                if skip_tags.contains(&tag) {
106                    continue;
107                }
108                if let Some(child_element) = scraper::ElementRef::wrap(child) {
109                    text.push_str(&self.extract_element_text(&child_element));
110                    if matches!(
111                        tag,
112                        "p" | "div" | "br" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "li"
113                    ) {
114                        text.push('\n');
115                    }
116                }
117            } else if let Some(text_node) = child.value().as_text() {
118                text.push_str(text_node);
119            }
120        }
121        text
122    }
123
124    /// Clean extracted text
125    fn clean_text(&self, text: &str) -> String {
126        let mut result = String::with_capacity(text.len());
127        let mut prev_was_whitespace = false;
128        let mut prev_was_newline = false;
129        let mut newline_count = 0;
130
131        for c in text.chars() {
132            if c == '\n' {
133                newline_count += 1;
134                if newline_count <= 2 && !prev_was_newline {
135                    result.push('\n');
136                    prev_was_newline = true;
137                }
138                prev_was_whitespace = true;
139            } else if c.is_whitespace() {
140                if !prev_was_whitespace {
141                    result.push(' ');
142                    prev_was_whitespace = true;
143                }
144                newline_count = 0;
145            } else {
146                result.push(c);
147                prev_was_whitespace = false;
148                prev_was_newline = false;
149                newline_count = 0;
150            }
151        }
152
153        result.trim().to_string()
154    }
155}
156
157#[async_trait::async_trait]
158impl Extractor for WebExtractor {
159    async fn extract(&self, source: &str) -> Result<ExtractResult> {
160        // Validate URL
161        let url =
162            url::Url::parse(source).map_err(|_| ExtractError::InvalidUrl(source.to_string()))?;
163
164        // Fetch the page
165        let response = self.client.get(url.as_str()).send().await?;
166
167        let status = response.status();
168        if !status.is_success() {
169            return Err(ExtractError::Http {
170                status: status.as_u16(),
171                message: status.to_string(),
172            });
173        }
174
175        let content_type = response
176            .headers()
177            .get("content-type")
178            .and_then(|v| v.to_str().ok())
179            .map(|s| s.to_string());
180
181        let body = response.text().await?;
182        let original_length = body.len();
183
184        // Check size limit
185        if original_length > self.config.max_length {
186            return Err(ExtractError::ContentTooLarge {
187                size: original_length,
188                max: self.config.max_length,
189            });
190        }
191
192        // Extract text
193        let (text, title) = if self.config.clean_text {
194            self.extract_text_from_html(&body)
195        } else {
196            (body, None)
197        };
198
199        let mut result =
200            ExtractResult::new(text, source.to_string()).with_original_length(original_length);
201
202        if let Some(ct) = content_type {
203            result = result.with_content_type(ct);
204        }
205
206        if let Some(t) = title {
207            result = result.with_title(t);
208        }
209
210        Ok(result)
211    }
212
213    #[cfg(feature = "sanitize")]
214    async fn extract_sanitized(&self, source: &str) -> Result<ExtractResult> {
215        let result = self.extract(source).await?;
216        crate::sanitize::sanitize_result(result, &self.config).await
217    }
218}
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223
224    #[test]
225    fn test_clean_text() {
226        let extractor = WebExtractor::default();
227        let input = "  Hello   World  \n\n\n\n  Test  ";
228        let result = extractor.clean_text(input);
229        // Consecutive whitespace collapsed, preserving single newline
230        assert_eq!(result, "Hello World \nTest");
231    }
232
233    #[test]
234    fn test_extract_text_from_html() {
235        let extractor = WebExtractor::default();
236        let html = r#"
237        <!DOCTYPE html>
238        <html>
239        <head><title>Test Page</title></head>
240        <body>
241            <script>alert('ignore me')</script>
242            <h1>Hello World</h1>
243            <p>This is a test paragraph.</p>
244        </body>
245        </html>
246        "#;
247
248        let (text, title) = extractor.extract_text_from_html(html);
249        assert_eq!(title, Some("Test Page".to_string()));
250        assert!(text.contains("Hello World"));
251        assert!(text.contains("This is a test paragraph"));
252        assert!(!text.contains("alert"));
253    }
254}