1use crate::{config::ExtractorConfig, error::Result, ExtractError, ExtractResult, Extractor};
4use reqwest::Client;
5use scraper::{Html, Selector};
6use std::time::Duration;
7
8pub struct WebExtractor {
10 config: ExtractorConfig,
11 client: Client,
12}
13
14impl Default for WebExtractor {
15 fn default() -> Self {
16 Self::new(ExtractorConfig::default())
17 }
18}
19
20impl WebExtractor {
21 pub fn new(config: ExtractorConfig) -> Self {
23 let client = Client::builder()
24 .timeout(Duration::from_secs(config.timeout_secs))
25 .redirect(if config.follow_redirects {
26 reqwest::redirect::Policy::limited(config.max_redirects)
27 } else {
28 reqwest::redirect::Policy::none()
29 })
30 .user_agent(&config.user_agent)
31 .build()
32 .expect("Failed to create HTTP client");
33
34 Self { config, client }
35 }
36
37 fn extract_text_from_html(&self, html: &str) -> (String, Option<String>) {
39 let document = Html::parse_document(html);
40
41 let title_selector = Selector::parse("title").unwrap();
43 let title = document
44 .select(&title_selector)
45 .next()
46 .map(|el| el.text().collect::<String>().trim().to_string());
47
48 let mut text_parts = Vec::new();
50
51 let content_selectors = [
53 "article",
54 "main",
55 "[role='main']",
56 ".content",
57 ".post-content",
58 ".article-content",
59 "#content",
60 "#main",
61 ];
62
63 let mut found_main_content = false;
64 for selector_str in content_selectors {
65 if let Ok(selector) = Selector::parse(selector_str) {
66 for element in document.select(&selector) {
67 let text = self.extract_element_text(&element);
68 if !text.is_empty() {
69 text_parts.push(text);
70 found_main_content = true;
71 }
72 }
73 if found_main_content {
74 break;
75 }
76 }
77 }
78
79 if !found_main_content {
81 if let Ok(body_selector) = Selector::parse("body") {
82 for element in document.select(&body_selector) {
83 text_parts.push(self.extract_element_text(&element));
84 }
85 }
86 }
87
88 let text = text_parts.join("\n\n");
89 let clean_text = self.clean_text(&text);
90
91 (clean_text, title)
92 }
93
94 #[allow(clippy::only_used_in_recursion)]
96 fn extract_element_text(&self, element: &scraper::ElementRef) -> String {
97 let skip_tags = [
98 "script", "style", "noscript", "nav", "header", "footer", "aside",
99 ];
100
101 let mut text = String::new();
102 for child in element.children() {
103 if let Some(element) = child.value().as_element() {
104 let tag = element.name();
105 if skip_tags.contains(&tag) {
106 continue;
107 }
108 if let Some(child_element) = scraper::ElementRef::wrap(child) {
109 text.push_str(&self.extract_element_text(&child_element));
110 if matches!(
111 tag,
112 "p" | "div" | "br" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "li"
113 ) {
114 text.push('\n');
115 }
116 }
117 } else if let Some(text_node) = child.value().as_text() {
118 text.push_str(text_node);
119 }
120 }
121 text
122 }
123
124 fn clean_text(&self, text: &str) -> String {
126 let mut result = String::with_capacity(text.len());
127 let mut prev_was_whitespace = false;
128 let mut prev_was_newline = false;
129 let mut newline_count = 0;
130
131 for c in text.chars() {
132 if c == '\n' {
133 newline_count += 1;
134 if newline_count <= 2 && !prev_was_newline {
135 result.push('\n');
136 prev_was_newline = true;
137 }
138 prev_was_whitespace = true;
139 } else if c.is_whitespace() {
140 if !prev_was_whitespace {
141 result.push(' ');
142 prev_was_whitespace = true;
143 }
144 newline_count = 0;
145 } else {
146 result.push(c);
147 prev_was_whitespace = false;
148 prev_was_newline = false;
149 newline_count = 0;
150 }
151 }
152
153 result.trim().to_string()
154 }
155}
156
157#[async_trait::async_trait]
158impl Extractor for WebExtractor {
159 async fn extract(&self, source: &str) -> Result<ExtractResult> {
160 let url =
162 url::Url::parse(source).map_err(|_| ExtractError::InvalidUrl(source.to_string()))?;
163
164 let response = self.client.get(url.as_str()).send().await?;
166
167 let status = response.status();
168 if !status.is_success() {
169 return Err(ExtractError::Http {
170 status: status.as_u16(),
171 message: status.to_string(),
172 });
173 }
174
175 let content_type = response
176 .headers()
177 .get("content-type")
178 .and_then(|v| v.to_str().ok())
179 .map(|s| s.to_string());
180
181 let body = response.text().await?;
182 let original_length = body.len();
183
184 if original_length > self.config.max_length {
186 return Err(ExtractError::ContentTooLarge {
187 size: original_length,
188 max: self.config.max_length,
189 });
190 }
191
192 let (text, title) = if self.config.clean_text {
194 self.extract_text_from_html(&body)
195 } else {
196 (body, None)
197 };
198
199 let mut result =
200 ExtractResult::new(text, source.to_string()).with_original_length(original_length);
201
202 if let Some(ct) = content_type {
203 result = result.with_content_type(ct);
204 }
205
206 if let Some(t) = title {
207 result = result.with_title(t);
208 }
209
210 Ok(result)
211 }
212
213 #[cfg(feature = "sanitize")]
214 async fn extract_sanitized(&self, source: &str) -> Result<ExtractResult> {
215 let result = self.extract(source).await?;
216 crate::sanitize::sanitize_result(result, &self.config).await
217 }
218}
219
220#[cfg(test)]
221mod tests {
222 use super::*;
223
224 #[test]
225 fn test_clean_text() {
226 let extractor = WebExtractor::default();
227 let input = " Hello World \n\n\n\n Test ";
228 let result = extractor.clean_text(input);
229 assert_eq!(result, "Hello World \nTest");
231 }
232
233 #[test]
234 fn test_extract_text_from_html() {
235 let extractor = WebExtractor::default();
236 let html = r#"
237 <!DOCTYPE html>
238 <html>
239 <head><title>Test Page</title></head>
240 <body>
241 <script>alert('ignore me')</script>
242 <h1>Hello World</h1>
243 <p>This is a test paragraph.</p>
244 </body>
245 </html>
246 "#;
247
248 let (text, title) = extractor.extract_text_from_html(html);
249 assert_eq!(title, Some("Test Page".to_string()));
250 assert!(text.contains("Hello World"));
251 assert!(text.contains("This is a test paragraph"));
252 assert!(!text.contains("alert"));
253 }
254}