1pub mod browser;
2pub mod detection;
3pub mod http;
4pub mod racer;
5pub mod stealth;
6
7
8use crate::{error::Result, types::ScrapeRequest};
9use async_trait::async_trait;
10use detection::{DetectionResult, RenderingDetector};
11use tracing::debug;
12
13#[derive(Debug, Clone)]
15pub struct RawScrapeResult {
16 pub url: String,
18 pub status_code: u16,
20 pub content_type: Option<String>,
22 pub html: String,
24 pub headers: Vec<(String, String)>,
26}
27
28#[async_trait]
30pub trait ScrapeEngine: Send + Sync {
31 async fn scrape(&self, request: &ScrapeRequest) -> Result<RawScrapeResult>;
33}
34
35#[derive(Debug, Clone, PartialEq)]
37pub enum EngineType {
38 Http,
40 Browser,
42}
43
44pub fn detect_engine_needed(url: &str, html: &str) -> EngineType {
46 debug!("Detecting engine for URL: {}", url);
47
48 let detection_result = RenderingDetector::needs_javascript(html, url);
50
51 if detection_result.needs_js {
52 debug!(
53 "JavaScript rendering needed: {} (frameworks: {:?})",
54 detection_result.reason, detection_result.detected_frameworks
55 );
56 return EngineType::Browser;
57 }
58
59 debug!("No JavaScript rendering needed: {}", detection_result.reason);
60 EngineType::Http
61}
62
63pub fn detect_engine_with_reason(url: &str, html: &str) -> (EngineType, DetectionResult) {
65 let detection_result = RenderingDetector::needs_javascript(html, url);
66 let engine_type = if detection_result.needs_js {
67 EngineType::Browser
68 } else {
69 EngineType::Http
70 };
71 (engine_type, detection_result)
72}
73
74pub fn validate_scrape_quality(result: &RawScrapeResult, markdown: &str) -> Result<()> {
78 use crate::error::ScrapeError;
79
80 let is_good_status = (200..300).contains(&result.status_code) || result.status_code == 304; let has_content = markdown.trim().len() > 100;
85
86 let content_density = calculate_content_density(&result.html);
88
89 let looks_like_error = is_likely_error_page(&result.html, result.status_code);
91
92 if !has_content {
93 return Err(ScrapeError::EmptyContent(format!(
94 "Markdown output is too short (length: {})",
95 markdown.len()
96 )));
97 }
98
99 if content_density < 0.05 && !is_good_status {
100 return Err(ScrapeError::LowQuality(format!(
101 "Very low content density: {:.2}% with status {}",
102 content_density * 100.0,
103 result.status_code
104 )));
105 }
106
107 if looks_like_error {
108 return Err(ScrapeError::ErrorPage(format!(
109 "Page appears to be an error page (status: {})",
110 result.status_code
111 )));
112 }
113
114 Ok(())
115}
116
117fn calculate_content_density(html: &str) -> f64 {
119 use scraper::Html;
120
121 let document = Html::parse_document(html);
122
123 let text = document.root_element().text().collect::<String>();
125
126 let text_len = text.trim().len() as f64;
127 let html_len = html.len() as f64;
128
129 if html_len > 0.0 {
130 text_len / html_len
131 } else {
132 0.0
133 }
134}
135
136fn is_likely_error_page(html: &str, status_code: u16) -> bool {
138 if status_code >= 400 {
140 return true;
141 }
142
143 if (200..300).contains(&status_code) {
146 if has_valid_page_metadata(html) {
148 return false;
149 }
150
151 let title_indicators = [
154 "<title>404",
155 "<title>error",
156 "<title>not found",
157 "<title>access denied",
158 "<title>forbidden",
159 ];
160
161 let lower = html.to_lowercase();
162
163 let mut error_count = 0;
165
166 if title_indicators.iter().any(|&indicator| lower.contains(indicator)) {
168 error_count += 1;
169 }
170
171 let heading_indicators = [
173 "<h1>404",
174 "<h1>error",
175 "<h1>not found",
176 "<h1>access denied",
177 "<h2>404",
178 "<h2>error",
179 "<h2>not found",
180 ];
181
182 if heading_indicators.iter().any(|&indicator| lower.contains(indicator)) {
183 error_count += 1;
184 }
185
186 let body_indicators = [
188 "this page doesn't exist",
189 "the page you are looking for does not exist",
190 "the page you requested could not be found",
191 ];
192
193 if body_indicators.iter().any(|&indicator| lower.contains(indicator)) {
194 error_count += 1;
195 }
196
197 return error_count >= 2;
199 }
200
201 false
203}
204
205fn has_valid_page_metadata(html: &str) -> bool {
207 let valid_patterns = [
208 "<meta property=\"og:type\"", "<meta property=\"og:title\"", "<meta name=\"description\"", "application/ld+json", "<meta property=\"twitter:card\"", ];
214
215 let metadata_count = valid_patterns.iter()
217 .filter(|&&pattern| html.contains(pattern))
218 .count();
219
220 metadata_count >= 2
221}
222
223#[cfg(test)]
224mod tests {
225 use super::*;
226
227 #[test]
228 fn test_detect_react_app() {
229 let html = r#"
230 <!DOCTYPE html>
231 <html>
232 <head></head>
233 <body>
234 <div id="root"></div>
235 <script>window.__NEXT_DATA__ = {}</script>
236 </body>
237 </html>
238 "#;
239 assert_eq!(
240 detect_engine_needed("https://example.com", html),
241 EngineType::Browser
242 );
243 }
244
245 #[test]
246 fn test_detect_vue_app() {
247 let html = r#"
248 <!DOCTYPE html>
249 <html>
250 <head></head>
251 <body>
252 <div id="app" data-v-123></div>
253 </body>
254 </html>
255 "#;
256 assert_eq!(
257 detect_engine_needed("https://example.com", html),
258 EngineType::Browser
259 );
260 }
261
262 #[test]
263 fn test_detect_minimal_html() {
264 let html = r#"
265 <!DOCTYPE html>
266 <html>
267 <head><title>App</title></head>
268 <body>
269 <div id="root"></div>
270 </body>
271 </html>
272 "#;
273 assert_eq!(
274 detect_engine_needed("https://example.com", html),
275 EngineType::Browser
276 );
277 }
278
279 #[test]
280 fn test_detect_regular_html() {
281 let html = r#"
282 <!DOCTYPE html>
283 <html>
284 <head><title>Regular Page</title></head>
285 <body>
286 <h1>Welcome</h1>
287 <p>This is a regular HTML page with plenty of content that is not a SPA.</p>
288 <p>It has multiple paragraphs and elements.</p>
289 </body>
290 </html>
291 "#;
292 assert_eq!(
293 detect_engine_needed("https://example.com", html),
294 EngineType::Http
295 );
296 }
297
298 #[test]
299 fn test_validate_empty_content() {
300 let result = RawScrapeResult {
301 url: "https://example.com".to_string(),
302 status_code: 200,
303 content_type: Some("text/html".to_string()),
304 html: "<html><body>Test</body></html>".to_string(),
305 headers: vec![],
306 };
307
308 let markdown = "Short"; let validation = validate_scrape_quality(&result, markdown);
311 assert!(validation.is_err());
312 assert!(matches!(
313 validation.unwrap_err(),
314 crate::error::ScrapeError::EmptyContent(_)
315 ));
316 }
317
318 #[test]
319 fn test_validate_error_page_by_status() {
320 let result = RawScrapeResult {
321 url: "https://example.com".to_string(),
322 status_code: 404,
323 content_type: Some("text/html".to_string()),
324 html: "<html><body><h1>Not Found</h1></body></html>".to_string(),
325 headers: vec![],
326 };
327
328 let markdown = "# Not Found\n\nThis is a longer markdown content that meets the minimum length requirement but is still an error page.";
329
330 let validation = validate_scrape_quality(&result, markdown);
331 assert!(validation.is_err());
332 assert!(matches!(
333 validation.unwrap_err(),
334 crate::error::ScrapeError::ErrorPage(_)
335 ));
336 }
337
338 #[test]
339 fn test_validate_error_page_by_content() {
340 let result = RawScrapeResult {
341 url: "https://example.com".to_string(),
342 status_code: 200,
343 content_type: Some("text/html".to_string()),
344 html: "<html><body><h1>404 Not Found</h1><p>The page you are looking for does not exist.</p></body></html>".to_string(),
345 headers: vec![],
346 };
347
348 let markdown =
349 "# 404 Not Found\n\nThe page you are looking for does not exist. This is long enough to pass the length check.";
350
351 let validation = validate_scrape_quality(&result, markdown);
352 assert!(validation.is_err());
353 assert!(matches!(
354 validation.unwrap_err(),
355 crate::error::ScrapeError::ErrorPage(_)
356 ));
357 }
358
359 #[test]
360 fn test_validate_good_content() {
361 let result = RawScrapeResult {
362 url: "https://example.com".to_string(),
363 status_code: 200,
364 content_type: Some("text/html".to_string()),
365 html: r#"
366 <html>
367 <head><title>Good Page</title></head>
368 <body>
369 <h1>Welcome to our site</h1>
370 <p>This is a well-formed page with plenty of content.</p>
371 <p>It has multiple paragraphs and meaningful information.</p>
372 <p>The content density is reasonable.</p>
373 </body>
374 </html>
375 "#
376 .to_string(),
377 headers: vec![],
378 };
379
380 let markdown = r#"
381# Welcome to our site
382
383This is a well-formed page with plenty of content.
384
385It has multiple paragraphs and meaningful information.
386
387The content density is reasonable.
388 "#;
389
390 let validation = validate_scrape_quality(&result, markdown);
391 assert!(validation.is_ok());
392 }
393
394 #[test]
395 fn test_validate_low_quality_content() {
396 let css_comments = "/* ".repeat(500); let html_comments = "<!--".repeat(500); let html_parts = vec![
400 r#"<html><head><style>"#,
401 &css_comments,
402 r#"*/ body { margin: 0; } </style></head><body>"#,
403 "T", &html_comments,
405 r#"--> </body></html>"#,
406 ];
407 let html = html_parts.join("");
408
409 let density = calculate_content_density(&html);
411 let html_len = html.len();
412 eprintln!(
413 "Content density: {} (html len: {}, text len: ~{})",
414 density,
415 html_len,
416 (density * html_len as f64) as usize
417 );
418
419 let result = RawScrapeResult {
420 url: "https://example.com".to_string(),
421 status_code: 500,
422 content_type: Some("text/html".to_string()),
423 html,
424 headers: vec![],
425 };
426
427 let markdown = "This markdown is long enough to pass the minimum length requirement of 100 characters but still represents very low density content.";
429
430 let validation = validate_scrape_quality(&result, markdown);
431 if let Err(ref e) = validation {
432 eprintln!("Validation error: {:?}", e);
433 }
434 assert!(validation.is_err(), "Expected validation to fail");
435
436 match validation.unwrap_err() {
438 crate::error::ScrapeError::LowQuality(_) => {
439 }
441 crate::error::ScrapeError::ErrorPage(_) => {
442 }
444 other => {
445 panic!("Expected LowQuality or ErrorPage, got: {:?}", other);
446 }
447 }
448 }
449
450 #[test]
451 fn test_calculate_content_density() {
452 let html = "<html><body>Test</body></html>";
453 let density = calculate_content_density(html);
454 assert!(density > 0.1 && density < 0.2);
456
457 let empty_html = "";
458 let empty_density = calculate_content_density(empty_html);
459 assert_eq!(empty_density, 0.0);
460 }
461
462 #[test]
463 fn test_is_likely_error_page() {
464 assert!(is_likely_error_page("Some content", 404));
466 assert!(is_likely_error_page("Some content", 500));
467
468 let valid_page_with_metadata = r#"
470 <html>
471 <head>
472 <meta property="og:type" content="website">
473 <meta property="og:title" content="IMDb">
474 <meta name="description" content="Movie database">
475 </head>
476 <body>Error occurred in JavaScript code</body>
477 </html>
478 "#;
479 assert!(!is_likely_error_page(valid_page_with_metadata, 200));
480
481 let error_in_title = "<html><head><title>404 Not Found</title></head><body><h1>404 Not Found</h1></body></html>";
483 assert!(is_likely_error_page(error_in_title, 200));
484
485 let error_in_heading = "<html><body><h1>404 Not Found</h1><p>The page you are looking for does not exist</p></body></html>";
487 assert!(is_likely_error_page(error_in_heading, 200));
488
489 let normal_page = "<html><body><p>Welcome to our site. Error handling is important.</p></body></html>";
491 assert!(!is_likely_error_page(normal_page, 200));
492
493 let imdb_like = r#"
495 <html>
496 <head>
497 <meta property="og:type" content="website">
498 <meta name="description" content="IMDb content">
499 <title>IMDb: Ratings, Reviews, and Where to Watch</title>
500 </head>
501 <body>
502 <script>
503 if (error occurred) { console.log("error occurred"); }
504 </script>
505 <h1>Welcome to IMDb</h1>
506 </body>
507 </html>
508 "#;
509 assert!(!is_likely_error_page(imdb_like, 200));
510 }
511
512 #[test]
513 fn test_has_valid_page_metadata() {
514 let with_metadata = r#"
515 <meta property="og:type" content="website">
516 <meta property="og:title" content="Test">
517 <meta name="description" content="Test page">
518 "#;
519 assert!(has_valid_page_metadata(with_metadata));
520
521 let with_one_metadata = r#"
522 <meta name="description" content="Test page">
523 "#;
524 assert!(!has_valid_page_metadata(with_one_metadata)); let no_metadata = "<html><body>Test</body></html>";
527 assert!(!has_valid_page_metadata(no_metadata));
528 }
529}