essence/engines/
mod.rs

1pub mod browser;
2pub mod detection;
3pub mod http;
4pub mod racer;
5pub mod stealth;
6
7
8use crate::{error::Result, types::ScrapeRequest};
9use async_trait::async_trait;
10use detection::{DetectionResult, RenderingDetector};
11use tracing::debug;
12
13/// Raw scrape result before formatting
14#[derive(Debug, Clone)]
15pub struct RawScrapeResult {
16    /// Final URL after redirects
17    pub url: String,
18    /// HTTP status code
19    pub status_code: u16,
20    /// Content-Type header
21    pub content_type: Option<String>,
22    /// Raw HTML content
23    pub html: String,
24    /// Response headers
25    pub headers: Vec<(String, String)>,
26}
27
28/// Trait for scraping engines
29#[async_trait]
30pub trait ScrapeEngine: Send + Sync {
31    /// Scrape a URL and return raw results
32    async fn scrape(&self, request: &ScrapeRequest) -> Result<RawScrapeResult>;
33}
34
35/// Engine types
36#[derive(Debug, Clone, PartialEq)]
37pub enum EngineType {
38    /// Standard HTTP engine
39    Http,
40    /// Browser-based engine for JavaScript-heavy sites
41    Browser,
42}
43
44/// Detect which engine to use based on URL and HTML content
45pub fn detect_engine_needed(url: &str, html: &str) -> EngineType {
46    debug!("Detecting engine for URL: {}", url);
47
48    // Use the new RenderingDetector for more sophisticated analysis
49    let detection_result = RenderingDetector::needs_javascript(html, url);
50
51    if detection_result.needs_js {
52        debug!(
53            "JavaScript rendering needed: {} (frameworks: {:?})",
54            detection_result.reason, detection_result.detected_frameworks
55        );
56        return EngineType::Browser;
57    }
58
59    debug!("No JavaScript rendering needed: {}", detection_result.reason);
60    EngineType::Http
61}
62
63/// Get detailed detection result (for metadata/debugging)
64pub fn detect_engine_with_reason(url: &str, html: &str) -> (EngineType, DetectionResult) {
65    let detection_result = RenderingDetector::needs_javascript(html, url);
66    let engine_type = if detection_result.needs_js {
67        EngineType::Browser
68    } else {
69        EngineType::Http
70    };
71    (engine_type, detection_result)
72}
73
74// Old detection functions removed - now using RenderingDetector from detection module
75
76/// Validate that scrape result contains meaningful content
77pub fn validate_scrape_quality(result: &RawScrapeResult, markdown: &str) -> Result<()> {
78    use crate::error::ScrapeError;
79
80    // Check status code
81    let is_good_status = (200..300).contains(&result.status_code) || result.status_code == 304; // Not Modified is OK
82
83    // Check markdown content length
84    let has_content = markdown.trim().len() > 100;
85
86    // Calculate content density (text vs HTML ratio)
87    let content_density = calculate_content_density(&result.html);
88
89    // Check for error indicators
90    let looks_like_error = is_likely_error_page(&result.html, result.status_code);
91
92    if !has_content {
93        return Err(ScrapeError::EmptyContent(format!(
94            "Markdown output is too short (length: {})",
95            markdown.len()
96        )));
97    }
98
99    if content_density < 0.05 && !is_good_status {
100        return Err(ScrapeError::LowQuality(format!(
101            "Very low content density: {:.2}% with status {}",
102            content_density * 100.0,
103            result.status_code
104        )));
105    }
106
107    if looks_like_error {
108        return Err(ScrapeError::ErrorPage(format!(
109            "Page appears to be an error page (status: {})",
110            result.status_code
111        )));
112    }
113
114    Ok(())
115}
116
117/// Calculate text content density (text length / HTML length)
118fn calculate_content_density(html: &str) -> f64 {
119    use scraper::Html;
120
121    let document = Html::parse_document(html);
122
123    // Extract all text
124    let text = document.root_element().text().collect::<String>();
125
126    let text_len = text.trim().len() as f64;
127    let html_len = html.len() as f64;
128
129    if html_len > 0.0 {
130        text_len / html_len
131    } else {
132        0.0
133    }
134}
135
136/// Detect if page is likely an error page
137fn is_likely_error_page(html: &str, status_code: u16) -> bool {
138    // For error status codes, it's definitely an error page
139    if status_code >= 400 {
140        return true;
141    }
142
143    // For successful status codes (200-299), be VERY conservative
144    // Only flag as error if we have strong evidence
145    if (200..300).contains(&status_code) {
146        // Check for valid page metadata - if present, likely NOT an error page
147        if has_valid_page_metadata(html) {
148            return false;
149        }
150
151        // Check for error indicators in prominent places (title, headings)
152        // Not in script tags or JSON data
153        let title_indicators = [
154            "<title>404",
155            "<title>error",
156            "<title>not found",
157            "<title>access denied",
158            "<title>forbidden",
159        ];
160
161        let lower = html.to_lowercase();
162
163        // Count how many strong error indicators we find
164        let mut error_count = 0;
165
166        // Check title for error indicators
167        if title_indicators.iter().any(|&indicator| lower.contains(indicator)) {
168            error_count += 1;
169        }
170
171        // Check for prominent error messages in heading tags
172        let heading_indicators = [
173            "<h1>404",
174            "<h1>error",
175            "<h1>not found",
176            "<h1>access denied",
177            "<h2>404",
178            "<h2>error",
179            "<h2>not found",
180        ];
181
182        if heading_indicators.iter().any(|&indicator| lower.contains(indicator)) {
183            error_count += 1;
184        }
185
186        // Check for very specific error phrases that are unlikely to appear in JS code
187        let body_indicators = [
188            "this page doesn't exist",
189            "the page you are looking for does not exist",
190            "the page you requested could not be found",
191        ];
192
193        if body_indicators.iter().any(|&indicator| lower.contains(indicator)) {
194            error_count += 1;
195        }
196
197        // Only flag as error if we have at least 2 strong indicators
198        return error_count >= 2;
199    }
200
201    // For 3xx status codes, not an error page (redirects)
202    false
203}
204
205/// Check if HTML has valid page metadata indicating it's a real page
206fn has_valid_page_metadata(html: &str) -> bool {
207    let valid_patterns = [
208        "<meta property=\"og:type\"",        // OpenGraph type
209        "<meta property=\"og:title\"",      // OpenGraph title
210        "<meta name=\"description\"",       // Meta description
211        "application/ld+json",              // Structured data
212        "<meta property=\"twitter:card\"",  // Twitter card
213    ];
214
215    // If page has 2+ valid metadata patterns, it's likely a real page
216    let metadata_count = valid_patterns.iter()
217        .filter(|&&pattern| html.contains(pattern))
218        .count();
219
220    metadata_count >= 2
221}
222
223#[cfg(test)]
224mod tests {
225    use super::*;
226
227    #[test]
228    fn test_detect_react_app() {
229        let html = r#"
230            <!DOCTYPE html>
231            <html>
232            <head></head>
233            <body>
234                <div id="root"></div>
235                <script>window.__NEXT_DATA__ = {}</script>
236            </body>
237            </html>
238        "#;
239        assert_eq!(
240            detect_engine_needed("https://example.com", html),
241            EngineType::Browser
242        );
243    }
244
245    #[test]
246    fn test_detect_vue_app() {
247        let html = r#"
248            <!DOCTYPE html>
249            <html>
250            <head></head>
251            <body>
252                <div id="app" data-v-123></div>
253            </body>
254            </html>
255        "#;
256        assert_eq!(
257            detect_engine_needed("https://example.com", html),
258            EngineType::Browser
259        );
260    }
261
262    #[test]
263    fn test_detect_minimal_html() {
264        let html = r#"
265            <!DOCTYPE html>
266            <html>
267            <head><title>App</title></head>
268            <body>
269                <div id="root"></div>
270            </body>
271            </html>
272        "#;
273        assert_eq!(
274            detect_engine_needed("https://example.com", html),
275            EngineType::Browser
276        );
277    }
278
279    #[test]
280    fn test_detect_regular_html() {
281        let html = r#"
282            <!DOCTYPE html>
283            <html>
284            <head><title>Regular Page</title></head>
285            <body>
286                <h1>Welcome</h1>
287                <p>This is a regular HTML page with plenty of content that is not a SPA.</p>
288                <p>It has multiple paragraphs and elements.</p>
289            </body>
290            </html>
291        "#;
292        assert_eq!(
293            detect_engine_needed("https://example.com", html),
294            EngineType::Http
295        );
296    }
297
298    #[test]
299    fn test_validate_empty_content() {
300        let result = RawScrapeResult {
301            url: "https://example.com".to_string(),
302            status_code: 200,
303            content_type: Some("text/html".to_string()),
304            html: "<html><body>Test</body></html>".to_string(),
305            headers: vec![],
306        };
307
308        let markdown = "Short"; // Too short, < 100 chars
309
310        let validation = validate_scrape_quality(&result, markdown);
311        assert!(validation.is_err());
312        assert!(matches!(
313            validation.unwrap_err(),
314            crate::error::ScrapeError::EmptyContent(_)
315        ));
316    }
317
318    #[test]
319    fn test_validate_error_page_by_status() {
320        let result = RawScrapeResult {
321            url: "https://example.com".to_string(),
322            status_code: 404,
323            content_type: Some("text/html".to_string()),
324            html: "<html><body><h1>Not Found</h1></body></html>".to_string(),
325            headers: vec![],
326        };
327
328        let markdown = "# Not Found\n\nThis is a longer markdown content that meets the minimum length requirement but is still an error page.";
329
330        let validation = validate_scrape_quality(&result, markdown);
331        assert!(validation.is_err());
332        assert!(matches!(
333            validation.unwrap_err(),
334            crate::error::ScrapeError::ErrorPage(_)
335        ));
336    }
337
338    #[test]
339    fn test_validate_error_page_by_content() {
340        let result = RawScrapeResult {
341            url: "https://example.com".to_string(),
342            status_code: 200,
343            content_type: Some("text/html".to_string()),
344            html: "<html><body><h1>404 Not Found</h1><p>The page you are looking for does not exist.</p></body></html>".to_string(),
345            headers: vec![],
346        };
347
348        let markdown =
349            "# 404 Not Found\n\nThe page you are looking for does not exist. This is long enough to pass the length check.";
350
351        let validation = validate_scrape_quality(&result, markdown);
352        assert!(validation.is_err());
353        assert!(matches!(
354            validation.unwrap_err(),
355            crate::error::ScrapeError::ErrorPage(_)
356        ));
357    }
358
359    #[test]
360    fn test_validate_good_content() {
361        let result = RawScrapeResult {
362            url: "https://example.com".to_string(),
363            status_code: 200,
364            content_type: Some("text/html".to_string()),
365            html: r#"
366                <html>
367                <head><title>Good Page</title></head>
368                <body>
369                    <h1>Welcome to our site</h1>
370                    <p>This is a well-formed page with plenty of content.</p>
371                    <p>It has multiple paragraphs and meaningful information.</p>
372                    <p>The content density is reasonable.</p>
373                </body>
374                </html>
375            "#
376            .to_string(),
377            headers: vec![],
378        };
379
380        let markdown = r#"
381# Welcome to our site
382
383This is a well-formed page with plenty of content.
384
385It has multiple paragraphs and meaningful information.
386
387The content density is reasonable.
388        "#;
389
390        let validation = validate_scrape_quality(&result, markdown);
391        assert!(validation.is_ok());
392    }
393
394    #[test]
395    fn test_validate_low_quality_content() {
396        // Create HTML with lots of non-text content (comments, styles, etc.) to make density low
397        let css_comments = "/* ".repeat(500); // 1000 chars of CSS comments
398        let html_comments = "<!--".repeat(500); // 2000 chars of HTML comments
399        let html_parts = vec![
400            r#"<html><head><style>"#,
401            &css_comments,
402            r#"*/ body { margin: 0; } </style></head><body>"#,
403            "T", // Just 1 char of actual text
404            &html_comments,
405            r#"--> </body></html>"#,
406        ];
407        let html = html_parts.join("");
408
409        // Debug: check the density
410        let density = calculate_content_density(&html);
411        let html_len = html.len();
412        eprintln!(
413            "Content density: {} (html len: {}, text len: ~{})",
414            density,
415            html_len,
416            (density * html_len as f64) as usize
417        );
418
419        let result = RawScrapeResult {
420            url: "https://example.com".to_string(),
421            status_code: 500,
422            content_type: Some("text/html".to_string()),
423            html,
424            headers: vec![],
425        };
426
427        // Markdown is longer than 100 chars to pass length check
428        let markdown = "This markdown is long enough to pass the minimum length requirement of 100 characters but still represents very low density content.";
429
430        let validation = validate_scrape_quality(&result, markdown);
431        if let Err(ref e) = validation {
432            eprintln!("Validation error: {:?}", e);
433        }
434        assert!(validation.is_err(), "Expected validation to fail");
435
436        // Check what error we got
437        match validation.unwrap_err() {
438            crate::error::ScrapeError::LowQuality(_) => {
439                // This is what we expect
440            }
441            crate::error::ScrapeError::ErrorPage(_) => {
442                // This is acceptable too since status is 500
443            }
444            other => {
445                panic!("Expected LowQuality or ErrorPage, got: {:?}", other);
446            }
447        }
448    }
449
450    #[test]
451    fn test_calculate_content_density() {
452        let html = "<html><body>Test</body></html>";
453        let density = calculate_content_density(html);
454        // "Test" is 4 chars, HTML is 30 chars, so density should be ~0.133
455        assert!(density > 0.1 && density < 0.2);
456
457        let empty_html = "";
458        let empty_density = calculate_content_density(empty_html);
459        assert_eq!(empty_density, 0.0);
460    }
461
462    #[test]
463    fn test_is_likely_error_page() {
464        // Error status codes are always error pages
465        assert!(is_likely_error_page("Some content", 404));
466        assert!(is_likely_error_page("Some content", 500));
467
468        // Pages with valid metadata should NOT be flagged as errors
469        let valid_page_with_metadata = r#"
470            <html>
471            <head>
472                <meta property="og:type" content="website">
473                <meta property="og:title" content="IMDb">
474                <meta name="description" content="Movie database">
475            </head>
476            <body>Error occurred in JavaScript code</body>
477            </html>
478        "#;
479        assert!(!is_likely_error_page(valid_page_with_metadata, 200));
480
481        // Page with error in title AND heading should be flagged (multiple indicators, no valid metadata)
482        let error_in_title = "<html><head><title>404 Not Found</title></head><body><h1>404 Not Found</h1></body></html>";
483        assert!(is_likely_error_page(error_in_title, 200));
484
485        // Page with error in h1 and body should be flagged (multiple indicators)
486        let error_in_heading = "<html><body><h1>404 Not Found</h1><p>The page you are looking for does not exist</p></body></html>";
487        assert!(is_likely_error_page(error_in_heading, 200));
488
489        // Page with just "error" in body text (only 1 indicator) should NOT be flagged
490        let normal_page = "<html><body><p>Welcome to our site. Error handling is important.</p></body></html>";
491        assert!(!is_likely_error_page(normal_page, 200));
492
493        // IMDb-like page with "error occurred" in JavaScript should NOT be flagged
494        let imdb_like = r#"
495            <html>
496            <head>
497                <meta property="og:type" content="website">
498                <meta name="description" content="IMDb content">
499                <title>IMDb: Ratings, Reviews, and Where to Watch</title>
500            </head>
501            <body>
502                <script>
503                    if (error occurred) { console.log("error occurred"); }
504                </script>
505                <h1>Welcome to IMDb</h1>
506            </body>
507            </html>
508        "#;
509        assert!(!is_likely_error_page(imdb_like, 200));
510    }
511
512    #[test]
513    fn test_has_valid_page_metadata() {
514        let with_metadata = r#"
515            <meta property="og:type" content="website">
516            <meta property="og:title" content="Test">
517            <meta name="description" content="Test page">
518        "#;
519        assert!(has_valid_page_metadata(with_metadata));
520
521        let with_one_metadata = r#"
522            <meta name="description" content="Test page">
523        "#;
524        assert!(!has_valid_page_metadata(with_one_metadata)); // Need at least 2
525
526        let no_metadata = "<html><body>Test</body></html>";
527        assert!(!has_valid_page_metadata(no_metadata));
528    }
529}
essence/engines/mod.rs

essence/engines/
mod.rs