essence-engine 0.2.0

A fast web retrieval engine with HTTP-to-browser fallback, producing LLM-ready Markdown
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
pub mod browser;
pub mod detection;
pub mod http;
pub mod racer;
pub mod stealth;


use crate::{error::Result, types::ScrapeRequest};
use async_trait::async_trait;
use detection::{DetectionResult, RenderingDetector};
use tracing::debug;

/// Raw scrape result before formatting
#[derive(Debug, Clone)]
pub struct RawScrapeResult {
    /// Final URL after redirects
    pub url: String,
    /// HTTP status code
    pub status_code: u16,
    /// Content-Type header
    pub content_type: Option<String>,
    /// Raw HTML content
    pub html: String,
    /// Response headers
    pub headers: Vec<(String, String)>,
}

/// Trait for scraping engines
#[async_trait]
pub trait ScrapeEngine: Send + Sync {
    /// Scrape a URL and return raw results
    async fn scrape(&self, request: &ScrapeRequest) -> Result<RawScrapeResult>;
}

/// Engine types
#[derive(Debug, Clone, PartialEq)]
pub enum EngineType {
    /// Standard HTTP engine
    Http,
    /// Browser-based engine for JavaScript-heavy sites
    Browser,
}

/// Detect which engine to use based on URL and HTML content
pub fn detect_engine_needed(url: &str, html: &str) -> EngineType {
    debug!("Detecting engine for URL: {}", url);

    // Use the new RenderingDetector for more sophisticated analysis
    let detection_result = RenderingDetector::needs_javascript(html, url);

    if detection_result.needs_js {
        debug!(
            "JavaScript rendering needed: {} (frameworks: {:?})",
            detection_result.reason, detection_result.detected_frameworks
        );
        return EngineType::Browser;
    }

    debug!("No JavaScript rendering needed: {}", detection_result.reason);
    EngineType::Http
}

/// Get detailed detection result (for metadata/debugging)
pub fn detect_engine_with_reason(url: &str, html: &str) -> (EngineType, DetectionResult) {
    let detection_result = RenderingDetector::needs_javascript(html, url);
    let engine_type = if detection_result.needs_js {
        EngineType::Browser
    } else {
        EngineType::Http
    };
    (engine_type, detection_result)
}

// Old detection functions removed - now using RenderingDetector from detection module

/// Validate that scrape result contains meaningful content
pub fn validate_scrape_quality(result: &RawScrapeResult, markdown: &str) -> Result<()> {
    use crate::error::ScrapeError;

    // Check status code
    let is_good_status = (200..300).contains(&result.status_code) || result.status_code == 304; // Not Modified is OK

    // Check markdown content length
    let has_content = markdown.trim().len() > 100;

    // Calculate content density (text vs HTML ratio)
    let content_density = calculate_content_density(&result.html);

    // Check for error indicators
    let looks_like_error = is_likely_error_page(&result.html, result.status_code);

    if !has_content {
        return Err(ScrapeError::EmptyContent(format!(
            "Markdown output is too short (length: {})",
            markdown.len()
        )));
    }

    if content_density < 0.05 && !is_good_status {
        return Err(ScrapeError::LowQuality(format!(
            "Very low content density: {:.2}% with status {}",
            content_density * 100.0,
            result.status_code
        )));
    }

    if looks_like_error {
        return Err(ScrapeError::ErrorPage(format!(
            "Page appears to be an error page (status: {})",
            result.status_code
        )));
    }

    Ok(())
}

/// Calculate text content density (text length / HTML length)
fn calculate_content_density(html: &str) -> f64 {
    use scraper::Html;

    let document = Html::parse_document(html);

    // Extract all text
    let text = document.root_element().text().collect::<String>();

    let text_len = text.trim().len() as f64;
    let html_len = html.len() as f64;

    if html_len > 0.0 {
        text_len / html_len
    } else {
        0.0
    }
}

/// Detect if page is likely an error page
fn is_likely_error_page(html: &str, status_code: u16) -> bool {
    // For error status codes, it's definitely an error page
    if status_code >= 400 {
        return true;
    }

    // For successful status codes (200-299), be VERY conservative
    // Only flag as error if we have strong evidence
    if (200..300).contains(&status_code) {
        // Check for valid page metadata - if present, likely NOT an error page
        if has_valid_page_metadata(html) {
            return false;
        }

        // Check for error indicators in prominent places (title, headings)
        // Not in script tags or JSON data
        let title_indicators = [
            "<title>404",
            "<title>error",
            "<title>not found",
            "<title>access denied",
            "<title>forbidden",
        ];

        let lower = html.to_lowercase();

        // Count how many strong error indicators we find
        let mut error_count = 0;

        // Check title for error indicators
        if title_indicators.iter().any(|&indicator| lower.contains(indicator)) {
            error_count += 1;
        }

        // Check for prominent error messages in heading tags
        let heading_indicators = [
            "<h1>404",
            "<h1>error",
            "<h1>not found",
            "<h1>access denied",
            "<h2>404",
            "<h2>error",
            "<h2>not found",
        ];

        if heading_indicators.iter().any(|&indicator| lower.contains(indicator)) {
            error_count += 1;
        }

        // Check for very specific error phrases that are unlikely to appear in JS code
        let body_indicators = [
            "this page doesn't exist",
            "the page you are looking for does not exist",
            "the page you requested could not be found",
        ];

        if body_indicators.iter().any(|&indicator| lower.contains(indicator)) {
            error_count += 1;
        }

        // Only flag as error if we have at least 2 strong indicators
        return error_count >= 2;
    }

    // For 3xx status codes, not an error page (redirects)
    false
}

/// Check if HTML has valid page metadata indicating it's a real page
fn has_valid_page_metadata(html: &str) -> bool {
    let valid_patterns = [
        "<meta property=\"og:type\"",        // OpenGraph type
        "<meta property=\"og:title\"",      // OpenGraph title
        "<meta name=\"description\"",       // Meta description
        "application/ld+json",              // Structured data
        "<meta property=\"twitter:card\"",  // Twitter card
    ];

    // If page has 2+ valid metadata patterns, it's likely a real page
    let metadata_count = valid_patterns.iter()
        .filter(|&&pattern| html.contains(pattern))
        .count();

    metadata_count >= 2
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_detect_react_app() {
        let html = r#"
            <!DOCTYPE html>
            <html>
            <head></head>
            <body>
                <div id="root"></div>
                <script>window.__NEXT_DATA__ = {}</script>
            </body>
            </html>
        "#;
        assert_eq!(
            detect_engine_needed("https://example.com", html),
            EngineType::Browser
        );
    }

    #[test]
    fn test_detect_vue_app() {
        let html = r#"
            <!DOCTYPE html>
            <html>
            <head></head>
            <body>
                <div id="app" data-v-123></div>
            </body>
            </html>
        "#;
        assert_eq!(
            detect_engine_needed("https://example.com", html),
            EngineType::Browser
        );
    }

    #[test]
    fn test_detect_minimal_html() {
        let html = r#"
            <!DOCTYPE html>
            <html>
            <head><title>App</title></head>
            <body>
                <div id="root"></div>
            </body>
            </html>
        "#;
        assert_eq!(
            detect_engine_needed("https://example.com", html),
            EngineType::Browser
        );
    }

    #[test]
    fn test_detect_regular_html() {
        let html = r#"
            <!DOCTYPE html>
            <html>
            <head><title>Regular Page</title></head>
            <body>
                <h1>Welcome</h1>
                <p>This is a regular HTML page with plenty of content that is not a SPA.</p>
                <p>It has multiple paragraphs and elements.</p>
            </body>
            </html>
        "#;
        assert_eq!(
            detect_engine_needed("https://example.com", html),
            EngineType::Http
        );
    }

    #[test]
    fn test_validate_empty_content() {
        let result = RawScrapeResult {
            url: "https://example.com".to_string(),
            status_code: 200,
            content_type: Some("text/html".to_string()),
            html: "<html><body>Test</body></html>".to_string(),
            headers: vec![],
        };

        let markdown = "Short"; // Too short, < 100 chars

        let validation = validate_scrape_quality(&result, markdown);
        assert!(validation.is_err());
        assert!(matches!(
            validation.unwrap_err(),
            crate::error::ScrapeError::EmptyContent(_)
        ));
    }

    #[test]
    fn test_validate_error_page_by_status() {
        let result = RawScrapeResult {
            url: "https://example.com".to_string(),
            status_code: 404,
            content_type: Some("text/html".to_string()),
            html: "<html><body><h1>Not Found</h1></body></html>".to_string(),
            headers: vec![],
        };

        let markdown = "# Not Found\n\nThis is a longer markdown content that meets the minimum length requirement but is still an error page.";

        let validation = validate_scrape_quality(&result, markdown);
        assert!(validation.is_err());
        assert!(matches!(
            validation.unwrap_err(),
            crate::error::ScrapeError::ErrorPage(_)
        ));
    }

    #[test]
    fn test_validate_error_page_by_content() {
        let result = RawScrapeResult {
            url: "https://example.com".to_string(),
            status_code: 200,
            content_type: Some("text/html".to_string()),
            html: "<html><body><h1>404 Not Found</h1><p>The page you are looking for does not exist.</p></body></html>".to_string(),
            headers: vec![],
        };

        let markdown =
            "# 404 Not Found\n\nThe page you are looking for does not exist. This is long enough to pass the length check.";

        let validation = validate_scrape_quality(&result, markdown);
        assert!(validation.is_err());
        assert!(matches!(
            validation.unwrap_err(),
            crate::error::ScrapeError::ErrorPage(_)
        ));
    }

    #[test]
    fn test_validate_good_content() {
        let result = RawScrapeResult {
            url: "https://example.com".to_string(),
            status_code: 200,
            content_type: Some("text/html".to_string()),
            html: r#"
                <html>
                <head><title>Good Page</title></head>
                <body>
                    <h1>Welcome to our site</h1>
                    <p>This is a well-formed page with plenty of content.</p>
                    <p>It has multiple paragraphs and meaningful information.</p>
                    <p>The content density is reasonable.</p>
                </body>
                </html>
            "#
            .to_string(),
            headers: vec![],
        };

        let markdown = r#"
# Welcome to our site

This is a well-formed page with plenty of content.

It has multiple paragraphs and meaningful information.

The content density is reasonable.
        "#;

        let validation = validate_scrape_quality(&result, markdown);
        assert!(validation.is_ok());
    }

    #[test]
    fn test_validate_low_quality_content() {
        // Create HTML with lots of non-text content (comments, styles, etc.) to make density low
        let css_comments = "/* ".repeat(500); // 1000 chars of CSS comments
        let html_comments = "<!--".repeat(500); // 2000 chars of HTML comments
        let html_parts = vec![
            r#"<html><head><style>"#,
            &css_comments,
            r#"*/ body { margin: 0; } </style></head><body>"#,
            "T", // Just 1 char of actual text
            &html_comments,
            r#"--> </body></html>"#,
        ];
        let html = html_parts.join("");

        // Debug: check the density
        let density = calculate_content_density(&html);
        let html_len = html.len();
        eprintln!(
            "Content density: {} (html len: {}, text len: ~{})",
            density,
            html_len,
            (density * html_len as f64) as usize
        );

        let result = RawScrapeResult {
            url: "https://example.com".to_string(),
            status_code: 500,
            content_type: Some("text/html".to_string()),
            html,
            headers: vec![],
        };

        // Markdown is longer than 100 chars to pass length check
        let markdown = "This markdown is long enough to pass the minimum length requirement of 100 characters but still represents very low density content.";

        let validation = validate_scrape_quality(&result, markdown);
        if let Err(ref e) = validation {
            eprintln!("Validation error: {:?}", e);
        }
        assert!(validation.is_err(), "Expected validation to fail");

        // Check what error we got
        match validation.unwrap_err() {
            crate::error::ScrapeError::LowQuality(_) => {
                // This is what we expect
            }
            crate::error::ScrapeError::ErrorPage(_) => {
                // This is acceptable too since status is 500
            }
            other => {
                panic!("Expected LowQuality or ErrorPage, got: {:?}", other);
            }
        }
    }

    #[test]
    fn test_calculate_content_density() {
        let html = "<html><body>Test</body></html>";
        let density = calculate_content_density(html);
        // "Test" is 4 chars, HTML is 30 chars, so density should be ~0.133
        assert!(density > 0.1 && density < 0.2);

        let empty_html = "";
        let empty_density = calculate_content_density(empty_html);
        assert_eq!(empty_density, 0.0);
    }

    #[test]
    fn test_is_likely_error_page() {
        // Error status codes are always error pages
        assert!(is_likely_error_page("Some content", 404));
        assert!(is_likely_error_page("Some content", 500));

        // Pages with valid metadata should NOT be flagged as errors
        let valid_page_with_metadata = r#"
            <html>
            <head>
                <meta property="og:type" content="website">
                <meta property="og:title" content="IMDb">
                <meta name="description" content="Movie database">
            </head>
            <body>Error occurred in JavaScript code</body>
            </html>
        "#;
        assert!(!is_likely_error_page(valid_page_with_metadata, 200));

        // Page with error in title AND heading should be flagged (multiple indicators, no valid metadata)
        let error_in_title = "<html><head><title>404 Not Found</title></head><body><h1>404 Not Found</h1></body></html>";
        assert!(is_likely_error_page(error_in_title, 200));

        // Page with error in h1 and body should be flagged (multiple indicators)
        let error_in_heading = "<html><body><h1>404 Not Found</h1><p>The page you are looking for does not exist</p></body></html>";
        assert!(is_likely_error_page(error_in_heading, 200));

        // Page with just "error" in body text (only 1 indicator) should NOT be flagged
        let normal_page = "<html><body><p>Welcome to our site. Error handling is important.</p></body></html>";
        assert!(!is_likely_error_page(normal_page, 200));

        // IMDb-like page with "error occurred" in JavaScript should NOT be flagged
        let imdb_like = r#"
            <html>
            <head>
                <meta property="og:type" content="website">
                <meta name="description" content="IMDb content">
                <title>IMDb: Ratings, Reviews, and Where to Watch</title>
            </head>
            <body>
                <script>
                    if (error occurred) { console.log("error occurred"); }
                </script>
                <h1>Welcome to IMDb</h1>
            </body>
            </html>
        "#;
        assert!(!is_likely_error_page(imdb_like, 200));
    }

    #[test]
    fn test_has_valid_page_metadata() {
        let with_metadata = r#"
            <meta property="og:type" content="website">
            <meta property="og:title" content="Test">
            <meta name="description" content="Test page">
        "#;
        assert!(has_valid_page_metadata(with_metadata));

        let with_one_metadata = r#"
            <meta name="description" content="Test page">
        "#;
        assert!(!has_valid_page_metadata(with_one_metadata)); // Need at least 2

        let no_metadata = "<html><body>Test</body></html>";
        assert!(!has_valid_page_metadata(no_metadata));
    }
}