void_crawl_core 0.3.7

Rust-native CDP browser automation core — stealth-patched headless Chrome, profile leasing, captcha detection
Documentation
//! Offline accuracy benchmark for the anti-bot vendor detector.
//!
//! A held-out, hermetic corpus of labeled responses — **disjoint** from the
//! cases in `antibot.rs`'s unit tests and from the live `fortress`/`sannysoft`
//! canaries — scored for precision/recall on the vendor label and the
//! challenged flag. This measures the detector against representative response
//! shapes rather than overfitting to one live target.
//!
//! Run: `cargo test -p void_crawl_core --test antibot_accuracy -- --nocapture`

use void_crawl_core::classify_antibot;

/// One labeled fixture: a response shape and what the detector should conclude.
struct Case {
    name:              &'static str,
    status:            u16,
    headers:           &'static [(&'static str, &'static str)],
    body:              &'static str,
    expect_vendor:     Option<&'static str>,
    expect_challenged: bool,
}

fn cases() -> Vec<Case> {
    vec![
        // ── Walls, actively challenging ──────────────────────────────
        Case {
            name:              "datadome_block_page",
            status:            403,
            headers:           &[
                ("server", "DataDome"),
                ("x-datadome", "protected"),
                ("x-dd-b", "1"),
            ],
            body:              "<html><body>Please enable JS and cookies. <script src=\"https://geo.captcha-delivery.com/captcha/\"></script></body></html>",
            expect_vendor:     Some("datadome"),
            expect_challenged: true,
        },
        Case {
            name:              "imperva_incapsula_incident",
            status:            403,
            headers:           &[
                ("x-iinfo", "12-3456-7890"),
                ("set-cookie", "visid_incap_123=abc; path=/"),
            ],
            body:              "<html>Request unsuccessful. Incident ID: 1234-5678. Powered by _Incapsula_Resource</html>",
            expect_vendor:     Some("imperva"),
            expect_challenged: true,
        },
        Case {
            name:              "akamai_access_denied",
            status:            403,
            headers:           &[("server", "AkamaiGHost"), ("mime-version", "1.0")],
            body:              "<html><head><title>Access Denied</title></head><body>Reference #18.abcd1234 generated by errors.edgesuite.net</body></html>",
            expect_vendor:     Some("akamai"),
            expect_challenged: true,
        },
        Case {
            name:              "perimeterx_captcha",
            status:            403,
            headers:           &[("set-cookie", "_pxhd=abc; path=/")],
            body:              "<html><div id=\"px-captcha\"></div><script>window._pxAppId='PXabc'</script></html>",
            expect_vendor:     Some("perimeterx"),
            expect_challenged: true,
        },
        Case {
            name:              "f5_bigip_rejected",
            status:            200,
            headers:           &[("set-cookie", "BIGipServerpool=123.456.789; path=/")],
            body:              "<html><body>The requested URL was rejected. Please consult with your administrator. Your support ID is: 1234567890</body></html>",
            expect_vendor:     Some("f5"),
            expect_challenged: true,
        },
        Case {
            name:              "awswaf_token_challenge",
            status:            202,
            headers:           &[("x-amzn-waf-action", "challenge")],
            body:              "<html><script src=\"https://abc.token.awswaf.com/abc/challenge.js\"></script></html>",
            expect_vendor:     Some("awswaf"),
            expect_challenged: true,
        },
        Case {
            name:              "hcaptcha_wall",
            status:            200,
            headers:           &[("content-type", "text/html")],
            body:              "<html><div class=\"h-captcha\" data-sitekey=\"abc\"></div><script src=\"https://hcaptcha.com/1/api.js\"></script></html>",
            expect_vendor:     Some("hcaptcha"),
            expect_challenged: true,
        },
        Case {
            name:              "sucuri_firewall_block",
            status:            403,
            headers:           &[
                ("server", "Sucuri/Cloudproxy"),
                ("x-sucuri-id", "12001"),
                ("x-sucuri-block", "RBL"),
            ],
            body:              "<html>Access Denied - Sucuri Website Firewall</html>",
            expect_vendor:     Some("sucuri"),
            expect_challenged: true,
        },
        // ── Presence only — must NOT be flagged as challenged ─────────
        Case {
            name:              "cloudflare_cdn_passthrough",
            status:            200,
            headers:           &[
                ("server", "cloudflare"),
                ("cf-ray", "8aabbccdd1122-IAD"),
                ("cf-cache-status", "HIT"),
            ],
            body:              "<html><body>Real content served fine.</body></html>",
            expect_vendor:     Some("cloudflare"),
            expect_challenged: false,
        },
        Case {
            name:              "akamai_cdn_passthrough",
            status:            200,
            headers:           &[
                ("server", "AkamaiGHost"),
                ("x-akamai-transformed", "9 1234 0 pmb=mTOPb"),
            ],
            body:              "<html><body>Article text, no wall.</body></html>",
            expect_vendor:     Some("akamai"),
            expect_challenged: false,
        },
        Case {
            name:              "cloudfront_cdn_presence",
            status:            200,
            headers:           &[
                ("via", "1.1 abcdef.cloudfront.net (CloudFront)"),
                ("x-amz-cf-id", "abcdef=="),
            ],
            body:              "<html><body>Static asset.</body></html>",
            expect_vendor:     Some("cloudfront"),
            expect_challenged: false,
        },
        // ── Clean — no vendor at all ─────────────────────────────────
        Case {
            name:              "plain_nginx",
            status:            200,
            headers:           &[
                ("server", "nginx/1.25.3"),
                ("content-type", "text/html; charset=utf-8"),
            ],
            body:              "<html><body><h1>Welcome</h1></body></html>",
            expect_vendor:     None,
            expect_challenged: false,
        },
        Case {
            name:              "plain_apache_404",
            status:            404,
            headers:           &[("server", "Apache/2.4.59")],
            body:              "<html><body>Not Found</body></html>",
            expect_vendor:     None,
            expect_challenged: false,
        },
    ]
}

#[test]
#[allow(clippy::cast_precision_loss)] // small fixed corpus; counts fit f64 exactly
fn detection_accuracy_meets_threshold() {
    let cases = cases();

    // Vendor-label confusion counters and challenged-flag correctness. Any
    // mismatch is collected and surfaced in the assert message (print macros
    // are disallowed workspace-wide).
    let mut vendor_tp = 0usize; // expected a vendor, got it among detections
    let mut vendor_fn = 0usize; // expected a vendor, missed it
    let mut vendor_fp = 0usize; // expected none, detected something
    let mut challenged_correct = 0usize;
    let total = cases.len();
    let mut failures: Vec<String> = Vec::new();

    for c in &cases {
        let headers: Vec<(String, String)> =
            c.headers.iter().map(|(k, v)| ((*k).to_string(), (*v).to_string())).collect();
        let v = classify_antibot(c.status, &headers, c.body);

        match c.expect_vendor {
            Some(want) => {
                if v.vendors.iter().any(|d| d == want) {
                    vendor_tp += 1;
                } else {
                    vendor_fn += 1;
                    failures
                        .push(format!("MISS [{}] expected {want:?}, got {:?}", c.name, v.vendors));
                }
            }
            None => {
                if v.detected() {
                    vendor_fp += 1;
                    failures
                        .push(format!("FALSE [{}] expected clean, got {:?}", c.name, v.vendors));
                }
            }
        }

        if v.challenged == c.expect_challenged {
            challenged_correct += 1;
        } else {
            failures.push(format!(
                "FLAG [{}] challenged: expected {}, got {}",
                c.name, c.expect_challenged, v.challenged
            ));
        }
    }

    let precision = vendor_tp as f64 / (vendor_tp + vendor_fp).max(1) as f64;
    let recall = vendor_tp as f64 / (vendor_tp + vendor_fn).max(1) as f64;
    let challenged_acc = challenged_correct as f64 / total.max(1) as f64;
    let detail = failures.join("\n  ");

    // The held-out corpus is curated, so the bar is high. Loosen only with a
    // documented reason — a regression here is a real signature defect.
    assert!(precision >= 0.95, "vendor precision {precision:.3} below 0.95\n  {detail}");
    assert!(recall >= 0.95, "vendor recall {recall:.3} below 0.95\n  {detail}");
    assert!(
        challenged_acc >= 0.95,
        "challenged-flag accuracy {challenged_acc:.3} below 0.95\n  {detail}"
    );
}