crawlex 1.0.4

Stealth crawler with Chrome-perfect TLS/H2 fingerprint, render pool, hooks, persistent queue
Documentation
use crawlex::discovery::cert::PeerCert;
use crawlex::discovery::tech_fingerprint::{analyze, analyze_with_facts, TechFingerprintFacts};
use http::{HeaderMap, HeaderValue};
use url::Url;

#[test]
fn detects_common_header_and_dom_technologies() {
    let url = Url::parse("https://shop.example/").unwrap();
    let mut headers = HeaderMap::new();
    headers.insert("server", HeaderValue::from_static("cloudflare"));
    headers.insert("cf-ray", HeaderValue::from_static("abc-SFO"));
    headers.insert("x-powered-by", HeaderValue::from_static("Express"));
    headers.insert("set-cookie", HeaderValue::from_static("__cf_bm=1; Path=/"));
    let html = r#"<!doctype html>
        <html><head>
          <meta name="generator" content="WordPress 6.5">
          <script src="/_next/static/app.js"></script>
          <script src="https://www.googletagmanager.com/gtm.js?id=GTM-X"></script>
        </head><body><div id="__NEXT_DATA__"></div></body></html>"#;

    let report = analyze(&url, &url, Some(&headers), Some(html));
    let slugs: Vec<_> = report
        .technologies
        .iter()
        .map(|t| t.slug.as_str())
        .collect();

    assert!(slugs.contains(&"cloudflare"));
    assert!(slugs.contains(&"express"));
    assert!(slugs.contains(&"wordpress"));
    assert!(slugs.contains(&"nextjs"));
    assert!(slugs.contains(&"google-tag-manager"));
    assert!(report
        .technologies
        .iter()
        .all(|t| t.confidence > 0 && !t.evidence.is_empty()));
}

#[test]
fn detects_required_framework_and_ecommerce_signatures() {
    let url = Url::parse("https://app.example/").unwrap();
    let mut headers = HeaderMap::new();
    headers.insert("x-powered-by", HeaderValue::from_static("Laravel"));
    headers.insert(
        "set-cookie",
        HeaderValue::from_static("_rails_session=1; csrftoken=2; woocommerce_cart_hash=3"),
    );
    let html = r#"<!doctype html>
        <html data-v-app data-server-rendered="true">
          <head>
            <meta name="generator" content="WooCommerce">
            <meta name="csrf-token" content="abc">
            <script type="module" src="/@vite/client"></script>
            <script src="/assets/rails-ujs.js"></script>
            <script src="/wp-content/plugins/woocommerce/assets/js/frontend/cart-fragments.js"></script>
            <script src="/static/version123/frontend/Magento/luma/en_US/mage/cookies.js"></script>
          </head>
          <body>
            <input type="hidden" name="csrfmiddlewaretoken" value="x">
            <div data-v-abc123></div>
          </body>
        </html>"#;

    let report = analyze(&url, &url, Some(&headers), Some(html));
    let slugs: Vec<_> = report
        .technologies
        .iter()
        .map(|t| t.slug.as_str())
        .collect();

    for expected in [
        "laravel",
        "rails",
        "django",
        "woocommerce",
        "magento",
        "vite",
        "vue",
    ] {
        assert!(slugs.contains(&expected), "missing {expected}: {slugs:?}");
    }
    assert!(report
        .technologies
        .iter()
        .filter(|t| ["rails", "woocommerce", "vite"].contains(&t.slug.as_str()))
        .all(|t| t.confidence >= 80));
}

#[test]
fn detects_infra_facts_from_dns_tls_ports_and_pwa_markers() {
    let url = Url::parse("https://infra.example/").unwrap();
    let cert = PeerCert {
        issuer_cn: Some("Cloudflare Inc ECC".to_string()),
        subject_cn: Some("sni.cloudflaressl.com".to_string()),
        sans: vec!["infra.example".to_string(), "*.myshopify.com".to_string()],
        ..PeerCert::default()
    };
    let dns_json = serde_json::json!({
        "cname": ["infra.global.ssl.fastly.net"],
        "ns": ["ns1.netlifydns.com"],
        "txt": ["v=spf1 include:_spf.google.com"],
        "cloud": ["aws:cloudfront"],
    })
    .to_string();

    let report = analyze_with_facts(
        &url,
        &url,
        None,
        None,
        TechFingerprintFacts {
            peer_cert: Some(&cert),
            dns_json: Some(&dns_json),
            open_ports: &[22, 5432],
            manifest_present: true,
            service_worker_present: true,
        },
    );
    let slugs: Vec<_> = report
        .technologies
        .iter()
        .map(|t| t.slug.as_str())
        .collect();

    for expected in [
        "cloudflare",
        "fastly",
        "netlify",
        "cloudfront",
        "shopify",
        "ssh",
        "postgresql",
        "web-app-manifest",
        "service-worker",
    ] {
        assert!(slugs.contains(&expected), "missing {expected}: {slugs:?}");
    }
}