basemind 0.12.2

//! Smoke contract for the `crawl` feature: crawlberg integration + `Url`
//! boundary validation, driven against an in-process `wiremock` server.
//!
//! No live network calls. The embedding + LanceDB write side of the pipeline
//! is exercised by `tests/mcp_smoke.rs`'s memory / documents coverage; the
//! purpose of THIS file is to pin the crawlberg plumbing — engine config,
//! result shapes, robots.txt enforcement, scheme allowlist — without paying
//! the ONNX model download cost.

#![cfg(feature = "crawl")]

use basemind::config::CrawlConfig;
use basemind::url::{Url, UrlError};
use basemind::web::build_engine;
use wiremock::matchers::{method, path};
use wiremock::{Mock, MockServer, ResponseTemplate};

/// Default crawl config for the smoke suite. `wiremock` binds to `127.0.0.1`, so
/// the engine must allow loopback — crawlberg blocks private/loopback targets by
/// default (SSRF protection). Production keeps that secure default; only these
/// local-server tests opt in via `allow_private_network`.
fn crawl_config() -> CrawlConfig {
    CrawlConfig {
        allow_private_network: true,
        ..CrawlConfig::default()
    }
}

const PAGE_INDEX: &str = "<html><head><title>basemind smoke</title></head>\
  <body><h1>Index</h1><p>The known phrase here is reticulating splines.</p>\
  <a href=\"/about\">about</a><a href=\"/forbidden\">forbidden</a></body></html>";

const PAGE_ABOUT: &str = "<html><body><h1>About</h1><p>Second indexable page.</p></body></html>";

const ROBOTS_TXT: &str = "User-agent: *\nDisallow: /forbidden\n";

const SITEMAP_XML: &str = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\
  <urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\
    <url><loc>{ORIGIN}/</loc></url>\
    <url><loc>{ORIGIN}/about</loc><lastmod>2025-01-01</lastmod></url>\
  </urlset>";

async fn spin_up_site() -> MockServer {
    let server = MockServer::start().await;

    Mock::given(method("GET"))
        .and(path("/"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html; charset=utf-8")
                .set_body_string(PAGE_INDEX),
        )
        .mount(&server)
        .await;

    Mock::given(method("GET"))
        .and(path("/about"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html; charset=utf-8")
                .set_body_string(PAGE_ABOUT),
        )
        .mount(&server)
        .await;

    Mock::given(method("GET"))
        .and(path("/forbidden"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html; charset=utf-8")
                .set_body_string("<html><body>should not be fetched</body></html>"),
        )
        .mount(&server)
        .await;

    Mock::given(method("GET"))
        .and(path("/robots.txt"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/plain")
                .set_body_string(ROBOTS_TXT),
        )
        .mount(&server)
        .await;

    Mock::given(method("GET"))
        .and(path("/sitemap.xml"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "application/xml")
                .set_body_string(SITEMAP_XML.replace("{ORIGIN}", &server.uri())),
        )
        .mount(&server)
        .await;

    server
}

// ─── Url newtype boundary ───────────────────────────────────────────────────

#[test]
fn url_newtype_rejects_file_scheme_via_serde() {
    let res: Result<Url, _> = serde_json::from_str("\"file:///etc/passwd\"");
    let err = res.expect_err("file:// must be rejected at deserialize");
    assert!(
        err.to_string().contains("file"),
        "error should name the scheme; got: {err}"
    );
}

#[test]
fn url_newtype_rejects_javascript_scheme() {
    let err = Url::parse("javascript:alert(1)").expect_err("must reject");
    assert!(
        matches!(&err, UrlError::DisallowedScheme(s) if s == "javascript"),
        "expected DisallowedScheme(javascript), got {err:?}"
    );
}

#[test]
fn url_newtype_accepts_http_https() {
    assert!(Url::parse("http://example.com").is_ok());
    assert!(Url::parse("https://example.com/page?q=1#frag").is_ok());
}

// ─── crawlberg integration (against wiremock) ──────────────────────────────

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_returns_200_and_body() {
    let server = spin_up_site().await;
    let cfg = crawl_config();
    let engine = build_engine(&cfg).expect("build engine");

    let url = format!("{}/", server.uri());
    let result = crawlberg::scrape(&engine, &url).await.expect("scrape root");

    assert_eq!(result.status_code, 200, "scrape should hit the mock 200");
    assert!(result.is_allowed, "robots.txt must allow /");
    let body = result
        .markdown
        .as_ref()
        .map(|m| m.content.as_str())
        .unwrap_or(result.html.as_str());
    assert!(
        body.contains("reticulating splines"),
        "expected known phrase in scraped body; got: {body:?}"
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn robots_txt_blocks_forbidden_path() {
    let server = spin_up_site().await;
    let cfg = crawl_config(); // respect_robots_txt = true
    let engine = build_engine(&cfg).expect("build engine");

    let url = format!("{}/forbidden", server.uri());
    let result = crawlberg::scrape(&engine, &url)
        .await
        .expect("scrape returns even when robots forbids");

    assert!(
        !result.is_allowed,
        "/forbidden must be blocked by robots.txt"
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn map_urls_discovers_sitemap_entries() {
    let server = spin_up_site().await;
    let cfg = crawl_config();
    let engine = build_engine(&cfg).expect("build engine");

    let url = format!("{}/", server.uri());
    let map = crawlberg::map_urls(&engine, &url)
        .await
        .expect("map_urls succeeds");

    // The sitemap lists 2 URLs; crawlberg may also discover links from the
    // root page, so assert >= 1 (the bare minimum that signals discovery
    // actually ran) and that at least one entry is our `/about` URL.
    assert!(
        !map.urls.is_empty(),
        "map_urls must surface at least one URL"
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn crawl_visits_seed_and_returns_pages() {
    let server = spin_up_site().await;
    // Tight bound so the test runs in <1 s.
    let cfg = CrawlConfig {
        max_pages: 4,
        max_depth: 1,
        ..crawl_config()
    };
    let engine = build_engine(&cfg).expect("build engine");

    let url = format!("{}/", server.uri());
    let result = crawlberg::crawl(&engine, &url).await.expect("crawl");

    assert!(
        !result.pages.is_empty(),
        "crawl from the seed must produce at least one page"
    );
    let seed_page = result
        .pages
        .iter()
        .find(|p| p.status_code == 200)
        .expect("at least one successful page");
    let body = seed_page
        .markdown
        .as_ref()
        .map(|m| m.content.as_str())
        .unwrap_or(seed_page.html.as_str());
    assert!(!body.is_empty(), "crawled page must have a non-empty body");
}

// ─── SSRF redirect bypass (C1) ──────────────────────────────────────────────

/// crawlberg follows HTTP redirects itself, so a public seed can 302 to a
/// private host (`http://169.254.169.254/` — the cloud metadata endpoint) that
/// the seed-URL denylist never saw. The MCP web helpers re-validate the URL the
/// crawler actually landed on (`final_url`) through `Url::parse` before indexing
/// and refuse private targets. This test pins that contract end-to-end: wiremock
/// 302s to a private URL, we drive the real `crawlberg::scrape`, then assert the
/// post-fetch denylist (`Url::parse`, which backs the helper's
/// `reject_redirected_private_url`) rejects the landed-on URL.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn redirect_to_private_host_is_rejected_post_fetch() {
    let server = MockServer::start().await;
    // A 302 whose Location points at the AWS link-local metadata endpoint.
    Mock::given(method("GET"))
        .and(path("/redirect"))
        .respond_with(
            ResponseTemplate::new(302)
                .insert_header("location", "http://169.254.169.254/latest/meta-data/"),
        )
        .mount(&server)
        .await;
    // robots must allow the seed so the fetch proceeds to the redirect.
    Mock::given(method("GET"))
        .and(path("/robots.txt"))
        .respond_with(ResponseTemplate::new(404))
        .mount(&server)
        .await;

    let cfg = crawl_config();
    let engine = build_engine(&cfg).expect("build engine");
    let url = format!("{}/redirect", server.uri());

    // The seed itself parses (public wiremock host); the SSRF risk only appears
    // after crawlberg follows the redirect. Whatever URL the crawler reports as
    // final, the post-fetch denylist must reject any private landing host.
    let private_target = "http://169.254.169.254/latest/meta-data/";
    assert!(
        matches!(Url::parse(private_target), Err(UrlError::PrivateHost(_))),
        "post-fetch denylist must reject the link-local redirect target"
    );

    // Best-effort: if the stack exposes the final URL and it is the private
    // target, confirm it round-trips through the same rejection.
    if let Ok(result) = crawlberg::scrape(&engine, &url).await
        && result.final_url.contains("169.254.169.254")
    {
        assert!(
            matches!(Url::parse(&result.final_url), Err(UrlError::PrivateHost(_))),
            "final_url after redirect must be rejected by the denylist; got {}",
            result.final_url
        );
    }
}

// ─── HTTP error paths ───────────────────────────────────────────────────────

/// 404 must surface to the caller (default config has `soft_http_errors=false`,
/// but historically reqwest-style stacks surface non-success status codes via
/// the `status_code` field rather than an `Err`). Either contract is acceptable
/// — the test guards against silent success on a missing URL.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_404_does_not_silently_succeed() {
    let server = MockServer::start().await;
    Mock::given(method("GET"))
        .and(path("/missing"))
        .respond_with(ResponseTemplate::new(404).set_body_string("not here"))
        .mount(&server)
        .await;

    let engine = build_engine(&crawl_config()).expect("engine");
    let url = format!("{}/missing", server.uri());
    let outcome = crawlberg::scrape(&engine, &url).await;

    match outcome {
        Ok(result) => assert!(
            result.status_code >= 400,
            "404 must not appear as 2xx; got status {}",
            result.status_code
        ),
        Err(_) => { /* CrawlError surface — also acceptable */ }
    }
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_5xx_surfaces_status_or_error() {
    let server = MockServer::start().await;
    Mock::given(method("GET"))
        .and(path("/boom"))
        .respond_with(ResponseTemplate::new(503).set_body_string("upstream down"))
        .mount(&server)
        .await;

    let engine = build_engine(&crawl_config()).expect("engine");
    let url = format!("{}/boom", server.uri());
    let outcome = crawlberg::scrape(&engine, &url).await;

    match outcome {
        Ok(result) => assert_eq!(
            result.status_code, 503,
            "5xx must round-trip exact status; got {}",
            result.status_code
        ),
        Err(_) => { /* CrawlError surface — also acceptable */ }
    }
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_follows_redirect_chain() {
    let server = MockServer::start().await;
    let target = format!("{}/landed", server.uri());
    Mock::given(method("GET"))
        .and(path("/redirect"))
        .respond_with(ResponseTemplate::new(301).insert_header("location", target.as_str()))
        .mount(&server)
        .await;
    Mock::given(method("GET"))
        .and(path("/landed"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string("<html><body>landed</body></html>"),
        )
        .mount(&server)
        .await;

    let engine = build_engine(&crawl_config()).expect("engine");
    let url = format!("{}/redirect", server.uri());
    let result = crawlberg::scrape(&engine, &url).await.expect("scrape");

    assert_eq!(result.status_code, 200, "redirect must end on the 200 page");
    assert!(
        result.final_url.contains("/landed"),
        "final_url should reflect the landing path; got {}",
        result.final_url
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_truncates_oversized_body() {
    let big_body = "x".repeat(64 * 1024); // 64 KiB
    let server = MockServer::start().await;
    Mock::given(method("GET"))
        .and(path("/big"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/plain")
                .set_body_string(big_body.clone()),
        )
        .mount(&server)
        .await;

    let cfg = CrawlConfig {
        max_body_size: 4096, // 4 KiB cap
        ..crawl_config()
    };
    let engine = build_engine(&cfg).expect("engine");
    let url = format!("{}/big", server.uri());
    let result = crawlberg::scrape(&engine, &url).await.expect("scrape");

    assert!(
        result.body_size <= 4096,
        "max_body_size must clip; got {} bytes (cap was 4096)",
        result.body_size
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_handles_empty_body() {
    let server = MockServer::start().await;
    Mock::given(method("GET"))
        .and(path("/empty"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string(""),
        )
        .mount(&server)
        .await;

    let engine = build_engine(&crawl_config()).expect("engine");
    let url = format!("{}/empty", server.uri());
    let result = crawlberg::scrape(&engine, &url).await.expect("scrape");

    assert_eq!(result.status_code, 200);
    assert_eq!(result.body_size, 0, "empty body must report 0 bytes");
}

// ─── Crawl bounds + dedup ───────────────────────────────────────────────────

/// A crawl that hits its own seed via a self-referencing link must not visit
/// the same page twice. Tests the dedupe contract on `normalized_urls`.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn crawl_dedupes_circular_links() {
    let server = MockServer::start().await;
    let origin = server.uri();
    let self_referencing = format!(
        "<html><body><a href=\"{origin}/\">self</a><a href=\"{origin}/leaf\">leaf</a></body></html>"
    );
    Mock::given(method("GET"))
        .and(path("/"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string(self_referencing),
        )
        .mount(&server)
        .await;
    let leaf_referencing_root =
        format!("<html><body><a href=\"{origin}/\">back to root</a></body></html>");
    Mock::given(method("GET"))
        .and(path("/leaf"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string(leaf_referencing_root),
        )
        .mount(&server)
        .await;

    let cfg = CrawlConfig {
        max_pages: 10,
        max_depth: 4,
        ..crawl_config()
    };
    let engine = build_engine(&cfg).expect("engine");
    let url = format!("{origin}/");
    let result = crawlberg::crawl(&engine, &url).await.expect("crawl");

    // Each unique URL should appear at most once in the visited set.
    let unique = result.unique_normalized_urls();
    assert!(
        result.pages.len() <= unique + 1,
        "crawl visited {} pages but only {} unique URLs — dedup regressed",
        result.pages.len(),
        unique
    );
}

/// `max_depth = 0` must restrict the crawl to the seed page alone, no link
/// following. The exact link discovery beyond depth 0 is up to crawlberg;
/// what we pin is that the seed page is present and the result is small.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn crawl_respects_max_depth_zero() {
    let server = MockServer::start().await;
    let origin = server.uri();
    let many_links = format!(
        "<html><body><a href=\"{origin}/a\">a</a><a href=\"{origin}/b\">b</a>\
         <a href=\"{origin}/c\">c</a></body></html>"
    );
    Mock::given(method("GET"))
        .and(path("/"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string(many_links),
        )
        .mount(&server)
        .await;
    for leaf in ["/a", "/b", "/c"] {
        Mock::given(method("GET"))
            .and(path(leaf))
            .respond_with(
                ResponseTemplate::new(200)
                    .insert_header("content-type", "text/html")
                    .set_body_string(format!("<html><body>{leaf}</body></html>")),
            )
            .mount(&server)
            .await;
    }

    let cfg = CrawlConfig {
        max_pages: 20,
        max_depth: 0,
        ..crawl_config()
    };
    let engine = build_engine(&cfg).expect("engine");
    let url = format!("{origin}/");
    let result = crawlberg::crawl(&engine, &url).await.expect("crawl");

    assert_eq!(
        result.pages.len(),
        1,
        "max_depth=0 must visit only the seed; got {} pages: {:?}",
        result.pages.len(),
        result.pages.iter().map(|p| &p.url).collect::<Vec<_>>()
    );
}

/// `max_pages` is the hard cap on visited pages. Set it tight and pile up many
/// links; the cap must hold regardless of `max_depth`.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn crawl_respects_max_pages_cap() {
    let server = MockServer::start().await;
    let origin = server.uri();
    let leaf_list: String = (0..20)
        .map(|i| format!("<a href=\"{origin}/leaf{i}\">leaf{i}</a>"))
        .collect();
    let index_html = format!("<html><body>{leaf_list}</body></html>");
    Mock::given(method("GET"))
        .and(path("/"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string(index_html),
        )
        .mount(&server)
        .await;
    Mock::given(method("GET"))
        .and(wiremock::matchers::path_regex("^/leaf[0-9]+$"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string("<html><body>leaf body</body></html>"),
        )
        .mount(&server)
        .await;

    let cfg = CrawlConfig {
        max_pages: 3,
        max_depth: 5,
        ..crawl_config()
    };
    let engine = build_engine(&cfg).expect("engine");
    let url = format!("{origin}/");
    let result = crawlberg::crawl(&engine, &url).await.expect("crawl");

    assert!(
        result.pages.len() <= 3,
        "max_pages=3 must be a hard cap; got {} pages",
        result.pages.len()
    );
}

/// When `/robots.txt` is missing (404), the crawler must default to permissive
/// — otherwise basemind would silently refuse to fetch from any site without
/// an explicit robots.txt file.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn missing_robots_txt_defaults_to_allowed() {
    let server = MockServer::start().await;
    Mock::given(method("GET"))
        .and(path("/"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string("<html><body>permissive</body></html>"),
        )
        .mount(&server)
        .await;
    Mock::given(method("GET"))
        .and(path("/robots.txt"))
        .respond_with(ResponseTemplate::new(404))
        .mount(&server)
        .await;

    let engine = build_engine(&crawl_config()).expect("engine");
    let url = format!("{}/", server.uri());
    let result = crawlberg::scrape(&engine, &url).await.expect("scrape");

    assert!(
        result.is_allowed,
        "missing robots.txt must default to is_allowed=true"
    );
    assert_eq!(result.status_code, 200);
}

// ─── Url newtype: extra boundary cases ──────────────────────────────────────

#[test]
fn url_newtype_strips_no_components() {
    let u = Url::parse("https://docs.rs/rmcp/latest/rmcp/?q=tool#anchor").unwrap();
    assert_eq!(
        u.as_str(),
        "https://docs.rs/rmcp/latest/rmcp/?q=tool#anchor"
    );
    assert_eq!(u.host_str(), Some("docs.rs"));
}

#[test]
fn url_newtype_rejects_empty_string() {
    let err = Url::parse("").expect_err("empty must reject");
    assert!(matches!(err, UrlError::Invalid(_)));
}

#[test]
fn url_newtype_rejects_whitespace() {
    let err = Url::parse("   ").expect_err("whitespace-only must reject");
    assert!(matches!(err, UrlError::Invalid(_)));
}

#[test]
fn url_newtype_inner_exposes_url_components() {
    let u = Url::parse("https://example.com:8080/path").unwrap();
    let inner = u.inner();
    assert_eq!(inner.port(), Some(8080));
    assert_eq!(inner.path(), "/path");
}

#[test]
fn url_from_str_parses() {
    use std::str::FromStr;
    let u: Url = Url::from_str("http://example.com").unwrap();
    assert_eq!(u.host_str(), Some("example.com"));
}

#[test]
fn url_from_str_rejects_bad_scheme() {
    use std::str::FromStr;
    assert!(Url::from_str("ftp://example.com").is_err());
}

// ─── build_engine error surface ─────────────────────────────────────────────

#[test]
fn build_engine_accepts_default_config() {
    let cfg = crawl_config();
    let engine = build_engine(&cfg);
    assert!(
        engine.is_ok(),
        "default CrawlConfig must build a valid engine"
    );
}

#[test]
fn build_engine_handles_tight_bounds() {
    let cfg = CrawlConfig {
        max_pages: 1,
        max_depth: 0,
        max_body_size: 1024,
        ..crawl_config()
    };
    assert!(
        build_engine(&cfg).is_ok(),
        "tight non-zero bounds must still build a valid engine"
    );
}

// ─── Per-call crawl override (mirrors helpers_web::per_call_engine) ──────────

/// `web_crawl` honours per-call `max_pages` / `max_depth` by cloning the server
/// `[crawl]` config, overriding those two fields, and building a one-shot
/// engine. This test reproduces that exact mechanism: start from a permissive
/// server default, clone + override down to `max_pages = 2`, and prove the
/// resulting engine enforces the per-call cap (not the server default).
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn per_call_override_caps_pages_below_server_default() {
    let server = MockServer::start().await;
    let origin = server.uri();
    let leaf_list: String = (0..20)
        .map(|i| format!("<a href=\"{origin}/leaf{i}\">leaf{i}</a>"))
        .collect();
    let index_html = format!("<html><body>{leaf_list}</body></html>");
    Mock::given(method("GET"))
        .and(path("/"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string(index_html),
        )
        .mount(&server)
        .await;
    Mock::given(method("GET"))
        .and(wiremock::matchers::path_regex("^/leaf[0-9]+$"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string("<html><body>leaf body</body></html>"),
        )
        .mount(&server)
        .await;

    // Server default is permissive (would visit many pages)…
    let server_default = CrawlConfig {
        max_pages: 50,
        max_depth: 5,
        ..crawl_config()
    };
    // …but the per-call override clamps to 2 pages, exactly as
    // `per_call_engine` does for an MCP/CLI `web_crawl { max_pages: 2 }`.
    let mut per_call = server_default.clone();
    per_call.max_pages = 2;
    let engine = build_engine(&per_call).expect("per-call engine");

    let url = format!("{origin}/");
    let result = crawlberg::crawl(&engine, &url).await.expect("crawl");

    assert!(
        result.pages.len() <= 2,
        "per-call max_pages=2 must override the server default of 50; got {} pages",
        result.pages.len()
    );
}