#![cfg(feature = "crawl")]
use basemind::config::CrawlConfig;
use basemind::url::{Url, UrlError};
use basemind::web::build_engine;
use wiremock::matchers::{method, path};
use wiremock::{Mock, MockServer, ResponseTemplate};
fn crawl_config() -> CrawlConfig {
CrawlConfig {
allow_private_network: true,
..CrawlConfig::default()
}
}
const PAGE_INDEX: &str = "<html><head><title>basemind smoke</title></head>\
<body><h1>Index</h1><p>The known phrase here is reticulating splines.</p>\
<a href=\"/about\">about</a><a href=\"/forbidden\">forbidden</a></body></html>";
const PAGE_ABOUT: &str = "<html><body><h1>About</h1><p>Second indexable page.</p></body></html>";
const ROBOTS_TXT: &str = "User-agent: *\nDisallow: /forbidden\n";
const SITEMAP_XML: &str = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\
<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\
<url><loc>{ORIGIN}/</loc></url>\
<url><loc>{ORIGIN}/about</loc><lastmod>2025-01-01</lastmod></url>\
</urlset>";
async fn spin_up_site() -> MockServer {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/html; charset=utf-8")
.set_body_string(PAGE_INDEX),
)
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path("/about"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/html; charset=utf-8")
.set_body_string(PAGE_ABOUT),
)
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path("/forbidden"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/html; charset=utf-8")
.set_body_string("<html><body>should not be fetched</body></html>"),
)
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path("/robots.txt"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/plain")
.set_body_string(ROBOTS_TXT),
)
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path("/sitemap.xml"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "application/xml")
.set_body_string(SITEMAP_XML.replace("{ORIGIN}", &server.uri())),
)
.mount(&server)
.await;
server
}
#[test]
fn url_newtype_rejects_file_scheme_via_serde() {
let res: Result<Url, _> = serde_json::from_str("\"file:///etc/passwd\"");
let err = res.expect_err("file:// must be rejected at deserialize");
assert!(
err.to_string().contains("file"),
"error should name the scheme; got: {err}"
);
}
#[test]
fn url_newtype_rejects_javascript_scheme() {
let err = Url::parse("javascript:alert(1)").expect_err("must reject");
assert!(
matches!(&err, UrlError::DisallowedScheme(s) if s == "javascript"),
"expected DisallowedScheme(javascript), got {err:?}"
);
}
#[test]
fn url_newtype_accepts_http_https() {
assert!(Url::parse("http://example.com").is_ok());
assert!(Url::parse("https://example.com/page?q=1#frag").is_ok());
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_returns_200_and_body() {
let server = spin_up_site().await;
let cfg = crawl_config();
let engine = build_engine(&cfg).expect("build engine");
let url = format!("{}/", server.uri());
let result = crawlberg::scrape(&engine, &url).await.expect("scrape root");
assert_eq!(result.status_code, 200, "scrape should hit the mock 200");
assert!(result.is_allowed, "robots.txt must allow /");
let body = result
.markdown
.as_ref()
.map(|m| m.content.as_str())
.unwrap_or(result.html.as_str());
assert!(
body.contains("reticulating splines"),
"expected known phrase in scraped body; got: {body:?}"
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn robots_txt_blocks_forbidden_path() {
let server = spin_up_site().await;
let cfg = crawl_config(); let engine = build_engine(&cfg).expect("build engine");
let url = format!("{}/forbidden", server.uri());
let result = crawlberg::scrape(&engine, &url)
.await
.expect("scrape returns even when robots forbids");
assert!(
!result.is_allowed,
"/forbidden must be blocked by robots.txt"
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn map_urls_discovers_sitemap_entries() {
let server = spin_up_site().await;
let cfg = crawl_config();
let engine = build_engine(&cfg).expect("build engine");
let url = format!("{}/", server.uri());
let map = crawlberg::map_urls(&engine, &url)
.await
.expect("map_urls succeeds");
assert!(
!map.urls.is_empty(),
"map_urls must surface at least one URL"
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn crawl_visits_seed_and_returns_pages() {
let server = spin_up_site().await;
let cfg = CrawlConfig {
max_pages: 4,
max_depth: 1,
..crawl_config()
};
let engine = build_engine(&cfg).expect("build engine");
let url = format!("{}/", server.uri());
let result = crawlberg::crawl(&engine, &url).await.expect("crawl");
assert!(
!result.pages.is_empty(),
"crawl from the seed must produce at least one page"
);
let seed_page = result
.pages
.iter()
.find(|p| p.status_code == 200)
.expect("at least one successful page");
let body = seed_page
.markdown
.as_ref()
.map(|m| m.content.as_str())
.unwrap_or(seed_page.html.as_str());
assert!(!body.is_empty(), "crawled page must have a non-empty body");
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn redirect_to_private_host_is_rejected_post_fetch() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/redirect"))
.respond_with(
ResponseTemplate::new(302)
.insert_header("location", "http://169.254.169.254/latest/meta-data/"),
)
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path("/robots.txt"))
.respond_with(ResponseTemplate::new(404))
.mount(&server)
.await;
let cfg = crawl_config();
let engine = build_engine(&cfg).expect("build engine");
let url = format!("{}/redirect", server.uri());
let private_target = "http://169.254.169.254/latest/meta-data/";
assert!(
matches!(Url::parse(private_target), Err(UrlError::PrivateHost(_))),
"post-fetch denylist must reject the link-local redirect target"
);
if let Ok(result) = crawlberg::scrape(&engine, &url).await
&& result.final_url.contains("169.254.169.254")
{
assert!(
matches!(Url::parse(&result.final_url), Err(UrlError::PrivateHost(_))),
"final_url after redirect must be rejected by the denylist; got {}",
result.final_url
);
}
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_404_does_not_silently_succeed() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/missing"))
.respond_with(ResponseTemplate::new(404).set_body_string("not here"))
.mount(&server)
.await;
let engine = build_engine(&crawl_config()).expect("engine");
let url = format!("{}/missing", server.uri());
let outcome = crawlberg::scrape(&engine, &url).await;
match outcome {
Ok(result) => assert!(
result.status_code >= 400,
"404 must not appear as 2xx; got status {}",
result.status_code
),
Err(_) => { }
}
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_5xx_surfaces_status_or_error() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/boom"))
.respond_with(ResponseTemplate::new(503).set_body_string("upstream down"))
.mount(&server)
.await;
let engine = build_engine(&crawl_config()).expect("engine");
let url = format!("{}/boom", server.uri());
let outcome = crawlberg::scrape(&engine, &url).await;
match outcome {
Ok(result) => assert_eq!(
result.status_code, 503,
"5xx must round-trip exact status; got {}",
result.status_code
),
Err(_) => { }
}
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_follows_redirect_chain() {
let server = MockServer::start().await;
let target = format!("{}/landed", server.uri());
Mock::given(method("GET"))
.and(path("/redirect"))
.respond_with(ResponseTemplate::new(301).insert_header("location", target.as_str()))
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path("/landed"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/html")
.set_body_string("<html><body>landed</body></html>"),
)
.mount(&server)
.await;
let engine = build_engine(&crawl_config()).expect("engine");
let url = format!("{}/redirect", server.uri());
let result = crawlberg::scrape(&engine, &url).await.expect("scrape");
assert_eq!(result.status_code, 200, "redirect must end on the 200 page");
assert!(
result.final_url.contains("/landed"),
"final_url should reflect the landing path; got {}",
result.final_url
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_truncates_oversized_body() {
let big_body = "x".repeat(64 * 1024); let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/big"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/plain")
.set_body_string(big_body.clone()),
)
.mount(&server)
.await;
let cfg = CrawlConfig {
max_body_size: 4096, ..crawl_config()
};
let engine = build_engine(&cfg).expect("engine");
let url = format!("{}/big", server.uri());
let result = crawlberg::scrape(&engine, &url).await.expect("scrape");
assert!(
result.body_size <= 4096,
"max_body_size must clip; got {} bytes (cap was 4096)",
result.body_size
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_handles_empty_body() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/empty"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/html")
.set_body_string(""),
)
.mount(&server)
.await;
let engine = build_engine(&crawl_config()).expect("engine");
let url = format!("{}/empty", server.uri());
let result = crawlberg::scrape(&engine, &url).await.expect("scrape");
assert_eq!(result.status_code, 200);
assert_eq!(result.body_size, 0, "empty body must report 0 bytes");
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn crawl_dedupes_circular_links() {
let server = MockServer::start().await;
let origin = server.uri();
let self_referencing = format!(
"<html><body><a href=\"{origin}/\">self</a><a href=\"{origin}/leaf\">leaf</a></body></html>"
);
Mock::given(method("GET"))
.and(path("/"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/html")
.set_body_string(self_referencing),
)
.mount(&server)
.await;
let leaf_referencing_root =
format!("<html><body><a href=\"{origin}/\">back to root</a></body></html>");
Mock::given(method("GET"))
.and(path("/leaf"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/html")
.set_body_string(leaf_referencing_root),
)
.mount(&server)
.await;
let cfg = CrawlConfig {
max_pages: 10,
max_depth: 4,
..crawl_config()
};
let engine = build_engine(&cfg).expect("engine");
let url = format!("{origin}/");
let result = crawlberg::crawl(&engine, &url).await.expect("crawl");
let unique = result.unique_normalized_urls();
assert!(
result.pages.len() <= unique + 1,
"crawl visited {} pages but only {} unique URLs — dedup regressed",
result.pages.len(),
unique
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn crawl_respects_max_depth_zero() {
let server = MockServer::start().await;
let origin = server.uri();
let many_links = format!(
"<html><body><a href=\"{origin}/a\">a</a><a href=\"{origin}/b\">b</a>\
<a href=\"{origin}/c\">c</a></body></html>"
);
Mock::given(method("GET"))
.and(path("/"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/html")
.set_body_string(many_links),
)
.mount(&server)
.await;
for leaf in ["/a", "/b", "/c"] {
Mock::given(method("GET"))
.and(path(leaf))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/html")
.set_body_string(format!("<html><body>{leaf}</body></html>")),
)
.mount(&server)
.await;
}
let cfg = CrawlConfig {
max_pages: 20,
max_depth: 0,
..crawl_config()
};
let engine = build_engine(&cfg).expect("engine");
let url = format!("{origin}/");
let result = crawlberg::crawl(&engine, &url).await.expect("crawl");
assert_eq!(
result.pages.len(),
1,
"max_depth=0 must visit only the seed; got {} pages: {:?}",
result.pages.len(),
result.pages.iter().map(|p| &p.url).collect::<Vec<_>>()
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn crawl_respects_max_pages_cap() {
let server = MockServer::start().await;
let origin = server.uri();
let leaf_list: String = (0..20)
.map(|i| format!("<a href=\"{origin}/leaf{i}\">leaf{i}</a>"))
.collect();
let index_html = format!("<html><body>{leaf_list}</body></html>");
Mock::given(method("GET"))
.and(path("/"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/html")
.set_body_string(index_html),
)
.mount(&server)
.await;
Mock::given(method("GET"))
.and(wiremock::matchers::path_regex("^/leaf[0-9]+$"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/html")
.set_body_string("<html><body>leaf body</body></html>"),
)
.mount(&server)
.await;
let cfg = CrawlConfig {
max_pages: 3,
max_depth: 5,
..crawl_config()
};
let engine = build_engine(&cfg).expect("engine");
let url = format!("{origin}/");
let result = crawlberg::crawl(&engine, &url).await.expect("crawl");
assert!(
result.pages.len() <= 3,
"max_pages=3 must be a hard cap; got {} pages",
result.pages.len()
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn missing_robots_txt_defaults_to_allowed() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/html")
.set_body_string("<html><body>permissive</body></html>"),
)
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path("/robots.txt"))
.respond_with(ResponseTemplate::new(404))
.mount(&server)
.await;
let engine = build_engine(&crawl_config()).expect("engine");
let url = format!("{}/", server.uri());
let result = crawlberg::scrape(&engine, &url).await.expect("scrape");
assert!(
result.is_allowed,
"missing robots.txt must default to is_allowed=true"
);
assert_eq!(result.status_code, 200);
}
#[test]
fn url_newtype_strips_no_components() {
let u = Url::parse("https://docs.rs/rmcp/latest/rmcp/?q=tool#anchor").unwrap();
assert_eq!(
u.as_str(),
"https://docs.rs/rmcp/latest/rmcp/?q=tool#anchor"
);
assert_eq!(u.host_str(), Some("docs.rs"));
}
#[test]
fn url_newtype_rejects_empty_string() {
let err = Url::parse("").expect_err("empty must reject");
assert!(matches!(err, UrlError::Invalid(_)));
}
#[test]
fn url_newtype_rejects_whitespace() {
let err = Url::parse(" ").expect_err("whitespace-only must reject");
assert!(matches!(err, UrlError::Invalid(_)));
}
#[test]
fn url_newtype_inner_exposes_url_components() {
let u = Url::parse("https://example.com:8080/path").unwrap();
let inner = u.inner();
assert_eq!(inner.port(), Some(8080));
assert_eq!(inner.path(), "/path");
}
#[test]
fn url_from_str_parses() {
use std::str::FromStr;
let u: Url = Url::from_str("http://example.com").unwrap();
assert_eq!(u.host_str(), Some("example.com"));
}
#[test]
fn url_from_str_rejects_bad_scheme() {
use std::str::FromStr;
assert!(Url::from_str("ftp://example.com").is_err());
}
#[test]
fn build_engine_accepts_default_config() {
let cfg = crawl_config();
let engine = build_engine(&cfg);
assert!(
engine.is_ok(),
"default CrawlConfig must build a valid engine"
);
}
#[test]
fn build_engine_handles_tight_bounds() {
let cfg = CrawlConfig {
max_pages: 1,
max_depth: 0,
max_body_size: 1024,
..crawl_config()
};
assert!(
build_engine(&cfg).is_ok(),
"tight non-zero bounds must still build a valid engine"
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn per_call_override_caps_pages_below_server_default() {
let server = MockServer::start().await;
let origin = server.uri();
let leaf_list: String = (0..20)
.map(|i| format!("<a href=\"{origin}/leaf{i}\">leaf{i}</a>"))
.collect();
let index_html = format!("<html><body>{leaf_list}</body></html>");
Mock::given(method("GET"))
.and(path("/"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/html")
.set_body_string(index_html),
)
.mount(&server)
.await;
Mock::given(method("GET"))
.and(wiremock::matchers::path_regex("^/leaf[0-9]+$"))
.respond_with(
ResponseTemplate::new(200)
.insert_header("content-type", "text/html")
.set_body_string("<html><body>leaf body</body></html>"),
)
.mount(&server)
.await;
let server_default = CrawlConfig {
max_pages: 50,
max_depth: 5,
..crawl_config()
};
let mut per_call = server_default.clone();
per_call.max_pages = 2;
let engine = build_engine(&per_call).expect("per-call engine");
let url = format!("{origin}/");
let result = crawlberg::crawl(&engine, &url).await.expect("crawl");
assert!(
result.pages.len() <= 2,
"per-call max_pages=2 must override the server default of 50; got {} pages",
result.pages.len()
);
}