basemind 0.12.2

Full AI context layer over MCP — tree-sitter code-map, document RAG (PDF/Office/HTML/email + OCR + reranker), shared agent memory, on-demand web crawl, git history + blame + per-symbol diff. 300+ languages, 10+ coding-agent harnesses, content-addressed Fjall + LanceDB.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
//! Smoke contract for the `crawl` feature: crawlberg integration + `Url`
//! boundary validation, driven against an in-process `wiremock` server.
//!
//! No live network calls. The embedding + LanceDB write side of the pipeline
//! is exercised by `tests/mcp_smoke.rs`'s memory / documents coverage; the
//! purpose of THIS file is to pin the crawlberg plumbing — engine config,
//! result shapes, robots.txt enforcement, scheme allowlist — without paying
//! the ONNX model download cost.

#![cfg(feature = "crawl")]

use basemind::config::CrawlConfig;
use basemind::url::{Url, UrlError};
use basemind::web::build_engine;
use wiremock::matchers::{method, path};
use wiremock::{Mock, MockServer, ResponseTemplate};

/// Default crawl config for the smoke suite. `wiremock` binds to `127.0.0.1`, so
/// the engine must allow loopback — crawlberg blocks private/loopback targets by
/// default (SSRF protection). Production keeps that secure default; only these
/// local-server tests opt in via `allow_private_network`.
fn crawl_config() -> CrawlConfig {
    CrawlConfig {
        allow_private_network: true,
        ..CrawlConfig::default()
    }
}

const PAGE_INDEX: &str = "<html><head><title>basemind smoke</title></head>\
  <body><h1>Index</h1><p>The known phrase here is reticulating splines.</p>\
  <a href=\"/about\">about</a><a href=\"/forbidden\">forbidden</a></body></html>";

const PAGE_ABOUT: &str = "<html><body><h1>About</h1><p>Second indexable page.</p></body></html>";

const ROBOTS_TXT: &str = "User-agent: *\nDisallow: /forbidden\n";

const SITEMAP_XML: &str = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\
  <urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\
    <url><loc>{ORIGIN}/</loc></url>\
    <url><loc>{ORIGIN}/about</loc><lastmod>2025-01-01</lastmod></url>\
  </urlset>";

async fn spin_up_site() -> MockServer {
    let server = MockServer::start().await;

    Mock::given(method("GET"))
        .and(path("/"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html; charset=utf-8")
                .set_body_string(PAGE_INDEX),
        )
        .mount(&server)
        .await;

    Mock::given(method("GET"))
        .and(path("/about"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html; charset=utf-8")
                .set_body_string(PAGE_ABOUT),
        )
        .mount(&server)
        .await;

    Mock::given(method("GET"))
        .and(path("/forbidden"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html; charset=utf-8")
                .set_body_string("<html><body>should not be fetched</body></html>"),
        )
        .mount(&server)
        .await;

    Mock::given(method("GET"))
        .and(path("/robots.txt"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/plain")
                .set_body_string(ROBOTS_TXT),
        )
        .mount(&server)
        .await;

    Mock::given(method("GET"))
        .and(path("/sitemap.xml"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "application/xml")
                .set_body_string(SITEMAP_XML.replace("{ORIGIN}", &server.uri())),
        )
        .mount(&server)
        .await;

    server
}

// ─── Url newtype boundary ───────────────────────────────────────────────────

#[test]
fn url_newtype_rejects_file_scheme_via_serde() {
    let res: Result<Url, _> = serde_json::from_str("\"file:///etc/passwd\"");
    let err = res.expect_err("file:// must be rejected at deserialize");
    assert!(
        err.to_string().contains("file"),
        "error should name the scheme; got: {err}"
    );
}

#[test]
fn url_newtype_rejects_javascript_scheme() {
    let err = Url::parse("javascript:alert(1)").expect_err("must reject");
    assert!(
        matches!(&err, UrlError::DisallowedScheme(s) if s == "javascript"),
        "expected DisallowedScheme(javascript), got {err:?}"
    );
}

#[test]
fn url_newtype_accepts_http_https() {
    assert!(Url::parse("http://example.com").is_ok());
    assert!(Url::parse("https://example.com/page?q=1#frag").is_ok());
}

// ─── crawlberg integration (against wiremock) ──────────────────────────────

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_returns_200_and_body() {
    let server = spin_up_site().await;
    let cfg = crawl_config();
    let engine = build_engine(&cfg).expect("build engine");

    let url = format!("{}/", server.uri());
    let result = crawlberg::scrape(&engine, &url).await.expect("scrape root");

    assert_eq!(result.status_code, 200, "scrape should hit the mock 200");
    assert!(result.is_allowed, "robots.txt must allow /");
    let body = result
        .markdown
        .as_ref()
        .map(|m| m.content.as_str())
        .unwrap_or(result.html.as_str());
    assert!(
        body.contains("reticulating splines"),
        "expected known phrase in scraped body; got: {body:?}"
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn robots_txt_blocks_forbidden_path() {
    let server = spin_up_site().await;
    let cfg = crawl_config(); // respect_robots_txt = true
    let engine = build_engine(&cfg).expect("build engine");

    let url = format!("{}/forbidden", server.uri());
    let result = crawlberg::scrape(&engine, &url)
        .await
        .expect("scrape returns even when robots forbids");

    assert!(
        !result.is_allowed,
        "/forbidden must be blocked by robots.txt"
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn map_urls_discovers_sitemap_entries() {
    let server = spin_up_site().await;
    let cfg = crawl_config();
    let engine = build_engine(&cfg).expect("build engine");

    let url = format!("{}/", server.uri());
    let map = crawlberg::map_urls(&engine, &url)
        .await
        .expect("map_urls succeeds");

    // The sitemap lists 2 URLs; crawlberg may also discover links from the
    // root page, so assert >= 1 (the bare minimum that signals discovery
    // actually ran) and that at least one entry is our `/about` URL.
    assert!(
        !map.urls.is_empty(),
        "map_urls must surface at least one URL"
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn crawl_visits_seed_and_returns_pages() {
    let server = spin_up_site().await;
    // Tight bound so the test runs in <1 s.
    let cfg = CrawlConfig {
        max_pages: 4,
        max_depth: 1,
        ..crawl_config()
    };
    let engine = build_engine(&cfg).expect("build engine");

    let url = format!("{}/", server.uri());
    let result = crawlberg::crawl(&engine, &url).await.expect("crawl");

    assert!(
        !result.pages.is_empty(),
        "crawl from the seed must produce at least one page"
    );
    let seed_page = result
        .pages
        .iter()
        .find(|p| p.status_code == 200)
        .expect("at least one successful page");
    let body = seed_page
        .markdown
        .as_ref()
        .map(|m| m.content.as_str())
        .unwrap_or(seed_page.html.as_str());
    assert!(!body.is_empty(), "crawled page must have a non-empty body");
}

// ─── SSRF redirect bypass (C1) ──────────────────────────────────────────────

/// crawlberg follows HTTP redirects itself, so a public seed can 302 to a
/// private host (`http://169.254.169.254/` — the cloud metadata endpoint) that
/// the seed-URL denylist never saw. The MCP web helpers re-validate the URL the
/// crawler actually landed on (`final_url`) through `Url::parse` before indexing
/// and refuse private targets. This test pins that contract end-to-end: wiremock
/// 302s to a private URL, we drive the real `crawlberg::scrape`, then assert the
/// post-fetch denylist (`Url::parse`, which backs the helper's
/// `reject_redirected_private_url`) rejects the landed-on URL.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn redirect_to_private_host_is_rejected_post_fetch() {
    let server = MockServer::start().await;
    // A 302 whose Location points at the AWS link-local metadata endpoint.
    Mock::given(method("GET"))
        .and(path("/redirect"))
        .respond_with(
            ResponseTemplate::new(302)
                .insert_header("location", "http://169.254.169.254/latest/meta-data/"),
        )
        .mount(&server)
        .await;
    // robots must allow the seed so the fetch proceeds to the redirect.
    Mock::given(method("GET"))
        .and(path("/robots.txt"))
        .respond_with(ResponseTemplate::new(404))
        .mount(&server)
        .await;

    let cfg = crawl_config();
    let engine = build_engine(&cfg).expect("build engine");
    let url = format!("{}/redirect", server.uri());

    // The seed itself parses (public wiremock host); the SSRF risk only appears
    // after crawlberg follows the redirect. Whatever URL the crawler reports as
    // final, the post-fetch denylist must reject any private landing host.
    let private_target = "http://169.254.169.254/latest/meta-data/";
    assert!(
        matches!(Url::parse(private_target), Err(UrlError::PrivateHost(_))),
        "post-fetch denylist must reject the link-local redirect target"
    );

    // Best-effort: if the stack exposes the final URL and it is the private
    // target, confirm it round-trips through the same rejection.
    if let Ok(result) = crawlberg::scrape(&engine, &url).await
        && result.final_url.contains("169.254.169.254")
    {
        assert!(
            matches!(Url::parse(&result.final_url), Err(UrlError::PrivateHost(_))),
            "final_url after redirect must be rejected by the denylist; got {}",
            result.final_url
        );
    }
}

// ─── HTTP error paths ───────────────────────────────────────────────────────

/// 404 must surface to the caller (default config has `soft_http_errors=false`,
/// but historically reqwest-style stacks surface non-success status codes via
/// the `status_code` field rather than an `Err`). Either contract is acceptable
/// — the test guards against silent success on a missing URL.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_404_does_not_silently_succeed() {
    let server = MockServer::start().await;
    Mock::given(method("GET"))
        .and(path("/missing"))
        .respond_with(ResponseTemplate::new(404).set_body_string("not here"))
        .mount(&server)
        .await;

    let engine = build_engine(&crawl_config()).expect("engine");
    let url = format!("{}/missing", server.uri());
    let outcome = crawlberg::scrape(&engine, &url).await;

    match outcome {
        Ok(result) => assert!(
            result.status_code >= 400,
            "404 must not appear as 2xx; got status {}",
            result.status_code
        ),
        Err(_) => { /* CrawlError surface — also acceptable */ }
    }
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_5xx_surfaces_status_or_error() {
    let server = MockServer::start().await;
    Mock::given(method("GET"))
        .and(path("/boom"))
        .respond_with(ResponseTemplate::new(503).set_body_string("upstream down"))
        .mount(&server)
        .await;

    let engine = build_engine(&crawl_config()).expect("engine");
    let url = format!("{}/boom", server.uri());
    let outcome = crawlberg::scrape(&engine, &url).await;

    match outcome {
        Ok(result) => assert_eq!(
            result.status_code, 503,
            "5xx must round-trip exact status; got {}",
            result.status_code
        ),
        Err(_) => { /* CrawlError surface — also acceptable */ }
    }
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_follows_redirect_chain() {
    let server = MockServer::start().await;
    let target = format!("{}/landed", server.uri());
    Mock::given(method("GET"))
        .and(path("/redirect"))
        .respond_with(ResponseTemplate::new(301).insert_header("location", target.as_str()))
        .mount(&server)
        .await;
    Mock::given(method("GET"))
        .and(path("/landed"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string("<html><body>landed</body></html>"),
        )
        .mount(&server)
        .await;

    let engine = build_engine(&crawl_config()).expect("engine");
    let url = format!("{}/redirect", server.uri());
    let result = crawlberg::scrape(&engine, &url).await.expect("scrape");

    assert_eq!(result.status_code, 200, "redirect must end on the 200 page");
    assert!(
        result.final_url.contains("/landed"),
        "final_url should reflect the landing path; got {}",
        result.final_url
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_truncates_oversized_body() {
    let big_body = "x".repeat(64 * 1024); // 64 KiB
    let server = MockServer::start().await;
    Mock::given(method("GET"))
        .and(path("/big"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/plain")
                .set_body_string(big_body.clone()),
        )
        .mount(&server)
        .await;

    let cfg = CrawlConfig {
        max_body_size: 4096, // 4 KiB cap
        ..crawl_config()
    };
    let engine = build_engine(&cfg).expect("engine");
    let url = format!("{}/big", server.uri());
    let result = crawlberg::scrape(&engine, &url).await.expect("scrape");

    assert!(
        result.body_size <= 4096,
        "max_body_size must clip; got {} bytes (cap was 4096)",
        result.body_size
    );
}

#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn scrape_handles_empty_body() {
    let server = MockServer::start().await;
    Mock::given(method("GET"))
        .and(path("/empty"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string(""),
        )
        .mount(&server)
        .await;

    let engine = build_engine(&crawl_config()).expect("engine");
    let url = format!("{}/empty", server.uri());
    let result = crawlberg::scrape(&engine, &url).await.expect("scrape");

    assert_eq!(result.status_code, 200);
    assert_eq!(result.body_size, 0, "empty body must report 0 bytes");
}

// ─── Crawl bounds + dedup ───────────────────────────────────────────────────

/// A crawl that hits its own seed via a self-referencing link must not visit
/// the same page twice. Tests the dedupe contract on `normalized_urls`.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn crawl_dedupes_circular_links() {
    let server = MockServer::start().await;
    let origin = server.uri();
    let self_referencing = format!(
        "<html><body><a href=\"{origin}/\">self</a><a href=\"{origin}/leaf\">leaf</a></body></html>"
    );
    Mock::given(method("GET"))
        .and(path("/"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string(self_referencing),
        )
        .mount(&server)
        .await;
    let leaf_referencing_root =
        format!("<html><body><a href=\"{origin}/\">back to root</a></body></html>");
    Mock::given(method("GET"))
        .and(path("/leaf"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string(leaf_referencing_root),
        )
        .mount(&server)
        .await;

    let cfg = CrawlConfig {
        max_pages: 10,
        max_depth: 4,
        ..crawl_config()
    };
    let engine = build_engine(&cfg).expect("engine");
    let url = format!("{origin}/");
    let result = crawlberg::crawl(&engine, &url).await.expect("crawl");

    // Each unique URL should appear at most once in the visited set.
    let unique = result.unique_normalized_urls();
    assert!(
        result.pages.len() <= unique + 1,
        "crawl visited {} pages but only {} unique URLs — dedup regressed",
        result.pages.len(),
        unique
    );
}

/// `max_depth = 0` must restrict the crawl to the seed page alone, no link
/// following. The exact link discovery beyond depth 0 is up to crawlberg;
/// what we pin is that the seed page is present and the result is small.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn crawl_respects_max_depth_zero() {
    let server = MockServer::start().await;
    let origin = server.uri();
    let many_links = format!(
        "<html><body><a href=\"{origin}/a\">a</a><a href=\"{origin}/b\">b</a>\
         <a href=\"{origin}/c\">c</a></body></html>"
    );
    Mock::given(method("GET"))
        .and(path("/"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string(many_links),
        )
        .mount(&server)
        .await;
    for leaf in ["/a", "/b", "/c"] {
        Mock::given(method("GET"))
            .and(path(leaf))
            .respond_with(
                ResponseTemplate::new(200)
                    .insert_header("content-type", "text/html")
                    .set_body_string(format!("<html><body>{leaf}</body></html>")),
            )
            .mount(&server)
            .await;
    }

    let cfg = CrawlConfig {
        max_pages: 20,
        max_depth: 0,
        ..crawl_config()
    };
    let engine = build_engine(&cfg).expect("engine");
    let url = format!("{origin}/");
    let result = crawlberg::crawl(&engine, &url).await.expect("crawl");

    assert_eq!(
        result.pages.len(),
        1,
        "max_depth=0 must visit only the seed; got {} pages: {:?}",
        result.pages.len(),
        result.pages.iter().map(|p| &p.url).collect::<Vec<_>>()
    );
}

/// `max_pages` is the hard cap on visited pages. Set it tight and pile up many
/// links; the cap must hold regardless of `max_depth`.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn crawl_respects_max_pages_cap() {
    let server = MockServer::start().await;
    let origin = server.uri();
    let leaf_list: String = (0..20)
        .map(|i| format!("<a href=\"{origin}/leaf{i}\">leaf{i}</a>"))
        .collect();
    let index_html = format!("<html><body>{leaf_list}</body></html>");
    Mock::given(method("GET"))
        .and(path("/"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string(index_html),
        )
        .mount(&server)
        .await;
    Mock::given(method("GET"))
        .and(wiremock::matchers::path_regex("^/leaf[0-9]+$"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string("<html><body>leaf body</body></html>"),
        )
        .mount(&server)
        .await;

    let cfg = CrawlConfig {
        max_pages: 3,
        max_depth: 5,
        ..crawl_config()
    };
    let engine = build_engine(&cfg).expect("engine");
    let url = format!("{origin}/");
    let result = crawlberg::crawl(&engine, &url).await.expect("crawl");

    assert!(
        result.pages.len() <= 3,
        "max_pages=3 must be a hard cap; got {} pages",
        result.pages.len()
    );
}

/// When `/robots.txt` is missing (404), the crawler must default to permissive
/// — otherwise basemind would silently refuse to fetch from any site without
/// an explicit robots.txt file.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn missing_robots_txt_defaults_to_allowed() {
    let server = MockServer::start().await;
    Mock::given(method("GET"))
        .and(path("/"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string("<html><body>permissive</body></html>"),
        )
        .mount(&server)
        .await;
    Mock::given(method("GET"))
        .and(path("/robots.txt"))
        .respond_with(ResponseTemplate::new(404))
        .mount(&server)
        .await;

    let engine = build_engine(&crawl_config()).expect("engine");
    let url = format!("{}/", server.uri());
    let result = crawlberg::scrape(&engine, &url).await.expect("scrape");

    assert!(
        result.is_allowed,
        "missing robots.txt must default to is_allowed=true"
    );
    assert_eq!(result.status_code, 200);
}

// ─── Url newtype: extra boundary cases ──────────────────────────────────────

#[test]
fn url_newtype_strips_no_components() {
    let u = Url::parse("https://docs.rs/rmcp/latest/rmcp/?q=tool#anchor").unwrap();
    assert_eq!(
        u.as_str(),
        "https://docs.rs/rmcp/latest/rmcp/?q=tool#anchor"
    );
    assert_eq!(u.host_str(), Some("docs.rs"));
}

#[test]
fn url_newtype_rejects_empty_string() {
    let err = Url::parse("").expect_err("empty must reject");
    assert!(matches!(err, UrlError::Invalid(_)));
}

#[test]
fn url_newtype_rejects_whitespace() {
    let err = Url::parse("   ").expect_err("whitespace-only must reject");
    assert!(matches!(err, UrlError::Invalid(_)));
}

#[test]
fn url_newtype_inner_exposes_url_components() {
    let u = Url::parse("https://example.com:8080/path").unwrap();
    let inner = u.inner();
    assert_eq!(inner.port(), Some(8080));
    assert_eq!(inner.path(), "/path");
}

#[test]
fn url_from_str_parses() {
    use std::str::FromStr;
    let u: Url = Url::from_str("http://example.com").unwrap();
    assert_eq!(u.host_str(), Some("example.com"));
}

#[test]
fn url_from_str_rejects_bad_scheme() {
    use std::str::FromStr;
    assert!(Url::from_str("ftp://example.com").is_err());
}

// ─── build_engine error surface ─────────────────────────────────────────────

#[test]
fn build_engine_accepts_default_config() {
    let cfg = crawl_config();
    let engine = build_engine(&cfg);
    assert!(
        engine.is_ok(),
        "default CrawlConfig must build a valid engine"
    );
}

#[test]
fn build_engine_handles_tight_bounds() {
    let cfg = CrawlConfig {
        max_pages: 1,
        max_depth: 0,
        max_body_size: 1024,
        ..crawl_config()
    };
    assert!(
        build_engine(&cfg).is_ok(),
        "tight non-zero bounds must still build a valid engine"
    );
}

// ─── Per-call crawl override (mirrors helpers_web::per_call_engine) ──────────

/// `web_crawl` honours per-call `max_pages` / `max_depth` by cloning the server
/// `[crawl]` config, overriding those two fields, and building a one-shot
/// engine. This test reproduces that exact mechanism: start from a permissive
/// server default, clone + override down to `max_pages = 2`, and prove the
/// resulting engine enforces the per-call cap (not the server default).
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn per_call_override_caps_pages_below_server_default() {
    let server = MockServer::start().await;
    let origin = server.uri();
    let leaf_list: String = (0..20)
        .map(|i| format!("<a href=\"{origin}/leaf{i}\">leaf{i}</a>"))
        .collect();
    let index_html = format!("<html><body>{leaf_list}</body></html>");
    Mock::given(method("GET"))
        .and(path("/"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string(index_html),
        )
        .mount(&server)
        .await;
    Mock::given(method("GET"))
        .and(wiremock::matchers::path_regex("^/leaf[0-9]+$"))
        .respond_with(
            ResponseTemplate::new(200)
                .insert_header("content-type", "text/html")
                .set_body_string("<html><body>leaf body</body></html>"),
        )
        .mount(&server)
        .await;

    // Server default is permissive (would visit many pages)…
    let server_default = CrawlConfig {
        max_pages: 50,
        max_depth: 5,
        ..crawl_config()
    };
    // …but the per-call override clamps to 2 pages, exactly as
    // `per_call_engine` does for an MCP/CLI `web_crawl { max_pages: 2 }`.
    let mut per_call = server_default.clone();
    per_call.max_pages = 2;
    let engine = build_engine(&per_call).expect("per-call engine");

    let url = format!("{origin}/");
    let result = crawlberg::crawl(&engine, &url).await.expect("crawl");

    assert!(
        result.pages.len() <= 2,
        "per-call max_pages=2 must override the server default of 50; got {} pages",
        result.pages.len()
    );
}