basemind 0.4.0-rc.2

Full AI context layer over MCP — tree-sitter code-map, document RAG (PDF/Office/HTML/email + OCR + reranker), shared agent memory, on-demand web crawl, git history + blame + per-symbol diff. 300+ languages, 8 coding-agent harnesses, content-addressed Fjall + LanceDB.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
//! Helper bodies for the three web ingestion MCP tools.
//!
//! Each helper:
//!  1. resolves the shared crawl engine + embedder + LanceDB store from
//!     `ServerState` (returning an MCP error when the feature was compiled in
//!     but the engine failed to initialize),
//!  2. runs the kreuzcrawl operation on the request URL,
//!  3. routes resulting page bodies through [`crate::web::ingest::index_page`]
//!     to land them in the existing `documents` LanceDB table.
//!
//! The whole module is gated on `feature = "crawl"` — when the feature is off
//! the file does not compile at all, and the corresponding tool router is not
//! registered on `BasemindServer`.

#![cfg(feature = "crawl")]

use std::sync::Arc;

use rmcp::ErrorData as McpError;
use rmcp::model::CallToolResult;

use super::ServerState;
use super::helpers::json_result;
use super::memory::lance_store;
use super::types::{
    WebCrawlPageOutcome, WebCrawlParams, WebCrawlResponse, WebMapEntry, WebMapParams,
    WebMapResponse, WebScrapeParams, WebScrapeResponse,
};
use crate::embeddings::SharedEmbedder;
use crate::web::ingest::{default_scope, index_page};

fn mcp_internal(prefix: &str, err: impl std::fmt::Display) -> McpError {
    McpError::internal_error(format!("{prefix}: {err}"), None)
}

/// POST-FETCH defence-in-depth SSRF check on a URL that kreuzcrawl actually
/// hit.
///
/// `Url::parse` enforces the private-host denylist on every *requested* URL, but
/// kreuzcrawl follows HTTP redirects itself, so a public seed can 30x-redirect
/// to a private target (`https://evil.com` → 302 → `http://169.254.169.254/`)
/// that the seed validation never saw. Here we re-validate the URL the crawler
/// landed on through the same denylist and refuse to index when it resolves to a
/// private / loopback / link-local host.
///
/// This does NOT prevent the redirect GET itself — the request to the private
/// host has already happened by the time we see `final_url` / `normalized_url`.
/// Fully blocking the redirect fetch would require a redirect-policy hook inside
/// kreuzcrawl's HTTP client, which is un-vendored here. This guard is the layer
/// we control: it stops private-host content from ever landing in the index.
fn reject_redirected_private_url(context: &str, fetched_url: &str) -> Result<(), McpError> {
    match crate::url::Url::parse(fetched_url) {
        Ok(_) => Ok(()),
        Err(crate::url::UrlError::PrivateHost(host)) => Err(McpError::invalid_params(
            format!(
                "{context}: refusing to index private/loopback host reached via redirect: {host} \
                 (set BASEMIND_ALLOW_PRIVATE_HOSTS=1 to allow)"
            ),
            None,
        )),
        // A non-denylist parse failure (e.g. an exotic scheme the crawler
        // normalised to) is also unsafe to index — fail closed rather than open.
        Err(other) => Err(McpError::invalid_params(
            format!("{context}: refusing to index unparsable fetched URL {fetched_url:?}: {other}"),
            None,
        )),
    }
}

/// Resolve the LanceDB scope tag for a fetched page.
///
/// When the caller supplied an explicit `scope`, honour it verbatim. Otherwise
/// derive the scope from the page's *final* URL (after redirects) rather than
/// the requested URL, so a redirect across hosts (e.g. `example.com` →
/// `cdn.example.net`) lands the rows under the host they actually came from and
/// `search_documents { scope: "web:<host>" }` retrieves them. Falls back to the
/// requested URL's scope if the final URL fails to parse (it should not for an
/// http/https response, but we never panic on a server-supplied string).
fn resolve_scope(explicit: Option<&str>, requested: &crate::url::Url, final_url: &str) -> String {
    if let Some(scope) = explicit {
        return scope.to_string();
    }
    match crate::url::Url::parse(final_url) {
        Ok(resolved) => default_scope(&resolved),
        Err(_) => default_scope(requested),
    }
}

/// Reject a `Some(0)` crawl override, mirroring the schema's `min = 1`. `None`
/// (inherit server default) and `Some(n >= 1)` pass through.
#[cfg(feature = "crawl")]
fn reject_zero_override(field: &str, value: Option<u32>) -> Result<(), McpError> {
    if value == Some(0) {
        return Err(McpError::invalid_params(
            format!("{field} must be >= 1"),
            None,
        ));
    }
    Ok(())
}

/// Build a per-call kreuzcrawl engine that overrides `max_pages` / `max_depth`
/// for this request only, leaving the server's shared `[crawl]` defaults intact.
///
/// kreuzcrawl bakes the page/depth caps into the engine handle, so honouring a
/// per-call override means constructing a fresh engine from a cloned config.
/// `None` overrides fall back to the server default.
#[cfg(feature = "crawl")]
fn per_call_engine(
    state: &ServerState,
    max_pages: Option<u32>,
    max_depth: Option<u32>,
) -> Result<kreuzcrawl::CrawlEngineHandle, McpError> {
    let mut cfg = state.config.crawl.clone();
    if let Some(mp) = max_pages {
        cfg.max_pages = mp;
    }
    if let Some(md) = max_depth {
        cfg.max_depth = md;
    }
    crate::web::build_engine(&cfg).map_err(|e| mcp_internal("build per-call crawl engine", e))
}

async fn embedder(state: &ServerState) -> Result<Arc<SharedEmbedder>, McpError> {
    // Use the configured embedding preset, not a hardcoded one. The disk
    // scanner embeds with `documents.embedding_preset`; if serve loaded a
    // different model the LanceStore (dim, model) mismatch would wipe the table
    // on the next open, so serve and disk scans must agree on the preset.
    let preset = state.config.documents.embedding_preset.clone();
    let embedder = state
        .embedder
        .get_or_try_init(|| async {
            SharedEmbedder::load(&preset)
                .map(Arc::new)
                .map_err(|e| format!("load embedder: {e}"))
        })
        .await
        .map_err(|e| McpError::internal_error(e.clone(), None))?;
    Ok(Arc::clone(embedder))
}

fn engine(state: &ServerState) -> Result<&kreuzcrawl::CrawlEngineHandle, McpError> {
    state.crawl_engine.as_ref().ok_or_else(|| {
        McpError::internal_error(
            "crawl engine not initialised; check basemind serve startup logs",
            None,
        )
    })
}

pub(super) async fn run_web_scrape(
    state: &ServerState,
    params: WebScrapeParams,
) -> Result<CallToolResult, McpError> {
    let engine = engine(state)?;
    let url_str = params.url.as_str().to_string();

    let result = kreuzcrawl::scrape(engine, &url_str)
        .await
        .map_err(|e| mcp_internal("kreuzcrawl scrape", e))?;

    // POST-FETCH SSRF guard: kreuzcrawl may have followed a redirect from the
    // (validated) seed to a private host. Re-validate the URL we actually
    // landed on before indexing anything from it.
    reject_redirected_private_url("web_scrape", &result.final_url)?;

    // Derive the scope from the FINAL url (post-redirect), not the requested
    // host — the rows we store are keyed by `result.final_url`, so the scope
    // must match the host they actually came from.
    let scope = resolve_scope(params.scope.as_deref(), &params.url, &result.final_url);

    let body_text: String = result
        .markdown
        .as_ref()
        .map(|m| m.content.clone())
        .filter(|s| !s.trim().is_empty())
        .unwrap_or_else(|| result.html.clone());

    let response = if params.index {
        let lance = lance_store(state).await?;
        let embedder = embedder(state).await?;
        let documents_cfg = state.config.documents.clone();
        let scope_for_block = scope.clone();
        let final_url_for_block = result.final_url.clone();
        let mime_for_block = result.content_type.clone();
        let indexed = tokio::task::spawn_blocking(move || {
            index_page(
                lance.as_ref(),
                &embedder,
                &documents_cfg,
                &scope_for_block,
                &final_url_for_block,
                &mime_for_block,
                &body_text,
            )
        })
        .await
        .map_err(|e| mcp_internal("spawn_blocking", e))?
        .map_err(|e| mcp_internal("index_page", e))?;

        WebScrapeResponse {
            url: url_str,
            final_url: result.final_url,
            status_code: result.status_code,
            content_type: result.content_type,
            bytes: indexed.bytes,
            chunks_indexed: indexed.chunks_indexed,
            indexed: indexed.chunks_indexed > 0,
            scope,
        }
    } else {
        WebScrapeResponse {
            url: url_str,
            final_url: result.final_url,
            status_code: result.status_code,
            content_type: result.content_type,
            bytes: body_text.len(),
            chunks_indexed: 0,
            indexed: false,
            scope,
        }
    };

    json_result(&response)
}

pub(super) async fn run_web_crawl(
    state: &ServerState,
    params: WebCrawlParams,
) -> Result<CallToolResult, McpError> {
    // Apply the per-call max_pages / max_depth overrides by building a one-shot
    // engine from the server config with those fields replaced. kreuzcrawl bakes
    // the caps into the engine handle, so a per-call override needs its own
    // engine; `None` overrides inherit the server `[crawl]` default. Make sure
    // the shared engine exists before paying for a per-call one so the error
    // surface matches the other web tools.
    engine(state)?;
    // Reject zero overrides at the boundary: the JSON schema declares `min = 1`
    // for both, but a hand-crafted MCP request can still send `0`, which would
    // bake a degenerate crawl (0 pages / 0 depth) into the per-call engine.
    // `None` keeps the server default.
    reject_zero_override("max_pages", params.max_pages)?;
    reject_zero_override("max_depth", params.max_depth)?;
    let engine = per_call_engine(state, params.max_pages, params.max_depth)?;
    let url_str = params.url.as_str().to_string();

    let crawl_outcome = kreuzcrawl::crawl(&engine, &url_str)
        .await
        .map_err(|e| mcp_internal("kreuzcrawl crawl", e))?;

    // Top-level scope echoed in the response: explicit when supplied, else
    // derived from the seed URL's host. Per-page rows derive their own scope
    // from the page's final URL below (a crawl can span subdomains).
    let scope = params
        .scope
        .clone()
        .unwrap_or_else(|| default_scope(&params.url));

    let pages_visited = crawl_outcome.pages.len();
    let lance = lance_store(state).await?;
    let embedder = embedder(state).await?;
    let documents_cfg = state.config.documents.clone();

    let mut total_chunks = 0usize;
    let mut pages_indexed = 0usize;
    let mut outcomes: Vec<WebCrawlPageOutcome> = Vec::with_capacity(crawl_outcome.pages.len());

    for page in crawl_outcome.pages {
        // POST-FETCH SSRF guard (defence-in-depth): a crawl can follow links /
        // redirects from a public seed onto a private host. Re-validate the URL
        // each page actually came from and skip indexing it when it resolves to
        // a private / loopback / link-local host. See
        // `reject_redirected_private_url` for why this can't block the GET
        // itself (kreuzcrawl owns the redirect policy, un-vendored here).
        if let Err(error) = reject_redirected_private_url("web_crawl", &page.normalized_url) {
            tracing::warn!(
                url = %page.normalized_url,
                "web_crawl: skipping private/loopback page reached via crawl"
            );
            outcomes.push(WebCrawlPageOutcome {
                url: page.normalized_url,
                status_code: page.status_code,
                chunks_indexed: 0,
                indexed: false,
                error: Some(error.message.to_string()),
            });
            continue;
        }

        let body_text = page
            .markdown
            .as_ref()
            .map(|m| m.content.clone())
            .filter(|s| !s.trim().is_empty())
            .unwrap_or_else(|| page.html.clone());

        // Each page is stored under `page.normalized_url`; derive its scope from
        // that same URL's host so rows land under the host they came from, not
        // the seed host (a crawl can follow links across subdomains). An
        // explicit caller scope still wins for every page.
        let page_scope = match params.scope.as_deref() {
            Some(s) => s.to_string(),
            None => match crate::url::Url::parse(&page.normalized_url) {
                Ok(u) => default_scope(&u),
                Err(_) => scope.clone(),
            },
        };

        let lance_for_block = Arc::clone(&lance);
        let embedder_for_block = Arc::clone(&embedder);
        let docs_for_block = documents_cfg.clone();
        let scope_for_block = page_scope;
        let path_for_block = page.normalized_url.clone();
        let mime_for_block = page.content_type.clone();

        let res = tokio::task::spawn_blocking(move || {
            index_page(
                lance_for_block.as_ref(),
                &embedder_for_block,
                &docs_for_block,
                &scope_for_block,
                &path_for_block,
                &mime_for_block,
                &body_text,
            )
        })
        .await;

        let outcome = match res {
            Ok(Ok(indexed)) => {
                if indexed.chunks_indexed > 0 {
                    pages_indexed += 1;
                    total_chunks += indexed.chunks_indexed;
                }
                WebCrawlPageOutcome {
                    url: page.normalized_url,
                    status_code: page.status_code,
                    chunks_indexed: indexed.chunks_indexed,
                    indexed: indexed.chunks_indexed > 0,
                    error: None,
                }
            }
            Ok(Err(error)) => {
                tracing::warn!(url = %page.normalized_url, ?error, "web_crawl index_page failed");
                WebCrawlPageOutcome {
                    url: page.normalized_url,
                    status_code: page.status_code,
                    chunks_indexed: 0,
                    indexed: false,
                    error: Some(error.to_string()),
                }
            }
            Err(join_err) => WebCrawlPageOutcome {
                url: page.normalized_url,
                status_code: page.status_code,
                chunks_indexed: 0,
                indexed: false,
                error: Some(format!("spawn_blocking: {join_err}")),
            },
        };
        outcomes.push(outcome);
    }

    json_result(&WebCrawlResponse {
        seed_url: url_str,
        pages_visited,
        pages_indexed,
        total_chunks,
        scope,
        pages: outcomes,
        error: crawl_outcome.error,
    })
}

pub(super) async fn run_web_map(
    state: &ServerState,
    params: WebMapParams,
) -> Result<CallToolResult, McpError> {
    let engine = engine(state)?;
    let url_str = params.url.as_str().to_string();

    let map = kreuzcrawl::map_urls(engine, &url_str)
        .await
        .map_err(|e| mcp_internal("kreuzcrawl map_urls", e))?;

    let urls: Vec<WebMapEntry> = map
        .urls
        .into_iter()
        .map(|u| WebMapEntry {
            url: u.url,
            lastmod: u.lastmod,
            changefreq: u.changefreq,
            priority: u.priority,
        })
        .collect();

    json_result(&WebMapResponse {
        url: url_str,
        total_urls: urls.len(),
        urls,
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    // `reject_redirected_private_url` consults the same process-global
    // `BASEMIND_ALLOW_PRIVATE_HOSTS` env as `Url::parse`; serialize the env
    // mutation on the CRATE-WIDE lock shared with the `url` and `web::ingest`
    // test modules so a setter in one module never observes a remover here.
    fn env_lock() -> std::sync::MutexGuard<'static, ()> {
        crate::url::PRIVATE_HOSTS_ENV_LOCK
            .lock()
            .unwrap_or_else(std::sync::PoisonError::into_inner)
    }

    #[test]
    fn rejects_zero_max_pages_and_depth() {
        assert!(reject_zero_override("max_pages", Some(0)).is_err());
        assert!(reject_zero_override("max_depth", Some(0)).is_err());
        // None (inherit server default) and >= 1 pass through.
        assert!(reject_zero_override("max_pages", None).is_ok());
        assert!(reject_zero_override("max_pages", Some(1)).is_ok());
        assert!(reject_zero_override("max_depth", Some(50)).is_ok());
    }

    #[test]
    fn zero_override_error_names_the_field_and_bound() {
        let err = reject_zero_override("max_pages", Some(0)).expect_err("0 must reject");
        assert!(
            err.message.contains("max_pages") && err.message.contains(">= 1"),
            "error should name the field and the >= 1 bound; got: {}",
            err.message
        );
    }

    #[test]
    fn rejects_private_redirect_target() {
        let _g = env_lock();
        unsafe { std::env::remove_var("BASEMIND_ALLOW_PRIVATE_HOSTS") };
        // Simulates the URL kreuzcrawl landed on AFTER following a redirect from a
        // public seed to the AWS metadata endpoint — the canonical SSRF target.
        let err =
            reject_redirected_private_url("web_scrape", "http://169.254.169.254/latest/meta-data/")
                .expect_err("link-local redirect target must be rejected");
        assert!(
            err.message.contains("169.254.169.254"),
            "rejection should name the private host; got: {}",
            err.message
        );
    }

    #[test]
    fn rejects_loopback_redirect_target() {
        let _g = env_lock();
        unsafe { std::env::remove_var("BASEMIND_ALLOW_PRIVATE_HOSTS") };
        assert!(reject_redirected_private_url("web_crawl", "http://127.0.0.1:9000/").is_err());
        assert!(reject_redirected_private_url("web_crawl", "http://localhost/admin").is_err());
    }

    #[test]
    fn allows_public_redirect_target() {
        let _g = env_lock();
        unsafe { std::env::remove_var("BASEMIND_ALLOW_PRIVATE_HOSTS") };
        assert!(reject_redirected_private_url("web_scrape", "https://example.com/landing").is_ok());
    }
}