basemind 0.0.1

Code-map MCP server + scanner — content-addressed, Fjall-backed inverted index over tree-sitter outlines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
use std::cell::RefCell;
use std::ops::ControlFlow;
use std::path::{Path, PathBuf};
use std::sync::{Arc, OnceLock, RwLock};
use std::time::{Duration, Instant};

use ahash::{AHashMap, AHashSet};
use thiserror::Error;
use tree_sitter::{Language, ParseOptions, Parser, Query, Tree};

/// Hard ceiling on a single tree-sitter parse. Defends against pathological inputs that
/// hang the recovery loop (e.g. multi-megabyte minified bundles with deep arrow chains).
///
/// Override per-process with `BASEMIND_PARSE_TIMEOUT_MS`. The default — 5 seconds — sits
/// well above any well-formed file's parse time on the supported languages (sub-second
/// for the TypeScript compiler's biggest files) but reliably aborts known hangers.
pub const DEFAULT_PARSE_TIMEOUT: Duration = Duration::from_millis(5_000);

fn parse_timeout_from_env() -> Duration {
    std::env::var("BASEMIND_PARSE_TIMEOUT_MS")
        .ok()
        .and_then(|s| s.parse::<u64>().ok())
        .map(Duration::from_millis)
        .unwrap_or(DEFAULT_PARSE_TIMEOUT)
}

#[derive(Debug, Error)]
pub enum LangError {
    #[error("language pack error: {0}")]
    Pack(String),
    #[error("grammar download failed: {0}")]
    Download(String),
    #[error("query compile error for {lang}/{kind}: {msg}")]
    QueryCompile {
        lang: &'static str,
        kind: &'static str,
        msg: String,
    },
    #[error("failed to set language {0} on parser")]
    ParserSetLanguage(String),
}

/// Stable language identifier used as the key everywhere (parser pool, query pool, FileMap.language).
///
/// `LangId` is the tree-sitter-language-pack identifier (e.g. `"rust"`, `"cpp"`, `"ruby"`),
/// always sourced from TSLP's static registry. Any string handed to `with_parser` / `get_query`
/// must come from [`detect`] or [`intern`] so the lifetime guarantee holds and TSLP can resolve it.
pub type LangId = &'static str;

/// Languages we ship hand-written `.scm` query overrides for. Anything outside this set falls
/// back to TSLP's vendored `tags.scm` (when wired) and produces best-effort extraction.
///
/// Order is the bootstrap download order — keep `rust` first so the most common cold-start case
/// stays fast.
pub const OVERRIDE_LANGUAGES: &[LangId] =
    &["rust", "python", "typescript", "tsx", "javascript", "go"];

/// Back-compat alias used by `basemind lang install` and tests that pre-warm the cache.
pub const SUPPORTED_LANGUAGES: &[LangId] = OVERRIDE_LANGUAGES;

/// Static map of override `(LangId, .scm source)` pairs. Tail of the lookup chain in
/// [`get_query`]. Adding a language here means dropping a new file in `src/queries/<lang>.scm`
/// using the same `;; section: <name>` convention.
fn override_query_source(lang: LangId) -> Option<&'static str> {
    Some(match lang {
        "rust" => include_str!("queries/rust.scm"),
        "python" => include_str!("queries/python.scm"),
        "typescript" => include_str!("queries/typescript.scm"),
        "tsx" => include_str!("queries/tsx.scm"),
        "javascript" => include_str!("queries/javascript.scm"),
        "go" => include_str!("queries/go.scm"),
        _ => return None,
    })
}

/// Whether basemind ships a hand-written override `.scm` file for this language.
pub fn has_override(lang: LangId) -> bool {
    override_query_source(lang).is_some()
}

/// Intern a (possibly non-static) language name into the static `LangId` form.
///
/// Used by code paths that load a language tag out of persisted state (`FileEntry.language`,
/// `FileMapL1.language`) and need to feed it back into the parser / query pool. Returns
/// `Some` only when the name resolves through TSLP — unknown strings stay `None` so callers
/// can fail loud instead of leaking arbitrary input.
///
/// Interning is monotonic: each new name is leaked once via `Box::leak` and cached. Cap is
/// bounded by the size of TSLP's registry (~306 grammars × ~10 bytes), well under the cost
/// of a single open file.
pub fn intern(name: &str) -> Option<LangId> {
    // Hot path: known override names are static literals — return them without touching the
    // interner lock. Cheap branch that absorbs 99% of indexed-file lookups.
    for &lid in OVERRIDE_LANGUAGES {
        if lid == name {
            return Some(lid);
        }
    }
    // Already interned? Fast read path.
    let lock = INTERNED.get_or_init(|| RwLock::new(AHashSet::new()));
    if let Some(&existing) = lock
        .read()
        .expect("intern pool poisoned")
        .iter()
        .find(|s| **s == name)
    {
        return Some(existing);
    }
    // Cold path: validate against TSLP's registry before leaking the bytes. Unknown names
    // should not pin memory.
    if !tree_sitter_language_pack::has_language(name) {
        return None;
    }
    let leaked: &'static str = Box::leak(name.to_string().into_boxed_str());
    lock.write().expect("intern pool poisoned").insert(leaked);
    Some(leaked)
}

static INTERNED: OnceLock<RwLock<AHashSet<&'static str>>> = OnceLock::new();

/// Result of the one-shot grammar bootstrap.
#[derive(Debug, Clone)]
pub struct BootstrapSummary {
    /// Languages that were already on disk before this run.
    pub already_cached: Vec<String>,
    /// Languages we just downloaded.
    pub downloaded: Vec<String>,
    /// tslp cache directory (where grammar `.so/.dylib`s live).
    pub cache_dir: Option<PathBuf>,
}

impl BootstrapSummary {
    pub fn did_download(&self) -> bool {
        !self.downloaded.is_empty()
    }
}

/// OnceLock holding the bootstrap outcome. `Arc` so callers can inspect without re-running.
static GRAMMAR_BOOTSTRAP: OnceLock<Result<Arc<BootstrapSummary>, Arc<LangError>>> = OnceLock::new();

/// Parse the tslp version out of its `cache_dir()` (`.../v<version>/libs`).
/// Returns `None` if the path is shaped unexpectedly — caller falls back gracefully.
fn tslp_version_from_cache_dir(p: &Path) -> Option<String> {
    let parent = p.parent()?;
    let leaf = parent.file_name()?.to_str()?;
    leaf.strip_prefix('v').map(str::to_string)
}

/// Ensure all `OVERRIDE_LANGUAGES` grammars are present in the tslp cache, downloading any
/// missing ones. Idempotent across the process — runs at most once.
///
/// Only the override-supported set is pre-warmed; dynamic-path languages are pulled on first
/// use of a file with that extension. Keeps cold-start small while still guaranteeing the
/// common cases parse instantly.
///
/// Uses `DownloadManager::ensure_languages` directly rather than the top-level
/// `tree_sitter_language_pack::download()` because the latter has a bug in 1.9.0-rc.22 where
/// in-memory REGISTRY membership short-circuits the actual download (returns Ok with no
/// disk side-effect).
pub fn ensure_grammars() -> Result<Arc<BootstrapSummary>, Arc<LangError>> {
    GRAMMAR_BOOTSTRAP
        .get_or_init(|| {
            let cache_dir_str = tree_sitter_language_pack::cache_dir()
                .map_err(|e| Arc::new(LangError::Pack(format!("resolve cache dir: {e}"))))?;
            let cache_dir = PathBuf::from(&cache_dir_str);
            let version = tslp_version_from_cache_dir(&cache_dir).ok_or_else(|| {
                Arc::new(LangError::Pack(format!(
                    "could not parse tslp version out of {cache_dir_str:?}"
                )))
            })?;

            let dm = tree_sitter_language_pack::DownloadManager::with_cache_dir(
                &version,
                cache_dir.clone(),
            );

            let installed: Vec<String> = dm.installed_languages();
            let mut already_cached: Vec<String> = Vec::new();
            let mut missing: Vec<&'static str> = Vec::new();
            for &name in OVERRIDE_LANGUAGES {
                if installed.iter().any(|n| n == name) {
                    already_cached.push(name.to_string());
                } else {
                    missing.push(name);
                }
            }
            if !missing.is_empty() {
                // Offline mode: don't reach the network. If grammars are missing, surface a
                // clean typed error so MCP clients / CLI users see a useful message instead of
                // silent empty parses. Set `BASEMIND_GRAMMAR_OFFLINE=1` to opt in (e.g. CI
                // environments where the cache is pre-warmed and outbound traffic is blocked).
                if std::env::var("BASEMIND_GRAMMAR_OFFLINE")
                    .is_ok_and(|v| v != "0" && !v.is_empty())
                {
                    return Err(Arc::new(LangError::Download(format!(
                        "offline mode: missing grammars {missing:?} and \
                         BASEMIND_GRAMMAR_OFFLINE is set",
                    ))));
                }
                dm.ensure_languages(&missing)
                    .map_err(|e| Arc::new(LangError::Download(format!("{e}"))))?;
            }
            Ok(Arc::new(BootstrapSummary {
                already_cached,
                downloaded: missing.into_iter().map(str::to_string).collect(),
                cache_dir: Some(cache_dir),
            }))
        })
        .clone()
}

/// Languages currently downloaded in the tslp cache (does not hit the network).
pub fn downloaded_languages() -> Vec<String> {
    // tslp's `downloaded_languages()` reads via a DownloadManager keyed by its own
    // CARGO_PKG_VERSION, which matches the cache layout — same source-of-truth either way.
    tree_sitter_language_pack::downloaded_languages()
}

/// Path to the tslp cache directory, if it can be resolved.
pub fn grammar_cache_dir() -> Option<PathBuf> {
    tree_sitter_language_pack::cache_dir()
        .ok()
        .map(PathBuf::from)
}

/// Clear the tslp grammar cache. Forces re-download on next use.
pub fn clean_grammar_cache() -> Result<(), LangError> {
    tree_sitter_language_pack::clean_cache().map_err(|e| LangError::Pack(format!("{e}")))
}

/// Detect the language for a path. Returns the TSLP pack name (a `'static` slice) for any
/// extension TSLP can resolve — across all 306 bundled grammars. Returns `None` for unknown
/// extensions; the scanner skips those files entirely.
pub fn detect(path: &Path) -> Option<LangId> {
    tree_sitter_language_pack::detect_language(path.to_str()?)
}

/// Fetch the underlying tree-sitter Language for a given `LangId`.
pub fn language(lang: LangId) -> Result<Language, LangError> {
    tree_sitter_language_pack::get_language(lang).map_err(|e| LangError::Pack(format!("{e}")))
}

// ─── Parser pool ──────────────────────────────────────────────────────────────
//
// Parser is !Sync and stateful — one per thread per language, kept hot in TLS.

thread_local! {
    static PARSERS: RefCell<AHashMap<LangId, Parser>> = RefCell::new(AHashMap::new());
}

/// Run a closure with a per-thread Parser for the given language.
/// The parser is reused across calls on the same thread.
pub fn with_parser<F, R>(lang: LangId, f: F) -> Result<R, LangError>
where
    F: FnOnce(&mut Parser) -> R,
{
    PARSERS.with(|cell| {
        let mut map = cell.borrow_mut();
        if !map.contains_key(&lang) {
            let mut p = Parser::new();
            let ts_lang = language(lang)?;
            p.set_language(&ts_lang)
                .map_err(|_| LangError::ParserSetLanguage(lang.to_string()))?;
            map.insert(lang, p);
        }
        Ok(f(map.get_mut(&lang).expect("just inserted")))
    })
}

/// Outcome of a single bounded parse.
#[derive(Debug)]
pub enum ParseOutcome {
    Ok(Tree),
    /// Parser returned `None` for a reason other than timeout (rare — typically a malformed
    /// input the grammar can't even start on).
    Failed,
    /// Progress callback aborted because `timeout` elapsed.
    TimedOut,
}

/// Run `parser.parse_with_options` with a progress callback that aborts after `timeout`.
///
/// tree-sitter 0.26 removed the C-side `ts_parser_set_timeout_micros` shortcut in favor of
/// progress-callback-driven cancellation — this helper reinstates the ergonomics. Uses a
/// monotonic clock so it's robust against wall-clock jumps.
pub fn parse_timed(parser: &mut Parser, source: &[u8], timeout: Duration) -> ParseOutcome {
    let started = Instant::now();
    let mut timed_out = false;
    let len = source.len();
    let mut input = |i: usize, _| -> &[u8] { if i < len { &source[i..] } else { &[] } };
    let mut progress = |_state: &tree_sitter::ParseState| -> ControlFlow<()> {
        if started.elapsed() > timeout {
            timed_out = true;
            ControlFlow::Break(())
        } else {
            ControlFlow::Continue(())
        }
    };
    let opts = ParseOptions::new().progress_callback(&mut progress);
    let tree = parser.parse_with_options(&mut input, None, Some(opts));
    match (tree, timed_out) {
        (Some(t), _) => ParseOutcome::Ok(t),
        (None, true) => ParseOutcome::TimedOut,
        (None, false) => ParseOutcome::Failed,
    }
}

/// Convenience: `parse_timed` with the env-configurable default timeout.
pub fn parse_with_default_timeout(parser: &mut Parser, source: &[u8]) -> ParseOutcome {
    parse_timed(parser, source, parse_timeout_from_env())
}

// ─── Query pool ───────────────────────────────────────────────────────────────
//
// Query is Send + Sync and not Clone; one Arc<Query> per (lang, kind) globally.

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum QueryKind {
    /// Captures: @symbol.name, @symbol.kind, @symbol.range, @symbol.signature
    Symbols,
    /// Captures: @import.module, @import.alias, @import.range
    Imports,
    /// Captures: @call.callee, @call.range  (L2)
    Calls,
    /// Captures: @doc.text, @doc.target  (L2)
    Docs,
}

impl QueryKind {
    pub fn name(self) -> &'static str {
        match self {
            QueryKind::Symbols => "symbols",
            QueryKind::Imports => "imports",
            QueryKind::Calls => "calls",
            QueryKind::Docs => "docs",
        }
    }
}

/// Two-state query cache value: `Some` when a query was found and compiled; `None` when the
/// language has no override section + no TSLP fallback for this kind. The `None` is cached
/// to avoid re-doing the negative lookup for every file in that language.
type CachedQuery = Option<Arc<Query>>;
type QueryMap = AHashMap<(LangId, QueryKind), CachedQuery>;
static QUERIES: OnceLock<RwLock<QueryMap>> = OnceLock::new();

/// Extract a single named query (S-expression `;; @section name`) from the .scm source.
///
/// Convention: each .scm file is divided into sections marked by `;; section: <name>` lines.
/// Sections we look for: `symbols`, `imports`, `calls`, `docs`.
fn extract_section(source: &str, name: &str) -> Option<String> {
    let marker_open = format!(";; section: {name}");
    let mut out = String::new();
    let mut in_section = false;
    for line in source.lines() {
        let trimmed = line.trim_start();
        if trimmed.starts_with(";; section:") {
            in_section = trimmed.starts_with(&marker_open);
            continue;
        }
        if in_section {
            out.push_str(line);
            out.push('\n');
        }
    }
    if out.trim().is_empty() {
        None
    } else {
        Some(out)
    }
}

/// Adapt an upstream TSLP `tags.scm` source into basemind's override-shaped section convention.
///
/// TSLP's `tags.scm` uses the GitHub-standard capture names `@definition.<kind>` / `@reference.call`
/// with the identifier name captured as `@name`. Basemind's extractors look for
/// `@symbol.<kind>` / `@symbol.name` (l1) and `@call.range` / `@call.callee` (l2). This walks
/// top-level S-expression patterns, classifies each by its root capture, and emits the
/// rewritten pattern into either the `;; section: symbols` or `;; section: calls` block.
///
/// Patterns whose root capture is neither `@definition.*` nor `@reference.call` (e.g.
/// `@reference.class`, `@reference.interface`, `@reference.send`, `@reference.type`,
/// `@reference.implementation`) are dropped — basemind has no consumer for them today.
fn adapt_tslp_tags(source: &str) -> String {
    let mut sym_buf = String::new();
    let mut call_buf = String::new();
    for pattern in split_top_level_patterns(source) {
        let kind = classify_pattern(pattern);
        match kind {
            PatternKind::Definition => sym_buf.push_str(&rewrite_pattern(pattern, kind)),
            PatternKind::ReferenceCall => call_buf.push_str(&rewrite_pattern(pattern, kind)),
            PatternKind::Other => {}
        }
    }
    let mut out = String::with_capacity(sym_buf.len() + call_buf.len() + 64);
    out.push_str(";; section: symbols\n");
    out.push_str(&sym_buf);
    out.push_str("\n;; section: calls\n");
    out.push_str(&call_buf);
    out
}

#[derive(Clone, Copy)]
enum PatternKind {
    Definition,
    ReferenceCall,
    Other,
}

/// Yield each top-level S-expression pattern from the source as a `&str` slice.
///
/// Walks paren depth char-by-char, skipping `;`-to-EOL comments and `"..."` string literals
/// where parens carry no structural meaning. A "pattern" is the substring from a depth-0
/// `(` (along with any trailing `@root.capture` annotation) to the next paren that lands
/// back at depth 0. Free-standing comments and whitespace between patterns are skipped.
fn split_top_level_patterns(source: &str) -> Vec<&str> {
    let bytes = source.as_bytes();
    let mut patterns: Vec<&str> = Vec::new();
    let mut i = 0;
    let mut start: Option<usize> = None;
    let mut depth: i32 = 0;
    while i < bytes.len() {
        let b = bytes[i];
        match b {
            b';' => {
                // Comment to end of line, regardless of depth.
                while i < bytes.len() && bytes[i] != b'\n' {
                    i += 1;
                }
            }
            b'"' => {
                // Skip string literal — escapes preserved minimally.
                i += 1;
                while i < bytes.len() {
                    if bytes[i] == b'\\' && i + 1 < bytes.len() {
                        i += 2;
                        continue;
                    }
                    if bytes[i] == b'"' {
                        i += 1;
                        break;
                    }
                    i += 1;
                }
            }
            b'(' => {
                if depth == 0 && start.is_none() {
                    start = Some(i);
                }
                depth += 1;
                i += 1;
            }
            b')' => {
                depth -= 1;
                i += 1;
                if depth == 0 {
                    // Consume any trailing `@root.capture` annotation that belongs to the
                    // just-closed pattern, including whitespace/newlines between `)` and `@`.
                    let mut j = i;
                    while j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
                        j += 1;
                    }
                    if j < bytes.len() && bytes[j] == b'@' {
                        // Skip `@capture.name` token.
                        j += 1;
                        while j < bytes.len() && is_capture_ident_byte(bytes[j]) {
                            j += 1;
                        }
                        i = j;
                    }
                    if let Some(s) = start {
                        patterns.push(&source[s..i]);
                    }
                    start = None;
                }
            }
            _ => i += 1,
        }
    }
    patterns
}

fn is_capture_ident_byte(b: u8) -> bool {
    b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'?' || b == b'!'
}

/// Find the root capture of a pattern: the rightmost `@<root>.<sub>` at the outermost
/// closing paren (or top level). Returns `Definition` if root is `definition.*`,
/// `ReferenceCall` if root is `reference.call`, `Other` otherwise.
fn classify_pattern(pattern: &str) -> PatternKind {
    // The root capture is the LAST `@...` token in the pattern. Scan from the end.
    let bytes = pattern.as_bytes();
    let mut i = bytes.len();
    while i > 0 {
        i -= 1;
        if bytes[i] == b'@' {
            // Found a `@` — read forward to extract the capture name.
            let cap_start = i + 1;
            let mut j = cap_start;
            while j < bytes.len() && is_capture_ident_byte(bytes[j]) {
                j += 1;
            }
            let cap = &pattern[cap_start..j];
            return classify_capture(cap);
        }
    }
    PatternKind::Other
}

fn classify_capture(cap: &str) -> PatternKind {
    if let Some(suffix) = cap.strip_prefix("definition.") {
        let _ = suffix;
        PatternKind::Definition
    } else if cap == "reference.call" {
        PatternKind::ReferenceCall
    } else {
        PatternKind::Other
    }
}

/// Rewrite a pattern's capture names from TSLP convention to basemind convention. The trailing
/// `\n` is included so consecutive patterns stay separated in the emitted section.
fn rewrite_pattern(pattern: &str, kind: PatternKind) -> String {
    let mut out = String::with_capacity(pattern.len() + 16);
    let bytes = pattern.as_bytes();
    let mut i = 0;
    while i < bytes.len() {
        if bytes[i] != b'@' {
            out.push(bytes[i] as char);
            i += 1;
            continue;
        }
        // Found `@`. Read the capture name.
        let cap_start = i + 1;
        let mut j = cap_start;
        while j < bytes.len() && is_capture_ident_byte(bytes[j]) {
            j += 1;
        }
        let cap = &pattern[cap_start..j];
        let rewritten = rewrite_capture(cap, kind);
        out.push('@');
        out.push_str(&rewritten);
        i = j;
    }
    out.push('\n');
    out
}

fn rewrite_capture(cap: &str, kind: PatternKind) -> String {
    // `@name` is the identifier sub-capture; remap by section.
    if cap == "name" {
        return match kind {
            PatternKind::Definition => "symbol.name".to_string(),
            PatternKind::ReferenceCall => "call.callee".to_string(),
            PatternKind::Other => "name".to_string(),
        };
    }
    if let Some(suffix) = cap.strip_prefix("definition.") {
        return format!("symbol.{suffix}");
    }
    if cap == "reference.call" {
        return "call.range".to_string();
    }
    // Predicates like `@cap (#match? ...)` keep their original name in the pattern they
    // came from — leave untouched.
    cap.to_string()
}

/// Per-language cache of the adapted `tags.scm` string. Populated on first lookup;
/// stays for process lifetime. Bound is the size of TSLP's registry — small.
type AdaptedTagsMap = AHashMap<LangId, Arc<str>>;
static ADAPTED_TAGS: OnceLock<RwLock<AdaptedTagsMap>> = OnceLock::new();

fn tslp_tags_adapted(lang: LangId) -> Option<Arc<str>> {
    let lock = ADAPTED_TAGS.get_or_init(|| RwLock::new(AHashMap::new()));
    if let Some(cached) = lock.read().expect("adapted tags pool poisoned").get(&lang) {
        return Some(Arc::clone(cached));
    }
    let raw = tree_sitter_language_pack::get_tags_query(lang)?;
    let adapted: Arc<str> = Arc::from(adapt_tslp_tags(raw));
    lock.write()
        .expect("adapted tags pool poisoned")
        .insert(lang, Arc::clone(&adapted));
    Some(adapted)
}

/// Look up a `(lang, kind)` query, returning `Ok(Some(arc))` when one exists,
/// `Ok(None)` when neither the override file nor the TSLP fallback provides this section,
/// and `Err` only on a compile error in source we do have.
///
/// Lookup chain:
/// 1. Local override — `src/queries/<lang>.scm` `;; section: <kind>`.
/// 2. TSLP `tags.scm` via [`adapt_tslp_tags`] — covers the 14 languages that ship a vendored
///    `tags.scm` upstream (kotlin, csharp, swift, gleam, gap, al, enforce, gdshader, roc,
///    cfml, ql, tact, sourcepawn, mojo).
/// 3. None — file is still detected and indexed, but symbol/import/call extraction yields
///    empty vectors for this language.
pub fn try_get_query(lang: LangId, kind: QueryKind) -> Result<CachedQuery, LangError> {
    let lock = QUERIES.get_or_init(|| RwLock::new(AHashMap::new()));
    if let Some(slot) = lock.read().expect("query pool poisoned").get(&(lang, kind)) {
        return Ok(slot.as_ref().map(Arc::clone));
    }

    let source: Option<String> = if let Some(raw) = override_query_source(lang) {
        extract_section(raw, kind.name())
    } else if matches!(kind, QueryKind::Symbols | QueryKind::Calls) {
        tslp_tags_adapted(lang).and_then(|adapted| extract_section(&adapted, kind.name()))
    } else {
        None
    };

    let cached = match source {
        Some(src) => {
            let ts_lang = language(lang)?;
            let query = Query::new(&ts_lang, &src).map_err(|e| LangError::QueryCompile {
                lang,
                kind: kind.name(),
                msg: format!("{e}"),
            })?;
            Some(Arc::new(query))
        }
        None => None,
    };

    lock.write()
        .expect("query pool poisoned")
        .insert((lang, kind), cached.as_ref().map(Arc::clone));
    Ok(cached)
}

/// Strict variant of [`try_get_query`] for callers that treat missing sections as errors.
/// Prefer `try_get_query` in new code so unsupported languages degrade gracefully.
pub fn get_query(lang: LangId, kind: QueryKind) -> Result<Arc<Query>, LangError> {
    try_get_query(lang, kind)?.ok_or_else(|| LangError::QueryCompile {
        lang,
        kind: kind.name(),
        msg: format!("no override or TSLP fallback for {}/{}", lang, kind.name()),
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn detect_known_extensions() {
        assert_eq!(detect(Path::new("foo.rs")), Some("rust"));
        assert_eq!(detect(Path::new("foo.py")), Some("python"));
        assert_eq!(detect(Path::new("foo.go")), Some("go"));
    }

    #[test]
    fn detect_dynamic_extension_resolves() {
        // Any TSLP-registered grammar resolves through detect(); cpp is outside the override
        // set but ships in the language pack, so dynamic dispatch must produce its pack name.
        assert_eq!(detect(Path::new("foo.cpp")), Some("cpp"));
    }

    #[test]
    fn extract_section_basic() {
        let src = ";; section: a\n(foo)\n;; section: b\n(bar)\n";
        assert_eq!(extract_section(src, "a").unwrap().trim(), "(foo)");
        assert_eq!(extract_section(src, "b").unwrap().trim(), "(bar)");
        assert_eq!(extract_section(src, "c"), None);
    }

    #[test]
    fn has_override_for_each_supported() {
        for &name in OVERRIDE_LANGUAGES {
            assert!(has_override(name), "missing override source for {name}");
        }
    }

    #[test]
    fn intern_known_overrides_returns_static() {
        let owned = "rust".to_string();
        let id = intern(&owned).expect("rust must intern");
        assert!(std::ptr::eq(id, "rust"));
    }

    #[test]
    fn intern_unknown_returns_none() {
        assert!(intern("this-is-not-a-real-grammar-name").is_none());
    }

    #[test]
    fn try_get_query_returns_none_for_unsupported_lang() {
        // `json` ships in TSLP but has no override AND no upstream `tags.scm`, so both
        // lookup branches miss and the cache stores `None`. (Previously `cpp` — which now
        // resolves through the TSLP tags fallback; data-only formats like JSON / YAML /
        // TOML reliably ship no tags query.)
        let res = try_get_query("json", QueryKind::Symbols).expect("query lookup must not error");
        assert!(res.is_none());
    }

    #[test]
    fn adapt_tslp_tags_emits_two_sections() {
        let src = "(function_item name: (identifier) @name) @definition.function\n\
                   (call_expression function: (identifier) @name) @reference.call\n";
        let out = adapt_tslp_tags(src);
        assert!(out.contains(";; section: symbols"));
        assert!(out.contains(";; section: calls"));
        assert!(out.contains("@symbol.function"));
        assert!(out.contains("@symbol.name"));
        assert!(out.contains("@call.range"));
        assert!(out.contains("@call.callee"));
    }

    #[test]
    fn adapt_tslp_tags_drops_reference_class() {
        // `@reference.class` (kotlin constructor invocation, rust impl_item) has no consumer
        // in basemind today — must be excluded from both sections.
        let src = "(impl_item trait: (type_identifier) @name) @reference.implementation\n\
                   (call_expression function: (identifier) @name) @reference.call\n";
        let out = adapt_tslp_tags(src);
        // `reference.implementation` lacks a section, so its `@name` does NOT become
        // `@call.callee` — the whole pattern is dropped.
        assert!(!out.contains("@reference"));
        assert!(out.contains("@call.range"));
        assert!(out.contains("@call.callee"));
    }

    #[test]
    fn adapt_tslp_tags_handles_multiline_patterns() {
        let src = "(struct_item\n    name: (type_identifier) @name) @definition.class\n";
        let out = adapt_tslp_tags(src);
        assert!(out.contains("@symbol.class"));
        assert!(out.contains("@symbol.name"));
    }

    #[test]
    fn adapt_tslp_tags_real_rust_compiles() {
        // The rust tags.scm from TSLP must produce a query string that tree-sitter compiles
        // against the rust grammar — guards against ever drifting capture rewrites.
        let raw = tree_sitter_language_pack::get_tags_query("rust").expect("rust ships tags.scm");
        let adapted = adapt_tslp_tags(raw);
        let sym = extract_section(&adapted, "symbols").expect("symbols section");
        let calls = extract_section(&adapted, "calls").expect("calls section");
        let ts_lang = language("rust").expect("rust language resolves");
        Query::new(&ts_lang, &sym).expect("adapted symbols query compiles");
        Query::new(&ts_lang, &calls).expect("adapted calls query compiles");
    }
}