Skip to main content

codelens_engine/symbols/
mod.rs

1mod parser;
2mod ranking;
3mod reader;
4pub mod scoring;
5#[cfg(test)]
6mod tests;
7mod types;
8mod writer;
9
10use parser::{flatten_symbol_infos, flatten_symbols, parse_symbols, slice_source, to_symbol_info};
11use ranking::prune_to_budget;
12use scoring::score_symbol;
13pub use scoring::{
14    sparse_coverage_bonus_from_fields, sparse_max_bonus, sparse_threshold, sparse_weighting_enabled,
15};
16pub(crate) use types::ReadDb;
17pub use types::{
18    IndexStats, RankedContextEntry, RankedContextResult, SymbolInfo, SymbolKind, SymbolProvenance,
19    make_symbol_id, parse_symbol_id,
20};
21
22use crate::db::{self, IndexDb, content_hash, index_db_path};
23// Re-export language_for_path so downstream crate modules keep working.
24pub(crate) use crate::lang_config::{LanguageConfig, language_for_path};
25use crate::project::ProjectRoot;
26use anyhow::{Context, Result, bail};
27use std::fs;
28use std::path::{Path, PathBuf};
29use std::time::UNIX_EPOCH;
30use walkdir::WalkDir;
31
32use crate::project::{collect_files, is_excluded};
33
34// Types (SymbolKind, SymbolInfo, ParsedSymbol, IndexStats, RankedContextEntry,
35// RankedContextResult, ReadDb) are in types.rs, re-exported above.
36
37/// SQLite-backed symbol index for a project.
38///
39/// Architecture: writer `Mutex<IndexDb>` for mutations + per-query read-only
40/// connections for `_cached` methods. This makes `SymbolIndex: Send + Sync`,
41/// enabling `Arc<SymbolIndex>` without an external Mutex.
42pub struct SymbolIndex {
43    project: ProjectRoot,
44    db_path: PathBuf,
45    writer: std::sync::Mutex<IndexDb>,
46    /// In-memory mode flag (tests) — when true, _cached reads use the writer.
47    in_memory: bool,
48}
49
50impl SymbolIndex {
51    pub fn new(project: ProjectRoot) -> Self {
52        let db_path = index_db_path(project.as_path());
53        let db = IndexDb::open(&db_path).unwrap_or_else(|e| {
54            tracing::warn!(
55                path = %db_path.display(),
56                error = %e,
57                "failed to open DB, falling back to in-memory"
58            );
59            IndexDb::open_memory().unwrap()
60        });
61        let in_memory = !db_path.is_file();
62        let mut idx = Self {
63            project,
64            db_path,
65            writer: std::sync::Mutex::new(db),
66            in_memory,
67        };
68        // Auto-migrate from legacy JSON index if DB is empty
69        if idx.writer().file_count().unwrap_or(0) == 0 {
70            let _ = idx.migrate_from_json();
71        }
72        idx
73    }
74
75    /// Acquire the writer connection (poison-safe).
76    fn writer(&self) -> std::sync::MutexGuard<'_, IndexDb> {
77        self.writer
78            .lock()
79            .unwrap_or_else(|poisoned| poisoned.into_inner())
80    }
81
82    /// Open a read-only DB connection for queries (or fall back to writer for in-memory).
83    fn reader(&self) -> Result<ReadDb<'_>> {
84        if self.in_memory {
85            return Ok(ReadDb::Writer(self.writer()));
86        }
87        match IndexDb::open_readonly(&self.db_path)? {
88            Some(db) => Ok(ReadDb::Owned(db)),
89            None => Ok(ReadDb::Writer(self.writer())),
90        }
91    }
92
93    /// Create an in-memory index (for tests and benchmarks — no disk persistence).
94    pub fn new_memory(project: ProjectRoot) -> Self {
95        let db = IndexDb::open_memory().unwrap();
96        Self {
97            db_path: PathBuf::new(),
98            project,
99            writer: std::sync::Mutex::new(db),
100            in_memory: true,
101        }
102    }
103
104    pub fn stats(&self) -> Result<IndexStats> {
105        let db = self.reader()?;
106        let supported_files = collect_candidate_files(self.project.as_path())?;
107        let indexed_files = db.file_count()?;
108        let indexed_paths = db.all_file_paths()?;
109
110        let mut stale = 0usize;
111        for rel in &indexed_paths {
112            let path = self.project.as_path().join(rel);
113            if !path.is_file() {
114                stale += 1;
115                continue;
116            }
117            let content = match fs::read(&path) {
118                Ok(c) => c,
119                Err(_) => {
120                    stale += 1;
121                    continue;
122                }
123            };
124            let hash = content_hash(&content);
125            let mtime = file_modified_ms(&path).unwrap_or(0) as i64;
126            if db.get_fresh_file(rel, mtime, &hash)?.is_none() {
127                stale += 1;
128            }
129        }
130
131        Ok(IndexStats {
132            indexed_files,
133            supported_files: supported_files.len(),
134            stale_files: stale,
135        })
136    }
137
138    /// SelectSolve file pre-filtering: score files by name relevance to query,
139    /// then extract symbols only from top-scoring files.
140    /// Path-first retrieval with FTS5 boost: file paths scored by query token
141    /// matching, then boosted by FTS5 symbol hits in the same file.
142    fn select_solve_symbols(&self, query: &str, depth: usize) -> Result<Vec<SymbolInfo>> {
143        // Collect file paths and compute top matches inside a block so the
144        // MutexGuard (ReadDb::Writer) is dropped before we call find_symbol /
145        // get_symbols_overview_cached, which also need the lock.  Holding the
146        // guard across those calls causes a deadlock with in-memory DBs.
147        //
148        // FTS5 boost: search each query token as a symbol name via FTS5,
149        // collect which files contain matching symbols, and boost those files.
150        // Token-level search is critical for NL queries like "how does dispatch
151        // work" — the full query won't match any symbol, but "dispatch" will
152        // find dispatch_tool in dispatch/mod.rs.
153        let fts_file_boost: std::collections::HashSet<String> = {
154            let query_lower = query.to_ascii_lowercase();
155            let tokens: Vec<&str> = query_lower
156                .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
157                .filter(|t| t.len() >= 3)
158                .collect();
159            let mut boost_files = std::collections::HashSet::new();
160            // First try full query (catches exact symbol names like "dispatch_tool")
161            if let Ok(hits) = self.find_symbol(query, None, false, false, 15) {
162                for sym in hits {
163                    boost_files.insert(sym.file_path);
164                }
165            }
166            // Then try individual tokens (catches NL queries)
167            for token in &tokens {
168                if let Ok(hits) = self.find_symbol(token, None, false, false, 10) {
169                    for sym in hits {
170                        boost_files.insert(sym.file_path);
171                    }
172                }
173            }
174            boost_files
175        };
176
177        let (top_files, importer_files): (Vec<String>, Vec<String>) = {
178            let db = self.reader()?;
179            let all_paths = db.all_file_paths()?;
180
181            let query_lower = query.to_ascii_lowercase();
182            let query_tokens: Vec<&str> = query_lower
183                .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
184                .filter(|t| t.len() >= 3)
185                .collect();
186
187            let mut file_scores: Vec<(String, usize)> = all_paths
188                .into_iter()
189                .map(|path| {
190                    let path_lower = path.to_ascii_lowercase();
191                    let mut score = query_tokens
192                        .iter()
193                        .filter(|token| path_lower.contains(**token))
194                        .count();
195                    // FTS5 boost: files containing matching symbols get +2
196                    if fts_file_boost.contains(&path) {
197                        score += 2;
198                    }
199                    (path, score)
200                })
201                .collect();
202
203            file_scores.sort_by_key(|b| std::cmp::Reverse(b.1));
204            let top: Vec<String> = file_scores
205                .into_iter()
206                .filter(|(_, score)| *score > 0)
207                .take(10)
208                .map(|(path, _)| path)
209                .collect();
210
211            // Import graph proximity: files that import top-matched files
212            // provide structural context (callers, consumers of matched code).
213            let mut importers = Vec::new();
214            if !top.is_empty() && top.len() <= 5 {
215                for file_path in top.iter().take(3) {
216                    if let Ok(imp) = db.get_importers(file_path) {
217                        for importer_path in imp.into_iter().take(3) {
218                            importers.push(importer_path);
219                        }
220                    }
221                }
222            }
223
224            (top, importers)
225            // db (MutexGuard) dropped here
226        };
227
228        // If no file matches (path + FTS5 both empty), fall back to broad symbol search
229        if top_files.is_empty() {
230            return self.find_symbol(query, None, false, false, 500);
231        }
232
233        // Collect symbols from top files
234        let mut all_symbols = Vec::new();
235        for file_path in &top_files {
236            if let Ok(symbols) = self.get_symbols_overview_cached(file_path, depth) {
237                all_symbols.extend(symbols);
238            }
239        }
240
241        // Import graph proximity: include symbols from files that import top matches.
242        // These provide structural context (callers, consumers of matched code).
243        for importer_path in &importer_files {
244            if let Ok(symbols) = self.get_symbols_overview_cached(importer_path, 1) {
245                all_symbols.extend(symbols);
246            }
247        }
248
249        // Also include direct symbol name matches (for exact/substring hits)
250        let mut seen_ids: std::collections::HashSet<String> =
251            all_symbols.iter().map(|s| s.id.clone()).collect();
252
253        if let Ok(direct) = self.find_symbol(query, None, false, false, 50) {
254            for sym in direct {
255                if seen_ids.insert(sym.id.clone()) {
256                    all_symbols.push(sym);
257                }
258            }
259        }
260
261        // For multi-word queries, also search individual tokens as symbol names
262        // (e.g., "dispatch tool call" → search for "dispatch", "tool", "call")
263        let query_lower = query.to_ascii_lowercase();
264        let tokens: Vec<&str> = query_lower
265            .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
266            .filter(|t| t.len() >= 3)
267            .collect();
268        if tokens.len() >= 2 {
269            for token in &tokens {
270                match self.find_symbol(token, None, false, false, 10) {
271                    Ok(hits) => {
272                        for sym in hits {
273                            if seen_ids.insert(sym.id.clone()) {
274                                all_symbols.push(sym);
275                            }
276                        }
277                    }
278                    Err(e) => {
279                        tracing::debug!(token, error = %e, "token find_symbol failed");
280                    }
281                }
282            }
283        }
284
285        Ok(all_symbols)
286    }
287
288    /// Hierarchical project structure: per-directory file count + symbol count.
289    /// Used as Level 1 pruning — lets LLM decide which directories to drill into.
290    pub fn get_project_structure(&self) -> Result<Vec<db::DirStats>> {
291        let db = self.reader()?;
292        db.dir_stats()
293    }
294
295    pub fn indexed_file_paths(&self) -> Result<Vec<String>> {
296        let db = self.reader()?;
297        db.all_file_paths()
298    }
299
300    pub fn get_symbols_overview(&self, path: &str, depth: usize) -> Result<Vec<SymbolInfo>> {
301        let resolved = self.project.resolve(path)?;
302        if resolved.is_dir() {
303            let mut symbols = Vec::new();
304            for file in WalkDir::new(&resolved)
305                .into_iter()
306                .filter_entry(|entry| !is_excluded(entry.path()))
307            {
308                let file = file?;
309                if !file.file_type().is_file() || language_for_path(file.path()).is_none() {
310                    continue;
311                }
312                let relative = self.project.to_relative(file.path());
313                let parsed = self.ensure_indexed(file.path(), &relative)?;
314                if !parsed.is_empty() {
315                    let id = make_symbol_id(&relative, &SymbolKind::File, &relative);
316                    symbols.push(SymbolInfo {
317                        name: relative.clone(),
318                        kind: SymbolKind::File,
319                        file_path: relative.clone(),
320                        provenance: SymbolProvenance::from_path(&relative),
321                        line: 0,
322                        column: 0,
323                        signature: format!(
324                            "{} ({} symbols)",
325                            file.file_name().to_string_lossy(),
326                            parsed.len()
327                        ),
328                        name_path: relative,
329                        id,
330                        body: None,
331                        children: parsed
332                            .into_iter()
333                            .map(|symbol| to_symbol_info(symbol, depth))
334                            .collect(),
335                        start_byte: 0,
336                        end_byte: 0,
337                    });
338                }
339            }
340            return Ok(symbols);
341        }
342
343        let relative = self.project.to_relative(&resolved);
344        let parsed = self.ensure_indexed(&resolved, &relative)?;
345        Ok(parsed
346            .into_iter()
347            .map(|symbol| to_symbol_info(symbol, depth))
348            .collect())
349    }
350
351    pub fn find_symbol(
352        &self,
353        name: &str,
354        file_path: Option<&str>,
355        include_body: bool,
356        exact_match: bool,
357        max_matches: usize,
358    ) -> Result<Vec<SymbolInfo>> {
359        // Fast path: if name looks like a stable symbol ID, parse and do targeted lookup
360        if let Some((id_file, _id_kind, id_name_path)) = parse_symbol_id(name) {
361            let resolved = self.project.resolve(id_file)?;
362            let relative = self.project.to_relative(&resolved);
363            self.ensure_indexed(&resolved, &relative)?;
364            // Extract the leaf name from name_path (after last '/')
365            let leaf_name = id_name_path.rsplit('/').next().unwrap_or(id_name_path);
366            let db = self.writer();
367            let db_rows = db.find_symbols_by_name(leaf_name, Some(id_file), true, max_matches)?;
368            let mut results = Vec::new();
369            for row in db_rows {
370                if row.name_path != id_name_path {
371                    continue;
372                }
373                let rel_path = db.get_file_path(row.file_id)?.unwrap_or_default();
374                let body = if include_body {
375                    let abs = self.project.as_path().join(&rel_path);
376                    fs::read_to_string(&abs).ok().map(|source| {
377                        slice_source(&source, row.start_byte as u32, row.end_byte as u32)
378                    })
379                } else {
380                    None
381                };
382                let kind = SymbolKind::from_str_label(&row.kind);
383                let id = make_symbol_id(&rel_path, &kind, &row.name_path);
384                let prov = SymbolProvenance::from_path(&rel_path);
385                results.push(SymbolInfo {
386                    name: row.name,
387                    kind,
388                    provenance: prov,
389                    file_path: rel_path,
390                    line: row.line as usize,
391                    column: row.column_num as usize,
392                    signature: row.signature,
393                    name_path: row.name_path,
394                    id,
395                    body,
396                    children: Vec::new(),
397                    start_byte: row.start_byte as u32,
398                    end_byte: row.end_byte as u32,
399                });
400            }
401            return Ok(results);
402        }
403
404        // Ensure target files are indexed first
405        if let Some(fp) = file_path {
406            let resolved = self.project.resolve(fp)?;
407            let relative = self.project.to_relative(&resolved);
408            self.ensure_indexed(&resolved, &relative)?;
409        } else {
410            // Ensure all files are indexed for a global search
411            let files = collect_candidate_files(self.project.as_path())?;
412            for file in &files {
413                let relative = self.project.to_relative(file);
414                self.ensure_indexed(file, &relative)?;
415            }
416        }
417
418        let db = self.writer();
419        let db_rows = db.find_symbols_by_name(name, file_path, exact_match, max_matches)?;
420
421        let mut results = Vec::new();
422        for row in db_rows {
423            let rel_path = db.get_file_path(row.file_id)?.unwrap_or_default();
424            let body = if include_body {
425                let abs = self.project.as_path().join(&rel_path);
426                fs::read_to_string(&abs)
427                    .ok()
428                    .map(|source| slice_source(&source, row.start_byte as u32, row.end_byte as u32))
429            } else {
430                None
431            };
432            let kind = SymbolKind::from_str_label(&row.kind);
433            let id = make_symbol_id(&rel_path, &kind, &row.name_path);
434            let prov = SymbolProvenance::from_path(&rel_path);
435            results.push(SymbolInfo {
436                name: row.name,
437                kind,
438                provenance: prov,
439                file_path: rel_path,
440                line: row.line as usize,
441                column: row.column_num as usize,
442                signature: row.signature,
443                name_path: row.name_path,
444                id,
445                body,
446                children: Vec::new(),
447                start_byte: row.start_byte as u32,
448                end_byte: row.end_byte as u32,
449            });
450        }
451        Ok(results)
452    }
453
454    pub fn get_ranked_context(
455        &self,
456        query: &str,
457        path: Option<&str>,
458        max_tokens: usize,
459        include_body: bool,
460        depth: usize,
461    ) -> Result<RankedContextResult> {
462        let all_symbols = if let Some(path) = path {
463            self.get_symbols_overview(path, depth)?
464        } else {
465            // SelectSolve: file pre-filtering → top files → symbol extraction
466            self.select_solve_symbols(query, depth)?
467        };
468
469        let mut scored = all_symbols
470            .into_iter()
471            .flat_map(flatten_symbol_infos)
472            .filter_map(|symbol| score_symbol(query, &symbol).map(|score| (symbol, score)))
473            .collect::<Vec<_>>();
474        scored.sort_by_key(|right| std::cmp::Reverse(right.1));
475
476        let (selected, chars_used) =
477            prune_to_budget(scored, max_tokens, include_body, self.project.as_path());
478
479        Ok(RankedContextResult {
480            query: query.to_owned(),
481            count: selected.len(),
482            symbols: selected,
483            token_budget: max_tokens,
484            chars_used,
485        })
486    }
487
488    /// Access the underlying database (e.g. for import graph queries).
489    pub fn db(&self) -> std::sync::MutexGuard<'_, IndexDb> {
490        self.writer()
491    }
492}
493
494pub fn get_symbols_overview(
495    project: &ProjectRoot,
496    path: &str,
497    depth: usize,
498) -> Result<Vec<SymbolInfo>> {
499    let resolved = project.resolve(path)?;
500    if resolved.is_dir() {
501        return get_directory_symbols(project, &resolved, depth);
502    }
503    get_file_symbols(project, &resolved, depth)
504}
505
506/// Find the byte range (start_byte, end_byte) of a named symbol in a file.
507/// If name_path is provided (e.g. "ClassName/method"), matches by full name_path;
508/// otherwise matches by symbol name alone.
509pub fn find_symbol_range(
510    project: &ProjectRoot,
511    relative_path: &str,
512    symbol_name: &str,
513    name_path: Option<&str>,
514) -> Result<(usize, usize)> {
515    let file = project.resolve(relative_path)?;
516    let rel = project.to_relative(&file);
517    let Some(language_config) = language_for_path(&file) else {
518        bail!("unsupported file type: {}", file.display());
519    };
520    let source =
521        fs::read_to_string(&file).with_context(|| format!("failed to read {}", file.display()))?;
522    let parsed = parse_symbols(&language_config, &rel, &source, false)?;
523    let flat = flatten_symbols(parsed);
524
525    let candidate = if let Some(np) = name_path {
526        flat.into_iter()
527            .find(|sym| sym.name_path == np || sym.name == symbol_name)
528    } else {
529        flat.into_iter().find(|sym| sym.name == symbol_name)
530    };
531
532    match candidate {
533        Some(sym) => Ok((sym.start_byte as usize, sym.end_byte as usize)),
534        None => bail!(
535            "symbol '{}' not found in {}",
536            name_path.unwrap_or(symbol_name),
537            relative_path
538        ),
539    }
540}
541
542pub fn find_symbol(
543    project: &ProjectRoot,
544    name: &str,
545    file_path: Option<&str>,
546    include_body: bool,
547    exact_match: bool,
548    max_matches: usize,
549) -> Result<Vec<SymbolInfo>> {
550    // Fast path: stable symbol ID
551    if let Some((id_file, _id_kind, id_name_path)) = parse_symbol_id(name) {
552        let resolved = project.resolve(id_file)?;
553        let rel = project.to_relative(&resolved);
554        let Some(language_config) = language_for_path(&resolved) else {
555            return Ok(Vec::new());
556        };
557        let source = fs::read_to_string(&resolved)?;
558        let parsed = parse_symbols(&language_config, &rel, &source, include_body)?;
559        let mut results = Vec::new();
560        for symbol in flatten_symbols(parsed) {
561            if symbol.name_path == id_name_path {
562                results.push(to_symbol_info(symbol, usize::MAX));
563                if results.len() >= max_matches {
564                    return Ok(results);
565                }
566            }
567        }
568        return Ok(results);
569    }
570
571    let files = match file_path {
572        Some(path) => vec![project.resolve(path)?],
573        None => collect_candidate_files(project.as_path())?,
574    };
575
576    let query = name.to_lowercase();
577    let mut results = Vec::new();
578
579    for file in files {
580        let rel = project.to_relative(&file);
581        let Some(language_config) = language_for_path(&file) else {
582            continue;
583        };
584        let source = match fs::read_to_string(&file) {
585            Ok(source) => source,
586            Err(_) => continue,
587        };
588        let parsed = parse_symbols(&language_config, &rel, &source, include_body)?;
589        for symbol in flatten_symbols(parsed) {
590            let matched = if exact_match {
591                symbol.name == name
592            } else {
593                scoring::contains_ascii_ci(&symbol.name, &query)
594            };
595            if matched {
596                results.push(to_symbol_info(symbol, usize::MAX));
597                if results.len() >= max_matches {
598                    return Ok(results);
599                }
600            }
601        }
602    }
603
604    Ok(results)
605}
606
607fn get_directory_symbols(
608    project: &ProjectRoot,
609    dir: &Path,
610    depth: usize,
611) -> Result<Vec<SymbolInfo>> {
612    let mut symbols = Vec::new();
613    for entry in WalkDir::new(dir)
614        .into_iter()
615        .filter_entry(|entry| !is_excluded(entry.path()))
616    {
617        let entry = entry?;
618        if !entry.file_type().is_file() {
619            continue;
620        }
621        let path = entry.path();
622        if language_for_path(path).is_none() {
623            continue;
624        }
625        let file_symbols = get_file_symbols(project, path, depth)?;
626        if !file_symbols.is_empty() {
627            let relative = project.to_relative(path);
628            let id = make_symbol_id(&relative, &SymbolKind::File, &relative);
629            symbols.push(SymbolInfo {
630                name: relative.clone(),
631                kind: SymbolKind::File,
632                file_path: relative.clone(),
633                provenance: SymbolProvenance::from_path(&relative),
634                line: 0,
635                column: 0,
636                signature: format!(
637                    "{} ({} symbols)",
638                    path.file_name()
639                        .and_then(|name| name.to_str())
640                        .unwrap_or_default(),
641                    file_symbols.len()
642                ),
643                name_path: relative,
644                id,
645                body: None,
646                children: file_symbols,
647                start_byte: 0,
648                end_byte: 0,
649            });
650        }
651    }
652    Ok(symbols)
653}
654
655fn get_file_symbols(project: &ProjectRoot, file: &Path, depth: usize) -> Result<Vec<SymbolInfo>> {
656    let relative = project.to_relative(file);
657    let Some(language_config) = language_for_path(file) else {
658        return Ok(Vec::new());
659    };
660    let source =
661        fs::read_to_string(file).with_context(|| format!("failed to read {}", file.display()))?;
662    let parsed = parse_symbols(&language_config, &relative, &source, false)?;
663    Ok(parsed
664        .into_iter()
665        .map(|symbol| to_symbol_info(symbol, depth))
666        .collect())
667}
668
669fn collect_candidate_files(root: &Path) -> Result<Vec<PathBuf>> {
670    collect_files(root, |path| language_for_path(path).is_some())
671}
672
673fn file_modified_ms(path: &Path) -> Result<u128> {
674    let modified = fs::metadata(path)
675        .with_context(|| format!("failed to stat {}", path.display()))?
676        .modified()
677        .with_context(|| format!("failed to read mtime for {}", path.display()))?;
678    Ok(modified
679        .duration_since(UNIX_EPOCH)
680        .unwrap_or_default()
681        .as_millis())
682}